date:20230414




On 2023/4/15 00:02, Mayuresh Chitale wrote:

When misa.F is clear, TB_FLAGS.MSTATUS_HS_FS field is unused and can
be used to save the current state of smstateen0.FCSR check which is
needed by the floating point translation routines.

Signed-off-by: Mayuresh Chitale 
---
  target/riscv/cpu_helper.c | 12 
  target/riscv/translate.c  |  7 +++
  2 files changed, 19 insertions(+)

diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
index 433ea529b0..fd1731cc39 100644
--- a/target/riscv/cpu_helper.c
+++ b/target/riscv/cpu_helper.c
@@ -105,6 +105,18 @@ void cpu_get_tb_cpu_state(CPURISCVState *env, target_ulong 
*pc,
  flags = FIELD_DP32(flags, TB_FLAGS, MSTATUS_HS_VS,
 get_field(env->mstatus_hs, MSTATUS_VS));
  }
+/*
+ * If misa.F is 0 then the MSTATUS_HS_FS field of the tb->flags
+ * can be used to pass the current state of the smstateen.FCSR bit
+ * which must be checked for in the floating point translation routines
+ */
+if (!riscv_has_ext(env, RVF)) {
+if (smstateen_acc_ok(env, 0, SMSTATEEN0_FCSR) == RISCV_EXCP_NONE) {
+flags = FIELD_DP32(flags, TB_FLAGS, MSTATUS_HS_FS, 1);
+} else {
+flags = FIELD_DP32(flags, TB_FLAGS, MSTATUS_HS_FS, 0);
+}
+}
  if (cpu->cfg.debug && !icount_enabled()) {
  flags = FIELD_DP32(flags, TB_FLAGS, ITRIGGER, env->itrigger_enabled);
  }
diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index d0094922b6..e29bbb8b70 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -79,6 +79,7 @@ typedef struct DisasContext {
  int frm;
  RISCVMXL ol;
  bool virt_inst_excp;
+bool smstateen_fcsr_ok;
  bool virt_enabled;
  const RISCVCPUConfig *cfg_ptr;
  bool hlsx;
@@ -1202,6 +1203,12 @@ static void riscv_tr_init_disas_context(DisasContextBase 
*dcbase, CPUState *cs)
  ctx->itrigger = FIELD_EX32(tb_flags, TB_FLAGS, ITRIGGER);
  ctx->zero = tcg_constant_tl(0);
  ctx->virt_inst_excp = false;
+if (has_ext(ctx, RVF)) {
+ctx->smstateen_fcsr_ok = 1;
+} else {
+ctx->smstateen_fcsr_ok = FIELD_EX32(tb_flags, TB_FLAGS,
+ MSTATUS_HS_FS);


By the way, it may introduce new question when MSTATUS_FS and 
MSTATUS_HS_FS is merged to save bits in tb_flag


by Richerd's patchset: 20230412114333.118895-5-richard.hender...@linaro.org

such as: the check "s->mstatus_fs == 0" in require_rvf() will be false 
if smstateen_fcsr_ok is true.


However, this should be true in this case to indicate F is diabled.

So we may need to set ctx->mstatus_fs = 0 here once merged with 
Richerd's patchset.


Regards,

Weiwei Li


+}
  }
  
  static void riscv_tr_tb_start(DisasContextBase *db, CPUState *cpu)

Re: [RFC PATCH v2 3/4] target/riscv: check smstateen fcsr flag




On 2023/4/15 00:02, Mayuresh Chitale wrote:

If misa.F and smstateen_fcsr_ok flag are clear then all the floating
point instructions must generate an appropriate exception.

Signed-off-by: Mayuresh Chitale 
---
  target/riscv/insn_trans/trans_rvd.c.inc   | 13 
  target/riscv/insn_trans/trans_rvf.c.inc   | 24 +++
  target/riscv/insn_trans/trans_rvzfh.c.inc | 18 ++---
  3 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/target/riscv/insn_trans/trans_rvd.c.inc 
b/target/riscv/insn_trans/trans_rvd.c.inc
index 2c51e01c40..d9e0cf116f 100644
--- a/target/riscv/insn_trans/trans_rvd.c.inc
+++ b/target/riscv/insn_trans/trans_rvd.c.inc
@@ -18,10 +18,15 @@
   * this program.  If not, see .
   */
  
-#define REQUIRE_ZDINX_OR_D(ctx) do { \

-if (!ctx->cfg_ptr->ext_zdinx) { \
-REQUIRE_EXT(ctx, RVD); \
-} \
+#define REQUIRE_ZDINX_OR_D(ctx) do {   \
+if (!has_ext(ctx, RVD)) {  \
+if (!ctx->cfg_ptr->ext_zdinx) {\
+return false;  \
+}  \
+if (!smstateen_fcsr_check(ctx)) {  \
+return false;  \
+}  \
+}  \
  } while (0)
  
  #define REQUIRE_EVEN(ctx, reg) do { \

diff --git a/target/riscv/insn_trans/trans_rvf.c.inc 
b/target/riscv/insn_trans/trans_rvf.c.inc
index 9e9fa2087a..6bf6fe16be 100644
--- a/target/riscv/insn_trans/trans_rvf.c.inc
+++ b/target/riscv/insn_trans/trans_rvf.c.inc
@@ -24,10 +24,26 @@
  return false; \
  } while (0)
  
-#define REQUIRE_ZFINX_OR_F(ctx) do {\

-if (!ctx->cfg_ptr->ext_zfinx) { \
-REQUIRE_EXT(ctx, RVF); \
-} \
+static inline bool smstateen_fcsr_check(DisasContext *ctx)
+{
+#ifndef CONFIG_USER_ONLY
+if (!has_ext(ctx, RVF) && !ctx->smstateen_fcsr_ok) {


We needn't check RVF here. Two reasons:

1. This check only be done when RVF is diabled.

2. ctx->smstateen_fcsr_ok is always be true (set in last patch) when RVF 
is enabled



+ctx->virt_inst_excp = ctx->virt_enabled;
+return false;
+}
+#endif
+return true;
+}
+
+#define REQUIRE_ZFINX_OR_F(ctx) do {   \
+if (!has_ext(ctx, RVF)) {  \
+if (!ctx->cfg_ptr->ext_zfinx) {\
+return false;  \
+}  \
+if (!smstateen_fcsr_check(ctx)) {  \
+return false;  \
+}  \
+}  \
  } while (0)
  
  #define REQUIRE_ZCF(ctx) do {  \

diff --git a/target/riscv/insn_trans/trans_rvzfh.c.inc 
b/target/riscv/insn_trans/trans_rvzfh.c.inc
index 74dde37ff7..74a125e4c0 100644
--- a/target/riscv/insn_trans/trans_rvzfh.c.inc
+++ b/target/riscv/insn_trans/trans_rvzfh.c.inc
@@ -16,28 +16,40 @@
   * this program.  If not, see .
   */
  
-#define REQUIRE_ZFH(ctx) do { \

+#define REQUIRE_ZFH(ctx) do {  \
  if (!ctx->cfg_ptr->ext_zfh) {  \
-return false; \
-} \
+return false;  \
+}  \
+if (!smstateen_fcsr_check(ctx)) {  \
+return false;  \
+}  \


This is unnecessary here. This check is only for Zhinx.

Similar to REQUIRE_ZFHMIN.


  } while (0)
  
  #define REQUIRE_ZHINX_OR_ZFH(ctx) do { \

  if (!ctx->cfg_ptr->ext_zhinx && !ctx->cfg_ptr->ext_zfh) { \
  return false;  \
  }  \
+if (!smstateen_fcsr_check(ctx)) {  \
+return false;  \
+}  \


This check is only for Zhinx here. So it's better to take the similar 
way as REQUIRE_ZFINX_OR_F.


Similar to REQUIRE_ZFHMIN_OR_ZHINXMIN.

Regards,

Weiwei Li


  } while (0)
  
  #define REQUIRE_ZFHMIN(ctx) do {  \

  if (!ctx->cfg_ptr->ext_zfhmin) {  \
  return false; \
  } \
+if (!smstateen_fcsr_check(ctx)) { \
+return false; \
+} \
  } while (0)
  
  #define REQUIRE_ZFHMIN_OR_ZHINXMIN(ctx) do { \

  if (!(ctx->cfg_ptr->ext_zfhmin || ctx->cfg_ptr->ext_zhinxmin)) { \
  return false;\
  }\
+if (!smstateen_fcsr_check(ctx)) {\
+return false;\
+}\
  } while (0)
  
  static bool trans_flh(DisasContext

Re: [RFC PATCH v2 2/4] target/riscv: Reuse TB_FLAGS.MSTATUS_HFS_FS




On 2023/4/15 00:02, Mayuresh Chitale wrote:

When misa.F is clear, TB_FLAGS.MSTATUS_HS_FS field is unused and can
be used to save the current state of smstateen0.FCSR check which is
needed by the floating point translation routines.

Signed-off-by: Mayuresh Chitale 
---
  target/riscv/cpu_helper.c | 12 
  target/riscv/translate.c  |  7 +++
  2 files changed, 19 insertions(+)

diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
index 433ea529b0..fd1731cc39 100644
--- a/target/riscv/cpu_helper.c
+++ b/target/riscv/cpu_helper.c
@@ -105,6 +105,18 @@ void cpu_get_tb_cpu_state(CPURISCVState *env, target_ulong 
*pc,
  flags = FIELD_DP32(flags, TB_FLAGS, MSTATUS_HS_VS,
 get_field(env->mstatus_hs, MSTATUS_VS));
  }
+/*
+ * If misa.F is 0 then the MSTATUS_HS_FS field of the tb->flags
+ * can be used to pass the current state of the smstateen.FCSR bit
+ * which must be checked for in the floating point translation routines
+ */
+if (!riscv_has_ext(env, RVF)) {
+if (smstateen_acc_ok(env, 0, SMSTATEEN0_FCSR) == RISCV_EXCP_NONE) {
+flags = FIELD_DP32(flags, TB_FLAGS, MSTATUS_HS_FS, 1);
+} else {
+flags = FIELD_DP32(flags, TB_FLAGS, MSTATUS_HS_FS, 0);
+}
+}
  if (cpu->cfg.debug && !icount_enabled()) {
  flags = FIELD_DP32(flags, TB_FLAGS, ITRIGGER, env->itrigger_enabled);
  }
diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index d0094922b6..e29bbb8b70 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -79,6 +79,7 @@ typedef struct DisasContext {
  int frm;
  RISCVMXL ol;
  bool virt_inst_excp;
+bool smstateen_fcsr_ok;
  bool virt_enabled;
  const RISCVCPUConfig *cfg_ptr;
  bool hlsx;
@@ -1202,6 +1203,12 @@ static void riscv_tr_init_disas_context(DisasContextBase 
*dcbase, CPUState *cs)
  ctx->itrigger = FIELD_EX32(tb_flags, TB_FLAGS, ITRIGGER);
  ctx->zero = tcg_constant_tl(0);
  ctx->virt_inst_excp = false;
+if (has_ext(ctx, RVF)) {
+ctx->smstateen_fcsr_ok = 1;
+} else {
+ctx->smstateen_fcsr_ok = FIELD_EX32(tb_flags, TB_FLAGS,
+ MSTATUS_HS_FS);


Seems unaligned with tb_flags.

Otherwise, LGTM.

Reviewed-by: Weiwei Li 

Weiwei Li


+}
  }
  
  static void riscv_tr_tb_start(DisasContextBase *db, CPUState *cpu)

Re: [RFC PATCH v2 1/4] target/riscv: smstateen check for fcsr




On 2023/4/15 00:01, Mayuresh Chitale wrote:

If smstateen is implemented and smtateen0.fcsr is clear and misa.F
is off then the floating point operations must return illegal
instruction exception or virtual instruction trap, if relevant.

Signed-off-by: Mayuresh Chitale 
---
  target/riscv/csr.c | 23 +++
  1 file changed, 23 insertions(+)

diff --git a/target/riscv/csr.c b/target/riscv/csr.c
index f4d2dcfdc8..8ae9e95f9f 100644
--- a/target/riscv/csr.c
+++ b/target/riscv/csr.c
@@ -82,6 +82,10 @@ static RISCVException fs(CPURISCVState *env, int csrno)
  !riscv_cpu_cfg(env)->ext_zfinx) {
  return RISCV_EXCP_ILLEGAL_INST;
  }
+
+if (!env->debugger && !riscv_cpu_fp_enabled(env)) {
+return smstateen_acc_ok(env, 0, SMSTATEEN0_FCSR);
+}
  #endif
  return RISCV_EXCP_NONE;
  }
@@ -2081,6 +2085,9 @@ static RISCVException write_mstateen0(CPURISCVState *env, 
int csrno,
target_ulong new_val)
  {
  uint64_t wr_mask = SMSTATEEN_STATEEN | SMSTATEEN0_HSENVCFG;
+if (!riscv_has_ext(env, RVF)) {
+wr_mask |= SMSTATEEN0_FCSR;
+}
  
  return write_mstateen(env, csrno, wr_mask, new_val);

  }
@@ -2117,6 +2124,10 @@ static RISCVException write_mstateen0h(CPURISCVState 
*env, int csrno,
  {
  uint64_t wr_mask = SMSTATEEN_STATEEN | SMSTATEEN0_HSENVCFG;
  
+if (!riscv_has_ext(env, RVF)) {

+wr_mask |= SMSTATEEN0_FCSR;
+}
+


FCSR is bit 1 of mstateen0.  So it seems unnecessary to be added to 
wr_mask for mstateen0h.


Similar to hstateen0h.

Otherwise,  Reviewed-by: Weiwei Li 

Weiwei Li

  return write_mstateenh(env, csrno, wr_mask, new_val);
  }
  
@@ -2154,6 +2165,10 @@ static RISCVException write_hstateen0(CPURISCVState *env, int csrno,

  {
  uint64_t wr_mask = SMSTATEEN_STATEEN | SMSTATEEN0_HSENVCFG;
  
+if (!riscv_has_ext(env, RVF)) {

+wr_mask |= SMSTATEEN0_FCSR;
+}
+
  return write_hstateen(env, csrno, wr_mask, new_val);
  }
  
@@ -2193,6 +2208,10 @@ static RISCVException write_hstateen0h(CPURISCVState *env, int csrno,

  {
  uint64_t wr_mask = SMSTATEEN_STATEEN | SMSTATEEN0_HSENVCFG;
  
+if (!riscv_has_ext(env, RVF)) {

+wr_mask |= SMSTATEEN0_FCSR;
+}
+
  return write_hstateenh(env, csrno, wr_mask, new_val);
  }
  
@@ -2240,6 +2259,10 @@ static RISCVException write_sstateen0(CPURISCVState *env, int csrno,

  {
  uint64_t wr_mask = SMSTATEEN_STATEEN | SMSTATEEN0_HSENVCFG;
  
+if (!riscv_has_ext(env, RVF)) {

+wr_mask |= SMSTATEEN0_FCSR;
+}
+
  return write_sstateen(env, csrno, wr_mask, new_val);
  }

Re: [PATCH v7 00/14] KVM: mm: fd-based approach for supporting KVM guest private memory

On Fri, Apr 14, 2023, Sean Christopherson wrote:
> On Fri, Apr 14, 2023, Ackerley Tng wrote:
> > Sean Christopherson  writes:
> > >   if (WARN_ON_ONCE(file->private_data)) {
> > >   err = -EEXIST;
> > >   goto err_fd;
> > >   }
> > 
> > Did you intend this as a check that the backing filesystem isn't using
> > the private_data field in the mapping?
> >
> > I think you meant file->f_mapping->private_data.
> 
> Ya, sounds right.  I should have added disclaimers that (a) I wrote this quite
> quickly and (b) it's compile tested only at this point.

FWIW, here's a very lightly tested version that doesn't explode on a basic 
selftest.

https://github.com/sean-jc/linux/tree/x86/upm_base_support

Re: [PATCH v7 00/14] KVM: mm: fd-based approach for supporting KVM guest private memory

On Fri, Apr 14, 2023, Ackerley Tng wrote:
> Sean Christopherson  writes:
> 
> > On Thu, Apr 13, 2023, Christian Brauner wrote:
> > > * by a mount option to tmpfs that makes it act
> > >in this restricted manner then you don't need an ioctl() and can get
> > >away with regular open calls. Such a tmpfs instance would only create
> > >regular, restricted memfds.
> 
> > I'd prefer to not go this route, becuase IIUC, it would require relatively
> > invasive changes to shmem code, and IIUC would require similar changes to
> > other support backings in the future, e.g. hugetlbfs?  And as above, I
> > don't think any of the potential use cases need restrictedmem to be a
> > uniquely identifiable mount.
> 
> FWIW, I'm starting to look at extending restrictedmem to hugetlbfs and
> the separation that the current implementation has is very helpful. Also
> helps that hugetlbfs and tmpfs are structured similarly, I guess.
> 
> > One of the goals (hopefully not a pipe dream) is to design restrictmem in
> > such a way that extending it to support other backing types isn't terribly
> > difficult.  In case it's not obvious, most of us working on this stuff
> > aren't filesystems experts, and many of us aren't mm experts either.  The
> > more we (KVM folks for the most part) can leverage existing code to do the
> > heavy lifting, the better.
> 
> > After giving myself a bit of a crash course in file systems, would
> > something like the below have any chance of (a) working, (b) getting
> > merged, and (c) being maintainable?
> 
> > The idea is similar to a stacking filesystem, but instead of stacking,
> > restrictedmem hijacks a f_ops and a_ops to create a lightweight shim around
> > tmpfs.  There are undoubtedly issues and edge cases, I'm just looking for a
> > quick "yes, this might be doable" or a "no, that's absolutely bonkers,
> > don't try it".
> 
> Not an FS expert by any means, but I did think of approaching it this
> way as well!
> 
> "Hijacking" perhaps gives this approach a bit of a negative connotation.

Heh, commandeer then.

> I thought this is pretty close to subclassing (as in Object
> Oriented Programming). When some methods (e.g. fallocate) are called,
> restrictedmem does some work, and calls the same method in the
> superclass.
> 
> The existing restrictedmem code is a more like instantiating an shmem
> object and keeping that object as a field within the restrictedmem
> object.
> 
> Some (maybe small) issues I can think of now:
> 
> (1)
> 
> One difficulty with this approach is that other functions may make
> assumptions about private_data being of a certain type, or functions may
> use private_data.
> 
> I checked and IIUC neither shmem nor hugetlbfs use the private_data
> field in the inode's i_mapping (also file's f_mapping).
> 
> But there's fs/buffer.c which uses private_data, although those
> functions seem to be used by FSes like ext4 and fat, not memory-backed
> FSes.
> 
> We can probably fix this if any backing filesystems of restrictedmem,
> like tmpfs and future ones use private_data.

Ya, if we go the route of poking into f_ops and stuff, I would want to add
WARN_ON_ONCE() hardening of everything that restrictemem wants to "commandeer" 
;-)

> > static int restrictedmem_file_create(struct file *file)
> > {
> > struct address_space *mapping = file->f_mapping;
> > struct restrictedmem *rm;
> 
> > rm = kzalloc(sizeof(*rm), GFP_KERNEL);
> > if (!rm)
> > return -ENOMEM;
> 
> > rm->backing_f_ops = file->f_op;
> > rm->backing_a_ops = mapping->a_ops;
> > rm->file = file;
> 
> We don't really need to do this, since rm->file is already the same as
> file, we could just pass the file itself when it's needed

Aha!  I was working on getting rid of it, but forgot to go back and do another
pass.

> > init_rwsem(>lock);
> > xa_init(>bindings);
> 
> > file->f_flags |= O_LARGEFILE;
> 
> > file->f_op = _fops;
> > mapping->a_ops = _aops;
> 
> I think we probably have to override inode_operations as well, because
> otherwise other methods would become available to a restrictedmem file
> (like link, unlink, mkdir, tmpfile). Or maybe that's a feature instead
> of a bug.

I think we want those?  What we want to restrict are operations that require
read/write/execute access to the file, everything else should be ok. fallocate()
is a special case because restrictmem needs to tell KVM to unmap the memory when
a hole is punched.  I assume ->setattr() needs similar treatment to handle
ftruncate()?

I'd love to hear Christian's input on this aspect of things.

> > if (WARN_ON_ONCE(file->private_data)) {
> > err = -EEXIST;
> > goto err_fd;
> > }
> 
> Did you intend this as a check that the backing filesystem isn't using
> the private_data field in the mapping?
>
> I think you meant file->f_mapping->private_data.

Ya, sounds right.  I should have added disclaimers that (a) I wrote this quite
quickly and (b) it's compile

Re: [PATCH v7 00/14] KVM: mm: fd-based approach for supporting KVM guest private memory

2023-04-14 Thread Ackerley Tng

Sean Christopherson  writes:

On Thu, Apr 13, 2023, Christian Brauner wrote:

On Thu, Aug 18, 2022 at 04:24:21PM +0300, Kirill A . Shutemov wrote:
> On Wed, Aug 17, 2022 at 10:40:12PM -0700, Hugh Dickins wrote:
> > Here's what I would prefer, and imagine much easier for you to  
maintain;

> > but I'm no system designer, and may be misunderstanding throughout.
> >
> > QEMU gets fd from opening /dev/kvm_something, uses ioctls (or perhaps
> > the fallocate syscall interface itself) to allocate and free the  
memory,
> > ioctl for initializing some of it too.  KVM in control of whether  
that
> > fd can be read or written or mmap'ed or whatever, no need to prevent  
it
> > in shmem.c, no need for flags, seals, notifications to and fro  
because
> > KVM is already in control and knows the history.  If shmem actually  
has
> > value, call into it underneath - somewhat like SysV SHM, and  
/dev/zero
> > mmap, and i915/gem make use of it underneath.  If shmem has nothing  
to

> > add, just allocate and free kernel memory directly, recorded in your
> > own xarray.
>
> I guess shim layer on top of shmem *can* work. I don't see immediately  
why
> it would not. But I'm not sure it is right direction. We risk creating  
yet

> another parallel VM with own rules/locking/accounting that opaque to
> core-mm.

Sorry for necrobumping this thread but I've been reviewing the

No worries, I'm just stoked someone who actually knows what they're doing  
is

chiming in :-)

+1, thanks Christian!

memfd_restricted() extension that Ackerley is currently working on. I
was pointed to this thread as this is what the extension is building
on but I'll reply to both threads here.

 From a glance at v10, memfd_restricted() is currently implemented as an
in-kernel stacking filesystem. A call to memfd_restricted() creates a
new restricted memfd file and a new unlinked tmpfs file and stashes the
tmpfs file into the memfd file's private data member. It then uses the
tmpfs file's f_ops and i_ops to perform the relevant file and inode
operations. So it has the same callstack as a general stacking
filesystem like overlayfs in some cases:

 memfd_restricted->getattr()
 -> tmpfs->getattr()

...

Since you're effectively acting like a stacking filesystem you should
really use the device number of your memfd restricted filesystem. IOW,
sm like:

 stat->dev = memfd_restricted_dentry->d_sb->s_dev;

But then you run into trouble if you want to go forward with Ackerley's
extension that allows to explicitly pass in tmpfs fds to
memfd_restricted(). Afaict, two tmpfs instances might allocate the same
inode number. So now the inode and device number pair isn't unique
anymore.

So you might best be served by allocating and reporting your own inode
numbers as well.

But if you want to preserve the inode number and device number of the
relevant tmpfs instance but still report memfd restricted as your
filesystem type

Unless I missed something along the way, reporting memfd_restricted as a  
distinct
filesystem is very much a non-goal.  AFAIK it's purely a side effect of  
the

proposed implementation.

then I think it's reasonable to ask whether a stacking implementation  
really

makes sense here.

If you extend memfd_restricted() or even consider extending it in the
future to take tmpfs file descriptors as arguments to identify the tmpfs
instance in which to allocate the underlying tmpfs file for the new
restricted memfd file you should really consider a tmpfs based
implementation.

Because at that point it just feels like a pointless wrapper to get
custom f_ops and i_ops. Plus it's wasteful because you allocate dentries
and inodes that you don't really care about at all.

Just off the top of my hat you might be better served:
* by a new ioctl() on tmpfs instances that
   yield regular tmpfs file descriptors with restricted f_ops and i_ops.
   That's not that different from btrfs subvolumes which effectively are
   directories but are created through an ioctl().

I think this is more or less what we want to do, except via a dedicated  
syscall
instead of an ioctl() so that the primary interface isn't strictly tied  
to tmpfs,

e.g. so that it can be extended to other backing types in the future.

* by a mount option to tmpfs that makes it act
   in this restricted manner then you don't need an ioctl() and can get
   away with regular open calls. Such a tmpfs instance would only create
   regular, restricted memfds.

I'd prefer to not go this route, becuase IIUC, it would require  
relatively invasive
changes to shmem code, and IIUC would require similar changes to other  
support
backings in the future, e.g. hugetlbfs?  And as above, I don't think any  
of the

potential use cases need restrictedmem to be a uniquely identifiable
mount.

FWIW, I'm starting to look at extending restrictedmem to hugetlbfs and
the separation that the current implementation has is very helpful. Also
helps that

[PATCH v13 0/3] Add support for TPM devices over I2C bus



Hello,
Incorporated review comments from Corey Minyard. Please review.

This drop adds support for the TPM devices attached to the I2C bus. It
only supports the TPM2 protocol. You need to run it with the external
TPM emulator like swtpm. I have tested it with swtpm.

I have refered to the work done by zhdan...@meta.com but at the core
level out implementation is different.
https://github.com/theopolis/qemu/commit/2e2e57cde9e419c36af8071bb85392ad1ed70966

Based-on: $MESSAGE_ID

Ninad Palsule (3):
  docs: Add support for TPM devices over I2C bus
  tpm: Extend common APIs to support TPM TIS I2C
  tpm: Add support for TPM device over I2C bus

 docs/specs/tpm.rst  |  21 ++
 hw/arm/Kconfig  |   1 +
 hw/tpm/Kconfig  |   7 +
 hw/tpm/meson.build  |   1 +
 hw/tpm/tpm_tis.h|   3 +
 hw/tpm/tpm_tis_common.c |  36 ++-
 hw/tpm/tpm_tis_i2c.c| 571 
 hw/tpm/trace-events |   6 +
 include/hw/acpi/tpm.h   |  41 +++
 include/sysemu/tpm.h|   3 +
 10 files changed, 682 insertions(+), 8 deletions(-)
 create mode 100644 hw/tpm/tpm_tis_i2c.c

-- 
2.37.2

[PATCH v13 1/3] docs: Add support for TPM devices over I2C bus

From: Ninad Palsule 

This is a documentation change for I2C TPM device support.

Qemu already supports devices attached to ISA and sysbus.
This drop adds support for the I2C bus attached TPM devices.

Signed-off-by: Ninad Palsule 
Reviewed-by: Stefan Berger 
Reviewed-by: Cédric Le Goater 
Reviewed-by: Joel Stanley 

---
V2:

Incorporated Stephen's review comments
- Added example in the document.

---
V4:
Incorporate Cedric & Stefan's comments

- Added example for ast2600-evb
- Corrected statement about arm virtual machine.

---
V6:
Incorporated review comments from Stefan.

---
V8:

Incorporate review comments from Joel and Stefan

- Removed the rainier example
- Added step required to configure on ast2600-evb
---
 docs/specs/tpm.rst | 21 +
 1 file changed, 21 insertions(+)

diff --git a/docs/specs/tpm.rst b/docs/specs/tpm.rst
index 535912a92b..efe124a148 100644
--- a/docs/specs/tpm.rst
+++ b/docs/specs/tpm.rst
@@ -21,12 +21,16 @@ QEMU files related to TPM TIS interface:
  - ``hw/tpm/tpm_tis_common.c``
  - ``hw/tpm/tpm_tis_isa.c``
  - ``hw/tpm/tpm_tis_sysbus.c``
+ - ``hw/tpm/tpm_tis_i2c.c``
  - ``hw/tpm/tpm_tis.h``
 
 Both an ISA device and a sysbus device are available. The former is
 used with pc/q35 machine while the latter can be instantiated in the
 Arm virt machine.
 
+An I2C device support is also provided which can be instantiated in the Arm
+based emulation machines. This device only supports the TPM 2 protocol.
+
 CRB interface
 -
 
@@ -348,6 +352,23 @@ In case an Arm virt machine is emulated, use the following 
command line:
 -drive if=pflash,format=raw,file=flash0.img,readonly=on \
 -drive if=pflash,format=raw,file=flash1.img
 
+In case a ast2600-evb bmc machine is emulated and you want to use a TPM device
+attached to I2C bus, use the following command line:
+
+.. code-block:: console
+
+  qemu-system-arm -M ast2600-evb -nographic \
+-kernel arch/arm/boot/zImage \
+-dtb arch/arm/boot/dts/aspeed-ast2600-evb.dtb \
+-initrd rootfs.cpio \
+-chardev socket,id=chrtpm,path=/tmp/mytpm1/swtpm-sock \
+-tpmdev emulator,id=tpm0,chardev=chrtpm \
+-device tpm-tis-i2c,tpmdev=tpm0,bus=aspeed.i2c.bus.12,address=0x2e
+
+  For testing, use this command to load the driver to the correct address
+
+  echo tpm_tis_i2c 0x2e > /sys/bus/i2c/devices/i2c-12/new_device
+
 In case SeaBIOS is used as firmware, it should show the TPM menu item
 after entering the menu with 'ESC'.
 
-- 
2.37.2

[PATCH v13 2/3] tpm: Extend common APIs to support TPM TIS I2C

From: Ninad Palsule 

Qemu already supports devices attached to ISA and sysbus. This drop adds
support for the I2C bus attached TPM devices.

This commit includes changes for the common code.
- Added support for the new checksum registers which are required for
  the I2C support. The checksum calculation is handled in the qemu
  common code.
- Added wrapper function for read and write data so that I2C code can
  call it without MMIO interface.

The TPM TIS I2C spec describes in the table in section "Interface Locality
Usage per Register" that the TPM_INT_ENABLE and TPM_INT_STATUS registers
must be writable for any locality even if the locality is not the active
locality. Therefore, remove the checks whether the writing locality is the
active locality for these registers.

Signed-off-by: Ninad Palsule 
Signed-off-by: Stefan Berger 
Reviewed-by: Stefan Berger 
Tested-by: Stefan Berger 
Reviewed-by: Cédric Le Goater 
Reviewed-by: Joel Stanley 
Tested-by: Joel Stanley 

---
V2:

Incorporated Stephen's comments.

- Removed checksum enable and checksum get registers.
- Added checksum calculation function which can be called from
  i2c layer.

---
V3:
Incorporated review comments from Cedric and Stefan.

- Pass locality to the checksum calculation function and cleanup
- Moved I2C related definations in the acpi/tpm.h

---
V4:

Incorporated review comments by Stefan

- Remove the check for locality while calculating checksum
- Use bswap16 instead of cpu_ti_be16.
- Rename TPM_I2C register by dropping _TIS_ from it.

---
V7:

Incorporated review comments from Stefan.

- Removed locality check from INT_ENABLE and INT_STATUS registers write
  path.
- Moved TPM_DATA_CSUM_ENABLED define in the tpm.h

---
V8:
Incorporated review comments from Stefan

- Moved the INT_ENABLE mask to tpm.h file.

---
V12:
Incorporated review comments from Stefan.
- Moved STS read/write mask to tpm.h
---
 hw/tpm/tpm_tis.h|  3 +++
 hw/tpm/tpm_tis_common.c | 36 
 include/hw/acpi/tpm.h   | 41 +
 3 files changed, 72 insertions(+), 8 deletions(-)

diff --git a/hw/tpm/tpm_tis.h b/hw/tpm/tpm_tis.h
index f6b5872ba6..6f29a508dd 100644
--- a/hw/tpm/tpm_tis.h
+++ b/hw/tpm/tpm_tis.h
@@ -86,5 +86,8 @@ int tpm_tis_pre_save(TPMState *s);
 void tpm_tis_reset(TPMState *s);
 enum TPMVersion tpm_tis_get_tpm_version(TPMState *s);
 void tpm_tis_request_completed(TPMState *s, int ret);
+uint32_t tpm_tis_read_data(TPMState *s, hwaddr addr, unsigned size);
+void tpm_tis_write_data(TPMState *s, hwaddr addr, uint64_t val, uint32_t size);
+uint16_t tpm_tis_get_checksum(TPMState *s);
 
 #endif /* TPM_TPM_TIS_H */
diff --git a/hw/tpm/tpm_tis_common.c b/hw/tpm/tpm_tis_common.c
index 503be2a541..c07c179dbc 100644
--- a/hw/tpm/tpm_tis_common.c
+++ b/hw/tpm/tpm_tis_common.c
@@ -26,6 +26,8 @@
 #include "hw/irq.h"
 #include "hw/isa/isa.h"
 #include "qapi/error.h"
+#include "qemu/bswap.h"
+#include "qemu/crc-ccitt.h"
 #include "qemu/module.h"
 
 #include "hw/acpi/tpm.h"
@@ -447,6 +449,23 @@ static uint64_t tpm_tis_mmio_read(void *opaque, hwaddr 
addr,
 return val;
 }
 
+/*
+ * A wrapper read function so that it can be directly called without
+ * mmio.
+ */
+uint32_t tpm_tis_read_data(TPMState *s, hwaddr addr, unsigned size)
+{
+return tpm_tis_mmio_read(s, addr, size);
+}
+
+/*
+ * Calculate current data buffer checksum
+ */
+uint16_t tpm_tis_get_checksum(TPMState *s)
+{
+return bswap16(crc_ccitt(0, s->buffer, s->rw_offset));
+}
+
 /*
  * Write a value to a register of the TIS interface
  * See specs pages 33-63 for description of the registers
@@ -588,10 +607,6 @@ static void tpm_tis_mmio_write(void *opaque, hwaddr addr,
 
 break;
 case TPM_TIS_REG_INT_ENABLE:
-if (s->active_locty != locty) {
-break;
-}
-
 s->loc[locty].inte &= mask;
 s->loc[locty].inte |= (val & (TPM_TIS_INT_ENABLED |
 TPM_TIS_INT_POLARITY_MASK |
@@ -601,10 +616,6 @@ static void tpm_tis_mmio_write(void *opaque, hwaddr addr,
 /* hard wired -- ignore */
 break;
 case TPM_TIS_REG_INT_STATUS:
-if (s->active_locty != locty) {
-break;
-}
-
 /* clearing of interrupt flags */
 if (((val & TPM_TIS_INTERRUPTS_SUPPORTED)) &&
 (s->loc[locty].ints & TPM_TIS_INTERRUPTS_SUPPORTED)) {
@@ -767,6 +778,15 @@ static void tpm_tis_mmio_write(void *opaque, hwaddr addr,
 }
 }
 
+/*
+ * A wrapper write function so that it can be directly called without
+ * mmio.
+ */
+void tpm_tis_write_data(TPMState *s, hwaddr addr, uint64_t val, uint32_t size)
+{
+tpm_tis_mmio_write(s, addr, val, size);
+}
+
 const MemoryRegionOps tpm_tis_memory_ops = {
 .read = tpm_tis_mmio_read,
 .write = tpm_tis_mmio_write,
diff --git a/include/hw/acpi/tpm.h b/include/hw/acpi/tpm.h
index 559ba6906c..579c45f5ba 100644
--- a/include/hw/acpi/tpm.h
+++ b/include/hw/acpi/tpm.h
@@ -93,6

[PATCH v13 3/3] tpm: Add support for TPM device over I2C bus

From: Ninad Palsule 

Qemu already supports devices attached to ISA and sysbus. This drop adds
support for the I2C bus attached TPM devices. I2C model only supports
TPM2 protocol.

This commit includes changes for the common code.
- Added I2C emulation model. Logic was added in the model to temporarily
  cache the data as I2C interface works per byte basis.
- New tpm type "tpm-tis-i2c" added for I2C support. The user has to
  provide this string on command line.

Testing:
  TPM I2C device module is tested using SWTPM (software based TPM
  package). Qemu uses the rainier machine and is connected to swtpm over
  the socket interface.

  The command to start swtpm is as follows:
  $ swtpm socket --tpmstate dir=/tmp/mytpm1\
 --ctrl type=unixio,path=/tmp/mytpm1/swtpm-sock  \
 --tpm2 --log level=100

  The command to start qemu is as follows:
  $ qemu-system-arm -M rainier-bmc -nographic \
-kernel ${IMAGEPATH}/fitImage-linux.bin \
-dtb ${IMAGEPATH}/aspeed-bmc-ibm-rainier.dtb \
-initrd ${IMAGEPATH}/obmc-phosphor-initramfs.rootfs.cpio.xz \
-drive 
file=${IMAGEPATH}/obmc-phosphor-image.rootfs.wic.qcow2,if=sd,index=2 \
-net nic -net 
user,hostfwd=:127.0.0.1:-:22,hostfwd=:127.0.0.1:2443-:443 \
-chardev socket,id=chrtpm,path=/tmp/mytpm1/swtpm-sock \
-tpmdev emulator,id=tpm0,chardev=chrtpm \
-device tpm-tis-i2c,tpmdev=tpm0,bus=aspeed.i2c.bus.12,address=0x2e

Signed-off-by: Ninad Palsule 
Reviewed-by: Stefan Berger 
Tested-by: Stefan Berger 
Reviewed-by: Cédric Le Goater 
Reviewed-by: Joel Stanley 
Tested-by: Joel Stanley 
---
V2:
Incorporated Stephen's review comments.
- Handled checksum related register in I2C layer
- Defined I2C interface capabilities and return those instead of
  capabilities from TPM TIS. Add required capabilities from TIS.
- Do not cache FIFO data in the I2C layer.
- Make sure that Device address change register is not passed to I2C
  layer as capability indicate that it is not supported.
- Added boundary checks.
- Make sure that bits 26-31 are zeroed for the TPM_STS register on read
- Updated Kconfig files for new define.

---
V3:
- Moved processing of register TPM_I2C_LOC_SEL in the I2C. So I2C layer
  remembers the locality and pass it to TIS on each read/write.
- The write data is no more cached in the I2C layer so the buffer size
  is reduced to 16 bytes.
- Checksum registers are now managed by the I2C layer. Added new
  function in TIS layer to return the checksum and used that to process
  the request.
- Now 2-4 byte register value will be passed to TIS layer in a single
  write call instead of 1 byte at a time. Added functions to convert
  between little endian stream of bytes to single 32 bit unsigned
  integer. Similarly 32  bit integer to stream of bytes.
- Added restriction on device change register.
- Replace few if-else statement with switch statement for clarity.
- Log warning when unknown register is received.
- Moved all register definations to acpi/tmp.h

---
V4:
Incorporated review comments from Cedric and Stefan.
- Reduced data[] size from 16 byte to 5 bytes.
- Added register name in the mapping table which can be used for
  tracing.
- Removed the endian conversion functions instead used simple logic
  provided by Stefan.
- Rename I2C registers to reduce the length.
- Added traces for send, recv and event functions. You can turn on trace
  on command line by using "-trace "tpm_tis_i2c*" option.

---
V5:
Fixed issues reported by Stefan's test.
- Added mask for the INT_ENABLE register.
- Use correct TIS register for reading interrupt capability.
- Cleanup how register is converted from I2C to TIS and also saved
  information like tis_addr and register name in the i2cst so that we
  can only convert it once on i2c_send.
- Trace register number for unknown registers.

---
V6:
Fixed review comments from Stefan.
- Fixed some variable size.
- Removed unused variables.
- Added vmstat backin to handle migration.
- Added post load phase to reload tis address and register name.

---
V7:
Incorporated review comments from Stefan.
- Added tpm_tis_i2c_initfn function
- Set the device catagory DEVICE_CATEGORY_MISC.
- Corrected default locality selection.
- Other cleanup. Include file cleanup.

---
V8:
Incorporated review comments from Stefan.
- Removed the irq initialization as linux doesn't support interrupts for
  TPM
- Handle INT_CAPABILITY register in I2C only and return 0 to indicate
  that it is not supported.

---
V9:
- Added copyright
- Added set data function and called it few places.
- Rename function tpm_i2c_interface_capability

---
V10:
- Fixed the copyright text.

---
V11:
- As per specs changed STS register to support read/write in the middle
- Fixed issue in the checksum register

---
V12:
- Added validation for the locality.
- Applied correct mask for STS read and write.

---
V13:
- Improved comment
- Removed extra line.
---
 hw/arm/Kconfig

Re: [PATCH v10 9/9] KVM: Enable and expose KVM_MEM_PRIVATE

On Tue, Mar 28, 2023, Chao Peng wrote:
> On Fri, Mar 24, 2023 at 10:29:25AM +0800, Xiaoyao Li wrote:
> > On 3/24/2023 10:10 AM, Chao Peng wrote:
> > > On Wed, Mar 22, 2023 at 05:41:31PM -0700, Isaku Yamahata wrote:
> > > > On Wed, Mar 08, 2023 at 03:40:26PM +0800,
> > > > Chao Peng  wrote:
> > > > 
> > > > > On Wed, Mar 08, 2023 at 12:13:24AM +, Ackerley Tng wrote:
> > > > > > Chao Peng  writes:
> > > > > > 
> > > > > > > On Sat, Jan 14, 2023 at 12:01:01AM +, Sean Christopherson 
> > > > > > > wrote:
> > > > > > > > On Fri, Dec 02, 2022, Chao Peng wrote:
> > > > > > > +static bool kvm_check_rmem_offset_alignment(u64 offset, u64 gpa)
> > > > > > > +{
> > > > > > > + if (!offset)
> > > > > > > + return true;
> > > > > > > + if (!gpa)
> > > > > > > + return false;
> > > > > > > +
> > > > > > > + return !!(count_trailing_zeros(offset) >= 
> > > > > > > count_trailing_zeros(gpa));
> > > > 
> > > > This check doesn't work expected. For example, offset = 2GB, gpa=4GB
> > > > this check fails.
> > > 
> > > This case is expected to fail as Sean initially suggested[*]:
> > >I would rather reject memslot if the gfn has lesser alignment than
> > >the offset. I'm totally ok with this approach _if_ there's a use case.
> > >Until such a use case presents itself, I would rather be conservative
> > >from a uAPI perspective.
> > > 
> > > I understand that we put tighter restriction on this but if you see such
> > > restriction is really a big issue for real usage, instead of a
> > > theoretical problem, then we can loosen the check here. But at that time
> > > below code is kind of x86 specific and may need improve.
> > > 
> > > BTW, in latest code, I replaced count_trailing_zeros() with fls64():
> > >return !!(fls64(offset) >= fls64(gpa));
> > 
> > wouldn't it be !!(ffs64(offset) <= ffs64(gpa)) ?
> 
> As the function document explains, here we want to return true when
> ALIGNMENT(offset) >= ALIGNMENT(gpa), so '>=' is what we need.
> 
> It's worthy clarifying that in Sean's original suggestion he actually
> mentioned the opposite. He said 'reject memslot if the gfn has lesser
> alignment than the offset', but I wonder this is his purpose, since
> if ALIGNMENT(offset) < ALIGNMENT(gpa), we wouldn't be possible to map
> the page as largepage. Consider we have below config:
> 
>   gpa=2M, offset=1M
> 
> In this case KVM tries to map gpa at 2M as 2M hugepage but the physical
> page at the offset(1M) in private_fd cannot provide the 2M page due to
> misalignment.
> 
> But as we discussed in the off-list thread, here we do find a real use
> case indicating this check is too strict. i.e. QEMU immediately fails
> when launch a guest > 2G memory. For this case QEMU splits guest memory
> space into two slots:
> 
>   Slot#1(ram_below_4G): gpa=0x0, offset=0x0, size=2G
>   Slot#2(ram_above_4G): gpa=4G,  offset=2G,  size=totalsize-2G
> 
> This strict alignment check fails for slot#2 because offset(2G) has less
> alignment than gpa(4G). To allow this, one solution can revert to my
> previous change in kvm_alloc_memslot_metadata() to disallow hugepage
> only when the offset/gpa are not aligned to related page size.
> 
> Sean, How do you think?

I agree, a pure alignment check is too restrictive, and not really what I 
intended
despite past me literally saying that's what I wanted :-)  I think I may have 
also
inverted the "less alignment" statement, but luckily I believe that ends up 
being
a moot point.

The goal is to avoid having to juggle scenarios where KVM wants to create a 
hugepage,
but restrictedmem can't provide one because of a misaligned file offset.  I 
think
the rule we want is that the offset must be aligned to the largest page size 
allowed
by the memslot _size_.  E.g. on x86, if the memslot size is >=1GiB then the 
offset
must be 1GiB or beter, ditto for >=2MiB and >=4KiB (ignoring that 4KiB is 
already a
requirement).

We could loosen that to say the largest size allowed by the memslot, but I don't
think that's worth the effort unless it's trivially easy to implement in code,
e.g. KVM could technically allow a 4KiB aligned offset if the memslot is 2MiB
sized but only 4KiB aligned on the GPA.  I doubt there's a real use case for 
such
a memslot, so I want to disallow that unless it's super easy to implement.

[PATCH 1/2] travis.yml: Add missing clang-10 package to the 'Clang (disable-tcg)' job

2023-04-14 Thread Vaibhav Jain

Since commit 74a1b256d775("configure: Bump minimum Clang version to 10.0") qemu
needs Clang version 10.0 as the minimum version to build qemu with
Clang. However 'focal' ships by default with Clang version 7.0.0 which causes an
error while executing the 'Clang (disable-tcg)' travis job of the form below:


$clang --version

clang version 7.0.0 (tags/RELEASE_700/final)


 ERROR: You need at least GCC v7.4 or Clang v10.0 (or XCode Clang v12.0)

 # QEMU configure log Fri 14 Apr 2023 03:48:22 PM UTC

 # Configured with: '../configure' '--disable-docs' '--disable-tools'
 '--disable-containers' '--disable-tcg' '--enable-kvm' '--disable-tools'
 '--enable-fdt=system' '--host-cc=clang' '--cxx=clang++'

Fix this by adding 'clang-10' to the 'apt_packages' section of the "[s390x]
Clang (disable-tcg)" job and updating the compiler to 'clang-10'.

Signed-off-by: Vaibhav Jain 
---
 .travis.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index cf088ba4cf..11894eb810 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -243,7 +243,7 @@ jobs:
 - name: "[s390x] Clang (disable-tcg)"
   arch: s390x
   dist: focal
-  compiler: clang
+  compiler: clang-10
   addons:
 apt_packages:
   - libaio-dev
@@ -269,6 +269,7 @@ jobs:
   - libvdeplug-dev
   - libvte-2.91-dev
   - ninja-build
+  - clang-10
   env:
 - TEST_CMD="make check-unit"
 - CONFIG="--disable-containers --disable-tcg --enable-kvm 
--disable-tools
-- 
2.39.2

[PATCH 2/2] travis.yml: Add missing 'flex' package to 'GCC (user)' job

2023-04-14 Thread Vaibhav Jain

Since commit fd8171fe52b5e("target/hexagon: import lexer for idef-parser") the
hexagon target uses 'flex' to generate idef-parser. However 'focal' may not have
'flex' pre-installed, consequently following error is seen with travis when
trying to execute the 'GCC (user)' job that also tries to build hexagon user
binary:


export CONFIG="--disable-containers --disable-system"

 Program flex found: NO

../target/hexagon/meson.build:179:4: ERROR: Program 'flex' not found or not
executable


Fix this by explicitly add 'flex' to the list of addon apt-packages for the
'GCC (user)' job.

Signed-off-by: Vaibhav Jain 
---
 .travis.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.travis.yml b/.travis.yml
index 11894eb810..8dc71c294d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -237,6 +237,7 @@ jobs:
   - libglib2.0-dev
   - libgnutls28-dev
   - ninja-build
+  - flex
   env:
 - CONFIG="--disable-containers --disable-system"
 
-- 
2.39.2

Re: [PATCH 1/3] target/arm: Don't allow stage 2 page table walks to downgrade to NS


On 14/4/23 18:04, Peter Maydell wrote:

Bit 63 in a Table descriptor is only the NSTable bit for stage 1
translations; in stage 2 it is RES0.  We were incorrectly looking at
it all the time.

This causes problems if:
  * the stage 2 table descriptor was incorrectly setting the RES0 bit
  * we are doing a stage 2 translation in Secure address space for
a NonSecure stage 1 regime -- in this case we would incorrectly
do an immediate downgrade to NonSecure

A bug elsewhere in the code currently prevents us from getting
to the second situation, but when we fix that it will be possible.

Cc: qemu-sta...@nongnu.org
Signed-off-by: Peter Maydell 
---
  target/arm/ptw.c | 5 +++--
  1 file changed, 3 insertions(+), 2 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH 2/2] chardev: Allow setting file chardev input file on the command line

2023-04-14 Thread Marc-André Lureau

On Thu, Apr 13, 2023 at 7:07 PM Peter Maydell 
wrote:

> Our 'file' chardev backend supports both "output from this chardev
> is written to a file" and "input from this chardev should be read
> from a file" (except on Windows). However, you can only set up
> the input file if you're using the QMP interface -- there is no
> command line syntax to do it.
>
> Add command line syntax to allow specifying an input file
> as well as an output file, using a new 'input-path' suboption.
>
> The specific use case I have is that I'd like to be able to
> feed fuzzer reproducer input into qtest without having to use
> '-qtest stdio' and put the input onto stdin. Being able to
> use a file chardev like this:
>  -chardev file,id=repro,path=/dev/null,input-path=repro.txt -qtest
> chardev:repro
> means that stdio is free for use by gdb.
>
> Signed-off-by: Peter Maydell 
>

Reviewed-by: Marc-André Lureau 



> ---
> The "not on Windows" ifdeffery is because qmp_chardev_open_file()
> does something similar; it seems likely to produce a nicer
> error message to catch it at parse time rather than open time.
>

(I wonder if we could actually reduce the win32-specific code, I'll have a
look eventually)

---
>  chardev/char-file.c |  8 
>  chardev/char.c  |  3 +++
>  qemu-options.hx | 10 --
>  3 files changed, 19 insertions(+), 2 deletions(-)
>
> diff --git a/chardev/char-file.c b/chardev/char-file.c
> index 3a7b9caf6f0..263e6da5636 100644
> --- a/chardev/char-file.c
> +++ b/chardev/char-file.c
> @@ -100,6 +100,7 @@ static void qemu_chr_parse_file_out(QemuOpts *opts,
> ChardevBackend *backend,
>  Error **errp)
>  {
>  const char *path = qemu_opt_get(opts, "path");
> +const char *inpath = qemu_opt_get(opts, "input-path");
>  ChardevFile *file;
>
>  backend->type = CHARDEV_BACKEND_KIND_FILE;
> @@ -107,9 +108,16 @@ static void qemu_chr_parse_file_out(QemuOpts *opts,
> ChardevBackend *backend,
>  error_setg(errp, "chardev: file: no filename given");
>  return;
>  }
> +#ifdef _WIN32
> +if (inpath) {
> +error_setg(errp, "chardev: file: input-path not supported on
> Windows");
> +return;
> +}
> +#endif
>  file = backend->u.file.data = g_new0(ChardevFile, 1);
>  qemu_chr_parse_common(opts, qapi_ChardevFile_base(file));
>  file->out = g_strdup(path);
> +file->in = g_strdup(inpath);
>
>  file->has_append = true;
>  file->append = qemu_opt_get_bool(opts, "append", false);
> diff --git a/chardev/char.c b/chardev/char.c
> index e69390601fc..661ad8176a9 100644
> --- a/chardev/char.c
> +++ b/chardev/char.c
> @@ -805,6 +805,9 @@ QemuOptsList qemu_chardev_opts = {
>  },{
>  .name = "path",
>  .type = QEMU_OPT_STRING,
> +},{
> +.name = "input-path",
> +.type = QEMU_OPT_STRING,
>  },{
>  .name = "host",
>  .type = QEMU_OPT_STRING,
> diff --git a/qemu-options.hx b/qemu-options.hx
> index 59bdf67a2c5..31d08c60264 100644
> --- a/qemu-options.hx
> +++ b/qemu-options.hx
> @@ -3360,7 +3360,7 @@ DEF("chardev", HAS_ARG, QEMU_OPTION_chardev,
>  "-chardev
> vc,id=id[[,width=width][,height=height]][[,cols=cols][,rows=rows]]\n"
>  " [,mux=on|off][,logfile=PATH][,logappend=on|off]\n"
>  "-chardev
> ringbuf,id=id[,size=size][,logfile=PATH][,logappend=on|off]\n"
> -"-chardev
> file,id=id,path=path[,mux=on|off][,logfile=PATH][,logappend=on|off]\n"
> +"-chardev
> file,id=id,path=path[,input-file=input-file][,mux=on|off][,logfile=PATH][,logappend=on|off]\n"
>  "-chardev
> pipe,id=id,path=path[,mux=on|off][,logfile=PATH][,logappend=on|off]\n"
>  #ifdef _WIN32
>  "-chardev
> console,id=id[,mux=on|off][,logfile=PATH][,logappend=on|off]\n"
> @@ -3563,13 +3563,19 @@ The available backends are:
>  Create a ring buffer with fixed size ``size``. size must be a power
>  of two and defaults to ``64K``.
>
> -``-chardev file,id=id,path=path``
> +``-chardev file,id=id,path=path[,input-path=input-path]``
>  Log all traffic received from the guest to a file.
>
>  ``path`` specifies the path of the file to be opened. This file will
>  be created if it does not already exist, and overwritten if it does.
>  ``path`` is required.
>
> +If ``input-path`` is specified, this is the path of a second file
> +which will be used for input. If ``input-path`` is not specified,
> +no input will be available from the chardev.
> +
> +Note that ``input-path`` is not supported on Windows hosts.
> +
>  ``-chardev pipe,id=id,path=path``
>  Create a two-way connection to the guest. The behaviour differs
>  slightly between Windows hosts and other hosts:
> --
> 2.34.1
>
>

Re: [PATCH V9 00/46] Live Update

2023-04-14 Thread Michael Galaxy

Greetings,

For what its worth, our team has been aggressively testing this patch
series and to date have not found any deficiencies or bottlenecks
whatsoever.

Overall, with a very well-tuned system (and linux kernel), we are
getting live update downtimes just above 15 seconds, and the vast
majority of that downtime is kind of wasted time during kexec
(particularly with PCI device probing, if anybody has any tips on that)
and has nothing to do whatsoever with the QEMU-side of things. (We use
this patchset in a PMEM-based configuration which has been working out
extremely well so far).

We also tested "back-to-back" live updates as you would expect to do in
production.

Overall we've done thousands of live updates over the past few months on
many different types of hardware without failure.

Steven has been really responsive in answering some of our usability
questions and we were able to fix those issues.

We will continue our testing throughout the year with more
heavily-loaded workloads, but all in all we would very much be
interested in seeing further reviews on this patch series from others.

*
*---
Tested-by: Michael Galaxy

On 12/7/22 09:48, Steven Sistare wrote:

This series desperately needs review in its intersection with live migration.
The code in other areas has been reviewed and revised multiple times -- thank
you!

David, Juan, can you spare some time to review this? I have done my best to
order
the patches logically (see the labelled groups in this email), and to provide
complete and clear cover letter and commit messages. Can I do anything to
facilitate,
like doing a code walk through via zoom?

And of course, I welcome anyone's feedback.

Here is the original posting.

https://urldefense.com/v3/__https://lore.kernel.org/qemu-devel/1658851843-236870-1-git-send-email-steven.sist...@oracle.com/__;!!GjvTz_vk!U6U0yYHuO4GRyGQUNpn0TQdwlC2QjJfgYC0yE249AuONq8-5rs48pZ6l0K-LOSRHMA9cU2op2U8GC9hU3EAFRUVu$

- Steve

On 7/26/2022 12:09 PM, Steve Sistare wrote:

This version of the live update patch series integrates live update into the
live migration framework. The new interfaces are:
* mode (migration parameter)
* cpr-exec-args (migration parameter)
* file (migration URI)
* migrate-mode-enable (command-line argument)
* only-cpr-capable (command-line argument)

Provide the cpr-exec and cpr-reboot migration modes for live update. These
save and restore VM state, with minimal guest pause time, so that qemu may be
updated to a new version in between. The caller sets the mode parameter
before invoking the migrate or migrate-incoming commands.

In cpr-reboot mode, the migrate command saves state to a file, allowing
one to quit qemu, reboot to an updated kernel, start an updated version of
qemu, and resume via the migrate-incoming command. The caller must specify
a migration URI that writes to and reads from a file. Unlike normal mode,
the use of certain local storage options does not block the migration, but
the caller must not modify guest block devices between the quit and restart.
The guest RAM memory-backend must be shared, and the @x-ignore-shared
migration capability must be set, to avoid saving it to the file. Guest RAM
must be non-volatile across reboot, which can be achieved by backing it with
a dax device, or /dev/shm PKRAM as proposed in
https://urldefense.com/v3/__https://lore.kernel.org/lkml/1617140178-8773-1-git-send-email-anthony.yznaga@oracle.com__;!!GjvTz_vk!U6U0yYHuO4GRyGQUNpn0TQdwlC2QjJfgYC0yE249AuONq8-5rs48pZ6l0K-LOSRHMA9cU2op2U8GC9hU3AKRJQux$
but this is not enforced. The restarted qemu arguments must match those used

to initially start qemu, plus the -incoming option.

The reboot mode supports vfio devices if the caller first suspends the guest,
such as by issuing guest-suspend-ram to the qemu guest agent. The guest
drivers' suspend methods flush outstanding requests and re-initialize the
devices, and thus there is no device state to save and restore. After
issuing migrate-incoming, the caller must issue a system_wakeup command to
resume.

In cpr-exec mode, the migrate command saves state to a file and directly
exec's a new version of qemu on the same host, replacing the original process
while retaining its PID. The caller must specify a migration URI that writes
to and reads from a file, and resumes execution via the migrate-incoming
command. Arguments for the new qemu process are taken from the cpr-exec-args
migration parameter, and must include the -incoming option.

Guest RAM must be backed by a memory backend with share=on, but cannot be
memory-backend-ram. The memory is re-mmap'd in the updated process, so guest
ram is efficiently preserved in place, albeit with new virtual addresses.
In addition, the '-migrate-mode-enable cpr-exec' option is required. This
causes secondary guest ram blocks (those not specified on the command line)
to be allocated by mmap'ing a memfd. The memfds are kept open across exec,
their values

Re: [PATCH 0/3] SDL2 usability fixes

2023-04-14 Thread Volker Rümelin


Am 13.04.23 um 22:43 schrieb Bernhard Beschow:


Am 13. April 2023 17:54:34 UTC schrieb "Volker Rümelin" 
:
I'm trying to use QEMU on Windows hosts for fun and for profit. 
While the GTK
GUI doesn't seem to support OpenGL under Windows the SDL2 GUI does. 
Hence I
used the SDL2 GUI where I ran into several issues of which three 
are fixed in

this series, which are:

* Alt+Tab switches tasks on the host rather than in the guest in 
fullscreen mode

* Alt+F4 closes QEMU rather than a graphical task in the guest
* AltGr keyboard modifier isn't recognized by a Linux guest

More information about each issue is provided in the patches.

Bernhard Beschow (3):
    ui/sdl2: Grab Alt+Tab also in fullscreen mode
    ui/sdl2: Grab Alt+F4 also under Windows
    ui/sdl2-input: Fix AltGr modifier on Windows hosts

   ui/sdl2-input.c | 13 +
   ui/sdl2.c   |  2 ++
   2 files changed, 15 insertions(+)


Hi Bernhard,

Hi Volker,

I don't think these patches are necessary. The AltGr key and the 
keyboard grab was fixed in 2020 with commit 830473455f ("ui/sdl2: 
fix handling of AltGr key on Windows") and a few commits before.
Indeed, this patch addresses the AltGr issue. What I noticed in my 
case is that the AltGr behavior is different, depending on whether 
the *guest* is in graphics mode or not. Pressing AltGr in graphics 
mode issues two key modifiers while only one is issued when the guest 
is in text mode. I'll recheck tomorrow when I have access to a 
Windows host.


Hi Bernhard,

the AltGr behavior depends on the keyboard grab. The AltGr key works 
without keyboard grabbed and it doesn't with keyboard grabbed. That's 
a bug.


Hi Bernhard,

this used to work before SDL2-2.0.16. There was no code for Windows in 
SDL2 to handle keyboard grabs. Version 2.0.16 introduced a Windows low 
level keyboard hook procedure to grab the keyboard. Windows calls this 
callback before the QEMU keyboard hook procedure. This explains the 
observed behavior.


The fix is to disable SDL_SetHint(SDL_HINT_GRAB_KEYBOARD, "1"); for 
Windows. This restores the QEMU keyboard grab for all Windows SDL2 
versions. Your first two patches are also necessary. But I think you 
will need a #ifdef SDL_HINT_ALLOW_ALT_TAB_WHILE_GRABBED for older SDL2 
versions.


I'll write a patch similar to this one after more tests.

diff --git a/ui/sdl2.c b/ui/sdl2.c
index b011e8d759..9f6b959464 100644
--- a/ui/sdl2.c
+++ b/ui/sdl2.c
@@ -855,7 +855,9 @@ static void sdl2_display_init(DisplayState *ds, 
DisplayOptions *o)
 #ifdef SDL_HINT_VIDEO_X11_NET_WM_BYPASS_COMPOSITOR /* only available since SDL 
2.0.8 */
 SDL_SetHint(SDL_HINT_VIDEO_X11_NET_WM_BYPASS_COMPOSITOR, "0");
 #endif
+#ifndef CONFIG_WIN32
 SDL_SetHint(SDL_HINT_GRAB_KEYBOARD, "1");
+#endif
 #ifdef SDL_HINT_ALLOW_ALT_TAB_WHILE_GRABBED
 /* available since SDL 2.0.16 */
 SDL_SetHint(SDL_HINT_ALLOW_ALT_TAB_WHILE_GRABBED, "0");
--
2.35.3

With best regards,
Volker





What about the other two issues? My other two patches override SDL2 
defaults which aren't addressed yet in QEMU AFAICS. The Alt+Tab one 
isn't even Windows-specific.


Something broke in the last few weeks. At the moment my Linux guest 
fails to start on Windows with -display sdl. QEMU locks up a short 
time after the Linux kernel starts.
This doesn't happen for me with 8.0rc4 and latest msys2 environment. 
I'm running with `-accel whpx -vga none -device virtio-vga-gl 
-display sdl,gl=on` and I even get decent OpenGL accelleration when 
the Linux guest is in graphics mode, with wobbly windows etc. 
Sometimes QEMU aborts when it can't map some OpenGL stuff when the 
guest enters graphics mode but once that succeeds it runs absolutely 
stable.



I'll try to find the commit that caused this regression.


I tested with QEMU 7.1 and sometimes it also locks up. This indicates 
that the QEMU code doesn't have an issue.


With best regards,
Volker


Yes, this would be interesting.

Best regards,
Bernhard


With best regards,
Volker

Re: [PATCH 1/2] qtest: Don't assert on "-qtest chardev:myid"

2023-04-14 Thread Marc-André Lureau

On Thu, Apr 13, 2023 at 7:07 PM Peter Maydell 
wrote:

> If the -qtest command line argument is passed a string that says
> "use this chardev for I/O", then it will assert:
>
> $ ./build/clang/qemu-system-i386 -chardev file,path=/dev/null,id=myid
> -qtest chardev:myid
> Unexpected error in qtest_set_chardev() at ../../softmmu/qtest.c:1011:
> qemu-system-i386: Cannot find character device 'qtest'
> Aborted (core dumped)
>
> This is because in qtest_server_init() we assume that when we create
> the chardev with qemu_chr_new() it will always have the name "qtest".
> This is true if qemu_chr_new() had to create a new chardev, but not
> true if one already existed and is being referred to with
> "chardev:myid".
>
> Use the name of the chardev we get back from qemu_chr_new() as the
> string to set the qtest 'chardev' property to, instead of hardcoding
> it to "qtest".
>
> Signed-off-by: Peter Maydell 
>

Reviewed-by: Marc-André Lureau 



> ---
>  softmmu/qtest.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/softmmu/qtest.c b/softmmu/qtest.c
> index 34bd2a33a76..26852996b5b 100644
> --- a/softmmu/qtest.c
> +++ b/softmmu/qtest.c
> @@ -867,7 +867,7 @@ void qtest_server_init(const char *qtest_chrdev, const
> char *qtest_log, Error **
>  }
>
>  qtest = object_new(TYPE_QTEST);
> -object_property_set_str(qtest, "chardev", "qtest", _abort);
> +object_property_set_str(qtest, "chardev", chr->label, _abort);
>  if (qtest_log) {
>  object_property_set_str(qtest, "log", qtest_log, _abort);
>  }
> --
> 2.34.1
>
>

Re: [QEMU][PATCH v3 2/4] hw/net/can: Introduce Xilinx Versal CANFD controller

2023-04-14 Thread Vikram Garhwal


Hi Peter & Francisco,

Apologies for long delay with next version i.e. v4. I was on family 
leave and resumed work two weeks back.


Please see my comments below.

On 12/19/22 8:11 AM, Peter Maydell wrote:

On Wed, 7 Dec 2022 at 02:13, Vikram Garhwal  wrote:

The Xilinx Versal CANFD controller is developed based on SocketCAN, QEMU CAN bus
implementation. Bus connection and socketCAN connection for each CAN module
can be set through command lines.

Signed-off-by: Vikram Garhwal
---
  hw/net/can/meson.build |1 +
  hw/net/can/trace-events|7 +
  hw/net/can/xlnx-versal-canfd.c | 2121 
  include/hw/net/xlnx-versal-canfd.h |   90 ++
  4 files changed, 2219 insertions(+)
  create mode 100644 hw/net/can/xlnx-versal-canfd.c
  create mode 100644 include/hw/net/xlnx-versal-canfd.h




@@ -0,0 +1,2121 @@
+/*
+ * QEMU model of the Xilinx Versal CANFD device.
+ *
+ * This implementation is based on the following datasheet:
+ *https://docs.xilinx.com/v/u/2.0-English/pg223-canfd
+ *
+ * Copyright (c) 2022 AMD Inc.
+ *
+ * Written-by: Vikram Garhwal

Missing space before '<'.


+static void canfd_config_reset(XlnxVersalCANFDState *s)
+{
+
+unsigned int i;
+
+/* Reset all the configuration register. */

"registers"


+for (i = 0; i < R_RX_FIFO_WATERMARK_REGISTER; ++i) {
+register_reset(>reg_info[i]);
+}
+
+canfd_update_irq(s);
+}
+static void store_rx_sequential(XlnxVersalCANFDState *s,
+const qemu_can_frame *frame,
+uint32_t fill_level, uint32_t read_index,
+uint32_t store_location, uint8_t rx_fifo,
+bool rx_fifo_id, uint8_t filter_index)
+{
+int i;
+bool is_canfd_frame;
+uint8_t dlc = frame->can_dlc;
+uint32_t dlc_reg_val = 0;
+uint32_t data_reg_val = 0;
+
+/* Getting RX0/1 fill level */
+if ((fill_level) > rx_fifo - 1) {
+g_autofree char *path = object_get_canonical_path(OBJECT(s));
+
+qemu_log_mask(LOG_GUEST_ERROR, "%s: RX%d Buffer is full. Discarding 
the"
+  " message\n", path, rx_fifo_id);
+
+/* Set the corresponding RF buffer overflow interrupt. */
+if (rx_fifo_id == 0) {
+ARRAY_FIELD_DP32(s->regs, INTERRUPT_STATUS_REGISTER, RXFOFLW, 1);
+} else {
+ARRAY_FIELD_DP32(s->regs, INTERRUPT_STATUS_REGISTER, RXFOFLW_1, 1);
+}
+} else {
+uint16_t rx_timestamp = CANFD_TIMER_MAX -
+ptimer_get_count(s->canfd_timer);
+
+if (rx_timestamp == 0x) {
+ARRAY_FIELD_DP32(s->regs, INTERRUPT_STATUS_REGISTER, TSCNT_OFLW, 
1);
+} else {
+ARRAY_FIELD_DP32(s->regs, TIMESTAMP_REGISTER, TIMESTAMP_CNT,
+ rx_timestamp);
+}
+
+if (rx_fifo_id == 0) {
+ARRAY_FIELD_DP32(s->regs, RX_FIFO_STATUS_REGISTER, FL,
+ fill_level + 1);
+assert(store_location <=
+  R_RB_ID_REGISTER + (s->cfg.rx0_fifo *
+  NUM_REGS_PER_MSG_SPACE));
+} else {
+ARRAY_FIELD_DP32(s->regs, RX_FIFO_STATUS_REGISTER, FL_1,
+ fill_level + 1);
+assert(store_location <=
+  R_RB_ID_REGISTER_1 + (s->cfg.rx1_fifo *
+NUM_REGS_PER_MSG_SPACE));
+}
+
+s->regs[store_location] = frame->can_id;
+
+if (frame->flags == QEMU_CAN_FRMF_TYPE_FD) {
+is_canfd_frame = true;
+
+for (i = 0; i < ARRAY_SIZE(canfd_dlc_array); i++) {
+if (canfd_dlc_array[i] == frame->can_dlc) {
+dlc = 8 + i;
+}
+
+dlc_reg_val = FIELD_DP32(0, RB_DLC_REGISTER, DLC, dlc);
+}
+} else {
+is_canfd_frame = false;
+if (frame->can_dlc > 8) {
+dlc = 8;
+}
+
+dlc_reg_val = FIELD_DP32(0, RB_DLC_REGISTER, DLC, frame->can_dlc);
+}
+
+dlc_reg_val |= FIELD_DP32(0, RB_DLC_REGISTER, FDF, is_canfd_frame);
+dlc_reg_val |= FIELD_DP32(0, RB_DLC_REGISTER, TIMESTAMP, rx_timestamp);
+dlc_reg_val |= FIELD_DP32(0, RB_DLC_REGISTER, MATCHED_FILTER_INDEX,
+  filter_index);
+s->regs[store_location + 1] = dlc_reg_val;
+
+for (i = 0; i <= dlc; i++) {
+data_reg_val = FIELD_DP32(0, RB_DW0_REGISTER, DATA_BYTES3,
+  frame->data[4 * i]);
+data_reg_val |= FIELD_DP32(0, RB_DW0_REGISTER, DATA_BYTES2,
+   frame->data[4 * i + 1]);
+data_reg_val |= FIELD_DP32(0, RB_DW0_REGISTER, DATA_BYTES1,
+   frame->data[4 * i + 2]);
+

Re: [RFC PATCH 0/6] Setting memory policy for restrictedmem file

On Fri, Apr 14, 2023, Michal Hocko wrote:
> On Fri 14-04-23 00:11:49, Ackerley Tng wrote:
> > 3. A more generic fbind(): it seems like this new functionality is
> >really only needed for restrictedmem files, hence a separate,
> >specific syscall was proposed to avoid complexities with handling
> >conflicting policies that may be specified via other syscalls like
> >mbind()
> 
> I do not think it is a good idea to make the syscall restrict mem
> specific.

+1.  IMO, any uAPI that isn't directly related to the fundamental properties of
restricted memory, i.e. isn't truly unique to restrictedmem, should be added as
generic fd-based uAPI.

> History shows that users are much more creative when it comes
> to usecases than us. I do understand that the nature of restricted
> memory is that it is not mapable but memory policies without a mapping
> are a reasonable concept in genereal. After all this just tells where
> the memory should be allocated from. Do we need to implement that for
> any other fs? No, you can safely return EINVAL for anything but
> memfd_restricted fd for now but you shouldn't limit usecases upfront.

I would even go a step further and say that we should seriously reconsider the
design/implemenation of memfd_restricted() if a generic fbind() needs explicit
handling from the restricted memory code.  One of the goals with 
memfd_restricted()
is to rely on the underlying backing store to handle all of the "normal" 
behaviors.

[PATCH 10/12] hw/virtio: add config support to vhost-user-device

To use the generic device the user will need to provide the config
region size via the command line. We also add a notifier so the guest
can be pinged if the remote daemon updates the config.

With these changes:

  -device vhost-user-device-pci,virtio-id=41,num_vqs=2,config_size=8

is equivalent to:

  -device vhost-user-gpio-pci

Signed-off-by: Alex Bennée 
---
 include/hw/virtio/vhost-user-device.h |  1 +
 hw/virtio/vhost-user-device.c | 58 ++-
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/include/hw/virtio/vhost-user-device.h 
b/include/hw/virtio/vhost-user-device.h
index 8d77f06721..cb98e0dcaa 100644
--- a/include/hw/virtio/vhost-user-device.h
+++ b/include/hw/virtio/vhost-user-device.h
@@ -21,6 +21,7 @@ struct VHostUserDevice {
 CharBackend chardev;
 uint16_t virtio_id;
 uint32_t num_vqs;
+uint32_t config_size;
 /* State tracking */
 VhostUserState vhost_user;
 struct vhost_virtqueue *vhost_vq;
diff --git a/hw/virtio/vhost-user-device.c b/hw/virtio/vhost-user-device.c
index bfbf3b29cb..977cfed247 100644
--- a/hw/virtio/vhost-user-device.c
+++ b/hw/virtio/vhost-user-device.c
@@ -117,6 +117,42 @@ static uint64_t vud_get_features(VirtIODevice *vdev,
 return vud->vhost_dev.features & ~(1ULL << VHOST_USER_F_PROTOCOL_FEATURES);
 }
 
+/*
+ * To handle VirtIO config we need to know the size of the config
+ * space. We don't cache the config but re-fetch it from the guest
+ * every time in case something has changed.
+ */
+static void vud_get_config(VirtIODevice *vdev, uint8_t *config)
+{
+VHostUserDevice *vud = VHOST_USER_DEVICE(vdev);
+Error *local_err = NULL;
+
+/*
+ * There will have been a warning during vhost_dev_init, but lets
+ * assert here as nothing will go right now.
+ */
+g_assert(vud->config_size && vud->vhost_user.supports_config == true);
+
+if (vhost_dev_get_config(>vhost_dev, config,
+ vud->config_size, _err)) {
+error_report_err(local_err);
+}
+}
+
+/*
+ * When the daemon signals an update to the config we just need to
+ * signal the guest as we re-read the config on demand above.
+ */
+static int vud_config_notifier(struct vhost_dev *dev)
+{
+virtio_notify_config(dev->vdev);
+return 0;
+}
+
+const VhostDevConfigOps vud_config_ops = {
+.vhost_dev_config_notifier = vud_config_notifier,
+};
+
 static void vud_handle_output(VirtIODevice *vdev, VirtQueue *vq)
 {
 /*
@@ -141,12 +177,21 @@ static int vud_connect(DeviceState *dev)
 {
 VirtIODevice *vdev = VIRTIO_DEVICE(dev);
 VHostUserDevice *vud = VHOST_USER_DEVICE(vdev);
+struct vhost_dev *vhost_dev = >vhost_dev;
 
 if (vud->connected) {
 return 0;
 }
 vud->connected = true;
 
+/*
+ * If we support VHOST_USER_GET_CONFIG we must enable the notifier
+ * so we can ping the guest when it updates.
+ */
+if (vud->vhost_user.supports_config) {
+vhost_dev_set_config_notifier(vhost_dev, _config_ops);
+}
+
 /* restore vhost state */
 if (virtio_device_started(vdev, vdev->status)) {
 vud_start(vdev);
@@ -214,11 +259,20 @@ static void vud_device_realize(DeviceState *dev, Error 
**errp)
 vud->num_vqs = 1; /* reasonable default? */
 }
 
+/*
+ * We can't handle config requests unless we know the size of the
+ * config region, specialisations of the vhost-user-device will be
+ * able to set this.
+ */
+if (vud->config_size) {
+vud->vhost_user.supports_config = true;
+}
+
 if (!vhost_user_init(>vhost_user, >chardev, errp)) {
 return;
 }
 
-virtio_init(vdev, vud->virtio_id, 0);
+virtio_init(vdev, vud->virtio_id, vud->config_size);
 
 /*
  * Disable guest notifiers, by default all notifications will be via the
@@ -271,6 +325,7 @@ static Property vud_properties[] = {
 DEFINE_PROP_CHR("chardev", VHostUserDevice, chardev),
 DEFINE_PROP_UINT16("virtio-id", VHostUserDevice, virtio_id, 0),
 DEFINE_PROP_UINT32("num_vqs", VHostUserDevice, num_vqs, 1),
+DEFINE_PROP_UINT32("config_size", VHostUserDevice, config_size, 0),
 DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -285,6 +340,7 @@ static void vud_class_init(ObjectClass *klass, void *data)
 vdc->realize = vud_device_realize;
 vdc->unrealize = vud_device_unrealize;
 vdc->get_features = vud_get_features;
+vdc->get_config = vud_get_config;
 vdc->set_status = vud_set_status;
 }
 
-- 
2.39.2

[PATCH 12/12] docs/system: add a basic enumeration of vhost-user devices

Signed-off-by: Alex Bennée 
---
 docs/system/devices/vhost-user-rng.rst |  2 ++
 docs/system/devices/vhost-user.rst | 41 ++
 2 files changed, 43 insertions(+)

diff --git a/docs/system/devices/vhost-user-rng.rst 
b/docs/system/devices/vhost-user-rng.rst
index a145d4105c..ead1405326 100644
--- a/docs/system/devices/vhost-user-rng.rst
+++ b/docs/system/devices/vhost-user-rng.rst
@@ -1,3 +1,5 @@
+.. _vhost_user_rng:
+
 QEMU vhost-user-rng - RNG emulation
 ===
 
diff --git a/docs/system/devices/vhost-user.rst 
b/docs/system/devices/vhost-user.rst
index 86128114fa..99b352823e 100644
--- a/docs/system/devices/vhost-user.rst
+++ b/docs/system/devices/vhost-user.rst
@@ -15,6 +15,47 @@ to the guest. The code is mostly boilerplate although each 
device has
 a ``chardev`` option which specifies the ID of the ``--chardev``
 device that connects via a socket to the vhost-user *daemon*.
 
+Each device will have an virtio-mmio and virtio-pci variant. See your
+platform details for what sort of virtio bus to use.
+
+.. list-table:: vhost-user devices
+  :widths: 20 20 60
+  :header-rows: 1
+
+  * - Device
+- Type
+- Notes
+  * - vhost-user-device
+- Generic
+- You must manually specify ``virtio-id`` and the correct ``num_vqs``
+  * - vhost-user-blk
+- Block storage
+-
+  * - vhost-user-fs
+- File based storage driver
+- See https://gitlab.com/virtio-fs/virtiofsd
+  * - vhost-user-scsi
+- SCSI based storage
+- See contrib/vhost-user/scsi
+  * - vhost-user-gpio
+- Proxy gpio pins to host
+- See https://github.com/rust-vmm/vhost-device
+  * - vhost-user-i2c
+- Proxy i2c devices to host
+- See https://github.com/rust-vmm/vhost-device
+  * - vhost-user-input
+- Generic input driver
+- See contrib/vhost-user-input
+  * - vhost-user-rng
+- Entropy driver
+- :ref:`vhost_user_rng`
+  * - vhost-user-gpu
+- GPU driver
+-
+  * - vhost-user-vsock
+- Socket based communication
+- See https://github.com/rust-vmm/vhost-device
+
 vhost-user daemon
 =
 
-- 
2.39.2

[PATCH 02/12] include/hw/virtio: document virtio_notify_config

Signed-off-by: Alex Bennée 
---
 include/hw/virtio/virtio.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index f236e94ca6..22ec098462 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -274,6 +274,13 @@ extern const VMStateInfo virtio_vmstate_info;
 
 int virtio_load(VirtIODevice *vdev, QEMUFile *f, int version_id);
 
+/**
+ * virtio_notify_config() - signal a change to device config
+ * @vdev: the virtio device
+ *
+ * Assuming the virtio device is up (VIRTIO_CONFIG_S_DRIVER_OK) this
+ * will trigger a guest interrupt and update the config version.
+ */
 void virtio_notify_config(VirtIODevice *vdev);
 
 bool virtio_queue_get_notification(VirtQueue *vq);
-- 
2.39.2

[PATCH 05/12] virtio: add generic vhost-user-device

In theory we shouldn't need to repeat so much boilerplate to support
vhost-user backends. This provides a generic vhost-user-device for
which the user needs to provide the few bits of information that
aren't currently provided by the vhost-user protocol. This should
provide a baseline implementation from which the other vhost-user stub
can specialise.

Signed-off-by: Alex Bennée 
---
 include/hw/virtio/vhost-user-device.h |  32 +++
 hw/virtio/vhost-user-device.c | 303 ++
 hw/virtio/meson.build |   2 +
 3 files changed, 337 insertions(+)
 create mode 100644 include/hw/virtio/vhost-user-device.h
 create mode 100644 hw/virtio/vhost-user-device.c

diff --git a/include/hw/virtio/vhost-user-device.h 
b/include/hw/virtio/vhost-user-device.h
new file mode 100644
index 00..8d77f06721
--- /dev/null
+++ b/include/hw/virtio/vhost-user-device.h
@@ -0,0 +1,32 @@
+/*
+ * Vhost-user generic virtio device
+ *
+ * Copyright (c) 2023 Linaro Ltd
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#ifndef QEMU_VHOST_USER_DEVICE_H
+#define QEMU_VHOST_USER_DEVICE_H
+
+#include "hw/virtio/vhost.h"
+#include "hw/virtio/vhost-user.h"
+
+#define TYPE_VHOST_USER_DEVICE "vhost-user-device"
+OBJECT_DECLARE_SIMPLE_TYPE(VHostUserDevice, VHOST_USER_DEVICE)
+
+struct VHostUserDevice {
+VirtIODevice parent;
+/* Properties */
+CharBackend chardev;
+uint16_t virtio_id;
+uint32_t num_vqs;
+/* State tracking */
+VhostUserState vhost_user;
+struct vhost_virtqueue *vhost_vq;
+struct vhost_dev vhost_dev;
+GPtrArray *vqs;
+bool connected;
+};
+
+#endif /* QEMU_VHOST_USER_DEVICE_H */
diff --git a/hw/virtio/vhost-user-device.c b/hw/virtio/vhost-user-device.c
new file mode 100644
index 00..bfbf3b29cb
--- /dev/null
+++ b/hw/virtio/vhost-user-device.c
@@ -0,0 +1,303 @@
+/*
+ * Generic vhost-user stub. This can be used to connect to any
+ * vhost-user backend. All configuration details must be handled by
+ * the vhost-user daemon itself
+ *
+ * Copyright (c) 2023 Linaro Ltd
+ * Author: Alex Bennée 
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/virtio-bus.h"
+#include "hw/virtio/vhost-user-device.h"
+#include "qemu/error-report.h"
+
+static void vud_start(VirtIODevice *vdev)
+{
+BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+VHostUserDevice *vud = VHOST_USER_DEVICE(vdev);
+int ret, i;
+
+if (!k->set_guest_notifiers) {
+error_report("binding does not support guest notifiers");
+return;
+}
+
+ret = vhost_dev_enable_notifiers(>vhost_dev, vdev);
+if (ret < 0) {
+error_report("Error enabling host notifiers: %d", -ret);
+return;
+}
+
+ret = k->set_guest_notifiers(qbus->parent, vud->vhost_dev.nvqs, true);
+if (ret < 0) {
+error_report("Error binding guest notifier: %d", -ret);
+goto err_host_notifiers;
+}
+
+vud->vhost_dev.acked_features = vdev->guest_features;
+
+ret = vhost_dev_start(>vhost_dev, vdev, true);
+if (ret < 0) {
+error_report("Error starting vhost-user-device: %d", -ret);
+goto err_guest_notifiers;
+}
+
+/*
+ * guest_notifier_mask/pending not used yet, so just unmask
+ * everything here. virtio-pci will do the right thing by
+ * enabling/disabling irqfd.
+ */
+for (i = 0; i < vud->vhost_dev.nvqs; i++) {
+vhost_virtqueue_mask(>vhost_dev, vdev, i, false);
+}
+
+return;
+
+err_guest_notifiers:
+k->set_guest_notifiers(qbus->parent, vud->vhost_dev.nvqs, false);
+err_host_notifiers:
+vhost_dev_disable_notifiers(>vhost_dev, vdev);
+}
+
+static void vud_stop(VirtIODevice *vdev)
+{
+VHostUserDevice *vud = VHOST_USER_DEVICE(vdev);
+BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
+VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+int ret;
+
+if (!k->set_guest_notifiers) {
+return;
+}
+
+vhost_dev_stop(>vhost_dev, vdev, true);
+
+ret = k->set_guest_notifiers(qbus->parent, vud->vhost_dev.nvqs, false);
+if (ret < 0) {
+error_report("vhost guest notifier cleanup failed: %d", ret);
+return;
+}
+
+vhost_dev_disable_notifiers(>vhost_dev, vdev);
+}
+
+static void vud_set_status(VirtIODevice *vdev, uint8_t status)
+{
+VHostUserDevice *vud = VHOST_USER_DEVICE(vdev);
+bool should_start = virtio_device_should_start(vdev, status);
+
+if (vhost_dev_is_started(>vhost_dev) == should_start) {
+return;
+}
+
+if (should_start) {
+vud_start(vdev);
+} else {
+vud_stop(vdev);
+}
+}
+
+/*
+ * For an implementation where everything is delegated to the backend
+ * we don't do anything other than return the full feature set offered
+ * by the daemon (module the reserved feature bit).
+ */
+static

[PATCH 06/12] virtio: add PCI stub for vhost-user-device

This is all pretty much boilerplate.

Signed-off-by: Alex Bennée 
---
 hw/virtio/vhost-user-device-pci.c | 71 +++
 hw/virtio/meson.build |  1 +
 2 files changed, 72 insertions(+)
 create mode 100644 hw/virtio/vhost-user-device-pci.c

diff --git a/hw/virtio/vhost-user-device-pci.c 
b/hw/virtio/vhost-user-device-pci.c
new file mode 100644
index 00..96bf99d5fd
--- /dev/null
+++ b/hw/virtio/vhost-user-device-pci.c
@@ -0,0 +1,71 @@
+/*
+ * Vhost-user generic virtio device PCI glue
+ *
+ * Copyright (c) 2023 Linaro Ltd
+ * Author: Alex Bennée 
+ *
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include "qemu/osdep.h"
+#include "hw/qdev-properties.h"
+#include "hw/virtio/vhost-user-device.h"
+#include "hw/virtio/virtio-pci.h"
+
+struct VHostUserDevicePCI {
+VirtIOPCIProxy parent_obj;
+VHostUserDevice vdev;
+};
+
+typedef struct VHostUserDevicePCI VHostUserDevicePCI;
+
+#define TYPE_VHOST_USER_DEVICE_PCI "vhost-user-device-pci-base"
+
+DECLARE_INSTANCE_CHECKER(VHostUserDevicePCI,
+ VHOST_USER_DEVICE_PCI,
+ TYPE_VHOST_USER_DEVICE_PCI)
+
+static void vhost_user_device_pci_realize(VirtIOPCIProxy *vpci_dev, Error 
**errp)
+{
+VHostUserDevicePCI *dev = VHOST_USER_DEVICE_PCI(vpci_dev);
+DeviceState *vdev = DEVICE(>vdev);
+
+vpci_dev->nvectors = 1;
+qdev_realize(vdev, BUS(_dev->bus), errp);
+}
+
+static void vhost_user_device_pci_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+VirtioPCIClass *k = VIRTIO_PCI_CLASS(klass);
+PCIDeviceClass *pcidev_k = PCI_DEVICE_CLASS(klass);
+k->realize = vhost_user_device_pci_realize;
+set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
+pcidev_k->vendor_id = PCI_VENDOR_ID_REDHAT_QUMRANET;
+pcidev_k->device_id = 0; /* Set by virtio-pci based on virtio id */
+pcidev_k->revision = 0x00;
+pcidev_k->class_id = PCI_CLASS_COMMUNICATION_OTHER;
+}
+
+static void vhost_user_device_pci_instance_init(Object *obj)
+{
+VHostUserDevicePCI *dev = VHOST_USER_DEVICE_PCI(obj);
+
+virtio_instance_init_common(obj, >vdev, sizeof(dev->vdev),
+TYPE_VHOST_USER_DEVICE);
+}
+
+static const VirtioPCIDeviceTypeInfo vhost_user_device_pci_info = {
+.base_name = TYPE_VHOST_USER_DEVICE_PCI,
+.non_transitional_name = "vhost-user-device-pci",
+.instance_size = sizeof(VHostUserDevicePCI),
+.instance_init = vhost_user_device_pci_instance_init,
+.class_init = vhost_user_device_pci_class_init,
+};
+
+static void vhost_user_device_pci_register(void)
+{
+virtio_pci_types_register(_user_device_pci_info);
+}
+
+type_init(vhost_user_device_pci_register);
diff --git a/hw/virtio/meson.build b/hw/virtio/meson.build
index 43e5fa3f7d..c0a86b94ae 100644
--- a/hw/virtio/meson.build
+++ b/hw/virtio/meson.build
@@ -13,6 +13,7 @@ if have_vhost
 # fixme - this really should be generic
 specific_virtio_ss.add(files('vhost-user.c'))
 softmmu_virtio_ss.add(files('vhost-user-device.c'))
+softmmu_virtio_ss.add(when: 'CONFIG_VIRTIO_PCI', if_true: 
files('vhost-user-device-pci.c'))
   endif
   if have_vhost_vdpa
 specific_virtio_ss.add(files('vhost-vdpa.c', 'vhost-shadow-virtqueue.c'))
-- 
2.39.2

[PATCH 09/12] hw/virtio: derive vhost-user-rng from vhost-user-device

Now we can take advantage of our new base class and make
vhost-user-rng a much simpler boilerplate wrapper.

Signed-off-by: Alex Bennée 
---
 include/hw/virtio/vhost-user-rng.h |  11 +-
 hw/virtio/vhost-user-rng.c | 264 +
 2 files changed, 8 insertions(+), 267 deletions(-)

diff --git a/include/hw/virtio/vhost-user-rng.h 
b/include/hw/virtio/vhost-user-rng.h
index ddd9f01eea..5be2999498 100644
--- a/include/hw/virtio/vhost-user-rng.h
+++ b/include/hw/virtio/vhost-user-rng.h
@@ -12,21 +12,14 @@
 #include "hw/virtio/virtio.h"
 #include "hw/virtio/vhost.h"
 #include "hw/virtio/vhost-user.h"
-#include "chardev/char-fe.h"
+#include "hw/virtio/vhost-user-device.h"
 
 #define TYPE_VHOST_USER_RNG "vhost-user-rng"
 OBJECT_DECLARE_SIMPLE_TYPE(VHostUserRNG, VHOST_USER_RNG)
 
 struct VHostUserRNG {
 /*< private >*/
-VirtIODevice parent;
-CharBackend chardev;
-struct vhost_virtqueue *vhost_vq;
-struct vhost_dev vhost_dev;
-VhostUserState vhost_user;
-VirtQueue *req_vq;
-bool connected;
-
+VHostUserDevice parent;
 /*< public >*/
 };
 
diff --git a/hw/virtio/vhost-user-rng.c b/hw/virtio/vhost-user-rng.c
index efc54cd3fb..1f23442b81 100644
--- a/hw/virtio/vhost-user-rng.c
+++ b/hw/virtio/vhost-user-rng.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 2021 Mathieu Poirier 
  *
- * Implementation seriously tailored on vhost-user-i2c.c
+ * Simple wrapper of the generic vhost-user-device.
  *
  * SPDX-License-Identifier: GPL-2.0-or-later
  */
@@ -13,281 +13,29 @@
 #include "hw/qdev-properties.h"
 #include "hw/virtio/virtio-bus.h"
 #include "hw/virtio/vhost-user-rng.h"
-#include "qemu/error-report.h"
 #include "standard-headers/linux/virtio_ids.h"
 
-static const int feature_bits[] = {
-VIRTIO_F_RING_RESET,
-VHOST_INVALID_FEATURE_BIT
-};
-
-static void vu_rng_start(VirtIODevice *vdev)
-{
-VHostUserRNG *rng = VHOST_USER_RNG(vdev);
-BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
-VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
-int ret;
-int i;
-
-if (!k->set_guest_notifiers) {
-error_report("binding does not support guest notifiers");
-return;
-}
-
-ret = vhost_dev_enable_notifiers(>vhost_dev, vdev);
-if (ret < 0) {
-error_report("Error enabling host notifiers: %d", -ret);
-return;
-}
-
-ret = k->set_guest_notifiers(qbus->parent, rng->vhost_dev.nvqs, true);
-if (ret < 0) {
-error_report("Error binding guest notifier: %d", -ret);
-goto err_host_notifiers;
-}
-
-rng->vhost_dev.acked_features = vdev->guest_features;
-ret = vhost_dev_start(>vhost_dev, vdev, true);
-if (ret < 0) {
-error_report("Error starting vhost-user-rng: %d", -ret);
-goto err_guest_notifiers;
-}
-
-/*
- * guest_notifier_mask/pending not used yet, so just unmask
- * everything here. virtio-pci will do the right thing by
- * enabling/disabling irqfd.
- */
-for (i = 0; i < rng->vhost_dev.nvqs; i++) {
-vhost_virtqueue_mask(>vhost_dev, vdev, i, false);
-}
-
-return;
-
-err_guest_notifiers:
-k->set_guest_notifiers(qbus->parent, rng->vhost_dev.nvqs, false);
-err_host_notifiers:
-vhost_dev_disable_notifiers(>vhost_dev, vdev);
-}
-
-static void vu_rng_stop(VirtIODevice *vdev)
-{
-VHostUserRNG *rng = VHOST_USER_RNG(vdev);
-BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
-VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
-int ret;
-
-if (!k->set_guest_notifiers) {
-return;
-}
-
-vhost_dev_stop(>vhost_dev, vdev, true);
-
-ret = k->set_guest_notifiers(qbus->parent, rng->vhost_dev.nvqs, false);
-if (ret < 0) {
-error_report("vhost guest notifier cleanup failed: %d", ret);
-return;
-}
-
-vhost_dev_disable_notifiers(>vhost_dev, vdev);
-}
-
-static void vu_rng_set_status(VirtIODevice *vdev, uint8_t status)
-{
-VHostUserRNG *rng = VHOST_USER_RNG(vdev);
-bool should_start = virtio_device_should_start(vdev, status);
-
-if (vhost_dev_is_started(>vhost_dev) == should_start) {
-return;
-}
-
-if (should_start) {
-vu_rng_start(vdev);
-} else {
-vu_rng_stop(vdev);
-}
-}
-
-static uint64_t vu_rng_get_features(VirtIODevice *vdev,
-uint64_t requested_features, Error **errp)
-{
-VHostUserRNG *rng = VHOST_USER_RNG(vdev);
-
-return vhost_get_features(>vhost_dev, feature_bits,
-  requested_features);
-}
-
-static void vu_rng_handle_output(VirtIODevice *vdev, VirtQueue *vq)
-{
-/*
- * Not normally called; it's the daemon that handles the queue;
- * however virtio's cleanup path can call this.
- */
-}
-
-static void vu_rng_guest_notifier_mask(VirtIODevice *vdev, int idx, bool mask)
-{
-VHostUserRNG *rng = VHOST_USER_RNG(vdev);
-
-vhost_virtqueue_mask(>vhost_dev, vdev, idx, mask);
-}
-
-static bool

[PATCH 11/12] hw/virtio: derive vhost-user-gpio from vhost-user-device (!BROKEN)

Now we can take advantage of our new base class and make
vhost-user-gpio a much simpler boilerplate wrapper.

[AJB - and this breaks because of the class init propery hack leading
to the config getting overriden and firing the assert. I need a clean
QOM way to do this derived class]

Signed-off-by: Alex Bennée 
---
 include/hw/virtio/vhost-user-gpio.h |  23 +-
 hw/virtio/vhost-user-gpio.c | 405 ++--
 2 files changed, 17 insertions(+), 411 deletions(-)

diff --git a/include/hw/virtio/vhost-user-gpio.h 
b/include/hw/virtio/vhost-user-gpio.h
index a9d3f9b049..82a2e36c21 100644
--- a/include/hw/virtio/vhost-user-gpio.h
+++ b/include/hw/virtio/vhost-user-gpio.h
@@ -12,33 +12,14 @@
 #include "hw/virtio/virtio.h"
 #include "hw/virtio/vhost.h"
 #include "hw/virtio/vhost-user.h"
-#include "standard-headers/linux/virtio_gpio.h"
-#include "chardev/char-fe.h"
+#include "hw/virtio/vhost-user-device.h"
 
 #define TYPE_VHOST_USER_GPIO "vhost-user-gpio-device"
 OBJECT_DECLARE_SIMPLE_TYPE(VHostUserGPIO, VHOST_USER_GPIO);
 
 struct VHostUserGPIO {
 /*< private >*/
-VirtIODevice parent_obj;
-CharBackend chardev;
-struct virtio_gpio_config config;
-struct vhost_virtqueue *vhost_vqs;
-struct vhost_dev vhost_dev;
-VhostUserState vhost_user;
-VirtQueue *command_vq;
-VirtQueue *interrupt_vq;
-/**
- * There are at least two steps of initialization of the
- * vhost-user device. The first is a "connect" step and
- * second is a "start" step. Make a separation between
- * those initialization phases by using two fields.
- *
- * @connected: see vu_gpio_connect()/vu_gpio_disconnect()
- * @started_vu: see vu_gpio_start()/vu_gpio_stop()
- */
-bool connected;
-bool started_vu;
+VHostUserDevice parent;
 /*< public >*/
 };
 
diff --git a/hw/virtio/vhost-user-gpio.c b/hw/virtio/vhost-user-gpio.c
index 3b013f2d0f..c76af1fc69 100644
--- a/hw/virtio/vhost-user-gpio.c
+++ b/hw/virtio/vhost-user-gpio.c
@@ -11,413 +11,38 @@
 #include "hw/qdev-properties.h"
 #include "hw/virtio/virtio-bus.h"
 #include "hw/virtio/vhost-user-gpio.h"
-#include "qemu/error-report.h"
 #include "standard-headers/linux/virtio_ids.h"
-#include "trace.h"
-
-#define REALIZE_CONNECTION_RETRIES 3
-#define VHOST_NVQS 2
-
-/* Features required from VirtIO */
-static const int feature_bits[] = {
-VIRTIO_F_VERSION_1,
-VIRTIO_F_NOTIFY_ON_EMPTY,
-VIRTIO_RING_F_INDIRECT_DESC,
-VIRTIO_RING_F_EVENT_IDX,
-VIRTIO_GPIO_F_IRQ,
-VIRTIO_F_RING_RESET,
-VHOST_INVALID_FEATURE_BIT
-};
-
-static void vu_gpio_get_config(VirtIODevice *vdev, uint8_t *config)
-{
-VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev);
-
-memcpy(config, >config, sizeof(gpio->config));
-}
-
-static int vu_gpio_config_notifier(struct vhost_dev *dev)
-{
-VHostUserGPIO *gpio = VHOST_USER_GPIO(dev->vdev);
-
-memcpy(dev->vdev->config, >config, sizeof(gpio->config));
-virtio_notify_config(dev->vdev);
-
-return 0;
-}
-
-const VhostDevConfigOps gpio_ops = {
-.vhost_dev_config_notifier = vu_gpio_config_notifier,
-};
-
-static int vu_gpio_start(VirtIODevice *vdev)
-{
-BusState *qbus = BUS(qdev_get_parent_bus(DEVICE(vdev)));
-VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
-VHostUserGPIO *gpio = VHOST_USER_GPIO(vdev);
-struct vhost_dev *vhost_dev = >vhost_dev;
-int ret, i;
-
-if (!k->set_guest_notifiers) {
-error_report("binding does not support guest notifiers");
-return -ENOSYS;
-}
-
-ret = vhost_dev_enable_notifiers(vhost_dev, vdev);
-if (ret < 0) {
-error_report("Error enabling host notifiers: %d", ret);
-return ret;
-}
-
-ret = k->set_guest_notifiers(qbus->parent, vhost_dev->nvqs, true);
-if (ret < 0) {
-error_report("Error binding guest notifier: %d", ret);
-goto err_host_notifiers;
-}
-
-/*
- * Before we start up we need to ensure we have the final feature
- * set needed for the vhost configuration. The backend may also
- * apply backend_features when the feature set is sent.
- */
-vhost_ack_features(>vhost_dev, feature_bits, vdev->guest_features);
-
-ret = vhost_dev_start(>vhost_dev, vdev, false);
-if (ret < 0) {
-error_report("Error starting vhost-user-gpio: %d", ret);
-goto err_guest_notifiers;
-}
-gpio->started_vu = true;
-
-/*
- * guest_notifier_mask/pending not used yet, so just unmask
- * everything here. virtio-pci will do the right thing by
- * enabling/disabling irqfd.
- */
-for (i = 0; i < gpio->vhost_dev.nvqs; i++) {
-vhost_virtqueue_mask(>vhost_dev, vdev, i, false);
-}
-
-/*
- * As we must have VHOST_USER_F_PROTOCOL_FEATURES (because
- * VHOST_USER_GET_CONFIG requires it) we need to explicitly enable
- * the vrings.
- */
-g_assert(vhost_dev->vhost_ops &&
- vhost_dev->vhost_ops->vhost_set_vring_enable);
-ret =

[PATCH 1/3] target/arm: Don't allow stage 2 page table walks to downgrade to NS

Bit 63 in a Table descriptor is only the NSTable bit for stage 1
translations; in stage 2 it is RES0.  We were incorrectly looking at
it all the time.

This causes problems if:
 * the stage 2 table descriptor was incorrectly setting the RES0 bit
 * we are doing a stage 2 translation in Secure address space for
   a NonSecure stage 1 regime -- in this case we would incorrectly
   do an immediate downgrade to NonSecure

A bug elsewhere in the code currently prevents us from getting
to the second situation, but when we fix that it will be possible.

Cc: qemu-sta...@nongnu.org
Signed-off-by: Peter Maydell 
---
 target/arm/ptw.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/target/arm/ptw.c b/target/arm/ptw.c
index 6d72950a795..06865227642 100644
--- a/target/arm/ptw.c
+++ b/target/arm/ptw.c
@@ -1403,17 +1403,18 @@ static bool get_phys_addr_lpae(CPUARMState *env, 
S1Translate *ptw,
 descaddrmask &= ~indexmask_grainsize;
 
 /*
- * Secure accesses start with the page table in secure memory and
+ * Secure stage 1 accesses start with the page table in secure memory and
  * can be downgraded to non-secure at any step. Non-secure accesses
  * remain non-secure. We implement this by just ORing in the NSTable/NS
  * bits at each step.
+ * Stage 2 never gets this kind of downgrade.
  */
 tableattrs = is_secure ? 0 : (1 << 4);
 
  next_level:
 descaddr |= (address >> (stride * (4 - level))) & indexmask;
 descaddr &= ~7ULL;
-nstable = extract32(tableattrs, 4, 1);
+nstable = !regime_is_stage2(mmu_idx) && extract32(tableattrs, 4, 1);
 if (nstable) {
 /*
  * Stage2_S -> Stage2 or Phys_S -> Phys_NS
-- 
2.34.1

[PATCH 04/12] include/hw/virtio: document some more usage of notifiers

Lets document some more of the core VirtIODevice structure.

Signed-off-by: Alex Bennée 
---
 include/hw/virtio/virtio.h | 8 
 1 file changed, 8 insertions(+)

diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 1ba7a9dd74..ef77e9ef0e 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -150,10 +150,18 @@ struct VirtIODevice
 VMChangeStateEntry *vmstate;
 char *bus_name;
 uint8_t device_endian;
+/**
+ * @user_guest_notifier_mask: gate usage of ->guest_notifier_mask() 
callback.
+ * This is used to suppress the masking of guest updates for
+ * vhost-user devices which are asynchronous by design.
+ */
 bool use_guest_notifier_mask;
 AddressSpace *dma_as;
 QLIST_HEAD(, VirtQueue) *vector_queues;
 QTAILQ_ENTRY(VirtIODevice) next;
+/**
+ * @config_notifier: the event notifier that handles config events
+ */
 EventNotifier config_notifier;
 };
 
-- 
2.39.2

[PATCH 03/12] include/hw/virtio: add kerneldoc for virtio_init

Signed-off-by: Alex Bennée 
---
 include/hw/virtio/virtio.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h
index 22ec098462..1ba7a9dd74 100644
--- a/include/hw/virtio/virtio.h
+++ b/include/hw/virtio/virtio.h
@@ -217,6 +217,12 @@ struct VirtioDeviceClass {
 void virtio_instance_init_common(Object *proxy_obj, void *data,
  size_t vdev_size, const char *vdev_name);
 
+/**
+ * virtio_init() - initialise the common VirtIODevice structure
+ * @vdev: pointer to VirtIODevice
+ * @device_id: the VirtIO device ID (see virtio_ids.h)
+ * @config_size: size of the config space
+ */
 void virtio_init(VirtIODevice *vdev, uint16_t device_id, size_t config_size);
 
 void virtio_cleanup(VirtIODevice *vdev);
-- 
2.39.2

[PATCH 07/12] include: attempt to document device_class_set_props

I'm still not sure how I achieve by use case of the parent class
defining the following properties:

  static Property vud_properties[] = {
  DEFINE_PROP_CHR("chardev", VHostUserDevice, chardev),
  DEFINE_PROP_UINT16("id", VHostUserDevice, id, 0),
  DEFINE_PROP_UINT32("num_vqs", VHostUserDevice, num_vqs, 1),
  DEFINE_PROP_END_OF_LIST(),
  };

But for the specialisation of the class I want the id to default to
the actual device id, e.g.:

  static Property vu_rng_properties[] = {
  DEFINE_PROP_UINT16("id", VHostUserDevice, id, VIRTIO_ID_RNG),
  DEFINE_PROP_UINT32("num_vqs", VHostUserDevice, num_vqs, 1),
  DEFINE_PROP_END_OF_LIST(),
  };

And so far the API for doing that isn't super clear.

Signed-off-by: Alex Bennée 
---
 include/hw/qdev-core.h | 9 +
 1 file changed, 9 insertions(+)

diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
index bd50ad5ee1..d4bbc30c92 100644
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@@ -776,6 +776,15 @@ BusState *sysbus_get_default(void);
 char *qdev_get_fw_dev_path(DeviceState *dev);
 char *qdev_get_own_fw_dev_path_from_handler(BusState *bus, DeviceState *dev);
 
+/**
+ * device_class_set_props(): add a set of properties to an device
+ * @dc: the parent DeviceClass all devices inherit
+ * @props: an array of properties, terminate by DEFINE_PROP_END_OF_LIST()
+ *
+ * This will add a set of properties to the object. It will fault if
+ * you attempt to add an existing property defined by a parent class.
+ * To modify an inherited property you need to use
+ */
 void device_class_set_props(DeviceClass *dc, Property *props);
 
 /**
-- 
2.39.2

[PATCH 00/12] virtio: add vhost-user-generic and reduce copy and paste

A lot of our vhost-user stubs are large chunks of boilerplate that do
(mostly) the same thing. This series attempts to fix that by defining
a new base class for vhost-user devices and then converting the rng
and gpio devices to be based off them. You can even use
vhost-user-device directly if you supply it with the right magic
numbers (which is helpful for development).

However the final patch runs into the weeds because I don't yet have a
clean way to represent in QOM the fixing of certain properties for the
specialised classes.

The series is a net reduction in code and an increase in
documentation but obviously needs to iron out a few more warts. I'm
open to suggestions on the best way to tweak the QOM stuff.

Alex.

Alex Bennée (12):
  hw/virtio: fix typo in VIRTIO_CONFIG_IRQ_IDX comments
  include/hw/virtio: document virtio_notify_config
  include/hw/virtio: add kerneldoc for virtio_init
  include/hw/virtio: document some more usage of notifiers
  virtio: add generic vhost-user-device
  virtio: add PCI stub for vhost-user-device
  include: attempt to document device_class_set_props
  qom: allow for properties to become "fixed"
  hw/virtio: derive vhost-user-rng from vhost-user-device
  hw/virtio: add config support to vhost-user-device
  hw/virtio: derive vhost-user-gpio from vhost-user-device (!BROKEN)
  docs/system: add a basic enumeration of vhost-user devices

 docs/system/devices/vhost-user-rng.rst |   2 +
 docs/system/devices/vhost-user.rst |  41 +++
 qapi/qom.json  |   2 +
 include/hw/qdev-core.h |   9 +
 include/hw/virtio/vhost-user-device.h  |  33 ++
 include/hw/virtio/vhost-user-gpio.h|  23 +-
 include/hw/virtio/vhost-user-rng.h |  11 +-
 include/hw/virtio/virtio.h |  21 ++
 include/qom/object.h   |  16 +-
 hw/display/vhost-user-gpu.c|   4 +-
 hw/net/virtio-net.c|   4 +-
 hw/virtio/vhost-user-device-pci.c  |  71 +
 hw/virtio/vhost-user-device.c  | 359 ++
 hw/virtio/vhost-user-fs.c  |   4 +-
 hw/virtio/vhost-user-gpio.c| 405 +
 hw/virtio/vhost-user-rng.c | 264 +---
 hw/virtio/vhost-vsock-common.c |   4 +-
 hw/virtio/virtio-crypto.c  |   4 +-
 qom/object.c   |  14 +
 qom/object_interfaces.c|   9 +-
 qom/qom-qmp-cmds.c |   1 +
 softmmu/qdev-monitor.c |   1 +
 hw/virtio/meson.build  |   3 +
 23 files changed, 613 insertions(+), 692 deletions(-)
 create mode 100644 include/hw/virtio/vhost-user-device.h
 create mode 100644 hw/virtio/vhost-user-device-pci.c
 create mode 100644 hw/virtio/vhost-user-device.c

-- 
2.39.2

[PATCH 01/12] hw/virtio: fix typo in VIRTIO_CONFIG_IRQ_IDX comments

Fixes: 544f0278af (virtio: introduce macro VIRTIO_CONFIG_IRQ_IDX)
Signed-off-by: Alex Bennée 
---
 hw/display/vhost-user-gpu.c| 4 ++--
 hw/net/virtio-net.c| 4 ++--
 hw/virtio/vhost-user-fs.c  | 4 ++--
 hw/virtio/vhost-user-gpio.c| 2 +-
 hw/virtio/vhost-vsock-common.c | 4 ++--
 hw/virtio/virtio-crypto.c  | 4 ++--
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/hw/display/vhost-user-gpu.c b/hw/display/vhost-user-gpu.c
index 71dfd956b8..7c61a7c3ac 100644
--- a/hw/display/vhost-user-gpu.c
+++ b/hw/display/vhost-user-gpu.c
@@ -489,7 +489,7 @@ vhost_user_gpu_guest_notifier_pending(VirtIODevice *vdev, 
int idx)
 
 /*
  * Add the check for configure interrupt, Use VIRTIO_CONFIG_IRQ_IDX -1
- * as the Marco of configure interrupt's IDX, If this driver does not
+ * as the macro of configure interrupt's IDX, If this driver does not
  * support, the function will return
  */
 
@@ -506,7 +506,7 @@ vhost_user_gpu_guest_notifier_mask(VirtIODevice *vdev, int 
idx, bool mask)
 
 /*
  * Add the check for configure interrupt, Use VIRTIO_CONFIG_IRQ_IDX -1
- * as the Marco of configure interrupt's IDX, If this driver does not
+ * as the macro of configure interrupt's IDX, If this driver does not
  * support, the function will return
  */
 
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 53e1c32643..c53616a080 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -3359,7 +3359,7 @@ static bool 
virtio_net_guest_notifier_pending(VirtIODevice *vdev, int idx)
 }
 /*
  * Add the check for configure interrupt, Use VIRTIO_CONFIG_IRQ_IDX -1
- * as the Marco of configure interrupt's IDX, If this driver does not
+ * as the macro of configure interrupt's IDX, If this driver does not
  * support, the function will return false
  */
 
@@ -3391,7 +3391,7 @@ static void virtio_net_guest_notifier_mask(VirtIODevice 
*vdev, int idx,
 }
 /*
  *Add the check for configure interrupt, Use VIRTIO_CONFIG_IRQ_IDX -1
- * as the Marco of configure interrupt's IDX, If this driver does not
+ * as the macro of configure interrupt's IDX, If this driver does not
  * support, the function will return
  */
 
diff --git a/hw/virtio/vhost-user-fs.c b/hw/virtio/vhost-user-fs.c
index 83fc20e49e..49d699ffc2 100644
--- a/hw/virtio/vhost-user-fs.c
+++ b/hw/virtio/vhost-user-fs.c
@@ -161,7 +161,7 @@ static void vuf_guest_notifier_mask(VirtIODevice *vdev, int 
idx,
 
 /*
  * Add the check for configure interrupt, Use VIRTIO_CONFIG_IRQ_IDX -1
- * as the Marco of configure interrupt's IDX, If this driver does not
+ * as the macro of configure interrupt's IDX, If this driver does not
  * support, the function will return
  */
 
@@ -177,7 +177,7 @@ static bool vuf_guest_notifier_pending(VirtIODevice *vdev, 
int idx)
 
 /*
  * Add the check for configure interrupt, Use VIRTIO_CONFIG_IRQ_IDX -1
- * as the Marco of configure interrupt's IDX, If this driver does not
+ * as the macro of configure interrupt's IDX, If this driver does not
  * support, the function will return
  */
 
diff --git a/hw/virtio/vhost-user-gpio.c b/hw/virtio/vhost-user-gpio.c
index d6927b610a..3b013f2d0f 100644
--- a/hw/virtio/vhost-user-gpio.c
+++ b/hw/virtio/vhost-user-gpio.c
@@ -194,7 +194,7 @@ static void vu_gpio_guest_notifier_mask(VirtIODevice *vdev, 
int idx, bool mask)
 
 /*
  * Add the check for configure interrupt, Use VIRTIO_CONFIG_IRQ_IDX -1
- * as the Marco of configure interrupt's IDX, If this driver does not
+ * as the macro of configure interrupt's IDX, If this driver does not
  * support, the function will return
  */
 
diff --git a/hw/virtio/vhost-vsock-common.c b/hw/virtio/vhost-vsock-common.c
index d2b5519d5a..623bdf91cc 100644
--- a/hw/virtio/vhost-vsock-common.c
+++ b/hw/virtio/vhost-vsock-common.c
@@ -129,7 +129,7 @@ static void 
vhost_vsock_common_guest_notifier_mask(VirtIODevice *vdev, int idx,
 
 /*
  * Add the check for configure interrupt, Use VIRTIO_CONFIG_IRQ_IDX -1
- * as the Marco of configure interrupt's IDX, If this driver does not
+ * as the macro of configure interrupt's IDX, If this driver does not
  * support, the function will return
  */
 
@@ -146,7 +146,7 @@ static bool 
vhost_vsock_common_guest_notifier_pending(VirtIODevice *vdev,
 
 /*
  * Add the check for configure interrupt, Use VIRTIO_CONFIG_IRQ_IDX -1
- * as the Marco of configure interrupt's IDX, If this driver does not
+ * as the macro of configure interrupt's IDX, If this driver does not
  * support, the function will return
  */
 
diff --git a/hw/virtio/virtio-crypto.c b/hw/virtio/virtio-crypto.c
index 802e1b9659..6b3e607329 100644
--- a/hw/virtio/virtio-crypto.c
+++ b/hw/virtio/virtio-crypto.c
@@ -1208,7 +1208,7 @@ static void 
virtio_crypto_guest_notifier_mask(VirtIODevice *vdev, int idx,
 
 /*
  * Add the

[PATCH 2/3] target/arm: Set ptw->out_secure correctly for stage 2 translations

In S1_ptw_translate(), we are in one of two situations:
 (1) translating an S1 page table walk through S2
 (2) translating an S2 page table walk through a physical regime

In case (1), ptw->in_secure indicates whether S1 is a Secure or
NonSecure translation regime.  In case (2), ptw->in_secure indicates
whether this stage 2 translation is for a Secure IPA or a NonSecure
IPA.  In particular, because of the VTCR_EL2.NSW and VSTCR_EL2.SW
bits, we can be doing a translation of a NonSecure IPA where the page
tables are in the Secure space.

Correct the setting of the ptw->out_secure value for the
regime-is-physical case:
 * it depends on whether s2_mmu_idx is Phys_S or Phys, not
   on the value of is_secure
 * we don't need to look at the VTCR_EL2.NSW and VSTCR.SW bits
   again here, as we already did that in get_phys_addr_twostage()
   to set the correct in_ptw_idx

This error doesn't currently cause a problem in itself because
we are incorrectly setting ptw->in_secure to s2walk_secure in
get_phys_addr_twostage(), but we need to fix this before we can
correct that problem.

Cc: qemu-sta...@nongnu.org
Signed-off-by: Peter Maydell 
---
 target/arm/ptw.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/target/arm/ptw.c b/target/arm/ptw.c
index 06865227642..c1e124df495 100644
--- a/target/arm/ptw.c
+++ b/target/arm/ptw.c
@@ -249,7 +249,7 @@ static bool S1_ptw_translate(CPUARMState *env, S1Translate 
*ptw,
 /* Regime is physical. */
 ptw->out_phys = addr;
 pte_attrs = 0;
-pte_secure = is_secure;
+pte_secure = s2_mmu_idx == ARMMMUIdx_Phys_S;
 }
 ptw->out_host = NULL;
 ptw->out_rw = false;
@@ -291,13 +291,15 @@ static bool S1_ptw_translate(CPUARMState *env, 
S1Translate *ptw,
 fi->s1ns = !is_secure;
 return false;
 }
+/* Check if page table walk is to secure or non-secure PA space. */
+ptw->out_secure = (is_secure
+   && !(pte_secure
+? env->cp15.vstcr_el2 & VSTCR_SW
+: env->cp15.vtcr_el2 & VTCR_NSW));
+} else {
+/* Regime is physical */
+ptw->out_secure = pte_secure;
 }
-
-/* Check if page table walk is to secure or non-secure PA space. */
-ptw->out_secure = (is_secure
-   && !(pte_secure
-? env->cp15.vstcr_el2 & VSTCR_SW
-: env->cp15.vtcr_el2 & VTCR_NSW));
 ptw->out_be = regime_translation_big_endian(env, mmu_idx);
 return true;
 
-- 
2.34.1

[PATCH 08/12] qom: allow for properties to become "fixed"

When specialising general purpose objects it is sometimes useful to
"fix" some of the properties that were configurable by the base
classes. We will use this facility when specialising
vhost-user-device.

Signed-off-by: Alex Bennée 
---
 qapi/qom.json   |  2 ++
 include/qom/object.h| 16 +++-
 qom/object.c| 14 ++
 qom/object_interfaces.c |  9 ++---
 qom/qom-qmp-cmds.c  |  1 +
 softmmu/qdev-monitor.c  |  1 +
 6 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/qapi/qom.json b/qapi/qom.json
index a877b879b9..4cda191f00 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -33,12 +33,14 @@
 # @description: if specified, the description of the property.
 #
 # @default-value: the default value, if any (since 5.0)
+# @fixed: if specified if value has been fixed (since 8.1)
 #
 # Since: 1.2
 ##
 { 'struct': 'ObjectPropertyInfo',
   'data': { 'name': 'str',
 'type': 'str',
+'fixed': 'bool',
 '*description': 'str',
 '*default-value': 'any' } }
 
diff --git a/include/qom/object.h b/include/qom/object.h
index ef7258a5e1..f18d1a8254 100644
--- a/include/qom/object.h
+++ b/include/qom/object.h
@@ -97,6 +97,8 @@ struct ObjectProperty
 ObjectPropertyInit *init;
 void *opaque;
 QObject *defval;
+/** @fixed: if the property has been fixed at its default */
+bool fixed;
 };
 
 /**
@@ -,6 +1113,17 @@ void object_property_set_default_int(ObjectProperty 
*prop, int64_t value);
  */
 void object_property_set_default_uint(ObjectProperty *prop, uint64_t value);
 
+/**
+ * object_property_fix_default_uint:
+ * @prop: the property to be fixed
+ * @value: the fixed value to be written to the property
+ *
+ * When specialising an object it may make send to fix some values and
+ * not allow them to be changed. This can only be applied to
+ * properties that previously had a default and now cannot be changed.
+ */
+void object_property_fix_default_uint(ObjectProperty *prop, uint64_t value);
+
 /**
  * object_property_find:
  * @obj: the object
@@ -1961,13 +1974,14 @@ size_t object_type_get_instance_size(const char 
*typename);
  * object_property_help:
  * @name: the name of the property
  * @type: the type of the property
+ * @fixed: has the value been fixed
  * @defval: the default value
  * @description: description of the property
  *
  * Returns: a user-friendly formatted string describing the property
  * for help purposes.
  */
-char *object_property_help(const char *name, const char *type,
+char *object_property_help(const char *name, const char *type, bool fixed,
QObject *defval, const char *description);
 
 G_DEFINE_AUTOPTR_CLEANUP_FUNC(Object, object_unref)
diff --git a/qom/object.c b/qom/object.c
index e25f1e96db..b5aba3ffc8 100644
--- a/qom/object.c
+++ b/qom/object.c
@@ -1584,6 +1584,20 @@ void object_property_set_default_uint(ObjectProperty 
*prop, uint64_t value)
 object_property_set_default(prop, QOBJECT(qnum_from_uint(value)));
 }
 
+static void object_property_fix_default(ObjectProperty *prop, QObject *defval)
+{
+g_assert(prop->init == object_property_init_defval);
+g_assert(!prop->fixed);
+
+prop->defval = defval;
+prop->fixed = true;
+}
+
+void object_property_fix_default_uint(ObjectProperty *prop, uint64_t value)
+{
+object_property_fix_default(prop, QOBJECT(qnum_from_uint(value)));
+}
+
 bool object_property_set_uint(Object *obj, const char *name,
   uint64_t value, Error **errp)
 {
diff --git a/qom/object_interfaces.c b/qom/object_interfaces.c
index 7d31589b04..e351938f8f 100644
--- a/qom/object_interfaces.c
+++ b/qom/object_interfaces.c
@@ -161,7 +161,7 @@ void user_creatable_add_qapi(ObjectOptions *options, Error 
**errp)
 visit_free(v);
 }
 
-char *object_property_help(const char *name, const char *type,
+char *object_property_help(const char *name, const char *type, bool fixed,
QObject *defval, const char *description)
 {
 GString *str = g_string_new(NULL);
@@ -179,7 +179,9 @@ char *object_property_help(const char *name, const char 
*type,
 if (defval) {
 g_autofree char *def_json = g_string_free(qobject_to_json(defval),
   false);
-g_string_append_printf(str, " (default: %s)", def_json);
+g_string_append_printf(str, " (%s: %s)",
+   fixed ? "fixed" : "default",
+   def_json);
 }
 
 return g_string_free(str, false);
@@ -220,7 +222,8 @@ bool type_print_class_properties(const char *type)
 
 g_ptr_array_add(array,
 object_property_help(prop->name, prop->type,
- prop->defval, prop->description));
+ prop->fixed, prop->defval,
+ prop->description));
 }

[PATCH 3/3] target/arm: handle ipa_secure vs s2walk_secure correctly

In get_phys_addr_twostage() when we set up the stage 2 translation,
we currently incorrectly set all of in_mmu_idx, in_ptw_idx and
in_secure based on s2walk_secure.

Here s2walk_secure is true if we should be doing this stage 2
walk to physical memory. ipa_secure is true if the intermediate
physical address we got from stage 1 is secure. The VSTCR_EL2.SW
and VTCR_EL2.NSW bits allow the guest to configure secure EL2
so that these two things are different, eg:
 * a stage 2 walk for an NS IPA done to secure physical memory
   (where the translation table base address and other parameters
   for the walk come from the NS control registers VTTBR_EL2
   and VTCR_EL2)
 * a stage 2 walk for an S IPA done to non-secure physical memory
   (where the parameters from the walk come from the S control
   registers VSTTBR_EL2 and VSTCR_EL2)

To tell get_phys_addr_lpae() to do this, we need to pass in an
in_mmu_idx based on ipa_secure, and an in_ptw_idx based on
s2walk_secure.  The in_secure parameter follows in_mmu_idx, as usual.

Note that this change relies on fixes in the previous two commits
("Don't allow stage 2 page table walks to downgrade to NS" and
"Set ptw->out_secure correctly for stage 2 translations").

Cc: qemu-sta...@nongnu.org
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1600
Signed-off-by: Peter Maydell 
---
This change also means that v8M, which also uses
get_phys_addr_twostage(), is no longer using a ptw->in_mmu_idx
calculated based on s2walk_secure, which was always a little
strange given that v8M doesn't do any kind of s2 walk, even
though it didn't produce incorrect behaviour.
---
 target/arm/ptw.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/arm/ptw.c b/target/arm/ptw.c
index c1e124df495..2ee2b0b9241 100644
--- a/target/arm/ptw.c
+++ b/target/arm/ptw.c
@@ -2741,9 +2741,9 @@ static bool get_phys_addr_twostage(CPUARMState *env, 
S1Translate *ptw,
 }
 
 is_el0 = ptw->in_mmu_idx == ARMMMUIdx_Stage1_E0;
-ptw->in_mmu_idx = s2walk_secure ? ARMMMUIdx_Stage2_S : ARMMMUIdx_Stage2;
+ptw->in_mmu_idx = ipa_secure ? ARMMMUIdx_Stage2_S : ARMMMUIdx_Stage2;
 ptw->in_ptw_idx = s2walk_secure ? ARMMMUIdx_Phys_S : ARMMMUIdx_Phys_NS;
-ptw->in_secure = s2walk_secure;
+ptw->in_secure = ipa_secure;
 
 /*
  * S1 is done, now do S2 translation.
-- 
2.34.1

[PATCH 0/3] target/arm: Fix handling of VSTCR_EL2.SW and VTCR_EL2.NSW

When FEAT_SEL2 (secure EL2) is implemented, the bits
VSTCR_EL2.SW and VTCR_EL2.NSW allow the guest to set things up
so that the stage 2 walk for an IPA is done to the other
address space, eg
 * a stage 2 walk for an NS IPA done to secure physical memory
   (where the translation table base address and other parameters
   for the walk come from the NS control registers VTTBR_EL2
   and VTCR_EL2)
 * a stage 2 walk for an S IPA done to non-secure physical memory
   (where the parameters from the walk come from the S control
   registers VSTTBR_EL2 and VSTCR_EL2)

We tried to implement this, but didn't get it right -- in
get_phys_addr_twostage() we identify whether we need to do
the s2 walk in Secure or NonSecure, but then we fail to pay
attention to whether we were doing the walk for an NS or S IPA.
The fix for this is simple -- set ptw->in_mmu_idx and ptw->in_secure
based on ipa_secure, with only ptw->in_ptw_idx depending on
s2walk_secure. However to make this work we first need to fix
a couple of places in the ptw code that were incorrectly looking
at ptw->in_secure when they either should not be or should
be doing something based on ptw->in_ptw_idx.

This fixes https://gitlab.com/qemu-project/qemu/-/issues/1600 .
NB: I have tested that this fixes the test case in the bug, and
that it doesn't break 'make check-avocado', but I don't have a
huge supply of EL2-using guests to hand so the patchset hasn't
received exhaustive testing. Plus this area of the architecture
and this bit of QEMU's codebase are pretty hairy -- so careful
review would be a good idea :-)

thanks
-- PMM

Peter Maydell (3):
  target/arm: Don't allow stage 2 page table walks to downgrade to NS
  target/arm: Set ptw->out_secure correctly for stage 2 translations
  target/arm: handle ipa_secure vs s2walk_secure correctly

 target/arm/ptw.c | 25 ++---
 1 file changed, 14 insertions(+), 11 deletions(-)

-- 
2.34.1

[RFC PATCH v2 2/4] target/riscv: Reuse TB_FLAGS.MSTATUS_HFS_FS

When misa.F is clear, TB_FLAGS.MSTATUS_HS_FS field is unused and can
be used to save the current state of smstateen0.FCSR check which is
needed by the floating point translation routines.

Signed-off-by: Mayuresh Chitale 
---
 target/riscv/cpu_helper.c | 12 
 target/riscv/translate.c  |  7 +++
 2 files changed, 19 insertions(+)

diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c
index 433ea529b0..fd1731cc39 100644
--- a/target/riscv/cpu_helper.c
+++ b/target/riscv/cpu_helper.c
@@ -105,6 +105,18 @@ void cpu_get_tb_cpu_state(CPURISCVState *env, target_ulong 
*pc,
 flags = FIELD_DP32(flags, TB_FLAGS, MSTATUS_HS_VS,
get_field(env->mstatus_hs, MSTATUS_VS));
 }
+/*
+ * If misa.F is 0 then the MSTATUS_HS_FS field of the tb->flags
+ * can be used to pass the current state of the smstateen.FCSR bit
+ * which must be checked for in the floating point translation routines
+ */
+if (!riscv_has_ext(env, RVF)) {
+if (smstateen_acc_ok(env, 0, SMSTATEEN0_FCSR) == RISCV_EXCP_NONE) {
+flags = FIELD_DP32(flags, TB_FLAGS, MSTATUS_HS_FS, 1);
+} else {
+flags = FIELD_DP32(flags, TB_FLAGS, MSTATUS_HS_FS, 0);
+}
+}
 if (cpu->cfg.debug && !icount_enabled()) {
 flags = FIELD_DP32(flags, TB_FLAGS, ITRIGGER, env->itrigger_enabled);
 }
diff --git a/target/riscv/translate.c b/target/riscv/translate.c
index d0094922b6..e29bbb8b70 100644
--- a/target/riscv/translate.c
+++ b/target/riscv/translate.c
@@ -79,6 +79,7 @@ typedef struct DisasContext {
 int frm;
 RISCVMXL ol;
 bool virt_inst_excp;
+bool smstateen_fcsr_ok;
 bool virt_enabled;
 const RISCVCPUConfig *cfg_ptr;
 bool hlsx;
@@ -1202,6 +1203,12 @@ static void riscv_tr_init_disas_context(DisasContextBase 
*dcbase, CPUState *cs)
 ctx->itrigger = FIELD_EX32(tb_flags, TB_FLAGS, ITRIGGER);
 ctx->zero = tcg_constant_tl(0);
 ctx->virt_inst_excp = false;
+if (has_ext(ctx, RVF)) {
+ctx->smstateen_fcsr_ok = 1;
+} else {
+ctx->smstateen_fcsr_ok = FIELD_EX32(tb_flags, TB_FLAGS,
+ MSTATUS_HS_FS);
+}
 }
 
 static void riscv_tr_tb_start(DisasContextBase *db, CPUState *cpu)
-- 
2.34.1

[RFC PATCH v2 0/4] Smstateen FCSR

Patch 4 and 5 of the smstateen series need to be re-submitted with
changes described in the email below.
https://lists.nongnu.org/archive/html/qemu-riscv/2022-11/msg00155.html
Hence splitting the patch 4 of the original series into three and
re-submitting along with the original patch 5.

Changes in v2:
 - Improve patch 1 description
 - Reuse TB_FLAGS.HS_FS for smstateen
 - Convert smstateen_fcsr_check to function
 - Add fcsr check for zdinx

Mayuresh Chitale (4):
  target/riscv: smstateen check for fcsr
  target/riscv: Reuse TB_FLAGS.MSTATUS_HFS_FS
  target/riscv: check smstateen fcsr flag
  target/riscv: smstateen knobs

 target/riscv/cpu.c|  3 ++-
 target/riscv/cpu_helper.c | 12 
 target/riscv/csr.c| 23 ++
 target/riscv/insn_trans/trans_rvd.c.inc   | 13 
 target/riscv/insn_trans/trans_rvf.c.inc   | 24 +++
 target/riscv/insn_trans/trans_rvzfh.c.inc | 18 ++---
 target/riscv/translate.c  |  7 +++
 7 files changed, 88 insertions(+), 12 deletions(-)

-- 
2.34.1

[RFC PATCH v2 4/4] target/riscv: smstateen knobs

Add knobs to allow users to enable smstateen and also export it via the
ISA extension string.

Signed-off-by: Mayuresh Chitale 
Reviewed-by: Weiwei Li
Reviewed-by: Alistair Francis 
---
 target/riscv/cpu.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
index fab38859ec..5c00106899 100644
--- a/target/riscv/cpu.c
+++ b/target/riscv/cpu.c
@@ -119,6 +119,7 @@ static const struct isa_ext_data isa_edata_arr[] = {
 ISA_EXT_DATA_ENTRY(zhinx, PRIV_VERSION_1_12_0, ext_zhinx),
 ISA_EXT_DATA_ENTRY(zhinxmin, PRIV_VERSION_1_12_0, ext_zhinxmin),
 ISA_EXT_DATA_ENTRY(smaia, PRIV_VERSION_1_12_0, ext_smaia),
+ISA_EXT_DATA_ENTRY(smstateen, PRIV_VERSION_1_12_0, ext_smstateen),
 ISA_EXT_DATA_ENTRY(ssaia, PRIV_VERSION_1_12_0, ext_ssaia),
 ISA_EXT_DATA_ENTRY(sscofpmf, PRIV_VERSION_1_12_0, ext_sscofpmf),
 ISA_EXT_DATA_ENTRY(sstc, PRIV_VERSION_1_12_0, ext_sstc),
@@ -1498,8 +1499,8 @@ static Property riscv_cpu_extensions[] = {
 DEFINE_PROP_UINT16("vlen", RISCVCPU, cfg.vlen, 128),
 DEFINE_PROP_UINT16("elen", RISCVCPU, cfg.elen, 64),
 
+DEFINE_PROP_BOOL("smstateen", RISCVCPU, cfg.ext_smstateen, false),
 DEFINE_PROP_BOOL("svadu", RISCVCPU, cfg.ext_svadu, true),
-
 DEFINE_PROP_BOOL("svinval", RISCVCPU, cfg.ext_svinval, false),
 DEFINE_PROP_BOOL("svnapot", RISCVCPU, cfg.ext_svnapot, false),
 DEFINE_PROP_BOOL("svpbmt", RISCVCPU, cfg.ext_svpbmt, false),
-- 
2.34.1

[RFC PATCH v2 1/4] target/riscv: smstateen check for fcsr

If smstateen is implemented and smtateen0.fcsr is clear and misa.F
is off then the floating point operations must return illegal
instruction exception or virtual instruction trap, if relevant.

Signed-off-by: Mayuresh Chitale 
---
 target/riscv/csr.c | 23 +++
 1 file changed, 23 insertions(+)

diff --git a/target/riscv/csr.c b/target/riscv/csr.c
index f4d2dcfdc8..8ae9e95f9f 100644
--- a/target/riscv/csr.c
+++ b/target/riscv/csr.c
@@ -82,6 +82,10 @@ static RISCVException fs(CPURISCVState *env, int csrno)
 !riscv_cpu_cfg(env)->ext_zfinx) {
 return RISCV_EXCP_ILLEGAL_INST;
 }
+
+if (!env->debugger && !riscv_cpu_fp_enabled(env)) {
+return smstateen_acc_ok(env, 0, SMSTATEEN0_FCSR);
+}
 #endif
 return RISCV_EXCP_NONE;
 }
@@ -2081,6 +2085,9 @@ static RISCVException write_mstateen0(CPURISCVState *env, 
int csrno,
   target_ulong new_val)
 {
 uint64_t wr_mask = SMSTATEEN_STATEEN | SMSTATEEN0_HSENVCFG;
+if (!riscv_has_ext(env, RVF)) {
+wr_mask |= SMSTATEEN0_FCSR;
+}
 
 return write_mstateen(env, csrno, wr_mask, new_val);
 }
@@ -2117,6 +2124,10 @@ static RISCVException write_mstateen0h(CPURISCVState 
*env, int csrno,
 {
 uint64_t wr_mask = SMSTATEEN_STATEEN | SMSTATEEN0_HSENVCFG;
 
+if (!riscv_has_ext(env, RVF)) {
+wr_mask |= SMSTATEEN0_FCSR;
+}
+
 return write_mstateenh(env, csrno, wr_mask, new_val);
 }
 
@@ -2154,6 +2165,10 @@ static RISCVException write_hstateen0(CPURISCVState 
*env, int csrno,
 {
 uint64_t wr_mask = SMSTATEEN_STATEEN | SMSTATEEN0_HSENVCFG;
 
+if (!riscv_has_ext(env, RVF)) {
+wr_mask |= SMSTATEEN0_FCSR;
+}
+
 return write_hstateen(env, csrno, wr_mask, new_val);
 }
 
@@ -2193,6 +2208,10 @@ static RISCVException write_hstateen0h(CPURISCVState 
*env, int csrno,
 {
 uint64_t wr_mask = SMSTATEEN_STATEEN | SMSTATEEN0_HSENVCFG;
 
+if (!riscv_has_ext(env, RVF)) {
+wr_mask |= SMSTATEEN0_FCSR;
+}
+
 return write_hstateenh(env, csrno, wr_mask, new_val);
 }
 
@@ -2240,6 +2259,10 @@ static RISCVException write_sstateen0(CPURISCVState 
*env, int csrno,
 {
 uint64_t wr_mask = SMSTATEEN_STATEEN | SMSTATEEN0_HSENVCFG;
 
+if (!riscv_has_ext(env, RVF)) {
+wr_mask |= SMSTATEEN0_FCSR;
+}
+
 return write_sstateen(env, csrno, wr_mask, new_val);
 }
 
-- 
2.34.1

[RFC PATCH v2 3/4] target/riscv: check smstateen fcsr flag

If misa.F and smstateen_fcsr_ok flag are clear then all the floating
point instructions must generate an appropriate exception.

Signed-off-by: Mayuresh Chitale 
---
 target/riscv/insn_trans/trans_rvd.c.inc   | 13 
 target/riscv/insn_trans/trans_rvf.c.inc   | 24 +++
 target/riscv/insn_trans/trans_rvzfh.c.inc | 18 ++---
 3 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/target/riscv/insn_trans/trans_rvd.c.inc 
b/target/riscv/insn_trans/trans_rvd.c.inc
index 2c51e01c40..d9e0cf116f 100644
--- a/target/riscv/insn_trans/trans_rvd.c.inc
+++ b/target/riscv/insn_trans/trans_rvd.c.inc
@@ -18,10 +18,15 @@
  * this program.  If not, see .
  */
 
-#define REQUIRE_ZDINX_OR_D(ctx) do { \
-if (!ctx->cfg_ptr->ext_zdinx) { \
-REQUIRE_EXT(ctx, RVD); \
-} \
+#define REQUIRE_ZDINX_OR_D(ctx) do {   \
+if (!has_ext(ctx, RVD)) {  \
+if (!ctx->cfg_ptr->ext_zdinx) {\
+return false;  \
+}  \
+if (!smstateen_fcsr_check(ctx)) {  \
+return false;  \
+}  \
+}  \
 } while (0)
 
 #define REQUIRE_EVEN(ctx, reg) do { \
diff --git a/target/riscv/insn_trans/trans_rvf.c.inc 
b/target/riscv/insn_trans/trans_rvf.c.inc
index 9e9fa2087a..6bf6fe16be 100644
--- a/target/riscv/insn_trans/trans_rvf.c.inc
+++ b/target/riscv/insn_trans/trans_rvf.c.inc
@@ -24,10 +24,26 @@
 return false; \
 } while (0)
 
-#define REQUIRE_ZFINX_OR_F(ctx) do {\
-if (!ctx->cfg_ptr->ext_zfinx) { \
-REQUIRE_EXT(ctx, RVF); \
-} \
+static inline bool smstateen_fcsr_check(DisasContext *ctx)
+{
+#ifndef CONFIG_USER_ONLY
+if (!has_ext(ctx, RVF) && !ctx->smstateen_fcsr_ok) {
+ctx->virt_inst_excp = ctx->virt_enabled;
+return false;
+}
+#endif
+return true;
+}
+
+#define REQUIRE_ZFINX_OR_F(ctx) do {   \
+if (!has_ext(ctx, RVF)) {  \
+if (!ctx->cfg_ptr->ext_zfinx) {\
+return false;  \
+}  \
+if (!smstateen_fcsr_check(ctx)) {  \
+return false;  \
+}  \
+}  \
 } while (0)
 
 #define REQUIRE_ZCF(ctx) do {  \
diff --git a/target/riscv/insn_trans/trans_rvzfh.c.inc 
b/target/riscv/insn_trans/trans_rvzfh.c.inc
index 74dde37ff7..74a125e4c0 100644
--- a/target/riscv/insn_trans/trans_rvzfh.c.inc
+++ b/target/riscv/insn_trans/trans_rvzfh.c.inc
@@ -16,28 +16,40 @@
  * this program.  If not, see .
  */
 
-#define REQUIRE_ZFH(ctx) do { \
+#define REQUIRE_ZFH(ctx) do {  \
 if (!ctx->cfg_ptr->ext_zfh) {  \
-return false; \
-} \
+return false;  \
+}  \
+if (!smstateen_fcsr_check(ctx)) {  \
+return false;  \
+}  \
 } while (0)
 
 #define REQUIRE_ZHINX_OR_ZFH(ctx) do { \
 if (!ctx->cfg_ptr->ext_zhinx && !ctx->cfg_ptr->ext_zfh) { \
 return false;  \
 }  \
+if (!smstateen_fcsr_check(ctx)) {  \
+return false;  \
+}  \
 } while (0)
 
 #define REQUIRE_ZFHMIN(ctx) do {  \
 if (!ctx->cfg_ptr->ext_zfhmin) {  \
 return false; \
 } \
+if (!smstateen_fcsr_check(ctx)) { \
+return false; \
+} \
 } while (0)
 
 #define REQUIRE_ZFHMIN_OR_ZHINXMIN(ctx) do { \
 if (!(ctx->cfg_ptr->ext_zfhmin || ctx->cfg_ptr->ext_zhinxmin)) { \
 return false;\
 }\
+if (!smstateen_fcsr_check(ctx)) {\
+return false;\
+}\
 } while (0)
 
 static bool trans_flh(DisasContext *ctx, arg_flh *a)
-- 
2.34.1

Re: [Socratic RFC PATCH] include: attempt to document device_class_set_props



Alex Bennée  writes:

> Peter Maydell  writes:
>
>> On Mon, 27 Mar 2023 at 23:10, Alex Bennée  wrote:
>>>
>>>
>>> Mark Cave-Ayland  writes:
>>>
>>> > On 27/03/2023 17:12, Alex Bennée wrote:
>>> >
>>> >> Mark Cave-Ayland  writes:
>>> >>
>>> >>> On 27/03/2023 14:15, Alex Bennée wrote:
>>> >>>
>>>  I'm still not sure how I achieve by use case of the parent class
>>>  defining the following properties:
>>>  static Property vud_properties[] = {
>>>  DEFINE_PROP_CHR("chardev", VHostUserDevice, chardev),
>>>  DEFINE_PROP_UINT16("id", VHostUserDevice, id, 0),
>>>  DEFINE_PROP_UINT32("num_vqs", VHostUserDevice, num_vqs, 1),
>>>  DEFINE_PROP_END_OF_LIST(),
>>>  };
>>>  But for the specialisation of the class I want the id to default to
>>>  the actual device id, e.g.:
>>>  static Property vu_rng_properties[] = {
>>>  DEFINE_PROP_UINT16("id", VHostUserDevice, id, VIRTIO_ID_RNG),
>>>  DEFINE_PROP_UINT32("num_vqs", VHostUserDevice, num_vqs, 1),
>>>  DEFINE_PROP_END_OF_LIST(),
>>>  };
>>>  And so far the API for doing that isn't super clear.
>>>  Signed-off-by: Alex Bennée 
>>>  ---
>>> include/hw/qdev-core.h | 9 +
>>> 1 file changed, 9 insertions(+)
>>>  diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
>>>  index bd50ad5ee1..d4bbc30c92 100644
>>>  --- a/include/hw/qdev-core.h
>>>  +++ b/include/hw/qdev-core.h
>>>  @@ -776,6 +776,15 @@ BusState *sysbus_get_default(void);
>>> char *qdev_get_fw_dev_path(DeviceState *dev);
>>> char *qdev_get_own_fw_dev_path_from_handler(BusState *bus, 
>>>  DeviceState *dev);
>>> +/**
>>>  + * device_class_set_props(): add a set of properties to an device
>>>  + * @dc: the parent DeviceClass all devices inherit
>>>  + * @props: an array of properties, terminate by 
>>>  DEFINE_PROP_END_OF_LIST()
>>>  + *
>>>  + * This will add a set of properties to the object. It will fault if
>>>  + * you attempt to add an existing property defined by a parent class.
>>>  + * To modify an inherited property you need to use
>>>  + */
>>> void device_class_set_props(DeviceClass *dc, Property *props);
>>>   /**
>>> >>>
>>> >>> Hmmm that's an interesting one. Looking at the source in
>>> >>> hw/core/qdev-properties.c you could possibly get away with something
>>> >>> like this in vu_rng_class_init():
>>> >>>
>>> >>>  ObjectProperty *op = object_class_property_find(klass, "id");
>>> >>>  object_property_set_default_uint(op, VIRTIO_ID_RNG);
>>> >>>
>>> >>> Of course this is all completely untested :)
>>> >> Sadly we assert on the existing prop->defval:
>>> >>static void object_property_set_default(ObjectProperty *prop,
>>> >> QObject *defval)
>>> >>{
>>> >>assert(!prop->defval);
>>> >>assert(!prop->init);
>>> >>prop->defval = defval;
>>> >>prop->init = object_property_init_defval;
>>> >>}
>>> >> Maybe the assert is too aggressive or we need a different helper,
>>> >> maybe
>>> >> a:
>>> >>void object_property_update_default_uint(ObjectProperty *prop,
>>> >> uint64_t value)
>>> >> ?
>>> >
>>> > It seems in that case once the default has been set, it is impossible
>>> > to change. The only other immediate option I can think of is to define
>>> > a specific DEFINE_VHOST_PROPERTIES macro in a similar way to
>>> > DEFINE_AUDIO_PROPERTIES which you can use to set the common properties
>>> > for all VHostUserDevice devices, including providing the default ID.
>>>
>>> I tried this: allow the default to change
>>>
>>> modified   qom/object.c
>>> @@ -1557,11 +1557,16 @@ static void object_property_init_defval(Object 
>>> *obj, ObjectProperty *prop)
>>>
>>>  static void object_property_set_default(ObjectProperty *prop, QObject 
>>> *defval)
>>>  {
>>> -assert(!prop->defval);
>>> -assert(!prop->init);
>>> +if (prop->init == object_property_init_defval) {
>>> +fprintf(stderr, "%s: updating existing defval\n", __func__);
>>> +prop->defval = defval;
>>> +} else {
>>> +assert(!prop->defval);
>>> +assert(!prop->init);
>>>
>>> -prop->defval = defval;
>>> -prop->init = object_property_init_defval;
>>> +prop->defval = defval;
>>> +prop->init = object_property_init_defval;
>>> +}
>>>  }
>>
>> I think this leaves the door open to bugs where you create
>> the property, somebody looks at it, and then you update
>> the default value afterwards...
>
> Really the pattern I have is:
>
>   vhost-user-device has the property and is configurable
>   vhost-user-rng-device specialises vhost-user-device and fixes the value
>
> I'm not sure how best to represent that. This should all be happening at
> class_init time.

Of course enabling my second derived class I ran into:

  ERROR:../../qom/object.c:1590:object_property_fix_default:

Re: [PATCH 0/3] SDL2 usability fixes

2023-04-14 Thread BALATON Zoltan


On Fri, 14 Apr 2023, Bernhard Beschow wrote:

Am 14. April 2023 06:53:18 UTC schrieb "Volker Rümelin" :

Am 13.04.23 um 22:43 schrieb Bernhard Beschow:

Am 13. April 2023 17:54:34 UTC schrieb "Volker Rümelin" :

I'm trying to use QEMU on Windows hosts for fun and for profit. While the GTK
GUI doesn't seem to support OpenGL under Windows the SDL2 GUI does. Hence I
used the SDL2 GUI where I ran into several issues of which three are fixed in
this series, which are:

* Alt+Tab switches tasks on the host rather than in the guest in fullscreen mode
* Alt+F4 closes QEMU rather than a graphical task in the guest
* AltGr keyboard modifier isn't recognized by a Linux guest

More information about each issue is provided in the patches.

Bernhard Beschow (3):
ui/sdl2: Grab Alt+Tab also in fullscreen mode
ui/sdl2: Grab Alt+F4 also under Windows
ui/sdl2-input: Fix AltGr modifier on Windows hosts

   ui/sdl2-input.c | 13 +
   ui/sdl2.c   |  2 ++
   2 files changed, 15 insertions(+)


Hi Bernhard,

Hi Volker,


I don't think these patches are necessary. The AltGr key and the keyboard grab was fixed 
in 2020 with commit 830473455f ("ui/sdl2: fix handling of AltGr key on 
Windows") and a few commits before.

Indeed, this patch addresses the AltGr issue. What I noticed in my case is that 
the AltGr behavior is different, depending on whether the *guest* is in 
graphics mode or not. Pressing AltGr in graphics mode issues two key modifiers 
while only one is issued when the guest is in text mode. I'll recheck tomorrow 
when I have access to a Windows host.


Hi Bernhard,


Hi Volker,



the AltGr behavior depends on the keyboard grab. The AltGr key works without 
keyboard grabbed and it doesn't with keyboard grabbed. That's a bug.


Interesting. The keyboard is grabbed automatically for some reason when 
the guest enters graphics mode. Together with what you describe this 
could explain the difference in behavior I'm seeing.


Not sure how it works on Windows but keyboard grab may depend on the 
drivers or devices in the guest. I think using a usb-tablet may auto-grab 
mouse while using a mouse needs to click in the window to grab. Also not 
sure how this relates to keyboard at all so maybe this is not relevant 
here in which case sorry for the noise. I guess what I wanted to say is 
also check what command line you use (what input devices you VM has) and 
what guest side drivers you use that may have an effect (such as some 
vmware drivers could or maybe some other drivers). In any case first you 
should sync to make sure you're on the same page and testing the same 
thing to avoid some confusion. Sorry if this is not really helpful.


Regrads,
BALATON Zoltan

Re: clean after distclean gobbles source files


On 14/04/2023 17.30, Steven Sistare wrote:

On 4/13/2023 7:41 AM, Thomas Huth wrote:

On 07/04/2023 17.44, Steven Sistare wrote:

Run 'make distclean', and GNUmakefile is removed.
But, GNUmakefile is where we cd to build/.
Run 'make distclean' or 'make clean' again, and Makefile applies
the clean actions, such as this one, at the top level of the tree:

  find . \( -name '*.so' -o -name '*.dll' -o \
    -name '*.[oda]' -o -name '*.gcno' \) -type f \
  ! -path ./roms/edk2/ArmPkg/Library/GccLto/liblto-aarch64.a \
  ! -path ./roms/edk2/ArmPkg/Library/GccLto/liblto-arm.a \
  -exec rm {} +

For example, it removes the .d source files in 'meson/test cases/d/*/*.d'.
The damage could be worse in the future if more suffixes are cleaned.

I don't have a suggested fix.  Recursion and the GNUmakefile bootstrap
make it non-trivial.


That's somewhat ugly, indeed.

We could maybe disallow make [dist]clean if running in-tree? Something like 
that:

diff a/Makefile b/Makefile
--- a/Makefile
+++ b/Makefile
@@ -26,7 +26,7 @@ quiet-command-run = $(if $(V),,$(if $2,printf "  %-7s %s\n" $2 $3 
&& ))$1
  quiet-@ = $(if $(V),,@)
  quiet-command = $(quiet-@)$(call quiet-command-run,$1,$2,$3)
  
-UNCHECKED_GOALS := %clean TAGS cscope ctags dist \

+UNCHECKED_GOALS := TAGS cscope ctags dist \
  help check-help print-% \
  docker docker-% vm-help vm-test vm-build-%
  
@@ -201,7 +201,7 @@ recurse-distclean: $(addsuffix /distclean, $(ROMS))
  
  ##
  
-clean: recurse-clean

+clean: config-host.mak recurse-clean
     -$(quiet-@)test -f build.ninja && $(NINJA) $(NINJAFLAGS) -t clean || :
     -$(quiet-@)test -f build.ninja && $(NINJA) $(NINJAFLAGS) clean-ctlist 
|| :
     find . \( -name '*.so' -o -name '*.dll' -o \


... or if we still want to allow that, maybe just make an exception for the *.d 
files:

diff --git a/Makefile b/Makefile
index e421f8a1f4..0cb2a7aa98 100644
--- a/Makefile
+++ b/Makefile
@@ -208,6 +208,7 @@ clean: recurse-clean
   -name '*.[oda]' -o -name '*.gcno' \) -type f \
     ! -path ./roms/edk2/ArmPkg/Library/GccLto/liblto-aarch64.a \
     ! -path ./roms/edk2/ArmPkg/Library/GccLto/liblto-arm.a \
+   ! -path './meson/test cases/d/*/*.d' \
     -exec rm {} +
     rm -f TAGS cscope.* *~ */*~
  


What do you think?


Actually, all make targets are broken if we do not cd to build first.


I think some of them work from the source directory, too... e.g. "make help" 
or "make vm-build-XXX" or "make dist" ... not sure how important this 
possibility is ... I guess "make dist" is still a thing? Michael?



This should do the trick.  If you agree, I will submit a patch.

diff --git a/Makefile b/Makefile
index a48103c..3d03101 100644
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,10 @@ ifneq ($(words $(subst :, ,$(CURDIR))), 1)
$(error main directory cannot contain spaces nor colons)
  endif

+ifneq ($(notdir $(CURDIR)),build)
+$(error To build in tree, run configure first.)
+endif


If we decide to go down that road, I think you should remove the existing 
"Please call configure before running make" UNCHECKED_GOALS logic in that 
file, too.


 Thomas

[PATCH v2] migration: Handle block device inactivation failures better

2023-04-14 Thread Eric Blake

Consider what happens when performing a migration between two host
machines connected to an NFS server serving multiple block devices to
the guest, when the NFS server becomes unavailable.  The migration
attempts to inactivate all block devices on the source (a necessary
step before the destination can take over); but if the NFS server is
non-responsive, the attempt to inactivate can itself fail.  When that
happens, the destination fails to get the migrated guest (good,
because the source wasn't able to flush everything properly):

  (qemu) qemu-kvm: load of migration failed: Input/output error

at which point, our only hope for the guest is for the source to take
back control.  With the current code base, the host outputs a message, but then 
appears to resume:

  (qemu) qemu-kvm: qemu_savevm_state_complete_precopy_non_iterable: 
bdrv_inactivate_all() failed (-1)

  (src qemu)info status
   VM status: running

but a second migration attempt now asserts:

  (src qemu) qemu-kvm: ../block.c:6738: int 
bdrv_inactivate_recurse(BlockDriverState *): Assertion `!(bs->open_flags & 
BDRV_O_INACTIVE)' failed.

Whether the guest is recoverable on the source after the first failure
is debatable, but what we do not want is to have qemu itself fail due
to an assertion.  It looks like the problem is as follows:

In migration.c:migration_completion(), the source sets 'inactivate' to
true (since COLO is not enabled), then tries
savevm.c:qemu_savevm_state_complete_precopy() with a request to
inactivate block devices.  In turn, this calls
block.c:bdrv_inactivate_all(), which fails when flushing runs up
against the non-responsive NFS server.  With savevm failing, we are
now left in a state where some, but not all, of the block devices have
been inactivated; but migration_completion() then jumps to 'fail'
rather than 'fail_invalidate' and skips an attempt to reclaim those
those disks by calling bdrv_activate_all().  Even if we do attempt to
reclaim disks, we aren't taking note of failure there, either.

Thus, we have reached a state where the migration engine has forgotten
all state about whether a block device is inactive, because we did not
set s->block_inactive in enough places; so migration allows the source
to reach vm_start() and resume execution, violating the block layer
invariant that the guest CPUs should not be restarted while a device
is inactive.  Note that the code in migration.c:migrate_fd_cancel()
will also try to reactivate all block devices if s->block_inactive was
set, but because we failed to set that flag after the first failure,
the source assumes it has reclaimed all devices, even though it still
has remaining inactivated devices and does not try again.  Normally,
qmp_cont() will also try to reactivate all disks (or correctly fail if
the disks are not reclaimable because NFS is not yet back up), but the
auto-resumption of the source after a migration failure does not go
through qmp_cont().  And because we have left the block layer in an
inconsistent state with devices still inactivated, the later migration
attempt is hitting the assertion failure.

Since it is important to not resume the source with inactive disks,
this patch marks s->block_inactive before attempting inactivation,
rather than after succeeding, in order to prevent any vm_start() until
it has successfully reactivated all devices.

See also https://bugzilla.redhat.com/show_bug.cgi?id=2058982

Signed-off-by: Eric Blake 

---

v2: Set s->block_inactive sooner [Juan]
---
 migration/migration.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/migration/migration.c b/migration/migration.c
index bda47891933..cb0d42c0610 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -3444,13 +3444,11 @@ static void migration_completion(MigrationState *s)
 MIGRATION_STATUS_DEVICE);
 }
 if (ret >= 0) {
+s->block_inactive = inactivate;
 qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
 ret = qemu_savevm_state_complete_precopy(s->to_dst_file, false,
  inactivate);
 }
-if (inactivate && ret >= 0) {
-s->block_inactive = true;
-}
 }
 qemu_mutex_unlock_iothread();

@@ -3522,6 +3520,7 @@ fail_invalidate:
 bdrv_activate_all(_err);
 if (local_err) {
 error_report_err(local_err);
+s->block_inactive = true;
 } else {
 s->block_inactive = false;
 }

base-commit: 7dbd6f8a27e30fe14adb3d5869097cddf24038d6
-- 
2.39.2

Re: clean after distclean gobbles source files

2023-04-14 Thread Steven Sistare

On 4/13/2023 7:41 AM, Thomas Huth wrote:
> On 07/04/2023 17.44, Steven Sistare wrote:
>> Run 'make distclean', and GNUmakefile is removed.
>> But, GNUmakefile is where we cd to build/.
>> Run 'make distclean' or 'make clean' again, and Makefile applies
>> the clean actions, such as this one, at the top level of the tree:
>>
>>  find . \( -name '*.so' -o -name '*.dll' -o \
>>    -name '*.[oda]' -o -name '*.gcno' \) -type f \
>>  ! -path ./roms/edk2/ArmPkg/Library/GccLto/liblto-aarch64.a \
>>  ! -path ./roms/edk2/ArmPkg/Library/GccLto/liblto-arm.a \
>>  -exec rm {} +
>>
>> For example, it removes the .d source files in 'meson/test cases/d/*/*.d'.
>> The damage could be worse in the future if more suffixes are cleaned.
>>
>> I don't have a suggested fix.  Recursion and the GNUmakefile bootstrap
>> make it non-trivial.
> 
> That's somewhat ugly, indeed.
> 
> We could maybe disallow make [dist]clean if running in-tree? Something like 
> that:
> 
> diff a/Makefile b/Makefile
> --- a/Makefile
> +++ b/Makefile
> @@ -26,7 +26,7 @@ quiet-command-run = $(if $(V),,$(if $2,printf "  %-7s %s\n" 
> $2 $3 && ))$1
>  quiet-@ = $(if $(V),,@)
>  quiet-command = $(quiet-@)$(call quiet-command-run,$1,$2,$3)
>  
> -UNCHECKED_GOALS := %clean TAGS cscope ctags dist \
> +UNCHECKED_GOALS := TAGS cscope ctags dist \
>  help check-help print-% \
>  docker docker-% vm-help vm-test vm-build-%
>  
> @@ -201,7 +201,7 @@ recurse-distclean: $(addsuffix /distclean, $(ROMS))
>  
>  ##
>  
> -clean: recurse-clean
> +clean: config-host.mak recurse-clean
>     -$(quiet-@)test -f build.ninja && $(NINJA) $(NINJAFLAGS) -t clean || :
>     -$(quiet-@)test -f build.ninja && $(NINJA) $(NINJAFLAGS) clean-ctlist 
> || :
>     find . \( -name '*.so' -o -name '*.dll' -o \
> 
> 
> ... or if we still want to allow that, maybe just make an exception for the 
> *.d files:
> 
> diff --git a/Makefile b/Makefile
> index e421f8a1f4..0cb2a7aa98 100644
> --- a/Makefile
> +++ b/Makefile
> @@ -208,6 +208,7 @@ clean: recurse-clean
>   -name '*.[oda]' -o -name '*.gcno' \) -type f \
>     ! -path ./roms/edk2/ArmPkg/Library/GccLto/liblto-aarch64.a \
>     ! -path ./roms/edk2/ArmPkg/Library/GccLto/liblto-arm.a \
> +   ! -path './meson/test cases/d/*/*.d' \
>     -exec rm {} +
>     rm -f TAGS cscope.* *~ */*~
>  
> 
> What do you think?

Actually, all make targets are broken if we do not cd to build first.

This should do the trick.  If you agree, I will submit a patch.

diff --git a/Makefile b/Makefile
index a48103c..3d03101 100644
--- a/Makefile
+++ b/Makefile
@@ -4,6 +4,10 @@ ifneq ($(words $(subst :, ,$(CURDIR))), 1)
   $(error main directory cannot contain spaces nor colons)
 endif

+ifneq ($(notdir $(CURDIR)),build)
+$(error To build in tree, run configure first.)
+endif
+
 # Always point to the root of the build tree (needs GNU make).
 BUILD_DIR=$(CURDIR)

- Steve

Re: [PATCH] target/i386: Fix exception classes for SSE/AVX instructions.


Hi Ricky,

On 12/2/23 09:28, Ricky Zhou wrote:

Fix the exception classes for some SSE/AVX instructions to match what is
documented in the Intel manual.

Most of these changes have no functional effect on the behavior that
qemu implements (primarily >= 16-byte memory alignment checks). For
example, since qemu does not implement the AC flag, there is no
difference in behavior between Exception Classes 4 and 5 for
instructions where the SSE version only takes <16 byte memory operands.


Having this patch split in 2 (documentation first, logical change then)
would ease code review.


There is one functional change:

Before this change, MOVNTPS and MOVNTPD were labeled as Exception Class
4 (only requiring alignment for legacy SSE instructions). This changes
them to Exception Class 1 (always requiring memory alignment), as
documented in the Intel manual.


This could be a 3rd patch.


Signed-off-by: Ricky Zhou 
---
  target/i386/tcg/decode-new.c.inc | 79 
  1 file changed, 40 insertions(+), 39 deletions(-)


Regards,

Phil.

Re: [PATCH 2/4] vhost-user: Interface for migration state transfer

2023-04-14 Thread Eugenio Perez Martin

On Thu, Apr 13, 2023 at 7:55 PM Hanna Czenczek  wrote:
>
> On 13.04.23 13:38, Stefan Hajnoczi wrote:
> > On Thu, 13 Apr 2023 at 05:24, Hanna Czenczek  wrote:
> >> On 12.04.23 23:06, Stefan Hajnoczi wrote:
> >>> On Tue, Apr 11, 2023 at 05:05:13PM +0200, Hanna Czenczek wrote:
>  So-called "internal" virtio-fs migration refers to transporting the
>  back-end's (virtiofsd's) state through qemu's migration stream.  To do
>  this, we need to be able to transfer virtiofsd's internal state to and
>  from virtiofsd.
> 
>  Because virtiofsd's internal state will not be too large, we believe it
>  is best to transfer it as a single binary blob after the streaming
>  phase.  Because this method should be useful to other vhost-user
>  implementations, too, it is introduced as a general-purpose addition to
>  the protocol, not limited to vhost-user-fs.
> 
>  These are the additions to the protocol:
>  - New vhost-user protocol feature VHOST_USER_PROTOCOL_F_MIGRATORY_STATE:
>  This feature signals support for transferring state, and is added so
>  that migration can fail early when the back-end has no support.
> 
>  - SET_DEVICE_STATE_FD function: Front-end and back-end negotiate a pipe
>  over which to transfer the state.  The front-end sends an FD to the
>  back-end into/from which it can write/read its state, and the 
>  back-end
>  can decide to either use it, or reply with a different FD for the
>  front-end to override the front-end's choice.
>  The front-end creates a simple pipe to transfer the state, but maybe
>  the back-end already has an FD into/from which it has to write/read
>  its state, in which case it will want to override the simple pipe.
>  Conversely, maybe in the future we find a way to have the front-end
>  get an immediate FD for the migration stream (in some cases), in 
>  which
>  case we will want to send this to the back-end instead of creating a
>  pipe.
>  Hence the negotiation: If one side has a better idea than a plain
>  pipe, we will want to use that.
> 
>  - CHECK_DEVICE_STATE: After the state has been transferred through the
>  pipe (the end indicated by EOF), the front-end invokes this function
>  to verify success.  There is no in-band way (through the pipe) to
>  indicate failure, so we need to check explicitly.
> 
>  Once the transfer pipe has been established via SET_DEVICE_STATE_FD
>  (which includes establishing the direction of transfer and migration
>  phase), the sending side writes its data into the pipe, and the reading
>  side reads it until it sees an EOF.  Then, the front-end will check for
>  success via CHECK_DEVICE_STATE, which on the destination side includes
>  checking for integrity (i.e. errors during deserialization).
> 
>  Suggested-by: Stefan Hajnoczi 
>  Signed-off-by: Hanna Czenczek 
>  ---
> include/hw/virtio/vhost-backend.h |  24 +
> include/hw/virtio/vhost.h |  79 
> hw/virtio/vhost-user.c| 147 ++
> hw/virtio/vhost.c |  37 
> 4 files changed, 287 insertions(+)
> 
>  diff --git a/include/hw/virtio/vhost-backend.h 
>  b/include/hw/virtio/vhost-backend.h
>  index ec3fbae58d..5935b32fe3 100644
>  --- a/include/hw/virtio/vhost-backend.h
>  +++ b/include/hw/virtio/vhost-backend.h
>  @@ -26,6 +26,18 @@ typedef enum VhostSetConfigType {
> VHOST_SET_CONFIG_TYPE_MIGRATION = 1,
> } VhostSetConfigType;
> 
>  +typedef enum VhostDeviceStateDirection {
>  +/* Transfer state from back-end (device) to front-end */
>  +VHOST_TRANSFER_STATE_DIRECTION_SAVE = 0,
>  +/* Transfer state from front-end to back-end (device) */
>  +VHOST_TRANSFER_STATE_DIRECTION_LOAD = 1,
>  +} VhostDeviceStateDirection;
>  +
>  +typedef enum VhostDeviceStatePhase {
>  +/* The device (and all its vrings) is stopped */
>  +VHOST_TRANSFER_STATE_PHASE_STOPPED = 0,
>  +} VhostDeviceStatePhase;
> >>> vDPA has:
> >>>
> >>> /* Suspend a device so it does not process virtqueue requests anymore
> >>>  *
> >>>  * After the return of ioctl the device must preserve all the 
> >>> necessary state
> >>>  * (the virtqueue vring base plus the possible device specific 
> >>> states) that is
> >>>  * required for restoring in the future. The device must not change 
> >>> its
> >>>  * configuration after that point.
> >>>  */
> >>> #define VHOST_VDPA_SUSPEND  _IO(VHOST_VIRTIO, 0x7D)
> >>>
> >>> /* Resume a device so it can resume processing virtqueue requests
> >>>  *
> >>>  * After the return of this ioctl the device will have restored all 
> >>> the
> >>>  *

Re: virtio-iommu hotplug issue

2023-04-14 Thread Jean-Philippe Brucker

On Thu, Apr 13, 2023 at 08:01:54PM +0900, Akihiko Odaki wrote:
> Yes, that's right. The guest can dynamically create and delete VFs. The
> device is emulated by QEMU: igb, an Intel NIC recently added to QEMU and
> projected to be released as part of QEMU 8.0.

Ah great, that's really useful, I'll add it to my tests

> > Yes, I think this is an issue in the virtio-iommu driver, which should be
> > sending a DETACH request when the VF is disabled, likely from
> > viommu_release_device(). I'll work on a fix unless you would like to do it
> 
> It will be nice if you prepare a fix. I will test your patch with my
> workload if you share it with me.

I sent a fix:
https://lore.kernel.org/linux-iommu/20230414150744.562456-1-jean-phili...@linaro.org/

Thank you for reporting this, it must have been annoying to debug

Thanks,
Jean

Re: [PATCH 39/40] MAINTAINERS: Add a reviewer for network packet abstractions


On 14/4/23 13:37, Akihiko Odaki wrote:

I have made significant changes for network packet abstractions so add
me as a reviewer.

Signed-off-by: Akihiko Odaki 
---
  MAINTAINERS | 1 +
  1 file changed, 1 insertion(+)


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH 38/40] vmxnet3: Do not depend on PC


On 14/4/23 13:37, Akihiko Odaki wrote:

vmxnet3 has no dependency on PC, and VMware Fusion actually makes it
available on Apple Silicon according to:
https://kb.vmware.com/s/article/90364

Signed-off-by: Akihiko Odaki 
---
  hw/net/Kconfig | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH 25/40] net/eth: Use void pointers


On 14/4/23 13:37, Akihiko Odaki wrote:

The uses of uint8_t pointers were misleading as they are never accessed
as an array of octets and it even require more strict alignment to
access as struct eth_header.

Signed-off-by: Akihiko Odaki 
---
  include/net/eth.h | 4 ++--
  net/eth.c | 6 +++---
  2 files changed, 5 insertions(+), 5 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH 24/40] igb: Fix igb_mac_reg_init alignment


On 14/4/23 13:37, Akihiko Odaki wrote:

Signed-off-by: Akihiko Odaki 
---
  hw/net/igb_core.c | 96 +++
  1 file changed, 48 insertions(+), 48 deletions(-)


"Fix igb_mac_reg_init() coding style alignment" to clarify
this isn't about data alignment.

Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH 23/40] igb: Share common VF constants


On 14/4/23 13:37, Akihiko Odaki wrote:

The constants need to be consistent between the PF and VF.

Signed-off-by: Akihiko Odaki 
---
  hw/net/igb.c| 10 +-
  hw/net/igb_common.h |  8 
  hw/net/igbvf.c  |  7 ---
  3 files changed, 13 insertions(+), 12 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH 17/40] igb: Always log status after building rx metadata


On 14/4/23 13:37, Akihiko Odaki wrote:

Without this change, the status flags may not be traced e.g. if checksum
offloading is disabled.

Signed-off-by: Akihiko Odaki 
---
  hw/net/igb_core.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé 


diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 5fdc8bc42d..ccc5a626b4 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -1303,9 +1303,8 @@ igb_build_rx_metadata(IGBCore *core,
  trace_e1000e_rx_metadata_l4_cso_disabled();
  }
  
-trace_e1000e_rx_metadata_status_flags(*status_flags);

-
  func_exit:
+trace_e1000e_rx_metadata_status_flags(*status_flags);
  *status_flags = cpu_to_le32(*status_flags);
  }
  


So igb_build_rx_metadata() is very similar to
e1000e_build_rx_metadata()...

Re: [PATCH 16/40] e1000e: Always log status after building rx metadata


On 14/4/23 13:37, Akihiko Odaki wrote:

Without this change, the status flags may not be traced e.g. if checksum
offloading is disabled.

Signed-off-by: Akihiko Odaki 
---
  hw/net/e1000e_core.c | 3 +--
  1 file changed, 1 insertion(+), 2 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH 15/40] e1000x: Take CRC into consideration for size check


On 14/4/23 13:37, Akihiko Odaki wrote:

Section 13.7.15 Receive Length Error Count says:

  Packets over 1522 bytes are oversized if LongPacketEnable is 0b
(RCTL.LPE). If LongPacketEnable (LPE) is 1b, then an incoming packet
is considered oversized if it exceeds 16384 bytes.



These lengths are based on bytes in the received packet from
 through , inclusively.


As QEMU processes packets without CRC, the number of bytes for CRC
need to be subtracted.

Signed-off-by: Akihiko Odaki 
---
  hw/net/e1000x_common.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/net/e1000x_common.c b/hw/net/e1000x_common.c
index 6cc23138a8..b4dfc74b66 100644
--- a/hw/net/e1000x_common.c
+++ b/hw/net/e1000x_common.c
@@ -142,10 +142,10 @@ bool e1000x_is_oversized(uint32_t *mac, size_t size)
  {
  /* this is the size past which hardware will
 drop packets when setting LPE=0 */
-static const int maximum_ethernet_vlan_size = 1522;
+static const int maximum_ethernet_vlan_size = 1522 - 4;
  /* this is the size past which hardware will
 drop packets when setting LPE=1 */
-static const int maximum_ethernet_lpe_size = 16 * KiB;
+static const int maximum_ethernet_lpe_size = 16 * KiB - 4;
  
  if ((size > maximum_ethernet_lpe_size ||

  (size > maximum_ethernet_vlan_size


IMHO this function could be simplified. Something like:

  bool long_packet_enabled = mac[RCTL] & E1000_RCTL_LPE;
  size_t oversize = long_packet_enabled ? 16 * KiB : ETH_VLAN_MAXSIZE;
  size_t crc32_size = sizeof(uint32_t);

  if (mac[RCTL] & E1000_RCTL_SBP) {
return false;
  }

  if (size + crc32_size > oversize ) {
...
return true;
  }

  return false;
}

[PATCH 1/2] gitlab-ci: Avoid to re-run "configure" in the device-crash-test jobs

After "make check-venv" had been added to these jobs, they started
to re-run "configure" each time since our logic in the makefile
thinks that some files are out of date here. Avoid it with the same
trick that we are using in buildtest-template.yml already by disabling
the up-to-date check via NINJA=":".

Fixes: 1d8cf47e5b ("tests: run 'device-crash-test' from tests/venv")
Signed-off-by: Thomas Huth 
---
 .gitlab-ci.d/buildtest.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.d/buildtest.yml b/.gitlab-ci.d/buildtest.yml
index ba6f551752..333eea9dd3 100644
--- a/.gitlab-ci.d/buildtest.yml
+++ b/.gitlab-ci.d/buildtest.yml
@@ -102,7 +102,7 @@ crash-test-debian:
 IMAGE: debian-amd64
   script:
 - cd build
-- make check-venv
+- make NINJA=":" check-venv
 - tests/venv/bin/python3 scripts/device-crash-test -q ./qemu-system-i386
 
 build-system-fedora:
@@ -145,7 +145,7 @@ crash-test-fedora:
 IMAGE: fedora
   script:
 - cd build
-- make check-venv
+- make NINJA=":" check-venv
 - tests/venv/bin/python3 scripts/device-crash-test -q ./qemu-system-ppc
 - tests/venv/bin/python3 scripts/device-crash-test -q ./qemu-system-riscv32
 
-- 
2.31.1

[PATCH 0/2] Improvements for the device-crash-test jobs

Improve the runtime of the device-crash-test jobs by avoiding
to run "configure" again and by forcing to test with TCG only
(instead of testing twice, with TCG and KVM).

Thomas Huth (2):
  gitlab-ci: Avoid to re-run "configure" in the device-crash-test jobs
  scripts/device-crash-test: Add a parameter to run with TCG only

 .gitlab-ci.d/buildtest.yml | 6 +++---
 scripts/device-crash-test  | 4 +++-
 2 files changed, 6 insertions(+), 4 deletions(-)

-- 
2.31.1

[PATCH 2/2] scripts/device-crash-test: Add a parameter to run with TCG only

We're currently facing the problem that the device-crash-test script
runs twice as long in the CI when a runner supports KVM - which sometimes
results in a timeout of the CI job. To get a more deterministic runtime
here, add an option to the script that allows to run it with TCG only.

Reported-by: Eldon Stegall 
Signed-off-by: Thomas Huth 
---
 .gitlab-ci.d/buildtest.yml | 2 +-
 scripts/device-crash-test  | 4 +++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.gitlab-ci.d/buildtest.yml b/.gitlab-ci.d/buildtest.yml
index 333eea9dd3..bb3650a51c 100644
--- a/.gitlab-ci.d/buildtest.yml
+++ b/.gitlab-ci.d/buildtest.yml
@@ -103,7 +103,7 @@ crash-test-debian:
   script:
 - cd build
 - make NINJA=":" check-venv
-- tests/venv/bin/python3 scripts/device-crash-test -q ./qemu-system-i386
+- tests/venv/bin/python3 scripts/device-crash-test -q --tcg-only 
./qemu-system-i386
 
 build-system-fedora:
   extends:
diff --git a/scripts/device-crash-test b/scripts/device-crash-test
index 73bcb98693..b74d887331 100755
--- a/scripts/device-crash-test
+++ b/scripts/device-crash-test
@@ -397,7 +397,7 @@ def binariesToTest(args, testcase):
 
 
 def accelsToTest(args, testcase):
-if getBinaryInfo(args, testcase['binary']).kvm_available:
+if getBinaryInfo(args, testcase['binary']).kvm_available and not 
args.tcg_only:
 yield 'kvm'
 yield 'tcg'
 
@@ -510,6 +510,8 @@ def main():
 help="Full mode: test cases that are expected to fail")
 parser.add_argument('--strict', action='store_true', dest='strict',
 help="Treat all warnings as fatal")
+parser.add_argument('--tcg-only', action='store_true', dest='tcg_only',
+help="Only test with TCG accelerator")
 parser.add_argument('qemu', nargs='*', metavar='QEMU',
 help='QEMU binary to run')
 args = parser.parse_args()
-- 
2.31.1

Re: [PATCH 2/2] chardev: Allow setting file chardev input file on the command line


On 13/04/2023 17.07, Peter Maydell wrote:

Our 'file' chardev backend supports both "output from this chardev
is written to a file" and "input from this chardev should be read
from a file" (except on Windows). However, you can only set up
the input file if you're using the QMP interface -- there is no
command line syntax to do it.

Add command line syntax to allow specifying an input file
as well as an output file, using a new 'input-path' suboption.

The specific use case I have is that I'd like to be able to
feed fuzzer reproducer input into qtest without having to use
'-qtest stdio' and put the input onto stdin. Being able to
use a file chardev like this:
  -chardev file,id=repro,path=/dev/null,input-path=repro.txt -qtest 
chardev:repro
means that stdio is free for use by gdb.

Signed-off-by: Peter Maydell 
---

...

diff --git a/qemu-options.hx b/qemu-options.hx
index 59bdf67a2c5..31d08c60264 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -3360,7 +3360,7 @@ DEF("chardev", HAS_ARG, QEMU_OPTION_chardev,
  "-chardev 
vc,id=id[[,width=width][,height=height]][[,cols=cols][,rows=rows]]\n"
  " [,mux=on|off][,logfile=PATH][,logappend=on|off]\n"
  "-chardev ringbuf,id=id[,size=size][,logfile=PATH][,logappend=on|off]\n"
-"-chardev 
file,id=id,path=path[,mux=on|off][,logfile=PATH][,logappend=on|off]\n"
+"-chardev 
file,id=id,path=path[,input-file=input-file][,mux=on|off][,logfile=PATH][,logappend=on|off]\n"


s/input-file=/input-path=/

 Thomas

Re: [PATCH 1/2] qtest: Don't assert on "-qtest chardev:myid"


On 13/04/2023 17.07, Peter Maydell wrote:

If the -qtest command line argument is passed a string that says
"use this chardev for I/O", then it will assert:

$ ./build/clang/qemu-system-i386 -chardev file,path=/dev/null,id=myid -qtest 
chardev:myid
Unexpected error in qtest_set_chardev() at ../../softmmu/qtest.c:1011:
qemu-system-i386: Cannot find character device 'qtest'
Aborted (core dumped)

This is because in qtest_server_init() we assume that when we create
the chardev with qemu_chr_new() it will always have the name "qtest".
This is true if qemu_chr_new() had to create a new chardev, but not
true if one already existed and is being referred to with
"chardev:myid".

Use the name of the chardev we get back from qemu_chr_new() as the
string to set the qtest 'chardev' property to, instead of hardcoding
it to "qtest".

Signed-off-by: Peter Maydell 
---
  softmmu/qtest.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/softmmu/qtest.c b/softmmu/qtest.c
index 34bd2a33a76..26852996b5b 100644
--- a/softmmu/qtest.c
+++ b/softmmu/qtest.c
@@ -867,7 +867,7 @@ void qtest_server_init(const char *qtest_chrdev, const char 
*qtest_log, Error **
  }
  
  qtest = object_new(TYPE_QTEST);

-object_property_set_str(qtest, "chardev", "qtest", _abort);
+object_property_set_str(qtest, "chardev", chr->label, _abort);
  if (qtest_log) {
  object_property_set_str(qtest, "log", qtest_log, _abort);
  }


Reviewed-by: Thomas Huth

[PATCH] accel/tcg/tcg-accel-ops-rr: ensure fairness with icount

2023-04-14 Thread Jamie Iles

The round-robin scheduler will iterate over the CPU list with an
assigned budget until the next timer expiry and may exit early because
of a TB exit.  This is fine under normal operation but with icount
enabled and SMP it is possible for a CPU to be starved of run time and
the system live-locks.

For example, booting a riscv64 platform with '-icount
shift=0,align=off,sleep=on -smp 2' we observe a livelock once the kernel
has timers enabled and starts performing TLB shootdowns.  In this case
we have CPU 0 in M-mode with interrupts disabled sending an IPI to CPU
1.  As we enter the TCG loop, we assign the icount budget to next timer
interrupt to CPU 0 and begin executing where the guest is sat in a busy
loop exhausting all of the budget before we try to execute CPU 1 which
is the target of the IPI but CPU 1 is left with no budget with which to
execute and the process repeats.

We try here to add some fairness by splitting the budget across all of
the CPUs on the thread fairly before entering each one.  The CPU count
is cached on CPU list generation ID to avoid iterating the list on each
loop iteration.  With this change it is possible to boot an SMP rv64
guest with icount enabled and no hangs.

Signed-off-by: Jamie Iles 
---
 accel/tcg/tcg-accel-ops-icount.c | 16 ++--
 accel/tcg/tcg-accel-ops-icount.h |  3 ++-
 accel/tcg/tcg-accel-ops-rr.c | 26 +-
 3 files changed, 41 insertions(+), 4 deletions(-)

diff --git a/accel/tcg/tcg-accel-ops-icount.c b/accel/tcg/tcg-accel-ops-icount.c
index 84cc7421be88..a7ffc8a68bad 100644
--- a/accel/tcg/tcg-accel-ops-icount.c
+++ b/accel/tcg/tcg-accel-ops-icount.c
@@ -89,7 +89,19 @@ void icount_handle_deadline(void)
 }
 }
 
-void icount_prepare_for_run(CPUState *cpu)
+int64_t icount_cpu_timeslice(int cpu_count)
+{
+int64_t limit = icount_get_limit();
+int64_t timeslice = limit / cpu_count;
+
+if (timeslice == 0) {
+timeslice = limit;
+}
+
+return timeslice;
+}
+
+void icount_prepare_for_run(CPUState *cpu, int64_t timeslice)
 {
 int insns_left;
 
@@ -101,7 +113,7 @@ void icount_prepare_for_run(CPUState *cpu)
 g_assert(cpu_neg(cpu)->icount_decr.u16.low == 0);
 g_assert(cpu->icount_extra == 0);
 
-cpu->icount_budget = icount_get_limit();
+cpu->icount_budget = MIN(icount_get_limit(), timeslice);
 insns_left = MIN(0x, cpu->icount_budget);
 cpu_neg(cpu)->icount_decr.u16.low = insns_left;
 cpu->icount_extra = cpu->icount_budget - insns_left;
diff --git a/accel/tcg/tcg-accel-ops-icount.h b/accel/tcg/tcg-accel-ops-icount.h
index 1b6fd9c60751..e8785a0e196d 100644
--- a/accel/tcg/tcg-accel-ops-icount.h
+++ b/accel/tcg/tcg-accel-ops-icount.h
@@ -11,7 +11,8 @@
 #define TCG_ACCEL_OPS_ICOUNT_H
 
 void icount_handle_deadline(void);
-void icount_prepare_for_run(CPUState *cpu);
+void icount_prepare_for_run(CPUState *cpu, int64_t timeslice);
+int64_t icount_cpu_timeslice(int cpu_count);
 void icount_process_data(CPUState *cpu);
 
 void icount_handle_interrupt(CPUState *cpu, int mask);
diff --git a/accel/tcg/tcg-accel-ops-rr.c b/accel/tcg/tcg-accel-ops-rr.c
index 290833a37fb2..bccb3670a656 100644
--- a/accel/tcg/tcg-accel-ops-rr.c
+++ b/accel/tcg/tcg-accel-ops-rr.c
@@ -139,6 +139,25 @@ static void rr_force_rcu(Notifier *notify, void *data)
 rr_kick_next_cpu();
 }
 
+static int rr_cpu_count(void)
+{
+static unsigned int last_gen_id = ~0;
+static int cpu_count;
+CPUState *cpu;
+
+cpu_list_lock();
+if (cpu_list_generation_id_get() != last_gen_id) {
+cpu_count = 0;
+CPU_FOREACH(cpu) {
+++cpu_count;
+}
+last_gen_id = cpu_list_generation_id_get();
+}
+cpu_list_unlock();
+
+return cpu_count;
+}
+
 /*
  * In the single-threaded case each vCPU is simulated in turn. If
  * there is more than a single vCPU we create a simple timer to kick
@@ -185,6 +204,9 @@ static void *rr_cpu_thread_fn(void *arg)
 cpu->exit_request = 1;
 
 while (1) {
+int cpu_count = rr_cpu_count();
+int64_t icount_timeslice = INT64_MAX;
+
 qemu_mutex_unlock_iothread();
 replay_mutex_lock();
 qemu_mutex_lock_iothread();
@@ -197,6 +219,8 @@ static void *rr_cpu_thread_fn(void *arg)
  * waking up the I/O thread and waiting for completion.
  */
 icount_handle_deadline();
+
+icount_timeslice = icount_cpu_timeslice(cpu_count);
 }
 
 replay_mutex_unlock();
@@ -218,7 +242,7 @@ static void *rr_cpu_thread_fn(void *arg)
 
 qemu_mutex_unlock_iothread();
 if (icount_enabled()) {
-icount_prepare_for_run(cpu);
+icount_prepare_for_run(cpu, icount_timeslice);
 }
 r = tcg_cpus_exec(cpu);
 if (icount_enabled()) {
-- 
2.25.1

Re: [PATCH 09/40] Fix references to igb Avocado test


On 14/4/23 13:37, Akihiko Odaki wrote:

Fixes: 9f95111474 ("tests/avocado: re-factor igb test to avoid timeouts")
Signed-off-by: Akihiko Odaki 
---
  MAINTAINERS| 2 +-
  docs/system/devices/igb.rst| 2 +-
  scripts/ci/org.centos/stream/8/x86_64/test-avocado | 2 +-
  3 files changed, 3 insertions(+), 3 deletions(-)


Oops.

Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH 08/40] igb: Always copy ethernet header


On 14/4/23 13:37, Akihiko Odaki wrote:

igb_receive_internal() used to check the iov length to determine
copy the iovs to a contiguous buffer, but the check is flawed in two
ways:
- It does not ensure that iovcnt > 0.
- It does not take virtio-net header into consideration.

The size of this copy is just 22 octets, which can be even less than
the code size required for checks. This (wrong) optimization is probably
not worth so just remove it. Removing this also allows igb to assume
aligned accesses for the ethernet header.

Fixes: 3a977deebe ("Intrdocue igb device emulation")
Signed-off-by: Akihiko Odaki 
---
  hw/net/igb_core.c | 39 +--
  1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 53f60fc3d3..1d188b526c 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c




-static uint16_t igb_receive_assign(IGBCore *core, const struct eth_header 
*ehdr,
+static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header,
 size_t size, E1000E_RSSInfo *rss_info,
 bool *external_tx)
  {
  static const int ta_shift[] = { 4, 3, 2, 0 };
+const struct eth_header *ehdr = _header->eth;
  uint32_t f, ra[2], *macp, rctl = core->mac[RCTL];
  uint16_t queues = 0;
  uint16_t oversized = 0;
-uint16_t vid = lduw_be_p(_GET_VLAN_HDR(ehdr)->h_tci) & VLAN_VID_MASK;
+uint16_t vid = be16_to_cpu(l2_header->vlan[0].h_tci) & VLAN_VID_MASK;


Why this API change? Are we certain tci is aligned in host memory?

Re: [PATCH 04/40] igb: Include the second VLAN tag in the buffer


On 14/4/23 16:32, Philippe Mathieu-Daudé wrote:

On 14/4/23 16:28, Philippe Mathieu-Daudé wrote:

On 14/4/23 13:37, Akihiko Odaki wrote:

Signed-off-by: Akihiko Odaki 
---
  hw/net/igb_core.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 55de212447..f725ab97ae 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -1590,7 +1590,7 @@ static ssize_t
  igb_receive_internal(IGBCore *core, const struct iovec *iov, int 
iovcnt,

   bool has_vnet, bool *external_tx)
  {
-    static const int maximum_ethernet_hdr_len = (ETH_HLEN + 4);
+    static const int maximum_ethernet_hdr_len = (ETH_HLEN + 8);


Aren't VLAN tags 16-bit wide? Could you convert this magic value
to some verbose-but-obvious definitions?


Digging a bit more, is this struct vlan_header?


And now I see in patch #08 "igb: Always copy ethernet header":

  +typedef struct L2Header {
  +struct eth_header eth;
  +struct vlan_header vlan[2];
  +} L2Header;

Maybe add it first, and use sizeof(L2Header) here directly?

Re: [PATCH 04/40] igb: Include the second VLAN tag in the buffer


On 14/4/23 16:28, Philippe Mathieu-Daudé wrote:

On 14/4/23 13:37, Akihiko Odaki wrote:

Signed-off-by: Akihiko Odaki 
---
  hw/net/igb_core.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 55de212447..f725ab97ae 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -1590,7 +1590,7 @@ static ssize_t
  igb_receive_internal(IGBCore *core, const struct iovec *iov, int 
iovcnt,

   bool has_vnet, bool *external_tx)
  {
-    static const int maximum_ethernet_hdr_len = (ETH_HLEN + 4);
+    static const int maximum_ethernet_hdr_len = (ETH_HLEN + 8);


Aren't VLAN tags 16-bit wide? Could you convert this magic value
to some verbose-but-obvious definitions?


Digging a bit more, is this struct vlan_header?


Is it worth adding a vlan_tag_t typedef in include/net/eth.h?


  uint16_t queues = 0;
  uint32_t n = 0;

Re: [PATCH 04/40] igb: Include the second VLAN tag in the buffer


On 14/4/23 13:37, Akihiko Odaki wrote:

Signed-off-by: Akihiko Odaki 
---
  hw/net/igb_core.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 55de212447..f725ab97ae 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -1590,7 +1590,7 @@ static ssize_t
  igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt,
   bool has_vnet, bool *external_tx)
  {
-static const int maximum_ethernet_hdr_len = (ETH_HLEN + 4);
+static const int maximum_ethernet_hdr_len = (ETH_HLEN + 8);


Aren't VLAN tags 16-bit wide? Could you convert this magic value
to some verbose-but-obvious definitions?

Is it worth adding a vlan_tag_t typedef in include/net/eth.h?


  uint16_t queues = 0;
  uint32_t n = 0;

Re: [PATCH 01/40] hw/net/net_tx_pkt: Decouple from PCI


On 14/4/23 13:36, Akihiko Odaki wrote:

This also fixes the leak of memory mapping when the specified memory is
partially mapped.

Fixes: e263cd49c7 ("Packet abstraction for VMWARE network devices")
Signed-off-by: Akihiko Odaki 
---
  hw/net/net_tx_pkt.c  | 65 +++-
  hw/net/net_tx_pkt.h  | 38 +++---


Preferably split the patch in at least 2, first the back-end,
then the front-ends.

Also consider installing scripts/git.orderfile when posting
API changes, as this eases email review workflow (no need to
scroll up/down frenetically to follow).


  hw/net/e1000e_core.c | 13 +
  hw/net/igb_core.c| 13 -



  hw/net/vmxnet3.c | 14 +-
  5 files changed, 83 insertions(+), 60 deletions(-)

Re: [PATCH 2/2] chardev: Allow setting file chardev input file on the command line

On Fri, 14 Apr 2023 at 15:03, Philippe Mathieu-Daudé  wrote:
>
> On 13/4/23 17:07, Peter Maydell wrote:
> > --- a/qemu-options.hx
> > +++ b/qemu-options.hx
>
>
> > -``-chardev file,id=id,path=path``
> > +``-chardev file,id=id,path=path[,input-path=input-path]``
> >   Log all traffic received from the guest to a file.
> >
> >   ``path`` specifies the path of the file to be opened. This file will
> >   be created if it does not already exist, and overwritten if it does.
> >   ``path`` is required.
>
> I find "path" vs. "input-path" confusing and would rather rename it as
> "output-path" for consistency; or at least add an alias.
> Possibly deprecating the "path" alias. Maybe matter of taste...

The much more common use is the preexisting one of "write the
output to the file". I don't particularly want to break all
the uses of that just because we added this option.

thanks
-- PMM

Re: [RFC PATCH v2 02/10] tests: add python3-venv dependency


On 14/4/23 07:54, John Snow wrote:

Several debian-based tests need the python3-venv dependency as a
consequence of Debian debundling the "ensurepip" module normally
included with Python.

As mkvenv.py stands as of this commit, Debian requires EITHER:

(A) setuptools and pip, or
(B) ensurepip

mkvenv is a few seconds faster if you have setuptools and pip, so
developers should prefer the first requirement. For the purposes of CI,
the time-save is a wash; it's only a matter of who is responsible for
installing pip and when; the timing is about the same.

Arbitrarily, I chose adding ensurepip to the test configuration because
it is normally part of the Python stdlib, and always having it allows us
a more consistent cross-platform environment.

Signed-off-by: John Snow 
---
  tests/docker/dockerfiles/debian-all-test-cross.docker | 3 ++-
  tests/docker/dockerfiles/debian-hexagon-cross.docker  | 3 ++-
  tests/docker/dockerfiles/debian-riscv64-cross.docker  | 3 ++-
  tests/docker/dockerfiles/debian-tricore-cross.docker  | 3 ++-
  4 files changed, 8 insertions(+), 4 deletions(-)


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH 2/2] chardev: Allow setting file chardev input file on the command line


On 13/4/23 17:07, Peter Maydell wrote:

Our 'file' chardev backend supports both "output from this chardev
is written to a file" and "input from this chardev should be read
from a file" (except on Windows). However, you can only set up
the input file if you're using the QMP interface -- there is no
command line syntax to do it.

Add command line syntax to allow specifying an input file
as well as an output file, using a new 'input-path' suboption.

The specific use case I have is that I'd like to be able to
feed fuzzer reproducer input into qtest without having to use
'-qtest stdio' and put the input onto stdin. Being able to
use a file chardev like this:
  -chardev file,id=repro,path=/dev/null,input-path=repro.txt -qtest 
chardev:repro
means that stdio is free for use by gdb.

Signed-off-by: Peter Maydell 
---
The "not on Windows" ifdeffery is because qmp_chardev_open_file()
does something similar; it seems likely to produce a nicer
error message to catch it at parse time rather than open time.
---
  chardev/char-file.c |  8 
  chardev/char.c  |  3 +++
  qemu-options.hx | 10 --
  3 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/chardev/char-file.c b/chardev/char-file.c
index 3a7b9caf6f0..263e6da5636 100644
--- a/chardev/char-file.c
+++ b/chardev/char-file.c
@@ -100,6 +100,7 @@ static void qemu_chr_parse_file_out(QemuOpts *opts, 
ChardevBackend *backend,
  Error **errp)
  {
  const char *path = qemu_opt_get(opts, "path");
+const char *inpath = qemu_opt_get(opts, "input-path");




  file->out = g_strdup(path);
+file->in = g_strdup(inpath);




diff --git a/chardev/char.c b/chardev/char.c
index e69390601fc..661ad8176a9 100644
--- a/chardev/char.c
+++ b/chardev/char.c
@@ -805,6 +805,9 @@ QemuOptsList qemu_chardev_opts = {
  },{
  .name = "path",
  .type = QEMU_OPT_STRING,
+},{
+.name = "input-path",
+.type = QEMU_OPT_STRING,
  },{




diff --git a/qemu-options.hx b/qemu-options.hx
index 59bdf67a2c5..31d08c60264 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx




-``-chardev file,id=id,path=path``
+``-chardev file,id=id,path=path[,input-path=input-path]``
  Log all traffic received from the guest to a file.
  
  ``path`` specifies the path of the file to be opened. This file will

  be created if it does not already exist, and overwritten if it does.
  ``path`` is required.


I find "path" vs. "input-path" confusing and would rather rename it as
"output-path" for consistency; or at least add an alias.
Possibly deprecating the "path" alias. Maybe matter of taste...

Can be done as follow-up, so:
Reviewed-by: Philippe Mathieu-Daudé 


+If ``input-path`` is specified, this is the path of a second file
+which will be used for input. If ``input-path`` is not specified,
+no input will be available from the chardev.
+
+Note that ``input-path`` is not supported on Windows hosts.

Re: [PATCH] rtl8139: fix large_send_mss divide-by-zero


On 13/4/23 19:19, Stefan Hajnoczi wrote:

If the driver sets large_send_mss to 0 then a divide-by-zero occurs.
Even if the division wasn't a problem, the for loop that emits MSS-sized
packets would never terminate.

Solve these issues by skipping offloading when large_send_mss=0.

This issue was found by OSS-Fuzz as part of Alexander Bulekov's device
fuzzing work. The reproducer is:

   $ cat << EOF | ./qemu-system-i386 -display none -machine accel=qtest, -m \
   512M,slots=1,maxmem=0x -machine q35 -nodefaults -device \
   rtl8139,netdev=net0 -netdev user,id=net0 -device \
   pc-dimm,id=nv1,memdev=mem1,addr=0xb800a6460280 -object \
   memory-backend-ram,id=mem1,size=2M  -qtest stdio
   outl 0xcf8 0x8814
   outl 0xcfc 0xe000
   outl 0xcf8 0x8804
   outw 0xcfc 0x06
   write 0xe037 0x1 0x04
   write 0xe0e0 0x2 0x01
   write 0x1 0x1 0x04
   write 0x3 0x1 0x98
   write 0xa 0x1 0x8c
   write 0xb 0x1 0x02
   write 0xc 0x1 0x46
   write 0xd 0x1 0xa6
   write 0xf 0x1 0xb8
   write 0xb800a646028c000c 0x1 0x08
   write 0xb800a646028c000e 0x1 0x47
   write 0xb800a646028c0010 0x1 0x02
   write 0xb800a646028c0017 0x1 0x06
   write 0xb800a646028c0036 0x1 0x80
   write 0xe0d9 0x1 0x40
   EOF

Buglink: https://gitlab.com/qemu-project/qemu/-/issues/1582
Fixes: 6d71357a3b65 ("rtl8139: honor large send MSS value")
Reported-by: Alexander Bulekov 
Cc: Peter Maydell 
Signed-off-by: Stefan Hajnoczi 
---
  hw/net/rtl8139.c | 3 +++
  1 file changed, 3 insertions(+)


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH] hw/core: Move numa.c into the target independent source set


On 13/4/23 20:26, Thomas Huth wrote:

There is nothing that depends on target specific macros in this
file, so we can move it to the common source set to avoid that
we have to compile this file multiple times (one time for each
target).

Signed-off-by: Thomas Huth 
---
  hw/core/meson.build | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)


Reviewed-by: Philippe Mathieu-Daudé

Re: [PATCH 0/3] SDL2 usability fixes

2023-04-14 Thread Bernhard Beschow




Am 14. April 2023 06:53:18 UTC schrieb "Volker Rümelin" :
>Am 13.04.23 um 22:43 schrieb Bernhard Beschow:
>> 
>> Am 13. April 2023 17:54:34 UTC schrieb "Volker Rümelin" 
>> :
 I'm trying to use QEMU on Windows hosts for fun and for profit. While the 
 GTK
 GUI doesn't seem to support OpenGL under Windows the SDL2 GUI does. Hence I
 used the SDL2 GUI where I ran into several issues of which three are fixed 
 in
 this series, which are:
 
 * Alt+Tab switches tasks on the host rather than in the guest in 
 fullscreen mode
 * Alt+F4 closes QEMU rather than a graphical task in the guest
 * AltGr keyboard modifier isn't recognized by a Linux guest
 
 More information about each issue is provided in the patches.
 
 Bernhard Beschow (3):
 ui/sdl2: Grab Alt+Tab also in fullscreen mode
 ui/sdl2: Grab Alt+F4 also under Windows
 ui/sdl2-input: Fix AltGr modifier on Windows hosts
 
ui/sdl2-input.c | 13 +
ui/sdl2.c   |  2 ++
2 files changed, 15 insertions(+)
 
>>> Hi Bernhard,
>> Hi Volker,
>> 
>>> I don't think these patches are necessary. The AltGr key and the keyboard 
>>> grab was fixed in 2020 with commit 830473455f ("ui/sdl2: fix handling of 
>>> AltGr key on Windows") and a few commits before.
>> Indeed, this patch addresses the AltGr issue. What I noticed in my case is 
>> that the AltGr behavior is different, depending on whether the *guest* is in 
>> graphics mode or not. Pressing AltGr in graphics mode issues two key 
>> modifiers while only one is issued when the guest is in text mode. I'll 
>> recheck tomorrow when I have access to a Windows host.
>
>Hi Bernhard,

Hi Volker,

>
>the AltGr behavior depends on the keyboard grab. The AltGr key works without 
>keyboard grabbed and it doesn't with keyboard grabbed. That's a bug.

Interesting. The keyboard is grabbed automatically for some reason when the 
guest enters graphics mode. Together with what you describe this could explain 
the difference in behavior I'm seeing.

Best regards,
Bernhard
>
>> 
>> What about the other two issues? My other two patches override SDL2 defaults 
>> which aren't addressed yet in QEMU AFAICS. The Alt+Tab one isn't even 
>> Windows-specific.
>> 
>>> Something broke in the last few weeks. At the moment my Linux guest fails 
>>> to start on Windows with -display sdl. QEMU locks up a short time after the 
>>> Linux kernel starts.
>> This doesn't happen for me with 8.0rc4 and latest msys2 environment. I'm 
>> running with `-accel whpx -vga none -device virtio-vga-gl -display 
>> sdl,gl=on` and I even get decent OpenGL accelleration when the Linux guest 
>> is in graphics mode, with wobbly windows etc. Sometimes QEMU aborts when it 
>> can't map some OpenGL stuff when the guest enters graphics mode but once 
>> that succeeds it runs absolutely stable.
>> 
>>> I'll try to find the commit that caused this regression.
>
>I tested with QEMU 7.1 and sometimes it also locks up. This indicates that the 
>QEMU code doesn't have an issue.
>
>With best regards,
>Volker
>
>> Yes, this would be interesting.
>> 
>> Best regards,
>> Bernhard
>> 
>>> With best regards,
>>> Volker
>

Re: [RFC PATCH] migration: Handle block device inactivation failures better

2023-04-14 Thread Eric Blake

On Fri, Apr 14, 2023 at 02:15:45PM +0200, Juan Quintela wrote:
> Eric Blake  wrote:
> > Consider what happens when performing a migration between two host
> > machines connected to an NFS server serving multiple block devices to
> > the guest, when the NFS server becomes unavailable.  The migration
> > attempts to inactivate all block devices on the source (a necessary
> > step before the destination can take over); but if the NFS server is
> > non-responsive, the attempt to inactivate can itself fail.  When that
> > happens, the destination fails to get the migrated guest (good,
> > because the source wasn't able to flush everything properly):
> >
> >   (qemu) qemu-kvm: load of migration failed: Input/output error
> >
> > at which point, our only hope for the guest is for the source to take
> > back control.  With the current code base, the host outputs a message, but 
> > then appears to resume:
> >
> >   (qemu) qemu-kvm: qemu_savevm_state_complete_precopy_non_iterable: 
> > bdrv_inactivate_all() failed (-1)
> >
> >   (src qemu)info status
> >VM status: running
> >
> > but a second migration attempt now asserts:
> >
> >   (src qemu) qemu-kvm: ../block.c:6738: int 
> > bdrv_inactivate_recurse(BlockDriverState *): Assertion `!(bs->open_flags & 
> > BDRV_O_INACTIVE)' failed.
> >
> > Whether the guest is recoverable on the source after the first failure
> > is debatable, but what we do not want is to have qemu itself fail due
> > to an assertion.  It looks like the problem is as follows:
> >
> > In migration.c:migration_completion(), the source sets 'inactivate' to
> > true (since COLO is not enabled), then tries
> > savevm.c:qemu_savevm_state_complete_precopy() with a request to
> > inactivate block devices.  In turn, this calls
> > block.c:bdrv_inactivate_all(), which fails when flushing runs up
> > against the non-responsive NFS server.  With savevm failing, we are
> > now left in a state where some, but not all, of the block devices have
> > been inactivated; the 'fail_invalidate' label of
> > migration_completion() then wants to reclaim those disks by calling
> > bdrv_activate_all() - but this too can fail, yet nothing takes note of
> > that failure.
> >
> > Thus, we have reached a state where the migration engine has forgotten
> > all state about whether a block device is inactive, because we did not
> > set s->block_inactive; so migration allows the source to reach
> > vm_start() and resume execution, violating the block layer invariant
> > that the guest CPUs should not be restarted while a device is
> > inactive.  Note that the code in migration.c:migrate_fd_cancel() will
> > also try to reactivate all block devices if s->block_inactive was set,
> > but because we failed to set that flag after the first failure, the
> > source assumes it has reclaimed all devices, even though it still has
> > remaining inactivated devices and does not try again.  Normally,
> > qmp_cont() will also try to reactivate all disks (or correctly fail if
> > the disks are not reclaimable because NFS is not yet back up), but the
> > auto-resumption of the source after a migration failure does not go
> > through qmp_cont().  And because we have left the block layer in an
> > inconsistent state with devices still inactivated, the later migration
> > attempt is hitting the assertion failure.
> >
> > Since it is important to not resume the source with inactive disks,
> > this patch tries harder at tracking whether migration attempted to
> > inactivate any devices, in order to prevent any vm_start() until it
> > has successfully reactivated all devices.
> >
> > See also https://bugzilla.redhat.com/show_bug.cgi?id=2058982
> >
> > Signed-off-by: Eric Blake 
> 
> Wow
> 
> Such a big comment for such a small patch.
> And then people think that there is not "nuance" anymore :-)

It took me a while to figure out the control paths - it is not obvious
that not all migration paths attempt to deactivate images, and
therefore not all paths need to try to reactivate them.  Yes, this
commit message is proof that our code base is complex!

> > @@ -3518,6 +3519,7 @@ fail_invalidate:
> >  bdrv_activate_all(_err);
> >  if (local_err) {
> >  error_report_err(local_err);
> > +s->block_inactive = inactivate;
> 
> Why not just put here:
> 
> s->block_inactive = true;
> 
> And call it a day?

Interesting idea.  Note that it is possible to reach the
fail_invalidate label WITHOUT having first tried to invalidate disk
images (for example, if s->state != MIGRATION_STATUS_ACTIVE on entry,
but qemu_file_get_error() fails) - but then again, in the
fail_invalidate label, we only attempt to reactivate if s->state is
MIGRATION_STATUS_ACTIVE or MIGRATION_STATUS_DEVICE (the latter
possible right before we attempt to inactivate devices above).

> 
> if bdrv_activate_all() fails, we can't continue anyways.

Good point.  And I think it points out yet another problem: if
qemu_savevm_state_complete_precopy() fails, we 'goto

Re: [PATCH 0/3] SDL2 usability fixes

2023-04-14 Thread Bernhard Beschow




Am 13. April 2023 21:40:29 UTC schrieb "Volker Rümelin" :
>Am 13.04.23 um 22:43 schrieb Bernhard Beschow:
>> 
>> Am 13. April 2023 17:54:34 UTC schrieb "Volker Rümelin" 
>> :
 I'm trying to use QEMU on Windows hosts for fun and for profit. While the 
 GTK
 GUI doesn't seem to support OpenGL under Windows the SDL2 GUI does. Hence I
 used the SDL2 GUI where I ran into several issues of which three are fixed 
 in
 this series, which are:
 
 * Alt+Tab switches tasks on the host rather than in the guest in 
 fullscreen mode
 * Alt+F4 closes QEMU rather than a graphical task in the guest
 * AltGr keyboard modifier isn't recognized by a Linux guest
 
 More information about each issue is provided in the patches.
 
 Bernhard Beschow (3):
 ui/sdl2: Grab Alt+Tab also in fullscreen mode
 ui/sdl2: Grab Alt+F4 also under Windows
 ui/sdl2-input: Fix AltGr modifier on Windows hosts
 
ui/sdl2-input.c | 13 +
ui/sdl2.c   |  2 ++
2 files changed, 15 insertions(+)
 
>>> Hi Bernhard,
>> Hi Volker,
>> 
>>> I don't think these patches are necessary. The AltGr key and the keyboard 
>>> grab was fixed in 2020 with commit 830473455f ("ui/sdl2: fix handling of 
>>> AltGr key on Windows") and a few commits before.
>> Indeed, this patch addresses the AltGr issue. What I noticed in my case is 
>> that the AltGr behavior is different, depending on whether the *guest* is in 
>> graphics mode or not. Pressing AltGr in graphics mode issues two key 
>> modifiers while only one is issued when the guest is in text mode. I'll 
>> recheck tomorrow when I have access to a Windows host.
>> 
>> What about the other two issues? My other two patches override SDL2 defaults 
>> which aren't addressed yet in QEMU AFAICS. The Alt+Tab one isn't even 
>> Windows-specific.
>
>Hi Bernhard,

Hi Volker,

>
>the keyboard behavior on Windows and Linux is identical. With the QEMU window 
>activated and keyboard not grabbed, those key combos like Alt-Tab or Alt-F4 
>are sent to the host. With the QEMU window activated and keyboard grabbed they 
>are sent to the guest.

That's the behavior I'd expect. I've confirmed today with 8.0rc4 that patches 1 
and 2 are needed to achieve this. I'm using a Windows 11 host and I'm compiling 
under msys2 on the same Windows host.

Best regards,
Bernhard

>I'm not so sure if this should be changed only for SDL on Windows.
>
>With best regards,
>Volker
>
>>> Something broke in the last few weeks. At the moment my Linux guest fails 
>>> to start on Windows with -display sdl. QEMU locks up a short time after the 
>>> Linux kernel starts.
>> This doesn't happen for me with 8.0rc4 and latest msys2 environment. I'm 
>> running with `-accel whpx -vga none -device virtio-vga-gl -display 
>> sdl,gl=on` and I even get decent OpenGL accelleration when the Linux guest 
>> is in graphics mode, with wobbly windows etc. Sometimes QEMU aborts when it 
>> can't map some OpenGL stuff when the guest enters graphics mode but once 
>> that succeeds it runs absolutely stable.
>> 
>>> I'll try to find the commit that caused this regression.
>> Yes, this would be interesting.
>> 
>> Best regards,
>> Bernhard
>> 
>>> With best regards,
>>> Volker
>

Re: [RFC 1/1] add support of `--initrd` for ELF-ARM kernels

2023-04-14 Thread Lankes, Stefan


> Am 14.04.2023 um 10:54 schrieb Alex Bennée :
> 

Hello Alex,

> 
> Where are these DTB nodes documented?

Yes, it is currently missing.

> 
> Also could you not achieve the same thing using the guest-loader which
> uses the multiboot spec and sets:
> 
>const char *compat[2] = { "multiboot,module", "multiboot,ramdisk" };
>if (qemu_fdt_setprop_string_array(fdt, node, "compatible",
>  (char **) ,
>  ARRAY_SIZE(compat)) < 0) {
>error_setg(errp, "couldn't set %s/compatible", node);
>return;
>}
> 

Thanks for the hint. I will check it.

Cheers

Stefan



smime.p7s
Description: S/MIME cryptographic signature

Re: [RFC PATCH] migration: Handle block device inactivation failures better

2023-04-14 Thread Juan Quintela

Eric Blake  wrote:
> Consider what happens when performing a migration between two host
> machines connected to an NFS server serving multiple block devices to
> the guest, when the NFS server becomes unavailable.  The migration
> attempts to inactivate all block devices on the source (a necessary
> step before the destination can take over); but if the NFS server is
> non-responsive, the attempt to inactivate can itself fail.  When that
> happens, the destination fails to get the migrated guest (good,
> because the source wasn't able to flush everything properly):
>
>   (qemu) qemu-kvm: load of migration failed: Input/output error
>
> at which point, our only hope for the guest is for the source to take
> back control.  With the current code base, the host outputs a message, but 
> then appears to resume:
>
>   (qemu) qemu-kvm: qemu_savevm_state_complete_precopy_non_iterable: 
> bdrv_inactivate_all() failed (-1)
>
>   (src qemu)info status
>VM status: running
>
> but a second migration attempt now asserts:
>
>   (src qemu) qemu-kvm: ../block.c:6738: int 
> bdrv_inactivate_recurse(BlockDriverState *): Assertion `!(bs->open_flags & 
> BDRV_O_INACTIVE)' failed.
>
> Whether the guest is recoverable on the source after the first failure
> is debatable, but what we do not want is to have qemu itself fail due
> to an assertion.  It looks like the problem is as follows:
>
> In migration.c:migration_completion(), the source sets 'inactivate' to
> true (since COLO is not enabled), then tries
> savevm.c:qemu_savevm_state_complete_precopy() with a request to
> inactivate block devices.  In turn, this calls
> block.c:bdrv_inactivate_all(), which fails when flushing runs up
> against the non-responsive NFS server.  With savevm failing, we are
> now left in a state where some, but not all, of the block devices have
> been inactivated; the 'fail_invalidate' label of
> migration_completion() then wants to reclaim those disks by calling
> bdrv_activate_all() - but this too can fail, yet nothing takes note of
> that failure.
>
> Thus, we have reached a state where the migration engine has forgotten
> all state about whether a block device is inactive, because we did not
> set s->block_inactive; so migration allows the source to reach
> vm_start() and resume execution, violating the block layer invariant
> that the guest CPUs should not be restarted while a device is
> inactive.  Note that the code in migration.c:migrate_fd_cancel() will
> also try to reactivate all block devices if s->block_inactive was set,
> but because we failed to set that flag after the first failure, the
> source assumes it has reclaimed all devices, even though it still has
> remaining inactivated devices and does not try again.  Normally,
> qmp_cont() will also try to reactivate all disks (or correctly fail if
> the disks are not reclaimable because NFS is not yet back up), but the
> auto-resumption of the source after a migration failure does not go
> through qmp_cont().  And because we have left the block layer in an
> inconsistent state with devices still inactivated, the later migration
> attempt is hitting the assertion failure.
>
> Since it is important to not resume the source with inactive disks,
> this patch tries harder at tracking whether migration attempted to
> inactivate any devices, in order to prevent any vm_start() until it
> has successfully reactivated all devices.
>
> See also https://bugzilla.redhat.com/show_bug.cgi?id=2058982
>
> Signed-off-by: Eric Blake 

Wow

Such a big comment for such a small patch.
And then people think that there is not "nuance" anymore :-)
> @@ -3518,6 +3519,7 @@ fail_invalidate:
>  bdrv_activate_all(_err);
>  if (local_err) {
>  error_report_err(local_err);
> +s->block_inactive = inactivate;

Why not just put here:

s->block_inactive = true;

And call it a day?

if bdrv_activate_all() fails, we can't continue anyways.

Or I am missing something?

Later, Juan.

>  } else {
>  s->block_inactive = false;
>  }
>
> base-commit: f1426881a827a6d3f31b65616c4a8db1e9e7c45e

Re: [PATCH 0/5] Cleanup [h_enter|spapr_exit]_nested routines

Harsh Prateek Bora  writes:

> This patchset introduces helper routines to enable (and does) cleaning
> up of h_enter_nested() and spapr_exit_nested() routines in existing api
> for nested virtualization on Power/SPAPR for better code readability / 
> maintenance. No functional changes intended with this patchset.

Do we have tests for this nested virtual hypervisor code? It would be
good to have at least a smoke test. I remember someone started adding
support to qemu-ppc-boot, probably Murilo. I suggest you take a look
around and maybe see what it takes to add it.

Just a L0 TCG with a small-ish qemu inside of it and a script to start
an L2.

>
> Harsh Prateek Bora (5):
>   ppc: spapr: cleanup cr get/store with helper routines.
>   ppc: spapr: cleanup h_enter_nested() with helper routines.
>   ppc: spapr: assert early rather late in h_enter_nested()
>   ppc: spapr: cleanup spapr_exit_nested() with helper routines.
>   MAINTAINERS: Adding myself in the list for ppc/spapr
>
>  MAINTAINERS  |   1 +
>  hw/ppc/spapr_hcall.c | 251 ---
>  target/ppc/cpu.c |  17 +++
>  target/ppc/cpu.h |   2 +
>  4 files changed, 161 insertions(+), 110 deletions(-)

Re: [RFC PATCH 0/1] Implement entropy leak reporting for virtio-rng

2023-04-14 Thread Amit Shah

On Thu, 2023-04-13 at 15:36 +0200, Babis Chalios wrote:
> 
> On 11/4/23 18:20, Jason A. Donenfeld wrote:
> > CAUTION: This email originated from outside of the organization. Do not 
> > click links or open attachments unless you can confirm the sender and know 
> > the content is safe.
> > 
> > 
> > 
> > On Tue, Apr 11, 2023 at 6:19 PM Amit Shah  wrote:
> > > Hey Babis,
> > > 
> > > On Mon, 2023-04-03 at 12:52 +0200, Babis Chalios wrote:
> > > > This patchset implements the entropy leak reporting feature proposal [1]
> > > > for virtio-rng devices.
> > > > 
> > > > Entropy leaking (as defined in the specification proposal) typically
> > > > happens when we take a snapshot of a VM or while we resume a VM from a
> > > > snapshot. In these cases, we want to let the guest know so that it can
> > > > reset state that needs to be uniqueue, for example.
> > > > 
> > > > This feature is offering functionality similar to what VMGENID does.
> > > > However, it allows to build mechanisms on the guest side to notify
> > > > user-space applications, like VMGENID for userspace and additionally for
> > > > kernel.
> > > > 
> > > > The new specification describes two request types that the guest might
> > > > place in the queues for the device to perform, a fill-on-leak request
> > > > where the device needs to fill with random bytes a buffer and a
> > > > copy-on-leak request where the device needs to perform a copy between
> > > > two guest-provided buffers. We currently trigger the handling of guest
> > > > requests when saving the VM state and when loading a VM from a snapshot
> > > > file.
> > > > 
> > > > This is an RFC, since the corresponding specification changes have not
> > > > yet been merged. It also aims to allow testing a respective patch-set
> > > > implementing the feature in the Linux front-end driver[2].
> > > > 
> > > > However, I would like to ask the community's opinion regarding the
> > > > handling of the fill-on-leak requests. Essentially, these requests are
> > > > very similar to the normal virtio-rng entropy requests, with the catch
> > > > that we should complete these requests before resuming the VM, so that
> > > > we avoid race-conditions in notifying the guest about entropy leak
> > > > events. This means that we cannot rely on the RngBackend's API, which is
> > > > asynchronous. At the moment, I have handled that using getrandom(), but
> > > > I would like a solution which doesn't work only with (relatively new)
> > > > Linux hosts. I am inclined to solve that by extending the RngBackend API
> > > > with a synchronous call to request for random bytes and I'd like to hear
> > > > opinion's on this approach.
> > > The patch looks OK - I suggest you add a new sync call that also probes
> > > for the availability of getrandom().
> > qemu_guest_getrandom_nofail?
> 
> That should work, I think. Any objections to this Amit?

That's one way to do it - and is fine too - but still needs probing
before calling in that function to ensure it's not going to fail.

Amit

Re: [PATCH 4/5] ppc: spapr: cleanup spapr_exit_nested() with helper routines.

Harsh Prateek Bora  writes:

> Currently, in spapr_exit_nested(), it does a lot of register state
> restoring from ptregs/hvstate after mapping each of those before
> restoring the L1 host state. This patch breaks down those set of ops
> to respective helper routines for better code readability/maintenance.
>
> Signed-off-by: Harsh Prateek Bora 
> ---
>  hw/ppc/spapr_hcall.c | 132 +--
>  1 file changed, 78 insertions(+), 54 deletions(-)
>
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index a77b4c9076..ed3a21471d 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -1701,36 +1701,23 @@ static target_ulong h_enter_nested(PowerPCCPU *cpu,
>  return env->gpr[3];
>  }
>  
> -void spapr_exit_nested(PowerPCCPU *cpu, int excp)
> +static void restore_hvstate_from_env(struct kvmppc_hv_guest_state *hvstate,
> + CPUPPCState *env, int excp)
>  {
> -CPUState *cs = CPU(cpu);
> -CPUPPCState *env = >env;
> -SpaprCpuState *spapr_cpu = spapr_cpu_state(cpu);
> -target_ulong r3_return = env->excp_vectors[excp]; /* hcall return value 
> */
> -target_ulong hv_ptr = spapr_cpu->nested_host_state->gpr[4];
> -target_ulong regs_ptr = spapr_cpu->nested_host_state->gpr[5];
> -struct kvmppc_hv_guest_state *hvstate;
> -struct kvmppc_pt_regs *regs;
> -hwaddr len;
> -
> -assert(spapr_cpu->in_nested);
> -
> -cpu_ppc_hdecr_exit(env);
> -
> -len = sizeof(*hvstate);
> -hvstate = address_space_map(CPU(cpu)->as, hv_ptr, , true,
> -MEMTXATTRS_UNSPECIFIED);
> -if (len != sizeof(*hvstate)) {
> -address_space_unmap(CPU(cpu)->as, hvstate, len, 0, true);
> -r3_return = H_PARAMETER;
> -goto out_restore_l1;
> -}
> -
> -hvstate->cfar = env->cfar;
> -hvstate->lpcr = env->spr[SPR_LPCR];
> -hvstate->pcr = env->spr[SPR_PCR];
> -hvstate->dpdes = env->spr[SPR_DPDES];
> -hvstate->hfscr = env->spr[SPR_HFSCR];
> +hvstate->cfar= env->cfar;
> +hvstate->lpcr= env->spr[SPR_LPCR];
> +hvstate->pcr = env->spr[SPR_PCR];
> +hvstate->dpdes   = env->spr[SPR_DPDES];
> +hvstate->hfscr   = env->spr[SPR_HFSCR];
> +/* HEIR should be implemented for HV mode and saved here. */
> +hvstate->srr0= env->spr[SPR_SRR0];
> +hvstate->srr1= env->spr[SPR_SRR1];
> +hvstate->sprg[0] = env->spr[SPR_SPRG0];
> +hvstate->sprg[1] = env->spr[SPR_SPRG1];
> +hvstate->sprg[2] = env->spr[SPR_SPRG2];
> +hvstate->sprg[3] = env->spr[SPR_SPRG3];
> +hvstate->pidr= env->spr[SPR_BOOKS_PID];
> +hvstate->ppr = env->spr[SPR_PPR];

Just leave these lines as they were. Let's avoid spending time
discussing code style. Also, the patch became harder to review by having
these unrelated changes interspersed.

Re: [PATCH 5/5] MAINTAINERS: Adding myself in the list for ppc/spapr

2023-04-14 Thread Daniel Henrique Barboza





On 3/31/23 03:53, Harsh Prateek Bora wrote:

Would like to get notified of changes in this area and review them.

Signed-off-by: Harsh Prateek Bora 
---


All reviewers are welcome.

Reviewed-by: Daniel Henrique Barboza 




  MAINTAINERS | 1 +
  1 file changed, 1 insertion(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 9b56ccdd92..be99e5c4e9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1406,6 +1406,7 @@ M: Daniel Henrique Barboza 
  R: Cédric Le Goater 
  R: David Gibson 
  R: Greg Kurz 
+R: Harsh Prateek Bora 
  L: qemu-...@nongnu.org
  S: Odd Fixes
  F: hw/*/spapr*

Re: [PATCH 3/5] ppc: spapr: assert early rather late in h_enter_nested()

Harsh Prateek Bora  writes:

> Currently, it asserts very late in the code flow if lpid is already 
> initialized.

That's not about initializing. It is about making sure the LPIDR is
0. Which has a specific meaning according to the ISA.

> Ideally, it should assert in the beginning if that is the case. This patch
> brings assert check in the beginning alongwith the related initialization.
>

Maybe just leave it where it is. There's not much to gain from moving this.

Re: [PATCH] migration: mark mixed functions that can suspend

2023-04-14 Thread Juan Quintela

Paolo Bonzini  wrote:
> There should be no paths from a coroutine_fn to aio_poll, however in
> practice coroutine_mixed_fn will call aio_poll in the !qemu_in_coroutine()
> path.  By marking mixed functions, we can track accurately the call paths
> that execute entirely in coroutine context, and find more missing
> coroutine_fn markers.  This results in more accurate checks that
> coroutine code does not end up blocking.
>
> If the marking were extended transitively to all functions that call
> these ones, static analysis could be done much more efficiently.
> However, this is a start and makes it possible to use vrc's path-based
> searches to find potential bugs where coroutine_fns call blocking functions.
>
> Signed-off-by: Paolo Bonzini 
> ---
>  include/migration/qemu-file-types.h |  4 ++--
>  migration/qemu-file.c   | 14 +++---
>  migration/qemu-file.h   |  6 +++---
>  3 files changed, 12 insertions(+), 12 deletions(-)

Hi Paolo

I think you are still missing some qemu_get_* functions.

Or puting as a question, why this functions don't need the mark?


qemu_get_ubyte()
qemu_get_be16()
qemu_get_be32()
qemu_get_be64()

And the same for the functions that end with an 's.

And to add insult to injury (I know, I know), some functions are marked
in .c but not in the .h

qemu_get_byte() cames to mind.

>From my point of view, every function that is qemu_get_* or qemu_peek_*
in either of the three files should get the annotation.

Or what I am missing here?

Later, Juan.

Re: [PATCH 2/5] ppc: spapr: cleanup h_enter_nested() with helper routines.

Harsh Prateek Bora  writes:

> h_enter_nested() currently does a lot of register specific operations
> which should be abstracted logically to simplify the code for better
> readability. This patch breaks down relevant blocks into respective
> helper routines to make use of them for better readability/maintenance.
>
> Signed-off-by: Harsh Prateek Bora 
> ---
>  hw/ppc/spapr_hcall.c | 99 +++-
>  1 file changed, 60 insertions(+), 39 deletions(-)
>
> diff --git a/hw/ppc/spapr_hcall.c b/hw/ppc/spapr_hcall.c
> index 124cee5e53..a13e5256ab 100644
> --- a/hw/ppc/spapr_hcall.c
> +++ b/hw/ppc/spapr_hcall.c
> @@ -1544,6 +1544,62 @@ static target_ulong h_copy_tofrom_guest(PowerPCCPU 
> *cpu,
>  return H_FUNCTION;
>  }
>  
> +static void restore_hdec_from_hvstate(CPUPPCState *dst,
> +  struct kvmppc_hv_guest_state *hv_state,
> +  target_ulong now)
> +{
> +target_ulong hdec;

add a blank line here

> +assert(hv_state);
> +hdec = hv_state->hdec_expiry - now;
> +cpu_ppc_hdecr_init(dst);
> +cpu_ppc_store_hdecr(dst, hdec);
> +}
> +
> +static void restore_lpcr_from_hvstate(PowerPCCPU *cpu,
> +  struct kvmppc_hv_guest_state *hv_state)
> +{
> +PowerPCCPUClass *pcc = POWERPC_CPU_GET_CLASS(cpu);
> +CPUPPCState *dst = >env;
> +target_ulong lpcr, lpcr_mask;

here as well

> +assert(hv_state);
> +lpcr_mask = LPCR_DPFD | LPCR_ILE | LPCR_AIL | LPCR_LD | LPCR_MER;
> +lpcr = (dst->spr[SPR_LPCR] & ~lpcr_mask) | (hv_state->lpcr & lpcr_mask);
> +lpcr |= LPCR_HR | LPCR_UPRT | LPCR_GTSE | LPCR_HVICE | LPCR_HDICE;
> +lpcr &= ~LPCR_LPES0;
> +dst->spr[SPR_LPCR] = lpcr & pcc->lpcr_mask;
> +}
> +
> +static void restore_env_from_ptregs_hvstate(CPUPPCState *env,

Take a look at how the kernel does it. It might be better to have ptregs
and hv regs separate. Also probably better to have some terms specific
to the domain (l2 state, l1 state, etc).

> +struct kvmppc_pt_regs *regs,
> +struct kvmppc_hv_guest_state 
> *hv_state)
> +{
> +assert(env);
> +assert(regs);
> +assert(hv_state);
> +assert(sizeof(env->gpr) == sizeof(regs->gpr));
> +memcpy(env->gpr, regs->gpr, sizeof(env->gpr));
> +env->nip = regs->nip;
> +env->msr = regs->msr;
> +env->lr = regs->link;
> +env->ctr = regs->ctr;
> +cpu_write_xer(env, regs->xer);
> +ppc_store_cr(env, regs->ccr);
> +/* hv_state->amor is not used in api v1 */

That's not really an API thing. More of an oversight.

> +env->spr[SPR_HFSCR] = hv_state->hfscr;
> +/* TCG does not implement DAWR*, CIABR, PURR, SPURR, IC, VTB, HEIR SPRs*/
> +env->cfar = hv_state->cfar;
> +env->spr[SPR_PCR]  = hv_state->pcr;
> +env->spr[SPR_DPDES] = hv_state->dpdes;
> +env->spr[SPR_SRR0]  = hv_state->srr0;
> +env->spr[SPR_SRR1]  = hv_state->srr1;
> +env->spr[SPR_SPRG0] = hv_state->sprg[0];
> +env->spr[SPR_SPRG1] = hv_state->sprg[1];
> +env->spr[SPR_SPRG2] = hv_state->sprg[2];
> +env->spr[SPR_SPRG3] = hv_state->sprg[3];
> +env->spr[SPR_BOOKS_PID] = hv_state->pidr;
> +env->spr[SPR_PPR]   = hv_state->ppr;

I would advise against the extra spacing inside functions.

[PATCH 08/40] igb: Always copy ethernet header

igb_receive_internal() used to check the iov length to determine
copy the iovs to a contiguous buffer, but the check is flawed in two
ways:
- It does not ensure that iovcnt > 0.
- It does not take virtio-net header into consideration.

The size of this copy is just 22 octets, which can be even less than
the code size required for checks. This (wrong) optimization is probably
not worth so just remove it. Removing this also allows igb to assume
aligned accesses for the ethernet header.

Fixes: 3a977deebe ("Intrdocue igb device emulation")
Signed-off-by: Akihiko Odaki 
---
 hw/net/igb_core.c | 39 +--
 1 file changed, 21 insertions(+), 18 deletions(-)

diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 53f60fc3d3..1d188b526c 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -67,6 +67,11 @@ typedef struct IGBTxPktVmdqCallbackContext {
 NetClientState *nc;
 } IGBTxPktVmdqCallbackContext;
 
+typedef struct L2Header {
+struct eth_header eth;
+struct vlan_header vlan[2];
+} L2Header;
+
 static ssize_t
 igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt,
  bool has_vnet, bool *external_tx);
@@ -961,15 +966,16 @@ igb_rx_is_oversized(IGBCore *core, uint16_t qn, size_t 
size)
 return size > (lpe ? max_ethernet_lpe_size : max_ethernet_vlan_size);
 }
 
-static uint16_t igb_receive_assign(IGBCore *core, const struct eth_header 
*ehdr,
+static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header,
size_t size, E1000E_RSSInfo *rss_info,
bool *external_tx)
 {
 static const int ta_shift[] = { 4, 3, 2, 0 };
+const struct eth_header *ehdr = _header->eth;
 uint32_t f, ra[2], *macp, rctl = core->mac[RCTL];
 uint16_t queues = 0;
 uint16_t oversized = 0;
-uint16_t vid = lduw_be_p(_GET_VLAN_HDR(ehdr)->h_tci) & VLAN_VID_MASK;
+uint16_t vid = be16_to_cpu(l2_header->vlan[0].h_tci) & VLAN_VID_MASK;
 bool accepted = false;
 int i;
 
@@ -1590,14 +1596,13 @@ static ssize_t
 igb_receive_internal(IGBCore *core, const struct iovec *iov, int iovcnt,
  bool has_vnet, bool *external_tx)
 {
-static const int maximum_ethernet_hdr_len = (ETH_HLEN + 8);
-
 uint16_t queues = 0;
 uint32_t n = 0;
-uint8_t min_buf[ETH_ZLEN];
+union {
+L2Header l2_header;
+uint8_t octets[ETH_ZLEN];
+} min_buf;
 struct iovec min_iov;
-struct eth_header *ehdr;
-uint8_t *filter_buf;
 size_t size, orig_size;
 size_t iov_ofs = 0;
 E1000E_RxRing rxr;
@@ -1623,24 +1628,21 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
 net_rx_pkt_unset_vhdr(core->rx_pkt);
 }
 
-filter_buf = iov->iov_base + iov_ofs;
 orig_size = iov_size(iov, iovcnt);
 size = orig_size - iov_ofs;
 
 /* Pad to minimum Ethernet frame length */
 if (size < sizeof(min_buf)) {
-iov_to_buf(iov, iovcnt, iov_ofs, min_buf, size);
-memset(_buf[size], 0, sizeof(min_buf) - size);
+iov_to_buf(iov, iovcnt, iov_ofs, _buf, size);
+memset(_buf.octets[size], 0, sizeof(min_buf) - size);
 e1000x_inc_reg_if_not_full(core->mac, RUC);
-min_iov.iov_base = filter_buf = min_buf;
+min_iov.iov_base = _buf;
 min_iov.iov_len = size = sizeof(min_buf);
 iovcnt = 1;
 iov = _iov;
 iov_ofs = 0;
-} else if (iov->iov_len < maximum_ethernet_hdr_len) {
-/* This is very unlikely, but may happen. */
-iov_to_buf(iov, iovcnt, iov_ofs, min_buf, maximum_ethernet_hdr_len);
-filter_buf = min_buf;
+} else {
+iov_to_buf(iov, iovcnt, iov_ofs, _buf, sizeof(min_buf.l2_header));
 }
 
 /* Discard oversized packets if !LPE and !SBP. */
@@ -1648,11 +1650,12 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
 return orig_size;
 }
 
-ehdr = PKT_GET_ETH_HDR(filter_buf);
-net_rx_pkt_set_packet_type(core->rx_pkt, get_eth_packet_type(ehdr));
+net_rx_pkt_set_packet_type(core->rx_pkt,
+   get_eth_packet_type(_buf.l2_header.eth));
 net_rx_pkt_set_protocols(core->rx_pkt, iov, iovcnt, iov_ofs);
 
-queues = igb_receive_assign(core, ehdr, size, _info, external_tx);
+queues = igb_receive_assign(core, _buf.l2_header, size,
+_info, external_tx);
 if (!queues) {
 trace_e1000e_rx_flt_dropped();
 return orig_size;
-- 
2.40.0

[PATCH 26/40] net/eth: Always add VLAN tag

It is possible to have another VLAN tag even if the packet is already
tagged.

Signed-off-by: Akihiko Odaki 
---
 hw/net/net_tx_pkt.c | 16 +++-
 include/net/eth.h   |  4 ++--
 net/eth.c   | 22 ++
 3 files changed, 15 insertions(+), 27 deletions(-)

diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c
index ce6b102391..af8f77a3f0 100644
--- a/hw/net/net_tx_pkt.c
+++ b/hw/net/net_tx_pkt.c
@@ -40,7 +40,10 @@ struct NetTxPkt {
 
 struct iovec *vec;
 
-uint8_t l2_hdr[ETH_MAX_L2_HDR_LEN];
+struct {
+struct eth_header eth;
+struct vlan_header vlan[3];
+} l2_hdr;
 union {
 struct ip_header ip;
 struct ip6_header ip6;
@@ -365,18 +368,13 @@ bool net_tx_pkt_build_vheader(struct NetTxPkt *pkt, bool 
tso_enable,
 void net_tx_pkt_setup_vlan_header_ex(struct NetTxPkt *pkt,
 uint16_t vlan, uint16_t vlan_ethtype)
 {
-bool is_new;
 assert(pkt);
 
 eth_setup_vlan_headers(pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_base,
-vlan, vlan_ethtype, _new);
+   >vec[NET_TX_PKT_L2HDR_FRAG].iov_len,
+   vlan, vlan_ethtype);
 
-/* update l2hdrlen */
-if (is_new) {
-pkt->hdr_len += sizeof(struct vlan_header);
-pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_len +=
-sizeof(struct vlan_header);
-}
+pkt->hdr_len += sizeof(struct vlan_header);
 }
 
 bool net_tx_pkt_add_raw_fragment(struct NetTxPkt *pkt, void *base, size_t len)
diff --git a/include/net/eth.h b/include/net/eth.h
index 2f87a72170..2bbd04ec3b 100644
--- a/include/net/eth.h
+++ b/include/net/eth.h
@@ -351,8 +351,8 @@ eth_strip_vlan_ex(const struct iovec *iov, int iovcnt, 
size_t iovoff,
 uint16_t
 eth_get_l3_proto(const struct iovec *l2hdr_iov, int iovcnt, size_t l2hdr_len);
 
-void eth_setup_vlan_headers(struct eth_header *ehdr, uint16_t vlan_tag,
-uint16_t vlan_ethtype, bool *is_new);
+void eth_setup_vlan_headers(struct eth_header *ehdr, size_t *ehdr_size,
+uint16_t vlan_tag, uint16_t vlan_ethtype);
 
 
 uint8_t eth_get_gso_type(uint16_t l3_proto, uint8_t *l3_hdr, uint8_t l4proto);
diff --git a/net/eth.c b/net/eth.c
index f7ffbda600..5307978486 100644
--- a/net/eth.c
+++ b/net/eth.c
@@ -21,26 +21,16 @@
 #include "net/checksum.h"
 #include "net/tap.h"
 
-void eth_setup_vlan_headers(struct eth_header *ehdr, uint16_t vlan_tag,
-uint16_t vlan_ethtype, bool *is_new)
+void eth_setup_vlan_headers(struct eth_header *ehdr, size_t *ehdr_size,
+uint16_t vlan_tag, uint16_t vlan_ethtype)
 {
 struct vlan_header *vhdr = PKT_GET_VLAN_HDR(ehdr);
 
-switch (be16_to_cpu(ehdr->h_proto)) {
-case ETH_P_VLAN:
-case ETH_P_DVLAN:
-/* vlan hdr exists */
-*is_new = false;
-break;
-
-default:
-/* No VLAN header, put a new one */
-vhdr->h_proto = ehdr->h_proto;
-ehdr->h_proto = cpu_to_be16(vlan_ethtype);
-*is_new = true;
-break;
-}
+memmove(vhdr + 1, vhdr, *ehdr_size - ETH_HLEN);
 vhdr->h_tci = cpu_to_be16(vlan_tag);
+vhdr->h_proto = ehdr->h_proto;
+ehdr->h_proto = cpu_to_be16(vlan_ethtype);
+*ehdr_size += sizeof(*vhdr);
 }
 
 uint8_t
-- 
2.40.0

[PATCH 40/40] docs/system/devices/igb: Note igb is tested for DPDK

Signed-off-by: Akihiko Odaki 
---
 docs/system/devices/igb.rst | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/docs/system/devices/igb.rst b/docs/system/devices/igb.rst
index afe036dad2..60c10bf7c7 100644
--- a/docs/system/devices/igb.rst
+++ b/docs/system/devices/igb.rst
@@ -14,7 +14,8 @@ Limitations
 ===
 
 This igb implementation was tested with Linux Test Project [2]_ and Windows HLK
-[3]_ during the initial development. The command used when testing with LTP is:
+[3]_ during the initial development. Later it was also tested with DPDK Test
+Suite [4]_. The command used when testing with LTP is:
 
 .. code-block:: shell
 
@@ -22,8 +23,8 @@ This igb implementation was tested with Linux Test Project 
[2]_ and Windows HLK
 
 Be aware that this implementation lacks many functionalities available with the
 actual hardware, and you may experience various failures if you try to use it
-with a different operating system other than Linux and Windows or if you try
-functionalities not covered by the tests.
+with a different operating system other than DPDK, Linux, and Windows or if you
+try functionalities not covered by the tests.
 
 Using igb
 =
@@ -32,7 +33,7 @@ Using igb should be nothing different from using another 
network device. See
 :ref:`pcsys_005fnetwork` in general.
 
 However, you may also need to perform additional steps to activate SR-IOV
-feature on your guest. For Linux, refer to [4]_.
+feature on your guest. For Linux, refer to [5]_.
 
 Developing igb
 ==
@@ -68,4 +69,5 @@ References
 .. [1] 
https://www.intel.com/content/dam/www/public/us/en/documents/datasheets/82576eb-gigabit-ethernet-controller-datasheet.pdf
 .. [2] https://github.com/linux-test-project/ltp
 .. [3] https://learn.microsoft.com/en-us/windows-hardware/test/hlk/
-.. [4] https://docs.kernel.org/PCI/pci-iov-howto.html
+.. [4] https://doc.dpdk.org/dts/gsg/
+.. [5] https://docs.kernel.org/PCI/pci-iov-howto.html
-- 
2.40.0

[PATCH 17/40] igb: Always log status after building rx metadata

Without this change, the status flags may not be traced e.g. if checksum
offloading is disabled.

Signed-off-by: Akihiko Odaki 
---
 hw/net/igb_core.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 5fdc8bc42d..ccc5a626b4 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -1303,9 +1303,8 @@ igb_build_rx_metadata(IGBCore *core,
 trace_e1000e_rx_metadata_l4_cso_disabled();
 }
 
-trace_e1000e_rx_metadata_status_flags(*status_flags);
-
 func_exit:
+trace_e1000e_rx_metadata_status_flags(*status_flags);
 *status_flags = cpu_to_le32(*status_flags);
 }
 
-- 
2.40.0

[PATCH 27/40] hw/net/net_rx_pkt: Enforce alignment for eth_header

eth_strip_vlan and eth_strip_vlan_ex refers to ehdr_buf as struct
eth_header. Enforce alignment for the structure.

Signed-off-by: Akihiko Odaki 
---
 hw/net/net_rx_pkt.c | 11 +++
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/hw/net/net_rx_pkt.c b/hw/net/net_rx_pkt.c
index 6125a063d7..1de42b4f51 100644
--- a/hw/net/net_rx_pkt.c
+++ b/hw/net/net_rx_pkt.c
@@ -23,7 +23,10 @@
 
 struct NetRxPkt {
 struct virtio_net_hdr virt_hdr;
-uint8_t ehdr_buf[sizeof(struct eth_header) + sizeof(struct vlan_header)];
+struct {
+struct eth_header eth;
+struct vlan_header vlan;
+} ehdr_buf;
 struct iovec *vec;
 uint16_t vec_len_total;
 uint16_t vec_len;
@@ -89,7 +92,7 @@ net_rx_pkt_pull_data(struct NetRxPkt *pkt,
 if (pkt->ehdr_buf_len) {
 net_rx_pkt_iovec_realloc(pkt, iovcnt + 1);
 
-pkt->vec[0].iov_base = pkt->ehdr_buf;
+pkt->vec[0].iov_base = >ehdr_buf;
 pkt->vec[0].iov_len = pkt->ehdr_buf_len;
 
 pkt->tot_len = pllen + pkt->ehdr_buf_len;
@@ -120,7 +123,7 @@ void net_rx_pkt_attach_iovec(struct NetRxPkt *pkt,
 assert(pkt);
 
 if (strip_vlan) {
-pkt->ehdr_buf_len = eth_strip_vlan(iov, iovcnt, iovoff, pkt->ehdr_buf,
+pkt->ehdr_buf_len = eth_strip_vlan(iov, iovcnt, iovoff, >ehdr_buf,
, );
 } else {
 pkt->ehdr_buf_len = 0;
@@ -142,7 +145,7 @@ void net_rx_pkt_attach_iovec_ex(struct NetRxPkt *pkt,
 
 if (strip_vlan) {
 pkt->ehdr_buf_len = eth_strip_vlan_ex(iov, iovcnt, iovoff, vet,
-  pkt->ehdr_buf,
+  >ehdr_buf,
   , );
 } else {
 pkt->ehdr_buf_len = 0;
-- 
2.40.0

[PATCH 10/40] tests/avocado: Remove unused imports

Signed-off-by: Akihiko Odaki 
---
 tests/avocado/netdev-ethtool.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/avocado/netdev-ethtool.py b/tests/avocado/netdev-ethtool.py
index f7e9464184..8de118e313 100644
--- a/tests/avocado/netdev-ethtool.py
+++ b/tests/avocado/netdev-ethtool.py
@@ -7,7 +7,6 @@
 
 from avocado import skip
 from avocado_qemu import QemuSystemTest
-from avocado_qemu import exec_command, exec_command_and_wait_for_pattern
 from avocado_qemu import wait_for_console_pattern
 
 class NetDevEthtool(QemuSystemTest):
-- 
2.40.0

[PATCH 30/40] igb: Implement igb-specific oversize check

igb has a configurable size limit for LPE, and uses different limits
depending on whether the packet is treated as a VLAN packet.

Signed-off-by: Akihiko Odaki 
---
 hw/net/igb_core.c | 41 +++--
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 2013a9a53d..569897fb99 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -954,16 +954,21 @@ igb_rx_l4_cso_enabled(IGBCore *core)
 return !!(core->mac[RXCSUM] & E1000_RXCSUM_TUOFLD);
 }
 
-static bool
-igb_rx_is_oversized(IGBCore *core, uint16_t qn, size_t size)
+static bool igb_rx_is_oversized(IGBCore *core, const struct eth_header *ehdr,
+size_t size, bool lpe, uint16_t rlpml)
 {
-uint16_t pool = qn % IGB_NUM_VM_POOLS;
-bool lpe = !!(core->mac[VMOLR0 + pool] & E1000_VMOLR_LPE);
-int max_ethernet_lpe_size =
-core->mac[VMOLR0 + pool] & E1000_VMOLR_RLPML_MASK;
-int max_ethernet_vlan_size = 1522;
+size += 4;
+
+if (lpe) {
+return size > rlpml;
+}
+
+if (e1000x_is_vlan_packet(ehdr, core->mac[VET] & 0x) &&
+e1000x_vlan_rx_filter_enabled(core->mac)) {
+return size > 1522;
+}
 
-return size > (lpe ? max_ethernet_lpe_size : max_ethernet_vlan_size);
+return size > 1518;
 }
 
 static uint16_t igb_receive_assign(IGBCore *core, const L2Header *l2_header,
@@ -976,6 +981,8 @@ static uint16_t igb_receive_assign(IGBCore *core, const 
L2Header *l2_header,
 uint16_t queues = 0;
 uint16_t oversized = 0;
 uint16_t vid = be16_to_cpu(l2_header->vlan[0].h_tci) & VLAN_VID_MASK;
+bool lpe;
+uint16_t rlpml;
 int i;
 
 memset(rss_info, 0, sizeof(E1000E_RSSInfo));
@@ -984,6 +991,14 @@ static uint16_t igb_receive_assign(IGBCore *core, const 
L2Header *l2_header,
 *external_tx = true;
 }
 
+lpe = !!(core->mac[RCTL] & E1000_RCTL_LPE);
+rlpml = core->mac[RLPML];
+if (!(core->mac[RCTL] & E1000_RCTL_SBP) &&
+igb_rx_is_oversized(core, ehdr, size, lpe, rlpml)) {
+trace_e1000x_rx_oversized(size);
+return queues;
+}
+
 if (e1000x_is_vlan_packet(ehdr, core->mac[VET] & 0x) &&
 !e1000x_rx_vlan_filter(core->mac, PKT_GET_VLAN_HDR(ehdr))) {
 return queues;
@@ -1067,7 +1082,10 @@ static uint16_t igb_receive_assign(IGBCore *core, const 
L2Header *l2_header,
 queues &= core->mac[VFRE];
 if (queues) {
 for (i = 0; i < IGB_NUM_VM_POOLS; i++) {
-if ((queues & BIT(i)) && igb_rx_is_oversized(core, i, size)) {
+lpe = !!(core->mac[VMOLR0 + i] & E1000_VMOLR_LPE);
+rlpml = core->mac[VMOLR0 + i] & E1000_VMOLR_RLPML_MASK;
+if ((queues & BIT(i)) &&
+igb_rx_is_oversized(core, ehdr, size, lpe, rlpml)) {
 oversized |= BIT(i);
 }
 }
@@ -1609,11 +1627,6 @@ igb_receive_internal(IGBCore *core, const struct iovec 
*iov, int iovcnt,
 iov_to_buf(iov, iovcnt, iov_ofs, _buf, sizeof(min_buf.l2_header));
 }
 
-/* Discard oversized packets if !LPE and !SBP. */
-if (e1000x_is_oversized(core->mac, size)) {
-return orig_size;
-}
-
 net_rx_pkt_set_packet_type(core->rx_pkt,
get_eth_packet_type(_buf.l2_header.eth));
 net_rx_pkt_set_protocols(core->rx_pkt, iov, iovcnt, iov_ofs);
-- 
2.40.0

[PATCH 28/40] tests/qtest/libqos/igb: Set GPIE.Multiple_MSIX

GPIE.Multiple_MSIX is not set by default, and needs to be set to get
interrupts from multiple MSI-X vectors.

Signed-off-by: Akihiko Odaki 
---
 tests/qtest/libqos/igb.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/qtest/libqos/igb.c b/tests/qtest/libqos/igb.c
index 12fb531bf0..a603468beb 100644
--- a/tests/qtest/libqos/igb.c
+++ b/tests/qtest/libqos/igb.c
@@ -114,6 +114,7 @@ static void igb_pci_start_hw(QOSGraphObject *obj)
 e1000e_macreg_write(>e1000e, E1000_RCTL, E1000_RCTL_EN);
 
 /* Enable all interrupts */
+e1000e_macreg_write(>e1000e, E1000_GPIE,  E1000_GPIE_MSIX_MODE);
 e1000e_macreg_write(>e1000e, E1000_IMS,  0x);
 e1000e_macreg_write(>e1000e, E1000_EIMS, 0x);
 
-- 
2.40.0

[PATCH 22/40] igb: Add more definitions for Tx descriptor

Signed-off-by: Akihiko Odaki 
---
 hw/net/igb_core.c |  2 +-
 hw/net/igb_regs.h | 32 +++-
 2 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index e5a7021c0e..350462c40c 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -418,7 +418,7 @@ igb_setup_tx_offloads(IGBCore *core, struct igb_tx *tx)
 {
 if (tx->first_cmd_type_len & E1000_ADVTXD_DCMD_TSE) {
 uint32_t idx = (tx->first_olinfo_status >> 4) & 1;
-uint32_t mss = tx->ctx[idx].mss_l4len_idx >> 16;
+uint32_t mss = tx->ctx[idx].mss_l4len_idx >> E1000_ADVTXD_MSS_SHIFT;
 if (!net_tx_pkt_build_vheader(tx->tx_pkt, true, true, mss)) {
 return false;
 }
diff --git a/hw/net/igb_regs.h b/hw/net/igb_regs.h
index c5c5b3c3b8..22ce909173 100644
--- a/hw/net/igb_regs.h
+++ b/hw/net/igb_regs.h
@@ -42,11 +42,6 @@ union e1000_adv_tx_desc {
 } wb;
 };
 
-#define E1000_ADVTXD_DTYP_CTXT  0x0020 /* Advanced Context Descriptor */
-#define E1000_ADVTXD_DTYP_DATA  0x0030 /* Advanced Data Descriptor */
-#define E1000_ADVTXD_DCMD_DEXT  0x2000 /* Descriptor Extension (1=Adv) */
-#define E1000_ADVTXD_DCMD_TSE   0x8000 /* TCP/UDP Segmentation Enable */
-
 #define E1000_ADVTXD_POTS_IXSM  0x0100 /* Insert TCP/UDP Checksum */
 #define E1000_ADVTXD_POTS_TXSM  0x0200 /* Insert TCP/UDP Checksum */
 
@@ -151,6 +146,10 @@ union e1000_adv_rx_desc {
 #define IGB_82576_VF_DEV_ID0x10CA
 #define IGB_I350_VF_DEV_ID 0x1520
 
+/* VLAN info */
+#define IGB_TX_FLAGS_VLAN_MASK 0x
+#define IGB_TX_FLAGS_VLAN_SHIFT16
+
 /* from igb/e1000_82575.h */
 
 #define E1000_MRQC_ENABLE_RSS_MQ0x0002
@@ -160,6 +159,29 @@ union e1000_adv_rx_desc {
 #define E1000_MRQC_RSS_FIELD_IPV6_UDP   0x0080
 #define E1000_MRQC_RSS_FIELD_IPV6_UDP_EX0x0100
 
+/* Adv Transmit Descriptor Config Masks */
+#define E1000_ADVTXD_MAC_TSTAMP   0x0008 /* IEEE1588 Timestamp packet */
+#define E1000_ADVTXD_DTYP_CTXT0x0020 /* Advanced Context Descriptor */
+#define E1000_ADVTXD_DTYP_DATA0x0030 /* Advanced Data Descriptor */
+#define E1000_ADVTXD_DCMD_EOP 0x0100 /* End of Packet */
+#define E1000_ADVTXD_DCMD_IFCS0x0200 /* Insert FCS (Ethernet CRC) */
+#define E1000_ADVTXD_DCMD_RS  0x0800 /* Report Status */
+#define E1000_ADVTXD_DCMD_DEXT0x2000 /* Descriptor extension (1=Adv) */
+#define E1000_ADVTXD_DCMD_VLE 0x4000 /* VLAN pkt enable */
+#define E1000_ADVTXD_DCMD_TSE 0x8000 /* TCP Seg enable */
+#define E1000_ADVTXD_PAYLEN_SHIFT14 /* Adv desc PAYLEN shift */
+
+#define E1000_ADVTXD_MACLEN_SHIFT9  /* Adv ctxt desc mac len shift */
+#define E1000_ADVTXD_TUCMD_L4T_UDP 0x  /* L4 Packet TYPE of UDP */
+#define E1000_ADVTXD_TUCMD_IPV40x0400  /* IP Packet Type: 1=IPv4 */
+#define E1000_ADVTXD_TUCMD_L4T_TCP 0x0800  /* L4 Packet TYPE of TCP */
+#define E1000_ADVTXD_TUCMD_L4T_SCTP 0x1000 /* L4 packet TYPE of SCTP */
+/* IPSec Encrypt Enable for ESP */
+#define E1000_ADVTXD_L4LEN_SHIFT 8  /* Adv ctxt L4LEN shift */
+#define E1000_ADVTXD_MSS_SHIFT  16  /* Adv ctxt MSS shift */
+/* Adv ctxt IPSec SA IDX mask */
+/* Adv ctxt IPSec ESP len mask */
+
 /* Additional Transmit Descriptor Control definitions */
 #define E1000_TXDCTL_QUEUE_ENABLE  0x0200 /* Enable specific Tx Queue */
 
-- 
2.40.0

[PATCH 09/40] Fix references to igb Avocado test

Fixes: 9f95111474 ("tests/avocado: re-factor igb test to avoid timeouts")
Signed-off-by: Akihiko Odaki 
---
 MAINTAINERS| 2 +-
 docs/system/devices/igb.rst| 2 +-
 scripts/ci/org.centos/stream/8/x86_64/test-avocado | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index ef45b5e71e..c31d2279ab 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2256,7 +2256,7 @@ R: Sriram Yagnaraman 
 S: Maintained
 F: docs/system/devices/igb.rst
 F: hw/net/igb*
-F: tests/avocado/igb.py
+F: tests/avocado/netdev-ethtool.py
 F: tests/qtest/igb-test.c
 F: tests/qtest/libqos/igb.c
 
diff --git a/docs/system/devices/igb.rst b/docs/system/devices/igb.rst
index 70edadd574..afe036dad2 100644
--- a/docs/system/devices/igb.rst
+++ b/docs/system/devices/igb.rst
@@ -60,7 +60,7 @@ Avocado test and can be ran with the following command:
 
 .. code:: shell
 
-  make check-avocado AVOCADO_TESTS=tests/avocado/igb.py
+  make check-avocado AVOCADO_TESTS=tests/avocado/netdev-ethtool.py
 
 References
 ==
diff --git a/scripts/ci/org.centos/stream/8/x86_64/test-avocado 
b/scripts/ci/org.centos/stream/8/x86_64/test-avocado
index d2c0e5fb4c..a1aa601ee3 100755
--- a/scripts/ci/org.centos/stream/8/x86_64/test-avocado
+++ b/scripts/ci/org.centos/stream/8/x86_64/test-avocado
@@ -30,7 +30,7 @@ make get-vm-images
 tests/avocado/cpu_queries.py:QueryCPUModelExpansion.test \
 tests/avocado/empty_cpu_model.py:EmptyCPUModel.test \
 tests/avocado/hotplug_cpu.py:HotPlugCPU.test \
-tests/avocado/igb.py:IGB.test \
+tests/avocado/netdev-ethtool.py:NetDevEthtool.test_igb_nomsi \
 tests/avocado/info_usernet.py:InfoUsernet.test_hostfwd \
 tests/avocado/intel_iommu.py:IntelIOMMU.test_intel_iommu \
 tests/avocado/intel_iommu.py:IntelIOMMU.test_intel_iommu_pt \
-- 
2.40.0

[PATCH 33/40] igb: Implement Tx SCTP CSO

Signed-off-by: Akihiko Odaki 
---
 hw/net/igb_core.c   | 12 +++-
 hw/net/net_tx_pkt.c | 18 ++
 hw/net/net_tx_pkt.h |  8 
 3 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/hw/net/igb_core.c b/hw/net/igb_core.c
index 0e1b681613..955db1b1dc 100644
--- a/hw/net/igb_core.c
+++ b/hw/net/igb_core.c
@@ -432,8 +432,9 @@ igb_tx_insert_vlan(IGBCore *core, uint16_t qn, struct 
igb_tx *tx,
 static bool
 igb_setup_tx_offloads(IGBCore *core, struct igb_tx *tx)
 {
+uint32_t idx = (tx->first_olinfo_status >> 4) & 1;
+
 if (tx->first_cmd_type_len & E1000_ADVTXD_DCMD_TSE) {
-uint32_t idx = (tx->first_olinfo_status >> 4) & 1;
 uint32_t mss = tx->ctx[idx].mss_l4len_idx >> E1000_ADVTXD_MSS_SHIFT;
 if (!net_tx_pkt_build_vheader(tx->tx_pkt, true, true, mss)) {
 return false;
@@ -444,10 +445,11 @@ igb_setup_tx_offloads(IGBCore *core, struct igb_tx *tx)
 return true;
 }
 
-if (tx->first_olinfo_status & E1000_ADVTXD_POTS_TXSM) {
-if (!net_tx_pkt_build_vheader(tx->tx_pkt, false, true, 0)) {
-return false;
-}
+if ((tx->first_olinfo_status & E1000_ADVTXD_POTS_TXSM) &&
+!((tx->ctx[idx].type_tucmd_mlhl & E1000_ADVTXD_TUCMD_L4T_SCTP) ?
+  net_tx_pkt_update_sctp_checksum(tx->tx_pkt) :
+  net_tx_pkt_build_vheader(tx->tx_pkt, false, true, 0))) {
+return false;
 }
 
 if (tx->first_olinfo_status & E1000_ADVTXD_POTS_IXSM) {
diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c
index af8f77a3f0..2e5f58b3c9 100644
--- a/hw/net/net_tx_pkt.c
+++ b/hw/net/net_tx_pkt.c
@@ -16,6 +16,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/crc32c.h"
 #include "net/eth.h"
 #include "net/checksum.h"
 #include "net/tap.h"
@@ -135,6 +136,23 @@ void net_tx_pkt_update_ip_checksums(struct NetTxPkt *pkt)
  pkt->virt_hdr.csum_offset, , sizeof(csum));
 }
 
+bool net_tx_pkt_update_sctp_checksum(struct NetTxPkt *pkt)
+{
+uint32_t csum = 0;
+struct iovec *pl_start_frag = pkt->vec + NET_TX_PKT_PL_START_FRAG;
+
+if (iov_from_buf(pl_start_frag, pkt->payload_frags, 8, , 
sizeof(csum)) < sizeof(csum)) {
+return false;
+}
+
+csum = cpu_to_le32(iov_crc32c(0x, pl_start_frag, 
pkt->payload_frags));
+if (iov_from_buf(pl_start_frag, pkt->payload_frags, 8, , 
sizeof(csum)) < sizeof(csum)) {
+return false;
+}
+
+return true;
+}
+
 static void net_tx_pkt_calculate_hdr_len(struct NetTxPkt *pkt)
 {
 pkt->hdr_len = pkt->vec[NET_TX_PKT_L2HDR_FRAG].iov_len +
diff --git a/hw/net/net_tx_pkt.h b/hw/net/net_tx_pkt.h
index f5cd44da6f..fc00d7941d 100644
--- a/hw/net/net_tx_pkt.h
+++ b/hw/net/net_tx_pkt.h
@@ -116,6 +116,14 @@ void net_tx_pkt_update_ip_checksums(struct NetTxPkt *pkt);
  */
 void net_tx_pkt_update_ip_hdr_checksum(struct NetTxPkt *pkt);
 
+/**
+ * Calculate the SCTP checksum.
+ *
+ * @pkt:packet
+ *
+ */
+bool net_tx_pkt_update_sctp_checksum(struct NetTxPkt *pkt);
+
 /**
  * get length of all populated data.
  *
-- 
2.40.0

[PATCH 39/40] MAINTAINERS: Add a reviewer for network packet abstractions