[PATCH v3] nbd/server: do not poll within a coroutine context

2024-04-03 Thread Eric Blake
From: Zhu Yangyang 

Coroutines are not supposed to block. Instead, they should yield.

The client performs TLS upgrade outside of an AIOContext, during
synchronous handshake; this still requires g_main_loop.  But the
server responds to TLS upgrade inside a coroutine, so a nested
g_main_loop is wrong.  Since the two callbacks no longer share more
than the setting of data.complete and data.error, it's just as easy to
use static helpers instead of trying to share a common code path.

Fixes: f95910f ("nbd: implement TLS support in the protocol negotiation")
Signed-off-by: Zhu Yangyang 
[eblake: move callbacks to their use point]
Signed-off-by: Eric Blake 
---

After looking at this more, I'm less convinced that there is enough
common code here to even be worth trying to share in common.c.  This
takes the essence of the v2 patch, but refactors it a bit.

v2: https://lists.gnu.org/archive/html/qemu-devel/2024-04/msg00019.html

 nbd/nbd-internal.h | 20 ++--
 nbd/client.c   | 21 +
 nbd/common.c   | 11 ---
 nbd/server.c   | 21 -
 4 files changed, 43 insertions(+), 30 deletions(-)

diff --git a/nbd/nbd-internal.h b/nbd/nbd-internal.h
index dfa02f77ee4..087c6bfc002 100644
--- a/nbd/nbd-internal.h
+++ b/nbd/nbd-internal.h
@@ -63,6 +63,16 @@
 #define NBD_SET_TIMEOUT _IO(0xab, 9)
 #define NBD_SET_FLAGS   _IO(0xab, 10)

+/* Used in NBD_OPT_STARTTLS handling */
+struct NBDTLSHandshakeData {
+bool complete;
+Error *error;
+union {
+GMainLoop *loop;
+Coroutine *co;
+} u;
+};
+
 /* nbd_write
  * Writes @size bytes to @ioc. Returns 0 on success.
  */
@@ -72,16 +82,6 @@ static inline int nbd_write(QIOChannel *ioc, const void 
*buffer, size_t size,
 return qio_channel_write_all(ioc, buffer, size, errp) < 0 ? -EIO : 0;
 }

-struct NBDTLSHandshakeData {
-GMainLoop *loop;
-bool complete;
-Error *error;
-};
-
-
-void nbd_tls_handshake(QIOTask *task,
-   void *opaque);
-
 int nbd_drop(QIOChannel *ioc, size_t size, Error **errp);

 #endif
diff --git a/nbd/client.c b/nbd/client.c
index 29ffc609a4b..c9dc5265404 100644
--- a/nbd/client.c
+++ b/nbd/client.c
@@ -596,6 +596,18 @@ static int nbd_request_simple_option(QIOChannel *ioc, int 
opt, bool strict,
 return 1;
 }

+/* Callback to learn when QIO TLS upgrade is complete */
+static void nbd_client_tls_handshake(QIOTask *task, void *opaque)
+{
+struct NBDTLSHandshakeData *data = opaque;
+
+qio_task_propagate_error(task, >error);
+data->complete = true;
+if (data->u.loop) {
+g_main_loop_quit(data->u.loop);
+}
+}
+
 static QIOChannel *nbd_receive_starttls(QIOChannel *ioc,
 QCryptoTLSCreds *tlscreds,
 const char *hostname, Error **errp)
@@ -619,18 +631,19 @@ static QIOChannel *nbd_receive_starttls(QIOChannel *ioc,
 return NULL;
 }
 qio_channel_set_name(QIO_CHANNEL(tioc), "nbd-client-tls");
-data.loop = g_main_loop_new(g_main_context_default(), FALSE);
 trace_nbd_receive_starttls_tls_handshake();
 qio_channel_tls_handshake(tioc,
-  nbd_tls_handshake,
+  nbd_client_tls_handshake,
   ,
   NULL,
   NULL);

 if (!data.complete) {
-g_main_loop_run(data.loop);
+data.u.loop = g_main_loop_new(g_main_context_default(), FALSE);
+g_main_loop_run(data.u.loop);
+g_main_loop_unref(data.u.loop);
 }
-g_main_loop_unref(data.loop);
+
 if (data.error) {
 error_propagate(errp, data.error);
 object_unref(OBJECT(tioc));
diff --git a/nbd/common.c b/nbd/common.c
index 3247c1d618a..589a748cfe6 100644
--- a/nbd/common.c
+++ b/nbd/common.c
@@ -47,17 +47,6 @@ int nbd_drop(QIOChannel *ioc, size_t size, Error **errp)
 }


-void nbd_tls_handshake(QIOTask *task,
-   void *opaque)
-{
-struct NBDTLSHandshakeData *data = opaque;
-
-qio_task_propagate_error(task, >error);
-data->complete = true;
-g_main_loop_quit(data->loop);
-}
-
-
 const char *nbd_opt_lookup(uint32_t opt)
 {
 switch (opt) {
diff --git a/nbd/server.c b/nbd/server.c
index c3484cc1ebc..d16726a6326 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -748,6 +748,17 @@ static int nbd_negotiate_handle_info(NBDClient *client, 
Error **errp)
 return rc;
 }

+/* Callback to learn when QIO TLS upgrade is complete */
+static void nbd_server_tls_handshake(QIOTask *task, void *opaque)
+{
+struct NBDTLSHandshakeData *data = opaque;
+
+qio_task_propagate_error(task, >error);
+data->complete = true;
+if (!qemu_coroutine_entered(data->u.co)) {
+aio_co_wake(data->u.co);
+}
+}

 /* Handle NBD_OPT_STARTTLS. Return NULL to drop connection, or else the
  * new channel for all further (now-encrypted) 

Re: [PATCH v10 2/2] memory tier: create CPUless memory tiers after obtaining HMAT info

2024-04-03 Thread Ho-Ren (Jack) Chuang
Hi Jonathan,

Thank you for your feedback. I will fix them (inlined) in the next V11.

On Wed, Apr 3, 2024 at 10:04 AM Jonathan Cameron
 wrote:
>
> A few minor comments inline.
>
> > diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> > index a44c03c2ba3a..16769552a338 100644
> > --- a/include/linux/memory-tiers.h
> > +++ b/include/linux/memory-tiers.h
> > @@ -140,12 +140,13 @@ static inline int mt_perf_to_adistance(struct 
> > access_coordinate *perf, int *adis
> >   return -EIO;
> >  }
> >
> > -struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct 
> > list_head *memory_types)
> > +static inline struct memory_dev_type *mt_find_alloc_memory_type(int adist,
> > + struct list_head *memory_types)
> >  {
> >   return NULL;
> >  }
> >
> > -void mt_put_memory_types(struct list_head *memory_types)
> > +static inline void mt_put_memory_types(struct list_head *memory_types)
> >  {
> Why in this patch and not previous one?

I've also noticed this issue. I will fix it in the next V11.

> >
> >  }
> > diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> > index 974af10cfdd8..44fa10980d37 100644
> > --- a/mm/memory-tiers.c
> > +++ b/mm/memory-tiers.c
> > @@ -36,6 +36,11 @@ struct node_memory_type_map {
> >
> >  static DEFINE_MUTEX(memory_tier_lock);
> >  static LIST_HEAD(memory_tiers);
> > +/*
> > + * The list is used to store all memory types that are not created
> > + * by a device driver.
> > + */
> > +static LIST_HEAD(default_memory_types);
> >  static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
> >  struct memory_dev_type *default_dram_type;
> >
> > @@ -108,6 +113,8 @@ static struct demotion_nodes *node_demotion 
> > __read_mostly;
> >
> >  static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
> >
> > +/* The lock is used to protect `default_dram_perf*` info and nid. */
> > +static DEFINE_MUTEX(default_dram_perf_lock);
> >  static bool default_dram_perf_error;
> >  static struct access_coordinate default_dram_perf;
> >  static int default_dram_perf_ref_nid = NUMA_NO_NODE;
> > @@ -505,7 +512,8 @@ static inline void __init_node_memory_type(int node, 
> > struct memory_dev_type *mem
> >  static struct memory_tier *set_node_memory_tier(int node)
> >  {
> >   struct memory_tier *memtier;
> > - struct memory_dev_type *memtype;
> > + struct memory_dev_type *mtype = default_dram_type;
>
> Does the rename add anything major to the patch?
> If not I'd leave it alone to reduce the churn and give
> a more readable patch.  If it is worth doing perhaps
> a precursor patch?
>

Either name works. Keeping it the same name will make the code
easier to follow. I agree! Thanks.

> > + int adist = MEMTIER_ADISTANCE_DRAM;
> >   pg_data_t *pgdat = NODE_DATA(node);
> >
> >
> > @@ -514,11 +522,20 @@ static struct memory_tier *set_node_memory_tier(int 
> > node)
> >   if (!node_state(node, N_MEMORY))
> >   return ERR_PTR(-EINVAL);
> >
> > - __init_node_memory_type(node, default_dram_type);
> > + mt_calc_adistance(node, );
> > + if (node_memory_types[node].memtype == NULL) {
> > + mtype = mt_find_alloc_memory_type(adist, 
> > _memory_types);
> > + if (IS_ERR(mtype)) {
> > + mtype = default_dram_type;
> > + pr_info("Failed to allocate a memory type. Fall 
> > back.\n");
> > + }
> > + }
> > +
> > + __init_node_memory_type(node, mtype);
> >
> > - memtype = node_memory_types[node].memtype;
> > - node_set(node, memtype->nodes);
> > - memtier = find_create_memory_tier(memtype);
> > + mtype = node_memory_types[node].memtype;
> > + node_set(node, mtype->nodes);
> > + memtier = find_create_memory_tier(mtype);
> >   if (!IS_ERR(memtier))
> >   rcu_assign_pointer(pgdat->memtier, memtier);
> >   return memtier;
> > @@ -655,6 +672,33 @@ void mt_put_memory_types(struct list_head 
> > *memory_types)
> >  }
> >  EXPORT_SYMBOL_GPL(mt_put_memory_types);
> >
> > +/*
> > + * This is invoked via `late_initcall()` to initialize memory tiers for
> > + * CPU-less memory nodes after driver initialization, which is
> > + * expected to provide `adistance` algorithms.
> > + */
> > +static int __init memory_tier_late_init(void)
> > +{
> > + int nid;
> > +
> > + mutex_lock(_tier_lock);
> > + for_each_node_state(nid, N_MEMORY)
> > + if (node_memory_types[nid].memtype == NULL)
> > + /*
> > +  * Some device drivers may have initialized memory 
> > tiers
> > +  * between `memory_tier_init()` and 
> > `memory_tier_late_init()`,
> > +  * potentially bringing online memory nodes and
> > +  * configuring memory tiers. Exclude them here.
> > +  */
>
> Does the comment refer to this path, or to ones where memtype is set?
>

Yes, the comment is for explaining why the 

[PATCH] hw/usb: Check cycle bit before trace call

2024-04-03 Thread Ian Moffett
Signed-off-by: Ian Moffett 
---
 hw/usb/hcd-xhci.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c
index ad40232eb6..ea5bc64b26 100644
--- a/hw/usb/hcd-xhci.c
+++ b/hw/usb/hcd-xhci.c
@@ -710,13 +710,13 @@ static TRBType xhci_ring_fetch(XHCIState *xhci, XHCIRing 
*ring, XHCITRB *trb,
 le32_to_cpus(>status);
 le32_to_cpus(>control);

-trace_usb_xhci_fetch_trb(ring->dequeue, trb_name(trb),
- trb->parameter, trb->status, trb->control);
-
 if ((trb->control & TRB_C) != ring->ccs) {
 return 0;
 }

+trace_usb_xhci_fetch_trb(ring->dequeue, trb_name(trb),
+ trb->parameter, trb->status, trb->control);
+
 type = TRB_TYPE(*trb);

 if (type != TR_LINK) {
--
2.44.0




Re: [PATCH v10 1/2] memory tier: dax/kmem: introduce an abstract layer for finding, allocating, and putting memory types

2024-04-03 Thread Ho-Ren (Jack) Chuang
Hi Jonathan,

Thanks for your feedback. I will fix them (inlined) in the next V11.
No worries, it's never too late!

On Wed, Apr 3, 2024 at 9:52 AM Jonathan Cameron
 wrote:
>
> On Tue,  2 Apr 2024 00:17:37 +
> "Ho-Ren (Jack) Chuang"  wrote:
>
> > Since different memory devices require finding, allocating, and putting
> > memory types, these common steps are abstracted in this patch,
> > enhancing the scalability and conciseness of the code.
> >
> > Signed-off-by: Ho-Ren (Jack) Chuang 
> > Reviewed-by: "Huang, Ying" 
>
> Hi,
>
> I know this is a late entry to the discussion but a few comments inline.
> (sorry I didn't look earlier!)
>
> All opportunities to improve code complexity and readability as a result
> of your factoring out.
>
> Jonathan
>
>
> > ---
> >  drivers/dax/kmem.c   | 20 ++--
> >  include/linux/memory-tiers.h | 13 +
> >  mm/memory-tiers.c| 32 
> >  3 files changed, 47 insertions(+), 18 deletions(-)
> >
> > diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
> > index 42ee360cf4e3..01399e5b53b2 100644
> > --- a/drivers/dax/kmem.c
> > +++ b/drivers/dax/kmem.c
> > @@ -55,21 +55,10 @@ static LIST_HEAD(kmem_memory_types);
> >
> >  static struct memory_dev_type *kmem_find_alloc_memory_type(int adist)
> >  {
> > - bool found = false;
> >   struct memory_dev_type *mtype;
> >
> >   mutex_lock(_memory_type_lock);
> could use
>
> guard(mutex)(_memory_type_lock);
> return mt_find_alloc_memory_type(adist, _memory_types);
>

I will change it accordingly.

> I'm fine if you ignore this comment though as may be other functions in
> here that could take advantage of the cleanup.h stuff in a future patch.
>
> > - list_for_each_entry(mtype, _memory_types, list) {
> > - if (mtype->adistance == adist) {
> > - found = true;
> > - break;
> > - }
> > - }
> > - if (!found) {
> > - mtype = alloc_memory_type(adist);
> > - if (!IS_ERR(mtype))
> > - list_add(>list, _memory_types);
> > - }
> > + mtype = mt_find_alloc_memory_type(adist, _memory_types);
> >   mutex_unlock(_memory_type_lock);
> >
> >   return mtype;
>
> > diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> > index 69e781900082..a44c03c2ba3a 100644
> > --- a/include/linux/memory-tiers.h
> > +++ b/include/linux/memory-tiers.h
> > @@ -48,6 +48,9 @@ int mt_calc_adistance(int node, int *adist);
> >  int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
> >const char *source);
> >  int mt_perf_to_adistance(struct access_coordinate *perf, int *adist);
> > +struct memory_dev_type *mt_find_alloc_memory_type(int adist,
> > + struct list_head 
> > *memory_types);
>
> That indent looks unusual.  Align the start of struct with start of int.
>

I can make this aligned but it will show another warning:
"WARNING: line length of 131 exceeds 100 columns"
Is this ok?

> > +void mt_put_memory_types(struct list_head *memory_types);
> >  #ifdef CONFIG_MIGRATION
> >  int next_demotion_node(int node);
> >  void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
> > @@ -136,5 +139,15 @@ static inline int mt_perf_to_adistance(struct 
> > access_coordinate *perf, int *adis
> >  {
> >   return -EIO;
> >  }
> > +
> > +struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct 
> > list_head *memory_types)
> > +{
> > + return NULL;
> > +}
> > +
> > +void mt_put_memory_types(struct list_head *memory_types)
> > +{
> > +
> No blank line needed here.

Will fix.

> > +}
> >  #endif   /* CONFIG_NUMA */
> >  #endif  /* _LINUX_MEMORY_TIERS_H */
> > diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> > index 0537664620e5..974af10cfdd8 100644
> > --- a/mm/memory-tiers.c
> > +++ b/mm/memory-tiers.c
> > @@ -623,6 +623,38 @@ void clear_node_memory_type(int node, struct 
> > memory_dev_type *memtype)
> >  }
> >  EXPORT_SYMBOL_GPL(clear_node_memory_type);
> >
> > +struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct 
> > list_head *memory_types)
>
> Breaking this out as a separate function provides opportunity to improve it.
> Maybe a follow up patch makes sense given it would no longer be a straight
> forward code move.  However in my view it would be simple enough to be obvious
> even within this patch.
>

I will just keep this as is for now to minimize the changes aka mistakes.

> > +{
> > + bool found = false;
> > + struct memory_dev_type *mtype;
> > +
> > + list_for_each_entry(mtype, memory_types, list) {
> > + if (mtype->adistance == adist) {
> > + found = true;
>
> Why not return here?
> return mtype;
>

Yes, I can return here. I will do that and take care of the ptr
returning at this point.

> > +  

Re: Point where target instructions are read

2024-04-03 Thread Richard Henderson

On 4/3/24 08:15, Gautam Bhat wrote:

On Tue, Apr 2, 2024 at 2:01 AM Richard Henderson
 wrote:


The boot process must cooperate somehow.

When using loader, you must link the image such that it loads at the pc reset 
address
defined by the architecture manual.


r~


I changed my loading options to the following now to have better control:

./qemu-system-msp430 -machine msp430-launchpad -device
loader,file=simple_test.bin,addr=0xFFFE,cpu-num=0,force
-raw=on -d in_asm,out_asm

Here simple_test.bin is the raw binary file converted using objcopy.
addr=0xFFFE is the vector location where the PC will load with the
starting address.

Now how do I load the address in that reset vector location and set my
PC? Is there some example code that I can look at?


Hmm.  I can't find an example.  I see a TODO for m68k which *should* be loading the pc 
from the reset vector on reset.


What I think should work is something like

void msp430_cpu_reset_hold(Object *obj)
{
standard stuff, mostly zeroing registers.
}

void msp430_cpu_reset_exit(Object *obj)
{
MSP430CPUClass *mcc = MSP430_CPU_GET_CLASS(obj);
CPUState *cs = CPU(obj);
CPUMSP430State *env = cpu_env(cs);
MemTxResult res;

if (mcc->parent_phases.exit) {
mvv->parent_phases.exit(obj);
}

/* Load PC from the Hard Reset interrupt vector. */
env->pc = address_space_lduw(cs->as, 0xfffe, MEMTXATTRS_UNSPECIFIED, );
assert(res == MEMTX_OK);
}

void msp430_cpu_class_init(ObjectClass *c, void *data)
{
MSP430CPUClass *mcc = MSP430_CPU_CLASS(c);
ResettableClass *rc = RESETTABLE_CLASS(c);

resettable_class_set_parent_phases(rc, NULL,
   msp430_cpu_reset_hold,
   msp430_cpu_reset_exit,
   >parent_phases);
}

The loader device populates ram during the reset hold phase, so I believe you need to wait 
until after that is complete to perform the load, thus the reset_exit hook.



r~



Re: Point where target instructions are read

2024-04-03 Thread BALATON Zoltan

On Wed, 3 Apr 2024, Gautam Bhat wrote:

On Tue, Apr 2, 2024 at 2:01 AM Richard Henderson
 wrote:


The boot process must cooperate somehow.

When using loader, you must link the image such that it loads at the pc reset 
address
defined by the architecture manual.


r~


I changed my loading options to the following now to have better control:

./qemu-system-msp430 -machine msp430-launchpad -device
loader,file=simple_test.bin,addr=0xFFFE,cpu-num=0,force
-raw=on -d in_asm,out_asm


Check the docs on the generic loader: 
https://www.qemu.org/docs/master/system/generic-loader.html
I think when using cpu-num it will also set the PC but I don't know much 
about it. Maybe you could start qemu with -S option then do info registers 
in QEMU monitor to check the status to find out what's happening. If real 
board has firmware maybe you need to use that or emulate it in the board 
code if the boot loader exepects it to be present.


Regards,
BALATON Zoltan


Here simple_test.bin is the raw binary file converted using objcopy.
addr=0xFFFE is the vector location where the PC will load with the
starting address.

Now how do I load the address in that reset vector location and set my
PC? Is there some example code that I can look at?

-Gautam.



Re: TCG change broke MorphOS boot on sam460ex

2024-04-03 Thread BALATON Zoltan

On Wed, 3 Apr 2024, Nicholas Piggin wrote:

On Tue Apr 2, 2024 at 9:32 PM AEST, BALATON Zoltan wrote:

On Thu, 21 Mar 2024, BALATON Zoltan wrote:

On 27/2/24 17:47, BALATON Zoltan wrote:

Hello,

Commit 18a536f1f8 (accel/tcg: Always require can_do_io) broke booting
MorphOS on sam460ex (this was before 8.2.0 and I thought I've verified it
before that release but apparently missed it back then). It can be
reproduced with https://www.morphos-team.net/morphos-3.18.iso and following
command:

qemu-system-ppc -M sam460ex -serial stdio -d unimp,guest_errors \
   -drive if=none,id=cd,format=raw,file=morphos-3.18.iso \
   -device ide-cd,drive=cd,bus=ide.1


Any idea on this one? While MorphOS boots on other machines and other OSes
seem to boot on this machine it may still suggest there's some problem
somewhere as this worked before. So it may worth investigating it to make
sure there's no bug that could affect other OSes too even if they boot. I
don't know how to debug this so some help would be needed.


In the bad case it crashes after running this TB:


IN:
0x00c01354:  38c00040  li   r6, 0x40
0x00c01358:  38e10204  addi r7, r1, 0x204
0x00c0135c:  39010104  addi r8, r1, 0x104
0x00c01360:  39410004  addi r10, r1, 4
0x00c01364:  3920  li   r9, 0
0x00c01368:  7cc903a6  mtctrr6
0x00c0136c:  84c70004  lwzu r6, 4(r7)
0x00c01370:  7cc907a4  tlbwehi  r6, r9
0x00c01374:  84c80004  lwzu r6, 4(r8)
0x00c01378:  7cc90fa4  tlbwelo  r6, r9
0x00c0137c:  84ca0004  lwzu r6, 4(r10)
0x00c01380:  7cc917a4  tlbwehi  r6, r9
0x00c01384:  39290001  addi r9, r9, 1
0x00c01388:  4200ffe4  bdnz 0xc0136c

IN:
0x00c01374: unable to read memory


"unable to read memory" is the tracer, it does actually translate
the address, but it points to a wayward real address which returns
0 to TCG, which is an invalid instruction.

The good case instead doesn't exit the TB after 0x00c01370 but after
the complete loop at the bdnz. That look like this after the same
first TB:


IN:
0x00c0136c:  84c70004  lwzu r6, 4(r7)
0x00c01370:  7cc907a4  tlbwehi  r6, r9
0x00c01374:  84c80004  lwzu r6, 4(r8)
0x00c01378:  7cc90fa4  tlbwelo  r6, r9
0x00c0137c:  84ca0004  lwzu r6, 4(r10)
0x00c01380:  7cc917a4  tlbwehi  r6, r9
0x00c01384:  39290001  addi r9, r9, 1
0x00c01388:  4200ffe4  bdnz 0xc0136c

IN:
0x00c0138c:  4c00012c  isync

All the tlbwe are executed in the same TB. MMU tracing shows the
first tlbwehi creates a new valid(!) TLB for 0x-0x1
that has a garbage RPN because the tlbwelo did not run yet.

What's happening in the bad case is that the translator breaks
and "re-fetches" instructions in the middle of that sequence, and
that's where the bogus translation causes 0 to be returned. The
good case the whole block is executed in the same fetch which
creates correct translations.

So it looks like a morphos bug, the can-do-io change just happens
to cause it to re-fetch in that place, but that could happen for
a number of reasons, so you can't rely on TLB *only* changing or
ifetch *only* re-fetching at a sync point like isync.


Thanks a lot for the analysis. Probably ir works on real machine due to 
cache effects so maybe it was just luck this did not break.



I would expect code like this to write an invalid entry with tlbwehi,
then tlbwelo to set the correct RPN, then make the entry valid with
the second tlbwehi. It would probably fix the bug if you just did the
first tlbwehi with r6=0 (or at least without the 0x200 bit set).


I think I had to fix a similar issue in AROS years ago when I've first 
tried to make sam460ex emulation work and used AROS for testing:

https://github.com/aros-development-team/AROS/commit/586a8ada8a5b861a77cab177d39e01de8c3f4cf5

I can't fix MorphOS as it's not open source but hope MorphOS people will 
get to know about this and do something with it. It still works better on 
other emulated machines such as pegasos2 and mac99 so it's not a big deal, 
just wanted to make sure it would not be a bug that could affect other 
OSes on sam460ex.


Thank you,
BALATON Zoltan

Hermetic virtio-vsock in QEMU

2024-04-03 Thread Roman Kiryanov
Hi Peter, Alex and QEMU,

I work in Android Studio Emulator and we use virtio-vsock to emulate
devices (e.g. sensors) which live in the Emulator binary. We need to run on
Windows and in environments without CONFIG_VHOST_VSOCK, that is why we
cannot use vhost-vsock and invented our implementation. I tried to grep the
QEMU8 sources and I believe virtio-vsock is not available there.

Do you think it is a good idea to implement virtio-vsock in QEMU (e.g. to
run on Windows)? If the answer is yes, could you please point where I could
start to build an upstreamable solution (not Android Emulator specific)? It
is not clear to me how we should make the device available for clients
(sensors, modem, adb, etc) in a generic way.

Thank you.

Regards,
Roman.


Re: [PATCH 06/19] block/stream: fix -Werror=maybe-uninitialized false-positives

2024-04-03 Thread Vladimir Sementsov-Ogievskiy

On 03.04.24 20:50, Eric Blake wrote:

On Wed, Apr 03, 2024 at 01:24:11PM +0400, Marc-André Lureau wrote:

Unfortunately, it doesn't work in all cases. It seems to have issues
with some guards:
../block/stream.c: In function ‘stream_run’:
../block/stream.c:216:12: error: ‘ret’ may be used uninitialized
[-Werror=maybe-uninitialized]
216 | if (ret < 0) {



That one looks like:

int ret;
WITH_GRAPH_RDLOCK_GUARD() {
   ret = ...;
}
if (copy) {
   ret = ...;
}
if (ret < 0)

I suspect the compiler is seeing the uncertainty possible from the
second conditional, and letting it take priority over the certainty
that the tweaked macro provided for the first conditional.





So, updated macro helps in some cases, but doesn't help here? Intersting, why.


What should we do? change the macros + cherry-pick the missing
false-positives, or keep this series as is?


An uglier macro, with sufficient comments as to why it is ugly (in
order to let us have fewer false positives where we have to add
initializers) may be less churn in the code base, but I'm not
necessarily sold on the ugly macro.  Let's see if anyone else
expresses an opinion.







I think marco + missing is better. No reason to add dead-initializations in 
cases where new macros helps.


Ok


Still, would be good to understand, what's the difference, why it help on some 
cases and not help in another.


I don't know, it's like if the analyzer was lazy for this particular
case, although there is nothing much different from other usages.

If I replace:
for (... *var2 = (void *)true; var2;
with:
for (... *var2 = (void *)true; var2 || true;

then it doesn't warn..


but it also doesn't work.  We want the body to execute exactly once,
not infloop.




Interestingly as well, if I change:
 for (... *var2 = (void *)true; var2; var2 = NULL)
for:
 for (... *var2 = GML_OBJ_(); var2; var2 = NULL)

GML_OBJ_() simply being &(GraphLockable) { }), an empty compound
literal, then it doesn't work, in all usages.


So the compiler is not figuring out that the compound literal is
sufficient for an unconditional one time through the for loop body.

What's worse, different compiler versions will change behavior over
time.  Making the code ugly to pacify a particular compiler, when that
compiler might improve in the future, is a form of chasing windmills.



All in all, I am not sure the trick of using var2 is really reliable either.


And that's my biggest argument for not making the macro not more
complex than it already is.



All sounds reasonable, I'm not sure now.

I still don't like an idea to satisfy compiler false-positive warnings by extra 
initializations.. Interesting that older versions do have unitialized-use 
warnings, but don't fail here (or nobody check?). Is it necessary to fix them 
at all? Older versions of compiler don't produce these warnings?  Is it 
possible that some optimizations in new GCC version somehow breaks our WITH_ 
hack, so that it really lead to uninitialized behavior? And we just mask real 
problem with these patches?

Wouldn't it more correct to just drop WITH_ hack, and move to a bit uglier but 
more gcc-native and fair

{
   QEMU_LOCK_GUARD(lock);
   ...
}

?

--
Best regards,
Vladimir




Re: [PATCH 1/1] migration/multifd: solve zero page causing multiple page faults

2024-04-03 Thread Peter Xu
On Tue, Apr 02, 2024 at 09:57:49AM -0300, Fabiano Rosas wrote:
> Yuan Liu  writes:
> 
> > Implemented recvbitmap tracking of received pages in multifd.
> >
> > If the zero page appears for the first time in the recvbitmap, this
> > page is not checked and set.
> >
> > If the zero page has already appeared in the recvbitmap, there is no
> > need to check the data but directly set the data to 0, because it is
> > unlikely that the zero page will be migrated multiple times.
> >
> > Signed-off-by: Yuan Liu 
> 
> Reviewed-by: Fabiano Rosas 
> 

I queued it with below squashed to update the comment. I hope it works for
you.  Thanks,

===8<===
diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
index 848915ea5b..7062da380b 100644
--- a/include/exec/ramblock.h
+++ b/include/exec/ramblock.h
@@ -57,7 +57,7 @@ struct RAMBlock {
 off_t bitmap_offset;
 uint64_t pages_offset;
 
-/* bitmap of already received pages in postcopy */
+/* Bitmap of already received pages.  Only used on destination side. */
 unsigned long *receivedmap;
 
 /*
===8<===

-- 
Peter Xu




riscv disassembler error with pmpcfg0

2024-04-03 Thread Eric DeVolder
I've been using QEMU8 to collect instruction information on U-Boot + OpenSBI.

I'm running QEMU in this fashion to collect the information:

# qemu-system-riscv64 -plugin file=qemu/build/contrib/plugins/libexeclog.so 
-singlestep -d plugin,nochain -D execlog.txt ...

When examining the instruction trace in execlog, I've noticed that the 
disassembly for pmpcfg0 is erroneous, for example:

0, 0x5456, 0x3a002573, "csrrs   a0,pmpcfg3,zero"

the CSR encoded in the instruction above is 0x3a0, which is pmpcfg0 (which also 
matches the code I'm examining).

For the Uboot+OpenSBI code I'm examining, pmpcfg0/3 is the only one that 
appears to have a problem.

I also checked QEMU9 and it behaves as described above as well.

I'm willing to provide a fix if I can get some advice/pointers on how this 
disassembly statement is generated...I did take a quick look but it didn't 
appear obvious how...

Thanks,
eric



Re: [PATCH v2] e1000: Convert debug macros into tracepoints.

2024-04-03 Thread Austin Clements
At this point there's not much of my original code left. :D Don, you're
welcome to take the credit in the commit.

On Wed, Apr 3, 2024, 9:46 AM Don Porter  wrote:

> From: Austin Clements 
>
> The E1000 debug messages are very useful for developing drivers.
> Make these available to users without recompiling QEMU.
>
> Signed-off-by: Austin Clements 
> [geo...@ldpreload.com: Rebased on top of 2.9.0]
> Signed-off-by: Geoffrey Thomas 
> Signed-off-by: Don Porter 
> ---
>  hw/net/e1000.c  | 90 +++--
>  hw/net/trace-events | 25 -
>  2 files changed, 54 insertions(+), 61 deletions(-)
>
> diff --git a/hw/net/e1000.c b/hw/net/e1000.c
> index 43f3a4a701..24475636a3 100644
> --- a/hw/net/e1000.c
> +++ b/hw/net/e1000.c
> @@ -44,26 +44,6 @@
>  #include "trace.h"
>  #include "qom/object.h"
>
> -/* #define E1000_DEBUG */
> -
> -#ifdef E1000_DEBUG
> -enum {
> -DEBUG_GENERAL,  DEBUG_IO,   DEBUG_MMIO, DEBUG_INTERRUPT,
> -DEBUG_RX,   DEBUG_TX,   DEBUG_MDIC, DEBUG_EEPROM,
> -DEBUG_UNKNOWN,  DEBUG_TXSUM,DEBUG_TXERR,DEBUG_RXERR,
> -DEBUG_RXFILTER, DEBUG_PHY,  DEBUG_NOTYET,
> -};
> -#define DBGBIT(x)(1< -static int debugflags = DBGBIT(TXERR) | DBGBIT(GENERAL);
> -
> -#define DBGOUT(what, fmt, ...) do { \
> -if (debugflags & DBGBIT(what)) \
> -fprintf(stderr, "e1000: " fmt, ## __VA_ARGS__); \
> -} while (0)
> -#else
> -#define DBGOUT(what, fmt, ...) do {} while (0)
> -#endif
> -
>  #define IOPORT_SIZE   0x40
>  #define PNPMMIO_SIZE  0x2
>
> @@ -351,8 +331,7 @@ e1000_mit_timer(void *opaque)
>  static void
>  set_ics(E1000State *s, int index, uint32_t val)
>  {
> -DBGOUT(INTERRUPT, "set_ics %x, ICR %x, IMR %x\n", val,
> s->mac_reg[ICR],
> -s->mac_reg[IMS]);
> +trace_e1000_set_ics(val, s->mac_reg[ICR], s->mac_reg[IMS]);
>  set_interrupt_cause(s, 0, val | s->mac_reg[ICR]);
>  }
>
> @@ -425,8 +404,7 @@ set_rx_control(E1000State *s, int index, uint32_t val)
>  s->mac_reg[RCTL] = val;
>  s->rxbuf_size = e1000x_rxbufsize(val);
>  s->rxbuf_min_shift = ((val / E1000_RCTL_RDMTS_QUAT) & 3) + 1;
> -DBGOUT(RX, "RCTL: %d, mac_reg[RCTL] = 0x%x\n", s->mac_reg[RDT],
> -   s->mac_reg[RCTL]);
> +trace_e1000_set_rx_control(s->mac_reg[RDT], s->mac_reg[RCTL]);
>  timer_mod(s->flush_queue_timer,
>qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 1000);
>  }
> @@ -440,16 +418,16 @@ set_mdic(E1000State *s, int index, uint32_t val)
>  if ((val & E1000_MDIC_PHY_MASK) >> E1000_MDIC_PHY_SHIFT != 1) // phy #
>  val = s->mac_reg[MDIC] | E1000_MDIC_ERROR;
>  else if (val & E1000_MDIC_OP_READ) {
> -DBGOUT(MDIC, "MDIC read reg 0x%x\n", addr);
> +trace_e1000_mdic_read_register(addr);
>  if (!(phy_regcap[addr] & PHY_R)) {
> -DBGOUT(MDIC, "MDIC read reg %x unhandled\n", addr);
> +trace_e1000_mdic_read_register_unhandled(addr);
>  val |= E1000_MDIC_ERROR;
>  } else
>  val = (val ^ data) | s->phy_reg[addr];
>  } else if (val & E1000_MDIC_OP_WRITE) {
> -DBGOUT(MDIC, "MDIC write reg 0x%x, value 0x%x\n", addr, data);
> +trace_e1000_mdic_write_register(addr, data);
>  if (!(phy_regcap[addr] & PHY_W)) {
> -DBGOUT(MDIC, "MDIC write reg %x unhandled\n", addr);
> +trace_e1000_mdic_write_register_unhandled(addr);
>  val |= E1000_MDIC_ERROR;
>  } else {
>  if (addr < NPHYWRITEOPS && phyreg_writeops[addr]) {
> @@ -471,8 +449,8 @@ get_eecd(E1000State *s, int index)
>  {
>  uint32_t ret = E1000_EECD_PRES|E1000_EECD_GNT |
> s->eecd_state.old_eecd;
>
> -DBGOUT(EEPROM, "reading eeprom bit %d (reading %d)\n",
> -   s->eecd_state.bitnum_out, s->eecd_state.reading);
> +trace_e1000_get_eecd(s->eecd_state.bitnum_out, s->eecd_state.reading);
> +
>  if (!s->eecd_state.reading ||
>  ((s->eeprom_data[(s->eecd_state.bitnum_out >> 4) & 0x3f] >>
>((s->eecd_state.bitnum_out & 0xf) ^ 0xf))) & 1)
> @@ -511,9 +489,8 @@ set_eecd(E1000State *s, int index, uint32_t val)
>  s->eecd_state.reading = (((s->eecd_state.val_in >> 6) & 7) ==
>  EEPROM_READ_OPCODE_MICROWIRE);
>  }
> -DBGOUT(EEPROM, "eeprom bitnum in %d out %d, reading %d\n",
> -   s->eecd_state.bitnum_in, s->eecd_state.bitnum_out,
> -   s->eecd_state.reading);
> +trace_e1000_set_eecd(s->eecd_state.bitnum_in,
> s->eecd_state.bitnum_out,
> + s->eecd_state.reading);
>  }
>
>  static uint32_t
> @@ -580,8 +557,7 @@ xmit_seg(E1000State *s)
>
>  if (tp->cptse) {
>  css = props->ipcss;
> -DBGOUT(TXSUM, "frames %d size %d ipcss %d\n",
> -   frames, tp->size, css);
> +trace_e1000_xmit_seg1(frames, tp->size, css);
>  if (props->ip) {/* IPv4 */
>  stw_be_p(tp->data+css+2, tp->size - css);
>   

Re: Intention to work on GSoC project

2024-04-03 Thread Eugenio Perez Martin
On Wed, Apr 3, 2024 at 4:36 PM Sahil  wrote:
>
> Hi,
>
> Thank you for the reply.
>
> On Tuesday, April 2, 2024 5:08:24 PM IST Eugenio Perez Martin wrote:
> > [...]
> > > > > Q2.
> > > > > In the Red Hat article, just below the first listing ("Memory layout 
> > > > > of a
> > > > > packed virtqueue descriptor"), there's the following line referring 
> > > > > to the
> > > > > buffer id in "virtq_desc":
> > > > > > This time, the id field is not an index for the device to look for 
> > > > > > the
> > > > > > buffer: it is an opaque value for it, only has meaning for the 
> > > > > > driver.
> > > > >
> > > > > But the device returns the buffer id when it writes the used 
> > > > > descriptor to
> > > > > the descriptor ring. The "only has meaning for the driver" part has 
> > > > > got me
> > > > > a little confused. Which buffer id is this that the device returns? 
> > > > > Is it related
> > > > > to the buffer id in the available descriptor?
> > > >
> > > > In my understanding, buffer id is the element that avail descriptor
> > > > marks to identify when adding descriptors to table. Device will returns
> > > > the buffer id in the processed descriptor or the last descriptor in a
> > > > chain, and write it to the descriptor that used idx refers to (first
> > > > one in the chain). Then used idx increments.
> > > >
> > > > The Packed Virtqueue blog [1] is helpful, but some details in the
> > > > examples
> > > > are making me confused.
> > > >
> > > > Q1.
> > > > In the last step of the two-entries descriptor table example, it says
> > > > both buffers #0 and #1 are available for the device. I understand
> > > > descriptor[0] is available and descriptor[1] is not, but there is no ID 
> > > > #0
> > > > now. So does the device got buffer #0 by notification beforehand? If so,
> > > > does it mean buffer #0 will be lost when notifications are disabled?
> >
> > I guess you mean the table labeled "Figure: Full two-entries descriptor
> > table".
> >
> > Take into account that the descriptor table is not the state of all
> > the descriptors. That information must be maintained by the device and
> > the driver internally.
> >
> > The descriptor table is used as a circular buffer, where one part is
> > writable by the driver and the other part is writable by the device.
> > For the device to override the descriptor table entry where descriptor
> > id 0 used to be does not mean that the descriptor id 0 is used. It
> > just means that the device communicates to the driver that descriptor
> > 1 is used, and both sides need to keep the descriptor state
> > coherently.
> >
> > > I too have a similar question and understanding the relation between
> > > buffer
> > > ids in the used and available descriptors might give more insight into
> > > this. For available descriptors, the buffer id is used to associate
> > > descriptors with a particular buffer. I am still not very sure about ids
> > > in used descriptors.
> > >
> > > Regarding Q1, both buffers #0 and #1 are available. In the mentioned
> > > figure, both descriptor[0] and descriptor[1] are available. This figure
> > > follows the figure with the caption "Using first buffer out of order". So
> > > in the first figure the device reads buffer #1 and writes the used
> > > descriptor but it still has buffer #0 to read. That still belongs to the
> > > device while buffer #1 can now be handled by the driver once again. So in
> > > the next figure, the driver makes buffer #1 available again. The device
> > > can still read buffer #0 from the previous batch of available
> > > descriptors.
> > >
> > > Based on what I have understood, the driver can't touch the descriptor
> > > corresponding to buffer #0 until the device acknowledges it. I did find
> > > the
> > > figure a little confusing as well. I think once the meaning of buffer id
> > > is clear from the driver's and device's perspective, it'll be easier to
> > > understand the figure.
> >
> > I think you got it right. Please let me know if you have further questions.
>
> I would like to clarify one thing in the figure "Full two-entries descriptor
> table". The driver can only overwrite a used descriptor in the descriptor
> ring, right?

Except for the first round, the driver can only write to used entries
in the descriptor table. In other words, their avail and used flags
must be equal.

> And likewise for the device?

Yes, but with avail descs. I think you got this already, but I want to
be as complete as possible here.

> So in the figure, the driver will
> have to wait until descriptor[1] is used before it can overwrite it?
>

Yes, but I think it is easier to think that both descriptor id 0 and 1
are available already. The descriptor id must be less than virtqueue
size.

An entry with a valid buffer and length must be invalid because of the
descriptor id in that situation, either because it is a number > vq
length or because it is a descriptor already available.

> Suppose the device marks descriptor[0] as used. 

Re: [PATCH v3] input-linux: Add option to not grab a device upon guest startup

2024-04-03 Thread Markus Armbruster
Justinien Bouron  writes:

>> Again, QAPI schema
>
> Pardon my ignorance, but are you writing this because there is a problem with
> the QAPI schema changes that I would need to fix and re-submit?
> Or is it just here to indicate that you've reviewed the change made to the
> schema?

The latter.

Reviewed-by: means I reviewed the entire patch, and approve it.

Acked-by: is weaker, and its meaning depends on context.  In this case,
it means I reviewed just the QAPI schema, and approve it.

Details (likely more than you ever wanted to know) at:
https://www.kernel.org/doc/html/latest/process/submitting-patches.html




Re: [PATCH v6 09/12] hw/cxl/events: Add qmp interfaces to add/release dynamic capacity extents

2024-04-03 Thread Gregory Price
On Mon, Mar 25, 2024 at 12:02:27PM -0700, nifan@gmail.com wrote:
> From: Fan Ni 
> 
> To simulate FM functionalities for initiating Dynamic Capacity Add
> (Opcode 5604h) and Dynamic Capacity Release (Opcode 5605h) as in CXL spec
> r3.1 7.6.7.6.5 and 7.6.7.6.6, we implemented two QMP interfaces to issue
> add/release dynamic capacity extents requests.
> 
... snip 
> +
> +/*
> + * The main function to process dynamic capacity event. Currently DC extents
> + * add/release requests are processed.
> + */
> +static void qmp_cxl_process_dynamic_capacity(const char *path, CxlEventLog 
> log,
> + CXLDCEventType type, uint16_t 
> hid,
> + uint8_t rid,
> + CXLDCExtentRecordList *records,
> + Error **errp)
> +{
... snip 
> +/* Sanity check and count the extents */
> +list = records;
> +while (list) {
> +offset = list->value->offset;
> +len = list->value->len;
> +dpa = offset + dcd->dc.regions[rid].base;
> +
> +if (len == 0) {
> +error_setg(errp, "extent with 0 length is not allowed");
> +return;
> +}
> +
> +if (offset % block_size || len % block_size) {
> +error_setg(errp, "dpa or len is not aligned to region block 
> size");
> +return;
> +}
> +
> +if (offset + len > dcd->dc.regions[rid].len) {
> +error_setg(errp, "extent range is beyond the region end");
> +return;
> +}
> +
> +/* No duplicate or overlapped extents are allowed */
> +if (test_any_bits_set(blk_bitmap, offset / block_size,
> +  len / block_size)) {
> +error_setg(errp, "duplicate or overlapped extents are detected");
> +return;
> +}
> +bitmap_set(blk_bitmap, offset / block_size, len / block_size);
> +
> +num_extents++;

I think num_extents is always equal to the length of the list, otherwise
this code will return with error.

Nitpick:
This can be moved to the bottom w/ `list = list->next` to express that a
little more clearly.

> +if (type == DC_EVENT_RELEASE_CAPACITY) {
> +if (cxl_extents_overlaps_dpa_range(>dc.extents_pending,
> +   dpa, len)) {
> +error_setg(errp,
> +   "cannot release extent with pending DPA range");
> +return;
> +}
> +if (!cxl_extents_contains_dpa_range(>dc.extents,
> +dpa, len)) {
> +error_setg(errp,
> +   "cannot release extent with non-existing DPA 
> range");
> +return;
> +}
> +}
> +list = list->next;
> +}
> +
> +if (num_extents == 0) {

Since num_extents is always the length of the list, this is equivalent to
`if (!records)` prior to the while loop. Makes it a little more clear that:

1. There must be at least 1 extent
2. All extents must be valid for the command to be serviced.

> +error_setg(errp, "no valid extents to send to process");
> +return;
> +}
> +

I'm looking at adding the MHD extensions around this point, e.g.:

/* If MHD cannot allocate requested extents, the cmd fails */
if (type == DC_EVENT_ADD_CAPACITY && dcd->mhd_dcd_extents_allocate &&
num_extents != dcd->mhd_dcd_extents_allocate(...))
return;

where mhd_dcd_extents_allocate checks the MHD block bitmap and tags
for correctness (shared // no double-allocations, etc). On success,
it garuantees proper ownership.

the release path would then be done in the release response path from
the host, as opposed to the release event injection.

Do you see any issues with that flow?

> +/* Create extent list for event being passed to host */
> +i = 0;
> +list = records;
> +extents = g_new0(CXLDCExtentRaw, num_extents);
> +while (list) {
> +offset = list->value->offset;
> +len = list->value->len;
> +dpa = dcd->dc.regions[rid].base + offset;
> +
> +extents[i].start_dpa = dpa;
> +extents[i].len = len;
> +memset(extents[i].tag, 0, 0x10);
> +extents[i].shared_seq = 0;
> +list = list->next;
> +i++;
> +}
> +
> +/*
> + * CXL r3.1 section 8.2.9.2.1.6: Dynamic Capacity Event Record
> + *
> + * All Dynamic Capacity event records shall set the Event Record Severity
> + * field in the Common Event Record Format to Informational Event. All
> + * Dynamic Capacity related events shall be logged in the Dynamic 
> Capacity
> + * Event Log.
> + */
> +cxl_assign_event_header(hdr, _capacity_uuid, flags, sizeof(dCap),
> +cxl_device_get_timestamp(>cxl_dstate));
> +
> +dCap.type = type;
> +/* FIXME: 

Re: [PATCH] target/hppa: Fix IIAOQ, IIASQ for pa2.0

2024-04-03 Thread Helge Deller

On 4/2/24 03:25, Richard Henderson wrote:

The contents of IIAOQ depend on PSW_W.
Follow the text in "Interruption Instruction Address Queues",
pages 2-13 through 2-15.

Reported-by: Sven Schnelle 
Fixes: b10700d826c ("target/hppa: Update IIAOQ, IIASQ for pa2.0")
Signed-off-by: Richard Henderson 


Tested-by: Helge Deller 

Helge


---

Sven, I looked again through IIAOQ documentation and it does seem
like some of the bits are wrong, both on interrupt delivery and RFI.


r~

---
  target/hppa/int_helper.c | 20 +++-
  target/hppa/sys_helper.c | 18 +-
  2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/target/hppa/int_helper.c b/target/hppa/int_helper.c
index 90437a92cd..a667ee380d 100644
--- a/target/hppa/int_helper.c
+++ b/target/hppa/int_helper.c
@@ -107,14 +107,10 @@ void hppa_cpu_do_interrupt(CPUState *cs)

  /* step 3 */
  /*
- * For pa1.x, IIASQ is simply a copy of IASQ.
- * For pa2.0, IIASQ is the top bits of the virtual address,
- *or zero if translation is disabled.
+ * IIASQ is the top bits of the virtual address, or zero if translation
+ * is disabled -- with PSW_W == 0, this will reduce to the space.
   */
-if (!hppa_is_pa20(env)) {
-env->cr[CR_IIASQ] = env->iasq_f >> 32;
-env->cr_back[0] = env->iasq_b >> 32;
-} else if (old_psw & PSW_C) {
+if (old_psw & PSW_C) {
  env->cr[CR_IIASQ] =
  hppa_form_gva_psw(old_psw, env->iasq_f, env->iaoq_f) >> 32;
  env->cr_back[0] =
@@ -123,8 +119,14 @@ void hppa_cpu_do_interrupt(CPUState *cs)
  env->cr[CR_IIASQ] = 0;
  env->cr_back[0] = 0;
  }
-env->cr[CR_IIAOQ] = env->iaoq_f;
-env->cr_back[1] = env->iaoq_b;
+/* IIAOQ is the full offset for wide mode, or 32 bits for narrow mode. */
+if (old_psw & PSW_W) {
+env->cr[CR_IIAOQ] = env->iaoq_f;
+env->cr_back[1] = env->iaoq_b;
+} else {
+env->cr[CR_IIAOQ] = (uint32_t)env->iaoq_f;
+env->cr_back[1] = (uint32_t)env->iaoq_b;
+}

  if (old_psw & PSW_Q) {
  /* step 5 */
diff --git a/target/hppa/sys_helper.c b/target/hppa/sys_helper.c
index 208e51c086..22d6c89964 100644
--- a/target/hppa/sys_helper.c
+++ b/target/hppa/sys_helper.c
@@ -78,21 +78,21 @@ target_ulong HELPER(swap_system_mask)(CPUHPPAState *env, 
target_ulong nsm)

  void HELPER(rfi)(CPUHPPAState *env)
  {
-env->iasq_f = (uint64_t)env->cr[CR_IIASQ] << 32;
-env->iasq_b = (uint64_t)env->cr_back[0] << 32;
-env->iaoq_f = env->cr[CR_IIAOQ];
-env->iaoq_b = env->cr_back[1];
+uint64_t mask;
+
+cpu_hppa_put_psw(env, env->cr[CR_IPSW]);

  /*
   * For pa2.0, IIASQ is the top bits of the virtual address.
   * To recreate the space identifier, remove the offset bits.
+ * For pa1.x, the mask reduces to no change to space.
   */
-if (hppa_is_pa20(env)) {
-env->iasq_f &= ~env->iaoq_f;
-env->iasq_b &= ~env->iaoq_b;
-}
+mask = gva_offset_mask(env->psw);

-cpu_hppa_put_psw(env, env->cr[CR_IPSW]);
+env->iaoq_f = env->cr[CR_IIAOQ];
+env->iaoq_b = env->cr_back[1];
+env->iasq_f = (env->cr[CR_IIASQ] << 32) & ~(env->iaoq_f & mask);
+env->iasq_b = (env->cr_back[0] << 32) & ~(env->iaoq_b & mask);
  }

  static void getshadowregs(CPUHPPAState *env)





Re: Point where target instructions are read

2024-04-03 Thread Gautam Bhat
On Tue, Apr 2, 2024 at 2:01 AM Richard Henderson
 wrote:

> The boot process must cooperate somehow.
>
> When using loader, you must link the image such that it loads at the pc reset 
> address
> defined by the architecture manual.
>
>
> r~

I changed my loading options to the following now to have better control:

./qemu-system-msp430 -machine msp430-launchpad -device
loader,file=simple_test.bin,addr=0xFFFE,cpu-num=0,force
-raw=on -d in_asm,out_asm

Here simple_test.bin is the raw binary file converted using objcopy.
addr=0xFFFE is the vector location where the PC will load with the
starting address.

Now how do I load the address in that reset vector location and set my
PC? Is there some example code that I can look at?

-Gautam.



Re: [PATCH v2] e1000: Convert debug macros into tracepoints.

2024-04-03 Thread Richard Henderson

On 4/3/24 03:45, Don Porter wrote:

From: Austin Clements

The E1000 debug messages are very useful for developing drivers.
Make these available to users without recompiling QEMU.

Signed-off-by: Austin Clements
[geo...@ldpreload.com: Rebased on top of 2.9.0]
Signed-off-by: Geoffrey Thomas
Signed-off-by: Don Porter
---
  hw/net/e1000.c  | 90 +++--
  hw/net/trace-events | 25 -
  2 files changed, 54 insertions(+), 61 deletions(-)


Reviewed-by: Richard Henderson 


r~



Re: [PATCH 06/19] block/stream: fix -Werror=maybe-uninitialized false-positives

2024-04-03 Thread Eric Blake
On Wed, Apr 03, 2024 at 01:24:11PM +0400, Marc-André Lureau wrote:
> > > Unfortunately, it doesn't work in all cases. It seems to have issues
> > > with some guards:
> > > ../block/stream.c: In function ‘stream_run’:
> > > ../block/stream.c:216:12: error: ‘ret’ may be used uninitialized
> > > [-Werror=maybe-uninitialized]
> > >216 | if (ret < 0) {
> > >

That one looks like:

int ret;
WITH_GRAPH_RDLOCK_GUARD() {
  ret = ...;
}
if (copy) {
  ret = ...;
}
if (ret < 0)

I suspect the compiler is seeing the uncertainty possible from the
second conditional, and letting it take priority over the certainty
that the tweaked macro provided for the first conditional.

> > >
> >
> > So, updated macro helps in some cases, but doesn't help here? Intersting, 
> > why.
> >
> > > What should we do? change the macros + cherry-pick the missing
> > > false-positives, or keep this series as is?

An uglier macro, with sufficient comments as to why it is ugly (in
order to let us have fewer false positives where we have to add
initializers) may be less churn in the code base, but I'm not
necessarily sold on the ugly macro.  Let's see if anyone else
expresses an opinion.


> > >
> > >
> >
> > I think marco + missing is better. No reason to add dead-initializations in 
> > cases where new macros helps.
> 
> Ok
> 
> > Still, would be good to understand, what's the difference, why it help on 
> > some cases and not help in another.
> 
> I don't know, it's like if the analyzer was lazy for this particular
> case, although there is nothing much different from other usages.
> 
> If I replace:
> for (... *var2 = (void *)true; var2;
> with:
> for (... *var2 = (void *)true; var2 || true;
> 
> then it doesn't warn..

but it also doesn't work.  We want the body to execute exactly once,
not infloop.


> 
> Interestingly as well, if I change:
> for (... *var2 = (void *)true; var2; var2 = NULL)
> for:
> for (... *var2 = GML_OBJ_(); var2; var2 = NULL)
> 
> GML_OBJ_() simply being &(GraphLockable) { }), an empty compound
> literal, then it doesn't work, in all usages.

So the compiler is not figuring out that the compound literal is
sufficient for an unconditional one time through the for loop body.

What's worse, different compiler versions will change behavior over
time.  Making the code ugly to pacify a particular compiler, when that
compiler might improve in the future, is a form of chasing windmills.

> 
> All in all, I am not sure the trick of using var2 is really reliable either.

And that's my biggest argument for not making the macro not more
complex than it already is.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.
Virtualization:  qemu.org | libguestfs.org




Re: [PATCH v10 2/2] memory tier: create CPUless memory tiers after obtaining HMAT info

2024-04-03 Thread Jonathan Cameron via
A few minor comments inline.

> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> index a44c03c2ba3a..16769552a338 100644
> --- a/include/linux/memory-tiers.h
> +++ b/include/linux/memory-tiers.h
> @@ -140,12 +140,13 @@ static inline int mt_perf_to_adistance(struct 
> access_coordinate *perf, int *adis
>   return -EIO;
>  }
>  
> -struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct 
> list_head *memory_types)
> +static inline struct memory_dev_type *mt_find_alloc_memory_type(int adist,
> + struct list_head *memory_types)
>  {
>   return NULL;
>  }
>  
> -void mt_put_memory_types(struct list_head *memory_types)
> +static inline void mt_put_memory_types(struct list_head *memory_types)
>  {
Why in this patch and not previous one?
>  
>  }
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index 974af10cfdd8..44fa10980d37 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -36,6 +36,11 @@ struct node_memory_type_map {
>  
>  static DEFINE_MUTEX(memory_tier_lock);
>  static LIST_HEAD(memory_tiers);
> +/*
> + * The list is used to store all memory types that are not created
> + * by a device driver.
> + */
> +static LIST_HEAD(default_memory_types);
>  static struct node_memory_type_map node_memory_types[MAX_NUMNODES];
>  struct memory_dev_type *default_dram_type;
>  
> @@ -108,6 +113,8 @@ static struct demotion_nodes *node_demotion __read_mostly;
>  
>  static BLOCKING_NOTIFIER_HEAD(mt_adistance_algorithms);
>  
> +/* The lock is used to protect `default_dram_perf*` info and nid. */
> +static DEFINE_MUTEX(default_dram_perf_lock);
>  static bool default_dram_perf_error;
>  static struct access_coordinate default_dram_perf;
>  static int default_dram_perf_ref_nid = NUMA_NO_NODE;
> @@ -505,7 +512,8 @@ static inline void __init_node_memory_type(int node, 
> struct memory_dev_type *mem
>  static struct memory_tier *set_node_memory_tier(int node)
>  {
>   struct memory_tier *memtier;
> - struct memory_dev_type *memtype;
> + struct memory_dev_type *mtype = default_dram_type;

Does the rename add anything major to the patch?
If not I'd leave it alone to reduce the churn and give
a more readable patch.  If it is worth doing perhaps
a precursor patch?

> + int adist = MEMTIER_ADISTANCE_DRAM;
>   pg_data_t *pgdat = NODE_DATA(node);
>  
>  
> @@ -514,11 +522,20 @@ static struct memory_tier *set_node_memory_tier(int 
> node)
>   if (!node_state(node, N_MEMORY))
>   return ERR_PTR(-EINVAL);
>  
> - __init_node_memory_type(node, default_dram_type);
> + mt_calc_adistance(node, );
> + if (node_memory_types[node].memtype == NULL) {
> + mtype = mt_find_alloc_memory_type(adist, _memory_types);
> + if (IS_ERR(mtype)) {
> + mtype = default_dram_type;
> + pr_info("Failed to allocate a memory type. Fall 
> back.\n");
> + }
> + }
> +
> + __init_node_memory_type(node, mtype);
>  
> - memtype = node_memory_types[node].memtype;
> - node_set(node, memtype->nodes);
> - memtier = find_create_memory_tier(memtype);
> + mtype = node_memory_types[node].memtype;
> + node_set(node, mtype->nodes);
> + memtier = find_create_memory_tier(mtype);
>   if (!IS_ERR(memtier))
>   rcu_assign_pointer(pgdat->memtier, memtier);
>   return memtier;
> @@ -655,6 +672,33 @@ void mt_put_memory_types(struct list_head *memory_types)
>  }
>  EXPORT_SYMBOL_GPL(mt_put_memory_types);
>  
> +/*
> + * This is invoked via `late_initcall()` to initialize memory tiers for
> + * CPU-less memory nodes after driver initialization, which is
> + * expected to provide `adistance` algorithms.
> + */
> +static int __init memory_tier_late_init(void)
> +{
> + int nid;
> +
> + mutex_lock(_tier_lock);
> + for_each_node_state(nid, N_MEMORY)
> + if (node_memory_types[nid].memtype == NULL)
> + /*
> +  * Some device drivers may have initialized memory tiers
> +  * between `memory_tier_init()` and 
> `memory_tier_late_init()`,
> +  * potentially bringing online memory nodes and
> +  * configuring memory tiers. Exclude them here.
> +  */

Does the comment refer to this path, or to ones where memtype is set?

> + set_node_memory_tier(nid);

Given the large comment I would add {} to help with readability.
You could flip the logic to reduce indent
for_each_node_state(nid, N_MEMORY) {
if (node_memory_types[nid].memtype)
continue;
/*
 * Some device drivers may have initialized memory tiers
 * between `memory_tier_init()` and `memory_tier_late_init()`,
 * potentially bringing online memory nodes and
 * configuring memory tiers. Exclude them 

Re: [PATCH v10 1/2] memory tier: dax/kmem: introduce an abstract layer for finding, allocating, and putting memory types

2024-04-03 Thread Jonathan Cameron via
On Tue,  2 Apr 2024 00:17:37 +
"Ho-Ren (Jack) Chuang"  wrote:

> Since different memory devices require finding, allocating, and putting
> memory types, these common steps are abstracted in this patch,
> enhancing the scalability and conciseness of the code.
> 
> Signed-off-by: Ho-Ren (Jack) Chuang 
> Reviewed-by: "Huang, Ying" 

Hi,

I know this is a late entry to the discussion but a few comments inline.
(sorry I didn't look earlier!)

All opportunities to improve code complexity and readability as a result
of your factoring out.

Jonathan


> ---
>  drivers/dax/kmem.c   | 20 ++--
>  include/linux/memory-tiers.h | 13 +
>  mm/memory-tiers.c| 32 
>  3 files changed, 47 insertions(+), 18 deletions(-)
> 
> diff --git a/drivers/dax/kmem.c b/drivers/dax/kmem.c
> index 42ee360cf4e3..01399e5b53b2 100644
> --- a/drivers/dax/kmem.c
> +++ b/drivers/dax/kmem.c
> @@ -55,21 +55,10 @@ static LIST_HEAD(kmem_memory_types);
>  
>  static struct memory_dev_type *kmem_find_alloc_memory_type(int adist)
>  {
> - bool found = false;
>   struct memory_dev_type *mtype;
>  
>   mutex_lock(_memory_type_lock);
could use

guard(mutex)(_memory_type_lock);
return mt_find_alloc_memory_type(adist, _memory_types);

I'm fine if you ignore this comment though as may be other functions in
here that could take advantage of the cleanup.h stuff in a future patch.

> - list_for_each_entry(mtype, _memory_types, list) {
> - if (mtype->adistance == adist) {
> - found = true;
> - break;
> - }
> - }
> - if (!found) {
> - mtype = alloc_memory_type(adist);
> - if (!IS_ERR(mtype))
> - list_add(>list, _memory_types);
> - }
> + mtype = mt_find_alloc_memory_type(adist, _memory_types);
>   mutex_unlock(_memory_type_lock);
>  
>   return mtype;
 
> diff --git a/include/linux/memory-tiers.h b/include/linux/memory-tiers.h
> index 69e781900082..a44c03c2ba3a 100644
> --- a/include/linux/memory-tiers.h
> +++ b/include/linux/memory-tiers.h
> @@ -48,6 +48,9 @@ int mt_calc_adistance(int node, int *adist);
>  int mt_set_default_dram_perf(int nid, struct access_coordinate *perf,
>const char *source);
>  int mt_perf_to_adistance(struct access_coordinate *perf, int *adist);
> +struct memory_dev_type *mt_find_alloc_memory_type(int adist,
> + struct list_head 
> *memory_types);

That indent looks unusual.  Align the start of struct with start of int.

> +void mt_put_memory_types(struct list_head *memory_types);
>  #ifdef CONFIG_MIGRATION
>  int next_demotion_node(int node);
>  void node_get_allowed_targets(pg_data_t *pgdat, nodemask_t *targets);
> @@ -136,5 +139,15 @@ static inline int mt_perf_to_adistance(struct 
> access_coordinate *perf, int *adis
>  {
>   return -EIO;
>  }
> +
> +struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct 
> list_head *memory_types)
> +{
> + return NULL;
> +}
> +
> +void mt_put_memory_types(struct list_head *memory_types)
> +{
> +
No blank line needed here. 
> +}
>  #endif   /* CONFIG_NUMA */
>  #endif  /* _LINUX_MEMORY_TIERS_H */
> diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
> index 0537664620e5..974af10cfdd8 100644
> --- a/mm/memory-tiers.c
> +++ b/mm/memory-tiers.c
> @@ -623,6 +623,38 @@ void clear_node_memory_type(int node, struct 
> memory_dev_type *memtype)
>  }
>  EXPORT_SYMBOL_GPL(clear_node_memory_type);
>  
> +struct memory_dev_type *mt_find_alloc_memory_type(int adist, struct 
> list_head *memory_types)

Breaking this out as a separate function provides opportunity to improve it.
Maybe a follow up patch makes sense given it would no longer be a straight
forward code move.  However in my view it would be simple enough to be obvious
even within this patch.

> +{
> + bool found = false;
> + struct memory_dev_type *mtype;
> +
> + list_for_each_entry(mtype, memory_types, list) {
> + if (mtype->adistance == adist) {
> + found = true;

Why not return here?
return mtype;

> + break;
> + }
> + }
> + if (!found) {

If returning above, no need for found variable - just do this unconditionally.
+ I suggest you flip logic for simpler to follow code flow.
It's more code but I think a bit easier to read as error handling is
out of the main simple flow.

mtype = alloc_memory_type(adist);
if (IS_ERR(mtype))
return mtype;

list_add(>list, memory_types);

return mtype;

> + mtype = alloc_memory_type(adist);
> + if (!IS_ERR(mtype))
> + list_add(>list, memory_types);
> + }
> +
> + return mtype;
> +}
> +EXPORT_SYMBOL_GPL(mt_find_alloc_memory_type);
> +
> +void mt_put_memory_types(struct 

Re: [RFC v2 1/5] virtio: Initialize sequence variables

2024-04-03 Thread Jonah Palmer




On 4/3/24 6:18 AM, Eugenio Perez Martin wrote:

On Thu, Mar 28, 2024 at 5:22 PM Jonah Palmer  wrote:


Initialize sequence variables for VirtQueue and VirtQueueElement
structures. A VirtQueue's sequence variables are initialized when a
VirtQueue is being created or reset. A VirtQueueElement's sequence
variable is initialized when a VirtQueueElement is being initialized.
These variables will be used to support the VIRTIO_F_IN_ORDER feature.

A VirtQueue's used_seq_idx represents the next expected index in a
sequence of VirtQueueElements to be processed (put on the used ring).
The next VirtQueueElement added to the used ring must match this
sequence number before additional elements can be safely added to the
used ring. It's also particularly useful for helping find the number of
new elements added to the used ring.

A VirtQueue's current_seq_idx represents the current sequence index.
This value is essentially a counter where the value is assigned to a new
VirtQueueElement and then incremented. Given its uint16_t type, this
sequence number can be between 0 and 65,535.

A VirtQueueElement's seq_idx represents the sequence number assigned to
the VirtQueueElement when it was created. This value must match with the
VirtQueue's used_seq_idx before the element can be put on the used ring
by the device.

Signed-off-by: Jonah Palmer 
---
  hw/virtio/virtio.c | 18 ++
  include/hw/virtio/virtio.h |  1 +
  2 files changed, 19 insertions(+)

diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index fb6b4ccd83..069d96df99 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -132,6 +132,10 @@ struct VirtQueue
  uint16_t used_idx;
  bool used_wrap_counter;

+/* In-Order sequence indices */
+uint16_t used_seq_idx;
+uint16_t current_seq_idx;
+


I'm having a hard time understanding the difference between these and
last_avail_idx and used_idx. It seems to me if we replace them
everything will work? What am I missing?



For used_seq_idx, it does work like used_idx except the difference is 
when their values get updated, specifically for the split VQ case.


As you know, for the split VQ case, the used_idx is updated during 
virtqueue_split_flush. However, imagine a batch of elements coming in 
where virtqueue_split_fill is called multiple times before 
virtqueue_split_flush. We want to make sure we write these elements to 
the used ring in-order and we'll know its order based on used_seq_idx.


Alternatively, I thought about replicating the logic for the packed VQ 
case (where this used_seq_idx isn't used) where we start looking at 
vq->used_elems[vq->used_idx] and iterate through until we find a used 
element, but I wasn't sure how to handle the case where elements get 
used (written to the used ring) and new elements get put in used_elems 
before the used_idx is updated. Since this search would require us to 
always start at index vq->used_idx.


For example, say, of three elements getting filled (elem0 - elem2), 
elem1 and elem0 come back first (vq->used_idx = 0):


elem1 - not in-order
elem0 - in-order, vq->used_elems[vq->used_idx + 1] (elem1) also now
in-order, write elem0 and elem1 to used ring, mark elements as
used

Then elem2 comes back, but vq->used_idx is still 0, so how do we know to 
ignore the used elements at vq->used_idx (elem0) and vq->used_idx + 1 
(elem1) and iterate to vq->used_idx + 2 (elem2)?


Hmm... now that I'm thinking about it, maybe for the split VQ case we 
could continue looking through the vq->used_elems array until we find an 
unused element... but then again how would we (1) know if the element is 
in-order and (2) know when to stop searching?


In any case, the use of this variable could be seen as an optimization 
as its value will tell us where to start looking in vq->used_elems 
instead of always starting at vq->used_idx.


If this is like a one-shot scenario where one element gets written and 
then flushed after, then yes in this case used_seq_idx == used_idx.


--

For current_seq_idx, this is pretty much just a counter. Every new 
VirtQueueElement created from virtqueue_pop is given a number and the 
counter is incremented. Like grabbing a ticket number and waiting for 
your number to be called. The next person to grab a ticket number will 
be your number + 1.


Let me know if I'm making any sense. Thanks :)

Jonah


  /* Last used index value we have signalled on */
  uint16_t signalled_used;

@@ -1621,6 +1625,11 @@ static void *virtqueue_split_pop(VirtQueue *vq, size_t 
sz)
  elem->in_sg[i] = iov[out_num + i];
  }

+/* Assign sequence index for in-order processing */
+if (virtio_vdev_has_feature(vdev, VIRTIO_F_IN_ORDER)) {
+elem->seq_idx = vq->current_seq_idx++;
+}
+
  vq->inuse++;

  trace_virtqueue_pop(vq, elem, elem->in_num, elem->out_num);
@@ -1760,6 +1769,11 @@ static void *virtqueue_packed_pop(VirtQueue *vq, size_t 
sz)
  vq->shadow_avail_idx = 

Re: [PULL v2 0/6] lsi, vga fixes for 2024-04-02

2024-04-03 Thread Peter Maydell
On Wed, 3 Apr 2024 at 11:07, Paolo Bonzini  wrote:
>
> The following changes since commit 7fcf7575f3d201fc84ae168017ffdfd6c86257a6:
>
>   Merge tag 'pull-target-arm-20240402' of 
> https://git.linaro.org/people/pmaydell/qemu-arm into staging (2024-04-02 
> 11:34:49 +0100)
>
> are available in the Git repository at:
>
>   https://gitlab.com/bonzini/qemu.git tags/for-upstream
>
> for you to fetch changes up to 8fc4bdc537d901c200e43122e32bcb40dc8fed37:
>
>   pc_q35: remove unnecessary m->alias assignment (2024-04-02 18:08:59 +0200)
>
> 
> * lsi53c895a: fix assertion failure with invalid Block Move
> * vga: fix assertion failure with 4- and 16-color modes
> * remove unnecessary assignment
>
> 


Applied, thanks.

Please update the changelog at https://wiki.qemu.org/ChangeLog/9.0
for any user-visible changes.

-- PMM



Re: [PATCH] migration: Yield coroutine when receiving MIG_CMD_POSTCOPY_LISTEN

2024-04-03 Thread Peter Xu
On Wed, Apr 03, 2024 at 04:04:21PM +, Wang, Wei W wrote:
> On Wednesday, April 3, 2024 10:42 PM, Peter Xu wrote:
> > On Wed, Apr 03, 2024 at 04:35:35PM +0800, Wang, Lei wrote:
> > > We should change the following line from
> > >
> > >   while (!qemu_sem_timedwait(>postcopy_qemufile_dst_done,
> > 100)) {
> > >
> > > to
> > >
> > >   while (qemu_sem_timedwait(>postcopy_qemufile_dst_done,
> > 100)) {
> > 
> > Stupid me.. :(  Thanks for figuring this out.
> > 
> > >
> > > After that fix, test passed and no segfault.
> > >
> > > Given that the test shows a yield to the main loop won't introduce
> > > much overhead (<1ms), how about first yield unconditionally, then we
> > > enter the while loop to wait for several ms and yield periodically?
> > 
> > Shouldn't the expectation be that this should return immediately without a
> > wait?  We're already processing LISTEN command, and on the source as you
> > said it was much after the connect().  It won't guarantee the ordering but 
> > IIUC
> > the majority should still have a direct hit?
> > 
> > What we can do though is reducing the 100ms timeout if you see that's
> > perhaps a risk of having too large a downtime when by accident.  We can even
> > do it in a tight loop here considering downtime is important, but to 
> > provide an
> > intermediate ground: how about 100ms -> 1ms poll?
> 
> Would it be better to use busy wait here, instead of blocking for even 1ms 
> here?
> It's likely that the preempt channel is waiting for the main thread to 
> dispatch for accept(),
> but we are calling qemu_sem_timedwait here to block the main thread for 1 
> more ms.

I think it's about the expectation of whether we should already received
that sem post.  My understanding is in most cases we should directly return
and avoid such wait.

Per my previous experience, 1ms is not a major issue to be added on top of
downtime in corner cases like this.

We do have a lot of othre potential optimizations to reduce downtime, or I
should say in the other way, that..  there can be a lot of cases where we
can hit much larger downtime than expected. Consider when we don't even
account downtime for device states for now, either load_state or
save_state, we only count RAM but that's far from accurate.. and we do have
more chances to optimize.  Some are listed here, but some may not:

https://wiki.qemu.org/ToDo/LiveMigration#Optimizations

If you agree with my above "expectation" statement, I'd say we should avoid
using a busy loop whenever possible in QEMU unless extremely necessary.

> 
> 
> > 
> > If you agree (and also to Wei; please review this and comment if there's 
> > any!),
> > would you write up the commit log, fully test it in whatever way you could,
> > and resend as a formal patch (please do this before Friday if possible)?  
> > You
> > can keep a "Suggested-by:" for me.  I want to queue it for
> > rc3 if it can catch it. It seems important if Wei can always reproduce it.
> 
> Not sure if Lei would be able to online as the following two days are Chinese 
> holiday.
> If not, I could help take over to send late tomorrow. Let's see.

Oops, I forgot that even if I was aware..

Please do so if you can do this.  Thank you, Wei!  (I hope you can switch
some working hours later on!)

Let me know if that doesn't work; it'll be all fine.

Thanks,

-- 
Peter Xu




RE: [PATCH] migration: Yield coroutine when receiving MIG_CMD_POSTCOPY_LISTEN

2024-04-03 Thread Wang, Wei W
On Wednesday, April 3, 2024 10:42 PM, Peter Xu wrote:
> On Wed, Apr 03, 2024 at 04:35:35PM +0800, Wang, Lei wrote:
> > We should change the following line from
> >
> > while (!qemu_sem_timedwait(>postcopy_qemufile_dst_done,
> 100)) {
> >
> > to
> >
> > while (qemu_sem_timedwait(>postcopy_qemufile_dst_done,
> 100)) {
> 
> Stupid me.. :(  Thanks for figuring this out.
> 
> >
> > After that fix, test passed and no segfault.
> >
> > Given that the test shows a yield to the main loop won't introduce
> > much overhead (<1ms), how about first yield unconditionally, then we
> > enter the while loop to wait for several ms and yield periodically?
> 
> Shouldn't the expectation be that this should return immediately without a
> wait?  We're already processing LISTEN command, and on the source as you
> said it was much after the connect().  It won't guarantee the ordering but 
> IIUC
> the majority should still have a direct hit?
> 
> What we can do though is reducing the 100ms timeout if you see that's
> perhaps a risk of having too large a downtime when by accident.  We can even
> do it in a tight loop here considering downtime is important, but to provide 
> an
> intermediate ground: how about 100ms -> 1ms poll?

Would it be better to use busy wait here, instead of blocking for even 1ms here?
It's likely that the preempt channel is waiting for the main thread to dispatch 
for accept(),
but we are calling qemu_sem_timedwait here to block the main thread for 1 more 
ms.


> 
> If you agree (and also to Wei; please review this and comment if there's 
> any!),
> would you write up the commit log, fully test it in whatever way you could,
> and resend as a formal patch (please do this before Friday if possible)?  You
> can keep a "Suggested-by:" for me.  I want to queue it for
> rc3 if it can catch it. It seems important if Wei can always reproduce it.

Not sure if Lei would be able to online as the following two days are Chinese 
holiday.
If not, I could help take over to send late tomorrow. Let's see.


Re: [PATCH v3] input-linux: Add option to not grab a device upon guest startup

2024-04-03 Thread Justinien Bouron
> Again, QAPI schema

Pardon my ignorance, but are you writing this because there is a problem with
the QAPI schema changes that I would need to fix and re-submit?
Or is it just here to indicate that you've reviewed the change made to the
schema?

Regards,
Justinien Bouron



[PATCH net v4] virtio_net: Do not send RSS key if it is not supported

2024-04-03 Thread Breno Leitao
There is a bug when setting the RSS options in virtio_net that can break
the whole machine, getting the kernel into an infinite loop.

Running the following command in any QEMU virtual machine with virtionet
will reproduce this problem:

# ethtool -X eth0  hfunc toeplitz

This is how the problem happens:

1) ethtool_set_rxfh() calls virtnet_set_rxfh()

2) virtnet_set_rxfh() calls virtnet_commit_rss_command()

3) virtnet_commit_rss_command() populates 4 entries for the rss
scatter-gather

4) Since the command above does not have a key, then the last
scatter-gatter entry will be zeroed, since rss_key_size == 0.
sg_buf_size = vi->rss_key_size;

5) This buffer is passed to qemu, but qemu is not happy with a buffer
with zero length, and do the following in virtqueue_map_desc() (QEMU
function):

  if (!sz) {
  virtio_error(vdev, "virtio: zero sized buffers are not allowed");

6) virtio_error() (also QEMU function) set the device as broken

vdev->broken = true;

7) Qemu bails out, and do not repond this crazy kernel.

8) The kernel is waiting for the response to come back (function
virtnet_send_command())

9) The kernel is waiting doing the following :

  while (!virtqueue_get_buf(vi->cvq, ) &&
 !virtqueue_is_broken(vi->cvq))
  cpu_relax();

10) None of the following functions above is true, thus, the kernel
loops here forever. Keeping in mind that virtqueue_is_broken() does
not look at the qemu `vdev->broken`, so, it never realizes that the
vitio is broken at QEMU side.

Fix it by not sending RSS commands if the feature is not available in
the device.

Fixes: c7114b1249fa ("drivers/net/virtio_net: Added basic RSS support.")
Cc: sta...@vger.kernel.org
Cc: qemu-devel@nongnu.org
Signed-off-by: Breno Leitao 
Reviewed-by: Heng Qi 
---
Changelog:

V2:
  * Moved from creating a valid packet, by rejecting the request
completely.
V3:
  * Got some good feedback from and Xuan Zhuo and Heng Qi, and reworked
the rejection path.
V4:
  * Added a comment in an "if" clause, as suggested by Michael S. Tsirkin.

---
 drivers/net/virtio_net.c | 26 ++
 1 file changed, 22 insertions(+), 4 deletions(-)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index c22d1118a133..115c3c5414f2 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -3807,6 +3807,7 @@ static int virtnet_set_rxfh(struct net_device *dev,
struct netlink_ext_ack *extack)
 {
struct virtnet_info *vi = netdev_priv(dev);
+   bool update = false;
int i;
 
if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE &&
@@ -3814,13 +3815,28 @@ static int virtnet_set_rxfh(struct net_device *dev,
return -EOPNOTSUPP;
 
if (rxfh->indir) {
+   if (!vi->has_rss)
+   return -EOPNOTSUPP;
+
for (i = 0; i < vi->rss_indir_table_size; ++i)
vi->ctrl->rss.indirection_table[i] = rxfh->indir[i];
+   update = true;
}
-   if (rxfh->key)
+
+   if (rxfh->key) {
+   /* If either _F_HASH_REPORT or _F_RSS are negotiated, the
+* device provides hash calculation capabilities, that is,
+* hash_key is configured.
+*/
+   if (!vi->has_rss && !vi->has_rss_hash_report)
+   return -EOPNOTSUPP;
+
memcpy(vi->ctrl->rss.key, rxfh->key, vi->rss_key_size);
+   update = true;
+   }
 
-   virtnet_commit_rss_command(vi);
+   if (update)
+   virtnet_commit_rss_command(vi);
 
return 0;
 }
@@ -4729,13 +4745,15 @@ static int virtnet_probe(struct virtio_device *vdev)
if (virtio_has_feature(vdev, VIRTIO_NET_F_HASH_REPORT))
vi->has_rss_hash_report = true;
 
-   if (virtio_has_feature(vdev, VIRTIO_NET_F_RSS))
+   if (virtio_has_feature(vdev, VIRTIO_NET_F_RSS)) {
vi->has_rss = true;
 
-   if (vi->has_rss || vi->has_rss_hash_report) {
vi->rss_indir_table_size =
virtio_cread16(vdev, offsetof(struct virtio_net_config,
rss_max_indirection_table_length));
+   }
+
+   if (vi->has_rss || vi->has_rss_hash_report) {
vi->rss_key_size =
virtio_cread8(vdev, offsetof(struct virtio_net_config, 
rss_max_key_size));
 
-- 
2.43.0




Re: [PATCH v3 2/2] xen: fix stubdom PCI addr

2024-04-03 Thread Anthony PERARD
On Wed, Mar 27, 2024 at 04:05:15AM +0100, Marek Marczykowski-Górecki wrote:
> When running in a stubdomain, the config space access via sysfs needs to
> use BDF as seen inside stubdomain (connected via xen-pcifront), which is
> different from the real BDF. For other purposes (hypercall parameters
> etc), the real BDF needs to be used.
> Get the in-stubdomain BDF by looking up relevant PV PCI xenstore
> entries.
> 
> Signed-off-by: Marek Marczykowski-Górecki 

Reviewed-by: Anthony PERARD 

Thanks,

-- 
Anthony PERARD



Re: [PATCH v3 1/2] hw/xen: detect when running inside stubdomain

2024-04-03 Thread Anthony PERARD
On Wed, Mar 27, 2024 at 04:05:14AM +0100, Marek Marczykowski-Górecki wrote:
> Introduce global xen_is_stubdomain variable when qemu is running inside
> a stubdomain instead of dom0. This will be relevant for subsequent
> patches, as few things like accessing PCI config space need to be done
> differently.
> 
> Signed-off-by: Marek Marczykowski-Górecki 

Reviewed-by: Anthony PERARD 

Thanks,


-- 
Anthony PERARD



Re: [PATCH v2] hw/i386/acpi: Set PCAT_COMPAT bit only when pic is not disabled

2024-04-03 Thread Xiaoyao Li

On 4/3/2024 11:12 PM, Igor Mammedov wrote:

On Wed,  3 Apr 2024 10:59:53 -0400
Xiaoyao Li  wrote:


A value 1 of PCAT_COMPAT (bit 0) of MADT.Flags indicates that the system
also has a PC-AT-compatible dual-8259 setup, i.e., the PIC.

When PIC is not enabled (pic=off) for x86 machine, the PCAT_COMPAT bit
needs to be cleared. Otherwise, the guest thinks there is a present PIC.


Can you add to commit message reproducer (aka qemu CLI and relevant
logs/symptoms observed on guest side)?


When booting a VM with "-machine xx,pic=off", there is supposed to be no 
PIC for the guest. When guest probes PIC, it should find nothing and log 
of below should be printed:


  [0.155970] Using NULL legacy PIC

However, the fact is that no such log printed in guest kernel, with the 
VM created with "pic=off". This is because guest think there is a 
present due to pcat_compat is reporte as 1 in MADT. See Linux code of 
probe_8259A() in arch/x86/kernel/i8259.c 





   
Signed-off-by: Xiaoyao Li   
---

changes in v2:
- Clarify more in commit message;
---
  hw/i386/acpi-common.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/i386/acpi-common.c b/hw/i386/acpi-common.c
index 20f19269da40..0cc2919bb851 100644
--- a/hw/i386/acpi-common.c
+++ b/hw/i386/acpi-common.c
@@ -107,7 +107,9 @@ void acpi_build_madt(GArray *table_data, BIOSLinker *linker,
  acpi_table_begin(, table_data);
  /* Local APIC Address */
  build_append_int_noprefix(table_data, APIC_DEFAULT_ADDRESS, 4);
-build_append_int_noprefix(table_data, 1 /* PCAT_COMPAT */, 4); /* Flags */
+/* Flags. bit 0: PCAT_COMPAT */
+build_append_int_noprefix(table_data,
+  x86ms->pic != ON_OFF_AUTO_OFF ? 1 : 0 , 4);
  
  for (i = 0; i < apic_ids->len; i++) {

  pc_madt_cpu_entry(i, apic_ids, table_data, false);







Re: [PATCH v2] hw/i386/acpi: Set PCAT_COMPAT bit only when pic is not disabled

2024-04-03 Thread Igor Mammedov
On Wed,  3 Apr 2024 10:59:53 -0400
Xiaoyao Li  wrote:

> A value 1 of PCAT_COMPAT (bit 0) of MADT.Flags indicates that the system
> also has a PC-AT-compatible dual-8259 setup, i.e., the PIC.
> 
> When PIC is not enabled (pic=off) for x86 machine, the PCAT_COMPAT bit
> needs to be cleared. Otherwise, the guest thinks there is a present PIC.

Can you add to commit message reproducer (aka qemu CLI and relevant
logs/symptoms observed on guest side)?

> 
> Signed-off-by: Xiaoyao Li 
> ---
> changes in v2:
> - Clarify more in commit message;
> ---
>  hw/i386/acpi-common.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/i386/acpi-common.c b/hw/i386/acpi-common.c
> index 20f19269da40..0cc2919bb851 100644
> --- a/hw/i386/acpi-common.c
> +++ b/hw/i386/acpi-common.c
> @@ -107,7 +107,9 @@ void acpi_build_madt(GArray *table_data, BIOSLinker 
> *linker,
>  acpi_table_begin(, table_data);
>  /* Local APIC Address */
>  build_append_int_noprefix(table_data, APIC_DEFAULT_ADDRESS, 4);
> -build_append_int_noprefix(table_data, 1 /* PCAT_COMPAT */, 4); /* Flags 
> */
> +/* Flags. bit 0: PCAT_COMPAT */
> +build_append_int_noprefix(table_data,
> +  x86ms->pic != ON_OFF_AUTO_OFF ? 1 : 0 , 4);
>  
>  for (i = 0; i < apic_ids->len; i++) {
>  pc_madt_cpu_entry(i, apic_ids, table_data, false);




Re: [RFC PATCH v2 4/6] cxl/core: report poison when injecting from debugfs

2024-04-03 Thread Shiyang Ruan via




在 2024/3/30 9:52, Dan Williams 写道:

Shiyang Ruan wrote:

Poison injection from debugfs is silent too.  Add calling
cxl_mem_report_poison() to make it able to do memory_failure().


Why does this needs to be signalled? It is a debug interface, the
debugger can also trigger a read after the injection, or trigger page
soft-offline.


I was thinking that the poison injection should trigger a chain of 
events.  So, for debugfs they should be independent, right?  I wasn't 
aware of this.  Will drop this patch.



--
Thanks,
Ruan.



[PATCH v2] hw/i386/acpi: Set PCAT_COMPAT bit only when pic is not disabled

2024-04-03 Thread Xiaoyao Li
A value 1 of PCAT_COMPAT (bit 0) of MADT.Flags indicates that the system
also has a PC-AT-compatible dual-8259 setup, i.e., the PIC.

When PIC is not enabled (pic=off) for x86 machine, the PCAT_COMPAT bit
needs to be cleared. Otherwise, the guest thinks there is a present PIC.

Signed-off-by: Xiaoyao Li 
---
changes in v2:
- Clarify more in commit message;
---
 hw/i386/acpi-common.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/i386/acpi-common.c b/hw/i386/acpi-common.c
index 20f19269da40..0cc2919bb851 100644
--- a/hw/i386/acpi-common.c
+++ b/hw/i386/acpi-common.c
@@ -107,7 +107,9 @@ void acpi_build_madt(GArray *table_data, BIOSLinker *linker,
 acpi_table_begin(, table_data);
 /* Local APIC Address */
 build_append_int_noprefix(table_data, APIC_DEFAULT_ADDRESS, 4);
-build_append_int_noprefix(table_data, 1 /* PCAT_COMPAT */, 4); /* Flags */
+/* Flags. bit 0: PCAT_COMPAT */
+build_append_int_noprefix(table_data,
+  x86ms->pic != ON_OFF_AUTO_OFF ? 1 : 0 , 4);
 
 for (i = 0; i < apic_ids->len; i++) {
 pc_madt_cpu_entry(i, apic_ids, table_data, false);
-- 
2.34.1




Re: [RFC PATCH v2 3/6] cxl/core: add report option for cxl_mem_get_poison()

2024-04-03 Thread Shiyang Ruan via




在 2024/3/30 9:50, Dan Williams 写道:

Shiyang Ruan wrote:

The GMER only has "Physical Address" field, no such one indicates length.
So, when a poison event is received, we could use GET_POISON_LIST command
to get the poison list.  Now driver has cxl_mem_get_poison(), so
reuse it and add a parameter 'bool report', report poison record to MCE
if set true.


I am not sure I agree with the rationale here because there is no
correlation between the event being signaled and the current state of
the poison list. It also establishes race between multiple GMER events,
i.e. imagine the hardware sends 4 GMER events to communicate a 256B
poison discovery event. Does the driver need logic to support GMER event
2, 3, and 4 if it already say all 256B of poison after processing GMER
event 1?


Yes, I didn't thought about that.



I think the best the driver can do is assume at least 64B of poison
per-event and depend on multiple notifications to handle larger poison
lengths.


Agree.  This also makes things easier.

And for qemu, I'm thinking of making a patch to limit the length of a 
poison record when injecting.  The length should between 64B to 4KiB per 
GMER. And emit many GMERs if length > 4KiB.




Otherwise, the poison list is really only useful for pre-populating
pages to offline after a reboot, i.e. to catch the kernel up with the
state of poison pages after a reboot.


Got it.


--
Thanks,
Ruan.



Re: [PATCH] migration: Yield coroutine when receiving MIG_CMD_POSTCOPY_LISTEN

2024-04-03 Thread Peter Xu
On Wed, Apr 03, 2024 at 04:35:35PM +0800, Wang, Lei wrote:
> We should change the following line from
> 
>   while (!qemu_sem_timedwait(>postcopy_qemufile_dst_done, 100)) {
> 
> to
> 
>   while (qemu_sem_timedwait(>postcopy_qemufile_dst_done, 100)) {

Stupid me.. :(  Thanks for figuring this out.

> 
> After that fix, test passed and no segfault.
> 
> Given that the test shows a yield to the main loop won't introduce much 
> overhead
> (<1ms), how about first yield unconditionally, then we enter the while loop to
> wait for several ms and yield periodically?

Shouldn't the expectation be that this should return immediately without a
wait?  We're already processing LISTEN command, and on the source as you
said it was much after the connect().  It won't guarantee the ordering but
IIUC the majority should still have a direct hit?

What we can do though is reducing the 100ms timeout if you see that's
perhaps a risk of having too large a downtime when by accident.  We can
even do it in a tight loop here considering downtime is important, but to
provide an intermediate ground: how about 100ms -> 1ms poll?

If you agree (and also to Wei; please review this and comment if there's
any!), would you write up the commit log, fully test it in whatever way you
could, and resend as a formal patch (please do this before Friday if
possible)?  You can keep a "Suggested-by:" for me.  I want to queue it for
rc3 if it can catch it. It seems important if Wei can always reproduce it.

Thanks,

-- 
Peter Xu




Re: Intention to work on GSoC project

2024-04-03 Thread Sahil
Hi,

Thank you for the reply.

On Tuesday, April 2, 2024 5:08:24 PM IST Eugenio Perez Martin wrote:
> [...]
> > > > Q2.
> > > > In the Red Hat article, just below the first listing ("Memory layout of 
> > > > a
> > > > packed virtqueue descriptor"), there's the following line referring to 
> > > > the 
> > > > buffer id in "virtq_desc":
> > > > > This time, the id field is not an index for the device to look for the
> > > > > buffer: it is an opaque value for it, only has meaning for the driver.
> > > > 
> > > > But the device returns the buffer id when it writes the used descriptor 
> > > > to
> > > > the descriptor ring. The "only has meaning for the driver" part has got 
> > > > me
> > > > a little confused. Which buffer id is this that the device returns? Is 
> > > > it related
> > > > to the buffer id in the available descriptor?
> > > 
> > > In my understanding, buffer id is the element that avail descriptor
> > > marks to identify when adding descriptors to table. Device will returns
> > > the buffer id in the processed descriptor or the last descriptor in a
> > > chain, and write it to the descriptor that used idx refers to (first
> > > one in the chain). Then used idx increments.
> > > 
> > > The Packed Virtqueue blog [1] is helpful, but some details in the
> > > examples
> > > are making me confused.
> > > 
> > > Q1.
> > > In the last step of the two-entries descriptor table example, it says
> > > both buffers #0 and #1 are available for the device. I understand
> > > descriptor[0] is available and descriptor[1] is not, but there is no ID #0
> > > now. So does the device got buffer #0 by notification beforehand? If so,
> > > does it mean buffer #0 will be lost when notifications are disabled?
> 
> I guess you mean the table labeled "Figure: Full two-entries descriptor
> table".
> 
> Take into account that the descriptor table is not the state of all
> the descriptors. That information must be maintained by the device and
> the driver internally.
> 
> The descriptor table is used as a circular buffer, where one part is
> writable by the driver and the other part is writable by the device.
> For the device to override the descriptor table entry where descriptor
> id 0 used to be does not mean that the descriptor id 0 is used. It
> just means that the device communicates to the driver that descriptor
> 1 is used, and both sides need to keep the descriptor state
> coherently.
> 
> > I too have a similar question and understanding the relation between
> > buffer
> > ids in the used and available descriptors might give more insight into
> > this. For available descriptors, the buffer id is used to associate
> > descriptors with a particular buffer. I am still not very sure about ids
> > in used descriptors.
> > 
> > Regarding Q1, both buffers #0 and #1 are available. In the mentioned
> > figure, both descriptor[0] and descriptor[1] are available. This figure
> > follows the figure with the caption "Using first buffer out of order". So
> > in the first figure the device reads buffer #1 and writes the used
> > descriptor but it still has buffer #0 to read. That still belongs to the
> > device while buffer #1 can now be handled by the driver once again. So in
> > the next figure, the driver makes buffer #1 available again. The device
> > can still read buffer #0 from the previous batch of available
> > descriptors.
> > 
> > Based on what I have understood, the driver can't touch the descriptor
> > corresponding to buffer #0 until the device acknowledges it. I did find
> > the
> > figure a little confusing as well. I think once the meaning of buffer id
> > is clear from the driver's and device's perspective, it'll be easier to
> > understand the figure.
> 
> I think you got it right. Please let me know if you have further questions.

I would like to clarify one thing in the figure "Full two-entries descriptor
table". The driver can only overwrite a used descriptor in the descriptor
ring, right? And likewise for the device? So in the figure, the driver will
have to wait until descriptor[1] is used before it can overwrite it?

Suppose the device marks descriptor[0] as used. I think the driver will
not be able to overwrite that descriptor entry because it has to go in
order and is at descriptor[1]. Is that correct? Is it possible for the driver
to go "backwards" in the descriptor ring?

> > I am also not very sure about what happens when notifications are
> > disabled.
> > I'll have to read up on that again. But I believe the driver still won't
> > be able to touch #0 until the device uses it.
> 
> If one side disables notification it needs to check the indexes or the
> flags by its own means: Timers, read the memory in a busy loop, etc.

Understood. Thank you for the clarification.

I have some questions from the "Virtio live migration technical deep
dive" article [1].

Q1.
In the paragraph just above Figure 6, there is the following line:
> the vhost kernel thread and QEMU may run in different CPU threads,

[PATCH v2] e1000: Convert debug macros into tracepoints.

2024-04-03 Thread Don Porter
From: Austin Clements 

The E1000 debug messages are very useful for developing drivers.
Make these available to users without recompiling QEMU.

Signed-off-by: Austin Clements 
[geo...@ldpreload.com: Rebased on top of 2.9.0]
Signed-off-by: Geoffrey Thomas 
Signed-off-by: Don Porter 
---
 hw/net/e1000.c  | 90 +++--
 hw/net/trace-events | 25 -
 2 files changed, 54 insertions(+), 61 deletions(-)

diff --git a/hw/net/e1000.c b/hw/net/e1000.c
index 43f3a4a701..24475636a3 100644
--- a/hw/net/e1000.c
+++ b/hw/net/e1000.c
@@ -44,26 +44,6 @@
 #include "trace.h"
 #include "qom/object.h"
 
-/* #define E1000_DEBUG */
-
-#ifdef E1000_DEBUG
-enum {
-DEBUG_GENERAL,  DEBUG_IO,   DEBUG_MMIO, DEBUG_INTERRUPT,
-DEBUG_RX,   DEBUG_TX,   DEBUG_MDIC, DEBUG_EEPROM,
-DEBUG_UNKNOWN,  DEBUG_TXSUM,DEBUG_TXERR,DEBUG_RXERR,
-DEBUG_RXFILTER, DEBUG_PHY,  DEBUG_NOTYET,
-};
-#define DBGBIT(x)(1mac_reg[IMS]);
+trace_e1000_set_ics(val, s->mac_reg[ICR], s->mac_reg[IMS]);
 set_interrupt_cause(s, 0, val | s->mac_reg[ICR]);
 }
 
@@ -425,8 +404,7 @@ set_rx_control(E1000State *s, int index, uint32_t val)
 s->mac_reg[RCTL] = val;
 s->rxbuf_size = e1000x_rxbufsize(val);
 s->rxbuf_min_shift = ((val / E1000_RCTL_RDMTS_QUAT) & 3) + 1;
-DBGOUT(RX, "RCTL: %d, mac_reg[RCTL] = 0x%x\n", s->mac_reg[RDT],
-   s->mac_reg[RCTL]);
+trace_e1000_set_rx_control(s->mac_reg[RDT], s->mac_reg[RCTL]);
 timer_mod(s->flush_queue_timer,
   qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + 1000);
 }
@@ -440,16 +418,16 @@ set_mdic(E1000State *s, int index, uint32_t val)
 if ((val & E1000_MDIC_PHY_MASK) >> E1000_MDIC_PHY_SHIFT != 1) // phy #
 val = s->mac_reg[MDIC] | E1000_MDIC_ERROR;
 else if (val & E1000_MDIC_OP_READ) {
-DBGOUT(MDIC, "MDIC read reg 0x%x\n", addr);
+trace_e1000_mdic_read_register(addr);
 if (!(phy_regcap[addr] & PHY_R)) {
-DBGOUT(MDIC, "MDIC read reg %x unhandled\n", addr);
+trace_e1000_mdic_read_register_unhandled(addr);
 val |= E1000_MDIC_ERROR;
 } else
 val = (val ^ data) | s->phy_reg[addr];
 } else if (val & E1000_MDIC_OP_WRITE) {
-DBGOUT(MDIC, "MDIC write reg 0x%x, value 0x%x\n", addr, data);
+trace_e1000_mdic_write_register(addr, data);
 if (!(phy_regcap[addr] & PHY_W)) {
-DBGOUT(MDIC, "MDIC write reg %x unhandled\n", addr);
+trace_e1000_mdic_write_register_unhandled(addr);
 val |= E1000_MDIC_ERROR;
 } else {
 if (addr < NPHYWRITEOPS && phyreg_writeops[addr]) {
@@ -471,8 +449,8 @@ get_eecd(E1000State *s, int index)
 {
 uint32_t ret = E1000_EECD_PRES|E1000_EECD_GNT | s->eecd_state.old_eecd;
 
-DBGOUT(EEPROM, "reading eeprom bit %d (reading %d)\n",
-   s->eecd_state.bitnum_out, s->eecd_state.reading);
+trace_e1000_get_eecd(s->eecd_state.bitnum_out, s->eecd_state.reading);
+
 if (!s->eecd_state.reading ||
 ((s->eeprom_data[(s->eecd_state.bitnum_out >> 4) & 0x3f] >>
   ((s->eecd_state.bitnum_out & 0xf) ^ 0xf))) & 1)
@@ -511,9 +489,8 @@ set_eecd(E1000State *s, int index, uint32_t val)
 s->eecd_state.reading = (((s->eecd_state.val_in >> 6) & 7) ==
 EEPROM_READ_OPCODE_MICROWIRE);
 }
-DBGOUT(EEPROM, "eeprom bitnum in %d out %d, reading %d\n",
-   s->eecd_state.bitnum_in, s->eecd_state.bitnum_out,
-   s->eecd_state.reading);
+trace_e1000_set_eecd(s->eecd_state.bitnum_in, s->eecd_state.bitnum_out,
+ s->eecd_state.reading);
 }
 
 static uint32_t
@@ -580,8 +557,7 @@ xmit_seg(E1000State *s)
 
 if (tp->cptse) {
 css = props->ipcss;
-DBGOUT(TXSUM, "frames %d size %d ipcss %d\n",
-   frames, tp->size, css);
+trace_e1000_xmit_seg1(frames, tp->size, css);
 if (props->ip) {/* IPv4 */
 stw_be_p(tp->data+css+2, tp->size - css);
 stw_be_p(tp->data+css+4,
@@ -591,7 +567,7 @@ xmit_seg(E1000State *s)
 }
 css = props->tucss;
 len = tp->size - css;
-DBGOUT(TXSUM, "tcp %d tucss %d len %d\n", props->tcp, css, len);
+trace_e1000_xmit_seg2(props->tcp, css, len);
 if (props->tcp) {
 sofar = frames * props->mss;
 stl_be_p(tp->data+css+4, ldl_be_p(tp->data+css+4)+sofar); /* seq */
@@ -759,7 +735,7 @@ start_xmit(E1000State *s)
 uint32_t tdh_start = s->mac_reg[TDH], cause = E1000_ICS_TXQE;
 
 if (!(s->mac_reg[TCTL] & E1000_TCTL_EN)) {
-DBGOUT(TX, "tx disabled\n");
+trace_e1000_start_xmit_fail1();
 return;
 }
 
@@ -773,9 +749,9 @@ start_xmit(E1000State *s)
sizeof(struct e1000_tx_desc) * s->mac_reg[TDH];
 pci_dma_read(d, base, , sizeof(desc));
 
-DBGOUT(TX, "index %d: %p : %x %x\n", s->mac_reg[TDH],

Re: [PATCH] hw/i386/acpi: Set PCAT_COMPAT bit only when pic is not disabled

2024-04-03 Thread Kirill A. Shutemov
On Wed, Apr 03, 2024 at 10:03:15AM +0800, Xiaoyao Li wrote:
> On 4/2/2024 10:31 PM, Michael S. Tsirkin wrote:
> > On Tue, Apr 02, 2024 at 09:18:44PM +0800, Xiaoyao Li wrote:
> > > On 4/2/2024 6:02 PM, Michael S. Tsirkin wrote:
> > > > On Tue, Apr 02, 2024 at 04:25:16AM -0400, Xiaoyao Li wrote:
> > > > > Set MADT.FLAGS[bit 0].PCAT_COMPAT based on x86ms->pic.
> > > > > 
> > > > > Signed-off-by: Xiaoyao Li 
> > > > 
> > > > Please include more info in the commit log:
> > > > what is the behaviour you observe, why it is wrong,
> > > > how does the patch fix it, what is guest behaviour
> > > > before and after.
> > > 
> > > Sorry, I thought it was straightforward.
> > > 
> > > A value 1 of PCAT_COMPAT (bit 0) of MADT.Flags indicates that the system
> > > also has a PC-AT-compatible dual-8259 setup, i.e., the PIC.
> > > 
> > > When PIC is not enabled for x86 machine, the PCAT_COMPAT bit needs to be
> > > cleared. Otherwise, the guest thinks there is a present PIC even it is
> > > booted with pic=off on QEMU.
> > > 
> > > (I haven't seen real issue from Linux guest. The user of PIC inside guest
> > > seems only the pit calibration. Whether pit calibration is triggered 
> > > depends
> > > on other things. But logically, current code is wrong, we need to fix it
> > > anyway.
> > > 
> > > @Isaku, please share more info if you have)
> > > 
> 
> + Kirill,
> 
> It seems to have issue with legacy irqs with PCAT_COMPAT set 1 while no PIC
> on QEMU side. Kirill, could you elaborate it?

TDX guest cannot support PIC because the platform doesn't allow direct
interrupt injection, only posted interrupts.

For TDX guest kernel we had a patch[1] that forces no-PIC, but it is not
upstreamable as it is a hack.

I looked around to find The Right Way™ to archive the same effect and
discovered that we only have PIC ops hooked up because kernel bypasses[2]
PIC enumeration because PCAT_COMPAT is set. Which is wrong for TDX guest
or other platforms without PIC.

I am not aware about any user-visible issues due to it, but maybe they are
just not discovered yet.

[1] 
https://lore.kernel.org/linux-kernel/b29f00c1eb5cff585ec2b999b69923c13418ecc4.1619458733.git.sathyanarayanan.kuppusw...@linux.intel.com/
[2] 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/x86/kernel/i8259.c#n322

-- 
  Kiryl Shutsemau / Kirill A. Shutemov



Re: [PATCH net v3] virtio_net: Do not send RSS key if it is not supported

2024-04-03 Thread Breno Leitao
On Sun, Mar 31, 2024 at 04:20:30PM -0400, Michael S. Tsirkin wrote:
> On Fri, Mar 29, 2024 at 10:16:41AM -0700, Breno Leitao wrote:
> > @@ -3814,13 +3815,24 @@ static int virtnet_set_rxfh(struct net_device *dev,
> > return -EOPNOTSUPP;
> >  
> > if (rxfh->indir) {
> > +   if (!vi->has_rss)
> > +   return -EOPNOTSUPP;
> > +
> > for (i = 0; i < vi->rss_indir_table_size; ++i)
> > vi->ctrl->rss.indirection_table[i] = rxfh->indir[i];
> > +   update = true;
> > }
> > -   if (rxfh->key)
> > +
> > +   if (rxfh->key) {
> > +   if (!vi->has_rss && !vi->has_rss_hash_report)
> > +   return -EOPNOTSUPP;
> 
> 
> What's the logic here? Is it || or &&? A comment can't hurt.

If txfh carries a key, then the device needs to has either has_rss or
has_rss_hash_report "features".

These are basically virtio features VIRTIO_NET_F_HASH_REPORT and
VIRTIO_NET_F_RSS that are set at virtio_probe.

I will add the comment and respin the series.



Re: [PATCH v3] input-linux: Add option to not grab a device upon guest startup

2024-04-03 Thread Markus Armbruster
Justinien Bouron  writes:

> Depending on your use-case, it might be inconvenient to have qemu grab
> the input device from the host immediately upon starting the guest.
>
> Added a new bool option to input-linux: grab-on-startup. If true, the
> device is grabbed as soon as the guest is started, otherwise it is not
> grabbed until the toggle combination is entered. To avoid breaking
> existing setups, the default value of grab-on-startup is true, i.e. same
> behaviour as before this change.
>
> Signed-off-by: Justinien Bouron 

Again, QAPI schema
Acked-by: Markus Armbruster 




Re: [PATCH v11 18/23] hw/intc/arm_gicv3: Handle icv_nmiar1_read() for icc_nmiar1_read()

2024-04-03 Thread Peter Maydell
On Wed, 3 Apr 2024 at 04:16, Jinjie Ruan  wrote:
> On 2024/4/3 0:12, Peter Maydell wrote:
> >> @@ -776,7 +811,11 @@ static uint64_t icv_iar_read(CPUARMState *env, const 
> >> ARMCPRegInfo *ri)
> >>  if (thisgrp == grp && icv_hppi_can_preempt(cs, lr)) {
> >>  intid = ich_lr_vintid(lr);
> >>  if (!gicv3_intid_is_special(intid)) {
> >> -icv_activate_irq(cs, idx, grp);
> >> +if (!(lr & ICH_LR_EL2_NMI)) {
> >
> > This is missing checks on both whether the GIC has NMI support and
> > on whether the SCTLR NMI bit is set (compare pseudocode
> > VirtualReadIAR1()). I suggest defining a
> >
> > bool nmi = cs->gic->nmi_support &&
> > (env->cp15.sctlr_el[arm_current_el(env)] & SCTLR_NMI) &&
> > (lr & ICH_LR_EL2_NMI);
>
> The nmi_support check is redundant, as if FEAT_GICv3_NMI is unsupported,
> the ICH_LR_EL2.NMI is RES0, so if ICH_LR_EL2.NMI is 1, FEAT_GICv3_NMI
> has been surely realized.

As far as I can see you haven't changed ich_lr_write() to enforce
that, though, so the guest can write 1 to the NMI bit even if the
GIC doesn't support FEAT_GICv3_NMI. If you want to skip checking
nmi_support here you need to enforce that the NMI bit in the LR
is 0 in ich_lr_write().

thanks
-- PMM



[PATCH v2 02/10] backends/confidential-guest-support: Add IGVM file parameter

2024-04-03 Thread Roy Hopkins
In order to add support for parsing IGVM files for secure virtual
machines, a the path to an IGVM file needs to be specified as
part of the guest configuration. It makes sense to add this to
the ConfidentialGuestSupport object as this is common to all secure
virtual machines that potentially could support IGVM based
configuration.

This patch allows the filename to be configured via the QEMU
object model in preparation for subsequent patches that will read and
parse the IGVM file.

Signed-off-by: Roy Hopkins 
---
 backends/confidential-guest-support.c | 21 +
 include/exec/confidential-guest-support.h |  9 +
 qapi/qom.json | 13 +
 qemu-options.hx   |  8 +++-
 4 files changed, 50 insertions(+), 1 deletion(-)

diff --git a/backends/confidential-guest-support.c 
b/backends/confidential-guest-support.c
index 052fde8db0..da436fb736 100644
--- a/backends/confidential-guest-support.c
+++ b/backends/confidential-guest-support.c
@@ -20,8 +20,29 @@ OBJECT_DEFINE_ABSTRACT_TYPE(ConfidentialGuestSupport,
 CONFIDENTIAL_GUEST_SUPPORT,
 OBJECT)
 
+#if defined(CONFIG_IGVM)
+static char *get_igvm(Object *obj, Error **errp)
+{
+ConfidentialGuestSupport *cgs = CONFIDENTIAL_GUEST_SUPPORT(obj);
+return g_strdup(cgs->igvm_filename);
+}
+
+static void set_igvm(Object *obj, const char *value, Error **errp)
+{
+ConfidentialGuestSupport *cgs = CONFIDENTIAL_GUEST_SUPPORT(obj);
+g_free(cgs->igvm_filename);
+cgs->igvm_filename = g_strdup(value);
+}
+#endif
+
 static void confidential_guest_support_class_init(ObjectClass *oc, void *data)
 {
+#if defined(CONFIG_IGVM)
+object_class_property_add_str(oc, "igvm-file",
+get_igvm, set_igvm);
+object_class_property_set_description(oc, "igvm-file",
+"Set the IGVM filename to use");
+#endif
 }
 
 static void confidential_guest_support_init(Object *obj)
diff --git a/include/exec/confidential-guest-support.h 
b/include/exec/confidential-guest-support.h
index ba2dd4b5df..ec74da8877 100644
--- a/include/exec/confidential-guest-support.h
+++ b/include/exec/confidential-guest-support.h
@@ -51,6 +51,15 @@ struct ConfidentialGuestSupport {
  * so 'ready' is not set, we'll abort.
  */
 bool ready;
+
+#if defined(CONFIG_IGVM)
+/*
+ * igvm_filename: Optional filename that specifies a file that contains
+ *the configuration of the guest in Independent Guest
+ *Virtual Machine (IGVM) format.
+ */
+char *igvm_filename;
+#endif
 };
 
 typedef struct ConfidentialGuestSupportClass {
diff --git a/qapi/qom.json b/qapi/qom.json
index 85e6b4f84a..5935e1b7a6 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -874,6 +874,18 @@
   'base': 'RngProperties',
   'data': { '*filename': 'str' } }
 
+##
+# @ConfidentialGuestProperties:
+#
+# Properties common to objects that are derivatives of 
confidential-guest-support.
+#
+# @igvm-file: IGVM file to use to configure guest (default: none)
+#
+# Since: 9.1
+##
+{ 'struct': 'ConfidentialGuestProperties',
+  'data': { '*igvm-file': 'str' } }
+
 ##
 # @SevGuestProperties:
 #
@@ -901,6 +913,7 @@
 # Since: 2.12
 ##
 { 'struct': 'SevGuestProperties',
+  'base': 'ConfidentialGuestProperties',
   'data': { '*sev-device': 'str',
 '*dh-cert-file': 'str',
 '*session-file': 'str',
diff --git a/qemu-options.hx b/qemu-options.hx
index 7fd1713fa8..0466bb048b 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -5655,7 +5655,7 @@ SRST
  -object secret,id=sec0,keyid=secmaster0,format=base64,\\
  data=$SECRET,iv=$(

[PATCH v2 10/10] docs/interop/firmware.json: Add igvm to FirmwareDevice

2024-04-03 Thread Roy Hopkins
Create an enum entry within FirmwareDevice for 'igvm' to describe that
an IGVM file can be used to map firmware into memory as an alternative
to pre-existing firmware devices.

Signed-off-by: Roy Hopkins 
---
 docs/interop/firmware.json | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/docs/interop/firmware.json b/docs/interop/firmware.json
index 54a1fc6c10..9a9178606e 100644
--- a/docs/interop/firmware.json
+++ b/docs/interop/firmware.json
@@ -55,10 +55,17 @@
 #
 # @memory: The firmware is to be mapped into memory.
 #
+# @igvm: The firmware is defined by a file conforming to the IGVM
+#specification and mapped into memory according to directives
+#defined in the file. This is similar to @memory but may
+#include additional processing defined by the IGVM file
+#including initial CPU state or population of metadata into
+#the guest address space.
+#
 # Since: 3.0
 ##
 { 'enum' : 'FirmwareDevice',
-  'data' : [ 'flash', 'kernel', 'memory' ] }
+  'data' : [ 'flash', 'kernel', 'memory', 'igvm' ] }
 
 ##
 # @FirmwareTarget:
-- 
2.43.0




[PATCH v2 08/10] i386/sev: Implement ConfidentialGuestSupport functions for SEV

2024-04-03 Thread Roy Hopkins
The ConfidentialGuestSupport object defines a number of virtual
functions that are called during processing of IGVM directives to query
or configure initial guest state. In order to support processing of IGVM
files, these functions need to be implemented by relevant isolation
hardware support code such as SEV.

This commit implements the required functions for SEV-ES and adds
support for processing IGVM files for configuring the guest.

Signed-off-by: Roy Hopkins 
---
 target/i386/sev.c | 137 ++
 1 file changed, 137 insertions(+)

diff --git a/target/i386/sev.c b/target/i386/sev.c
index 31dfdc3fe5..46313e7024 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -37,6 +37,7 @@
 #include "qapi/qapi-commands-misc-target.h"
 #include "exec/confidential-guest-support.h"
 #include "hw/i386/pc.h"
+#include "hw/i386/e820_memory_layout.h"
 #include "exec/address-spaces.h"
 
 #define TYPE_SEV_GUEST "sev-guest"
@@ -170,6 +171,9 @@ static const char *const sev_fw_errlist[] = {
 
 #define SEV_FW_MAX_ERROR  ARRAY_SIZE(sev_fw_errlist)
 
+static int sev_launch_update_data(SevGuestState *sev_guest, uint8_t *addr,
+  uint64_t len);
+
 static int
 sev_ioctl(int fd, int cmd, void *data, int *error)
 {
@@ -304,6 +308,14 @@ sev_guest_finalize(Object *obj)
 {
 }
 
+static int cgs_check_support(ConfidentialGuestPlatformType platform,
+ uint16_t platform_version, uint8_t highest_vtl,
+ uint64_t shared_gpa_boundary)
+{
+return (((platform == CGS_PLATFORM_SEV_ES) && sev_es_enabled()) ||
+((platform == CGS_PLATFORM_SEV) && sev_enabled())) ? 1 : 0;
+}
+
 static void sev_apply_cpu_context(CPUState *cpu)
 {
 SevGuestState *sev_guest = SEV_GUEST(MACHINE(qdev_get_machine())->cgs);
@@ -384,6 +396,54 @@ static void sev_apply_cpu_context(CPUState *cpu)
 }
 }
 
+static int check_vmsa_supported(const struct sev_es_save_area *vmsa)
+{
+struct sev_es_save_area vmsa_check;
+size_t i;
+/*
+ * Clear all supported fields so we can then check the entire structure
+ * is zero.
+ */
+memcpy(_check, vmsa, sizeof(struct sev_es_save_area));
+memset(_check.es, 0, sizeof(vmsa_check.es));
+memset(_check.cs, 0, sizeof(vmsa_check.cs));
+memset(_check.ss, 0, sizeof(vmsa_check.ss));
+memset(_check.ds, 0, sizeof(vmsa_check.ds));
+memset(_check.fs, 0, sizeof(vmsa_check.fs));
+memset(_check.gs, 0, sizeof(vmsa_check.gs));
+vmsa_check.efer = 0;
+vmsa_check.cr0 = 0;
+vmsa_check.cr3 = 0;
+vmsa_check.cr4 = 0;
+vmsa_check.xcr0 = 0;
+vmsa_check.dr6 = 0;
+vmsa_check.dr7 = 0;
+vmsa_check.rax = 0;
+vmsa_check.rcx = 0;
+vmsa_check.rdx = 0;
+vmsa_check.rbx = 0;
+vmsa_check.rsp = 0;
+vmsa_check.rbp = 0;
+vmsa_check.rsi = 0;
+vmsa_check.rdi = 0;
+vmsa_check.r8 = 0;
+vmsa_check.r9 = 0;
+vmsa_check.r10 = 0;
+vmsa_check.r11 = 0;
+vmsa_check.r12 = 0;
+vmsa_check.r13 = 0;
+vmsa_check.r14 = 0;
+vmsa_check.r15 = 0;
+vmsa_check.rip = 0;
+
+for (i = 0; i < sizeof(vmsa_check); ++i) {
+if (((uint8_t *)_check)[i]) {
+return 0;
+}
+}
+return 1;
+}
+
 static int sev_set_cpu_context(uint16_t cpu_index, const void *ctx,
uint32_t ctx_len, hwaddr gpa)
 {
@@ -446,6 +506,77 @@ static int sev_set_cpu_context(uint16_t cpu_index, const 
void *ctx,
 return 0;
 }
 
+static int cgs_set_guest_state(hwaddr gpa, uint8_t *ptr, uint64_t len,
+   ConfidentialGuestPageType memory_type,
+   uint16_t cpu_index, Error **errp)
+{
+SevGuestState *sev = SEV_GUEST(MACHINE(qdev_get_machine())->cgs);
+int ret = 1;
+
+if (!sev_enabled()) {
+error_setg(errp, "%s: attempt to configure guest memory, but SEV "
+ "is not enabled",
+ __func__);
+} else if (memory_type == CGS_PAGE_TYPE_VMSA) {
+if (!sev_es_enabled()) {
+error_setg(errp,
+   "%s: attempt to configure initial VMSA, but SEV-ES "
+   "is not supported",
+   __func__);
+} else {
+if (!check_vmsa_supported((const struct sev_es_save_area *)ptr)) {
+error_setg(errp,
+   "%s: The VMSA contains fields that are not "
+   "synchronized with KVM. Continuing would result in "
+   "either unpredictable guest behavior, or a "
+   "mismatched launch measurement.",
+   __func__);
+} else {
+ret = sev_set_cpu_context(cpu_index, ptr, len, gpa);
+}
+}
+} else if ((memory_type == CGS_PAGE_TYPE_ZERO) ||
+   (memory_type == CGS_PAGE_TYPE_NORMAL)) {
+ret = sev_launch_update_data(sev, 

[PATCH v9 09/20] virtio-net: Copy header only when necessary

2024-04-03 Thread Akihiko Odaki
The copied header is only used for byte swapping.

Signed-off-by: Akihiko Odaki 
---
 hw/net/virtio-net.c | 26 --
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index e33bdbfd84a5..ca0fbf7b7654 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -360,7 +360,8 @@ static void virtio_net_vnet_endian_status(VirtIONet *n, 
uint8_t status)
  * can't do it, we fallback onto fixing the headers in the core
  * virtio-net code.
  */
-n->needs_vnet_hdr_swap = virtio_net_set_vnet_endian(vdev, n->nic->ncs,
+n->needs_vnet_hdr_swap = n->has_vnet_hdr &&
+ virtio_net_set_vnet_endian(vdev, n->nic->ncs,
 queue_pairs, true);
 } else if (virtio_net_started(n, vdev->status)) {
 /* After using the device, we need to reset the network backend to
@@ -2767,7 +2768,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
 return -EINVAL;
 }
 
-if (n->has_vnet_hdr) {
+if (n->needs_vnet_hdr_swap) {
 if (iov_to_buf(out_sg, out_num, 0, , n->guest_hdr_len) <
 n->guest_hdr_len) {
 virtio_error(vdev, "virtio-net header incorrect");
@@ -2775,19 +2776,16 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
 g_free(elem);
 return -EINVAL;
 }
-if (n->needs_vnet_hdr_swap) {
-virtio_net_hdr_swap(vdev, (void *) );
-sg2[0].iov_base = 
-sg2[0].iov_len = n->guest_hdr_len;
-out_num = iov_copy([1], ARRAY_SIZE(sg2) - 1,
-   out_sg, out_num,
-   n->guest_hdr_len, -1);
-if (out_num == VIRTQUEUE_MAX_SIZE) {
-goto drop;
-}
-out_num += 1;
-out_sg = sg2;
+virtio_net_hdr_swap(vdev, (void *) );
+sg2[0].iov_base = 
+sg2[0].iov_len = n->guest_hdr_len;
+out_num = iov_copy([1], ARRAY_SIZE(sg2) - 1, out_sg, out_num,
+   n->guest_hdr_len, -1);
+if (out_num == VIRTQUEUE_MAX_SIZE) {
+goto drop;
 }
+out_num += 1;
+out_sg = sg2;
 }
 /*
  * If host wants to see the guest header as is, we can

-- 
2.44.0




[PATCH v9 12/20] virtio-net: Unify the logic to update NIC state for RSS

2024-04-03 Thread Akihiko Odaki
The code to attach or detach the eBPF program to RSS were duplicated so
unify them into one function to save some code.

Signed-off-by: Akihiko Odaki 
---
 hw/net/virtio-net.c | 90 +
 1 file changed, 36 insertions(+), 54 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 1ac9c06f6865..61b49e335dea 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1232,18 +1232,6 @@ static int virtio_net_handle_announce(VirtIONet *n, 
uint8_t cmd,
 }
 }
 
-static void virtio_net_detach_epbf_rss(VirtIONet *n);
-
-static void virtio_net_disable_rss(VirtIONet *n)
-{
-if (n->rss_data.enabled) {
-trace_virtio_net_rss_disable();
-}
-n->rss_data.enabled = false;
-
-virtio_net_detach_epbf_rss(n);
-}
-
 static bool virtio_net_attach_ebpf_to_backend(NICState *nic, int prog_fd)
 {
 NetClientState *nc = qemu_get_peer(qemu_get_queue(nic), 0);
@@ -1291,6 +1279,40 @@ static void virtio_net_detach_epbf_rss(VirtIONet *n)
 virtio_net_attach_ebpf_to_backend(n->nic, -1);
 }
 
+static void virtio_net_commit_rss_config(VirtIONet *n)
+{
+if (n->rss_data.enabled) {
+n->rss_data.enabled_software_rss = n->rss_data.populate_hash;
+if (n->rss_data.populate_hash) {
+virtio_net_detach_epbf_rss(n);
+} else if (!virtio_net_attach_epbf_rss(n)) {
+if (get_vhost_net(qemu_get_queue(n->nic)->peer)) {
+warn_report("Can't load eBPF RSS for vhost");
+} else {
+warn_report("Can't load eBPF RSS - fallback to software RSS");
+n->rss_data.enabled_software_rss = true;
+}
+}
+
+trace_virtio_net_rss_enable(n->rss_data.hash_types,
+n->rss_data.indirections_len,
+sizeof(n->rss_data.key));
+} else {
+virtio_net_detach_epbf_rss(n);
+trace_virtio_net_rss_disable();
+}
+}
+
+static void virtio_net_disable_rss(VirtIONet *n)
+{
+if (!n->rss_data.enabled) {
+return;
+}
+
+n->rss_data.enabled = false;
+virtio_net_commit_rss_config(n);
+}
+
 static bool virtio_net_load_ebpf_fds(VirtIONet *n)
 {
 int fds[EBPF_RSS_MAX_FDS] = { [0 ... EBPF_RSS_MAX_FDS - 1] = -1};
@@ -1455,28 +1477,7 @@ static uint16_t virtio_net_handle_rss(VirtIONet *n,
 goto error;
 }
 n->rss_data.enabled = true;
-
-if (!n->rss_data.populate_hash) {
-if (!virtio_net_attach_epbf_rss(n)) {
-/* EBPF must be loaded for vhost */
-if (get_vhost_net(qemu_get_queue(n->nic)->peer)) {
-warn_report("Can't load eBPF RSS for vhost");
-goto error;
-}
-/* fallback to software RSS */
-warn_report("Can't load eBPF RSS - fallback to software RSS");
-n->rss_data.enabled_software_rss = true;
-}
-} else {
-/* use software RSS for hash populating */
-/* and detach eBPF if was loaded before */
-virtio_net_detach_epbf_rss(n);
-n->rss_data.enabled_software_rss = true;
-}
-
-trace_virtio_net_rss_enable(n->rss_data.hash_types,
-n->rss_data.indirections_len,
-temp.b);
+virtio_net_commit_rss_config(n);
 return queue_pairs;
 error:
 trace_virtio_net_rss_error(err_msg, err_value);
@@ -3092,26 +3093,7 @@ static int virtio_net_post_load_device(void *opaque, int 
version_id)
 }
 }
 
-if (n->rss_data.enabled) {
-n->rss_data.enabled_software_rss = n->rss_data.populate_hash;
-if (!n->rss_data.populate_hash) {
-if (!virtio_net_attach_epbf_rss(n)) {
-if (get_vhost_net(qemu_get_queue(n->nic)->peer)) {
-warn_report("Can't post-load eBPF RSS for vhost");
-} else {
-warn_report("Can't post-load eBPF RSS - "
-"fallback to software RSS");
-n->rss_data.enabled_software_rss = true;
-}
-}
-}
-
-trace_virtio_net_rss_enable(n->rss_data.hash_types,
-n->rss_data.indirections_len,
-sizeof(n->rss_data.key));
-} else {
-trace_virtio_net_rss_disable();
-}
+virtio_net_commit_rss_config(n);
 return 0;
 }
 

-- 
2.44.0




[PATCH v9 20/20] ebpf: Add a separate target for skeleton

2024-04-03 Thread Akihiko Odaki
This generalizes the rule to generate the skeleton and allows to add
another.

Signed-off-by: Akihiko Odaki 
---
 tools/ebpf/Makefile.ebpf | 15 ---
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/tools/ebpf/Makefile.ebpf b/tools/ebpf/Makefile.ebpf
index 3391e7ce0898..572ca5987ae6 100755
--- a/tools/ebpf/Makefile.ebpf
+++ b/tools/ebpf/Makefile.ebpf
@@ -1,23 +1,24 @@
-OBJS = rss.bpf.o
+SKELETONS = rss.bpf.skeleton.h
 
 LLVM_STRIP ?= llvm-strip
 CLANG ?= clang
 INC_FLAGS = `$(CLANG) -print-file-name=include`
 EXTRA_CFLAGS ?= -O2 -g -target bpf
 
-all: $(OBJS)
+all: $(SKELETONS)
 
 .PHONY: clean
 
 clean:
-   rm -f $(OBJS)
-   rm -f rss.bpf.skeleton.h
+   rm -f $(SKELETONS) $(SKELETONS:%.skeleton.h=%.o)
 
-$(OBJS):  %.o:%.c
+%.o: %.c
$(CLANG) $(INC_FLAGS) \
 -D__KERNEL__ -D__ASM_SYSREG_H \
 -I../include $(LINUXINCLUDE) \
 $(EXTRA_CFLAGS) -c $< -o $@
$(LLVM_STRIP) -g $@
-   bpftool gen skeleton rss.bpf.o > rss.bpf.skeleton.h
-   cp rss.bpf.skeleton.h ../../ebpf/
+
+%.skeleton.h: %.o
+   bpftool gen skeleton $< > $@
+   cp $@ ../../ebpf/

-- 
2.44.0




[PATCH v2 00/10] Introduce support for IGVM files

2024-04-03 Thread Roy Hopkins
Here is v2 of the set of patches to add support for IGVM files to QEMU. These
address all of the comments on v1 [1]. These patches are also available
to view on github: [2].

Changes in v2:

  * Fixed various spelling and documentation errors from Stefano.
  * Addressed readability and other suggested code changes from Daniel.
  * igvm.c: Fix issue in prepare_memory() which resulted in the wrong start 
index
 being used at the start of a page range if the next or last directive did
 not follow the previous one.
  * igvmc: Fix usage of IGVM compatibility mask.
  * igvm.c: Fix issue in page_attrs_equal() which treated zero and normal pages
as equal. This could affect the SEV measurement.
  * Improve and clarify handling of IGVM to VMSA to KVM CPU state conversion.
The specific registers that are synchronized are now documented and a check
is performed during IGVM file parsing to determine if any registers outside
the supported set are non-zero making it easier to determine the cause of
any mismatch of launch measurement.
  * Significant rework of error handling in ConfidentialGuestSupport and the
IGVM parser.
  * confidential-guest-support: Remove TDX and other non-currently-supported
platforms.
  * Exit with error if any unknown IGVM directives are encountered.
  * Rework handling of firmware so if an IGVM file is provided in addition to
a firmware file then an error is generated.
  * Update firmware.json to add an 'igvm' firmware device.

Thanks to Daniel, Stefano, Ani and everyone else that has taken time to review
this so far.

[1] Link to v1:
https://lore.kernel.org/qemu-devel/cover.1709044754.git.roy.hopk...@suse.com/

[2] v2 patches also available here:
https://github.com/roy-hopkins/qemu/tree/igvm_master_v2

Roy Hopkins (10):
  meson: Add optional dependency on IGVM library
  backends/confidential-guest-support: Add IGVM file parameter
  backends/confidential-guest-support: Add functions to support IGVM
  backends/igvm: Implement parsing and processing of IGVM files
  i386/pc: Process IGVM file during PC initialization if present
  i386/pc_sysfw: Ensure sysfw flash configuration does not conflict with
IGVM
  i386/sev: Refactor setting of reset vector and initial CPU state
  i386/sev: Implement ConfidentialGuestSupport functions for SEV
  docs/system: Add documentation on support for IGVM
  docs/interop/firmware.json: Add igvm to FirmwareDevice

 backends/confidential-guest-support.c  |  75 +++
 backends/igvm.c| 744 +
 backends/meson.build   |   4 +
 docs/interop/firmware.json |   9 +-
 docs/system/i386/amd-memory-encryption.rst |   2 +
 docs/system/igvm.rst   | 129 
 docs/system/index.rst  |   1 +
 hw/i386/pc_piix.c  |   4 +
 hw/i386/pc_q35.c   |   4 +
 hw/i386/pc_sysfw.c |  22 +-
 include/exec/confidential-guest-support.h  | 105 +++
 include/exec/igvm.h|  36 +
 meson.build|   8 +
 meson_options.txt  |   2 +
 qapi/qom.json  |  13 +
 qemu-options.hx|   8 +-
 scripts/meson-buildoptions.sh  |   3 +
 target/i386/sev.c  | 425 +++-
 target/i386/sev.h  | 110 +++
 19 files changed, 1671 insertions(+), 33 deletions(-)
 create mode 100644 backends/igvm.c
 create mode 100644 docs/system/igvm.rst
 create mode 100644 include/exec/igvm.h

--
2.43.0




[PATCH v9 15/20] virtio-net: Always set populate_hash

2024-04-03 Thread Akihiko Odaki
The member is not cleared during reset so may have a stale value.

Signed-off-by: Akihiko Odaki 
---
 hw/net/virtio-net.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 86929c9e1fad..2de073ce18fd 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -651,6 +651,7 @@ static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int 
mergeable_rx_bufs,
 n->guest_hdr_len = n->mergeable_rx_bufs ?
 sizeof(struct virtio_net_hdr_mrg_rxbuf) :
 sizeof(struct virtio_net_hdr);
+n->rss_data.populate_hash = false;
 }
 
 for (i = 0; i < n->max_queue_pairs; i++) {

-- 
2.44.0




[PATCH v9 13/20] virtio-net: Return an error when vhost cannot enable RSS

2024-04-03 Thread Akihiko Odaki
vhost requires eBPF for RSS. When eBPF is not available, virtio-net
implicitly disables RSS even if the user explicitly requests it. Return
an error instead of implicitly disabling RSS if RSS is requested but not
available.

Signed-off-by: Akihiko Odaki 
---
 hw/net/virtio-net.c | 97 ++---
 1 file changed, 48 insertions(+), 49 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 61b49e335dea..3d53eba88cfc 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -793,9 +793,6 @@ static uint64_t virtio_net_get_features(VirtIODevice *vdev, 
uint64_t features,
 return features;
 }
 
-if (!ebpf_rss_is_loaded(>ebpf_rss)) {
-virtio_clear_feature(, VIRTIO_NET_F_RSS);
-}
 features = vhost_net_get_features(get_vhost_net(nc->peer), features);
 vdev->backend_features = features;
 
@@ -3591,6 +3588,50 @@ static bool failover_hide_primary_device(DeviceListener 
*listener,
 return qatomic_read(>failover_primary_hidden);
 }
 
+static void virtio_net_device_unrealize(DeviceState *dev)
+{
+VirtIODevice *vdev = VIRTIO_DEVICE(dev);
+VirtIONet *n = VIRTIO_NET(dev);
+int i, max_queue_pairs;
+
+if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
+virtio_net_unload_ebpf(n);
+}
+
+/* This will stop vhost backend if appropriate. */
+virtio_net_set_status(vdev, 0);
+
+g_free(n->netclient_name);
+n->netclient_name = NULL;
+g_free(n->netclient_type);
+n->netclient_type = NULL;
+
+g_free(n->mac_table.macs);
+g_free(n->vlans);
+
+if (n->failover) {
+qobject_unref(n->primary_opts);
+device_listener_unregister(>primary_listener);
+migration_remove_notifier(>migration_state);
+} else {
+assert(n->primary_opts == NULL);
+}
+
+max_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
+for (i = 0; i < max_queue_pairs; i++) {
+virtio_net_del_queue(n, i);
+}
+/* delete also control vq */
+virtio_del_queue(vdev, max_queue_pairs * 2);
+qemu_announce_timer_del(>announce_timer, false);
+g_free(n->vqs);
+qemu_del_nic(n->nic);
+virtio_net_rsc_cleanup(n);
+g_free(n->rss_data.indirections_table);
+net_rx_pkt_uninit(n->rx_pkt);
+virtio_cleanup(vdev);
+}
+
 static void virtio_net_device_realize(DeviceState *dev, Error **errp)
 {
 VirtIODevice *vdev = VIRTIO_DEVICE(dev);
@@ -3760,53 +3801,11 @@ static void virtio_net_device_realize(DeviceState *dev, 
Error **errp)
 
 net_rx_pkt_init(>rx_pkt);
 
-if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
-virtio_net_load_ebpf(n);
-}
-}
-
-static void virtio_net_device_unrealize(DeviceState *dev)
-{
-VirtIODevice *vdev = VIRTIO_DEVICE(dev);
-VirtIONet *n = VIRTIO_NET(dev);
-int i, max_queue_pairs;
-
-if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
-virtio_net_unload_ebpf(n);
+if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS) &&
+!virtio_net_load_ebpf(n) && get_vhost_net(nc->peer)) {
+virtio_net_device_unrealize(dev);
+error_setg(errp, "Can't load eBPF RSS for vhost");
 }
-
-/* This will stop vhost backend if appropriate. */
-virtio_net_set_status(vdev, 0);
-
-g_free(n->netclient_name);
-n->netclient_name = NULL;
-g_free(n->netclient_type);
-n->netclient_type = NULL;
-
-g_free(n->mac_table.macs);
-g_free(n->vlans);
-
-if (n->failover) {
-qobject_unref(n->primary_opts);
-device_listener_unregister(>primary_listener);
-migration_remove_notifier(>migration_state);
-} else {
-assert(n->primary_opts == NULL);
-}
-
-max_queue_pairs = n->multiqueue ? n->max_queue_pairs : 1;
-for (i = 0; i < max_queue_pairs; i++) {
-virtio_net_del_queue(n, i);
-}
-/* delete also control vq */
-virtio_del_queue(vdev, max_queue_pairs * 2);
-qemu_announce_timer_del(>announce_timer, false);
-g_free(n->vqs);
-qemu_del_nic(n->nic);
-virtio_net_rsc_cleanup(n);
-g_free(n->rss_data.indirections_table);
-net_rx_pkt_uninit(n->rx_pkt);
-virtio_cleanup(vdev);
 }
 
 static void virtio_net_reset(VirtIODevice *vdev)

-- 
2.44.0




[PATCH v2 05/10] i386/pc: Process IGVM file during PC initialization if present

2024-04-03 Thread Roy Hopkins
An IGVM file contains configuration of a guest that supports
confidential computing hardware. As part of the PC system
initialisation, the IGVM needs to be processed to apply this
configuration before the guest is started.

This patch introduces processing of a provided IGVM file at the end of
the current PC initialization steps. If an IGVM file has been provided
then the directives in the file are processed completing the
initialization of the target.

If no IGVM file has been specified by the user then no there is no
intended consequences in these changes.

Signed-off-by: Roy Hopkins 
---
 backends/confidential-guest-support.c | 18 ++
 hw/i386/pc_piix.c |  4 
 hw/i386/pc_q35.c  |  4 
 include/exec/confidential-guest-support.h | 17 +
 4 files changed, 43 insertions(+)

diff --git a/backends/confidential-guest-support.c 
b/backends/confidential-guest-support.c
index adfe447334..79c0f3fc56 100644
--- a/backends/confidential-guest-support.c
+++ b/backends/confidential-guest-support.c
@@ -88,3 +88,21 @@ static void confidential_guest_support_init(Object *obj)
 static void confidential_guest_support_finalize(Object *obj)
 {
 }
+
+bool cgs_is_igvm(ConfidentialGuestSupport *cgs)
+{
+#if defined(CONFIG_IGVM)
+return cgs && cgs->igvm;
+#else
+return false;
+#endif
+}
+
+void cgs_process_igvm(ConfidentialGuestSupport *cgs)
+{
+#if defined(CONFIG_IGVM)
+if (cgs && cgs_is_igvm(cgs)) {
+igvm_process(cgs, _fatal);
+}
+#endif
+}
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 18ba076609..f63ddb8e83 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -68,6 +68,7 @@
 #include "hw/i386/acpi-build.h"
 #include "kvm/kvm-cpu.h"
 #include "target/i386/cpu.h"
+#include "exec/confidential-guest-support.h"
 
 #define XEN_IOAPIC_NUM_PIRQS 128ULL
 
@@ -366,6 +367,9 @@ static void pc_init1(MachineState *machine, const char 
*pci_type)
x86_nvdimm_acpi_dsmio,
x86ms->fw_cfg, OBJECT(pcms));
 }
+
+/* Apply confidential guest state from IGVM if supplied */
+cgs_process_igvm(machine->cgs);
 }
 
 typedef enum PCSouthBridgeOption {
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index b5922b44af..3f24728cd3 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -60,6 +60,7 @@
 #include "hw/mem/nvdimm.h"
 #include "hw/i386/acpi-build.h"
 #include "target/i386/cpu.h"
+#include "exec/confidential-guest-support.h"
 
 /* ICH9 AHCI has 6 ports */
 #define MAX_SATA_PORTS 6
@@ -327,6 +328,9 @@ static void pc_q35_init(MachineState *machine)
x86_nvdimm_acpi_dsmio,
x86ms->fw_cfg, OBJECT(pcms));
 }
+
+/* Apply confidential guest state from IGVM if supplied */
+cgs_process_igvm(machine->cgs);
 }
 
 #define DEFINE_Q35_MACHINE(suffix, name, compatfn, optionfn) \
diff --git a/include/exec/confidential-guest-support.h 
b/include/exec/confidential-guest-support.h
index 9419e91249..c380eee2c3 100644
--- a/include/exec/confidential-guest-support.h
+++ b/include/exec/confidential-guest-support.h
@@ -145,6 +145,23 @@ typedef struct ConfidentialGuestSupportClass {
 ObjectClass parent;
 } ConfidentialGuestSupportClass;
 
+/*
+ * Check whether the configuration of the confidential guest is provided
+ * using an IGVM file. IGVM configuration can include the system firmware,
+ * initial CPU state and other configuration that should override standard
+ * system initialization. This function should be used by platforms to
+ * determine which devices and configuration to include during system
+ * initialization.
+ */
+bool cgs_is_igvm(ConfidentialGuestSupport *cgs);
+/*
+ * If IGVM is supported and an IGVM file has been specified then the
+ * configuration described in the file is applied to the guest.
+ * Configuration of a confidential guest includes the layout of the
+ * guest memory, including firmware and initial CPU state.
+ */
+void cgs_process_igvm(ConfidentialGuestSupport *cgs);
+
 #endif /* !CONFIG_USER_ONLY */
 
 #endif /* QEMU_CONFIDENTIAL_GUEST_SUPPORT_H */
-- 
2.43.0




[PATCH v2 07/10] i386/sev: Refactor setting of reset vector and initial CPU state

2024-04-03 Thread Roy Hopkins
When an SEV guest is started, the reset vector and state are
extracted from metadata that is contained in the firmware volume.

In preparation for using IGVM to setup the initial CPU state,
the code has been refactored to populate vmcb_save_area for each
CPU which is then applied during guest startup and CPU reset.

Signed-off-by: Roy Hopkins 
---
 target/i386/sev.c | 288 +-
 target/i386/sev.h | 110 ++
 2 files changed, 369 insertions(+), 29 deletions(-)

diff --git a/target/i386/sev.c b/target/i386/sev.c
index 72930ff0dc..31dfdc3fe5 100644
--- a/target/i386/sev.c
+++ b/target/i386/sev.c
@@ -74,9 +74,7 @@ struct SevGuestState {
 SevState state;
 gchar *measurement;
 
-uint32_t reset_cs;
-uint32_t reset_ip;
-bool reset_data_valid;
+QTAILQ_HEAD(, SevLaunchVmsa) launch_vmsa;
 };
 
 #define DEFAULT_GUEST_POLICY0x1 /* disable debug */
@@ -99,6 +97,12 @@ typedef struct QEMU_PACKED SevHashTableDescriptor {
 /* hard code sha256 digest size */
 #define HASH_SIZE 32
 
+/* Convert between SEV-ES VMSA and SegmentCache flags/attributes */
+#define FLAGS_VMSA_TO_SEGCACHE(flags) \
+flags) & 0xff00) << 12) | (((flags) & 0xff) << 8))
+#define FLAGS_SEGCACHE_TO_VMSA(flags) \
+flags) & 0xff00) >> 8) | (((flags) & 0xf0) >> 12))
+
 typedef struct QEMU_PACKED SevHashTableEntry {
 QemuUUID guid;
 uint16_t len;
@@ -125,6 +129,15 @@ typedef struct QEMU_PACKED PaddedSevHashTable {
 QEMU_BUILD_BUG_ON(sizeof(PaddedSevHashTable) % 16 != 0);
 
 static SevGuestState *sev_guest;
+
+typedef struct SevLaunchVmsa {
+QTAILQ_ENTRY(SevLaunchVmsa) next;
+
+uint16_t cpu_index;
+uint64_t gpa;
+struct sev_es_save_area vmsa;
+} SevLaunchVmsa;
+
 static Error *sev_mig_blocker;
 
 static const char *const sev_fw_errlist[] = {
@@ -291,6 +304,148 @@ sev_guest_finalize(Object *obj)
 {
 }
 
+static void sev_apply_cpu_context(CPUState *cpu)
+{
+SevGuestState *sev_guest = SEV_GUEST(MACHINE(qdev_get_machine())->cgs);
+X86CPU *x86;
+CPUX86State *env;
+struct SevLaunchVmsa *launch_vmsa;
+
+/* See if an initial VMSA has been provided for this CPU */
+QTAILQ_FOREACH(launch_vmsa, _guest->launch_vmsa, next)
+{
+if (cpu->cpu_index == launch_vmsa->cpu_index) {
+x86 = X86_CPU(cpu);
+env = >env;
+
+/*
+ * Ideally we would provide the VMSA directly to kvm which would
+ * ensure that the resulting initial VMSA measurement which is
+ * calculated during KVM_SEV_LAUNCH_UPDATE_VMSA is calculated from
+ * exactly what we provide here. Currently this is not possible so
+ * we need to copy the parts of the VMSA structure that we 
currently
+ * support into the CPU state.
+ */
+cpu_load_efer(env, launch_vmsa->vmsa.efer);
+cpu_x86_update_cr4(env, launch_vmsa->vmsa.cr4);
+cpu_x86_update_cr0(env, launch_vmsa->vmsa.cr0);
+cpu_x86_update_cr3(env, launch_vmsa->vmsa.cr3);
+env->xcr0 = launch_vmsa->vmsa.xcr0;
+
+cpu_x86_load_seg_cache(
+env, R_CS, launch_vmsa->vmsa.cs.selector,
+launch_vmsa->vmsa.cs.base, launch_vmsa->vmsa.cs.limit,
+FLAGS_VMSA_TO_SEGCACHE(launch_vmsa->vmsa.cs.attrib));
+cpu_x86_load_seg_cache(
+env, R_DS, launch_vmsa->vmsa.ds.selector,
+launch_vmsa->vmsa.ds.base, launch_vmsa->vmsa.ds.limit,
+FLAGS_VMSA_TO_SEGCACHE(launch_vmsa->vmsa.ds.attrib));
+cpu_x86_load_seg_cache(
+env, R_ES, launch_vmsa->vmsa.es.selector,
+launch_vmsa->vmsa.es.base, launch_vmsa->vmsa.es.limit,
+FLAGS_VMSA_TO_SEGCACHE(launch_vmsa->vmsa.es.attrib));
+cpu_x86_load_seg_cache(
+env, R_FS, launch_vmsa->vmsa.fs.selector,
+launch_vmsa->vmsa.fs.base, launch_vmsa->vmsa.fs.limit,
+FLAGS_VMSA_TO_SEGCACHE(launch_vmsa->vmsa.fs.attrib));
+cpu_x86_load_seg_cache(
+env, R_GS, launch_vmsa->vmsa.gs.selector,
+launch_vmsa->vmsa.gs.base, launch_vmsa->vmsa.gs.limit,
+FLAGS_VMSA_TO_SEGCACHE(launch_vmsa->vmsa.gs.attrib));
+cpu_x86_load_seg_cache(
+env, R_SS, launch_vmsa->vmsa.ss.selector,
+launch_vmsa->vmsa.ss.base, launch_vmsa->vmsa.ss.limit,
+FLAGS_VMSA_TO_SEGCACHE(launch_vmsa->vmsa.ss.attrib));
+
+env->dr[6] = launch_vmsa->vmsa.dr6;
+env->dr[7] = launch_vmsa->vmsa.dr7;
+
+env->regs[R_EAX] = launch_vmsa->vmsa.rax;
+env->regs[R_ECX] = launch_vmsa->vmsa.rcx;
+env->regs[R_EDX] = launch_vmsa->vmsa.rdx;
+env->regs[R_EBX] = launch_vmsa->vmsa.rbx;
+env->regs[R_ESP] = launch_vmsa->vmsa.rsp;
+env->regs[R_EBP] = launch_vmsa->vmsa.rbp;

[PATCH v9 10/20] virtio-net: Shrink header byte swapping buffer

2024-04-03 Thread Akihiko Odaki
Byte swapping is only performed for the part of header shared with the
legacy standard and the buffer only needs to cover it.

Signed-off-by: Akihiko Odaki 
---
 hw/net/virtio-net.c | 17 ++---
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index ca0fbf7b7654..5aa0527a1921 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -676,11 +676,6 @@ static void virtio_net_set_mrg_rx_bufs(VirtIONet *n, int 
mergeable_rx_bufs,
 
 n->mergeable_rx_bufs = mergeable_rx_bufs;
 
-/*
- * Note: when extending the vnet header, please make sure to
- * change the vnet header copying logic in virtio_net_flush_tx()
- * as well.
- */
 if (version_1) {
 n->guest_hdr_len = hash_report ?
 sizeof(struct virtio_net_hdr_v1_hash) :
@@ -2752,7 +2747,7 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
 ssize_t ret;
 unsigned int out_num;
 struct iovec sg[VIRTQUEUE_MAX_SIZE], sg2[VIRTQUEUE_MAX_SIZE + 1], 
*out_sg;
-struct virtio_net_hdr_v1_hash vhdr;
+struct virtio_net_hdr vhdr;
 
 elem = virtqueue_pop(q->tx_vq, sizeof(VirtQueueElement));
 if (!elem) {
@@ -2769,18 +2764,18 @@ static int32_t virtio_net_flush_tx(VirtIONetQueue *q)
 }
 
 if (n->needs_vnet_hdr_swap) {
-if (iov_to_buf(out_sg, out_num, 0, , n->guest_hdr_len) <
-n->guest_hdr_len) {
+if (iov_to_buf(out_sg, out_num, 0, , sizeof(vhdr)) <
+sizeof(vhdr)) {
 virtio_error(vdev, "virtio-net header incorrect");
 virtqueue_detach_element(q->tx_vq, elem, 0);
 g_free(elem);
 return -EINVAL;
 }
-virtio_net_hdr_swap(vdev, (void *) );
+virtio_net_hdr_swap(vdev, );
 sg2[0].iov_base = 
-sg2[0].iov_len = n->guest_hdr_len;
+sg2[0].iov_len = sizeof(vhdr);
 out_num = iov_copy([1], ARRAY_SIZE(sg2) - 1, out_sg, out_num,
-   n->guest_hdr_len, -1);
+   sizeof(vhdr), -1);
 if (out_num == VIRTQUEUE_MAX_SIZE) {
 goto drop;
 }

-- 
2.44.0




[PATCH v9 03/20] net: Move virtio-net header length assertion

2024-04-03 Thread Akihiko Odaki
The virtio-net header length assertion should happen for any clients.

Signed-off-by: Akihiko Odaki 
---
 net/net.c | 5 +
 net/tap.c | 3 ---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/net/net.c b/net/net.c
index bd51037ebfb0..db096765f4b2 100644
--- a/net/net.c
+++ b/net/net.c
@@ -56,6 +56,7 @@
 #include "net/filter.h"
 #include "qapi/string-output-visitor.h"
 #include "qapi/qobject-input-visitor.h"
+#include "standard-headers/linux/virtio_net.h"
 
 /* Net bridge is currently not supported for W32. */
 #if !defined(_WIN32)
@@ -550,6 +551,10 @@ void qemu_set_vnet_hdr_len(NetClientState *nc, int len)
 return;
 }
 
+assert(len == sizeof(struct virtio_net_hdr_mrg_rxbuf) ||
+   len == sizeof(struct virtio_net_hdr) ||
+   len == sizeof(struct virtio_net_hdr_v1_hash));
+
 nc->vnet_hdr_len = len;
 nc->info->set_vnet_hdr_len(nc, len);
 }
diff --git a/net/tap.c b/net/tap.c
index c848844955df..49edf6c2b6e1 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -267,9 +267,6 @@ static void tap_set_vnet_hdr_len(NetClientState *nc, int 
len)
 TAPState *s = DO_UPCAST(TAPState, nc, nc);
 
 assert(nc->info->type == NET_CLIENT_DRIVER_TAP);
-assert(len == sizeof(struct virtio_net_hdr_mrg_rxbuf) ||
-   len == sizeof(struct virtio_net_hdr) ||
-   len == sizeof(struct virtio_net_hdr_v1_hash));
 
 tap_fd_set_vnet_hdr_len(s->fd, len);
 s->host_vnet_hdr_len = len;

-- 
2.44.0




[PATCH v2 01/10] meson: Add optional dependency on IGVM library

2024-04-03 Thread Roy Hopkins
The IGVM library allows Independent Guest Virtual Machine files to be
parsed and processed. IGVM files are used to configure guest memory
layout, initial processor state and other configuration pertaining to
secure virtual machines.

This adds the --enable-igvm configure option, enabled by default, which
attempts to locate and link against the IGVM library via pkgconfig and
sets CONFIG_IGVM if found.

The library is added to the system_ss target in backends/meson.build
where the IGVM parsing will be performed by the ConfidentialGuestSupport
object.

Signed-off-by: Roy Hopkins 
---
 backends/meson.build  | 3 +++
 meson.build   | 8 
 meson_options.txt | 2 ++
 scripts/meson-buildoptions.sh | 3 +++
 4 files changed, 16 insertions(+)

diff --git a/backends/meson.build b/backends/meson.build
index 8b2b111497..d550ac19f7 100644
--- a/backends/meson.build
+++ b/backends/meson.build
@@ -30,5 +30,8 @@ if have_vhost_user_crypto
 endif
 system_ss.add(when: gio, if_true: files('dbus-vmstate.c'))
 system_ss.add(when: 'CONFIG_SGX', if_true: files('hostmem-epc.c'))
+if igvm.found()
+  system_ss.add(igvm)
+endif
 
 subdir('tpm')
diff --git a/meson.build b/meson.build
index c9c3217ba4..f0b5a29ce7 100644
--- a/meson.build
+++ b/meson.build
@@ -1232,6 +1232,12 @@ if host_os == 'linux' and (have_system or have_tools)
method: 'pkg-config',
required: get_option('libudev'))
 endif
+igvm = not_found
+if not get_option('igvm').auto() or have_system
+  igvm = dependency('igvm',
+   method: 'pkg-config',
+   required: get_option('igvm'))
+endif
 
 mpathlibs = [libudev]
 mpathpersist = not_found
@@ -2320,6 +2326,7 @@ config_host_data.set('CONFIG_CFI', get_option('cfi'))
 config_host_data.set('CONFIG_SELINUX', selinux.found())
 config_host_data.set('CONFIG_XEN_BACKEND', xen.found())
 config_host_data.set('CONFIG_LIBDW', libdw.found())
+config_host_data.set('CONFIG_IGVM', igvm.found())
 if xen.found()
   # protect from xen.version() having less than three components
   xen_version = xen.version().split('.') + ['0', '0']
@@ -4456,6 +4463,7 @@ summary_info += {'seccomp support':   seccomp}
 summary_info += {'GlusterFS support': glusterfs}
 summary_info += {'hv-balloon support': hv_balloon}
 summary_info += {'TPM support':   have_tpm}
+summary_info += {'IGVM support':  igvm}
 summary_info += {'libssh support':libssh}
 summary_info += {'lzo support':   lzo}
 summary_info += {'snappy support':snappy}
diff --git a/meson_options.txt b/meson_options.txt
index 0a99a059ec..4eaba64f4b 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -109,6 +109,8 @@ option('dbus_display', type: 'feature', value: 'auto',
description: '-display dbus support')
 option('tpm', type : 'feature', value : 'auto',
description: 'TPM support')
+option('igvm', type: 'feature', value: 'auto',
+   description: 'Independent Guest Virtual Machine (IGVM) file support')
 
 # Do not enable it by default even for Mingw32, because it doesn't
 # work on Wine.
diff --git a/scripts/meson-buildoptions.sh b/scripts/meson-buildoptions.sh
index 680fa3f581..38a8183625 100644
--- a/scripts/meson-buildoptions.sh
+++ b/scripts/meson-buildoptions.sh
@@ -126,6 +126,7 @@ meson_options_help() {
   printf "%s\n" '  hv-balloon  hv-balloon driver (requires Glib 2.68+ 
GTree API)'
   printf "%s\n" '  hvf HVF acceleration support'
   printf "%s\n" '  iconv   Font glyph conversion support'
+  printf "%s\n" '  igvmIGVM file support'
   printf "%s\n" '  jackJACK sound support'
   printf "%s\n" '  keyring Linux keyring support'
   printf "%s\n" '  kvm KVM acceleration support'
@@ -342,6 +343,8 @@ _meson_option_parse() {
 --iasl=*) quote_sh "-Diasl=$2" ;;
 --enable-iconv) printf "%s" -Diconv=enabled ;;
 --disable-iconv) printf "%s" -Diconv=disabled ;;
+--enable-igvm) printf "%s" -Digvm=enabled ;;
+--disable-igvm) printf "%s" -Digvm=disabled ;;
 --includedir=*) quote_sh "-Dincludedir=$2" ;;
 --enable-install-blobs) printf "%s" -Dinstall_blobs=true ;;
 --disable-install-blobs) printf "%s" -Dinstall_blobs=false ;;
-- 
2.43.0




[PATCH v9 19/20] ebpf: Refactor tun_rss_steering_prog()

2024-04-03 Thread Akihiko Odaki
This saves branches and makes later BPF program changes easier.

Signed-off-by: Akihiko Odaki 
---
 tools/ebpf/rss.bpf.c | 26 +++---
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/tools/ebpf/rss.bpf.c b/tools/ebpf/rss.bpf.c
index 77434435ac15..c989cb3cd82c 100644
--- a/tools/ebpf/rss.bpf.c
+++ b/tools/ebpf/rss.bpf.c
@@ -547,27 +547,23 @@ int tun_rss_steering_prog(struct __sk_buff *skb)
 config = bpf_map_lookup_elem(_rss_map_configurations, );
 toe = bpf_map_lookup_elem(_rss_map_toeplitz_key, );
 
-if (config && toe) {
-if (!config->redirect) {
-return config->default_queue;
-}
+if (!config || !toe) {
+return 0;
+}
 
-if (calculate_rss_hash(skb, config, toe, )) {
-__u32 table_idx = hash % config->indirections_len;
-__u16 *queue = 0;
+if (config->redirect && calculate_rss_hash(skb, config, toe, )) {
+__u32 table_idx = hash % config->indirections_len;
+__u16 *queue = 0;
 
-queue = bpf_map_lookup_elem(_rss_map_indirection_table,
-_idx);
+queue = bpf_map_lookup_elem(_rss_map_indirection_table,
+_idx);
 
-if (queue) {
-return *queue;
-}
+if (queue) {
+return *queue;
 }
-
-return config->default_queue;
 }
 
-return 0;
+return config->default_queue;
 }
 
 char _license[] SEC("license") = "GPL v2";

-- 
2.44.0




[PATCH v9 11/20] virtio-net: Disable RSS on reset

2024-04-03 Thread Akihiko Odaki
RSS is disabled by default.

Fixes: 590790297c ("virtio-net: implement RSS configuration command")
Signed-off-by: Akihiko Odaki 
Reviewed-by: Michael Tokarev 
---
 hw/net/virtio-net.c | 70 +++--
 1 file changed, 36 insertions(+), 34 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 5aa0527a1921..1ac9c06f6865 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -600,40 +600,6 @@ static void virtio_net_queue_enable(VirtIODevice *vdev, 
uint32_t queue_index)
 }
 }
 
-static void virtio_net_reset(VirtIODevice *vdev)
-{
-VirtIONet *n = VIRTIO_NET(vdev);
-int i;
-
-/* Reset back to compatibility mode */
-n->promisc = 1;
-n->allmulti = 0;
-n->alluni = 0;
-n->nomulti = 0;
-n->nouni = 0;
-n->nobcast = 0;
-/* multiqueue is disabled by default */
-n->curr_queue_pairs = 1;
-timer_del(n->announce_timer.tm);
-n->announce_timer.round = 0;
-n->status &= ~VIRTIO_NET_S_ANNOUNCE;
-
-/* Flush any MAC and VLAN filter table state */
-n->mac_table.in_use = 0;
-n->mac_table.first_multi = 0;
-n->mac_table.multi_overflow = 0;
-n->mac_table.uni_overflow = 0;
-memset(n->mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN);
-memcpy(>mac[0], >nic->conf->macaddr, sizeof(n->mac));
-qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
-memset(n->vlans, 0, MAX_VLAN >> 3);
-
-/* Flush any async TX */
-for (i = 0;  i < n->max_queue_pairs; i++) {
-flush_or_purge_queued_packets(qemu_get_subqueue(n->nic, i));
-}
-}
-
 static void peer_test_vnet_hdr(VirtIONet *n)
 {
 NetClientState *nc = qemu_get_queue(n->nic);
@@ -3861,6 +3827,42 @@ static void virtio_net_device_unrealize(DeviceState *dev)
 virtio_cleanup(vdev);
 }
 
+static void virtio_net_reset(VirtIODevice *vdev)
+{
+VirtIONet *n = VIRTIO_NET(vdev);
+int i;
+
+/* Reset back to compatibility mode */
+n->promisc = 1;
+n->allmulti = 0;
+n->alluni = 0;
+n->nomulti = 0;
+n->nouni = 0;
+n->nobcast = 0;
+/* multiqueue is disabled by default */
+n->curr_queue_pairs = 1;
+timer_del(n->announce_timer.tm);
+n->announce_timer.round = 0;
+n->status &= ~VIRTIO_NET_S_ANNOUNCE;
+
+/* Flush any MAC and VLAN filter table state */
+n->mac_table.in_use = 0;
+n->mac_table.first_multi = 0;
+n->mac_table.multi_overflow = 0;
+n->mac_table.uni_overflow = 0;
+memset(n->mac_table.macs, 0, MAC_TABLE_ENTRIES * ETH_ALEN);
+memcpy(>mac[0], >nic->conf->macaddr, sizeof(n->mac));
+qemu_format_nic_info_str(qemu_get_queue(n->nic), n->mac);
+memset(n->vlans, 0, MAX_VLAN >> 3);
+
+/* Flush any async TX */
+for (i = 0;  i < n->max_queue_pairs; i++) {
+flush_or_purge_queued_packets(qemu_get_subqueue(n->nic, i));
+}
+
+virtio_net_disable_rss(n);
+}
+
 static void virtio_net_instance_init(Object *obj)
 {
 VirtIONet *n = VIRTIO_NET(obj);

-- 
2.44.0




[PATCH v9 02/20] tap: Remove qemu_using_vnet_hdr()

2024-04-03 Thread Akihiko Odaki
Since qemu_set_vnet_hdr_len() is always called when
qemu_using_vnet_hdr() is called, we can merge them and save some code.

For consistency, express that the virtio-net header is not in use by
returning 0 with qemu_get_vnet_hdr_len() instead of having a dedicated
function, qemu_get_using_vnet_hdr().

Signed-off-by: Akihiko Odaki 
---
 include/net/net.h   |  7 ---
 hw/net/e1000e.c |  1 -
 hw/net/igb.c|  1 -
 hw/net/net_tx_pkt.c |  4 ++--
 hw/net/virtio-net.c |  3 ---
 hw/net/vmxnet3.c|  2 --
 net/dump.c  |  4 +---
 net/net.c   | 24 +---
 net/netmap.c|  5 -
 net/tap.c   | 28 +---
 10 files changed, 5 insertions(+), 74 deletions(-)

diff --git a/include/net/net.h b/include/net/net.h
index b1f9b35fcca1..6fe5a0aee833 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -57,8 +57,6 @@ typedef bool (HasUfo)(NetClientState *);
 typedef bool (HasUso)(NetClientState *);
 typedef bool (HasVnetHdr)(NetClientState *);
 typedef bool (HasVnetHdrLen)(NetClientState *, int);
-typedef bool (GetUsingVnetHdr)(NetClientState *);
-typedef void (UsingVnetHdr)(NetClientState *, bool);
 typedef void (SetOffload)(NetClientState *, int, int, int, int, int, int, int);
 typedef int (GetVnetHdrLen)(NetClientState *);
 typedef void (SetVnetHdrLen)(NetClientState *, int);
@@ -88,10 +86,7 @@ typedef struct NetClientInfo {
 HasUso *has_uso;
 HasVnetHdr *has_vnet_hdr;
 HasVnetHdrLen *has_vnet_hdr_len;
-GetUsingVnetHdr *get_using_vnet_hdr;
-UsingVnetHdr *using_vnet_hdr;
 SetOffload *set_offload;
-GetVnetHdrLen *get_vnet_hdr_len;
 SetVnetHdrLen *set_vnet_hdr_len;
 SetVnetLE *set_vnet_le;
 SetVnetBE *set_vnet_be;
@@ -194,8 +189,6 @@ bool qemu_has_ufo(NetClientState *nc);
 bool qemu_has_uso(NetClientState *nc);
 bool qemu_has_vnet_hdr(NetClientState *nc);
 bool qemu_has_vnet_hdr_len(NetClientState *nc, int len);
-bool qemu_get_using_vnet_hdr(NetClientState *nc);
-void qemu_using_vnet_hdr(NetClientState *nc, bool enable);
 void qemu_set_offload(NetClientState *nc, int csum, int tso4, int tso6,
   int ecn, int ufo, int uso4, int uso6);
 int qemu_get_vnet_hdr_len(NetClientState *nc);
diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
index 7c6f6029518c..d0dde767f6aa 100644
--- a/hw/net/e1000e.c
+++ b/hw/net/e1000e.c
@@ -352,7 +352,6 @@ e1000e_init_net_peer(E1000EState *s, PCIDevice *pci_dev, 
uint8_t *macaddr)
 for (i = 0; i < s->conf.peers.queues; i++) {
 nc = qemu_get_subqueue(s->nic, i);
 qemu_set_vnet_hdr_len(nc->peer, sizeof(struct virtio_net_hdr));
-qemu_using_vnet_hdr(nc->peer, true);
 }
 }
 
diff --git a/hw/net/igb.c b/hw/net/igb.c
index 9b37523d6df8..1224c7ba8e38 100644
--- a/hw/net/igb.c
+++ b/hw/net/igb.c
@@ -349,7 +349,6 @@ igb_init_net_peer(IGBState *s, PCIDevice *pci_dev, uint8_t 
*macaddr)
 for (i = 0; i < s->conf.peers.queues; i++) {
 nc = qemu_get_subqueue(s->nic, i);
 qemu_set_vnet_hdr_len(nc->peer, sizeof(struct virtio_net_hdr));
-qemu_using_vnet_hdr(nc->peer, true);
 }
 }
 
diff --git a/hw/net/net_tx_pkt.c b/hw/net/net_tx_pkt.c
index 2134a18c4c90..903238dca24d 100644
--- a/hw/net/net_tx_pkt.c
+++ b/hw/net/net_tx_pkt.c
@@ -578,7 +578,7 @@ static void net_tx_pkt_sendv(
 {
 NetClientState *nc = opaque;
 
-if (qemu_get_using_vnet_hdr(nc->peer)) {
+if (qemu_get_vnet_hdr_len(nc->peer)) {
 qemu_sendv_packet(nc, virt_iov, virt_iov_cnt);
 } else {
 qemu_sendv_packet(nc, iov, iov_cnt);
@@ -808,7 +808,7 @@ static bool net_tx_pkt_do_sw_fragmentation(struct NetTxPkt 
*pkt,
 
 bool net_tx_pkt_send(struct NetTxPkt *pkt, NetClientState *nc)
 {
-bool offload = qemu_get_using_vnet_hdr(nc->peer);
+bool offload = qemu_get_vnet_hdr_len(nc->peer);
 return net_tx_pkt_send_custom(pkt, offload, net_tx_pkt_sendv, nc);
 }
 
diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 58014a92ad19..f6112c0ac97d 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -3794,9 +3794,6 @@ static void virtio_net_device_realize(DeviceState *dev, 
Error **errp)
 
 peer_test_vnet_hdr(n);
 if (peer_has_vnet_hdr(n)) {
-for (i = 0; i < n->max_queue_pairs; i++) {
-qemu_using_vnet_hdr(qemu_get_subqueue(n->nic, i)->peer, true);
-}
 n->host_hdr_len = sizeof(struct virtio_net_hdr);
 } else {
 n->host_hdr_len = 0;
diff --git a/hw/net/vmxnet3.c b/hw/net/vmxnet3.c
index 707487c63666..63a91877730d 100644
--- a/hw/net/vmxnet3.c
+++ b/hw/net/vmxnet3.c
@@ -2091,8 +2091,6 @@ static void vmxnet3_net_init(VMXNET3State *s)
 if (s->peer_has_vhdr) {
 qemu_set_vnet_hdr_len(qemu_get_queue(s->nic)->peer,
 sizeof(struct virtio_net_hdr));
-
-qemu_using_vnet_hdr(qemu_get_queue(s->nic)->peer, 1);
 }
 
 qemu_format_nic_info_str(qemu_get_queue(s->nic), s->conf.macaddr.a);
diff --git a/net/dump.c b/net/dump.c
index 

[PATCH v9 05/20] tap: Call tap_receive_iov() from tap_receive()

2024-04-03 Thread Akihiko Odaki
This will save duplicate logic found in both of tap_receive_iov() and
tap_receive().

Suggested-by: "Zhang, Chen" 
Signed-off-by: Akihiko Odaki 
---
 net/tap.c | 35 +--
 1 file changed, 5 insertions(+), 30 deletions(-)

diff --git a/net/tap.c b/net/tap.c
index 99c59ee46881..9825518ff1f3 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -133,39 +133,14 @@ static ssize_t tap_receive_iov(NetClientState *nc, const 
struct iovec *iov,
 return tap_write_packet(s, iovp, iovcnt);
 }
 
-static ssize_t tap_receive_raw(NetClientState *nc, const uint8_t *buf, size_t 
size)
-{
-TAPState *s = DO_UPCAST(TAPState, nc, nc);
-struct iovec iov[2];
-int iovcnt = 0;
-struct virtio_net_hdr_mrg_rxbuf hdr = { };
-
-if (s->host_vnet_hdr_len) {
-iov[iovcnt].iov_base = 
-iov[iovcnt].iov_len  = s->host_vnet_hdr_len;
-iovcnt++;
-}
-
-iov[iovcnt].iov_base = (char *)buf;
-iov[iovcnt].iov_len  = size;
-iovcnt++;
-
-return tap_write_packet(s, iov, iovcnt);
-}
-
 static ssize_t tap_receive(NetClientState *nc, const uint8_t *buf, size_t size)
 {
-TAPState *s = DO_UPCAST(TAPState, nc, nc);
-struct iovec iov[1];
-
-if (s->host_vnet_hdr_len && !s->using_vnet_hdr) {
-return tap_receive_raw(nc, buf, size);
-}
-
-iov[0].iov_base = (char *)buf;
-iov[0].iov_len  = size;
+struct iovec iov = {
+.iov_base = (void *)buf,
+.iov_len = size
+};
 
-return tap_write_packet(s, iov, 1);
+return tap_receive_iov(nc, , 1);
 }
 
 #ifndef __sun__

-- 
2.44.0




[PATCH v9 17/20] ebpf: Fix RSS error handling

2024-04-03 Thread Akihiko Odaki
calculate_rss_hash() was using hash value 0 to tell if it calculated
a hash, but the hash value may be 0 on a rare occasion. Have a
distinct bool value for correctness.

Fixes: f3fa412de2 ("ebpf: Added eBPF RSS program.")
Signed-off-by: Akihiko Odaki 
---
 ebpf/rss.bpf.skeleton.h | 1210 +++
 tools/ebpf/rss.bpf.c|   20 +-
 2 files changed, 610 insertions(+), 620 deletions(-)

diff --git a/ebpf/rss.bpf.skeleton.h b/ebpf/rss.bpf.skeleton.h
index aed4ef9a0335..e41ed8890191 100644
--- a/ebpf/rss.bpf.skeleton.h
+++ b/ebpf/rss.bpf.skeleton.h
@@ -165,7 +165,7 @@ rss_bpf__create_skeleton(struct rss_bpf *obj)
s->progs[0].prog = >progs.tun_rss_steering_prog;
s->progs[0].link = >links.tun_rss_steering_prog;
 
-   s->data = (void *)rss_bpf__elf_bytes(>data_sz);
+   s->data = rss_bpf__elf_bytes(>data_sz);
 
obj->skeleton = s;
return 0;
@@ -176,194 +176,188 @@ err:
 
 static inline const void *rss_bpf__elf_bytes(size_t *sz)
 {
-   *sz = 20600;
-   return (const void *)"\
+   static const char data[] __attribute__((__aligned__(8))) = "\
 \x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\x38\x4d\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0d\0\
-\x01\0\xbf\x19\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x63\x1a\x4c\xff\0\0\0\0\xbf\xa7\
-\0\0\0\0\0\0\x07\x07\0\0\x4c\xff\xff\xff\x18\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
+\0\0\0\0\0\0\0\0\0\0\0\xb8\x4b\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0d\0\
+\x01\0\xbf\x19\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x63\x1a\x54\xff\0\0\0\0\xbf\xa7\
+\0\0\0\0\0\0\x07\x07\0\0\x54\xff\xff\xff\x18\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
 \xbf\x72\0\0\0\0\0\0\x85\0\0\0\x01\0\0\0\xbf\x06\0\0\0\0\0\0\x18\x01\0\0\0\0\0\
 \0\0\0\0\0\0\0\0\0\xbf\x72\0\0\0\0\0\0\x85\0\0\0\x01\0\0\0\xbf\x07\0\0\0\0\0\0\
-\x18\0\0\0\xff\xff\xff\xff\0\0\0\0\0\0\0\0\x15\x06\x61\x02\0\0\0\0\xbf\x78\0\0\
-\0\0\0\0\x15\x08\x5f\x02\0\0\0\0\x71\x61\0\0\0\0\0\0\x55\x01\x01\0\0\0\0\0\x05\
-\0\x58\x02\0\0\0\0\xb7\x01\0\0\0\0\0\0\x63\x1a\xc0\xff\0\0\0\0\x7b\x1a\xb8\xff\
-\0\0\0\0\x7b\x1a\xb0\xff\0\0\0\0\x7b\x1a\xa8\xff\0\0\0\0\x7b\x1a\xa0\xff\0\0\0\
-\0\x63\x1a\x98\xff\0\0\0\0\x7b\x1a\x90\xff\0\0\0\0\x7b\x1a\x88\xff\0\0\0\0\x7b\
-\x1a\x80\xff\0\0\0\0\x7b\x1a\x78\xff\0\0\0\0\x7b\x1a\x70\xff\0\0\0\0\x7b\x1a\
-\x68\xff\0\0\0\0\x7b\x1a\x60\xff\0\0\0\0\x7b\x1a\x58\xff\0\0\0\0\x7b\x1a\x50\
-\xff\0\0\0\0\x15\x09\x47\x02\0\0\0\0\x6b\x1a\xc8\xff\0\0\0\0\xbf\xa3\0\0\0\0\0\
-\0\x07\x03\0\0\xc8\xff\xff\xff\xbf\x91\0\0\0\0\0\0\xb7\x02\0\0\x0c\0\0\0\xb7\
+\x18\0\0\0\xff\xff\xff\xff\0\0\0\0\0\0\0\0\x15\x06\x4f\x02\0\0\0\0\xbf\x78\0\0\
+\0\0\0\0\x15\x08\x4d\x02\0\0\0\0\x71\x61\0\0\0\0\0\0\x55\x01\x01\0\0\0\0\0\x05\
+\0\x46\x02\0\0\0\0\xb7\x01\0\0\0\0\0\0\x63\x1a\xc8\xff\0\0\0\0\x7b\x1a\xc0\xff\
+\0\0\0\0\x7b\x1a\xb8\xff\0\0\0\0\x7b\x1a\xb0\xff\0\0\0\0\x7b\x1a\xa8\xff\0\0\0\
+\0\x63\x1a\xa0\xff\0\0\0\0\x7b\x1a\x98\xff\0\0\0\0\x7b\x1a\x90\xff\0\0\0\0\x7b\
+\x1a\x88\xff\0\0\0\0\x7b\x1a\x80\xff\0\0\0\0\x7b\x1a\x78\xff\0\0\0\0\x7b\x1a\
+\x70\xff\0\0\0\0\x7b\x1a\x68\xff\0\0\0\0\x7b\x1a\x60\xff\0\0\0\0\x7b\x1a\x58\
+\xff\0\0\0\0\x15\x09\x35\x02\0\0\0\0\x6b\x1a\xd0\xff\0\0\0\0\xbf\xa3\0\0\0\0\0\
+\0\x07\x03\0\0\xd0\xff\xff\xff\xbf\x91\0\0\0\0\0\0\xb7\x02\0\0\x0c\0\0\0\xb7\
 \x04\0\0\x02\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x44\0\0\0\x67\0\0\0\x20\0\0\0\
-\x77\0\0\0\x20\0\0\0\x55\0\x3c\x02\0\0\0\0\xb7\x02\0\0\x10\0\0\0\x69\xa1\xc8\
+\x77\0\0\0\x20\0\0\0\x55\0\x2a\x02\0\0\0\0\xb7\x02\0\0\x10\0\0\0\x69\xa1\xd0\
 \xff\0\0\0\0\xbf\x13\0\0\0\0\0\0\xdc\x03\0\0\x10\0\0\0\x15\x03\x02\0\0\x81\0\0\
 \x55\x03\x0b\0\xa8\x88\0\0\xb7\x02\0\0\x14\0\0\0\xbf\xa3\0\0\0\0\0\0\x07\x03\0\
-\0\xc8\xff\xff\xff\xbf\x91\0\0\0\0\0\0\xb7\x04\0\0\x02\0\0\0\xb7\x05\0\0\0\0\0\
-\0\x85\0\0\0\x44\0\0\0\x67\0\0\0\x20\0\0\0\x77\0\0\0\x20\0\0\0\x55\0\x2c\x02\0\
-\0\0\0\x69\xa1\xc8\xff\0\0\0\0\x15\x01\x2a\x02\0\0\0\0\x7b\x9a\x38\xff\0\0\0\0\
-\x15\x01\x56\0\x86\xdd\0\0\x55\x01\x3b\0\x08\0\0\0\xb7\x01\0\0\x01\0\0\0\x73\
-\x1a\x50\xff\0\0\0\0\xb7\x01\0\0\0\0\0\0\x63\x1a\xd8\xff\0\0\0\0\x7b\x1a\xd0\
-\xff\0\0\0\0\x7b\x1a\xc8\xff\0\0\0\0\xbf\xa3\0\0\0\0\0\0\x07\x03\0\0\xc8\xff\
-\xff\xff\x79\xa1\x38\xff\0\0\0\0\xb7\x02\0\0\0\0\0\0\xb7\x04\0\0\x14\0\0\0\xb7\
+\0\xd0\xff\xff\xff\xbf\x91\0\0\0\0\0\0\xb7\x04\0\0\x02\0\0\0\xb7\x05\0\0\0\0\0\
+\0\x85\0\0\0\x44\0\0\0\x67\0\0\0\x20\0\0\0\x77\0\0\0\x20\0\0\0\x55\0\x1a\x02\0\
+\0\0\0\x69\xa1\xd0\xff\0\0\0\0\x15\x01\x18\x02\0\0\0\0\x15\x01\x21\0\x86\xdd\0\
+\0\x7b\x9a\x48\xff\0\0\0\0\x55\x01\xf6\0\x08\0\0\0\xb7\x01\0\0\x01\0\0\0\x73\
+\x1a\x58\xff\0\0\0\0\xb7\x01\0\0\0\0\0\0\x63\x1a\xe0\xff\0\0\0\0\x7b\x1a\xd8\
+\xff\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xbf\xa3\0\0\0\0\0\0\x07\x03\0\0\xd0\xff\
+\xff\xff\x79\xa1\x48\xff\0\0\0\0\xb7\x02\0\0\0\0\0\0\xb7\x04\0\0\x14\0\0\0\xb7\
 \x05\0\0\x01\0\0\0\x85\0\0\0\x44\0\0\0\x67\0\0\0\x20\0\0\0\x77\0\0\0\x20\0\0\0\
-\x55\0\x17\x02\0\0\0\0\x69\xa1\xce\xff\0\0\0\0\x57\x01\0\0\x3f\xff\0\0\xb7\x02\

[PATCH v9 14/20] virtio-net: Report RSS warning at device realization

2024-04-03 Thread Akihiko Odaki
Warning about RSS fallback at device realization allows the user to
notice the configuration problem early.

Signed-off-by: Akihiko Odaki 
---
 hw/net/virtio-net.c | 23 ---
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 3d53eba88cfc..86929c9e1fad 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1261,10 +1261,12 @@ static bool virtio_net_attach_epbf_rss(VirtIONet *n)
 
 if (!ebpf_rss_set_all(>ebpf_rss, ,
   n->rss_data.indirections_table, n->rss_data.key)) {
+error_report("Failed to configure eBPF RSS");
 return false;
 }
 
 if (!virtio_net_attach_ebpf_to_backend(n->nic, n->ebpf_rss.program_fd)) {
+error_report("Failed to attach eBPF to backend");
 return false;
 }
 
@@ -1279,16 +1281,10 @@ static void virtio_net_detach_epbf_rss(VirtIONet *n)
 static void virtio_net_commit_rss_config(VirtIONet *n)
 {
 if (n->rss_data.enabled) {
-n->rss_data.enabled_software_rss = n->rss_data.populate_hash;
+n->rss_data.enabled_software_rss = n->rss_data.populate_hash ||
+   !virtio_net_attach_epbf_rss(n);
 if (n->rss_data.populate_hash) {
 virtio_net_detach_epbf_rss(n);
-} else if (!virtio_net_attach_epbf_rss(n)) {
-if (get_vhost_net(qemu_get_queue(n->nic)->peer)) {
-warn_report("Can't load eBPF RSS for vhost");
-} else {
-warn_report("Can't load eBPF RSS - fallback to software RSS");
-n->rss_data.enabled_software_rss = true;
-}
 }
 
 trace_virtio_net_rss_enable(n->rss_data.hash_types,
@@ -3802,9 +3798,14 @@ static void virtio_net_device_realize(DeviceState *dev, 
Error **errp)
 net_rx_pkt_init(>rx_pkt);
 
 if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS) &&
-!virtio_net_load_ebpf(n) && get_vhost_net(nc->peer)) {
-virtio_net_device_unrealize(dev);
-error_setg(errp, "Can't load eBPF RSS for vhost");
+!virtio_net_load_ebpf(n)) {
+if (get_vhost_net(nc->peer)) {
+error_setg(errp, "Can't load eBPF RSS for vhost");
+virtio_net_device_unrealize(dev);
+return;
+}
+
+warn_report("Can't load eBPF RSS - fallback to software RSS");
 }
 }
 

-- 
2.44.0




[PATCH v2 09/10] docs/system: Add documentation on support for IGVM

2024-04-03 Thread Roy Hopkins
IGVM support has been implemented for Confidential Guests that support
AMD SEV and AMD SEV-ES. Add some documentation that gives some
background on the IGVM format and how to use it to configure a
confidential guest.

Signed-off-by: Roy Hopkins 
---
 docs/system/i386/amd-memory-encryption.rst |   2 +
 docs/system/igvm.rst   | 129 +
 docs/system/index.rst  |   1 +
 3 files changed, 132 insertions(+)
 create mode 100644 docs/system/igvm.rst

diff --git a/docs/system/i386/amd-memory-encryption.rst 
b/docs/system/i386/amd-memory-encryption.rst
index e9bc142bc1..a253bf7db1 100644
--- a/docs/system/i386/amd-memory-encryption.rst
+++ b/docs/system/i386/amd-memory-encryption.rst
@@ -1,3 +1,5 @@
+.. _amd-sev:
+
 AMD Secure Encrypted Virtualization (SEV)
 =
 
diff --git a/docs/system/igvm.rst b/docs/system/igvm.rst
new file mode 100644
index 00..b07c11fa6e
--- /dev/null
+++ b/docs/system/igvm.rst
@@ -0,0 +1,129 @@
+Independent Guest Virtual Machine (IGVM) support
+
+
+IGVM files are designed to encapsulate all the information required to launch a
+virtual machine on any given virtualization stack in a deterministic way. This
+allows the cryptographic measurement of initial guest state for Confidential
+Guests to be calculated when the IGVM file is built, allowing a relying party 
to
+verify the initial state of a guest via a remote attestation.
+
+QEMU supports IGVM files through the Confidential Guest Support object. An igvm
+filename can optionally be passed to the object which will subsequently be
+parsed and used to configure the guest state prior to launching the guest.
+
+Further Information on IGVM
+---
+
+Information about the IGVM format, including links to the format specification
+and documentation for the Rust and C libraries can be found at the project
+repository:
+
+https://github.com/microsoft/igvm
+
+
+Supported Platforms
+---
+
+Currently, IGVM files can be provided for Confidential Guests on host systems
+that support AMD SEV and SEV-ES running under KVM.
+
+
+Limitations when using IGVM with AMD SEV and SEV-ES
+---
+
+IGVM files configure the initial state of the guest using a set of directives.
+Not every directive is supported by every Confidential Guest type. For example,
+AMD SEV does not support encrypted save state regions, therefore setting the
+initial CPU state using IGVM for SEV is not possible. When an IGVM file 
contains
+directives that are not supported for the active platform, an error is 
displayed
+and the guest launch is aborted.
+
+The table below describes the list of directives that are supported for SEV and
+SEV-ES.
+
+.. list-table:: SEV & SEV-ES Supported Directives
+   :widths: 35 65
+   :header-rows: 1
+
+   * - IGVM directive
+ - Notes
+   * - IGVM_VHT_PAGE_DATA
+ - ``NORMAL`` zero, measured and unmeasured page types are supported. Other
+   page types result in an error.
+   * - IGVM_VHT_PARAMETER_AREA
+ -
+   * - IGVM_VHT_PARAMETER_INSERT
+ -
+   * - IGVM_VHT_MEMORY_MAP
+ - The memory map page is populated using entries from the E820 table.
+   * - IGVM_VHT_VP_COUNT_PARAMETER
+ - The guest parameter page is populated with the CPU count.
+   * - IGVM_VHT_ENVIRONMENT_INFO_PARAMETER
+ - The ``memory_is_shared`` parameter is set to 1 in the guest parameter
+   page.
+
+.. list-table:: Additional SEV-ES Supported Directives
+   :widths: 25 75
+   :header-rows: 1
+
+   * - IGVM directive
+ - Notes
+   * - IGVM_VHT_VP_CONTEXT
+ - Setting of the initial CPU state for the boot CPU and additional CPUs is
+   supported with limitations on the fields that can be provided in the
+   VMSA. See below for details on which fields are supported.
+
+Initial CPU state with SEV-ES VMSA
+--
+
+The initial state of guest CPUs can be defined in the IGVM file for AMD SEV-ES.
+The state data is provided as a VMSA structure as defined in Table B-4 in the
+AMD64 Architecture Programmer's Manual, Volume 2 [1].
+
+The IGVM VMSA is translated to CPU state in QEMU which is then synchronized
+by KVM to the guest VMSA during the launch process where it contributes to the
+launch measurement. See :ref:`amd-sev` for details on the launch process and
+guest launch measurement.
+
+It is important that no information is lost or changed when translating the
+VMSA provided by the IGVM file into the VSMA that is used to launch the guest.
+Therefore, QEMU restricts the VMSA fields that can be provided in the IGVM
+VMSA structure to the following registers:
+
+RAX, RCX, RDX, RBX, RBP, RSI, RDI, R8-R15, RSP, RIP, CS, DS, ES, FS, GS, SS,
+CR0, CR3, CR4, XCR0, EFER.
+
+When processing the IGVM file, QEMU will check if any fields other than the
+above are non-zero and generate an error if 

[PATCH v9 07/20] virtio-net: Do not propagate ebpf-rss-fds errors

2024-04-03 Thread Akihiko Odaki
Propagating ebpf-rss-fds errors has several problems.

First, it makes device realization fail and disables the fallback to the
conventional eBPF loading.

Second, it leaks memory by making device realization fail without
freeing memory already allocated.

Third, the convention is to set an error when a function returns false,
but virtio_net_load_ebpf_fds() and virtio_net_load_ebpf() returns false
without setting an error, which is confusing.

Remove the propagation to fix these problems.

Fixes: 0524ea0510a3 ("ebpf: Added eBPF initialization by fds.")
Signed-off-by: Akihiko Odaki 
---
 hw/net/virtio-net.c | 23 ++-
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index f6112c0ac97d..8ede38aadbbe 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1329,24 +1329,22 @@ static void virtio_net_detach_epbf_rss(VirtIONet *n)
 virtio_net_attach_ebpf_to_backend(n->nic, -1);
 }
 
-static bool virtio_net_load_ebpf_fds(VirtIONet *n, Error **errp)
+static bool virtio_net_load_ebpf_fds(VirtIONet *n)
 {
 int fds[EBPF_RSS_MAX_FDS] = { [0 ... EBPF_RSS_MAX_FDS - 1] = -1};
 int ret = true;
 int i = 0;
 
-ERRP_GUARD();
-
 if (n->nr_ebpf_rss_fds != EBPF_RSS_MAX_FDS) {
-error_setg(errp,
-  "Expected %d file descriptors but got %d",
-  EBPF_RSS_MAX_FDS, n->nr_ebpf_rss_fds);
+warn_report("Expected %d file descriptors but got %d",
+EBPF_RSS_MAX_FDS, n->nr_ebpf_rss_fds);
return false;
}
 
 for (i = 0; i < n->nr_ebpf_rss_fds; i++) {
-fds[i] = monitor_fd_param(monitor_cur(), n->ebpf_rss_fds[i], errp);
-if (*errp) {
+fds[i] = monitor_fd_param(monitor_cur(), n->ebpf_rss_fds[i],
+  _warn);
+if (fds[i] < 0) {
 ret = false;
 goto exit;
 }
@@ -1355,7 +1353,7 @@ static bool virtio_net_load_ebpf_fds(VirtIONet *n, Error 
**errp)
 ret = ebpf_rss_load_fds(>ebpf_rss, fds[0], fds[1], fds[2], fds[3]);
 
 exit:
-if (!ret || *errp) {
+if (!ret) {
 for (i = 0; i < n->nr_ebpf_rss_fds && fds[i] != -1; i++) {
 close(fds[i]);
 }
@@ -1364,13 +1362,12 @@ exit:
 return ret;
 }
 
-static bool virtio_net_load_ebpf(VirtIONet *n, Error **errp)
+static bool virtio_net_load_ebpf(VirtIONet *n)
 {
 bool ret = false;
 
 if (virtio_net_attach_ebpf_to_backend(n->nic, -1)) {
-if (!(n->ebpf_rss_fds
-&& virtio_net_load_ebpf_fds(n, errp))) {
+if (!(n->ebpf_rss_fds && virtio_net_load_ebpf_fds(n))) {
 ret = ebpf_rss_load(>ebpf_rss);
 }
 }
@@ -3825,7 +3822,7 @@ static void virtio_net_device_realize(DeviceState *dev, 
Error **errp)
 net_rx_pkt_init(>rx_pkt);
 
 if (virtio_has_feature(n->host_features, VIRTIO_NET_F_RSS)) {
-virtio_net_load_ebpf(n, errp);
+virtio_net_load_ebpf(n);
 }
 }
 

-- 
2.44.0




[PATCH v2 03/10] backends/confidential-guest-support: Add functions to support IGVM

2024-04-03 Thread Roy Hopkins
In preparation for supporting the processing of IGVM files to configure
guests, this adds a set of functions to ConfidentialGuestSupport
allowing configuration of secure virtual machines that can be
implemented for each supported isolation platform type such as Intel TDX
or AMD SEV-SNP. These functions will be called by IGVM processing code
in subsequent patches.

This commit provides a default implementation of the functions that
either perform no action or generate a warning or error when they are
called. Targets that support ConfidentalGuestSupport should override
these implementations.

Signed-off-by: Roy Hopkins 
---
 backends/confidential-guest-support.c | 32 ++
 include/exec/confidential-guest-support.h | 74 +++
 2 files changed, 106 insertions(+)

diff --git a/backends/confidential-guest-support.c 
b/backends/confidential-guest-support.c
index da436fb736..cb0bc543c0 100644
--- a/backends/confidential-guest-support.c
+++ b/backends/confidential-guest-support.c
@@ -14,6 +14,8 @@
 #include "qemu/osdep.h"
 
 #include "exec/confidential-guest-support.h"
+#include "qemu/error-report.h"
+#include "qapi/error.h"
 
 OBJECT_DEFINE_ABSTRACT_TYPE(ConfidentialGuestSupport,
 confidential_guest_support,
@@ -45,8 +47,38 @@ static void 
confidential_guest_support_class_init(ObjectClass *oc, void *data)
 #endif
 }
 
+static int check_support(ConfidentialGuestPlatformType platform,
+ uint16_t platform_version, uint8_t highest_vtl,
+ uint64_t shared_gpa_boundary)
+{
+/* Default: no support. */
+return 0;
+}
+
+static int set_guest_state(hwaddr gpa, uint8_t *ptr, uint64_t len,
+   ConfidentialGuestPageType memory_type,
+   uint16_t cpu_index, Error **errp)
+{
+error_setg(errp,
+   "Setting confidential guest state is not supported for this 
platform");
+return -1;
+}
+
+static int get_mem_map_entry(int index, ConfidentialGuestMemoryMapEntry *entry,
+ Error **errp)
+{
+error_setg(
+errp,
+"Obtaining the confidential guest memory map is not supported for this 
platform");
+return -1;
+}
+
 static void confidential_guest_support_init(Object *obj)
 {
+ConfidentialGuestSupport *cgs = CONFIDENTIAL_GUEST_SUPPORT(obj);
+cgs->check_support = check_support;
+cgs->set_guest_state = set_guest_state;
+cgs->get_mem_map_entry = get_mem_map_entry;
 }
 
 static void confidential_guest_support_finalize(Object *obj)
diff --git a/include/exec/confidential-guest-support.h 
b/include/exec/confidential-guest-support.h
index ec74da8877..a8ad84fa07 100644
--- a/include/exec/confidential-guest-support.h
+++ b/include/exec/confidential-guest-support.h
@@ -21,10 +21,44 @@
 #ifndef CONFIG_USER_ONLY
 
 #include "qom/object.h"
+#include "exec/hwaddr.h"
+
+#if defined(CONFIG_IGVM)
+#include "igvm/igvm.h"
+#endif
 
 #define TYPE_CONFIDENTIAL_GUEST_SUPPORT "confidential-guest-support"
 OBJECT_DECLARE_SIMPLE_TYPE(ConfidentialGuestSupport, 
CONFIDENTIAL_GUEST_SUPPORT)
 
+typedef enum ConfidentialGuestPlatformType {
+CGS_PLATFORM_SEV,
+CGS_PLATFORM_SEV_ES,
+} ConfidentialGuestPlatformType;
+
+typedef enum ConfidentialGuestMemoryType {
+CGS_MEM_RAM,
+CGS_MEM_RESERVED,
+CGS_MEM_ACPI,
+CGS_MEM_NVS,
+CGS_MEM_UNUSABLE,
+} ConfidentialGuestMemoryType;
+
+typedef struct ConfidentialGuestMemoryMapEntry {
+uint64_t gpa;
+uint64_t size;
+ConfidentialGuestMemoryType type;
+} ConfidentialGuestMemoryMapEntry;
+
+typedef enum ConfidentialGuestPageType {
+CGS_PAGE_TYPE_NORMAL,
+CGS_PAGE_TYPE_VMSA,
+CGS_PAGE_TYPE_ZERO,
+CGS_PAGE_TYPE_UNMEASURED,
+CGS_PAGE_TYPE_SECRETS,
+CGS_PAGE_TYPE_CPUID,
+CGS_PAGE_TYPE_REQUIRED_MEMORY,
+} ConfidentialGuestPageType;
+
 struct ConfidentialGuestSupport {
 Object parent;
 
@@ -60,6 +94,46 @@ struct ConfidentialGuestSupport {
  */
 char *igvm_filename;
 #endif
+
+/*
+ * The following virtual methods need to be implemented by systems that
+ * support confidential guests that can be configured with IGVM and are
+ * used during processing of the IGVM file with process_igvm().
+ */
+
+/*
+ * Check for to see if this confidential guest supports a particular
+ * platform or configuration
+ */
+int (*check_support)(ConfidentialGuestPlatformType platform,
+ uint16_t platform_version, uint8_t highest_vtl,
+ uint64_t shared_gpa_boundary);
+
+/*
+ * Configure part of the state of a guest for a particular set of data, 
page
+ * type and gpa. This can be used for example to pre-populate and measure
+ * guest memory contents, define private ranges or set the initial CPU 
state
+ * for one or more CPUs.
+ *
+ * If memory_type is CGS_PAGE_TYPE_VMSA then ptr points to the initial CPU
+ * context for a virtual 

[PATCH v2 06/10] i386/pc_sysfw: Ensure sysfw flash configuration does not conflict with IGVM

2024-04-03 Thread Roy Hopkins
When using an IGVM file the configuration of the system firmware is
defined by IGVM directives contained in the file. In this case the user
should not configure any pflash devices.

This commit skips initialization of the ROM mode when pflash0 is not set
then checks to ensure no pflash devices have been configured when using
IGVM, exiting with an error message if this is not the case.

Signed-off-by: Roy Hopkins 
---
 hw/i386/pc_sysfw.c | 23 +--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/hw/i386/pc_sysfw.c b/hw/i386/pc_sysfw.c
index 3efabbbab2..2412f26225 100644
--- a/hw/i386/pc_sysfw.c
+++ b/hw/i386/pc_sysfw.c
@@ -226,8 +226,13 @@ void pc_system_firmware_init(PCMachineState *pcms,
 }
 
 if (!pflash_blk[0]) {
-/* Machine property pflash0 not set, use ROM mode */
-x86_bios_rom_init(MACHINE(pcms), "bios.bin", rom_memory, false);
+/*
+ * Machine property pflash0 not set, use ROM mode unless using IGVM,
+ * in which case the firmware must be provided by the IGVM file.
+ */
+if (!cgs_is_igvm(MACHINE(pcms)->cgs)) {
+x86_bios_rom_init(MACHINE(pcms), "bios.bin", rom_memory, false);
+}
 } else {
 if (kvm_enabled() && !kvm_readonly_mem_enabled()) {
 /*
@@ -243,6 +248,20 @@ void pc_system_firmware_init(PCMachineState *pcms,
 }
 
 pc_system_flash_cleanup_unused(pcms);
+
+/*
+ * The user should not have specified any pflash devices when using IGVM
+ * to configure the guest.
+ */
+if (cgs_is_igvm(MACHINE(pcms)->cgs)) {
+for (i = 0; i < ARRAY_SIZE(pcms->flash); i++) {
+if (pcms->flash[i]) {
+error_report("pflash devices cannot be configured when "
+ "using IGVM");
+exit(1);
+}
+}
+}
 }
 
 void x86_firmware_configure(void *ptr, int size)
-- 
2.43.0




[PATCH v2 04/10] backends/igvm: Implement parsing and processing of IGVM files

2024-04-03 Thread Roy Hopkins
This commit adds an implementation of an IGVM loader which parses the
file specified as a pararameter to ConfidentialGuestSupport and provides
a function that uses the interface in the same object to configure and
populate guest memory based on the contents of the file.

The IGVM file is parsed when a filename is provided but the code to
process the IGVM file is not yet hooked into target systems. This will
follow in a later commit.

Signed-off-by: Roy Hopkins 
---
 backends/confidential-guest-support.c |   4 +
 backends/igvm.c   | 745 ++
 backends/meson.build  |   1 +
 include/exec/confidential-guest-support.h |   5 +
 include/exec/igvm.h   |  36 ++
 5 files changed, 791 insertions(+)
 create mode 100644 backends/igvm.c
 create mode 100644 include/exec/igvm.h

diff --git a/backends/confidential-guest-support.c 
b/backends/confidential-guest-support.c
index cb0bc543c0..adfe447334 100644
--- a/backends/confidential-guest-support.c
+++ b/backends/confidential-guest-support.c
@@ -16,6 +16,7 @@
 #include "exec/confidential-guest-support.h"
 #include "qemu/error-report.h"
 #include "qapi/error.h"
+#include "exec/igvm.h"
 
 OBJECT_DEFINE_ABSTRACT_TYPE(ConfidentialGuestSupport,
 confidential_guest_support,
@@ -34,6 +35,9 @@ static void set_igvm(Object *obj, const char *value, Error 
**errp)
 ConfidentialGuestSupport *cgs = CONFIDENTIAL_GUEST_SUPPORT(obj);
 g_free(cgs->igvm_filename);
 cgs->igvm_filename = g_strdup(value);
+#if defined(CONFIG_IGVM)
+igvm_file_init(cgs, errp);
+#endif
 }
 #endif
 
diff --git a/backends/igvm.c b/backends/igvm.c
new file mode 100644
index 00..87e6032a2e
--- /dev/null
+++ b/backends/igvm.c
@@ -0,0 +1,745 @@
+/*
+ * QEMU IGVM configuration backend for Confidential Guests
+ *
+ * Copyright (C) 2023-2024 SUSE
+ *
+ * Authors:
+ *  Roy Hopkins 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#if defined(CONFIG_IGVM)
+
+#include "exec/confidential-guest-support.h"
+#include "qemu/queue.h"
+#include "qemu/typedefs.h"
+
+#include "exec/igvm.h"
+#include "qemu/error-report.h"
+#include "hw/boards.h"
+#include "qapi/error.h"
+#include "exec/address-spaces.h"
+
+#include 
+#include 
+#include 
+
+typedef struct IgvmParameterData {
+QTAILQ_ENTRY(IgvmParameterData) next;
+uint8_t *data;
+uint32_t size;
+uint32_t index;
+} IgvmParameterData;
+
+static QTAILQ_HEAD(, IgvmParameterData) parameter_data;
+
+static int directive_page_data(ConfidentialGuestSupport *cgs, int i,
+   uint32_t compatibility_mask,
+   const uint8_t *header_data, Error **errp);
+static int directive_vp_context(ConfidentialGuestSupport *cgs, int i,
+uint32_t compatibility_mask,
+const uint8_t *header_data, Error **errp);
+static int directive_parameter_area(ConfidentialGuestSupport *cgs, int i,
+uint32_t compatibility_mask,
+const uint8_t *header_data, Error **errp);
+static int directive_parameter_insert(ConfidentialGuestSupport *cgs, int i,
+  uint32_t compatibility_mask,
+  const uint8_t *header_data, Error 
**errp);
+static int directive_memory_map(ConfidentialGuestSupport *cgs, int i,
+uint32_t compatibility_mask,
+const uint8_t *header_data, Error **errp);
+static int directive_vp_count(ConfidentialGuestSupport *cgs, int i,
+  uint32_t compatibility_mask,
+  const uint8_t *header_data, Error **errp);
+static int directive_environment_info(ConfidentialGuestSupport *cgs, int i,
+  uint32_t compatibility_mask,
+  const uint8_t *header_data, Error 
**errp);
+static int directive_required_memory(ConfidentialGuestSupport *cgs, int i,
+ uint32_t compatibility_mask,
+ const uint8_t *header_data, Error **errp);
+
+struct IGVMDirectiveHandler {
+uint32_t type;
+int (*handler)(ConfidentialGuestSupport *cgs, int i,
+   uint32_t compatibility_mask, const uint8_t *header_data,
+   Error **errp);
+};
+
+static struct IGVMDirectiveHandler directive_handlers[] = {
+{ IGVM_VHT_PAGE_DATA, directive_page_data },
+{ IGVM_VHT_VP_CONTEXT, directive_vp_context },
+{ IGVM_VHT_PARAMETER_AREA, directive_parameter_area },
+{ IGVM_VHT_PARAMETER_INSERT, directive_parameter_insert },
+{ IGVM_VHT_MEMORY_MAP, directive_memory_map },
+{ IGVM_VHT_VP_COUNT_PARAMETER, directive_vp_count },
+{ 

[PATCH v9 16/20] virtio-net: Do not write hashes to peer buffer

2024-04-03 Thread Akihiko Odaki
The peer buffer is qualified with const and not meant to be modified.
It also prevents enabling VIRTIO_NET_F_HASH_REPORT for peers without
virtio-net header support.

Signed-off-by: Akihiko Odaki 
---
 hw/net/virtio-net.c | 36 +---
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 2de073ce18fd..ff1884564d0d 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -1823,16 +1823,9 @@ static uint8_t virtio_net_get_hash_type(bool hasip4,
 return 0xff;
 }
 
-static void virtio_set_packet_hash(const uint8_t *buf, uint8_t report,
-   uint32_t hash)
-{
-struct virtio_net_hdr_v1_hash *hdr = (void *)buf;
-hdr->hash_value = hash;
-hdr->hash_report = report;
-}
-
 static int virtio_net_process_rss(NetClientState *nc, const uint8_t *buf,
-  size_t size)
+  size_t size,
+  struct virtio_net_hdr_v1_hash *hdr)
 {
 VirtIONet *n = qemu_get_nic_opaque(nc);
 unsigned int index = nc->queue_index, new_index = index;
@@ -1863,7 +1856,8 @@ static int virtio_net_process_rss(NetClientState *nc, 
const uint8_t *buf,
  n->rss_data.hash_types);
 if (net_hash_type > NetPktRssIpV6UdpEx) {
 if (n->rss_data.populate_hash) {
-virtio_set_packet_hash(buf, VIRTIO_NET_HASH_REPORT_NONE, 0);
+hdr->hash_value = VIRTIO_NET_HASH_REPORT_NONE;
+hdr->hash_report = 0;
 }
 return n->rss_data.redirect ? n->rss_data.default_queue : -1;
 }
@@ -1871,7 +1865,8 @@ static int virtio_net_process_rss(NetClientState *nc, 
const uint8_t *buf,
 hash = net_rx_pkt_calc_rss_hash(pkt, net_hash_type, n->rss_data.key);
 
 if (n->rss_data.populate_hash) {
-virtio_set_packet_hash(buf, reports[net_hash_type], hash);
+hdr->hash_value = hash;
+hdr->hash_report = reports[net_hash_type];
 }
 
 if (n->rss_data.redirect) {
@@ -1891,7 +1886,7 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, 
const uint8_t *buf,
 VirtQueueElement *elems[VIRTQUEUE_MAX_SIZE];
 size_t lens[VIRTQUEUE_MAX_SIZE];
 struct iovec mhdr_sg[VIRTQUEUE_MAX_SIZE];
-struct virtio_net_hdr_mrg_rxbuf mhdr;
+struct virtio_net_hdr_v1_hash extra_hdr;
 unsigned mhdr_cnt = 0;
 size_t offset, i, guest_offset, j;
 ssize_t err;
@@ -1901,7 +1896,7 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, 
const uint8_t *buf,
 }
 
 if (!no_rss && n->rss_data.enabled && n->rss_data.enabled_software_rss) {
-int index = virtio_net_process_rss(nc, buf, size);
+int index = virtio_net_process_rss(nc, buf, size, _hdr);
 if (index >= 0) {
 NetClientState *nc2 = qemu_get_subqueue(n->nic, index);
 return virtio_net_receive_rcu(nc2, buf, size, true);
@@ -1961,15 +1956,17 @@ static ssize_t virtio_net_receive_rcu(NetClientState 
*nc, const uint8_t *buf,
 if (n->mergeable_rx_bufs) {
 mhdr_cnt = iov_copy(mhdr_sg, ARRAY_SIZE(mhdr_sg),
 sg, elem->in_num,
-offsetof(typeof(mhdr), num_buffers),
-sizeof(mhdr.num_buffers));
+offsetof(typeof(extra_hdr), 
hdr.num_buffers),
+sizeof(extra_hdr.hdr.num_buffers));
 }
 
 receive_header(n, sg, elem->in_num, buf, size);
 if (n->rss_data.populate_hash) {
-offset = sizeof(mhdr);
+offset = offsetof(typeof(extra_hdr), hash_value);
 iov_from_buf(sg, elem->in_num, offset,
- buf + offset, n->host_hdr_len - sizeof(mhdr));
+ (char *)_hdr + offset,
+ sizeof(extra_hdr.hash_value) +
+ sizeof(extra_hdr.hash_report));
 }
 offset = n->host_hdr_len;
 total += n->guest_hdr_len;
@@ -2015,10 +2012,11 @@ static ssize_t virtio_net_receive_rcu(NetClientState 
*nc, const uint8_t *buf,
 }
 
 if (mhdr_cnt) {
-virtio_stw_p(vdev, _buffers, i);
+virtio_stw_p(vdev, _hdr.hdr.num_buffers, i);
 iov_from_buf(mhdr_sg, mhdr_cnt,
  0,
- _buffers, sizeof mhdr.num_buffers);
+ _hdr.hdr.num_buffers,
+ sizeof extra_hdr.hdr.num_buffers);
 }
 
 for (j = 0; j < i; j++) {

-- 
2.44.0




[PATCH v9 06/20] tap: Shrink zeroed virtio-net header

2024-04-03 Thread Akihiko Odaki
tap prepends a zeroed virtio-net header when writing a packet to a
tap with virtio-net header enabled but not in use. This only happens
when s->host_vnet_hdr_len == sizeof(struct virtio_net_hdr).

Signed-off-by: Akihiko Odaki 
---
 net/tap.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/tap.c b/net/tap.c
index 9825518ff1f3..51f7aec39d9e 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -119,7 +119,7 @@ static ssize_t tap_receive_iov(NetClientState *nc, const 
struct iovec *iov,
 TAPState *s = DO_UPCAST(TAPState, nc, nc);
 const struct iovec *iovp = iov;
 g_autofree struct iovec *iov_copy = NULL;
-struct virtio_net_hdr_mrg_rxbuf hdr = { };
+struct virtio_net_hdr hdr = { };
 
 if (s->host_vnet_hdr_len && !s->using_vnet_hdr) {
 iov_copy = g_new(struct iovec, iovcnt + 1);

-- 
2.44.0




[PATCH v9 18/20] ebpf: Return 0 when configuration fails

2024-04-03 Thread Akihiko Odaki
The kernel interprets the returned value as an unsigned 32-bit so -1
will mean queue 4294967295, which is awkward. Return 0 instead.

Signed-off-by: Akihiko Odaki 
---
 ebpf/rss.bpf.skeleton.h | 1532 +++
 tools/ebpf/rss.bpf.c|2 +-
 2 files changed, 767 insertions(+), 767 deletions(-)

diff --git a/ebpf/rss.bpf.skeleton.h b/ebpf/rss.bpf.skeleton.h
index e41ed8890191..647212e5dd0c 100644
--- a/ebpf/rss.bpf.skeleton.h
+++ b/ebpf/rss.bpf.skeleton.h
@@ -178,786 +178,786 @@ static inline const void *rss_bpf__elf_bytes(size_t *sz)
 {
static const char data[] __attribute__((__aligned__(8))) = "\
 \x7f\x45\x4c\x46\x02\x01\x01\0\0\0\0\0\0\0\0\0\x01\0\xf7\0\x01\0\0\0\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\0\0\xb8\x4b\0\0\0\0\0\0\0\0\0\0\x40\0\0\0\0\0\x40\0\x0d\0\
-\x01\0\xbf\x19\0\0\0\0\0\0\xb7\x01\0\0\0\0\0\0\x63\x1a\x54\xff\0\0\0\0\xbf\xa7\
-\0\0\0\0\0\0\x07\x07\0\0\x54\xff\xff\xff\x18\x01\0\0\0\0\0\0\0\0\0\0\0\0\0\0\
-\xbf\x72\0\0\0\0\0\0\x85\0\0\0\x01\0\0\0\xbf\x06\0\0\0\0\0\0\x18\x01\0\0\0\0\0\
-\0\0\0\0\0\0\0\0\0\xbf\x72\0\0\0\0\0\0\x85\0\0\0\x01\0\0\0\xbf\x07\0\0\0\0\0\0\
-\x18\0\0\0\xff\xff\xff\xff\0\0\0\0\0\0\0\0\x15\x06\x4f\x02\0\0\0\0\xbf\x78\0\0\
-\0\0\0\0\x15\x08\x4d\x02\0\0\0\0\x71\x61\0\0\0\0\0\0\x55\x01\x01\0\0\0\0\0\x05\
-\0\x46\x02\0\0\0\0\xb7\x01\0\0\0\0\0\0\x63\x1a\xc8\xff\0\0\0\0\x7b\x1a\xc0\xff\
-\0\0\0\0\x7b\x1a\xb8\xff\0\0\0\0\x7b\x1a\xb0\xff\0\0\0\0\x7b\x1a\xa8\xff\0\0\0\
-\0\x63\x1a\xa0\xff\0\0\0\0\x7b\x1a\x98\xff\0\0\0\0\x7b\x1a\x90\xff\0\0\0\0\x7b\
-\x1a\x88\xff\0\0\0\0\x7b\x1a\x80\xff\0\0\0\0\x7b\x1a\x78\xff\0\0\0\0\x7b\x1a\
-\x70\xff\0\0\0\0\x7b\x1a\x68\xff\0\0\0\0\x7b\x1a\x60\xff\0\0\0\0\x7b\x1a\x58\
-\xff\0\0\0\0\x15\x09\x35\x02\0\0\0\0\x6b\x1a\xd0\xff\0\0\0\0\xbf\xa3\0\0\0\0\0\
-\0\x07\x03\0\0\xd0\xff\xff\xff\xbf\x91\0\0\0\0\0\0\xb7\x02\0\0\x0c\0\0\0\xb7\
-\x04\0\0\x02\0\0\0\xb7\x05\0\0\0\0\0\0\x85\0\0\0\x44\0\0\0\x67\0\0\0\x20\0\0\0\
-\x77\0\0\0\x20\0\0\0\x55\0\x2a\x02\0\0\0\0\xb7\x02\0\0\x10\0\0\0\x69\xa1\xd0\
-\xff\0\0\0\0\xbf\x13\0\0\0\0\0\0\xdc\x03\0\0\x10\0\0\0\x15\x03\x02\0\0\x81\0\0\
-\x55\x03\x0b\0\xa8\x88\0\0\xb7\x02\0\0\x14\0\0\0\xbf\xa3\0\0\0\0\0\0\x07\x03\0\
-\0\xd0\xff\xff\xff\xbf\x91\0\0\0\0\0\0\xb7\x04\0\0\x02\0\0\0\xb7\x05\0\0\0\0\0\
-\0\x85\0\0\0\x44\0\0\0\x67\0\0\0\x20\0\0\0\x77\0\0\0\x20\0\0\0\x55\0\x1a\x02\0\
-\0\0\0\x69\xa1\xd0\xff\0\0\0\0\x15\x01\x18\x02\0\0\0\0\x15\x01\x21\0\x86\xdd\0\
-\0\x7b\x9a\x48\xff\0\0\0\0\x55\x01\xf6\0\x08\0\0\0\xb7\x01\0\0\x01\0\0\0\x73\
-\x1a\x58\xff\0\0\0\0\xb7\x01\0\0\0\0\0\0\x63\x1a\xe0\xff\0\0\0\0\x7b\x1a\xd8\
-\xff\0\0\0\0\x7b\x1a\xd0\xff\0\0\0\0\xbf\xa3\0\0\0\0\0\0\x07\x03\0\0\xd0\xff\
-\xff\xff\x79\xa1\x48\xff\0\0\0\0\xb7\x02\0\0\0\0\0\0\xb7\x04\0\0\x14\0\0\0\xb7\
-\x05\0\0\x01\0\0\0\x85\0\0\0\x44\0\0\0\x67\0\0\0\x20\0\0\0\x77\0\0\0\x20\0\0\0\
-\x55\0\x05\x02\0\0\0\0\x69\xa1\xd6\xff\0\0\0\0\x57\x01\0\0\x3f\xff\0\0\xb7\x04\
-\0\0\x01\0\0\0\x55\x01\x01\0\0\0\0\0\xb7\x04\0\0\0\0\0\0\x61\xa1\xdc\xff\0\0\0\
-\0\x63\x1a\x64\xff\0\0\0\0\x61\xa1\xe0\xff\0\0\0\0\x63\x1a\x68\xff\0\0\0\0\x71\
-\xa9\xd9\xff\0\0\0\0\x71\xa2\xd0\xff\0\0\0\0\x67\x02\0\0\x02\0\0\0\x57\x02\0\0\
-\x3c\0\0\0\x73\x4a\x5e\xff\0\0\0\0\x05\0\xbc\0\0\0\0\0\xb7\x01\0\0\x01\0\0\0\
-\x73\x1a\x59\xff\0\0\0\0\xb7\x01\0\0\0\0\0\0\x7b\x1a\xf0\xff\0\0\0\0\x7b\x1a\
-\xe8\xff\0\0\0\0\x7b\x1a\xe0\xff\0\0\0\0\x7b\x1a\xd8\xff\0\0\0\0\x7b\x1a\xd0\
-\xff\0\0\0\0\xbf\xa3\0\0\0\0\0\0\x07\x03\0\0\xd0\xff\xff\xff\xbf\x91\0\0\0\0\0\
-\0\xb7\x02\0\0\0\0\0\0\xb7\x04\0\0\x28\0\0\0\xb7\x05\0\0\x01\0\0\0\x85\0\0\0\
-\x44\0\0\0\x67\0\0\0\x20\0\0\0\x77\0\0\0\x20\0\0\0\x55\0\xe4\x01\0\0\0\0\xb7\
-\x03\0\0\x28\0\0\0\x7b\x9a\x48\xff\0\0\0\0\x79\xa1\xe0\xff\0\0\0\0\x63\x1a\x6c\
-\xff\0\0\0\0\x77\x01\0\0\x20\0\0\0\x63\x1a\x70\xff\0\0\0\0\x79\xa1\xd8\xff\0\0\
-\0\0\x63\x1a\x64\xff\0\0\0\0\x77\x01\0\0\x20\0\0\0\x63\x1a\x68\xff\0\0\0\0\x79\
-\xa1\xe8\xff\0\0\0\0\x63\x1a\x74\xff\0\0\0\0\x77\x01\0\0\x20\0\0\0\x63\x1a\x78\
-\xff\0\0\0\0\x79\xa1\xf0\xff\0\0\0\0\x63\x1a\x7c\xff\0\0\0\0\x77\x01\0\0\x20\0\
-\0\0\x63\x1a\x80\xff\0\0\0\0\x71\xa9\xd6\xff\0\0\0\0\x25\x09\x93\0\x3c\0\0\0\
-\xb7\x01\0\0\x01\0\0\0\x6f\x91\0\0\0\0\0\0\x18\x02\0\0\x01\0\0\0\0\0\0\0\0\x18\
-\0\x1c\x5f\x21\0\0\0\0\0\0\x55\x01\x01\0\0\0\0\0\x05\0\x8c\0\0\0\0\0\xb7\x01\0\
-\0\0\0\0\0\x6b\x1a\xfe\xff\0\0\0\0\xb7\x02\0\0\x28\0\0\0\xbf\xa1\0\0\0\0\0\0\
-\x07\x01\0\0\x94\xff\xff\xff\x7b\x1a\x20\xff\0\0\0\0\xbf\xa1\0\0\0\0\0\0\x07\
-\x01\0\0\x84\xff\xff\xff\x7b\x1a\x18\xff\0\0\0\0\xb7\x01\0\0\0\0\0\0\x7b\x1a\
-\x38\xff\0\0\0\0\x7b\x7a\x30\xff\0\0\0\0\x7b\x8a\x28\xff\0\0\0\0\xbf\xa3\0\0\0\
-\0\0\0\x07\x03\0\0\xfe\xff\xff\xff\x79\xa1\x48\xff\0\0\0\0\x7b\x2a\x40\xff\0\0\
-\0\0\xb7\x04\0\0\x02\0\0\0\xb7\x05\0\0\x01\0\0\0\x85\0\0\0\x44\0\0\0\x67\0\0\0\
-\x20\0\0\0\x77\0\0\0\x20\0\0\0\x55\0\xb2\x01\0\0\0\0\xbf\x91\0\0\0\0\0\0\x15\
-\x01\x22\0\x3c\0\0\0\x15\x01\x58\0\x2c\0\0\0\x79\xa2\x40\xff\0\0\0\0\x55\x01\
-\x59\0\x2b\0\0\0\xb7\x01\0\0\0\0\0\0\x63\x1a\xf8\xff\0\0\0\0\xbf\xa3\0\0\0\0\0\

[PATCH v9 04/20] net: Remove receive_raw()

2024-04-03 Thread Akihiko Odaki
While netmap implements virtio-net header, it does not implement
receive_raw(). Instead of implementing receive_raw for netmap, add
virtio-net headers in the common code and use receive_iov()/receive()
instead. This also fixes the buffer size for the virtio-net header.

Fixes: fbbdbddec0 ("tap: allow extended virtio header with hash info")
Signed-off-by: Akihiko Odaki 
---
 include/net/net.h |  1 -
 net/net.c | 18 --
 net/tap.c |  1 -
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/include/net/net.h b/include/net/net.h
index 6fe5a0aee833..c8f679761bf9 100644
--- a/include/net/net.h
+++ b/include/net/net.h
@@ -72,7 +72,6 @@ typedef struct NetClientInfo {
 NetClientDriver type;
 size_t size;
 NetReceive *receive;
-NetReceive *receive_raw;
 NetReceiveIOV *receive_iov;
 NetCanReceive *can_receive;
 NetStart *start;
diff --git a/net/net.c b/net/net.c
index db096765f4b2..6938da05e077 100644
--- a/net/net.c
+++ b/net/net.c
@@ -787,11 +787,7 @@ static ssize_t nc_sendv_compat(NetClientState *nc, const 
struct iovec *iov,
 offset = iov_to_buf(iov, iovcnt, 0, buf, offset);
 }
 
-if (flags & QEMU_NET_PACKET_FLAG_RAW && nc->info->receive_raw) {
-ret = nc->info->receive_raw(nc, buffer, offset);
-} else {
-ret = nc->info->receive(nc, buffer, offset);
-}
+ret = nc->info->receive(nc, buffer, offset);
 
 g_free(buf);
 return ret;
@@ -806,6 +802,8 @@ static ssize_t qemu_deliver_packet_iov(NetClientState 
*sender,
 MemReentrancyGuard *owned_reentrancy_guard;
 NetClientState *nc = opaque;
 int ret;
+struct virtio_net_hdr_v1_hash vnet_hdr = { };
+g_autofree struct iovec *iov_copy = NULL;
 
 
 if (nc->link_down) {
@@ -824,7 +822,15 @@ static ssize_t qemu_deliver_packet_iov(NetClientState 
*sender,
 owned_reentrancy_guard->engaged_in_io = true;
 }
 
-if (nc->info->receive_iov && !(flags & QEMU_NET_PACKET_FLAG_RAW)) {
+if ((flags & QEMU_NET_PACKET_FLAG_RAW) && nc->vnet_hdr_len) {
+iov_copy = g_new(struct iovec, iovcnt + 1);
+iov_copy[0].iov_base = _hdr;
+iov_copy[0].iov_len =  nc->vnet_hdr_len;
+memcpy(_copy[1], iov, iovcnt * sizeof(*iov));
+iov = iov_copy;
+}
+
+if (nc->info->receive_iov) {
 ret = nc->info->receive_iov(nc, iov, iovcnt);
 } else {
 ret = nc_sendv_compat(nc, iov, iovcnt, flags);
diff --git a/net/tap.c b/net/tap.c
index 49edf6c2b6e1..99c59ee46881 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -360,7 +360,6 @@ static NetClientInfo net_tap_info = {
 .type = NET_CLIENT_DRIVER_TAP,
 .size = sizeof(TAPState),
 .receive = tap_receive,
-.receive_raw = tap_receive_raw,
 .receive_iov = tap_receive_iov,
 .poll = tap_poll,
 .cleanup = tap_cleanup,

-- 
2.44.0




[PATCH v9 01/20] tap: Remove tap_probe_vnet_hdr_len()

2024-04-03 Thread Akihiko Odaki
It was necessary since an Linux older than 2.6.35 may implement the
virtio-net header but may not allow to change its length. Remove it
since such an old Linux is no longer supported.

Signed-off-by: Akihiko Odaki 
Acked-by: Michael S. Tsirkin 
---
 net/tap_int.h |  1 -
 net/tap-bsd.c |  5 -
 net/tap-linux.c   | 20 
 net/tap-solaris.c |  5 -
 net/tap-stub.c|  5 -
 net/tap.c |  8 ++--
 6 files changed, 2 insertions(+), 42 deletions(-)

diff --git a/net/tap_int.h b/net/tap_int.h
index 9a2175655bb0..8857ff299d22 100644
--- a/net/tap_int.h
+++ b/net/tap_int.h
@@ -35,7 +35,6 @@ ssize_t tap_read_packet(int tapfd, uint8_t *buf, int maxlen);
 
 void tap_set_sndbuf(int fd, const NetdevTapOptions *tap, Error **errp);
 int tap_probe_vnet_hdr(int fd, Error **errp);
-int tap_probe_vnet_hdr_len(int fd, int len);
 int tap_probe_has_ufo(int fd);
 int tap_probe_has_uso(int fd);
 void tap_fd_set_offload(int fd, int csum, int tso4, int tso6, int ecn, int ufo,
diff --git a/net/tap-bsd.c b/net/tap-bsd.c
index 274ea7bd2c3c..b4c84441ba8b 100644
--- a/net/tap-bsd.c
+++ b/net/tap-bsd.c
@@ -217,11 +217,6 @@ int tap_probe_has_uso(int fd)
 return 0;
 }
 
-int tap_probe_vnet_hdr_len(int fd, int len)
-{
-return 0;
-}
-
 void tap_fd_set_vnet_hdr_len(int fd, int len)
 {
 }
diff --git a/net/tap-linux.c b/net/tap-linux.c
index c7e514ecb04b..1226d5fda2d9 100644
--- a/net/tap-linux.c
+++ b/net/tap-linux.c
@@ -185,26 +185,6 @@ int tap_probe_has_uso(int fd)
 return 1;
 }
 
-/* Verify that we can assign given length */
-int tap_probe_vnet_hdr_len(int fd, int len)
-{
-int orig;
-if (ioctl(fd, TUNGETVNETHDRSZ, ) == -1) {
-return 0;
-}
-if (ioctl(fd, TUNSETVNETHDRSZ, ) == -1) {
-return 0;
-}
-/* Restore original length: we can't handle failure. */
-if (ioctl(fd, TUNSETVNETHDRSZ, ) == -1) {
-fprintf(stderr, "TUNGETVNETHDRSZ ioctl() failed: %s. Exiting.\n",
-strerror(errno));
-abort();
-return -errno;
-}
-return 1;
-}
-
 void tap_fd_set_vnet_hdr_len(int fd, int len)
 {
 if (ioctl(fd, TUNSETVNETHDRSZ, ) == -1) {
diff --git a/net/tap-solaris.c b/net/tap-solaris.c
index 08b13af51257..51b7830bef1d 100644
--- a/net/tap-solaris.c
+++ b/net/tap-solaris.c
@@ -221,11 +221,6 @@ int tap_probe_has_uso(int fd)
 return 0;
 }
 
-int tap_probe_vnet_hdr_len(int fd, int len)
-{
-return 0;
-}
-
 void tap_fd_set_vnet_hdr_len(int fd, int len)
 {
 }
diff --git a/net/tap-stub.c b/net/tap-stub.c
index 4b24f61e3a6c..38673434cbd6 100644
--- a/net/tap-stub.c
+++ b/net/tap-stub.c
@@ -52,11 +52,6 @@ int tap_probe_has_uso(int fd)
 return 0;
 }
 
-int tap_probe_vnet_hdr_len(int fd, int len)
-{
-return 0;
-}
-
 void tap_fd_set_vnet_hdr_len(int fd, int len)
 {
 }
diff --git a/net/tap.c b/net/tap.c
index baaa2f7a9ac7..72ae95894ff1 100644
--- a/net/tap.c
+++ b/net/tap.c
@@ -259,11 +259,7 @@ static bool tap_has_vnet_hdr(NetClientState *nc)
 
 static bool tap_has_vnet_hdr_len(NetClientState *nc, int len)
 {
-TAPState *s = DO_UPCAST(TAPState, nc, nc);
-
-assert(nc->info->type == NET_CLIENT_DRIVER_TAP);
-
-return !!tap_probe_vnet_hdr_len(s->fd, len);
+return tap_has_vnet_hdr(nc);
 }
 
 static int tap_get_vnet_hdr_len(NetClientState *nc)
@@ -432,7 +428,7 @@ static TAPState *net_tap_fd_init(NetClientState *peer,
  * Make sure host header length is set correctly in tap:
  * it might have been modified by another instance of qemu.
  */
-if (tap_probe_vnet_hdr_len(s->fd, s->host_vnet_hdr_len)) {
+if (vnet_hdr) {
 tap_fd_set_vnet_hdr_len(s->fd, s->host_vnet_hdr_len);
 }
 tap_read_poll(s, true);

-- 
2.44.0




[PATCH v9 08/20] virtio-net: Add only one queue pair when realizing

2024-04-03 Thread Akihiko Odaki
Multiqueue usage is not negotiated yet when realizing. If more than
one queue is added and the guest never requests to enable multiqueue,
the extra queues will not be deleted when unrealizing and leak.

Fixes: f9d6dbf0bf6e ("virtio-net: remove virtio queues if the guest doesn't 
support multiqueue")
Signed-off-by: Akihiko Odaki 
---
 hw/net/virtio-net.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c
index 8ede38aadbbe..e33bdbfd84a5 100644
--- a/hw/net/virtio-net.c
+++ b/hw/net/virtio-net.c
@@ -3759,9 +3759,7 @@ static void virtio_net_device_realize(DeviceState *dev, 
Error **errp)
 n->net_conf.tx_queue_size = MIN(virtio_net_max_tx_queue_size(n),
 n->net_conf.tx_queue_size);
 
-for (i = 0; i < n->max_queue_pairs; i++) {
-virtio_net_add_queue(n, i);
-}
+virtio_net_add_queue(n, 0);
 
 n->ctrl_vq = virtio_add_queue(vdev, 64, virtio_net_handle_ctrl);
 qemu_macaddr_default_if_unset(>nic_conf.macaddr);

-- 
2.44.0




[PATCH v9 00/20] virtio-net RSS/hash report fixes and improvements

2024-04-03 Thread Akihiko Odaki
This series contains fixes and improvements for virtio-net RSS and hash
reporting feature.

V7 -> V8:
  Reset author email addresses.
  Rebased.

V6 -> V7:
  Dropped patch "virtio-net: Do not clear VIRTIO_NET_F_HASH_REPORT".
  Dropped the changes to remove packet flags.
  Re-introduced tap_receive() and changed it to call tap_receive_iov().
  Removed tap_get_vnet_hdr_len().
  Fixed tap initialization not to call tap_fd_set_vnet_hdr_len() for tap
  without virtio-net header.
  Changed to call error_report() instead of warn_report() for
  programming errors.

V5 -> V6:
  Corrected the message for patch "virtio-net: Return an error when vhost
  cannot enable RSS".
  Removed changes to introduce asserts from "virtio-net: Return an error
  when vhost cannot enable RSS".
  Reorganized patches "virtio-net: Return an error when vhost cannot
  enable RSS" and "virtio-net: Do not clear VIRTIO_NET_F_RSS". This
  version now contains patches "virtio-net: Return an error when vhost
  cannot enable RSS" and "virtio-net: Enable software RSS".
  Rebased.

V4 -> V5:
  Added patch "virtio-net: Do not write hashes to peer buffer".

V3 -> V4:
  Extract patches "tap: Remove tap_receive()" and  "net: Remove flag
  propagation" from "net: Remove receive_raw()".
  Added patch "virtio-net: Always set populate_hash".
  Added patch "virtio-net: Do not clear VIRTIO_NET_F_HASH_REPORT".
  Added patch "ebpf: Use standard section name".
  Added patch "ebpf: Simplify error handling".
  Added patch "ebpf: Return 0 when configuration fails".
  Added patch "ebpf: Refactor tun_rss_steering_prog()".
  Added patch "ebpf: Add a separate target for skeleton".

V2 -> V3:
  Added patch "tap: Remove tap_probe_vnet_hdr_len()".
  Added patch "tap: Remove qemu_using_vnet_hdr()".
  Added patch "net: Move virtio-net header length assertion".
  Added patch "net: Remove receive_raw()".
  Added patch "tap: Shrink zeroed virtio-net header".
  Dropped patch "tap: Fix virtio-net header buffer size".

V1 -> V2:
  Added patch "ebpf: Fix RSS error handling".

Signed-off-by: Akihiko Odaki 
---
Changes in v9:
- Added patch "virtio-net: Do not propagate ebpf-rss-fds errors".
- Added patch "virtio-net: Shrink header byte swapping buffer".
- Rebased.
- Link to v8: 
https://lore.kernel.org/r/20231210-rss-v8-0-9553ee714...@daynix.com

---
Akihiko Odaki (20):
  tap: Remove tap_probe_vnet_hdr_len()
  tap: Remove qemu_using_vnet_hdr()
  net: Move virtio-net header length assertion
  net: Remove receive_raw()
  tap: Call tap_receive_iov() from tap_receive()
  tap: Shrink zeroed virtio-net header
  virtio-net: Do not propagate ebpf-rss-fds errors
  virtio-net: Add only one queue pair when realizing
  virtio-net: Copy header only when necessary
  virtio-net: Shrink header byte swapping buffer
  virtio-net: Disable RSS on reset
  virtio-net: Unify the logic to update NIC state for RSS
  virtio-net: Return an error when vhost cannot enable RSS
  virtio-net: Report RSS warning at device realization
  virtio-net: Always set populate_hash
  virtio-net: Do not write hashes to peer buffer
  ebpf: Fix RSS error handling
  ebpf: Return 0 when configuration fails
  ebpf: Refactor tun_rss_steering_prog()
  ebpf: Add a separate target for skeleton

 ebpf/rss.bpf.skeleton.h  | 1558 +++---
 include/net/net.h|8 -
 net/tap_int.h|1 -
 hw/net/e1000e.c  |1 -
 hw/net/igb.c |1 -
 hw/net/net_tx_pkt.c  |4 +-
 hw/net/virtio-net.c  |  344 +-
 hw/net/vmxnet3.c |2 -
 net/dump.c   |4 +-
 net/net.c|   47 +-
 net/netmap.c |5 -
 net/tap-bsd.c|5 -
 net/tap-linux.c  |   20 -
 net/tap-solaris.c|5 -
 net/tap-stub.c   |5 -
 net/tap.c|   77 +--
 tools/ebpf/rss.bpf.c |   44 +-
 tools/ebpf/Makefile.ebpf |   15 +-
 18 files changed, 988 insertions(+), 1158 deletions(-)
---
base-commit: e5c6528dce86d7a9ada7ecf02fcb7b8560955131
change-id: 20231210-rss-e7c98e722253

Best regards,
-- 
Akihiko Odaki 




Re: [RFC PATCH 00/12] SMMUv3 nested translation support

2024-04-03 Thread Mostafa Saleh
Hi Nicolin,

On Tue, Apr 02, 2024 at 03:28:12PM -0700, Nicolin Chen wrote:
> Hi Mostafa,
> 
> On Mon, Mar 25, 2024 at 10:13:56AM +, Mostafa Saleh wrote:
> > 
> > Currently, QEMU supports emulating either stage-1 or stage-2 SMMUs
> > but not nested instances.
> > This patch series adds support for nested translation in SMMUv3,
> > this is controlled by property “arm-smmuv3.stage=nested”, and
> > advertised to guests as (IDR0.S1P == 1 && IDR0.S2P == 2)
> 
> IIUIC, with this series, vSMMU will support a virtualized 2-stage
> translation in a guest VM, right? I wonder how it would interact

I always get confused with terminologies when dealing with QEMU;
as the host can mean the actual host (which is x86_64 in my case)
and the guest would aarch64 Linux fully emulated by QEMU, and the
emulated guest can be considered a host and launch it’s guests wit
KVM for example. This also will be more fun with guests supporting
nested virtualization :)

For simplicity, I will consider:
- HOST: the fully emulated QEMU guest (aarch64) running on my machine.
- GUEST: Any guest launched by the HOST (through KVM for example)
- QEMU: Is the instance of QEMU emulating the HOST (built for x86)
- QEMU-VMM: Is the instance of QEMU running on the HOST (built for
  aarch64) which launches VMs(GUESTs).

With that, AFAIU, vSMMU is the SMMUv3 emulation used for GUESTs with
QEMU-VMM, where it has hooks in CMDQ and then the QEMU-VMM will issue
IOCTLs to the HOST to do the actual SMMU work (through iommufd or IIRC
there was previous patches from Eric that does that also), also the
vSMMU is out of tree AFAICT.

In that case, this work is orthogonal to that, the nested SMMUv3
emulation in this series mainly targets QEMU which is advertised
to the HOST, which then allows it to use iommufd with GUESts.

In theory, that work can be extended to QEMU-VMM with vSMMU, but
I guess that would be a lot of work as the VMM needs to collapse
both stages as the kernel provides only one address space for the VMM.

Mainly, I use this patches to test nesting patches I am hacking for
KVM, also they can be used with your patches to test iommufd with
needing hardware. (See testing section in the cover letter)

> with the ongoing 2-stage nesting support with host and guest. Or
> is it supposed to be just a total orthogonal feature without any
> interaction with the host system?

Are you referring to the iommufd work on Linux to support nesting?


Thanks,
Mostafa
> Thanks
> Nicolin
> 
> > Main changes(architecture):
> > 
> > 1) CDs are considered IPA and translated with stage-2.
> > 2) TTBx and tables for stage-1 are considered IPA and translated
> >with stage-2.
> > 3) Translate the IPA address with stage-2.
> > 
> > TLBs:
> > ==
> > TLBs are the most tricky part.
> > 
> > 1) General design
> >Unified(Combined) design is used, where a new tag is added "stage"
> >which has 2 valid values:
> >- STAGE_1: Meaning this entry translates VA to PADDR, it can be
> >  cached from fully nested configuration or from stage-1 only.
> >  It doesn't support separate cached entries (VA to IPA).
> > 
> >- STAGE_2: Meaning this translates IPA to PADDR, cached from
> >  stage-2  only configuration.
> > 
> >TLBs are also modified to cache 2 permissions, a new permission added
> >"parent_perm."
> > 
> >For non-nested configuration, perm == parent_perm and nothing
> >changes. This is used to know which stage to use in case there is
> >a permission fault from a TLB entry.
> > 
> > 2) Caching in TLB
> >Stage-1 and stage-2 are inserted in the TLB as is.
> >For nested translation, both entries are combined into one TLB
> >entry. Everything is used from stage-1, except:
> >- transatled_addr from stage-2.
> >- parent_perm is from stage-2.
> >- addr_mask: is the minimum of both.
> > 
> > 3) TLB Lookup
> >For stage-1 and nested translations, it look for STAGE_1 entries.
> >For stage-2 it look for STAGE_2 TLB entries.
> > 
> > 4) TLB invalidation
> >- Stage-1 commands (CMD_TLBI_NH_VAA, SMMU_CMD_TLBI_NH_VA,
> >  SMMU_CMD_TLBI_NH_ALL): Invalidate TLBs tagged with SMMU_STAGE_1.
> >- Stage-2 commands (CMD_TLBI_S2_IPA): Invalidate TLBs tagged with
> >  SMMU_STAGE_2.
> >- All (SMMU_CMD_TLBI_S12_VMALL): Will invalidate both, this is
> >  communicated to the TLB as SMMU_NESTED which is (SMMU_STAGE_1 |
> >  SMMU_STAGE_2) which uses it as a mask.
> > 
> >As far as I understand, this is compliant with the ARM
> >architecture, based on:
> >- ARM ARM DDI 0487J.a: RLGSCG, RTVTYQ, RGNJPZ
> >- ARM IHI 0070F.b: 16.2 Caching
> > 
> >An alternative approach would be to instantiate 2 TLBs, one per
> >each stage. I haven’t investigated that.
> > 
> > Others
> > ===
> > - Advertise SMMUv3.2-S2FWB, it is NOP for QEMU as it doesn’t support
> >   attributes.
> > 
> > - OAS: A typical setup with nesting is to share CPU stage-2 with the
> >   

[PATCH 6/6] bios-tables-test: Add data for complex numa test (GI, GP etc)

2024-04-03 Thread Jonathan Cameron via
Given this is a new configuration, there are affects on APIC, CEDT
and DSDT, but the key elements are in SRAT (plus related data in
HMAT).  The configuration has node to exercise many different combinations.

0) CPUs + Memory
1) GI only
2) GP only
3) CPUS only
4) Memory only
5) CPUs + HP memory

GI node, GP Node, Memory only node, hotplug memory
only node, latency and bandwidth such that in Linux Access0
(any initiator) and Access1 (CPU initiators only) given different
answers.  Following cropped to remove details of each entry.

[000h  004h]   Signature : "SRAT"[System Resource 
Affinity Table]

[030h 0048 001h]   Subtable Type : 00 [Processor Local APIC/SAPIC 
Affinity]
[032h 0050 001h] Proximity Domain Low(8) : 00
[033h 0051 001h] Apic ID : 00

[040h 0064 001h]   Subtable Type : 00 [Processor Local APIC/SAPIC 
Affinity]
[042h 0066 001h] Proximity Domain Low(8) : 03   


   [043h 0067 001h] 
Apic ID : 01

[050h 0080 001h]   Subtable Type : 00 [Processor Local APIC/SAPIC 
Affinity]
[052h 0082 001h] Proximity Domain Low(8) : 05
[053h 0083 001h] Apic ID : 02

[060h 0096 001h]   Subtable Type : 01 [Memory Affinity]
[062h 0098 004h]Proximity Domain : 
[068h 0104 008h]Base Address : 
[070h 0112 008h]  Address Length : 000A

[088h 0136 001h]   Subtable Type : 01 [Memory Affinity]
[08Ah 0138 004h]Proximity Domain : 
[090h 0144 008h]Base Address : 0010
[098h 0152 008h]  Address Length : 03F0
[0A8h 0168 008h]   Reserved3 : 

[0B0h 0176 001h]   Subtable Type : 01 [Memory Affinity]
[0B2h 0178 004h]Proximity Domain : 0004
[0B8h 0184 008h]Base Address : 0400
[0C0h 0192 008h]  Address Length : 0400

//Comment in hw/i386/aml-build.c on why these exist - not part of
//ACPI requirements.
[0D8h 0216 001h]   Subtable Type : 01 [Memory Affinity]
[0DAh 0218 004h]Proximity Domain : 
[0E0h 0224 008h]Base Address : 
[0E8h 0232 008h]  Address Length : 

[100h 0256 001h]   Subtable Type : 01 [Memory Affinity]
[102h 0258 004h]Proximity Domain : 
[108h 0264 008h]Base Address : 
[110h 0272 008h]  Address Length : 

[128h 0296 001h]   Subtable Type : 01 [Memory Affinity]
[12Ah 0298 004h]Proximity Domain : 
[130h 0304 008h]Base Address : 
[138h 0312 008h]  Address Length : 

[150h 0336 001h]   Subtable Type : 01 [Memory Affinity]
[152h 0338 004h]Proximity Domain : 
[158h 0344 008h]Base Address : 
[160h 0352 008h]  Address Length : 

[178h 0376 001h]   Subtable Type : 01 [Memory Affinity]
[17Ah 0378 004h]Proximity Domain : 
[180h 0384 008h]Base Address : 
[188h 0392 008h]  Address Length : 
// End of strange empty Memory Affinity structures.

[1A0h 0416 001h]   Subtable Type : 05 [Generic Initiator Affinity]
[1A3h 0419 001h]  Device Handle Type : 01
[1A4h 0420 004h]Proximity Domain : 0001
[1A8h 0424 010h]   Device Handle : 00 00 10 00 00 00 00 00 00 00 00 
00 00 00 00 00

[1C0h 0448 001h]   Subtable Type : 06 [Generic Port Affinity]
[1C3h 0451 001h]  Device Handle Type : 00
[1C4h 0452 004h]Proximity Domain : 0002
[1C8h 0456 010h]   Device Handle : 41 43 50 49 30 30 31 36 40 00 00 
00 00 00 00 00

[1E0h 0480 001h]   Subtable Type : 01 [Memory Affinity]
[1E2h 0482 004h]Proximity Domain : 0005
[1E8h 0488 008h]Base Address : 0001
[1F0h 0496 008h]  Address Length : 9000
[1FCh 0508 004h]   Flags (decoded below) : 0003
 Enabled : 1
   Hot Pluggable : 1
Non-Volatile : 0

Example block from HMAT:
[0F0h 0240 002h]  Structure Type : 0001 [System Locality Latency 
and Bandwidth Information]  

   [0F2h 0242 002h]  

[PATCH 5/6] bios-tables-test: Add complex SRAT / HMAT test for GI GP

2024-04-03 Thread Jonathan Cameron via
Add a test with 6 nodes to exercise most interesting corner cases
of SRAT and HMAT generation including the new Generic Initiator
and Generic Port Affinity structures.  More details of the
set up in the following patch adding the table data.

Signed-off-by: Jonathan Cameron 
---
 tests/qtest/bios-tables-test.c | 92 ++
 1 file changed, 92 insertions(+)

diff --git a/tests/qtest/bios-tables-test.c b/tests/qtest/bios-tables-test.c
index d1ff4db7a2..1651d06b7b 100644
--- a/tests/qtest/bios-tables-test.c
+++ b/tests/qtest/bios-tables-test.c
@@ -1862,6 +1862,96 @@ static void test_acpi_q35_tcg_acpi_hmat_noinitiator(void)
 free_test_data();
 }
 
+/* Test intended to hit corner cases of SRAT and HMAT */
+static void test_acpi_q35_tcg_acpi_hmat_generic_x(void)
+{
+test_data data = {};
+
+data.machine = MACHINE_Q35;
+data.variant = ".acpihmat-generic-x";
+test_acpi_one(" -machine hmat=on,cxl=on"
+  " -smp 3,sockets=3"
+  " -m 128M,maxmem=384M,slots=2"
+  " -device virtio-rng-pci,id=gidev"
+  " -device pxb-cxl,bus_nr=64,bus=pcie.0,id=cxl.1"
+  " -object memory-backend-ram,size=64M,id=ram0"
+  " -object memory-backend-ram,size=64M,id=ram1"
+  " -numa node,nodeid=0,cpus=0,memdev=ram0"
+  " -numa node,nodeid=1"
+  " -object acpi-generic-initiator,id=gi0,pci-dev=gidev,node=1"
+  " -numa node,nodeid=2"
+  " -object acpi-generic-port,id=gp0,pci-bus=cxl.1,node=2"
+  " -numa node,nodeid=3,cpus=1"
+  " -numa node,nodeid=4,memdev=ram1"
+  " -numa node,nodeid=5,cpus=2"
+  " -numa hmat-lb,initiator=0,target=0,hierarchy=memory,"
+  "data-type=access-latency,latency=10"
+  " -numa hmat-lb,initiator=0,target=0,hierarchy=memory,"
+  "data-type=access-bandwidth,bandwidth=800M"
+  " -numa hmat-lb,initiator=0,target=2,hierarchy=memory,"
+  "data-type=access-latency,latency=100"
+  " -numa hmat-lb,initiator=0,target=2,hierarchy=memory,"
+  "data-type=access-bandwidth,bandwidth=200M"
+  " -numa hmat-lb,initiator=0,target=4,hierarchy=memory,"
+  "data-type=access-latency,latency=100"
+  " -numa hmat-lb,initiator=0,target=4,hierarchy=memory,"
+  "data-type=access-bandwidth,bandwidth=200M"
+  " -numa hmat-lb,initiator=0,target=5,hierarchy=memory,"
+  "data-type=access-latency,latency=200"
+  " -numa hmat-lb,initiator=0,target=5,hierarchy=memory,"
+  "data-type=access-bandwidth,bandwidth=400M"
+  " -numa hmat-lb,initiator=1,target=0,hierarchy=memory,"
+  "data-type=access-latency,latency=500"
+  " -numa hmat-lb,initiator=1,target=0,hierarchy=memory,"
+  "data-type=access-bandwidth,bandwidth=100M"
+  " -numa hmat-lb,initiator=1,target=2,hierarchy=memory,"
+  "data-type=access-latency,latency=50"
+  " -numa hmat-lb,initiator=1,target=2,hierarchy=memory,"
+  "data-type=access-bandwidth,bandwidth=400M"
+  " -numa hmat-lb,initiator=1,target=4,hierarchy=memory,"
+  "data-type=access-latency,latency=50"
+  " -numa hmat-lb,initiator=1,target=4,hierarchy=memory,"
+  "data-type=access-bandwidth,bandwidth=800M"
+  " -numa hmat-lb,initiator=1,target=5,hierarchy=memory,"
+  "data-type=access-latency,latency=500"
+  " -numa hmat-lb,initiator=1,target=5,hierarchy=memory,"
+  "data-type=access-bandwidth,bandwidth=100M"
+  " -numa hmat-lb,initiator=3,target=0,hierarchy=memory,"
+  "data-type=access-latency,latency=20"
+  " -numa hmat-lb,initiator=3,target=0,hierarchy=memory,"
+  "data-type=access-bandwidth,bandwidth=400M"
+  " -numa hmat-lb,initiator=3,target=2,hierarchy=memory,"
+  "data-type=access-latency,latency=80"
+  " -numa hmat-lb,initiator=3,target=2,hierarchy=memory,"
+  "data-type=access-bandwidth,bandwidth=200M"
+  " -numa hmat-lb,initiator=3,target=4,hierarchy=memory,"
+  "data-type=access-latency,latency=80"
+  " -numa hmat-lb,initiator=3,target=4,hierarchy=memory,"
+  "data-type=access-bandwidth,bandwidth=200M"
+  " -numa hmat-lb,initiator=3,target=5,hierarchy=memory,"
+  "data-type=access-latency,latency=20"
+  " -numa hmat-lb,initiator=3,target=5,hierarchy=memory,"
+  "data-type=access-bandwidth,bandwidth=400M"
+ 

[PATCH 4/6] bios-tables-test: Allow for new acpihmat-generic-x test data.

2024-04-03 Thread Jonathan Cameron via
The test to be added exercises many corners of the SRAT and HMAT
table generation.

Signed-off-by: Jonathan Cameron 
---
 tests/qtest/bios-tables-test-allowed-diff.h | 5 +
 tests/data/acpi/q35/APIC.acpihmat-generic-x | 0
 tests/data/acpi/q35/CEDT.acpihmat-generic-x | 0
 tests/data/acpi/q35/DSDT.acpihmat-generic-x | 0
 tests/data/acpi/q35/HMAT.acpihmat-generic-x | 0
 tests/data/acpi/q35/SRAT.acpihmat-generic-x | 0
 6 files changed, 5 insertions(+)

diff --git a/tests/qtest/bios-tables-test-allowed-diff.h 
b/tests/qtest/bios-tables-test-allowed-diff.h
index dfb8523c8b..a5aa801c99 100644
--- a/tests/qtest/bios-tables-test-allowed-diff.h
+++ b/tests/qtest/bios-tables-test-allowed-diff.h
@@ -1 +1,6 @@
 /* List of comma-separated changed AML files to ignore */
+"tests/data/acpi/q35/APIC.acpihmat-generic-x",
+"tests/data/acpi/q35/CEDT.acpihmat-generic-x",
+"tests/data/acpi/q35/DSDT.acpihmat-generic-x",
+"tests/data/acpi/q35/HMAT.acpihmat-generic-x",
+"tests/data/acpi/q35/SRAT.acpihmat-generic-x",
diff --git a/tests/data/acpi/q35/APIC.acpihmat-generic-x 
b/tests/data/acpi/q35/APIC.acpihmat-generic-x
new file mode 100644
index 00..e69de29bb2
diff --git a/tests/data/acpi/q35/CEDT.acpihmat-generic-x 
b/tests/data/acpi/q35/CEDT.acpihmat-generic-x
new file mode 100644
index 00..e69de29bb2
diff --git a/tests/data/acpi/q35/DSDT.acpihmat-generic-x 
b/tests/data/acpi/q35/DSDT.acpihmat-generic-x
new file mode 100644
index 00..e69de29bb2
diff --git a/tests/data/acpi/q35/HMAT.acpihmat-generic-x 
b/tests/data/acpi/q35/HMAT.acpihmat-generic-x
new file mode 100644
index 00..e69de29bb2
diff --git a/tests/data/acpi/q35/SRAT.acpihmat-generic-x 
b/tests/data/acpi/q35/SRAT.acpihmat-generic-x
new file mode 100644
index 00..e69de29bb2
-- 
2.39.2




[PATCH 3/6] hw/acpi: Generic Port Affinity Structure support

2024-04-03 Thread Jonathan Cameron via
These are very similar to the recently added Generic Initiators
but instead of representing an initiator of memory traffic they
represent an edge point beyond which may lie either targets or
initiators.  Here we add these ports such that they may
be targets of hmat_lb records to describe the latency and
bandwidth from host side initiators to the port.  A descoverable
mechanism such as UEFI CDAT read from CXL devices and switches
is used to discover the remainder fo the path and the OS can build
up full latency and bandwidth numbers as need for work and data
placement decisions.

Signed-off-by: Jonathan Cameron 
---
 qapi/qom.json|  18 +++
 include/hw/acpi/acpi_generic_initiator.h |  18 ++-
 include/hw/pci/pci_bridge.h  |   1 +
 hw/acpi/acpi_generic_initiator.c | 141 +--
 hw/pci-bridge/pci_expander_bridge.c  |   1 -
 5 files changed, 141 insertions(+), 38 deletions(-)

diff --git a/qapi/qom.json b/qapi/qom.json
index 85e6b4f84a..5480d9ca24 100644
--- a/qapi/qom.json
+++ b/qapi/qom.json
@@ -826,6 +826,22 @@
   'data': { 'pci-dev': 'str',
 'node': 'uint32' } }
 
+
+##
+# @AcpiGenericPortProperties:
+#
+# Properties for acpi-generic-port objects.
+#
+# @pci-bus: PCI bus of the hostbridge associated with this SRAT entry
+#
+# @node: numa node associated with the PCI device
+#
+# Since: 9.1
+##
+{ 'struct': 'AcpiGenericPortProperties',
+  'data': { 'pci-bus': 'str',
+'node': 'uint32' } }
+
 ##
 # @RngProperties:
 #
@@ -944,6 +960,7 @@
 { 'enum': 'ObjectType',
   'data': [
 'acpi-generic-initiator',
+'acpi-generic-port',
 'authz-list',
 'authz-listfile',
 'authz-pam',
@@ -1016,6 +1033,7 @@
   'discriminator': 'qom-type',
   'data': {
   'acpi-generic-initiator': 'AcpiGenericInitiatorProperties',
+  'acpi-generic-port':  'AcpiGenericPortProperties',
   'authz-list': 'AuthZListProperties',
   'authz-listfile': 'AuthZListFileProperties',
   'authz-pam':  'AuthZPAMProperties',
diff --git a/include/hw/acpi/acpi_generic_initiator.h 
b/include/hw/acpi/acpi_generic_initiator.h
index 26e2bd92d4..49ac448034 100644
--- a/include/hw/acpi/acpi_generic_initiator.h
+++ b/include/hw/acpi/acpi_generic_initiator.h
@@ -30,6 +30,12 @@ typedef struct AcpiGenericInitiator {
 AcpiGenericNode parent;
 } AcpiGenericInitiator;
 
+#define TYPE_ACPI_GENERIC_PORT "acpi-generic-port"
+
+typedef struct AcpiGenericPort {
+AcpiGenericInitiator parent;
+} AcpiGenericPort;
+
 /*
  * ACPI 6.3:
  * Table 5-81 Flags – Generic Initiator Affinity Structure
@@ -49,8 +55,16 @@ typedef enum {
  * Table 5-80 Device Handle - PCI
  */
 typedef struct PCIDeviceHandle {
-uint16_t segment;
-uint16_t bdf;
+union {
+struct {
+uint16_t segment;
+uint16_t bdf;
+};
+struct {
+uint64_t hid;
+uint32_t uid;
+};
+};
 } PCIDeviceHandle;
 
 void build_srat_generic_pci_initiator(GArray *table_data);
diff --git a/include/hw/pci/pci_bridge.h b/include/hw/pci/pci_bridge.h
index 5cd452115a..5456e24883 100644
--- a/include/hw/pci/pci_bridge.h
+++ b/include/hw/pci/pci_bridge.h
@@ -102,6 +102,7 @@ typedef struct PXBPCIEDev {
 PXBDev parent_obj;
 } PXBPCIEDev;
 
+#define TYPE_PXB_CXL_BUS "pxb-cxl-bus"
 #define TYPE_PXB_DEV "pxb"
 OBJECT_DECLARE_SIMPLE_TYPE(PXBDev, PXB_DEV)
 
diff --git a/hw/acpi/acpi_generic_initiator.c b/hw/acpi/acpi_generic_initiator.c
index c054e0e27d..85191e90ab 100644
--- a/hw/acpi/acpi_generic_initiator.c
+++ b/hw/acpi/acpi_generic_initiator.c
@@ -7,6 +7,7 @@
 #include "hw/acpi/acpi_generic_initiator.h"
 #include "hw/acpi/aml-build.h"
 #include "hw/boards.h"
+#include "hw/pci/pci_bridge.h"
 #include "hw/pci/pci_device.h"
 #include "qemu/error-report.h"
 
@@ -18,6 +19,10 @@ typedef struct AcpiGenericInitiatorClass {
  AcpiGenericNodeClass parent_class;
 } AcpiGenericInitiatorClass;
 
+typedef struct AcpiGenericPortClass {
+AcpiGenericInitiatorClass parent;
+} AcpiGenericPortClass;
+
 OBJECT_DEFINE_ABSTRACT_TYPE(AcpiGenericNode, acpi_generic_node,
 ACPI_GENERIC_NODE, OBJECT)
 
@@ -30,6 +35,13 @@ OBJECT_DEFINE_TYPE_WITH_INTERFACES(AcpiGenericInitiator, 
acpi_generic_initiator,
 
 OBJECT_DECLARE_SIMPLE_TYPE(AcpiGenericInitiator, ACPI_GENERIC_INITIATOR)
 
+OBJECT_DEFINE_TYPE_WITH_INTERFACES(AcpiGenericPort, acpi_generic_port,
+   ACPI_GENERIC_PORT, ACPI_GENERIC_NODE,
+   { TYPE_USER_CREATABLE },
+   { NULL })
+
+OBJECT_DECLARE_SIMPLE_TYPE(AcpiGenericPort, ACPI_GENERIC_PORT)
+
 static void acpi_generic_node_init(Object *obj)
 {
 AcpiGenericNode *gn = ACPI_GENERIC_NODE(obj);
@@ -53,6 +65,14 @@ static void acpi_generic_initiator_finalize(Object *obj)
 {
 }
 
+static void acpi_generic_port_init(Object *obj)
+{
+}
+
+static void acpi_generic_port_finalize(Object *obj)
+{
+}
+
 static 

[PATCH 2/6] hw/acpi: Insert an acpi-generic-node base under acpi-generic-initiator

2024-04-03 Thread Jonathan Cameron via
This will simplify reuse when adding acpi-generic-port.
Note that some error_printf() messages will now print acpi-generic-node
whereas others will move to type specific cases in next patch so
are left alone for now.

Signed-off-by: Jonathan Cameron 
---
 include/hw/acpi/acpi_generic_initiator.h | 15 -
 hw/acpi/acpi_generic_initiator.c | 78 +++-
 2 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/include/hw/acpi/acpi_generic_initiator.h 
b/include/hw/acpi/acpi_generic_initiator.h
index a304bad73e..26e2bd92d4 100644
--- a/include/hw/acpi/acpi_generic_initiator.h
+++ b/include/hw/acpi/acpi_generic_initiator.h
@@ -8,15 +8,26 @@
 
 #include "qom/object_interfaces.h"
 
-#define TYPE_ACPI_GENERIC_INITIATOR "acpi-generic-initiator"
+/*
+ * Abstract type to be used as base for
+ * - acpi-generic-initator
+ * - acpi-generic-port
+ */
+#define TYPE_ACPI_GENERIC_NODE "acpi-generic-node"
 
-typedef struct AcpiGenericInitiator {
+typedef struct AcpiGenericNode {
 /* private */
 Object parent;
 
 /* public */
 char *pci_dev;
 uint16_t node;
+} AcpiGenericNode;
+
+#define TYPE_ACPI_GENERIC_INITIATOR "acpi-generic-initiator"
+
+typedef struct AcpiGenericInitiator {
+AcpiGenericNode parent;
 } AcpiGenericInitiator;
 
 /*
diff --git a/hw/acpi/acpi_generic_initiator.c b/hw/acpi/acpi_generic_initiator.c
index 18a939b0e5..c054e0e27d 100644
--- a/hw/acpi/acpi_generic_initiator.c
+++ b/hw/acpi/acpi_generic_initiator.c
@@ -10,45 +10,61 @@
 #include "hw/pci/pci_device.h"
 #include "qemu/error-report.h"
 
-typedef struct AcpiGenericInitiatorClass {
+typedef struct AcpiGenericNodeClass {
 ObjectClass parent_class;
+} AcpiGenericNodeClass;
+
+typedef struct AcpiGenericInitiatorClass {
+ AcpiGenericNodeClass parent_class;
 } AcpiGenericInitiatorClass;
 
+OBJECT_DEFINE_ABSTRACT_TYPE(AcpiGenericNode, acpi_generic_node,
+ACPI_GENERIC_NODE, OBJECT)
+
+OBJECT_DECLARE_SIMPLE_TYPE(AcpiGenericNode, ACPI_GENERIC_NODE)
+
 OBJECT_DEFINE_TYPE_WITH_INTERFACES(AcpiGenericInitiator, 
acpi_generic_initiator,
-   ACPI_GENERIC_INITIATOR, OBJECT,
+   ACPI_GENERIC_INITIATOR, ACPI_GENERIC_NODE,
{ TYPE_USER_CREATABLE },
{ NULL })
 
 OBJECT_DECLARE_SIMPLE_TYPE(AcpiGenericInitiator, ACPI_GENERIC_INITIATOR)
 
+static void acpi_generic_node_init(Object *obj)
+{
+AcpiGenericNode *gn = ACPI_GENERIC_NODE(obj);
+
+gn->node = MAX_NODES;
+gn->pci_dev = NULL;
+}
+
 static void acpi_generic_initiator_init(Object *obj)
 {
-AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj);
+}
+
+static void acpi_generic_node_finalize(Object *obj)
+{
+AcpiGenericNode *gn = ACPI_GENERIC_NODE(obj);
 
-gi->node = MAX_NODES;
-gi->pci_dev = NULL;
+g_free(gn->pci_dev);
 }
 
 static void acpi_generic_initiator_finalize(Object *obj)
 {
-AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj);
-
-g_free(gi->pci_dev);
 }
 
-static void acpi_generic_initiator_set_pci_device(Object *obj, const char *val,
-  Error **errp)
+static void acpi_generic_node_set_pci_device(Object *obj, const char *val,
+ Error **errp)
 {
-AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj);
+AcpiGenericNode *gn = ACPI_GENERIC_NODE(obj);
 
-gi->pci_dev = g_strdup(val);
+gn->pci_dev = g_strdup(val);
 }
-
-static void acpi_generic_initiator_set_node(Object *obj, Visitor *v,
-const char *name, void *opaque,
-Error **errp)
+static void acpi_generic_node_set_node(Object *obj, Visitor *v,
+   const char *name, void *opaque,
+   Error **errp)
 {
-AcpiGenericInitiator *gi = ACPI_GENERIC_INITIATOR(obj);
+AcpiGenericNode *gn = ACPI_GENERIC_NODE(obj);
 MachineState *ms = MACHINE(qdev_get_machine());
 uint32_t value;
 
@@ -58,20 +74,24 @@ static void acpi_generic_initiator_set_node(Object *obj, 
Visitor *v,
 
 if (value >= MAX_NODES) {
 error_printf("%s: Invalid NUMA node specified\n",
- TYPE_ACPI_GENERIC_INITIATOR);
+ TYPE_ACPI_GENERIC_NODE);
 exit(1);
 }
 
-gi->node = value;
-ms->numa_state->nodes[gi->node].has_gi = true;
+gn->node = value;
+ms->numa_state->nodes[gn->node].has_gi = true;
 }
 
-static void acpi_generic_initiator_class_init(ObjectClass *oc, void *data)
+static void acpi_generic_node_class_init(ObjectClass *oc, void *data)
 {
 object_class_property_add_str(oc, "pci-dev", NULL,
-acpi_generic_initiator_set_pci_device);
+acpi_generic_node_set_pci_device);
 object_class_property_add(oc, "node", "int", NULL,
-acpi_generic_initiator_set_node, NULL, NULL);
+acpi_generic_node_set_node, NULL, NULL);
+}
+

[PATCH 1/6] hw/acpi/GI: Fix trivial parameter alignment issue.

2024-04-03 Thread Jonathan Cameron via
Before making additional modification, tidy up this misleading indentation.

Signed-off-by: Jonathan Cameron 
---
 hw/acpi/acpi_generic_initiator.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hw/acpi/acpi_generic_initiator.c b/hw/acpi/acpi_generic_initiator.c
index 17b9a052f5..18a939b0e5 100644
--- a/hw/acpi/acpi_generic_initiator.c
+++ b/hw/acpi/acpi_generic_initiator.c
@@ -132,7 +132,7 @@ static int build_all_acpi_generic_initiators(Object *obj, 
void *opaque)
 
 dev_handle.segment = 0;
 dev_handle.bdf = PCI_BUILD_BDF(pci_bus_num(pci_get_bus(pci_dev)),
-   pci_dev->devfn);
+   pci_dev->devfn);
 
 build_srat_generic_pci_initiator_affinity(table_data,
   gi->node, _handle);
-- 
2.39.2




[PATCH 0/6 qemu] acpi: NUMA nodes for CXL HB as GP + complex NUMA test.

2024-04-03 Thread Jonathan Cameron via
ACPI 6.5 introduced Generic Port Affinity Structures to close a system
description gap that was a problem for CXL memory systems.
It defines an new SRAT Affinity structure (and hence allows creation of an
ACPI Proximity Node which can only be defined via an SRAT structure)
for the boundary between a discoverable fabric and a non discoverable
system interconnects etc.

The HMAT data on latency and bandwidth is combined with discoverable
information from the CXL bus (link speeds, lane counts) and CXL devices
(switch port to port characteristics and USP to memory, via CDAT tables
read from the device).  QEMU has supported the rest of the elements
of this chain for a while but now the kernel has caught up and we need
the missing element of Generic Ports (this code has been used extensively
in testing and debugging that kernel support, some resulting fixes
currently under review).

Generic Port Affinity Structures are very similar to the recently
added Generic Initiator Affinity Structures (GI) so this series
factors out and reuses much of that infrastructure for reuse
There are subtle differences (beyond the obvious structure ID change).

- The ACPI spec example (and linux kernel support) has a Generic
  Port not as associated with the CXL root port, but rather with
  the CXL Host bridge. As a result, an ACPI handle is used (rather
  than the PCI SBDF option for GIs). In QEMU the easiest way
  to get to this is to target the root bridge PCI Bus, and
  conveniently the root bridge bus number is used for the UID allowing
  us to construct an appropriate entry.

A key addition of this series is a complex NUMA topology example that
stretches the QEMU emulation code for GI, GP and nodes with just
CPUS, just memory, just hot pluggable memory, mixture of memory and CPUs.

A similar test showed up a few NUMA related bugs with fixes applied for
9.0 (note that one of these needs linux booted to identify that it
rejects the HMAT table and this test is a regression test for the
table generation only).

https://lore.kernel.org/qemu-devel/2eb6672cfdaea7dacd8e9bb0523887f13b9f85ce.1710282274.git@redhat.com/
https://lore.kernel.org/qemu-devel/74e2845c5f95b0c139c79233ddb65bb17f2dd679.1710282274.git@redhat.com/

Jonathan Cameron (6):
  hw/acpi/GI: Fix trivial parameter alignment issue.
  hw/acpi: Insert an acpi-generic-node base under acpi-generic-initiator
  hw/acpi: Generic Port Affinity Structure support
  bios-tables-test: Allow for new acpihmat-generic-x test data.
  bios-tables-test: Add complex SRAT / HMAT test for GI GP
  bios-tables-test: Add data for complex numa test (GI, GP etc)

 qapi/qom.json   |  18 ++
 include/hw/acpi/acpi_generic_initiator.h|  33 +++-
 include/hw/pci/pci_bridge.h |   1 +
 hw/acpi/acpi_generic_initiator.c| 199 ++--
 hw/pci-bridge/pci_expander_bridge.c |   1 -
 tests/qtest/bios-tables-test.c  |  92 +
 tests/data/acpi/q35/APIC.acpihmat-generic-x | Bin 0 -> 136 bytes
 tests/data/acpi/q35/CEDT.acpihmat-generic-x | Bin 0 -> 68 bytes
 tests/data/acpi/q35/DSDT.acpihmat-generic-x | Bin 0 -> 10400 bytes
 tests/data/acpi/q35/HMAT.acpihmat-generic-x | Bin 0 -> 360 bytes
 tests/data/acpi/q35/SRAT.acpihmat-generic-x | Bin 0 -> 520 bytes
 11 files changed, 285 insertions(+), 59 deletions(-)
 create mode 100644 tests/data/acpi/q35/APIC.acpihmat-generic-x
 create mode 100644 tests/data/acpi/q35/CEDT.acpihmat-generic-x
 create mode 100644 tests/data/acpi/q35/DSDT.acpihmat-generic-x
 create mode 100644 tests/data/acpi/q35/HMAT.acpihmat-generic-x
 create mode 100644 tests/data/acpi/q35/SRAT.acpihmat-generic-x

-- 
2.39.2




[PATCH v12 20/23] hw/intc/arm_gicv3: Report the NMI interrupt in gicv3_cpuif_update()

2024-04-03 Thread Jinjie Ruan via
In CPU Interface, if the IRQ has the non-maskable property, report NMI to
the corresponding PE.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
Reviewed-by: Peter Maydell 
---
v12:
- Add Reviewed-by.
v10:
- superprio -> nmi.
- Update the commit message, superpriority -> non-maskable.
v6:
- Add Reviewed-by.
v4:
- Swap the ordering of the IFs.
v3:
- Remove handling nmi_is_irq flag.
---
 hw/intc/arm_gicv3_cpuif.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index 526740aa7e..93476f4744 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -1037,6 +1037,7 @@ void gicv3_cpuif_update(GICv3CPUState *cs)
 /* Tell the CPU about its highest priority pending interrupt */
 int irqlevel = 0;
 int fiqlevel = 0;
+int nmilevel = 0;
 ARMCPU *cpu = ARM_CPU(cs->cpu);
 CPUARMState *env = >env;
 
@@ -1075,6 +1076,8 @@ void gicv3_cpuif_update(GICv3CPUState *cs)
 
 if (isfiq) {
 fiqlevel = 1;
+} else if (cs->hppi.nmi) {
+nmilevel = 1;
 } else {
 irqlevel = 1;
 }
@@ -1084,6 +1087,7 @@ void gicv3_cpuif_update(GICv3CPUState *cs)
 
 qemu_set_irq(cs->parent_fiq, fiqlevel);
 qemu_set_irq(cs->parent_irq, irqlevel);
+qemu_set_irq(cs->parent_nmi, nmilevel);
 }
 
 static uint64_t icc_pmr_read(CPUARMState *env, const ARMCPRegInfo *ri)
-- 
2.34.1




[PATCH 1/2] scsi-disk: Introduce the migrate_emulate_scsi_request field

2024-04-03 Thread Hyman Huang
To indicate to the destination whether or not emulational SCSI
requests are sent, introduce the migrate_emulate_scsi_request
in struct SCSIDiskState. It seeks to achieve migration backend
compatibility.

This commit sets the stage for the next one, which addresses
the crash of a VM configured with a CDROM during live migration.

Signed-off-by: Hyman Huang 
---
 hw/scsi/scsi-disk.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 4bd7af9d0c..0985676f73 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -111,6 +111,7 @@ struct SCSIDiskState {
  * 0x- reserved
  */
 uint16_t rotation_rate;
+bool migrate_emulate_scsi_request;
 };
 
 static void scsi_free_request(SCSIRequest *req)
@@ -3133,11 +3134,21 @@ static Property scsi_hd_properties[] = {
 DEFINE_PROP_END_OF_LIST(),
 };
 
+static int scsi_disk_pre_save(void *opaque)
+{
+SCSIDiskState *dev = opaque;
+dev->migrate_emulate_scsi_request = false;
+
+return 0;
+}
+
 static const VMStateDescription vmstate_scsi_disk_state = {
 .name = "scsi-disk",
-.version_id = 1,
+.version_id = 2,
 .minimum_version_id = 1,
+.pre_save = scsi_disk_pre_save,
 .fields = (const VMStateField[]) {
+VMSTATE_BOOL_V(migrate_emulate_scsi_request, SCSIDiskState, 2),
 VMSTATE_SCSI_DEVICE(qdev, SCSIDiskState),
 VMSTATE_BOOL(media_changed, SCSIDiskState),
 VMSTATE_BOOL(media_event, SCSIDiskState),
-- 
2.39.3




[PATCH v12 18/23] hw/intc/arm_gicv3: Handle icv_nmiar1_read() for icc_nmiar1_read()

2024-04-03 Thread Jinjie Ruan via
Implement icv_nmiar1_read() for icc_nmiar1_read(), so add definition for
ICH_LR_EL2.NMI and ICH_AP1R_EL2.NMI bit.

If FEAT_GICv3_NMI is supported, ich_ap_write() should consider ICV_AP1R_EL1.NMI
bit. In icv_activate_irq() and icv_eoir_write(), the ICV_AP1R_EL1.NMI bit
should be set or clear according to the Non-maskable property. And the RPR
priority should also update the NMI bit according to the APR priority NMI bit.

By the way, add gicv3_icv_nmiar1_read trace event.

If the hpp irq is a NMI, the icv iar read should return 1022 and trap for
NMI again

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v12:
- When NMI is 1, the virtual interrupt's priority is 0x0.
- Make the thisnmi logic more concisely in hppvi_index().
- Use is_nmi to simplify the code and check is_nmi before comparing vpmr.
- Remove redundant nmi_support check in ich_highest_active_virt_prio(),
  hppvi_index(), icv_hppi_can_preempt(), icv_rpr_read() and icv_activate_irq().
- Also check sctlrx.NMI in icv_iar_read().
- Check icv_hppi_can_preempt() for icv_nmiar1_read().
- Check ICH_LR_EL2.NMI after check icv_hppi_can_preempt() as icv_iar_read()
  do it in icv_nmiar1_read().
- Fix the comment style in icv_nmiar1_read().
- Correct thisnmi to bool in icv_eoir_write().
- Check thisnmi and nmi both true instead of identical in icv_eoir_write().
v11:
- Deal with NMI in the callers instead of ich_highest_active_virt_prio().
- Set either NMI or a group-priority bit, not both.
- Only set AP NMI bits in the 0 reg.
- Handle NMI in hppvi_index(), icv_hppi_can_preempt() and icv_eoir_write().
v10:
- Rename ICH_AP1R_EL2_NMI to ICV_AP1R_EL1_NMI.
- Add ICV_RPR_EL1_NMI definition.
- Set ICV_RPR_EL1.NMI according to the ICV_AP1R_EL1.NMI in
  ich_highest_active_virt_prio().
v9:
- Correct the INTID_NMI logic.
v8:
- Fix an unexpected interrupt bug when sending VNMI by running qemu VM.
v7:
- Add Reviewed-by.
v6:
- Implement icv_nmiar1_read().
---
 hw/intc/arm_gicv3_cpuif.c | 100 +-
 hw/intc/gicv3_internal.h  |   4 ++
 hw/intc/trace-events  |   1 +
 3 files changed, 93 insertions(+), 12 deletions(-)

diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index a5a1ef93ca..526740aa7e 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -157,6 +157,10 @@ static int ich_highest_active_virt_prio(GICv3CPUState *cs)
 int i;
 int aprmax = ich_num_aprs(cs);
 
+if (cs->ich_apr[GICV3_G1NS][0] & ICV_AP1R_EL1_NMI) {
+return 0x0;
+}
+
 for (i = 0; i < aprmax; i++) {
 uint32_t apr = cs->ich_apr[GICV3_G0][i] |
 cs->ich_apr[GICV3_G1NS][i];
@@ -191,6 +195,7 @@ static int hppvi_index(GICv3CPUState *cs)
  * correct behaviour.
  */
 int prio = 0xff;
+bool nmi = false;
 
 if (!(cs->ich_vmcr_el2 & (ICH_VMCR_EL2_VENG0 | ICH_VMCR_EL2_VENG1))) {
 /* Both groups disabled, definitely nothing to do */
@@ -199,6 +204,7 @@ static int hppvi_index(GICv3CPUState *cs)
 
 for (i = 0; i < cs->num_list_regs; i++) {
 uint64_t lr = cs->ich_lr_el2[i];
+bool thisnmi;
 int thisprio;
 
 if (ich_lr_state(lr) != ICH_LR_EL2_STATE_PENDING) {
@@ -217,10 +223,12 @@ static int hppvi_index(GICv3CPUState *cs)
 }
 }
 
+thisnmi = lr & ICH_LR_EL2_NMI;
 thisprio = ich_lr_prio(lr);
 
-if (thisprio < prio) {
+if ((thisprio < prio) || ((thisprio == prio) && (thisnmi & (!nmi {
 prio = thisprio;
+nmi = thisnmi;
 idx = i;
 }
 }
@@ -289,6 +297,7 @@ static bool icv_hppi_can_preempt(GICv3CPUState *cs, 
uint64_t lr)
  * equivalent of these checks.
  */
 int grp;
+bool is_nmi;
 uint32_t mask, prio, rprio, vpmr;
 
 if (!(cs->ich_hcr_el2 & ICH_HCR_EL2_EN)) {
@@ -301,10 +310,11 @@ static bool icv_hppi_can_preempt(GICv3CPUState *cs, 
uint64_t lr)
  */
 
 prio = ich_lr_prio(lr);
+is_nmi = lr & ICH_LR_EL2_NMI;
 vpmr = extract64(cs->ich_vmcr_el2, ICH_VMCR_EL2_VPMR_SHIFT,
  ICH_VMCR_EL2_VPMR_LENGTH);
 
-if (prio >= vpmr) {
+if (!is_nmi && prio >= vpmr) {
 /* Priority mask masks this interrupt */
 return false;
 }
@@ -326,6 +336,11 @@ static bool icv_hppi_can_preempt(GICv3CPUState *cs, 
uint64_t lr)
 return true;
 }
 
+if ((prio & mask) == (rprio & mask) && is_nmi &&
+!(cs->ich_apr[GICV3_G1NS][0] & ICV_AP1R_EL1_NMI)) {
+return true;
+}
+
 return false;
 }
 
@@ -550,7 +565,11 @@ static void icv_ap_write(CPUARMState *env, const 
ARMCPRegInfo *ri,
 
 trace_gicv3_icv_ap_write(ri->crm & 1, regno, gicv3_redist_affid(cs), 
value);
 
-cs->ich_apr[grp][regno] = value & 0xU;
+if (cs->gic->nmi_support) {
+cs->ich_apr[grp][regno] = value & (0xU | ICV_AP1R_EL1_NMI);
+} else {
+cs->ich_apr[grp][regno] = value & 0xU;
+}
 
 gicv3_cpuif_virt_irq_fiq_update(cs);
 

[PATCH v12 12/23] target/arm: Handle NMI in arm_cpu_do_interrupt_aarch64()

2024-04-03 Thread Jinjie Ruan via
According to Arm GIC section 4.6.3 Interrupt superpriority, the interrupt
with superpriority is always IRQ, never FIQ, so the NMI exception trap entry
behave like IRQ. And VINMI(vIRQ with Superpriority) can be raised from the
GIC or come from the hcrx_el2.HCRX_VINMI bit, VFNMI(vFIQ with Superpriority)
come from the hcrx_el2.HCRX_VFNMI bit.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v9:
- Update the commit message.
- Handle VINMI and VFNMI.
v7:
- Add Reviewed-by.
v6:
- Not combine VFNMI with CPU_INTERRUPT_VNMI.
v4:
- Also handle VNMI in arm_cpu_do_interrupt_aarch64().
v3:
- Remove the FIQ NMI handle.
---
 target/arm/helper.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index 65f2ddfa56..0455f20ccc 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -11649,10 +11649,13 @@ static void arm_cpu_do_interrupt_aarch64(CPUState *cs)
 break;
 case EXCP_IRQ:
 case EXCP_VIRQ:
+case EXCP_NMI:
+case EXCP_VINMI:
 addr += 0x80;
 break;
 case EXCP_FIQ:
 case EXCP_VFIQ:
+case EXCP_VFNMI:
 addr += 0x100;
 break;
 case EXCP_VSERR:
-- 
2.34.1




[PATCH v12 01/23] target/arm: Handle HCR_EL2 accesses for bits introduced with FEAT_NMI

2024-04-03 Thread Jinjie Ruan via
FEAT_NMI defines another three new bits in HCRX_EL2: TALLINT, HCRX_VINMI and
HCRX_VFNMI. When the feature is enabled, allow these bits to be written in
HCRX_EL2.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v12:
- Remove the redundant blank line.
v9:
- Declare cpu variable to reuse latter.
v4:
- Update the comment for FEAT_NMI in hcrx_write().
- Update the commit message, s/thress/three/g.
v3:
- Add Reviewed-by.
- Add HCRX_VINMI and HCRX_VFNMI support in HCRX_EL2.
- Upate the commit messsage.
---
 target/arm/cpu-features.h | 5 +
 target/arm/helper.c   | 8 +++-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/target/arm/cpu-features.h b/target/arm/cpu-features.h
index e5758d9fbc..b300d0446d 100644
--- a/target/arm/cpu-features.h
+++ b/target/arm/cpu-features.h
@@ -681,6 +681,11 @@ static inline bool isar_feature_aa64_sme(const 
ARMISARegisters *id)
 return FIELD_EX64(id->id_aa64pfr1, ID_AA64PFR1, SME) != 0;
 }
 
+static inline bool isar_feature_aa64_nmi(const ARMISARegisters *id)
+{
+return FIELD_EX64(id->id_aa64pfr1, ID_AA64PFR1, NMI) != 0;
+}
+
 static inline bool isar_feature_aa64_tgran4_lpa2(const ARMISARegisters *id)
 {
 return FIELD_SEX64(id->id_aa64mmfr0, ID_AA64MMFR0, TGRAN4) >= 1;
diff --git a/target/arm/helper.c b/target/arm/helper.c
index 3f3a5b55d4..408922c94d 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -6183,13 +6183,19 @@ bool el_is_in_host(CPUARMState *env, int el)
 static void hcrx_write(CPUARMState *env, const ARMCPRegInfo *ri,
uint64_t value)
 {
+ARMCPU *cpu = env_archcpu(env);
 uint64_t valid_mask = 0;
 
 /* FEAT_MOPS adds MSCEn and MCE2 */
-if (cpu_isar_feature(aa64_mops, env_archcpu(env))) {
+if (cpu_isar_feature(aa64_mops, cpu)) {
 valid_mask |= HCRX_MSCEN | HCRX_MCE2;
 }
 
+/* FEAT_NMI adds TALLINT, VINMI and VFNMI */
+if (cpu_isar_feature(aa64_nmi, cpu)) {
+valid_mask |= HCRX_TALLINT | HCRX_VINMI | HCRX_VFNMI;
+}
+
 /* Clear RES0 bits.  */
 env->cp15.hcrx_el2 = value & valid_mask;
 }
-- 
2.34.1




[PATCH v12 23/23] hw/arm/virt: Add FEAT_GICv3_NMI feature support in virt GIC

2024-04-03 Thread Jinjie Ruan via
A PE that implements FEAT_NMI and FEAT_GICv3 also implements
FEAT_GICv3_NMI. A PE that does not implement FEAT_NMI, does not implement
FEAT_GICv3_NMI

So included support FEAT_GICv3_NMI feature as part of virt platform
GIC initialization if FEAT_NMI and FEAT_GICv3 supported.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v4:
- Add Reviewed-by.
v3:
- Adjust to be the last after add FEAT_NMI to max.
- Check whether support FEAT_NMI and FEAT_GICv3 for FEAT_GICv3_NMI.
---
 hw/arm/virt.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index ef2e6c2c4d..63d9f5b553 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -729,6 +729,19 @@ static void create_v2m(VirtMachineState *vms)
 vms->msi_controller = VIRT_MSI_CTRL_GICV2M;
 }
 
+/*
+ * A PE that implements FEAT_NMI and FEAT_GICv3 also implements
+ * FEAT_GICv3_NMI. A PE that does not implement FEAT_NMI, does not implement
+ * FEAT_GICv3_NMI.
+ */
+static bool gicv3_nmi_present(VirtMachineState *vms)
+{
+ARMCPU *cpu = ARM_CPU(qemu_get_cpu(0));
+
+return cpu_isar_feature(aa64_nmi, cpu) &&
+   (vms->gic_version != VIRT_GIC_VERSION_2);
+}
+
 static void create_gic(VirtMachineState *vms, MemoryRegion *mem)
 {
 MachineState *ms = MACHINE(vms);
@@ -802,6 +815,11 @@ static void create_gic(VirtMachineState *vms, MemoryRegion 
*mem)
   vms->virt);
 }
 }
+
+if (gicv3_nmi_present(vms)) {
+qdev_prop_set_bit(vms->gic, "has-nmi", true);
+}
+
 gicbusdev = SYS_BUS_DEVICE(vms->gic);
 sysbus_realize_and_unref(gicbusdev, _fatal);
 sysbus_mmio_map(gicbusdev, 0, vms->memmap[VIRT_GIC_DIST].base);
-- 
2.34.1




[PATCH v12 08/23] target/arm: Handle IS/FS in ISR_EL1 for NMI, VINMI and VFNMI

2024-04-03 Thread Jinjie Ruan via
Add IS and FS bit in ISR_EL1 and handle the read. With CPU_INTERRUPT_NMI or
CPU_INTERRUPT_VINMI, both CPSR_I and ISR_IS must be set. With
CPU_INTERRUPT_VFNMI, both CPSR_F and ISR_FS must be set.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v9:
- CPU_INTERRUPT_VNMI -> CPU_INTERRUPT_VINMI.
- Handle CPSR_F and ISR_FS according to CPU_INTERRUPT_VFNMI instead of
  CPU_INTERRUPT_VFIQ and HCRX_EL2.VFNMI.
- Update the commit message.
v7:
- env->cp15.hcrx_el2 -> arm_hcrx_el2_eff().
- Add Reviewed-by.
v6:
- Verify that HCR_EL2.VF is set before checking VFNMI.
v4;
- Also handle VNMI.
v3:
- CPU_INTERRUPT_NMI do not set FIQ, so remove it.
- With CPU_INTERRUPT_NMI, both CPSR_I and ISR_IS must be set.
---
 target/arm/cpu.h|  2 ++
 target/arm/helper.c | 13 +
 2 files changed, 15 insertions(+)

diff --git a/target/arm/cpu.h b/target/arm/cpu.h
index 08a6bc50de..97997dbd08 100644
--- a/target/arm/cpu.h
+++ b/target/arm/cpu.h
@@ -1398,6 +1398,8 @@ void pmu_init(ARMCPU *cpu);
 #define CPSR_N (1U << 31)
 #define CPSR_NZCV (CPSR_N | CPSR_Z | CPSR_C | CPSR_V)
 #define CPSR_AIF (CPSR_A | CPSR_I | CPSR_F)
+#define ISR_FS (1U << 9)
+#define ISR_IS (1U << 10)
 
 #define CPSR_IT (CPSR_IT_0_1 | CPSR_IT_2_7)
 #define CACHED_CPSR_BITS (CPSR_T | CPSR_AIF | CPSR_GE | CPSR_IT | CPSR_Q \
diff --git a/target/arm/helper.c b/target/arm/helper.c
index d9814433e1..0e7eefd7e5 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -2021,16 +2021,29 @@ static uint64_t isr_read(CPUARMState *env, const 
ARMCPRegInfo *ri)
 if (cs->interrupt_request & CPU_INTERRUPT_VIRQ) {
 ret |= CPSR_I;
 }
+if (cs->interrupt_request & CPU_INTERRUPT_VINMI) {
+ret |= ISR_IS;
+ret |= CPSR_I;
+}
 } else {
 if (cs->interrupt_request & CPU_INTERRUPT_HARD) {
 ret |= CPSR_I;
 }
+
+if (cs->interrupt_request & CPU_INTERRUPT_NMI) {
+ret |= ISR_IS;
+ret |= CPSR_I;
+}
 }
 
 if (hcr_el2 & HCR_FMO) {
 if (cs->interrupt_request & CPU_INTERRUPT_VFIQ) {
 ret |= CPSR_F;
 }
+if (cs->interrupt_request & CPU_INTERRUPT_VFNMI) {
+ret |= ISR_FS;
+ret |= CPSR_F;
+}
 } else {
 if (cs->interrupt_request & CPU_INTERRUPT_FIQ) {
 ret |= CPSR_F;
-- 
2.34.1




[PATCH 2/2] scsi-disk: Fix the migration crash of the CDROM device with USB bus

2024-04-03 Thread Hyman Huang
When configuring VMs with the CDROM device using the USB bus
in Libvirt, do as follows:


  
  
  
  
  



The destination Qemu process crashed, causing the VM migration
to fail; the backtrace reveals the following:

Program terminated with signal SIGSEGV, Segmentation fault.
0  __memmove_sse2_unaligned_erms () at 
../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:312
312movq-8(%rsi,%rdx), %rcx
[Current thread is 1 (Thread 0x7f0a9025fc00 (LWP 3286206))]
(gdb) bt
0  __memmove_sse2_unaligned_erms () at 
../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:312
1  memcpy (__len=8, __src=, __dest=) at 
/usr/include/bits/string_fortified.h:34
2  iov_from_buf_full (iov=, iov_cnt=, 
offset=, buf=0x0, bytes=bytes@entry=8) at ../util/iov.c:33
3  iov_from_buf (bytes=8, buf=, offset=, 
iov_cnt=, iov=)
   at 
/usr/src/debug/qemu-6-6.2.0-75.7.oe1.smartx.git.40.x86_64/include/qemu/iov.h:49
4  usb_packet_copy (p=p@entry=0x56066b2fb5a0, ptr=, 
bytes=bytes@entry=8) at ../hw/usb/core.c:636
5  usb_msd_copy_data (s=s@entry=0x56066c62c770, p=p@entry=0x56066b2fb5a0) at 
../hw/usb/dev-storage.c:186
6  usb_msd_handle_data (dev=0x56066c62c770, p=0x56066b2fb5a0) at 
../hw/usb/dev-storage.c:496
7  usb_handle_packet (dev=0x56066c62c770, p=p@entry=0x56066b2fb5a0) at 
../hw/usb/core.c:455
8  uhci_handle_td (s=s@entry=0x56066bd5f210, q=0x56066bb7fbd0, q@entry=0x0, 
qh_addr=qh_addr@entry=902518530, td=td@entry=0x7fffe6e788f0, td_addr=,
   int_mask=int_mask@entry=0x7fffe6e788e4) at ../hw/usb/hcd-uhci.c:885
9  uhci_process_frame (s=s@entry=0x56066bd5f210) at ../hw/usb/hcd-uhci.c:1061
10 uhci_frame_timer (opaque=opaque@entry=0x56066bd5f210) at 
../hw/usb/hcd-uhci.c:1159
11 timerlist_run_timers (timer_list=0x56066af26bd0) at ../util/qemu-timer.c:642
12 qemu_clock_run_timers (type=QEMU_CLOCK_VIRTUAL) at ../util/qemu-timer.c:656
13 qemu_clock_run_all_timers () at ../util/qemu-timer.c:738
14 main_loop_wait (nonblocking=nonblocking@entry=0) at ../util/main-loop.c:542
15 qemu_main_loop () at ../softmmu/runstate.c:739
16 main (argc=, argv=, envp=) at 
../softmmu/main.c:52
(gdb) frame 5
(gdb) p ((SCSIDiskReq *)s->req)->iov
$1 = {iov_base = 0x0, iov_len = 0}
(gdb) p/x s->req->tag
$2 = 0x472

The scsi commands that the CDROM issued are wrapped as the
payload of the USB protocol in Qemu's implementation of a
USB mass storage device, which is used to implement a
CDROM device that uses a USB bus.

In general, the USB controller processes SCSI commands in
two phases. Sending the OUT USB package that encapsulates
the SCSI command is the first stage; scsi-disk would handle
this by emulating the SCSI operation. Receiving the IN USB
package containing the SCSI operation's output is the second
stage. Additionally, the SCSI request tag tracks the request
during the procedure.

Since QEMU did not migrate the flying SCSI request, the
output of the SCSI may be lost if the live migration is
initiated between the two previously mentioned steps.

In our scenario, the SCSI command is GET_EVENT_STATUS_NOTIFICATION,
the QEMU log information below demonstrates how the SCSI command
is being handled (first step) on the source:

usb_packet_state_change bus 0, port 2, ep 2, packet 0x559f9ba14b00, state undef 
-> setup
usb_msd_cmd_submit lun 0, tag 0x472, flags 0x0080, len 10, data-len 8

After migration, the VM crashed as soon as the destination's UHCI
controller began processing the remaining portion of the SCSI
request (second step)! Here is how the QEMU logged out:

usb_packet_state_change bus 0, port 2, ep 1, packet 0x56066b2fb5a0, state undef 
-> setup
usb_msd_data_in 8/8 (scsi 8)
shutting down, reason=crashed

To summarize, the missing scsi request during a live migration
may cause a VM configured with a CDROM to crash.

Migrating the SCSI request that the scsi-disk is handling is
the simple approach, assuming that it actually exists.

Signed-off-by: Hyman Huang 
---
 hw/scsi/scsi-disk.c | 24 +++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index 0985676f73..d6e9d9e8d4 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -160,6 +160,16 @@ static void scsi_disk_save_request(QEMUFile *f, 
SCSIRequest *req)
 }
 }
 
+static void scsi_disk_emulate_save_request(QEMUFile *f, SCSIRequest *req)
+{
+SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
+SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
+
+if (s->migrate_emulate_scsi_request) {
+scsi_disk_save_request(f, req);
+}
+}
+
 static void scsi_disk_load_request(QEMUFile *f, SCSIRequest *req)
 {
 SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
@@ -183,6 +193,16 @@ static void scsi_disk_load_request(QEMUFile *f, 
SCSIRequest *req)
 qemu_iovec_init_external(>qiov, >iov, 1);
 }
 
+static void scsi_disk_emulate_load_request(QEMUFile *f, SCSIRequest *req)
+{
+SCSIDiskReq *r = DO_UPCAST(SCSIDiskReq, req, req);
+SCSIDiskState *s = DO_UPCAST(SCSIDiskState, 

[PATCH v12 14/23] hw/intc/arm_gicv3: Add irq non-maskable property

2024-04-03 Thread Jinjie Ruan via
A SPI, PPI or SGI interrupt can have non-maskable property. So maintain
non-maskable property in PendingIrq and GICR/GICD. Since add new device
state, it also needs to be migrated, so also save NMI info in
vmstate_gicv3_cpu and vmstate_gicv3.

Signed-off-by: Jinjie Ruan 
Acked-by: Richard Henderson 
Reviewed-by: Peter Maydell 
---
v12:
- nmi_needed -> gicv3_cpu_nmi_needed.
- needed_nmi -> gicv3_nmi_needed.
- Add Reviewed-by.
v11:
- Put vmstate_gicv3_cpu_nmi and vmstate_gicv3_gicd_nmi into existing list.
- Remove the excess != 0.
v10:
- superprio -> nmi, gicr_isuperprio -> gicr_inmir0.
- Save NMI state in vmstate_gicv3_cpu and vmstate_gicv3.
- Update the commit message.
v3:
- Place this ahead of implement GICR_INMIR.
- Add Acked-by.
---
 hw/intc/arm_gicv3_common.c | 38 ++
 include/hw/intc/arm_gicv3_common.h |  4 
 2 files changed, 42 insertions(+)

diff --git a/hw/intc/arm_gicv3_common.c b/hw/intc/arm_gicv3_common.c
index 2d2cea6858..9810558b07 100644
--- a/hw/intc/arm_gicv3_common.c
+++ b/hw/intc/arm_gicv3_common.c
@@ -164,6 +164,24 @@ const VMStateDescription vmstate_gicv3_gicv4 = {
 }
 };
 
+static bool gicv3_cpu_nmi_needed(void *opaque)
+{
+GICv3CPUState *cs = opaque;
+
+return cs->gic->nmi_support;
+}
+
+static const VMStateDescription vmstate_gicv3_cpu_nmi = {
+.name = "arm_gicv3_cpu/nmi",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = gicv3_cpu_nmi_needed,
+.fields = (const VMStateField[]) {
+VMSTATE_UINT32(gicr_inmir0, GICv3CPUState),
+VMSTATE_END_OF_LIST()
+}
+};
+
 static const VMStateDescription vmstate_gicv3_cpu = {
 .name = "arm_gicv3_cpu",
 .version_id = 1,
@@ -196,6 +214,7 @@ static const VMStateDescription vmstate_gicv3_cpu = {
 _gicv3_cpu_virt,
 _gicv3_cpu_sre_el1,
 _gicv3_gicv4,
+_gicv3_cpu_nmi,
 NULL
 }
 };
@@ -238,6 +257,24 @@ const VMStateDescription 
vmstate_gicv3_gicd_no_migration_shift_bug = {
 }
 };
 
+static bool gicv3_nmi_needed(void *opaque)
+{
+GICv3State *cs = opaque;
+
+return cs->nmi_support;
+}
+
+const VMStateDescription vmstate_gicv3_gicd_nmi = {
+.name = "arm_gicv3/gicd_nmi",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = gicv3_nmi_needed,
+.fields = (const VMStateField[]) {
+VMSTATE_UINT32_ARRAY(nmi, GICv3State, GICV3_BMP_SIZE),
+VMSTATE_END_OF_LIST()
+}
+};
+
 static const VMStateDescription vmstate_gicv3 = {
 .name = "arm_gicv3",
 .version_id = 1,
@@ -266,6 +303,7 @@ static const VMStateDescription vmstate_gicv3 = {
 },
 .subsections = (const VMStateDescription * const []) {
 _gicv3_gicd_no_migration_shift_bug,
+_gicv3_gicd_nmi,
 NULL
 }
 };
diff --git a/include/hw/intc/arm_gicv3_common.h 
b/include/hw/intc/arm_gicv3_common.h
index 4358c5319c..88533749eb 100644
--- a/include/hw/intc/arm_gicv3_common.h
+++ b/include/hw/intc/arm_gicv3_common.h
@@ -146,6 +146,7 @@ typedef struct {
 int irq;
 uint8_t prio;
 int grp;
+bool nmi;
 } PendingIrq;
 
 struct GICv3CPUState {
@@ -172,6 +173,7 @@ struct GICv3CPUState {
 uint32_t gicr_ienabler0;
 uint32_t gicr_ipendr0;
 uint32_t gicr_iactiver0;
+uint32_t gicr_inmir0;
 uint32_t edge_trigger; /* ICFGR0 and ICFGR1 even bits */
 uint32_t gicr_igrpmodr0;
 uint32_t gicr_nsacr;
@@ -275,6 +277,7 @@ struct GICv3State {
 GIC_DECLARE_BITMAP(active);   /* GICD_ISACTIVER */
 GIC_DECLARE_BITMAP(level);/* Current level */
 GIC_DECLARE_BITMAP(edge_trigger); /* GICD_ICFGR even bits */
+GIC_DECLARE_BITMAP(nmi);  /* GICD_INMIR */
 uint8_t gicd_ipriority[GICV3_MAXIRQ];
 uint64_t gicd_irouter[GICV3_MAXIRQ];
 /* Cached information: pointer to the cpu i/f for the CPUs specified
@@ -314,6 +317,7 @@ GICV3_BITMAP_ACCESSORS(pending)
 GICV3_BITMAP_ACCESSORS(active)
 GICV3_BITMAP_ACCESSORS(level)
 GICV3_BITMAP_ACCESSORS(edge_trigger)
+GICV3_BITMAP_ACCESSORS(nmi)
 
 #define TYPE_ARM_GICV3_COMMON "arm-gicv3-common"
 typedef struct ARMGICv3CommonClass ARMGICv3CommonClass;
-- 
2.34.1




[PATCH v12 21/23] hw/intc/arm_gicv3: Report the VINMI interrupt

2024-04-03 Thread Jinjie Ruan via
In vCPU Interface, if the vIRQ has the non-maskable property, report
vINMI to the corresponding vPE.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
Reviewed-by: Peter Maydell 
---
v12:
- Do not check nmi_support repetitively.
- Add Reviewed-by.
v10:
- Update the commit message, superpriority -> non-maskable.
v9:
- Update the commit subject and message, vNMI -> vINMI.
v6:
- Add Reviewed-by.
---
 hw/intc/arm_gicv3_cpuif.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index 93476f4744..f54b3b45ec 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -480,6 +480,7 @@ void gicv3_cpuif_virt_irq_fiq_update(GICv3CPUState *cs)
 int idx;
 int irqlevel = 0;
 int fiqlevel = 0;
+int nmilevel = 0;
 
 idx = hppvi_index(cs);
 trace_gicv3_cpuif_virt_update(gicv3_redist_affid(cs), idx,
@@ -497,9 +498,17 @@ void gicv3_cpuif_virt_irq_fiq_update(GICv3CPUState *cs)
 uint64_t lr = cs->ich_lr_el2[idx];
 
 if (icv_hppi_can_preempt(cs, lr)) {
-/* Virtual interrupts are simple: G0 are always FIQ, and G1 IRQ */
+/*
+ * Virtual interrupts are simple: G0 are always FIQ, and G1 are
+ * IRQ or NMI which depends on the ICH_LR_EL2.NMI to have
+ * non-maskable property.
+ */
 if (lr & ICH_LR_EL2_GROUP) {
-irqlevel = 1;
+if (lr & ICH_LR_EL2_NMI) {
+nmilevel = 1;
+} else {
+irqlevel = 1;
+}
 } else {
 fiqlevel = 1;
 }
@@ -509,6 +518,7 @@ void gicv3_cpuif_virt_irq_fiq_update(GICv3CPUState *cs)
 trace_gicv3_cpuif_virt_set_irqs(gicv3_redist_affid(cs), fiqlevel, 
irqlevel);
 qemu_set_irq(cs->parent_vfiq, fiqlevel);
 qemu_set_irq(cs->parent_virq, irqlevel);
+qemu_set_irq(cs->parent_vnmi, nmilevel);
 }
 
 static void gicv3_cpuif_virt_update(GICv3CPUState *cs)
-- 
2.34.1




[PATCH v12 06/23] target/arm: Add support for Non-maskable Interrupt

2024-04-03 Thread Jinjie Ruan via
This only implements the external delivery method via the GICv3.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v12:
- Correct the comment style in arm_cpu_initfn().
v10:
- In arm_cpu_exec_interrupt(), if SCTLR_ELx.NMI is 0, NMI -> IRQ,
  VINMI -> VIRQ, VFNMI -> VFIQ.
- Make arm_cpu_update_virq() and arm_cpu_update_vfiq() check that it is not a
  VINMI/VFNMI, so only set 1 bit in interrupt_request, not 2.
v9:
- Update the GPIOs passed in the arm_cpu_kvm_set_irq, and update the comment.
- Definitely not merge VINMI and VFNMI into EXCP_VNMI.
- Update VINMI and VFNMI when writing HCR_EL2 or HCRX_EL2.
v8:
- Fix the rcu stall after sending a VNMI in qemu VM.
v7:
- Add Reviewed-by.
v6:
- env->cp15.hcr_el2 -> arm_hcr_el2_eff().
- env->cp15.hcrx_el2 -> arm_hcrx_el2_eff().
- Not include VF && VFNMI in CPU_INTERRUPT_VNMI.
v4:
- Accept NMI unconditionally for arm_cpu_has_work() but add comment.
- Change from & to && for EXCP_IRQ or EXCP_FIQ.
- Refator nmi mask in arm_excp_unmasked().
- Also handle VNMI in arm_cpu_exec_interrupt() and arm_cpu_set_irq().
- Rename virtual to Virtual.
v3:
- Not include CPU_INTERRUPT_NMI when FEAT_NMI not enabled
- Add ARM_CPU_VNMI.
- Refator nmi mask in arm_excp_unmasked().
- Test SCTLR_ELx.NMI for ALLINT mask for NMI.
---
 target/arm/cpu-qom.h   |   5 +-
 target/arm/cpu.c   | 147 ++---
 target/arm/cpu.h   |   6 ++
 target/arm/helper.c|  33 +++--
 target/arm/internals.h |  18 +
 5 files changed, 193 insertions(+), 16 deletions(-)

diff --git a/target/arm/cpu-qom.h b/target/arm/cpu-qom.h
index 8e032691db..b497667d61 100644
--- a/target/arm/cpu-qom.h
+++ b/target/arm/cpu-qom.h
@@ -36,11 +36,14 @@ DECLARE_CLASS_CHECKERS(AArch64CPUClass, AARCH64_CPU,
 #define ARM_CPU_TYPE_SUFFIX "-" TYPE_ARM_CPU
 #define ARM_CPU_TYPE_NAME(name) (name ARM_CPU_TYPE_SUFFIX)
 
-/* Meanings of the ARMCPU object's four inbound GPIO lines */
+/* Meanings of the ARMCPU object's seven inbound GPIO lines */
 #define ARM_CPU_IRQ 0
 #define ARM_CPU_FIQ 1
 #define ARM_CPU_VIRQ 2
 #define ARM_CPU_VFIQ 3
+#define ARM_CPU_NMI 4
+#define ARM_CPU_VINMI 5
+#define ARM_CPU_VFNMI 6
 
 /* For M profile, some registers are banked secure vs non-secure;
  * these are represented as a 2-element array where the first element
diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index ab8d007a86..d2dfd36fd4 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -122,6 +122,13 @@ void arm_restore_state_to_opc(CPUState *cs,
 }
 #endif /* CONFIG_TCG */
 
+/*
+ * With SCTLR_ELx.NMI == 0, IRQ with Superpriority is masked identically with
+ * IRQ without Superpriority. Moreover, if the GIC is configured so that
+ * FEAT_GICv3_NMI is only set if FEAT_NMI is set, then we won't ever see
+ * CPU_INTERRUPT_*NMI anyway. So we might as well accept NMI here
+ * unconditionally.
+ */
 static bool arm_cpu_has_work(CPUState *cs)
 {
 ARMCPU *cpu = ARM_CPU(cs);
@@ -129,6 +136,7 @@ static bool arm_cpu_has_work(CPUState *cs)
 return (cpu->power_state != PSCI_OFF)
 && cs->interrupt_request &
 (CPU_INTERRUPT_FIQ | CPU_INTERRUPT_HARD
+ | CPU_INTERRUPT_NMI | CPU_INTERRUPT_VINMI | CPU_INTERRUPT_VFNMI
  | CPU_INTERRUPT_VFIQ | CPU_INTERRUPT_VIRQ | CPU_INTERRUPT_VSERR
  | CPU_INTERRUPT_EXITTB);
 }
@@ -668,6 +676,7 @@ static inline bool arm_excp_unmasked(CPUState *cs, unsigned 
int excp_idx,
 CPUARMState *env = cpu_env(cs);
 bool pstate_unmasked;
 bool unmasked = false;
+bool allIntMask = false;
 
 /*
  * Don't take exceptions if they target a lower EL.
@@ -678,13 +687,36 @@ static inline bool arm_excp_unmasked(CPUState *cs, 
unsigned int excp_idx,
 return false;
 }
 
+if (cpu_isar_feature(aa64_nmi, env_archcpu(env)) &&
+env->cp15.sctlr_el[target_el] & SCTLR_NMI && cur_el == target_el) {
+allIntMask = env->pstate & PSTATE_ALLINT ||
+ ((env->cp15.sctlr_el[target_el] & SCTLR_SPINTMASK) &&
+  (env->pstate & PSTATE_SP));
+}
+
 switch (excp_idx) {
+case EXCP_NMI:
+pstate_unmasked = !allIntMask;
+break;
+
+case EXCP_VINMI:
+if (!(hcr_el2 & HCR_IMO) || (hcr_el2 & HCR_TGE)) {
+/* VINMIs are only taken when hypervized.  */
+return false;
+}
+return !allIntMask;
+case EXCP_VFNMI:
+if (!(hcr_el2 & HCR_FMO) || (hcr_el2 & HCR_TGE)) {
+/* VFNMIs are only taken when hypervized.  */
+return false;
+}
+return !allIntMask;
 case EXCP_FIQ:
-pstate_unmasked = !(env->daif & PSTATE_F);
+pstate_unmasked = (!(env->daif & PSTATE_F)) && (!allIntMask);
 break;
 
 case EXCP_IRQ:
-pstate_unmasked = !(env->daif & PSTATE_I);
+pstate_unmasked = (!(env->daif & PSTATE_I)) && (!allIntMask);
 break;
 
 case EXCP_VFIQ:
@@ -692,13 +724,13 @@ static inline bool arm_excp_unmasked(CPUState *cs, 
unsigned 

[PATCH v12 15/23] hw/intc/arm_gicv3_redist: Implement GICR_INMIR0

2024-04-03 Thread Jinjie Ruan via
Add GICR_INMIR0 register and support access GICR_INMIR0.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
Reviewed-by: Peter Maydell 
---
v11:
- Add new Reviewed-by.
v10:
- gicr_isuperprio -> gicr_inmir0.
v6:
- Add Reviewed-by.
v4:
- Make the GICR_INMIR0 implementation more clearer.
---
 hw/intc/arm_gicv3_redist.c | 19 +++
 hw/intc/gicv3_internal.h   |  1 +
 2 files changed, 20 insertions(+)

diff --git a/hw/intc/arm_gicv3_redist.c b/hw/intc/arm_gicv3_redist.c
index 8153525849..ed1f9d1e44 100644
--- a/hw/intc/arm_gicv3_redist.c
+++ b/hw/intc/arm_gicv3_redist.c
@@ -35,6 +35,15 @@ static int gicr_ns_access(GICv3CPUState *cs, int irq)
 return extract32(cs->gicr_nsacr, irq * 2, 2);
 }
 
+static void gicr_write_bitmap_reg(GICv3CPUState *cs, MemTxAttrs attrs,
+  uint32_t *reg, uint32_t val)
+{
+/* Helper routine to implement writing to a "set" register */
+val &= mask_group(cs, attrs);
+*reg = val;
+gicv3_redist_update(cs);
+}
+
 static void gicr_write_set_bitmap_reg(GICv3CPUState *cs, MemTxAttrs attrs,
   uint32_t *reg, uint32_t val)
 {
@@ -406,6 +415,10 @@ static MemTxResult gicr_readl(GICv3CPUState *cs, hwaddr 
offset,
 *data = value;
 return MEMTX_OK;
 }
+case GICR_INMIR0:
+*data = cs->gic->nmi_support ?
+gicr_read_bitmap_reg(cs, attrs, cs->gicr_inmir0) : 0;
+return MEMTX_OK;
 case GICR_ICFGR0:
 case GICR_ICFGR1:
 {
@@ -555,6 +568,12 @@ static MemTxResult gicr_writel(GICv3CPUState *cs, hwaddr 
offset,
 gicv3_redist_update(cs);
 return MEMTX_OK;
 }
+case GICR_INMIR0:
+if (cs->gic->nmi_support) {
+gicr_write_bitmap_reg(cs, attrs, >gicr_inmir0, value);
+}
+return MEMTX_OK;
+
 case GICR_ICFGR0:
 /* Register is all RAZ/WI or RAO/WI bits */
 return MEMTX_OK;
diff --git a/hw/intc/gicv3_internal.h b/hw/intc/gicv3_internal.h
index 8f4ebed2f4..21697ecf39 100644
--- a/hw/intc/gicv3_internal.h
+++ b/hw/intc/gicv3_internal.h
@@ -110,6 +110,7 @@
 #define GICR_ICFGR1   (GICR_SGI_OFFSET + 0x0C04)
 #define GICR_IGRPMODR0(GICR_SGI_OFFSET + 0x0D00)
 #define GICR_NSACR(GICR_SGI_OFFSET + 0x0E00)
+#define GICR_INMIR0   (GICR_SGI_OFFSET + 0x0F80)
 
 /* VLPI redistributor registers, offsets from VLPI_base */
 #define GICR_VPROPBASER   (GICR_VLPI_OFFSET + 0x70)
-- 
2.34.1




[PATCH v12 10/23] hw/arm/virt: Wire NMI and VINMI irq lines from GIC to CPU

2024-04-03 Thread Jinjie Ruan via
Wire the new NMI and VINMI interrupt line from the GIC to each CPU.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v9:
- Rename ARM_CPU_VNMI to ARM_CPU_VINMI.
- Update the commit message.
v4:
- Add Reviewed-by.
v3:
- Also add VNMI wire.
---
 hw/arm/virt.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index a9a913aead..ef2e6c2c4d 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -821,7 +821,8 @@ static void create_gic(VirtMachineState *vms, MemoryRegion 
*mem)
 
 /* Wire the outputs from each CPU's generic timer and the GICv3
  * maintenance interrupt signal to the appropriate GIC PPI inputs,
- * and the GIC's IRQ/FIQ/VIRQ/VFIQ interrupt outputs to the CPU's inputs.
+ * and the GIC's IRQ/FIQ/VIRQ/VFIQ/NMI/VINMI interrupt outputs to the
+ * CPU's inputs.
  */
 for (i = 0; i < smp_cpus; i++) {
 DeviceState *cpudev = DEVICE(qemu_get_cpu(i));
@@ -865,6 +866,10 @@ static void create_gic(VirtMachineState *vms, MemoryRegion 
*mem)
qdev_get_gpio_in(cpudev, ARM_CPU_VIRQ));
 sysbus_connect_irq(gicbusdev, i + 3 * smp_cpus,
qdev_get_gpio_in(cpudev, ARM_CPU_VFIQ));
+sysbus_connect_irq(gicbusdev, i + 4 * smp_cpus,
+   qdev_get_gpio_in(cpudev, ARM_CPU_NMI));
+sysbus_connect_irq(gicbusdev, i + 5 * smp_cpus,
+   qdev_get_gpio_in(cpudev, ARM_CPU_VINMI));
 }
 
 fdt_add_gic_node(vms);
-- 
2.34.1




[PATCH v12 13/23] hw/intc/arm_gicv3: Add has-nmi property to GICv3 device

2024-04-03 Thread Jinjie Ruan via
Add a property has-nmi to the GICv3 device, and use this to set
the NMI bit in the GICD_TYPER register. This isn't visible to
guests yet because the property defaults to false and we won't
set it in the board code until we've landed all of the changes
needed to implement FEAT_GICV3_NMI.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
Reviewed-by: Peter Maydell 
---
v12:
- Update the subject and commit message.
- Add Reviewed-by.
v10:
- Adjust to before add irq non-maskable property.
v4:
- Add Reviewed-by.
---
 hw/intc/arm_gicv3_common.c | 1 +
 hw/intc/arm_gicv3_dist.c   | 2 ++
 hw/intc/gicv3_internal.h   | 1 +
 include/hw/intc/arm_gicv3_common.h | 1 +
 4 files changed, 5 insertions(+)

diff --git a/hw/intc/arm_gicv3_common.c b/hw/intc/arm_gicv3_common.c
index c52f060026..2d2cea6858 100644
--- a/hw/intc/arm_gicv3_common.c
+++ b/hw/intc/arm_gicv3_common.c
@@ -569,6 +569,7 @@ static Property arm_gicv3_common_properties[] = {
 DEFINE_PROP_UINT32("num-irq", GICv3State, num_irq, 32),
 DEFINE_PROP_UINT32("revision", GICv3State, revision, 3),
 DEFINE_PROP_BOOL("has-lpi", GICv3State, lpi_enable, 0),
+DEFINE_PROP_BOOL("has-nmi", GICv3State, nmi_support, 0),
 DEFINE_PROP_BOOL("has-security-extensions", GICv3State, security_extn, 0),
 /*
  * Compatibility property: force 8 bits of physical priority, even
diff --git a/hw/intc/arm_gicv3_dist.c b/hw/intc/arm_gicv3_dist.c
index 35e850685c..22ddc0d666 100644
--- a/hw/intc/arm_gicv3_dist.c
+++ b/hw/intc/arm_gicv3_dist.c
@@ -389,6 +389,7 @@ static bool gicd_readl(GICv3State *s, hwaddr offset,
  *  by GICD_TYPER.IDbits)
  * MBIS == 0 (message-based SPIs not supported)
  * SecurityExtn == 1 if security extns supported
+ * NMI = 1 if Non-maskable interrupt property is supported
  * CPUNumber == 0 since for us ARE is always 1
  * ITLinesNumber == (((max SPI IntID + 1) / 32) - 1)
  */
@@ -402,6 +403,7 @@ static bool gicd_readl(GICv3State *s, hwaddr offset,
 bool dvis = s->revision >= 4;
 
 *data = (1 << 25) | (1 << 24) | (dvis << 18) | (sec_extn << 10) |
+(s->nmi_support << GICD_TYPER_NMI_SHIFT) |
 (s->lpi_enable << GICD_TYPER_LPIS_SHIFT) |
 (0xf << 19) | itlinesnumber;
 return true;
diff --git a/hw/intc/gicv3_internal.h b/hw/intc/gicv3_internal.h
index 29d5cdc1b6..8f4ebed2f4 100644
--- a/hw/intc/gicv3_internal.h
+++ b/hw/intc/gicv3_internal.h
@@ -68,6 +68,7 @@
 #define GICD_CTLR_E1NWF (1U << 7)
 #define GICD_CTLR_RWP   (1U << 31)
 
+#define GICD_TYPER_NMI_SHIFT   9
 #define GICD_TYPER_LPIS_SHIFT  17
 
 /* 16 bits EventId */
diff --git a/include/hw/intc/arm_gicv3_common.h 
b/include/hw/intc/arm_gicv3_common.h
index 7324c7d983..4358c5319c 100644
--- a/include/hw/intc/arm_gicv3_common.h
+++ b/include/hw/intc/arm_gicv3_common.h
@@ -249,6 +249,7 @@ struct GICv3State {
 uint32_t num_irq;
 uint32_t revision;
 bool lpi_enable;
+bool nmi_support;
 bool security_extn;
 bool force_8bit_prio;
 bool irq_reset_nonsecure;
-- 
2.34.1




[PATCH v12 09/23] target/arm: Handle PSTATE.ALLINT on taking an exception

2024-04-03 Thread Jinjie Ruan via
Set or clear PSTATE.ALLINT on taking an exception to ELx according to the
SCTLR_ELx.SPINTMASK bit.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v9:
- Not check SCTLR_NMI in arm_cpu_do_interrupt_aarch64().
v3:
- Add Reviewed-by.
---
 target/arm/helper.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index 0e7eefd7e5..65f2ddfa56 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -11729,6 +11729,14 @@ static void arm_cpu_do_interrupt_aarch64(CPUState *cs)
 }
 }
 
+if (cpu_isar_feature(aa64_nmi, cpu)) {
+if (!(env->cp15.sctlr_el[new_el] & SCTLR_SPINTMASK)) {
+new_mode |= PSTATE_ALLINT;
+} else {
+new_mode &= ~PSTATE_ALLINT;
+}
+}
+
 pstate_write(env, PSTATE_DAIF | new_mode);
 env->aarch64 = true;
 aarch64_restore_sp(env, new_el);
-- 
2.34.1




[PATCH v12 17/23] hw/intc/arm_gicv3: Add NMI handling CPU interface registers

2024-04-03 Thread Jinjie Ruan via
Add the NMIAR CPU interface registers which deal with acknowledging NMI.

When introduce NMI interrupt, there are some updates to the semantics for the
register ICC_IAR1_EL1 and ICC_HPPIR1_EL1. For ICC_IAR1_EL1 register, it
should return 1022 if the intid has non-maskable property. And for
ICC_NMIAR1_EL1 register, it should return 1023 if the intid do not have
non-maskable property. Howerever, these are not necessary for ICC_HPPIR1_EL1
register.

And the APR and RPR has NMI bits which should be handled correctly.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v12:
- pPriority<63> = ICC_AP1R_EL1NS<63> if HaveNMIExt() and HaveEL(EL3) and
  (IsNonSecure(), fix the wrong writing.
- Do not check nmi_support repetitively in icc_hppi_can_preempt()
  and icc_activate_irq().
- Check hppi.nmi after check icc_hppi_can_preempt() for icc_iar1_read() and
  icc_nmiar1_read().
v11:
- Handle NMI priority in icc_highest_active_prio() and handle NMI RPR in
  icc_rpr_read() separately.
- Only set NMI bit for a NMI and and ordinary priority bit for a non-NMI in
  icc_activate_irq().
- Only clear APR bit for AP1R0 in icc_drop_prio().
- Check special INTID_* in callers instead of passing two extra boolean args
  for ack functions.
- Handle NMI in icc_hppi_can_preempt() and icc_highest_active_group().
- Also check icc_hppi_can_preempt() for icc_nmiar1_read().
v10:
- is_nmi -> nmi.
- is_hppi -> hppi.
- Exchange the order of nmi and hppi parameters.
- superprio -> nmi.
- Handle APR and RPR NMI bits.
- Update the commit message, super priority -> non-maskable property.
v7:
- Add Reviewed-by.
v4:
- Define ICC_NMIAR1_EL1 only if FEAT_GICv3_NMI is implemented.
- Check sctrl_elx.SCTLR_NMI to return 1022 for icc_iar1_read().
- Add gicv3_icc_nmiar1_read() trace event.
- Do not check icc_hppi_can_preempt() for icc_nmiar1_read().
- Add icv_nmiar1_read() and call it when EL2Enabled() and HCR_EL2.IMO == '1'
---
 hw/intc/arm_gicv3_cpuif.c | 137 --
 hw/intc/gicv3_internal.h  |   5 ++
 hw/intc/trace-events  |   1 +
 3 files changed, 138 insertions(+), 5 deletions(-)

diff --git a/hw/intc/arm_gicv3_cpuif.c b/hw/intc/arm_gicv3_cpuif.c
index e1a60d8c15..a5a1ef93ca 100644
--- a/hw/intc/arm_gicv3_cpuif.c
+++ b/hw/intc/arm_gicv3_cpuif.c
@@ -795,6 +795,13 @@ static uint64_t icv_iar_read(CPUARMState *env, const 
ARMCPRegInfo *ri)
 return intid;
 }
 
+static uint64_t icv_nmiar1_read(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+/* todo */
+uint64_t intid = INTID_SPURIOUS;
+return intid;
+}
+
 static uint32_t icc_fullprio_mask(GICv3CPUState *cs)
 {
 /*
@@ -832,6 +839,23 @@ static int icc_highest_active_prio(GICv3CPUState *cs)
  */
 int i;
 
+if (cs->gic->nmi_support) {
+/*
+ * If an NMI is active this takes precedence over anything else
+ * for priority purposes; the NMI bit is only in the AP1R0 bit.
+ * We return here the effective priority of the NMI, which is
+ * either 0x0 or 0x80. Callers will need to check NMI again for
+ * purposes of either setting the RPR register bits or for
+ * prioritization of NMI vs non-NMI.
+ */
+if (cs->icc_apr[GICV3_G1][0] & ICC_AP1R_EL1_NMI) {
+return 0;
+}
+if (cs->icc_apr[GICV3_G1NS][0] & ICC_AP1R_EL1_NMI) {
+return (cs->gic->gicd_ctlr & GICD_CTLR_DS) ? 0 : 0x80;
+}
+}
+
 for (i = 0; i < icc_num_aprs(cs); i++) {
 uint32_t apr = cs->icc_apr[GICV3_G0][i] |
 cs->icc_apr[GICV3_G1][i] | cs->icc_apr[GICV3_G1NS][i];
@@ -898,12 +922,24 @@ static bool icc_hppi_can_preempt(GICv3CPUState *cs)
  */
 int rprio;
 uint32_t mask;
+ARMCPU *cpu = ARM_CPU(cs->cpu);
+CPUARMState *env = >env;
 
 if (icc_no_enabled_hppi(cs)) {
 return false;
 }
 
-if (cs->hppi.prio >= cs->icc_pmr_el1) {
+if (cs->hppi.nmi) {
+if (!(cs->gic->gicd_ctlr & GICD_CTLR_DS) &&
+cs->hppi.grp == GICV3_G1NS) {
+if (cs->icc_pmr_el1 < 0x80) {
+return false;
+}
+if (arm_is_secure(env) && cs->icc_pmr_el1 == 0x80) {
+return false;
+}
+}
+} else if (cs->hppi.prio >= cs->icc_pmr_el1) {
 /* Priority mask masks this interrupt */
 return false;
 }
@@ -923,6 +959,12 @@ static bool icc_hppi_can_preempt(GICv3CPUState *cs)
 return true;
 }
 
+if (cs->hppi.nmi && (cs->hppi.prio & mask) == (rprio & mask)) {
+if (!(cs->icc_apr[cs->hppi.grp][0] & ICC_AP1R_EL1_NMI)) {
+return true;
+}
+}
+
 return false;
 }
 
@@ -1044,8 +1086,13 @@ static void icc_activate_irq(GICv3CPUState *cs, int irq)
 int aprbit = prio >> (8 - cs->prebits);
 int regno = aprbit / 32;
 int regbit = aprbit % 32;
+bool nmi = cs->hppi.nmi;
 
-cs->icc_apr[cs->hppi.grp][regno] |= (1 << regbit);
+if (nmi) {
+

[PATCH v12 16/23] hw/intc/arm_gicv3: Implement GICD_INMIR

2024-04-03 Thread Jinjie Ruan via
Add GICD_INMIR, GICD_INMIRnE register and support access GICD_INMIR0.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
Reviewed-by: Peter Maydell 
---
v11:
- Add new Reviewed-by.
v10:
- superprio -> nmi.
v4:
- Make the GICD_INMIR implementation more clearer.
- Udpate the commit message.
v3:
- Add Reviewed-by.
---
 hw/intc/arm_gicv3_dist.c | 34 ++
 hw/intc/gicv3_internal.h |  2 ++
 2 files changed, 36 insertions(+)

diff --git a/hw/intc/arm_gicv3_dist.c b/hw/intc/arm_gicv3_dist.c
index 22ddc0d666..d8207acb22 100644
--- a/hw/intc/arm_gicv3_dist.c
+++ b/hw/intc/arm_gicv3_dist.c
@@ -89,6 +89,29 @@ static int gicd_ns_access(GICv3State *s, int irq)
 return extract32(s->gicd_nsacr[irq / 16], (irq % 16) * 2, 2);
 }
 
+static void gicd_write_bitmap_reg(GICv3State *s, MemTxAttrs attrs,
+  uint32_t *bmp, maskfn *maskfn,
+  int offset, uint32_t val)
+{
+/*
+ * Helper routine to implement writing to a "set" register
+ * (GICD_INMIR, etc).
+ * Semantics implemented here:
+ * RAZ/WI for SGIs, PPIs, unimplemented IRQs
+ * Bits corresponding to Group 0 or Secure Group 1 interrupts RAZ/WI.
+ * offset should be the offset in bytes of the register from the start
+ * of its group.
+ */
+int irq = offset * 8;
+
+if (irq < GIC_INTERNAL || irq >= s->num_irq) {
+return;
+}
+val &= mask_group_and_nsacr(s, attrs, maskfn, irq);
+*gic_bmp_ptr32(bmp, irq) = val;
+gicv3_update(s, irq, 32);
+}
+
 static void gicd_write_set_bitmap_reg(GICv3State *s, MemTxAttrs attrs,
   uint32_t *bmp,
   maskfn *maskfn,
@@ -545,6 +568,11 @@ static bool gicd_readl(GICv3State *s, hwaddr offset,
 /* RAZ/WI since affinity routing is always enabled */
 *data = 0;
 return true;
+case GICD_INMIR ... GICD_INMIR + 0x7f:
+*data = (!s->nmi_support) ? 0 :
+gicd_read_bitmap_reg(s, attrs, s->nmi, NULL,
+ offset - GICD_INMIR);
+return true;
 case GICD_IROUTER ... GICD_IROUTER + 0x1fdf:
 {
 uint64_t r;
@@ -754,6 +782,12 @@ static bool gicd_writel(GICv3State *s, hwaddr offset,
 case GICD_SPENDSGIR ... GICD_SPENDSGIR + 0xf:
 /* RAZ/WI since affinity routing is always enabled */
 return true;
+case GICD_INMIR ... GICD_INMIR + 0x7f:
+if (s->nmi_support) {
+gicd_write_bitmap_reg(s, attrs, s->nmi, NULL,
+  offset - GICD_INMIR, value);
+}
+return true;
 case GICD_IROUTER ... GICD_IROUTER + 0x1fdf:
 {
 uint64_t r;
diff --git a/hw/intc/gicv3_internal.h b/hw/intc/gicv3_internal.h
index 21697ecf39..8d793243f4 100644
--- a/hw/intc/gicv3_internal.h
+++ b/hw/intc/gicv3_internal.h
@@ -52,6 +52,8 @@
 #define GICD_SGIR0x0F00
 #define GICD_CPENDSGIR   0x0F10
 #define GICD_SPENDSGIR   0x0F20
+#define GICD_INMIR   0x0F80
+#define GICD_INMIRnE 0x3B00
 #define GICD_IROUTER 0x6000
 #define GICD_IDREGS  0xFFD0
 
-- 
2.34.1




[PATCH v12 03/23] target/arm: Add support for FEAT_NMI, Non-maskable Interrupt

2024-04-03 Thread Jinjie Ruan via
Add support for FEAT_NMI. NMI (FEAT_NMI) is an mandatory feature in
ARMv8.8-A and ARM v9.3-A.

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v3:
- Add Reviewed-by.
- Adjust to before the MSR patches.
---
 target/arm/internals.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/target/arm/internals.h b/target/arm/internals.h
index dd3da211a3..516e0584bf 100644
--- a/target/arm/internals.h
+++ b/target/arm/internals.h
@@ -1229,6 +1229,9 @@ static inline uint32_t aarch64_pstate_valid_mask(const 
ARMISARegisters *id)
 if (isar_feature_aa64_mte(id)) {
 valid |= PSTATE_TCO;
 }
+if (isar_feature_aa64_nmi(id)) {
+valid |= PSTATE_ALLINT;
+}
 
 return valid;
 }
-- 
2.34.1




[PATCH v12 05/23] target/arm: Support MSR access to ALLINT

2024-04-03 Thread Jinjie Ruan via
Support ALLINT msr access as follow:
mrs , ALLINT// read allint
msr ALLINT, // write allint with imm

Signed-off-by: Jinjie Ruan 
Reviewed-by: Richard Henderson 
---
v9:
- Move nmi_reginfo and related functions inside an existing ifdef
  TARGET_AARCH64 to solve the --target-list=aarch64-softmmu,arm-softmmu
  compilation problem.
- Check 'isread' when writing to ALLINT.
v5:
- Add Reviewed-by.
v4:
- Remove arm_is_el2_enabled() check in allint_check().
- Change to env->pstate instead of env->allint.
v3:
- Remove EL0 check in aa64_allint_access() which alreay checks in .access
  PL1_RW.
- Use arm_hcrx_el2_eff() in aa64_allint_access() instead of env->cp15.hcrx_el2.
- Make ALLINT msr access function controlled by aa64_nmi.
---
 target/arm/helper.c | 35 +++
 1 file changed, 35 insertions(+)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index 408922c94d..5ed3eacbea 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -7496,6 +7496,37 @@ static const ARMCPRegInfo rme_mte_reginfo[] = {
   .opc0 = 1, .opc1 = 6, .crn = 7, .crm = 14, .opc2 = 5,
   .access = PL3_W, .type = ARM_CP_NOP },
 };
+
+static void aa64_allint_write(CPUARMState *env, const ARMCPRegInfo *ri,
+  uint64_t value)
+{
+env->pstate = (env->pstate & ~PSTATE_ALLINT) | (value & PSTATE_ALLINT);
+}
+
+static uint64_t aa64_allint_read(CPUARMState *env, const ARMCPRegInfo *ri)
+{
+return env->pstate & PSTATE_ALLINT;
+}
+
+static CPAccessResult aa64_allint_access(CPUARMState *env,
+ const ARMCPRegInfo *ri, bool isread)
+{
+if (!isread && arm_current_el(env) == 1 &&
+(arm_hcrx_el2_eff(env) & HCRX_TALLINT)) {
+return CP_ACCESS_TRAP_EL2;
+}
+return CP_ACCESS_OK;
+}
+
+static const ARMCPRegInfo nmi_reginfo[] = {
+{ .name = "ALLINT", .state = ARM_CP_STATE_AA64,
+  .opc0 = 3, .opc1 = 0, .opc2 = 0, .crn = 4, .crm = 3,
+  .type = ARM_CP_NO_RAW,
+  .access = PL1_RW, .accessfn = aa64_allint_access,
+  .fieldoffset = offsetof(CPUARMState, pstate),
+  .writefn = aa64_allint_write, .readfn = aa64_allint_read,
+  .resetfn = arm_cp_reset_ignore },
+};
 #endif /* TARGET_AARCH64 */
 
 static void define_pmu_regs(ARMCPU *cpu)
@@ -9890,6 +9921,10 @@ void register_cp_regs_for_features(ARMCPU *cpu)
 if (cpu_isar_feature(aa64_nv2, cpu)) {
 define_arm_cp_regs(cpu, nv2_reginfo);
 }
+
+if (cpu_isar_feature(aa64_nmi, cpu)) {
+define_arm_cp_regs(cpu, nmi_reginfo);
+}
 #endif
 
 if (cpu_isar_feature(any_predinv, cpu)) {
-- 
2.34.1




  1   2   >