Re: [PATCH] qemu/bitops.h: Locate changed bits

2024-05-28 Thread Wang, Lei
On 5/29/2024 12:59, Tong Ho wrote:> Add inlined functions to obtain a mask of
changed bits.  3 flavors
> are added: toggled, changed to 1, changed to 0.
> 
> These newly added utilities aid common device behaviors where
> actions are taken only when a register's bit(s) are changed.
> 
> Signed-off-by: Tong Ho 
> ---
>  include/qemu/bitops.h | 33 +
>  1 file changed, 33 insertions(+)
> 
> diff --git a/include/qemu/bitops.h b/include/qemu/bitops.h
> index 2c0a2fe751..7a701474ea 100644
> --- a/include/qemu/bitops.h
> +++ b/include/qemu/bitops.h
> @@ -148,6 +148,39 @@ static inline int test_bit(long nr, const unsigned long 
> *addr)
>  return 1UL & (addr[BIT_WORD(nr)] >> (nr & (BITS_PER_LONG-1)));
>  }
>  
> +/**
> + * find_bits_changed - Returns a mask of bits changed.
> + * @ref_bits: the reference bits against which the test is made.
> + * @chk_bits: the bits to be checked.
> + */
> +static inline unsigned long find_bits_changed(unsigned long ref_bits,
> +  unsigned long chk_bits)
> +{
> +return ref_bits ^ chk_bits;
> +}
> +
> +/**
> + * find_bits_to_1 - Returns a mask of bits changed from 0 to 1.
> + * @ref_bits: the reference bits against which the test is made.
> + * @chk_bits: the bits to be checked.
> + */
> +static inline unsigned long find_bits_to_1(unsigned long ref_bits,
> +   unsigned long chk_bits)
> +{
> +return find_bits_changed(ref_bits, chk_bits) & chk_bits;
> +}
> +
> +/**
> + * find_bits_to_0 - Returns a mask of bits changed from 1 to 0.
> + * @ref_bits: the reference bits against which the test is made.
> + * @chk_bits: the bits to be checked.
> + */
> +static inline unsigned long find_bits_to_0(unsigned long ref_bits,
> +   unsigned long chk_bits)
> +{
> +return find_bits_to_1(chk_bits, ref_bits);
> +}
> +
>  /**
>   * find_last_bit - find the last set bit in a memory region
>   * @addr: The address to start the search at

Reviewed-by: Lei Wang 



Re: [PATCH 1/1] prealloc: add truncate mode for prealloc filter

2024-05-28 Thread Wang, Lei
On 5/1/2024 1:05, Denis V. Lunev via wrote:
> Preallocate filter allows to implement really interesting setups.
> 
> Assume that we have
> * shared block device, f.e. iSCSI LUN, implemented with some HW device
> * clustered LVM on top of it
> * QCOW2 image stored inside LVM volume
> 
> This allows very cheap clustered setups with all QCOW2 features intact.
> Currently supported setups using QCOW2 with data_file option are not
> so cool as snapshots are not allowed, QCOW2 should be placed into some
> additional distributed storage and so on.
> 
> Though QCOW2 inside LVM volume has a drawback. The image is growing and
> in order to accomodate that image LVM volume is to be resized. This
> could be done externally using ENOSPACE event/condition but this is
> cumbersome.
> 
> This patch introduces native implementation for such a setup. We should
> just put prealloc filter in between QCOW2 format and file nodes. In that
> case LVM will be resized at proper moment and that is done effectively
> as resizing is done in chinks.
> 
> The patch adds allocation mode for this purpose in order to distinguish
> 'fallocate' for ordinary file system and 'truncate'.
> 
> Signed-off-by: Denis V. Lunev 
> CC: Alexander Ivanov 
> CC: Kevin Wolf 
> CC: Hanna Reitz 
> CC: Vladimir Sementsov-Ogievskiy 
> ---
>  block/preallocate.c | 50 +++--
>  1 file changed, 48 insertions(+), 2 deletions(-)
> 
> diff --git a/block/preallocate.c b/block/preallocate.c
> index 4d82125036..6d31627325 100644
> --- a/block/preallocate.c
> +++ b/block/preallocate.c
> @@ -33,10 +33,24 @@
>  #include "block/block-io.h"
>  #include "block/block_int.h"
>  
> +typedef enum PreallocateMode {
> +PREALLOCATE_MODE_FALLOCATE = 0,
> +PREALLOCATE_MODE_TRUNCATE = 1,
> +PREALLOCATE_MODE__MAX = 2,
> +} PreallocateMode;
> +
> +static QEnumLookup prealloc_mode_lookup = {
> +.array = (const char *const[]) {
> +"falloc",
> +"truncate",
> +},
> +.size = PREALLOCATE_MODE__MAX,
> +};
>  
>  typedef struct PreallocateOpts {
>  int64_t prealloc_size;
>  int64_t prealloc_align;
> +PreallocateMode prealloc_mode;
>  } PreallocateOpts;
>  
>  typedef struct BDRVPreallocateState {
> @@ -79,6 +93,7 @@ typedef struct BDRVPreallocateState {
>  
>  #define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align"
>  #define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size"
> +#define PREALLOCATE_OPT_MODE "mode"

Why not keeping the names consistent, I mean:

#define PREALLOCATE_OPT_PREALLOC_MODE "prealloc-mode"

>  static QemuOptsList runtime_opts = {
>  .name = "preallocate",
>  .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
> @@ -94,7 +109,14 @@ static QemuOptsList runtime_opts = {
>  .type = QEMU_OPT_SIZE,
>  .help = "how much to preallocate, default 128M",
>  },
> -{ /* end of list */ }
> +{
> +.name = PREALLOCATE_OPT_MODE,
> +.type = QEMU_OPT_STRING,
> +.help = "Preallocation mode on image expansion "
> +"(allowed values: falloc, truncate)",
> +.def_value_str = "falloc",
> +},
> +{ /* end of list */ },
>  },
>  };
>  
> @@ -102,6 +124,8 @@ static bool preallocate_absorb_opts(PreallocateOpts 
> *dest, QDict *options,
>  BlockDriverState *child_bs, Error **errp)
>  {
>  QemuOpts *opts = qemu_opts_create(_opts, NULL, 0, _abort);
> +Error *local_err = NULL;
> +char *buf;
>  
>  if (!qemu_opts_absorb_qdict(opts, options, errp)) {
>  return false;
> @@ -112,6 +136,17 @@ static bool preallocate_absorb_opts(PreallocateOpts 
> *dest, QDict *options,
>  dest->prealloc_size =
>  qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB);
>  
> +buf = qemu_opt_get_del(opts, PREALLOCATE_OPT_MODE);
> +/* prealloc_mode can be downgraded later during allocate_clusters */
> +dest->prealloc_mode = qapi_enum_parse(_mode_lookup, buf,
> +  PREALLOCATE_MODE_FALLOCATE,
> +  _err);
> +g_free(buf);
> +if (local_err != NULL) {
> +error_propagate(errp, local_err);
> +return false;
> +}
> +
>  qemu_opts_del(opts);
>  
>  if (!QEMU_IS_ALIGNED(dest->prealloc_align, BDRV_SECTOR_SIZE)) {
> @@ -335,9 +370,20 @@ handle_write(BlockDriverState *bs, int64_t offset, 
> int64_t bytes,
>  
>  want_merge_zero = want_merge_zero && (prealloc_start <= offset);
>  
> -ret = bdrv_co_pwrite_zeroes(
> +switch (s->opts.prealloc_mode) {
> +case PREALLOCATE_MODE_FALLOCATE:
> +ret = bdrv_co_pwrite_zeroes(
>  bs->file, prealloc_start, prealloc_end - prealloc_start,
>  BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT);
> +break;
> +case PREALLOCATE_MODE_TRUNCATE:
> +ret = bdrv_co_truncate(bs->file, prealloc_end, false,
> +  

Re: Missing features in QEMU SapphireRapid definition

2024-04-23 Thread Wang, Lei
Hi Manish,

Thanks for the information. It seems those 3 features are missing in the SPR CPU
Model definition, we are currently polishing our new SPR CPU Model version, you
can launch the SPR CPU Model using:

-cpu SapphireRapids,+cldemote,+movdiri,+movdir64b

as a workaround and sorry for the inconvenience:)

Thanks,
Lei

On 4/23/2024 14:57, Manish Mishra wrote:
> Hi Everyone,
> 
> We see few features like movdiri, movdiri64b, cldemote are present on
> SapphireRapid nodes and are also mentioned as supported in intel manual.  But
> these are missing from the QEMU definition of SapphireRapid CPUs
> https://github.com/qemu/qemu/commit/7eb061b06e97af9a8da7f31b839d78997ae737fc
> .
> There may be other features too missing but we were particularly interested in
> these three. So just wanted to understand if there is any reason for this.  
> Any
> help on this will be really appreciated. 
> 
> Thanks
> 
> Manish Mishra
> 
>  
> 



Re: [PATCH v2] migration/postcopy: ensure preempt channel is ready before loading states

2024-04-06 Thread Wang, Lei
On 4/6/2024 5:53, Peter Xu wrote:> On Fri, Apr 05, 2024 at 11:40:56AM +0800, Wei
Wang wrote:
>> Before loading the guest states, ensure that the preempt channel has been
>> ready to use, as some of the states (e.g. via virtio_load) might trigger
>> page faults that will be handled through the preempt channel. So yield to
>> the main thread in the case that the channel create event hasn't been
>> dispatched.
>>
>> Originally-by: Lei Wang 
>> Link: 
>> https://lore.kernel.org/all/9aa5d1be-7801-40dd-83fd-f7e041ced...@intel.com/T/
>> Suggested-by: Peter Xu 
> 
> The current version doesn't have any of my credits. :) Thanks, but I'll
> just drop it to reflect reality, so we keep the credit to the right ones.
> 
>> Signed-off-by: Lei Wang 
>> Signed-off-by: Wei Wang 
>> ---
>>  migration/savevm.c | 17 +
>>  1 file changed, 17 insertions(+)
>>
>> diff --git a/migration/savevm.c b/migration/savevm.c
>> index 388d7af7cd..63f9991a8a 100644
>> --- a/migration/savevm.c
>> +++ b/migration/savevm.c
>> @@ -2342,6 +2342,23 @@ static int 
>> loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
>>  
>>  QEMUFile *packf = qemu_file_new_input(QIO_CHANNEL(bioc));
>>  
>> +/*
>> + * Before loading the guest states, ensure that the preempt channel has
>> + * been ready to use, as some of the states (e.g. via virtio_load) might
>> + * trigger page faults that will be handled through the preempt channel.
>> + * So yield to the main thread in the case that the channel create event
>> + * hasn't been dispatched.
> 
> I'll squash below into it.  If any of you disagree please shoot: valid
> until this Sunday.

I'm OK with it.

> 
> + * TODO: if we can move migration loadvm out of main thread, then we
> + * won't block main thread from polling the accept() fds.  We can drop
> + * this as a whole when that is done.
> 
> Huge thanks to both!

Thank you Peter :)

> 
>> + */
>> +do {
>> +if (!migrate_postcopy_preempt() || !qemu_in_coroutine() ||
>> +mis->postcopy_qemufile_dst) {
>> +break;
>> +}
>> +
>> +aio_co_schedule(qemu_get_current_aio_context(), 
>> qemu_coroutine_self());
>> +qemu_coroutine_yield();
>> +} while (1);
>> +
>>  ret = qemu_loadvm_state_main(packf, mis);
>>  trace_loadvm_handle_cmd_packaged_main(ret);
>>  qemu_fclose(packf);
>> -- 
>> 2.27.0
>>
> 



Re: [PATCH v1] migration/postcopy: ensure preempt channel is ready before loading states

2024-04-04 Thread Wang, Lei
On 4/5/2024 0:25, Wang, Wei W wrote:> On Thursday, April 4, 2024 10:12 PM, Peter
Xu wrote:
>> On Thu, Apr 04, 2024 at 06:05:50PM +0800, Wei Wang wrote:
>>> Before loading the guest states, ensure that the preempt channel has
>>> been ready to use, as some of the states (e.g. via virtio_load) might
>>> trigger page faults that will be handled through the preempt channel.
>>> So yield to the main thread in the case that the channel create event
>>> has been dispatched.
>>>
>>> Originally-by: Lei Wang 
>>> Link:
>>> https://lore.kernel.org/all/9aa5d1be-7801-40dd-83fd-f7e041ced249@intel
>>> .com/T/
>>> Suggested-by: Peter Xu 
>>> Signed-off-by: Lei Wang 
>>> Signed-off-by: Wei Wang 
>>> ---
>>>  migration/savevm.c | 17 +
>>>  1 file changed, 17 insertions(+)
>>>
>>> diff --git a/migration/savevm.c b/migration/savevm.c index
>>> 388d7af7cd..fbc9f2bdd4 100644
>>> --- a/migration/savevm.c
>>> +++ b/migration/savevm.c
>>> @@ -2342,6 +2342,23 @@ static int
>>> loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
>>>
>>>  QEMUFile *packf = qemu_file_new_input(QIO_CHANNEL(bioc));
>>>
>>> +/*
>>> + * Before loading the guest states, ensure that the preempt channel has
>>> + * been ready to use, as some of the states (e.g. via virtio_load) 
>>> might
>>> + * trigger page faults that will be handled through the preempt 
>>> channel.
>>> + * So yield to the main thread in the case that the channel create 
>>> event
>>> + * has been dispatched.
>>> + */
>>> +do {
>>> +if (!migrate_postcopy_preempt() || !qemu_in_coroutine() ||
>>> +mis->postcopy_qemufile_dst) {
>>> +break;
>>> +}
>>> +
>>> +aio_co_schedule(qemu_get_current_aio_context(),
>> qemu_coroutine_self());
>>> +qemu_coroutine_yield();
>>> +} while (!qemu_sem_timedwait(>postcopy_qemufile_dst_done,
>>> + 1));
>>
>> I think we need s/!// here, so the same mistake I made?  I think we need to
>> rework the retval of qemu_sem_timedwait() at some point later..
> 
> No. qemu_sem_timedwait returns false when timeout, which means sem isn’t 
> posted yet.
> So it needs to go back to the loop. (the patch was tested)

When timeout, qemu_sem_timedwait() will return -1. I think the patch test passed
may because you will always have at least one yield (the first yield in the do
...while ...) when loadvm_handle_cmd_packaged()?

> 
>>
>> Besides, this patch kept the sem_wait() in postcopy_preempt_thread() so it
>> will wait() on this sem again.  If this qemu_sem_timedwait() accidentally
>> consumed the sem count then I think the other thread can hang forever?
> 
> I can get the issue you mentioned, and seems better to be placed before the 
> creation of
> the preempt thread. Then we probably don’t need to wait_sem in the preempt 
> thread, as the
> channel is guaranteed to be ready when it runs?
> 
> Update will be:
> 
> diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
> index eccff499cb..5a70ce4f23 100644
> --- a/migration/postcopy-ram.c
> +++ b/migration/postcopy-ram.c
> @@ -1254,6 +1254,15 @@ int postcopy_ram_incoming_setup(MigrationIncomingState 
> *mis)
>  }
> 
>  if (migrate_postcopy_preempt()) {
> +do {
> +if (!migrate_postcopy_preempt() || !qemu_in_coroutine() ||
> +mis->postcopy_qemufile_dst) {
> +break;
> +}
> +aio_co_schedule(qemu_get_current_aio_context(), 
> qemu_coroutine_self());
> +qemu_coroutine_yield();
> +} while (!qemu_sem_timedwait(>postcopy_qemufile_dst_done, 1));
> +
>  /*
>   * This thread needs to be created after the temp pages because
>   * it'll fetch RAM_CHANNEL_POSTCOPY PostcopyTmpPage immediately.
> @@ -1743,12 +1752,6 @@ void *postcopy_preempt_thread(void *opaque)
> 
>  qemu_sem_post(>thread_sync_sem);
> 
> -/*
> - * The preempt channel is established in asynchronous way.  Wait
> - * for its completion.
> - */
> -qemu_sem_wait(>postcopy_qemufile_dst_done);
> 
> 
> 
> 
> 
> 
> 



Re: [PATCH] migration: Yield coroutine when receiving MIG_CMD_POSTCOPY_LISTEN

2024-04-03 Thread Wang, Lei
On 4/3/2024 5:39, Peter Xu wrote:>
> I'm not 100% sure such thing (no matter here or moved into it, which
> does look cleaner) would work for us.
>
> The problem is I still don't yet see an ordering restricted on top of
> (1)
> accept() happens, and (2) receive LISTEN cmd here.  What happens if
> the
> accept() request is not yet received when reaching LISTEN?  Or is it
> always guaranteed the accept(fd) will always be polled here?
>
> For example, the source QEMU (no matter pre-7.2 or later) will always
> setup the preempt channel asynchrounously, then IIUC it can connect()
> after sending the whole chunk of packed data which should include this
> LISTEN.  I think it means it's not guaranteed this will 100% work, but
> maybe further reduce the possibility of the race.

 I think the following code:

 postcopy_start() ->
postcopy_preempt_establish_channel() ->
qemu_sem_wait(>postcopy_qemufile_src_sem);

 can guarantee that the connect() syscall is successful so the destination 
 side
 receives the connect() request before it loads the LISTEN command, 
 otherwise
 it won't post the sem:

 postcopy_preempt_send_channel_new() ->
postcopy_preempt_send_channel_done() ->
qemu_sem_post(>postcopy_qemufile_src_sem);

>>>
>>> Yes. But as mentioned in another thread, connect() and accept() are async.
>>> So in theory accept() could still come later after the LISTEN command.
>>
>> IIUC accept() is the callback and will be triggered by the connect() event.
>>
>> The reason accept() is not called in the destination is the main loop doesn't
>> get a chance to handle other events (connect()), so if we can guarantee
>> connect() is before LISTEN, then when handling the LISTEN cmd, we yield to 
>> the
>> main loop and the connect() event is there, then we can handle it by calling 
>> the
>> accept():
>>
>> qio_net_listener_channel_func
>>  qio_channel_socket_accept
>>  qemu_accept
>>  accept
>>
>> so it seems the case accept() comes alter after LISTEN is in our expectation?
> 
> The major thing uncertain to me is "accept() will return with a valid fd"
> on dest host is not guaranteed to order against anything.
> 
> For example, I won't be surprised if a kernel implementation provides an
> async model of "accept()" syscall, so that even if the other side returned
> with "connect()", the "accept()" can still fetch nothing if the async model
> will need a delay for the new channel to be finally delivered to the
> "accept()" thread context.  It just sounds like tricky to rely on such
> thing.
> 
> What I proposed below shouldn't rely on any kernel details on how accept()
> could be implemented, it simply waits for the fd to be created before doing
> anything else (including creating the preempt thread and process packed
> data).

Thanks for the detailed explanation!

> 
>>
>>>
>
> One right fix that I can think of is moving the sem_wait() into
> the main thread too, so we wait for the sem _before_ reading the
> packed data, so there's no chance of fault.  However I don't think
> sem_wait() will be smart enough to yield when in a coroutine..  In the
> long term run I think we should really make migration loadvm to do
> work in the thread rather than the main thread.  I think it means we
> have one more example to be listed in this todo so that's preferred..
>
> https://wiki.qemu.org/ToDo/LiveMigration#Create_a_thread_for_migration
> _destination
>
> I attached such draft patch below, but I'm not sure it'll work.  Let
> me know how both of you think about it.

 Sadly it doesn't work, there is an unknown segfault.
> 
> Could you paste the stack of the segfault ("(gdb) thread apply all bt")?
> Or help to figure out what is wrong?
> 
> Since I cannot reproduce myself, I may need your help debugging this.  If
> you agree with what I said above and agree on such fix, please also feel
> free to go ahead and fix the segfault.

We should change the following line from

while (!qemu_sem_timedwait(>postcopy_qemufile_dst_done, 100)) {

to

while (qemu_sem_timedwait(>postcopy_qemufile_dst_done, 100)) {

After that fix, test passed and no segfault.

Given that the test shows a yield to the main loop won't introduce much overhead
(<1ms), how about first yield unconditionally, then we enter the while loop to
wait for several ms and yield periodically?



Re: [PATCH] migration: Yield coroutine when receiving MIG_CMD_POSTCOPY_LISTEN

2024-04-02 Thread Wang, Lei
On 4/2/2024 15:25, Wang, Wei W wrote:> On Tuesday, April 2, 2024 2:56 PM, Wang,
Lei4 wrote:
>> On 4/2/2024 0:13, Peter Xu wrote:> On Fri, Mar 29, 2024 at 08:54:07AM +,
>> Wang, Wei W wrote:
 On Friday, March 29, 2024 11:32 AM, Wang, Lei4 wrote:
> When using the post-copy preemption feature to perform post-copy
> live migration, the below scenario could lead to a deadlock and the
> migration will never finish:
>
>  - Source connect() the preemption channel in postcopy_start().
>  - Source and the destination side TCP stack finished the 3-way handshake
>thus the connection is successful.
>  - The destination side main thread is handling the loading of the bulk
>> RAM
>pages thus it doesn't start to handle the pending connection event in 
> the
>event loop. and doesn't post the semaphore
>> postcopy_qemufile_dst_done for
>the preemption thread.
>  - The source side sends non-iterative device states, such as the virtio
>states.
>  - The destination main thread starts to receive the virtio states, this
>process may lead to a page fault (e.g., 
> virtio_load()->vring_avail_idx()
>may trigger a page fault since the avail ring page may not be received
>yet).
>>>
>>> Ouch.  Yeah I think this part got overlooked when working on the
>>> preempt channel.
>>>
>  - The page request is sent back to the source side. Source sends the page
>content to the destination side preemption thread.
>  - Since the event is not arrived and the semaphore
>postcopy_qemufile_dst_done is not posted, the preemption thread in
>destination side is blocked, and cannot handle receiving the page.
>  - The QEMU main load thread on the destination side is stuck at the page
>fault, and cannot yield and handle the connect() event for the
>preemption channel to unblock the preemption thread.
>  - The postcopy will stuck there forever since this is a deadlock.
>
> The key point to reproduce this bug is that the source side is
> sending pages at a rate faster than the destination handling,
> otherwise, the qemu_get_be64() in
> ram_load_precopy() will have a chance to yield since at that time
> there are no pending data in the buffer to get. This will make this
> bug harder to be reproduced.
>>>
>>> How hard would this reproduce?
>>
>> We can manually make this easier to reproduce by adding the following code
>> to make the destination busier to load the pages:
>>
>> diff --git a/migration/ram.c b/migration/ram.c index 0ad9fbba48..0b42877e1f
>> 100644
>> --- a/migration/ram.c
>> +++ b/migration/ram.c
>> @@ -4232,6 +4232,7 @@ static int ram_load_precopy(QEMUFile *f)  {
>>  MigrationIncomingState *mis = migration_incoming_get_current();
>>  int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
>> +volatile unsigned long long a;
>>
>>  if (!migrate_compress()) {
>>  invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE; @@ -4347,6
>> +4348,7 @@ static int ram_load_precopy(QEMUFile *f)
>>  break;
>>
>>  case RAM_SAVE_FLAG_PAGE:
>> +for (a = 0; a < 1; a++);
>>  qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
>>  break;
>>
> 
> Which version of QEMU are you using?
> I tried with the latest upstream QEMU (e.g. v8.2.0 release, 1600b9f46b1bd), 
> it's
> always reproducible without any changes (with local migration tests).

I'm using the latest tip:

6af9d12c88b9720f209912f6e4b01fefe5906d59

and it cannot be reproduced without the modification.

> 
> 
>>>
>>> I'm thinking whether this should be 9.0 material or 9.1.  It's pretty
>>> late for 9.0 though, but we can still discuss.
>>>
>
> Fix this by yielding the load coroutine when receiving
> MIG_CMD_POSTCOPY_LISTEN so the main event loop can handle the
> connection event before loading the non-iterative devices state to
> avoid the deadlock condition.
>
> Signed-off-by: Lei Wang 

 This seems to be a regression issue caused by this commit:
 737840e2c6ea (migration: Use the number of transferred bytes
 directly)

 Adding qemu_fflush back to migration_rate_exceeded() or
 ram_save_iterate seems to work (might not be a good fix though).

> ---
>  migration/savevm.c | 5 +
>  1 file changed, 5 insertions(+)
>
> diff --git a/migration/savevm.c b/migration/savevm.c index
> e386c5267f..8fd4dc92f2 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -2445,6 +2445,11 @@ static int
>> loadvm_process_command(QEMUFile *f)
>  return loadvm_postcopy_handle_advise(mis, len);
>
>  case MIG_CMD_POSTCOPY_LISTEN:
> +if (migrate_postcopy_preempt() && qemu_in_coroutine()) {
> +aio_co_schedule(qemu_get_current_aio_context(),
> +qemu_coroutine_self());
> 

Re: [PATCH] migration: Yield coroutine when receiving MIG_CMD_POSTCOPY_LISTEN

2024-04-02 Thread Wang, Lei
On 4/2/2024 0:13, Peter Xu wrote:> On Fri, Mar 29, 2024 at 08:54:07AM +,
Wang, Wei W wrote:
>> On Friday, March 29, 2024 11:32 AM, Wang, Lei4 wrote:
>>> When using the post-copy preemption feature to perform post-copy live
>>> migration, the below scenario could lead to a deadlock and the migration 
>>> will
>>> never finish:
>>>
>>>  - Source connect() the preemption channel in postcopy_start().
>>>  - Source and the destination side TCP stack finished the 3-way handshake
>>>thus the connection is successful.
>>>  - The destination side main thread is handling the loading of the bulk RAM
>>>pages thus it doesn't start to handle the pending connection event in the
>>>event loop. and doesn't post the semaphore postcopy_qemufile_dst_done for
>>>the preemption thread.
>>>  - The source side sends non-iterative device states, such as the virtio
>>>states.
>>>  - The destination main thread starts to receive the virtio states, this
>>>process may lead to a page fault (e.g., virtio_load()->vring_avail_idx()
>>>may trigger a page fault since the avail ring page may not be received
>>>yet).
> 
> Ouch.  Yeah I think this part got overlooked when working on the preempt
> channel.
> 
>>>  - The page request is sent back to the source side. Source sends the page
>>>content to the destination side preemption thread.
>>>  - Since the event is not arrived and the semaphore
>>>postcopy_qemufile_dst_done is not posted, the preemption thread in
>>>destination side is blocked, and cannot handle receiving the page.
>>>  - The QEMU main load thread on the destination side is stuck at the page
>>>fault, and cannot yield and handle the connect() event for the
>>>preemption channel to unblock the preemption thread.
>>>  - The postcopy will stuck there forever since this is a deadlock.
>>>
>>> The key point to reproduce this bug is that the source side is sending 
>>> pages at a
>>> rate faster than the destination handling, otherwise, the qemu_get_be64() in
>>> ram_load_precopy() will have a chance to yield since at that time there are 
>>> no
>>> pending data in the buffer to get. This will make this bug harder to be
>>> reproduced.
> 
> How hard would this reproduce?

We can manually make this easier to reproduce by adding the following code to
make the destination busier to load the pages:

diff --git a/migration/ram.c b/migration/ram.c
index 0ad9fbba48..0b42877e1f 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -4232,6 +4232,7 @@ static int ram_load_precopy(QEMUFile *f)
 {
 MigrationIncomingState *mis = migration_incoming_get_current();
 int flags = 0, ret = 0, invalid_flags = 0, len = 0, i = 0;
+volatile unsigned long long a;

 if (!migrate_compress()) {
 invalid_flags |= RAM_SAVE_FLAG_COMPRESS_PAGE;
@@ -4347,6 +4348,7 @@ static int ram_load_precopy(QEMUFile *f)
 break;

 case RAM_SAVE_FLAG_PAGE:
+for (a = 0; a < 1; a++);
 qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
 break;

> 
> I'm thinking whether this should be 9.0 material or 9.1.  It's pretty late
> for 9.0 though, but we can still discuss.
> 
>>>
>>> Fix this by yielding the load coroutine when receiving
>>> MIG_CMD_POSTCOPY_LISTEN so the main event loop can handle the
>>> connection event before loading the non-iterative devices state to avoid the
>>> deadlock condition.
>>>
>>> Signed-off-by: Lei Wang 
>>
>> This seems to be a regression issue caused by this commit:
>> 737840e2c6ea (migration: Use the number of transferred bytes directly)
>>
>> Adding qemu_fflush back to migration_rate_exceeded() or ram_save_iterate
>> seems to work (might not be a good fix though).
>>
>>> ---
>>>  migration/savevm.c | 5 +
>>>  1 file changed, 5 insertions(+)
>>>
>>> diff --git a/migration/savevm.c b/migration/savevm.c index
>>> e386c5267f..8fd4dc92f2 100644
>>> --- a/migration/savevm.c
>>> +++ b/migration/savevm.c
>>> @@ -2445,6 +2445,11 @@ static int loadvm_process_command(QEMUFile *f)
>>>  return loadvm_postcopy_handle_advise(mis, len);
>>>
>>>  case MIG_CMD_POSTCOPY_LISTEN:
>>> +if (migrate_postcopy_preempt() && qemu_in_coroutine()) {
>>> +aio_co_schedule(qemu_get_current_aio_context(),
>>> +qemu_coroutine_self());
>>> +qemu_coroutine_yield();
>>> +}
>>
>> The above could be moved to loadvm_postcopy_handle_listen().
> 
> I'm not 100% sure such thing (no matter here or moved into it, which does
> look cleaner) would work for us.
> 
> The problem is I still don't yet see an ordering restricted on top of (1)
> accept() happens, and (2) receive LISTEN cmd here.  What happens if the
> accept() request is not yet received when reaching LISTEN?  Or is it always
> guaranteed the accept(fd) will always be polled here?
> 
> For example, the source QEMU (no matter pre-7.2 or later) will always setup
> the preempt channel asynchrounously, then IIUC it can 

Re: [PATCH v5 15/65] i386/tdx: Get tdx_capabilities via KVM_TDX_CAPABILITIES

2024-03-18 Thread Wang, Lei
On 2/29/2024 14:36, Xiaoyao Li wrote:> KVM provides TDX capabilities via sub
command KVM_TDX_CAPABILITIES of
> IOCTL(KVM_MEMORY_ENCRYPT_OP). Get the capabilities when initializing
> TDX context. It will be used to validate user's setting later.
> 
> Since there is no interface reporting how many cpuid configs contains in
> KVM_TDX_CAPABILITIES, QEMU chooses to try starting with a known number
> and abort when it exceeds KVM_MAX_CPUID_ENTRIES.
> 
> Besides, introduce the interfaces to invoke TDX "ioctls" at different
> scope (KVM, VM and VCPU) in preparation.

tdx_platform_ioctl() is dropped because no user so suggest rephrasing this
statement because no KVM scope ioctl interface is introduced in this patch.

> 
> Signed-off-by: Xiaoyao Li 
> ---
> Changes in v4:
> - use {} to initialize struct kvm_tdx_cmd, to avoid memset();
> - remove tdx_platform_ioctl() because no user;
> 
> Changes in v3:
> - rename __tdx_ioctl() to tdx_ioctl_internal()
> - Pass errp in get_tdx_capabilities();
> 
> changes in v2:
>   - Make the error message more clear;
> 
> changes in v1:
>   - start from nr_cpuid_configs = 6 for the loop;
>   - stop the loop when nr_cpuid_configs exceeds KVM_MAX_CPUID_ENTRIES;
> ---
>  target/i386/kvm/kvm.c  |  2 -
>  target/i386/kvm/kvm_i386.h |  2 +
>  target/i386/kvm/tdx.c  | 91 +-
>  3 files changed, 92 insertions(+), 3 deletions(-)
> 
> diff --git a/target/i386/kvm/kvm.c b/target/i386/kvm/kvm.c
> index 52d99d30bdc8..0e68e80f4291 100644
> --- a/target/i386/kvm/kvm.c
> +++ b/target/i386/kvm/kvm.c
> @@ -1685,8 +1685,6 @@ static int hyperv_init_vcpu(X86CPU *cpu)
>  
>  static Error *invtsc_mig_blocker;
>  
> -#define KVM_MAX_CPUID_ENTRIES  100
> -
>  static void kvm_init_xsave(CPUX86State *env)
>  {
>  if (has_xsave2) {
> diff --git a/target/i386/kvm/kvm_i386.h b/target/i386/kvm/kvm_i386.h
> index 55fb25fa8e2e..c3ef46a97a7b 100644
> --- a/target/i386/kvm/kvm_i386.h
> +++ b/target/i386/kvm/kvm_i386.h
> @@ -13,6 +13,8 @@
>  
>  #include "sysemu/kvm.h"
>  
> +#define KVM_MAX_CPUID_ENTRIES  100
> +
>  #ifdef CONFIG_KVM
>  
>  #define kvm_pit_in_kernel() \
> diff --git a/target/i386/kvm/tdx.c b/target/i386/kvm/tdx.c
> index d9a1dd46dc69..2b956450a083 100644
> --- a/target/i386/kvm/tdx.c
> +++ b/target/i386/kvm/tdx.c
> @@ -12,18 +12,107 @@
>   */
>  
>  #include "qemu/osdep.h"
> +#include "qemu/error-report.h"
> +#include "qapi/error.h"
>  #include "qom/object_interfaces.h"
> +#include "sysemu/kvm.h"
>  
>  #include "hw/i386/x86.h"
> +#include "kvm_i386.h"
>  #include "tdx.h"
>  
> +static struct kvm_tdx_capabilities *tdx_caps;
> +
> +enum tdx_ioctl_level{
> +TDX_VM_IOCTL,
> +TDX_VCPU_IOCTL,
> +};
> +
> +static int tdx_ioctl_internal(void *state, enum tdx_ioctl_level level, int 
> cmd_id,
> +__u32 flags, void *data)
> +{
> +struct kvm_tdx_cmd tdx_cmd = {};
> +int r;
> +
> +tdx_cmd.id = cmd_id;
> +tdx_cmd.flags = flags;
> +tdx_cmd.data = (__u64)(unsigned long)data;
> +
> +switch (level) {
> +case TDX_VM_IOCTL:
> +r = kvm_vm_ioctl(kvm_state, KVM_MEMORY_ENCRYPT_OP, _cmd);
> +break;
> +case TDX_VCPU_IOCTL:
> +r = kvm_vcpu_ioctl(state, KVM_MEMORY_ENCRYPT_OP, _cmd);
> +break;
> +default:
> +error_report("Invalid tdx_ioctl_level %d", level);
> +exit(1);
> +}
> +
> +return r;
> +}
> +
> +static inline int tdx_vm_ioctl(int cmd_id, __u32 flags, void *data)
> +{
> +return tdx_ioctl_internal(NULL, TDX_VM_IOCTL, cmd_id, flags, data);
> +}
> +
> +static inline int tdx_vcpu_ioctl(void *vcpu_fd, int cmd_id, __u32 flags,
> + void *data)
> +{
> +return  tdx_ioctl_internal(vcpu_fd, TDX_VCPU_IOCTL, cmd_id, flags, data);
> +}
> +
> +static int get_tdx_capabilities(Error **errp)
> +{
> +struct kvm_tdx_capabilities *caps;
> +/* 1st generation of TDX reports 6 cpuid configs */
> +int nr_cpuid_configs = 6;
> +size_t size;
> +int r;
> +
> +do {
> +size = sizeof(struct kvm_tdx_capabilities) +
> +   nr_cpuid_configs * sizeof(struct kvm_tdx_cpuid_config);
> +caps = g_malloc0(size);
> +caps->nr_cpuid_configs = nr_cpuid_configs;
> +
> +r = tdx_vm_ioctl(KVM_TDX_CAPABILITIES, 0, caps);
> +if (r == -E2BIG) {
> +g_free(caps);
> +nr_cpuid_configs *= 2;
> +if (nr_cpuid_configs > KVM_MAX_CPUID_ENTRIES) {
> +error_setg(errp, "%s: KVM TDX seems broken that number of 
> CPUID "
> +   "entries in kvm_tdx_capabilities exceeds limit 
> %d",
> +   __func__, KVM_MAX_CPUID_ENTRIES);
> +return r;
> +}
> +} else if (r < 0) {
> +g_free(caps);
> +error_setg_errno(errp, -r, "%s: KVM_TDX_CAPABILITIES failed", 
> __func__);
> +return r;
> +}
> +}
> +while (r == -E2BIG);
> +
> +tdx_caps 

Re: [PATCH v5 08/65] kvm: handle KVM_EXIT_MEMORY_FAULT

2024-03-18 Thread Wang, Lei
On 2/29/2024 14:36, Xiaoyao Li wrote:
> From: Chao Peng 
> 
> When geeting KVM_EXIT_MEMORY_FAULT exit, it indicates userspace needs to
> do the memory conversion on the RAMBlock to turn the memory into desired
> attribute, i.e., private/shared.
> 
> Currently only KVM_MEMORY_EXIT_FLAG_PRIVATE in flags is valid when
> KVM_EXIT_MEMORY_FAULT happens.
> 
> Note, KVM_EXIT_MEMORY_FAULT makes sense only when the RAMBlock has
> guest_memfd memory backend.
> 
> Note, KVM_EXIT_MEMORY_FAULT returns with -EFAULT, so special handling is
> added.
> 
> When page is converted from shared to private, the original shared
> memory can be discarded via ram_block_discard_range(). Note, shared
> memory can be discarded only when it's not back'ed by hugetlb because
> hugetlb is supposed to be pre-allocated and no need for discarding.
> 
> Signed-off-by: Chao Peng 
> Co-developed-by: Xiaoyao Li 
> Signed-off-by: Xiaoyao Li 
> 
> ---
> Changes in v4:
> - open-coded ram_block_discard logic;
> - change warn_report() to error_report(); (Daniel)
> ---
>  accel/kvm/kvm-all.c | 94 -
>  1 file changed, 84 insertions(+), 10 deletions(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 70d482a2c936..87e4275932a7 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -2903,6 +2903,68 @@ static void kvm_eat_signals(CPUState *cpu)
>  } while (sigismember(, SIG_IPI));
>  }
>  
> +static int kvm_convert_memory(hwaddr start, hwaddr size, bool to_private)
> +{
> +MemoryRegionSection section;
> +ram_addr_t offset;
> +MemoryRegion *mr;
> +RAMBlock *rb;
> +void *addr;
> +int ret = -1;
> +
> +if (!QEMU_PTR_IS_ALIGNED(start, qemu_host_page_size) ||
> +!QEMU_PTR_IS_ALIGNED(size, qemu_host_page_size)) {
> +return -1;
> +}
> +
> +if (!size) {
> +return -1;
> +}
> +
> +section = memory_region_find(get_system_memory(), start, size);
> +mr = section.mr;
> +if (!mr) {
> +return -1;
> +}
> +
> +if (memory_region_has_guest_memfd(mr)) {
> +if (to_private) {
> +ret = kvm_set_memory_attributes_private(start, size);
> +} else {
> +ret = kvm_set_memory_attributes_shared(start, size);
> +}
> +
> +if (ret) {
> +memory_region_unref(section.mr);
> +return ret;
> +}
> +
> +addr = memory_region_get_ram_ptr(mr) + section.offset_within_region;
> +rb = qemu_ram_block_from_host(addr, false, );
> +
> +if (to_private) {
> +if (rb->page_size != qemu_host_page_size) {
> +/*
> +* shared memory is back'ed by  hugetlb, which is supposed to 
> be
> +* pre-allocated and doesn't need to be discarded
> +*/

Nit: comment indentation is broken here.

> +return 0;
> +} else {
> +ret = ram_block_discard_range(rb, offset, size);
> +}
> +} else {
> +ret = ram_block_discard_guest_memfd_range(rb, offset, size);
> +}
> +} else {
> +error_report("Convert non guest_memfd backed memory region "
> +"(0x%"HWADDR_PRIx" ,+ 0x%"HWADDR_PRIx") to %s",

Same as above.

> +start, size, to_private ? "private" : "shared");
> +}
> +
> +memory_region_unref(section.mr);
> +return ret;
> +}
> +
>  int kvm_cpu_exec(CPUState *cpu)
>  {
>  struct kvm_run *run = cpu->kvm_run;
> @@ -2970,18 +3032,20 @@ int kvm_cpu_exec(CPUState *cpu)
>  ret = EXCP_INTERRUPT;
>  break;
>  }
> -fprintf(stderr, "error: kvm run failed %s\n",
> -strerror(-run_ret));
> +if (!(run_ret == -EFAULT && run->exit_reason == 
> KVM_EXIT_MEMORY_FAULT)) {
> +fprintf(stderr, "error: kvm run failed %s\n",
> +strerror(-run_ret));
>  #ifdef TARGET_PPC
> -if (run_ret == -EBUSY) {
> -fprintf(stderr,
> -"This is probably because your SMT is enabled.\n"
> -"VCPU can only run on primary threads with all "
> -"secondary threads offline.\n");
> -}
> +if (run_ret == -EBUSY) {
> +fprintf(stderr,
> +"This is probably because your SMT is enabled.\n"
> +"VCPU can only run on primary threads with all "
> +"secondary threads offline.\n");
> +}
>  #endif
> -ret = -1;
> -break;
> +ret = -1;
> +break;
> +}
>  }
>  
>  trace_kvm_run_exit(cpu->cpu_index, run->exit_reason);
> @@ -3064,6 +3128,16 @@ int kvm_cpu_exec(CPUState *cpu)
>  break;
>  }
>  break;
> +case 

Re: [PATCH v5 06/65] kvm: Introduce support for memory_attributes

2024-03-18 Thread Wang, Lei
On 2/29/2024 14:36, Xiaoyao Li wrote:> Introduce the helper functions to set 
the attributes of a range of
> memory to private or shared.
> 
> This is necessary to notify KVM the private/shared attribute of each gpa
> range. KVM needs the information to decide the GPA needs to be mapped at
> hva-based shared memory or guest_memfd based private memory.
> 
> Signed-off-by: Xiaoyao Li 
> ---
> Changes in v4:
> - move the check of kvm_supported_memory_attributes to the common
>   kvm_set_memory_attributes(); (Wang Wei)
> - change warn_report() to error_report() in kvm_set_memory_attributes()
>   and drop the __func__; (Daniel)
> ---
>  accel/kvm/kvm-all.c  | 44 
>  include/sysemu/kvm.h |  3 +++
>  2 files changed, 47 insertions(+)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index cd0aa7545a1f..70d482a2c936 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -92,6 +92,7 @@ static bool kvm_has_guest_debug;
>  static int kvm_sstep_flags;
>  static bool kvm_immediate_exit;
>  static bool kvm_guest_memfd_supported;
> +static uint64_t kvm_supported_memory_attributes;
>  static hwaddr kvm_max_slot_size = ~0;
>  
>  static const KVMCapabilityInfo kvm_required_capabilites[] = {
> @@ -1304,6 +1305,46 @@ void kvm_set_max_memslot_size(hwaddr max_slot_size)
>  kvm_max_slot_size = max_slot_size;
>  }
>  
> +static int kvm_set_memory_attributes(hwaddr start, hwaddr size, uint64_t 
> attr)
> +{
> +struct kvm_memory_attributes attrs;
> +int r;
> +
> +if (kvm_supported_memory_attributes == 0) {
> +error_report("No memory attribute supported by KVM\n");
> +return -EINVAL;
> +}
> +
> +if ((attr & kvm_supported_memory_attributes) != attr) {
> +error_report("memory attribute 0x%lx not supported by KVM,"
> + " supported bits are 0x%lx\n",
> + attr, kvm_supported_memory_attributes);
> +return -EINVAL;
> +}
> +
> +attrs.attributes = attr;
> +attrs.address = start;
> +attrs.size = size;
> +attrs.flags = 0;
> +
> +r = kvm_vm_ioctl(kvm_state, KVM_SET_MEMORY_ATTRIBUTES, );
> +if (r) {
> +error_report("failed to set memory (0x%lx+%#zx) with attr 0x%lx 
> error '%s'",
> + start, size, attr, strerror(errno));
> +}
> +return r;
> +}
> +
> +int kvm_set_memory_attributes_private(hwaddr start, hwaddr size)
> +{
> +return kvm_set_memory_attributes(start, size, 
> KVM_MEMORY_ATTRIBUTE_PRIVATE);
> +}
> +
> +int kvm_set_memory_attributes_shared(hwaddr start, hwaddr size)
> +{
> +return kvm_set_memory_attributes(start, size, 0);
> +}
> +
>  /* Called with KVMMemoryListener.slots_lock held */
>  static void kvm_set_phys_mem(KVMMemoryListener *kml,
>   MemoryRegionSection *section, bool add)
> @@ -2439,6 +2480,9 @@ static int kvm_init(MachineState *ms)
>  
>  kvm_guest_memfd_supported = kvm_check_extension(s, KVM_CAP_GUEST_MEMFD);
>  
> +ret = kvm_check_extension(s, KVM_CAP_MEMORY_ATTRIBUTES);
> +kvm_supported_memory_attributes = ret > 0 ? ret : 0;

kvm_check_extension() only returns non-negative value, so we can just

kvm_supported_memory_attributes = kvm_check_extension(s, 
KVM_CAP_MEMORY_ATTRIBUTES);

> +
>  if (object_property_find(OBJECT(current_machine), "kvm-type")) {
>  g_autofree char *kvm_type = 
> object_property_get_str(OBJECT(current_machine),
>  "kvm-type",
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index 6cdf82de8372..8e83adfbbd19 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -546,4 +546,7 @@ uint32_t kvm_dirty_ring_size(void);
>  bool kvm_hwpoisoned_mem(void);
>  
>  int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp);
> +
> +int kvm_set_memory_attributes_private(hwaddr start, hwaddr size);
> +int kvm_set_memory_attributes_shared(hwaddr start, hwaddr size);
>  #endif



Re: [PATCH] migration: Don't serialize migration while can't switchover

2024-02-26 Thread Wang, Lei
On 2/22/2024 23:56, Avihai Horon wrote:
> Currently, migration code serializes device data sending during pre-copy
> iterative phase. As noted in the code comment, this is done to prevent
> faster changing device from sending its data over and over.
> 
> However, with switchover-ack capability enabled, this behavior can be
> problematic and may prevent migration from converging. The problem lies
> in the fact that an earlier device may never finish sending its data and
> thus block other devices from sending theirs.
> 
> This bug was observed in several VFIO migration scenarios where some
> workload on the VM prevented RAM from ever reaching a hard zero, not
> allowing VFIO initial pre-copy data to be sent, and thus destination
> could not ack switchover. Note that the same scenario, but without
> switchover-ack, would converge.
> 
> Fix it by not serializing device data sending during pre-copy iterative
> phase if switchover was not acked yet.

Hi Avihai,

Can this bug be solved by ordering the priority of different device's handlers?

> 
> Fixes: 1b4adb10f898 ("migration: Implement switchover ack logic")
> Signed-off-by: Avihai Horon 
> ---
>  migration/savevm.h|  2 +-
>  migration/migration.c |  4 ++--
>  migration/savevm.c| 22 +++---
>  3 files changed, 18 insertions(+), 10 deletions(-)
> 
> diff --git a/migration/savevm.h b/migration/savevm.h
> index 74669733dd6..d4a368b522b 100644
> --- a/migration/savevm.h
> +++ b/migration/savevm.h
> @@ -36,7 +36,7 @@ void qemu_savevm_state_setup(QEMUFile *f);
>  bool qemu_savevm_state_guest_unplug_pending(void);
>  int qemu_savevm_state_resume_prepare(MigrationState *s);
>  void qemu_savevm_state_header(QEMUFile *f);
> -int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy);
> +int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy, bool 
> can_switchover);
>  void qemu_savevm_state_cleanup(void);
>  void qemu_savevm_state_complete_postcopy(QEMUFile *f);
>  int qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only,
> diff --git a/migration/migration.c b/migration/migration.c
> index ab21de2cadb..d8bfe1fb1b9 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -3133,7 +3133,7 @@ static MigIterateState 
> migration_iteration_run(MigrationState *s)
>  }
>  
>  /* Just another iteration step */
> -qemu_savevm_state_iterate(s->to_dst_file, in_postcopy);
> +qemu_savevm_state_iterate(s->to_dst_file, in_postcopy, can_switchover);
>  return MIG_ITERATE_RESUME;
>  }
>  
> @@ -3216,7 +3216,7 @@ static MigIterateState 
> bg_migration_iteration_run(MigrationState *s)
>  {
>  int res;
>  
> -res = qemu_savevm_state_iterate(s->to_dst_file, false);
> +res = qemu_savevm_state_iterate(s->to_dst_file, false, true);
>  if (res > 0) {
>  bg_migration_completion(s);
>  return MIG_ITERATE_BREAK;
> diff --git a/migration/savevm.c b/migration/savevm.c
> index d612c8a9020..3a012796375 100644
> --- a/migration/savevm.c
> +++ b/migration/savevm.c
> @@ -1386,7 +1386,7 @@ int qemu_savevm_state_resume_prepare(MigrationState *s)
>   *   0 : We haven't finished, caller have to go again
>   *   1 : We have finished, we can go to complete phase
>   */
> -int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
> +int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy, bool 
> can_switchover)
>  {
>  SaveStateEntry *se;
>  int ret = 1;
> @@ -1430,12 +1430,20 @@ int qemu_savevm_state_iterate(QEMUFile *f, bool 
> postcopy)
>   "%d(%s): %d",
>   se->section_id, se->idstr, ret);
>  qemu_file_set_error(f, ret);
> +return ret;
>  }
> -if (ret <= 0) {
> -/* Do not proceed to the next vmstate before this one reported
> -   completion of the current stage. This serializes the migration
> -   and reduces the probability that a faster changing state is
> -   synchronized over and over again. */
> +
> +if (ret == 0 && can_switchover) {
> +/*
> + * Do not proceed to the next vmstate before this one reported
> + * completion of the current stage. This serializes the migration
> + * and reduces the probability that a faster changing state is
> + * synchronized over and over again.
> + * Do it only if migration can switchover. If migration can't
> + * switchover yet, do proceed to let other devices send their 
> data
> + * too, as this may be required for switchover to be acked and
> + * migration to converge.
> + */
>  break;
>  }
>  }
> @@ -1724,7 +1732,7 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
>  qemu_savevm_state_setup(f);
>  
>  while (qemu_file_get_error(f) == 0) {
> -if (qemu_savevm_state_iterate(f, false) > 0) {
> +if (qemu_savevm_state_iterate(f, false, true) 

Re: [PATCH v2 1/7] migration/multifd: Add new migration option zero-page-detection.

2024-02-25 Thread Wang, Lei
On 2/17/2024 6:39, Hao Xiang wrote:
> This new parameter controls where the zero page checking is running.
> 1. If this parameter is set to 'legacy', zero page checking is
> done in the migration main thread.
> 2. If this parameter is set to 'none', zero page checking is disabled.
> 
> Signed-off-by: Hao Xiang 
> ---
>  hw/core/qdev-properties-system.c| 10 ++
>  include/hw/qdev-properties-system.h |  4 
>  migration/migration-hmp-cmds.c  |  9 +
>  migration/options.c | 21 
>  migration/options.h |  1 +
>  migration/ram.c |  4 
>  qapi/migration.json | 30 ++---
>  7 files changed, 76 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/core/qdev-properties-system.c 
> b/hw/core/qdev-properties-system.c
> index 1a396521d5..63843f18b5 100644
> --- a/hw/core/qdev-properties-system.c
> +++ b/hw/core/qdev-properties-system.c
> @@ -679,6 +679,16 @@ const PropertyInfo qdev_prop_mig_mode = {
>  .set_default_value = qdev_propinfo_set_default_value_enum,
>  };
>  
> +const PropertyInfo qdev_prop_zero_page_detection = {
> +.name = "ZeroPageDetection",
> +.description = "zero_page_detection values, "
> +   "multifd,legacy,none",

Nit: Maybe multifd/legacy/none?



Re: [External] Re: [PATCH v2 09/20] util/dsa: Implement DSA task asynchronous completion thread model.

2023-12-18 Thread Wang, Lei
On 12/19/2023 2:57, Hao Xiang wrote:> On Sun, Dec 17, 2023 at 7:11 PM Wang, Lei
 wrote:
>>
>> On 11/14/2023 13:40, Hao Xiang wrote:> * Create a dedicated thread for DSA 
>> task
>> completion.
>>> * DSA completion thread runs a loop and poll for completed tasks.
>>> * Start and stop DSA completion thread during DSA device start stop.
>>>
>>> User space application can directly submit task to Intel DSA
>>> accelerator by writing to DSA's device memory (mapped in user space).
>>
>>> +}
>>> +return;
>>> +}
>>> +} else {
>>> +assert(batch_status == DSA_COMP_BATCH_FAIL ||
>>> +batch_status == DSA_COMP_BATCH_PAGE_FAULT);
>>
>> Nit: indentation is broken here.
>>
>>> +}
>>> +
>>> +for (int i = 0; i < count; i++) {
>>> +
>>> +completion = _task->completions[i];
>>> +status = completion->status;
>>> +
>>> +if (status == DSA_COMP_SUCCESS) {
>>> +results[i] = (completion->result == 0);
>>> +continue;
>>> +}
>>> +
>>> +if (status != DSA_COMP_PAGE_FAULT_NOBOF) {
>>> +fprintf(stderr,
>>> +"Unexpected completion status = %u.\n", status);
>>> +assert(false);
>>> +}
>>> +}
>>> +}
>>> +
>>> +/**
>>> + * @brief Handles an asynchronous DSA batch task completion.
>>> + *
>>> + * @param task A pointer to the batch buffer zero task structure.
>>> + */
>>> +static void
>>> +dsa_batch_task_complete(struct buffer_zero_batch_task *batch_task)
>>> +{
>>> +batch_task->status = DSA_TASK_COMPLETION;
>>> +batch_task->completion_callback(batch_task);
>>> +}
>>> +
>>> +/**
>>> + * @brief The function entry point called by a dedicated DSA
>>> + *work item completion thread.
>>> + *
>>> + * @param opaque A pointer to the thread context.
>>> + *
>>> + * @return void* Not used.
>>> + */
>>> +static void *
>>> +dsa_completion_loop(void *opaque)
>>
>> Per my understanding, if a multifd sending thread corresponds to a DSA 
>> device,
>> then the batch tasks are executed in parallel which means a task may be
>> completed slower than another even if this task is enqueued earlier than it. 
>> If
>> we poll on the slower task first it will block the handling of the faster 
>> one,
>> even if the zero checking task for that thread is finished and it can go 
>> ahead
>> and send the data to the wire, this may lower the network resource 
>> utilization.
>>
> 
> Hi Lei, thanks for reviewing. You are correct that we can keep pulling
> a task enqueued first while others in the queue have already been
> completed. In fact, only one DSA completion thread (pulling thread) is
> used here even when multiple DSA devices are used. The pulling loop is
> the most CPU intensive activity in the DSA workflow and that acts
> directly against the goal of saving CPU usage. The trade-off I want to
> take here is a slightly higher latency on DSA task completion but more
> CPU savings. A single DSA engine can reach 30 GB/s throughput on
> memory comparison operation. We use kernel tcp stack for network
> transfer. The best I see is around 10GB/s throughput.  RDMA can
> potentially go higher but I am not sure if it can go higher than 30
> GB/s throughput anytime soon.

Hi Hao, that makes sense, if the DSA is faster than the network, then a little
bit of latency in DSA checking is tolerable. In the long term, I think the best
form of the DSA task checking thread is to use an fd or such sort of thing that
can multiplex the checking of different DSA devices, then we can serve the DSA
task in the order they complete rather than FCFS.

> 
>>> +{
>>> +struct dsa_completion_thread *thread_context =
>>> +(struct dsa_completion_thread *)opaque;
>>> +struct buffer_zero_batch_task *batch_task;
>>> +struct dsa_device_group *group = thread_context->group;
>>> +
>>> +rcu_register_thread();
>>> +
>>> +thread_context->thread_id = qemu_get_thread_id();
>>> +qemu_sem_post(_context->sem_init_done);
>>> +
>>> +while (thread_context->running) {
>>> +batch_task = dsa_task_dequeue(group);
>>> +assert(batch_task != NULL || !group->

Re: [PATCH v2 13/20] migration/multifd: Prepare to introduce DSA acceleration on the multifd path.

2023-12-17 Thread Wang, Lei
On 11/14/2023 13:40, Hao Xiang wrote:> 1. Refactor multifd_send_thread function.
> 2. Implement buffer_is_zero_use_cpu to handle CPU based zero page
> checking.
> 3. Introduce the batch task structure in MultiFDSendParams.
> 
> Signed-off-by: Hao Xiang 
> ---
>  migration/multifd.c | 82 -
>  migration/multifd.h |  3 ++
>  2 files changed, 70 insertions(+), 15 deletions(-)
> 
> diff --git a/migration/multifd.c b/migration/multifd.c
> index 1198ffde9c..68ab97f918 100644
> --- a/migration/multifd.c
> +++ b/migration/multifd.c
> @@ -14,6 +14,8 @@
>  #include "qemu/cutils.h"
>  #include "qemu/rcu.h"
>  #include "qemu/cutils.h"
> +#include "qemu/dsa.h"
> +#include "qemu/memalign.h"
>  #include "exec/target_page.h"
>  #include "sysemu/sysemu.h"
>  #include "exec/ramblock.h"
> @@ -574,6 +576,11 @@ void multifd_save_cleanup(void)
>  p->name = NULL;
>  multifd_pages_clear(p->pages);
>  p->pages = NULL;
> +g_free(p->addr);
> +p->addr = NULL;
> +buffer_zero_batch_task_destroy(p->batch_task);
> +qemu_vfree(p->batch_task);
> +p->batch_task = NULL;
>  p->packet_len = 0;
>  g_free(p->packet);
>  p->packet = NULL;
> @@ -678,13 +685,66 @@ int multifd_send_sync_main(QEMUFile *f)
>  return 0;
>  }
>  
> +static void set_page(MultiFDSendParams *p, bool zero_page, uint64_t offset)
> +{
> +RAMBlock *rb = p->pages->block;
> +if (zero_page) {
> +p->zero[p->zero_num] = offset;
> +p->zero_num++;
> +ram_release_page(rb->idstr, offset);
> +} else {
> +p->normal[p->normal_num] = offset;
> +p->normal_num++;
> +}
> +}
> +
> +static void buffer_is_zero_use_cpu(MultiFDSendParams *p)
> +{
> +const void **buf = (const void **)p->addr;
> +assert(!migrate_use_main_zero_page());
> +
> +for (int i = 0; i < p->pages->num; i++) {
> +p->batch_task->results[i] = buffer_is_zero(buf[i], p->page_size);
> +}
> +}
> +
> +static void set_normal_pages(MultiFDSendParams *p)
> +{
> +for (int i = 0; i < p->pages->num; i++) {
> +p->batch_task->results[i] = false;
> +}
> +}
> +
> +static void multifd_zero_page_check(MultiFDSendParams *p)
> +{
> +/* older qemu don't understand zero page on multifd channel */
> +bool use_multifd_zero_page = !migrate_use_main_zero_page();
> +
> +RAMBlock *rb = p->pages->block;
> +
> +for (int i = 0; i < p->pages->num; i++) {
> +p->addr[i] = (ram_addr_t)(rb->host + p->pages->offset[i]);
> +}
> +
> +if (use_multifd_zero_page) {
> +buffer_is_zero_use_cpu(p);
> +} else {
> +// No zero page checking. All pages are normal pages.

Please pay attention to the comment style here and in other patches.

> +set_normal_pages(p);
> +}
> +
> +for (int i = 0; i < p->pages->num; i++) {
> +uint64_t offset = p->pages->offset[i];
> +bool zero_page = p->batch_task->results[i];
> +set_page(p, zero_page, offset);
> +}
> +}
> +
>  static void *multifd_send_thread(void *opaque)
>  {
>  MultiFDSendParams *p = opaque;
>  MigrationThread *thread = NULL;
>  Error *local_err = NULL;
> -/* qemu older than 8.2 don't understand zero page on multifd channel */
> -bool use_multifd_zero_page = !migrate_use_main_zero_page();
>  int ret = 0;
>  bool use_zero_copy_send = migrate_zero_copy_send();
>  
> @@ -710,7 +770,6 @@ static void *multifd_send_thread(void *opaque)
>  qemu_mutex_lock(>mutex);
>  
>  if (p->pending_job) {
> -RAMBlock *rb = p->pages->block;
>  uint64_t packet_num = p->packet_num;
>  uint32_t flags;
>  
> @@ -723,18 +782,7 @@ static void *multifd_send_thread(void *opaque)
>  p->iovs_num = 1;
>  }
>  
> -for (int i = 0; i < p->pages->num; i++) {
> -uint64_t offset = p->pages->offset[i];
> -if (use_multifd_zero_page &&
> -buffer_is_zero(rb->host + offset, p->page_size)) {
> -p->zero[p->zero_num] = offset;
> -p->zero_num++;
> -ram_release_page(rb->idstr, offset);
> -} else {
> -p->normal[p->normal_num] = offset;
> -p->normal_num++;
> -}
> -}
> +multifd_zero_page_check(p);
>  
>  if (p->normal_num) {
>  ret = multifd_send_state->ops->send_prepare(p, _err);
> @@ -976,6 +1024,10 @@ int multifd_save_setup(Error **errp)
>  p->pending_job = 0;
>  p->id = i;
>  p->pages = multifd_pages_init(page_count);
> +p->addr = g_new0(ram_addr_t, page_count);
> +p->batch_task =
> +(struct buffer_zero_batch_task *)qemu_memalign(64, 
> sizeof(*p->batch_task));
> +buffer_zero_batch_task_init(p->batch_task, page_count);
>  p->packet_len = 

Re: [PATCH v2 12/20] migration/multifd: Add new migration option for multifd DSA offloading.

2023-12-17 Thread Wang, Lei
On 11/14/2023 13:40, Hao Xiang wrote:
> Intel DSA offloading is an optional feature that turns on if
> proper hardware and software stack is available. To turn on
> DSA offloading in multifd live migration:
> 
> multifd-dsa-accel="[dsa_dev_path1] ] [dsa_dev_path2] ... [dsa_dev_pathX]"

Nit: a redundant bracket ]

> 
> This feature is turned off by default.
> 
> Signed-off-by: Hao Xiang 

[...]



Re: [PATCH v2 09/20] util/dsa: Implement DSA task asynchronous completion thread model.

2023-12-17 Thread Wang, Lei
On 11/14/2023 13:40, Hao Xiang wrote:> * Create a dedicated thread for DSA task
completion.
> * DSA completion thread runs a loop and poll for completed tasks.
> * Start and stop DSA completion thread during DSA device start stop.
> 
> User space application can directly submit task to Intel DSA
> accelerator by writing to DSA's device memory (mapped in user space).

> +}
> +return;
> +}
> +} else {
> +assert(batch_status == DSA_COMP_BATCH_FAIL ||
> +batch_status == DSA_COMP_BATCH_PAGE_FAULT);

Nit: indentation is broken here.

> +}
> +
> +for (int i = 0; i < count; i++) {
> +
> +completion = _task->completions[i];
> +status = completion->status;
> +
> +if (status == DSA_COMP_SUCCESS) {
> +results[i] = (completion->result == 0);
> +continue;
> +}
> +
> +if (status != DSA_COMP_PAGE_FAULT_NOBOF) {
> +fprintf(stderr,
> +"Unexpected completion status = %u.\n", status);
> +assert(false);
> +}
> +}
> +}
> +
> +/**
> + * @brief Handles an asynchronous DSA batch task completion.
> + *
> + * @param task A pointer to the batch buffer zero task structure.
> + */
> +static void
> +dsa_batch_task_complete(struct buffer_zero_batch_task *batch_task)
> +{
> +batch_task->status = DSA_TASK_COMPLETION;
> +batch_task->completion_callback(batch_task);
> +}
> +
> +/**
> + * @brief The function entry point called by a dedicated DSA
> + *work item completion thread.
> + *
> + * @param opaque A pointer to the thread context.
> + *
> + * @return void* Not used.
> + */
> +static void *
> +dsa_completion_loop(void *opaque)

Per my understanding, if a multifd sending thread corresponds to a DSA device,
then the batch tasks are executed in parallel which means a task may be
completed slower than another even if this task is enqueued earlier than it. If
we poll on the slower task first it will block the handling of the faster one,
even if the zero checking task for that thread is finished and it can go ahead
and send the data to the wire, this may lower the network resource utilization.

> +{
> +struct dsa_completion_thread *thread_context =
> +(struct dsa_completion_thread *)opaque;
> +struct buffer_zero_batch_task *batch_task;
> +struct dsa_device_group *group = thread_context->group;
> +
> +rcu_register_thread();
> +
> +thread_context->thread_id = qemu_get_thread_id();
> +qemu_sem_post(_context->sem_init_done);
> +
> +while (thread_context->running) {
> +batch_task = dsa_task_dequeue(group);
> +assert(batch_task != NULL || !group->running);
> +if (!group->running) {
> +assert(!thread_context->running);
> +break;
> +}
> +if (batch_task->task_type == DSA_TASK) {
> +poll_task_completion(batch_task);
> +} else {
> +assert(batch_task->task_type == DSA_BATCH_TASK);
> +poll_batch_task_completion(batch_task);
> +}
> +
> +dsa_batch_task_complete(batch_task);
> +}
> +
> +rcu_unregister_thread();
> +return NULL;
> +}
> +
> +/**
> + * @brief Initializes a DSA completion thread.
> + *
> + * @param completion_thread A pointer to the completion thread context.
> + * @param group A pointer to the DSA device group.
> + */
> +static void
> +dsa_completion_thread_init(
> +struct dsa_completion_thread *completion_thread,
> +struct dsa_device_group *group)
> +{
> +completion_thread->stopping = false;
> +completion_thread->running = true;
> +completion_thread->thread_id = -1;
> +qemu_sem_init(_thread->sem_init_done, 0);
> +completion_thread->group = group;
> +
> +qemu_thread_create(_thread->thread,
> +   DSA_COMPLETION_THREAD,
> +   dsa_completion_loop,
> +   completion_thread,
> +   QEMU_THREAD_JOINABLE);
> +
> +/* Wait for initialization to complete */
> +while (completion_thread->thread_id == -1) {
> +qemu_sem_wait(_thread->sem_init_done);
> +}
> +}
> +
> +/**
> + * @brief Stops the completion thread (and implicitly, the device group).
> + *
> + * @param opaque A pointer to the completion thread.
> + */
> +static void dsa_completion_thread_stop(void *opaque)
> +{
> +struct dsa_completion_thread *thread_context =
> +(struct dsa_completion_thread *)opaque;
> +
> +struct dsa_device_group *group = thread_context->group;
> +
> +qemu_mutex_lock(>task_queue_lock);
> +
> +thread_context->stopping = true;
> +thread_context->running = false;
> +
> +dsa_device_group_stop(group);
> +
> +qemu_cond_signal(>task_queue_cond);
> +qemu_mutex_unlock(>task_queue_lock);
> +
> +qemu_thread_join(_context->thread);
> +
> +qemu_sem_destroy(_context->sem_init_done);
> +}
> +
>  /**
>   * @brief Check if DSA is running.
>   *
> @@ 

Re: [PATCH v2 03/20] multifd: Zero pages transmission

2023-12-17 Thread Wang, Lei
On 11/14/2023 13:40, Hao Xiang wrote:> From: Juan Quintela 
> 
> This implements the zero page dection and handling.

s/dection/detection

> 
> Signed-off-by: Juan Quintela 
> ---
>  migration/multifd.c | 41 +++--
>  migration/multifd.h |  5 +

[...]

> +/*
> + * This array contains the pointers to:

it contains the offsets in the RAMBlock, not the real pointer.

> + *  - normal pages (initial normal_pages entries)
> + *  - zero pages (following zero_pages entries)
> + */
>  uint64_t offset[];
>  } __attribute__((packed)) MultiFDPacket_t;
>  



Re: [PATCH 1/5] migration: Store downtime timestamps in an array

2023-09-27 Thread Wang, Lei
On 9/27/2023 0:18, Joao Martins wrote:
> Right now downtime_start is stored in MigrationState.
> 
> In preparation to having more downtime timestamps during
> switchover, move downtime_start to an array namely, @timestamp.
> 
> Add a setter/getter surrounding which timestamps to record,
> to make it easier to spread to various migration functions.
> 
> Signed-off-by: Joao Martins 
> ---
>  qapi/migration.json   | 14 ++
>  migration/migration.h |  7 +--
>  migration/migration.c | 24 
>  3 files changed, 39 insertions(+), 6 deletions(-)
> 
> diff --git a/qapi/migration.json b/qapi/migration.json
> index 8843e74b59c7..b836cc881d33 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -190,6 +190,20 @@
>  { 'struct': 'VfioStats',
>'data': {'transferred': 'int' } }
>  
> +##
> +# @MigrationDowntime:
> +#
> +# An enumeration of downtime timestamps for all
> +# steps of the switchover.
> +#
> +# @start:Timestamp taken at the start of the switchover right before
   ^
Suggest adding a space here to keep consistent with other attributes.


> +#we stop the VM.
> +#
> +# Since: 8.2
> +##
> +{ 'enum': 'MigrationDowntime',
> +  'data': [ 'start' ] }
> +
>  ##
>  # @MigrationInfo:
>  #
> diff --git a/migration/migration.h b/migration/migration.h
> index c390500604b6..180dc31c5306 100644
> --- a/migration/migration.h
> +++ b/migration/migration.h
> @@ -319,8 +319,8 @@ struct MigrationState {
>  int64_t start_time;
>  /* Total time used by latest migration (ms) */
>  int64_t total_time;
> -/* Timestamp when VM is down (ms) to migrate the last stuff */
> -int64_t downtime_start;
> +/* Timestamps e.g. when VM is down (ms) to migrate the last stuff */
> +int64_t timestamp[MIGRATION_DOWNTIME__MAX];
>  int64_t downtime;
>  int64_t expected_downtime;
>  bool capabilities[MIGRATION_CAPABILITY__MAX];
> @@ -516,4 +516,7 @@ void migration_populate_vfio_info(MigrationInfo *info);
>  void migration_reset_vfio_bytes_transferred(void);
>  void postcopy_temp_page_reset(PostcopyTmpPage *tmp_page);
>  
> +void migration_set_timestamp(MigrationDowntime tm);
> +int64_t migration_get_timestamp(MigrationDowntime tm);
> +
>  #endif
> diff --git a/migration/migration.c b/migration/migration.c
> index d61e5727429a..dd955c61acc7 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -2312,6 +2312,21 @@ static int migration_maybe_pause(MigrationState *s,
>  return s->state == new_state ? 0 : -EINVAL;
>  }
>  
> +void migration_set_timestamp(MigrationDowntime type)
> +{
> +MigrationState *s = migrate_get_current();
> +
> +s->timestamp[type] = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> +}
> +
> +int64_t migration_get_timestamp(MigrationDowntime type)
> +{
> +MigrationState *s = migrate_get_current();
> +
> +return s->timestamp[type];
> +}
> +
> +
>  /**
>   * migration_completion: Used by migration_thread when there's not much left.
>   *   The caller 'breaks' the loop when this returns.
> @@ -2325,7 +2340,7 @@ static void migration_completion(MigrationState *s)
>  
>  if (s->state == MIGRATION_STATUS_ACTIVE) {
>  qemu_mutex_lock_iothread();
> -s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> +migration_set_timestamp(MIGRATION_DOWNTIME_START);
>  qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER, NULL);
>  
>  s->vm_old_state = runstate_get();
> @@ -2670,7 +2685,7 @@ static void migration_calculate_complete(MigrationState 
> *s)
>   * It's still not set, so we are precopy migration.  For
>   * postcopy, downtime is calculated during postcopy_start().
>   */
> -s->downtime = end_time - s->downtime_start;
> +s->downtime = end_time - 
> migration_get_timestamp(MIGRATION_DOWNTIME_START);
>  }
>  
>  transfer_time = s->total_time - s->setup_time;
> @@ -3069,7 +3084,8 @@ static void bg_migration_vm_start_bh(void *opaque)
>  s->vm_start_bh = NULL;
>  
>  vm_start();
> -s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - s->downtime_start;
> +s->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) -
> + migration_get_timestamp(MIGRATION_DOWNTIME_START);
>  }
>  
>  /**
> @@ -3134,7 +3150,7 @@ static void *bg_migration_thread(void *opaque)
>  s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
>  
>  trace_migration_thread_setup_complete();
> -s->downtime_start = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
> +migration_set_timestamp(MIGRATION_DOWNTIME_START);
>  
>  qemu_mutex_lock_iothread();
>  



Re: [RFC PATCH v2 02/21] RAMBlock: Add support of KVM private gmem

2023-09-14 Thread Wang, Lei
On 9/14/2023 11:50, Xiaoyao Li wrote:
> From: Chao Peng 
> 
> Add KVM gmem support to RAMBlock so both normal hva based memory
> and kvm gmem fd based private memory can be associated in one RAMBlock.
> 
> Introduce new flag RAM_KVM_GMEM. It calls KVM ioctl to create private
> gmem for the RAMBlock when it's set.
> 
> Signed-off-by: Xiaoyao Li 

Kindly reminding the author's Signed-off-by is missing.

> ---
>  accel/kvm/kvm-all.c | 17 +
>  include/exec/memory.h   |  3 +++
>  include/exec/ramblock.h |  1 +
>  include/sysemu/kvm.h|  2 ++
>  softmmu/physmem.c   | 18 +++---
>  5 files changed, 38 insertions(+), 3 deletions(-)
> 
> diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c
> index 60aacd925393..185ae16d9620 100644
> --- a/accel/kvm/kvm-all.c
> +++ b/accel/kvm/kvm-all.c
> @@ -4225,3 +4225,20 @@ void query_stats_schemas_cb(StatsSchemaList **result, 
> Error **errp)
>  query_stats_schema_vcpu(first_cpu, _args);
>  }
>  }
> +
> +int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp)
> +{
> +int fd;
> +struct kvm_create_guest_memfd gmem = {
> +.size = size,
> +/* TODO: to decide whether KVM_GUEST_MEMFD_ALLOW_HUGEPAGE is 
> supported */
> +.flags = flags,
> +};
> +
> +fd = kvm_vm_ioctl(kvm_state, KVM_CREATE_GUEST_MEMFD, );
> +if (fd < 0) {
> +error_setg_errno(errp, errno, "%s: error creating kvm gmem\n", 
> __func__);
> +}
> +
> +return fd;
> +}
> diff --git a/include/exec/memory.h b/include/exec/memory.h
> index 68284428f87c..227cb2578e95 100644
> --- a/include/exec/memory.h
> +++ b/include/exec/memory.h
> @@ -235,6 +235,9 @@ typedef struct IOMMUTLBEvent {
>  /* RAM is an mmap-ed named file */
>  #define RAM_NAMED_FILE (1 << 9)
>  
> +/* RAM can be private that has kvm gmem backend */
> +#define RAM_KVM_GMEM(1 << 10)
> +
>  static inline void iommu_notifier_init(IOMMUNotifier *n, IOMMUNotify fn,
> IOMMUNotifierFlag flags,
> hwaddr start, hwaddr end,
> diff --git a/include/exec/ramblock.h b/include/exec/ramblock.h
> index 69c6a5390293..0d158b3909c9 100644
> --- a/include/exec/ramblock.h
> +++ b/include/exec/ramblock.h
> @@ -41,6 +41,7 @@ struct RAMBlock {
>  QLIST_HEAD(, RAMBlockNotifier) ramblock_notifiers;
>  int fd;
>  uint64_t fd_offset;
> +int gmem_fd;
>  size_t page_size;
>  /* dirty bitmap used during migration */
>  unsigned long *bmap;
> diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
> index 115f0cca79d1..f5b74c8dd8c5 100644
> --- a/include/sysemu/kvm.h
> +++ b/include/sysemu/kvm.h
> @@ -580,4 +580,6 @@ bool kvm_arch_cpu_check_are_resettable(void);
>  bool kvm_dirty_ring_enabled(void);
>  
>  uint32_t kvm_dirty_ring_size(void);
> +
> +int kvm_create_guest_memfd(uint64_t size, uint64_t flags, Error **errp);
>  #endif
> diff --git a/softmmu/physmem.c b/softmmu/physmem.c
> index 3df73542e1fe..2d98a88f41f0 100644
> --- a/softmmu/physmem.c
> +++ b/softmmu/physmem.c
> @@ -1824,6 +1824,16 @@ static void ram_block_add(RAMBlock *new_block, Error 
> **errp)
>  }
>  }
>  
> +if (kvm_enabled() && new_block->flags & RAM_KVM_GMEM &&
> +new_block->gmem_fd < 0) {
> +new_block->gmem_fd = kvm_create_guest_memfd(new_block->max_length,
> +0, errp);
> +if (new_block->gmem_fd < 0) {
> +qemu_mutex_unlock_ramlist();
> +return;
> +}
> +}
> +
>  new_ram_size = MAX(old_ram_size,
>(new_block->offset + new_block->max_length) >> 
> TARGET_PAGE_BITS);
>  if (new_ram_size > old_ram_size) {
> @@ -1885,7 +1895,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, 
> MemoryRegion *mr,
>  
>  /* Just support these ram flags by now. */
>  assert((ram_flags & ~(RAM_SHARED | RAM_PMEM | RAM_NORESERVE |
> -  RAM_PROTECTED | RAM_NAMED_FILE)) == 0);
> +  RAM_PROTECTED | RAM_NAMED_FILE | RAM_KVM_GMEM)) == 
> 0);
>  
>  if (xen_enabled()) {
>  error_setg(errp, "-mem-path not supported with Xen");
> @@ -1920,6 +1930,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, 
> MemoryRegion *mr,
>  new_block->used_length = size;
>  new_block->max_length = size;
>  new_block->flags = ram_flags;
> +new_block->gmem_fd = -1;
>  new_block->host = file_ram_alloc(new_block, size, fd, readonly,
>   !file_size, offset, errp);
>  if (!new_block->host) {
> @@ -1978,7 +1989,7 @@ RAMBlock *qemu_ram_alloc_internal(ram_addr_t size, 
> ram_addr_t max_size,
>  Error *local_err = NULL;
>  
>  assert((ram_flags & ~(RAM_SHARED | RAM_RESIZEABLE | RAM_PREALLOC |
> -  RAM_NORESERVE)) == 0);
> +  RAM_NORESERVE| RAM_KVM_GMEM)) == 0);
>  assert(!host ^ (ram_flags & RAM_PREALLOC));
>  
>

Re: [PATCH for-8.2 v2 2/2] migration: Allow user to specify migration switchover bandwidth

2023-09-05 Thread Wang, Lei
On 9/6/2023 0:46, Peter Xu wrote:
> On Fri, Sep 01, 2023 at 09:37:32AM +0100, Daniel P. Berrangé wrote:
 When the user wants to have migration only use 5Gbps out of that 10Gbps,
 one can set max-bandwidth to 5Gbps, along with max-switchover-bandwidth to
 5Gbps so it'll never use over 5Gbps too (so the user can have the rest
>>>
>>> Hi Peter. I'm curious if we specify max-switchover-bandwidth to 5Gbps over a
>>> 10Gbps network, in the completion stage will it send the remaining data in 
>>> 5Gbps
>>> using downtime_limit time or in 10Gbps (saturate the network) using the
>>> downtime_limit / 2 time? Seems this parameter won't rate limit the final 
>>> stage:)
>>
>> Effectively the mgmt app is telling QEMU to assume that this
>> much bandwidth is available for use during switchover. If QEMU
>> determines that, given this available bandwidth, the remaining
>> data can be sent over the link within the downtime limit, it
>> will perform the switchover. When sending this sitchover data,
>> it will actually transmit the data at full line rate IIUC.
> 
> I'm right at reposting this patch, but then I found that the
> max-available-bandwidth is indeed confusing (as Lei's question shows).
> 
> We do have all the bandwidth throttling values in the pattern of
> max-*-bandwidth and this one will start to be the outlier that it won't
> really throttle the network.
> 
> If the old name "available-bandwidth" is too general, I'm now considering
> "avail-switchover-bandwidth" just to leave max- out of the name to
> differenciate, if some day we want to add a real throttle for switchover we
> can still have a sane name.
> 
> Any objections before I repost?

I'm also OK with it. "avail" has semantics that we have a lower bound of the
bandwidth when switchover so we can promise at least those amount of bandwidth
can be used, so it can cover both the throttling and non-throuttling case.
"switchover" means this parameter only works in the switchover phase rather than
the bulk stage.

> 
> Thanks,
> 



Re: [PATCH for-8.2 v2 2/2] migration: Allow user to specify migration switchover bandwidth

2023-09-01 Thread Wang, Lei
On 8/3/2023 23:53, Peter Xu wrote:
> Migration bandwidth is a very important value to live migration.  It's
> because it's one of the major factors that we'll make decision on when to
> switchover to destination in a precopy process.
> 
> This value is currently estimated by QEMU during the whole live migration
> process by monitoring how fast we were sending the data.  This can be the
> most accurate bandwidth if in the ideal world, where we're always feeding
> unlimited data to the migration channel, and then it'll be limited to the
> bandwidth that is available.
> 
> However in reality it may be very different, e.g., over a 10Gbps network we
> can see query-migrate showing migration bandwidth of only a few tens of
> MB/s just because there are plenty of other things the migration thread
> might be doing.  For example, the migration thread can be busy scanning
> zero pages, or it can be fetching dirty bitmap from other external dirty
> sources (like vhost or KVM).  It means we may not be pushing data as much
> as possible to migration channel, so the bandwidth estimated from "how many
> data we sent in the channel" can be dramatically inaccurate sometimes,
> e.g., that a few tens of MB/s even if 10Gbps available, and then the
> decision to switchover will be further affected by this.
> 
> The migration may not even converge at all with the downtime specified,
> with that wrong estimation of bandwidth.
> 
> The issue is QEMU itself may not be able to avoid those uncertainties on
> measuing the real "available migration bandwidth".  At least not something
> I can think of so far.
> 
> One way to fix this is when the user is fully aware of the available
> bandwidth, then we can allow the user to help providing an accurate value.
> 
> For example, if the user has a dedicated channel of 10Gbps for migration
> for this specific VM, the user can specify this bandwidth so QEMU can
> always do the calculation based on this fact, trusting the user as long as
> specified.
> 
> A new parameter "max-switchover-bandwidth" is introduced just for this. So
> when the user specified this parameter, instead of trusting the estimated
> value from QEMU itself (based on the QEMUFile send speed), let's trust the
> user more by using this value to decide when to switchover, assuming that
> we'll have such bandwidth available then.
> 
> When the user wants to have migration only use 5Gbps out of that 10Gbps,
> one can set max-bandwidth to 5Gbps, along with max-switchover-bandwidth to
> 5Gbps so it'll never use over 5Gbps too (so the user can have the rest

Hi Peter. I'm curious if we specify max-switchover-bandwidth to 5Gbps over a
10Gbps network, in the completion stage will it send the remaining data in 5Gbps
using downtime_limit time or in 10Gbps (saturate the network) using the
downtime_limit / 2 time? Seems this parameter won't rate limit the final stage:)

> 5Gbps for other things).  So it can be useful even if the network is not
> dedicated, but as long as the user can know a solid value.
> 
> This can resolve issues like "unconvergence migration" which is caused by
> hilarious low "migration bandwidth" detected for whatever reason.
> 
> Reported-by: Zhiyi Guo 
> Signed-off-by: Peter Xu 
> ---
>  qapi/migration.json| 14 +-
>  migration/migration.h  |  2 +-
>  migration/options.h|  1 +
>  migration/migration-hmp-cmds.c | 14 ++
>  migration/migration.c  | 19 +++
>  migration/options.c| 28 
>  migration/trace-events |  2 +-
>  7 files changed, 73 insertions(+), 7 deletions(-)
> 
> diff --git a/qapi/migration.json b/qapi/migration.json
> index bb798f87a5..6a04fb7d36 100644
> --- a/qapi/migration.json
> +++ b/qapi/migration.json
> @@ -759,6 +759,16 @@
>  # @max-bandwidth: to set maximum speed for migration.  maximum speed
>  # in bytes per second.  (Since 2.8)
>  #
> +# @max-switchover-bandwidth: to set available bandwidth for migration.
> +# By default, this value is zero, means the user is not aware of
> +# the available bandwidth that can be used by QEMU migration, so
> +# QEMU will estimate the bandwidth automatically.  This can be set
> +# when the estimated value is not accurate, while the user is able
> +# to guarantee such bandwidth is available for migration purpose
> +# during the migration procedure.  When specified correctly, this
> +# can make the switchover decision much more accurate, which will
> +# also be based on the max downtime specified.  (Since 8.2)
> +#
>  # @downtime-limit: set maximum tolerated downtime for migration.
>  # maximum downtime in milliseconds (Since 2.8)
>  #
> @@ -840,7 +850,7 @@
> 'cpu-throttle-initial', 'cpu-throttle-increment',
> 'cpu-throttle-tailslow',
> 'tls-creds', 'tls-hostname', 'tls-authz', 'max-bandwidth',
> -   'downtime-limit',
> +   

Re: [PATCH 0/7] Add new CPU model EmeraldRapids and GraniteRapids

2023-06-15 Thread Wang, Lei
On 6/16/2023 11:23, Tao Su wrote:
> This patch series mainly updates SapphireRapids CPU model and adds
> new CPU model EmeraldRapids and GraniteRapids.
> 
> Bit 13 (ARCH_CAP_FBSDP_NO), bit 14 (ARCH_CAP_FBSDP_NO) and bit 15

Bit 13 should be MSR_ARCH_CAP_SBDR_SSDP_NO, right?

> (ARCH_CAP_PSDP_NO) of MSR_IA32_ARCH_CAPABILITIES are enumerated starting
> from latest SapphireRapids, which are missed in current SapphireRapids
> CPU model, so add a new version for SapphireRapids CPU model to expose
> these bits.
> 
> Add EmeraldRapids CPU model to this series, since EmeraldRapids also
> enumerates these bits. The original patch of EmeraldRapids CPU model is
> in [1].
> 
> GraniteRapids is Intel's successor to EmeraldRapids, an Intel 3 process
> microarchitecture for enthusiasts and servers, which adds new features
> based on SapphireRapids and EmeraldRapids.
> 
> [1]
> https://lore.kernel.org/qemu-devel/20230515025308.1050277-1-qian@intel.com/
> 
> Lei Wang (1):
>   target/i386: Add few security fix bits in ARCH_CAPABILITIES into
> SapphireRapids CPU model
> 
> Qian Wen (1):
>   target/i386: Add new CPU model EmeraldRapids
> 
> Tao Su (5):
>   target/i386: Add FEAT_7_1_EDX to adjust feature level
>   target/i386: Add support for MCDT_NO in CPUID enumeration
>   target/i386: Allow MCDT_NO if host supports
>   target/i386: Add new bit definitions of MSR_IA32_ARCH_CAPABILITIES
>   target/i386: Add new CPU model GraniteRapids
> 
>  target/i386/cpu.c | 303 +-
>  target/i386/cpu.h |   8 ++
>  target/i386/kvm/kvm.c |   5 +
>  3 files changed, 314 insertions(+), 2 deletions(-)
> 
> 
> base-commit: 7efd65423ab22e6f5890ca08ae40c84d6660242f



Re: [PATCH v2 4/4] migration/calc-dirty-rate: tool to predict migration time

2023-05-29 Thread Wang, Lei
On 4/27/2023 20:43, Andrei Gudkov via wrote:
> Signed-off-by: Andrei Gudkov 
> ---
>  MAINTAINERS  |   1 +
>  scripts/predict_migration.py | 283 +++
>  2 files changed, 284 insertions(+)
>  create mode 100644 scripts/predict_migration.py
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index fc225e66df..0c578446cf 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -3167,6 +3167,7 @@ F: docs/devel/migration.rst
>  F: qapi/migration.json
>  F: tests/migration/
>  F: util/userfaultfd.c
> +F: scripts/predict_migration.py
>  
>  D-Bus
>  M: Marc-André Lureau 
> diff --git a/scripts/predict_migration.py b/scripts/predict_migration.py
> new file mode 100644
> index 00..c92a97585f
> --- /dev/null
> +++ b/scripts/predict_migration.py
> @@ -0,0 +1,283 @@
> +#!/usr/bin/env python3
> +#
> +# Predicts time required to migrate VM under given max downtime constraint.
> +#
> +# Copyright (c) 2023 HUAWEI TECHNOLOGIES CO.,LTD.
> +#
> +# Authors:
> +#  Andrei Gudkov 
> +#
> +# This work is licensed under the terms of the GNU GPL, version 2 or
> +# later.  See the COPYING file in the top-level directory.
> +
> +
> +# Usage:
> +#
> +# Step 1. Collect dirty page statistics from live VM:
> +# $ scripts/predict_migration.py calc-dirty-rate   
> >dirty.json
> +# <...takes 1 minute by default...>
> +#
> +# Step 2. Run predictor against collected data:
> +# $ scripts/predict_migration.py predict < dirty.json
> +# Downtime> |125ms |250ms |500ms |   1000ms |   5000ms |
> unlim |
> +# 
> -
> +#  100 Mbps |- |- |- |- |- |   
> 16m45s |
> +#1 Gbps |- |- |- |- |- |
> 1m39s |
> +#2 Gbps |- |- |- |- |1m55s |  
> 50s |
> +#  2.5 Gbps |- |- |- |- |1m12s |  
> 40s |
> +#5 Gbps |- |- |- |  29s |  25s |  
> 20s |
> +#   10 Gbps |  13s |  13s |  12s |  12s |  12s |  
> 10s |
> +#   25 Gbps |   5s |   5s |   5s |   5s |   4s |   
> 4s |
> +#   40 Gbps |   3s |   3s |   3s |   3s |   3s |   
> 3s |
> +#
> +# The latter prints table that lists estimated time it will take to migrate 
> VM.
> +# This time depends on the network bandwidth and max allowed downtime.
> +# Dash indicates that migration does not converge.
> +# Prediction takes care only about migrating RAM and only in pre-copy mode.
> +# Other features, such as compression or local disk migration, are not 
> supported
> +
> +
> +import sys
> +import os
> +import math
> +import json
> +from dataclasses import dataclass
> +import asyncio
> +import argparse
> +
> +sys.path.append(os.path.join(os.path.dirname(__file__), '..', 'python'))
> +from qemu.qmp import QMPClient
> +
> +async def calc_dirty_rate(host, port, calc_time, sample_pages):
> +client = QMPClient()
> +try:
> +await client.connect((host, port))
> +args = {
> +'calc-time': calc_time,
> +'sample-pages': sample_pages
> +}
> +await client.execute('calc-dirty-rate', args)
> +await asyncio.sleep(calc_time)
> +while True:
> +data = await client.execute('query-dirty-rate')
> +if data['status'] == 'measuring':
> +await asyncio.sleep(0.5)
> +elif data['status'] == 'measured':
> +return data
> +else:
> +raise ValueError(data['status'])
> +finally:
> +await client.disconnect()
> +
> +
> +class MemoryModel:
> +"""
> +Models RAM state during pre-copy migration using calc-dirty-rate results.
> +Its primary function is to estimate how many pages will be dirtied
> +after given time starting from "clean" state.
> +This function is non-linear and saturates at some point.
> +"""
> +
> +@dataclass
> +class Point:
> +period_millis:float
> +dirty_pages:float
> +
> +def __init__(self, data):
> +"""
> +:param data: dictionary returned by calc-dirty-rate
> +"""
> +self.__points = self.__make_points(data)
> +self.__page_size = data['page-size']
> +self.__num_total_pages = data['n-total-pages']
> +self.__num_zero_pages = data['n-zero-pages'] / \
> +(data['n-sampled-pages'] / data['n-total-pages'])
> +
> +def __make_points(self, data):
> +points = list()
> +
> +# Add observed points
> +sample_ratio = data['n-sampled-pages'] / data['n-total-pages']
> +for millis,dirty_pages in zip(data['periods'], 
> data['n-dirty-pages']):
> +millis = float(millis)
> +dirty_pages = dirty_pages / sample_ratio
> +points.append(MemoryModel.Point(millis, dirty_pages))
> +
> +  

Re: [PATCH v2 2/4] migration/calc-dirty-rate: detailed stats in sampling mode

2023-05-29 Thread Wang, Lei
On 4/27/2023 20:42, Andrei Gudkov via wrote:
> Collect number of dirty pages for progresseively increasing time
> periods starting with 125ms up to number of seconds specified with
> calc-dirty-rate. Report through qmp and hmp: 1) vector of dirty page
> measurements, 2) page size, 3) total number of VM pages, 4) number
> of sampled pages.
> 
> Signed-off-by: Andrei Gudkov 
> ---
>  migration/dirtyrate.c | 165 +-
>  migration/dirtyrate.h |  25 ++-
>  qapi/migration.json   |  24 +-
>  3 files changed, 160 insertions(+), 54 deletions(-)
> 
> diff --git a/migration/dirtyrate.c b/migration/dirtyrate.c
> index acba3213a3..4491bbe91a 100644
> --- a/migration/dirtyrate.c
> +++ b/migration/dirtyrate.c
> @@ -224,6 +224,7 @@ static struct DirtyRateInfo *query_dirty_rate_info(void)
>  info->calc_time = DirtyStat.calc_time;
>  info->sample_pages = DirtyStat.sample_pages;
>  info->mode = dirtyrate_mode;
> +info->page_size = TARGET_PAGE_SIZE;
>  
>  if (qatomic_read() == DIRTY_RATE_STATUS_MEASURED) {
>  info->has_dirty_rate = true;
> @@ -245,6 +246,29 @@ static struct DirtyRateInfo *query_dirty_rate_info(void)
>  info->vcpu_dirty_rate = head;
>  }
>  
> +if (dirtyrate_mode == DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING) {
> +int64List *periods_head = NULL;
> +int64List **periods_tail = _head;
> +int64List *n_dirty_pages_head = NULL;
> +int64List **n_dirty_pages_tail = _dirty_pages_head;
> +
> +info->n_total_pages = DirtyStat.page_sampling.n_total_pages;
> +info->has_n_total_pages = true;
> +
> +info->n_sampled_pages = DirtyStat.page_sampling.n_sampled_pages;
> +info->has_n_sampled_pages = true;
> +
> +for (i = 0; i < DirtyStat.page_sampling.n_readings; i++) {
> +DirtyReading *dr = _sampling.readings[i];
> +QAPI_LIST_APPEND(periods_tail, dr->period);
> +QAPI_LIST_APPEND(n_dirty_pages_tail, dr->n_dirty_pages);
> +}
> +info->n_dirty_pages = n_dirty_pages_head;
> +info->periods = periods_head;
> +info->has_n_dirty_pages = true;
> +info->has_periods = true;
> +}
> +
>  if (dirtyrate_mode == DIRTY_RATE_MEASURE_MODE_DIRTY_BITMAP) {
>  info->sample_pages = 0;
>  }
> @@ -265,9 +289,11 @@ static void init_dirtyrate_stat(int64_t start_time,
>  
>  switch (config.mode) {
>  case DIRTY_RATE_MEASURE_MODE_PAGE_SAMPLING:
> -DirtyStat.page_sampling.total_dirty_samples = 0;
> -DirtyStat.page_sampling.total_sample_count = 0;
> -DirtyStat.page_sampling.total_block_mem_MB = 0;
> +DirtyStat.page_sampling.n_total_pages = 0;
> +DirtyStat.page_sampling.n_sampled_pages = 0;
> +DirtyStat.page_sampling.n_readings = 0;
> +DirtyStat.page_sampling.readings = 
> g_try_malloc0_n(MAX_DIRTY_READINGS,
> +  
> sizeof(DirtyReading));
>  break;
>  case DIRTY_RATE_MEASURE_MODE_DIRTY_RING:
>  DirtyStat.dirty_ring.nvcpu = -1;
> @@ -285,28 +311,10 @@ static void cleanup_dirtyrate_stat(struct 
> DirtyRateConfig config)
>  free(DirtyStat.dirty_ring.rates);
>  DirtyStat.dirty_ring.rates = NULL;
>  }
> -}
> -
> -static void update_dirtyrate_stat(struct RamblockDirtyInfo *info)
> -{
> -DirtyStat.page_sampling.total_dirty_samples += info->sample_dirty_count;
> -DirtyStat.page_sampling.total_sample_count += info->sample_pages_count;
> -/* size of total pages in MB */
> -DirtyStat.page_sampling.total_block_mem_MB += (info->ramblock_pages *
> -   TARGET_PAGE_SIZE) >> 20;
> -}
> -
> -static void update_dirtyrate(uint64_t msec)
> -{
> -uint64_t dirtyrate;
> -uint64_t total_dirty_samples = 
> DirtyStat.page_sampling.total_dirty_samples;
> -uint64_t total_sample_count = DirtyStat.page_sampling.total_sample_count;
> -uint64_t total_block_mem_MB = DirtyStat.page_sampling.total_block_mem_MB;
> -
> -dirtyrate = total_dirty_samples * total_block_mem_MB *
> -1000 / (total_sample_count * msec);
> -
> -DirtyStat.dirty_rate = dirtyrate;
> +if (DirtyStat.page_sampling.readings) {
> +free(DirtyStat.page_sampling.readings);
> +DirtyStat.page_sampling.readings = NULL;
> +}
>  }
>  
>  /*
> @@ -377,12 +385,14 @@ static bool save_ramblock_hash(struct RamblockDirtyInfo 
> *info)
>  return false;
>  }
>  
> -rand  = g_rand_new();
> +rand = g_rand_new();
> +DirtyStat.page_sampling.n_total_pages += info->ramblock_pages;
>  for (i = 0; i < sample_pages_count; i++) {
>  info->sample_page_vfn[i] = g_rand_int_range(rand, 0,
>  info->ramblock_pages - 
> 1);
>  

Re: [PATCH] multifd: Set a higher "backlog" default value for listen()

2023-05-18 Thread Wang, Lei
On 5/19/2023 10:44, Wang, Wei W wrote:
> On Friday, May 19, 2023 9:31 AM, Wang, Lei4 wrote:
>> On 5/18/2023 17:16, Juan Quintela wrote:
>>> Lei Wang  wrote:
 When destination VM is launched, the "backlog" parameter for listen()
 is set to 1 as default in socket_start_incoming_migration_internal(),
 which will lead to socket connection error (the queue of pending
 connections is full) when "multifd" and "multifd-channels" are set
 later on and a high number of channels are used. Set it to a
 hard-coded higher default value 512 to fix this issue.

 Reported-by: Wei Wang 
 Signed-off-by: Lei Wang 
>>>
>>> [cc'd daiel who is the maintainer of qio]
>>>
>>> My understanding of that value is that 230 or something like that
>>> would be more than enough.  The maxiimum number of multifd channels is
>> 256.
>>
>> You are right, the "multifd-channels" expects uint8_t, so 256 is enough.
>>
> 
> We can change it to uint16_t or uint32_t, but need to see if listening on a 
> larger
> value is OK to everyone.

Is there any use case to use >256 migration channels? If not, then I suppose
it's no need to increase it.

> 
> Man page of listen mentions that the  maximum length of the queue for
> incomplete sockets can be set using /proc/sys/net/ipv4/tcp_max_syn_backlog,
> and it is 4096 by default on my machine



Re: [PATCH] multifd: Set a higher "backlog" default value for listen()

2023-05-18 Thread Wang, Lei
On 5/18/2023 17:16, Juan Quintela wrote:
> Lei Wang  wrote:
>> When destination VM is launched, the "backlog" parameter for listen() is set
>> to 1 as default in socket_start_incoming_migration_internal(), which will
>> lead to socket connection error (the queue of pending connections is full)
>> when "multifd" and "multifd-channels" are set later on and a high number of
>> channels are used. Set it to a hard-coded higher default value 512 to fix
>> this issue.
>>
>> Reported-by: Wei Wang 
>> Signed-off-by: Lei Wang 
> 
> [cc'd daiel who is the maintainer of qio]
> 
> My understanding of that value is that 230 or something like that would
> be more than enough.  The maxiimum number of multifd channels is 256.

You are right, the "multifd-channels" expects uint8_t, so 256 is enough.

> 
> Daniel, any opinion?
> 
> Later, Juan.
> 
>> ---
>>  migration/socket.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/migration/socket.c b/migration/socket.c
>> index 1b6f5baefb..b43a66ef7e 100644
>> --- a/migration/socket.c
>> +++ b/migration/socket.c
>> @@ -179,7 +179,7 @@ socket_start_incoming_migration_internal(SocketAddress 
>> *saddr,
>>  QIONetListener *listener = qio_net_listener_new();
>>  MigrationIncomingState *mis = migration_incoming_get_current();
>>  size_t i;
>> -int num = 1;
>> +int num = 512;
>>  
>>  qio_net_listener_set_name(listener, "migration-socket-listener");
> 



Re: [PATCH] docs: build-system: rename "default-configs" to "configs"

2023-05-12 Thread Wang, Lei
There are also several "default-configs" in docs/devel/kconfig.rst, I think they
can also be updated in this patch.

BR,
Lei

On 10/4/2021 15:12, Kashyap Chamarthy wrote:
> Commit 812b31d3f9 (configs: rename default-configs to configs and
> reorganise, 2021-07-07) did the rename.
> 
> Reflect that update also in the documentation.
> 
> Signed-off-by: Kashyap Chamarthy 
> ---
>  docs/devel/build-system.rst | 12 ++--
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/docs/devel/build-system.rst b/docs/devel/build-system.rst
> index 0f636d620e..bfa5250802 100644
> --- a/docs/devel/build-system.rst
> +++ b/docs/devel/build-system.rst
> @@ -250,7 +250,7 @@ Target-dependent emulator sourcesets:
>Each emulator also includes sources for files in the ``hw/`` and 
> ``target/``
>subdirectories.  The subdirectory used for each emulator comes
>from the target's definition of ``TARGET_BASE_ARCH`` or (if missing)
> -  ``TARGET_ARCH``, as found in ``default-configs/targets/*.mak``.
> +  ``TARGET_ARCH``, as found in ``configs/targets/*.mak``.
>  
>Each subdirectory in ``hw/`` adds one sourceset to the ``hw_arch`` 
> dictionary,
>for example::
> @@ -307,8 +307,8 @@ Utility sourcesets:
>  The following files concur in the definition of which files are linked
>  into each emulator:
>  
> -``default-configs/devices/*.mak``
> -  The files under ``default-configs/devices/`` control the boards and devices
> +``configs/devices/*.mak``
> +  The files under ``configs/devices/`` control the boards and devices
>that are built into each QEMU system emulation targets. They merely contain
>a list of config variable definitions such as::
>  
> @@ -317,11 +317,11 @@ into each emulator:
>  CONFIG_XLNX_VERSAL=y
>  
>  ``*/Kconfig``
> -  These files are processed together with ``default-configs/devices/*.mak`` 
> and
> +  These files are processed together with ``configs/devices/*.mak`` and
>describe the dependencies between various features, subsystems and
>device models.  They are described in :ref:`kconfig`
>  
> -``default-configs/targets/*.mak``
> +``configs/targets/*.mak``
>These files mostly define symbols that appear in the ``*-config-target.h``
>file for each emulator [#cfgtarget]_.  However, the ``TARGET_ARCH``
>and ``TARGET_BASE_ARCH`` will also be used to select the ``hw/`` and
> @@ -460,7 +460,7 @@ Built by Meson:
>TARGET-NAME is again the name of a system or userspace emulator. The
>config-devices.mak file is automatically generated by make using the
>scripts/make_device_config.sh program, feeding it the
> -  default-configs/$TARGET-NAME file as input.
> +  configs/$TARGET-NAME file as input.
>  
>  ``config-host.h``, ``$TARGET-NAME/config-target.h``, 
> ``$TARGET-NAME/config-devices.h``
>These files are used by source code to determine what features



Re: [PATCH v3 6/8] target/i386/intel-pt: Enable host pass through of Intel PT

2023-02-20 Thread Wang, Lei


On 12/8/2022 2:25 PM, Xiaoyao Li wrote:
> commit e37a5c7fa459 ("i386: Add Intel Processor Trace feature support")
> added the support of Intel PT by making CPUID[14] of PT as fixed feature
> set (from ICX) for any CPU model on any host. This truly breaks the PT
> exposure on Intel SPR platform because SPR has less supported bitmap of
> CPUID(0x14,1):EBX[15:0] than ICX.
> 
> To fix the problem, enable pass through of host's PT capabilities for
> the cases "-cpu host/max" that it won't use default fixed PT feature set
> of ICX but expand automatically based on get_supported_cpuid reported by
> host. Meanwhile, it needs to ensure named CPU model still has the fixed
> PT feature set to not break the live migration case of
> "-cpu named_cpu_model,+intel-pt"
> 
> Introduces env->use_default_intel_pt flag.
>  - True means it's old CPU model that uses fixed PT feature set of ICX.
>  - False means the named CPU model has its own PT feature set.
> 
> Besides, to keep the same behavior for old CPU models that validate PT
> feature set against default fixed PT feature set of ICX in addition to
> validate from host's capabilities (via get_supported_cpuid) in
> x86_cpu_filter_features().
> 
> In the future, new named CPU model, e.g., Sapphire Rapids, can define
> its own PT feature set by setting @has_specific_intel_pt_feature_set to
> true and defines it's own FEAT_14_0_EBX, FEAT_14_0_ECX, FEAT_14_1_EAX
> and FEAT_14_1_EBX.
> 
> Signed-off-by: Xiaoyao Li 
> ---
>  target/i386/cpu.c | 71 ++-
>  target/i386/cpu.h |  1 +
>  2 files changed, 40 insertions(+), 32 deletions(-)
> 
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index e302cbbebfc5..24f3c7b06698 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -5194,6 +5194,21 @@ static void x86_cpu_load_model(X86CPU *cpu, 
> X86CPUModel *model)
>  env->features[w] = def->features[w];
>  }
>  
> +/*
> + * All (old) named CPU models have the same default values for INTEL_PT_*
> + *
> + * Assign the default value here since we don't want to manually 
> copy/paste
> + * it to all entries in builtin_x86_defs.
> + */
> +if (!env->features[FEAT_14_0_EBX] && !env->features[FEAT_14_0_ECX] &&
> +!env->features[FEAT_14_1_EAX] && !env->features[FEAT_14_1_EBX]) {
> +env->use_default_intel_pt = true;
> +env->features[FEAT_14_0_EBX] = INTEL_PT_DEFAULT_0_EBX;
> +env->features[FEAT_14_0_ECX] = INTEL_PT_DEFAULT_0_ECX;
> +env->features[FEAT_14_1_EAX] = INTEL_PT_DEFAULT_1_EAX;
> +env->features[FEAT_14_1_EBX] = INTEL_PT_DEFAULT_1_EBX;
> +}
> +
>  /* legacy-cache defaults to 'off' if CPU model provides cache info */
>  cpu->legacy_cache = !def->cache_info;
>  
> @@ -5716,14 +5731,11 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
> uint32_t count,
>  
>  if (count == 0) {
>  *eax = INTEL_PT_MAX_SUBLEAF;
> -*ebx = INTEL_PT_DEFAULT_0_EBX;
> -*ecx = INTEL_PT_DEFAULT_0_ECX;
> -if (env->features[FEAT_14_0_ECX] & CPUID_14_0_ECX_LIP) {
> -*ecx |= CPUID_14_0_ECX_LIP;
> -}
> +*ebx = env->features[FEAT_14_0_EBX];
> +*ecx = env->features[FEAT_14_0_ECX];
>  } else if (count == 1) {
> -*eax = INTEL_PT_DEFAULT_1_EAX;
> -*ebx = INTEL_PT_DEFAULT_1_EBX;
> +*eax = env->features[FEAT_14_1_EAX];
> +*ebx = env->features[FEAT_14_1_EBX];
>  }
>  break;
>  }
> @@ -6425,6 +6437,7 @@ static void x86_cpu_filter_features(X86CPU *cpu, bool 
> verbose)
>  CPUX86State *env = >env;
>  FeatureWord w;
>  const char *prefix = NULL;
> +uint64_t host_feat;
>  
>  if (verbose) {
>  prefix = accel_uses_host_cpuid()
> @@ -6433,8 +6446,7 @@ static void x86_cpu_filter_features(X86CPU *cpu, bool 
> verbose)
>  }
>  
>  for (w = 0; w < FEATURE_WORDS; w++) {
> -uint64_t host_feat =
> -x86_cpu_get_supported_feature_word(w, false);
> +host_feat = x86_cpu_get_supported_feature_word(w, false);
>  uint64_t requested_features = env->features[w];
>  uint64_t unavailable_features;
>  
> @@ -6458,31 +6470,26 @@ static void x86_cpu_filter_features(X86CPU *cpu, bool 
> verbose)
>  mark_unavailable_features(cpu, w, unavailable_features, prefix);
>  }
>  
> -if ((env->features[FEAT_7_0_EBX] & CPUID_7_0_EBX_INTEL_PT) &&
> -kvm_enabled()) {
> -KVMState *s = CPU(cpu)->kvm_state;
> -uint32_t eax_0 = kvm_arch_get_supported_cpuid(s, 0x14, 0, R_EAX);
> -uint32_t ebx_0 = kvm_arch_get_supported_cpuid(s, 0x14, 0, R_EBX);
> -uint32_t ecx_0 = kvm_arch_get_supported_cpuid(s, 0x14, 0, R_ECX);
> -uint32_t eax_1 = kvm_arch_get_supported_cpuid(s, 0x14, 1, R_EAX);
> -uint32_t ebx_1 = kvm_arch_get_supported_cpuid(s, 0x14, 1, R_EBX);
> -
> -if (!eax_0 ||
> -  

Re: [PATCH v3 4/6] i386: Mask and report unavailable multi-bit feature values

2023-02-08 Thread Wang, Lei
On 2/9/2023 1:59 PM, Xiaoyao Li wrote:
> On 2/9/2023 12:21 PM, Wang, Lei wrote:
>>>>>> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
>>>>>> index 88aa780566..e638a31d34 100644
>>>>>> --- a/target/i386/cpu.c
>>>>>> +++ b/target/i386/cpu.c
>>>>>> @@ -4377,6 +4377,28 @@ static bool x86_cpu_have_filtered_features(X86CPU
>>>>>> *cpu)
>>>>>>    return false;
>>>>>>    }
>>>>>>
>>>>>> +static void mark_unavailable_multi_bit_default(X86CPU *cpu, FeatureWord 
>>>>>> w,
>>>>>> +   int index,
>>>>>> +   const char 
>>>>>> *verbose_prefix)
>>>>>> +{
>>>>>> +    FeatureWordInfo *f = _word_info[w];
>>>>>> +    g_autofree char *feat_word_str = feature_word_description(f);
>>>>>> +    uint64_t host_feat = x86_cpu_get_supported_feature_word(w, false);
>>>>>> +    MultiBitFeatureInfo mf = f->multi_bit_features[index];
>>>>>> +
>>>>>> +    if ((cpu->env.features[w] & mf.mask) &&
>>>>> With this checking bits are all 0 but covered by mf.mask's range are 
>>>>> skippped,
>>>>> even if they're different from the host_feat, please check whether it's
>>>>> desried
>>>>> behavior.
>>>> This is the intended design because there are quite a number of multi-bit
>>>> CPUIDs
>>>> should support passing all 0 to them.
>>> you didn't answer the question. The question is why the handling can be 
>>> skipped
>>> when the value of multi-bit feature is 0.
>> I think the default function should handle the most common case, which is,
>> passing all 0 multi-bits to KVM is valid and shouldn't be warned. E.g, when 
>> we
>> are using some earlier CPU models which doesn't support AMX, we shouldn't be
>> warned that all 0 values don't match the CPUID reported by KVM.
>>
> 
> passing value 0 to KVM is valid, is not the reason why the handling can be 
> skipped.
> 
> The correct reason is that when value is 0, it means the multi-bit feature is
> not enabled/requested. It's always legal and doesn't need any handling. So it
> can be just skipped.

Currently, we can say yes, but I doubt if there will be a multi-bit field in the
future that only accepts the non-zero value specified.



Re: [PATCH v3 4/6] i386: Mask and report unavailable multi-bit feature values

2023-02-08 Thread Wang, Lei
On 2/9/2023 11:29 AM, Xiaoyao Li wrote:
> On 2/9/2023 9:04 AM, Wang, Lei wrote:
>> On 2/6/2023 3:43 PM, Yuan Yao wrote:
>>> On Fri, Jan 06, 2023 at 12:38:24AM -0800, Lei Wang wrote:
>>>> Some feature words, e.g., feature words in AMX-related CPUID leaf 0x1D and
>>>> 0x1E are not bit-wise but multiple bits represents one value. Handle this
>>>> situation when the values specified are not the same as which are reported
>>>> by KVM. The handling includes:
>>>>
>>>>   - The responsibility of masking bits and giving warnings are delegated to
>>>>     the feature enabler. A framework is also provided to enable this.
>>>>   - To simplify the initialization, a default function is provided if the
>>>>     the function is not specified.
> 
> What's the behavior of default ? you need to call out it clearly.
> 
>>>> The reason why delegating this responsibility rather than just marking
>>>> them as zeros when they are not same is because different multi-bit
>>>> features may have different logic, which is case by case, for example:
>>>>
>>>>   1. CPUID.0x14_0x1:EBX[15:0]. Even though it's multi-bits field, it's a
>>>>  bitmap and each bit represents a separate capability.
>>>>
>>>>   2. CPUID.0x14_0x1:EAX[2:0] represents the number of configurable Address
>>>>  Ranges. 3 bits as a whole to represent a integer value. It means the
>>>>  maximum capability of HW. If KVM reports M, then M to 0 is legal
>>>>  value to configure (because KVM can emulate each value correctly).
>>>>
>>>>   3. CPUID.0x1D_0x1:EAX[31:16] represents palette 1 bytes_per_tile. 16 bits
>>>>  as a whole represent an integer value. It's not like case 2 and SW
>>>>  needs to configure the same value as reported. Because it's not
>>>>  possible for SW to configure to a different value and KVM cannot
>>>>  emulate it.
>>>>
>>>> So marking them blindly as zeros is incorrect, and delegating this
>>>> responsibility can let each multi-bit feature have its own way to mask 
>>>> bits.
> 
> you can first describe there are such 3 cases of multi-bit features and they
> need different handling for checking whether configured value is supported by
> KVM or not. So introduce a handling callback function that each multi-bit
> feature can implement their own. Meanwhile, provide a default handling 
> callback
> that handles case x: when configured value is not the same as the one reported
> by KVM, clearing it to zero to mark it as unavailable.

OK, will rephrase the commit message.

> 
>>>> Signed-off-by: Lei Wang 
>>>> ---
>>>>   target/i386/cpu-internal.h |  2 ++
>>>>   target/i386/cpu.c  | 36 
>>>>   2 files changed, 38 insertions(+)
>>>>
>>>> diff --git a/target/i386/cpu-internal.h b/target/i386/cpu-internal.h
>>>> index 66b3d66cb4..83c7b53926 100644
>>>> --- a/target/i386/cpu-internal.h
>>>> +++ b/target/i386/cpu-internal.h
>>>> @@ -30,6 +30,8 @@ typedef struct MultiBitFeatureInfo {
>>>>   uint64_t mask;
>>>>   unsigned high_bit_position;
>>>>   unsigned low_bit_position;
>>>> +    void (*mark_unavailable_multi_bit)(X86CPU *cpu, FeatureWord w, int 
>>>> index,
>>>> +   const char *verbose_prefix);
>>>>   } MultiBitFeatureInfo;
>>>>
>>>>   typedef struct FeatureWordInfo {
>>>> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
>>>> index 88aa780566..e638a31d34 100644
>>>> --- a/target/i386/cpu.c
>>>> +++ b/target/i386/cpu.c
>>>> @@ -4377,6 +4377,28 @@ static bool x86_cpu_have_filtered_features(X86CPU 
>>>> *cpu)
>>>>   return false;
>>>>   }
>>>>
>>>> +static void mark_unavailable_multi_bit_default(X86CPU *cpu, FeatureWord w,
>>>> +   int index,
>>>> +   const char *verbose_prefix)
>>>> +{
>>>> +    FeatureWordInfo *f = _word_info[w];
>>>> +    g_autofree char *feat_word_str = feature_word_description(f);
>>>> +    uint64_t host_feat = x86_cpu_get_supported_feature_word(w, false);
>>>> +    MultiBitFeatureInfo mf = f->multi_bit_features[index];
>>>> +
>>>> +    if ((cpu->env

Re: [PATCH v3 4/6] i386: Mask and report unavailable multi-bit feature values

2023-02-08 Thread Wang, Lei
On 2/6/2023 3:43 PM, Yuan Yao wrote:
> On Fri, Jan 06, 2023 at 12:38:24AM -0800, Lei Wang wrote:
>> Some feature words, e.g., feature words in AMX-related CPUID leaf 0x1D and
>> 0x1E are not bit-wise but multiple bits represents one value. Handle this
>> situation when the values specified are not the same as which are reported
>> by KVM. The handling includes:
>>
>>  - The responsibility of masking bits and giving warnings are delegated to
>>the feature enabler. A framework is also provided to enable this.
>>  - To simplify the initialization, a default function is provided if the
>>the function is not specified.
>>
>> The reason why delegating this responsibility rather than just marking
>> them as zeros when they are not same is because different multi-bit
>> features may have different logic, which is case by case, for example:
>>
>>  1. CPUID.0x14_0x1:EBX[15:0]. Even though it's multi-bits field, it's a
>> bitmap and each bit represents a separate capability.
>>
>>  2. CPUID.0x14_0x1:EAX[2:0] represents the number of configurable Address
>> Ranges. 3 bits as a whole to represent a integer value. It means the
>> maximum capability of HW. If KVM reports M, then M to 0 is legal
>> value to configure (because KVM can emulate each value correctly).
>>
>>  3. CPUID.0x1D_0x1:EAX[31:16] represents palette 1 bytes_per_tile. 16 bits
>> as a whole represent an integer value. It's not like case 2 and SW
>> needs to configure the same value as reported. Because it's not
>> possible for SW to configure to a different value and KVM cannot
>> emulate it.
>>
>> So marking them blindly as zeros is incorrect, and delegating this
>> responsibility can let each multi-bit feature have its own way to mask bits.
>>
>> Signed-off-by: Lei Wang 
>> ---
>>  target/i386/cpu-internal.h |  2 ++
>>  target/i386/cpu.c  | 36 
>>  2 files changed, 38 insertions(+)
>>
>> diff --git a/target/i386/cpu-internal.h b/target/i386/cpu-internal.h
>> index 66b3d66cb4..83c7b53926 100644
>> --- a/target/i386/cpu-internal.h
>> +++ b/target/i386/cpu-internal.h
>> @@ -30,6 +30,8 @@ typedef struct MultiBitFeatureInfo {
>>  uint64_t mask;
>>  unsigned high_bit_position;
>>  unsigned low_bit_position;
>> +void (*mark_unavailable_multi_bit)(X86CPU *cpu, FeatureWord w, int 
>> index,
>> +   const char *verbose_prefix);
>>  } MultiBitFeatureInfo;
>>
>>  typedef struct FeatureWordInfo {
>> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
>> index 88aa780566..e638a31d34 100644
>> --- a/target/i386/cpu.c
>> +++ b/target/i386/cpu.c
>> @@ -4377,6 +4377,28 @@ static bool x86_cpu_have_filtered_features(X86CPU 
>> *cpu)
>>  return false;
>>  }
>>
>> +static void mark_unavailable_multi_bit_default(X86CPU *cpu, FeatureWord w,
>> +   int index,
>> +   const char *verbose_prefix)
>> +{
>> +FeatureWordInfo *f = _word_info[w];
>> +g_autofree char *feat_word_str = feature_word_description(f);
>> +uint64_t host_feat = x86_cpu_get_supported_feature_word(w, false);
>> +MultiBitFeatureInfo mf = f->multi_bit_features[index];
>> +
>> +if ((cpu->env.features[w] & mf.mask) &&
> 
> With this checking bits are all 0 but covered by mf.mask's range are skippped,
> even if they're different from the host_feat, please check whether it's 
> desried
> behavior.

This is the intended design because there are quite a number of multi-bit CPUIDs
should support passing all 0 to them.

>> +((cpu->env.features[w] ^ host_feat) & mf.mask)) {
>> +if (!cpu->force_features) {
>> +cpu->env.features[w] &= ~mf.mask;
>> +}
>> +cpu->filtered_features[w] |= mf.mask;
>> +if (verbose_prefix)
>> +warn_report("%s: %s.%s [%u:%u]", verbose_prefix, feat_word_str,
>> +mf.feat_name, mf.high_bit_position,
>> +mf.low_bit_position);
>> +}
>> +}
>> +
>>  static void mark_unavailable_features(X86CPU *cpu, FeatureWord w, uint64_t 
>> mask,
>>const char *verbose_prefix)
>>  {
>> @@ -6442,6 +6464,20 @@ static void x86_cpu_filter_features(X86CPU *cpu, bool 
>> verbose)
>>  x86_cpu_get_supported_feature_word(w, false);
>>  uint64_t requested_features = env->features[w];
>>  uint64_t unavailable_features = requested_features & ~host_feat;
>> +FeatureWordInfo f = feature_word_info[w];
>> +int i;
>> +
>> +for (i = 0; i < f.num_multi_bit_features; i++) {
>> +MultiBitFeatureInfo mf = f.multi_bit_features[i];
>> +if (mf.mark_unavailable_multi_bit) {
>> +mf.mark_unavailable_multi_bit(cpu, w, i, prefix);
>> +} else {
>> +mark_unavailable_multi_bit_default(cpu, w, i, prefix);
>> +}
>> +
>> +

Re: [PATCH v3 0/6] Support for new CPU model SapphireRapids

2023-02-06 Thread Wang, Lei
On 2/2/2023 7:05 PM, Igor Mammedov wrote:
> On Fri,  6 Jan 2023 00:38:20 -0800
> Lei Wang  wrote:
> 
>> This series aims to add a new CPU model SapphireRapids, and tries to
>> address the problem stated in
>> https://lore.kernel.org/all/20220812055751.14553-1-lei4.w...@intel.com/T/#mcf67dbd1ad37c65d7988c36a2b267be9afd2fb30,
>> so that named CPU model can define its own AMX values, and QEMU won't
>> pass the wrong AMX values to KVM in future platforms if they have
>> different values supported.
>>
>> The original patch is
>> https://lore.kernel.org/all/20220812055751.14553-1-lei4.w...@intel.com/T/#u.
> 
> MultiBitFeatureInfo looks like an interesting
> idea but among fixing whatever issues this has atm,
> you'd probably need to introduce a new  qdev_bitfield property
> infrastructure so that such features could be treated like any
> other qdev properties.
> Also when MultiBitFeatureInfo is added, one should convert all
> other usecases where it's applicable (not only for new code)
> so that we would end up with consolidated approach instead of
> zoo mess.
> 
> I'm not sure all MultiBitFeatureInfo complexity is necessary
> just for adding a new CPU model, I'd rather use ad hoc approach
> that we were using before for non boolean features.

Hi, Igor. I do not quite understand what does the "ad hoc approach" mean,
currently if we specify a multi-bit non-boolean CPUID value which is different
from the host value to CPU model, e.g., consider the following scenario:

- KVM **ONLY** supports value 5 (101) and,
- QEMU user want to pass value 3 (011) to it,

and follow the current logic:

uint64_t unavailable_features = requested_features & ~host_feat;

then:

1. The warning message will be messy and not intuitive:

requested_features bit 1 is 1 and host_feat bit 1 is 0, so it will warn on this
non-sense bit.


2. Some CPUID bits will "leak" into the final CPUID passed to KVM:

requested_features bit 0 is 1 and host_feat bit 0 is also 1, so it will pass
this CPUID bit to host, the request_features value is 3 (011), finally we get 1
(001), this is not we want.

Am I understanding it correctly?

> 
> And then try to develop MultiBitFeatureInfo & co as a separate
> series to demonstrate how much it will improve current
> cpu models definitions.
> 
> PS:
>  'make check-acceptance' are broken with this
> 
>> ---
>>
>> Changelog:
>>
>> v3:
>>  - Rebase on the latest QEMU (d1852caab131ea898134fdcea8c14bc2ee75fbe9).
>>  - v2: 
>> https://lore.kernel.org/qemu-devel/20221102085256.81139-1-lei4.w...@intel.com/
>>
>> v2:
>>  - Fix when passing all zeros of AMX-related CPUID, QEMU will warn
>>unsupported.
>>  - Remove unnecessary function definition and make code cleaner.
>>  - Fix some typos.
>>  - v1: 
>> https://lore.kernel.org/qemu-devel/20221027020036.373140-1-lei4.w...@intel.com/T/#t
>>
>>
>> Lei Wang (6):
>>   i386: Introduce FeatureWordInfo for AMX CPUID leaf 0x1D and 0x1E
>>   i386: Remove unused parameter "uint32_t bit" in
>> feature_word_description()
>>   i386: Introduce new struct "MultiBitFeatureInfo" for multi-bit
>> features
>>   i386: Mask and report unavailable multi-bit feature values
>>   i386: Initialize AMX CPUID leaves with corresponding env->features[]
>> leaves
>>   i386: Add new CPU model SapphireRapids
>>
>>  target/i386/cpu-internal.h |  11 ++
>>  target/i386/cpu.c  | 311 +++--
>>  target/i386/cpu.h  |  16 ++
>>  3 files changed, 322 insertions(+), 16 deletions(-)
>>
>>
>> base-commit: d1852caab131ea898134fdcea8c14bc2ee75fbe9
> 



Re: [PATCH v3 6/6] i386: Add new CPU model SapphireRapids

2023-02-02 Thread Wang, Lei
On 2/2/2023 6:40 PM, Igor Mammedov wrote:
> On Fri,  6 Jan 2023 00:38:26 -0800
> Lei Wang  wrote:
> 
>> The new CPU model mostly inherits features from Icelake-Server, while
>> adding new features:
>>  - AMX (Advance Matrix eXtensions)
>>  - Bus Lock Debug Exception
>> and new instructions:
>>  - AVX VNNI (Vector Neural Network Instruction):
>> - VPDPBUS: Multiply and Add Unsigned and Signed Bytes
>> - VPDPBUSDS: Multiply and Add Unsigned and Signed Bytes with Saturation
>> - VPDPWSSD: Multiply and Add Signed Word Integers
>> - VPDPWSSDS: Multiply and Add Signed Integers with Saturation
>>  - FP16: Replicates existing AVX512 computational SP (FP32) instructions
>>using FP16 instead of FP32 for ~2X performance gain
>>  - SERIALIZE: Provide software with a simple way to force the processor to
>>complete all modifications, faster, allowed in all privilege levels and
>>not causing an unconditional VM exit
>>  - TSX Suspend Load Address Tracking: Allows programmers to choose which
>>memory accesses do not need to be tracked in the TSX read set
>>  - AVX512_BF16: Vector Neural Network Instructions supporting BFLOAT16
>>inputs and conversion instructions from IEEE single precision
> 
> you should mention all new features here, preferably in format:
>  feature-flag:  expected CPUID feature bits , so reviewer would be able to 
> find it in spec

Thanks, will mention the new features introduced by the new CPU model.

> also you mention that it inherits most of the features from Icelake cpu,
> it would be nice to provide cpuid diff between real Icelake and new cpu
> somewhere in cover letter to simplify comparison.

OK, will add the diff between the command line output here.

>>
>> Features may be added in future versions:
>>  - CET (virtualization support hasn't been merged)
>> Instructions may be added in future versions:
>>  - fast zero-length MOVSB (KVM doesn't support yet)
>>  - fast short STOSB (KVM doesn't support yet)
>>  - fast short CMPSB, SCASB (KVM doesn't support yet)
>>
>> Signed-off-by: Lei Wang 
>> Reviewed-by: Robert Hoo 
>> ---
>>  target/i386/cpu.c | 135 ++
>>  target/i386/cpu.h |   4 ++
>>  2 files changed, 139 insertions(+)
>>
>> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
>> index 946df29a3d..5e1ecd50b3 100644
>> --- a/target/i386/cpu.c
>> +++ b/target/i386/cpu.c
>> @@ -3576,6 +3576,141 @@ static const X86CPUDefinition builtin_x86_defs[] = {
>>  { /* end of list */ }
>>  }
>>  },
>> +{
>> +.name = "SapphireRapids",
>> +.level = 0x20,
>> +.vendor = CPUID_VENDOR_INTEL,
>> +.family = 6,
>> +.model = 143,
>> +.stepping = 4,
>> +/*
>> + * please keep the ascending order so that we can have a clear view 
>> of
>> + * bit position of each feature.
>> + */
> 
> unless you have a way to enforce this comment. I wouldn't expect that would
> work in practice.
> 
> Also If you wish to bring more order here, you should start with a 
> prerequisite
> patch that would do the same to all existing CPU models.
> That way one can easily see a difference between Icelake and new cpu model.
> 
> As it is in this patch (with all unnecessary reordering) it's hard for 
> reviewer
> to see differences.
> I'd suggest to just copy Icelake definition and modify it to suit new cpu 
> model
> (without reordering all feature bits)  

Thanks for the suggestion, will remove the comment and just copy Icelake
definition and modify it to suit new cpu model (without reordering all feature 
bits)

>> +.features[FEAT_1_EDX] =
>> +CPUID_FP87 | CPUID_VME | CPUID_DE | CPUID_PSE | CPUID_TSC |
>> +CPUID_MSR | CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC |
>> +CPUID_SEP | CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV |
>> +CPUID_PAT | CPUID_PSE36 | CPUID_CLFLUSH | CPUID_MMX | 
>> CPUID_FXSR |
>> +CPUID_SSE | CPUID_SSE2,
>> +.features[FEAT_1_ECX] =
>> +CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSSE3 |
>> +CPUID_EXT_FMA | CPUID_EXT_CX16 | CPUID_EXT_PCID | 
>> CPUID_EXT_SSE41 |
>> +CPUID_EXT_SSE42 | CPUID_EXT_X2APIC | CPUID_EXT_MOVBE |
>> +CPUID_EXT_POPCNT | CPUID_EXT_TSC_DEADLINE_TIMER | CPUID_EXT_AES 
>> |
>> +CPUID_EXT_XSAVE | CPUID_EXT_AVX | CPUID_EXT_F16C | 
>> CPUID_EXT_RDRAND,
>> +.features[FEAT_8000_0001_EDX] =
>> +CPUID_EXT2_SYSCALL | CPUID_EXT2_NX | CPUID_EXT2_PDPE1GB |
>> +CPUID_EXT2_RDTSCP | CPUID_EXT2_LM,
>> +.features[FEAT_8000_0001_ECX] =
>> +CPUID_EXT3_LAHF_LM | CPUID_EXT3_ABM | CPUID_EXT3_3DNOWPREFETCH,
>> +.features[FEAT_8000_0008_EBX] =
>> +CPUID_8000_0008_EBX_WBNOINVD,
>> +.features[FEAT_7_0_EBX] =
>> +CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_HLE 
>> |
>> +

Re: [PATCH v2 0/6] Support for new CPU model SapphireRapids

2022-12-18 Thread Wang, Lei
Kindly ping for any comments:)

BR,
Lei

On 11/2/2022 4:52 PM, Wang, Lei wrote:
> This series aims to add a new CPU model SapphireRapids, and tries to
> address the problem stated in
> https://lore.kernel.org/all/20220812055751.14553-1-lei4.w...@intel.com/T/#mcf67dbd1ad37c65d7988c36a2b267be9afd2fb30,
> so that named CPU model can define its own AMX values, and QEMU won't
> pass the wrong AMX values to KVM in future platforms if they have
> different values supported.
> 
> The original patch is
> https://lore.kernel.org/all/20220812055751.14553-1-lei4.w...@intel.com/T/#u.
> 
> ---
> 
> Changelog:
> 
> v2:
>  - Fix when passing all zeros of AMX-related CPUID, QEMU will warn
>unsupported.
>  - Remove unnecessary function definition and make code cleaner.
>  - Fix some typos.
>  - v1:
>
> https://lore.kernel.org/qemu-devel/20221027020036.373140-1-lei4.w...@intel.com/T/#t
> 
> Wang, Lei (6):
>   i386: Introduce FeatureWordInfo for AMX CPUID leaf 0x1D and 0x1E
>   i386: Remove unused parameter "uint32_t bit" in
> feature_word_description()
>   i386: Introduce new struct "MultiBitFeatureInfo" for multi-bit
> features
>   i386: Mask and report unavailable multi-bit feature values
>   i386: Initialize AMX CPUID leaves with corresponding env->features[]
> leaves
>   i386: Add new CPU model SapphireRapids
> 
>  target/i386/cpu-internal.h |  11 ++
>  target/i386/cpu.c  | 311 +++--
>  target/i386/cpu.h  |  16 ++
>  3 files changed, 322 insertions(+), 16 deletions(-)
> 



[PATCH v2 1/6] i386: Introduce FeatureWordInfo for AMX CPUID leaf 0x1D and 0x1E

2022-11-02 Thread Wang, Lei
CPUID leaf 0x1D and 0x1E enumerate tile and TMUL information for AMX.

Introduce FeatureWord FEAT_1D_1_EAX, FEAT_1D_1_EBX, FEAT_1D_1_ECX and
FEAT_1E_0_EBX. Thus these features of AMX can be expanded when
"-cpu host/max" and can be configured in named CPU model.

Signed-off-by: Wang, Lei 
---
 target/i386/cpu.c | 55 +++
 target/i386/cpu.h | 12 +++
 2 files changed, 67 insertions(+)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 8a11470507..e98780773c 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -1002,6 +1002,45 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 },
 .tcg_features = ~0U,
 },
+[FEAT_1D_1_EAX] = {
+.type = CPUID_FEATURE_WORD,
+.cpuid = {
+.eax = 0x1D,
+.needs_ecx = true, .ecx = 1,
+.reg = R_EAX,
+},
+.migratable_flags = CPUID_AMX_PALETTE_1_TOTAL_TILE_BYTES_MASK |
+CPUID_AMX_PALETTE_1_BYTES_PER_TILE_MASK,
+},
+[FEAT_1D_1_EBX] = {
+.type = CPUID_FEATURE_WORD,
+.cpuid = {
+.eax = 0x1D,
+.needs_ecx = true, .ecx = 1,
+.reg = R_EBX,
+},
+.migratable_flags = CPUID_AMX_PALETTE_1_BYTES_PER_ROW_MASK |
+CPUID_AMX_PALETTE_1_MAX_NAMES_MASK,
+},
+[FEAT_1D_1_ECX] = {
+.type = CPUID_FEATURE_WORD,
+.cpuid = {
+.eax = 0x1D,
+.needs_ecx = true, .ecx = 1,
+.reg = R_ECX,
+},
+.migratable_flags = CPUID_AMX_PALETTE_1_MAX_ROWS_MASK,
+},
+[FEAT_1E_0_EBX] = {
+.type = CPUID_FEATURE_WORD,
+.cpuid = {
+.eax = 0x1E,
+.needs_ecx = true, .ecx = 0,
+.reg = R_EBX,
+},
+.migratable_flags = CPUID_AMX_TMUL_MAX_K_MASK |
+CPUID_AMX_TMUL_MAX_N_MASK,
+},
 /*Below are MSR exposed features*/
 [FEAT_ARCH_CAPABILITIES] = {
 .type = MSR_FEATURE_WORD,
@@ -1371,6 +1410,22 @@ static FeatureDep feature_dependencies[] = {
 .from = { FEAT_7_0_EBX, CPUID_7_0_EBX_INTEL_PT },
 .to = { FEAT_14_0_ECX,  ~0ull },
 },
+{
+.from = { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_TILE },
+.to = { FEAT_1D_1_EAX,  ~0ull },
+},
+{
+.from = { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_TILE },
+.to = { FEAT_1D_1_EBX,  ~0ull },
+},
+{
+.from = { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_TILE },
+.to = { FEAT_1D_1_ECX,  ~0ull },
+},
+{
+.from = { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_TILE },
+.to = { FEAT_1E_0_EBX,  ~0ull },
+},
 {
 .from = { FEAT_8000_0001_EDX,   CPUID_EXT2_RDTSCP },
 .to = { FEAT_VMX_SECONDARY_CTLS,VMX_SECONDARY_EXEC_RDTSCP },
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 7edf5dfac3..1c90fb6c9d 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -583,6 +583,14 @@ typedef enum X86Seg {
  XSTATE_Hi16_ZMM_MASK | XSTATE_PKRU_MASK | \
  XSTATE_XTILE_CFG_MASK | 
XSTATE_XTILE_DATA_MASK)
 
+#define CPUID_AMX_PALETTE_1_TOTAL_TILE_BYTES_MASK 0xU
+#define CPUID_AMX_PALETTE_1_BYTES_PER_TILE_MASK   (0xU << 16)
+#define CPUID_AMX_PALETTE_1_BYTES_PER_ROW_MASK0xU
+#define CPUID_AMX_PALETTE_1_MAX_NAMES_MASK(0xU << 16)
+#define CPUID_AMX_PALETTE_1_MAX_ROWS_MASK 0xU
+#define CPUID_AMX_TMUL_MAX_K_MASK 0xffU
+#define CPUID_AMX_TMUL_MAX_N_MASK (0xU << 8)
+
 /* CPUID feature words */
 typedef enum FeatureWord {
 FEAT_1_EDX, /* CPUID[1].EDX */
@@ -603,6 +611,10 @@ typedef enum FeatureWord {
 FEAT_6_EAX, /* CPUID[6].EAX */
 FEAT_XSAVE_XCR0_LO, /* CPUID[EAX=0xd,ECX=0].EAX */
 FEAT_XSAVE_XCR0_HI, /* CPUID[EAX=0xd,ECX=0].EDX */
+FEAT_1D_1_EAX,  /* CPUID[EAX=0x1d,ECX=1].EAX */
+FEAT_1D_1_EBX,  /* CPUID[EAX=0x1d,ECX=1].EBX */
+FEAT_1D_1_ECX,  /* CPUID[EAX=0x1d,ECX=1].ECX */
+FEAT_1E_0_EBX,  /* CPUID[EAX=0x1e,ECX=0].EBX */
 FEAT_ARCH_CAPABILITIES,
 FEAT_CORE_CAPABILITY,
 FEAT_PERF_CAPABILITIES,
-- 
2.34.1




[PATCH v2 5/6] i386: Initialize AMX CPUID leaves with corresponding env->features[] leaves

2022-11-02 Thread Wang, Lei
The AMX-related CPUID value, i.e., CPUID(0x1D,1):EAX, CPUID(0x1D,1):EBX,
CPUID(0x1D,1):ECX and CPUID(0x1E,0):EBX are hard-coded to Sapphire Rapids
without considering future platforms.

Replace these hard-coded values with env->features[], so QEMU can pass the
right value to KVM.

Signed-off-by: Wang, Lei 
---
 target/i386/cpu.c | 27 +--
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 53223857ba..fce5a04be7 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -576,16 +576,16 @@ static CPUCacheInfo legacy_l3_cache = {
 #define INTEL_PT_PSB_BITMAP  (0x003f << 16) /* Support 
2K,4K,8K,16K,32K,64K */
 
 /* CPUID Leaf 0x1D constants: */
-#define INTEL_AMX_TILE_MAX_SUBLEAF 0x1
-#define INTEL_AMX_TOTAL_TILE_BYTES 0x2000
-#define INTEL_AMX_BYTES_PER_TILE   0x400
-#define INTEL_AMX_BYTES_PER_ROW0x40
-#define INTEL_AMX_TILE_MAX_NAMES   0x8
-#define INTEL_AMX_TILE_MAX_ROWS0x10
+#define INTEL_SPR_AMX_TILE_MAX_SUBLEAF 0x1
+#define INTEL_SPR_AMX_TOTAL_TILE_BYTES 0x2000
+#define INTEL_SPR_AMX_BYTES_PER_TILE   0x400
+#define INTEL_SPR_AMX_BYTES_PER_ROW0x40
+#define INTEL_SPR_AMX_TILE_MAX_NAMES   0x8
+#define INTEL_SPR_AMX_TILE_MAX_ROWS0x10
 
 /* CPUID Leaf 0x1E constants: */
-#define INTEL_AMX_TMUL_MAX_K   0x10
-#define INTEL_AMX_TMUL_MAX_N   0x40
+#define INTEL_SPR_AMX_TMUL_MAX_K   0x10
+#define INTEL_SPR_AMX_TMUL_MAX_N   0x40
 
 void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
   uint32_t vendor2, uint32_t vendor3)
@@ -5765,12 +5765,11 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
 
 if (count == 0) {
 /* Highest numbered palette subleaf */
-*eax = INTEL_AMX_TILE_MAX_SUBLEAF;
+*eax = INTEL_SPR_AMX_TILE_MAX_SUBLEAF;
 } else if (count == 1) {
-*eax = INTEL_AMX_TOTAL_TILE_BYTES |
-   (INTEL_AMX_BYTES_PER_TILE << 16);
-*ebx = INTEL_AMX_BYTES_PER_ROW | (INTEL_AMX_TILE_MAX_NAMES << 16);
-*ecx = INTEL_AMX_TILE_MAX_ROWS;
+*eax = env->features[FEAT_1D_1_EAX];
+*ebx = env->features[FEAT_1D_1_EBX];
+*ecx = env->features[FEAT_1D_1_ECX];
 }
 break;
 }
@@ -5786,7 +5785,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
 
 if (count == 0) {
 /* Highest numbered palette subleaf */
-*ebx = INTEL_AMX_TMUL_MAX_K | (INTEL_AMX_TMUL_MAX_N << 8);
+*ebx = env->features[FEAT_1E_0_EBX];
 }
 break;
 }
-- 
2.34.1




[PATCH v2 0/6] Support for new CPU model SapphireRapids

2022-11-02 Thread Wang, Lei
This series aims to add a new CPU model SapphireRapids, and tries to
address the problem stated in
https://lore.kernel.org/all/20220812055751.14553-1-lei4.w...@intel.com/T/#mcf67dbd1ad37c65d7988c36a2b267be9afd2fb30,
so that named CPU model can define its own AMX values, and QEMU won't
pass the wrong AMX values to KVM in future platforms if they have
different values supported.

The original patch is
https://lore.kernel.org/all/20220812055751.14553-1-lei4.w...@intel.com/T/#u.

---

Changelog:

v2:
 - Fix when passing all zeros of AMX-related CPUID, QEMU will warn
   unsupported.
 - Remove unnecessary function definition and make code cleaner.
 - Fix some typos.
 - v1:
   
https://lore.kernel.org/qemu-devel/20221027020036.373140-1-lei4.w...@intel.com/T/#t

Wang, Lei (6):
  i386: Introduce FeatureWordInfo for AMX CPUID leaf 0x1D and 0x1E
  i386: Remove unused parameter "uint32_t bit" in
feature_word_description()
  i386: Introduce new struct "MultiBitFeatureInfo" for multi-bit
features
  i386: Mask and report unavailable multi-bit feature values
  i386: Initialize AMX CPUID leaves with corresponding env->features[]
leaves
  i386: Add new CPU model SapphireRapids

 target/i386/cpu-internal.h |  11 ++
 target/i386/cpu.c  | 311 +++--
 target/i386/cpu.h  |  16 ++
 3 files changed, 322 insertions(+), 16 deletions(-)

-- 
2.34.1




[PATCH v2 2/6] i386: Remove unused parameter "uint32_t bit" in feature_word_description()

2022-11-02 Thread Wang, Lei
Parameter "uint32_t bit" is not used in function feature_word_description(),
so remove it.

Signed-off-by: Wang, Lei 
---
 target/i386/cpu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index e98780773c..0083a2a7f7 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -4290,7 +4290,7 @@ static const TypeInfo max_x86_cpu_type_info = {
 .class_init = max_x86_cpu_class_init,
 };
 
-static char *feature_word_description(FeatureWordInfo *f, uint32_t bit)
+static char *feature_word_description(FeatureWordInfo *f)
 {
 assert(f->type == CPUID_FEATURE_WORD || f->type == MSR_FEATURE_WORD);
 
@@ -4329,6 +4329,7 @@ static void mark_unavailable_features(X86CPU *cpu, 
FeatureWord w, uint64_t mask,
 CPUX86State *env = >env;
 FeatureWordInfo *f = _word_info[w];
 int i;
+g_autofree char *feat_word_str = feature_word_description(f);
 
 if (!cpu->force_features) {
 env->features[w] &= ~mask;
@@ -4341,7 +4342,6 @@ static void mark_unavailable_features(X86CPU *cpu, 
FeatureWord w, uint64_t mask,
 
 for (i = 0; i < 64; ++i) {
 if ((1ULL << i) & mask) {
-g_autofree char *feat_word_str = feature_word_description(f, i);
 warn_report("%s: %s%s%s [bit %d]",
 verbose_prefix,
 feat_word_str,
-- 
2.34.1




[PATCH v2 4/6] i386: Mask and report unavailable multi-bit feature values

2022-11-02 Thread Wang, Lei
Some feature words, e.g., feature words in AMX-related CPUID leaf 0x1D and
0x1E are not bit-wise but multiple bits represents one value. Handle this
situation when the values specified are not the same as which are reported
by KVM. The handling includes:

 - The responsibility of masking bits and giving warnings are delegated to
   the feature enabler. A framework is also provided to enable this.
 - To simplify the initialization, a default function is provided if the
   the function is not specified.

The reason why delegating this responsibility rather than just marking
them as zeros when they are not same is because different multi-bit
features may have different logic, which is case by case, for example:

 1. CPUID.0x14_0x1:EBX[15:0]. Even though it's multi-bits field, it's a
bitmap and each bit represents a separate capability.

 2. CPUID.0x14_0x1:EAX[2:0] represents the number of configurable Address
Ranges. 3 bits as a whole to represent a integer value. It means the
maximum capability of HW. If KVM reports M, then M to 0 is legal
value to configure (because KVM can emulate each value correctly).

 3. CPUID.0x1D_0x1:EAX[31:16] represents palette 1 bytes_per_tile. 16 bits
as a whole represent an integer value. It's not like case 2 and SW
needs to configure the same value as reported. Because it's not
possible for SW to configure to a different value and KVM cannot
emulate it.

So marking them blindly as zeros is incorrect, and delegating this
responsibility can let each multi-bit feature have its own way to mask bits.

Signed-off-by: Wang, Lei 
---
 target/i386/cpu-internal.h |  2 ++
 target/i386/cpu.c  | 36 
 2 files changed, 38 insertions(+)

diff --git a/target/i386/cpu-internal.h b/target/i386/cpu-internal.h
index 66b3d66cb4..83c7b53926 100644
--- a/target/i386/cpu-internal.h
+++ b/target/i386/cpu-internal.h
@@ -30,6 +30,8 @@ typedef struct MultiBitFeatureInfo {
 uint64_t mask;
 unsigned high_bit_position;
 unsigned low_bit_position;
+void (*mark_unavailable_multi_bit)(X86CPU *cpu, FeatureWord w, int index,
+   const char *verbose_prefix);
 } MultiBitFeatureInfo;
 
 typedef struct FeatureWordInfo {
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 7ae232ab18..53223857ba 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -4377,6 +4377,28 @@ static bool x86_cpu_have_filtered_features(X86CPU *cpu)
 return false;
 }
 
+static void mark_unavailable_multi_bit_default(X86CPU *cpu, FeatureWord w,
+   int index,
+   const char *verbose_prefix)
+{
+FeatureWordInfo *f = _word_info[w];
+g_autofree char *feat_word_str = feature_word_description(f);
+uint64_t host_feat = x86_cpu_get_supported_feature_word(w, false);
+MultiBitFeatureInfo mf = f->multi_bit_features[index];
+
+if ((cpu->env.features[w] & mf.mask) &&
+((cpu->env.features[w] ^ host_feat) & mf.mask)) {
+if (!cpu->force_features) {
+cpu->env.features[w] &= ~mf.mask;
+}
+cpu->filtered_features[w] |= mf.mask;
+if (verbose_prefix)
+warn_report("%s: %s.%s [%u:%u]", verbose_prefix, feat_word_str,
+mf.feat_name, mf.high_bit_position,
+mf.low_bit_position);
+}
+}
+
 static void mark_unavailable_features(X86CPU *cpu, FeatureWord w, uint64_t 
mask,
   const char *verbose_prefix)
 {
@@ -6428,6 +6450,20 @@ static void x86_cpu_filter_features(X86CPU *cpu, bool 
verbose)
 x86_cpu_get_supported_feature_word(w, false);
 uint64_t requested_features = env->features[w];
 uint64_t unavailable_features = requested_features & ~host_feat;
+FeatureWordInfo f = feature_word_info[w];
+int i;
+
+for (i = 0; i < f.num_multi_bit_features; i++) {
+MultiBitFeatureInfo mf = f.multi_bit_features[i];
+if (mf.mark_unavailable_multi_bit) {
+mf.mark_unavailable_multi_bit(cpu, w, i, prefix);
+} else {
+mark_unavailable_multi_bit_default(cpu, w, i, prefix);
+}
+
+unavailable_features &= ~mf.mask;
+}
+
 mark_unavailable_features(cpu, w, unavailable_features, prefix);
 }
 
-- 
2.34.1




[PATCH v2 3/6] i386: Introduce new struct "MultiBitFeatureInfo" for multi-bit features

2022-11-02 Thread Wang, Lei
Some features use multiple CPUID bits to form a value to be used, e.g.,
CPUID(0x1E,0):EBX[23:08] is regarded as the tmul_maxn value for AMX.
Introduce a new struct "MultiBitFeatureInfo" to hold the information for
those features and create a corresponding member in struct FeatureWordInfo,
so that the infomation can be assigned for each item in feature_word_info
array and used in the future.

Signed-off-by: Wang, Lei 
---
 target/i386/cpu-internal.h |  9 +++
 target/i386/cpu.c  | 54 ++
 2 files changed, 63 insertions(+)

diff --git a/target/i386/cpu-internal.h b/target/i386/cpu-internal.h
index 9baac5c0b4..66b3d66cb4 100644
--- a/target/i386/cpu-internal.h
+++ b/target/i386/cpu-internal.h
@@ -25,6 +25,13 @@ typedef enum FeatureWordType {
MSR_FEATURE_WORD,
 } FeatureWordType;
 
+typedef struct MultiBitFeatureInfo {
+const char *feat_name;
+uint64_t mask;
+unsigned high_bit_position;
+unsigned low_bit_position;
+} MultiBitFeatureInfo;
+
 typedef struct FeatureWordInfo {
 FeatureWordType type;
 /* feature flags names are taken from "Intel Processor Identification and
@@ -51,6 +58,8 @@ typedef struct FeatureWordInfo {
 uint64_t migratable_flags; /* Feature flags known to be migratable */
 /* Features that shouldn't be auto-enabled by "-cpu host" */
 uint64_t no_autoenable_flags;
+unsigned num_multi_bit_features;
+MultiBitFeatureInfo *multi_bit_features;
 } FeatureWordInfo;
 
 extern FeatureWordInfo feature_word_info[];
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 0083a2a7f7..7ae232ab18 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -1011,6 +1011,21 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 },
 .migratable_flags = CPUID_AMX_PALETTE_1_TOTAL_TILE_BYTES_MASK |
 CPUID_AMX_PALETTE_1_BYTES_PER_TILE_MASK,
+.num_multi_bit_features = 2,
+.multi_bit_features = (MultiBitFeatureInfo[]){
+{
+.feat_name = "total_tile_bytes",
+.mask = CPUID_AMX_PALETTE_1_TOTAL_TILE_BYTES_MASK,
+.high_bit_position = 15,
+.low_bit_position = 0,
+},
+{
+.feat_name = "bytes_per_tile",
+.mask = CPUID_AMX_PALETTE_1_BYTES_PER_TILE_MASK,
+.high_bit_position = 31,
+.low_bit_position = 16,
+},
+},
 },
 [FEAT_1D_1_EBX] = {
 .type = CPUID_FEATURE_WORD,
@@ -1021,6 +1036,21 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 },
 .migratable_flags = CPUID_AMX_PALETTE_1_BYTES_PER_ROW_MASK |
 CPUID_AMX_PALETTE_1_MAX_NAMES_MASK,
+.num_multi_bit_features = 2,
+.multi_bit_features = (MultiBitFeatureInfo[]){
+{
+.feat_name = "bytes_per_row",
+.mask = CPUID_AMX_PALETTE_1_BYTES_PER_ROW_MASK,
+.high_bit_position = 15,
+.low_bit_position = 0,
+},
+{
+.feat_name = "max_names",
+.mask = CPUID_AMX_PALETTE_1_MAX_NAMES_MASK,
+.high_bit_position = 31,
+.low_bit_position = 16,
+},
+},
 },
 [FEAT_1D_1_ECX] = {
 .type = CPUID_FEATURE_WORD,
@@ -1030,6 +1060,15 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 .reg = R_ECX,
 },
 .migratable_flags = CPUID_AMX_PALETTE_1_MAX_ROWS_MASK,
+.num_multi_bit_features = 1,
+.multi_bit_features = (MultiBitFeatureInfo[]){
+{
+.feat_name = "max_rows",
+.mask = CPUID_AMX_PALETTE_1_MAX_ROWS_MASK,
+.high_bit_position = 15,
+.low_bit_position = 0,
+},
+},
 },
 [FEAT_1E_0_EBX] = {
 .type = CPUID_FEATURE_WORD,
@@ -1040,6 +1079,21 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 },
 .migratable_flags = CPUID_AMX_TMUL_MAX_K_MASK |
 CPUID_AMX_TMUL_MAX_N_MASK,
+.num_multi_bit_features = 2,
+.multi_bit_features = (MultiBitFeatureInfo[]){
+{
+.feat_name = "tmul_maxk",
+.mask = CPUID_AMX_TMUL_MAX_K_MASK,
+.high_bit_position = 7,
+.low_bit_position = 0,
+},
+{
+.feat_name = "tmul_maxn",
+.mask = CPUID_AMX_TMUL_MAX_N_MASK,
+.high_bit_position = 23,
+.low_bit_position = 8,
+},
+},
 },
 /*Below are MSR exposed features*/
 [FEAT_ARCH_CAPABILITIES] = {
-- 
2.34.1




[PATCH v2 1/6] i386: Introduce FeatureWordInfo for AMX CPUID leaf 0x1D and 0x1E

2022-11-02 Thread Wang, Lei
CPUID leaf 0x1D and 0x1E enumerate tile and TMUL information for AMX.

Introduce FeatureWord FEAT_1D_1_EAX, FEAT_1D_1_EBX, FEAT_1D_1_ECX and
FEAT_1E_0_EBX. Thus these features of AMX can be expanded when
"-cpu host/max" and can be configured in named CPU model.

Signed-off-by: Wang, Lei 
---
 target/i386/cpu.c | 55 +++
 target/i386/cpu.h | 12 +++
 2 files changed, 67 insertions(+)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 8a11470507..e98780773c 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -1002,6 +1002,45 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 },
 .tcg_features = ~0U,
 },
+[FEAT_1D_1_EAX] = {
+.type = CPUID_FEATURE_WORD,
+.cpuid = {
+.eax = 0x1D,
+.needs_ecx = true, .ecx = 1,
+.reg = R_EAX,
+},
+.migratable_flags = CPUID_AMX_PALETTE_1_TOTAL_TILE_BYTES_MASK |
+CPUID_AMX_PALETTE_1_BYTES_PER_TILE_MASK,
+},
+[FEAT_1D_1_EBX] = {
+.type = CPUID_FEATURE_WORD,
+.cpuid = {
+.eax = 0x1D,
+.needs_ecx = true, .ecx = 1,
+.reg = R_EBX,
+},
+.migratable_flags = CPUID_AMX_PALETTE_1_BYTES_PER_ROW_MASK |
+CPUID_AMX_PALETTE_1_MAX_NAMES_MASK,
+},
+[FEAT_1D_1_ECX] = {
+.type = CPUID_FEATURE_WORD,
+.cpuid = {
+.eax = 0x1D,
+.needs_ecx = true, .ecx = 1,
+.reg = R_ECX,
+},
+.migratable_flags = CPUID_AMX_PALETTE_1_MAX_ROWS_MASK,
+},
+[FEAT_1E_0_EBX] = {
+.type = CPUID_FEATURE_WORD,
+.cpuid = {
+.eax = 0x1E,
+.needs_ecx = true, .ecx = 0,
+.reg = R_EBX,
+},
+.migratable_flags = CPUID_AMX_TMUL_MAX_K_MASK |
+CPUID_AMX_TMUL_MAX_N_MASK,
+},
 /*Below are MSR exposed features*/
 [FEAT_ARCH_CAPABILITIES] = {
 .type = MSR_FEATURE_WORD,
@@ -1371,6 +1410,22 @@ static FeatureDep feature_dependencies[] = {
 .from = { FEAT_7_0_EBX, CPUID_7_0_EBX_INTEL_PT },
 .to = { FEAT_14_0_ECX,  ~0ull },
 },
+{
+.from = { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_TILE },
+.to = { FEAT_1D_1_EAX,  ~0ull },
+},
+{
+.from = { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_TILE },
+.to = { FEAT_1D_1_EBX,  ~0ull },
+},
+{
+.from = { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_TILE },
+.to = { FEAT_1D_1_ECX,  ~0ull },
+},
+{
+.from = { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_TILE },
+.to = { FEAT_1E_0_EBX,  ~0ull },
+},
 {
 .from = { FEAT_8000_0001_EDX,   CPUID_EXT2_RDTSCP },
 .to = { FEAT_VMX_SECONDARY_CTLS,VMX_SECONDARY_EXEC_RDTSCP },
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 7edf5dfac3..1c90fb6c9d 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -583,6 +583,14 @@ typedef enum X86Seg {
  XSTATE_Hi16_ZMM_MASK | XSTATE_PKRU_MASK | \
  XSTATE_XTILE_CFG_MASK | 
XSTATE_XTILE_DATA_MASK)
 
+#define CPUID_AMX_PALETTE_1_TOTAL_TILE_BYTES_MASK 0xU
+#define CPUID_AMX_PALETTE_1_BYTES_PER_TILE_MASK   (0xU << 16)
+#define CPUID_AMX_PALETTE_1_BYTES_PER_ROW_MASK0xU
+#define CPUID_AMX_PALETTE_1_MAX_NAMES_MASK(0xU << 16)
+#define CPUID_AMX_PALETTE_1_MAX_ROWS_MASK 0xU
+#define CPUID_AMX_TMUL_MAX_K_MASK 0xffU
+#define CPUID_AMX_TMUL_MAX_N_MASK (0xU << 8)
+
 /* CPUID feature words */
 typedef enum FeatureWord {
 FEAT_1_EDX, /* CPUID[1].EDX */
@@ -603,6 +611,10 @@ typedef enum FeatureWord {
 FEAT_6_EAX, /* CPUID[6].EAX */
 FEAT_XSAVE_XCR0_LO, /* CPUID[EAX=0xd,ECX=0].EAX */
 FEAT_XSAVE_XCR0_HI, /* CPUID[EAX=0xd,ECX=0].EDX */
+FEAT_1D_1_EAX,  /* CPUID[EAX=0x1d,ECX=1].EAX */
+FEAT_1D_1_EBX,  /* CPUID[EAX=0x1d,ECX=1].EBX */
+FEAT_1D_1_ECX,  /* CPUID[EAX=0x1d,ECX=1].ECX */
+FEAT_1E_0_EBX,  /* CPUID[EAX=0x1e,ECX=0].EBX */
 FEAT_ARCH_CAPABILITIES,
 FEAT_CORE_CAPABILITY,
 FEAT_PERF_CAPABILITIES,
-- 
2.34.1




[PATCH v2 0/6] Support for new CPU model SapphireRapids

2022-11-02 Thread Wang, Lei
This series aims to add a new CPU model SapphireRapids, and tries to
address the problem stated in
https://lore.kernel.org/all/20220812055751.14553-1-lei4.w...@intel.com/T/#mcf67dbd1ad37c65d7988c36a2b267be9afd2fb30,
so that named CPU model can define its own AMX values, and QEMU won't
pass the wrong AMX values to KVM in future platforms if they have
different values supported.

The original patch is
https://lore.kernel.org/all/20220812055751.14553-1-lei4.w...@intel.com/T/#u.

---

Changelog:

v2:
 - Fix when passing all zeros of AMX-related CPUID, QEMU will warn
   unsupported.
 - Remove unnecessary function definition and make code cleaner.
 - Fix some typos.
 - v1:
   
https://lore.kernel.org/qemu-devel/20221027020036.373140-1-lei4.w...@intel.com/T/#t

Wang, Lei (6):
  i386: Introduce FeatureWordInfo for AMX CPUID leaf 0x1D and 0x1E
  i386: Remove unused parameter "uint32_t bit" in
feature_word_description()
  i386: Introduce new struct "MultiBitFeatureInfo" for multi-bit
features
  i386: Mask and report unavailable multi-bit feature values
  i386: Initialize AMX CPUID leaves with corresponding env->features[]
leaves
  i386: Add new CPU model SapphireRapids

 target/i386/cpu-internal.h |  11 ++
 target/i386/cpu.c  | 311 +++--
 target/i386/cpu.h  |  16 ++
 3 files changed, 322 insertions(+), 16 deletions(-)

-- 
2.34.1




[PATCH v2 6/6] i386: Add new CPU model SapphireRapids

2022-11-02 Thread Wang, Lei
The new CPU model mostly inherits features from Icelake-Server, while
adding new features:
 - AMX (Advance Matrix eXtensions)
 - Bus Lock Debug Exception
and new instructions:
 - AVX VNNI (Vector Neural Network Instruction):
- VPDPBUS: Multiply and Add Unsigned and Signed Bytes
- VPDPBUSDS: Multiply and Add Unsigned and Signed Bytes with Saturation
- VPDPWSSD: Multiply and Add Signed Word Integers
- VPDPWSSDS: Multiply and Add Signed Integers with Saturation
 - FP16: Replicates existing AVX512 computational SP (FP32) instructions
   using FP16 instead of FP32 for ~2X performance gain
 - SERIALIZE: Provide software with a simple way to force the processor to
   complete all modifications, faster, allowed in all privilege levels and
   not causing an unconditional VM exit
 - TSX Suspend Load Address Tracking: Allows programmers to choose which
   memory accesses do not need to be tracked in the TSX read set
 - AVX512_BF16: Vector Neural Network Instructions supporting BFLOAT16
   inputs and conversion instructions from IEEE single precision

Features may be added in future versions:
 - CET (virtualization support hasn't been merged)
Instructions may be added in future versions:
 - fast zero-length MOVSB (KVM doesn't support yet)
 - fast short STOSB (KVM doesn't support yet)
 - fast short CMPSB, SCASB (KVM doesn't support yet)

Signed-off-by: Wang, Lei 
Reviewed-by: Robert Hoo 
---
 target/i386/cpu.c | 135 ++
 target/i386/cpu.h |   4 ++
 2 files changed, 139 insertions(+)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index fce5a04be7..b4513742bd 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -3576,6 +3576,141 @@ static const X86CPUDefinition builtin_x86_defs[] = {
 { /* end of list */ }
 }
 },
+{
+.name = "SapphireRapids",
+.level = 0x20,
+.vendor = CPUID_VENDOR_INTEL,
+.family = 6,
+.model = 143,
+.stepping = 4,
+/*
+ * please keep the ascending order so that we can have a clear view of
+ * bit position of each feature.
+ */
+.features[FEAT_1_EDX] =
+CPUID_FP87 | CPUID_VME | CPUID_DE | CPUID_PSE | CPUID_TSC |
+CPUID_MSR | CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC |
+CPUID_SEP | CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV |
+CPUID_PAT | CPUID_PSE36 | CPUID_CLFLUSH | CPUID_MMX | CPUID_FXSR |
+CPUID_SSE | CPUID_SSE2,
+.features[FEAT_1_ECX] =
+CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSSE3 |
+CPUID_EXT_FMA | CPUID_EXT_CX16 | CPUID_EXT_PCID | CPUID_EXT_SSE41 |
+CPUID_EXT_SSE42 | CPUID_EXT_X2APIC | CPUID_EXT_MOVBE |
+CPUID_EXT_POPCNT | CPUID_EXT_TSC_DEADLINE_TIMER | CPUID_EXT_AES |
+CPUID_EXT_XSAVE | CPUID_EXT_AVX | CPUID_EXT_F16C | 
CPUID_EXT_RDRAND,
+.features[FEAT_8000_0001_EDX] =
+CPUID_EXT2_SYSCALL | CPUID_EXT2_NX | CPUID_EXT2_PDPE1GB |
+CPUID_EXT2_RDTSCP | CPUID_EXT2_LM,
+.features[FEAT_8000_0001_ECX] =
+CPUID_EXT3_LAHF_LM | CPUID_EXT3_ABM | CPUID_EXT3_3DNOWPREFETCH,
+.features[FEAT_8000_0008_EBX] =
+CPUID_8000_0008_EBX_WBNOINVD,
+.features[FEAT_7_0_EBX] =
+CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_HLE |
+CPUID_7_0_EBX_AVX2 | CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_BMI2 |
+CPUID_7_0_EBX_ERMS | CPUID_7_0_EBX_INVPCID | CPUID_7_0_EBX_RTM |
+CPUID_7_0_EBX_AVX512F | CPUID_7_0_EBX_AVX512DQ |
+CPUID_7_0_EBX_RDSEED | CPUID_7_0_EBX_ADX | CPUID_7_0_EBX_SMAP |
+CPUID_7_0_EBX_AVX512IFMA | CPUID_7_0_EBX_CLFLUSHOPT |
+CPUID_7_0_EBX_CLWB | CPUID_7_0_EBX_AVX512CD | CPUID_7_0_EBX_SHA_NI 
|
+CPUID_7_0_EBX_AVX512BW | CPUID_7_0_EBX_AVX512VL,
+.features[FEAT_7_0_ECX] =
+CPUID_7_0_ECX_AVX512_VBMI | CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_PKU 
|
+CPUID_7_0_ECX_AVX512_VBMI2 | CPUID_7_0_ECX_GFNI |
+CPUID_7_0_ECX_VAES | CPUID_7_0_ECX_VPCLMULQDQ |
+CPUID_7_0_ECX_AVX512VNNI | CPUID_7_0_ECX_AVX512BITALG |
+CPUID_7_0_ECX_AVX512_VPOPCNTDQ | CPUID_7_0_ECX_LA57 |
+CPUID_7_0_ECX_RDPID | CPUID_7_0_ECX_BUS_LOCK_DETECT,
+.features[FEAT_7_0_EDX] =
+CPUID_7_0_EDX_FSRM | CPUID_7_0_EDX_SERIALIZE |
+CPUID_7_0_EDX_TSX_LDTRK | CPUID_7_0_EDX_AMX_BF16 |
+CPUID_7_0_EDX_AVX512_FP16 | CPUID_7_0_EDX_AMX_TILE |
+CPUID_7_0_EDX_AMX_INT8 | CPUID_7_0_EDX_SPEC_CTRL |
+CPUID_7_0_EDX_ARCH_CAPABILITIES | CPUID_7_0_EDX_SPEC_CTRL_SSBD,
+.features[FEAT_ARCH_CAPABILITIES] =
+MSR_ARCH_CAP_RDCL_NO | MSR_ARCH_CAP_IBRS_ALL |
+MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY | MSR_ARCH_CAP_MDS_NO |
+MSR_ARCH_CAP_PSCHANGE_MC_NO | MSR_ARCH_CAP_TAA_NO,
+.features[

Re: [PATCH] i386/cpu: Adjust cpuid addresable id count to match the spec

2022-10-27 Thread Wang, Lei


On 10/10/2022 10:49 AM, Ilya Oleinik wrote:
> For EBX bits 23 - 16 in CPUID[01] Intel's manual states:
> "
> *   The nearest power-of-2 integer that is not smaller than EBX[23:16]
> is the number of unique initial APICIDs reserved for addressing
> different logical processors in a physical package. This field is
> only valid if CPUID.1.EDX.HTT[bit 28]= 1.
> "
> Ensure this condition is met.
> 
> Additionally, apply efb3934adf9ee7794db7e0ade9f576c634592891 to
> non passthrough cache mode.
> 
> Signed-off-by: Ilya Oleinik 
> ---
>  target/i386/cpu.c | 10 +-
>  1 file changed, 5 insertions(+), 5 deletions(-)
> 
> diff --git a/target/i386/cpu.c b/target/i386/cpu.c
> index ad623d91e4..e793bcc03f 100644
> --- a/target/i386/cpu.c
> +++ b/target/i386/cpu.c
> @@ -245,8 +245,8 @@ static void encode_cache_cpuid4(CPUCacheInfo *cache,
>  *eax = CACHE_TYPE(cache->type) |
> CACHE_LEVEL(cache->level) |
> (cache->self_init ? CACHE_SELF_INIT_LEVEL : 0) |
> -   ((num_cores - 1) << 26) |
> -   ((num_apic_ids - 1) << 14);
> +   ((pow2ceil(num_cores) - 1) << 26) |
> +   ((pow2ceil(num_apic_ids) - 1) << 14);
>  
>  assert(cache->line_size > 0);
>  assert(cache->partitions > 0);
> @@ -5258,7 +5258,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
> uint32_t count,
>  }
>  *edx = env->features[FEAT_1_EDX];
>  if (cs->nr_cores * cs->nr_threads > 1) {
> -*ebx |= (cs->nr_cores * cs->nr_threads) << 16;
> +*ebx |= (pow2ceil(cs->nr_cores * cs->nr_threads)) << 16;
>  *edx |= CPUID_HT;
>  }
>  if (!cpu->enable_pmu) {
> @@ -5313,12 +5313,12 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
> uint32_t count,
>  switch (count) {
>  case 0: /* L1 dcache info */
>  encode_cache_cpuid4(env->cache_info_cpuid4.l1d_cache,
> -1, cs->nr_cores,

Hi Ilya,

Just curious why the origin implementation is hard-coded to 1 and is there any
bug reported related to this?

> +cs->nr_threads, cs->nr_cores,
>  eax, ebx, ecx, edx);
>  break;
>  case 1: /* L1 icache info */
>  encode_cache_cpuid4(env->cache_info_cpuid4.l1i_cache,
> -1, cs->nr_cores,
> +cs->nr_threads, cs->nr_cores,
>  eax, ebx, ecx, edx);
>  break;
>  case 2: /* L2 cache info */



[PATCH 2/6] i386: Remove unused parameter "uint32_t bit" in feature_word_description()

2022-10-26 Thread Wang, Lei
Parameter "uint32_t bit" is not used in function feature_word_description(),
so remove it.

Signed-off-by: Wang, Lei 
---
 target/i386/cpu.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index e98780773c..0083a2a7f7 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -4290,7 +4290,7 @@ static const TypeInfo max_x86_cpu_type_info = {
 .class_init = max_x86_cpu_class_init,
 };
 
-static char *feature_word_description(FeatureWordInfo *f, uint32_t bit)
+static char *feature_word_description(FeatureWordInfo *f)
 {
 assert(f->type == CPUID_FEATURE_WORD || f->type == MSR_FEATURE_WORD);
 
@@ -4329,6 +4329,7 @@ static void mark_unavailable_features(X86CPU *cpu, 
FeatureWord w, uint64_t mask,
 CPUX86State *env = >env;
 FeatureWordInfo *f = _word_info[w];
 int i;
+g_autofree char *feat_word_str = feature_word_description(f);
 
 if (!cpu->force_features) {
 env->features[w] &= ~mask;
@@ -4341,7 +4342,6 @@ static void mark_unavailable_features(X86CPU *cpu, 
FeatureWord w, uint64_t mask,
 
 for (i = 0; i < 64; ++i) {
 if ((1ULL << i) & mask) {
-g_autofree char *feat_word_str = feature_word_description(f, i);
 warn_report("%s: %s%s%s [bit %d]",
 verbose_prefix,
 feat_word_str,
-- 
2.34.1




[PATCH 0/6] Support for new CPU model SapphireRapids

2022-10-26 Thread Wang, Lei
This series aims to add a new CPU model SapphireRapids, and tries to
address the problem stated in
https://lore.kernel.org/all/20220812055751.14553-1-lei4.w...@intel.com/T/#mcf67dbd1ad37c65d7988c36a2b267be9afd2fb30,
so that named CPU model can define its own AMX values, and QEMU won't
pass the wrong AMX values to KVM in future platforms if they have
different values supported.

The original patch is
https://lore.kernel.org/all/20220812055751.14553-1-lei4.w...@intel.com/T/#u.

Wang, Lei (6):
  i386: Introduce FeatureWordInfo for AMX CPUID leaf 0x1D and 0x1E
  i386: Remove unused parameter "uint32_t bit" in
feature_word_description()
  i386: Introduce new struct "MultiBitFeatureInfo" for multi-bit
features
  i386: Mask and report unavailable multi-bit feature values
  i386: Initialize AMX CPUID leaves with corresponding env->features[]
leaves
  i386: Add new CPU model SapphireRapids

 target/i386/cpu-internal.h |  11 ++
 target/i386/cpu.c  | 314 ++---
 target/i386/cpu.h  |  18 +++
 3 files changed, 323 insertions(+), 20 deletions(-)

-- 
2.34.1




[PATCH 4/6] i386: Mask and report unavailable multi-bit feature values

2022-10-26 Thread Wang, Lei
Some feature words, e.g., feature words in AMX-related CPUID leaf 0x1D and
0x1E are not bit-wise but multiple bits represents one value. Handle this
situation when the values specified are not the same as which are reported
by KVM. The handling includes:

 - The responsibility of masking bits and giving warnings are delegated to
   the feature enabler. A framwork is also provided to enable this.
 - To simplify the initialization, a default function is provided if the
   the function is not specified.

The reason why delegating this responsibility rather than just marking
them as zeros when they are not same is because different multi-bit
features may have different logic, which is case by case, for example:

 1. CPUID.0x14_0x1:EBX[15:0]. Even though it's multi-bits field, it's a
bitmap and each bit represents a separate capability.

 2. CPUID.0x14_0x1:EAX[2:0] represents the number of configurable Address
Ranges. 3 bits as a whole to represent a integer value. It means the
maximum capability of HW. If KVM reports M, then M to 0 is legal
value to configure (because KVM can emulate each value correctly).

 3. CPUID.0x1D_0x1:EAX[31:16] represents palette 1 bytes_per_tile. 16 bits
as a whole represent an integer value. It's not like case 2 and SW
needs to configure the same value as reported. Because it's not
possible for SW to configure to a different value and KVM cannot
emulate it.

So marking them blindly as zeros is incorrect, and delegating this
responsibility can let each multi-bit feature have its own way to mask bits.

Signed-off-by: Wang, Lei 
---
 target/i386/cpu-internal.h |  2 ++
 target/i386/cpu.c  | 39 ++
 target/i386/cpu.h  |  2 ++
 3 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/target/i386/cpu-internal.h b/target/i386/cpu-internal.h
index 66b3d66cb4..f973046b4e 100644
--- a/target/i386/cpu-internal.h
+++ b/target/i386/cpu-internal.h
@@ -30,6 +30,8 @@ typedef struct MultiBitFeatureInfo {
 uint64_t mask;
 unsigned high_bit_position;
 unsigned low_bit_position;
+void (*mark_unavailable_bits)(X86CPU *cpu, FeatureWord w, int index,
+  const char *verbose_prefix);
 } MultiBitFeatureInfo;
 
 typedef struct FeatureWordInfo {
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 7ae232ab18..fc120c0694 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -4377,6 +4377,26 @@ static bool x86_cpu_have_filtered_features(X86CPU *cpu)
 return false;
 }
 
+void mark_unavailable_bits(X86CPU *cpu, FeatureWord w, int index,
+   const char *verbose_prefix)
+{
+FeatureWordInfo *f = _word_info[w];
+g_autofree char *feat_word_str = feature_word_description(f);
+uint64_t host_feat = x86_cpu_get_supported_feature_word(w, false);
+MultiBitFeatureInfo mf = f->multi_bit_features[index];
+
+if ((cpu->env.features[w] ^ host_feat) & mf.mask) {
+if (!cpu->force_features) {
+cpu->env.features[w] &= ~mf.mask;
+}
+cpu->filtered_features[w] |= mf.mask;
+if (verbose_prefix)
+warn_report("%s: %s.%s [%u:%u]", verbose_prefix, feat_word_str,
+mf.feat_name, mf.high_bit_position,
+mf.low_bit_position);
+}
+}
+
 static void mark_unavailable_features(X86CPU *cpu, FeatureWord w, uint64_t 
mask,
   const char *verbose_prefix)
 {
@@ -6424,10 +6444,21 @@ static void x86_cpu_filter_features(X86CPU *cpu, bool 
verbose)
 }
 
 for (w = 0; w < FEATURE_WORDS; w++) {
-uint64_t host_feat =
-x86_cpu_get_supported_feature_word(w, false);
-uint64_t requested_features = env->features[w];
-uint64_t unavailable_features = requested_features & ~host_feat;
+uint64_t host_feat = x86_cpu_get_supported_feature_word(w, false);
+FeatureWordInfo f = feature_word_info[w];
+uint64_t unavailable_features = env->features[w] & ~host_feat;
+int i;
+
+for (i = 0; i < f.num_multi_bit_features; i++) {
+MultiBitFeatureInfo mf = f.multi_bit_features[i];
+if (!mf.mark_unavailable_bits) {
+mf.mark_unavailable_bits = mark_unavailable_bits;
+}
+mf.mark_unavailable_bits(cpu, w, i, prefix);
+
+unavailable_features &= ~mf.mask;
+}
+
 mark_unavailable_features(cpu, w, unavailable_features, prefix);
 }
 
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 1c90fb6c9d..824a2b0f85 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -2103,6 +2103,8 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
 void cpu_clear_apic_feature(CPUX86State *env);
 void host_cpuid(uint32_t function, uint32_t count,
 uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx);
+v

[PATCH 5/6] i386: Initialize AMX CPUID leaves with corresponding env->features[] leaves

2022-10-26 Thread Wang, Lei
The AMX-related CPUID value, i.e., CPUID(0x1D,1):EAX, CPUID(0x1D,1):EBX,
CPUID(0x1D,1):ECX and CPUID(0x1E,0):EBX are hard-coded to Sapphire Rapids
without considering future platforms.

Replace these hard-coded values with env->features[], so QEMU can pass the
right value to KVM.

Signed-off-by: Wang, Lei 
---
 target/i386/cpu.c | 27 +--
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index fc120c0694..21d9529d38 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -576,16 +576,16 @@ static CPUCacheInfo legacy_l3_cache = {
 #define INTEL_PT_PSB_BITMAP  (0x003f << 16) /* Support 
2K,4K,8K,16K,32K,64K */
 
 /* CPUID Leaf 0x1D constants: */
-#define INTEL_AMX_TILE_MAX_SUBLEAF 0x1
-#define INTEL_AMX_TOTAL_TILE_BYTES 0x2000
-#define INTEL_AMX_BYTES_PER_TILE   0x400
-#define INTEL_AMX_BYTES_PER_ROW0x40
-#define INTEL_AMX_TILE_MAX_NAMES   0x8
-#define INTEL_AMX_TILE_MAX_ROWS0x10
+#define INTEL_SPR_AMX_TILE_MAX_SUBLEAF 0x1
+#define INTEL_SPR_AMX_TOTAL_TILE_BYTES 0x2000
+#define INTEL_SPR_AMX_BYTES_PER_TILE   0x400
+#define INTEL_SPR_AMX_BYTES_PER_ROW0x40
+#define INTEL_SPR_AMX_TILE_MAX_NAMES   0x8
+#define INTEL_SPR_AMX_TILE_MAX_ROWS0x10
 
 /* CPUID Leaf 0x1E constants: */
-#define INTEL_AMX_TMUL_MAX_K   0x10
-#define INTEL_AMX_TMUL_MAX_N   0x40
+#define INTEL_SPR_AMX_TMUL_MAX_K   0x10
+#define INTEL_SPR_AMX_TMUL_MAX_N   0x40
 
 void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
   uint32_t vendor2, uint32_t vendor3)
@@ -5763,12 +5763,11 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
 
 if (count == 0) {
 /* Highest numbered palette subleaf */
-*eax = INTEL_AMX_TILE_MAX_SUBLEAF;
+*eax = INTEL_SPR_AMX_TILE_MAX_SUBLEAF;
 } else if (count == 1) {
-*eax = INTEL_AMX_TOTAL_TILE_BYTES |
-   (INTEL_AMX_BYTES_PER_TILE << 16);
-*ebx = INTEL_AMX_BYTES_PER_ROW | (INTEL_AMX_TILE_MAX_NAMES << 16);
-*ecx = INTEL_AMX_TILE_MAX_ROWS;
+*eax = env->features[FEAT_1D_1_EAX];
+*ebx = env->features[FEAT_1D_1_EBX];
+*ecx = env->features[FEAT_1D_1_ECX];
 }
 break;
 }
@@ -5784,7 +5783,7 @@ void cpu_x86_cpuid(CPUX86State *env, uint32_t index, 
uint32_t count,
 
 if (count == 0) {
 /* Highest numbered palette subleaf */
-*ebx = INTEL_AMX_TMUL_MAX_K | (INTEL_AMX_TMUL_MAX_N << 8);
+*ebx = env->features[FEAT_1E_0_EBX];
 }
 break;
 }
-- 
2.34.1




[PATCH 3/6] i386: Introduce new struct "MultiBitFeatureInfo" for multi-bit features

2022-10-26 Thread Wang, Lei
Some features use multiple CPUID bits to form a value to be used, e.g.,
CPUID(0x1E,0):EBX[23:08] is regarded as the tmul_maxn value for AMX.
Introduce a new struct "MultiBitFeatureInfo" to hold the information for
those features and create a corresponding member in struct FeatureWordInfo,
so that the infomation can be assigned for each item in feature_word_info
array and used in the future.

Signed-off-by: Wang, Lei 
---
 target/i386/cpu-internal.h |  9 +++
 target/i386/cpu.c  | 54 ++
 2 files changed, 63 insertions(+)

diff --git a/target/i386/cpu-internal.h b/target/i386/cpu-internal.h
index 9baac5c0b4..66b3d66cb4 100644
--- a/target/i386/cpu-internal.h
+++ b/target/i386/cpu-internal.h
@@ -25,6 +25,13 @@ typedef enum FeatureWordType {
MSR_FEATURE_WORD,
 } FeatureWordType;
 
+typedef struct MultiBitFeatureInfo {
+const char *feat_name;
+uint64_t mask;
+unsigned high_bit_position;
+unsigned low_bit_position;
+} MultiBitFeatureInfo;
+
 typedef struct FeatureWordInfo {
 FeatureWordType type;
 /* feature flags names are taken from "Intel Processor Identification and
@@ -51,6 +58,8 @@ typedef struct FeatureWordInfo {
 uint64_t migratable_flags; /* Feature flags known to be migratable */
 /* Features that shouldn't be auto-enabled by "-cpu host" */
 uint64_t no_autoenable_flags;
+unsigned num_multi_bit_features;
+MultiBitFeatureInfo *multi_bit_features;
 } FeatureWordInfo;
 
 extern FeatureWordInfo feature_word_info[];
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 0083a2a7f7..7ae232ab18 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -1011,6 +1011,21 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 },
 .migratable_flags = CPUID_AMX_PALETTE_1_TOTAL_TILE_BYTES_MASK |
 CPUID_AMX_PALETTE_1_BYTES_PER_TILE_MASK,
+.num_multi_bit_features = 2,
+.multi_bit_features = (MultiBitFeatureInfo[]){
+{
+.feat_name = "total_tile_bytes",
+.mask = CPUID_AMX_PALETTE_1_TOTAL_TILE_BYTES_MASK,
+.high_bit_position = 15,
+.low_bit_position = 0,
+},
+{
+.feat_name = "bytes_per_tile",
+.mask = CPUID_AMX_PALETTE_1_BYTES_PER_TILE_MASK,
+.high_bit_position = 31,
+.low_bit_position = 16,
+},
+},
 },
 [FEAT_1D_1_EBX] = {
 .type = CPUID_FEATURE_WORD,
@@ -1021,6 +1036,21 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 },
 .migratable_flags = CPUID_AMX_PALETTE_1_BYTES_PER_ROW_MASK |
 CPUID_AMX_PALETTE_1_MAX_NAMES_MASK,
+.num_multi_bit_features = 2,
+.multi_bit_features = (MultiBitFeatureInfo[]){
+{
+.feat_name = "bytes_per_row",
+.mask = CPUID_AMX_PALETTE_1_BYTES_PER_ROW_MASK,
+.high_bit_position = 15,
+.low_bit_position = 0,
+},
+{
+.feat_name = "max_names",
+.mask = CPUID_AMX_PALETTE_1_MAX_NAMES_MASK,
+.high_bit_position = 31,
+.low_bit_position = 16,
+},
+},
 },
 [FEAT_1D_1_ECX] = {
 .type = CPUID_FEATURE_WORD,
@@ -1030,6 +1060,15 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 .reg = R_ECX,
 },
 .migratable_flags = CPUID_AMX_PALETTE_1_MAX_ROWS_MASK,
+.num_multi_bit_features = 1,
+.multi_bit_features = (MultiBitFeatureInfo[]){
+{
+.feat_name = "max_rows",
+.mask = CPUID_AMX_PALETTE_1_MAX_ROWS_MASK,
+.high_bit_position = 15,
+.low_bit_position = 0,
+},
+},
 },
 [FEAT_1E_0_EBX] = {
 .type = CPUID_FEATURE_WORD,
@@ -1040,6 +1079,21 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 },
 .migratable_flags = CPUID_AMX_TMUL_MAX_K_MASK |
 CPUID_AMX_TMUL_MAX_N_MASK,
+.num_multi_bit_features = 2,
+.multi_bit_features = (MultiBitFeatureInfo[]){
+{
+.feat_name = "tmul_maxk",
+.mask = CPUID_AMX_TMUL_MAX_K_MASK,
+.high_bit_position = 7,
+.low_bit_position = 0,
+},
+{
+.feat_name = "tmul_maxn",
+.mask = CPUID_AMX_TMUL_MAX_N_MASK,
+.high_bit_position = 23,
+.low_bit_position = 8,
+},
+},
 },
 /*Below are MSR exposed features*/
 [FEAT_ARCH_CAPABILITIES] = {
-- 
2.34.1




[PATCH 6/6] i386: Add new CPU model SapphireRapids

2022-10-26 Thread Wang, Lei
The new CPU model mostly inherits features from Icelake-Server, while
adding new features:
 - AMX (Advance Matrix eXtensions)
 - Bus Lock Debug Exception
and new instructions:
 - AVX VNNI (Vector Neural Network Instruction):
- VPDPBUS: Multiply and Add Unsigned and Signed Bytes
- VPDPBUSDS: Multiply and Add Unsigned and Signed Bytes with Saturation
- VPDPWSSD: Multiply and Add Signed Word Integers
- VPDPWSSDS: Multiply and Add Signed Integers with Saturation
 - FP16: Replicates existing AVX512 computational SP (FP32) instructions
   using FP16 instead of FP32 for ~2X performance gain
 - SERIALIZE: Provide software with a simple way to force the processor to
   complete all modifications, faster, allowed in all privilege levels and
   not causing an unconditional VM exit
 - TSX Suspend Load Address Tracking: Allows programmers to choose which
   memory accesses do not need to be tracked in the TSX read set
 - AVX512_BF16: Vector Neural Network Instructions supporting BFLOAT16
   inputs and conversion instructions from IEEE single precision

Features may be added in future versions:
 - CET (virtualization support hasn't been merged)
Instructions may be added in future versions:
 - fast zero-length MOVSB (KVM doesn't support yet)
 - fast short STOSB (KVM doesn't support yet)
 - fast short CMPSB, SCASB (KVM doesn't support yet)

Signed-off-by: Wang, Lei 
Reviewed-by: Robert Hoo 
---
 target/i386/cpu.c | 135 ++
 target/i386/cpu.h |   4 ++
 2 files changed, 139 insertions(+)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 21d9529d38..6bbca600e0 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -3576,6 +3576,141 @@ static const X86CPUDefinition builtin_x86_defs[] = {
 { /* end of list */ }
 }
 },
+{
+.name = "SapphireRapids",
+.level = 0x20,
+.vendor = CPUID_VENDOR_INTEL,
+.family = 6,
+.model = 143,
+.stepping = 4,
+/*
+ * please keep the ascending order so that we can have a clear view of
+ * bit position of each feature.
+ */
+.features[FEAT_1_EDX] =
+CPUID_FP87 | CPUID_VME | CPUID_DE | CPUID_PSE | CPUID_TSC |
+CPUID_MSR | CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC |
+CPUID_SEP | CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV |
+CPUID_PAT | CPUID_PSE36 | CPUID_CLFLUSH | CPUID_MMX | CPUID_FXSR |
+CPUID_SSE | CPUID_SSE2,
+.features[FEAT_1_ECX] =
+CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSSE3 |
+CPUID_EXT_FMA | CPUID_EXT_CX16 | CPUID_EXT_PCID | CPUID_EXT_SSE41 |
+CPUID_EXT_SSE42 | CPUID_EXT_X2APIC | CPUID_EXT_MOVBE |
+CPUID_EXT_POPCNT | CPUID_EXT_TSC_DEADLINE_TIMER | CPUID_EXT_AES |
+CPUID_EXT_XSAVE | CPUID_EXT_AVX | CPUID_EXT_F16C | 
CPUID_EXT_RDRAND,
+.features[FEAT_8000_0001_EDX] =
+CPUID_EXT2_SYSCALL | CPUID_EXT2_NX | CPUID_EXT2_PDPE1GB |
+CPUID_EXT2_RDTSCP | CPUID_EXT2_LM,
+.features[FEAT_8000_0001_ECX] =
+CPUID_EXT3_LAHF_LM | CPUID_EXT3_ABM | CPUID_EXT3_3DNOWPREFETCH,
+.features[FEAT_8000_0008_EBX] =
+CPUID_8000_0008_EBX_WBNOINVD,
+.features[FEAT_7_0_EBX] =
+CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_HLE |
+CPUID_7_0_EBX_AVX2 | CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_BMI2 |
+CPUID_7_0_EBX_ERMS | CPUID_7_0_EBX_INVPCID | CPUID_7_0_EBX_RTM |
+CPUID_7_0_EBX_AVX512F | CPUID_7_0_EBX_AVX512DQ |
+CPUID_7_0_EBX_RDSEED | CPUID_7_0_EBX_ADX | CPUID_7_0_EBX_SMAP |
+CPUID_7_0_EBX_AVX512IFMA | CPUID_7_0_EBX_CLFLUSHOPT |
+CPUID_7_0_EBX_CLWB | CPUID_7_0_EBX_AVX512CD | CPUID_7_0_EBX_SHA_NI 
|
+CPUID_7_0_EBX_AVX512BW | CPUID_7_0_EBX_AVX512VL,
+.features[FEAT_7_0_ECX] =
+CPUID_7_0_ECX_AVX512_VBMI | CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_PKU 
|
+CPUID_7_0_ECX_AVX512_VBMI2 | CPUID_7_0_ECX_GFNI |
+CPUID_7_0_ECX_VAES | CPUID_7_0_ECX_VPCLMULQDQ |
+CPUID_7_0_ECX_AVX512VNNI | CPUID_7_0_ECX_AVX512BITALG |
+CPUID_7_0_ECX_AVX512_VPOPCNTDQ | CPUID_7_0_ECX_LA57 |
+CPUID_7_0_ECX_RDPID | CPUID_7_0_ECX_BUS_LOCK_DETECT,
+.features[FEAT_7_0_EDX] =
+CPUID_7_0_EDX_FSRM | CPUID_7_0_EDX_SERIALIZE |
+CPUID_7_0_EDX_TSX_LDTRK | CPUID_7_0_EDX_AMX_BF16 |
+CPUID_7_0_EDX_AVX512_FP16 | CPUID_7_0_EDX_AMX_TILE |
+CPUID_7_0_EDX_AMX_INT8 | CPUID_7_0_EDX_SPEC_CTRL |
+CPUID_7_0_EDX_ARCH_CAPABILITIES | CPUID_7_0_EDX_SPEC_CTRL_SSBD,
+.features[FEAT_ARCH_CAPABILITIES] =
+MSR_ARCH_CAP_RDCL_NO | MSR_ARCH_CAP_IBRS_ALL |
+MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY | MSR_ARCH_CAP_MDS_NO |
+MSR_ARCH_CAP_PSCHANGE_MC_NO | MSR_ARCH_CAP_TAA_NO,
+.features[

[PATCH 1/6] i386: Introduce FeatureWordInfo for AMX CPUID leaf 0x1D and 0x1E

2022-10-26 Thread Wang, Lei
CPUID leaf 0x1D and 0x1E enumerate tile and TMUL information for AMX.

Introduce FeatureWord FEAT_1D_1_EAX, FEAT_1D_1_EBX, FEAT_1D_1_ECX and
FEAT_1E_0_EBX. Thus these features of AMX can be expanded when
"-cpu host/max" and can be configured in named CPU model.

Signed-off-by: Wang, Lei 
---
 target/i386/cpu.c | 55 +++
 target/i386/cpu.h | 12 +++
 2 files changed, 67 insertions(+)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 8a11470507..e98780773c 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -1002,6 +1002,45 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
 },
 .tcg_features = ~0U,
 },
+[FEAT_1D_1_EAX] = {
+.type = CPUID_FEATURE_WORD,
+.cpuid = {
+.eax = 0x1D,
+.needs_ecx = true, .ecx = 1,
+.reg = R_EAX,
+},
+.migratable_flags = CPUID_AMX_PALETTE_1_TOTAL_TILE_BYTES_MASK |
+CPUID_AMX_PALETTE_1_BYTES_PER_TILE_MASK,
+},
+[FEAT_1D_1_EBX] = {
+.type = CPUID_FEATURE_WORD,
+.cpuid = {
+.eax = 0x1D,
+.needs_ecx = true, .ecx = 1,
+.reg = R_EBX,
+},
+.migratable_flags = CPUID_AMX_PALETTE_1_BYTES_PER_ROW_MASK |
+CPUID_AMX_PALETTE_1_MAX_NAMES_MASK,
+},
+[FEAT_1D_1_ECX] = {
+.type = CPUID_FEATURE_WORD,
+.cpuid = {
+.eax = 0x1D,
+.needs_ecx = true, .ecx = 1,
+.reg = R_ECX,
+},
+.migratable_flags = CPUID_AMX_PALETTE_1_MAX_ROWS_MASK,
+},
+[FEAT_1E_0_EBX] = {
+.type = CPUID_FEATURE_WORD,
+.cpuid = {
+.eax = 0x1E,
+.needs_ecx = true, .ecx = 0,
+.reg = R_EBX,
+},
+.migratable_flags = CPUID_AMX_TMUL_MAX_K_MASK |
+CPUID_AMX_TMUL_MAX_N_MASK,
+},
 /*Below are MSR exposed features*/
 [FEAT_ARCH_CAPABILITIES] = {
 .type = MSR_FEATURE_WORD,
@@ -1371,6 +1410,22 @@ static FeatureDep feature_dependencies[] = {
 .from = { FEAT_7_0_EBX, CPUID_7_0_EBX_INTEL_PT },
 .to = { FEAT_14_0_ECX,  ~0ull },
 },
+{
+.from = { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_TILE },
+.to = { FEAT_1D_1_EAX,  ~0ull },
+},
+{
+.from = { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_TILE },
+.to = { FEAT_1D_1_EBX,  ~0ull },
+},
+{
+.from = { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_TILE },
+.to = { FEAT_1D_1_ECX,  ~0ull },
+},
+{
+.from = { FEAT_7_0_EDX, CPUID_7_0_EDX_AMX_TILE },
+.to = { FEAT_1E_0_EBX,  ~0ull },
+},
 {
 .from = { FEAT_8000_0001_EDX,   CPUID_EXT2_RDTSCP },
 .to = { FEAT_VMX_SECONDARY_CTLS,VMX_SECONDARY_EXEC_RDTSCP },
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 7edf5dfac3..1c90fb6c9d 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -583,6 +583,14 @@ typedef enum X86Seg {
  XSTATE_Hi16_ZMM_MASK | XSTATE_PKRU_MASK | \
  XSTATE_XTILE_CFG_MASK | 
XSTATE_XTILE_DATA_MASK)
 
+#define CPUID_AMX_PALETTE_1_TOTAL_TILE_BYTES_MASK 0xU
+#define CPUID_AMX_PALETTE_1_BYTES_PER_TILE_MASK   (0xU << 16)
+#define CPUID_AMX_PALETTE_1_BYTES_PER_ROW_MASK0xU
+#define CPUID_AMX_PALETTE_1_MAX_NAMES_MASK(0xU << 16)
+#define CPUID_AMX_PALETTE_1_MAX_ROWS_MASK 0xU
+#define CPUID_AMX_TMUL_MAX_K_MASK 0xffU
+#define CPUID_AMX_TMUL_MAX_N_MASK (0xU << 8)
+
 /* CPUID feature words */
 typedef enum FeatureWord {
 FEAT_1_EDX, /* CPUID[1].EDX */
@@ -603,6 +611,10 @@ typedef enum FeatureWord {
 FEAT_6_EAX, /* CPUID[6].EAX */
 FEAT_XSAVE_XCR0_LO, /* CPUID[EAX=0xd,ECX=0].EAX */
 FEAT_XSAVE_XCR0_HI, /* CPUID[EAX=0xd,ECX=0].EDX */
+FEAT_1D_1_EAX,  /* CPUID[EAX=0x1d,ECX=1].EAX */
+FEAT_1D_1_EBX,  /* CPUID[EAX=0x1d,ECX=1].EBX */
+FEAT_1D_1_ECX,  /* CPUID[EAX=0x1d,ECX=1].ECX */
+FEAT_1E_0_EBX,  /* CPUID[EAX=0x1e,ECX=0].EBX */
 FEAT_ARCH_CAPABILITIES,
 FEAT_CORE_CAPABILITY,
 FEAT_PERF_CAPABILITIES,
-- 
2.34.1




[PATCH] .gitignore: add multiple items to .gitignore

2022-10-20 Thread Wang, Lei
Add /.vscode/, .clang-format and .gdb_history to .gitignore because:

 - For VSCode, workspace settings as well as debugging and task
 configurations are stored at the root in a .vscode folder;
 - For ClangFormat, the .clang-format file is searched relative to
 the current working directory when reading stdin;
 - For GDB, GDB command history file defaults to the value of the
 environment variable GDBHISTFILE, or to ./.gdb_history if this
 variable is not set.

Signed-off-by: Wang, Lei 
---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 8aab671265..61fa39967b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,10 +1,13 @@
 /GNUmakefile
 /build/
 /.cache/
+/.vscode/
 *.pyc
 .sdk
 .stgit-*
 .git-submodule-status
+.clang-format
+.gdb_history
 cscope.*
 tags
 TAGS
-- 
2.34.1




Re: [PATCH] qemu-config: extract same logic in *_add_opts() to fill_config_groups()

2022-10-16 Thread Wang, Lei
Kindly ping for any comments.

BR,
Lei

On 9/2/2022 3:57 PM, Markus Armbruster wrote:
> Cc: Gerd & Kevin, because they were involved with the code that gets
> refactored here, and no good deed shall go unpunished.
> 
> "Wang, Lei"  writes:
> 
>> QEMU use qemu_add_opts() and qemu_add_drive_opts() to add config options
>> when initialization. Extract the same logic in both functions to a
>> seperate function fill_config_groups() to reduce code redundency.
>>
>> Signed-off-by: Wang, Lei 
>> ---
>>  util/qemu-config.c | 39 ---
>>  1 file changed, 20 insertions(+), 19 deletions(-)
>>
>> diff --git a/util/qemu-config.c b/util/qemu-config.c
>> index 433488aa56..3a1c85223a 100644
>> --- a/util/qemu-config.c
>> +++ b/util/qemu-config.c
>> @@ -282,36 +282,37 @@ QemuOptsList *qemu_find_opts_err(const char *group, 
>> Error **errp)
>>  return find_list(vm_config_groups, group, errp);
>>  }
>>  
>> -void qemu_add_drive_opts(QemuOptsList *list)
>> +static int fill_config_groups(QemuOptsList *groups[], int entries,
>> +  QemuOptsList *list)
>>  {
>> -int entries, i;
>> +int i;
>>  
>> -entries = ARRAY_SIZE(drive_config_groups);
>>  entries--; /* keep list NULL terminated */
>>  for (i = 0; i < entries; i++) {
>> -if (drive_config_groups[i] == NULL) {
>> -drive_config_groups[i] = list;
>> -return;
>> +if (groups[i] == NULL) {
>> +groups[i] = list;
>> +return 0;
>>  }
>>  }
>> -fprintf(stderr, "ran out of space in drive_config_groups");
>> -abort();
>> +return -1;
>>  }
>>  
>> -void qemu_add_opts(QemuOptsList *list)
>> +void qemu_add_drive_opts(QemuOptsList *list)
>>  {
>> -int entries, i;
>> +if (fill_config_groups(drive_config_groups, 
>> ARRAY_SIZE(drive_config_groups),
>> +   list) < 0) {
>> +fprintf(stderr, "ran out of space in drive_config_groups");
>> +abort();
>> +}
>> +}
>>  
>> -entries = ARRAY_SIZE(vm_config_groups);
>> -entries--; /* keep list NULL terminated */
>> -for (i = 0; i < entries; i++) {
>> -if (vm_config_groups[i] == NULL) {
>> -vm_config_groups[i] = list;
>> -return;
>> -}
>> +void qemu_add_opts(QemuOptsList *list)
>> +{
>> +if (fill_config_groups(vm_config_groups, ARRAY_SIZE(vm_config_groups),
>> +   list) < 0) {
>> +fprintf(stderr, "ran out of space in vm_config_groups");
>> +abort();
>>  }
>> -fprintf(stderr, "ran out of space in vm_config_groups");
>> -abort();
>>  }
>>  
>>  /* Returns number of config groups on success, -errno on error */
> 



[PATCH] .gitignore: add .cache/ to .gitignore

2022-09-07 Thread Wang, Lei
$PROJECT/.cache/clangd/index is the intended location for project index
data when using clangd as the language server. Ignore this directory to
keep the git status clean.

Signed-off-by: Wang, Lei 
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 9726a778b3..8aab671265 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 /GNUmakefile
 /build/
+/.cache/
 *.pyc
 .sdk
 .stgit-*
-- 
2.37.3




[PATCH] qemu-config: extract same logic in *_add_opts() to fill_config_groups()

2022-09-02 Thread Wang, Lei
QEMU use qemu_add_opts() and qemu_add_drive_opts() to add config options
when initialization. Extract the same logic in both functions to a
seperate function fill_config_groups() to reduce code redundency.

Signed-off-by: Wang, Lei 
---
 util/qemu-config.c | 39 ---
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/util/qemu-config.c b/util/qemu-config.c
index 433488aa56..3a1c85223a 100644
--- a/util/qemu-config.c
+++ b/util/qemu-config.c
@@ -282,36 +282,37 @@ QemuOptsList *qemu_find_opts_err(const char *group, Error 
**errp)
 return find_list(vm_config_groups, group, errp);
 }
 
-void qemu_add_drive_opts(QemuOptsList *list)
+static int fill_config_groups(QemuOptsList *groups[], int entries,
+  QemuOptsList *list)
 {
-int entries, i;
+int i;
 
-entries = ARRAY_SIZE(drive_config_groups);
 entries--; /* keep list NULL terminated */
 for (i = 0; i < entries; i++) {
-if (drive_config_groups[i] == NULL) {
-drive_config_groups[i] = list;
-return;
+if (groups[i] == NULL) {
+groups[i] = list;
+return 0;
 }
 }
-fprintf(stderr, "ran out of space in drive_config_groups");
-abort();
+return -1;
 }
 
-void qemu_add_opts(QemuOptsList *list)
+void qemu_add_drive_opts(QemuOptsList *list)
 {
-int entries, i;
+if (fill_config_groups(drive_config_groups, 
ARRAY_SIZE(drive_config_groups),
+   list) < 0) {
+fprintf(stderr, "ran out of space in drive_config_groups");
+abort();
+}
+}
 
-entries = ARRAY_SIZE(vm_config_groups);
-entries--; /* keep list NULL terminated */
-for (i = 0; i < entries; i++) {
-if (vm_config_groups[i] == NULL) {
-vm_config_groups[i] = list;
-return;
-}
+void qemu_add_opts(QemuOptsList *list)
+{
+if (fill_config_groups(vm_config_groups, ARRAY_SIZE(vm_config_groups),
+   list) < 0) {
+fprintf(stderr, "ran out of space in vm_config_groups");
+abort();
 }
-fprintf(stderr, "ran out of space in vm_config_groups");
-abort();
 }
 
 /* Returns number of config groups on success, -errno on error */
-- 
2.37.1




Re: [Qemu-devel] [RFC PATCH] Add qemu .clang-format

2022-09-01 Thread Wang, Lei
On 9/1/2022 7:55 PM, Alex Bennée wrote:
> 
> "Wang, Lei"  writes:
> 
>> On 9/1/2022 4:12 PM, Daniel P. Berrangé wrote:
>>> On Thu, Sep 01, 2022 at 09:08:33AM +0800, Wang, Lei wrote:
>>>> On 8/31/2022 6:39 PM, Daniel P. Berrangé wrote:
>>>>> On Wed, Aug 31, 2022 at 05:18:34PM +0800, Wang, Lei wrote:
>>>>>>
>>>>>>
>>>>>> On 8/31/2022 4:49 PM, Daniel P. Berrangé wrote:
>>>>>>> On Wed, Aug 31, 2022 at 02:23:51PM +0800, Wang, Lei wrote:
>>>>>>>>
>>>>>>>> On 10/2/2015 1:30 AM, marcandre.lur...@redhat.com wrote:
>>>>>>>>> From: Marc-André Lureau 
>>>>>>>>>
>>>>>>>>> clang-format is awesome to reflow your code according to qemu coding
>>>>>>>>> style in an editor (in the region you modify).
>>>>>>>>>
>>>>>>>>> (note: clang-tidy should be able to add missing braces around
>>>>>>>>> statements, but I haven't tried it, it's quite recent)
>>>>>>>>>
>>>>>>>>> Signed-off-by: Marc-André Lureau 
>>>>>>>>> ---
>>>>>>>>> .clang-format | 6 ++
>>>>>>>>> 1 file changed, 6 insertions(+)
>>>>>>>>> create mode 100644 .clang-format
>>>>>>>>>
>>>>>>>>> diff --git a/.clang-format b/.clang-format
>>>>>>>>> new file mode 100644
>>>>>>>>> index 000..6422547
>>>>>>>>> --- /dev/null
>>>>>>>>> +++ b/.clang-format
>>>>>>>>> @@ -0,0 +1,6 @@
>>>>>>>>> +BasedOnStyle: LLVM
>>>>>>>>> +IndentWidth: 4
>>>>>>>>> +UseTab: Never
>>>>>>>>> +BreakBeforeBraces: Linux
>>>>>>>>> +AllowShortIfStatementsOnASingleLine: false
>>>>>>>>> +IndentCaseLabels: false
>>>>>>>>
>>>>>>>> Hi, any progress on this? I also found a gist on GitHub which can be a
>>>>>>>> reference: 
>>>>>>>> https://gist.github.com/elmarco/aa5e0b23567f46fb7f0e73cde586a0c1
>>>>>>>
>>>>>>> clang-format is a great tool and I'd highly recommend its use on
>>>>>>> any newly started projects, and even retrospectively on existing
>>>>>>> projects which are small scale. Adding it to large existing projects
>>>>>>> is problematic though.
>>>>>>>
>>>>>>> None of the QEMU code complies with it today and indeed there is
>>>>>>> quite a bit of style variance across different parts of QEMU. If
>>>>>>> we add this config file, and someone makes a 1 line change in a
>>>>>>> file, clang-format will reformat the entire file contents.
>>>>>>>
>>>>>>> The only practical way to introduce use of clang-format would be
>>>>>>> to do a bulk reformat of the entire codebase. That is something
>>>>>>> that is quite disruptive to both people with patches they're
>>>>>>> working on but not submitted yet, as well as people wanting to
>>>>>>> cherry-pick new commits back to old code branches.
>>>>>>>
>>>>>>> With regards,
>>>>>>> Daniel
>>>>>>
>>>>>> I think the benefits of introducing clang-format mainly for its ability 
>>>>>> to
>>>>>> format a code range, which means for any future contributions, we could
>>>>>> encourage a range format before the patch is generated. This can 
>>>>>> extensively
>>>>>> simplify my workflow, especially because I use the Neovim + LSP 
>>>>>> combination,
>>>>>> which supports a built-in function "lua vim.lsp.buf.range_formatting()".
>>>>>
>>>>> IMHO partial format conversions are even worse than full conversions,
>>>>> because they would make code inconsistent within the scope of a file.
>>>>
>>>> So you mean when we're adding new code in an old file, the coding style
>>>> should also be the old one? That sounds a bit unreasonable. I thought we 
>>>>

Re: [Qemu-devel] [RFC PATCH] Add qemu .clang-format

2022-09-01 Thread Wang, Lei
On 9/1/2022 4:12 PM, Daniel P. Berrangé wrote:
> On Thu, Sep 01, 2022 at 09:08:33AM +0800, Wang, Lei wrote:
>> On 8/31/2022 6:39 PM, Daniel P. Berrangé wrote:
>>> On Wed, Aug 31, 2022 at 05:18:34PM +0800, Wang, Lei wrote:
>>>>
>>>>
>>>> On 8/31/2022 4:49 PM, Daniel P. Berrangé wrote:
>>>>> On Wed, Aug 31, 2022 at 02:23:51PM +0800, Wang, Lei wrote:
>>>>>>
>>>>>> On 10/2/2015 1:30 AM, marcandre.lur...@redhat.com wrote:
>>>>>>> From: Marc-André Lureau 
>>>>>>>
>>>>>>> clang-format is awesome to reflow your code according to qemu coding
>>>>>>> style in an editor (in the region you modify).
>>>>>>>
>>>>>>> (note: clang-tidy should be able to add missing braces around
>>>>>>> statements, but I haven't tried it, it's quite recent)
>>>>>>>
>>>>>>> Signed-off-by: Marc-André Lureau 
>>>>>>> ---
>>>>>>> .clang-format | 6 ++
>>>>>>> 1 file changed, 6 insertions(+)
>>>>>>> create mode 100644 .clang-format
>>>>>>>
>>>>>>> diff --git a/.clang-format b/.clang-format
>>>>>>> new file mode 100644
>>>>>>> index 000..6422547
>>>>>>> --- /dev/null
>>>>>>> +++ b/.clang-format
>>>>>>> @@ -0,0 +1,6 @@
>>>>>>> +BasedOnStyle: LLVM
>>>>>>> +IndentWidth: 4
>>>>>>> +UseTab: Never
>>>>>>> +BreakBeforeBraces: Linux
>>>>>>> +AllowShortIfStatementsOnASingleLine: false
>>>>>>> +IndentCaseLabels: false
>>>>>>
>>>>>> Hi, any progress on this? I also found a gist on GitHub which can be a
>>>>>> reference: 
>>>>>> https://gist.github.com/elmarco/aa5e0b23567f46fb7f0e73cde586a0c1
>>>>>
>>>>> clang-format is a great tool and I'd highly recommend its use on
>>>>> any newly started projects, and even retrospectively on existing
>>>>> projects which are small scale. Adding it to large existing projects
>>>>> is problematic though.
>>>>>
>>>>> None of the QEMU code complies with it today and indeed there is
>>>>> quite a bit of style variance across different parts of QEMU. If
>>>>> we add this config file, and someone makes a 1 line change in a
>>>>> file, clang-format will reformat the entire file contents.
>>>>>
>>>>> The only practical way to introduce use of clang-format would be
>>>>> to do a bulk reformat of the entire codebase. That is something
>>>>> that is quite disruptive to both people with patches they're
>>>>> working on but not submitted yet, as well as people wanting to
>>>>> cherry-pick new commits back to old code branches.
>>>>>
>>>>> With regards,
>>>>> Daniel
>>>>
>>>> I think the benefits of introducing clang-format mainly for its ability to
>>>> format a code range, which means for any future contributions, we could
>>>> encourage a range format before the patch is generated. This can 
>>>> extensively
>>>> simplify my workflow, especially because I use the Neovim + LSP 
>>>> combination,
>>>> which supports a built-in function "lua vim.lsp.buf.range_formatting()".
>>>
>>> IMHO partial format conversions are even worse than full conversions,
>>> because they would make code inconsistent within the scope of a file.
>>
>> So you mean when we're adding new code in an old file, the coding style
>> should also be the old one? That sounds a bit unreasonable. I thought we are
>> shifting the coding style in an on-demand way, so we can finally achieve to
>> the new style mildly, if each time we're using the old coding style, that
>> could be impossible.
> 
> From my POV as a maintainer, the best situation would be consistency across
> the entire codebase. Since we likely won't get that though, then next best
> is consistency across the subsystem directory, and next best is consistency
> across the whole file.  Mixing code styles within a file is the worst IMHO.
> 
>>
>>>> I have no interest in reformatting the existing code and also think using 
>>>> it
>>>> to reformat an entire file shouldn't be 

Re: [Qemu-devel] [RFC PATCH] Add qemu .clang-format

2022-08-31 Thread Wang, Lei

On 8/31/2022 6:39 PM, Daniel P. Berrangé wrote:

On Wed, Aug 31, 2022 at 05:18:34PM +0800, Wang, Lei wrote:



On 8/31/2022 4:49 PM, Daniel P. Berrangé wrote:

On Wed, Aug 31, 2022 at 02:23:51PM +0800, Wang, Lei wrote:


On 10/2/2015 1:30 AM, marcandre.lur...@redhat.com wrote:

From: Marc-André Lureau 

clang-format is awesome to reflow your code according to qemu coding
style in an editor (in the region you modify).

(note: clang-tidy should be able to add missing braces around
statements, but I haven't tried it, it's quite recent)

Signed-off-by: Marc-André Lureau 
---
.clang-format | 6 ++
1 file changed, 6 insertions(+)
create mode 100644 .clang-format

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000..6422547
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,6 @@
+BasedOnStyle: LLVM
+IndentWidth: 4
+UseTab: Never
+BreakBeforeBraces: Linux
+AllowShortIfStatementsOnASingleLine: false
+IndentCaseLabels: false


Hi, any progress on this? I also found a gist on GitHub which can be a
reference: https://gist.github.com/elmarco/aa5e0b23567f46fb7f0e73cde586a0c1


clang-format is a great tool and I'd highly recommend its use on
any newly started projects, and even retrospectively on existing
projects which are small scale. Adding it to large existing projects
is problematic though.

None of the QEMU code complies with it today and indeed there is
quite a bit of style variance across different parts of QEMU. If
we add this config file, and someone makes a 1 line change in a
file, clang-format will reformat the entire file contents.

The only practical way to introduce use of clang-format would be
to do a bulk reformat of the entire codebase. That is something
that is quite disruptive to both people with patches they're
working on but not submitted yet, as well as people wanting to
cherry-pick new commits back to old code branches.

With regards,
Daniel


I think the benefits of introducing clang-format mainly for its ability to
format a code range, which means for any future contributions, we could
encourage a range format before the patch is generated. This can extensively
simplify my workflow, especially because I use the Neovim + LSP combination,
which supports a built-in function "lua vim.lsp.buf.range_formatting()".


IMHO partial format conversions are even worse than full conversions,
because they would make code inconsistent within the scope of a file.


So you mean when we're adding new code in an old file, the coding style 
should also be the old one? That sounds a bit unreasonable. I thought we 
are shifting the coding style in an on-demand way, so we can finally 
achieve to the new style mildly, if each time we're using the old coding 
style, that could be impossible.



I have no interest in reformatting the existing code and also think using it
to reformat an entire file shouldn't be encouraged, but, we can leverage
this tool to give future contributions a better experience. It's also
important to note that the kernel already has a ".clang-format" file, so I
think we can give it a try:)


The mere action of introducing a .clang-format file in the root of the
repository will cause some contributors' editors to automatically
reformat files every time they are saved. IOW even if you don't want
intend to do reformatting, that will be a net result.

With regards,
Daniel


I think that depends on developer's configuration, as far as I know, 
format on save is a feature which can be easily disabled on most of the 
IDE's, such as VSCode.




Re: [Qemu-devel] [RFC PATCH] Add qemu .clang-format

2022-08-31 Thread Wang, Lei




On 8/31/2022 4:49 PM, Daniel P. Berrangé wrote:

On Wed, Aug 31, 2022 at 02:23:51PM +0800, Wang, Lei wrote:


On 10/2/2015 1:30 AM, marcandre.lur...@redhat.com wrote:

From: Marc-André Lureau 

clang-format is awesome to reflow your code according to qemu coding
style in an editor (in the region you modify).

(note: clang-tidy should be able to add missing braces around
statements, but I haven't tried it, it's quite recent)

Signed-off-by: Marc-André Lureau 
---
   .clang-format | 6 ++
   1 file changed, 6 insertions(+)
   create mode 100644 .clang-format

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000..6422547
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,6 @@
+BasedOnStyle: LLVM
+IndentWidth: 4
+UseTab: Never
+BreakBeforeBraces: Linux
+AllowShortIfStatementsOnASingleLine: false
+IndentCaseLabels: false


Hi, any progress on this? I also found a gist on GitHub which can be a
reference: https://gist.github.com/elmarco/aa5e0b23567f46fb7f0e73cde586a0c1


clang-format is a great tool and I'd highly recommend its use on
any newly started projects, and even retrospectively on existing
projects which are small scale. Adding it to large existing projects
is problematic though.

None of the QEMU code complies with it today and indeed there is
quite a bit of style variance across different parts of QEMU. If
we add this config file, and someone makes a 1 line change in a
file, clang-format will reformat the entire file contents.

The only practical way to introduce use of clang-format would be
to do a bulk reformat of the entire codebase. That is something
that is quite disruptive to both people with patches they're
working on but not submitted yet, as well as people wanting to
cherry-pick new commits back to old code branches.

With regards,
Daniel


I think the benefits of introducing clang-format mainly for its ability 
to format a code range, which means for any future contributions, we 
could encourage a range format before the patch is generated. This can 
extensively simplify my workflow, especially because I use the Neovim + 
LSP combination, which supports a built-in function "lua 
vim.lsp.buf.range_formatting()".


I have no interest in reformatting the existing code and also think 
using it to reformat an entire file shouldn't be encouraged, but, we can 
leverage this tool to give future contributions a better experience. 
It's also important to note that the kernel already has a 
".clang-format" file, so I think we can give it a try:)


BR,
Lei



Re: [Qemu-devel] [RFC PATCH] Add qemu .clang-format

2022-08-31 Thread Wang, Lei



On 10/2/2015 1:30 AM, marcandre.lur...@redhat.com wrote:

From: Marc-André Lureau 

clang-format is awesome to reflow your code according to qemu coding
style in an editor (in the region you modify).

(note: clang-tidy should be able to add missing braces around
statements, but I haven't tried it, it's quite recent)

Signed-off-by: Marc-André Lureau 
---
  .clang-format | 6 ++
  1 file changed, 6 insertions(+)
  create mode 100644 .clang-format

diff --git a/.clang-format b/.clang-format
new file mode 100644
index 000..6422547
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,6 @@
+BasedOnStyle: LLVM
+IndentWidth: 4
+UseTab: Never
+BreakBeforeBraces: Linux
+AllowShortIfStatementsOnASingleLine: false
+IndentCaseLabels: false


Hi, any progress on this? I also found a gist on GitHub which can be a 
reference: https://gist.github.com/elmarco/aa5e0b23567f46fb7f0e73cde586a0c1




[PATCH] i386: Add new CPU model SapphireRapids

2022-08-11 Thread Wang, Lei
The new CPU model mostly inherits features from Icelake-Server, while
adding new features:
 - AMX (Advance Matrix eXtensions)
 - Bus Lock Debug Exception
and new instructions:
 - AVX VNNI (Vector Neural Network Instruction):
- VPDPBUS: Multiply and Add Unsigned and Signed Bytes
- VPDPBUSDS: Multiply and Add Unsigned and Signed Bytes with Saturation
- VPDPWSSD: Multiply and Add Signed Word Integers
- VPDPWSSDS: Multiply and Add Signed Integers with Saturation
 - FP16: Replicates existing AVX512 computational SP (FP32) instructions
   using FP16 instead of FP32 for ~2X performance gain
 - SERIALIZE: Provide software with a simple way to force the processor to
   complete all modifications, faster, allowed in all privilege levels and
   not causing an unconditional VM exit
 - TSX Suspend Load Address Tracking: Allows programmers to choose which
   memory accesses do not need to be tracked in the TSX read set
 - AVX512_BF16: Vector Neural Network Instructions supporting BFLOAT16
   inputs and conversion instructions from IEEE single precision

Features may be added in future versions:
 - CET (virtualization support hasn't been merged)
Instructions may be added in future versions:
 - fast zero-length MOVSB (KVM doesn't support yet)
 - fast short STOSB (KVM doesn't support yet)
 - fast short CMPSB, SCASB (KVM doesn't support yet)

Signed-off-by: Wang, Lei 
Reviewed-by: Robert Hoo 
---
 target/i386/cpu.c | 128 ++
 target/i386/cpu.h |   4 ++
 2 files changed, 132 insertions(+)

diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index 1db1278a59..abb43853d4 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -3467,6 +3467,134 @@ static const X86CPUDefinition builtin_x86_defs[] = {
 { /* end of list */ }
 }
 },
+{
+.name = "SapphireRapids",
+.level = 0x20,
+.vendor = CPUID_VENDOR_INTEL,
+.family = 6,
+.model = 143,
+.stepping = 4,
+/*
+ * please keep the ascending order so that we can have a clear view of
+ * bit position of each feature.
+ */
+.features[FEAT_1_EDX] =
+CPUID_FP87 | CPUID_VME | CPUID_DE | CPUID_PSE | CPUID_TSC |
+CPUID_MSR | CPUID_PAE | CPUID_MCE | CPUID_CX8 | CPUID_APIC |
+CPUID_SEP | CPUID_MTRR | CPUID_PGE | CPUID_MCA | CPUID_CMOV |
+CPUID_PAT | CPUID_PSE36 | CPUID_CLFLUSH | CPUID_MMX | CPUID_FXSR |
+CPUID_SSE | CPUID_SSE2,
+.features[FEAT_1_ECX] =
+CPUID_EXT_SSE3 | CPUID_EXT_PCLMULQDQ | CPUID_EXT_SSSE3 |
+CPUID_EXT_FMA | CPUID_EXT_CX16 | CPUID_EXT_PCID | CPUID_EXT_SSE41 |
+CPUID_EXT_SSE42 | CPUID_EXT_X2APIC | CPUID_EXT_MOVBE |
+CPUID_EXT_POPCNT | CPUID_EXT_TSC_DEADLINE_TIMER | CPUID_EXT_AES |
+CPUID_EXT_XSAVE | CPUID_EXT_AVX | CPUID_EXT_F16C | 
CPUID_EXT_RDRAND,
+.features[FEAT_8000_0001_EDX] =
+CPUID_EXT2_SYSCALL | CPUID_EXT2_NX | CPUID_EXT2_PDPE1GB |
+CPUID_EXT2_RDTSCP | CPUID_EXT2_LM,
+.features[FEAT_8000_0001_ECX] =
+CPUID_EXT3_LAHF_LM | CPUID_EXT3_ABM | CPUID_EXT3_3DNOWPREFETCH,
+.features[FEAT_8000_0008_EBX] =
+CPUID_8000_0008_EBX_WBNOINVD,
+.features[FEAT_7_0_EBX] =
+CPUID_7_0_EBX_FSGSBASE | CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_HLE |
+CPUID_7_0_EBX_AVX2 | CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_BMI2 |
+CPUID_7_0_EBX_ERMS | CPUID_7_0_EBX_INVPCID | CPUID_7_0_EBX_RTM |
+CPUID_7_0_EBX_AVX512F | CPUID_7_0_EBX_AVX512DQ |
+CPUID_7_0_EBX_RDSEED | CPUID_7_0_EBX_ADX | CPUID_7_0_EBX_SMAP |
+CPUID_7_0_EBX_AVX512IFMA | CPUID_7_0_EBX_CLFLUSHOPT |
+CPUID_7_0_EBX_CLWB | CPUID_7_0_EBX_AVX512CD | CPUID_7_0_EBX_SHA_NI 
|
+CPUID_7_0_EBX_AVX512BW | CPUID_7_0_EBX_AVX512VL,
+.features[FEAT_7_0_ECX] =
+CPUID_7_0_ECX_AVX512_VBMI | CPUID_7_0_ECX_UMIP | CPUID_7_0_ECX_PKU 
|
+CPUID_7_0_ECX_AVX512_VBMI2 | CPUID_7_0_ECX_GFNI |
+CPUID_7_0_ECX_VAES | CPUID_7_0_ECX_VPCLMULQDQ |
+CPUID_7_0_ECX_AVX512VNNI | CPUID_7_0_ECX_AVX512BITALG |
+CPUID_7_0_ECX_AVX512_VPOPCNTDQ | CPUID_7_0_ECX_LA57 |
+CPUID_7_0_ECX_RDPID | CPUID_7_0_ECX_BUS_LOCK_DETECT,
+.features[FEAT_7_0_EDX] =
+CPUID_7_0_EDX_FSRM | CPUID_7_0_EDX_SERIALIZE |
+CPUID_7_0_EDX_TSX_LDTRK | CPUID_7_0_EDX_AMX_BF16 |
+CPUID_7_0_EDX_AVX512_FP16 | CPUID_7_0_EDX_AMX_TILE |
+CPUID_7_0_EDX_AMX_INT8 | CPUID_7_0_EDX_SPEC_CTRL |
+CPUID_7_0_EDX_ARCH_CAPABILITIES | CPUID_7_0_EDX_SPEC_CTRL_SSBD,
+.features[FEAT_ARCH_CAPABILITIES] =
+MSR_ARCH_CAP_RDCL_NO | MSR_ARCH_CAP_IBRS_ALL |
+MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY | MSR_ARCH_CAP_MDS_NO |
+MSR_ARCH_CAP_PSCHANGE_MC_NO | MSR_ARCH_CAP_TAA_NO,
+.features[