date:20230704

[PATCH QEMU v7 7/9] migration: Implement dirty-limit convergence algo

2023-07-04 Thread ~hyman

From: Hyman Huang(黄勇) 

Implement dirty-limit convergence algo for live migration,
which is kind of like auto-converge algo but using dirty-limit
instead of cpu throttle to make migration convergent.

Enable dirty page limit if dirty_rate_high_cnt greater than 2
when dirty-limit capability enabled, Disable dirty-limit if
migration be canceled.

Note that "set_vcpu_dirty_limit", "cancel_vcpu_dirty_limit"
commands are not allowed during dirty-limit live migration.

Signed-off-by: Hyman Huang(黄勇) 
Reviewed-by: Markus Armbruster 
Reviewed-by: Juan Quintela 
---
 migration/migration.c  |  3 +++
 migration/ram.c| 36 
 migration/trace-events |  1 +
 softmmu/dirtylimit.c   | 29 +
 4 files changed, 69 insertions(+)

diff --git a/migration/migration.c b/migration/migration.c
index 096e8191d1..a3791900fd 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -166,6 +166,9 @@ void migration_cancel(const Error *error)
 if (error) {
 migrate_set_error(current_migration, error);
 }
+if (migrate_dirty_limit()) {
+qmp_cancel_vcpu_dirty_limit(false, -1, NULL);
+}
 migrate_fd_cancel(current_migration);
 }
 
diff --git a/migration/ram.c b/migration/ram.c
index b6559f9312..8a86363216 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -46,6 +46,7 @@
 #include "qapi/error.h"
 #include "qapi/qapi-types-migration.h"
 #include "qapi/qapi-events-migration.h"
+#include "qapi/qapi-commands-migration.h"
 #include "qapi/qmp/qerror.h"
 #include "trace.h"
 #include "exec/ram_addr.h"
@@ -59,6 +60,8 @@
 #include "multifd.h"
 #include "sysemu/runstate.h"
 #include "options.h"
+#include "sysemu/dirtylimit.h"
+#include "sysemu/kvm.h"
 
 #include "hw/boards.h" /* for machine_dump_guest_core() */
 
@@ -984,6 +987,37 @@ static void migration_update_rates(RAMState *rs, int64_t 
end_time)
 }
 }
 
+/*
+ * Enable dirty-limit to throttle down the guest
+ */
+static void migration_dirty_limit_guest(void)
+{
+/*
+ * dirty page rate quota for all vCPUs fetched from
+ * migration parameter 'vcpu_dirty_limit'
+ */
+static int64_t quota_dirtyrate;
+MigrationState *s = migrate_get_current();
+
+/*
+ * If dirty limit already enabled and migration parameter
+ * vcpu-dirty-limit untouched.
+ */
+if (dirtylimit_in_service() &&
+quota_dirtyrate == s->parameters.vcpu_dirty_limit) {
+return;
+}
+
+quota_dirtyrate = s->parameters.vcpu_dirty_limit;
+
+/*
+ * Set all vCPU a quota dirtyrate, note that the second
+ * parameter will be ignored if setting all vCPU for the vm
+ */
+qmp_set_vcpu_dirty_limit(false, -1, quota_dirtyrate, NULL);
+trace_migration_dirty_limit_guest(quota_dirtyrate);
+}
+
 static void migration_trigger_throttle(RAMState *rs)
 {
 uint64_t threshold = migrate_throttle_trigger_threshold();
@@ -1013,6 +1047,8 @@ static void migration_trigger_throttle(RAMState *rs)
 trace_migration_throttle();
 mig_throttle_guest_down(bytes_dirty_period,
 bytes_dirty_threshold);
+} else if (migrate_dirty_limit()) {
+migration_dirty_limit_guest();
 }
 }
 }
diff --git a/migration/trace-events b/migration/trace-events
index 5259c1044b..580895e86e 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -93,6 +93,7 @@ migration_bitmap_sync_start(void) ""
 migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64
 migration_bitmap_clear_dirty(char *str, uint64_t start, uint64_t size, 
unsigned long page) "rb %s start 0x%"PRIx64" size 0x%"PRIx64" page 0x%lx"
 migration_throttle(void) ""
+migration_dirty_limit_guest(int64_t dirtyrate) "guest dirty page rate limit %" 
PRIi64 " MB/s"
 ram_discard_range(const char *rbname, uint64_t start, size_t len) "%s: start: 
%" PRIx64 " %zx"
 ram_load_loop(const char *rbname, uint64_t addr, int flags, void *host) "%s: 
addr: 0x%" PRIx64 " flags: 0x%x host: %p"
 ram_load_postcopy_loop(int channel, uint64_t addr, int flags) "chan=%d 
addr=0x%" PRIx64 " flags=0x%x"
diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
index 953ef934bc..5134296667 100644
--- a/softmmu/dirtylimit.c
+++ b/softmmu/dirtylimit.c
@@ -436,6 +436,23 @@ static void dirtylimit_cleanup(void)
 dirtylimit_state_finalize();
 }
 
+/*
+ * dirty page rate limit is not allowed to set if migration
+ * is running with dirty-limit capability enabled.
+ */
+static bool dirtylimit_is_allowed(void)
+{
+MigrationState *ms = migrate_get_current();
+
+if (migration_is_running(ms->state) &&
+(!qemu_thread_is_self(>thread)) &&
+migrate_dirty_limit() &&
+dirtylimit_in_service()) {
+return false;
+}
+return true;
+}
+
 void qmp_cancel_vcpu_dirty_limit(bool has_cpu_index,
  int64_t cpu_index,
  Error **errp)
@@ -449,6 +466,12 @@ void

[PATCH QEMU v7 1/9] softmmu/dirtylimit: Add parameter check for hmp "set_vcpu_dirty_limit"

2023-07-04 Thread ~hyman

From: Hyman Huang(黄勇) 

dirty_rate paraemter of hmp command "set_vcpu_dirty_limit" is invalid
if less than 0, so add parameter check for it.

Note that this patch also delete the unsolicited help message and
clean up the code.

Signed-off-by: Hyman Huang(黄勇) 
Reviewed-by: Markus Armbruster 
Reviewed-by: Peter Xu 
Reviewed-by: Juan Quintela 
---
 softmmu/dirtylimit.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
index 015a9038d1..5c12d26d49 100644
--- a/softmmu/dirtylimit.c
+++ b/softmmu/dirtylimit.c
@@ -515,14 +515,15 @@ void hmp_set_vcpu_dirty_limit(Monitor *mon, const QDict 
*qdict)
 int64_t cpu_index = qdict_get_try_int(qdict, "cpu_index", -1);
 Error *err = NULL;
 
-qmp_set_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, dirty_rate, );
-if (err) {
-hmp_handle_error(mon, err);
-return;
+if (dirty_rate < 0) {
+error_setg(, "invalid dirty page limit %ld", dirty_rate);
+goto out;
 }
 
-monitor_printf(mon, "[Please use 'info vcpu_dirty_limit' to query "
-   "dirty limit for virtual CPU]\n");
+qmp_set_vcpu_dirty_limit(!!(cpu_index != -1), cpu_index, dirty_rate, );
+
+out:
+hmp_handle_error(mon, err);
 }
 
 static struct DirtyLimitInfo *dirtylimit_query_vcpu(int cpu_index)
-- 
2.38.5

[PATCH QEMU v7 9/9] tests: Add migration dirty-limit capability test

2023-07-04 Thread ~hyman

From: Hyman Huang(黄勇) 

Add migration dirty-limit capability test if kernel support
dirty ring.

Migration dirty-limit capability introduce dirty limit
capability, two parameters: x-vcpu-dirty-limit-period and
vcpu-dirty-limit are introduced to implement the live
migration with dirty limit.

The test case does the following things:
1. start src, dst vm and enable dirty-limit capability
2. start migrate and set cancel it to check if dirty limit
   stop working.
3. restart dst vm
4. start migrate and enable dirty-limit capability
5. check if migration satisfy the convergence condition
   during pre-switchover phase.

Signed-off-by: Hyman Huang(黄勇) 
---
 tests/qtest/migration-test.c | 155 +++
 1 file changed, 155 insertions(+)

diff --git a/tests/qtest/migration-test.c b/tests/qtest/migration-test.c
index b9cc194100..f55f95c9b0 100644
--- a/tests/qtest/migration-test.c
+++ b/tests/qtest/migration-test.c
@@ -2636,6 +2636,159 @@ static void test_vcpu_dirty_limit(void)
 dirtylimit_stop_vm(vm);
 }
 
+static void migrate_dirty_limit_wait_showup(QTestState *from,
+const int64_t period,
+const int64_t value)
+{
+/* Enable dirty limit capability */
+migrate_set_capability(from, "dirty-limit", true);
+
+/* Set dirty limit parameters */
+migrate_set_parameter_int(from, "x-vcpu-dirty-limit-period", period);
+migrate_set_parameter_int(from, "vcpu-dirty-limit", value);
+
+/* Make sure migrate can't converge */
+migrate_ensure_non_converge(from);
+
+/* To check limit rate after precopy */
+migrate_set_capability(from, "pause-before-switchover", true);
+
+/* Wait for the serial output from the source */
+wait_for_serial("src_serial");
+}
+
+/*
+ * This test does:
+ *  source   target
+ *   migrate_incoming
+ * migrate
+ * migrate_cancel
+ *   restart target
+ * migrate
+ *
+ *  And see that if dirty limit works correctly
+ */
+static void test_migrate_dirty_limit(void)
+{
+g_autofree char *uri = g_strdup_printf("unix:%s/migsocket", tmpfs);
+QTestState *from, *to;
+int64_t remaining;
+uint64_t throttle_us_per_full;
+/*
+ * We want the test to be stable and as fast as possible.
+ * E.g., with 1Gb/s bandwith migration may pass without dirty limit,
+ * so we need to decrease a bandwidth.
+ */
+const int64_t dirtylimit_period = 1000, dirtylimit_value = 50;
+const int64_t max_bandwidth = 4; /* ~400Mb/s */
+const int64_t downtime_limit = 250; /* 250ms */
+/*
+ * We migrate through unix-socket (> 500Mb/s).
+ * Thus, expected migration speed ~= bandwidth limit (< 500Mb/s).
+ * So, we can predict expected_threshold
+ */
+const int64_t expected_threshold = max_bandwidth * downtime_limit / 1000;
+int max_try_count = 10;
+MigrateCommon args = {
+.start = {
+.hide_stderr = true,
+.use_dirty_ring = true,
+},
+.listen_uri = uri,
+.connect_uri = uri,
+};
+
+/* Start src, dst vm */
+if (test_migrate_start(, , args.listen_uri, )) {
+return;
+}
+
+/* Prepare for dirty limit migration and wait src vm show up */
+migrate_dirty_limit_wait_showup(from, dirtylimit_period, dirtylimit_value);
+
+/* Start migrate */
+migrate_qmp(from, uri, "{}");
+
+/* Wait for dirty limit throttle begin */
+throttle_us_per_full = 0;
+while (throttle_us_per_full == 0) {
+throttle_us_per_full =
+read_migrate_property_int(from, "dirty-limit-throttle-time-per-round");
+usleep(100);
+g_assert_false(got_src_stop);
+}
+
+/* Now cancel migrate and wait for dirty limit throttle switch off */
+migrate_cancel(from);
+wait_for_migration_status(from, "cancelled", NULL);
+
+/* Check if dirty limit throttle switched off, set timeout 1ms */
+do {
+throttle_us_per_full =
+read_migrate_property_int(from, "dirty-limit-throttle-time-per-round");
+usleep(100);
+g_assert_false(got_src_stop);
+} while (throttle_us_per_full != 0 && --max_try_count);
+
+/* Assert dirty limit is not in service */
+g_assert_cmpint(throttle_us_per_full, ==, 0);
+
+args = (MigrateCommon) {
+.start = {
+.only_target = true,
+.use_dirty_ring = true,
+},
+.listen_uri = uri,
+.connect_uri = uri,
+};
+
+/* Restart dst vm, src vm already show up so we needn't wait anymore */
+if (test_migrate_start(, , args.listen_uri, )) {
+return;
+}
+
+/* Start migrate */
+migrate_qmp(from, uri, "{}");
+
+/* Wait for dirty limit throttle begin */
+throttle_us_per_full = 0;
+while (throttle_us_per_full == 0) {
+throttle_us_per_full =
+read_migrate_property_int(from,

[PATCH QEMU v7 0/9] migration: introduce dirtylimit capability

2023-07-04 Thread ~hyman

Hi,  Juan, this version maybe the last version,
i rebase on master and fix the conflicts with
the switchover-ack capability. Please be free
to the take this version to make PR or the
previous version if you already fixed conflicts.

v7:
1. Rebase on master and fix conflicts

v6:
1. Rebase on master
2. Split the commit "Implement dirty-limit convergence algo" into two as
Juan suggested as the following:
a. Put the detection logic before auto-converge checking
b. Implement dirty-limit convergence algo
3. Put the detection logic before auto-converge checking
4. Sort the migrate_dirty_limit function in commit
"Introduce dirty-limit capability" suggested by Juan
5. Substitute the the int64_t to uint64_t in the last 2 commits
6. Fix the comments spell mistake
7. Add helper function in the commit
"Implement dirty-limit convergence algo" suggested by Juan

v5:
1. Rebase on master and enrich the comment for "dirty-limit" capability,
suggesting by Markus.
2. Drop commits that have already been merged.

v4:
1. Polish the docs and update the release version suggested by Markus
2. Rename the migrate exported info "dirty-limit-throttle-time-per-
round"
   to "dirty-limit-throttle-time-per-full".

v3(resend):
- fix the syntax error of the topic.

v3:
This version make some modifications inspired by Peter and Markus
as following:
1. Do the code clean up in [PATCH v2 02/11] suggested by Markus
2. Replace the [PATCH v2 03/11] with a much simpler patch posted by
   Peter to fix the following bug:
   https://bugzilla.redhat.com/show_bug.cgi?id=2124756
3. Fix the error path of migrate_params_check in [PATCH v2 04/11]
   pointed out by Markus. Enrich the commit message to explain why
   x-vcpu-dirty-limit-period an unstable parameter.
4. Refactor the dirty-limit convergence algo in [PATCH v2 07/11]
   suggested by Peter:
   a. apply blk_mig_bulk_active check before enable dirty-limit
   b. drop the unhelpful check function before enable dirty-limit
   c. change the migration_cancel logic, just cancel dirty-limit
  only if dirty-limit capability turned on.
   d. abstract a code clean commit [PATCH v3 07/10] to adjust
  the check order before enable auto-converge
5. Change the name of observing indexes during dirty-limit live
   migration to make them more easy-understanding. Use the
   maximum throttle time of vpus as "dirty-limit-throttle-time-per-full"
6. Fix some grammatical and spelling errors pointed out by Markus
   and enrich the document about the dirty-limit live migration
   observing indexes "dirty-limit-ring-full-time"
   and "dirty-limit-throttle-time-per-full"
7. Change the default value of x-vcpu-dirty-limit-period to 1000ms,
   which is optimal value pointed out in cover letter in that
   testing environment.
8. Drop the 2 guestperf test commits [PATCH v2 10/11],
   [PATCH v2 11/11] and post them with a standalone series in the
   future.

v2:
This version make a little bit modifications comparing with
version 1 as following:
1. fix the overflow issue reported by Peter Maydell
2. add parameter check for hmp "set_vcpu_dirty_limit" command
3. fix the racing issue between dirty ring reaper thread and
   Qemu main thread.
4. add migrate parameter check for x-vcpu-dirty-limit-period
   and vcpu-dirty-limit.
5. add the logic to forbid hmp/qmp commands set_vcpu_dirty_limit,
   cancel_vcpu_dirty_limit during dirty-limit live migration when
   implement dirty-limit convergence algo.
6. add capability check to ensure auto-converge and dirty-limit
   are mutually exclusive.
7. pre-check if kvm dirty ring size is configured before setting
   dirty-limit migrate parameter

Hyman Huang(黄勇) (9):
  softmmu/dirtylimit: Add parameter check for hmp "set_vcpu_dirty_limit"
  qapi/migration: Introduce x-vcpu-dirty-limit-period parameter
  qapi/migration: Introduce vcpu-dirty-limit parameters
  migration: Introduce dirty-limit capability
  migration: Refactor auto-converge capability logic
  migration: Put the detection logic before auto-converge checking
  migration: Implement dirty-limit convergence algo
  migration: Extend query-migrate to provide dirty page limit info
  tests: Add migration dirty-limit capability test

 include/sysemu/dirtylimit.h|   2 +
 migration/migration-hmp-cmds.c |  26 ++
 migration/migration.c  |  13 +++
 migration/options.c|  73 
 migration/options.h|   1 +
 migration/ram.c|  61 ++---
 migration/trace-events |   1 +
 qapi/migration.json|  75 ++--
 softmmu/dirtylimit.c   |  91 +--
 tests/qtest/migration-test.c   | 155 +
 10 files changed, 473 insertions(+), 25 deletions(-)

-- 
2.38.5

[PATCH QEMU v7 3/9] qapi/migration: Introduce vcpu-dirty-limit parameters

2023-07-04 Thread ~hyman

From: Hyman Huang(黄勇) 

Introduce "vcpu-dirty-limit" migration parameter used
to limit dirty page rate during live migration.

"vcpu-dirty-limit" and "x-vcpu-dirty-limit-period" are
two dirty-limit-related migration parameters, which can
be set before and during live migration by qmp
migrate-set-parameters.

This two parameters are used to help implement the dirty
page rate limit algo of migration.

Signed-off-by: Hyman Huang(黄勇) 
Acked-by: Peter Xu 
Reviewed-by: Juan Quintela 
---
 migration/migration-hmp-cmds.c |  8 
 migration/options.c| 21 +
 qapi/migration.json| 18 +++---
 3 files changed, 44 insertions(+), 3 deletions(-)

diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 352e9ec716..35e8020bbf 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -368,6 +368,10 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict 
*qdict)
 monitor_printf(mon, "%s: %" PRIu64 " ms\n",
 MigrationParameter_str(MIGRATION_PARAMETER_X_VCPU_DIRTY_LIMIT_PERIOD),
 params->x_vcpu_dirty_limit_period);
+
+monitor_printf(mon, "%s: %" PRIu64 " MB/s\n",
+MigrationParameter_str(MIGRATION_PARAMETER_VCPU_DIRTY_LIMIT),
+params->vcpu_dirty_limit);
 }
 
 qapi_free_MigrationParameters(params);
@@ -628,6 +632,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict 
*qdict)
 p->has_x_vcpu_dirty_limit_period = true;
 visit_type_size(v, param, >x_vcpu_dirty_limit_period, );
 break;
+case MIGRATION_PARAMETER_VCPU_DIRTY_LIMIT:
+p->has_vcpu_dirty_limit = true;
+visit_type_size(v, param, >vcpu_dirty_limit, );
+break;
 default:
 assert(0);
 }
diff --git a/migration/options.c b/migration/options.c
index 1de63ba775..7d2d98830e 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -81,6 +81,7 @@
 DEFINE_PROP_BOOL(name, MigrationState, capabilities[x], false)
 
 #define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD 1000/* milliseconds */
+#define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT1   /* MB/s */
 
 Property migration_properties[] = {
 DEFINE_PROP_BOOL("store-global-state", MigrationState,
@@ -168,6 +169,9 @@ Property migration_properties[] = {
 DEFINE_PROP_UINT64("x-vcpu-dirty-limit-period", MigrationState,
parameters.x_vcpu_dirty_limit_period,
DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD),
+DEFINE_PROP_UINT64("vcpu-dirty-limit", MigrationState,
+   parameters.vcpu_dirty_limit,
+   DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT),
 
 /* Migration capabilities */
 DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
@@ -915,6 +919,8 @@ MigrationParameters *qmp_query_migrate_parameters(Error 
**errp)
 
 params->has_x_vcpu_dirty_limit_period = true;
 params->x_vcpu_dirty_limit_period = 
s->parameters.x_vcpu_dirty_limit_period;
+params->has_vcpu_dirty_limit = true;
+params->vcpu_dirty_limit = s->parameters.vcpu_dirty_limit;
 
 return params;
 }
@@ -949,6 +955,7 @@ void migrate_params_init(MigrationParameters *params)
 params->has_announce_rounds = true;
 params->has_announce_step = true;
 params->has_x_vcpu_dirty_limit_period = true;
+params->has_vcpu_dirty_limit = true;
 }
 
 /*
@@ -1118,6 +1125,14 @@ bool migrate_params_check(MigrationParameters *params, 
Error **errp)
 return false;
 }
 
+if (params->has_vcpu_dirty_limit &&
+(params->vcpu_dirty_limit < 1)) {
+error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
+   "vcpu_dirty_limit",
+   "is invalid, it must greater then 1 MB/s");
+return false;
+}
+
 return true;
 }
 
@@ -1222,6 +1237,9 @@ static void 
migrate_params_test_apply(MigrateSetParameters *params,
 dest->x_vcpu_dirty_limit_period =
 params->x_vcpu_dirty_limit_period;
 }
+if (params->has_vcpu_dirty_limit) {
+dest->vcpu_dirty_limit = params->vcpu_dirty_limit;
+}
 }
 
 static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
@@ -1345,6 +1363,9 @@ static void migrate_params_apply(MigrateSetParameters 
*params, Error **errp)
 s->parameters.x_vcpu_dirty_limit_period =
 params->x_vcpu_dirty_limit_period;
 }
+if (params->has_vcpu_dirty_limit) {
+s->parameters.vcpu_dirty_limit = params->vcpu_dirty_limit;
+}
 }
 
 void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
diff --git a/qapi/migration.json b/qapi/migration.json
index 384b768e03..aa590dbf0e 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -793,6 +793,9 @@
 # live migration. Should be in the range 1 to 
1000ms,
 # defaults to 1000ms. (Since 8.1)
 #
+# @vcpu-dirty-limit: Dirtyrate limit (MB/s) during live

[PATCH QEMU v7 4/9] migration: Introduce dirty-limit capability

2023-07-04 Thread ~hyman

From: Hyman Huang(黄勇) 

Introduce migration dirty-limit capability, which can
be turned on before live migration and limit dirty
page rate durty live migration.

Introduce migrate_dirty_limit function to help check
if dirty-limit capability enabled during live migration.

Meanwhile, refactor vcpu_dirty_rate_stat_collect
so that period can be configured instead of hardcoded.

dirty-limit capability is kind of like auto-converge
but using dirty limit instead of traditional cpu-throttle
to throttle guest down. To enable this feature, turn on
the dirty-limit capability before live migration using
migrate-set-capabilities, and set the parameters
"x-vcpu-dirty-limit-period", "vcpu-dirty-limit" suitably
to speed up convergence.

Signed-off-by: Hyman Huang(黄勇) 
Acked-by: Peter Xu 
Reviewed-by: Juan Quintela 
---
 migration/options.c  | 24 
 migration/options.h  |  1 +
 qapi/migration.json  | 13 -
 softmmu/dirtylimit.c | 12 +++-
 4 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/migration/options.c b/migration/options.c
index 7d2d98830e..8b4eb8c519 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -27,6 +27,7 @@
 #include "qemu-file.h"
 #include "ram.h"
 #include "options.h"
+#include "sysemu/kvm.h"
 
 /* Maximum migrate downtime set to 2000 seconds */
 #define MAX_MIGRATE_DOWNTIME_SECONDS 2000
@@ -196,6 +197,8 @@ Property migration_properties[] = {
 #endif
 DEFINE_PROP_MIG_CAP("x-switchover-ack",
 MIGRATION_CAPABILITY_SWITCHOVER_ACK),
+DEFINE_PROP_MIG_CAP("x-dirty-limit",
+MIGRATION_CAPABILITY_DIRTY_LIMIT),
 
 DEFINE_PROP_END_OF_LIST(),
 };
@@ -242,6 +245,13 @@ bool migrate_dirty_bitmaps(void)
 return s->capabilities[MIGRATION_CAPABILITY_DIRTY_BITMAPS];
 }
 
+bool migrate_dirty_limit(void)
+{
+MigrationState *s = migrate_get_current();
+
+return s->capabilities[MIGRATION_CAPABILITY_DIRTY_LIMIT];
+}
+
 bool migrate_events(void)
 {
 MigrationState *s = migrate_get_current();
@@ -573,6 +583,20 @@ bool migrate_caps_check(bool *old_caps, bool *new_caps, 
Error **errp)
 }
 }
 
+if (new_caps[MIGRATION_CAPABILITY_DIRTY_LIMIT]) {
+if (new_caps[MIGRATION_CAPABILITY_AUTO_CONVERGE]) {
+error_setg(errp, "dirty-limit conflicts with auto-converge"
+   " either of then available currently");
+return false;
+}
+
+if (!kvm_enabled() || !kvm_dirty_ring_enabled()) {
+error_setg(errp, "dirty-limit requires KVM with accelerator"
+   " property 'dirty-ring-size' set");
+return false;
+}
+}
+
 return true;
 }
 
diff --git a/migration/options.h b/migration/options.h
index 9aaf363322..b5a950d4e4 100644
--- a/migration/options.h
+++ b/migration/options.h
@@ -24,6 +24,7 @@ extern Property migration_properties[];
 /* capabilities */
 
 bool migrate_auto_converge(void);
+bool migrate_dirty_limit(void);
 bool migrate_background_snapshot(void);
 bool migrate_block(void);
 bool migrate_colo(void);
diff --git a/qapi/migration.json b/qapi/migration.json
index aa590dbf0e..cc51835cdd 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -497,6 +497,16 @@
 # are present.  'return-path' capability must be enabled to use
 # it.  (since 8.1)
 #
+# @dirty-limit: If enabled, migration will use the dirty-limit algo to
+#   throttle down guest instead of auto-converge algo.
+#   Throttle algo only works when vCPU's dirtyrate greater
+#   than 'vcpu-dirty-limit', read processes in guest os
+#   aren't penalized any more, so this algo can improve
+#   performance of vCPU during live migration. This is an
+#   optional performance feature and should not affect the
+#   correctness of the existing auto-converge algo.
+#   (since 8.1)
+#
 # Features:
 #
 # @unstable: Members @x-colo and @x-ignore-shared are experimental.
@@ -512,7 +522,8 @@
'dirty-bitmaps', 'postcopy-blocktime', 'late-block-activate',
{ 'name': 'x-ignore-shared', 'features': [ 'unstable' ] },
'validate-uuid', 'background-snapshot',
-   'zero-copy-send', 'postcopy-preempt', 'switchover-ack'] }
+   'zero-copy-send', 'postcopy-preempt', 'switchover-ack',
+   'dirty-limit'] }
 
 ##
 # @MigrationCapabilityStatus:
diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
index 5c12d26d49..953ef934bc 100644
--- a/softmmu/dirtylimit.c
+++ b/softmmu/dirtylimit.c
@@ -24,6 +24,9 @@
 #include "hw/boards.h"
 #include "sysemu/kvm.h"
 #include "trace.h"
+#include "migration/misc.h"
+#include "migration/migration.h"
+#include "migration/options.h"
 
 /*
  * Dirtylimit stop working if dirty page rate error
@@ -75,11 +78,18 @@ static bool dirtylimit_quit;
 
 static void vcpu_dirty_rate_stat_collect(void)
 {
+MigrationState *s = migrate_get_current();

[PATCH QEMU v7 5/9] migration: Refactor auto-converge capability logic

2023-07-04 Thread ~hyman

From: Hyman Huang(黄勇) 

Check if block migration is running before throttling
guest down in auto-converge way.

Note that this modification is kind of like code clean,
because block migration does not depend on auto-converge
capability, so the order of checks can be adjusted.

Signed-off-by: Hyman Huang(黄勇) 
Acked-by: Peter Xu 
Reviewed-by: Juan Quintela 
---
 migration/ram.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/migration/ram.c b/migration/ram.c
index 5283a75f02..78746849b5 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -995,7 +995,11 @@ static void migration_trigger_throttle(RAMState *rs)
 /* During block migration the auto-converge logic incorrectly detects
  * that ram migration makes no progress. Avoid this by disabling the
  * throttling logic during the bulk phase of block migration. */
-if (migrate_auto_converge() && !blk_mig_bulk_active()) {
+if (blk_mig_bulk_active()) {
+return;
+}
+
+if (migrate_auto_converge()) {
 /* The following detection logic can be refined later. For now:
Check to see if the ratio between dirtied bytes and the approx.
amount of bytes that just got transferred since the last time
-- 
2.38.5

[PATCH QEMU v7 8/9] migration: Extend query-migrate to provide dirty page limit info

2023-07-04 Thread ~hyman

From: Hyman Huang(黄勇) 

Extend query-migrate to provide throttle time and estimated
ring full time with dirty-limit capability enabled, through which
we can observe if dirty limit take effect during live migration.

Signed-off-by: Hyman Huang(黄勇) 
Reviewed-by: Markus Armbruster 
Reviewed-by: Juan Quintela 
---
 include/sysemu/dirtylimit.h|  2 ++
 migration/migration-hmp-cmds.c | 10 +
 migration/migration.c  | 10 +
 qapi/migration.json| 16 +-
 softmmu/dirtylimit.c   | 39 ++
 5 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/include/sysemu/dirtylimit.h b/include/sysemu/dirtylimit.h
index 8d2c1f3a6b..d11edb 100644
--- a/include/sysemu/dirtylimit.h
+++ b/include/sysemu/dirtylimit.h
@@ -34,4 +34,6 @@ void dirtylimit_set_vcpu(int cpu_index,
 void dirtylimit_set_all(uint64_t quota,
 bool enable);
 void dirtylimit_vcpu_execute(CPUState *cpu);
+uint64_t dirtylimit_throttle_time_per_round(void);
+uint64_t dirtylimit_ring_full_time(void);
 #endif
diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 35e8020bbf..c115ef2d23 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -190,6 +190,16 @@ void hmp_info_migrate(Monitor *mon, const QDict *qdict)
info->cpu_throttle_percentage);
 }
 
+if (info->has_dirty_limit_throttle_time_per_round) {
+monitor_printf(mon, "dirty-limit throttle time: %" PRIu64 " us\n",
+   info->dirty_limit_throttle_time_per_round);
+}
+
+if (info->has_dirty_limit_ring_full_time) {
+monitor_printf(mon, "dirty-limit ring full time: %" PRIu64 " us\n",
+   info->dirty_limit_ring_full_time);
+}
+
 if (info->has_postcopy_blocktime) {
 monitor_printf(mon, "postcopy blocktime: %u\n",
info->postcopy_blocktime);
diff --git a/migration/migration.c b/migration/migration.c
index a3791900fd..a4dcaa3c91 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -64,6 +64,7 @@
 #include "yank_functions.h"
 #include "sysemu/qtest.h"
 #include "options.h"
+#include "sysemu/dirtylimit.h"
 
 static NotifierList migration_state_notifiers =
 NOTIFIER_LIST_INITIALIZER(migration_state_notifiers);
@@ -974,6 +975,15 @@ static void populate_ram_info(MigrationInfo *info, 
MigrationState *s)
 info->ram->dirty_pages_rate =
stat64_get(_stats.dirty_pages_rate);
 }
+
+if (migrate_dirty_limit() && dirtylimit_in_service()) {
+info->has_dirty_limit_throttle_time_per_round = true;
+info->dirty_limit_throttle_time_per_round =
+dirtylimit_throttle_time_per_round();
+
+info->has_dirty_limit_ring_full_time = true;
+info->dirty_limit_ring_full_time = dirtylimit_ring_full_time();
+}
 }
 
 static void populate_disk_info(MigrationInfo *info)
diff --git a/qapi/migration.json b/qapi/migration.json
index cc51835cdd..ebc15e2782 100644
--- a/qapi/migration.json
+++ b/qapi/migration.json
@@ -250,6 +250,18 @@
 # blocked.  Present and non-empty when migration is blocked.
 # (since 6.0)
 #
+# @dirty-limit-throttle-time-per-round: Maximum throttle time (in 
microseconds) of virtual
+#   CPUs each dirty ring full round, which 
shows how
+#   MigrationCapability dirty-limit 
affects the guest
+#   during live migration. (since 8.1)
+#
+# @dirty-limit-ring-full-time: Estimated average dirty ring full time (in 
microseconds)
+#  each dirty ring full round, note that the value 
equals
+#  dirty ring memory size divided by average dirty 
page rate
+#  of virtual CPU, which can be used to observe 
the average
+#  memory load of virtual CPU indirectly. Note 
that zero
+#  means guest doesn't dirty memory (since 8.1)
+#
 # Since: 0.14
 ##
 { 'struct': 'MigrationInfo',
@@ -267,7 +279,9 @@
'*postcopy-blocktime' : 'uint32',
'*postcopy-vcpu-blocktime': ['uint32'],
'*compression': 'CompressionStats',
-   '*socket-address': ['SocketAddress'] } }
+   '*socket-address': ['SocketAddress'],
+   '*dirty-limit-throttle-time-per-round': 'uint64',
+   '*dirty-limit-ring-full-time': 'uint64'} }
 
 ##
 # @query-migrate:
diff --git a/softmmu/dirtylimit.c b/softmmu/dirtylimit.c
index 5134296667..a0686323e5 100644
--- a/softmmu/dirtylimit.c
+++ b/softmmu/dirtylimit.c
@@ -565,6 +565,45 @@ out:
 hmp_handle_error(mon, err);
 }
 
+/* Return the max throttle time of each virtual CPU */
+uint64_t dirtylimit_throttle_time_per_round(void)
+{
+CPUState *cpu;
+int64_t max = 0;
+
+CPU_FOREACH(cpu) {
+if

[PATCH QEMU v7 6/9] migration: Put the detection logic before auto-converge checking

2023-07-04 Thread ~hyman

From: Hyman Huang(黄勇) 

This commit is prepared for the implementation of dirty-limit
convergence algo.

The detection logic of throttling condition can apply to both
auto-converge and dirty-limit algo, putting it's position
before the checking logic for auto-converge feature.

Signed-off-by: Hyman Huang(黄勇) 
Reviewed-by: Juan Quintela 
---
 migration/ram.c | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index 78746849b5..b6559f9312 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -999,17 +999,18 @@ static void migration_trigger_throttle(RAMState *rs)
 return;
 }
 
-if (migrate_auto_converge()) {
-/* The following detection logic can be refined later. For now:
-   Check to see if the ratio between dirtied bytes and the approx.
-   amount of bytes that just got transferred since the last time
-   we were in this routine reaches the threshold. If that happens
-   twice, start or increase throttling. */
-
-if ((bytes_dirty_period > bytes_dirty_threshold) &&
-(++rs->dirty_rate_high_cnt >= 2)) {
+/*
+ * The following detection logic can be refined later. For now:
+ * Check to see if the ratio between dirtied bytes and the approx.
+ * amount of bytes that just got transferred since the last time
+ * we were in this routine reaches the threshold. If that happens
+ * twice, start or increase throttling.
+ */
+if ((bytes_dirty_period > bytes_dirty_threshold) &&
+(++rs->dirty_rate_high_cnt >= 2)) {
+rs->dirty_rate_high_cnt = 0;
+if (migrate_auto_converge()) {
 trace_migration_throttle();
-rs->dirty_rate_high_cnt = 0;
 mig_throttle_guest_down(bytes_dirty_period,
 bytes_dirty_threshold);
 }
-- 
2.38.5

[PATCH QEMU v7 2/9] qapi/migration: Introduce x-vcpu-dirty-limit-period parameter

2023-07-04 Thread ~hyman

From: Hyman Huang(黄勇) 

Introduce "x-vcpu-dirty-limit-period" migration experimental
parameter, which is in the range of 1 to 1000ms and used to
make dirtyrate calculation period configurable.

Currently with the "x-vcpu-dirty-limit-period" varies, the
total time of live migration changes, test results show the
optimal value of "x-vcpu-dirty-limit-period" ranges from
500ms to 1000 ms. "x-vcpu-dirty-limit-period" should be made
stable once it proves best value can not be determined with
developer's experiments.

Signed-off-by: Hyman Huang(黄勇) 
Reviewed-by: Markus Armbruster 
Reviewed-by: Juan Quintela 
---
 migration/migration-hmp-cmds.c |  8 
 migration/options.c| 28 
 qapi/migration.json| 34 +++---
 3 files changed, 63 insertions(+), 7 deletions(-)

diff --git a/migration/migration-hmp-cmds.c b/migration/migration-hmp-cmds.c
index 9885d7c9f7..352e9ec716 100644
--- a/migration/migration-hmp-cmds.c
+++ b/migration/migration-hmp-cmds.c
@@ -364,6 +364,10 @@ void hmp_info_migrate_parameters(Monitor *mon, const QDict 
*qdict)
 }
 }
 }
+
+monitor_printf(mon, "%s: %" PRIu64 " ms\n",
+MigrationParameter_str(MIGRATION_PARAMETER_X_VCPU_DIRTY_LIMIT_PERIOD),
+params->x_vcpu_dirty_limit_period);
 }
 
 qapi_free_MigrationParameters(params);
@@ -620,6 +624,10 @@ void hmp_migrate_set_parameter(Monitor *mon, const QDict 
*qdict)
 error_setg(, "The block-bitmap-mapping parameter can only be set "
"through QMP");
 break;
+case MIGRATION_PARAMETER_X_VCPU_DIRTY_LIMIT_PERIOD:
+p->has_x_vcpu_dirty_limit_period = true;
+visit_type_size(v, param, >x_vcpu_dirty_limit_period, );
+break;
 default:
 assert(0);
 }
diff --git a/migration/options.c b/migration/options.c
index 5a9505adf7..1de63ba775 100644
--- a/migration/options.c
+++ b/migration/options.c
@@ -80,6 +80,8 @@
 #define DEFINE_PROP_MIG_CAP(name, x) \
 DEFINE_PROP_BOOL(name, MigrationState, capabilities[x], false)
 
+#define DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD 1000/* milliseconds */
+
 Property migration_properties[] = {
 DEFINE_PROP_BOOL("store-global-state", MigrationState,
  store_global_state, true),
@@ -163,6 +165,9 @@ Property migration_properties[] = {
 DEFINE_PROP_STRING("tls-creds", MigrationState, parameters.tls_creds),
 DEFINE_PROP_STRING("tls-hostname", MigrationState, 
parameters.tls_hostname),
 DEFINE_PROP_STRING("tls-authz", MigrationState, parameters.tls_authz),
+DEFINE_PROP_UINT64("x-vcpu-dirty-limit-period", MigrationState,
+   parameters.x_vcpu_dirty_limit_period,
+   DEFAULT_MIGRATE_VCPU_DIRTY_LIMIT_PERIOD),
 
 /* Migration capabilities */
 DEFINE_PROP_MIG_CAP("x-xbzrle", MIGRATION_CAPABILITY_XBZRLE),
@@ -908,6 +913,9 @@ MigrationParameters *qmp_query_migrate_parameters(Error 
**errp)
s->parameters.block_bitmap_mapping);
 }
 
+params->has_x_vcpu_dirty_limit_period = true;
+params->x_vcpu_dirty_limit_period = 
s->parameters.x_vcpu_dirty_limit_period;
+
 return params;
 }
 
@@ -940,6 +948,7 @@ void migrate_params_init(MigrationParameters *params)
 params->has_announce_max = true;
 params->has_announce_rounds = true;
 params->has_announce_step = true;
+params->has_x_vcpu_dirty_limit_period = true;
 }
 
 /*
@@ -1100,6 +1109,15 @@ bool migrate_params_check(MigrationParameters *params, 
Error **errp)
 }
 #endif
 
+if (params->has_x_vcpu_dirty_limit_period &&
+(params->x_vcpu_dirty_limit_period < 1 ||
+ params->x_vcpu_dirty_limit_period > 1000)) {
+error_setg(errp, QERR_INVALID_PARAMETER_VALUE,
+   "x-vcpu-dirty-limit-period",
+   "a value between 1 and 1000");
+return false;
+}
+
 return true;
 }
 
@@ -1199,6 +1217,11 @@ static void 
migrate_params_test_apply(MigrateSetParameters *params,
 dest->has_block_bitmap_mapping = true;
 dest->block_bitmap_mapping = params->block_bitmap_mapping;
 }
+
+if (params->has_x_vcpu_dirty_limit_period) {
+dest->x_vcpu_dirty_limit_period =
+params->x_vcpu_dirty_limit_period;
+}
 }
 
 static void migrate_params_apply(MigrateSetParameters *params, Error **errp)
@@ -1317,6 +1340,11 @@ static void migrate_params_apply(MigrateSetParameters 
*params, Error **errp)
 QAPI_CLONE(BitmapMigrationNodeAliasList,
params->block_bitmap_mapping);
 }
+
+if (params->has_x_vcpu_dirty_limit_period) {
+s->parameters.x_vcpu_dirty_limit_period =
+params->x_vcpu_dirty_limit_period;
+}
 }
 
 void qmp_migrate_set_parameters(MigrateSetParameters *params, Error **errp)
diff --git a/qapi/migration.json b/qapi/migration.json
index 47dfef0278..384b768e03 100644
---

Re: [PATCH v7 5/6] hw/pci: ensure PCIE devices are plugged into only slot 0 of PCIE port

2023-07-04 Thread Ani Sinha




> On 05-Jul-2023, at 7:09 AM, Akihiko Odaki  wrote:
> 
> 
> 
> On 2023/07/05 0:07, Ani Sinha wrote:
>>> On 04-Jul-2023, at 7:58 PM, Igor Mammedov  wrote:
>>> 
>>> On Tue, 4 Jul 2023 19:20:00 +0530
>>> Ani Sinha  wrote:
>>> 
> On 04-Jul-2023, at 6:18 PM, Igor Mammedov  wrote:
> 
> On Tue, 4 Jul 2023 21:02:09 +0900
> Akihiko Odaki  wrote:
> 
>> On 2023/07/04 20:59, Ani Sinha wrote:
>>> 
>>> 
 On 04-Jul-2023, at 5:24 PM, Akihiko Odaki  
 wrote:
 
 On 2023/07/04 20:25, Ani Sinha wrote:
> PCI Express ports only have one slot, so PCI Express devices can only 
> be
> plugged into slot 0 on a PCIE port. Add a warning to let users know 
> when the
> invalid configuration is used. We may enforce this more strongly 
> later on once
> we get more clarity on whether we are introducing a bad regression 
> for users
> currenly using the wrong configuration.
> The change has been tested to not break or alter behaviors of ARI 
> capable
> devices by instantiating seven vfs on an emulated igb device (the 
> maximum
> number of vfs the linux igb driver supports). The vfs instantiated 
> correctly
> and are seen to have non-zero device/slot numbers in the conventional 
> PCI BDF
> representation.
> CC: jus...@redhat.com
> CC: imamm...@redhat.com
> CC: m...@redhat.com
> CC: akihiko.od...@daynix.com
> Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=2128929
> Signed-off-by: Ani Sinha 
> Reviewed-by: Julia Suvorova 
> ---
> hw/pci/pci.c | 15 +++
> 1 file changed, 15 insertions(+)
> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
> index e2eb4c3b4a..47517ba3db 100644
> --- a/hw/pci/pci.c
> +++ b/hw/pci/pci.c
> @@ -65,6 +65,7 @@ bool pci_available = true;
> static char *pcibus_get_dev_path(DeviceState *dev);
> static char *pcibus_get_fw_dev_path(DeviceState *dev);
> static void pcibus_reset(BusState *qbus);
> +static bool pcie_has_upstream_port(PCIDevice *dev);
>   static Property pci_props[] = {
> DEFINE_PROP_PCI_DEVFN("addr", PCIDevice, devfn, -1),
> @@ -2121,6 +2122,20 @@ static void pci_qdev_realize(DeviceState 
> *qdev, Error **errp)
> }
> }
> +/*
> + * With SRIOV and ARI, vfs can have non-zero slot in the 
> conventional
> + * PCI interpretation as all five bits reserved for slot 
> addresses are
> + * also used for function bits for the various vfs. Ignore that 
> case.
 
 You don't have to mention SR/IOV; it affects all ARI-capable devices. 
 A PF can also have non-zero slot number in the conventional 
 interpretation so you shouldn't call it vf either.
>>> 
>>> Can you please help write a comment that explains this properly for all 
>>> cases - ARI/non-ARI, PFs and VFs? Once everyone agrees that its clear 
>>> and correct, I will re-spin.
>> 
>> Simply, you can say:
>> With ARI, the slot number field in the conventional PCI interpretation
>> can have a non-zero value as the field bits are reused to extend the
>> function number bits. Ignore that case.
> 
> mentioning 'conventional PCI interpretation' in comment and then 
> immediately
> checking 'pci_is_express(pci_dev)' is confusing. Since comment belongs
> only to PCIE branch it would be better to talk in only about PCIe stuff
> and referring to relevant portions of spec.
 
 Ok so how about this?
 
   * With ARI, devices can have non-zero slot in the traditional BDF
 * representation as all five bits reserved for slot addresses are
 * also used for function bits. Ignore that case.
>>> 
>>> you still refer to traditional (which I misread as 'conventional'),
>>> steal the linux comment and argument it with ARI if necessary,
>>> something like this (probably needs some more massaging):
>> The comment messaging in these patches seems to exceed the value of the 
>> patch itself :-)
>> How about this?
>> /*
>>  * A PCIe Downstream Port normally leads to a Link with only Device
>>  * 0 on it (PCIe spec r3.1, sec 7.3.1).
>>  * With ARI, PCI_SLOT() can return non-zero value as all five bits
>>  * reserved for slot addresses are also used for function bits.
>>  * Hence, ignore ARI capable devices.
>>  */
> 
> Perhaps: s/normally leads to/must lead to/
> 
> From the kernel perspective, they may need to deal with a quirky hardware 
> that does not conform with the specification, but from QEMU perspective, it 
> is what we *must* conform with.

PCI base spec 4.0, rev 3, section 7.3.1 says:

"  
Downstream Ports that do not have ARI

Re: [PATCH] target/riscv: Add Zihintntl extension ISA string to DTS

2023-07-04 Thread Frank Chang

Reviewed-by: Frank Chang 

On Tue, Jul 4, 2023 at 4:41 PM Jason Chien  wrote:

> RVA23 Profiles states:
> The RVA23 profiles are intended to be used for 64-bit application
> processors that will run rich OS stacks from standard binary OS
> distributions and with a substantial number of third-party binary user
> applications that will be supported over a considerable length of time
> in the field.
>
> The chapter 4 of the unprivileged spec introduces the Zihintntl extension
> and Zihintntl is a mandatory extension presented in RVA23 Profiles, whose
> purpose is to enable application and operating system portability across
> different implementations. Thus the DTS should contain the Zihintntl ISA
> string in order to pass to software.
>
> The unprivileged spec states:
> Like any HINTs, these instructions may be freely ignored. Hence, although
> they are described in terms of cache-based memory hierarchies, they do not
> mandate the provision of caches.
>
> These instructions are encoded with used opcode, e.g. ADD x0, x0, x2, which
> QEMU already supports, and QEMU does not emulate cache. Therefore these
> instructions can be considered as a no-op, and we only need to add a new
> property for the Zihintntl extension.
>
> Signed-off-by: Jason Chien 
> ---
>  target/riscv/cpu.c | 2 ++
>  target/riscv/cpu_cfg.h | 1 +
>  2 files changed, 3 insertions(+)
>
> diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c
> index 881bddf393..6fd21466a4 100644
> --- a/target/riscv/cpu.c
> +++ b/target/riscv/cpu.c
> @@ -81,6 +81,7 @@ static const struct isa_ext_data isa_edata_arr[] = {
>  ISA_EXT_DATA_ENTRY(zicond, PRIV_VERSION_1_12_0, ext_zicond),
>  ISA_EXT_DATA_ENTRY(zicsr, PRIV_VERSION_1_10_0, ext_icsr),
>  ISA_EXT_DATA_ENTRY(zifencei, PRIV_VERSION_1_10_0, ext_ifencei),
> +ISA_EXT_DATA_ENTRY(zihintntl, PRIV_VERSION_1_10_0, ext_zihintntl),
>  ISA_EXT_DATA_ENTRY(zihintpause, PRIV_VERSION_1_10_0, ext_zihintpause),
>  ISA_EXT_DATA_ENTRY(zawrs, PRIV_VERSION_1_12_0, ext_zawrs),
>  ISA_EXT_DATA_ENTRY(zfh, PRIV_VERSION_1_11_0, ext_zfh),
> @@ -1598,6 +1599,7 @@ static Property riscv_cpu_extensions[] = {
>  DEFINE_PROP_BOOL("sscofpmf", RISCVCPU, cfg.ext_sscofpmf, false),
>  DEFINE_PROP_BOOL("Zifencei", RISCVCPU, cfg.ext_ifencei, true),
>  DEFINE_PROP_BOOL("Zicsr", RISCVCPU, cfg.ext_icsr, true),
> +DEFINE_PROP_BOOL("Zihintntl", RISCVCPU, cfg.ext_zihintntl, true),
>  DEFINE_PROP_BOOL("Zihintpause", RISCVCPU, cfg.ext_zihintpause, true),
>  DEFINE_PROP_BOOL("Zawrs", RISCVCPU, cfg.ext_zawrs, true),
>  DEFINE_PROP_BOOL("Zfh", RISCVCPU, cfg.ext_zfh, false),
> diff --git a/target/riscv/cpu_cfg.h b/target/riscv/cpu_cfg.h
> index c4a627d335..c7da2facef 100644
> --- a/target/riscv/cpu_cfg.h
> +++ b/target/riscv/cpu_cfg.h
> @@ -66,6 +66,7 @@ struct RISCVCPUConfig {
>  bool ext_icbom;
>  bool ext_icboz;
>  bool ext_zicond;
> +bool ext_zihintntl;
>  bool ext_zihintpause;
>  bool ext_smstateen;
>  bool ext_sstc;
> --
> 2.17.1
>
>
>

Re: intermittent clang sanitizer failure during 'make check-tcg': null pointer deref in IntervalTreeNode

2023-07-04 Thread Richard Henderson


On 7/4/23 18:20, Peter Maydell wrote:

If you build QEMU with the clang UB sanitizer and do a
'make check-tcg' run, it can fail like this:

   TESTvma-pthread-with-libinsn.so on aarch64
../../util/interval-tree.c:751:32: runtime error: member access within
null pointer of type 'IntervalTreeNode' (aka 'struct
IntervalTreeNode')
SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior
../../util/interval-tree.c:751:32 in

SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior
../../util/interval-tree.c:751:32 in
make[1]: *** [Makefile:181: run-plugin-vma-pthread-with-libinsn.so] Error 124
make: *** [/mnt/nvmedisk/linaro/qemu-from-laptop/qemu/tests/Makefile.include:56:
run-tcg-tests-aarch64-linux-user] Error 2

I only saw this once; when I re-ran the test passed...


Hmm.

The 751:32 reference is "left->subtree_last".

Three lines above we checked

if (node->rb.rb_left) {

but then read it again within the IF

IntervalTreeNode *left = rb_to_itree(node->rb.rb_left);

I suspect a normal optimizing compiler combines these two reads, but UBSAN does not and 
thus the check and the use race.


On the to-do list...


r~

Re: [PULL 00/11] target-arm queue

2023-07-04 Thread Richard Henderson


On 7/5/23 06:57, Richard Henderson wrote:

On 7/4/23 18:36, Peter Maydell wrote:

  docs/system/arm/sbsa.rst  |   5 +-
  hw/arm/sbsa-ref.c |  23 +++--
  hw/misc/allwinner-sramc.c |   1 +
  target/arm/cpu.c  |  65 -
  target/arm/gdbstub.c  |   4 +
  target/arm/helper.c   |  70 +++---
  target/arm/tcg/translate-sme.c    |  24 +++--
  target/xtensa/exc_helper.c    |   3 +
  tests/qtest/xlnx-canfd-test.c |  33 +++
  tests/tcg/aarch64/icivau.c    | 189 ++
  tests/tcg/aarch64/sme-outprod1.c  |  83 +
  hw/arm/Kconfig    |   2 +-
  tests/tcg/aarch64/Makefile.target |  13 ++-
  13 files changed, 436 insertions(+), 79 deletions(-)


There's one more failure:

https://gitlab.com/qemu-project/qemu/-/jobs/4592433432#L3723


/tmp/ccASXpLo.s: Assembler messages:
/tmp/ccASXpLo.s:782: Error: selected processor does not support system register name 
'id_aa64zfr0_el1'
/tmp/ccASXpLo.s:829: Error: selected processor does not support system register name 
'id_aa64smfr0_el1'

make[1]: *** [Makefile:119: sysregs] Error 1


I guess it's the change to Makefile.target, as I don't see any other likely 
candidates.


Ho hum, that's *my* patch 5, "Fix SME full tile indexing".
I'll have a closer look tomorrow.  Sorry about that.


r~

Re: [PULL 00/11] target-arm queue

2023-07-04 Thread Richard Henderson


On 7/4/23 18:36, Peter Maydell wrote:

  docs/system/arm/sbsa.rst  |   5 +-
  hw/arm/sbsa-ref.c |  23 +++--
  hw/misc/allwinner-sramc.c |   1 +
  target/arm/cpu.c  |  65 -
  target/arm/gdbstub.c  |   4 +
  target/arm/helper.c   |  70 +++---
  target/arm/tcg/translate-sme.c|  24 +++--
  target/xtensa/exc_helper.c|   3 +
  tests/qtest/xlnx-canfd-test.c |  33 +++
  tests/tcg/aarch64/icivau.c| 189 ++
  tests/tcg/aarch64/sme-outprod1.c  |  83 +
  hw/arm/Kconfig|   2 +-
  tests/tcg/aarch64/Makefile.target |  13 ++-
  13 files changed, 436 insertions(+), 79 deletions(-)


There's one more failure:

https://gitlab.com/qemu-project/qemu/-/jobs/4592433432#L3723


/tmp/ccASXpLo.s: Assembler messages:
/tmp/ccASXpLo.s:782: Error: selected processor does not support system register 
name 'id_aa64zfr0_el1'
/tmp/ccASXpLo.s:829: Error: selected processor does not support system register 
name 'id_aa64smfr0_el1'
make[1]: *** [Makefile:119: sysregs] Error 1


I guess it's the change to Makefile.target, as I don't see any other likely 
candidates.


r~

RE: [PATCH 2/2] virtio-iommu: Rework the trace in virtio_iommu_set_page_size_mask()

2023-07-04 Thread Duan, Zhenzhong




>-Original Message-
>From: Eric Auger 
>Sent: Tuesday, July 4, 2023 7:15 PM
>To: eric.auger@gmail.com; eric.au...@redhat.com; qemu-
>de...@nongnu.org; qemu-...@nongnu.org; m...@redhat.com; jean-
>phili...@linaro.org; Duan, Zhenzhong 
>Cc: alex.william...@redhat.com; c...@redhap.com;
>bharat.bhus...@nxp.com; peter.mayd...@linaro.org
>Subject: [PATCH 2/2] virtio-iommu: Rework the trace in
>virtio_iommu_set_page_size_mask()
>
>The current error messages in virtio_iommu_set_page_size_mask()
>sound quite similar for different situations and miss the IOMMU
>memory region that causes the issue.
>
>Clarify them and rework the comment.
>
>Also remove the trace when the new page_size_mask is not applied as
>the current frozen granule is kept. This message is rather confusing
>for the end user and anyway the current granule would have been used
>by the driver
>
>Signed-off-by: Eric Auger 
>---
> hw/virtio/virtio-iommu.c | 19 +++
> 1 file changed, 7 insertions(+), 12 deletions(-)
>
>diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
>index 1eaf81bab5..0d9f7196fe 100644
>--- a/hw/virtio/virtio-iommu.c
>+++ b/hw/virtio/virtio-iommu.c
>@@ -1101,29 +1101,24 @@ static int
>virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr,
>   new_mask);
>
> if ((cur_mask & new_mask) == 0) {
>-error_setg(errp, "virtio-iommu page mask 0x%"PRIx64
>-   " is incompatible with mask 0x%"PRIx64, cur_mask, 
>new_mask);
>+error_setg(errp, "virtio-iommu %s reports a page size mask 0x%"PRIx64
>+   " incompatible with currently supported mask 0x%"PRIx64,
>+   mr->parent_obj.name, new_mask, cur_mask);
> return -1;
> }
>
> /*
>  * Once the granule is frozen we can't change the mask anymore. If by
>  * chance the hotplugged device supports the same granule, we can still
>- * accept it. Having a different masks is possible but the guest will use
>- * sub-optimal block sizes, so warn about it.
>+ * accept it.
>  */
> if (s->granule_frozen) {
>-int new_granule = ctz64(new_mask);
> int cur_granule = ctz64(cur_mask);
>
>-if (new_granule != cur_granule) {
>-error_setg(errp, "virtio-iommu page mask 0x%"PRIx64
>-   " is incompatible with mask 0x%"PRIx64, cur_mask,
>-   new_mask);
>+if (!(BIT(cur_granule) & new_mask)) {

Good catch.

Reviewed-by: Zhenzhong Duan 

Thanks
Zhenzhong

>+error_setg(errp, "virtio-iommu %s does not support frozen granule
>0x%"PRIx64,
>+   mr->parent_obj.name, BIT(cur_granule));
> return -1;
>-} else if (new_mask != cur_mask) {
>-warn_report("virtio-iommu page mask 0x%"PRIx64
>-" does not match 0x%"PRIx64, cur_mask, new_mask);
> }
> return 0;
> }
>--
>2.38.1

Re: [PULL 07/11] tests/tcg/aarch64: Add testcases for IC IVAU and dual-mapped code

2023-07-04 Thread Richard Henderson


On 7/4/23 18:36, Peter Maydell wrote:

+int main(int argc, char **argv)
+{
+const char *shm_name = "qemu-test-tcg-aarch64-icivau";
+int fd;
+
+fd = shm_open(shm_name, O_CREAT | O_RDWR, S_IRUSR | S_IWUSR);


Build failures:

https://gitlab.com/qemu-project/qemu/-/jobs/4592433393#L3958
https://gitlab.com/qemu-project/qemu/-/jobs/4592433395#L4149
https://gitlab.com/qemu-project/qemu/-/jobs/4592433400#L3694


/usr/lib/gcc-cross/aarch64-linux-gnu/10/../../../../aarch64-linux-gnu/bin/ld: 
/usr/lib/gcc-cross/aarch64-linux-gnu/10/../../../../aarch64-linux-gnu/lib/../lib/librt.a(shm_open.o): 
in function `shm_open':

(.text+0x3c): undefined reference to `__shm_directory'
/usr/lib/gcc-cross/aarch64-linux-gnu/10/../../../../aarch64-linux-gnu/bin/ld: 
(.text+0xcc): undefined reference to `pthread_setcancelstate'
/usr/lib/gcc-cross/aarch64-linux-gnu/10/../../../../aarch64-linux-gnu/bin/ld: 
(.text+0xfc): undefined reference to `pthread_setcancelstate'
/usr/lib/gcc-cross/aarch64-linux-gnu/10/../../../../aarch64-linux-gnu/bin/ld: 
/usr/lib/gcc-cross/aarch64-linux-gnu/10/../../../../aarch64-linux-gnu/lib/../lib/librt.a(shm_unlink.o): 
in function `shm_unlink':

(.text+0x30): undefined reference to `__shm_directory'
collect2: error: ld returned 1 exit status
make[1]: *** [Makefile:119: icivau] Error 1
make[1]: *** Waiting for unfinished jobs
make: *** [/builds/qemu-project/qemu/tests/Makefile.include:50: 
build-tcg-tests-aarch64-linux-user] Error 2


It looks like this test needs something else.



r~

RE: [PATCH 1/2] virtio-iommu: Fix 64kB host page size VFIO device assignment

2023-07-04 Thread Duan, Zhenzhong

Hi Eric,

>-Original Message-
>From: Eric Auger 
>Sent: Tuesday, July 4, 2023 7:15 PM
>Subject: [PATCH 1/2] virtio-iommu: Fix 64kB host page size VFIO device
>assignment
>
>When running on a 64kB page size host and protecting a VFIO device
>with the virtio-iommu, qemu crashes with this kind of message:
>
>qemu-kvm: virtio-iommu page mask 0xf000 is incompatible
>with mask 0x2001

Does 0x2001 mean only  512MB and 64KB super page mapping is
supported for host iommu hw? 4KB mapping not supported?

There is a check in guest kernel side hint only 4KB is supported, with
this patch we force viommu->pgsize_bitmap to 0x2001
and fail below check? Does this device work in guest?
Please correct me if I understand wrong.

static int viommu_domain_finalise(struct viommu_endpoint *vdev,
  struct iommu_domain *domain)
{
...
viommu_page_size = 1UL << __ffs(viommu->pgsize_bitmap);
if (viommu_page_size > PAGE_SIZE) {
dev_err(vdev->dev,
"granule 0x%lx larger than system page size 0x%lx\n",
viommu_page_size, PAGE_SIZE);
return -ENODEV;
}

Another question is: Presume 0x2001 does mean only 512MB and 64KB
is supported. Is host hva->hpa mapping ensured to be compatible with at least
64KB mapping? If host mmu only support 4KB mapping or other non-compatible
with 0x2001, will vfio_dma_map() fail?

>qemu: hardware error: vfio: DMA mapping failed, unable to continue
>
>This is due to the fact the IOMMU MR corresponding to the VFIO device
>is enabled very late on domain attach, after the machine init.
>The device reports a minimal 64kB page size but it is too late to be
>applied. virtio_iommu_set_page_size_mask() fails and this causes
>vfio_listener_region_add() to end up with hw_error();
>
>To work around this issue, we transiently enable the IOMMU MR on
>machine init to collect the page size requirements and then restore
>the bypass state.
>
>Fixes: 90519b9053 ("virtio-iommu: Add bypass mode support to assigned
>device")
>Signed-off-by: Eric Auger 
>---
> include/hw/virtio/virtio-iommu.h |  2 ++
> hw/virtio/virtio-iommu.c | 30 --
> hw/virtio/trace-events   |  1 +
> 3 files changed, 31 insertions(+), 2 deletions(-)
>
>diff --git a/include/hw/virtio/virtio-iommu.h b/include/hw/virtio/virtio-
>iommu.h
>index 2ad5ee320b..a93fc5383e 100644
>--- a/include/hw/virtio/virtio-iommu.h
>+++ b/include/hw/virtio/virtio-iommu.h
>@@ -61,6 +61,8 @@ struct VirtIOIOMMU {
> QemuRecMutex mutex;
> GTree *endpoints;
> bool boot_bypass;
>+Notifier machine_done;
>+bool granule_frozen;
> };
>
> #endif
>diff --git a/hw/virtio/virtio-iommu.c b/hw/virtio/virtio-iommu.c
>index 1cd258135d..1eaf81bab5 100644
>--- a/hw/virtio/virtio-iommu.c
>+++ b/hw/virtio/virtio-iommu.c
>@@ -24,6 +24,7 @@
> #include "hw/virtio/virtio.h"
> #include "sysemu/kvm.h"
> #include "sysemu/reset.h"
>+#include "sysemu/sysemu.h"
> #include "qapi/error.h"
> #include "qemu/error-report.h"
> #include "trace.h"
>@@ -1106,12 +1107,12 @@ static int
>virtio_iommu_set_page_size_mask(IOMMUMemoryRegion *mr,
> }
>
> /*
>- * After the machine is finalized, we can't change the mask anymore. If by
>+ * Once the granule is frozen we can't change the mask anymore. If by
>  * chance the hotplugged device supports the same granule, we can still
>  * accept it. Having a different masks is possible but the guest will use
>  * sub-optimal block sizes, so warn about it.
>  */
>-if (phase_check(PHASE_MACHINE_READY)) {
>+if (s->granule_frozen) {
> int new_granule = ctz64(new_mask);
> int cur_granule = ctz64(cur_mask);
>
>@@ -1146,6 +1147,27 @@ static void virtio_iommu_system_reset(void
>*opaque)
>
> }
>
>+static void virtio_iommu_freeze_granule(Notifier *notifier, void *data)
>+{
>+VirtIOIOMMU *s = container_of(notifier, VirtIOIOMMU, machine_done);
>+bool boot_bypass = s->config.bypass;
>+int granule;
>+
>+/*
>+ * Transient IOMMU MR enable to collect page_size_mask requirement
>+ * through memory_region_iommu_set_page_size_mask() called by
>+ * VFIO region_add() callback
>+ */
>+s->config.bypass = 0;
>+virtio_iommu_switch_address_space_all(s);
>+/* restore default */
>+s->config.bypass = boot_bypass;
>+virtio_iommu_switch_address_space_all(s);
>+s->granule_frozen = true;
>+granule = ctz64(s->config.page_size_mask);
>+trace_virtio_iommu_freeze_granule(BIT(granule));
>+}

It looks a bit heavy here just in order to get page_size_mask from host side.
But maybe this is the only way with current interface.

Thanks
Zhenzhong

>+
> static void virtio_iommu_device_realize(DeviceState *dev, Error **errp)
> {
> VirtIODevice *vdev = VIRTIO_DEVICE(dev);
>@@ -1189,6 +1211,9 @@ static void virtio_iommu_device_realize(DeviceState
>*dev, Error **errp)
>

Re: [PATCH v3] target/ppc: Make checkstop actually stop the system

2023-07-04 Thread Nicholas Piggin

On Tue Jul 4, 2023 at 1:06 AM AEST, BALATON Zoltan wrote:
> On Mon, 3 Jul 2023, Nicholas Piggin wrote:
> > On Mon Jul 3, 2023 at 10:26 PM AEST, BALATON Zoltan wrote:
> >> On Mon, 3 Jul 2023, Nicholas Piggin wrote:
> >>> checkstop state does not halt the system, interrupts continue to be
> >>> serviced, and other CPUs run. Stop the machine with
> >>> qemu_system_guest_panicked.
> >>>
> >>> Change the logging not to print separately to stderr because a
> >>> checkstop is a guest error (or perhaps a simulated machine error)
> >>> rather than a QEMU error. CPU registers are dumped.
> >>>
> >>> Signed-off-by: Nicholas Piggin 
> >>>
> >>> Since v1:
> >>> - Fix loop exit so it stops on the checkstop-causing instruction, rather 
> >>> than
> >>>  after it.
> >>>
> >>> Since v2:
> >>> - Rebase on ppc-next.
> >>
> >> Is this really based on ppc-next or on my series or another patch from
> >> you? I think the patch from my series that introduces the checksrop
> >> function that this patch is changing is not yet in ppc-next so this may
> >> not apply there.
> >
> > It is based on ppc-next unless I've done something silly. Is this
> > the patch you were talking about?
> >
> > https://gitlab.com/danielhb/qemu/-/commit/26d089ac20080066061ed61fb58a5411e275e191
> >
> >> I think you've posted an alternative to the patch moving
> >> checkstop handling to a function and the sc patch which may clash with the
> >> not yet merged parts in my series but i could not follow all these
> >> patches. I'm not sure Daniel could so maybe you could send it as a series
> >> to include all patches you want to add or state what it's based on.
> >
> > Things are getting a little confusing, but I think what Daniel has
> > is okay. Were were talking about changing checkstop with my patch,
> > but no big deal to take yours first, I've reworked things.
>
> Sorry my bad. Yes it's getting confusing to me and missed that Daniel 
> already merged that patch on ppc-next, I thought it was one of the patches 
> not yet merged. So this can go on top then.
>
> There was another change for fixing sc that may clash that I'm not sure 
> if I need to do anything about.

The sc patches still had the problem of changing the syscall dump to
the instruction after syscall IIRC. So that would need to be fixed.
But that does make me a bit less happy about them too... Anyway
resend with fix and rebase if you want  them.

Thanks,
Nick

Re: [PATCH v6 5/5] hw/pci: ensure PCIE devices are plugged into only slot 0 of PCIE port

2023-07-04 Thread Akihiko Odaki


On 2023/07/04 23:03, Ani Sinha wrote:




On 04-Jul-2023, at 5:20 PM, Akihiko Odaki  wrote:

On 2023/07/04 20:38, Igor Mammedov wrote:

On Sat, 1 Jul 2023 16:28:30 +0900
Akihiko Odaki  wrote:

On 2023/07/01 0:29, Michael S. Tsirkin wrote:

On Fri, Jun 30, 2023 at 08:36:38PM +0900, Akihiko Odaki wrote:

On 2023/06/30 19:37, Ani Sinha wrote:


  

On 30-Jun-2023, at 3:30 PM, Michael S. Tsirkin  wrote:

On Fri, Jun 30, 2023 at 02:52:52PM +0530, Ani Sinha wrote:


  

On 30-Jun-2023, at 2:13 PM, Michael S. Tsirkin  wrote:

On Fri, Jun 30, 2023 at 02:06:59PM +0530, Ani Sinha wrote:


  

On 30-Jun-2023, at 2:02 PM, Michael S. Tsirkin  wrote:

On Fri, Jun 30, 2023 at 01:11:33PM +0530, Ani Sinha wrote:


Thus the check for unoccupied function 0 needs to use pci_is_vf() instead of 
checking ARI capability, and that can happen in do_pci_register_device().
  

Also where do you propose we move the check?


In pci_qdev_realize(), somewhere after pc->realize() and before option ROM 
loading.


Hmm, I tried this. The issue here is something like this would be now allowed 
since the PF has ARI capability:

-device pcie-root-port,id=p -device igb,bus=p,addr=0x2.0x0

The above should not be allowed and when used, we do not see the igb ethernet 
device from the guest OS.


I think it's allowed because it expects you to hotplug function 0 later,


This is about the igb device being plugged into the non-zero slot of the 
pci-root-port. The guest OS ignores it.


yes but if you later add a device with ARI and with next field pointing
slot 2 guest will suddently find both.


Hmm, I tried this:

-device pcie-root-port,id=p \
-device igb,bus=p,addr=0x2.0x0 \
-device igb,bus=p,addr=0x0.0x0 \

The guest only found the second igb device not the first. You can try too.


Because next parameter in pcie_ari_init does not match.


OK send me a command line that I can test it with. I can’t come up with a case 
that actually works in practice.


I don't think there is one because the code for PCI multifunction does not
care ARI. In my opinion, we need yet another check to make non-SR-IOV
multifunction and ARI capability mutually exclusive; if a function has the
ARI capability and it is not a VF, an attempt to assign non-zero function
number for it should fail.

is it stated somewhere in spec(s) that ARI and !SR-IOV are mutually exclusive?


Why is that? My understanding is that ARI capable devices should also
set the multifunction bit in the header. It's not terribly clear from
the spec though.


Something like the following will not work properly with ARI-capable
device (think of a as an ARI-capable device):
-device a,addr=0x1.0x0,multifunction=on -device a,addr=0x1.0x1

(I had a crazy idea, to use it like that so we could put more devices
on port without resorting to adding extra bridges)
Can you elaborate some more why it won't work?


It won't work because the ARI next function number field is fixed. In this 
case, the field of the Function at 0x1.0x0 should point to 0x1.0x1, but it 
doesn’t.


Where does it point to in this case then? 0x1.0x2 becasue of the stride?


With "[PATCH v5 0/2] pcie: Fix ARI next function numbers"*, it will 
point to 0, which ends the linked list formed with the ARI next number 
fields and make it only contain Function 0.


* 
https://lore.kernel.org/qemu-devel/20230705022421.13115-1-akihiko.od...@daynix.com/





As the result, the Function at 0x1.0x1 won't be recognized.

It's more problematic if some of the Functions are ARI-capable but others are 
not. In my understanding, all Functions in a ARI-capable device need to have 
ARI capability, but that's not enforced.


This is because the next function numbers advertised with ARI are not
updated with the multifunction configuration, but they are hardcoded in
the device implementation. In this sense, the traditional (non-SR/IOV)
multifunction mechanism QEMU has will not work with ARI-capable devices.

   

But it should be a distinct check as it will need to check the function
number bits.
  
  


  
  

no?

I am quite worried about all this work going into blocking
what we think is disallowed configurations. We should have
maybe blocked them originally, but now that we didn't
there's a non zero chance of regressions,


Sigh,


There's value in patches 1-4 I think - the last patch helped you find
these. so there's value in this work.
  

no medals here for being brave :-)


Try removing support for a 3.5mm jack next. Oh wait ...


Indeed. Everyone uses bluetooth these days. I for one is happy that the jack is 
gone (and they were bold enough to do it while Samsung and others still carry 
the useless port ) :-)


Hello from a guy using a shiny M2 Macbook Air carrying the legacy jack with
a 100-yen earphone. Even people who ported Linux to this machine spent
efforts to get the jack to work on Linux ;)
  
  
  

and the benefit
is not guaranteed.

--
MST

[PATCH v5 2/2] pcie: Specify 0 for ARI next function numbers

2023-07-04 Thread Akihiko Odaki

The current implementers of ARI are all SR-IOV devices. The ARI next
function number field is undefined for VF according to PCI Express Base
Specification Revision 5.0 Version 1.0 section 9.3.7.7. The PF should
end the linked list formed with the field by specifying 0 according to
section 7.8.7.2.

For migration, the field will keep having 1 as its value on the old
virt models.

Fixes: 2503461691 ("pcie: Add some SR/IOV API documentation in 
docs/pcie_sriov.txt")
Fixes: 44c2c09488 ("hw/nvme: Add support for SR-IOV")
Fixes: 3a977deebe ("Intrdocue igb device emulation")
Signed-off-by: Akihiko Odaki 
---
 include/hw/pci/pci.h | 2 ++
 hw/core/machine.c| 1 +
 hw/pci/pci.c | 2 ++
 hw/pci/pcie.c| 2 +-
 4 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
index e6d0574a29..9c5b5eb206 100644
--- a/include/hw/pci/pci.h
+++ b/include/hw/pci/pci.h
@@ -209,6 +209,8 @@ enum {
 QEMU_PCIE_CAP_CXL = (1 << QEMU_PCIE_CXL_BITNR),
 #define QEMU_PCIE_ERR_UNC_MASK_BITNR 11
 QEMU_PCIE_ERR_UNC_MASK = (1 << QEMU_PCIE_ERR_UNC_MASK_BITNR),
+#define QEMU_PCIE_ARI_NEXTFN_1_BITNR 12
+QEMU_PCIE_ARI_NEXTFN_1 = (1 << QEMU_PCIE_ARI_NEXTFN_1_BITNR),
 };
 
 typedef struct PCIINTxRoute {
diff --git a/hw/core/machine.c b/hw/core/machine.c
index 46f8f9a2b0..f0d35c6401 100644
--- a/hw/core/machine.c
+++ b/hw/core/machine.c
@@ -41,6 +41,7 @@
 
 GlobalProperty hw_compat_8_0[] = {
 { "migration", "multifd-flush-after-each-section", "on"},
+{ TYPE_PCI_DEVICE, "x-pcie-ari-nextfn-1", "on" },
 };
 const size_t hw_compat_8_0_len = G_N_ELEMENTS(hw_compat_8_0);
 
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index e2eb4c3b4a..45a9bc0da8 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -82,6 +82,8 @@ static Property pci_props[] = {
 DEFINE_PROP_UINT32("acpi-index",  PCIDevice, acpi_index, 0),
 DEFINE_PROP_BIT("x-pcie-err-unc-mask", PCIDevice, cap_present,
 QEMU_PCIE_ERR_UNC_MASK_BITNR, true),
+DEFINE_PROP_BIT("x-pcie-ari-nextfn-1", PCIDevice, cap_present,
+QEMU_PCIE_ARI_NEXTFN_1_BITNR, false),
 DEFINE_PROP_END_OF_LIST()
 };
 
diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index 9a3f6430e8..cf09e03a10 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -1030,7 +1030,7 @@ void pcie_sync_bridge_lnk(PCIDevice *bridge_dev)
 /* ARI */
 void pcie_ari_init(PCIDevice *dev, uint16_t offset)
 {
-uint16_t nextfn = 1;
+uint16_t nextfn = dev->cap_present & QEMU_PCIE_ARI_NEXTFN_1 ? 1 : 0;
 
 pcie_add_capability(dev, PCI_EXT_CAP_ID_ARI, PCI_ARI_VER,
 offset, PCI_ARI_SIZEOF);
-- 
2.41.0

[PATCH v5 1/2] pcie: Use common ARI next function number

2023-07-04 Thread Akihiko Odaki

Currently the only implementers of ARI is SR-IOV devices, and they
behave similar. Share the ARI next function number.

Signed-off-by: Akihiko Odaki 
---
 docs/pcie_sriov.txt   | 4 ++--
 include/hw/pci/pcie.h | 2 +-
 hw/net/igb.c  | 2 +-
 hw/net/igbvf.c| 2 +-
 hw/nvme/ctrl.c| 2 +-
 hw/pci/pcie.c | 4 +++-
 6 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/docs/pcie_sriov.txt b/docs/pcie_sriov.txt
index 7eff7f2703..a47aad0bfa 100644
--- a/docs/pcie_sriov.txt
+++ b/docs/pcie_sriov.txt
@@ -48,7 +48,7 @@ setting up a BAR for a VF.
   ...
   int ret = pcie_endpoint_cap_init(d, 0x70);
   ...
-  pcie_ari_init(d, 0x100, 1);
+  pcie_ari_init(d, 0x100);
   ...
 
   /* Add and initialize the SR/IOV capability */
@@ -78,7 +78,7 @@ setting up a BAR for a VF.
   ...
   int ret = pcie_endpoint_cap_init(d, 0x60);
   ...
-  pcie_ari_init(d, 0x100, 1);
+  pcie_ari_init(d, 0x100);
   ...
   memory_region_init(mr, ... )
   pcie_sriov_vf_register_bar(d, bar_nr, mr);
diff --git a/include/hw/pci/pcie.h b/include/hw/pci/pcie.h
index 3cc2b15957..bf7dc5d685 100644
--- a/include/hw/pci/pcie.h
+++ b/include/hw/pci/pcie.h
@@ -134,7 +134,7 @@ void pcie_sync_bridge_lnk(PCIDevice *dev);
 void pcie_acs_init(PCIDevice *dev, uint16_t offset);
 void pcie_acs_reset(PCIDevice *dev);
 
-void pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn);
+void pcie_ari_init(PCIDevice *dev, uint16_t offset);
 void pcie_dev_ser_num_init(PCIDevice *dev, uint16_t offset, uint64_t ser_num);
 void pcie_ats_init(PCIDevice *dev, uint16_t offset, bool aligned);
 
diff --git a/hw/net/igb.c b/hw/net/igb.c
index 1c989d7677..8ff832acfc 100644
--- a/hw/net/igb.c
+++ b/hw/net/igb.c
@@ -431,7 +431,7 @@ static void igb_pci_realize(PCIDevice *pci_dev, Error 
**errp)
 hw_error("Failed to initialize AER capability");
 }
 
-pcie_ari_init(pci_dev, 0x150, 1);
+pcie_ari_init(pci_dev, 0x150);
 
 pcie_sriov_pf_init(pci_dev, IGB_CAP_SRIOV_OFFSET, TYPE_IGBVF,
 IGB_82576_VF_DEV_ID, IGB_MAX_VF_FUNCTIONS, IGB_MAX_VF_FUNCTIONS,
diff --git a/hw/net/igbvf.c b/hw/net/igbvf.c
index 284ea61184..d55e1e8a6a 100644
--- a/hw/net/igbvf.c
+++ b/hw/net/igbvf.c
@@ -270,7 +270,7 @@ static void igbvf_pci_realize(PCIDevice *dev, Error **errp)
 hw_error("Failed to initialize AER capability");
 }
 
-pcie_ari_init(dev, 0x150, 1);
+pcie_ari_init(dev, 0x150);
 }
 
 static void igbvf_pci_uninit(PCIDevice *dev)
diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
index fd917fcda1..8b7168a266 100644
--- a/hw/nvme/ctrl.c
+++ b/hw/nvme/ctrl.c
@@ -8088,7 +8088,7 @@ static bool nvme_init_pci(NvmeCtrl *n, PCIDevice 
*pci_dev, Error **errp)
 pcie_endpoint_cap_init(pci_dev, 0x80);
 pcie_cap_flr_init(pci_dev);
 if (n->params.sriov_max_vfs) {
-pcie_ari_init(pci_dev, 0x100, 1);
+pcie_ari_init(pci_dev, 0x100);
 }
 
 /* add one to max_ioqpairs to account for the admin queue pair */
diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index b8c24cf45f..9a3f6430e8 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -1028,8 +1028,10 @@ void pcie_sync_bridge_lnk(PCIDevice *bridge_dev)
  */
 
 /* ARI */
-void pcie_ari_init(PCIDevice *dev, uint16_t offset, uint16_t nextfn)
+void pcie_ari_init(PCIDevice *dev, uint16_t offset)
 {
+uint16_t nextfn = 1;
+
 pcie_add_capability(dev, PCI_EXT_CAP_ID_ARI, PCI_ARI_VER,
 offset, PCI_ARI_SIZEOF);
 pci_set_long(dev->config + offset + PCI_ARI_CAP, (nextfn & 0xff) << 8);
-- 
2.41.0

[PATCH v5 0/2] pcie: Fix ARI next function numbers

2023-07-04 Thread Akihiko Odaki

The ARI next function number field is undefined for VF. The PF should
end the linked list formed with the field by specifying 0.

Supersedes: <20230701070133.24877-1-akihiko.od...@daynix.com>
("[PATCH 0/4] pci: Compare function number and ARI next function number")

V4 -> V5:
  Added references to the specification. (Igor Mammedov)

V3 -> V4:
  Corrected the default value of x-pcie-ari-nextfn-1. (Igor Mammedov)
  Added an explanation for migration compatibility. (Igor Mammedov)

V2 -> V3:
  Moved the logic to PCI common infrastucture (Michael S. Tsirkin)

V1 -> V2:
  Fixed migration. (Michael S. Tsirkin)
  Added a caveat comment. (Michael S. Tsirkin)

Akihiko Odaki (2):
  pcie: Use common ARI next function number
  pcie: Specify 0 for ARI next function numbers

 docs/pcie_sriov.txt   | 4 ++--
 include/hw/pci/pci.h  | 2 ++
 include/hw/pci/pcie.h | 2 +-
 hw/core/machine.c | 1 +
 hw/net/igb.c  | 2 +-
 hw/net/igbvf.c| 2 +-
 hw/nvme/ctrl.c| 2 +-
 hw/pci/pci.c  | 2 ++
 hw/pci/pcie.c | 4 +++-
 9 files changed, 14 insertions(+), 7 deletions(-)

-- 
2.41.0

Re: [PATCH RFC v2 3/4] vdpa: Restore packet receive filtering state relative with _F_CTRL_RX feature

2023-07-04 Thread Hawkins Jiawei

On 2023/7/4 23:39, Eugenio Perez Martin wrote:
> On Thu, Jun 29, 2023 at 5:26 PM Hawkins Jiawei  wrote:
>>
>> This patch introduces vhost_vdpa_net_load_rx_mode()
>> and vhost_vdpa_net_load_rx() to restore the packet
>> receive filtering state in relation to
>> VIRTIO_NET_F_CTRL_RX feature at device's startup.
>>
>> Signed-off-by: Hawkins Jiawei 
>> ---
>> v2:
>>- avoid sending CVQ command in default state suggested by Eugenio
>>
>> v1: 
>> https://lore.kernel.org/all/86eeddcd6f6b04e5c1e44e901ddea3b1b8b6c183.1687402580.git.yin31...@gmail.com/
>>
>>   net/vhost-vdpa.c | 104 +++
>>   1 file changed, 104 insertions(+)
>>
>> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
>> index cb45c84c88..9d5d88756c 100644
>> --- a/net/vhost-vdpa.c
>> +++ b/net/vhost-vdpa.c
>> @@ -792,6 +792,106 @@ static int vhost_vdpa_net_load_offloads(VhostVDPAState 
>> *s,
>>   return 0;
>>   }
>>
>> +static int vhost_vdpa_net_load_rx_mode(VhostVDPAState *s,
>> +   uint8_t cmd,
>> +   uint8_t on)
>> +{
>> +ssize_t dev_written;
>> +const struct iovec data = {
>> +.iov_base = ,
>> +.iov_len = sizeof(on),
>> +};
>> +dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_RX,
>> +  cmd, , 1);
>> +if (unlikely(dev_written < 0)) {
>> +return dev_written;
>> +}
>> +if (*s->status != VIRTIO_NET_OK) {
>> +return -EINVAL;
>> +}
>> +
>> +return 0;
>> +}
>> +
>> +static int vhost_vdpa_net_load_rx(VhostVDPAState *s,
>> +  const VirtIONet *n)
>> +{
>> +uint8_t on;
>> +int r;
>> +
>> +if (virtio_vdev_has_feature(>parent_obj, VIRTIO_NET_F_CTRL_RX)) {
>
> Also suggesting early returns here.

So, for CVQ commands related to VIRTIO_NET_F_CTRL_EXTRA_RX, would it be
more appropriate to create a new function, maybe
vhost_vdpa_net_load_rx_extra, to handle them instead of sending those
CVQ commands within this function, if we choose to return early?

>
>> +/* Load the promiscous mode */
>> +if (n->mac_table.uni_overflow) {
>> +/*
>> + * According to VirtIO standard, "Since there are no guarantees,
>> + * it can use a hash filter or silently switch to
>> + * allmulti or promiscuous mode if it is given too many 
>> addresses."
>> + *
>> + * QEMU ignores non-multicast(unicast) MAC addresses and
>> + * marks `uni_overflow` for the device internal state
>> + * if guest sets too many non-multicast(unicast) MAC addresses.
>> + * Therefore, we should turn promiscous mode on in this case.
>> + */
>> +on = 1;
>> +} else {
>> +on = n->promisc;
>> +}
>
> I think we can remove the "on" variable and just do:
>
> /*
>   * According to ...
>   */
> if (n->mac_table.uni_overflow || n->promisc) {
>r = vhost_vdpa_net_load_rx_mode(s, VIRTIO_NET_CTRL_RX_PROMISC, on);
>if (r < 0) {
>  return r;
>}
> ---
>
> And the equivalent for multicast.
>
> Would that make sense?

Yes, I will refactor these according to your suggestion.

Thanks!


>
> Thanks!
>
>> +if (on != 1) {
>> +/*
>> + * According to virtio_net_reset(), device turns promiscuous 
>> mode on
>> + * by default.
>> + *
>> + * Therefore, there is no need to send this CVQ command if the
>> + * driver also sets promiscuous mode on, which aligns with
>> + * the device's defaults.
>> + *
>> + * Note that the device's defaults can mismatch the driver's
>> + * configuration only at live migration.
>> + */
>> +r = vhost_vdpa_net_load_rx_mode(s, VIRTIO_NET_CTRL_RX_PROMISC, 
>> on);
>> +if (r < 0) {
>> +return r;
>> +}
>> +}
>> +
>> +/* Load the all-multicast mode */
>> +if (n->mac_table.multi_overflow) {
>> +/*
>> + * According to VirtIO standard, "Since there are no guarantees,
>> + * it can use a hash filter or silently switch to
>> + * allmulti or promiscuous mode if it is given too many 
>> addresses."
>> + *
>> + * QEMU ignores multicast MAC addresses and
>> + * marks `multi_overflow` for the device internal state
>> + * if guest sets too many multicast MAC addresses.
>> + * Therefore, we should turn all-multicast mode on in this case.
>> + */
>> +on = 1;
>> +} else {
>> +on = n->allmulti;
>> +}
>> +if (on != 0) {
>> +/*
>> + * According to virtio_net_reset(), device turns all-multicast 
>> mode
>> + * off by default.
>> + *
>> + *

Re: [PATCH] ppc/pnv: Set P10 core xscom region size to match hardware

2023-07-04 Thread Joel Stanley

On Wed, 5 Jul 2023 at 01:27, Nicholas Piggin  wrote:
>
> The P10 core xscom memory regions overlap because the size is wrong.
> The P10 core+L2 xscom region size is allocated as 0x1000 (with some
> unused ranges). "EC" is used as a closer match, as "EX" includes L3
> which has a disjoint xscom range that would require a different
> region if it were implemented.
>
> Signed-off-by: Nicholas Piggin 

Nice, that looks better:

0001-0001000f (prio 0, i/o): xscom-quad.0: 0x10
000100108000-00010010 (prio 0, i/o): xscom-core.3: 0x8000
00010011-000100117fff (prio 0, i/o): xscom-core.2: 0x8000
00010012-000100127fff (prio 0, i/o): xscom-core.1: 0x8000
00010014-000100147fff (prio 0, i/o): xscom-core.0: 0x8000
00010800-0001080f (prio 0, i/o): xscom-quad.4: 0x10
000108108000-00010810 (prio 0, i/o): xscom-core.7: 0x8000
00010811-000108117fff (prio 0, i/o): xscom-core.6: 0x8000
00010812-000108127fff (prio 0, i/o): xscom-core.5: 0x8000
00010814-000108147fff (prio 0, i/o): xscom-core.4: 0x8000

Reviewed-by: Joel Stanley 


> ---
>  hw/ppc/pnv_core.c  | 3 +--
>  include/hw/ppc/pnv_xscom.h | 2 +-
>  2 files changed, 2 insertions(+), 3 deletions(-)
>
> diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
> index b7223bb445..ffbc29cbf4 100644
> --- a/hw/ppc/pnv_core.c
> +++ b/hw/ppc/pnv_core.c
> @@ -299,9 +299,8 @@ static void pnv_core_realize(DeviceState *dev, Error 
> **errp)
>  }
>
>  snprintf(name, sizeof(name), "xscom-core.%d", cc->core_id);
> -/* TODO: check PNV_XSCOM_EX_SIZE for p10 */
>  pnv_xscom_region_init(>xscom_regs, OBJECT(dev), pcc->xscom_ops,
> -  pc, name, PNV_XSCOM_EX_SIZE);
> +  pc, name, PNV10_XSCOM_EC_SIZE);
>
>  qemu_register_reset(pnv_core_reset, pc);
>  return;
> diff --git a/include/hw/ppc/pnv_xscom.h b/include/hw/ppc/pnv_xscom.h
> index f7da9a1dc6..a4c9d95dc5 100644
> --- a/include/hw/ppc/pnv_xscom.h
> +++ b/include/hw/ppc/pnv_xscom.h
> @@ -133,7 +133,7 @@ struct PnvXScomInterfaceClass {
>
>  #define PNV10_XSCOM_EC_BASE(core) \
>  ((uint64_t) PNV10_XSCOM_EQ_BASE(core) | PNV10_XSCOM_EC(core & 0x3))
> -#define PNV10_XSCOM_EC_SIZE0x10
> +#define PNV10_XSCOM_EC_SIZE0x1000
>
>  #define PNV10_XSCOM_PSIHB_BASE 0x3011D00
>  #define PNV10_XSCOM_PSIHB_SIZE 0x100
> --
> 2.40.1
>

Re: [PATCH RFC v2 2/4] vdpa: Restore MAC address filtering state

2023-07-04 Thread Hawkins Jiawei

On 2023/7/4 22:53, Eugenio Perez Martin wrote:
> On Thu, Jun 29, 2023 at 5:26 PM Hawkins Jiawei  wrote:
>>
>> This patch refactors vhost_vdpa_net_load_mac() to
>> restore the MAC address filtering state at device's startup.
>>
>> Signed-off-by: Hawkins Jiawei 
>> ---
>> v2:
>>- use iovec suggested by Eugenio
>>- avoid sending CVQ command in default state
>>
>> v1: 
>> https://lore.kernel.org/all/00f72fe154a882fd6dc15bc39e3a1ac63f9dadce.1687402580.git.yin31...@gmail.com/
>>
>>   net/vhost-vdpa.c | 51 
>>   1 file changed, 51 insertions(+)
>>
>> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
>> index 0bd1c7817c..cb45c84c88 100644
>> --- a/net/vhost-vdpa.c
>> +++ b/net/vhost-vdpa.c
>> @@ -665,6 +665,57 @@ static int vhost_vdpa_net_load_mac(VhostVDPAState *s, 
>> const VirtIONet *n)
>>   }
>>   }
>>
>> +if (virtio_vdev_has_feature(>parent_obj, VIRTIO_NET_F_CTRL_RX)) {
>> +if (n->mac_table.in_use != 0) {
>
> This may be just style nitpicking, but I find it more clear to return
> early if conditions are not met and then send the CVQ command.

Yes, this makes code more clear to read.

But it appears that we may meet a problem if the function
vhost_vdpa_net_load_x() sends multiple CVQ commands. It is possible that
we might not meet the condition for one of the CVQ commands, but we
could still meet the conditions for other CVQ commands.

Therefore, in the case of vhost_vdpa_net_load_x() sending multiple CVQ
commands, if we still hope to use this style, should we split the
function into multiple functions, with each function responsible for
sending only one CVQ command? Or should we jump to the next CVQ command
instead of returning from the function if the conditions are not met?

Thanks!


> Something like:
> /*
>   * According to ...
>   */
> if (!virtio_vdev_has_feature(>parent_obj, VIRTIO_NET_F_CTRL_RX)) ||
> (n->mac_table.in_use == 0)) {
>return 0
> }
>
> uni_entries = n->mac_table.first_multi,
> ...
> ---
>
> Now I just realized vhost_vdpa_net_load_mac does not follow this for
> checking VIRTIO_NET_F_CTRL_MAC_ADDR.
>
> I'm ok if you leave it this way though.
>
> Thanks!
>
>> +/*
>> + * According to virtio_net_reset(), device uses an empty MAC 
>> filter
>> + * table as its default state.
>> + *
>> + * Therefore, there is no need to send this CVQ command if the
>> + * driver also sets an empty MAC filter table, which aligns with
>> + * the device's defaults.
>> + *
>> + * Note that the device's defaults can mismatch the driver's
>> + * configuration only at live migration.
>> + */
>> +uint32_t uni_entries = n->mac_table.first_multi,
>> + uni_macs_size = uni_entries * ETH_ALEN,
>> + mul_entries = n->mac_table.in_use - uni_entries,
>> + mul_macs_size = mul_entries * ETH_ALEN;
>> +struct virtio_net_ctrl_mac uni = {
>> +.entries = cpu_to_le32(uni_entries),
>> +};
>> +struct virtio_net_ctrl_mac mul = {
>> +.entries = cpu_to_le32(mul_entries),
>> +};
>> +const struct iovec data[] = {
>> +{
>> +.iov_base = ,
>> +.iov_len = sizeof(uni),
>> +}, {
>> +.iov_base = n->mac_table.macs,
>> +.iov_len = uni_macs_size,
>> +}, {
>> +.iov_base = ,
>> +.iov_len = sizeof(mul),
>> +}, {
>> +.iov_base = >mac_table.macs[uni_macs_size],
>> +.iov_len = mul_macs_size,
>> +},
>> +};
>> +ssize_t dev_written = vhost_vdpa_net_load_cmd(s,
>> +VIRTIO_NET_CTRL_MAC,
>> +VIRTIO_NET_CTRL_MAC_TABLE_SET,
>> +data, ARRAY_SIZE(data));
>> +if (unlikely(dev_written < 0)) {
>> +return dev_written;
>> +}
>> +if (*s->status != VIRTIO_NET_OK) {
>> +return -EINVAL;
>> +}
>> +}
>> +}
>> +
>>   return 0;
>>   }
>>
>> --
>> 2.25.1
>>
>

Re: [PATCH v7 5/6] hw/pci: ensure PCIE devices are plugged into only slot 0 of PCIE port

2023-07-04 Thread Akihiko Odaki





On 2023/07/05 0:07, Ani Sinha wrote:




On 04-Jul-2023, at 7:58 PM, Igor Mammedov  wrote:

On Tue, 4 Jul 2023 19:20:00 +0530
Ani Sinha  wrote:


On 04-Jul-2023, at 6:18 PM, Igor Mammedov  wrote:

On Tue, 4 Jul 2023 21:02:09 +0900
Akihiko Odaki  wrote:


On 2023/07/04 20:59, Ani Sinha wrote:




On 04-Jul-2023, at 5:24 PM, Akihiko Odaki  wrote:

On 2023/07/04 20:25, Ani Sinha wrote:

PCI Express ports only have one slot, so PCI Express devices can only be
plugged into slot 0 on a PCIE port. Add a warning to let users know when the
invalid configuration is used. We may enforce this more strongly later on once
we get more clarity on whether we are introducing a bad regression for users
currenly using the wrong configuration.
The change has been tested to not break or alter behaviors of ARI capable
devices by instantiating seven vfs on an emulated igb device (the maximum
number of vfs the linux igb driver supports). The vfs instantiated correctly
and are seen to have non-zero device/slot numbers in the conventional PCI BDF
representation.
CC: jus...@redhat.com
CC: imamm...@redhat.com
CC: m...@redhat.com
CC: akihiko.od...@daynix.com
Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=2128929
Signed-off-by: Ani Sinha 
Reviewed-by: Julia Suvorova 
---
hw/pci/pci.c | 15 +++
1 file changed, 15 insertions(+)
diff --git a/hw/pci/pci.c b/hw/pci/pci.c
index e2eb4c3b4a..47517ba3db 100644
--- a/hw/pci/pci.c
+++ b/hw/pci/pci.c
@@ -65,6 +65,7 @@ bool pci_available = true;
static char *pcibus_get_dev_path(DeviceState *dev);
static char *pcibus_get_fw_dev_path(DeviceState *dev);
static void pcibus_reset(BusState *qbus);
+static bool pcie_has_upstream_port(PCIDevice *dev);
   static Property pci_props[] = {
 DEFINE_PROP_PCI_DEVFN("addr", PCIDevice, devfn, -1),
@@ -2121,6 +2122,20 @@ static void pci_qdev_realize(DeviceState *qdev, Error 
**errp)
 }
 }
+/*
+ * With SRIOV and ARI, vfs can have non-zero slot in the conventional
+ * PCI interpretation as all five bits reserved for slot addresses are
+ * also used for function bits for the various vfs. Ignore that case.


You don't have to mention SR/IOV; it affects all ARI-capable devices. A PF can 
also have non-zero slot number in the conventional interpretation so you 
shouldn't call it vf either.


Can you please help write a comment that explains this properly for all cases - 
ARI/non-ARI, PFs and VFs? Once everyone agrees that its clear and correct, I 
will re-spin.


Simply, you can say:
With ARI, the slot number field in the conventional PCI interpretation
can have a non-zero value as the field bits are reused to extend the
function number bits. Ignore that case.


mentioning 'conventional PCI interpretation' in comment and then immediately
checking 'pci_is_express(pci_dev)' is confusing. Since comment belongs
only to PCIE branch it would be better to talk in only about PCIe stuff
and referring to relevant portions of spec.


Ok so how about this?

   * With ARI, devices can have non-zero slot in the traditional BDF
 * representation as all five bits reserved for slot addresses are
 * also used for function bits. Ignore that case.


you still refer to traditional (which I misread as 'conventional'),
steal the linux comment and argument it with ARI if necessary,
something like this (probably needs some more massaging):


The comment messaging in these patches seems to exceed the value of the patch 
itself :-)

How about this?

 /*
  * A PCIe Downstream Port normally leads to a Link with only Device
  * 0 on it (PCIe spec r3.1, sec 7.3.1).
  * With ARI, PCI_SLOT() can return non-zero value as all five bits
  * reserved for slot addresses are also used for function bits.
  * Hence, ignore ARI capable devices.
  */


Perhaps: s/normally leads to/must lead to/

From the kernel perspective, they may need to deal with a quirky 
hardware that does not conform with the specification, but from QEMU 
perspective, it is what we *must* conform with.


Otherwise looks good to me.






 /*
 * A PCIe Downstream Port normally leads to a Link with only Device
 * 0 on it (PCIe spec r3.1, sec 7.3.1).
  However PCI_SLOT() is broken if ARI is enabled, hence work around it
  by skipping check if the later cap is present.
 */





(for example see how it's done in kernel code: only_one_child(...)

PS:
kernel can be forced  to scan for !0 device numbers, but that's rather
a hack, so we shouldn't really care about that.








+ */
+if (pci_is_express(pci_dev) &&
+!pcie_find_capability(pci_dev, PCI_EXT_CAP_ID_ARI) &&
+pcie_has_upstream_port(pci_dev) &&
+PCI_SLOT(pci_dev->devfn)) {
+warn_report("PCI: slot %d is not valid for %s,"
+" parent device only allows plugging into slot 0.",
+PCI_SLOT(pci_dev->devfn), pci_dev->name);
+}
+
 if (pci_dev->failover_pair_id) {
 if

[PATCH] ppc/pnv: Set P10 core xscom region size to match hardware

2023-07-04 Thread Nicholas Piggin

The P10 core xscom memory regions overlap because the size is wrong.
The P10 core+L2 xscom region size is allocated as 0x1000 (with some
unused ranges). "EC" is used as a closer match, as "EX" includes L3
which has a disjoint xscom range that would require a different
region if it were implemented.

Signed-off-by: Nicholas Piggin 
---
 hw/ppc/pnv_core.c  | 3 +--
 include/hw/ppc/pnv_xscom.h | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/hw/ppc/pnv_core.c b/hw/ppc/pnv_core.c
index b7223bb445..ffbc29cbf4 100644
--- a/hw/ppc/pnv_core.c
+++ b/hw/ppc/pnv_core.c
@@ -299,9 +299,8 @@ static void pnv_core_realize(DeviceState *dev, Error **errp)
 }
 
 snprintf(name, sizeof(name), "xscom-core.%d", cc->core_id);
-/* TODO: check PNV_XSCOM_EX_SIZE for p10 */
 pnv_xscom_region_init(>xscom_regs, OBJECT(dev), pcc->xscom_ops,
-  pc, name, PNV_XSCOM_EX_SIZE);
+  pc, name, PNV10_XSCOM_EC_SIZE);
 
 qemu_register_reset(pnv_core_reset, pc);
 return;
diff --git a/include/hw/ppc/pnv_xscom.h b/include/hw/ppc/pnv_xscom.h
index f7da9a1dc6..a4c9d95dc5 100644
--- a/include/hw/ppc/pnv_xscom.h
+++ b/include/hw/ppc/pnv_xscom.h
@@ -133,7 +133,7 @@ struct PnvXScomInterfaceClass {
 
 #define PNV10_XSCOM_EC_BASE(core) \
 ((uint64_t) PNV10_XSCOM_EQ_BASE(core) | PNV10_XSCOM_EC(core & 0x3))
-#define PNV10_XSCOM_EC_SIZE0x10
+#define PNV10_XSCOM_EC_SIZE0x1000
 
 #define PNV10_XSCOM_PSIHB_BASE 0x3011D00
 #define PNV10_XSCOM_PSIHB_SIZE 0x100
-- 
2.40.1

Re: [PATCH v2 0/5] ppc/pnv: Extend "quad" model for p10

2023-07-04 Thread Nicholas Piggin

On Tue Jul 4, 2023 at 3:41 PM AEST, Joel Stanley wrote:
> The quad model implements the EC xscoms for the p9 machine, reusing the
> same model for p10 which isn't quite correct. This series adds a PnvQuad
> class and subclasses it for P9 and P10.
>
> I mistakenly thought we needed the quad model to implement the core
> thread state scom on p10, because the read was coming in to the address
> belonging to the quad. In fact the quad region was too large,
> overlapping with the core. This is fixed in v2, and the core thread is
> back where it should be in the core model. This should address Nick's
> feedback on the v1 cover letter.

Already queued, but FWIW these all look good to me. Thanks, this is a
good base to add some more functions on too.

One thing is the core xscom regions seem to still overlap...

Thanks,
Nick

>
> v2 also adds Cedric's r-b, fixes the s/write/read/ mistakes, and is
> checkpatch clean.
>
> v1: https://lore.kernel.org/qemu-devel/20230630035547.80329-1-j...@jms.id.au/
>
> Joel Stanley (5):
>   ppc/pnv: quad xscom callbacks are P9 specific
>   ppc/pnv: Subclass quad xscom callbacks
>   ppc/pnv: Add P10 quad xscom model
>   ppc/pnv: Add P10 core xscom model
>   ppc/pnv: Return zero for core thread state xscom
>
>  include/hw/ppc/pnv_core.h  |  13 ++-
>  include/hw/ppc/pnv_xscom.h |   2 +-
>  hw/ppc/pnv.c   |  11 ++-
>  hw/ppc/pnv_core.c  | 165 +++--
>  4 files changed, 162 insertions(+), 29 deletions(-)

Re: [PATCH RFC v2 1/4] vdpa: Use iovec for vhost_vdpa_net_load_cmd()

2023-07-04 Thread Hawkins Jiawei

On 2023/7/4 22:17, Eugenio Perez Martin wrote:
> On Thu, Jun 29, 2023 at 5:25 PM Hawkins Jiawei  wrote:
>>
>> According to VirtIO standard, "The driver MUST follow
>> the VIRTIO_NET_CTRL_MAC_TABLE_SET command by a le32 number,
>> followed by that number of non-multicast MAC addresses,
>> followed by another le32 number, followed by that number
>> of multicast addresses."
>>
>> Considering that these data is not stored in contiguous memory,
>> this patch refactors vhost_vdpa_net_load_cmd() to accept
>> scattered data, eliminating the need for an addtional data copy or
>> packing the data into s->cvq_cmd_out_buffer outside of
>> vhost_vdpa_net_load_cmd().
>>
>> Signed-off-by: Hawkins Jiawei 
>> ---
>> v2:
>>- refactor vhost_vdpa_load_cmd() to accept iovec suggested by
>> Eugenio
>>
>>   net/vhost-vdpa.c | 42 --
>>   1 file changed, 32 insertions(+), 10 deletions(-)
>>
>> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
>> index 6f6a5c6df6..0bd1c7817c 100644
>> --- a/net/vhost-vdpa.c
>> +++ b/net/vhost-vdpa.c
>> @@ -620,29 +620,43 @@ static ssize_t vhost_vdpa_net_cvq_add(VhostVDPAState 
>> *s, size_t out_len,
>>   }
>>
>>   static ssize_t vhost_vdpa_net_load_cmd(VhostVDPAState *s, uint8_t class,
>> -   uint8_t cmd, const void *data,
>> -   size_t data_size)
>> +   uint8_t cmd, const struct iovec 
>> *data,
>> +   size_t data_len)
>>   {
>>   const struct virtio_net_ctrl_hdr ctrl = {
>>   .class = class,
>>   .cmd = cmd,
>>   };
>> +void *cursor = s->cvq_cmd_out_buffer;
>>
>> -assert(data_size < vhost_vdpa_net_cvq_cmd_page_len() - sizeof(ctrl));
>> +/* pack the CVQ command header */
>> +assert(sizeof(ctrl) < vhost_vdpa_net_cvq_cmd_page_len() -
>> +  (cursor - s->cvq_cmd_out_buffer));
>> +memcpy(cursor, , sizeof(ctrl));
>> +cursor += sizeof(ctrl);
>>
>> -memcpy(s->cvq_cmd_out_buffer, , sizeof(ctrl));
>> -memcpy(s->cvq_cmd_out_buffer + sizeof(ctrl), data, data_size);
>> +/* pack the CVQ command command-specific-data */
>> +for (int i = 0; i < data_len; ++i) {
>> +assert(data[i].iov_len < vhost_vdpa_net_cvq_cmd_page_len() -
>> + (cursor - s->cvq_cmd_out_buffer));
>> +memcpy(cursor, data[i].iov_base, data[i].iov_len);
>> +cursor += data[i].iov_len;
>> +}
>
> Can we replace all of the above by iov_to_buf?

Yes, you are right.

We should use iov_to_buf() here. I will refactor this patch according to
your suggestion.

Thanks!


>
>>
>> -return vhost_vdpa_net_cvq_add(s, sizeof(ctrl) + data_size,
>> +return vhost_vdpa_net_cvq_add(s, cursor - s->cvq_cmd_out_buffer,
>> sizeof(virtio_net_ctrl_ack));
>>   }
>>
>>   static int vhost_vdpa_net_load_mac(VhostVDPAState *s, const VirtIONet *n)
>>   {
>>   if (virtio_vdev_has_feature(>parent_obj, 
>> VIRTIO_NET_F_CTRL_MAC_ADDR)) {
>> +const struct iovec data = {
>> +.iov_base = (void *)n->mac,
>> +.iov_len = sizeof(n->mac),
>> +};
>>   ssize_t dev_written = vhost_vdpa_net_load_cmd(s, 
>> VIRTIO_NET_CTRL_MAC,
>> 
>> VIRTIO_NET_CTRL_MAC_ADDR_SET,
>> -  n->mac, sizeof(n->mac));
>> +  , 1);
>>   if (unlikely(dev_written < 0)) {
>>   return dev_written;
>>   }
>> @@ -665,9 +679,13 @@ static int vhost_vdpa_net_load_mq(VhostVDPAState *s,
>>   }
>>
>>   mq.virtqueue_pairs = cpu_to_le16(n->curr_queue_pairs);
>> +const struct iovec data = {
>> +.iov_base = ,
>> +.iov_len = sizeof(mq),
>> +};
>>   dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_MQ,
>> -  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET, 
>> ,
>> -  sizeof(mq));
>> +  VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET,
>> +  , 1);
>>   if (unlikely(dev_written < 0)) {
>>   return dev_written;
>>   }
>> @@ -706,9 +724,13 @@ static int vhost_vdpa_net_load_offloads(VhostVDPAState 
>> *s,
>>   }
>>
>>   offloads = cpu_to_le64(n->curr_guest_offloads);
>> +const struct iovec data = {
>> +.iov_base = ,
>> +.iov_len = sizeof(offloads),
>> +};
>>   dev_written = vhost_vdpa_net_load_cmd(s, 
>> VIRTIO_NET_CTRL_GUEST_OFFLOADS,
>> 
>> VIRTIO_NET_CTRL_GUEST_OFFLOADS_SET,
>> -  , sizeof(offloads));
>> +  , 1);
>>   if (unlikely(dev_written < 0)) {
>>   return dev_written;
>>   }
>> --
>>

RE: [PATCH] Hexagon: move GETPC() calls to top level helpers

2023-07-04 Thread ltaylorsimpson

-Original Message-
From: Matheus Tavares Bernardino  
Sent: Monday, July 3, 2023 3:50 PM
To: qemu-devel@nongnu.org
Cc: bc...@quicinc.com; quic_mlie...@quicinc.com; ltaylorsimp...@gmail.com
Subject: [PATCH] Hexagon: move GETPC() calls to top level helpers

As docs/devel/loads-stores.rst states:

  ``GETPC()`` should be used with great care: calling
  it in other functions that are *not* the top level
  ``HELPER(foo)`` will cause unexpected behavior. Instead, the
  value of ``GETPC()`` should be read from the helper and passed
  if needed to the functions that the helper calls.

Let's fix the GETPC() usage in Hexagon, making sure it's always called from
top level helpers and passed down to the places where it's needed. There are
two snippets where that is not currently the case:

- probe_store(), which is only called from two helpers, so it's easy to
  move GETPC() up.

- mem_load*() functions, which are also called directly from helpers,
  but through the MEM_LOAD*() set of macros. Note that this are only
  used when compiling with --disable-hexagon-idef-parser.

  In this case, we also take this opportunity to simplify the code,
  unifying the mem_load*() functions.

Signed-off-by: Matheus Tavares Bernardino 
---
 target/hexagon/macros.h| 22 ++---
 target/hexagon/op_helper.h | 11 ++---  target/hexagon/op_helper.c | 49
+++---
 3 files changed, 25 insertions(+), 57 deletions(-)

diff --git a/target/hexagon/macros.h b/target/hexagon/macros.h index
5451b061ee..efb8013912 100644
--- a/target/hexagon/macros.h
+++ b/target/hexagon/macros.h
+
+#define MEM_LOADn(SIZE, VA) ({ \
+check_noshuf(env, pkt_has_store_s1, slot, VA, SIZE); \
+cpu_ldub_data_ra(env, VA, GETPC()); \
+})

Note that check_noshuf calls HELPER(probe_noshuf_load) and
HELPER(commit_store).  Both of those call GETPC() from within.  So, you'll
need to pull the contents into separate functions that take ra as an
argument.

Does this pass the test suite?  You are only using the SIZE parameter in
check_noshuf, but cpu_ldub_data_ra only reads a single byte.

+#define MEM_LOAD1s(VA) ((int8_t)MEM_LOADn(1, VA)) #define 
+MEM_LOAD1u(VA) ((uint8_t)MEM_LOADn(1, VA)) #define MEM_LOAD2s(VA) 
+((int16_t)MEM_LOADn(2, VA)) #define MEM_LOAD2u(VA) 
+((uint16_t)MEM_LOADn(2, VA)) #define MEM_LOAD4s(VA) 
+((int32_t)MEM_LOADn(4, VA)) #define MEM_LOAD4u(VA) 
+((uint32_t)MEM_LOADn(4, VA)) #define MEM_LOAD8s(VA) 
+((int64_t)MEM_LOADn(8, VA)) #define MEM_LOAD8u(VA) 
+((uint64_t)MEM_LOADn(8, VA))

A further cleanup would be to remove these altogether and modify the
definition of fLOAD to use MEM_LOADn.

 #define MEM_STORE1(VA, DATA, SLOT) log_store32(env, VA, DATA, 1, SLOT)
#define MEM_STORE2(VA, DATA, SLOT) log_store32(env, VA, DATA, 2, SLOT) diff
--git a/target/hexagon/op_helper.h b/target/hexagon/op_helper.h index
8f3764d15e..845c3d197e 100644
--- a/target/hexagon/op_helper.h
+++ b/target/hexagon/op_helper.h
@@ -19,15 +19,8 @@
 #define HEXAGON_OP_HELPER_H

+void check_noshuf(CPUHexagonState *env, bool pkt_has_store_s1,
+  uint32_t slot, target_ulong vaddr, int size);

Does this really need to be non-static?

Thanks,
Taylor

Re: [PATCH] pnv/xive: Allow mmio operations of any size on the ESB CI pages

2023-07-04 Thread Daniel Henrique Barboza


Queued in gitlab.com/danielhb/qemu/tree/ppc-next. Thanks,


Daniel

On 7/4/23 11:48, Frederic Barrat wrote:

We currently only allow 64-bit operations on the ESB CI pages. There's
no real reason for that limitation, skiboot/linux didn't need
more. However the hardware supports any size, so this patch relaxes
that restriction. It impacts both the ESB pages for "normal"
interrupts as well as the ESB pages for escalation interrupts defined
for the ENDs.

Signed-off-by: Frederic Barrat 
---

This should wrap-up the cleanup about mmio size for the xive BARs. The
NVPG and NVC BAR accesses should also be relaxed but we don't really
implement them, any load/store currently fails. Something to address
when/if we implement them.

  hw/intc/xive.c  | 8 
  hw/intc/xive2.c | 4 ++--
  2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/hw/intc/xive.c b/hw/intc/xive.c
index f60c878345..c014e961a4 100644
--- a/hw/intc/xive.c
+++ b/hw/intc/xive.c
@@ -1175,11 +1175,11 @@ static const MemoryRegionOps xive_source_esb_ops = {
  .write = xive_source_esb_write,
  .endianness = DEVICE_BIG_ENDIAN,
  .valid = {
-.min_access_size = 8,
+.min_access_size = 1,
  .max_access_size = 8,
  },
  .impl = {
-.min_access_size = 8,
+.min_access_size = 1,
  .max_access_size = 8,
  },
  };
@@ -2006,11 +2006,11 @@ static const MemoryRegionOps xive_end_source_ops = {
  .write = xive_end_source_write,
  .endianness = DEVICE_BIG_ENDIAN,
  .valid = {
-.min_access_size = 8,
+.min_access_size = 1,
  .max_access_size = 8,
  },
  .impl = {
-.min_access_size = 8,
+.min_access_size = 1,
  .max_access_size = 8,
  },
  };
diff --git a/hw/intc/xive2.c b/hw/intc/xive2.c
index 4d9ff41956..c37ef25d44 100644
--- a/hw/intc/xive2.c
+++ b/hw/intc/xive2.c
@@ -954,11 +954,11 @@ static const MemoryRegionOps xive2_end_source_ops = {
  .write = xive2_end_source_write,
  .endianness = DEVICE_BIG_ENDIAN,
  .valid = {
-.min_access_size = 8,
+.min_access_size = 1,
  .max_access_size = 8,
  },
  .impl = {
-.min_access_size = 8,
+.min_access_size = 1,
  .max_access_size = 8,
  },
  };

Re: [PATCH v2 0/5] ppc/pnv: Extend "quad" model for p10

2023-07-04 Thread Daniel Henrique Barboza


Queued in gitlab.com/danielhb/qemu/tree/ppc-next. Thanks,


Daniel

On 7/4/23 02:41, Joel Stanley wrote:

The quad model implements the EC xscoms for the p9 machine, reusing the
same model for p10 which isn't quite correct. This series adds a PnvQuad
class and subclasses it for P9 and P10.

I mistakenly thought we needed the quad model to implement the core
thread state scom on p10, because the read was coming in to the address
belonging to the quad. In fact the quad region was too large,
overlapping with the core. This is fixed in v2, and the core thread is
back where it should be in the core model. This should address Nick's
feedback on the v1 cover letter.

v2 also adds Cedric's r-b, fixes the s/write/read/ mistakes, and is
checkpatch clean.

v1: https://lore.kernel.org/qemu-devel/20230630035547.80329-1-j...@jms.id.au/

Joel Stanley (5):
   ppc/pnv: quad xscom callbacks are P9 specific
   ppc/pnv: Subclass quad xscom callbacks
   ppc/pnv: Add P10 quad xscom model
   ppc/pnv: Add P10 core xscom model
   ppc/pnv: Return zero for core thread state xscom

  include/hw/ppc/pnv_core.h  |  13 ++-
  include/hw/ppc/pnv_xscom.h |   2 +-
  hw/ppc/pnv.c   |  11 ++-
  hw/ppc/pnv_core.c  | 165 +++--
  4 files changed, 162 insertions(+), 29 deletions(-)

Re: [PATCH] Hexagon: move GETPC() calls to top level helpers

2023-07-04 Thread Matheus Tavares Bernardino

> Matheus Tavares Bernardino  wrote:
>
> diff --git a/target/hexagon/macros.h b/target/hexagon/macros.h
> index 5451b061ee..efb8013912 100644
> --- a/target/hexagon/macros.h
> +++ b/target/hexagon/macros.h
> @@ -173,14 +173,20 @@
>  #define MEM_STORE8(VA, DATA, SLOT) \
>  MEM_STORE8_FUNC(DATA)(cpu_env, VA, DATA, SLOT)
>  #else
> -#define MEM_LOAD1s(VA) ((int8_t)mem_load1(env, pkt_has_store_s1, slot, VA))
> -#define MEM_LOAD1u(VA) ((uint8_t)mem_load1(env, pkt_has_store_s1, slot, VA))
> -#define MEM_LOAD2s(VA) ((int16_t)mem_load2(env, pkt_has_store_s1, slot, VA))
> -#define MEM_LOAD2u(VA) ((uint16_t)mem_load2(env, pkt_has_store_s1, slot, VA))
> -#define MEM_LOAD4s(VA) ((int32_t)mem_load4(env, pkt_has_store_s1, slot, VA))
> -#define MEM_LOAD4u(VA) ((uint32_t)mem_load4(env, pkt_has_store_s1, slot, VA))
> -#define MEM_LOAD8s(VA) ((int64_t)mem_load8(env, pkt_has_store_s1, slot, VA))
> -#define MEM_LOAD8u(VA) ((uint64_t)mem_load8(env, pkt_has_store_s1, slot, VA))
> +
> +#define MEM_LOADn(SIZE, VA) ({ \
> +check_noshuf(env, pkt_has_store_s1, slot, VA, SIZE); \
> +cpu_ldub_data_ra(env, VA, GETPC()); \
> +})
> +
> +#define MEM_LOAD1s(VA) ((int8_t)MEM_LOADn(1, VA))
> +#define MEM_LOAD1u(VA) ((uint8_t)MEM_LOADn(1, VA))
> +#define MEM_LOAD2s(VA) ((int16_t)MEM_LOADn(2, VA))
> +#define MEM_LOAD2u(VA) ((uint16_t)MEM_LOADn(2, VA))
> +#define MEM_LOAD4s(VA) ((int32_t)MEM_LOADn(4, VA))
> +#define MEM_LOAD4u(VA) ((uint32_t)MEM_LOADn(4, VA))
> +#define MEM_LOAD8s(VA) ((int64_t)MEM_LOADn(8, VA))
> +#define MEM_LOAD8u(VA) ((uint64_t)MEM_LOADn(8, VA))

Oops, an oversight from my side: this simplification is not correct
since the mem_load*() functions all call different variants of
cpu_ld*_data_ra(). I'll send a v2 correcting that.

Re: [PATCH qemu v5] aspeed add montblanc bmc reference from fuji

2023-07-04 Thread Michael Tokarev


04.07.2023 14:06, ~ssinprem wrote:

From: Sittisak Sinprem 

- I2C list follow I2C Tree v1.6 20230320
- fru eeprom data use FB FRU format version 4

Signed-off-by: Sittisak Sinprem 


Once again, this has nothing to do with qemu-stable@, it is not a fix
suitable for stable releases.

Thanks,

/mjt

Re: Memory region endianness

2023-07-04 Thread Peter Xu

On Fri, Jun 30, 2023 at 01:37:49AM +0200, BALATON Zoltan wrote:
> Hello,
> 
> Some devices have bits that allow the guest to change endianness of memory
> mapped resources, e.g. ati-vga should allow switching the regs BAR into big
> endian on writing a bit. What's the best way to emulate this?
> 
> The naive way could be to just test for the bit in the memory ops call backs
> and do the swap there, but that would add overhead when it's not needed
> (most guests don't need it) and there are two BARs to access the same
> registers (one is in an IO BAR that aliases part of the MEM space BAR) and
> these may need to have different endianness so I'd rather have the memory
> layer handle it.
> 
> Now the question is how can the endianness be changed from the memory ops
> call back? Is it allowed to overwrite ops.endianness or replace ops with
> another one that has DEVICE_BIG_ENDIAN? In MemoryRegion the ops field is
> declared const and nothing seems to try to change it so I guess it might not
> be changed.
> 
> Then do I need to define two memory regions one with little and another with
> big endian and unmap/map those when the bit is written? Can this be done
> when a write to the bit happens with LE ops then is it possible from the
> callback ro unmap the memory region being written and replace it with
> another? Is there any other easy simple way that I'm missing?

I hope it'll just work as you said.  It seems the same case to me comparing
to other memory region updates within an MMIO callback, no matter whether
the new MR only switched the endianness, or something else has changed.

Thanks,

-- 
Peter Xu

Re: [PATCH] ppc/pegasos2: Add support for -initrd command line option

2023-07-04 Thread BALATON Zoltan


On Tue, 4 Jul 2023, BALATON Zoltan wrote:

Forgot to add commit message here:

This also changes type of sz local variable to ssize_t because it is used 
to store return value of load_elf() and load_image_targphys() that return 
ssize_t.


Should I resend for this or you can ammend on commit?

Regards,
BALATON Zoltan


Signed-off-by: BALATON Zoltan 
---
hw/ppc/pegasos2.c | 32 +++-
1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/pegasos2.c b/hw/ppc/pegasos2.c
index af5489de26..9c9944188b 100644
--- a/hw/ppc/pegasos2.c
+++ b/hw/ppc/pegasos2.c
@@ -44,6 +44,8 @@
#define PROM_ADDR 0xfff0
#define PROM_SIZE 0x8

+#define INITRD_MIN_ADDR 0x60
+
#define KVMPPC_HCALL_BASE0xf000
#define KVMPPC_H_RTAS(KVMPPC_HCALL_BASE + 0x0)
#define KVMPPC_H_VOF_CLIENT  (KVMPPC_HCALL_BASE + 0x5)
@@ -80,6 +82,8 @@ struct Pegasos2MachineState {
uint64_t kernel_addr;
uint64_t kernel_entry;
uint64_t kernel_size;
+uint64_t initrd_addr;
+uint64_t initrd_size;
};

static void *build_fdt(MachineState *machine, int *fdt_size);
@@ -117,7 +121,8 @@ static void pegasos2_init(MachineState *machine)
I2CBus *i2c_bus;
const char *fwname = machine->firmware ?: PROM_FILENAME;
char *filename;
-int i, sz;
+int i;
+ssize_t sz;
uint8_t *spd_data;

/* init CPU */
@@ -213,6 +218,20 @@ static void pegasos2_init(MachineState *machine)
warn_report("Using Virtual OpenFirmware but no -kernel option.");
}

+if (machine->initrd_filename) {
+pm->initrd_addr = pm->kernel_addr + pm->kernel_size + 64 * KiB;
+pm->initrd_addr = ROUND_UP(pm->initrd_addr, 4);
+pm->initrd_addr = MAX(pm->initrd_addr, INITRD_MIN_ADDR);
+sz = load_image_targphys(machine->initrd_filename, pm->initrd_addr,
+ machine->ram_size - pm->initrd_addr);
+if (sz <= 0) {
+error_report("Could not load initrd '%s'",
+ machine->initrd_filename);
+exit(1);
+}
+pm->initrd_size = sz;
+}
+
if (!pm->vof && machine->kernel_cmdline && machine->kernel_cmdline[0]) {
warn_report("Option -append may be ineffective with -bios.");
}
@@ -335,6 +354,11 @@ static void pegasos2_machine_reset(MachineState *machine, 
ShutdownCause reason)
error_report("Memory for kernel is in use");
exit(1);
}
+if (pm->initrd_size &&
+vof_claim(pm->vof, pm->initrd_addr, pm->initrd_size, 0) == -1) {
+error_report("Memory for initrd is in use");
+exit(1);
+}
fdt = build_fdt(machine, );
/* FIXME: VOF assumes entry is same as load address */
d[0] = cpu_to_be64(pm->kernel_entry);
@@ -966,6 +990,12 @@ static void *build_fdt(MachineState *machine, int 
*fdt_size)
qemu_fdt_setprop_string(fdt, "/memory@0", "name", "memory");

qemu_fdt_add_subnode(fdt, "/chosen");
+if (pm->initrd_addr && pm->initrd_size) {
+qemu_fdt_setprop_cell(fdt, "/chosen", "linux,initrd-end",
+  pm->initrd_addr + pm->initrd_size);
+qemu_fdt_setprop_cell(fdt, "/chosen", "linux,initrd-start",
+  pm->initrd_addr);
+}
qemu_fdt_setprop_string(fdt, "/chosen", "bootargs",
machine->kernel_cmdline ?: "");
qemu_fdt_setprop_string(fdt, "/chosen", "name", "chosen");

[PATCH] ppc/pegasos2: Add support for -initrd command line option

2023-07-04 Thread BALATON Zoltan

Signed-off-by: BALATON Zoltan 
---
 hw/ppc/pegasos2.c | 32 +++-
 1 file changed, 31 insertions(+), 1 deletion(-)

diff --git a/hw/ppc/pegasos2.c b/hw/ppc/pegasos2.c
index af5489de26..9c9944188b 100644
--- a/hw/ppc/pegasos2.c
+++ b/hw/ppc/pegasos2.c
@@ -44,6 +44,8 @@
 #define PROM_ADDR 0xfff0
 #define PROM_SIZE 0x8
 
+#define INITRD_MIN_ADDR 0x60
+
 #define KVMPPC_HCALL_BASE0xf000
 #define KVMPPC_H_RTAS(KVMPPC_HCALL_BASE + 0x0)
 #define KVMPPC_H_VOF_CLIENT  (KVMPPC_HCALL_BASE + 0x5)
@@ -80,6 +82,8 @@ struct Pegasos2MachineState {
 uint64_t kernel_addr;
 uint64_t kernel_entry;
 uint64_t kernel_size;
+uint64_t initrd_addr;
+uint64_t initrd_size;
 };
 
 static void *build_fdt(MachineState *machine, int *fdt_size);
@@ -117,7 +121,8 @@ static void pegasos2_init(MachineState *machine)
 I2CBus *i2c_bus;
 const char *fwname = machine->firmware ?: PROM_FILENAME;
 char *filename;
-int i, sz;
+int i;
+ssize_t sz;
 uint8_t *spd_data;
 
 /* init CPU */
@@ -213,6 +218,20 @@ static void pegasos2_init(MachineState *machine)
 warn_report("Using Virtual OpenFirmware but no -kernel option.");
 }
 
+if (machine->initrd_filename) {
+pm->initrd_addr = pm->kernel_addr + pm->kernel_size + 64 * KiB;
+pm->initrd_addr = ROUND_UP(pm->initrd_addr, 4);
+pm->initrd_addr = MAX(pm->initrd_addr, INITRD_MIN_ADDR);
+sz = load_image_targphys(machine->initrd_filename, pm->initrd_addr,
+ machine->ram_size - pm->initrd_addr);
+if (sz <= 0) {
+error_report("Could not load initrd '%s'",
+ machine->initrd_filename);
+exit(1);
+}
+pm->initrd_size = sz;
+}
+
 if (!pm->vof && machine->kernel_cmdline && machine->kernel_cmdline[0]) {
 warn_report("Option -append may be ineffective with -bios.");
 }
@@ -335,6 +354,11 @@ static void pegasos2_machine_reset(MachineState *machine, 
ShutdownCause reason)
 error_report("Memory for kernel is in use");
 exit(1);
 }
+if (pm->initrd_size &&
+vof_claim(pm->vof, pm->initrd_addr, pm->initrd_size, 0) == -1) {
+error_report("Memory for initrd is in use");
+exit(1);
+}
 fdt = build_fdt(machine, );
 /* FIXME: VOF assumes entry is same as load address */
 d[0] = cpu_to_be64(pm->kernel_entry);
@@ -966,6 +990,12 @@ static void *build_fdt(MachineState *machine, int 
*fdt_size)
 qemu_fdt_setprop_string(fdt, "/memory@0", "name", "memory");
 
 qemu_fdt_add_subnode(fdt, "/chosen");
+if (pm->initrd_addr && pm->initrd_size) {
+qemu_fdt_setprop_cell(fdt, "/chosen", "linux,initrd-end",
+  pm->initrd_addr + pm->initrd_size);
+qemu_fdt_setprop_cell(fdt, "/chosen", "linux,initrd-start",
+  pm->initrd_addr);
+}
 qemu_fdt_setprop_string(fdt, "/chosen", "bootargs",
 machine->kernel_cmdline ?: "");
 qemu_fdt_setprop_string(fdt, "/chosen", "name", "chosen");
-- 
2.30.9

Re: [PATCH] xen-block: Avoid leaks on new error path

2023-07-04 Thread Peter Maydell

On Tue, 4 Jul 2023 at 18:19, Anthony PERARD  wrote:
>
> From: Anthony PERARD 
>
> Commit 189829399070 ("xen-block: Use specific blockdev driver")
> introduced a new error path, without taking care of allocated
> resources.
>
> So only allocate the qdicts after the error check, and free both
> `filename` and `driver` when we are about to return and thus taking
> care of both success and error path.
>
> Coverity only spotted the leak of qdicts (*_layer variables).
>
> Reported-by: Peter Maydell 
> Fixes: Coverity CID 1508722, 1398649
> Fixes: 189829399070 ("xen-block: Use specific blockdev driver")
> Signed-off-by: Anthony PERARD 

Reviewed-by: Peter Maydell 

thanks
-- PMM

Re: GLibC AMD CPUID cache reporting regression (was Re: qemu-user self emulation broken with default CPU on x86/x64)

2023-07-04 Thread Florian Weimer

* Daniel P. Berrangé:

> On Mon, Jul 03, 2023 at 06:03:08PM +0200, Pierrick Bouvier wrote:
>> Hi everyone,
>> 
>> Recently (in d135f781 [1], between v7.0.0 and v8.0.0), qemu-user default cpu
>> was updated to "max" instead of qemu32/qemu64.
>> 
>> This change "broke" qemu self emulation if this new default cpu is used.
>> 
>> $ ./qemu-x86_64 ./qemu-x86_64 --version
>> qemu-x86_64: ../util/cacheflush.c:212: init_cache_info: Assertion `(isize &
>> (isize - 1)) == 0' failed.
>> qemu: uncaught target signal 6 (Aborted) - core dumped
>> Aborted
>> 
>> By setting cpu back to qemu64, it works again.
>> $ ./qemu-x86_64 -cpu qemu64 ./qemu-x86_64  --version
>> qemu-x86_64 version 8.0.50 (v8.0.0-2317-ge125b08ed6)
>> Copyright (c) 2003-2023 Fabrice Bellard and the QEMU Project developers
>> 
>> Commenting assert does not work, as qemu aligned malloc fail shortly after.
>> 
>> I'm willing to fix it, but I'm not sure what is the issue with "max" cpu
>> exactly. Is it missing CPU cache line, or something else?
>
> I've observed GLibC is issuing CPUID leaf 0x8000_001d
>
> QEMU 'max' CPU model doesn't defnie xlevel, so QEMU makes it default
> to the same as min_xlevel, which is calculated to be 0x8000_000a.
>
> cpu_x86_cpuid() in QEMU sees CPUID leaf 0x8000_001d is above 0x8000_000a,
> and so  considers it an invaild CPUID and thus forces it to report
> 0x_000d which is supposedly what an invalid CPUID leaf should do.
>
>
> Net result: glibc is asking for 0x8000_001d, but getting back data
> for 0x_000d.
>
> This doesn't end happily for obvious reasons, getting garbage for
> the dcache sizes.
>
>
> The 'qemu64' CPU model also gets CPUID leaf 0x8000_001d capped back
> to 0x_000d, but crucially qemu64 lacks the 'xsave' feature bit,
> so QEMU returns all-zeroes for CPUID leaf 0x_000d. Still not
> good, but this makes glibc report 0 for DCACHE_*, which in turn
> avoids tripping up the nested qemu which queries DCACHE sysconf.
>
> So the problem is thus more widespread than just 'max' CPU model.
>
> Any QEMU CPU model with vendor=AuthenticAMD and the xsave feature,
> and the xlevel unset, will cause glibc to report garbage for the
> L1D cache info
>
> Any QEMU CPU model with vendor=AuthenticAMD and without the xsave
> feature, and the xlevel unset, will cause glibc to report zeroes
> for L1D cache info
>
> Neither is good, but the latter at least doesn't trip up the
> nested QEMU when it queries L1D cache info.
>
> I'm unsure if QEMU's behaviour is correct with calculating the
> default 'xlevel' values for 'max', but I'm assuming the xlevel
> was correct for Opteron_G4/5 since those are explicitly set
> in the code for along time.

We are tracking this as:

  New AMD cache size computation logic does not work for some CPUs,
  hypervisors
  

I filed it after we resolved the earlier crashes because the data is
clearly not accurate.  I was also able to confirm that impacts more than
just hypervisors.

Sajan posted a first patch:

  [PATCH] x86: Fix for cache computation on AMD legacy cpus.
  

However, it changes the reported cache sizes on some older CPUs compared
to what we had before (although the values are no longer zero at least).

Thanks,
Florian

Re: GLibC AMD CPUID cache reporting regression (was Re: qemu-user self emulation broken with default CPU on x86/x64)

2023-07-04 Thread Pierrick Bouvier


Thanks for this deep analysis.

Even if the bug is potentially on glibc side, would that be worth to 
change something in qemu CPU description to avoid it (like changing cpuid)?


On 7/3/23 23:05, Daniel P. Berrangé wrote:

On Mon, Jul 03, 2023 at 06:03:08PM +0200, Pierrick Bouvier wrote:

Hi everyone,

Recently (in d135f781 [1], between v7.0.0 and v8.0.0), qemu-user default cpu
was updated to "max" instead of qemu32/qemu64.

This change "broke" qemu self emulation if this new default cpu is used.

$ ./qemu-x86_64 ./qemu-x86_64 --version
qemu-x86_64: ../util/cacheflush.c:212: init_cache_info: Assertion `(isize &
(isize - 1)) == 0' failed.
qemu: uncaught target signal 6 (Aborted) - core dumped
Aborted

By setting cpu back to qemu64, it works again.
$ ./qemu-x86_64 -cpu qemu64 ./qemu-x86_64  --version
qemu-x86_64 version 8.0.50 (v8.0.0-2317-ge125b08ed6)
Copyright (c) 2003-2023 Fabrice Bellard and the QEMU Project developers

Commenting assert does not work, as qemu aligned malloc fail shortly after.

I'm willing to fix it, but I'm not sure what is the issue with "max" cpu
exactly. Is it missing CPU cache line, or something else?


I've observed GLibC is issuing CPUID leaf 0x8000_001d

QEMU 'max' CPU model doesn't defnie xlevel, so QEMU makes it default
to the same as min_xlevel, which is calculated to be 0x8000_000a.

cpu_x86_cpuid() in QEMU sees CPUID leaf 0x8000_001d is above 0x8000_000a,
and so  considers it an invaild CPUID and thus forces it to report
0x_000d which is supposedly what an invalid CPUID leaf should do.


Net result: glibc is asking for 0x8000_001d, but getting back data
for 0x_000d.

This doesn't end happily for obvious reasons, getting garbage for
the dcache sizes.


The 'qemu64' CPU model also gets CPUID leaf 0x8000_001d capped back
to 0x_000d, but crucially qemu64 lacks the 'xsave' feature bit,
so QEMU returns all-zeroes for CPUID leaf 0x_000d. Still not
good, but this makes glibc report 0 for DCACHE_*, which in turn
avoids tripping up the nested qemu which queries DCACHE sysconf.

So the problem is thus more widespread than just 'max' CPU model.

Any QEMU CPU model with vendor=AuthenticAMD and the xsave feature,
and the xlevel unset, will cause glibc to report garbage for the
L1D cache info

Any QEMU CPU model with vendor=AuthenticAMD and without the xsave
feature, and the xlevel unset, will cause glibc to report zeroes
for L1D cache info

Neither is good, but the latter at least doesn't trip up the
nested QEMU when it queries L1D cache info.

I'm unsure if QEMU's behaviour is correct with calculating the
default 'xlevel' values for 'max', but I'm assuming the xlevel
was correct for Opteron_G4/5 since those are explicitly set
in the code for along time.

Over to the GLibC side, I see there was a recent change:

commit 103a469dc7755fd9e8ccf362f3dd4c55dc761908
Author: Sajan Karumanchi 
Date:   Wed Jan 18 18:29:04 2023 +0100

 x86: Cache computation for AMD architecture.
 
 All AMD architectures cache details will be computed based on

 __cpuid__ `0x8000_001D` and the reference to __cpuid__ `0x8000_0006` will 
be
 zeroed out for future architectures.
 
 Reviewed-by: Premachandra Mallappa 



This introduced the use of CPUID leaf 0x8000_001D. Before this point
glibc would use 0x8000_ and 0x8000_0005 to calculate the cache
size.  QEMU worked correctly with this implementation.

   https://sourceware.org/pipermail/libc-alpha/2023-January/144815.html

The reporter said

"Though we have done the testing on Zen and pre-Zen architectures,
 we recommend to carryout the tests from your end too."

it is unclear if their testing would have covered Opteron_G4/Opteron_G5
architectures, and I not expecting to have had QEMU testing of course ?

I don't have any non-virtual pre-Zen silicon I could verify CPUID
behaviour on. I've not found historic versions of the AMD architecture
reference to see when they first documented 0x8000_001d as a valid
CPUID leaf for getting cache info.

IOW it is still unclear to me whether the root cause bug here is in
QEMU's emulation of CPUID 0x8000_001d, or whether this was actually
a real regression introduced in glibc >= 2.37

I'm tending towards glibc regression though.

Copying Florian and the original AMD patch author

Brief summary

With old glibc 2.36, using QEMU's  qemu64/max CPU models:

# qemu-x86_64-static -cpu qemu64 /bin/getconf -a | grep DCACHE
LEVEL1_DCACHE_SIZE 65536
LEVEL1_DCACHE_ASSOC2
LEVEL1_DCACHE_LINESIZE 64

# qemu-x86_64-static -cpu Opteron_G4 /bin/getconf -a | grep DCACHE
LEVEL1_DCACHE_SIZE 65536
LEVEL1_DCACHE_ASSOC2
LEVEL1_DCACHE_LINESIZE 64

# qemu-x86_64-static -cpu max /bin/getconf -a | grep DCACHE
LEVEL1_DCACHE_SIZE 65536
LEVEL1_DCACHE_ASSOC2
LEVEL1_DCACHE_LINESIZE 64


With new glibc 2.37:

# qemu-x86_64-static -cpu qemu64 /bin/getconf -a | grep DCACHE

Re: [PATCH] linux-user/syscall: Implement execve without execveat

2023-07-04 Thread Pierrick Bouvier


On 7/3/23 19:29, Michael Tokarev wrote:

03.07.2023 18:48, Pierrick Bouvier пишет:

Support for execveat syscall was implemented in 55bbe4 and is available
since QEMU 8.0.0. It relies on host execveat, which is widely available
on most of Linux kernels today.

However, this change breaks qemu-user self emulation, if "host" qemu
version is less than 8.0.0. Indeed, it does not implement yet execveat.
This strange use case happens with most of distribution today having
binfmt support.

With a concrete failing example:
$ qemu-x86_64-7.2 qemu-x86_64-8.0 /bin/bash -c /bin/ls
/bin/bash: line 1: /bin/ls: Function not implemented
-> not implemented means execve returned ENOSYS

qemu-user-static 7.2 and 8.0 can be conveniently grabbed from debian
packages qemu-user-static* [1].

One usage of this is running wine-arm64 from linux-x64 (details [2]).
This is by updating qemu embedded in docker image that we ran into this
issue.

The solution to update host qemu is not always possible. Either it's
complicated or ask you to recompile it, or simply is not accessible
(GitLab CI, GitHub Actions). Thus, it could be worth to implement execve
without relying on execveat, which is the goal of this patch.

This patch was tested with example presented in this commit message.

[1] http://ftp.us.debian.org/debian/pool/main/q/qemu/
[1] https://www.linaro.org/blog/emulate-windows-on-arm/

Signed-off-by: Pierrick Bouvier 
---
   linux-user/syscall.c | 45 +---
   1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index f2cb101d83..b64ec3296a 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -659,6 +659,7 @@ safe_syscall4(pid_t, wait4, pid_t, pid, int *, status, int, 
options, \
   #endif
   safe_syscall5(int, waitid, idtype_t, idtype, id_t, id, siginfo_t *, infop, \
 int, options, struct rusage *, rusage)
+safe_syscall3(int, execve, const char *, filename, char **, argv, char **, 
envp)
   safe_syscall5(int, execveat, int, dirfd, const char *, filename,
 char **, argv, char **, envp, int, flags)
   #if defined(TARGET_NR_select) || defined(TARGET_NR__newselect) || \
@@ -8520,9 +8521,12 @@ static int do_openat(CPUArchState *cpu_env, int dirfd, 
const char *pathname, int
   return safe_openat(dirfd, path(pathname), flags, mode);
   }
   
-static int do_execveat(CPUArchState *cpu_env, int dirfd,

-   abi_long pathname, abi_long guest_argp,
-   abi_long guest_envp, int flags)
+#define IS_EXECVEAT 0
+#define IS_EXECVE 1
+
+static int do_execv(CPUArchState *cpu_env, int dirfd,
+abi_long pathname, abi_long guest_argp,
+abi_long guest_envp, int flags, bool is_execve)
   {
   int ret;
   char **argp, **envp;
@@ -8601,10 +8605,18 @@ static int do_execveat(CPUArchState *cpu_env, int dirfd,
   goto execve_efault;
   }
   
-if (is_proc_myself(p, "exe")) {

-ret = get_errno(safe_execveat(dirfd, exec_path, argp, envp, flags));
+if (is_execve == IS_EXECVE) {


is_execve is either bool or not. I'd use it as bool, and pass true/false.
Right now it is inconsistent.



I'll update this.


+if (is_proc_myself(p, "exe")) {
+ret = get_errno(safe_execve(exec_path, argp, envp));
+} else {
+ret = get_errno(safe_execve(p, argp, envp));
+}
   } else {
-ret = get_errno(safe_execveat(dirfd, p, argp, envp, flags));
+if (is_proc_myself(p, "exe")) {
+ret = get_errno(safe_execveat(dirfd, exec_path, argp, envp, 
flags));
+} else {
+ret = get_errno(safe_execveat(dirfd, p, argp, envp, flags));
+}
   }


And this can be simplified quite a bit by using a condition on
is_proc_myself(p, "exe"):

if (is_proc_myself(p, exe)) {
   p = exec_path;
}
ret = is_excveat ?
  safe_execveat(dirfd, p, argp, envp, flags) :
  safe_execve(p, argp, envp);
ret = get_errno(ret);
...

I dunno which way Laurent might prefer, but to my taste this way it is
much more readable (give or take the proper coding style to use here, -
I don't remember how the arithmetic if should be styled).



Yes, looks good.

   
   unlock_user(p, pathname, 0);

@@ -8633,6 +8645,25 @@ execve_end:
   return ret;
   }
   
+static int do_execveat(CPUArchState *cpu_env, int dirfd,

+   abi_long pathname, abi_long guest_argp,
+   abi_long guest_envp, int flags)
+{
+return do_execv(cpu_env, dirfd,
+pathname, guest_argp, guest_envp, flags,
+IS_EXECVEAT);
+}
+
+static int do_execve(CPUArchState *cpu_env,
+ abi_long pathname, abi_long guest_argp,
+ abi_long guest_envp)
+{
+return do_execv(cpu_env, AT_FDCWD,
+pathname, guest_argp, guest_envp, 0,
+

Re: [PULL 57/62] hw/xen: Support MSI mapping to PIRQ

2023-07-04 Thread David Woodhouse

On Fri, 2023-06-23 at 14:27 +0100, Peter Maydell wrote:
> On Thu, 6 Apr 2023 at 17:25, Woodhouse, David  wrote:
> > 
> > On Thu, 2023-04-06 at 16:48 +0100, Peter Maydell wrote:
> > > On Thu, 2 Mar 2023 at 12:37, Paolo Bonzini  wrote:
> > > > 
> > > > From: David Woodhouse 
> > > > 
> > > > The way that Xen handles MSI PIRQs is kind of awful.
> > > 
> > > > Now that this is working we can finally enable XENFEAT_hvm_pirqs and
> > > > let the guest use it all.
> > > > 
> > > 
> > > Hi; Coverity points out a logic error in this code (CID 1507603):
> > > 
> > > > @@ -1638,6 +1877,7 @@ int xen_physdev_unmap_pirq(struct 
> > > > physdev_unmap_pirq *unmap)
> > > > 
> > > >  /* We can only unmap GSI PIRQs */
> > > >  if (gsi < 0) {
> > > > +    qemu_mutex_unlock(>port_lock);
> > > >  return -EINVAL;
> > > >  }
> > > 
> > > One of the things xen_physdev_unmap_pirq() does early is return
> > > if gsi is a negative value...
> > > 
> > > > @@ -1646,6 +1886,12 @@ int xen_physdev_unmap_pirq(struct 
> > > > physdev_unmap_pirq *unmap)
> > > >  pirq_inuse_word(s, pirq) &= ~pirq_inuse_bit(pirq);
> > > > 
> > > >  trace_kvm_xen_unmap_pirq(pirq, gsi);
> > > > +    qemu_mutex_unlock(>port_lock);
> > > > +
> > > > +    if (gsi == IRQ_MSI_EMU) {
> > > 
> > > ...but then later we try to test to see if it is IRQ_MSI_EMU.
> > > IRQ_MSI_EMU is -3, so this condition can never be true.
> > > 
> > > > +    kvm_update_msi_routes_all(NULL, true, 0, 0);
> > > > +    }
> > > 
> > > What was the intention here ?
> > 
> > 
> > Hrm the way that Xen automatically maps the MSI to a PIRQ by
> > snooping on the (masked) writes to the MSI target is awful, as noted.
> > 
> > I don't think Linux guests ever do unmap the MSI PIRQ but it might be
> > possible; I'll have to do some experiments in Xen to see what happens.
> 
> Did you ever figure out what the right thing here was ?

Yeah, I do I think that Xen will actually allow the guest to unmap the
IRQ_MSI_EMU pirq, and it will mask the underlying MSI when it does so,
just as it *unmasks* the MSI when the guest binds the irq.

But that's partly because Xen handles the awfulness with snooping on
the MSI table writes differently. The PIRQ isn't automatically marked
as IRQ_MSI_EMU when the MSI message is written; there's a different
structure which handles that. And the PIRQ is only set to IRQ_MSI_EMU
when it's actually *delivered*, AFAICT.

Emulating that more faithfully might actually simplify the locking in
qemu, which is fairly complex around the MSI→PIRQ→MSI call paths which
need care to avoid deadlock. I'll see if I can make it simpler.

In the short term though, I think it's harmless that we don't support
unmapping IRQ_MSI_EMU because AFAICT nobody actually *does* it. But
I'll look at fixing it anyway, and that dead code will no longer be
dead. 

smime.p7s
Description: S/MIME cryptographic signature

[PATCH] xen-block: Avoid leaks on new error path

2023-07-04 Thread Anthony PERARD via

From: Anthony PERARD 

Commit 189829399070 ("xen-block: Use specific blockdev driver")
introduced a new error path, without taking care of allocated
resources.

So only allocate the qdicts after the error check, and free both
`filename` and `driver` when we are about to return and thus taking
care of both success and error path.

Coverity only spotted the leak of qdicts (*_layer variables).

Reported-by: Peter Maydell 
Fixes: Coverity CID 1508722, 1398649
Fixes: 189829399070 ("xen-block: Use specific blockdev driver")
Signed-off-by: Anthony PERARD 
---
 hw/block/xen-block.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/hw/block/xen-block.c b/hw/block/xen-block.c
index f099914831..3906b9058b 100644
--- a/hw/block/xen-block.c
+++ b/hw/block/xen-block.c
@@ -781,14 +781,15 @@ static XenBlockDrive *xen_block_drive_create(const char 
*id,
 drive = g_new0(XenBlockDrive, 1);
 drive->id = g_strdup(id);
 
-file_layer = qdict_new();
-driver_layer = qdict_new();
-
 rc = stat(filename, );
 if (rc) {
 error_setg_errno(errp, errno, "Could not stat file '%s'", filename);
 goto done;
 }
+
+file_layer = qdict_new();
+driver_layer = qdict_new();
+
 if (S_ISBLK(st.st_mode)) {
 qdict_put_str(file_layer, "driver", "host_device");
 } else {
@@ -796,7 +797,6 @@ static XenBlockDrive *xen_block_drive_create(const char *id,
 }
 
 qdict_put_str(file_layer, "filename", filename);
-g_free(filename);
 
 if (mode && *mode != 'w') {
 qdict_put_bool(file_layer, "read-only", true);
@@ -831,7 +831,6 @@ static XenBlockDrive *xen_block_drive_create(const char *id,
 qdict_put_str(file_layer, "locking", "off");
 
 qdict_put_str(driver_layer, "driver", driver);
-g_free(driver);
 
 qdict_put(driver_layer, "file", file_layer);
 
@@ -842,6 +841,8 @@ static XenBlockDrive *xen_block_drive_create(const char *id,
 qobject_unref(driver_layer);
 
 done:
+g_free(filename);
+g_free(driver);
 if (*errp) {
 xen_block_drive_destroy(drive, NULL);
 return NULL;
-- 
Anthony PERARD

Re: [PATCH] pnv/xive: Allow mmio operations of any size on the ESB CI pages

2023-07-04 Thread Cédric Le Goater


On 7/4/23 16:48, Frederic Barrat wrote:

We currently only allow 64-bit operations on the ESB CI pages. There's
no real reason for that limitation, skiboot/linux didn't need
more. However the hardware supports any size, so this patch relaxes
that restriction. It impacts both the ESB pages for "normal"
interrupts as well as the ESB pages for escalation interrupts defined
for the ENDs.

Signed-off-by: Frederic Barrat 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---

This should wrap-up the cleanup about mmio size for the xive BARs. The
NVPG and NVC BAR accesses should also be relaxed but we don't really
implement them, any load/store currently fails. Something to address
when/if we implement them.

  hw/intc/xive.c  | 8 
  hw/intc/xive2.c | 4 ++--
  2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/hw/intc/xive.c b/hw/intc/xive.c
index f60c878345..c014e961a4 100644
--- a/hw/intc/xive.c
+++ b/hw/intc/xive.c
@@ -1175,11 +1175,11 @@ static const MemoryRegionOps xive_source_esb_ops = {
  .write = xive_source_esb_write,
  .endianness = DEVICE_BIG_ENDIAN,
  .valid = {
-.min_access_size = 8,
+.min_access_size = 1,
  .max_access_size = 8,
  },
  .impl = {
-.min_access_size = 8,
+.min_access_size = 1,
  .max_access_size = 8,
  },
  };
@@ -2006,11 +2006,11 @@ static const MemoryRegionOps xive_end_source_ops = {
  .write = xive_end_source_write,
  .endianness = DEVICE_BIG_ENDIAN,
  .valid = {
-.min_access_size = 8,
+.min_access_size = 1,
  .max_access_size = 8,
  },
  .impl = {
-.min_access_size = 8,
+.min_access_size = 1,
  .max_access_size = 8,
  },
  };
diff --git a/hw/intc/xive2.c b/hw/intc/xive2.c
index 4d9ff41956..c37ef25d44 100644
--- a/hw/intc/xive2.c
+++ b/hw/intc/xive2.c
@@ -954,11 +954,11 @@ static const MemoryRegionOps xive2_end_source_ops = {
  .write = xive2_end_source_write,
  .endianness = DEVICE_BIG_ENDIAN,
  .valid = {
-.min_access_size = 8,
+.min_access_size = 1,
  .max_access_size = 8,
  },
  .impl = {
-.min_access_size = 8,
+.min_access_size = 1,
  .max_access_size = 8,
  },
  };

Re: [RFC PATCH 0/4] ppc: Improve multisocket support

2023-07-04 Thread Cédric Le Goater


On 7/4/23 18:20, Frederic Barrat wrote:



On 04/07/2023 15:49, Cédric Le Goater wrote:

Hello,

Here are changes improving multisocket support of the XIVE models
(POWER9 only). When a source has an END target on another chip, the
XIVE IC will use an MMIO store to forward the notification to the
remote chip. The long term plan is to get rid of pnv_xive_get_remote()
whic is a modeling shortcut. I have had them for while, they compile,
they seem to still work but this is not for merge yet. If someone
could take over, that would be nice.

The best way to test is to start a 2 sockets * 1 cpu system with devices
attached to the PCI buses of chip 0 and to offline CPU 0. All sources
should be configured to be served by CPU 1 on socket 1 and trigger
notifications on chip 0 should be forwarded to chip 1.

Last patch adds support for degenerative interrupts. This is used by
the lowest level FW of POWER systems. Difficult to test.




Thanks for the series! My crystal ball tells me the PC MMIO patch will come 
handy soon (to be adapted for P10 and groups). And the remote routing looks 
pretty interesting too.


I am glad !

I think the first 2 patches are a good addition. They remove a
qdev_get_machine() call which is ugly. P10 has the same kind of
shortcut.

C.

Re: [PATCH v4 6/6] target/riscv: select KVM AIA in riscv virt machine

2023-07-04 Thread Andrew Jones

On Wed, Jun 21, 2023 at 02:54:56PM +, Yong-Xuan Wang wrote:
> Select KVM AIA when the host kernel has in-kernel AIA chip support.
> Since KVM AIA only has one APLIC instance, we map the QEMU APLIC
> devices to KVM APLIC.
> We also extend virt machine to specify the KVM AIA mode. The "kvm-aia"
> parameter is passed along with machine name in QEMU command-line.
> 1) "kvm-aia=emul": IMSIC is emulated by hypervisor
> 2) "kvm-aia=hwaccel": use hardware guest IMSIC
> 3) "kvm-aia=auto": use the hardware guest IMSICs whenever available
>otherwise we fallback to software emulation.
> 
> Signed-off-by: Yong-Xuan Wang 
> Reviewed-by: Jim Shu 
> ---
>  hw/riscv/virt.c | 92 ++---
>  include/hw/riscv/virt.h |  1 +
>  2 files changed, 79 insertions(+), 14 deletions(-)
> 
> diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c
> index 4a1d29a741..efa176a184 100644
> --- a/hw/riscv/virt.c
> +++ b/hw/riscv/virt.c
> @@ -35,6 +35,7 @@
>  #include "hw/riscv/virt.h"
>  #include "hw/riscv/boot.h"
>  #include "hw/riscv/numa.h"
> +#include "kvm_riscv.h"
>  #include "hw/intc/riscv_aclint.h"
>  #include "hw/intc/riscv_aplic.h"
>  #include "hw/intc/riscv_imsic.h"
> @@ -74,6 +75,12 @@
>  #error "Can't accomodate all IMSIC groups in address space"
>  #endif
>  
> +/* KVM AIA only supports APLIC.m. APLIC.w is always emulated by QEMU. */

"APLIC MSI" and "APLIC Wired"

> +static bool virt_use_kvm_aia(RISCVVirtState *s)
> +{
> +return kvm_irqchip_in_kernel() && s->aia_type == 
> VIRT_AIA_TYPE_APLIC_IMSIC;
> +}
> +
>  static const MemMapEntry virt_memmap[] = {
>  [VIRT_DEBUG] ={0x0, 0x100 },
>  [VIRT_MROM] = { 0x1000,0xf000 },
> @@ -642,7 +649,8 @@ static void create_fdt_socket_aplic(RISCVVirtState *s,
>  uint32_t msi_s_phandle,
>  uint32_t *phandle,
>  uint32_t *intc_phandles,
> -uint32_t *aplic_phandles)
> +uint32_t *aplic_phandles,
> +int num_harts)
>  {
>  int cpu;
>  char *aplic_name;
> @@ -653,11 +661,11 @@ static void create_fdt_socket_aplic(RISCVVirtState *s,
>  
>  aplic_m_phandle = (*phandle)++;
>  aplic_s_phandle = (*phandle)++;
> -aplic_cells = g_new0(uint32_t, s->soc[socket].num_harts * 2);
> +aplic_cells = g_new0(uint32_t, num_harts * 2);
>  
>  if (!kvm_enabled()) {
>  /* M-level APLIC node */
> -for (cpu = 0; cpu < s->soc[socket].num_harts; cpu++) {
> +for (cpu = 0; cpu < num_harts; cpu++) {
>  aplic_cells[cpu * 2 + 0] = cpu_to_be32(intc_phandles[cpu]);
>  aplic_cells[cpu * 2 + 1] = cpu_to_be32(IRQ_M_EXT);
>  }

There are a couple other instances of s->soc[socket].num_harts in this
function that can be changed to num_harts.

> @@ -691,7 +699,7 @@ static void create_fdt_socket_aplic(RISCVVirtState *s,
>  }
>  
>  /* S-level APLIC node */
> -for (cpu = 0; cpu < s->soc[socket].num_harts; cpu++) {
> +for (cpu = 0; cpu < num_harts; cpu++) {
>  aplic_cells[cpu * 2 + 0] = cpu_to_be32(intc_phandles[cpu]);
>  aplic_cells[cpu * 2 + 1] = cpu_to_be32(IRQ_S_EXT);
>  }
> @@ -798,17 +806,25 @@ static void create_fdt_sockets(RISCVVirtState *s, const 
> MemMapEntry *memmap,
>  *msi_pcie_phandle = msi_s_phandle;
>  }
>  
> -phandle_pos = ms->smp.cpus;
> -for (socket = (socket_count - 1); socket >= 0; socket--) {
> -phandle_pos -= s->soc[socket].num_harts;
> +/* KVM AIA only has one APLIC instance */
> +if (virt_use_kvm_aia(s)) {
> +create_fdt_socket_aplic(s, memmap, 0,
> +msi_m_phandle, msi_s_phandle, phandle,
> +_phandles[0], xplic_phandles, ms->smp.cpus);
> +} else {
> +phandle_pos = ms->smp.cpus;
> +for (socket = (socket_count - 1); socket >= 0; socket--) {
> +phandle_pos -= s->soc[socket].num_harts;
>  
> -if (s->aia_type == VIRT_AIA_TYPE_NONE) {
> -create_fdt_socket_plic(s, memmap, socket, phandle,
> -_phandles[phandle_pos], xplic_phandles);
> -} else {
> -create_fdt_socket_aplic(s, memmap, socket,
> -msi_m_phandle, msi_s_phandle, phandle,
> -_phandles[phandle_pos], xplic_phandles);
> +if (s->aia_type == VIRT_AIA_TYPE_NONE) {
> +create_fdt_socket_plic(s, memmap, socket, phandle,
> +_phandles[phandle_pos], xplic_phandles);
> +} else {
> +create_fdt_socket_aplic(s, memmap, socket,
> +msi_m_phandle, msi_s_phandle, phandle,
> +_phandles[phandle_pos], xplic_phandles,
> +s->soc[socket].num_harts);
> +}
>  }
>  }
>  
> @@ -819,6 +835,9 @@ static void

[PULL 02/11] hw/arm/sbsa-ref: use XHCI to replace EHCI

2023-07-04 Thread Peter Maydell

From: Yuquan Wang 

The current sbsa-ref cannot use EHCI controller which is only
able to do 32-bit DMA, since sbsa-ref doesn't have RAM below 4GB.
Hence, this uses XHCI to provide a usb controller with 64-bit
DMA capablity instead of EHCI.

We bump the platform version to 0.3 with this change.  Although the
hardware at the USB controller address changes, the firmware and
Linux can both cope with this -- on an older non-XHCI-aware
firmware/kernel setup the probe routine simply fails and the guest
proceeds without any USB.  (This isn't a loss of functionality,
because the old USB controller never worked in the first place.) So
we can call this a backwards-compatible change and only bump the
minor version.

Signed-off-by: Yuquan Wang 
Message-id: 20230621103847.447508-2-wangyuquan1...@phytium.com.cn
[PMM: tweaked commit message; add line to docs about what
 changes in platform version 0.3]
Reviewed-by: Peter Maydell 
Signed-off-by: Peter Maydell 
---
 docs/system/arm/sbsa.rst |  5 -
 hw/arm/sbsa-ref.c| 23 +--
 hw/arm/Kconfig   |  2 +-
 3 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/docs/system/arm/sbsa.rst b/docs/system/arm/sbsa.rst
index a8e0b530a24..bca61608ff8 100644
--- a/docs/system/arm/sbsa.rst
+++ b/docs/system/arm/sbsa.rst
@@ -19,7 +19,7 @@ The ``sbsa-ref`` board supports:
   - A configurable number of AArch64 CPUs
   - GIC version 3
   - System bus AHCI controller
-  - System bus EHCI controller
+  - System bus XHCI controller
   - CDROM and hard disc on AHCI bus
   - E1000E ethernet card on PCIe bus
   - Bochs display adapter on PCIe bus
@@ -68,3 +68,6 @@ Platform version changes:
 
 0.2
   GIC ITS information is present in devicetree.
+
+0.3
+  The USB controller is an XHCI device, not EHCI
diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c
index 82a28b2e0be..1a8519b868f 100644
--- a/hw/arm/sbsa-ref.c
+++ b/hw/arm/sbsa-ref.c
@@ -42,6 +42,7 @@
 #include "hw/pci-host/gpex.h"
 #include "hw/qdev-properties.h"
 #include "hw/usb.h"
+#include "hw/usb/xhci.h"
 #include "hw/char/pl011.h"
 #include "hw/watchdog/sbsa_gwdt.h"
 #include "net/net.h"
@@ -85,7 +86,7 @@ enum {
 SBSA_SECURE_UART_MM,
 SBSA_SECURE_MEM,
 SBSA_AHCI,
-SBSA_EHCI,
+SBSA_XHCI,
 };
 
 struct SBSAMachineState {
@@ -123,7 +124,7 @@ static const MemMapEntry sbsa_ref_memmap[] = {
 [SBSA_SMMU] =   { 0x6005, 0x0002 },
 /* Space here reserved for more SMMUs */
 [SBSA_AHCI] =   { 0x6010, 0x0001 },
-[SBSA_EHCI] =   { 0x6011, 0x0001 },
+[SBSA_XHCI] =   { 0x6011, 0x0001 },
 /* Space here reserved for other devices */
 [SBSA_PCIE_PIO] =   { 0x7fff, 0x0001 },
 /* 32-bit address PCIE MMIO space */
@@ -143,7 +144,7 @@ static const int sbsa_ref_irqmap[] = {
 [SBSA_SECURE_UART] = 8,
 [SBSA_SECURE_UART_MM] = 9,
 [SBSA_AHCI] = 10,
-[SBSA_EHCI] = 11,
+[SBSA_XHCI] = 11,
 [SBSA_SMMU] = 12, /* ... to 15 */
 [SBSA_GWDT_WS0] = 16,
 };
@@ -230,7 +231,7 @@ static void create_fdt(SBSAMachineState *sms)
  *fw compatibility.
  */
 qemu_fdt_setprop_cell(fdt, "/", "machine-version-major", 0);
-qemu_fdt_setprop_cell(fdt, "/", "machine-version-minor", 2);
+qemu_fdt_setprop_cell(fdt, "/", "machine-version-minor", 3);
 
 if (ms->numa_state->have_numa_distance) {
 int size = nb_numa_nodes * nb_numa_nodes * 3 * sizeof(uint32_t);
@@ -604,13 +605,15 @@ static void create_ahci(const SBSAMachineState *sms)
 }
 }
 
-static void create_ehci(const SBSAMachineState *sms)
+static void create_xhci(const SBSAMachineState *sms)
 {
-hwaddr base = sbsa_ref_memmap[SBSA_EHCI].base;
-int irq = sbsa_ref_irqmap[SBSA_EHCI];
+hwaddr base = sbsa_ref_memmap[SBSA_XHCI].base;
+int irq = sbsa_ref_irqmap[SBSA_XHCI];
+DeviceState *dev = qdev_new(TYPE_XHCI_SYSBUS);
 
-sysbus_create_simple("platform-ehci-usb", base,
- qdev_get_gpio_in(sms->gic, irq));
+sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), _fatal);
+sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, base);
+sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, qdev_get_gpio_in(sms->gic, 
irq));
 }
 
 static void create_smmu(const SBSAMachineState *sms, PCIBus *bus)
@@ -832,7 +835,7 @@ static void sbsa_ref_init(MachineState *machine)
 
 create_ahci(sms);
 
-create_ehci(sms);
+create_xhci(sms);
 
 create_pcie(sms);
 
diff --git a/hw/arm/Kconfig b/hw/arm/Kconfig
index 7de17d1e8c3..7e683484405 100644
--- a/hw/arm/Kconfig
+++ b/hw/arm/Kconfig
@@ -266,7 +266,7 @@ config SBSA_REF
 select PL011 # UART
 select PL031 # RTC
 select PL061 # GPIO
-select USB_EHCI_SYSBUS
+select USB_XHCI_SYSBUS
 select WDT_SBSA
 select BOCHS_DISPLAY
 
-- 
2.34.1

[PULL 07/11] tests/tcg/aarch64: Add testcases for IC IVAU and dual-mapped code

2023-07-04 Thread Peter Maydell

From: John Högberg 

https://gitlab.com/qemu-project/qemu/-/issues/1034

Signed-off-by: John Högberg 
Message-id: 168778890374.24232.340213885153806878...@git.sr.ht
Reviewed-by: Peter Maydell 
[PMM: fixed typo in comment]
Signed-off-by: Peter Maydell 
---
 tests/tcg/aarch64/icivau.c| 189 ++
 tests/tcg/aarch64/Makefile.target |   3 +-
 2 files changed, 191 insertions(+), 1 deletion(-)
 create mode 100644 tests/tcg/aarch64/icivau.c

diff --git a/tests/tcg/aarch64/icivau.c b/tests/tcg/aarch64/icivau.c
new file mode 100644
index 000..77b9e98d5e5
--- /dev/null
+++ b/tests/tcg/aarch64/icivau.c
@@ -0,0 +1,189 @@
+/*
+ * Tests the IC IVAU-driven workaround for catching changes made to dual-mapped
+ * code that would otherwise go unnoticed in user mode.
+ *
+ * Copyright (c) 2023 Ericsson AB
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define MAX_CODE_SIZE 128
+
+typedef int (SelfModTest)(uint32_t, uint32_t*);
+typedef int (BasicTest)(int);
+
+static void mark_code_modified(const uint32_t *exec_data, size_t length)
+{
+int dc_required, ic_required;
+unsigned long ctr_el0;
+
+/*
+ * Clear the data/instruction cache, as indicated by the CTR_ELO.{DIC,IDC}
+ * flags.
+ *
+ * For completeness we might be tempted to assert that we should fail when
+ * the whole code update sequence is omitted, but that would make the test
+ * flaky as it can succeed by coincidence on actual hardware.
+ */
+asm ("mrs %0, ctr_el0\n" : "=r"(ctr_el0));
+
+/* CTR_EL0.IDC */
+dc_required = !((ctr_el0 >> 28) & 1);
+
+/* CTR_EL0.DIC */
+ic_required = !((ctr_el0 >> 29) & 1);
+
+if (dc_required) {
+size_t dcache_stride, i;
+
+/*
+ * Step according to the minimum cache size, as the cache maintenance
+ * instructions operate on the cache line of the given address.
+ *
+ * We assume that exec_data is properly aligned.
+ */
+dcache_stride = (4 << ((ctr_el0 >> 16) & 0xF));
+
+for (i = 0; i < length; i += dcache_stride) {
+const char *dc_addr = &((const char *)exec_data)[i];
+asm volatile ("dc cvau, %x[dc_addr]\n"
+  : /* no outputs */
+  : [dc_addr] "r"(dc_addr)
+  : "memory");
+}
+
+asm volatile ("dmb ish\n");
+}
+
+if (ic_required) {
+size_t icache_stride, i;
+
+icache_stride = (4 << (ctr_el0 & 0xF));
+
+for (i = 0; i < length; i += icache_stride) {
+const char *ic_addr = &((const char *)exec_data)[i];
+asm volatile ("ic ivau, %x[ic_addr]\n"
+  : /* no outputs */
+  : [ic_addr] "r"(ic_addr)
+  : "memory");
+}
+
+asm volatile ("dmb ish\n");
+}
+
+asm volatile ("isb sy\n");
+}
+
+static int basic_test(uint32_t *rw_data, const uint32_t *exec_data)
+{
+/*
+ * As user mode only misbehaved for dual-mapped code when previously
+ * translated code had been changed, we'll start off with this basic test
+ * function to ensure that there's already some translated code at
+ * exec_data before the next test. This should cause the next test to fail
+ * if `mark_code_modified` fails to invalidate the code.
+ *
+ * Note that the payload is in binary form instead of inline assembler
+ * because we cannot use __attribute__((naked)) on this platform and the
+ * workarounds are at least as ugly as this is.
+ */
+static const uint32_t basic_payload[] = {
+0xD65F03C0 /* 0x00: RET */
+};
+
+BasicTest *copied_ptr = (BasicTest *)exec_data;
+
+memcpy(rw_data, basic_payload, sizeof(basic_payload));
+mark_code_modified(exec_data, sizeof(basic_payload));
+
+return copied_ptr(1234) == 1234;
+}
+
+static int self_modification_test(uint32_t *rw_data, const uint32_t *exec_data)
+{
+/*
+ * This test is self-modifying in an attempt to cover an edge case where
+ * the IC IVAU instruction invalidates itself.
+ *
+ * Note that the IC IVAU instruction is 16 bytes into the function, in what
+ * will be the same cache line as the modified instruction on machines with
+ * a cache line size >= 16 bytes.
+ */
+static const uint32_t self_mod_payload[] = {
+/* Overwrite the placeholder instruction with the new one. */
+0xB9001C20, /* 0x00: STR w0, [x1, 0x1C] */
+
+/* Get the executable address of the modified instruction. */
+0x10A8, /* 0x04: ADR x8, <0x1C> */
+
+/* Mark the modified instruction as updated. */
+0xD50B7B28, /* 0x08: DC CVAU x8 */
+0xD5033BBF, /* 0x0C: DMB ISH */
+0xD50B7528, /* 0x10: IC IVAU x8 */
+0xD5033BBF, /* 0x14: DMB ISH */
+0xD5033FDF, /* 0x18: ISB */
+
+

[PULL 04/11] target/arm: Dump ZA[] when active

2023-07-04 Thread Peter Maydell

From: Richard Henderson 

Always print each matrix row whole, one per line, so that we
get the entire matrix in the proper shape.

Signed-off-by: Richard Henderson 
Message-id: 20230622151201.1578522-3-richard.hender...@linaro.org
Reviewed-by: Peter Maydell 
Signed-off-by: Peter Maydell 
---
 target/arm/cpu.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index f12c714bc43..adf84f96860 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -1082,6 +1082,24 @@ static void aarch64_cpu_dump_state(CPUState *cs, FILE 
*f, int flags)
  i, q[1], q[0], (i & 1 ? "\n" : " "));
 }
 }
+
+if (cpu_isar_feature(aa64_sme, cpu) &&
+FIELD_EX64(env->svcr, SVCR, ZA) &&
+sme_exception_el(env, el) == 0) {
+int zcr_len = sve_vqm1_for_el_sm(env, el, true);
+int svl = (zcr_len + 1) * 16;
+int svl_lg10 = svl < 100 ? 2 : 3;
+
+for (i = 0; i < svl; i++) {
+qemu_fprintf(f, "ZA[%0*d]=", svl_lg10, i);
+for (j = zcr_len; j >= 0; --j) {
+qemu_fprintf(f, "%016" PRIx64 ":%016" PRIx64 "%c",
+ env->zarray[i].d[2 * j + 1],
+ env->zarray[i].d[2 * j],
+ j ? ':' : '\n');
+}
+}
+}
 }
 
 #else
-- 
2.34.1

[PULL 08/11] tests/qtest: xlnx-canfd-test: Fix code coverity issues

2023-07-04 Thread Peter Maydell

From: Vikram Garhwal 

Following are done to fix the coverity issues:
1. Change read_data to fix the CID 1512899: Out-of-bounds access (OVERRUN)
2. Fix match_rx_tx_data to fix CID 1512900: Logically dead code (DEADCODE)
3. Replace rand() in generate_random_data() with g_rand_int()

Signed-off-by: Vikram Garhwal 
Message-id: 20230628202758.16398-1-vikram.garh...@amd.com
Reviewed-by: Peter Maydell 
Signed-off-by: Peter Maydell 
---
 tests/qtest/xlnx-canfd-test.c | 33 +++--
 1 file changed, 11 insertions(+), 22 deletions(-)

diff --git a/tests/qtest/xlnx-canfd-test.c b/tests/qtest/xlnx-canfd-test.c
index 76ee106d4f4..78ec9ef2a76 100644
--- a/tests/qtest/xlnx-canfd-test.c
+++ b/tests/qtest/xlnx-canfd-test.c
@@ -170,23 +170,23 @@ static void generate_random_data(uint32_t *buf_tx, bool 
is_canfd_frame)
 /* Generate random TX data for CANFD frame. */
 if (is_canfd_frame) {
 for (int i = 0; i < CANFD_FRAME_SIZE - 2; i++) {
-buf_tx[2 + i] = rand();
+buf_tx[2 + i] = g_random_int();
 }
 } else {
 /* Generate random TX data for CAN frame. */
 for (int i = 0; i < CAN_FRAME_SIZE - 2; i++) {
-buf_tx[2 + i] = rand();
+buf_tx[2 + i] = g_random_int();
 }
 }
 }
 
-static void read_data(QTestState *qts, uint64_t can_base_addr, uint32_t 
*buf_rx)
+static void read_data(QTestState *qts, uint64_t can_base_addr, uint32_t 
*buf_rx,
+  uint32_t frame_size)
 {
 uint32_t int_status;
 uint32_t fifo_status_reg_value;
 /* At which RX FIFO the received data is stored. */
 uint8_t store_ind = 0;
-bool is_canfd_frame = false;
 
 /* Read the interrupt on CANFD rx. */
 int_status = qtest_readl(qts, can_base_addr + R_ISR_OFFSET) & ISR_RXOK;
@@ -207,16 +207,9 @@ static void read_data(QTestState *qts, uint64_t 
can_base_addr, uint32_t *buf_rx)
 buf_rx[0] = qtest_readl(qts, can_base_addr + R_RX0_ID_OFFSET);
 buf_rx[1] = qtest_readl(qts, can_base_addr + R_RX0_DLC_OFFSET);
 
-is_canfd_frame = (buf_rx[1] >> DLC_FD_BIT_SHIFT) & 1;
-
-if (is_canfd_frame) {
-for (int i = 0; i < CANFD_FRAME_SIZE - 2; i++) {
-buf_rx[i + 2] = qtest_readl(qts,
-can_base_addr + R_RX0_DATA1_OFFSET + 4 * 
i);
-}
-} else {
-buf_rx[2] = qtest_readl(qts, can_base_addr + R_RX0_DATA1_OFFSET);
-buf_rx[3] = qtest_readl(qts, can_base_addr + R_RX0_DATA2_OFFSET);
+for (int i = 0; i < frame_size - 2; i++) {
+buf_rx[i + 2] = qtest_readl(qts,
+can_base_addr + R_RX0_DATA1_OFFSET + 4 * i);
 }
 
 /* Clear the RX interrupt. */
@@ -272,10 +265,6 @@ static void match_rx_tx_data(const uint32_t *buf_tx, const 
uint32_t *buf_rx,
 g_assert_cmpint((buf_rx[size] & DLC_FD_BIT_MASK), ==,
 (buf_tx[size] & DLC_FD_BIT_MASK));
 } else {
-if (!is_canfd_frame && size == 4) {
-break;
-}
-
 g_assert_cmpint(buf_rx[size], ==, buf_tx[size]);
 }
 
@@ -318,7 +307,7 @@ static void test_can_data_transfer(void)
 write_data(qts, CANFD0_BASE_ADDR, buf_tx, false);
 
 send_data(qts, CANFD0_BASE_ADDR);
-read_data(qts, CANFD1_BASE_ADDR, buf_rx);
+read_data(qts, CANFD1_BASE_ADDR, buf_rx, CAN_FRAME_SIZE);
 match_rx_tx_data(buf_tx, buf_rx, false);
 
 qtest_quit(qts);
@@ -358,7 +347,7 @@ static void test_canfd_data_transfer(void)
 write_data(qts, CANFD0_BASE_ADDR, buf_tx, true);
 
 send_data(qts, CANFD0_BASE_ADDR);
-read_data(qts, CANFD1_BASE_ADDR, buf_rx);
+read_data(qts, CANFD1_BASE_ADDR, buf_rx, CANFD_FRAME_SIZE);
 match_rx_tx_data(buf_tx, buf_rx, true);
 
 qtest_quit(qts);
@@ -397,7 +386,7 @@ static void test_can_loopback(void)
 write_data(qts, CANFD0_BASE_ADDR, buf_tx, true);
 
 send_data(qts, CANFD0_BASE_ADDR);
-read_data(qts, CANFD0_BASE_ADDR, buf_rx);
+read_data(qts, CANFD0_BASE_ADDR, buf_rx, CANFD_FRAME_SIZE);
 match_rx_tx_data(buf_tx, buf_rx, true);
 
 generate_random_data(buf_tx, true);
@@ -405,7 +394,7 @@ static void test_can_loopback(void)
 write_data(qts, CANFD1_BASE_ADDR, buf_tx, true);
 
 send_data(qts, CANFD1_BASE_ADDR);
-read_data(qts, CANFD1_BASE_ADDR, buf_rx);
+read_data(qts, CANFD1_BASE_ADDR, buf_rx, CANFD_FRAME_SIZE);
 match_rx_tx_data(buf_tx, buf_rx, true);
 
 qtest_quit(qts);
-- 
2.34.1

[PULL 10/11] hw: arm: allwinner-sramc: Set class_size

2023-07-04 Thread Peter Maydell

From: Akihiko Odaki 

AwSRAMCClass is larger than SysBusDeviceClass so the class size must be
advertised accordingly.

Fixes: 05def917e1 ("hw: arm: allwinner-sramc: Add SRAM Controller support for 
R40")
Signed-off-by: Akihiko Odaki 
Reviewed-by: Philippe Mathieu-Daudé 
Reviewed-by: Richard Henderson 
Message-id: 20230628110905.38125-1-akihiko.od...@daynix.com
Signed-off-by: Peter Maydell 
---
 hw/misc/allwinner-sramc.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/hw/misc/allwinner-sramc.c b/hw/misc/allwinner-sramc.c
index a8b731f8f28..d76c24d081f 100644
--- a/hw/misc/allwinner-sramc.c
+++ b/hw/misc/allwinner-sramc.c
@@ -159,6 +159,7 @@ static const TypeInfo allwinner_sramc_info = {
 .parent= TYPE_SYS_BUS_DEVICE,
 .instance_init = allwinner_sramc_init,
 .instance_size = sizeof(AwSRAMCState),
+.class_size= sizeof(AwSRAMCClass),
 .class_init= allwinner_sramc_class_init,
 };
 
-- 
2.34.1

[PULL 03/11] target/arm: Avoid splitting Zregs across lines in dump

2023-07-04 Thread Peter Maydell

From: Richard Henderson 

Allow the line length to extend to 548 columns.  While annoyingly wide,
it's still less confusing than the continuations we print.  Also, the
default VL used by Linux (and max for A64FX) uses only 140 columns.

Signed-off-by: Richard Henderson 
Message-id: 20230622151201.1578522-2-richard.hender...@linaro.org
Reviewed-by: Peter Maydell 
Signed-off-by: Peter Maydell 
---
 target/arm/cpu.c | 36 ++--
 1 file changed, 14 insertions(+), 22 deletions(-)

diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index a1e77698ba2..f12c714bc43 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -955,7 +955,7 @@ static void aarch64_cpu_dump_state(CPUState *cs, FILE *f, 
int flags)
 ARMCPU *cpu = ARM_CPU(cs);
 CPUARMState *env = >env;
 uint32_t psr = pstate_read(env);
-int i;
+int i, j;
 int el = arm_current_el(env);
 const char *ns_status;
 bool sve;
@@ -1014,7 +1014,7 @@ static void aarch64_cpu_dump_state(CPUState *cs, FILE *f, 
int flags)
 }
 
 if (sve) {
-int j, zcr_len = sve_vqm1_for_el(env, el);
+int zcr_len = sve_vqm1_for_el(env, el);
 
 for (i = 0; i <= FFR_PRED_NUM; i++) {
 bool eol;
@@ -1054,32 +1054,24 @@ static void aarch64_cpu_dump_state(CPUState *cs, FILE 
*f, int flags)
 }
 }
 
-for (i = 0; i < 32; i++) {
-if (zcr_len == 0) {
+if (zcr_len == 0) {
+/*
+ * With vl=16, there are only 37 columns per register,
+ * so output two registers per line.
+ */
+for (i = 0; i < 32; i++) {
 qemu_fprintf(f, "Z%02d=%016" PRIx64 ":%016" PRIx64 "%s",
  i, env->vfp.zregs[i].d[1],
  env->vfp.zregs[i].d[0], i & 1 ? "\n" : " ");
-} else if (zcr_len == 1) {
-qemu_fprintf(f, "Z%02d=%016" PRIx64 ":%016" PRIx64
- ":%016" PRIx64 ":%016" PRIx64 "\n",
- i, env->vfp.zregs[i].d[3], env->vfp.zregs[i].d[2],
- env->vfp.zregs[i].d[1], env->vfp.zregs[i].d[0]);
-} else {
+}
+} else {
+for (i = 0; i < 32; i++) {
+qemu_fprintf(f, "Z%02d=", i);
 for (j = zcr_len; j >= 0; j--) {
-bool odd = (zcr_len - j) % 2 != 0;
-if (j == zcr_len) {
-qemu_fprintf(f, "Z%02d[%x-%x]=", i, j, j - 1);
-} else if (!odd) {
-if (j > 0) {
-qemu_fprintf(f, "   [%x-%x]=", j, j - 1);
-} else {
-qemu_fprintf(f, " [%x]=", j);
-}
-}
 qemu_fprintf(f, "%016" PRIx64 ":%016" PRIx64 "%s",
  env->vfp.zregs[i].d[j * 2 + 1],
- env->vfp.zregs[i].d[j * 2],
- odd || j == 0 ? "\n" : ":");
+ env->vfp.zregs[i].d[j * 2 + 0],
+ j ? ":" : "\n");
 }
 }
 }
-- 
2.34.1

[PULL 05/11] target/arm: Fix SME full tile indexing

2023-07-04 Thread Peter Maydell

From: Richard Henderson 

For the outer product set of insns, which take an entire matrix
tile as output, the argument is not a combined tile+column.
Therefore using get_tile_rowcol was incorrect, as we extracted
the tile number from itself.

The test case relies only on assembler support for SME, since
no release of GCC recognizes -march=armv9-a+sme yet.

Cc: qemu-sta...@nongnu.org
Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1620
Signed-off-by: Richard Henderson 
Message-id: 20230622151201.1578522-5-richard.hender...@linaro.org
Reviewed-by: Peter Maydell 
Signed-off-by: Peter Maydell 
---
 target/arm/tcg/translate-sme.c| 24 ++---
 tests/tcg/aarch64/sme-outprod1.c  | 83 +++
 tests/tcg/aarch64/Makefile.target | 10 ++--
 3 files changed, 108 insertions(+), 9 deletions(-)
 create mode 100644 tests/tcg/aarch64/sme-outprod1.c

diff --git a/target/arm/tcg/translate-sme.c b/target/arm/tcg/translate-sme.c
index d0054e3f775..6038b0a06f1 100644
--- a/target/arm/tcg/translate-sme.c
+++ b/target/arm/tcg/translate-sme.c
@@ -95,6 +95,21 @@ static TCGv_ptr get_tile_rowcol(DisasContext *s, int esz, 
int rs,
 return addr;
 }
 
+/*
+ * Resolve tile.size[0] to a host pointer.
+ * Used by e.g. outer product insns where we require the entire tile.
+ */
+static TCGv_ptr get_tile(DisasContext *s, int esz, int tile)
+{
+TCGv_ptr addr = tcg_temp_new_ptr();
+int offset;
+
+offset = tile * sizeof(ARMVectorReg) + offsetof(CPUARMState, zarray);
+
+tcg_gen_addi_ptr(addr, cpu_env, offset);
+return addr;
+}
+
 static bool trans_ZERO(DisasContext *s, arg_ZERO *a)
 {
 if (!dc_isar_feature(aa64_sme, s)) {
@@ -260,8 +275,7 @@ static bool do_adda(DisasContext *s, arg_adda *a, MemOp esz,
 return true;
 }
 
-/* Sum XZR+zad to find ZAd. */
-za = get_tile_rowcol(s, esz, 31, a->zad, false);
+za = get_tile(s, esz, a->zad);
 zn = vec_full_reg_ptr(s, a->zn);
 pn = pred_full_reg_ptr(s, a->pn);
 pm = pred_full_reg_ptr(s, a->pm);
@@ -286,8 +300,7 @@ static bool do_outprod(DisasContext *s, arg_op *a, MemOp 
esz,
 return true;
 }
 
-/* Sum XZR+zad to find ZAd. */
-za = get_tile_rowcol(s, esz, 31, a->zad, false);
+za = get_tile(s, esz, a->zad);
 zn = vec_full_reg_ptr(s, a->zn);
 zm = vec_full_reg_ptr(s, a->zm);
 pn = pred_full_reg_ptr(s, a->pn);
@@ -308,8 +321,7 @@ static bool do_outprod_fpst(DisasContext *s, arg_op *a, 
MemOp esz,
 return true;
 }
 
-/* Sum XZR+zad to find ZAd. */
-za = get_tile_rowcol(s, esz, 31, a->zad, false);
+za = get_tile(s, esz, a->zad);
 zn = vec_full_reg_ptr(s, a->zn);
 zm = vec_full_reg_ptr(s, a->zm);
 pn = pred_full_reg_ptr(s, a->pn);
diff --git a/tests/tcg/aarch64/sme-outprod1.c b/tests/tcg/aarch64/sme-outprod1.c
new file mode 100644
index 000..6e5972d75e3
--- /dev/null
+++ b/tests/tcg/aarch64/sme-outprod1.c
@@ -0,0 +1,83 @@
+/*
+ * SME outer product, 1 x 1.
+ * SPDX-License-Identifier: GPL-2.0-or-later
+ */
+
+#include 
+
+extern void foo(float *dst);
+
+asm(
+"  .arch_extension sme\n"
+"  .type foo, @function\n"
+"foo:\n"
+"  stp x29, x30, [sp, -80]!\n"
+"  mov x29, sp\n"
+"  stp d8, d9, [sp, 16]\n"
+"  stp d10, d11, [sp, 32]\n"
+"  stp d12, d13, [sp, 48]\n"
+"  stp d14, d15, [sp, 64]\n"
+"  smstart\n"
+"  ptrue p0.s, vl4\n"
+"  fmov z0.s, #1.0\n"
+/*
+ * An outer product of a vector of 1.0 by itself should be a matrix of 1.0.
+ * Note that we are using tile 1 here (za1.s) rather than tile 0.
+ */
+"  zero {za}\n"
+"  fmopa za1.s, p0/m, p0/m, z0.s, z0.s\n"
+/*
+ * Read the first 4x4 sub-matrix of elements from tile 1:
+ * Note that za1h should be interchangable here.
+ */
+"  mov w12, #0\n"
+"  mova z0.s, p0/m, za1v.s[w12, #0]\n"
+"  mova z1.s, p0/m, za1v.s[w12, #1]\n"
+"  mova z2.s, p0/m, za1v.s[w12, #2]\n"
+"  mova z3.s, p0/m, za1v.s[w12, #3]\n"
+/*
+ * And store them to the input pointer (dst in the C code):
+ */
+"  st1w {z0.s}, p0, [x0]\n"
+"  add x0, x0, #16\n"
+"  st1w {z1.s}, p0, [x0]\n"
+"  add x0, x0, #16\n"
+"  st1w {z2.s}, p0, [x0]\n"
+"  add x0, x0, #16\n"
+"  st1w {z3.s}, p0, [x0]\n"
+"  smstop\n"
+"  ldp d8, d9, [sp, 16]\n"
+"  ldp d10, d11, [sp, 32]\n"
+"  ldp d12, d13, [sp, 48]\n"
+"  ldp d14, d15, [sp, 64]\n"
+"  ldp x29, x30, [sp], 80\n"
+"  ret\n"
+"  .size foo, . - foo"
+);
+
+int main()
+{
+float dst[16];
+int i, j;
+
+foo(dst);
+
+for (i = 0; i < 16; i++) {
+if (dst[i] != 1.0f) {
+break;
+}
+}
+
+if (i == 16) {
+return 0; /* success */
+}
+
+/* failure */
+for (i = 0; i < 4; ++i) {
+for (j = 0; j < 4; ++j) {
+printf("%f ", (double)dst[i * 4 + j]);
+}
+printf("\n");
+}
+return 1;
+}
diff --git a/tests/tcg/aarch64/Makefile.target 
b/tests/tcg/aarch64/Makefile.target
index

[PULL 00/11] target-arm queue

2023-07-04 Thread Peter Maydell

Just a collection of bug fixes this time around...

thanks
-- PMM

The following changes since commit 2a6ae69154542caa91dd17c40fd3f5ffbec300de:

  Merge tag 'pull-maintainer-ominbus-030723-1' of 
https://gitlab.com/stsquad/qemu into staging (2023-07-04 08:36:44 +0200)

are available in the Git repository at:

  https://git.linaro.org/people/pmaydell/qemu-arm.git 
tags/pull-target-arm-20230704

for you to fetch changes up to 86a78272f094857b4eda79d721c116e93942aa9a:

  target/xtensa: Assert that interrupt level is within bounds (2023-07-04 
14:27:08 +0100)


target-arm queue:
 * Add raw_writes ops for register whose write induce TLB maintenance
 * hw/arm/sbsa-ref: use XHCI to replace EHCI
 * Avoid splitting Zregs across lines in dump
 * Dump ZA[] when active
 * Fix SME full tile indexing
 * Handle IC IVAU to improve compatibility with JITs
 * xlnx-canfd-test: Fix code coverity issues
 * gdbstub: Guard M-profile code with CONFIG_TCG
 * allwinner-sramc: Set class_size
 * target/xtensa: Assert that interrupt level is within bounds


Akihiko Odaki (1):
  hw: arm: allwinner-sramc: Set class_size

Eric Auger (1):
  target/arm: Add raw_writes ops for register whose write induce TLB 
maintenance

Fabiano Rosas (1):
  target/arm: gdbstub: Guard M-profile code with CONFIG_TCG

John Högberg (2):
  target/arm: Handle IC IVAU to improve compatibility with JITs
  tests/tcg/aarch64: Add testcases for IC IVAU and dual-mapped code

Peter Maydell (1):
  target/xtensa: Assert that interrupt level is within bounds

Richard Henderson (3):
  target/arm: Avoid splitting Zregs across lines in dump
  target/arm: Dump ZA[] when active
  target/arm: Fix SME full tile indexing

Vikram Garhwal (1):
  tests/qtest: xlnx-canfd-test: Fix code coverity issues

Yuquan Wang (1):
  hw/arm/sbsa-ref: use XHCI to replace EHCI

 docs/system/arm/sbsa.rst  |   5 +-
 hw/arm/sbsa-ref.c |  23 +++--
 hw/misc/allwinner-sramc.c |   1 +
 target/arm/cpu.c  |  65 -
 target/arm/gdbstub.c  |   4 +
 target/arm/helper.c   |  70 +++---
 target/arm/tcg/translate-sme.c|  24 +++--
 target/xtensa/exc_helper.c|   3 +
 tests/qtest/xlnx-canfd-test.c |  33 +++
 tests/tcg/aarch64/icivau.c| 189 ++
 tests/tcg/aarch64/sme-outprod1.c  |  83 +
 hw/arm/Kconfig|   2 +-
 tests/tcg/aarch64/Makefile.target |  13 ++-
 13 files changed, 436 insertions(+), 79 deletions(-)
 create mode 100644 tests/tcg/aarch64/icivau.c
 create mode 100644 tests/tcg/aarch64/sme-outprod1.c

[PULL 09/11] target/arm: gdbstub: Guard M-profile code with CONFIG_TCG

2023-07-04 Thread Peter Maydell

From: Fabiano Rosas 

This code is only relevant when TCG is present in the build. Building
with --disable-tcg --enable-xen on an x86 host we get:

$ ../configure --target-list=x86_64-softmmu,aarch64-softmmu --disable-tcg 
--enable-xen
$ make -j$(nproc)
...
libqemu-aarch64-softmmu.fa.p/target_arm_gdbstub.c.o: in function `m_sysreg_ptr':
 ../target/arm/gdbstub.c:358: undefined reference to `arm_v7m_get_sp_ptr'
 ../target/arm/gdbstub.c:361: undefined reference to `arm_v7m_get_sp_ptr'

libqemu-aarch64-softmmu.fa.p/target_arm_gdbstub.c.o: in function 
`arm_gdb_get_m_systemreg':
../target/arm/gdbstub.c:405: undefined reference to `arm_v7m_mrs_control'

Signed-off-by: Fabiano Rosas 
Message-id: 20230628164821.16771-1-faro...@suse.de
Reviewed-by: Peter Maydell 
Signed-off-by: Peter Maydell 
---
 target/arm/gdbstub.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/target/arm/gdbstub.c b/target/arm/gdbstub.c
index 03b17c814f6..f421c5d041c 100644
--- a/target/arm/gdbstub.c
+++ b/target/arm/gdbstub.c
@@ -324,6 +324,7 @@ static int arm_gen_dynamic_sysreg_xml(CPUState *cs, int 
base_reg)
 return cpu->dyn_sysreg_xml.num;
 }
 
+#ifdef CONFIG_TCG
 typedef enum {
 M_SYSREG_MSP,
 M_SYSREG_PSP,
@@ -481,6 +482,7 @@ static int arm_gen_dynamic_m_secextreg_xml(CPUState *cs, 
int orig_base_reg)
 return cpu->dyn_m_secextreg_xml.num;
 }
 #endif
+#endif /* CONFIG_TCG */
 
 const char *arm_gdb_get_dynamic_xml(CPUState *cs, const char *xmlname)
 {
@@ -561,6 +563,7 @@ void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu)
  arm_gen_dynamic_sysreg_xml(cs, cs->gdb_num_regs),
  "system-registers.xml", 0);
 
+#ifdef CONFIG_TCG
 if (arm_feature(env, ARM_FEATURE_M) && tcg_enabled()) {
 gdb_register_coprocessor(cs,
 arm_gdb_get_m_systemreg, arm_gdb_set_m_systemreg,
@@ -575,4 +578,5 @@ void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu)
 }
 #endif
 }
+#endif /* CONFIG_TCG */
 }
-- 
2.34.1

[PULL 06/11] target/arm: Handle IC IVAU to improve compatibility with JITs

2023-07-04 Thread Peter Maydell

From: John Högberg 

Unlike architectures with precise self-modifying code semantics
(e.g. x86) ARM processors do not maintain coherency for instruction
execution and memory, requiring an instruction synchronization
barrier on every core that will execute the new code, and on many
models also the explicit use of cache management instructions.

While this is required to make JITs work on actual hardware, QEMU
has gotten away with not handling this since it does not emulate
caches, and unconditionally invalidates code whenever the softmmu
or the user-mode page protection logic detects that code has been
modified.

Unfortunately the latter does not work in the face of dual-mapped
code (a common W^X workaround), where one page is executable and
the other is writable: user-mode has no way to connect one with the
other as that is only known to the kernel and the emulated
application.

This commit works around the issue by telling software that
instruction cache invalidation is required by clearing the
CPR_EL0.DIC flag (regardless of whether the emulated processor
needs it), and then invalidating code in IC IVAU instructions.

Resolves: https://gitlab.com/qemu-project/qemu/-/issues/1034

Co-authored-by: Richard Henderson 
Signed-off-by: John Högberg 
Reviewed-by: Richard Henderson 
Message-id: 168778890374.24232.340213885153806878...@git.sr.ht
[PMM: removed unnecessary AArch64 feature check; moved
 "clear CTR_EL1.DIC" code up a bit so it's not in the middle
 of the vfp/neon related tests]
Signed-off-by: Peter Maydell 
---
 target/arm/cpu.c| 11 +++
 target/arm/helper.c | 47 ++---
 2 files changed, 55 insertions(+), 3 deletions(-)

diff --git a/target/arm/cpu.c b/target/arm/cpu.c
index adf84f96860..822efa5b2c1 100644
--- a/target/arm/cpu.c
+++ b/target/arm/cpu.c
@@ -1694,6 +1694,17 @@ static void arm_cpu_realizefn(DeviceState *dev, Error 
**errp)
 return;
 }
 
+#ifdef CONFIG_USER_ONLY
+/*
+ * User mode relies on IC IVAU instructions to catch modification of
+ * dual-mapped code.
+ *
+ * Clear CTR_EL0.DIC to ensure that software that honors these flags uses
+ * IC IVAU even if the emulated processor does not normally require it.
+ */
+cpu->ctr = FIELD_DP64(cpu->ctr, CTR_EL0, DIC, 0);
+#endif
+
 if (arm_feature(env, ARM_FEATURE_AARCH64) &&
 cpu->has_vfp != cpu->has_neon) {
 /*
diff --git a/target/arm/helper.c b/target/arm/helper.c
index a0b84efab52..8e836aaee13 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -5234,6 +5234,36 @@ static void mdcr_el2_write(CPUARMState *env, const 
ARMCPRegInfo *ri,
 }
 }
 
+#ifdef CONFIG_USER_ONLY
+/*
+ * `IC IVAU` is handled to improve compatibility with JITs that dual-map their
+ * code to get around W^X restrictions, where one region is writable and the
+ * other is executable.
+ *
+ * Since the executable region is never written to we cannot detect code
+ * changes when running in user mode, and rely on the emulated JIT telling us
+ * that the code has changed by executing this instruction.
+ */
+static void ic_ivau_write(CPUARMState *env, const ARMCPRegInfo *ri,
+  uint64_t value)
+{
+uint64_t icache_line_mask, start_address, end_address;
+const ARMCPU *cpu;
+
+cpu = env_archcpu(env);
+
+icache_line_mask = (4 << extract32(cpu->ctr, 0, 4)) - 1;
+start_address = value & ~icache_line_mask;
+end_address = value | icache_line_mask;
+
+mmap_lock();
+
+tb_invalidate_phys_range(start_address, end_address);
+
+mmap_unlock();
+}
+#endif
+
 static const ARMCPRegInfo v8_cp_reginfo[] = {
 /*
  * Minimal set of EL0-visible registers. This will need to be expanded
@@ -5273,7 +5303,10 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
 { .name = "CURRENTEL", .state = ARM_CP_STATE_AA64,
   .opc0 = 3, .opc1 = 0, .opc2 = 2, .crn = 4, .crm = 2,
   .access = PL1_R, .type = ARM_CP_CURRENTEL },
-/* Cache ops: all NOPs since we don't emulate caches */
+/*
+ * Instruction cache ops. All of these except `IC IVAU` NOP because we
+ * don't emulate caches.
+ */
 { .name = "IC_IALLUIS", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 7, .crm = 1, .opc2 = 0,
   .access = PL1_W, .type = ARM_CP_NOP,
@@ -5286,9 +5319,17 @@ static const ARMCPRegInfo v8_cp_reginfo[] = {
   .accessfn = access_tocu },
 { .name = "IC_IVAU", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 3, .crn = 7, .crm = 5, .opc2 = 1,
-  .access = PL0_W, .type = ARM_CP_NOP,
+  .access = PL0_W,
   .fgt = FGT_ICIVAU,
-  .accessfn = access_tocu },
+  .accessfn = access_tocu,
+#ifdef CONFIG_USER_ONLY
+  .type = ARM_CP_NO_RAW,
+  .writefn = ic_ivau_write
+#else
+  .type = ARM_CP_NOP
+#endif
+},
+/* Cache ops: all NOPs since we don't emulate caches */
 { .name = "DC_IVAC", .state = ARM_CP_STATE_AA64,
   .opc0 = 1, .opc1 = 0, .crn = 7, .crm =

[PULL 01/11] target/arm: Add raw_writes ops for register whose write induce TLB maintenance

2023-07-04 Thread Peter Maydell

From: Eric Auger 

Some registers whose 'cooked' writefns induce TLB maintenance do
not have raw_writefn ops defined. If only the writefn ops is set
(ie. no raw_writefn is provided), it is assumed the cooked also
work as the raw one. For those registers it is not obvious the
tlb_flush works on KVM mode so better/safer setting the raw write.

Signed-off-by: Eric Auger 
Suggested-by: Peter Maydell 
Reviewed-by: Peter Maydell 
Signed-off-by: Peter Maydell 
---
 target/arm/helper.c | 23 +--
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/target/arm/helper.c b/target/arm/helper.c
index d08c058e424..a0b84efab52 100644
--- a/target/arm/helper.c
+++ b/target/arm/helper.c
@@ -4189,14 +4189,14 @@ static const ARMCPRegInfo vmsa_cp_reginfo[] = {
   .opc0 = 3, .opc1 = 0, .crn = 2, .crm = 0, .opc2 = 0,
   .access = PL1_RW, .accessfn = access_tvm_trvm,
   .fgt = FGT_TTBR0_EL1,
-  .writefn = vmsa_ttbr_write, .resetvalue = 0,
+  .writefn = vmsa_ttbr_write, .resetvalue = 0, .raw_writefn = raw_write,
   .bank_fieldoffsets = { offsetof(CPUARMState, cp15.ttbr0_s),
  offsetof(CPUARMState, cp15.ttbr0_ns) } },
 { .name = "TTBR1_EL1", .state = ARM_CP_STATE_BOTH,
   .opc0 = 3, .opc1 = 0, .crn = 2, .crm = 0, .opc2 = 1,
   .access = PL1_RW, .accessfn = access_tvm_trvm,
   .fgt = FGT_TTBR1_EL1,
-  .writefn = vmsa_ttbr_write, .resetvalue = 0,
+  .writefn = vmsa_ttbr_write, .resetvalue = 0, .raw_writefn = raw_write,
   .bank_fieldoffsets = { offsetof(CPUARMState, cp15.ttbr1_s),
  offsetof(CPUARMState, cp15.ttbr1_ns) } },
 { .name = "TCR_EL1", .state = ARM_CP_STATE_AA64,
@@ -4456,13 +4456,13 @@ static const ARMCPRegInfo lpae_cp_reginfo[] = {
   .type = ARM_CP_64BIT | ARM_CP_ALIAS,
   .bank_fieldoffsets = { offsetof(CPUARMState, cp15.ttbr0_s),
  offsetof(CPUARMState, cp15.ttbr0_ns) },
-  .writefn = vmsa_ttbr_write, },
+  .writefn = vmsa_ttbr_write, .raw_writefn = raw_write },
 { .name = "TTBR1", .cp = 15, .crm = 2, .opc1 = 1,
   .access = PL1_RW, .accessfn = access_tvm_trvm,
   .type = ARM_CP_64BIT | ARM_CP_ALIAS,
   .bank_fieldoffsets = { offsetof(CPUARMState, cp15.ttbr1_s),
  offsetof(CPUARMState, cp15.ttbr1_ns) },
-  .writefn = vmsa_ttbr_write, },
+  .writefn = vmsa_ttbr_write, .raw_writefn = raw_write },
 };
 
 static uint64_t aa64_fpcr_read(CPUARMState *env, const ARMCPRegInfo *ri)
@@ -5911,7 +5911,7 @@ static const ARMCPRegInfo el2_cp_reginfo[] = {
   .type = ARM_CP_IO,
   .opc0 = 3, .opc1 = 4, .crn = 1, .crm = 1, .opc2 = 0,
   .access = PL2_RW, .fieldoffset = offsetof(CPUARMState, cp15.hcr_el2),
-  .writefn = hcr_write },
+  .writefn = hcr_write, .raw_writefn = raw_write },
 { .name = "HCR", .state = ARM_CP_STATE_AA32,
   .type = ARM_CP_ALIAS | ARM_CP_IO,
   .cp = 15, .opc1 = 4, .crn = 1, .crm = 1, .opc2 = 0,
@@ -5983,6 +5983,7 @@ static const ARMCPRegInfo el2_cp_reginfo[] = {
 { .name = "TCR_EL2", .state = ARM_CP_STATE_BOTH,
   .opc0 = 3, .opc1 = 4, .crn = 2, .crm = 0, .opc2 = 2,
   .access = PL2_RW, .writefn = vmsa_tcr_el12_write,
+  .raw_writefn = raw_write,
   .fieldoffset = offsetof(CPUARMState, cp15.tcr_el[2]) },
 { .name = "VTCR", .state = ARM_CP_STATE_AA32,
   .cp = 15, .opc1 = 4, .crn = 2, .crm = 1, .opc2 = 2,
@@ -5999,10 +6000,10 @@ static const ARMCPRegInfo el2_cp_reginfo[] = {
   .type = ARM_CP_64BIT | ARM_CP_ALIAS,
   .access = PL2_RW, .accessfn = access_el3_aa32ns,
   .fieldoffset = offsetof(CPUARMState, cp15.vttbr_el2),
-  .writefn = vttbr_write },
+  .writefn = vttbr_write, .raw_writefn = raw_write },
 { .name = "VTTBR_EL2", .state = ARM_CP_STATE_AA64,
   .opc0 = 3, .opc1 = 4, .crn = 2, .crm = 1, .opc2 = 0,
-  .access = PL2_RW, .writefn = vttbr_write,
+  .access = PL2_RW, .writefn = vttbr_write, .raw_writefn = raw_write,
   .fieldoffset = offsetof(CPUARMState, cp15.vttbr_el2) },
 { .name = "SCTLR_EL2", .state = ARM_CP_STATE_BOTH,
   .opc0 = 3, .opc1 = 4, .crn = 1, .crm = 0, .opc2 = 0,
@@ -6014,7 +6015,8 @@ static const ARMCPRegInfo el2_cp_reginfo[] = {
   .fieldoffset = offsetof(CPUARMState, cp15.tpidr_el[2]) },
 { .name = "TTBR0_EL2", .state = ARM_CP_STATE_AA64,
   .opc0 = 3, .opc1 = 4, .crn = 2, .crm = 0, .opc2 = 0,
-  .access = PL2_RW, .resetvalue = 0, .writefn = vmsa_tcr_ttbr_el2_write,
+  .access = PL2_RW, .resetvalue = 0,
+  .writefn = vmsa_tcr_ttbr_el2_write, .raw_writefn = raw_write,
   .fieldoffset = offsetof(CPUARMState, cp15.ttbr0_el[2]) },
 { .name = "HTTBR", .cp = 15, .opc1 = 4, .crm = 2,
   .access = PL2_RW, .type = ARM_CP_64BIT | ARM_CP_ALIAS,
@@ -6201,12 +6203,12 @@ static const ARMCPRegInfo el3_cp_reginfo[] = {
 { .name = "SCR_EL3", .state = ARM_CP_STATE_AA64,
   .opc0 = 3, .opc1 = 6, .crn = 1, .crm = 1,

[PULL 11/11] target/xtensa: Assert that interrupt level is within bounds

2023-07-04 Thread Peter Maydell

In handle_interrupt() we use level as an index into the interrupt_vector[]
array. This is safe because we have checked it against env->config->nlevel,
but Coverity can't see that (and it is only true because each CPU config
sets its XCHAL_NUM_INTLEVELS to something less than MAX_NLEVELS), so it
complains about a possible array overrun (CID 1507131)

Add an assert() which will make Coverity happy and catch the unlikely
case of a mis-set XCHAL_NUM_INTLEVELS in future.

Signed-off-by: Peter Maydell 
Acked-by: Max Filippov 
Message-id: 20230623154135.1930261-1-peter.mayd...@linaro.org
---
 target/xtensa/exc_helper.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/target/xtensa/exc_helper.c b/target/xtensa/exc_helper.c
index d4823a65cda..43f6a862de2 100644
--- a/target/xtensa/exc_helper.c
+++ b/target/xtensa/exc_helper.c
@@ -169,6 +169,9 @@ static void handle_interrupt(CPUXtensaState *env)
 CPUState *cs = env_cpu(env);
 
 if (level > 1) {
+/* env->config->nlevel check should have ensured this */
+assert(level < sizeof(env->config->interrupt_vector));
+
 env->sregs[EPC1 + level - 1] = env->pc;
 env->sregs[EPS2 + level - 2] = env->sregs[PS];
 env->sregs[PS] =
-- 
2.34.1

intermittent clang sanitizer failure during 'make check-tcg': null pointer deref in IntervalTreeNode

2023-07-04 Thread Peter Maydell

If you build QEMU with the clang UB sanitizer and do a
'make check-tcg' run, it can fail like this:

  TESTvma-pthread-with-libinsn.so on aarch64
../../util/interval-tree.c:751:32: runtime error: member access within
null pointer of type 'IntervalTreeNode' (aka 'struct
IntervalTreeNode')
SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior
../../util/interval-tree.c:751:32 in

SUMMARY: UndefinedBehaviorSanitizer: undefined-behavior
../../util/interval-tree.c:751:32 in
make[1]: *** [Makefile:181: run-plugin-vma-pthread-with-libinsn.so] Error 124
make: *** [/mnt/nvmedisk/linaro/qemu-from-laptop/qemu/tests/Makefile.include:56:
run-tcg-tests-aarch64-linux-user] Error 2

I only saw this once; when I re-ran the test passed...

thanks
-- PMM

Re: [RFC PATCH 0/4] ppc: Improve multisocket support

2023-07-04 Thread Frederic Barrat





On 04/07/2023 15:49, Cédric Le Goater wrote:

Hello,

Here are changes improving multisocket support of the XIVE models
(POWER9 only). When a source has an END target on another chip, the
XIVE IC will use an MMIO store to forward the notification to the
remote chip. The long term plan is to get rid of pnv_xive_get_remote()
whic is a modeling shortcut. I have had them for while, they compile,
they seem to still work but this is not for merge yet. If someone
could take over, that would be nice.

The best way to test is to start a 2 sockets * 1 cpu system with devices
attached to the PCI buses of chip 0 and to offline CPU 0. All sources
should be configured to be served by CPU 1 on socket 1 and trigger
notifications on chip 0 should be forwarded to chip 1.

Last patch adds support for degenerative interrupts. This is used by
the lowest level FW of POWER systems. Difficult to test.




Thanks for the series! My crystal ball tells me the PC MMIO patch will 
come handy soon (to be adapted for P10 and groups). And the remote 
routing looks pretty interesting too.

The last patch (LSI) may rot a bit longer though :)

  Fred



Thanks,

C.

Cédric Le Goater (4):
   ppc/xive: introduce a new XiveRouter end_notify() handler
   ppc/pnv: handle END triggers between chips with MMIOs
   ppc/pnv: add support for the PC MMIOs
   ppc/pnv: Add support for degenerative interrupts (POWER LSI)

  hw/intc/pnv_xive_regs.h   |   1 +
  include/hw/ppc/pnv_xive.h |  15 +++
  include/hw/ppc/xive.h |   3 +
  hw/intc/pnv_xive.c| 262 +++---
  hw/intc/xive.c|  50 ++--
  5 files changed, 278 insertions(+), 53 deletions(-)

Re: [PATCH v4 5/6] target/riscv: update APLIC and IMSIC to support KVM AIA

2023-07-04 Thread Andrew Jones

On Wed, Jun 21, 2023 at 02:54:55PM +, Yong-Xuan Wang wrote:
> KVM AIA can't emulate APLIC only.

I think you mean "KVM AIA can't emulate the APLIC alone." ?

> When "aia=aplic" parameter is passed,
> APLIC devices is emulated by QEMU. For "aia=aplic-imsic", remove the
> mmio operations of APLIC when using KVM AIA and send wired interrupt
> signal via KVM_IRQ_LINE API.
> After KVM AIA enabled, MSI messages are delivered by KVM_SIGNAL_MSI API
> when the IMSICs receive mmio write requests.
> 
> Signed-off-by: Yong-Xuan Wang 
> Reviewed-by: Jim Shu 
> ---
>  hw/intc/riscv_aplic.c | 58 +++
>  hw/intc/riscv_imsic.c | 26 +++
>  2 files changed, 63 insertions(+), 21 deletions(-)
> 
> diff --git a/hw/intc/riscv_aplic.c b/hw/intc/riscv_aplic.c
> index afc5b54dbb..b1949636b6 100644
> --- a/hw/intc/riscv_aplic.c
> +++ b/hw/intc/riscv_aplic.c
> @@ -31,6 +31,7 @@
>  #include "hw/irq.h"
>  #include "target/riscv/cpu.h"
>  #include "sysemu/sysemu.h"
> +#include "sysemu/kvm.h"
>  #include "migration/vmstate.h"
>  
>  #define APLIC_MAX_IDC  (1UL << 14)
> @@ -148,6 +149,15 @@
>  
>  #define APLIC_IDC_CLAIMI   0x1c
>  
> +/*
> + * KVM AIA only supports APLIC.m, fallback to QEMU emulation if we want to 
> use
> + * APLIC.w.

I assume APLIC.m and APLIC.w mean "APLIC MSI" and "APLIC Wired". Can you
please spell that out?

> + */
> +static bool is_kvm_aia(bool msimode)
> +{
> +return kvm_irqchip_in_kernel() && msimode;
> +}
> +
>  static uint32_t riscv_aplic_read_input_word(RISCVAPLICState *aplic,
>  uint32_t word)
>  {
> @@ -471,6 +481,13 @@ static uint32_t riscv_aplic_idc_claimi(RISCVAPLICState 
> *aplic, uint32_t idc)
>  return topi;
>  }
>  
> +static void riscv_kvm_aplic_request(void *opaque, int irq, int level)
> +{
> +kvm_set_irq(kvm_state, irq, !!level);
> +
> +return;

Unnecessary 'return'

> +}
> +
>  static void riscv_aplic_request(void *opaque, int irq, int level)
>  {
>  bool update = false;
> @@ -801,29 +818,35 @@ static void riscv_aplic_realize(DeviceState *dev, Error 
> **errp)
>  uint32_t i;
>  RISCVAPLICState *aplic = RISCV_APLIC(dev);
>  
> -aplic->bitfield_words = (aplic->num_irqs + 31) >> 5;
> -aplic->sourcecfg = g_new0(uint32_t, aplic->num_irqs);
> -aplic->state = g_new0(uint32_t, aplic->num_irqs);
> -aplic->target = g_new0(uint32_t, aplic->num_irqs);
> -if (!aplic->msimode) {
> -for (i = 0; i < aplic->num_irqs; i++) {
> -aplic->target[i] = 1;
> +if (!is_kvm_aia(aplic->msimode)) {
> +aplic->bitfield_words = (aplic->num_irqs + 31) >> 5;
> +aplic->sourcecfg = g_new0(uint32_t, aplic->num_irqs);
> +aplic->state = g_new0(uint32_t, aplic->num_irqs);
> +aplic->target = g_new0(uint32_t, aplic->num_irqs);
> +if (!aplic->msimode) {
> +for (i = 0; i < aplic->num_irqs; i++) {
> +aplic->target[i] = 1;
> +}
>  }
> -}
> -aplic->idelivery = g_new0(uint32_t, aplic->num_harts);
> -aplic->iforce = g_new0(uint32_t, aplic->num_harts);
> -aplic->ithreshold = g_new0(uint32_t, aplic->num_harts);
> +aplic->idelivery = g_new0(uint32_t, aplic->num_harts);
> +aplic->iforce = g_new0(uint32_t, aplic->num_harts);
> +aplic->ithreshold = g_new0(uint32_t, aplic->num_harts);
>  
> -memory_region_init_io(>mmio, OBJECT(dev), _aplic_ops, aplic,
> -  TYPE_RISCV_APLIC, aplic->aperture_size);
> -sysbus_init_mmio(SYS_BUS_DEVICE(dev), >mmio);
> +memory_region_init_io(>mmio, OBJECT(dev), _aplic_ops,
> + aplic, TYPE_RISCV_APLIC, aplic->aperture_size);
> +sysbus_init_mmio(SYS_BUS_DEVICE(dev), >mmio);
> +}
>  
>  /*
>   * Only root APLICs have hardware IRQ lines. All non-root APLICs
>   * have IRQ lines delegated by their parent APLIC.
>   */
>  if (!aplic->parent) {
> -qdev_init_gpio_in(dev, riscv_aplic_request, aplic->num_irqs);
> +if (is_kvm_aia(aplic->msimode)) {
> +qdev_init_gpio_in(dev, riscv_kvm_aplic_request, aplic->num_irqs);
> +} else {
> +qdev_init_gpio_in(dev, riscv_aplic_request, aplic->num_irqs);
> +}
>  }
>  
>  /* Create output IRQ lines for non-MSI mode */
> @@ -958,7 +981,10 @@ DeviceState *riscv_aplic_create(hwaddr addr, hwaddr size,
>  qdev_prop_set_bit(dev, "mmode", mmode);
>  
>  sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), _fatal);
> -sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, addr);
> +
> +if (!is_kvm_aia(msimode)) {
> +sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, addr);
> +}
>  
>  if (parent) {
>  riscv_aplic_add_child(parent, dev);
> diff --git a/hw/intc/riscv_imsic.c b/hw/intc/riscv_imsic.c
> index fea3385b51..00fdb60fc6 100644
> --- a/hw/intc/riscv_imsic.c
> +++ b/hw/intc/riscv_imsic.c
> @@ -32,6 +32,7 @@

Re: [PATCH] ui/gtk: set the area of the scanout texture correctly

2023-07-04 Thread Marc-André Lureau

Hi

On Mon, Jun 26, 2023 at 9:49 PM Kim, Dongwon  wrote:

> Hi Marc-André Lureau,
>
> On 6/26/2023 4:56 AM, Marc-André Lureau wrote:
> > Hi
> >
> > On Wed, Jun 21, 2023 at 11:53 PM Dongwon Kim 
> > wrote:
> >
> > x and y offsets and width and height of the scanout texture
> > is not correctly configured in case guest scanout frame is
> > dmabuf.
> >
> > Cc: Gerd Hoffmann 
> > Cc: Marc-André Lureau 
> > Cc: Vivek Kasireddy 
> > Signed-off-by: Dongwon Kim 
> >
> >
> > I find this a bit confusing, and I don't know how to actually test it.
> >
> > The only place where scanout_{width, height} are set is
> > virtio_gpu_create_dmabuf() and there, they have the same values as
> > width and height. it's too easy to get confused with the values imho.
>
> Yes, scanout_width/height are same as width/height as far as there is
> only one guest display exist. But they will be different in case there
> multiple displays on the guest side, configured in extended mode (when
> the guest is running Xorg).
>
> In this case, blob for the guest display is same for scanout 1 and 2 but
> each scanout will have different offset and scanout_width/scanout_height
> to reference a sub region in the same blob(dmabuf).
>
> I added x/y/scanout_width/scanout_height with a previous commit:
>
> commit e86a93f55463c088aa0b5260e915ffbf9f86c62b
> Author: Dongwon Kim 
> Date:   Wed Nov 3 23:51:52 2021 -0700
>
>  virtio-gpu: splitting one extended mode guest fb into n-scanouts
>
> > I find the terminology we use for ScanoutTexture much clearer. It uses
> > backing_{width, height} instead, which indicates quite clearly that
> > the usual x/y/w/h are for the sub-region to be shown.
> yeah agreed. Then dmabuf->width/height should be changed to
> dmabuf->backing_width/height and dmabuf->width/height will be replacing
> dmabuf->scanout_width/scanout_height. I guess this is what you meant,
> right?
>

right, can you send a new patch?
thanks


> > I think we should have a preliminary commit that renames
> > scanout_{width, height}.
> >
> > Please give some help/hints on how to actually test this code too.
>
> So this patch is just to make things look consistent in the code level.
> Having offset (0,0) in this function call for all different scanouts
> didn't look right to me. This code change won't make anything done
> differently though. So no test is applicable.
>
> >
> > Thanks!
> >
> >
> > ---
> >  ui/gtk-egl.c | 3 ++-
> >  ui/gtk-gl-area.c | 3 ++-
> >  2 files changed, 4 insertions(+), 2 deletions(-)
> >
> > diff --git a/ui/gtk-egl.c b/ui/gtk-egl.c
> > index 19130041bc..e99e3b0d8c 100644
> > --- a/ui/gtk-egl.c
> > +++ b/ui/gtk-egl.c
> > @@ -257,7 +257,8 @@ void
> > gd_egl_scanout_dmabuf(DisplayChangeListener *dcl,
> >
> >  gd_egl_scanout_texture(dcl, dmabuf->texture,
> > dmabuf->y0_top, dmabuf->width,
> > dmabuf->height,
> > -   0, 0, dmabuf->width, dmabuf->height);
> > +   dmabuf->x, dmabuf->y,
> > dmabuf->scanout_width,
> > +   dmabuf->scanout_height);
> >
> >  if (dmabuf->allow_fences) {
> >  vc->gfx.guest_fb.dmabuf = dmabuf;
> > diff --git a/ui/gtk-gl-area.c b/ui/gtk-gl-area.c
> > index c384a1516b..1605818bd1 100644
> > --- a/ui/gtk-gl-area.c
> > +++ b/ui/gtk-gl-area.c
> > @@ -299,7 +299,8 @@ void
> > gd_gl_area_scanout_dmabuf(DisplayChangeListener *dcl,
> >
> >  gd_gl_area_scanout_texture(dcl, dmabuf->texture,
> > dmabuf->y0_top, dmabuf->width,
> > dmabuf->height,
> > -   0, 0, dmabuf->width, dmabuf->height);
> > +   dmabuf->x, dmabuf->y,
> > dmabuf->scanout_width,
> > +   dmabuf->scanout_height);
> >
> >  if (dmabuf->allow_fences) {
> >  vc->gfx.guest_fb.dmabuf = dmabuf;
> > --
> > 2.34.1
> >
> >
> >
> >
> > --
> > Marc-André Lureau
>


-- 
Marc-André Lureau

Re: [PATCH] virtio-gpu: replace the surface with null surface when resetting

2023-07-04 Thread Marc-André Lureau

On Wed, Jun 28, 2023 at 1:05 AM Dongwon Kim  wrote:

> The primary guest scanout shows the booting screen right after reboot
> but additional guest displays (i.e. max_ouptuts > 1) will keep displaying
> the old frames until the guest virtio gpu driver gets initialized, which
> could cause some confusion. A better way is to to replace the surface with
> a place holder that tells the display is not active during the reset of
> virtio-gpu device.
>
> And to immediately update the surface with the place holder image after
> the switch, displaychangelistener_gfx_switch needs to be called with
> 'update == TRUE' in dpy_gfx_replace_surface when the new surface is NULL.
>
> Cc: Gerd Hoffmann 
> Cc: Marc-André Lureau 
> Cc: Vivek Kasireddy 
> Signed-off-by: Dongwon Kim 
>

Acked-by: Marc-André Lureau 

---
>  hw/display/virtio-gpu.c |  5 +
>  ui/console.c| 11 ++-
>  2 files changed, 11 insertions(+), 5 deletions(-)
>
> diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
> index 66cddd94d9..de92f003b5 100644
> --- a/hw/display/virtio-gpu.c
> +++ b/hw/display/virtio-gpu.c
> @@ -1354,6 +1354,7 @@ void virtio_gpu_reset(VirtIODevice *vdev)
>  VirtIOGPU *g = VIRTIO_GPU(vdev);
>  struct virtio_gpu_simple_resource *res, *tmp;
>  struct virtio_gpu_ctrl_command *cmd;
> +int i = 0;
>
>  QTAILQ_FOREACH_SAFE(res, >reslist, next, tmp) {
>  virtio_gpu_resource_destroy(g, res);
> @@ -1372,6 +1373,10 @@ void virtio_gpu_reset(VirtIODevice *vdev)
>  g_free(cmd);
>  }
>
> +for (i = 0; i < g->parent_obj.conf.max_outputs; i++) {
> +dpy_gfx_replace_surface(g->parent_obj.scanout[i].con, NULL);
> +}
> +
>  virtio_gpu_base_reset(VIRTIO_GPU_BASE(vdev));
>  }
>
> diff --git a/ui/console.c b/ui/console.c
> index e173731e20..768f39697c 100644
> --- a/ui/console.c
> +++ b/ui/console.c
> @@ -1787,6 +1787,7 @@ void dpy_gfx_replace_surface(QemuConsole *con,
>  static const char placeholder_msg[] = "Display output is not active.";
>  DisplayState *s = con->ds;
>  DisplaySurface *old_surface = con->surface;
> +DisplaySurface *new_surface = surface;
>  DisplayChangeListener *dcl;
>  int width;
>  int height;
> @@ -1800,19 +1801,19 @@ void dpy_gfx_replace_surface(QemuConsole *con,
>  height = 480;
>  }
>
> -surface = qemu_create_placeholder_surface(width, height,
> placeholder_msg);
> +new_surface = qemu_create_placeholder_surface(width, height,
> placeholder_msg);
>  }
>
> -assert(old_surface != surface);
> +assert(old_surface != new_surface);
>
>  con->scanout.kind = SCANOUT_SURFACE;
> -con->surface = surface;
> -dpy_gfx_create_texture(con, surface);
> +con->surface = new_surface;
> +dpy_gfx_create_texture(con, new_surface);
>  QLIST_FOREACH(dcl, >listeners, next) {
>  if (con != (dcl->con ? dcl->con : active_console)) {
>  continue;
>  }
> -displaychangelistener_gfx_switch(dcl, surface, FALSE);
> +displaychangelistener_gfx_switch(dcl, new_surface, surface ?
> FALSE : TRUE);
>  }
>  dpy_gfx_destroy_texture(con, old_surface);
>  qemu_free_displaysurface(old_surface);
> --
> 2.34.1
>
>
>

-- 
Marc-André Lureau

Re: [PATCH] target/arm: Avoid over-length shift in arm_cpu_sve_finalize() error case

2023-07-04 Thread Alex Bennée



Peter Maydell  writes:

> If you build QEMU with the clang sanitizer enabled, you can see it
> fire when running the arm-cpu-features test:
>
> $ QTEST_QEMU_BINARY=./build/arm-clang/qemu-system-aarch64 
> ./build/arm-clang/tests/qtest/arm-cpu-features
> [...]
> ../../target/arm/cpu64.c:125:19: runtime error: shift exponent 64 is too 
> large for 64-bit type 'unsigned long long'
> [...]
>
> This happens because the user can specify some incorrect SVE
> properties that result in our calculating a max_vq of 0.  We catch
> this and error out, but before we do that we calculate
>
>  vq_mask = MAKE_64BIT_MASK(0, max_vq);$
>
> and the MAKE_64BIT_MASK() call is only valid for lengths that are
> greater than zero, so we hit the undefined behaviour.

Hmm that does make me worry we could have more land mines waiting to be
found. Would converting MAKE_64BIT_MASK into an inline function and
asserting be a better solution?

>
> Change the logic so that if max_vq is 0 we specifically set vq_mask
> to 0 without going via MAKE_64BIT_MASK().  This lets us drop the
> max_vq check from the error-exit logic, because if max_vq is 0 then
> vq_map must now be 0.
>
> The UB only happens in the case where the user passed us an incorrect
> set of SVE properties, so it's not a big problem in practice.
>
> Signed-off-by: Peter Maydell 

Reviewed-by: Alex Bennée 

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [PATCH] target/arm: Avoid over-length shift in arm_cpu_sve_finalize() error case

2023-07-04 Thread Peter Maydell

On Tue, 4 Jul 2023 at 16:52, Philippe Mathieu-Daudé  wrote:
>
> On 4/7/23 17:43, Peter Maydell wrote:
> > If you build QEMU with the clang sanitizer enabled, you can see it
> > fire when running the arm-cpu-features test:
> >
> > $ QTEST_QEMU_BINARY=./build/arm-clang/qemu-system-aarch64 
> > ./build/arm-clang/tests/qtest/arm-cpu-features
> > [...]
> > ../../target/arm/cpu64.c:125:19: runtime error: shift exponent 64 is too 
> > large for 64-bit type 'unsigned long long'
> > [...]
> >
> > This happens because the user can specify some incorrect SVE
> > properties that result in our calculating a max_vq of 0.  We catch
> > this and error out, but before we do that we calculate
> >
> >   vq_mask = MAKE_64BIT_MASK(0, max_vq);$
> >
> > and the MAKE_64BIT_MASK() call is only valid for lengths that are
> > greater than zero, so we hit the undefined behaviour.
>
> Can we fix it generically?
>
> -- >8 --
> --- a/include/qemu/bitops.h
> +++ b/include/qemu/bitops.h
> @@ -28,3 +28,3 @@
>   #define MAKE_64BIT_MASK(shift, length) \
> -(((~0ULL) >> (64 - (length))) << (shift))
> +((length) ? (((~0ULL) >> (64 - (length))) << (shift)) : 0)
>
> ---

Only by introducing a conditional in the case where the length
isn't a compile time constant.
Like the extract and deposit functions, the assumption is that
you're operating on a field that actually exists and isn't
zero-width.

thanks
-- PMM

Re: [PATCH] target/arm: gdbstub: Guard M-profile code with CONFIG_TCG

2023-07-04 Thread Philippe Mathieu-Daudé


On 4/7/23 17:44, Peter Maydell wrote:

On Tue, 4 Jul 2023 at 16:21, Philippe Mathieu-Daudé  wrote:


On 28/6/23 18:48, Fabiano Rosas wrote:

This code is only relevant when TCG is present in the build. Building
with --disable-tcg --enable-xen on an x86 host we get:

$ ../configure --target-list=x86_64-softmmu,aarch64-softmmu --disable-tcg 
--enable-xen
$ make -j$(nproc)
...
libqemu-aarch64-softmmu.fa.p/target_arm_gdbstub.c.o: in function `m_sysreg_ptr':
   ../target/arm/gdbstub.c:358: undefined reference to `arm_v7m_get_sp_ptr'
   ../target/arm/gdbstub.c:361: undefined reference to `arm_v7m_get_sp_ptr'

libqemu-aarch64-softmmu.fa.p/target_arm_gdbstub.c.o: in function 
`arm_gdb_get_m_systemreg':
../target/arm/gdbstub.c:405: undefined reference to `arm_v7m_mrs_control'

Signed-off-by: Fabiano Rosas 
---
This is a respin of:
https://lore.kernel.org/r/20230313151058.19645-5-faro...@suse.de
---
   target/arm/gdbstub.c | 4 
   1 file changed, 4 insertions(+)

diff --git a/target/arm/gdbstub.c b/target/arm/gdbstub.c
index 03b17c814f..f421c5d041 100644
--- a/target/arm/gdbstub.c
+++ b/target/arm/gdbstub.c
@@ -324,6 +324,7 @@ static int arm_gen_dynamic_sysreg_xml(CPUState *cs, int 
base_reg)
   return cpu->dyn_sysreg_xml.num;
   }

+#ifdef CONFIG_TCG


OK.


   typedef enum {
   M_SYSREG_MSP,
   M_SYSREG_PSP,
@@ -481,6 +482,7 @@ static int arm_gen_dynamic_m_secextreg_xml(CPUState *cs, 
int orig_base_reg)
   return cpu->dyn_m_secextreg_xml.num;
   }
   #endif
+#endif /* CONFIG_TCG */

   const char *arm_gdb_get_dynamic_xml(CPUState *cs, const char *xmlname)
   {
@@ -561,6 +563,7 @@ void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu)
arm_gen_dynamic_sysreg_xml(cs, 
cs->gdb_num_regs),
"system-registers.xml", 0);

+#ifdef CONFIG_TCG


IIUC tcg_enabled(), this guard shouldn't be necessary; if CONFIG_TCG
is not defined, tcg_enabled() evaluates to 0, and the compiler should
elide the whole block.


IME it's a bit optimistic to assume that the compiler will always
do that, especially with no optimisation enabled.


OK I see, thanks.

Re: [PATCH] ui/gtk: skip refresh if new dmabuf is submitted

2023-07-04 Thread Marc-André Lureau

Hi

On Wed, Jun 28, 2023 at 9:12 PM Dongwon Kim  wrote:

> Skip refresh if a new dmabuf (guest scanout frame) is submitted
> and ready to be drawn because the scanout will be refreshed with
> new frame anyway.
>
> Also, setting scanout mode is better to be done right before
> a draw event is scheduled because the mode can be reset anytime
> after it is set in dpy_gl_scanout_texture by any asynchronouse
> dpy_refresh call, which eventually cancels the drawing of the
> guest scanout texture.
>

This is actually quite difficult to analyze. Can you be more explicit? Can
you split the patch and add comments, because the state conditions &
behavours aren't obvious.

Gerd, any chance you look at it too?


> Cc: Gerd Hoffmann 
> Cc: Marc-André Lureau 
> Cc: Vivek Kasireddy 
> Signed-off-by: Dongwon Kim 
> ---
>  ui/gtk-egl.c | 6 +-
>  ui/gtk-gl-area.c | 6 +-
>  2 files changed, 10 insertions(+), 2 deletions(-)
>
> diff --git a/ui/gtk-egl.c b/ui/gtk-egl.c
> index 19130041bc..62ff6812b6 100644
> --- a/ui/gtk-egl.c
> +++ b/ui/gtk-egl.c
> @@ -143,6 +143,10 @@ void gd_egl_refresh(DisplayChangeListener *dcl)
>  gd_update_monitor_refresh_rate(
>  vc, vc->window ? vc->window : vc->gfx.drawing_area);
>
> +if (vc->gfx.guest_fb.dmabuf &&
> vc->gfx.guest_fb.dmabuf->draw_submitted) {
> +return;
> +}
> +
>  if (!vc->gfx.esurface) {
>  gd_egl_init(vc);
>  if (!vc->gfx.esurface) {
> @@ -236,7 +240,6 @@ void gd_egl_scanout_texture(DisplayChangeListener *dcl,
>  eglMakeCurrent(qemu_egl_display, vc->gfx.esurface,
> vc->gfx.esurface, vc->gfx.ectx);
>
> -gtk_egl_set_scanout_mode(vc, true);
>  egl_fb_setup_for_tex(>gfx.guest_fb, backing_width, backing_height,
>   backing_id, false);
>  }
> @@ -344,6 +347,7 @@ void gd_egl_flush(DisplayChangeListener *dcl,
>  if (vc->gfx.guest_fb.dmabuf &&
> !vc->gfx.guest_fb.dmabuf->draw_submitted) {
>  graphic_hw_gl_block(vc->gfx.dcl.con, true);
>  vc->gfx.guest_fb.dmabuf->draw_submitted = true;
> +gtk_egl_set_scanout_mode(vc, true);
>  gtk_widget_queue_draw_area(area, x, y, w, h);
>  return;
>  }
> diff --git a/ui/gtk-gl-area.c b/ui/gtk-gl-area.c
> index c384a1516b..68eff3435b 100644
> --- a/ui/gtk-gl-area.c
> +++ b/ui/gtk-gl-area.c
> @@ -123,6 +123,10 @@ void gd_gl_area_refresh(DisplayChangeListener *dcl)
>
>  gd_update_monitor_refresh_rate(vc, vc->window ? vc->window :
> vc->gfx.drawing_area);
>
> +if (vc->gfx.guest_fb.dmabuf &&
> vc->gfx.guest_fb.dmabuf->draw_submitted) {
> +return;
> +}
> +
>  if (!vc->gfx.gls) {
>  if (!gtk_widget_get_realized(vc->gfx.drawing_area)) {
>  return;
> @@ -261,7 +265,6 @@ void gd_gl_area_scanout_texture(DisplayChangeListener
> *dcl,
>  return;
>  }
>
> -gtk_gl_area_set_scanout_mode(vc, true);
>  egl_fb_setup_for_tex(>gfx.guest_fb, backing_width, backing_height,
>   backing_id, false);
>  }
> @@ -281,6 +284,7 @@ void gd_gl_area_scanout_flush(DisplayChangeListener
> *dcl,
>  if (vc->gfx.guest_fb.dmabuf &&
> !vc->gfx.guest_fb.dmabuf->draw_submitted) {
>  graphic_hw_gl_block(vc->gfx.dcl.con, true);
>  vc->gfx.guest_fb.dmabuf->draw_submitted = true;
> +gtk_gl_area_set_scanout_mode(vc, true);
>  }
>  gtk_gl_area_queue_render(GTK_GL_AREA(vc->gfx.drawing_area));
>  }
> --
> 2.34.1
>
>
>

-- 
Marc-André Lureau

Re: [PATCH v2] vfio: Fix null pointer dereference bug in vfio_bars_finalize()

2023-07-04 Thread Cédric Le Goater


Hello Avihai

On 7/4/23 15:39, Avihai Horon wrote:

vfio_realize() has the following flow:
1. vfio_bars_prepare() -- sets VFIOBAR->size.
2. msix_early_setup().
3. vfio_bars_register() -- allocates VFIOBAR->mr.

After vfio_bars_prepare() is called msix_early_setup() can fail. If it
does fail, vfio_bars_register() is never called and VFIOBAR->mr is not
allocated.

In this case, vfio_bars_finalize() is called as part of the error flow
to free the bars' resources. However, vfio_bars_finalize() calls
object_unparent() for VFIOBAR->mr after checking only VFIOBAR->size, and
thus we get a null pointer dereference.

Fix it by checking VFIOBAR->mr in vfio_bars_finalize().


Did you see the issue by reading the code or did you actually crash
QEMU with a test case ?



Fixes: 89d5202edc50 ("vfio/pci: Allow relocating MSI-X MMIO")
Signed-off-by: Avihai Horon 



Reviewed-by: Cédric Le Goater 

Thanks,

C.



---

  Changes from v1:
  * Assert VFIOBAR->size and set VFIOBAR->mr to NULL to make the code
more accurate. (Philippe)
  * Small reword in the last paragraph of the commit message.

  hw/vfio/pci.c | 4 +++-
  1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index ab6645ba60..bc98791cbb 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -1752,9 +1752,11 @@ static void vfio_bars_finalize(VFIOPCIDevice *vdev)
  
  vfio_bar_quirk_finalize(vdev, i);

  vfio_region_finalize(>region);
-if (bar->size) {
+if (bar->mr) {
+assert(bar->size);
  object_unparent(OBJECT(bar->mr));
  g_free(bar->mr);
+bar->mr = NULL;
  }
  }

Re: [PATCH] target/arm: Avoid over-length shift in arm_cpu_sve_finalize() error case

2023-07-04 Thread Philippe Mathieu-Daudé


On 4/7/23 17:43, Peter Maydell wrote:

If you build QEMU with the clang sanitizer enabled, you can see it
fire when running the arm-cpu-features test:

$ QTEST_QEMU_BINARY=./build/arm-clang/qemu-system-aarch64 
./build/arm-clang/tests/qtest/arm-cpu-features
[...]
../../target/arm/cpu64.c:125:19: runtime error: shift exponent 64 is too large 
for 64-bit type 'unsigned long long'
[...]

This happens because the user can specify some incorrect SVE
properties that result in our calculating a max_vq of 0.  We catch
this and error out, but before we do that we calculate

  vq_mask = MAKE_64BIT_MASK(0, max_vq);$

and the MAKE_64BIT_MASK() call is only valid for lengths that are
greater than zero, so we hit the undefined behaviour.


Can we fix it generically?

-- >8 --
--- a/include/qemu/bitops.h
+++ b/include/qemu/bitops.h
@@ -28,3 +28,3 @@
 #define MAKE_64BIT_MASK(shift, length) \
-(((~0ULL) >> (64 - (length))) << (shift))
+((length) ? (((~0ULL) >> (64 - (length))) << (shift)) : 0)

---



Change the logic so that if max_vq is 0 we specifically set vq_mask
to 0 without going via MAKE_64BIT_MASK().  This lets us drop the
max_vq check from the error-exit logic, because if max_vq is 0 then
vq_map must now be 0.

The UB only happens in the case where the user passed us an incorrect
set of SVE properties, so it's not a big problem in practice.

Signed-off-by: Peter Maydell 
---
  target/arm/cpu64.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index 6eaf8e32cfa..6012e4ef549 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -122,10 +122,10 @@ void arm_cpu_sve_finalize(ARMCPU *cpu, Error **errp)
  vq = ctz32(tmp) + 1;
  
  max_vq = vq <= ARM_MAX_VQ ? vq - 1 : ARM_MAX_VQ;

-vq_mask = MAKE_64BIT_MASK(0, max_vq);
+vq_mask = max_vq > 0 ? MAKE_64BIT_MASK(0, max_vq) : 0;
  vq_map = vq_supported & ~vq_init & vq_mask;
  
-if (max_vq == 0 || vq_map == 0) {

+if (vq_map == 0) {
  error_setg(errp, "cannot disable sve%d", vq * 128);
  error_append_hint(errp, "Disabling sve%d results in all "
"vector lengths being disabled.\n",


Reviewed-by: Philippe Mathieu-Daudé

[PATCH v2] i386/xen: consistent locking around Xen singleshot timers

2023-07-04 Thread David Woodhouse

From: David Woodhouse 

Coverity points out (CID 1507534, 1507968) that we sometimes access
env->xen_singleshot_timer_ns under the protection of
env->xen_timers_lock and sometimes not.

This isn't always an issue. There are two modes for the timers; if the
kernel supports the EVTCHN_SEND capability then it handles all the timer
hypercalls and delivery internally, and all we use the field for is to
get/set the timer as part of the vCPU state via an ioctl(). If the
kernel doesn't have that support, then we do all the emulation within
qemu, and *those* are the code paths where we actually care about the
locking.

But it doesn't hurt to be a little bit more consistent and avoid having
to explain *why* it's OK.

Signed-off-by: David Woodhouse 
---
 target/i386/kvm/xen-emu.c | 36 ++--
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/target/i386/kvm/xen-emu.c b/target/i386/kvm/xen-emu.c
index d7c7eb8d9c..9946ff0905 100644
--- a/target/i386/kvm/xen-emu.c
+++ b/target/i386/kvm/xen-emu.c
@@ -43,6 +43,7 @@
 
 static void xen_vcpu_singleshot_timer_event(void *opaque);
 static void xen_vcpu_periodic_timer_event(void *opaque);
+static int vcpuop_stop_singleshot_timer(CPUState *cs);
 
 #ifdef TARGET_X86_64
 #define hypercall_compat32(longmode) (!(longmode))
@@ -466,6 +467,7 @@ void kvm_xen_inject_vcpu_callback_vector(uint32_t vcpu_id, 
int type)
 }
 }
 
+/* Must always be called with xen_timers_lock held */
 static int kvm_xen_set_vcpu_timer(CPUState *cs)
 {
 X86CPU *cpu = X86_CPU(cs);
@@ -483,6 +485,7 @@ static int kvm_xen_set_vcpu_timer(CPUState *cs)
 
 static void do_set_vcpu_timer_virq(CPUState *cs, run_on_cpu_data data)
 {
+QEMU_LOCK_GUARD(_CPU(cs)->env.xen_timers_lock);
 kvm_xen_set_vcpu_timer(cs);
 }
 
@@ -545,7 +548,6 @@ static void do_vcpu_soft_reset(CPUState *cs, 
run_on_cpu_data data)
 env->xen_vcpu_time_info_gpa = INVALID_GPA;
 env->xen_vcpu_runstate_gpa = INVALID_GPA;
 env->xen_vcpu_callback_vector = 0;
-env->xen_singleshot_timer_ns = 0;
 memset(env->xen_virq, 0, sizeof(env->xen_virq));
 
 set_vcpu_info(cs, INVALID_GPA);
@@ -555,8 +557,13 @@ static void do_vcpu_soft_reset(CPUState *cs, 
run_on_cpu_data data)
   INVALID_GPA);
 if (kvm_xen_has_cap(EVTCHN_SEND)) {
 kvm_xen_set_vcpu_callback_vector(cs);
+
+QEMU_LOCK_GUARD(_CPU(cs)->env.xen_timers_lock);
+env->xen_singleshot_timer_ns = 0;
 kvm_xen_set_vcpu_timer(cs);
-}
+} else {
+vcpuop_stop_singleshot_timer(cs);
+};
 
 }
 
@@ -1059,6 +1066,10 @@ static int vcpuop_stop_periodic_timer(CPUState *target)
 return 0;
 }
 
+/*
+ * Userspace handling of timer, for older kernels.
+ * Must always be called with xen_timers_lock held.
+ */
 static int do_set_singleshot_timer(CPUState *cs, uint64_t timeout_abs,
bool future, bool linux_wa)
 {
@@ -1086,12 +1097,8 @@ static int do_set_singleshot_timer(CPUState *cs, 
uint64_t timeout_abs,
 timeout_abs = now + delta;
 }
 
-qemu_mutex_lock(>xen_timers_lock);
-
 timer_mod_ns(env->xen_singleshot_timer, qemu_now + delta);
 env->xen_singleshot_timer_ns = now + delta;
-
-qemu_mutex_unlock(>xen_timers_lock);
 return 0;
 }
 
@@ -1115,6 +1122,7 @@ static int vcpuop_set_singleshot_timer(CPUState *cs, 
uint64_t arg)
 return -EFAULT;
 }
 
+QEMU_LOCK_GUARD(_CPU(cs)->env.xen_timers_lock);
 return do_set_singleshot_timer(cs, sst.timeout_abs_ns,
!!(sst.flags & VCPU_SSHOTTMR_future),
false);
@@ -1141,6 +1149,7 @@ static bool kvm_xen_hcall_set_timer_op(struct 
kvm_xen_exit *exit, X86CPU *cpu,
 if (unlikely(timeout == 0)) {
 err = vcpuop_stop_singleshot_timer(CPU(cpu));
 } else {
+QEMU_LOCK_GUARD(_CPU(cpu)->env.xen_timers_lock);
 err = do_set_singleshot_timer(CPU(cpu), timeout, false, true);
 }
 exit->u.hcall.result = err;
@@ -1826,6 +1835,7 @@ int kvm_put_xen_state(CPUState *cs)
  * If the kernel has EVTCHN_SEND support then it handles timers too,
  * so the timer will be restored by kvm_xen_set_vcpu_timer() below.
  */
+QEMU_LOCK_GUARD(>xen_timers_lock);
 if (env->xen_singleshot_timer_ns) {
 ret = do_set_singleshot_timer(cs, env->xen_singleshot_timer_ns,
 false, false);
@@ -1844,10 +1854,7 @@ int kvm_put_xen_state(CPUState *cs)
 }
 
 if (env->xen_virq[VIRQ_TIMER]) {
-ret = kvm_xen_set_vcpu_timer(cs);
-if (ret < 0) {
-return ret;
-}
+do_set_vcpu_timer_virq(cs, 
RUN_ON_CPU_HOST_INT(env->xen_virq[VIRQ_TIMER]));
 }
 return 0;
 }
@@ -1896,6 +1903,15 @@ int kvm_get_xen_state(CPUState *cs)
 if (ret < 0) {
 return ret;
 }
+
+/*
+ * This locking is fairly pointless, and is here to appease Coverity.
+

Re: [PATCH] i386/xen: consistent locking around Xen singleshot timers

2023-07-04 Thread David Woodhouse

On Fri, 2023-06-02 at 17:58 +0100, Peter Maydell wrote:
> On Mon, 22 May 2023 at 19:52, David Woodhouse  wrote:
> > 
> > From: David Woodhouse 
> > 
> > Coverity points out (CID 1507534) that we sometimes access
> > env->xen_singleshot_timer_ns under the protection of
> > env->xen_timers_lock (eg in xen_vcpu_singleshot_timer_event()) and
> > sometimes not (the specific case Coverity complains about is in
> > do_vcpu_soft_reset()).
> > 
> > This isn't strictly an issue. There are two modes for the timers; if
> > the kernel supports the EVTCHN_SEND capability then it handles all the
> > timer hypercalls and delivery internally, and all we need to do is an
> > ioctl to get/set the next timer as part of the vCPU state. If the
> > kernel doesn't have that support, then we do all the emulation within
> > qemu, and *those* are the code paths where we actually care about the
> > locking.
> > 
> > But it doesn't hurt to be a little bit more consistent and avoid having
> > to explain *why* it's OK.
> > 
> > Signed-off-by: David Woodhouse 
> 
> Looking a bit more closely at the Coverity lists, there's also
> CID 1507968 which is for the access to env->xen_singleshot_timer_ns
> in kvm_get_xen_state(), which this patch doesn't touch, I think ?

That one is basically a false positive since it's for the case where
the kernel is handling the timers and all we do is an ioctl to read the
state. Which is *inherently* racy if any other vCPU can be running and
modifying them at the time. And doesn't have anything to race *with* if
not :)

On the other hand, the one in kvm_put_xen_state() which calls
do_set_singleshot_timer() is for the userspace case, and *that* one
probably wants locking. Again in practice it's probably never going to
have anything to race with, but that's a poor excuse. New patch
follows.

smime.p7s
Description: S/MIME cryptographic signature

Re: [PATCH] target/arm: gdbstub: Guard M-profile code with CONFIG_TCG

2023-07-04 Thread Peter Maydell

On Tue, 4 Jul 2023 at 16:21, Philippe Mathieu-Daudé  wrote:
>
> On 28/6/23 18:48, Fabiano Rosas wrote:
> > This code is only relevant when TCG is present in the build. Building
> > with --disable-tcg --enable-xen on an x86 host we get:
> >
> > $ ../configure --target-list=x86_64-softmmu,aarch64-softmmu --disable-tcg 
> > --enable-xen
> > $ make -j$(nproc)
> > ...
> > libqemu-aarch64-softmmu.fa.p/target_arm_gdbstub.c.o: in function 
> > `m_sysreg_ptr':
> >   ../target/arm/gdbstub.c:358: undefined reference to `arm_v7m_get_sp_ptr'
> >   ../target/arm/gdbstub.c:361: undefined reference to `arm_v7m_get_sp_ptr'
> >
> > libqemu-aarch64-softmmu.fa.p/target_arm_gdbstub.c.o: in function 
> > `arm_gdb_get_m_systemreg':
> > ../target/arm/gdbstub.c:405: undefined reference to `arm_v7m_mrs_control'
> >
> > Signed-off-by: Fabiano Rosas 
> > ---
> > This is a respin of:
> > https://lore.kernel.org/r/20230313151058.19645-5-faro...@suse.de
> > ---
> >   target/arm/gdbstub.c | 4 
> >   1 file changed, 4 insertions(+)
> >
> > diff --git a/target/arm/gdbstub.c b/target/arm/gdbstub.c
> > index 03b17c814f..f421c5d041 100644
> > --- a/target/arm/gdbstub.c
> > +++ b/target/arm/gdbstub.c
> > @@ -324,6 +324,7 @@ static int arm_gen_dynamic_sysreg_xml(CPUState *cs, int 
> > base_reg)
> >   return cpu->dyn_sysreg_xml.num;
> >   }
> >
> > +#ifdef CONFIG_TCG
>
> OK.
>
> >   typedef enum {
> >   M_SYSREG_MSP,
> >   M_SYSREG_PSP,
> > @@ -481,6 +482,7 @@ static int arm_gen_dynamic_m_secextreg_xml(CPUState 
> > *cs, int orig_base_reg)
> >   return cpu->dyn_m_secextreg_xml.num;
> >   }
> >   #endif
> > +#endif /* CONFIG_TCG */
> >
> >   const char *arm_gdb_get_dynamic_xml(CPUState *cs, const char *xmlname)
> >   {
> > @@ -561,6 +563,7 @@ void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu)
> >arm_gen_dynamic_sysreg_xml(cs, 
> > cs->gdb_num_regs),
> >"system-registers.xml", 0);
> >
> > +#ifdef CONFIG_TCG
>
> IIUC tcg_enabled(), this guard shouldn't be necessary; if CONFIG_TCG
> is not defined, tcg_enabled() evaluates to 0, and the compiler should
> elide the whole block.

IME it's a bit optimistic to assume that the compiler will always
do that, especially with no optimisation enabled.

thanks
-- PMM

[PATCH] target/arm: Avoid over-length shift in arm_cpu_sve_finalize() error case

2023-07-04 Thread Peter Maydell

If you build QEMU with the clang sanitizer enabled, you can see it
fire when running the arm-cpu-features test:

$ QTEST_QEMU_BINARY=./build/arm-clang/qemu-system-aarch64 
./build/arm-clang/tests/qtest/arm-cpu-features
[...]
../../target/arm/cpu64.c:125:19: runtime error: shift exponent 64 is too large 
for 64-bit type 'unsigned long long'
[...]

This happens because the user can specify some incorrect SVE
properties that result in our calculating a max_vq of 0.  We catch
this and error out, but before we do that we calculate

 vq_mask = MAKE_64BIT_MASK(0, max_vq);$

and the MAKE_64BIT_MASK() call is only valid for lengths that are
greater than zero, so we hit the undefined behaviour.

Change the logic so that if max_vq is 0 we specifically set vq_mask
to 0 without going via MAKE_64BIT_MASK().  This lets us drop the
max_vq check from the error-exit logic, because if max_vq is 0 then
vq_map must now be 0.

The UB only happens in the case where the user passed us an incorrect
set of SVE properties, so it's not a big problem in practice.

Signed-off-by: Peter Maydell 
---
 target/arm/cpu64.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/arm/cpu64.c b/target/arm/cpu64.c
index 6eaf8e32cfa..6012e4ef549 100644
--- a/target/arm/cpu64.c
+++ b/target/arm/cpu64.c
@@ -122,10 +122,10 @@ void arm_cpu_sve_finalize(ARMCPU *cpu, Error **errp)
 vq = ctz32(tmp) + 1;
 
 max_vq = vq <= ARM_MAX_VQ ? vq - 1 : ARM_MAX_VQ;
-vq_mask = MAKE_64BIT_MASK(0, max_vq);
+vq_mask = max_vq > 0 ? MAKE_64BIT_MASK(0, max_vq) : 0;
 vq_map = vq_supported & ~vq_init & vq_mask;
 
-if (max_vq == 0 || vq_map == 0) {
+if (vq_map == 0) {
 error_setg(errp, "cannot disable sve%d", vq * 128);
 error_append_hint(errp, "Disabling sve%d results in all "
   "vector lengths being disabled.\n",
-- 
2.34.1

Re: [PATCH RFC v2 3/4] vdpa: Restore packet receive filtering state relative with _F_CTRL_RX feature

2023-07-04 Thread Eugenio Perez Martin

On Thu, Jun 29, 2023 at 5:26 PM Hawkins Jiawei  wrote:
>
> This patch introduces vhost_vdpa_net_load_rx_mode()
> and vhost_vdpa_net_load_rx() to restore the packet
> receive filtering state in relation to
> VIRTIO_NET_F_CTRL_RX feature at device's startup.
>
> Signed-off-by: Hawkins Jiawei 
> ---
> v2:
>   - avoid sending CVQ command in default state suggested by Eugenio
>
> v1: 
> https://lore.kernel.org/all/86eeddcd6f6b04e5c1e44e901ddea3b1b8b6c183.1687402580.git.yin31...@gmail.com/
>
>  net/vhost-vdpa.c | 104 +++
>  1 file changed, 104 insertions(+)
>
> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> index cb45c84c88..9d5d88756c 100644
> --- a/net/vhost-vdpa.c
> +++ b/net/vhost-vdpa.c
> @@ -792,6 +792,106 @@ static int vhost_vdpa_net_load_offloads(VhostVDPAState 
> *s,
>  return 0;
>  }
>
> +static int vhost_vdpa_net_load_rx_mode(VhostVDPAState *s,
> +   uint8_t cmd,
> +   uint8_t on)
> +{
> +ssize_t dev_written;
> +const struct iovec data = {
> +.iov_base = ,
> +.iov_len = sizeof(on),
> +};
> +dev_written = vhost_vdpa_net_load_cmd(s, VIRTIO_NET_CTRL_RX,
> +  cmd, , 1);
> +if (unlikely(dev_written < 0)) {
> +return dev_written;
> +}
> +if (*s->status != VIRTIO_NET_OK) {
> +return -EINVAL;
> +}
> +
> +return 0;
> +}
> +
> +static int vhost_vdpa_net_load_rx(VhostVDPAState *s,
> +  const VirtIONet *n)
> +{
> +uint8_t on;
> +int r;
> +
> +if (virtio_vdev_has_feature(>parent_obj, VIRTIO_NET_F_CTRL_RX)) {

Also suggesting early returns here.

> +/* Load the promiscous mode */
> +if (n->mac_table.uni_overflow) {
> +/*
> + * According to VirtIO standard, "Since there are no guarantees,
> + * it can use a hash filter or silently switch to
> + * allmulti or promiscuous mode if it is given too many 
> addresses."
> + *
> + * QEMU ignores non-multicast(unicast) MAC addresses and
> + * marks `uni_overflow` for the device internal state
> + * if guest sets too many non-multicast(unicast) MAC addresses.
> + * Therefore, we should turn promiscous mode on in this case.
> + */
> +on = 1;
> +} else {
> +on = n->promisc;
> +}

I think we can remove the "on" variable and just do:

/*
 * According to ...
 */
if (n->mac_table.uni_overflow || n->promisc) {
  r = vhost_vdpa_net_load_rx_mode(s, VIRTIO_NET_CTRL_RX_PROMISC, on);
  if (r < 0) {
return r;
  }
---

And the equivalent for multicast.

Would that make sense?

Thanks!

> +if (on != 1) {
> +/*
> + * According to virtio_net_reset(), device turns promiscuous 
> mode on
> + * by default.
> + *
> + * Therefore, there is no need to send this CVQ command if the
> + * driver also sets promiscuous mode on, which aligns with
> + * the device's defaults.
> + *
> + * Note that the device's defaults can mismatch the driver's
> + * configuration only at live migration.
> + */
> +r = vhost_vdpa_net_load_rx_mode(s, VIRTIO_NET_CTRL_RX_PROMISC, 
> on);
> +if (r < 0) {
> +return r;
> +}
> +}
> +
> +/* Load the all-multicast mode */
> +if (n->mac_table.multi_overflow) {
> +/*
> + * According to VirtIO standard, "Since there are no guarantees,
> + * it can use a hash filter or silently switch to
> + * allmulti or promiscuous mode if it is given too many 
> addresses."
> + *
> + * QEMU ignores multicast MAC addresses and
> + * marks `multi_overflow` for the device internal state
> + * if guest sets too many multicast MAC addresses.
> + * Therefore, we should turn all-multicast mode on in this case.
> + */
> +on = 1;
> +} else {
> +on = n->allmulti;
> +}
> +if (on != 0) {
> +/*
> + * According to virtio_net_reset(), device turns all-multicast 
> mode
> + * off by default.
> + *
> + * Therefore, there is no need to send this CVQ command if the
> + * driver also sets all-multicast mode off, which aligns with
> + * the device's defaults.
> + *
> + * Note that the device's defaults can mismatch the driver's
> + * configuration only at live migration.
> + */
> +r = vhost_vdpa_net_load_rx_mode(s, VIRTIO_NET_CTRL_RX_ALLMULTI, 
> on);
> +if (r < 0) {
> +return r;
> +}
> +}
> +}
> +
> +

[PULL 1/1] block/blkio: fix module_block.py parsing

2023-07-04 Thread Stefan Hajnoczi

When QEMU is built with --enable-modules, the module_block.py script
parses block/*.c to find block drivers that are built as modules. The
script generates a table of block drivers called block_driver_modules[].
This table is used for block driver module loading.

The blkio.c driver uses macros to define its BlockDriver structs. This
was done to avoid code duplication but the module_block.py script is
unable to parse the macro. The result is that libblkio-based block
drivers can be built as modules but will not be found at runtime.

One fix is to make the module_block.py script or build system fancier so
it can parse C macros (e.g. by parsing the preprocessed source code). I
chose not to do this because it raises the complexity of the build,
making future issues harder to debug.

Keep things simple: use the macro to avoid duplicating BlockDriver
function pointers but define .format_name and .protocol_name manually
for each BlockDriver. This way the module_block.py is able to parse the
code.

Also get rid of the block driver name macros (e.g. DRIVER_IO_URING)
because module_block.py cannot parse them either.

Fixes: fd66dbd424f5 ("blkio: add libblkio block driver")
Reported-by: Qing Wang 
Signed-off-by: Stefan Hajnoczi 
Reviewed-by: Stefano Garzarella 
Message-id: 20230704123436.187761-1-stefa...@redhat.com
Cc: Stefano Garzarella 
Signed-off-by: Stefan Hajnoczi 
---
 block/blkio.c | 108 ++
 1 file changed, 56 insertions(+), 52 deletions(-)

diff --git a/block/blkio.c b/block/blkio.c
index 527323d625..1798648134 100644
--- a/block/blkio.c
+++ b/block/blkio.c
@@ -22,16 +22,6 @@
 
 #include "block/block-io.h"
 
-/*
- * Keep the QEMU BlockDriver names identical to the libblkio driver names.
- * Using macros instead of typing out the string literals avoids typos.
- */
-#define DRIVER_IO_URING "io_uring"
-#define DRIVER_NVME_IO_URING "nvme-io_uring"
-#define DRIVER_VIRTIO_BLK_VFIO_PCI "virtio-blk-vfio-pci"
-#define DRIVER_VIRTIO_BLK_VHOST_USER "virtio-blk-vhost-user"
-#define DRIVER_VIRTIO_BLK_VHOST_VDPA "virtio-blk-vhost-vdpa"
-
 /*
  * Allocated bounce buffers are kept in a list sorted by buffer address.
  */
@@ -744,15 +734,15 @@ static int blkio_file_open(BlockDriverState *bs, QDict 
*options, int flags,
 return ret;
 }
 
-if (strcmp(blkio_driver, DRIVER_IO_URING) == 0) {
+if (strcmp(blkio_driver, "io_uring") == 0) {
 ret = blkio_io_uring_open(bs, options, flags, errp);
-} else if (strcmp(blkio_driver, DRIVER_NVME_IO_URING) == 0) {
+} else if (strcmp(blkio_driver, "nvme-io_uring") == 0) {
 ret = blkio_nvme_io_uring(bs, options, flags, errp);
-} else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VFIO_PCI) == 0) {
+} else if (strcmp(blkio_driver, "virtio-blk-vfio-pci") == 0) {
 ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
-} else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_USER) == 0) {
+} else if (strcmp(blkio_driver, "virtio-blk-vhost-user") == 0) {
 ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
-} else if (strcmp(blkio_driver, DRIVER_VIRTIO_BLK_VHOST_VDPA) == 0) {
+} else if (strcmp(blkio_driver, "virtio-blk-vhost-vdpa") == 0) {
 ret = blkio_virtio_blk_common_open(bs, options, flags, errp);
 } else {
 g_assert_not_reached();
@@ -1028,49 +1018,63 @@ static void blkio_refresh_limits(BlockDriverState *bs, 
Error **errp)
  * - truncate
  */
 
-#define BLKIO_DRIVER(name, ...) \
-{ \
-.format_name = name, \
-.protocol_name   = name, \
-.instance_size   = sizeof(BDRVBlkioState), \
-.bdrv_file_open  = blkio_file_open, \
-.bdrv_close  = blkio_close, \
-.bdrv_co_getlength   = blkio_co_getlength, \
-.bdrv_co_truncate= blkio_truncate, \
-.bdrv_co_get_info= blkio_co_get_info, \
-.bdrv_attach_aio_context = blkio_attach_aio_context, \
-.bdrv_detach_aio_context = blkio_detach_aio_context, \
-.bdrv_co_pdiscard= blkio_co_pdiscard, \
-.bdrv_co_preadv  = blkio_co_preadv, \
-.bdrv_co_pwritev = blkio_co_pwritev, \
-.bdrv_co_flush_to_disk   = blkio_co_flush, \
-.bdrv_co_pwrite_zeroes   = blkio_co_pwrite_zeroes, \
-.bdrv_refresh_limits = blkio_refresh_limits, \
-.bdrv_register_buf   = blkio_register_buf, \
-.bdrv_unregister_buf = blkio_unregister_buf, \
-__VA_ARGS__ \
-}
+/*
+ * Do not include .format_name and .protocol_name because module_block.py
+ * does not parse macros in the source code.
+ */
+#define BLKIO_DRIVER_COMMON \
+.instance_size   = sizeof(BDRVBlkioState), \
+.bdrv_file_open  = blkio_file_open, \
+.bdrv_close  = blkio_close, \
+.bdrv_co_getlength   = blkio_co_getlength, \
+.bdrv_co_truncate= blkio_truncate, \
+

[PULL 0/1] Block patches

2023-07-04 Thread Stefan Hajnoczi

The following changes since commit d145c0da22cde391d8c6672d33146ce306e8bf75:

  Merge tag 'pull-tcg-20230701' of https://gitlab.com/rth7680/qemu into staging 
(2023-07-01 08:55:37 +0200)

are available in the Git repository at:

  https://gitlab.com/stefanha/qemu.git tags/block-pull-request

for you to fetch changes up to c21eae1ccc782440f320accb6f90c66cb8f45ee9:

  block/blkio: fix module_block.py parsing (2023-07-04 17:28:25 +0200)


Pull request

Fix --enable-modules with the blkio block driver.



Stefan Hajnoczi (1):
  block/blkio: fix module_block.py parsing

 block/blkio.c | 108 ++
 1 file changed, 56 insertions(+), 52 deletions(-)

-- 
2.40.1

Re: [virtio-dev] [RFC PATCH] docs/interop: define STANDALONE protocol feature for vhost-user

2023-07-04 Thread Alex Bennée



Stefano Garzarella  writes:

> On Tue, Jul 04, 2023 at 01:36:00PM +0100, Alex Bennée wrote:
>>Currently QEMU has to know some details about the back-end to be able
>>to setup the guest. While various parts of the setup can be delegated
>>to the backend (for example config handling) this is a very piecemeal
>>approach.
>>
>>This patch suggests a new feature flag (VHOST_USER_PROTOCOL_F_STANDALONE)
>>which the back-end can advertise which allows a probe message to be
>>sent to get all the details QEMU needs to know in one message.
>>
>>Signed-off-by: Alex Bennée 
>>
>>---
>>Initial RFC for discussion. I intend to prototype this work with QEMU
>>and one of the rust-vmm vhost-user daemons.
>
> Thanks for starting this discussion!
>
> I'm comparing with vhost-vdpa IOCTLs, so my questions may be
> superficial, but they help me understand the differences.

I did have a quick read-through to get a handle on vhost-vdpa but the
docs are fairly empty. However I see there is more detail in the linux
commit so after looking at that I do wonder:

 * The kernel commit defines a subset of VIRTIO_F feature flags. Should
   we do the same for this interface?

 * The VDPA GET/SET STATUS and GET/SET CONFIG ioctls are already covered
   by the equivalent VHOST_USER messages?

>>---
>> docs/interop/vhost-user.rst | 37 +
>> hw/virtio/vhost-user.c  |  8 
>> 2 files changed, 45 insertions(+)
>>
>>diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst
>>index 5a070adbc1..85b1b1583a 100644
>>--- a/docs/interop/vhost-user.rst
>>+++ b/docs/interop/vhost-user.rst
>>@@ -275,6 +275,21 @@ Inflight description
>>
>> :queue size: a 16-bit size of virtqueues
>>
>>+Backend specifications
>>+^^
>>+
>>++---+-+++
>>+| device id | config size |   min_vqs  |   max_vqs  |
>>++---+-+++
>>+
>>+:device id: a 32-bit value holding the VirtIO device ID
>>+
>>+:config size: a 32-bit value holding the config size (see 
>>``VHOST_USER_GET_CONFIG``)
>>+
>>+:min_vqs: a 32-bit value holding the minimum number of vqs supported
>
> Why do we need the minimum?

We need to know the minimum number because some devices have fixed VQs
that must be present.

>
>>+
>>+:max_vqs: a 32-bit value holding the maximum number of vqs supported, must 
>>be >= min_vqs
>
> Is this overlap with VHOST_USER_GET_QUEUE_NUM?

Yes it does and I considered implementing a bunch of messages to fill in
around what we already have. However that seemed like it would add a
bunch of complexity to the interface when we could get all the initial
data in one go.

>
>>+
>> C structure
>> ---
>>
>>@@ -296,6 +311,7 @@ In QEMU the vhost-user message is implemented with the 
>>following struct:
>>   VhostUserConfig config;
>>   VhostUserVringArea area;
>>   VhostUserInflight inflight;
>>+  VhostUserBackendSpecs specs;
>>   };
>>   } QEMU_PACKED VhostUserMsg;
>>
>>@@ -316,6 +332,7 @@ replies. Here is a list of the ones that do:
>> * ``VHOST_USER_GET_VRING_BASE``
>> * ``VHOST_USER_SET_LOG_BASE`` (if ``VHOST_USER_PROTOCOL_F_LOG_SHMFD``)
>> * ``VHOST_USER_GET_INFLIGHT_FD`` (if 
>> ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD``)
>>+* ``VHOST_USER_GET_BACKEND_SPECS`` (if ``VHOST_USER_PROTOCOL_F_STANDALONE``)
>>
>> .. seealso::
>>
>>@@ -885,6 +902,13 @@ Protocol features
>>   #define VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS  15
>>   #define VHOST_USER_PROTOCOL_F_STATUS   16
>>   #define VHOST_USER_PROTOCOL_F_XEN_MMAP 17
>>+  #define VHOST_USER_PROTOCOL_F_STANDALONE   18
>>+
>>+Some features are only valid in the presence of other supporting
>>+features. In the case of ``VHOST_USER_PROTOCOL_F_STANDALONE`` the
>>+backend must also support ``VHOST_USER_PROTOCOL_F_CONFIG`` and
>>+``VHOST_USER_PROTOCOL_F_STATUS``.
>>+
>
> What about adding a new section where we will describe what we mean
> with "standalone" devices?
>
> For example that the entire virtio device is emulated in the backend,
> etc.
>
> By the way, I was thinking more about F_FULL_DEVICE, but I'm not good
> with names, so I'll just throw out an idea :-)

Naming things is hard ;-)

I could add a new section although AIUI there is nothing really in
existing daemons which is split between the front-end and back-end. The
stubs are basically boilerplate and ensure DT/PCIe hubs make the device
appear so once things are discovered QEMU never really gets involved
aside from being a dumb relay.

>
> Thanks,
> Stefano
>
>>
>> Front-end message types
>> ---
>>@@ -1440,6 +1464,19 @@ Front-end message types
>>   query the back-end for its device status as defined in the Virtio
>>   specification.
>>
>>+``VHOST_USER_GET_BACKEND_SPECS``
>>+  :id: 41
>>+  :request payload: N/A
>>+  :reply payload: ``Backend specifications``
>>+
>>+  When the ``VHOST_USER_PROTOCOL_F_STANDALONE`` protocol feature has been

Re: [PATCH] target/arm: gdbstub: Guard M-profile code with CONFIG_TCG

2023-07-04 Thread Philippe Mathieu-Daudé


On 28/6/23 18:48, Fabiano Rosas wrote:

This code is only relevant when TCG is present in the build. Building
with --disable-tcg --enable-xen on an x86 host we get:

$ ../configure --target-list=x86_64-softmmu,aarch64-softmmu --disable-tcg 
--enable-xen
$ make -j$(nproc)
...
libqemu-aarch64-softmmu.fa.p/target_arm_gdbstub.c.o: in function `m_sysreg_ptr':
  ../target/arm/gdbstub.c:358: undefined reference to `arm_v7m_get_sp_ptr'
  ../target/arm/gdbstub.c:361: undefined reference to `arm_v7m_get_sp_ptr'

libqemu-aarch64-softmmu.fa.p/target_arm_gdbstub.c.o: in function 
`arm_gdb_get_m_systemreg':
../target/arm/gdbstub.c:405: undefined reference to `arm_v7m_mrs_control'

Signed-off-by: Fabiano Rosas 
---
This is a respin of:
https://lore.kernel.org/r/20230313151058.19645-5-faro...@suse.de
---
  target/arm/gdbstub.c | 4 
  1 file changed, 4 insertions(+)

diff --git a/target/arm/gdbstub.c b/target/arm/gdbstub.c
index 03b17c814f..f421c5d041 100644
--- a/target/arm/gdbstub.c
+++ b/target/arm/gdbstub.c
@@ -324,6 +324,7 @@ static int arm_gen_dynamic_sysreg_xml(CPUState *cs, int 
base_reg)
  return cpu->dyn_sysreg_xml.num;
  }
  
+#ifdef CONFIG_TCG


OK.


  typedef enum {
  M_SYSREG_MSP,
  M_SYSREG_PSP,
@@ -481,6 +482,7 @@ static int arm_gen_dynamic_m_secextreg_xml(CPUState *cs, 
int orig_base_reg)
  return cpu->dyn_m_secextreg_xml.num;
  }
  #endif
+#endif /* CONFIG_TCG */
  
  const char *arm_gdb_get_dynamic_xml(CPUState *cs, const char *xmlname)

  {
@@ -561,6 +563,7 @@ void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu)
   arm_gen_dynamic_sysreg_xml(cs, cs->gdb_num_regs),
   "system-registers.xml", 0);
  
+#ifdef CONFIG_TCG


IIUC tcg_enabled(), this guard shouldn't be necessary; if CONFIG_TCG
is not defined, tcg_enabled() evaluates to 0, and the compiler should
elide the whole block.


  if (arm_feature(env, ARM_FEATURE_M) && tcg_enabled()) {
  gdb_register_coprocessor(cs,
  arm_gdb_get_m_systemreg, arm_gdb_set_m_systemreg,
@@ -575,4 +578,5 @@ void arm_cpu_register_gdb_regs_for_features(ARMCPU *cpu)
  }
  #endif
  }
+#endif /* CONFIG_TCG */
  }

Re: [PATCH] ui/gtk: Make sure the right EGL context is currently bound

2023-07-04 Thread Marc-André Lureau

On Wed, Jun 28, 2023 at 9:36 PM Dongwon Kim  wrote:

> Observed a wrong context is bound when changing the scanout mode.
> To prevent problem, it is needed to make sure to bind the right
> context in gtk_egl_set_scanout_mode/gtk_gl_area_set_scanout_mode
> as well as unbind one in the end of gd_egl_update/gd_gl_area_update.
>
> Cc: Gerd Hoffmann 
> Cc: Marc-André Lureau 
> Cc: Vivek Kasireddy 
> Signed-off-by: Dongwon Kim 
>

Reviewed-by: Marc-André Lureau 


> ---
>  ui/gtk-egl.c | 4 
>  ui/gtk-gl-area.c | 2 ++
>  2 files changed, 6 insertions(+)
>
> diff --git a/ui/gtk-egl.c b/ui/gtk-egl.c
> index 19130041bc..79f9f334f2 100644
> --- a/ui/gtk-egl.c
> +++ b/ui/gtk-egl.c
> @@ -31,6 +31,8 @@ static void gtk_egl_set_scanout_mode(VirtualConsole *vc,
> bool scanout)
>
>  vc->gfx.scanout_mode = scanout;
>  if (!vc->gfx.scanout_mode) {
> +eglMakeCurrent(qemu_egl_display, vc->gfx.esurface,
> +   vc->gfx.esurface, vc->gfx.ectx);
>  egl_fb_destroy(>gfx.guest_fb);
>  if (vc->gfx.surface) {
>  surface_gl_destroy_texture(vc->gfx.gls, vc->gfx.ds);
> @@ -134,6 +136,8 @@ void gd_egl_update(DisplayChangeListener *dcl,
> vc->gfx.esurface, vc->gfx.ectx);
>  surface_gl_update_texture(vc->gfx.gls, vc->gfx.ds, x, y, w, h);
>  vc->gfx.glupdates++;
> +eglMakeCurrent(qemu_egl_display, EGL_NO_SURFACE,
> +   EGL_NO_SURFACE, EGL_NO_CONTEXT);
>  }
>
>  void gd_egl_refresh(DisplayChangeListener *dcl)
> diff --git a/ui/gtk-gl-area.c b/ui/gtk-gl-area.c
> index c384a1516b..285e661071 100644
> --- a/ui/gtk-gl-area.c
> +++ b/ui/gtk-gl-area.c
> @@ -26,6 +26,7 @@ static void gtk_gl_area_set_scanout_mode(VirtualConsole
> *vc, bool scanout)
>
>  vc->gfx.scanout_mode = scanout;
>  if (!vc->gfx.scanout_mode) {
> +gtk_gl_area_make_current(GTK_GL_AREA(vc->gfx.drawing_area));
>  egl_fb_destroy(>gfx.guest_fb);
>  if (vc->gfx.surface) {
>  surface_gl_destroy_texture(vc->gfx.gls, vc->gfx.ds);
> @@ -115,6 +116,7 @@ void gd_gl_area_update(DisplayChangeListener *dcl,
>  gtk_gl_area_make_current(GTK_GL_AREA(vc->gfx.drawing_area));
>  surface_gl_update_texture(vc->gfx.gls, vc->gfx.ds, x, y, w, h);
>  vc->gfx.glupdates++;
> +gdk_gl_context_clear_current();
>  }
>
>  void gd_gl_area_refresh(DisplayChangeListener *dcl)
> --
> 2.34.1
>
>
>

-- 
Marc-André Lureau

[PATCH] virtio-blk: fix host notifier issues during dataplane start/stop

2023-07-04 Thread Stefan Hajnoczi

The main loop thread can consume 100% CPU when using --device
virtio-blk-pci,iothread=. ppoll() constantly returns but
reading virtqueue host notifiers fails with EAGAIN. The file descriptors
are stale and remain registered with the AioContext because of bugs in
the virtio-blk dataplane start/stop code.

The problem is that the dataplane start/stop code involves drain
operations, which call virtio_blk_drained_begin() and
virtio_blk_drained_end() at points where the host notifier is not
operational:
- In virtio_blk_data_plane_start(), blk_set_aio_context() drains after
  vblk->dataplane_started has been set to true but the host notifier has
  not been attached yet.
- In virtio_blk_data_plane_stop(), blk_drain() and blk_set_aio_context()
  drain after the host notifier has already been detached but with
  vblk->dataplane_started still set to true.

I would like to simplify ->ioeventfd_start/stop() to avoid interactions
with drain entirely, but couldn't find a way to do that. Instead, this
patch accepts the fragile nature of the code and reorders it so that
vblk->dataplane_started is false during drain operations. This way the
virtio_blk_drained_begin() and virtio_blk_drained_end() calls don't
touch the host notifier. The result is that
virtio_blk_data_plane_start() and virtio_blk_data_plane_stop() have
complete control over the host notifier and stale file descriptors are
no longer left in the AioContext.

This patch fixes the 100% CPU consumption in the main loop thread and
correctly moves host notifier processing to the IOThread.

Fixes: 1665d9326fd2 ("virtio-blk: implement BlockDevOps->drained_begin()")
Reported-by: Lukáš Doktor 
Signed-off-by: Stefan Hajnoczi 
---
 hw/block/dataplane/virtio-blk.c | 67 +++--
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/hw/block/dataplane/virtio-blk.c b/hw/block/dataplane/virtio-blk.c
index c227b39408..da36fcfd0b 100644
--- a/hw/block/dataplane/virtio-blk.c
+++ b/hw/block/dataplane/virtio-blk.c
@@ -219,13 +219,6 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
 
 memory_region_transaction_commit();
 
-/*
- * These fields are visible to the IOThread so we rely on implicit barriers
- * in aio_context_acquire() on the write side and aio_notify_accept() on
- * the read side.
- */
-s->starting = false;
-vblk->dataplane_started = true;
 trace_virtio_blk_data_plane_start(s);
 
 old_context = blk_get_aio_context(s->conf->conf.blk);
@@ -244,6 +237,18 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
 event_notifier_set(virtio_queue_get_host_notifier(vq));
 }
 
+/*
+ * These fields must be visible to the IOThread when it processes the
+ * virtqueue, otherwise it will think dataplane has not started yet.
+ *
+ * Make sure ->dataplane_started is false when blk_set_aio_context() is
+ * called above so that draining does not cause the host notifier to be
+ * detached/attached prematurely.
+ */
+s->starting = false;
+vblk->dataplane_started = true;
+smp_wmb(); /* paired with aio_notify_accept() on the read side */
+
 /* Get this show started by hooking up our callbacks */
 if (!blk_in_drain(s->conf->conf.blk)) {
 aio_context_acquire(s->ctx);
@@ -273,7 +278,6 @@ int virtio_blk_data_plane_start(VirtIODevice *vdev)
   fail_guest_notifiers:
 vblk->dataplane_disabled = true;
 s->starting = false;
-vblk->dataplane_started = true;
 return -ENOSYS;
 }
 
@@ -327,6 +331,32 @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev)
 aio_wait_bh_oneshot(s->ctx, virtio_blk_data_plane_stop_bh, s);
 }
 
+/*
+ * Batch all the host notifiers in a single transaction to avoid
+ * quadratic time complexity in address_space_update_ioeventfds().
+ */
+memory_region_transaction_begin();
+
+for (i = 0; i < nvqs; i++) {
+virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
+}
+
+/*
+ * The transaction expects the ioeventfds to be open when it
+ * commits. Do it now, before the cleanup loop.
+ */
+memory_region_transaction_commit();
+
+for (i = 0; i < nvqs; i++) {
+virtio_bus_cleanup_host_notifier(VIRTIO_BUS(qbus), i);
+}
+
+/*
+ * Set ->dataplane_started to false before draining so that host notifiers
+ * are not detached/attached anymore.
+ */
+vblk->dataplane_started = false;
+
 aio_context_acquire(s->ctx);
 
 /* Wait for virtio_blk_dma_restart_bh() and in flight I/O to complete */
@@ -340,32 +370,11 @@ void virtio_blk_data_plane_stop(VirtIODevice *vdev)
 
 aio_context_release(s->ctx);
 
-/*
- * Batch all the host notifiers in a single transaction to avoid
- * quadratic time complexity in address_space_update_ioeventfds().
- */
-memory_region_transaction_begin();
-
-for (i = 0; i < nvqs; i++) {
-virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
-}
-
-/*
- *

Re: [PATCH v4 4/6] target/riscv: Create an KVM AIA irqchip

2023-07-04 Thread Andrew Jones

On Wed, Jun 21, 2023 at 02:54:54PM +, Yong-Xuan Wang wrote:
> implement a function to create an KVM AIA chip

This is a bit too terse. We should at least summarize the KVM API this
uses.

> 
> Signed-off-by: Yong-Xuan Wang 
> Reviewed-by: Jim Shu 
> ---
>  target/riscv/kvm.c   | 163 +++
>  target/riscv/kvm_riscv.h |   6 ++
>  2 files changed, 169 insertions(+)
> 
> diff --git a/target/riscv/kvm.c b/target/riscv/kvm.c
> index eb469e8ca5..3dd8467031 100644
> --- a/target/riscv/kvm.c
> +++ b/target/riscv/kvm.c
> @@ -34,6 +34,7 @@
>  #include "exec/address-spaces.h"
>  #include "hw/boards.h"
>  #include "hw/irq.h"
> +#include "hw/intc/riscv_imsic.h"
>  #include "qemu/log.h"
>  #include "hw/loader.h"
>  #include "kvm_riscv.h"
> @@ -41,6 +42,7 @@
>  #include "chardev/char-fe.h"
>  #include "migration/migration.h"
>  #include "sysemu/runstate.h"
> +#include "hw/riscv/numa.h"
>  
>  static uint64_t kvm_riscv_reg_id(CPURISCVState *env, uint64_t type,
>   uint64_t idx)
> @@ -548,3 +550,164 @@ bool kvm_arch_cpu_check_are_resettable(void)
>  void kvm_arch_accel_class_init(ObjectClass *oc)
>  {
>  }
> +
> +char *kvm_aia_mode_str(uint64_t aia_mode)
> +{
> +const char *val;
> +
> +switch (aia_mode) {
> +case KVM_DEV_RISCV_AIA_MODE_EMUL:
> +val = "emul";
> +break;
> +case KVM_DEV_RISCV_AIA_MODE_HWACCEL:
> +val = "hwaccel";
> +break;
> +case KVM_DEV_RISCV_AIA_MODE_AUTO:
> +default:
> +val = "auto";
> +break;
> +};
> +
> +return g_strdup(val);

There's no need to duplicate statically allocated strings unless they need
to be manipulated. These strings do not, so this should just be

 const char *kvm_aia_mode_str(uint64_t aia_mode)
 {
switch (aia_mode) {
case KVM_DEV_RISCV_AIA_MODE_EMUL:
return "emul";
...

or even just an array

 const char *kvm_aia_mode_str[] = { "emul", "hwaccel", "auto" };

> +}
> +
> +void kvm_riscv_aia_create(MachineState *machine,
> +  uint64_t aia_mode, uint64_t group_shift,
> +  uint64_t aia_irq_num, uint64_t aia_msi_num,
> +  uint64_t aplic_base, uint64_t imsic_base,
> +  uint64_t guest_num)
> +{
> +int ret, i;
> +int aia_fd = -1;
> +uint64_t default_aia_mode;
> +uint64_t socket_count = riscv_socket_count(machine);
> +uint64_t max_hart_per_socket = 0;
> +uint64_t socket, base_hart, hart_count, socket_imsic_base, imsic_addr;
> +uint64_t socket_bits, hart_bits, guest_bits;
> +
> +aia_fd = kvm_create_device(kvm_state, KVM_DEV_TYPE_RISCV_AIA, false);
> +
> +if (aia_fd < 0) {
> +error_report("Unable to create in-kernel irqchip");
> +exit(1);
> +}
> +

For all the "fail to..." error messages below I would change them to
"failed to..."

> +ret = kvm_device_access(aia_fd, KVM_DEV_RISCV_AIA_GRP_CONFIG,
> +KVM_DEV_RISCV_AIA_CONFIG_MODE,
> +_aia_mode, false, NULL);
> +if (ret < 0) {
> +error_report("KVM AIA: fail to get current KVM AIA mode");
> +exit(1);
> +}
> +qemu_log("KVM AIA: default mode is %s\n",
> + kvm_aia_mode_str(default_aia_mode));
> +
> +if (default_aia_mode != aia_mode) {
> +ret = kvm_device_access(aia_fd, KVM_DEV_RISCV_AIA_GRP_CONFIG,
> +KVM_DEV_RISCV_AIA_CONFIG_MODE,
> +_mode, true, NULL);
> +if (ret < 0)
> +warn_report("KVM AIA: fail to set KVM AIA mode");
> +else
> +qemu_log("KVM AIA: set current mode to %s\n",
> + kvm_aia_mode_str(aia_mode));
> +}
> +
> +ret = kvm_device_access(aia_fd, KVM_DEV_RISCV_AIA_GRP_CONFIG,
> +KVM_DEV_RISCV_AIA_CONFIG_SRCS,
> +_irq_num, true, NULL);
> +if (ret < 0) {
> +error_report("KVM AIA: fail to set number of input irq lines");
> +exit(1);
> +}
> +
> +ret = kvm_device_access(aia_fd, KVM_DEV_RISCV_AIA_GRP_CONFIG,
> +KVM_DEV_RISCV_AIA_CONFIG_IDS,
> +_msi_num, true, NULL);
> +if (ret < 0) {
> +error_report("KVM AIA: fail to set number of msi");
> +exit(1);
> +}
> +
> +socket_bits = find_last_bit(_count, BITS_PER_LONG) + 1;
> +ret = kvm_device_access(aia_fd, KVM_DEV_RISCV_AIA_GRP_CONFIG,
> +KVM_DEV_RISCV_AIA_CONFIG_GROUP_BITS,
> +_bits, true, NULL);
> +if (ret < 0) {
> +error_report("KVM AIA: fail to set group_bits");
> +exit(1);
> +}
> +
> +ret = kvm_device_access(aia_fd, KVM_DEV_RISCV_AIA_GRP_CONFIG,
> +KVM_DEV_RISCV_AIA_CONFIG_GROUP_SHIFT,
> +_shift, true, NULL);
> +if (ret < 0) {
> +

[PATCH] i386/xen: fix off-by-one in xen_evtchn_set_gsi()

2023-07-04 Thread Woodhouse, David

Coverity points out (CID 1508128) a bounds checking error. We need to check
for gsi >= IOAPIC_NUM_PINS, not just greater-than.

Also fix up an assert() that has the same problem, that Coverity didn't see.

Signed-off-by: David Woodhouse 
---
 hw/i386/kvm/xen_evtchn.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/i386/kvm/xen_evtchn.c b/hw/i386/kvm/xen_evtchn.c
index 3d810dbd59..0e9c108614 100644
--- a/hw/i386/kvm/xen_evtchn.c
+++ b/hw/i386/kvm/xen_evtchn.c
@@ -1587,7 +1587,7 @@ static int allocate_pirq(XenEvtchnState *s, int type, int 
gsi)
  found:
 pirq_inuse_word(s, pirq) |= pirq_inuse_bit(pirq);
 if (gsi >= 0) {
-assert(gsi <= IOAPIC_NUM_PINS);
+assert(gsi < IOAPIC_NUM_PINS);
 s->gsi_pirq[gsi] = pirq;
 }
 s->pirq[pirq].gsi = gsi;
@@ -1601,7 +1601,7 @@ bool xen_evtchn_set_gsi(int gsi, int level)
 
 assert(qemu_mutex_iothread_locked());
 
-if (!s || gsi < 0 || gsi > IOAPIC_NUM_PINS) {
+if (!s || gsi < 0 || gsi >= IOAPIC_NUM_PINS) {
 return false;
 }
 
-- 
2.34.1




smime.p7s
Description: S/MIME cryptographic signature



Amazon Development Centre (London) Ltd. Registered in England and Wales with 
registration number 04543232 with its registered office at 1 Principal Place, 
Worship Street, London EC2A 2FA, United Kingdom.

Re: [PATCH] virtio-gpu: do not replace surface when scanout is disabled

2023-07-04 Thread Marc-André Lureau

Hi

On Wed, Jun 28, 2023 at 12:32 AM Dongwon Kim  wrote:

> Surface is replaced with a place holder whenever the surface res
> is unreferenced by the guest message. With this logic, there is
> very frequent switching between guest display and the place holder
> image, which is looking like a flickering display if the guest driver
> is designed to unref the current scanout resource before sending out
> a new scanout resource. So it is better to leave the current scanout
> image until there is a new one flushed by the guest.
>
> Cc: Gerd Hoffmann 
> Cc: Marc-André Lureau 
> Cc: Vivek Kasireddy 
> Signed-off-by: Dongwon Kim 
>

Why is the driver not setting a different scanout before destroying the
resource?

I think it's wrong to not replace the surface, as the associated scanout
resource may be destroyed or explicitly disabled for various purposes, and
we don't want to display garbage either.

---
>  hw/display/virtio-gpu.c | 1 -
>  1 file changed, 1 deletion(-)
>
> diff --git a/hw/display/virtio-gpu.c b/hw/display/virtio-gpu.c
> index 66cddd94d9..9d3e922c8f 100644
> --- a/hw/display/virtio-gpu.c
> +++ b/hw/display/virtio-gpu.c
> @@ -387,7 +387,6 @@ static void virtio_gpu_disable_scanout(VirtIOGPU *g,
> int scanout_id)
>  res->scanout_bitmask &= ~(1 << scanout_id);
>  }
>
> -dpy_gfx_replace_surface(scanout->con, NULL);
>  scanout->resource_id = 0;
>  scanout->ds = NULL;
>  scanout->width = 0;
> --
> 2.34.1
>
>
>

-- 
Marc-André Lureau

Re: [PATCH v2 00/19] hw/timer/arm_timer: QOM'ify ARM_TIMER and correct sysbus/irq in ICP_PIT

2023-07-04 Thread Philippe Mathieu-Daudé


On 4/7/23 16:49, Philippe Mathieu-Daudé wrote:

This series converts the ARM_TIMER model to QOM.

Doing so we also correct an abuse of SysBus IRQ in
the ICP PIT model.

Since v1:
- Added pm215's R-b tags
- Addressed Mark/Peter review comments
   - Drop '*State' suffix from structure names
   - Use OR-IRQ gate


(forgot to mention migration normally taken care of)


   - Drop sp804_unrealize()
   - Implement Resettable API
- MMIO-map timer regions into parents

Also, from here it should be easier to add the
ARM AP804 used by BCM2835; see Thomas Venriès's
patch v2:
https://patchwork.kernel.org/project/qemu-devel/patch/1507728290-6062-1-git-send-email-thomas.venr...@gmail.com/

Re: [PATCH v7 5/6] hw/pci: ensure PCIE devices are plugged into only slot 0 of PCIE port

2023-07-04 Thread Ani Sinha




> On 04-Jul-2023, at 7:58 PM, Igor Mammedov  wrote:
> 
> On Tue, 4 Jul 2023 19:20:00 +0530
> Ani Sinha  wrote:
> 
>>> On 04-Jul-2023, at 6:18 PM, Igor Mammedov  wrote:
>>> 
>>> On Tue, 4 Jul 2023 21:02:09 +0900
>>> Akihiko Odaki  wrote:
>>> 
 On 2023/07/04 20:59, Ani Sinha wrote:  
> 
> 
>> On 04-Jul-2023, at 5:24 PM, Akihiko Odaki  
>> wrote:
>> 
>> On 2023/07/04 20:25, Ani Sinha wrote:
>>> PCI Express ports only have one slot, so PCI Express devices can only be
>>> plugged into slot 0 on a PCIE port. Add a warning to let users know 
>>> when the
>>> invalid configuration is used. We may enforce this more strongly later 
>>> on once
>>> we get more clarity on whether we are introducing a bad regression for 
>>> users
>>> currenly using the wrong configuration.
>>> The change has been tested to not break or alter behaviors of ARI 
>>> capable
>>> devices by instantiating seven vfs on an emulated igb device (the 
>>> maximum
>>> number of vfs the linux igb driver supports). The vfs instantiated 
>>> correctly
>>> and are seen to have non-zero device/slot numbers in the conventional 
>>> PCI BDF
>>> representation.
>>> CC: jus...@redhat.com
>>> CC: imamm...@redhat.com
>>> CC: m...@redhat.com
>>> CC: akihiko.od...@daynix.com
>>> Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=2128929
>>> Signed-off-by: Ani Sinha 
>>> Reviewed-by: Julia Suvorova 
>>> ---
>>> hw/pci/pci.c | 15 +++
>>> 1 file changed, 15 insertions(+)
>>> diff --git a/hw/pci/pci.c b/hw/pci/pci.c
>>> index e2eb4c3b4a..47517ba3db 100644
>>> --- a/hw/pci/pci.c
>>> +++ b/hw/pci/pci.c
>>> @@ -65,6 +65,7 @@ bool pci_available = true;
>>> static char *pcibus_get_dev_path(DeviceState *dev);
>>> static char *pcibus_get_fw_dev_path(DeviceState *dev);
>>> static void pcibus_reset(BusState *qbus);
>>> +static bool pcie_has_upstream_port(PCIDevice *dev);
>>>   static Property pci_props[] = {
>>> DEFINE_PROP_PCI_DEVFN("addr", PCIDevice, devfn, -1),
>>> @@ -2121,6 +2122,20 @@ static void pci_qdev_realize(DeviceState *qdev, 
>>> Error **errp)
>>> }
>>> }
>>> +/*
>>> + * With SRIOV and ARI, vfs can have non-zero slot in the 
>>> conventional
>>> + * PCI interpretation as all five bits reserved for slot addresses 
>>> are
>>> + * also used for function bits for the various vfs. Ignore that 
>>> case.
>> 
>> You don't have to mention SR/IOV; it affects all ARI-capable devices. A 
>> PF can also have non-zero slot number in the conventional interpretation 
>> so you shouldn't call it vf either.
> 
> Can you please help write a comment that explains this properly for all 
> cases - ARI/non-ARI, PFs and VFs? Once everyone agrees that its clear and 
> correct, I will re-spin.
 
 Simply, you can say:
 With ARI, the slot number field in the conventional PCI interpretation 
 can have a non-zero value as the field bits are reused to extend the 
 function number bits. Ignore that case.  
>>> 
>>> mentioning 'conventional PCI interpretation' in comment and then immediately
>>> checking 'pci_is_express(pci_dev)' is confusing. Since comment belongs
>>> only to PCIE branch it would be better to talk in only about PCIe stuff
>>> and referring to relevant portions of spec.  
>> 
>> Ok so how about this?
>> 
>>   * With ARI, devices can have non-zero slot in the traditional BDF  
>> 
>> * representation as all five bits reserved for slot addresses are
>>   
>> * also used for function bits. Ignore that case.  
> 
> you still refer to traditional (which I misread as 'conventional'),
> steal the linux comment and argument it with ARI if necessary,
> something like this (probably needs some more massaging):

The comment messaging in these patches seems to exceed the value of the patch 
itself :-)

How about this?

/*  

 * A PCIe Downstream Port normally leads to a Link with only Device 

 * 0 on it (PCIe spec r3.1, sec 7.3.1). 

 * With ARI, PCI_SLOT() can return non-zero value as all five bits  

 * reserved for slot addresses are also used for function bits. 

 *

Re: [PATCH 0/2] target/arm: Implement Cortex Neoverse-V1

2023-07-04 Thread Philippe Mathieu-Daudé


On 4/7/23 17:00, Marcin Juszkiewicz wrote:

W dniu 4.07.2023 o 16:54, Philippe Mathieu-Daudé pisze:

On 4/7/23 15:35, Marcin Juszkiewicz wrote:

W dniu 4.07.2023 o 15:06, Peter Maydell pisze:


This patchset implements the Cortex Neoverse-V1 CPU type, as a
representative Armv8.3 (+ some extras from 8.4) CPU matching real
hardware.  The main thing we were waiting for to be able to define
this was FEAT_LSE2, and that is now supported.


Now I can add "reach SBSA level 4" to todo list as it requires v8.3 
cpu (I do not count 'max' cpu type).


Do we need to introduce machine variants, such sbsa-lvl3-ref and
sbsa-lvl4-ref? Or simply sbsa-level3/sbsa-level4?


No such combinations. The plan for sbsa-ref is to have only one platform.

Version of platform is exported in DeviceTree already. TF-A reads it and 
exports via SMC call to EDK2. What changes between versions is present 
in documentation.


Great! I like simplicity :)

Re: [PATCH v3 0/8] Implement Most ARMv8.3 Pointer Authentication Features

2023-07-04 Thread Peter Maydell

On Fri, 9 Jun 2023 at 18:23, Aaron Lindsay  wrote:
>
> Changes from v2 of this patchset [0]:
> - Remove properties for EPAC, Pauth2, FPAC, FPACCombined
> - Separate out aa64isar2 addition into its own patch
> - Comment clarifications
> - Several code formatting/simplifications
> - Rebase on top of latest upstream changes (for example, those which
>   reorganized decoding PAC branch instructions)
>
> [0] - https://lists.nongnu.org/archive/html/qemu-devel/2023-02/msg06494.html

I was looking at trying to bring this into target-arm.next,
since the changes in patches 1 and 2 for code review
comments are not large. However I found that it fails
"make check":

ERROR:../../tests/qtest/arm-cpu-features.c:425:pauth_tests_default:
assertion failed: (g_str_equal(_error, "cannot enable pauth-impdef
without pauth"))

as a result of patch 8's changes to the handling
of the CPU pauth related properties.

The assertion seems to be because patch 8 has changed
the error string to something different from that which
the test case is looking for.

Patch 8 needs to also update the test case to include
testing of the handling of the new properties it adds.

thanks
-- PMM

Re: [PATCH 0/2] target/arm: Implement Cortex Neoverse-V1

2023-07-04 Thread Marcin Juszkiewicz


W dniu 4.07.2023 o 16:54, Philippe Mathieu-Daudé pisze:

On 4/7/23 15:35, Marcin Juszkiewicz wrote:

W dniu 4.07.2023 o 15:06, Peter Maydell pisze:


This patchset implements the Cortex Neoverse-V1 CPU type, as a
representative Armv8.3 (+ some extras from 8.4) CPU matching real
hardware.  The main thing we were waiting for to be able to define
this was FEAT_LSE2, and that is now supported.


Now I can add "reach SBSA level 4" to todo list as it requires v8.3 
cpu (I do not count 'max' cpu type).


Do we need to introduce machine variants, such sbsa-lvl3-ref and
sbsa-lvl4-ref? Or simply sbsa-level3/sbsa-level4?


No such combinations. The plan for sbsa-ref is to have only one platform.

Version of platform is exported in DeviceTree already. TF-A reads it and 
exports via SMC call to EDK2. What changes between versions is present 
in documentation.

Re: [PATCH 2/2] target/arm: Define neoverse-v1

2023-07-04 Thread Alex Bennée



Peter Maydell  writes:

> Now that we have implemented support for FEAT_LSE2, we can define
> a CPU model for the Neoverse-V1, and enable it for the virt and
> sbsa-ref boards.
>
> Signed-off-by: Peter Maydell 

Reviewed-by: Alex Bennée 

-- 
Alex Bennée
Virtualisation Tech Lead @ Linaro

Re: [virtio-dev] [RFC PATCH] docs/interop: define STANDALONE protocol feature for vhost-user

2023-07-04 Thread Stefano Garzarella


On Tue, Jul 04, 2023 at 01:36:00PM +0100, Alex Bennée wrote:

Currently QEMU has to know some details about the back-end to be able
to setup the guest. While various parts of the setup can be delegated
to the backend (for example config handling) this is a very piecemeal
approach.

This patch suggests a new feature flag (VHOST_USER_PROTOCOL_F_STANDALONE)
which the back-end can advertise which allows a probe message to be
sent to get all the details QEMU needs to know in one message.

Signed-off-by: Alex Bennée 

---
Initial RFC for discussion. I intend to prototype this work with QEMU
and one of the rust-vmm vhost-user daemons.


Thanks for starting this discussion!

I'm comparing with vhost-vdpa IOCTLs, so my questions may be
superficial, but they help me understand the differences.


---
docs/interop/vhost-user.rst | 37 +
hw/virtio/vhost-user.c  |  8 
2 files changed, 45 insertions(+)

diff --git a/docs/interop/vhost-user.rst b/docs/interop/vhost-user.rst
index 5a070adbc1..85b1b1583a 100644
--- a/docs/interop/vhost-user.rst
+++ b/docs/interop/vhost-user.rst
@@ -275,6 +275,21 @@ Inflight description

:queue size: a 16-bit size of virtqueues

+Backend specifications
+^^
+
++---+-+++
+| device id | config size |   min_vqs  |   max_vqs  |
++---+-+++
+
+:device id: a 32-bit value holding the VirtIO device ID
+
+:config size: a 32-bit value holding the config size (see 
``VHOST_USER_GET_CONFIG``)
+
+:min_vqs: a 32-bit value holding the minimum number of vqs supported


Why do we need the minimum?


+
+:max_vqs: a 32-bit value holding the maximum number of vqs supported, must be 
>= min_vqs


Is this overlap with VHOST_USER_GET_QUEUE_NUM?


+
C structure
---

@@ -296,6 +311,7 @@ In QEMU the vhost-user message is implemented with the 
following struct:
  VhostUserConfig config;
  VhostUserVringArea area;
  VhostUserInflight inflight;
+  VhostUserBackendSpecs specs;
  };
  } QEMU_PACKED VhostUserMsg;

@@ -316,6 +332,7 @@ replies. Here is a list of the ones that do:
* ``VHOST_USER_GET_VRING_BASE``
* ``VHOST_USER_SET_LOG_BASE`` (if ``VHOST_USER_PROTOCOL_F_LOG_SHMFD``)
* ``VHOST_USER_GET_INFLIGHT_FD`` (if ``VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD``)
+* ``VHOST_USER_GET_BACKEND_SPECS`` (if ``VHOST_USER_PROTOCOL_F_STANDALONE``)

.. seealso::

@@ -885,6 +902,13 @@ Protocol features
  #define VHOST_USER_PROTOCOL_F_CONFIGURE_MEM_SLOTS  15
  #define VHOST_USER_PROTOCOL_F_STATUS   16
  #define VHOST_USER_PROTOCOL_F_XEN_MMAP 17
+  #define VHOST_USER_PROTOCOL_F_STANDALONE   18
+
+Some features are only valid in the presence of other supporting
+features. In the case of ``VHOST_USER_PROTOCOL_F_STANDALONE`` the
+backend must also support ``VHOST_USER_PROTOCOL_F_CONFIG`` and
+``VHOST_USER_PROTOCOL_F_STATUS``.
+


What about adding a new section where we will describe what we mean
with "standalone" devices?

For example that the entire virtio device is emulated in the backend,
etc.

By the way, I was thinking more about F_FULL_DEVICE, but I'm not good
with names, so I'll just throw out an idea :-)

Thanks,
Stefano



Front-end message types
---
@@ -1440,6 +1464,19 @@ Front-end message types
  query the back-end for its device status as defined in the Virtio
  specification.

+``VHOST_USER_GET_BACKEND_SPECS``
+  :id: 41
+  :request payload: N/A
+  :reply payload: ``Backend specifications``
+
+  When the ``VHOST_USER_PROTOCOL_F_STANDALONE`` protocol feature has been
+  successfully negotiated, this message is submitted by the front-end to
+  query the back-end for its capabilities. This is intended to remove
+  the need for the front-end to know ahead of time what the VirtIO
+  device the backend emulates is.
+
+  The reply contains the device id, size of the config space and the
+  range of VirtQueues the backend supports.

Back-end message types
--
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index c4e0cbd702..28b021d5d3 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -202,6 +202,13 @@ typedef struct VhostUserInflight {
uint16_t queue_size;
} VhostUserInflight;

+typedef struct VhostUserBackendSpecs {
+uint32_t device_id;
+uint32_t config_size;
+uint32_t min_vqs;
+uint32_t max_vqs;
+} VhostUserBackendSpecs;
+
typedef struct {
VhostUserRequest request;

@@ -226,6 +233,7 @@ typedef union {
VhostUserCryptoSession session;
VhostUserVringArea area;
VhostUserInflight inflight;
+VhostUserBackendSpecs specs;
} VhostUserPayload;

typedef struct VhostUserMsg {
--
2.39.2


-
To unsubscribe, e-mail: virtio-dev-unsubscr...@lists.oasis-open.org
For additional commands, e-mail: virtio-dev-h...@lists.oasis-open.org

Re: [PATCH 0/2] target/arm: Implement Cortex Neoverse-V1

2023-07-04 Thread Philippe Mathieu-Daudé


On 4/7/23 15:35, Marcin Juszkiewicz wrote:

W dniu 4.07.2023 o 15:06, Peter Maydell pisze:


This patchset implements the Cortex Neoverse-V1 CPU type, as a
representative Armv8.3 (+ some extras from 8.4) CPU matching real
hardware.  The main thing we were waiting for to be able to define
this was FEAT_LSE2, and that is now supported.


Now I can add "reach SBSA level 4" to todo list as it requires v8.3 cpu 
(I do not count 'max' cpu type).


Do we need to introduce machine variants, such sbsa-lvl3-ref and
sbsa-lvl4-ref? Or simply sbsa-level3/sbsa-level4?

Re: [PATCH RFC v2 2/4] vdpa: Restore MAC address filtering state

2023-07-04 Thread Eugenio Perez Martin

On Thu, Jun 29, 2023 at 5:26 PM Hawkins Jiawei  wrote:
>
> This patch refactors vhost_vdpa_net_load_mac() to
> restore the MAC address filtering state at device's startup.
>
> Signed-off-by: Hawkins Jiawei 
> ---
> v2:
>   - use iovec suggested by Eugenio
>   - avoid sending CVQ command in default state
>
> v1: 
> https://lore.kernel.org/all/00f72fe154a882fd6dc15bc39e3a1ac63f9dadce.1687402580.git.yin31...@gmail.com/
>
>  net/vhost-vdpa.c | 51 
>  1 file changed, 51 insertions(+)
>
> diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c
> index 0bd1c7817c..cb45c84c88 100644
> --- a/net/vhost-vdpa.c
> +++ b/net/vhost-vdpa.c
> @@ -665,6 +665,57 @@ static int vhost_vdpa_net_load_mac(VhostVDPAState *s, 
> const VirtIONet *n)
>  }
>  }
>
> +if (virtio_vdev_has_feature(>parent_obj, VIRTIO_NET_F_CTRL_RX)) {
> +if (n->mac_table.in_use != 0) {

This may be just style nitpicking, but I find it more clear to return
early if conditions are not met and then send the CVQ command.
Something like:
/*
 * According to ...
 */
if (!virtio_vdev_has_feature(>parent_obj, VIRTIO_NET_F_CTRL_RX)) ||
(n->mac_table.in_use == 0)) {
  return 0
}

uni_entries = n->mac_table.first_multi,
...
---

Now I just realized vhost_vdpa_net_load_mac does not follow this for
checking VIRTIO_NET_F_CTRL_MAC_ADDR.

I'm ok if you leave it this way though.

Thanks!

> +/*
> + * According to virtio_net_reset(), device uses an empty MAC 
> filter
> + * table as its default state.
> + *
> + * Therefore, there is no need to send this CVQ command if the
> + * driver also sets an empty MAC filter table, which aligns with
> + * the device's defaults.
> + *
> + * Note that the device's defaults can mismatch the driver's
> + * configuration only at live migration.
> + */
> +uint32_t uni_entries = n->mac_table.first_multi,
> + uni_macs_size = uni_entries * ETH_ALEN,
> + mul_entries = n->mac_table.in_use - uni_entries,
> + mul_macs_size = mul_entries * ETH_ALEN;
> +struct virtio_net_ctrl_mac uni = {
> +.entries = cpu_to_le32(uni_entries),
> +};
> +struct virtio_net_ctrl_mac mul = {
> +.entries = cpu_to_le32(mul_entries),
> +};
> +const struct iovec data[] = {
> +{
> +.iov_base = ,
> +.iov_len = sizeof(uni),
> +}, {
> +.iov_base = n->mac_table.macs,
> +.iov_len = uni_macs_size,
> +}, {
> +.iov_base = ,
> +.iov_len = sizeof(mul),
> +}, {
> +.iov_base = >mac_table.macs[uni_macs_size],
> +.iov_len = mul_macs_size,
> +},
> +};
> +ssize_t dev_written = vhost_vdpa_net_load_cmd(s,
> +VIRTIO_NET_CTRL_MAC,
> +VIRTIO_NET_CTRL_MAC_TABLE_SET,
> +data, ARRAY_SIZE(data));
> +if (unlikely(dev_written < 0)) {
> +return dev_written;
> +}
> +if (*s->status != VIRTIO_NET_OK) {
> +return -EINVAL;
> +}
> +}
> +}
> +
>  return 0;
>  }
>
> --
> 2.25.1
>

[PATCH v2 14/19] hw/timer/arm_timer: Pass timer output IRQ as parameter to arm_timer_new

2023-07-04 Thread Philippe Mathieu-Daudé

Both SP804Timer/IntegratorPIT peek at ArmTimer internal state.
This is fine so far but we want to convert ArmTimer to QOM
where peeking at QOM state internal should be avoided.
ArmTimer's IRQ is just a pointer, so we can pass/set it via
argument, avoiding accessing ArmTimer internal state except
from the arm_timer_*() methods.

Once ArmTimer get QOM'ified (in a few commits), it will
inherit the SysBus API. This IRQ will then become a SysBus
IRQ within this ArmTimer object.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/timer/arm_timer.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/hw/timer/arm_timer.c b/hw/timer/arm_timer.c
index 68cd50314f..f6bec28884 100644
--- a/hw/timer/arm_timer.c
+++ b/hw/timer/arm_timer.c
@@ -179,7 +179,7 @@ static void arm_timer_reset_hold(ArmTimer *s)
 s->control = TIMER_CTRL_IE;
 }
 
-static ArmTimer *arm_timer_new(uint32_t freq)
+static ArmTimer *arm_timer_new(uint32_t freq, qemu_irq irq_out)
 {
 ArmTimer *s;
 
@@ -187,6 +187,7 @@ static ArmTimer *arm_timer_new(uint32_t freq)
 s->freq = freq;
 arm_timer_reset_hold(s);
 
+s->irq = irq_out;
 s->timer = ptimer_init(arm_timer_tick, s, PTIMER_POLICY_LEGACY);
 vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, _arm_timer, s);
 return s;
@@ -209,6 +210,7 @@ struct SP804Timer {
 uint32_t freq[2];
 int level[2];
 qemu_irq irq;
+qemu_irq irq_in[2];
 };
 
 static const uint8_t sp804_ids[] = {
@@ -311,8 +313,8 @@ static void sp804_realize(DeviceState *dev, Error **errp)
 SP804Timer *s = SP804_TIMER(dev);
 
 for (unsigned i = 0; i < ARRAY_SIZE(s->timer); i++) {
-s->timer[i] = arm_timer_new(s->freq[i]);
-s->timer[i]->irq = qemu_allocate_irq(sp804_set_irq, s, i);
+s->irq_in[i] = qemu_allocate_irq(sp804_set_irq, s, i);
+s->timer[i] = arm_timer_new(s->freq[i], s->irq_in[i]);
 }
 }
 
@@ -341,6 +343,7 @@ struct IntegratorPIT {
 
 MemoryRegion iomem;
 ArmTimer *timer[3];
+qemu_irq irq_in[3];
 };
 
 static uint64_t icp_pit_read(void *opaque, hwaddr offset,
@@ -392,8 +395,8 @@ static void icp_pit_init(Object *obj)
 SysBusDevice *dev = SYS_BUS_DEVICE(obj);
 
 for (unsigned i = 0; i < ARRAY_SIZE(s->timer); i++) {
-s->timer[i] = arm_timer_new(tmr_freq[i]);
-sysbus_init_irq(dev, >timer[i]->irq);
+s->timer[i] = arm_timer_new(tmr_freq[i], s->irq_in[i]);
+sysbus_init_irq(dev, >irq_in[i]);
 }
 
 memory_region_init_io(>iomem, obj, _pit_ops, s,
-- 
2.38.1

[PATCH v2 17/19] hw/timer/arm_timer: QDev'ify ARM_TIMER

2023-07-04 Thread Philippe Mathieu-Daudé

Introduce the ARM_TIMER sysbus device, exposing one output IRQ
and a single MMIO region.

arm_timer_new() is converted as QOM instance init()/finalize()
handlers. Note in arm_timer_finalize() we release a ptimer handle
which was previously leaked.

ArmTimer is directly embedded into SP804Timer/IntegratorPIT,
and is initialized as a QOM child.

Since the timer frequency belongs to ARM_TIMER, have it hold the
QOM property. SP804Timer/IntegratorPIT directly access it.

For IntegratorPIT, each ARM_TIMER sysbus output IRQ is wired as
input IRQ.

For the SP804Timer, we add a TYPE_OR_IRQ to OR the ARM_TIMER sysbus
output IRQs together, exposing a single output IRQ.
The Kconfig entry have to select the OR_IRQ dependency.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/timer/arm_timer.c | 139 ---
 hw/timer/Kconfig |   1 +
 2 files changed, 105 insertions(+), 35 deletions(-)

diff --git a/hw/timer/arm_timer.c b/hw/timer/arm_timer.c
index 30e29cc166..8207723ab5 100644
--- a/hw/timer/arm_timer.c
+++ b/hw/timer/arm_timer.c
@@ -12,11 +12,13 @@
 #include "migration/vmstate.h"
 #include "qemu/timer.h"
 #include "hw/irq.h"
+#include "hw/or-irq.h"
 #include "hw/ptimer.h"
 #include "hw/qdev-properties.h"
 #include "qemu/module.h"
 #include "qemu/log.h"
 #include "qom/object.h"
+#include "qapi/error.h"
 
 /* Common timer implementation.  */
 
@@ -29,14 +31,20 @@
 #define TIMER_CTRL_PERIODIC (1 << 6)
 #define TIMER_CTRL_ENABLE   (1 << 7)
 
-typedef struct {
+#define TYPE_ARM_TIMER "arm-timer"
+OBJECT_DECLARE_SIMPLE_TYPE(ArmTimer, ARM_TIMER)
+
+struct ArmTimer {
+SysBusDevice parent_obj;
+
+MemoryRegion iomem;
 ptimer_state *timer;
 uint32_t control;
 uint32_t limit;
 uint32_t freq;
 int int_level;
 qemu_irq irq;
-} ArmTimer;
+};
 
 /* Check all active timers, and schedule the next timer interrupt.  */
 
@@ -152,6 +160,14 @@ static void arm_timer_write(void *opaque, hwaddr offset,
 arm_timer_update(s);
 }
 
+static const MemoryRegionOps arm_timer_ops = {
+.read = arm_timer_read,
+.write = arm_timer_write,
+.endianness = DEVICE_LITTLE_ENDIAN,
+.impl.min_access_size = 4,
+.impl.max_access_size = 4,
+};
+
 static void arm_timer_tick(void *opaque)
 {
 ArmTimer *s = opaque;
@@ -172,25 +188,49 @@ static const VMStateDescription vmstate_arm_timer = {
 }
 };
 
-static void arm_timer_reset_hold(ArmTimer *s)
+static void arm_timer_reset_hold(Object *obj)
 {
+ArmTimer *s = ARM_TIMER(obj);
+
 s->limit = 0;
 s->int_level = 0;
 s->control = TIMER_CTRL_IE;
 }
 
-static ArmTimer *arm_timer_new(uint32_t freq, qemu_irq irq_out)
+static void arm_timer_init(Object *obj)
 {
-ArmTimer *s;
+ArmTimer *s = ARM_TIMER(obj);
+SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
 
-s = g_new0(ArmTimer, 1);
-s->freq = freq;
-arm_timer_reset_hold(s);
-
-s->irq = irq_out;
 s->timer = ptimer_init(arm_timer_tick, s, PTIMER_POLICY_LEGACY);
-vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, _arm_timer, s);
-return s;
+
+sysbus_init_irq(sbd, >irq);
+
+memory_region_init_io(>iomem, obj, _timer_ops, s,
+  "arm_timer", 0x20);
+sysbus_init_mmio(sbd, >iomem);
+}
+
+static void arm_timer_finalize(Object *obj)
+{
+ArmTimer *s = ARM_TIMER(obj);
+
+ptimer_free(s->timer);
+}
+
+static Property arm_timer_properties[] = {
+DEFINE_PROP_UINT32("freq", ArmTimer, freq, 0),
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static void arm_timer_class_init(ObjectClass *oc, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(oc);
+ResettableClass *rc = RESETTABLE_CLASS(oc);
+
+dc->vmsd = _arm_timer;
+device_class_set_props(dc, arm_timer_properties);
+rc->phases.hold = arm_timer_reset_hold;
 }
 
 /*
@@ -206,11 +246,10 @@ struct SP804Timer {
 SysBusDevice parent_obj;
 
 MemoryRegion iomem;
-ArmTimer *timer[2];
-uint32_t freq[2];
-int level[2];
+ArmTimer timer[2];
+int mig_v1_level[2];
+OrIRQState irq_orgate;
 qemu_irq irq;
-qemu_irq irq_in[2];
 };
 
 static const uint8_t sp804_ids[] = {
@@ -220,15 +259,6 @@ static const uint8_t sp804_ids[] = {
 0xd, 0xf0, 0x05, 0xb1
 };
 
-/* Merge the IRQs from the two component devices.  */
-static void sp804_set_irq(void *opaque, int irq, int level)
-{
-SP804Timer *s = opaque;
-
-s->level[irq] = level;
-qemu_set_irq(s->irq, s->level[0] || s->level[1]);
-}
-
 static uint64_t sp804_read(void *opaque, hwaddr offset,
unsigned size)
 {
@@ -287,12 +317,26 @@ static const MemoryRegionOps sp804_ops = {
 .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
+static int sp804_post_load(void *opaque, int version_id)
+{
+SP804Timer *s = opaque;
+
+if (version_id < 2) {
+for (unsigned i = 0; i < ARRAY_SIZE(s->timer); i++) {
+qemu_set_irq(qdev_get_gpio_in(DEVICE(>irq_orgate), i),
+ s->mig_v1_level[i]);
+}
+}
+

[PATCH v2 15/19] hw/timer/arm_timer: Fix misuse of SysBus IRQ in IntegratorPIT

2023-07-04 Thread Philippe Mathieu-Daudé

SysBus IRQ are *output* IRQs. As some sort of simplification
to avoid to forward it, IntegratorPIT misuses it as ARM timer
input IRQ. Fix that by using a simple IRQ forwarder handler.

Note: sysbus_pass_irq() forwards GPIOs and IRQs from a container
to an inner device but only work with an entire set of IRQs, so
we can not use it here where we forward a single IRQ from each
timer.

Signed-off-by: Philippe Mathieu-Daudé 
Reviewed-by: Peter Maydell 
---
 hw/timer/arm_timer.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/hw/timer/arm_timer.c b/hw/timer/arm_timer.c
index f6bec28884..aae7f3cf9d 100644
--- a/hw/timer/arm_timer.c
+++ b/hw/timer/arm_timer.c
@@ -344,6 +344,7 @@ struct IntegratorPIT {
 MemoryRegion iomem;
 ArmTimer *timer[3];
 qemu_irq irq_in[3];
+qemu_irq irq[3];
 };
 
 static uint64_t icp_pit_read(void *opaque, hwaddr offset,
@@ -383,6 +384,13 @@ static const MemoryRegionOps icp_pit_ops = {
 .endianness = DEVICE_NATIVE_ENDIAN,
 };
 
+static void icp_pit_fwd_irq(void *opaque, int n, int level)
+{
+IntegratorPIT *s = opaque;
+
+qemu_set_irq(s->irq[n], level);
+}
+
 static void icp_pit_init(Object *obj)
 {
 static const uint32_t tmr_freq[] = {
@@ -394,9 +402,14 @@ static void icp_pit_init(Object *obj)
 IntegratorPIT *s = INTEGRATOR_PIT(obj);
 SysBusDevice *dev = SYS_BUS_DEVICE(obj);
 
+qdev_init_gpio_in_named(DEVICE(obj), icp_pit_fwd_irq,
+"timer-in", ARRAY_SIZE(s->timer));
+
 for (unsigned i = 0; i < ARRAY_SIZE(s->timer); i++) {
 s->timer[i] = arm_timer_new(tmr_freq[i], s->irq_in[i]);
-sysbus_init_irq(dev, >irq_in[i]);
+sysbus_init_irq(dev, >irq[i]);
+sysbus_connect_irq(dev, i,
+   qdev_get_gpio_in_named(DEVICE(obj), "timer-in", i));
 }
 
 memory_region_init_io(>iomem, obj, _pit_ops, s,
-- 
2.38.1

[PATCH v2 16/19] hw/timer/arm_timer: Extract icp_pit_realize() from icp_pit_init()

2023-07-04 Thread Philippe Mathieu-Daudé

To make the next commit easier to digest, extract icp_pit_realize()
from icp_pit_init() as a preliminary step.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/timer/arm_timer.c | 21 +++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/hw/timer/arm_timer.c b/hw/timer/arm_timer.c
index aae7f3cf9d..30e29cc166 100644
--- a/hw/timer/arm_timer.c
+++ b/hw/timer/arm_timer.c
@@ -408,8 +408,6 @@ static void icp_pit_init(Object *obj)
 for (unsigned i = 0; i < ARRAY_SIZE(s->timer); i++) {
 s->timer[i] = arm_timer_new(tmr_freq[i], s->irq_in[i]);
 sysbus_init_irq(dev, >irq[i]);
-sysbus_connect_irq(dev, i,
-   qdev_get_gpio_in_named(DEVICE(obj), "timer-in", i));
 }
 
 memory_region_init_io(>iomem, obj, _pit_ops, s,
@@ -419,12 +417,31 @@ static void icp_pit_init(Object *obj)
save themselves.  */
 }
 
+static void icp_pit_realize(DeviceState *dev, Error **errp)
+{
+IntegratorPIT *s = INTEGRATOR_PIT(dev);
+
+for (unsigned i = 0; i < ARRAY_SIZE(s->timer); i++) {
+SysBusDevice *tmr = SYS_BUS_DEVICE(>timer[i]);
+
+sysbus_connect_irq(tmr, i, qdev_get_gpio_in_named(dev, "timer-in", i));
+}
+}
+
+static void icp_pit_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *k = DEVICE_CLASS(klass);
+
+k->realize = icp_pit_realize;
+}
+
 static const TypeInfo arm_timer_types[] = {
 {
 .name   = TYPE_INTEGRATOR_PIT,
 .parent = TYPE_SYS_BUS_DEVICE,
 .instance_size  = sizeof(IntegratorPIT),
 .instance_init  = icp_pit_init,
+.class_init = icp_pit_class_init,
 
 }, {
 .name   = TYPE_SP804_TIMER,
-- 
2.38.1

[PATCH v2 08/19] hw/timer/arm_timer: Extract arm_timer_reset_hold()

2023-07-04 Thread Philippe Mathieu-Daudé

Extract arm_timer_reset_hold() before converting this model to
QOM/QDev in few commits. This will become our ResettableHoldPhase
handler.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/timer/arm_timer.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/hw/timer/arm_timer.c b/hw/timer/arm_timer.c
index 8dae845998..0d7fac4d78 100644
--- a/hw/timer/arm_timer.c
+++ b/hw/timer/arm_timer.c
@@ -172,13 +172,20 @@ static const VMStateDescription vmstate_arm_timer = {
 }
 };
 
+static void arm_timer_reset_hold(ArmTimer *s)
+{
+s->limit = 0;
+s->int_level = 0;
+s->control = TIMER_CTRL_IE;
+}
+
 static ArmTimer *arm_timer_init(uint32_t freq)
 {
 ArmTimer *s;
 
 s = g_new0(ArmTimer, 1);
 s->freq = freq;
-s->control = TIMER_CTRL_IE;
+arm_timer_reset_hold(s);
 
 s->timer = ptimer_init(arm_timer_tick, s, PTIMER_POLICY_LEGACY);
 vmstate_register(NULL, VMSTATE_INSTANCE_ID_ANY, _arm_timer, s);
-- 
2.38.1

[PATCH v2 07/19] hw/timer/arm_timer: Rename TYPE_SP804 -> TYPE_SP804_TIMER

2023-07-04 Thread Philippe Mathieu-Daudé

Having a QOM object using its device type as suffix is
often helpful.

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/timer/arm_timer.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/hw/timer/arm_timer.c b/hw/timer/arm_timer.c
index 41045de8ed..8dae845998 100644
--- a/hw/timer/arm_timer.c
+++ b/hw/timer/arm_timer.c
@@ -191,8 +191,8 @@ static ArmTimer *arm_timer_init(uint32_t freq)
  * https://developer.arm.com/documentation/ddi0271/latest/
  */
 
-#define TYPE_SP804 "sp804"
-OBJECT_DECLARE_SIMPLE_TYPE(SP804Timer, SP804)
+#define TYPE_SP804_TIMER "sp804"
+OBJECT_DECLARE_SIMPLE_TYPE(SP804Timer, SP804_TIMER)
 
 struct SP804Timer {
 SysBusDevice parent_obj;
@@ -290,7 +290,7 @@ static const VMStateDescription vmstate_sp804 = {
 
 static void sp804_init(Object *obj)
 {
-SP804Timer *s = SP804(obj);
+SP804Timer *s = SP804_TIMER(obj);
 SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
 
 sysbus_init_irq(sbd, >irq);
@@ -301,7 +301,7 @@ static void sp804_init(Object *obj)
 
 static void sp804_realize(DeviceState *dev, Error **errp)
 {
-SP804Timer *s = SP804(dev);
+SP804Timer *s = SP804_TIMER(dev);
 
 s->timer[0] = arm_timer_init(s->freq0);
 s->timer[1] = arm_timer_init(s->freq1);
@@ -403,7 +403,7 @@ static const TypeInfo arm_timer_types[] = {
 .instance_init  = icp_pit_init,
 
 }, {
-.name   = TYPE_SP804,
+.name   = TYPE_SP804_TIMER,
 .parent = TYPE_SYS_BUS_DEVICE,
 .instance_size  = sizeof(SP804Timer),
 .instance_init  = sp804_init,
-- 
2.38.1

[PATCH v2 10/19] hw/timer/arm_timer: Rename arm_timer_init() -> arm_timer_new()

2023-07-04 Thread Philippe Mathieu-Daudé

QDev models often use foo_new() as the combination of
foo_init() + foo_realize(). Here arm_timer_init() is
a such combination, so rename it as arm_timer_new() to
emphasis the returned device is already realized.

Signed-off-by: Philippe Mathieu-Daudé 
Reviewed-by: Peter Maydell 
---
 hw/timer/arm_timer.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/hw/timer/arm_timer.c b/hw/timer/arm_timer.c
index cbd82e8365..4ef34b0f60 100644
--- a/hw/timer/arm_timer.c
+++ b/hw/timer/arm_timer.c
@@ -179,7 +179,7 @@ static void arm_timer_reset_hold(ArmTimer *s)
 s->control = TIMER_CTRL_IE;
 }
 
-static ArmTimer *arm_timer_init(uint32_t freq)
+static ArmTimer *arm_timer_new(uint32_t freq)
 {
 ArmTimer *s;
 
@@ -310,8 +310,8 @@ static void sp804_realize(DeviceState *dev, Error **errp)
 {
 SP804Timer *s = SP804_TIMER(dev);
 
-s->timer[0] = arm_timer_init(s->freq0);
-s->timer[1] = arm_timer_init(s->freq1);
+s->timer[0] = arm_timer_new(s->freq0);
+s->timer[1] = arm_timer_new(s->freq1);
 s->timer[0]->irq = qemu_allocate_irq(sp804_set_irq, s, 0);
 s->timer[1]->irq = qemu_allocate_irq(sp804_set_irq, s, 1);
 }
@@ -386,10 +386,10 @@ static void icp_pit_init(Object *obj)
 SysBusDevice *dev = SYS_BUS_DEVICE(obj);
 
 /* Timer 0 runs at the system clock speed (40MHz).  */
-s->timer[0] = arm_timer_init(4000);
+s->timer[0] = arm_timer_new(4000);
 /* The other two timers run at 1MHz.  */
-s->timer[1] = arm_timer_init(100);
-s->timer[2] = arm_timer_init(100);
+s->timer[1] = arm_timer_new(100);
+s->timer[2] = arm_timer_new(100);
 
 sysbus_init_irq(dev, >timer[0]->irq);
 sysbus_init_irq(dev, >timer[1]->irq);
-- 
2.38.1

1 2 3 4 >

1 - 100 of 310 matches

Mail list logo