[PATCH v16 11/14] hmat acpi: Build System Locality Latency and Bandwidth Information Structure(s)
From: Liu Jingqi This structure describes the memory access latency and bandwidth information from various memory access initiator proximity domains. The latency and bandwidth numbers represented in this structure correspond to rated latency and bandwidth for the platform. The software could use this information as hint for optimization. Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- Changes in v16: - Add more description for lb_length (Igor) - Drop entry_list and calculate entries in this patch (Igor) Changes in v13: - Calculate the entries in a new patch. --- hw/acpi/hmat.c | 105 - 1 file changed, 104 insertions(+), 1 deletion(-) diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c index 9ff79308a4..ed19ebed2f 100644 --- a/hw/acpi/hmat.c +++ b/hw/acpi/hmat.c @@ -25,8 +25,10 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "sysemu/numa.h" #include "hw/acpi/hmat.h" +#include "qemu/error-report.h" /* * ACPI 6.3: @@ -67,11 +69,89 @@ static void build_hmat_mpda(GArray *table_data, uint16_t flags, build_append_int_noprefix(table_data, 0, 8); } +/* + * ACPI 6.3: 5.2.27.4 System Locality Latency and Bandwidth Information + * Structure: Table 5-146 + */ +static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, + uint32_t num_initiator, uint32_t num_target, + uint32_t *initiator_list) +{ +int i, index; +HMAT_LB_Data *lb_data; +uint16_t *entry_list; +uint32_t base; +/* Length in bytes for entire structure */ +uint32_t lb_length += 32 /* Table length upto and including Entry Base Unit */ ++ 4 * num_initiator /* Initiator Proximity Domain List */ ++ 4 * num_target /* Target Proximity Domain List */ ++ 2 * num_initiator * num_target; /* Latency or Bandwidth Entries */ + +/* Type */ +build_append_int_noprefix(table_data, 1, 2); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* Length */ +build_append_int_noprefix(table_data, lb_length, 4); +/* Flags: Bits [3:0] Memory Hierarchy, Bits[7:4] Reserved */ +assert(!(hmat_lb->hierarchy >> 4)); +build_append_int_noprefix(table_data, hmat_lb->hierarchy, 1); +/* Data Type */ +build_append_int_noprefix(table_data, hmat_lb->data_type, 1); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* Number of Initiator Proximity Domains (s) */ +build_append_int_noprefix(table_data, num_initiator, 4); +/* Number of Target Proximity Domains (t) */ +build_append_int_noprefix(table_data, num_target, 4); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 4); + +/* Entry Base Unit */ +if (hmat_lb->data_type <= HMAT_LB_DATA_WRITE_LATENCY) { +/* Convert latency base from nanoseconds to picosecond */ +base = hmat_lb->base * 1000; +} else { +/* Convert bandwidth base from Byte to Megabyte */ +base = hmat_lb->base / MiB; +} +build_append_int_noprefix(table_data, base, 8); + +/* Initiator Proximity Domain List */ +for (i = 0; i < num_initiator; i++) { +build_append_int_noprefix(table_data, initiator_list[i], 4); +} + +/* Target Proximity Domain List */ +for (i = 0; i < num_target; i++) { +build_append_int_noprefix(table_data, i, 4); +} + +/* Latency or Bandwidth Entries */ +entry_list = g_malloc0(hmat_lb->list->len * sizeof(uint16_t)); +for (i = 0; i < hmat_lb->list->len; i++) { +lb_data = _array_index(hmat_lb->list, HMAT_LB_Data, i); +index = lb_data->initiator * num_target + lb_data->target; + +entry_list[index] = (uint16_t)(lb_data->data / hmat_lb->base); +} + +for (i = 0; i < num_initiator * num_target; i++) { +build_append_int_noprefix(table_data, entry_list[i], 2); +} + +g_free(entry_list); +} + /* Build HMAT sub table structures */ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) { uint16_t flags; -int i; +uint32_t num_initiator = 0; +uint32_t initiator_list[MAX_NODES]; +int i, hierarchy, type; +HMAT_LB_Info *hmat_lb; for (i = 0; i < numa_state->num_nodes; i++) { flags = 0; @@ -82,6 +162,29 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) build_hmat_mpda(table_data, flags, numa_state->nodes[i].initiator, i); } + +for (i = 0; i < numa_state->num_nodes; i++) { +if (numa_state->nodes[i].has_cpu) { +initiator_list[num_initiator++] = i; +} +} + +/* + * ACPI 6.3: 5.2.27.4 System Locality Latency and Bandwidth Information + * Structure: Table 5-146 + */ +for (hierarchy = HMAT_LB_MEM_MEMORY; + hierarchy <= HMAT_LB_MEM_CACHE_3RD_LEVEL; hierarchy++) { +for (type = HMAT_LB_DATA_ACCESS_LATENCY; + type <=
[PATCH v16 06/14] tests: Add test for QAPI builtin type time
Add tests for time input such as zero, around limit of precision, signed upper limit, actual upper limit, beyond limits, time suffixes, and etc. Signed-off-by: Tao Xu --- Changes in v16: - Update the test cases Changes in v14: - Drop time unit picosecond (Eric) --- tests/test-keyval.c| 89 ++ tests/test-qobject-input-visitor.c | 29 ++ 2 files changed, 118 insertions(+) diff --git a/tests/test-keyval.c b/tests/test-keyval.c index fad941fcb8..39443f7e0c 100644 --- a/tests/test-keyval.c +++ b/tests/test-keyval.c @@ -457,6 +457,94 @@ static void test_keyval_visit_size(void) visit_free(v); } +static void test_keyval_visit_time(void) +{ +Error *err = NULL; +Visitor *v; +QDict *qdict; +uint64_t time; + +/* Lower limit zero */ +qdict = keyval_parse("time1=0", NULL, _abort); +v = qobject_input_visitor_new_keyval(QOBJECT(qdict)); +qobject_unref(qdict); +visit_start_struct(v, NULL, NULL, 0, _abort); +visit_type_time(v, "time1", , _abort); +g_assert_cmpuint(time, ==, 0); +visit_check_struct(v, _abort); +visit_end_struct(v, NULL); +visit_free(v); + +/* Around limit of precision: UINT64_MAX - 1, UINT64_MAX */ +qdict = keyval_parse("time1=18446744073709551614," + "time2=18446744073709551615", + NULL, _abort); +v = qobject_input_visitor_new_keyval(QOBJECT(qdict)); +qobject_unref(qdict); +visit_start_struct(v, NULL, NULL, 0, _abort); +visit_type_time(v, "time1", , _abort); +g_assert_cmphex(time, ==, 0xfffe); +visit_type_time(v, "time2", , _abort); +g_assert_cmphex(time, ==, 0x); +visit_check_struct(v, _abort); +visit_end_struct(v, NULL); +visit_free(v); + +/* Beyond limits */ +qdict = keyval_parse("time1=-1," + "time2=18446744073709551616", /* 2^64 */ + NULL, _abort); +v = qobject_input_visitor_new_keyval(QOBJECT(qdict)); +qobject_unref(qdict); +visit_start_struct(v, NULL, NULL, 0, _abort); +visit_type_time(v, "time1", , ); +error_free_or_abort(); +visit_type_time(v, "time2", , ); +error_free_or_abort(); +visit_end_struct(v, NULL); +visit_free(v); + +/* Suffixes */ +qdict = keyval_parse("time1=2ns,time2=3.4us,time3=5ms,time4=600s", + NULL, _abort); +v = qobject_input_visitor_new_keyval(QOBJECT(qdict)); +qobject_unref(qdict); +visit_start_struct(v, NULL, NULL, 0, _abort); +visit_type_time(v, "time1", , _abort); +g_assert_cmpuint(time, ==, 2); +visit_type_time(v, "time2", , _abort); +g_assert_cmpuint(time, ==, 3400); +visit_type_time(v, "time3", , _abort); +g_assert_cmphex(time, ==, 5 * 1000 * 1000); +visit_type_time(v, "time4", , _abort); +g_assert_cmphex(time, ==, 600 * 10LL); +visit_check_struct(v, _abort); +visit_end_struct(v, NULL); +visit_free(v); + +/* Beyond limit with suffix */ +qdict = keyval_parse("time1=1844674407370955s", NULL, _abort); +v = qobject_input_visitor_new_keyval(QOBJECT(qdict)); +qobject_unref(qdict); +visit_start_struct(v, NULL, NULL, 0, _abort); +visit_type_time(v, "time1", , ); +error_free_or_abort(); +visit_end_struct(v, NULL); +visit_free(v); + +/* Trailing crap */ +qdict = keyval_parse("time1=89ks,time2=ns", NULL, _abort); +v = qobject_input_visitor_new_keyval(QOBJECT(qdict)); +qobject_unref(qdict); +visit_start_struct(v, NULL, NULL, 0, _abort); +visit_type_time(v, "time1", , ); +error_free_or_abort(); +visit_type_time(v, "time2", , );; +error_free_or_abort(); +visit_end_struct(v, NULL); +visit_free(v); +} + static void test_keyval_visit_dict(void) { Error *err = NULL; @@ -645,6 +733,7 @@ int main(int argc, char *argv[]) g_test_add_func("/keyval/visit/bool", test_keyval_visit_bool); g_test_add_func("/keyval/visit/number", test_keyval_visit_number); g_test_add_func("/keyval/visit/size", test_keyval_visit_size); +g_test_add_func("/keyval/visit/time", test_keyval_visit_time); g_test_add_func("/keyval/visit/dict", test_keyval_visit_dict); g_test_add_func("/keyval/visit/list", test_keyval_visit_list); g_test_add_func("/keyval/visit/optional", test_keyval_visit_optional); diff --git a/tests/test-qobject-input-visitor.c b/tests/test-qobject-input-visitor.c index 6bacabf063..55138042b8 100644 --- a/tests/test-qobject-input-visitor.c +++ b/tests/test-qobject-input-visitor.c @@ -366,6 +366,31 @@ static void test_visitor_in_size_str_fail(TestInputVisitorData *data, error_free_or_abort(); } +static void test_visitor_in_time_str_keyval(TestInputVisitorData *data, +const void *unused) +{ +uint64_t res, value = 265 * 1000 * 1000; +Visitor *v; + +v = visitor_input_test_init_full(data, true,
[PATCH v16 05/14] qapi: Add builtin type time
Add optional builtin type time, fallback is uint64. This type use qemu_strtotime_ns() for pre-converting time suffix to numbers. Signed-off-by: Tao Xu --- No changes in v16. Changes in v14: - Drop time unit picosecond (Eric) --- include/qapi/visitor-impl.h | 4 include/qapi/visitor.h | 8 qapi/opts-visitor.c | 22 ++ qapi/qapi-visit-core.c | 12 qapi/qobject-input-visitor.c | 18 ++ qapi/trace-events| 1 + scripts/qapi/schema.py | 1 + 7 files changed, 66 insertions(+) diff --git a/include/qapi/visitor-impl.h b/include/qapi/visitor-impl.h index 8ccb3b6c20..e0979563c7 100644 --- a/include/qapi/visitor-impl.h +++ b/include/qapi/visitor-impl.h @@ -88,6 +88,10 @@ struct Visitor void (*type_size)(Visitor *v, const char *name, uint64_t *obj, Error **errp); +/* Optional; fallback is type_uint64() */ +void (*type_time)(Visitor *v, const char *name, uint64_t *obj, + Error **errp); + /* Must be set */ void (*type_bool)(Visitor *v, const char *name, bool *obj, Error **errp); diff --git a/include/qapi/visitor.h b/include/qapi/visitor.h index c5b23851a1..22242e706f 100644 --- a/include/qapi/visitor.h +++ b/include/qapi/visitor.h @@ -554,6 +554,14 @@ void visit_type_int64(Visitor *v, const char *name, int64_t *obj, void visit_type_size(Visitor *v, const char *name, uint64_t *obj, Error **errp); +/* + * Visit a uint64_t value. + * Like visit_type_uint64(), except that some visitors may choose to + * recognize numbers with timeunit suffix, such as "ns", "us" "ms" and "s". + */ +void visit_type_time(Visitor *v, const char *name, uint64_t *obj, + Error **errp); + /* * Visit a boolean value. * diff --git a/qapi/opts-visitor.c b/qapi/opts-visitor.c index 5fe0276c1c..59b575f0fc 100644 --- a/qapi/opts-visitor.c +++ b/qapi/opts-visitor.c @@ -526,6 +526,27 @@ opts_type_size(Visitor *v, const char *name, uint64_t *obj, Error **errp) processed(ov, name); } +static void +opts_type_time(Visitor *v, const char *name, uint64_t *obj, Error **errp) +{ +OptsVisitor *ov = to_ov(v); +const QemuOpt *opt; +int err; + +opt = lookup_scalar(ov, name, errp); +if (!opt) { +return; +} + +err = qemu_strtotime_ns(opt->str ? opt->str : "", NULL, obj); +if (err < 0) { +error_setg(errp, QERR_INVALID_PARAMETER_VALUE, opt->name, + "a time value"); +return; +} + +processed(ov, name); +} static void opts_optional(Visitor *v, const char *name, bool *present) @@ -573,6 +594,7 @@ opts_visitor_new(const QemuOpts *opts) ov->visitor.type_int64 = _type_int64; ov->visitor.type_uint64 = _type_uint64; ov->visitor.type_size = _type_size; +ov->visitor.type_time = _type_time; ov->visitor.type_bool = _type_bool; ov->visitor.type_str= _type_str; diff --git a/qapi/qapi-visit-core.c b/qapi/qapi-visit-core.c index 5365561b07..ac8896455c 100644 --- a/qapi/qapi-visit-core.c +++ b/qapi/qapi-visit-core.c @@ -277,6 +277,18 @@ void visit_type_size(Visitor *v, const char *name, uint64_t *obj, } } +void visit_type_time(Visitor *v, const char *name, uint64_t *obj, + Error **errp) +{ +assert(obj); +trace_visit_type_time(v, name, obj); +if (v->type_time) { +v->type_time(v, name, obj, errp); +} else { +v->type_uint64(v, name, obj, errp); +} +} + void visit_type_bool(Visitor *v, const char *name, bool *obj, Error **errp) { assert(obj); diff --git a/qapi/qobject-input-visitor.c b/qapi/qobject-input-visitor.c index 32236cbcb1..e476fe0d16 100644 --- a/qapi/qobject-input-visitor.c +++ b/qapi/qobject-input-visitor.c @@ -627,6 +627,23 @@ static void qobject_input_type_size_keyval(Visitor *v, const char *name, } } +static void qobject_input_type_time_keyval(Visitor *v, const char *name, + uint64_t *obj, Error **errp) +{ +QObjectInputVisitor *qiv = to_qiv(v); +const char *str = qobject_input_get_keyval(qiv, name, errp); + +if (!str) { +return; +} + +if (qemu_strtotime_ns(str, NULL, obj) < 0) { +/* TODO report -ERANGE more nicely */ +error_setg(errp, QERR_INVALID_PARAMETER_VALUE, + full_name(qiv, name), "time"); +} +} + static void qobject_input_optional(Visitor *v, const char *name, bool *present) { QObjectInputVisitor *qiv = to_qiv(v); @@ -708,6 +725,7 @@ Visitor *qobject_input_visitor_new_keyval(QObject *obj) v->visitor.type_any = qobject_input_type_any; v->visitor.type_null = qobject_input_type_null; v->visitor.type_size = qobject_input_type_size_keyval; +v->visitor.type_time = qobject_input_type_time_keyval; v->keyval = true; return >visitor; diff --git a/qapi/trace-events b/qapi/trace-events index
[PATCH v16 03/14] util/cutils: refactor do_strtosz() to support suffixes list
Add do_strtomul() to convert string according to different suffixes. Reviewed-by: Eduardo Habkost Signed-off-by: Tao Xu --- No changes in v16. Changes in v15: - Add a new patch to refactor do_strtosz() (Eduardo) --- util/cutils.c | 72 ++- 1 file changed, 42 insertions(+), 30 deletions(-) diff --git a/util/cutils.c b/util/cutils.c index d94a468954..ffef92338a 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -181,41 +181,37 @@ int fcntl_setfl(int fd, int flag) } #endif -static int64_t suffix_mul(char suffix, int64_t unit) +static int64_t suffix_mul(const char *suffixes[], int num_suffix, + const char *endptr, int *offset, int64_t unit) { -switch (qemu_toupper(suffix)) { -case 'B': -return 1; -case 'K': -return unit; -case 'M': -return unit * unit; -case 'G': -return unit * unit * unit; -case 'T': -return unit * unit * unit * unit; -case 'P': -return unit * unit * unit * unit * unit; -case 'E': -return unit * unit * unit * unit * unit * unit; +int i, suffix_len; +int64_t mul = 1; + +for (i = 0; i < num_suffix; i++) { +suffix_len = strlen(suffixes[i]); +if (g_ascii_strncasecmp(suffixes[i], endptr, suffix_len) == 0) { +*offset = suffix_len; +return mul; +} +mul *= unit; } + return -1; } /* - * Convert string to bytes, allowing either B/b for bytes, K/k for KB, - * M/m for MB, G/g for GB or T/t for TB. End pointer will be returned - * in *end, if not NULL. Return -ERANGE on overflow, and -EINVAL on - * other error. + * Convert string according to different suffixes. End pointer will be returned + * in *end, if not NULL. Return -ERANGE on overflow, and -EINVAL on other error. */ -static int do_strtosz(const char *nptr, const char **end, - const char default_suffix, int64_t unit, +static int do_strtomul(const char *nptr, const char **end, + const char *suffixes[], int num_suffix, + const char *default_suffix, int64_t unit, uint64_t *result) { int retval; const char *endptr; -unsigned char c; int mul_required = 0; +int offset = 0; long double val, mul, integral, fraction; retval = qemu_strtold_finite(nptr, , ); @@ -226,12 +222,12 @@ static int do_strtosz(const char *nptr, const char **end, if (fraction != 0) { mul_required = 1; } -c = *endptr; -mul = suffix_mul(c, unit); + +mul = suffix_mul(suffixes, num_suffix, endptr, , unit); if (mul >= 0) { -endptr++; +endptr += offset; } else { -mul = suffix_mul(default_suffix, unit); +mul = suffix_mul(suffixes, num_suffix, default_suffix, , unit); assert(mul >= 0); } if (mul == 1 && mul_required) { @@ -256,19 +252,35 @@ out: return retval; } +/* + * Convert string to bytes, allowing either B/b for bytes, K/k for KB, + * M/m for MB, G/g for GB or T/t for TB. End pointer will be returned + * in *end, if not NULL. Return -ERANGE on overflow, and -EINVAL on + * other error. + */ +static int do_strtosz(const char *nptr, const char **end, + const char *default_suffix, int64_t unit, + uint64_t *result) +{ +static const char *suffixes[] = { "B", "K", "M", "G", "T", "P", "E" }; + +return do_strtomul(nptr, end, suffixes, ARRAY_SIZE(suffixes), + default_suffix, unit, result); +} + int qemu_strtosz(const char *nptr, const char **end, uint64_t *result) { -return do_strtosz(nptr, end, 'B', 1024, result); +return do_strtosz(nptr, end, "B", 1024, result); } int qemu_strtosz_MiB(const char *nptr, const char **end, uint64_t *result) { -return do_strtosz(nptr, end, 'M', 1024, result); +return do_strtosz(nptr, end, "M", 1024, result); } int qemu_strtosz_metric(const char *nptr, const char **end, uint64_t *result) { -return do_strtosz(nptr, end, 'B', 1000, result); +return do_strtosz(nptr, end, "B", 1000, result); } /** -- 2.20.1
[PATCH v16 02/14] util/cutils: Use qemu_strtold_finite to parse size
Support full 64bit precision, modify related test cases. Signed-off-by: Tao Xu --- New patch in v16 --- tests/test-cutils.c| 41 +--- tests/test-keyval.c| 47 +++--- tests/test-qemu-opts.c | 39 +++ util/cutils.c | 13 +--- 4 files changed, 24 insertions(+), 116 deletions(-) diff --git a/tests/test-cutils.c b/tests/test-cutils.c index 1aa8351520..465514b85f 100644 --- a/tests/test-cutils.c +++ b/tests/test-cutils.c @@ -1970,40 +1970,19 @@ static void test_qemu_strtosz_simple(void) g_assert_cmpint(err, ==, 0); g_assert_cmpint(res, ==, 12345); -/* Note: precision is 53 bits since we're parsing with strtod() */ +/* Note: precision is 64 bits (UINT64_MAX) */ -str = "9007199254740991"; /* 2^53-1 */ +str = "18446744073709551614"; /* UINT64_MAX - 1 */ err = qemu_strtosz(str, , ); g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0x1f); -g_assert(endptr == str + 16); - -str = "9007199254740992"; /* 2^53 */ -err = qemu_strtosz(str, , ); -g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0x20); -g_assert(endptr == str + 16); - -str = "9007199254740993"; /* 2^53+1 */ -err = qemu_strtosz(str, , ); -g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0x20); /* rounded to 53 bits */ -g_assert(endptr == str + 16); - -str = "18446744073709549568"; /* 0xf800 (53 msbs set) */ -err = qemu_strtosz(str, , ); -g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0xf800); +g_assert_cmpint(res, ==, 0xfffe); g_assert(endptr == str + 20); -str = "18446744073709550591"; /* 0xfbff */ +str = "18446744073709551615"; /* UINT64_MAX */ err = qemu_strtosz(str, , ); g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0xf800); /* rounded to 53 bits */ +g_assert_cmpint(res, ==, 0x); g_assert(endptr == str + 20); - -/* 0x7e00..0x7fff get rounded to - * 0x8000, thus -ERANGE; see test_qemu_strtosz_erange() */ } static void test_qemu_strtosz_units(void) @@ -2145,16 +2124,6 @@ static void test_qemu_strtosz_erange(void) g_assert_cmpint(err, ==, -ERANGE); g_assert(endptr == str + 2); -str = "18446744073709550592"; /* 0xfc00 */ -err = qemu_strtosz(str, , ); -g_assert_cmpint(err, ==, -ERANGE); -g_assert(endptr == str + 20); - -str = "18446744073709551615"; /* 2^64-1 */ -err = qemu_strtosz(str, , ); -g_assert_cmpint(err, ==, -ERANGE); -g_assert(endptr == str + 20); - str = "18446744073709551616"; /* 2^64 */ err = qemu_strtosz(str, , ); g_assert_cmpint(err, ==, -ERANGE); diff --git a/tests/test-keyval.c b/tests/test-keyval.c index 09b0ae3c68..fad941fcb8 100644 --- a/tests/test-keyval.c +++ b/tests/test-keyval.c @@ -383,59 +383,26 @@ static void test_keyval_visit_size(void) visit_end_struct(v, NULL); visit_free(v); -/* Note: precision is 53 bits since we're parsing with strtod() */ +/* Note: precision is 64 bits (UINT64_MAX) */ -/* Around limit of precision: 2^53-1, 2^53, 2^53+1 */ -qdict = keyval_parse("sz1=9007199254740991," - "sz2=9007199254740992," - "sz3=9007199254740993", +/* Around limit of precision: UINT64_MAX - 1, UINT64_MAX */ +qdict = keyval_parse("sz1=18446744073709551614," + "sz2=18446744073709551615", NULL, _abort); v = qobject_input_visitor_new_keyval(QOBJECT(qdict)); qobject_unref(qdict); visit_start_struct(v, NULL, NULL, 0, _abort); visit_type_size(v, "sz1", , _abort); -g_assert_cmphex(sz, ==, 0x1f); +g_assert_cmphex(sz, ==, 0xfffe); visit_type_size(v, "sz2", , _abort); -g_assert_cmphex(sz, ==, 0x20); -visit_type_size(v, "sz3", , _abort); -g_assert_cmphex(sz, ==, 0x20); -visit_check_struct(v, _abort); -visit_end_struct(v, NULL); -visit_free(v); - -/* Close to signed upper limit 0x7c00 (53 msbs set) */ -qdict = keyval_parse("sz1=9223372036854774784," /* 7c00 */ - "sz2=9223372036854775295", /* 7dff */ - NULL, _abort); -v = qobject_input_visitor_new_keyval(QOBJECT(qdict)); -qobject_unref(qdict); -visit_start_struct(v, NULL, NULL, 0, _abort); -visit_type_size(v, "sz1", , _abort); -g_assert_cmphex(sz, ==, 0x7c00); -visit_type_size(v, "sz2", , _abort); -g_assert_cmphex(sz, ==, 0x7c00); -visit_check_struct(v, _abort); -visit_end_struct(v, NULL); -visit_free(v); - -/* Close to actual upper limit 0xf800 (53 msbs set) */ -
[PATCH v16 04/14] util/cutils: Add qemu_strtotime_ns()
To convert strings with time suffixes to numbers, support time unit are "ns" for nanosecond, "us" for microsecond, "ms" for millisecond or "s" for second. Add test for qemu_strtotime_ns, test the input of basic, time suffixes, float, invaild, trailing and overflow. Reviewed-by: Eduardo Habkost Signed-off-by: Tao Xu --- Changes in v16: - Update the test because precision is 64 bits Changes in v15: - Add a new patch to refactor do_strtosz() (Eduardo) - use ARRAY_SIZE(suffixes) instead of hardcoding the suffixes number (Eduardo) --- include/qemu/cutils.h | 1 + tests/test-cutils.c | 173 ++ util/cutils.c | 14 3 files changed, 188 insertions(+) diff --git a/include/qemu/cutils.h b/include/qemu/cutils.h index 48cf9bf776..befa94f2d4 100644 --- a/include/qemu/cutils.h +++ b/include/qemu/cutils.h @@ -185,5 +185,6 @@ int uleb128_decode_small(const uint8_t *in, uint32_t *n); * *str1 is <, == or > than *str2. */ int qemu_pstrcmp0(const char **str1, const char **str2); +int qemu_strtotime_ns(const char *nptr, const char **end, uint64_t *result); #endif diff --git a/tests/test-cutils.c b/tests/test-cutils.c index 465514b85f..0ff1d816f1 100644 --- a/tests/test-cutils.c +++ b/tests/test-cutils.c @@ -2148,6 +2148,167 @@ static void test_qemu_strtosz_metric(void) g_assert(endptr == str + 6); } +static void test_qemu_strtotime_ns_simple(void) +{ +const char *str; +const char *endptr; +int err; +uint64_t res = 0xbaadf00d; + +str = "0"; +err = qemu_strtotime_ns(str, , ); +g_assert_cmpint(err, ==, 0); +g_assert_cmpint(res, ==, 0); +g_assert(endptr == str + 1); + +str = "56789"; +err = qemu_strtotime_ns(str, , ); +g_assert_cmpint(err, ==, 0); +g_assert_cmpint(res, ==, 56789); +g_assert(endptr == str + 5); + +err = qemu_strtotime_ns(str, NULL, ); +g_assert_cmpint(err, ==, 0); +g_assert_cmpint(res, ==, 56789); + +/* Note: precision is 64 bits (UINT64_MAX) */ + +str = "18446744073709551614"; /* UINT64_MAX - 1 */ +err = qemu_strtotime_ns(str, , ); +g_assert_cmpint(err, ==, 0); +g_assert_cmpint(res, ==, 0xfffe); +g_assert(endptr == str + 20); + +str = "18446744073709551615"; /* UINT64_MAX */ +err = qemu_strtotime_ns(str, , ); +g_assert_cmpint(err, ==, 0); +g_assert_cmpint(res, ==, 0x); +g_assert(endptr == str + 20); +} + +static void test_qemu_strtotime_ns_units(void) +{ +const char *ns = "1ns"; +const char *us = "1us"; +const char *ms = "1ms"; +const char *s = "1s"; +int err; +const char *endptr; +uint64_t res = 0xbaadf00d; + +/* default time unit is ns */ +err = qemu_strtotime_ns(ns, , ); +g_assert_cmpint(err, ==, 0); +g_assert_cmpint(res, ==, 1); +g_assert(endptr == ns + 3); + +err = qemu_strtotime_ns(us, , ); +g_assert_cmpint(err, ==, 0); +g_assert_cmpint(res, ==, 1000); +g_assert(endptr == us + 3); + +err = qemu_strtotime_ns(ms, , ); +g_assert_cmpint(err, ==, 0); +g_assert_cmpint(res, ==, 100); +g_assert(endptr == ms + 3); + +err = qemu_strtotime_ns(s, , ); +g_assert_cmpint(err, ==, 0); +g_assert_cmpint(res, ==, 10LL); +g_assert(endptr == s + 2); +} + +static void test_qemu_strtotime_ns_float(void) +{ +const char *str = "56.789us"; +int err; +const char *endptr; +uint64_t res = 0xbaadf00d; + +err = qemu_strtotime_ns(str, , ); +g_assert_cmpint(err, ==, 0); +g_assert_cmpint(res, ==, 56.789 * 1000); +g_assert(endptr == str + 8); +} + +static void test_qemu_strtotime_ns_invalid(void) +{ +const char *str; +const char *endptr; +int err; +uint64_t res = 0xbaadf00d; + +str = ""; +err = qemu_strtotime_ns(str, , ); +g_assert_cmpint(err, ==, -EINVAL); +g_assert(endptr == str); + +str = " \t "; +err = qemu_strtotime_ns(str, , ); +g_assert_cmpint(err, ==, -EINVAL); +g_assert(endptr == str); + +str = "crap"; +err = qemu_strtotime_ns(str, , ); +g_assert_cmpint(err, ==, -EINVAL); +g_assert(endptr == str); + +str = "inf"; +err = qemu_strtotime_ns(str, , ); +g_assert_cmpint(err, ==, -EINVAL); +g_assert(endptr == str); + +str = "NaN"; +err = qemu_strtotime_ns(str, , ); +g_assert_cmpint(err, ==, -EINVAL); +g_assert(endptr == str); +} + +static void test_qemu_strtotime_ns_trailing(void) +{ +const char *str; +const char *endptr; +int err; +uint64_t res = 0xbaadf00d; + +str = "123xxx"; + +err = qemu_strtotime_ns(str, NULL, ); +g_assert_cmpint(err, ==, -EINVAL); + +str = "1msxxx"; +err = qemu_strtotime_ns(str, , ); +g_assert_cmpint(err, ==, 0); +g_assert_cmpint(res, ==, 100); +g_assert(endptr == str + 3); + +err = qemu_strtotime_ns(str, NULL, ); +g_assert_cmpint(err, ==, -EINVAL); +} + +static void
[PATCH v16 01/14] util/cutils: Add Add qemu_strtold and qemu_strtold_finite
Work like qemu_strtod() and qemu_strtold_finite, except store long double. Signed-off-by: Tao Xu --- New patch in v16. --- include/qemu/cutils.h | 3 +++ util/cutils.c | 48 ++- 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/include/qemu/cutils.h b/include/qemu/cutils.h index b54c847e0f..48cf9bf776 100644 --- a/include/qemu/cutils.h +++ b/include/qemu/cutils.h @@ -146,6 +146,9 @@ int qemu_strtou64(const char *nptr, const char **endptr, int base, uint64_t *result); int qemu_strtod(const char *nptr, const char **endptr, double *result); int qemu_strtod_finite(const char *nptr, const char **endptr, double *result); +int qemu_strtold(const char *nptr, const char **endptr, long double *result); +int qemu_strtold_finite(const char *nptr, const char **endptr, +long double *result); int parse_uint(const char *s, unsigned long long *value, char **endptr, int base); diff --git a/util/cutils.c b/util/cutils.c index fd591cadf0..5db3b2add5 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -553,7 +553,7 @@ int qemu_strtou64(const char *nptr, const char **endptr, int base, /** * Convert string @nptr to a double. - * + * * This is a wrapper around strtod() that is harder to misuse. * Semantics of @nptr and @endptr match strtod() with differences * noted below. @@ -616,6 +616,52 @@ int qemu_strtod_finite(const char *nptr, const char **endptr, double *result) return ret; } +/* + * Convert string @nptr to a long double. + * + * Works like qemu_strtod(), except it stores long double. + */ +int qemu_strtold(const char *nptr, const char **endptr, long double *result) +{ +char *ep; + +if (!nptr) { +if (endptr) { +*endptr = nptr; +} +return -EINVAL; +} + +errno = 0; +*result = strtold(nptr, ); +return check_strtox_error(nptr, ep, endptr, errno); +} + +/* + * Convert string @nptr to a finite long double. + * + * Works like qemu_strtod_finite(), except it stores long double. + */ +int qemu_strtold_finite(const char *nptr, const char **endptr, +long double *result) +{ +long double tmp; +int ret; + +ret = qemu_strtold(nptr, endptr, ); +if (!ret && !isfinite(tmp)) { +if (endptr) { +*endptr = nptr; +} +ret = -EINVAL; +} + +if (ret != -EINVAL) { +*result = tmp; +} +return ret; +} + /** * Searches for the first occurrence of 'c' in 's', and returns a pointer * to the trailing null byte if none was found. -- 2.20.1
[PATCH v16 00/14] Build ACPI Heterogeneous Memory Attribute Table (HMAT)
This series of patches will build Heterogeneous Memory Attribute Table (HMAT) according to the command line. The ACPI HMAT describes the memory attributes, such as memory side cache attributes and bandwidth and latency details, related to the Memory Proximity Domain. The software is expected to use HMAT information as hint for optimization. In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report the platform's HMAT tables. The V15 patches link: https://patchwork.kernel.org/cover/11231971/ Changelog: v16: - Add and use qemu_strtold_finite to parse size, support full 64bit precision, modify related test cases (Eduardo and Markus) - Simplify struct HMAT_LB_Info and related code, unify latency and bandwidth (Igor) - Add cross check with hmat_lb data (Igor) - Fields in Cache Attributes are promoted to uint32_t before shifting (Igor) - Add case for QMP build HMAT (Igor) v15: - Add a new patch to refactor do_strtosz() (Eduardo) - Make tests without breaking CI (Michael) v14: - Reuse the codes of do_strtosz to build qemu_strtotime_ns (Eduardo) - Squash patch v13 01/12 and 02/12 together (Daniel and Eduardo) - Drop time unit picosecond (Eric) - Use qemu ctz64 and clz64 instead of builtin function v13: - Modify some text description - Drop "initiator_valid" field in struct NodeInfo - Reuse Garray to store the raw bandwidth and bandwidth data - Calculate common base unit using range bitmap - Add a patch to alculate hmat latency and bandwidth entry list - Drop the total_levels option and use readable cache size - Remove the unnecessary head file - Use decimal notation with appropriate suffix for cache size v12: - Fix a bug that a memory-only node without initiator setting doesn't report error. (reported by Danmei Wei) - Fix a bug that if HMAT is enabled and without hmat-lb setting, QEMU will crash. (reported by Danmei Wei) v11: - Move numa option patches forward. - Add num_initiator in Numa_state to record the number of initiators. - Simplify struct HMAT_LB_Info, use uint64_t array to store data. - Drop hmat_get_base(). - Calculate base in build_hmat_lb(). v10: - Add qemu_strtotime_ps() to convert strings with time suffixes to numbers, and add some tests for it. - Add qapi buildin type time, and add some tests for it. - Add machine oprion properties "-machine hmat=on|off" for enabling or disabling HMAT in QEMU. Liu Jingqi (5): numa: Extend CLI to provide memory latency and bandwidth information numa: Extend CLI to provide memory side cache information hmat acpi: Build Memory Proximity Domain Attributes Structure(s) hmat acpi: Build System Locality Latency and Bandwidth Information Structure(s) hmat acpi: Build Memory Side Cache Information Structure(s) Tao Xu (9): util/cutils: Add Add qemu_strtold and qemu_strtold_finite util/cutils: Use qemu_strtold_finite to parse size util/cutils: refactor do_strtosz() to support suffixes list util/cutils: Add qemu_strtotime_ns() qapi: Add builtin type time tests: Add test for QAPI builtin type time numa: Extend CLI to provide initiator information for numa nodes tests/numa: Add case for QMP build HMAT tests/bios-tables-test: add test cases for ACPI HMAT hw/acpi/Kconfig | 7 +- hw/acpi/Makefile.objs | 1 + hw/acpi/hmat.c| 269 + hw/acpi/hmat.h| 42 hw/core/machine.c | 64 ++ hw/core/numa.c| 270 ++ hw/i386/acpi-build.c | 5 + include/qapi/visitor-impl.h | 4 + include/qapi/visitor.h| 8 + include/qemu/cutils.h | 4 + include/sysemu/numa.h | 84 qapi/machine.json | 178 - qapi/opts-visitor.c | 22 +++ qapi/qapi-visit-core.c| 12 ++ qapi/qobject-input-visitor.c | 18 ++ qapi/trace-events | 1 + qemu-options.hx | 96 - scripts/qapi/schema.py| 1 + tests/bios-tables-test-allowed-diff.h | 8 + tests/bios-tables-test.c | 44 + tests/data/acpi/pc/APIC.acpihmat | 0 tests/data/acpi/pc/DSDT.acpihmat | 0 tests/data/acpi/pc/HMAT.acpihmat | 0 tests/data/acpi/pc/SRAT.acpihmat | 0 tests/data/acpi/q35/APIC.acpihmat | 0 tests/data/acpi/q35/DSDT.acpihmat | 0 tests/data/acpi/q35/HMAT.acpihmat | 0 tests/data/acpi/q35/SRAT.acpihmat | 0 tests/numa-test.c | 51 + tests/test-cutils.c | 214 tests/test-keyval.c | 136 + tests/test-qemu-opts.c| 39 +---
Re: [PATCH] Modify tests to work with clang
On 15/11/2019 05.38, Taylor Simpson wrote: > Signed-off-by: Taylor Simpson > --- > tests/tcg/multiarch/float_helpers.c | 13 - > tests/tcg/multiarch/linux-test.c| 2 +- > 2 files changed, 13 insertions(+), 2 deletions(-) > > diff --git a/tests/tcg/multiarch/float_helpers.c > b/tests/tcg/multiarch/float_helpers.c > index 8ee7903..437247c 100644 > --- a/tests/tcg/multiarch/float_helpers.c > +++ b/tests/tcg/multiarch/float_helpers.c > @@ -26,6 +26,17 @@ > > #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) > > +/* > +| The macro QEMU_GNUC_PREREQ tests for minimum version of the GNU C compiler. > +| The code is a copy of SOFTFLOAT_GNUC_PREREQ, see softfloat-macros.h. > +**/ > +#if defined(__GNUC__) && defined(__GNUC_MINOR__) > +# define QEMU_GNUC_PREREQ(maj, min) \ > + ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) > +#else > +# define QEMU_GNUC_PREREQ(maj, min) 0 > +#endif > + > /* > * Half Precision Numbers > * > @@ -79,7 +90,7 @@ char *fmt_16(uint16_t num) > > #ifndef SNANF > /* Signaling NaN macros, if supported. */ > -# if __GNUC_PREREQ(3, 3) > +# if defined(__clang__) || QEMU_GNUC_PREREQ(3, 3) IIRC clearly, Clang reports itself as GCC 4.2, so you certainly don't need the defined(__clang__) here. But additionally, we require at least GCC 4.8 to compile QEMU these days (see the check in the configure script), so you can even remove this #if statement completely, so that the following #defines are simply always used. Thomas
Re: [SeaBIOS] Re: [PATCH] ahci: zero-initialize port struct
On Thu, Nov 14, 2019 at 11:08:59AM +0200, Sam Eiderman wrote: > Do you want a single commit with the changes? That is the idea, unless it is too messy. I don't have v4 any more, looks like I've deleted v4 instead of v3 while cleaning up my mail folders, so I can't easily check. Do you have v3 and v4 as git branches somewhere? > I am afraid it will be slightly unreadable when looking at file histories. > The only commit that didn't change was: > [SeaBIOS] [PATCH v4 2/5] boot: Reorder functions in boot.c Hmm, looks like there have been more changes than I remember. Maybe it's better to revert and re-apply the changed patches then. cheers, Gerd
Re: [PATCH] Implement backend program convention command for vhost-user-blk
Patchew URL: https://patchew.org/QEMU/20191115060925.12346-1-mic...@redhat.com/ Hi, This series seems to have some coding style problems. See output below for more information: Subject: [PATCH] Implement backend program convention command for vhost-user-blk Type: series Message-id: 20191115060925.12346-1-mic...@redhat.com === TEST SCRIPT BEGIN === #!/bin/bash git rev-parse base > /dev/null || exit 0 git config --local diff.renamelimit 0 git config --local diff.renames True git config --local diff.algorithm histogram ./scripts/checkpatch.pl --mailback base.. === TEST SCRIPT END === Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384 Switched to a new branch 'test' 2a8f5a3 Implement backend program convention command for vhost-user-blk === OUTPUT BEGIN === ERROR: trailing whitespace #44: FILE: contrib/vhost-user-blk/vhost-user-blk.c:600: +{ $ ERROR: suspect code indent for conditional statements (8, 11) #96: FILE: contrib/vhost-user-blk/vhost-user-blk.c:631: +if (lsock < 0) { + exit(EXIT_FAILURE); total: 2 errors, 0 warnings, 135 lines checked Commit 2a8f5a38e630 (Implement backend program convention command for vhost-user-blk) has style problems, please review. If any of these errors are false positives report them to the maintainer, see CHECKPATCH in MAINTAINERS. === OUTPUT END === Test command exited with code: 1 The full log is available at http://patchew.org/logs/20191115060925.12346-1-mic...@redhat.com/testing.checkpatch/?type=message. --- Email generated automatically by Patchew [https://patchew.org/]. Please send your feedback to patchew-de...@redhat.com
[PATCH] Implement backend program convention command for vhost-user-blk
From: michan Signed-off-by: Micky Yun Chan (michiboo) --- contrib/vhost-user-blk/vhost-user-blk.c | 95 +++-- 1 file changed, 57 insertions(+), 38 deletions(-) diff --git a/contrib/vhost-user-blk/vhost-user-blk.c b/contrib/vhost-user-blk/vhost-user-blk.c index ae61034656..435ddc46b2 100644 --- a/contrib/vhost-user-blk/vhost-user-blk.c +++ b/contrib/vhost-user-blk/vhost-user-blk.c @@ -576,70 +576,89 @@ vub_new(char *blk_file) return vdev_blk; } +static int opt_fdnum = -1; +static char *opt_socket_path; +static char *opt_blk_file; +static gboolean opt_print_caps; +static gboolean opt_read_only; + + +static GOptionEntry entries[] = { +{ "print-capabilities", 'c', 0, G_OPTION_ARG_NONE, _print_caps, + "Print capabilities", NULL }, +{ "fd", 'f', 0, G_OPTION_ARG_INT, _fdnum, + "Use inherited fd socket", "FDNUM" }, +{ "socket-path", 's', 0, G_OPTION_ARG_FILENAME, _socket_path, + "Use UNIX socket path", "PATH" }, +{"blk-file", 'b', 0, G_OPTION_ARG_FILENAME, _blk_file, + "block device or file path", "PATH"}, +{ "read-only", 'r', 0, G_OPTION_ARG_NONE, _read_only, + "Enable read-only", NULL } +}; + int main(int argc, char **argv) -{ -int opt; -char *unix_socket = NULL; -char *blk_file = NULL; -bool enable_ro = false; +{ int lsock = -1, csock = -1; VubDev *vdev_blk = NULL; +GError *error = NULL; +GOptionContext *context; -while ((opt = getopt(argc, argv, "b:rs:h")) != -1) { -switch (opt) { -case 'b': -blk_file = g_strdup(optarg); -break; -case 's': -unix_socket = g_strdup(optarg); -break; -case 'r': -enable_ro = true; -break; -case 'h': -default: -printf("Usage: %s [ -b block device or file, -s UNIX domain socket" - " | -r Enable read-only ] | [ -h ]\n", argv[0]); -return 0; -} +context = g_option_context_new(NULL); +g_option_context_add_main_entries(context, entries, NULL); +if (!g_option_context_parse(context, , , )) { +g_printerr("Option parsing failed: %s\n", error->message); +exit(EXIT_FAILURE); +} +if (opt_print_caps) { +g_print("{\n"); +g_print(" \"type\": \"blk\",\n"); +g_print(" \"features\": [\n"); +g_print("\"blk-file\",\n"); +g_print("\"read-only\"\n"); +g_print(" ]\n"); +g_print("}\n"); +exit(EXIT_SUCCESS); } -if (!unix_socket || !blk_file) { +if (!opt_blk_file) { printf("Usage: %s [ -b block device or file, -s UNIX domain socket" " | -r Enable read-only ] | [ -h ]\n", argv[0]); return -1; } -lsock = unix_sock_new(unix_socket); -if (lsock < 0) { -goto err; +if (opt_socket_path) { +lsock = unix_sock_new(opt_socket_path); +if (lsock < 0) { + exit(EXIT_FAILURE); +} +} else { +lsock = opt_fdnum; } -csock = accept(lsock, (void *)0, (void *)0); +csock = accept(lsock, NULL, NULL); if (csock < 0) { -fprintf(stderr, "Accept error %s\n", strerror(errno)); -goto err; +g_printerr("Accept error %s\n", strerror(errno)); +exit(EXIT_FAILURE); } -vdev_blk = vub_new(blk_file); +vdev_blk = vub_new(opt_blk_file); if (!vdev_blk) { -goto err; +exit(EXIT_FAILURE); } -if (enable_ro) { +if (opt_read_only) { vdev_blk->enable_ro = true; } if (!vug_init(_blk->parent, VHOST_USER_BLK_MAX_QUEUES, csock, vub_panic_cb, _iface)) { -fprintf(stderr, "Failed to initialized libvhost-user-glib\n"); -goto err; +g_printerr("Failed to initialized libvhost-user-glib\n"); +exit(EXIT_FAILURE); } g_main_loop_run(vdev_blk->loop); +g_main_loop_unref(vdev_blk->loop); vug_deinit(_blk->parent); - -err: vub_free(vdev_blk); if (csock >= 0) { close(csock); @@ -647,8 +666,8 @@ err: if (lsock >= 0) { close(lsock); } -g_free(unix_socket); -g_free(blk_file); +g_free(opt_socket_path); +g_free(opt_blk_file); return 0; } -- 2.21.0
Re: [PATCH] Modify tests to work with clang
Patchew URL: https://patchew.org/QEMU/1573792691-398-1-git-send-email-tsimp...@quicinc.com/ Hi, This series seems to have some coding style problems. See output below for more information: Subject: [PATCH] Modify tests to work with clang Type: series Message-id: 1573792691-398-1-git-send-email-tsimp...@quicinc.com === TEST SCRIPT BEGIN === #!/bin/bash git rev-parse base > /dev/null || exit 0 git config --local diff.renamelimit 0 git config --local diff.renames True git config --local diff.algorithm histogram ./scripts/checkpatch.pl --mailback base.. === TEST SCRIPT END === Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384 Switched to a new branch 'test' 40fa5c0 Modify tests to work with clang === OUTPUT BEGIN === WARNING: Block comments use a leading /* on a separate line #18: FILE: tests/tcg/multiarch/float_helpers.c:29: +/* WARNING: Block comments use * on subsequent lines #19: FILE: tests/tcg/multiarch/float_helpers.c:30: +/* +| The macro QEMU_GNUC_PREREQ tests for minimum version of the GNU C compiler. WARNING: Block comments use a trailing */ on a separate line #21: FILE: tests/tcg/multiarch/float_helpers.c:32: +**/ WARNING: architecture specific defines should be avoided #22: FILE: tests/tcg/multiarch/float_helpers.c:33: +#if defined(__GNUC__) && defined(__GNUC_MINOR__) WARNING: architecture specific defines should be avoided #37: FILE: tests/tcg/multiarch/float_helpers.c:93: +# if defined(__clang__) || QEMU_GNUC_PREREQ(3, 3) ERROR: Use of volatile is usually wrong, please add a comment #50: FILE: tests/tcg/multiarch/linux-test.c:488: +*(volatile uint8_t *)0 = 0; total: 1 errors, 5 warnings, 33 lines checked Commit 40fa5c0d7a1b (Modify tests to work with clang) has style problems, please review. If any of these errors are false positives report them to the maintainer, see CHECKPATCH in MAINTAINERS. === OUTPUT END === Test command exited with code: 1 The full log is available at http://patchew.org/logs/1573792691-398-1-git-send-email-tsimp...@quicinc.com/testing.checkpatch/?type=message. --- Email generated automatically by Patchew [https://patchew.org/]. Please send your feedback to patchew-de...@redhat.com
RE: [PATCH v9 Kernel 2/5] vfio iommu: Add ioctl defination to get dirty pages bitmap.
> From: Alex Williamson > Sent: Friday, November 15, 2019 11:22 AM > > On Thu, 14 Nov 2019 21:40:35 -0500 > Yan Zhao wrote: > > > On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote: > > > On Fri, 15 Nov 2019 00:26:07 +0530 > > > Kirti Wankhede wrote: > > > > > > > On 11/14/2019 1:37 AM, Alex Williamson wrote: > > > > > On Thu, 14 Nov 2019 01:07:21 +0530 > > > > > Kirti Wankhede wrote: > > > > > > > > > >> On 11/13/2019 4:00 AM, Alex Williamson wrote: > > > > >>> On Tue, 12 Nov 2019 22:33:37 +0530 > > > > >>> Kirti Wankhede wrote: > > > > >>> > > > > All pages pinned by vendor driver through vfio_pin_pages API > should be > > > > considered as dirty during migration. IOMMU container > maintains a list of > > > > all such pinned pages. Added an ioctl defination to get bitmap of > such > > > > >>> > > > > >>> definition > > > > >>> > > > > pinned pages for requested IO virtual address range. > > > > >>> > > > > >>> Additionally, all mapped pages are considered dirty when > physically > > > > >>> mapped through to an IOMMU, modulo we discussed devices > opting in to > > > > >>> per page pinning to indicate finer granularity with a TBD > mechanism to > > > > >>> figure out if any non-opt-in devices remain. > > > > >>> > > > > >> > > > > >> You mean, in case of device direct assignment (device pass > through)? > > > > > > > > > > Yes, or IOMMU backed mdevs. If vfio_dmas in the container are > fully > > > > > pinned and mapped, then the correct dirty page set is all mapped > pages. > > > > > We discussed using the vpfn list as a mechanism for vendor drivers > to > > > > > reduce their migration footprint, but we also discussed that we > would > > > > > need a way to determine that all participants in the container have > > > > > explicitly pinned their working pages or else we must consider the > > > > > entire potential working set as dirty. > > > > > > > > > > > > > How can vendor driver tell this capability to iommu module? Any > suggestions? > > > > > > I think it does so by pinning pages. Is it acceptable that if the > > > vendor driver pins any pages, then from that point forward we consider > > > the IOMMU group dirty page scope to be limited to pinned pages? > There > > > are complications around non-singleton IOMMU groups, but I think > we're > > > already leaning towards that being a non-worthwhile problem to solve. > > > So if we require that only singleton IOMMU groups can pin pages and > we > > > pass the IOMMU group as a parameter to > > > vfio_iommu_driver_ops.pin_pages(), then the type1 backend can set a > > > flag on its local vfio_group struct to indicate dirty page scope is > > > limited to pinned pages. We might want to keep a flag on the > > > vfio_iommu struct to indicate if all of the vfio_groups for each > > > vfio_domain in the vfio_iommu.domain_list dirty page scope limited to > > > pinned pages as an optimization to avoid walking lists too often. Then > > > we could test if vfio_iommu.domain_list is not empty and this new flag > > > does not limit the dirty page scope, then everything within each > > > vfio_dma is considered dirty. > > > > > > > hi Alex > > could you help clarify whether my understandings below are right? > > In future, > > 1. for mdev and for passthrough device withoug hardware ability to track > > dirty pages, the vendor driver has to explicitly call > > vfio_pin_pages()/vfio_unpin_pages() + a flag to tell vfio its dirty page > > set. > > For non-IOMMU backed mdevs without hardware dirty page tracking, > there's no change to the vendor driver currently. Pages pinned by the > vendor driver are marked as dirty. What about the vendor driver can figure out, in software means, which pinned pages are actually dirty? In that case, would a separate mark_dirty interface make more sense? Or introduce read/write flag to the pin_pages interface similar to DMA API? Existing drivers always set both r/w flags but just in case then a specific driver may set read-only or write-only... > > For any IOMMU backed device, mdev or direct assignment, all mapped > memory would be considered dirty unless there are explicit calls to pin > pages on top of the IOMMU page pinning and mapping. These would likely > be enabled only when the device is in the _SAVING device_state. > > > 2. for those devices with hardware ability to track dirty pages, will still > > provide a callback to vendor driver to get dirty pages. (as for those > devices, > > it is hard to explicitly call vfio_pin_pages()/vfio_unpin_pages()) > > > > 3. for devices relying on dirty bit info in physical IOMMU, there > > will be a callback to physical IOMMU driver to get dirty page set from > > vfio. > > The proposal here does not cover exactly how these would be > implemented, it only establishes the container as the point of user > interaction with the dirty bitmap and hopefully allows us to maintain > that interface regardless of whether we have dirty tracking at the > device
[PULL 4/4] riscv/virt: Increase flash size
From: Alistair Francis Coreboot developers have requested that they have at least 32MB of flash to load binaries. We currently have 32MB of flash, but it is split in two to allow loading two flash binaries. Let's increase the flash size from 32MB to 64MB to ensure we have a single region that is 32MB. No QEMU release has include flash in the RISC-V virt machine, so this isn't a breaking change. Signed-off-by: Alistair Francis Signed-off-by: Palmer Dabbelt --- hw/riscv/virt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hw/riscv/virt.c b/hw/riscv/virt.c index cc8f311e6bbb..23f340df193e 100644 --- a/hw/riscv/virt.c +++ b/hw/riscv/virt.c @@ -62,7 +62,7 @@ static const struct MemmapEntry { [VIRT_PLIC] ={ 0xc00, 0x400 }, [VIRT_UART0] = { 0x1000, 0x100 }, [VIRT_VIRTIO] = { 0x10001000,0x1000 }, -[VIRT_FLASH] = { 0x2000, 0x200 }, +[VIRT_FLASH] = { 0x2000, 0x400 }, [VIRT_DRAM] ={ 0x8000, 0x0 }, [VIRT_PCIE_MMIO] = { 0x4000,0x4000 }, [VIRT_PCIE_PIO] ={ 0x0300,0x0001 }, -- 2.21.0
[PULL 2/4] target/riscv: Remove atomic accesses to MIP CSR
From: Alistair Francis Instead of relying on atomics to access the MIP register let's update our helper function to instead just lock the IO mutex thread before writing. This follows the same concept as used in PPC for handling interrupts Signed-off-by: Alistair Francis Reviewed-by: Richard Henderson Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- target/riscv/cpu.c| 5 ++-- target/riscv/cpu.h| 9 target/riscv/cpu_helper.c | 48 +++ target/riscv/csr.c| 2 +- 4 files changed, 21 insertions(+), 43 deletions(-) diff --git a/target/riscv/cpu.c b/target/riscv/cpu.c index 3939963b71ab..d37861a4305b 100644 --- a/target/riscv/cpu.c +++ b/target/riscv/cpu.c @@ -224,8 +224,7 @@ static void riscv_cpu_dump_state(CPUState *cs, FILE *f, int flags) #ifndef CONFIG_USER_ONLY qemu_fprintf(f, " %s " TARGET_FMT_lx "\n", "mhartid ", env->mhartid); qemu_fprintf(f, " %s " TARGET_FMT_lx "\n", "mstatus ", env->mstatus); -qemu_fprintf(f, " %s " TARGET_FMT_lx "\n", "mip ", - (target_ulong)atomic_read(>mip)); +qemu_fprintf(f, " %s 0x%x\n", "mip ", env->mip); qemu_fprintf(f, " %s " TARGET_FMT_lx "\n", "mie ", env->mie); qemu_fprintf(f, " %s " TARGET_FMT_lx "\n", "mideleg ", env->mideleg); qemu_fprintf(f, " %s " TARGET_FMT_lx "\n", "medeleg ", env->medeleg); @@ -275,7 +274,7 @@ static bool riscv_cpu_has_work(CPUState *cs) * Definition of the WFI instruction requires it to ignore the privilege * mode and delegation registers, but respect individual enables */ -return (atomic_read(>mip) & env->mie) != 0; +return (env->mip & env->mie) != 0; #else return true; #endif diff --git a/target/riscv/cpu.h b/target/riscv/cpu.h index 8c64c68538d8..e59343e13c02 100644 --- a/target/riscv/cpu.h +++ b/target/riscv/cpu.h @@ -121,15 +121,6 @@ struct CPURISCVState { target_ulong mhartid; target_ulong mstatus; -/* - * CAUTION! Unlike the rest of this struct, mip is accessed asynchonously - * by I/O threads. It should be read with atomic_read. It should be updated - * using riscv_cpu_update_mip with the iothread mutex held. The iothread - * mutex must be held because mip must be consistent with the CPU inturrept - * state. riscv_cpu_update_mip calls cpu_interrupt or cpu_reset_interrupt - * wuth the invariant that CPU_INTERRUPT_HARD is set iff mip is non-zero. - * mip is 32-bits to allow atomic_read on 32-bit hosts. - */ uint32_t mip; uint32_t miclaim; diff --git a/target/riscv/cpu_helper.c b/target/riscv/cpu_helper.c index f13131a51bde..767c8762ac42 100644 --- a/target/riscv/cpu_helper.c +++ b/target/riscv/cpu_helper.c @@ -19,6 +19,7 @@ #include "qemu/osdep.h" #include "qemu/log.h" +#include "qemu/main-loop.h" #include "cpu.h" #include "exec/exec-all.h" #include "tcg-op.h" @@ -38,7 +39,7 @@ static int riscv_cpu_local_irq_pending(CPURISCVState *env) { target_ulong mstatus_mie = get_field(env->mstatus, MSTATUS_MIE); target_ulong mstatus_sie = get_field(env->mstatus, MSTATUS_SIE); -target_ulong pending = atomic_read(>mip) & env->mie; +target_ulong pending = env->mip & env->mie; target_ulong mie = env->priv < PRV_M || (env->priv == PRV_M && mstatus_mie); target_ulong sie = env->priv < PRV_S || (env->priv == PRV_S && mstatus_sie); target_ulong irqs = (pending & ~env->mideleg & -mie) | @@ -92,42 +93,29 @@ int riscv_cpu_claim_interrupts(RISCVCPU *cpu, uint32_t interrupts) } } -struct CpuAsyncInfo { -uint32_t new_mip; -}; - -static void riscv_cpu_update_mip_irqs_async(CPUState *target_cpu_state, -run_on_cpu_data data) -{ -struct CpuAsyncInfo *info = (struct CpuAsyncInfo *) data.host_ptr; - -if (info->new_mip) { -cpu_interrupt(target_cpu_state, CPU_INTERRUPT_HARD); -} else { -cpu_reset_interrupt(target_cpu_state, CPU_INTERRUPT_HARD); -} - -g_free(info); -} - uint32_t riscv_cpu_update_mip(RISCVCPU *cpu, uint32_t mask, uint32_t value) { CPURISCVState *env = >env; CPUState *cs = CPU(cpu); -struct CpuAsyncInfo *info; -uint32_t old, new, cmp = atomic_read(>mip); +uint32_t old = env->mip; +bool locked = false; + +if (!qemu_mutex_iothread_locked()) { +locked = true; +qemu_mutex_lock_iothread(); +} -do { -old = cmp; -new = (old & ~mask) | (value & mask); -cmp = atomic_cmpxchg(>mip, old, new); -} while (old != cmp); +env->mip = (env->mip & ~mask) | (value & mask); -info = g_new(struct CpuAsyncInfo, 1); -info->new_mip = new; +if (env->mip) { +cpu_interrupt(cs, CPU_INTERRUPT_HARD); +} else { +cpu_reset_interrupt(cs, CPU_INTERRUPT_HARD); +} -async_run_on_cpu(cs, riscv_cpu_update_mip_irqs_async, -
[PULL 3/4] opensbi: Upgrade from v0.4 to v0.5
From: Alistair Francis This release has: Lot of critical fixes Hypervisor extension support SBI v0.2 base extension support Debug prints support Handle traps when doing unpriv load/store Allow compiling without FP support Use git describe to generate boot-time banner Andes AE350 platform support ShortLog: Anup Patel (14): platform: sifive/fu540: Move FDT further up lib: Allow compiling without FP support lib: Introduce sbi_dprintf() API lib: Use sbi_dprintf() for invalid CSRs lib: Handle traps when doing unpriv load/store in get_insn() lib: Delegate supervisor ecall to HS-mode when H extension available lib: Extend sbi_hart_switch_mode() to support hypervisor extension lib: Extend sbi_trap_redirect() for hypervisor extension lib: Redirect WFI trapped from VS/VU mode to HS-mode include: Extend get_insn() to read instruction from VS/VU mode lib: Emulate HTIMEDELTA CSR for platforms not having TIME CSR Makefile: Minor fix in OPENSBI_VERSION_GIT lib: Fix coldboot race condition observed on emulators/simulators include: Bump-up version to 0.5 Atish Patra (16): lib: Provide an atomic exchange function unsigned long lib: Fix race conditions in tlb fifo access. platform: Remove the ipi_sync method from all platforms. lib: Fix timer for 32 bit lib: Support atomic swap instructions lib: Upgrade to full flush if size is at least threshold docs: Update the fu540 platform guide as per U-Boot documents. lib: Change tlb range flush threshold to 4k page instead of 1G lib: provide a platform specific tlb range flush threshold lib: Fix tlb flush range limit value Test: Move test payload related code out of interface header lib: Align error codes as per SBI specification. lib: Rename existing SBI implementation as 0.1. lib: Remove redundant variable assignment lib: Implement SBI v0.2 lib: Provide a platform hook to implement vendor specific SBI extensions. Bin Meng (6): platform: sifive: fu540: Use standard value string for cpu node status README: Document 32-bit / 64-bit images build treewide: Use conventional names for 32-bit and 64-bit platform: sifive: fu540: Expand FDT size before any patching firmware: Use macro instead of magic number for boot status docs: platform: Update descriptions for qemu/sifive_u support Damien Le Moal (4): kendryte/k210: Use sifive UART driver kendryte/k210: remove sysctl code README: Update license information kendryte/k210: remove unused file Georg Kotheimer (1): utils: Use cpu_to_fdt32() when writing to fdt Jacob Garber (4): lib: Use bitwise & instead of boolean && lib: Use correct type for return value lib: Prevent unintended sign extensions lib: Correct null pointer check Lukas Auer (1): firmware: do not use relocated _boot_status before it is valid Nylon Chen (3): firmware: Fix the loop condition of _wait_relocate_copy_done section platform: Add Andes AE350 initial support scripts: Add AE350 to platform list in the binary archive script Palmer Dabbelt (1): Include `git describe` in OpenSBI Zong Li (1): Write MSIP by using memory-mapped control register Signed-off-by: Alistair Francis Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- pc-bios/opensbi-riscv32-virt-fw_jump.bin | Bin 36888 -> 40984 bytes pc-bios/opensbi-riscv64-sifive_u-fw_jump.bin | Bin 45064 -> 49160 bytes pc-bios/opensbi-riscv64-virt-fw_jump.bin | Bin 40968 -> 45064 bytes roms/opensbi | 2 +- 4 files changed, 1 insertion(+), 1 deletion(-) diff --git a/pc-bios/opensbi-riscv32-virt-fw_jump.bin b/pc-bios/opensbi-riscv32-virt-fw_jump.bin index f5bcaa56954c860d09acc0cb337c1666d88b6efb..6c5b7b89f676392b687d9835ac9fbdc34f3052cd 100644 GIT binary patch delta 14873 zcmcJ030M@@w(hQ~uBvWCX>DY1N=ri{#9%89(L`lxTF~GS(O?`Y5k1LOBQc35Cuy*# zZW{;GEjA=3;snY$F~p7%eaC=EgP25#5z*YJL_{<>2~LP8$X@1-@EU755M*8 zu08zw-)j%G*4kxmKAl@eo6`nG+!CTtPdkkeUR@sH%OIp-m`+`OGIHmQ7kAP~|3? z=on;#^`JiMtpS-s9DX0iOyPE7bNLkh|e`FUN)tBes4jeD|tCI3p;0Ay?C*$4i3o zLnOSq6gl_HIr|^@dO?ZStJ$-w=)!)%XogD5(5(W{XrQV$ZSOy3*9gJ+DiVG; zPA%Cl_Z{8hYVDQHca=mI2*!EP_%!v!;l{YG0ua7HqnwHH!wqS+FJR-Moj$43_G{ zd(4gK^Qn};e9j=s^{*lRawok(9{1|O@|eZJPsS`hPsZ$6ogDG$W;UAW$O{>mjEIh~ za0QDTnJfxdamZogP`C$=f(C}OiV)HquE1!1~De=POoNzYZ@ zy3i|KEDU%C9Z2Ve;lI)T!6l^FRBMM6TTMUdv^?s`H@wpN=;=;70`WiT!Sv68_!qh$ z^jK(`I>G=lTpm_x->zY$Jr65u*)1{5oYAe--YJw)u6|J>F_HZm7(;hJp;=0 z@@4|#SSCcphO2C}y#nZD#{3fxR>XFTY7NPE&=k^8k@y2eA7AI40JQ<1 zS%L8FUPzE=bSD4}^>N~@idZ^*3cjR>)jnIJC@O6uJ@iaeKx7)y8YqMZvmepQtwZAwPf@?2t?Qvd$aoHRLLZ;cpA4lq`D!)sF#KgdEqxI_LQWTnhhK2v5f+y zQi!p6920Bf7}GvZG#$pBTzKfJ=X?K5>NlgB!K+F8T8wt>^W}IlUDQs$#AN zJ`~U91E1gX!m2jb4IA;^k``#Foic98ErI@b@FC`mD%QB#KDI-GE}S7_+eq4sZVnvL
[PULL 1/4] remove unnecessary ifdef TARGET_RISCV64
From: "hiroyuki.obinata" Signed-off-by: Hiroyuki Obinata Signed-off-by: Palmer Dabbelt --- target/riscv/translate.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/target/riscv/translate.c b/target/riscv/translate.c index b26533d4fd78..ab6a891dc381 100644 --- a/target/riscv/translate.c +++ b/target/riscv/translate.c @@ -64,12 +64,10 @@ static const int tcg_memop_lookup[8] = { [0] = MO_SB, [1] = MO_TESW, [2] = MO_TESL, +[3] = MO_TEQ, [4] = MO_UB, [5] = MO_TEUW, -#ifdef TARGET_RISCV64 -[3] = MO_TEQ, [6] = MO_TEUL, -#endif }; #endif -- 2.21.0
[PULL] RISC-V Fixes for 4.2-rc2
The following changes since commit aa464db69b40b4b695be31085e6d2f1e90956c89: Update version for v4.2.0-rc1 release (2019-11-12 18:40:02 +) are available in the Git repository at: g...@github.com:palmer-dabbelt/qemu.git tags/riscv-for-master-4.2-rc2 for you to fetch changes up to 6911fde41006b2afe3510755c4cff259ca56c1d9: riscv/virt: Increase flash size (2019-11-14 09:53:28 -0800) RISC-V Fixes for 4.2-rc2 This contains a handful of patches that I'd like to target for 4.2: * OpenSBI upgrade to 0.5 * Increase in the flash size of the virt board. * A non-functional cleanup. * A cleanup to our MIP handling that avoids atomics. This passes "make check" and boots OpenEmbedded for me. Alistair Francis (3): target/riscv: Remove atomic accesses to MIP CSR opensbi: Upgrade from v0.4 to v0.5 riscv/virt: Increase flash size hiroyuki.obinata (1): remove unnecessary ifdef TARGET_RISCV64 hw/riscv/virt.c | 2 +- pc-bios/opensbi-riscv32-virt-fw_jump.bin | Bin 36888 -> 40984 bytes pc-bios/opensbi-riscv64-sifive_u-fw_jump.bin | Bin 45064 -> 49160 bytes pc-bios/opensbi-riscv64-virt-fw_jump.bin | Bin 40968 -> 45064 bytes roms/opensbi | 2 +- target/riscv/cpu.c | 5 ++- target/riscv/cpu.h | 9 - target/riscv/cpu_helper.c| 48 ++- target/riscv/csr.c | 2 +- target/riscv/translate.c | 4 +-- 10 files changed, 24 insertions(+), 48 deletions(-)
[PATCH] Modify tests to work with clang
Signed-off-by: Taylor Simpson --- tests/tcg/multiarch/float_helpers.c | 13 - tests/tcg/multiarch/linux-test.c| 2 +- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/tests/tcg/multiarch/float_helpers.c b/tests/tcg/multiarch/float_helpers.c index 8ee7903..437247c 100644 --- a/tests/tcg/multiarch/float_helpers.c +++ b/tests/tcg/multiarch/float_helpers.c @@ -26,6 +26,17 @@ #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +/* +| The macro QEMU_GNUC_PREREQ tests for minimum version of the GNU C compiler. +| The code is a copy of SOFTFLOAT_GNUC_PREREQ, see softfloat-macros.h. +**/ +#if defined(__GNUC__) && defined(__GNUC_MINOR__) +# define QEMU_GNUC_PREREQ(maj, min) \ + ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) +#else +# define QEMU_GNUC_PREREQ(maj, min) 0 +#endif + /* * Half Precision Numbers * @@ -79,7 +90,7 @@ char *fmt_16(uint16_t num) #ifndef SNANF /* Signaling NaN macros, if supported. */ -# if __GNUC_PREREQ(3, 3) +# if defined(__clang__) || QEMU_GNUC_PREREQ(3, 3) # define SNANF (__builtin_nansf ("")) # define SNAN (__builtin_nans ("")) # define SNANL (__builtin_nansl ("")) diff --git a/tests/tcg/multiarch/linux-test.c b/tests/tcg/multiarch/linux-test.c index 673d7c8..edfc02c 100644 --- a/tests/tcg/multiarch/linux-test.c +++ b/tests/tcg/multiarch/linux-test.c @@ -485,7 +485,7 @@ static void test_signal(void) act.sa_flags = SA_SIGINFO; chk_error(sigaction(SIGSEGV, , NULL)); if (setjmp(jmp_env) == 0) { -*(uint8_t *)0 = 0; +*(volatile uint8_t *)0 = 0; } act.sa_handler = SIG_DFL; -- 2.7.4
[RFC PATCH v1 7/8] vfio-ccw: Refactor ccw irq handler
Make it easier to add new ones in the future. Signed-off-by: Eric Farman --- hw/vfio/ccw.c | 55 --- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 2b1a83b94c..b16526d5de 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -334,22 +334,35 @@ read_err: css_inject_io_interrupt(sch); } -static void vfio_ccw_register_io_notifier(VFIOCCWDevice *vcdev, Error **errp) +static void vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev, int irq, + Error **errp) { VFIODevice *vdev = >vdev; struct vfio_irq_info *irq_info; size_t argsz; int fd; +EventNotifier *notifier; +IOHandler *fd_read; + +switch (irq) { +case VFIO_CCW_IO_IRQ_INDEX: +notifier = >io_notifier; +fd_read = vfio_ccw_io_notifier_handler; +break; +default: +error_setg(errp, "vfio: Unsupported device irq(%d) fd: %m", irq); +return; +} -if (vdev->num_irqs < VFIO_CCW_IO_IRQ_INDEX + 1) { -error_setg(errp, "vfio: unexpected number of io irqs %u", +if (vdev->num_irqs < irq + 1) { +error_setg(errp, "vfio: unexpected number of irqs %u", vdev->num_irqs); return; } argsz = sizeof(*irq_info); irq_info = g_malloc0(argsz); -irq_info->index = VFIO_CCW_IO_IRQ_INDEX; +irq_info->index = irq; irq_info->argsz = argsz; if (ioctl(vdev->fd, VFIO_DEVICE_GET_IRQ_INFO, irq_info) < 0 || irq_info->count < 1) { @@ -357,37 +370,47 @@ static void vfio_ccw_register_io_notifier(VFIOCCWDevice *vcdev, Error **errp) goto out_free_info; } -if (event_notifier_init(>io_notifier, 0)) { +if (event_notifier_init(notifier, 0)) { error_setg_errno(errp, errno, - "vfio: Unable to init event notifier for IO"); + "vfio: Unable to init event notifier for irq (%d)", irq); goto out_free_info; } -fd = event_notifier_get_fd(>io_notifier); -qemu_set_fd_handler(fd, vfio_ccw_io_notifier_handler, NULL, vcdev); +fd = event_notifier_get_fd(notifier); +qemu_set_fd_handler(fd, fd_read, NULL, vcdev); -if (vfio_set_irq_signaling(vdev, VFIO_CCW_IO_IRQ_INDEX, 0, +if (vfio_set_irq_signaling(vdev, irq, 0, VFIO_IRQ_SET_ACTION_TRIGGER, fd, errp)) { qemu_set_fd_handler(fd, NULL, NULL, vcdev); -event_notifier_cleanup(>io_notifier); +event_notifier_cleanup(notifier); } out_free_info: g_free(irq_info); } -static void vfio_ccw_unregister_io_notifier(VFIOCCWDevice *vcdev) +static void vfio_ccw_unregister_irq_notifier(VFIOCCWDevice *vcdev, int irq) { Error *err = NULL; +EventNotifier *notifier; + +switch (irq) { +case VFIO_CCW_IO_IRQ_INDEX: +notifier = >io_notifier; +break; +default: +error_report("vfio: Unsupported device irq(%d) fd: %m", irq); +return; +} -if (vfio_set_irq_signaling(>vdev, VFIO_CCW_IO_IRQ_INDEX, 0, +if (vfio_set_irq_signaling(>vdev, irq, 0, VFIO_IRQ_SET_ACTION_TRIGGER, -1, )) { error_reportf_err(err, VFIO_MSG_PREFIX, vcdev->vdev.name); } -qemu_set_fd_handler(event_notifier_get_fd(>io_notifier), +qemu_set_fd_handler(event_notifier_get_fd(notifier), NULL, NULL, vcdev); -event_notifier_cleanup(>io_notifier); +event_notifier_cleanup(notifier); } static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) @@ -590,7 +613,7 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) goto out_region_err; } -vfio_ccw_register_io_notifier(vcdev, ); +vfio_ccw_register_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX, ); if (err) { goto out_notifier_err; } @@ -619,7 +642,7 @@ static void vfio_ccw_unrealize(DeviceState *dev, Error **errp) S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev); VFIOGroup *group = vcdev->vdev.group; -vfio_ccw_unregister_io_notifier(vcdev); +vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); vfio_ccw_put_region(vcdev); vfio_ccw_put_device(vcdev); vfio_put_group(group); -- 2.17.1
[RFC PATCH v1 5/8] vfio-ccw: Add support for the schib region
From: Farhan Ali The schib region can be used to obtain the latest SCHIB from the host passthrough subchannel. Since the guest SCHIB is virtualized, we currently only update the path related information so that the guest is aware of any path related changes when it issues the 'stsch' instruction. Signed-off-by: Farhan Ali Signed-off-by: Eric Farman --- Notes: v0->v1: [EF] - Change various incarnations of "update chp status" to "handle_store", to reflect the STSCH instruction that will drive this code - Remove temporary variable for casting/testing purposes in s390_ccw_store(), and add a block comment of WHY its there. - Add a few comments to vfio_ccw_handle_store() hw/s390x/css.c | 8 -- hw/s390x/s390-ccw.c | 20 + hw/vfio/ccw.c | 57 + include/hw/s390x/css.h | 3 +- include/hw/s390x/s390-ccw.h | 1 + target/s390x/ioinst.c | 3 +- 6 files changed, 87 insertions(+), 5 deletions(-) diff --git a/hw/s390x/css.c b/hw/s390x/css.c index 844caab408..6ee6a96d75 100644 --- a/hw/s390x/css.c +++ b/hw/s390x/css.c @@ -1335,11 +1335,15 @@ static void copy_schib_to_guest(SCHIB *dest, const SCHIB *src) } } -int css_do_stsch(SubchDev *sch, SCHIB *schib) +IOInstEnding css_do_stsch(SubchDev *sch, SCHIB *schib) { +int ret = IOINST_CC_EXPECTED; + +ret = s390_ccw_store(sch); + /* Use current status. */ copy_schib_to_guest(schib, >curr_status); -return 0; +return ret; } static void copy_pmcw_from_guest(PMCW *dest, const PMCW *src) diff --git a/hw/s390x/s390-ccw.c b/hw/s390x/s390-ccw.c index 0c5a5b60bd..be0b379338 100644 --- a/hw/s390x/s390-ccw.c +++ b/hw/s390x/s390-ccw.c @@ -51,6 +51,26 @@ int s390_ccw_clear(SubchDev *sch) return cdc->handle_clear(sch); } +IOInstEnding s390_ccw_store(SubchDev *sch) +{ +S390CCWDeviceClass *cdc = NULL; +int ret = IOINST_CC_EXPECTED; + +/* + * This only applies to passthrough devices, so we can't unconditionally + * set this variable like we would for halt/clear. + */ +if (object_dynamic_cast(OBJECT(sch->driver_data), TYPE_S390_CCW)) { +cdc = S390_CCW_DEVICE_GET_CLASS(sch->driver_data); +} + +if (cdc && cdc->handle_store) { +ret = cdc->handle_store(sch); +} + +return ret; +} + static void s390_ccw_get_dev_info(S390CCWDevice *cdev, char *sysfsdev, Error **errp) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 3e32bc1819..2ff223470f 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -41,6 +41,9 @@ struct VFIOCCWDevice { uint64_t async_cmd_region_size; uint64_t async_cmd_region_offset; struct ccw_cmd_region *async_cmd_region; +uint64_t schib_region_size; +uint64_t schib_region_offset; +struct ccw_schib_region *schib_region; EventNotifier io_notifier; bool force_orb_pfch; bool warned_orb_pfch; @@ -124,6 +127,45 @@ again: } } +static IOInstEnding vfio_ccw_handle_store(SubchDev *sch) +{ +S390CCWDevice *cdev = sch->driver_data; +VFIOCCWDevice *vcdev = DO_UPCAST(VFIOCCWDevice, cdev, cdev); +SCHIB *schib = >curr_status; +struct ccw_schib_region *region = vcdev->schib_region; +SCHIB *s = (SCHIB *)region->schib_area; +int ret; + +/* schib region not available so nothing else to do */ +if (!region) { +return IOINST_CC_EXPECTED; +} + +memset(region, 0, sizeof(*region)); +ret = pread(vcdev->vdev.fd, region, vcdev->schib_region_size, +vcdev->schib_region_offset); + +if (ret == -1) { +/* Assume device is damaged, regardless of errno */ +return IOINST_CC_NOT_OPERATIONAL; +} + +/* + * Selectively copy path-related bits of the SCHIB, + * rather than copying the entire struct. + */ +schib->pmcw.pnom = s->pmcw.pnom; +schib->pmcw.lpum = s->pmcw.lpum; +schib->pmcw.pam = s->pmcw.pam; +schib->pmcw.pom = s->pmcw.pom; + +if (s->scsw.flags & SCSW_FLAGS_MASK_PNO) { +schib->scsw.flags |= SCSW_FLAGS_MASK_PNO; +} + +return IOINST_CC_EXPECTED; +} + static int vfio_ccw_handle_clear(SubchDev *sch) { S390CCWDevice *cdev = sch->driver_data; @@ -395,10 +437,23 @@ static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) vcdev->async_cmd_region = g_malloc0(info->size); } +ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW, + VFIO_REGION_SUBTYPE_CCW_SCHIB, ); +if (!ret) { +vcdev->schib_region_size = info->size; +if (sizeof(*vcdev->schib_region) != vcdev->schib_region_size) { +error_setg(errp, "vfio: Unexpected size of the schib region"); +goto out_err; +} +vcdev->schib_region_offset = info->offset; +vcdev->schib_region = g_malloc(info->size); +} + g_free(info);
[RFC PATCH v1 8/8] vfio-ccw: Add support for the CRW irq
From: Farhan Ali The CRW irq will be used by vfio-ccw to notify the userspace about any CRWs the userspace needs to handle. Let's add support for it. Signed-off-by: Farhan Ali Signed-off-by: Eric Farman --- Notes: v0->v1: [EF] - Check vcdev->crw_region before registering the irq, in case host kernel does not have matching support - Split the refactoring changes to an earlier (new) patch (and don't remove the "num_irqs" check in the register routine, but adjust it to the check the input variable) - Don't revert the cool vfio_set_irq_signaling() stuff - Unregister CRW IRQ before IO IRQ in unrealize - s/crw1/crw0/ hw/vfio/ccw.c | 45 + 1 file changed, 45 insertions(+) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index b16526d5de..b3f4120118 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -48,6 +48,7 @@ struct VFIOCCWDevice { uint64_t crw_region_offset; struct ccw_crw_region *crw_region; EventNotifier io_notifier; +EventNotifier crw_notifier; bool force_orb_pfch; bool warned_orb_pfch; }; @@ -259,6 +260,34 @@ static void vfio_ccw_reset(DeviceState *dev) ioctl(vcdev->vdev.fd, VFIO_DEVICE_RESET); } +static void vfio_ccw_crw_notifier_handler(void *opaque) +{ +VFIOCCWDevice *vcdev = opaque; +struct ccw_crw_region *region = vcdev->crw_region; +CRW crw; +int size; +uint8_t erc; +uint16_t rsid; + +if (!event_notifier_test_and_clear(>crw_notifier)) { +return; +} + +memset(region, 0, sizeof(*region)); +size = pread(vcdev->vdev.fd, region, vcdev->crw_region_size, + vcdev->crw_region_offset); + +if (size == -1) { +error_report("vfio-ccw: Read crw region failed with errno=%d", errno); +return; +} + +memcpy(, >crw0, sizeof(CRW)); +erc = crw.flags & 0x003f; +rsid = crw.rsid; +css_queue_crw(CRW_RSC_CHP, erc, 0, 0, rsid); +} + static void vfio_ccw_io_notifier_handler(void *opaque) { VFIOCCWDevice *vcdev = opaque; @@ -349,6 +378,10 @@ static void vfio_ccw_register_irq_notifier(VFIOCCWDevice *vcdev, int irq, notifier = >io_notifier; fd_read = vfio_ccw_io_notifier_handler; break; +case VFIO_CCW_CRW_IRQ_INDEX: +notifier = >crw_notifier; +fd_read = vfio_ccw_crw_notifier_handler; +break; default: error_setg(errp, "vfio: Unsupported device irq(%d) fd: %m", irq); return; @@ -398,6 +431,9 @@ static void vfio_ccw_unregister_irq_notifier(VFIOCCWDevice *vcdev, int irq) case VFIO_CCW_IO_IRQ_INDEX: notifier = >io_notifier; break; +case VFIO_CCW_CRW_IRQ_INDEX: +notifier = >crw_notifier; +break; default: error_report("vfio: Unsupported device irq(%d) fd: %m", irq); return; @@ -618,6 +654,14 @@ static void vfio_ccw_realize(DeviceState *dev, Error **errp) goto out_notifier_err; } +if (vcdev->crw_region) { +vfio_ccw_register_irq_notifier(vcdev, VFIO_CCW_CRW_IRQ_INDEX, ); +if (err) { +vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); +goto out_notifier_err; +} +} + return; out_notifier_err: @@ -642,6 +686,7 @@ static void vfio_ccw_unrealize(DeviceState *dev, Error **errp) S390CCWDeviceClass *cdc = S390_CCW_DEVICE_GET_CLASS(cdev); VFIOGroup *group = vcdev->vdev.group; +vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_CRW_IRQ_INDEX); vfio_ccw_unregister_irq_notifier(vcdev, VFIO_CCW_IO_IRQ_INDEX); vfio_ccw_put_region(vcdev); vfio_ccw_put_device(vcdev); -- 2.17.1
[RFC PATCH v1 4/8] vfio-ccw: Refactor cleanup of regions
While we're at it, add a g_free() for the async_cmd_region that is the last thing currently created. g_free() knows how to handle NULL pointers, so this makes it easier to remember what cleanups need to be performed when new regions are added. Signed-off-by: Eric Farman --- hw/vfio/ccw.c | 14 +- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 0590a6f512..3e32bc1819 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -376,8 +376,7 @@ static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) vcdev->io_region_size = info->size; if (sizeof(*vcdev->io_region) != vcdev->io_region_size) { error_setg(errp, "vfio: Unexpected size of the I/O region"); -g_free(info); -return; +goto out_err; } vcdev->io_region_offset = info->offset; @@ -390,15 +389,20 @@ static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) vcdev->async_cmd_region_size = info->size; if (sizeof(*vcdev->async_cmd_region) != vcdev->async_cmd_region_size) { error_setg(errp, "vfio: Unexpected size of the async cmd region"); -g_free(vcdev->io_region); -g_free(info); -return; +goto out_err; } vcdev->async_cmd_region_offset = info->offset; vcdev->async_cmd_region = g_malloc0(info->size); } g_free(info); +return; + +out_err: +g_free(vcdev->async_cmd_region); +g_free(vcdev->io_region); +g_free(info); +return; } static void vfio_ccw_put_region(VFIOCCWDevice *vcdev) -- 2.17.1
[RFC PATCH v1 1/8] vfio-ccw: Return IOINST_CC_NOT_OPERATIONAL for EIO
From: Farhan Ali EIO is returned by vfio-ccw mediated device when the backing host subchannel is not operational anymore. So return cc=3 back to the guest, rather than returning a unit check. This way the guest can take appropriate action such as issue an 'stsch'. Signed-off-by: Farhan Ali --- hw/vfio/ccw.c | 1 + 1 file changed, 1 insertion(+) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 6863f6c69f..0919ddbeb8 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -114,6 +114,7 @@ again: return IOINST_CC_BUSY; case -ENODEV: case -EACCES: +case -EIO: return IOINST_CC_NOT_OPERATIONAL; case -EFAULT: default: -- 2.17.1
[RFC PATCH v1 0/8] s390x/vfio-ccw: Channel Path Handling
Here is a first pass at the channel-path handling code for vfio-ccw, to take advantage of the corresponding kernel patches posted here: https://lore.kernel.org/kvm/20191115025620.19593-1-far...@linux.ibm.com/ As with the KVM patches, these were originally written by Farhan Ali this past summer, and my git notes log the changes I've made since picking up this work. There are two commits at the front of this series that seem to be pre-reqs to the actual changes for this, which is why the linux-headers update seems lost in the middle of this. I've tried to be mindful of testing permutations of new/old kernel with either new/old QEMU, and it seems to be in good shape. Eric Farman (2): vfio-ccw: Refactor cleanup of regions vfio-ccw: Refactor ccw irq handler Farhan Ali (6): vfio-ccw: Return IOINST_CC_NOT_OPERATIONAL for EIO vfio-ccw: Don't inject an I/O interrupt if the subchannel is not enabled linux-headers: update vfio-ccw: Add support for the schib region vfio-ccw: Add support for the crw region vfio-ccw: Add support for the CRW irq hw/s390x/css.c | 8 +- hw/s390x/s390-ccw.c| 20 hw/vfio/ccw.c | 195 + include/hw/s390x/css.h | 3 +- include/hw/s390x/s390-ccw.h| 1 + linux-headers/linux/vfio.h | 3 + linux-headers/linux/vfio_ccw.h | 10 ++ target/s390x/ioinst.c | 3 +- 8 files changed, 217 insertions(+), 26 deletions(-) -- 2.17.1
[RFC PATCH v1 6/8] vfio-ccw: Add support for the crw region
From: Farhan Ali The crw region can be used to obtain information about Channel Report Words (CRW) from vfio-ccw driver. Currently only channel path related CRWs are passed to QEMU from vfio-ccw driver. Signed-off-by: Farhan Ali Signed-off-by: Eric Farman --- Notes: v0->v1: [EF] - Fixed copy/paste error in error message (s/schib/CRW) hw/vfio/ccw.c | 18 ++ 1 file changed, 18 insertions(+) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 2ff223470f..2b1a83b94c 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -44,6 +44,9 @@ struct VFIOCCWDevice { uint64_t schib_region_size; uint64_t schib_region_offset; struct ccw_schib_region *schib_region; +uint64_t crw_region_size; +uint64_t crw_region_offset; +struct ccw_crw_region *crw_region; EventNotifier io_notifier; bool force_orb_pfch; bool warned_orb_pfch; @@ -449,10 +452,24 @@ static void vfio_ccw_get_region(VFIOCCWDevice *vcdev, Error **errp) vcdev->schib_region = g_malloc(info->size); } +ret = vfio_get_dev_region_info(vdev, VFIO_REGION_TYPE_CCW, + VFIO_REGION_SUBTYPE_CCW_CRW, ); + +if (!ret) { +vcdev->crw_region_size = info->size; +if (sizeof(*vcdev->crw_region) != vcdev->crw_region_size) { +error_setg(errp, "vfio: Unexpected size of the CRW region"); +goto out_err; +} +vcdev->crw_region_offset = info->offset; +vcdev->crw_region = g_malloc(info->size); +} + g_free(info); return; out_err: +g_free(vcdev->crw_region); g_free(vcdev->schib_region); g_free(vcdev->async_cmd_region); g_free(vcdev->io_region); @@ -462,6 +479,7 @@ out_err: static void vfio_ccw_put_region(VFIOCCWDevice *vcdev) { +g_free(vcdev->crw_region); g_free(vcdev->schib_region); g_free(vcdev->async_cmd_region); g_free(vcdev->io_region); -- 2.17.1
[RFC PATCH v1 3/8] linux-headers: update
From: Farhan Ali Signed-off-by: Farhan Ali Signed-off-by: Eric Farman --- Notes: v0->v1: [EF] - Run scripts/update-linux-headers.sh properly, but do not add resulting changes to linux-headers/asm-mips/ linux-headers/linux/vfio.h | 3 +++ linux-headers/linux/vfio_ccw.h | 10 ++ 2 files changed, 13 insertions(+) diff --git a/linux-headers/linux/vfio.h b/linux-headers/linux/vfio.h index fb10370d29..9e227348b3 100644 --- a/linux-headers/linux/vfio.h +++ b/linux-headers/linux/vfio.h @@ -378,6 +378,8 @@ struct vfio_region_gfx_edid { /* sub-types for VFIO_REGION_TYPE_CCW */ #define VFIO_REGION_SUBTYPE_CCW_ASYNC_CMD (1) +#define VFIO_REGION_SUBTYPE_CCW_SCHIB (2) +#define VFIO_REGION_SUBTYPE_CCW_CRW(3) /* * The MSIX mappable capability informs that MSIX data of a BAR can be mmapped @@ -577,6 +579,7 @@ enum { enum { VFIO_CCW_IO_IRQ_INDEX, + VFIO_CCW_CRW_IRQ_INDEX, VFIO_CCW_NUM_IRQS }; diff --git a/linux-headers/linux/vfio_ccw.h b/linux-headers/linux/vfio_ccw.h index fcc3e69ef5..8654c9afca 100644 --- a/linux-headers/linux/vfio_ccw.h +++ b/linux-headers/linux/vfio_ccw.h @@ -34,4 +34,14 @@ struct ccw_cmd_region { __u32 ret_code; } __attribute__((packed)); +struct ccw_schib_region { +#define SCHIB_AREA_SIZE 52 + __u8 schib_area[SCHIB_AREA_SIZE]; +} __attribute__((packed)); + +struct ccw_crw_region { + __u32 crw0; + __u32 crw1; +} __attribute__((packed)); + #endif -- 2.17.1
[RFC PATCH v1 2/8] vfio-ccw: Don't inject an I/O interrupt if the subchannel is not enabled
From: Farhan Ali According to PoPs, when the SCHIB.PMCW bit 8 is 0 status presented by the device is not made available to the program. So don't inject an interrupt in the guest if the guest OS has not enabled the subchannel. Signed-off-by: Farhan Ali Signed-off-by: Eric Farman --- Notes: v0->v1: [EF] - Update commit message, as the bit in question is bit 8 not 15 hw/vfio/ccw.c | 5 + 1 file changed, 5 insertions(+) diff --git a/hw/vfio/ccw.c b/hw/vfio/ccw.c index 0919ddbeb8..0590a6f512 100644 --- a/hw/vfio/ccw.c +++ b/hw/vfio/ccw.c @@ -230,6 +230,11 @@ static void vfio_ccw_io_notifier_handler(void *opaque) return; } +/* Virtual subchannel is not enabled */ +if (!(schib->pmcw.flags & PMCW_FLAGS_MASK_ENA)) { +return; +} + size = pread(vcdev->vdev.fd, region, vcdev->io_region_size, vcdev->io_region_offset); if (size == -1) { -- 2.17.1
Re: [PATCH v5] iotests: Test NBD client reconnection
On 14/11/2019 18:09, Eric Blake wrote: > On 11/11/19 9:39 PM, Andrey Shinkevich wrote: >> The test for an NBD client. The NBD server is disconnected after the >> client write request. The NBD client should reconnect and complete >> the write operation. >> >> Suggested-by: Denis V. Lunev >> Suggested-by: Vladimir Sementsov-Ogievskiy >> Signed-off-by: Andrey Shinkevich >> Reviewed-by: Eric Blake >> Tested-by: Eric Blake >> --- >> v5: "" were replaced with '' in the test except the function comments. >> The 'quick' word was added to the 'qroup' file next to the test >> #277. > > Queuing for 4.2-rc2 through my NBD tree. > > Eric, Thank you very much. -- With the best regards, Andrey Shinkevich
Re: [PATCH v9 Kernel 2/5] vfio iommu: Add ioctl defination to get dirty pages bitmap.
On Thu, 14 Nov 2019 21:40:35 -0500 Yan Zhao wrote: > On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote: > > On Fri, 15 Nov 2019 00:26:07 +0530 > > Kirti Wankhede wrote: > > > > > On 11/14/2019 1:37 AM, Alex Williamson wrote: > > > > On Thu, 14 Nov 2019 01:07:21 +0530 > > > > Kirti Wankhede wrote: > > > > > > > >> On 11/13/2019 4:00 AM, Alex Williamson wrote: > > > >>> On Tue, 12 Nov 2019 22:33:37 +0530 > > > >>> Kirti Wankhede wrote: > > > >>> > > > All pages pinned by vendor driver through vfio_pin_pages API should > > > be > > > considered as dirty during migration. IOMMU container maintains a > > > list of > > > all such pinned pages. Added an ioctl defination to get bitmap of > > > such > > > >>> > > > >>> definition > > > >>> > > > pinned pages for requested IO virtual address range. > > > >>> > > > >>> Additionally, all mapped pages are considered dirty when physically > > > >>> mapped through to an IOMMU, modulo we discussed devices opting in to > > > >>> per page pinning to indicate finer granularity with a TBD mechanism to > > > >>> figure out if any non-opt-in devices remain. > > > >>> > > > >> > > > >> You mean, in case of device direct assignment (device pass through)? > > > >> > > > > > > > > Yes, or IOMMU backed mdevs. If vfio_dmas in the container are fully > > > > pinned and mapped, then the correct dirty page set is all mapped pages. > > > > We discussed using the vpfn list as a mechanism for vendor drivers to > > > > reduce their migration footprint, but we also discussed that we would > > > > need a way to determine that all participants in the container have > > > > explicitly pinned their working pages or else we must consider the > > > > entire potential working set as dirty. > > > > > > > > > > How can vendor driver tell this capability to iommu module? Any > > > suggestions? > > > > I think it does so by pinning pages. Is it acceptable that if the > > vendor driver pins any pages, then from that point forward we consider > > the IOMMU group dirty page scope to be limited to pinned pages? There > > are complications around non-singleton IOMMU groups, but I think we're > > already leaning towards that being a non-worthwhile problem to solve. > > So if we require that only singleton IOMMU groups can pin pages and we > > pass the IOMMU group as a parameter to > > vfio_iommu_driver_ops.pin_pages(), then the type1 backend can set a > > flag on its local vfio_group struct to indicate dirty page scope is > > limited to pinned pages. We might want to keep a flag on the > > vfio_iommu struct to indicate if all of the vfio_groups for each > > vfio_domain in the vfio_iommu.domain_list dirty page scope limited to > > pinned pages as an optimization to avoid walking lists too often. Then > > we could test if vfio_iommu.domain_list is not empty and this new flag > > does not limit the dirty page scope, then everything within each > > vfio_dma is considered dirty. > > > > hi Alex > could you help clarify whether my understandings below are right? > In future, > 1. for mdev and for passthrough device withoug hardware ability to track > dirty pages, the vendor driver has to explicitly call > vfio_pin_pages()/vfio_unpin_pages() + a flag to tell vfio its dirty page set. For non-IOMMU backed mdevs without hardware dirty page tracking, there's no change to the vendor driver currently. Pages pinned by the vendor driver are marked as dirty. For any IOMMU backed device, mdev or direct assignment, all mapped memory would be considered dirty unless there are explicit calls to pin pages on top of the IOMMU page pinning and mapping. These would likely be enabled only when the device is in the _SAVING device_state. > 2. for those devices with hardware ability to track dirty pages, will still > provide a callback to vendor driver to get dirty pages. (as for those devices, > it is hard to explicitly call vfio_pin_pages()/vfio_unpin_pages()) > > 3. for devices relying on dirty bit info in physical IOMMU, there > will be a callback to physical IOMMU driver to get dirty page set from > vfio. The proposal here does not cover exactly how these would be implemented, it only establishes the container as the point of user interaction with the dirty bitmap and hopefully allows us to maintain that interface regardless of whether we have dirty tracking at the device or the system IOMMU. Ideally devices with dirty tracking would make use of page pinning and we'd extend the interface to allow vendor drivers the ability to indicate the clean/dirty state of those pinned pages. For system IOMMU dirty page tracking, that potentially might mean that we support IOMMU page faults and the container manages those faults such that the container is the central record of dirty pages. Until these interfaces are designed, we can only speculate, but the goal is to design a user interface compatible with how
Re: [PATCH v9 Kernel 2/5] vfio iommu: Add ioctl defination to get dirty pages bitmap.
On Fri, Nov 15, 2019 at 05:06:25AM +0800, Alex Williamson wrote: > On Fri, 15 Nov 2019 00:26:07 +0530 > Kirti Wankhede wrote: > > > On 11/14/2019 1:37 AM, Alex Williamson wrote: > > > On Thu, 14 Nov 2019 01:07:21 +0530 > > > Kirti Wankhede wrote: > > > > > >> On 11/13/2019 4:00 AM, Alex Williamson wrote: > > >>> On Tue, 12 Nov 2019 22:33:37 +0530 > > >>> Kirti Wankhede wrote: > > >>> > > All pages pinned by vendor driver through vfio_pin_pages API should be > > considered as dirty during migration. IOMMU container maintains a list > > of > > all such pinned pages. Added an ioctl defination to get bitmap of such > > > > >>> > > >>> definition > > >>> > > pinned pages for requested IO virtual address range. > > >>> > > >>> Additionally, all mapped pages are considered dirty when physically > > >>> mapped through to an IOMMU, modulo we discussed devices opting in to > > >>> per page pinning to indicate finer granularity with a TBD mechanism to > > >>> figure out if any non-opt-in devices remain. > > >>> > > >> > > >> You mean, in case of device direct assignment (device pass through)? > > > > > > Yes, or IOMMU backed mdevs. If vfio_dmas in the container are fully > > > pinned and mapped, then the correct dirty page set is all mapped pages. > > > We discussed using the vpfn list as a mechanism for vendor drivers to > > > reduce their migration footprint, but we also discussed that we would > > > need a way to determine that all participants in the container have > > > explicitly pinned their working pages or else we must consider the > > > entire potential working set as dirty. > > > > > > > How can vendor driver tell this capability to iommu module? Any suggestions? > > I think it does so by pinning pages. Is it acceptable that if the > vendor driver pins any pages, then from that point forward we consider > the IOMMU group dirty page scope to be limited to pinned pages? There > are complications around non-singleton IOMMU groups, but I think we're > already leaning towards that being a non-worthwhile problem to solve. > So if we require that only singleton IOMMU groups can pin pages and we > pass the IOMMU group as a parameter to > vfio_iommu_driver_ops.pin_pages(), then the type1 backend can set a > flag on its local vfio_group struct to indicate dirty page scope is > limited to pinned pages. We might want to keep a flag on the > vfio_iommu struct to indicate if all of the vfio_groups for each > vfio_domain in the vfio_iommu.domain_list dirty page scope limited to > pinned pages as an optimization to avoid walking lists too often. Then > we could test if vfio_iommu.domain_list is not empty and this new flag > does not limit the dirty page scope, then everything within each > vfio_dma is considered dirty. > hi Alex could you help clarify whether my understandings below are right? In future, 1. for mdev and for passthrough device withoug hardware ability to track dirty pages, the vendor driver has to explicitly call vfio_pin_pages()/vfio_unpin_pages() + a flag to tell vfio its dirty page set. 2. for those devices with hardware ability to track dirty pages, will still provide a callback to vendor driver to get dirty pages. (as for those devices, it is hard to explicitly call vfio_pin_pages()/vfio_unpin_pages()) 3. for devices relying on dirty bit info in physical IOMMU, there will be a callback to physical IOMMU driver to get dirty page set from vfio. Thanks Yan > > Signed-off-by: Kirti Wankhede > > Reviewed-by: Neo Jia > > --- > > include/uapi/linux/vfio.h | 23 +++ > > 1 file changed, 23 insertions(+) > > > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > > index 35b09427ad9f..6fd3822aa610 100644 > > --- a/include/uapi/linux/vfio.h > > +++ b/include/uapi/linux/vfio.h > > @@ -902,6 +902,29 @@ struct vfio_iommu_type1_dma_unmap { > > #define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15) > > #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16) > > > > +/** > > + * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOWR(VFIO_TYPE, VFIO_BASE + 17, > > + * struct > > vfio_iommu_type1_dirty_bitmap) > > + * > > + * IOCTL to get dirty pages bitmap for IOMMU container during > > migration. > > + * Get dirty pages bitmap of given IO virtual addresses range using > > + * struct vfio_iommu_type1_dirty_bitmap. Caller sets argsz, which is > > size of > > + * struct vfio_iommu_type1_dirty_bitmap. User should allocate memory > > to get > > + * bitmap and should set size of allocated memory in bitmap_size > > field. > > + * One bit is used to represent per page consecutively starting from > > iova > > + * offset. Bit set indicates page at that offset from iova is dirty. > > + */ > > +struct
Re: virtio,iommu_platform=on
On 14/11/2019 19:42, Michael S. Tsirkin wrote: > On Thu, Nov 14, 2019 at 09:58:47AM +1100, Alexey Kardashevskiy wrote: >> >> >> On 13/11/2019 21:00, Michael S. Tsirkin wrote: >>> On Wed, Nov 13, 2019 at 03:44:28PM +1100, Alexey Kardashevskiy wrote: On 12/11/2019 18:08, Michael S. Tsirkin wrote: > On Tue, Nov 12, 2019 at 02:53:49PM +1100, Alexey Kardashevskiy wrote: >> Hi! >> >> I am enabling IOMMU for virtio in the pseries firmware (SLOF) and seeing >> problems, one of them is SLOF does SCSI bus scan, then it stops the >> virtio-scsi by clearing MMIO|IO|BUSMASTER from PCI_COMMAND (as SLOF >> stopped using the devices) and when this happens, I see unassigned >> memory access (see below) which happens because disabling busmaster >> disables IOMMU and QEMU cannot access the rings to do some shutdown. And >> when this happens, the device does not come back even if SLOF re-enables >> it. > > In fact clearing bus master should disable ring access even > without the IOMMU. > Once you do this you should not wait for rings to be processed, > it is safe to assume they won't be touched again and just > free up any buffers that have not been used. > > Why don't you see this without IOMMU? Because without IOMMU, virtio can always access rings, it does not need bus master address space for that. >>> >>> Right and that's a bug in virtio scsi. E.g. virtio net checks >>> bus mastering before each access. >> >> You have to be specific - virtio scsi in the guest or in QEMU? > > If a device accesses memory with bus master on, it's buggy. > >> >>> Which is all well and good, but we can't just break the world >>> so I guess we first need to fix SLOF, and then add >>> a compat property. And maybe keep it broken for >>> legacy ... >>> > It's a bug I think, probably there to work around buggy guests. > > So pls fix this in SLOF and then hopefully we can drop the > work arounds and have clearing bus master actually block DMA. Laszlo suggested writing 0 to the status but this does not seem helping, with both ioeventfd=true/false. It looks like setting/clearing busmaster bit confused memory region caches in QEMU's virtio. I am confused which direction to keep digging to, any suggestions? Thanks, >>> >>> to clarify you reset after setting bus master? right? >> >> >> I was talking about clearing the bus master, and where I call that >> virtio reset does not matter. Thanks, >> >> > > so > > bus master =0 > reset > bus master =1 > > device does not recover? No, it does not. With my recent changes, it seems that only virtio-block cannot recover; virtio-net and virtio-scsi are fine. Thanks, > > >>> >>> > >> Hacking SLOF to not clear BUSMASTER makes virtio-scsi work but it is >> hardly a right fix. >> >> Is this something expected? Thanks, >> >> >> Here is the exact command line: >> >> /home/aik/pbuild/qemu-garrison2-ppc64/ppc64-softmmu/qemu-system-ppc64 \ >> >> -nodefaults \ >> >> -chardev stdio,id=STDIO0,signal=off,mux=on \ >> >> -device spapr-vty,id=svty0,reg=0x71000110,chardev=STDIO0 \ >> >> -mon id=MON0,chardev=STDIO0,mode=readline \ >> >> -nographic \ >> >> -vga none \ >> >> -enable-kvm \ >> -m 2G \ >> >> -device >> virtio-scsi-pci,id=vscsi0,iommu_platform=on,disable-modern=off,disable-legacy=on >> \ >> -drive id=DRIVE0,if=none,file=img/u1804-64le.qcow2,format=qcow2 \ >> >> -device scsi-disk,id=scsi-disk0,drive=DRIVE0 \ >> >> -snapshot \ >> >> -smp 1 \ >> >> -machine pseries \ >> >> -L /home/aik/t/qemu-ppc64-bios/ \ >> >> -trace events=qemu_trace_events \ >> >> -d guest_errors \ >> >> -chardev socket,id=SOCKET0,server,nowait,path=qemu.mon.ssh59518 \ >> >> -mon chardev=SOCKET0,mode=control >> >> >> >> Here is the backtrace: >> >> Thread 5 "qemu-system-ppc" hit Breakpoint 8, unassigned_mem_accepts >> (opaque=0x0, addr=0x5802, size=0x2, is_write=0x0, attrs=...) at /home/ >> aik/p/qemu/memory.c:1275 >> 1275return false; >> #0 unassigned_mem_accepts (opaque=0x0, addr=0x5802, size=0x2, >> is_write=0x0, attrs=...) at /home/aik/p/qemu/memory.c:1275 >> #1 0x100a8ac8 in memory_region_access_valid (mr=0x1105c230 >> , addr=0x5802, size=0x2, is_write=0x0, attrs=...) at >> /home/aik/p/qemu/memory.c:1377 >> #2 0x100a8c88 in memory_region_dispatch_read (mr=0x1105c230 >> , addr=0x5802, pval=0x7550d410, op=MO_16, >> attrs=...) at /home/aik/p/qemu/memory.c:1418 >> #3 0x1001cad4 in address_space_lduw_internal_cached_slow >> (cache=0x7fff68036fa0, addr=0x2, attrs=..., result=0x0, >> endian=DEVICE_LITTLE_ENDIAN) at /home/aik/p/qemu/memory_ldst.inc.c:211 >> #4
9p: requests efficiency
I'm currently reading up on how client requests (T messages) are currently dispatched in general by 9pfs, to understand where potential inefficiencies are that I am encountering. I mean 9pfs is pretty fast on raw I/O (read/write requests), provided that the message payload on guest side was chosen large enough (e.g. trans=virtio,version=9p2000.L,msize=4194304,...), where I already come close to my test disk's therotical maximum performance on read/write tests. But obviously these are huge 9p requests. However when there are a large number of (i.e. small) 9p requests, no matter what the actual request type is, then I am encountering severe performance issues with 9pfs and I try to understand whether this could be improved with reasonable effort. If I understand it correctly, each incoming request (T message) is dispatched to its own qemu coroutine queue. So individual requests should already be processed in parallel, right? Best regards, Christian Schoenebeck
RE: QEMU for Qualcomm Hexagon - KVM Forum talk and code available
OK, I changed the code in github and will upstream it that way. -Original Message- From: Richard Henderson Sent: Wednesday, November 13, 2019 3:10 PM To: Taylor Simpson ; Alex Bennée ; qemu-devel@nongnu.org Cc: Alessandro Di Federico ; ni...@rev.ng; Niccolò Izzo ; Aleksandar Markovic Subject: Re: QEMU for Qualcomm Hexagon - KVM Forum talk and code available - CAUTION: This email originated from outside of the organization. - On 11/13/19 8:31 PM, Taylor Simpson wrote: > [Taylor] Currently, I have the generator and the generated code sitting in > the source tree. I'm flexible on this if the decision is to regenerate it > every time. I would prefer to regenerate every time, and not store the generated code in the source tree at all. It makes it a no-brainer to modify the source and not have to remember how to regenerate, because the rules are right there in the makefile. r~
[Bug 1846427] Re: 4.1.0: qcow2 corruption on savevm/quit/loadvm cycle
I tried the ArchLinux package that includes three patches applied to qemu 4.1 ( see https://git.archlinux.org/svntogit/packages.git/commit/trunk/PKGBUILD?h=packages/qemu=e9707066408de26aa04f8d0ddebe5556aa87e662 ). My Windows 10 qcow2 image got corrupted again after a short time of use. Host filesystem is ext4. "OFLAG_COPIED data cluster: l2_entry=382c7 refcount=1" The Windows installation seems to be fine after repair. At least Windows did not found anything wrong. -- You received this bug notification because you are a member of qemu- devel-ml, which is subscribed to QEMU. https://bugs.launchpad.net/bugs/1846427 Title: 4.1.0: qcow2 corruption on savevm/quit/loadvm cycle Status in QEMU: Fix Committed Bug description: I'm seeing massive corruption of qcow2 images with qemu 4.1.0 and git master as of 7f21573c822805a8e6be379d9bcf3ad9effef3dc after a few savevm/quit/loadvm cycles. I've narrowed it down to the following reproducer (further notes below): # qemu-img check debian.qcow2 No errors were found on the image. 251601/327680 = 76.78% allocated, 1.63% fragmented, 0.00% compressed clusters Image end offset: 18340446208 # bin/qemu/bin/qemu-system-x86_64 -machine pc-q35-4.0.1,accel=kvm -m 4096 -chardev stdio,id=charmonitor -mon chardev=charmonitor -drive file=debian.qcow2,id=d -S qemu-system-x86_64: warning: dbind: Couldn't register with accessibility bus: Did not receive a reply. Possible causes include: the remote application did not send a reply, the message bus security policy blocked the reply, the reply timeout expired, or the network connection was broken. QEMU 4.1.50 monitor - type 'help' for more information (qemu) loadvm foo (qemu) c (qemu) qcow2_free_clusters failed: Invalid argument qcow2_free_clusters failed: Invalid argument qcow2_free_clusters failed: Invalid argument qcow2_free_clusters failed: Invalid argument quit [m@nargothrond:~] qemu-img check debian.qcow2 Leaked cluster 85179 refcount=2 reference=1 Leaked cluster 85180 refcount=2 reference=1 ERROR cluster 266150 refcount=0 reference=2 [...] ERROR OFLAG_COPIED data cluster: l2_entry=42284 refcount=1 9493 errors were found on the image. Data may be corrupted, or further writes to the image may corrupt it. 2 leaked clusters were found on the image. This means waste of disk space, but no harm to data. 259266/327680 = 79.12% allocated, 1.67% fragmented, 0.00% compressed clusters Image end offset: 18340446208 This is on a x86_64 Linux 5.3.1 Gentoo host with qemu-system-x86_64 and accel=kvm. The compiler is gcc-9.2.0 with the rest of the system similarly current. Reproduced with qemu-4.1.0 from distribution package as well as vanilla git checkout of tag v4.1.0 and commit 7f21573c822805a8e6be379d9bcf3ad9effef3dc (today's master). Does not happen with qemu compiled from vanilla checkout of tag v4.0.0. Build sequence: ./configure --prefix=$HOME/bin/qemu-bisect --target-list=x86_64-softmmu --disable-werror --disable-docs [...] CFLAGS-O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -g [...] (can provide full configure output if helpful) make -j8 install The kind of guest OS does not matter: seen with Debian testing 64bit, Windows 7 x86/x64 BIOS and Windows 7 x64 EFI. The virtual storage controller does not seem to matter: seen with VirtIO SCSI, emulated SCSI and emulated SATA AHCI. Caching modes (none, directsync, writeback), aio mode (threads, native) or discard (ignore, unmap) or detect-zeroes (off, unmap) does not influence occurence either. Having more RAM in the guest seems to increase odds of corruption: With 512MB to the Debian guest problem hardly occurs at all, with 4GB RAM it happens almost instantly. An automated reproducer works as follows: - the guest *does* mount its root fs and swap with option discard and my testing leaves me with the impression that file deletion rather than reading is causing the issue - foo is a snapshot of the running Debian VM which is already running command # while true ; do dd if=/dev/zero of=foo bs=10240k count=400 ; done to produce some I/O to the disk (4GB file with 4GB of RAM). - on the host a loop continuously resumes and saves the guest state and quits qemu inbetween: # while true ; do (echo loadvm foo ; echo c ; sleep 10 ; echo stop ; echo savevm foo ; echo quit ) | bin/qemu-bisect/bin/qemu-system-x86_64 -machine pc-q35-3.1,accel=kvm -m 4096 -chardev stdio,id=charmonitor -mon chardev=charmonitor -drive file=debian.qcow2,id=d -S -display none ; done - quitting qemu inbetween saves and loads seems to be necessary for the problem to occur. Just continusouly in one session saving and loading guest state does not trigger it. - For me, after about 2 to 6 iterations of above loop the image is corrupted. - corruption manifests with other messages from qemu as well, e.g.: (qemu) loadvm foo Error: Device 'd'
Re: [PATCH] Semihost SYS_READC implementation (v4)
Richard Henderson writes: > For semi-hosting, it seems even better if the semi-hosting syscall instruction > is not "real", because you're explicitly requesting services from "unreal" > hardware. It should be specified to generate a SIGILL type of exception > anywhere semi-hosting is not enabled. In the QEMU case, yes, it's virtual hardware. However, the other common case for semihosting is when doing hardware bringup using JTAG or other remote debugging link -- having an instruction which safely traps to the debugger is required to make semihosting usable there. Reading through the history of the current RISC-V semihosting mechanism, there were many designs considered and rejected because they wouldn't work in the JTAG environment. > With that in mind, it may be simpler to handle all of this not in the > translator, but in the function that delivers the ebreak exception. At that > point one can arrange to read memory without raising additional > exceptions. I'll go explore and see if I can figure any of this out. I'd still like to get the non-RISC-V SYS_READC patch landed someday :-) -- -keith signature.asc Description: PGP signature
[ANNOUNCE] QEMU 4.1.1 Stable released
Hi everyone, I am pleased to announce that the QEMU v4.1.1 stable release is now available: You can grab the tarball from our download page here: https://www.qemu.org/download/#source v4.1.1 is now tagged in the official qemu.git repository, and the stable-4.1 branch has been updated accordingly: https://git.qemu.org/?p=qemu.git;a=shortlog;h=refs/heads/stable-4.1 This update contains a number of fixes relating qcow2 and XFS corruption, a security fix for LSI SCSI emulation (CVE-2019-12068), and general fixes for various other subsystems. Please see the changelog for additional details and update accordingly. Thank you to everyone involved! CHANGELOG: 99c5874a9b: Update version for 4.1.1 release (Michael Roth) e092a17d38: mirror: Keep mirror_top_bs drained after dropping permissions (Kevin Wolf) 088f1e8fd9: block/create: Do not abort if a block driver is not available (Philippe Mathieu-Daudé) 145b562990: vhost: Fix memory region section comparison (Dr. David Alan Gilbert) 42b6571357: memory: Provide an equality function for MemoryRegionSections (Dr. David Alan Gilbert) c0aca9352d: memory: Align MemoryRegionSections fields (Dr. David Alan Gilbert) 54c130493c: tests: make filemonitor test more robust to event ordering (Daniel P. Berrangé) 3d018ff3bd: block: posix: Always allocate the first block (Nir Soffer) f0d3fa265d: file-posix: Handle undetectable alignment (Nir Soffer) 7db05c8a73: block/file-posix: Let post-EOF fallocate serialize (Max Reitz) d9b88f7e0d: block: Add bdrv_co_get_self_request() (Max Reitz) 590cff8230: block: Make wait/mark serialising requests public (Max Reitz) 2e2ad02f2c: block/io: refactor padding (Vladimir Sementsov-Ogievskiy) b3b76fc643: util/iov: improve qemu_iovec_is_zero (Vladimir Sementsov-Ogievskiy) cff024fe85: util/iov: introduce qemu_iovec_init_extended (Vladimir Sementsov-Ogievskiy) 40df4a1bf7: qcow2-bitmap: Fix uint64_t left-shift overflow (Tuguoyi) b156178553: iotests: Add peek_file* functions (Max Reitz) 15f5e8c367: iotests: Add test for 4G+ compressed qcow2 write (Max Reitz) 405deba14f: qcow2: Fix QCOW2_COMPRESSED_SECTOR_MASK (Max Reitz) 01be50603b: virtio-blk: Cancel the pending BH when the dataplane is reset (Philippe Mathieu-Daudé) 051c9b3cbc: scsi: lsi: exit infinite loop while executing script (CVE-2019-12068) (Paolo Bonzini) b387531323: target/xtensa: regenerate and re-import test_mmuhifi_c3 core (Max Filippov) cdc6896659: target/arm: Allow reading flags from FPSCR for M-profile (Christophe Lyon) c0b35d87de: hbitmap: handle set/reset with zero length (Vladimir Sementsov-Ogievskiy) fcd7cba6ac: util/hbitmap: strict hbitmap_reset (Vladimir Sementsov-Ogievskiy) aea18ef938: COLO-compare: Fix incorrect `if` logic (Fan Yang) 4887acf574: virtio-net: prevent offloads reset on migration (Mikhail Sennikovsky) 8010d3fce0: virtio: new post_load hook (Michael S. Tsirkin) 6705b9344f: ui: Fix hanging up Cocoa display on macOS 10.15 (Catalina) (Hikaru Nishida) c0e2fbf124: mirror: Do not dereference invalid pointers (Max Reitz) b077ac637d: iotests: Test large write request to qcow2 file (Max Reitz) 9e51c5306c: qcow2: Limit total allocation range to INT_MAX (Max Reitz) aae0faa5d3: hw/core/loader: Fix possible crash in rom_copy() (Thomas Huth) 7b404cae7f: vhost-user: save features if the char dev is closed (Adrian Moreno) d868d30db6: iotests: Test internal snapshots with -blockdev (Kevin Wolf) 7a8aa6c734: block/snapshot: Restrict set of snapshot nodes (Kevin Wolf) 331c08d300: s390: PCI: fix IOMMU region init (Matthew Rosato) fc5afb1a92: roms/Makefile.edk2: don't pull in submodules when building from tarball (Michael Roth) c5c9b1362d: make-release: pull in edk2 submodules so we can build it from tarballs (Michael Roth) 220816989c: hw/arm/boot.c: Set NSACR.{CP11,CP10} for NS kernel boots (Peter Maydell) 783e7eb52c: block/backup: fix backup_cow_with_offload for last cluster (Vladimir Sementsov-Ogievskiy) e01ed1a1ae: block/backup: fix max_transfer handling for copy_range (Vladimir Sementsov-Ogievskiy) 416a692e51: qcow2: Fix corruption bug in qcow2_detect_metadata_preallocation() (Kevin Wolf) e9bb3d942e: coroutine: Add qemu_co_mutex_assert_locked() (Kevin Wolf) 84f22c7285: block/qcow2: Fix corruption introduced by commit 8ac0f15f335 (Maxim Levitsky) 86b0f4022b: blockjob: update nodes head while removing all bdrv (Sergio Lopez) 2d86df1f78: curl: Handle success in multi_check_completion (Max Reitz) 18e1b71937: curl: Report only ready sockets (Max Reitz) 0888ddac8e: curl: Pass CURLSocket to curl_multi_do() (Max Reitz) 4be97ef966: curl: Check completion in curl_multi_do() (Max Reitz) 78ea94e389: curl: Keep *socket until the end of curl_sock_cb() (Max Reitz) 3648493495: curl: Keep pointer to the CURLState in CURLSocket (Max Reitz) 0694c489cd: block/nfs: tear down aio before nfs_close (Peter Lieven) c9ffb12754: qcow2: Fix the calculation of the maximum L2 cache size (Alberto Garcia) 28a9a3558a: libvhost-user: fix SLAVE_SEND_FD handling (Johannes Berg) 9027d3fba6:
[PATCH v3 4/4] tests: More iotest 223 improvements
Run the core of the test twice, once without iothreads, and again with, for more coverage of both setups. Suggested-by: Nir Soffer Signed-off-by: Eric Blake Reviewed-by: Max Reitz --- tests/qemu-iotests/223 | 16 ++- tests/qemu-iotests/223.out | 85 +- 2 files changed, 97 insertions(+), 4 deletions(-) diff --git a/tests/qemu-iotests/223 b/tests/qemu-iotests/223 index b5a80e50bbc1..ea69cd4b8b5e 100755 --- a/tests/qemu-iotests/223 +++ b/tests/qemu-iotests/223 @@ -117,10 +117,19 @@ _send_qemu_cmd $QEMU_HANDLE '{"execute":"qmp_capabilities"}' "return" _send_qemu_cmd $QEMU_HANDLE '{"execute":"blockdev-add", "arguments":{"driver":"qcow2", "node-name":"n", "file":{"driver":"file", "filename":"'"$TEST_IMG"'"}}}' "return" -_send_qemu_cmd $QEMU_HANDLE '{"execute":"x-blockdev-set-iothread", - "arguments":{"node-name":"n", "iothread":"io0"}}' "return" _send_qemu_cmd $QEMU_HANDLE '{"execute":"block-dirty-bitmap-disable", "arguments":{"node":"n", "name":"b"}}' "return" + +for attempt in normal iothread; do + +echo +echo "=== Set up NBD with $attempt access ===" +echo +if [ $attempt = iothread ]; then +_send_qemu_cmd $QEMU_HANDLE '{"execute":"x-blockdev-set-iothread", + "arguments":{"node-name":"n", "iothread":"io0"}}' "return" +fi + _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-add", "arguments":{"device":"n"}}' "error" # Attempt add without server _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-start", @@ -180,6 +189,9 @@ _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-remove", "arguments":{"name":"n2"}}' "error" # Attempt duplicate clean _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-stop"}' "return" _send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-stop"}' "error" # Again + +done + _send_qemu_cmd $QEMU_HANDLE '{"execute":"quit"}' "return" wait=yes _cleanup_qemu diff --git a/tests/qemu-iotests/223.out b/tests/qemu-iotests/223.out index 861ddbd9e0a4..f17559880268 100644 --- a/tests/qemu-iotests/223.out +++ b/tests/qemu-iotests/223.out @@ -28,10 +28,91 @@ wrote 2097152/2097152 bytes at offset 2097152 {"return": {}} {"execute":"blockdev-add", "arguments":{"driver":"IMGFMT", "node-name":"n", "file":{"driver":"file", "filename":"TEST_DIR/t.IMGFMT"}}} {"return": {}} -{"execute":"x-blockdev-set-iothread", "arguments":{"node-name":"n", "iothread":"io0"}} -{"return": {}} {"execute":"block-dirty-bitmap-disable", "arguments":{"node":"n", "name":"b"}} {"return": {}} + +=== Set up NBD with normal access === + +{"execute":"nbd-server-add", "arguments":{"device":"n"}} +{"error": {"class": "GenericError", "desc": "NBD server not running"}} +{"execute":"nbd-server-start", "arguments":{"addr":{"type":"unix", "data":{"path":"SOCK_DIR/nbd" +{"return": {}} +{"execute":"nbd-server-start", "arguments":{"addr":{"type":"unix", "data":{"path":"SOCK_DIR/nbd1" +{"error": {"class": "GenericError", "desc": "NBD server already running"}} +exports available: 0 +{"execute":"nbd-server-add", "arguments":{"device":"n", "bitmap":"b"}} +{"return": {}} +{"execute":"nbd-server-add", "arguments":{"device":"nosuch"}} +{"error": {"class": "GenericError", "desc": "Cannot find device=nosuch nor node_name=nosuch"}} +{"execute":"nbd-server-add", "arguments":{"device":"n"}} +{"error": {"class": "GenericError", "desc": "NBD server already has export named 'n'"}} +{"execute":"nbd-server-add", "arguments":{"device":"n", "name":"n2", "bitmap":"b2"}} +{"error": {"class": "GenericError", "desc": "Enabled bitmap 'b2' incompatible with readonly export"}} +{"execute":"nbd-server-add", "arguments":{"device":"n", "name":"n2", "bitmap":"b3"}} +{"error": {"class": "GenericError", "desc": "Bitmap 'b3' is not found"}} +{"execute":"nbd-server-add", "arguments":{"device":"n", "name":"n2", "writable":true, "bitmap":"b2"}} +{"return": {}} +exports available: 2 + export: 'n' + size: 4194304 + flags: 0x58f ( readonly flush fua df multi cache ) + min block: 1 + opt block: 4096 + max block: 33554432 + available meta contexts: 2 + base:allocation + qemu:dirty-bitmap:b + export: 'n2' + size: 4194304 + flags: 0xced ( flush fua trim zeroes df cache fast-zero ) + min block: 1 + opt block: 4096 + max block: 33554432 + available meta contexts: 2 + base:allocation + qemu:dirty-bitmap:b2 + +=== Contrast normal status to large granularity dirty-bitmap === + +read 512/512 bytes at offset 512 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 524288/524288 bytes at offset 524288 +512 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 1048576/1048576 bytes at offset 1048576 +1 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +read 2097152/2097152 bytes at offset 2097152 +2 MiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +[{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, +{ "start": 4096, "length": 1044480, "depth": 0, "zero": true, "data": false, "offset": OFFSET}, +{ "start":
[PATCH v3 2/4] iotests: Switch nbd tests to use Unix rather than TCP
Up to now, all it took to cause a lot of iotest failures was to have a background process such as 'nbdkit -p 10810 null' running, because we hard-coded the TCP port. Switching to a Unix socket eliminates this contention. We still have TCP coverage in test 233, and that test is more careful to not pick a hard-coded port. Signed-off-by: Eric Blake --- tests/qemu-iotests/common.filter | 6 -- tests/qemu-iotests/common.rc | 8 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter index f870e00e4421..5367deea398e 100644 --- a/tests/qemu-iotests/common.filter +++ b/tests/qemu-iotests/common.filter @@ -127,7 +127,8 @@ _filter_img_create() -e "s#$TEST_DIR#TEST_DIR#g" \ -e "s#$SOCK_DIR#SOCK_DIR#g" \ -e "s#$IMGFMT#IMGFMT#g" \ --e 's#nbd:127.0.0.1:10810#TEST_DIR/t.IMGFMT#g' \ +-e 's#nbd:127.0.0.1:[0-9]\\+#TEST_DIR/t.IMGFMT#g' \ +-e 's#nbd+unix:///\??socket=SOCK_DIR/nbd#TEST_DIR/t.IMGFMT#g' \ -e "s# encryption=off##g" \ -e "s# cluster_size=[0-9]\\+##g" \ -e "s# table_size=[0-9]\\+##g" \ @@ -164,7 +165,8 @@ _filter_img_info() -e "s#$TEST_DIR#TEST_DIR#g" \ -e "s#$SOCK_DIR#SOCK_DIR#g" \ -e "s#$IMGFMT#IMGFMT#g" \ --e 's#nbd://127.0.0.1:10810$#TEST_DIR/t.IMGFMT#g' \ +-e 's#nbd://127.0.0.1:[0-9]\\+$#TEST_DIR/t.IMGFMT#g' \ +-e 's#nbd+unix:///\??socket=SOCK_DIR/nbd#TEST_DIR/t.IMGFMT#g' \ -e 's#json.*vdisk-id.*vxhs"}}#TEST_DIR/t.IMGFMT#' \ -e "/encrypted: yes/d" \ -e "/cluster_size: [0-9]\\+/d" \ diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc index fa7bae24226a..f772dcb67322 100644 --- a/tests/qemu-iotests/common.rc +++ b/tests/qemu-iotests/common.rc @@ -217,7 +217,7 @@ if [ "$IMGOPTSSYNTAX" = "true" ]; then TEST_IMG="$DRIVER,file.filename=$TEST_DIR/t.$IMGFMT" elif [ "$IMGPROTO" = "nbd" ]; then TEST_IMG_FILE=$TEST_DIR/t.$IMGFMT -TEST_IMG="$DRIVER,file.driver=nbd,file.host=127.0.0.1,file.port=10810" + TEST_IMG="$DRIVER,file.driver=nbd,file.type=unix,file.path=$SOCKDIR/$IMGFMT" elif [ "$IMGPROTO" = "ssh" ]; then TEST_IMG_FILE=$TEST_DIR/t.$IMGFMT TEST_IMG="$DRIVER,file.driver=ssh,file.host=127.0.0.1,file.path=$TEST_IMG_FILE" @@ -233,7 +233,7 @@ else TEST_IMG=$TEST_DIR/t.$IMGFMT elif [ "$IMGPROTO" = "nbd" ]; then TEST_IMG_FILE=$TEST_DIR/t.$IMGFMT -TEST_IMG="nbd:127.0.0.1:10810" +TEST_IMG="nbd+unix:///?socket=$SOCK_DIR/nbd" elif [ "$IMGPROTO" = "ssh" ]; then TEST_IMG_FILE=$TEST_DIR/t.$IMGFMT REMOTE_TEST_DIR="ssh://\\($USER@\\)\\?127.0.0.1\\(:[0-9]\\+\\)\\?$TEST_DIR" @@ -293,7 +293,7 @@ _stop_nbd_server() local QEMU_NBD_PID read QEMU_NBD_PID < "${QEMU_TEST_DIR}/qemu-nbd.pid" kill ${QEMU_NBD_PID} -rm -f "${QEMU_TEST_DIR}/qemu-nbd.pid" +rm -f "${QEMU_TEST_DIR}/qemu-nbd.pid" "$SOCK_DIR/nbd" fi } @@ -353,7 +353,7 @@ _make_test_img() if [ $IMGPROTO = "nbd" ]; then # Pass a sufficiently high number to -e that should be enough for all # tests -eval "$QEMU_NBD -v -t -b 127.0.0.1 -p 10810 -f $IMGFMT -e 42 -x '' $TEST_IMG_FILE >/dev/null &" +eval "$QEMU_NBD -v -t -k '$SOCK_DIR/nbd' -f $IMGFMT -e 42 -x '' $TEST_IMG_FILE >/dev/null &" sleep 1 # FIXME: qemu-nbd needs to be listening before we continue fi -- 2.21.0
[PATCH v3 1/4] iotests: Fix 173
This test has been broken since 3.0. It used TEST_IMG to influence the name of a file created during _make_test_img, but commit 655ae6bb changed things so that the wrong file name is being created, which then caused _launch_qemu to fail. In the meantime, the set of events issued for the actions of the test has increased. Why haven't we noticed the failure? Because the test rarely gets run: './check -qcow2 173' is insufficient (that defaults to using file protocol) './check -nfs 173' is insufficient (that defaults to using raw format) so the test is only run with: ./check -qcow2 -nfs 173 Note that we already have a number of other problems with -nfs: ./check -nfs (fails 18/30) ./check -qcow2 -nfs (fails 45/76 after this patch, if exports does not permit 'insecure') and it's not on my priority list to fix those. Rather, I found this because of my next patch's work on tests using _send_qemu_cmd. Fixes: 655ae6b Signed-off-by: Eric Blake Reviewed-by: Max Reitz --- tests/qemu-iotests/173 | 4 ++-- tests/qemu-iotests/173.out | 6 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/tests/qemu-iotests/173 b/tests/qemu-iotests/173 index 9e2fa2e73cb9..29dcaa1960df 100755 --- a/tests/qemu-iotests/173 +++ b/tests/qemu-iotests/173 @@ -47,9 +47,9 @@ size=100M BASE_IMG="${TEST_DIR}/image.base" TOP_IMG="${TEST_DIR}/image.snp1" -TEST_IMG="${BASE_IMG}" _make_test_img $size +TEST_IMG_FILE="${BASE_IMG}" _make_test_img $size -TEST_IMG="${TOP_IMG}" _make_test_img $size +TEST_IMG_FILE="${TOP_IMG}" _make_test_img $size echo echo === Running QEMU, using block-stream to find backing image === diff --git a/tests/qemu-iotests/173.out b/tests/qemu-iotests/173.out index f477a0099a32..e83d17ec2f64 100644 --- a/tests/qemu-iotests/173.out +++ b/tests/qemu-iotests/173.out @@ -7,6 +7,10 @@ Formatting 'TEST_DIR/image.snp1', fmt=IMGFMT size=104857600 {"return": {}} {"return": {}} {"return": {}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "disk2"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "disk2"}} {"return": {}} -{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "disk2", "len": 104857600, "offset": 104857600, "speed": 0, "type": "stream"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "disk2"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "JOB_STATUS_CHANGE", "data": {"status": "pending", "id": "disk2"}} +{"timestamp": {"seconds": TIMESTAMP, "microseconds": TIMESTAMP}, "event": "BLOCK_JOB_COMPLETED", "data": {"device": "disk2", "len": 0, "offset": 0, "speed": 0, "type": "stream"}} *** done -- 2.21.0
[PATCH v3 3/4] iotests: Include QMP input in .out files
We generally include relevant HMP input in .out files, by virtue of the fact that HMP echoes its input. But QMP does not, so we have to explicitly inject it in the output stream (appropriately filtered to keep the tests passing), in order to make it easier to read .out files to see what behavior is being tested (especially true where the output file is a sequence of {'return': {}}). Suggested-by: Max Reitz Signed-off-by: Eric Blake --- tests/qemu-iotests/common.qemu | 9 tests/qemu-iotests/085.out | 26 ++ tests/qemu-iotests/094.out | 4 ++ tests/qemu-iotests/095.out | 2 + tests/qemu-iotests/109.out | 88 ++ tests/qemu-iotests/117.out | 5 ++ tests/qemu-iotests/127.out | 4 ++ tests/qemu-iotests/140.out | 5 ++ tests/qemu-iotests/141.out | 26 ++ tests/qemu-iotests/143.out | 3 ++ tests/qemu-iotests/144.out | 5 ++ tests/qemu-iotests/153.out | 11 + tests/qemu-iotests/156.out | 11 + tests/qemu-iotests/161.out | 8 tests/qemu-iotests/173.out | 4 ++ tests/qemu-iotests/182.out | 8 tests/qemu-iotests/183.out | 11 + tests/qemu-iotests/185.out | 18 +++ tests/qemu-iotests/191.out | 8 tests/qemu-iotests/200.out | 1 + tests/qemu-iotests/223.out | 19 tests/qemu-iotests/229.out | 3 ++ tests/qemu-iotests/249.out | 6 +++ 23 files changed, 285 insertions(+) diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu index 8d2021a7eb0c..de680cf1c7c9 100644 --- a/tests/qemu-iotests/common.qemu +++ b/tests/qemu-iotests/common.qemu @@ -123,6 +123,9 @@ _timed_wait_for() # until either timeout, or a response. If it is not set, or <=0, # then the command is only sent once. # +# If neither $silent nor $mismatch_only is set, and $cmd begins with '{', +# echo the command before sending it the first time. +# # If $qemu_error_no_exit is set, then even if the expected response # is not seen, we will not exit. $QEMU_STATUS[$1] will be set it -1 in # that case. @@ -152,6 +155,12 @@ _send_qemu_cmd() shift $(($# - 2)) fi +# Display QMP being sent, but not HMP (since HMP already echoes its +# input back to output); decide based on leading '{' +if [ -z "$silent" ] && [ -z "$mismatch_only" ] && +[ "$cmd" != "${cmd#\{}" ]; then +echo "${cmd}" | _filter_testdir | _filter_imgfmt +fi while [ ${count} -gt 0 ] do echo "${cmd}" >&${QEMU_IN[${h}]} diff --git a/tests/qemu-iotests/085.out b/tests/qemu-iotests/085.out index 2a5f256cd3ec..bb50227b8265 100644 --- a/tests/qemu-iotests/085.out +++ b/tests/qemu-iotests/085.out @@ -7,48 +7,61 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 === Sending capabilities === +{ 'execute': 'qmp_capabilities' } {"return": {}} === Create a single snapshot on virtio0 === +{ 'execute': 'blockdev-snapshot-sync', 'arguments': { 'device': 'virtio0', 'snapshot-file':'TEST_DIR/1-snapshot-v0.IMGFMT', 'format': 'IMGFMT' } } Formatting 'TEST_DIR/1-snapshot-v0.qcow2', fmt=qcow2 size=134217728 backing_file=TEST_DIR/t.qcow2.1 backing_fmt=qcow2 cluster_size=65536 lazy_refcounts=off refcount_bits=16 {"return": {}} === Invalid command - missing device and nodename === +{ 'execute': 'blockdev-snapshot-sync', 'arguments': { 'snapshot-file':'TEST_DIR/1-snapshot-v0.IMGFMT', 'format': 'IMGFMT' } } {"error": {"class": "GenericError", "desc": "Cannot find device= nor node_name="}} === Invalid command - missing snapshot-file === +{ 'execute': 'blockdev-snapshot-sync', 'arguments': { 'device': 'virtio0', 'format': 'IMGFMT' } } {"error": {"class": "GenericError", "desc": "Parameter 'snapshot-file' is missing"}} === Create several transactional group snapshots === +{ 'execute': 'transaction', 'arguments': {'actions': [ { 'type': 'blockdev-snapshot-sync', 'data' : { 'device': 'virtio0', 'snapshot-file': 'TEST_DIR/2-snapshot-v0.IMGFMT' } }, { 'type': 'blockdev-snapshot-sync', 'data' : { 'device': 'virtio1', 'snapshot-file': 'TEST_DIR/2-snapshot-v1.IMGFMT' } } ] } } Formatting 'TEST_DIR/2-snapshot-v0.qcow2', fmt=qcow2 size=134217728 backing_file=TEST_DIR/1-snapshot-v0.qcow2 backing_fmt=qcow2 cluster_size=65536 lazy_refcounts=off refcount_bits=16 Formatting 'TEST_DIR/2-snapshot-v1.qcow2', fmt=qcow2 size=134217728 backing_file=TEST_DIR/t.qcow2.2 backing_fmt=qcow2 cluster_size=65536 lazy_refcounts=off refcount_bits=16 {"return": {}} +{ 'execute': 'transaction', 'arguments': {'actions': [ { 'type': 'blockdev-snapshot-sync', 'data' : { 'device': 'virtio0', 'snapshot-file': 'TEST_DIR/3-snapshot-v0.IMGFMT' } }, { 'type': 'blockdev-snapshot-sync', 'data' : { 'device': 'virtio1', 'snapshot-file': 'TEST_DIR/3-snapshot-v1.IMGFMT' } } ] } } Formatting 'TEST_DIR/3-snapshot-v0.qcow2', fmt=qcow2 size=134217728 backing_file=TEST_DIR/2-snapshot-v0.qcow2 backing_fmt=qcow2 cluster_size=65536
[PATCH v3 for-4.2 0/4] tests: More iotest 223 improvements
Since v2: - rebase to SOCK_DIR changes - new patch 2 to avoid TCP port 10810 contention [Max] - add imgfmt filtering [Max] As this is limited to iotests, I think it is fair game for -rc2. Eric Blake (4): iotests: Fix 173 iotests: Switch nbd tests to use Unix rather than TCP iotests: Include QMP input in .out files tests: More iotest 223 improvements tests/qemu-iotests/common.filter | 6 +- tests/qemu-iotests/common.qemu | 9 +++ tests/qemu-iotests/common.rc | 8 +-- tests/qemu-iotests/085.out | 26 tests/qemu-iotests/094.out | 4 ++ tests/qemu-iotests/095.out | 2 + tests/qemu-iotests/109.out | 88 +++ tests/qemu-iotests/117.out | 5 ++ tests/qemu-iotests/127.out | 4 ++ tests/qemu-iotests/140.out | 5 ++ tests/qemu-iotests/141.out | 26 tests/qemu-iotests/143.out | 3 + tests/qemu-iotests/144.out | 5 ++ tests/qemu-iotests/153.out | 11 tests/qemu-iotests/156.out | 11 tests/qemu-iotests/161.out | 8 +++ tests/qemu-iotests/173 | 4 +- tests/qemu-iotests/173.out | 10 +++- tests/qemu-iotests/182.out | 8 +++ tests/qemu-iotests/183.out | 11 tests/qemu-iotests/185.out | 18 ++ tests/qemu-iotests/191.out | 8 +++ tests/qemu-iotests/200.out | 1 + tests/qemu-iotests/223 | 16 - tests/qemu-iotests/223.out | 100 +++ tests/qemu-iotests/229.out | 3 + tests/qemu-iotests/249.out | 6 ++ 27 files changed, 395 insertions(+), 11 deletions(-) -- 2.21.0
Re: [PATCH v2 1/3] hw/block/pflash: Remove dynamic field width from trace events
Hi Eric, On 11/8/19 4:56 PM, Eric Blake wrote: On 11/8/19 8:40 AM, Philippe Mathieu-Daudé wrote: Since not all trace backends support dynamic field width in format (dtrace via stap does not), replace by a static field width instead. Reported-by: Eric Blake Buglink: https://bugs.launchpad.net/qemu/+bug/1844817 Signed-off-by: Philippe Mathieu-Daudé --- hw/block/pflash_cfi01.c | 8 hw/block/pflash_cfi02.c | 8 hw/block/trace-events | 8 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/hw/block/pflash_cfi01.c b/hw/block/pflash_cfi01.c index 566c0acb77..787d1196f2 100644 --- a/hw/block/pflash_cfi01.c +++ b/hw/block/pflash_cfi01.c @@ -276,7 +276,7 @@ static uint32_t pflash_data_read(PFlashCFI01 *pfl, hwaddr offset, DPRINTF("BUG in %s\n", __func__); abort(); } - trace_pflash_data_read(offset, width << 1, ret); + trace_pflash_data_read(offset, width << 3, ret); Umm, why is width changing? That's not mentioned in the commit message. Previously it was used to set the format width: [1, 2, 4] -> [2, 4, 8]. We usually log the width in byte (accessed at memory location) or bits (used by the bus). When using this device I'm custom to think in bus access width. Regardless whichever format we prefer, a change is needed. @@ -389,7 +389,7 @@ static uint32_t pflash_read(PFlashCFI01 *pfl, hwaddr offset, break; } - trace_pflash_io_read(offset, width, width << 1, ret, pfl->cmd, pfl->wcycle); + trace_pflash_io_read(offset, width << 3, ret, pfl->cmd, pfl->wcycle); And even this one is odd. Matching up to the trace messages: -pflash_io_read(uint64_t offset, int width, int fmt_width, uint32_t value, uint8_t cmd, uint8_t wcycle) "offset:0x%04"PRIx64" width:%d value:0x%0*x cmd:0x%02x wcycle:%u" +pflash_io_read(uint64_t offset, int width, uint32_t value, uint8_t cmd, uint8_t wcycle) "offset:0x%04"PRIx64" width:%d value:0x%04x cmd:0x%02x wcycle:%u" you are changing from: "%04"PRIx64" %d %0*x...", offset, width, width << 1, ret,... (where width<<1, ret matches *x) into "%04"PRIx64" %d %04x...", offset, width << 3, ret,... where you are now printing a different value for width. Do you prefer using a "-bit" suffix? As "offset:0x%04"PRIx64" width:%d-bit value:0x%04x cmd:0x%02x wcycle:%u" I can also simply remove this information. Ideally I'd revert this patch once the we get this format parsable by the SystemTap backend.
Re: [PATCH v2 3/5] MAINTAINERS: Adjust maintainership for Malta board
+Paul Burton On 11/13/19 2:47 PM, Aleksandar Markovic wrote: From: Aleksandar Markovic Change the maintainership for Malta board to improve its quality. Acked-by: Aurelien Jarno Signed-off-by: Aleksandar Markovic --- MAINTAINERS | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 3bf2144..6afec32 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -955,8 +955,9 @@ F: hw/display/jazz_led.c F: hw/dma/rc4030.c Malta -M: Aurelien Jarno > +R: Aurelien Jarno Aurelien, do you want to stay co-maintainer with this one? Else, thanks for staying listed as designated reviewer :) -R: Aleksandar Rikalo +M: Philippe Mathieu-Daudé I'm happy to continue Aurelien work with this board :) Similarly to the Fuloong board, I mostly use the Malta as a hobby, so I'll use my personal email. Paul, do you mind being co-maintainer or at least listed as designated reviewer here? +R: Hervé Poussineau I don't see commits from Hervé with this board, so he is probably not interested. S: Maintained Until Paul is interested in co-maintenance, let's change this board status to 'Odd Fixes'. F: hw/mips/mips_malta.c F: hw/mips/gt64xxx_pci.c
Re: [PATCH v2 2/3] hw/mips/gt64xxx: Remove dynamic field width from trace events
On 11/8/19 4:58 PM, Eric Blake wrote: On 11/8/19 8:40 AM, Philippe Mathieu-Daudé wrote: Since not all trace backends support dynamic field width in format (dtrace via stap does not), replace by a static field width instead. Reported-by: Eric Blake Buglink: https://bugs.launchpad.net/qemu/+bug/1844817 Signed-off-by: Philippe Mathieu-Daudé --- v2: Do not update qemu_log_mask() --- hw/mips/gt64xxx_pci.c | 16 hw/mips/trace-events | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/hw/mips/gt64xxx_pci.c b/hw/mips/gt64xxx_pci.c index 5cab9c1ee1..6743e7c929 100644 --- a/hw/mips/gt64xxx_pci.c +++ b/hw/mips/gt64xxx_pci.c @@ -642,19 +642,19 @@ static void gt64120_writel(void *opaque, hwaddr addr, /* not really implemented */ s->regs[saddr] = ~(~(s->regs[saddr]) | ~(val & 0xfffe)); s->regs[saddr] |= !!(s->regs[saddr] & 0xfffe); - trace_gt64120_write("INTRCAUSE", size << 1, val); + trace_gt64120_write("INTRCAUSE", size << 3, val); Again, this isn't mentioned in the commit message. Why are you changing parameter values? +++ b/hw/mips/trace-events @@ -1,4 +1,4 @@ # gt64xxx.c -gt64120_read(const char *regname, int width, uint64_t value) "gt64120 read %s value:0x%0*" PRIx64 -gt64120_write(const char *regname, int width, uint64_t value) "gt64120 write %s value:0x%0*" PRIx64 +gt64120_read(const char *regname, int width, uint64_t value) "gt64120 read %s width:%d value:0x%08" PRIx64 +gt64120_write(const char *regname, int width, uint64_t value) "gt64120 write %s width:%d value:0x%08" PRIx64 Huh, we were really broken - the old code (if passed to printf) would try to parse 4 parameters, even though it was only passed 3. But it looks like you still need a v3. Oops. I am surprise the compiler doesn't emit a warning here...
Re: [PATCH v9 Kernel 3/5] vfio iommu: Add ioctl defination to unmap IOVA and return dirty bitmap
On Fri, 15 Nov 2019 00:26:26 +0530 Kirti Wankhede wrote: > On 11/14/2019 1:52 AM, Alex Williamson wrote: > > On Thu, 14 Nov 2019 01:22:39 +0530 > > Kirti Wankhede wrote: > > > >> On 11/13/2019 4:00 AM, Alex Williamson wrote: > >>> On Tue, 12 Nov 2019 22:33:38 +0530 > >>> Kirti Wankhede wrote: > >>> > With vIOMMU, during pre-copy phase of migration, while CPUs are still > running, IO virtual address unmap can happen while device still keeping > reference of guest pfns. Those pages should be reported as dirty before > unmap, so that VFIO user space application can copy content of those > pages > from source to destination. > > IOCTL defination added here add bitmap pointer, size and flag. If flag > >>> > >>> definition, adds > >>> > VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is set and bitmap memory is > allocated > and bitmap_size of set, then ioctl will create bitmap of pinned pages > and > >>> > >>> s/of/is/ > >>> > then unmap those. > > Signed-off-by: Kirti Wankhede > Reviewed-by: Neo Jia > --- > include/uapi/linux/vfio.h | 33 + > 1 file changed, 33 insertions(+) > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index 6fd3822aa610..72fd297baf52 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -925,6 +925,39 @@ struct vfio_iommu_type1_dirty_bitmap { > > #define VFIO_IOMMU_GET_DIRTY_BITMAP _IO(VFIO_TYPE, > VFIO_BASE + 17) > > +/** > + * VFIO_IOMMU_UNMAP_DMA_GET_BITMAP - _IOWR(VFIO_TYPE, VFIO_BASE + 18, > + *struct > vfio_iommu_type1_dma_unmap_bitmap) > + * > + * Unmap IO virtual addresses using the provided struct > + * vfio_iommu_type1_dma_unmap_bitmap. Caller sets argsz. > + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get dirty > bitmap > + * before unmapping IO virtual addresses. If this flag is not set, only > IO > + * virtual address are unmapped without creating pinned pages bitmap, > that > + * is, behave same as VFIO_IOMMU_UNMAP_DMA ioctl. > + * User should allocate memory to get bitmap and should set size of > allocated > + * memory in bitmap_size field. One bit in bitmap is used to represent > per page > + * consecutively starting from iova offset. Bit set indicates page at > that > + * offset from iova is dirty. > + * The actual unmapped size is returned in the size field and bitmap of > pages > + * in the range of unmapped size is returned in bitmap if flag > + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is set. > + * > + * No guarantee is made to the user that arbitrary unmaps of iova or > size > + * different from those used in the original mapping call will succeed. > + */ > +struct vfio_iommu_type1_dma_unmap_bitmap { > +__u32argsz; > +__u32flags; > +#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0) > +__u64iova;/* IO virtual address > */ > +__u64size;/* Size of mapping > (bytes) */ > +__u64bitmap_size; /* in bytes */ > +void __user *bitmap; /* one bit per page */ > +}; > + > +#define VFIO_IOMMU_UNMAP_DMA_GET_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 18) > + > >>> > >>> Why not extend VFIO_IOMMU_UNMAP_DMA to support this rather than add an > >>> ioctl that duplicates the functionality and extends it?? > >> > >> We do want old userspace applications to work with new kernel and > >> vice-versa, right? > >> > >> If I try to change existing VFIO_IOMMU_UNMAP_DMA ioctl structure, say if > >> add 'bitmap_size' and 'bitmap' after 'size', with below code in old > >> kernel, old kernel & new userspace will work. > >> > >> minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); > >> > >> if (copy_from_user(, (void __user *)arg, minsz)) > >> return -EFAULT; > >> > >> if (unmap.argsz < minsz || unmap.flags) > >> return -EINVAL; > >> > >> > >> With new kernel it would change to: > >> minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, bitmap); > > > > No, the minimum structure size still ends at size, we interpret flags > > and argsz to learn if the user understands those fields and optionally > > include them. Therefore old userspace on new kernel continues to work. > > > >> if (copy_from_user(, (void __user *)arg, minsz)) > >> return -EFAULT; > >> > >> if (unmap.argsz < minsz || unmap.flags) > >> return -EINVAL; > >> > >> Then old
Re: [PATCH v9 Kernel 2/5] vfio iommu: Add ioctl defination to get dirty pages bitmap.
On Fri, 15 Nov 2019 00:26:07 +0530 Kirti Wankhede wrote: > On 11/14/2019 1:37 AM, Alex Williamson wrote: > > On Thu, 14 Nov 2019 01:07:21 +0530 > > Kirti Wankhede wrote: > > > >> On 11/13/2019 4:00 AM, Alex Williamson wrote: > >>> On Tue, 12 Nov 2019 22:33:37 +0530 > >>> Kirti Wankhede wrote: > >>> > All pages pinned by vendor driver through vfio_pin_pages API should be > considered as dirty during migration. IOMMU container maintains a list of > all such pinned pages. Added an ioctl defination to get bitmap of such > >>> > >>> definition > >>> > pinned pages for requested IO virtual address range. > >>> > >>> Additionally, all mapped pages are considered dirty when physically > >>> mapped through to an IOMMU, modulo we discussed devices opting in to > >>> per page pinning to indicate finer granularity with a TBD mechanism to > >>> figure out if any non-opt-in devices remain. > >>> > >> > >> You mean, in case of device direct assignment (device pass through)? > > > > Yes, or IOMMU backed mdevs. If vfio_dmas in the container are fully > > pinned and mapped, then the correct dirty page set is all mapped pages. > > We discussed using the vpfn list as a mechanism for vendor drivers to > > reduce their migration footprint, but we also discussed that we would > > need a way to determine that all participants in the container have > > explicitly pinned their working pages or else we must consider the > > entire potential working set as dirty. > > > > How can vendor driver tell this capability to iommu module? Any suggestions? I think it does so by pinning pages. Is it acceptable that if the vendor driver pins any pages, then from that point forward we consider the IOMMU group dirty page scope to be limited to pinned pages? There are complications around non-singleton IOMMU groups, but I think we're already leaning towards that being a non-worthwhile problem to solve. So if we require that only singleton IOMMU groups can pin pages and we pass the IOMMU group as a parameter to vfio_iommu_driver_ops.pin_pages(), then the type1 backend can set a flag on its local vfio_group struct to indicate dirty page scope is limited to pinned pages. We might want to keep a flag on the vfio_iommu struct to indicate if all of the vfio_groups for each vfio_domain in the vfio_iommu.domain_list dirty page scope limited to pinned pages as an optimization to avoid walking lists too often. Then we could test if vfio_iommu.domain_list is not empty and this new flag does not limit the dirty page scope, then everything within each vfio_dma is considered dirty. > Signed-off-by: Kirti Wankhede > Reviewed-by: Neo Jia > --- > include/uapi/linux/vfio.h | 23 +++ > 1 file changed, 23 insertions(+) > > diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h > index 35b09427ad9f..6fd3822aa610 100644 > --- a/include/uapi/linux/vfio.h > +++ b/include/uapi/linux/vfio.h > @@ -902,6 +902,29 @@ struct vfio_iommu_type1_dma_unmap { > #define VFIO_IOMMU_ENABLE _IO(VFIO_TYPE, VFIO_BASE + 15) > #define VFIO_IOMMU_DISABLE_IO(VFIO_TYPE, VFIO_BASE + 16) > > +/** > + * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOWR(VFIO_TYPE, VFIO_BASE + 17, > + * struct > vfio_iommu_type1_dirty_bitmap) > + * > + * IOCTL to get dirty pages bitmap for IOMMU container during migration. > + * Get dirty pages bitmap of given IO virtual addresses range using > + * struct vfio_iommu_type1_dirty_bitmap. Caller sets argsz, which is > size of > + * struct vfio_iommu_type1_dirty_bitmap. User should allocate memory to > get > + * bitmap and should set size of allocated memory in bitmap_size field. > + * One bit is used to represent per page consecutively starting from > iova > + * offset. Bit set indicates page at that offset from iova is dirty. > + */ > +struct vfio_iommu_type1_dirty_bitmap { > +__u32argsz; > +__u32flags; > +__u64iova; /* IO virtual address */ > +__u64size; /* Size of iova range */ > +__u64bitmap_size; /* in bytes */ > >>> > >>> This seems redundant. We can calculate the size of the bitmap based on > >>> the iova size. > >>> > >> > >> But in kernel space, we need to validate the size of memory allocated by > >> user instead of assuming user is always correct, right? > > > > What does it buy us for the user to tell us the size? They could be > > wrong, they could be malicious. The argsz field on the ioctl is mostly > > for the handshake that the user is competent, we should get faults from > > the copy-user operation if it's incorrect. > > > > It is to mainly fail safe. > > +void
Re: [PATCH] Semihost SYS_READC implementation (v4)
On Thu, 14 Nov 2019 at 20:52, Richard Henderson wrote: > Yet another reason why I prefer any semi-hosting call to use an encoding that > is otherwise reserved illegal. > > For this, you have to make up your mind: is it important to execute the > instructions as specified by the ISA, or as specified by the semi-hosting > spec? > > In this case, semi-hosting defines an "entry nop" that begins the sequence, > and > I think that we are well within our rights to ignore the validity of "insn1 > insn2 || other-insn". Perhaps. I think you get the same issue with insn1 || insn2 vs insn1 || some-other-insn though. (And the spec has wording that explicitly wants the latter to be handled with the normal "I'm a hint instruction" behaviour of insn1.) -- PMM
Re: [PATCH] pl031: Expose RTCICR as proper WC register
On Thu, 14 Nov 2019 at 20:45, Alexander Graf wrote: > On 14.11.19 15:42, Peter Maydell wrote: > > Is that OK? > > It's much better. Will you just fix it up inline for me please? :) Sure :-) -- PMM
Re: [PATCH] Semihost SYS_READC implementation (v4)
On 11/14/19 8:29 PM, Peter Maydell wrote: > On Thu, 14 Nov 2019 at 19:18, Richard Henderson > wrote: >> - If the sequence crosses a page, then so be it. Because of >> step 1, this only happens when we *must* cross a page, and >> will have recognized any paging exception anyway. >> The generic parts of qemu will handle proper invalidation of >> a TB that crosses a page boundary. > > I'm not sure this would work. If you have > insn1 insn2 || other-insn > (where || is the page boundary and page 2 is non-executable) > then the required behaviour is "execute insn1 and insn2 with > normal behaviour, then fault trying to read other-insn, with > the fault address being that of other-insn". > Whereas for > insn1 insn2 || insn3 > you want to treat it as a semihosting sequence. But you can't distinguish > the two because trying to read the word in page 2 will cause us to > generate a fault with the fault address being that of insn1. Or > have I forgotten how the page-crossing handling works ? Yet another reason why I prefer any semi-hosting call to use an encoding that is otherwise reserved illegal. For this, you have to make up your mind: is it important to execute the instructions as specified by the ISA, or as specified by the semi-hosting spec? In this case, semi-hosting defines an "entry nop" that begins the sequence, and I think that we are well within our rights to ignore the validity of "insn1 insn2 || other-insn". r~
Re: [PATCH] pl031: Expose RTCICR as proper WC register
On 14.11.19 15:42, Peter Maydell wrote: On Tue, 12 Nov 2019 at 11:57, Peter Maydell wrote: On Tue, 12 Nov 2019 at 07:28, Alexander Graf wrote: I still think that being consistent with the actual PL031 spec is preferable though. If any real world guest breaks because of this, we can still revert this patch and document the exact breakage in the comment instead. Yeah, I agree; I'm essentially just gathering material for the commit message here. (The gold standard would be to go find some hardware with a real pl031 and prod it to confirm behaviour, but that's more effort than really seems justified to me.) I propose to put this in for 4.2 with an updated commit message: ===begin=== pl031: Expose RTCICR as proper WC register The current PL031 RTCICR register implementation always clears the IRQ pending status on a register write, regardless of the value the guest writes. To justify that behavior, it references the ARM926EJ-S Development Chip Reference Manual (DDI0287B) and indicates that said document states that any write clears the internal IRQ state. It is indeed true that in section 11.1 this document says: "The interrupt is cleared by writing any data value to the interrupt clear register RTCICR". However, later in section 11.2.2 it contradicts itself by saying: "Writing 1 to bit 0 of RTCICR clears the RTCINTR flag." The latter statement matches the PL031 TRM (DDI0224C), which says: "Writing 1 to bit position 0 clears the corresponding interrupt. Writing 0 has no effect." Let's assume that the self-contradictory DDI0287B is in error, and follow the reference manual for the device itself, by making the register write-one-to-clear. ===endit=== Is that OK? It's much better. Will you just fix it up inline for me please? :) Thanks, Alex Amazon Development Center Germany GmbH Krausenstr. 38 10117 Berlin Geschaeftsfuehrung: Christian Schlaeger, Ralf Herbrich Eingetragen am Amtsgericht Charlottenburg unter HRB 149173 B Sitz: Berlin Ust-ID: DE 289 237 879
Re: [PATCH] Semihost SYS_READC implementation (v4)
On Thu, 14 Nov 2019 at 19:18, Richard Henderson wrote: > - If the sequence crosses a page, then so be it. Because of > step 1, this only happens when we *must* cross a page, and > will have recognized any paging exception anyway. > The generic parts of qemu will handle proper invalidation of > a TB that crosses a page boundary. I'm not sure this would work. If you have insn1 insn2 || other-insn (where || is the page boundary and page 2 is non-executable) then the required behaviour is "execute insn1 and insn2 with normal behaviour, then fault trying to read other-insn, with the fault address being that of other-insn". Whereas for insn1 insn2 || insn3 you want to treat it as a semihosting sequence. But you can't distinguish the two because trying to read the word in page 2 will cause us to generate a fault with the fault address being that of insn1. Or have I forgotten how the page-crossing handling works ? thanks -- PMM
Re: [PATCH] Semihost SYS_READC implementation (v4)
On Thu, 14 Nov 2019 at 17:47, Peter Maydell wrote: > The ARM semihosting ABI also has a number of warts > which are basically historical legacy. With a clean > sheet you get to avoid some of them. (Notably you could > skip the whole 'negotiate presence of extensions' business > by just getting those parts right from the start ...better still, if you can define (a) a mandatory "return version and feature bit info" call right from the start and (b) the required behaviour for attempts to make a semihosting call with an unknown/unimplemented function number -- then you can avoid the nasty kludge "magic behaviour by opening a magic filename" we were stuck with specifying in arm semihosting 2.0. thanks -- PMM
Re: [PATCH] Semihost SYS_READC implementation (v4)
On 11/14/19 5:14 PM, Peter Maydell wrote: > On Fri, 25 Oct 2019 at 20:15, Keith Packard wrote: >> There seems to be convergence on a pretty simple interface which uses >> ebreak surrounded by a couple of specific no-ops: >> >> slli x0, x0, 0x1f >> ebreak >> srai x0, x0, 0x7 >> >> There are implementations in rust and openocd, and I've got one for >> picolibc. The risc-v semihosting code is sitting on a branch in my repo >> on github: >> >> https://github.com/keith-packard/qemu/tree/riscv-semihost > > I had an idle glance at this implementation, and this: > >uint32_t pre = opcode_at(>base, ctx->base.pc_next - 4); >uint32_t ebreak = opcode_at(>base, ctx->base.pc_next); >uint32_t post = opcode_at(>base, ctx->base.pc_next + 4); > > (where opcode_at() is a wrapper for cpu_ldl_code()) has > some unfortunate side effects: if the previous instruction > is in the previous MMU page, or the following instruction > is in the next MMU page, you might incorrectly trigger > an exception (where QEMU will just longjmp straight out of > the cpu_ldl_code()) if that other page isn't actually mapped > in the guest's page table. You need to be careful not to access > code outside the page you're actually on unless you're really > going to execute it and are OK with it faulting. > > I'm not sure what the best way on QEMU to identify this kind > of multiple-instruction-sequence is -- Richard might have an > idea. One approach would be to always take an exception > (or call a helper function) for the 'ebreak', and then > distinguish semihosting from other kinds of ebreak by doing > the load of the preceding/succeeding instructions at runtime > rather than translate time. It's irritating that this 3 insn sequence is already a thing, baked into other sim/emulators. Seems to me that it would have been easier to abscond with ebreak + func3 field != 0. For semi-hosting, it seems even better if the semi-hosting syscall instruction is not "real", because you're explicitly requesting services from "unreal" hardware. It should be specified to generate a SIGILL type of exception anywhere semi-hosting is not enabled. That said, it is possible to do this in the translator, by considering this sequence to be one 12-byte instruction. You'd build in a recognizer into the decoder such that, when semi-hosting is enabled, seeing the entry nop starts looking ahead: - If this is not the first insn in the TB, and there are not 8 more bytes remaining on the page, terminate the current TB. This will re-start the machine in the next TB. - If the complete sequence is not found, then advance the pc past the entry nop and let nature take its course wrt the subsequent instructions. - If the sequence crosses a page, then so be it. Because of step 1, this only happens when we *must* cross a page, and will have recognized any paging exception anyway. The generic parts of qemu will handle proper invalidation of a TB that crosses a page boundary. - This is kinda similar to the target/sh4 code to handle generic user-space atomic sequences: c.f. decode_gusa(). - This implementation does mean that you can't jump directly to the ebreak insn and have the sequence be recognized. Control must flow through the entry nop. This is a bug or a feature depending on the reviewer. ;-) With that in mind, it may be simpler to handle all of this not in the translator, but in the function that delivers the ebreak exception. At that point one can arrange to read memory without raising additional exceptions. r~
Re: [PATCH v9 Kernel 3/5] vfio iommu: Add ioctl defination to unmap IOVA and return dirty bitmap
On 11/14/2019 1:52 AM, Alex Williamson wrote: On Thu, 14 Nov 2019 01:22:39 +0530 Kirti Wankhede wrote: On 11/13/2019 4:00 AM, Alex Williamson wrote: On Tue, 12 Nov 2019 22:33:38 +0530 Kirti Wankhede wrote: With vIOMMU, during pre-copy phase of migration, while CPUs are still running, IO virtual address unmap can happen while device still keeping reference of guest pfns. Those pages should be reported as dirty before unmap, so that VFIO user space application can copy content of those pages from source to destination. IOCTL defination added here add bitmap pointer, size and flag. If flag definition, adds VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is set and bitmap memory is allocated and bitmap_size of set, then ioctl will create bitmap of pinned pages and s/of/is/ then unmap those. Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia --- include/uapi/linux/vfio.h | 33 + 1 file changed, 33 insertions(+) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 6fd3822aa610..72fd297baf52 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -925,6 +925,39 @@ struct vfio_iommu_type1_dirty_bitmap { #define VFIO_IOMMU_GET_DIRTY_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 17) +/** + * VFIO_IOMMU_UNMAP_DMA_GET_BITMAP - _IOWR(VFIO_TYPE, VFIO_BASE + 18, + * struct vfio_iommu_type1_dma_unmap_bitmap) + * + * Unmap IO virtual addresses using the provided struct + * vfio_iommu_type1_dma_unmap_bitmap. Caller sets argsz. + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP should be set to get dirty bitmap + * before unmapping IO virtual addresses. If this flag is not set, only IO + * virtual address are unmapped without creating pinned pages bitmap, that + * is, behave same as VFIO_IOMMU_UNMAP_DMA ioctl. + * User should allocate memory to get bitmap and should set size of allocated + * memory in bitmap_size field. One bit in bitmap is used to represent per page + * consecutively starting from iova offset. Bit set indicates page at that + * offset from iova is dirty. + * The actual unmapped size is returned in the size field and bitmap of pages + * in the range of unmapped size is returned in bitmap if flag + * VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP is set. + * + * No guarantee is made to the user that arbitrary unmaps of iova or size + * different from those used in the original mapping call will succeed. + */ +struct vfio_iommu_type1_dma_unmap_bitmap { + __u32argsz; + __u32flags; +#define VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP (1 << 0) + __u64iova;/* IO virtual address */ + __u64size;/* Size of mapping (bytes) */ + __u64bitmap_size; /* in bytes */ + void __user *bitmap; /* one bit per page */ +}; + +#define VFIO_IOMMU_UNMAP_DMA_GET_BITMAP _IO(VFIO_TYPE, VFIO_BASE + 18) + Why not extend VFIO_IOMMU_UNMAP_DMA to support this rather than add an ioctl that duplicates the functionality and extends it?? We do want old userspace applications to work with new kernel and vice-versa, right? If I try to change existing VFIO_IOMMU_UNMAP_DMA ioctl structure, say if add 'bitmap_size' and 'bitmap' after 'size', with below code in old kernel, old kernel & new userspace will work. minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size); if (copy_from_user(, (void __user *)arg, minsz)) return -EFAULT; if (unmap.argsz < minsz || unmap.flags) return -EINVAL; With new kernel it would change to: minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, bitmap); No, the minimum structure size still ends at size, we interpret flags and argsz to learn if the user understands those fields and optionally include them. Therefore old userspace on new kernel continues to work. if (copy_from_user(, (void __user *)arg, minsz)) return -EFAULT; if (unmap.argsz < minsz || unmap.flags) return -EINVAL; Then old userspace app will fail because unmap.argsz < minsz and might be copy_from_user would cause seg fault because userspace sdk doesn't contain new member variables. We can't change the sequence to keep 'size' as last member, because then new userspace app on old kernel will interpret it wrong. If we have new userspace on old kernel, that userspace needs to be able to learn that this feature exists (new flag in the vfio_iommu_type1_info struct as suggested below) and only make use of it when available. This is why the old kernel checks argsz against minsz. So long as the user passes something at least minsz in size, we have compatibility. The old kernel doesn't understand the GET_DIRTY_BITMAP flag and will return an error if the user attempts to use it. Thanks, Ok. So then VFIO_IOMMU_UNMAP_DMA_GET_BITMAP
Re: [PATCH v9 Kernel 2/5] vfio iommu: Add ioctl defination to get dirty pages bitmap.
On 11/14/2019 1:37 AM, Alex Williamson wrote: On Thu, 14 Nov 2019 01:07:21 +0530 Kirti Wankhede wrote: On 11/13/2019 4:00 AM, Alex Williamson wrote: On Tue, 12 Nov 2019 22:33:37 +0530 Kirti Wankhede wrote: All pages pinned by vendor driver through vfio_pin_pages API should be considered as dirty during migration. IOMMU container maintains a list of all such pinned pages. Added an ioctl defination to get bitmap of such definition pinned pages for requested IO virtual address range. Additionally, all mapped pages are considered dirty when physically mapped through to an IOMMU, modulo we discussed devices opting in to per page pinning to indicate finer granularity with a TBD mechanism to figure out if any non-opt-in devices remain. You mean, in case of device direct assignment (device pass through)? Yes, or IOMMU backed mdevs. If vfio_dmas in the container are fully pinned and mapped, then the correct dirty page set is all mapped pages. We discussed using the vpfn list as a mechanism for vendor drivers to reduce their migration footprint, but we also discussed that we would need a way to determine that all participants in the container have explicitly pinned their working pages or else we must consider the entire potential working set as dirty. How can vendor driver tell this capability to iommu module? Any suggestions? Signed-off-by: Kirti Wankhede Reviewed-by: Neo Jia --- include/uapi/linux/vfio.h | 23 +++ 1 file changed, 23 insertions(+) diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h index 35b09427ad9f..6fd3822aa610 100644 --- a/include/uapi/linux/vfio.h +++ b/include/uapi/linux/vfio.h @@ -902,6 +902,29 @@ struct vfio_iommu_type1_dma_unmap { #define VFIO_IOMMU_ENABLE_IO(VFIO_TYPE, VFIO_BASE + 15) #define VFIO_IOMMU_DISABLE _IO(VFIO_TYPE, VFIO_BASE + 16) +/** + * VFIO_IOMMU_GET_DIRTY_BITMAP - _IOWR(VFIO_TYPE, VFIO_BASE + 17, + * struct vfio_iommu_type1_dirty_bitmap) + * + * IOCTL to get dirty pages bitmap for IOMMU container during migration. + * Get dirty pages bitmap of given IO virtual addresses range using + * struct vfio_iommu_type1_dirty_bitmap. Caller sets argsz, which is size of + * struct vfio_iommu_type1_dirty_bitmap. User should allocate memory to get + * bitmap and should set size of allocated memory in bitmap_size field. + * One bit is used to represent per page consecutively starting from iova + * offset. Bit set indicates page at that offset from iova is dirty. + */ +struct vfio_iommu_type1_dirty_bitmap { + __u32argsz; + __u32flags; + __u64iova; /* IO virtual address */ + __u64size; /* Size of iova range */ + __u64bitmap_size; /* in bytes */ This seems redundant. We can calculate the size of the bitmap based on the iova size. But in kernel space, we need to validate the size of memory allocated by user instead of assuming user is always correct, right? What does it buy us for the user to tell us the size? They could be wrong, they could be malicious. The argsz field on the ioctl is mostly for the handshake that the user is competent, we should get faults from the copy-user operation if it's incorrect. It is to mainly fail safe. + void __user *bitmap;/* one bit per page */ Should we define that as a __u64* to (a) help with the size calculation, and (b) assure that we can use 8-byte ops on it? However, who defines page size? Is it necessarily the processor page size? A physical IOMMU may support page sizes other than the CPU page size. It might be more important to indicate the expected page size than the bitmap size. Thanks, I see in QEMU and in vfio_iommu_type1 module, page sizes considered for mapping are CPU page size, 4K. Do we still need to have such argument? That assumption exists for backwards compatibility prior to supporting the iova_pgsizes field in vfio_iommu_type1_info. AFAIK the current interface has no page size assumptions and we should not add any. So userspace has iova_pgsizes information, which can be input to this ioctl. Bitmap should be considering smallest page size. Does that makes sense? Thanks, Kirti Thanks, Alex
Re: [PATCH v9 Kernel 1/5] vfio: KABI for migration interface for device state
+ * Vendor driver should decide whether to partition data section and how to + * partition the data section. Vendor driver should return data_offset + * accordingly. + * + * Sequence to be followed for _SAVING|_RUNNING device state or pre-copy phase + * and for _SAVING device state or stop-and-copy phase: + * a. read pending_bytes. If pending_bytes > 0, go through below steps. + * b. read data_offset, indicates kernel driver to write data to staging buffer. + *Kernel driver should return this read operation only after writing data to + *staging buffer is done. May I know under what condition this data_offset will be changed per each iteration from a-f ? Its upto vendor driver, if vendor driver maintains multiple partitions in data section. So, do you mean each time before doing b (reading data_offset), step a (reading pending_bytes) is mandatory, otherwise the vendor driver does not know which data_offset is? pending_bytes will only tell bytes remaining to transfer from vendor driver. On read operation on data_offset, vendor driver should decide what to send depending on where he is making data available to userspace. Then, any lock to wrap step a and b to ensure atomic? With current QEMU implementation, where migration is single thread, there is not need of lock yet. But when we add multi-threaded support may be in future then locks will be required in userspace side. Thanks, Kirti
Re: [PATCH v2 00/10] Further bitmaps improvements
14.11.2019 21:47, Eric Blake wrote: > On 10/22/19 7:58 AM, Vladimir Sementsov-Ogievskiy wrote: >> Hi! >> >> The main feature here is improvement of _next_dirty_area API, which I'm >> going to use then for backup / block-copy. >> >> v2: >> 01: just use INT64_MAX instead of adding new constant >> 08: add separate function nbd_extent_array_convert_to_be and converted >> state of NBDExtentArray, to make these things explicit, and avoid >> extra memdup. >> 09: Save part of comment for bitmap_to_extents(), add Eric's r-b > > Is any of this series a bug fix important to get into -rc2? Nothing > Or is it safe to defer to the 5.0 timeframe? Yes, no doubts. > >> >> Vladimir Sementsov-Ogievskiy (10): >> hbitmap: assert that we don't create bitmap larger than INT64_MAX >> hbitmap: move hbitmap_iter_next_word to hbitmap.c >> hbitmap: unpublish hbitmap_iter_skip_words >> hbitmap: drop meta bitmaps as they are unused >> block/dirty-bitmap: switch _next_dirty_area and _next_zero to int64_t >> block/dirty-bitmap: add _next_dirty API >> block/dirty-bitmap: improve _next_dirty_area API >> nbd/server: introduce NBDExtentArray >> nbd/server: use bdrv_dirty_bitmap_next_dirty_area >> block/qcow2-bitmap: use bdrv_dirty_bitmap_next_dirty >> > -- Best regards, Vladimir
Re: [PATCH v9 Kernel 1/5] vfio: KABI for migration interface for device state
On 11/14/2019 2:10 AM, Alex Williamson wrote: On Thu, 14 Nov 2019 01:47:04 +0530 Kirti Wankhede wrote: On 11/14/2019 1:18 AM, Alex Williamson wrote: On Thu, 14 Nov 2019 00:59:52 +0530 Kirti Wankhede wrote: On 11/13/2019 11:57 PM, Alex Williamson wrote: On Wed, 13 Nov 2019 11:24:17 +0100 Cornelia Huck wrote: On Tue, 12 Nov 2019 15:30:05 -0700 Alex Williamson wrote: On Tue, 12 Nov 2019 22:33:36 +0530 Kirti Wankhede wrote: - Defined MIGRATION region type and sub-type. - Used 3 bits to define VFIO device states. Bit 0 => _RUNNING Bit 1 => _SAVING Bit 2 => _RESUMING Combination of these bits defines VFIO device's state during migration _RUNNING => Normal VFIO device running state. When its reset, it indicates _STOPPED state. when device is changed to _STOPPED, driver should stop device before write() returns. _SAVING | _RUNNING => vCPUs are running, VFIO device is running but start saving state of device i.e. pre-copy state _SAVING => vCPUs are stopped, VFIO device should be stopped, and s/should/must/ save device state,i.e. stop-n-copy state _RESUMING => VFIO device resuming state. _SAVING | _RESUMING and _RUNNING | _RESUMING => Invalid states A table might be useful here and in the uapi header to indicate valid states: I like that. | _RESUMING | _SAVING | _RUNNING | Description +---+-+--+-- | 0 |0| 0| Stopped, not saving or resuming (a) +---+-+--+-- | 0 |0| 1| Running, default state +---+-+--+-- | 0 |1| 0| Stopped, migration interface in save mode +---+-+--+-- | 0 |1| 1| Running, save mode interface, iterative +---+-+--+-- | 1 |0| 0| Stopped, migration resume interface active +---+-+--+-- | 1 |0| 1| Invalid (b) +---+-+--+-- | 1 |1| 0| Invalid (c) +---+-+--+-- | 1 |1| 1| Invalid (d) I think we need to consider whether we define (a) as generally available, for instance we might want to use it for diagnostics or a fatal error condition outside of migration. Are there hidden assumptions between state transitions here or are there specific next possible state diagrams that we need to include as well? Some kind of state-change diagram might be useful in addition to the textual description anyway. Let me try, just to make sure I understand this correctly: During User application initialization, there is one more state change: 0) 0/0/0 stop to running -> 0/0/1 0/0/0 cannot be the initial state of the device, that would imply that a device supporting this migration interface breaks backwards compatibility with all existing vfio userspace code and that code needs to learn to set the device running as part of its initialization. That's absolutely unacceptable. The initial device state must be 0/0/1. There isn't any device state for all existing vfio userspace code right now. So default its assumed to be always running. Exactly, there is no representation of device state, therefore it's assumed to be running, therefore when adding a representation of device state it must default to running. With migration support, device states are explicitly getting added. For example, in case of QEMU, while device is getting initialized, i.e. from vfio_realize(), device_state is set to 0/0/0, but not required to convey it to vendor driver. But we have a 0/0/0 state, why would we intentionally keep an internal state that's inconsistent with the device? Then with vfio_vmstate_change() notifier, device state is changed to 0/0/1 when VM/vCPU are transitioned to running, at this moment device state is conveyed to vendor driver. So vendor driver doesn't see 0/0/0 state. But the running state is the state of the device, not the VM or the vCPU. Sure we might want to stop the device if the VM/vCPU state is stopped, but we must accept that the device is running when it's opened and we shouldn't intentionally maintain inconsistent state. While resuming, for userspace, for example QEMU, device state change is from 0/0/0 to 1/0/0, vendor driver see 1/0/0 after device basic initialization is done. I don't see why this matters, all device_state transitions are written directly to the
Re: [PATCH v2 00/10] Further bitmaps improvements
On 10/22/19 7:58 AM, Vladimir Sementsov-Ogievskiy wrote: Hi! The main feature here is improvement of _next_dirty_area API, which I'm going to use then for backup / block-copy. v2: 01: just use INT64_MAX instead of adding new constant 08: add separate function nbd_extent_array_convert_to_be and converted state of NBDExtentArray, to make these things explicit, and avoid extra memdup. 09: Save part of comment for bitmap_to_extents(), add Eric's r-b Is any of this series a bug fix important to get into -rc2? Or is it safe to defer to the 5.0 timeframe? Vladimir Sementsov-Ogievskiy (10): hbitmap: assert that we don't create bitmap larger than INT64_MAX hbitmap: move hbitmap_iter_next_word to hbitmap.c hbitmap: unpublish hbitmap_iter_skip_words hbitmap: drop meta bitmaps as they are unused block/dirty-bitmap: switch _next_dirty_area and _next_zero to int64_t block/dirty-bitmap: add _next_dirty API block/dirty-bitmap: improve _next_dirty_area API nbd/server: introduce NBDExtentArray nbd/server: use bdrv_dirty_bitmap_next_dirty_area block/qcow2-bitmap: use bdrv_dirty_bitmap_next_dirty -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH] Semihost SYS_READC implementation (v4)
On Thu, 14 Nov 2019 at 18:05, Keith Packard wrote: > > Peter Maydell writes: > > > I had an idle glance at this implementation, and this: > > > >uint32_t pre = opcode_at(>base, ctx->base.pc_next - 4); > >uint32_t ebreak = opcode_at(>base, ctx->base.pc_next); > >uint32_t post = opcode_at(>base, ctx->base.pc_next + 4); > > > > (where opcode_at() is a wrapper for cpu_ldl_code()) has > > some unfortunate side effects: if the previous instruction > > is in the previous MMU page, or the following instruction > > is in the next MMU page, you might incorrectly trigger > > an exception (where QEMU will just longjmp straight out of > > the cpu_ldl_code()) if that other page isn't actually mapped > > in the guest's page table. You need to be careful not to access > > code outside the page you're actually on unless you're really > > going to execute it and are OK with it faulting. > > I can't even find the implementation of cpu_ldl_code; the qemu source > code is somewhat obscure in this area. But, longjmp'ing out of the > middle of that seems like a bad idea. It's the way QEMU works -- generally load/store operations that work on virtual addresses are expected to only return in the success case; on failure they longjmp out to cause the guest exception. (Load/stores on physical addresses generally return a memory transaction status for the caller to check and handle.) I agree that within the translation code it's a bit weird and it might be nicer for the translate.c code to explicitly handle failures to load an insn, but it would be a bit of an upheaval to try to rewrite it at this point. cpu_ldl_code() is provided by include/exec/cpu_ldst.h, incidentally (via preprocessor macros and repeated inclusion of some template .h files, which is why a grep for the function name misses it). > > Does your semihosting spec expect to have the semihosting > > call work if the sequence crosses a page boundary, the > > code is being executed by a userspace process, and one of > > the two pages has been paged out by the OS ? > > You've seen the entirety of the RISC-V semihosting spec already. For > now, perhaps we should limit RISC-V semihosting support to devices > without paging support and await a more complete spec. > > As you suggest, disallowing the sequence from crossing a page boundary > would be a simple fix, but that would require wording changes to the > spec. Yeah, I'm implicitly suggesting that a bit more thought/revision of the spec might not be a bad idea. Things that are effectively supposed to be treated like a single instruction but which can cross cacheline or page boundaries turn out to be a fertile source of implementation bugs. (The arm translate.c code has to be quite careful about handling 32-bit Thumb insns that cross pages. The x86 translate.c code is less careful and may well be buggy in this area.) thanks -- PMM
Re: [PATCH] Semihost SYS_READC implementation (v4)
Peter Maydell writes: > I had an idle glance at this implementation, and this: > >uint32_t pre = opcode_at(>base, ctx->base.pc_next - 4); >uint32_t ebreak = opcode_at(>base, ctx->base.pc_next); >uint32_t post = opcode_at(>base, ctx->base.pc_next + 4); > > (where opcode_at() is a wrapper for cpu_ldl_code()) has > some unfortunate side effects: if the previous instruction > is in the previous MMU page, or the following instruction > is in the next MMU page, you might incorrectly trigger > an exception (where QEMU will just longjmp straight out of > the cpu_ldl_code()) if that other page isn't actually mapped > in the guest's page table. You need to be careful not to access > code outside the page you're actually on unless you're really > going to execute it and are OK with it faulting. I can't even find the implementation of cpu_ldl_code; the qemu source code is somewhat obscure in this area. But, longjmp'ing out of the middle of that seems like a bad idea. > Does your semihosting spec expect to have the semihosting > call work if the sequence crosses a page boundary, the > code is being executed by a userspace process, and one of > the two pages has been paged out by the OS ? You've seen the entirety of the RISC-V semihosting spec already. For now, perhaps we should limit RISC-V semihosting support to devices without paging support and await a more complete spec. As you suggest, disallowing the sequence from crossing a page boundary would be a simple fix, but that would require wording changes to the spec. -- -keith signature.asc Description: PGP signature
Re: [PATCH] Semihost SYS_READC implementation (v4)
On Thu, 14 Nov 2019 at 17:39, Keith Packard wrote: > > Peter Maydell writes: > > > That defines the instruction sequence used to make a semihosting > > call, but not the specification of what the calls are: > > * what call numbers perform which functions > > * how arguments are passed to the call (registers? parameter > >blocks in memory? other?) > > * the semantics of each function supported (number of arguments, > >behaviour, error handling) > > > > That's really what I had in mind by the overall semihosting spec. > > There isn't anything more provided by the RISC-V foundation at this > point. I'm not sure what else they *should* provide as the goal is to > match the ARM design, which does specify all of the above. This isn't obvious if you just say "semihosting". QEMU currently provides 'semihosting' ABIs for: * arm * lm32 * m68k * mips * nios2 * xtensa m68k and nios2 provide basically the same set of calls, but all the other architectures differe from each other. "Semihosting" just means "the concept of an ABI from guest to the debugger or emulator", not a particular ABI. The ARM semihosting ABI also has a number of warts which are basically historical legacy. With a clean sheet you get to avoid some of them. (Notably you could skip the whole 'negotiate presence of extensions' business by just getting those parts right from the start. Actually specifying errno values would also be sensible and is something the ARM spec sadly does not do and can't really at this point with multiple implementations in play.) thanks -- PMM
Re: [PATCH] Semihost SYS_READC implementation (v4)
Alistair Francis writes: > This sounds like something that the platform spec should contain. I'm frankly happy with it specifying the semantics by reference to the ARM docs -- that way we can easily share existing code without concern about subtle semantic differences. The only thing that would be useful is to clarify the low-level ABI details about argument passing, but given that the ARM semihosting spec passes arguments in the standard registers, extrapolating that to RISC-V is not exactly difficult. > +Atish +Palmer who are in charge of that. > > @Keith maybe you should contact the platform spec group and ask them > to look into this? I can certainly ask to have the argument passing details clarified; I doubt that group would be interested in adopting the whole semihosting spec and publishing it separately. -- -keith signature.asc Description: PGP signature
Re: [PATCH v1] s390x: kvm-unit-tests: a PONG device for Sub Channels tests
On 2019-11-14 14:19, Cornelia Huck wrote: On Thu, 14 Nov 2019 14:02:35 +0100 Halil Pasic wrote: On Thu, 14 Nov 2019 11:38:23 +0100 Cornelia Huck wrote: On Wed, 13 Nov 2019 20:02:33 +0100 Pierre Morel wrote: ...snip... We made some different design decisions, while aiming essentially for the same. Maybe it's due to different scope, maybe not. For instance one can't test IDA with PONG, I guess. Now that I saw this again, I also recall the discussion of comparing it with the "testdev" for pci/isa. Anybody knows if these are used by kvm-unit-tests? Only by X. Regards, Pierre -- Pierre Morel IBM Lab Boeblingen
Re: [PATCH] Semihost SYS_READC implementation (v4)
Peter Maydell writes: > That defines the instruction sequence used to make a semihosting > call, but not the specification of what the calls are: > * what call numbers perform which functions > * how arguments are passed to the call (registers? parameter >blocks in memory? other?) > * the semantics of each function supported (number of arguments, >behaviour, error handling) > > That's really what I had in mind by the overall semihosting spec. There isn't anything more provided by the RISC-V foundation at this point. I'm not sure what else they *should* provide as the goal is to match the ARM design, which does specify all of the above. -- -keith signature.asc Description: PGP signature
Re: [PATCH v1] s390x: kvm-unit-tests: a PONG device for Sub Channels tests
On 2019-11-14 14:02, Halil Pasic wrote: On Thu, 14 Nov 2019 11:38:23 +0100 Cornelia Huck wrote: On Wed, 13 Nov 2019 20:02:33 +0100 Pierre Morel wrote: Minor nit for $SUBJECT: this isn't a kvm-unit-tests patch, that's just one consumer :) And subchannel is one word in s390-speak. OK, surely better for grep [..] Some questions regarding this device and its intended usage: - What are you trying to test? Basic ccw processing, or something more specific? Is there any way you can use the kvm-unit-test infrastructure to test basic processing with an existing device? I'm also curious about the big picture (what is in scope and what out of scope). Your design should be evaluated in the light of intended usage. BTW have you had a look at this abandoned patch-set of mine: https://lists.gnu.org/archive/html/qemu-devel/2017-11/msg04220.html No, but now yes. Yes, it is something similar, I can remember as you did it. We made some different design decisions, while aiming essentially for the same. Maybe it's due to different scope, maybe not. For instance one can't test IDA with PONG, I guess. No not now. Regards, Pierre -- Pierre Morel IBM Lab Boeblingen
Re: [QEMU-SECURITY] ide: fix assertion in ide_dma_cb() to prevent qemu DoS from quest
On 07.11.2019 01:05, Alexander Popov wrote: > On 06.11.2019 15:05, Michael S. Tsirkin wrote: >> Do you want to cook up a patch like this then? > > Yes, I will take this task and return with a patch. > > Thanks! I've just sent the v2 of the patch. Looking forward to your feedback. Best regards, Alexander
[PATCH v2 1/1] ide: check DMA transfer size in ide_dma_cb() to prevent qemu DoS from quests
The commit a718978ed58a from July 2015 introduced the assertion which implies that the size of successful DMA transfers handled in ide_dma_cb() should be multiple of 512 (the size of a sector). But guest systems can initiate DMA transfers that don't fit this requirement. PoC for Linux that uses SCSI_IOCTL_SEND_COMMAND to perform such an ATA command and crash qemu: #include #include #include #include #include #include #include #include #include #include #define CMD_SIZE 2048 struct scsi_ioctl_cmd_6 { unsigned int inlen; unsigned int outlen; unsigned char cmd[6]; unsigned char data[]; }; int main(void) { intptr_t fd = 0; struct scsi_ioctl_cmd_6 *cmd = NULL; cmd = malloc(CMD_SIZE); if (!cmd) { perror("[-] malloc"); return 1; } memset(cmd, 0, CMD_SIZE); cmd->inlen = 1337; cmd->cmd[0] = READ_6; fd = open("/dev/sg0", O_RDONLY); if (fd == -1) { perror("[-] opening sg"); return 1; } printf("[+] sg0 is opened\n"); printf("[.] qemu should break here:\n"); fflush(stdout); ioctl(fd, SCSI_IOCTL_SEND_COMMAND, cmd); printf("[-] qemu didn't break\n"); free(cmd); return 1; } For fixing that let's check the number of bytes prepared for the transfer by the prepare_buf() handler. If it is not a multiple of 512 then end the DMA transfer with an error. That also fixes the I/O stall in guests after a DMA transfer request for less than the size of a sector. Signed-off-by: Alexander Popov --- hw/ide/core.c | 11 +-- 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/hw/ide/core.c b/hw/ide/core.c index 754ff4dc34..85aac614f0 100644 --- a/hw/ide/core.c +++ b/hw/ide/core.c @@ -849,6 +849,7 @@ static void ide_dma_cb(void *opaque, int ret) int64_t sector_num; uint64_t offset; bool stay_active = false; +int32_t prepared = 0; if (ret == -EINVAL) { ide_dma_error(s); @@ -892,12 +893,10 @@ static void ide_dma_cb(void *opaque, int ret) n = s->nsector; s->io_buffer_index = 0; s->io_buffer_size = n * 512; -if (s->bus->dma->ops->prepare_buf(s->bus->dma, s->io_buffer_size) < 512) { -/* The PRDs were too short. Reset the Active bit, but don't raise an - * interrupt. */ -s->status = READY_STAT | SEEK_STAT; -dma_buf_commit(s, 0); -goto eot; +prepared = s->bus->dma->ops->prepare_buf(s->bus->dma, s->io_buffer_size); +if (prepared % 512) { +ide_dma_error(s); +return; } trace_ide_dma_cb(s, sector_num, n, IDE_DMA_CMD_str(s->dma_cmd)); -- 2.21.0
Re: Convert VMDK to RAW
On 14.11.19 17:12, janine.schnei...@fau.de wrote: > Ladies and Gentlemen, > > > > I am a PhD student at the Friedrich-Alexander-University > Erlangen-Nuremberg in Bavaria, Germany and am currently working on a > forensic reconstruction tool. The tool can be used to analyze physical > and virtual hard disks and to reconstruct files. I would now like to > extend the tool so that it is able to analyze VMDK files and convert > them to raw. Unfortunately I have not been able to understand how to > correctly unpack and assemble VMDK containers. Since qemu is able to > convert VMDK to raw, I wanted to ask you if you could explain to me how > to put the grains together? Hi, I’m not quite sure what you mean by a “VMDK container”. VMDK disk images can consist of multiple files that are linked together by a descriptor file. In theory all you need to do is tell qemu-img to convert that descriptor file into a raw image. For example: (Sorry, I don’t know much about VMware, so all I can do is use qemu tools to demonstrate) $ qemu-img create -f vmdk -o subformat=twoGbMaxExtentSparse foo.vmdk 4G Formatting 'foo.vmdk', fmt=vmdk size=4294967296 compat6=off hwversion=undefined subformat=twoGbMaxExtentSparse $ ls foo-s001.vmdk foo-s002.vmdk foo.vmdk $ In this example, foo.vmdk is the descriptor file and it points to the other two (data) files: $ cat foo.vmdk # Disk DescriptorFile version=1 CID=6d8d65ed parentCID= createType="twoGbMaxExtentSparse" # Extent description RW 4194304 SPARSE "foo-s001.vmdk" RW 4194304 SPARSE "foo-s002.vmdk" # The Disk Data Base #DDB ddb.virtualHWVersion = "4" ddb.geometry.cylinders = "8322" ddb.geometry.heads = "16" ddb.geometry.sectors = "63" ddb.adapterType = "ide" $ So to convert this VMDK disk image to a raw image, you’d simply do this: $ qemu-img convert -f vmdk -O raw -p foo.vmdk foo.img (100.00/100%) $ Max signature.asc Description: OpenPGP digital signature
Re: [PATCH for-4.2 v2 3/3] block/file-posix: Let post-EOF fallocate serialize
On 14.11.19 18:15, Max Reitz wrote: > On 14.11.19 17:27, Christoph Hellwig wrote: >> On Fri, Nov 01, 2019 at 04:25:10PM +0100, Max Reitz wrote: >>> The XFS kernel driver has a bug that may cause data corruption for qcow2 >>> images as of qemu commit c8bb23cbdbe32f. We can work around it by >>> treating post-EOF fallocates as serializing up until infinity (INT64_MAX >>> in practice). >> >> This has been fixed in the kernel a while ago. I don't think it makes >> sense to work around it in qemu. > > Has it? It was my understanding that this is fixed by > https://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git/commit/?h=for-next=249bd9087a5264d2b8a974081870e2e27671b4dcwhich Sorry, broke the link. Let me try again: https://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git/commit/?h=for-next=249bd9087a5264d2b8a974081870e2e27671b4dc Max > has been merged only very recently and is on track to be part of Linux > 5.5, as far as I understand. > > Max > signature.asc Description: OpenPGP digital signature
Re: [PATCH RFC] virtio-pci: disable vring processing when bus-mastering is disabled
Quoting Michael S. Tsirkin (2019-11-14 03:12:00) > On Thu, Nov 14, 2019 at 04:10:36AM -0500, Michael S. Tsirkin wrote: > > On Wed, Nov 13, 2019 at 07:07:36PM -0600, Michael Roth wrote: > > > Quoting Michael S. Tsirkin (2019-11-13 04:09:02) > > > > On Tue, Nov 12, 2019 at 11:43:01PM -0600, Michael Roth wrote: > > > > > Currently the SLOF firmware for pseries guests will disable/re-enable > > > > > a PCI device multiple times via IO/MEM/MASTER bits of PCI_COMMAND > > > > > register after the initial probe/feature negotiation, as it tends to > > > > > work with a single device at a time at various stages like probing > > > > > and running block/network bootloaders without doing a full reset > > > > > in-between. > > > > > > > > > > In QEMU, when PCI_COMMAND_MASTER is disabled we disable the > > > > > corresponding IOMMU memory region, so DMA accesses (including to vring > > > > > fields like idx/flags) will no longer undergo the necessary > > > > > translation. Normally we wouldn't expect this to happen since it would > > > > > be misbehavior on the driver side to continue driving DMA requests. > > > > > > > > > > However, in the case of pseries, with iommu_platform=on, we trigger > > > > > the > > > > > following sequence when tearing down the virtio-blk dataplane > > > > > ioeventfd > > > > > in response to the guest unsetting PCI_COMMAND_MASTER: > > > > > > > > > > #2 0x55922651 in virtqueue_map_desc > > > > > (vdev=vdev@entry=0x56dbcfb0, > > > > > p_num_sg=p_num_sg@entry=0x7fffe657e1a8, > > > > > addr=addr@entry=0x7fffe657e240, iov=iov@entry=0x7fffe6580240, > > > > > max_num_sg=max_num_sg@entry=1024, is_write=is_write@entry=false, > > > > > pa=0, sz=0) > > > > > at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:757 > > > > > #3 0x55922a89 in virtqueue_pop > > > > > (vq=vq@entry=0x56dc8660, sz=sz@entry=184) > > > > > at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:950 > > > > > #4 0x558d3eca in virtio_blk_get_request > > > > > (vq=0x56dc8660, s=0x56dbcfb0) > > > > > at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:255 > > > > > #5 0x558d3eca in virtio_blk_handle_vq (s=0x56dbcfb0, > > > > > vq=0x56dc8660) > > > > > at /home/mdroth/w/qemu.git/hw/block/virtio-blk.c:776 > > > > > #6 0x5591dd66 in virtio_queue_notify_aio_vq > > > > > (vq=vq@entry=0x56dc8660) > > > > > at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1550 > > > > > #7 0x5591ecef in virtio_queue_notify_aio_vq > > > > > (vq=0x56dc8660) > > > > > at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:1546 > > > > > #8 0x5591ecef in virtio_queue_host_notifier_aio_poll > > > > > (opaque=0x56dc86c8) > > > > > at /home/mdroth/w/qemu.git/hw/virtio/virtio.c:2527 > > > > > #9 0x55d02164 in run_poll_handlers_once > > > > > (ctx=ctx@entry=0x5688bfc0, timeout=timeout@entry=0x7fffe65844a8) > > > > > at /home/mdroth/w/qemu.git/util/aio-posix.c:520 > > > > > #10 0x55d02d1b in try_poll_mode (timeout=0x7fffe65844a8, > > > > > ctx=0x5688bfc0) > > > > > at /home/mdroth/w/qemu.git/util/aio-posix.c:607 > > > > > #11 0x55d02d1b in aio_poll (ctx=ctx@entry=0x5688bfc0, > > > > > blocking=blocking@entry=true) > > > > > at /home/mdroth/w/qemu.git/util/aio-posix.c:639 > > > > > #12 0x55d0004d in aio_wait_bh_oneshot (ctx=0x5688bfc0, > > > > > cb=cb@entry=0x558d5130 , > > > > > opaque=opaque@entry=0x56de86f0) > > > > > at /home/mdroth/w/qemu.git/util/aio-wait.c:71 > > > > > #13 0x558d59bf in virtio_blk_data_plane_stop > > > > > (vdev=) > > > > > at /home/mdroth/w/qemu.git/hw/block/dataplane/virtio-blk.c:288 > > > > > #14 0x55b906a1 in virtio_bus_stop_ioeventfd > > > > > (bus=bus@entry=0x56dbcf38) > > > > > at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:245 > > > > > #15 0x55b90dbb in virtio_bus_stop_ioeventfd > > > > > (bus=bus@entry=0x56dbcf38) > > > > > at /home/mdroth/w/qemu.git/hw/virtio/virtio-bus.c:237 > > > > > #16 0x55b92a8e in virtio_pci_stop_ioeventfd > > > > > (proxy=0x56db4e40) > > > > > at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:292 > > > > > #17 0x55b92a8e in virtio_write_config > > > > > (pci_dev=0x56db4e40, address=, val=1048832, > > > > > len=) > > > > > at /home/mdroth/w/qemu.git/hw/virtio/virtio-pci.c:613 > > > > > > > > > > I.e. the calling code is only scheduling a one-shot BH for > > > > > virtio_blk_data_plane_stop_bh, but somehow we end up trying to process > > > > > an additional virtqueue entry before we get there. This is likely due > > > > > to the following check in virtio_queue_host_notifier_aio_poll: > > > > > > > > > > static bool virtio_queue_host_notifier_aio_poll(void *opaque) > > > > > { > > > > > EventNotifier *n = opaque; > > > > > VirtQueue *vq = container_of(n,
Re: [PATCH for-4.2 v2 3/3] block/file-posix: Let post-EOF fallocate serialize
On 14.11.19 17:27, Christoph Hellwig wrote: > On Fri, Nov 01, 2019 at 04:25:10PM +0100, Max Reitz wrote: >> The XFS kernel driver has a bug that may cause data corruption for qcow2 >> images as of qemu commit c8bb23cbdbe32f. We can work around it by >> treating post-EOF fallocates as serializing up until infinity (INT64_MAX >> in practice). > > This has been fixed in the kernel a while ago. I don't think it makes > sense to work around it in qemu. Has it? It was my understanding that this is fixed by https://git.kernel.org/pub/scm/fs/xfs/xfs-linux.git/commit/?h=for-next=249bd9087a5264d2b8a974081870e2e27671b4dcwhich has been merged only very recently and is on track to be part of Linux 5.5, as far as I understand. Max signature.asc Description: OpenPGP digital signature
Re: [PATCH v1] s390x: kvm-unit-tests: a PONG device for Sub Channels tests
On 2019-11-14 13:33, Thomas Huth wrote: On 14/11/2019 11.38, Cornelia Huck wrote: On Wed, 13 Nov 2019 20:02:33 +0100 Pierre Morel wrote: Minor nit for $SUBJECT: this isn't a kvm-unit-tests patch, that's just one consumer :) The PONG device accept two commands: PONG_READ and PONG_WRITE which allow to read from and write to an internal buffer of 1024 bytes. The QEMU device is named ccw-pong. Signed-off-by: Pierre Morel --- hw/s390x/Makefile.objs | 1 + hw/s390x/ccw-pong.c | 186 include/hw/s390x/pong.h | 47 3 files changed, 234 insertions(+) create mode 100644 hw/s390x/ccw-pong.c create mode 100644 include/hw/s390x/pong.h diff --git a/hw/s390x/Makefile.objs b/hw/s390x/Makefile.objs index ee91152..3a83438 100644 --- a/hw/s390x/Makefile.objs +++ b/hw/s390x/Makefile.objs @@ -32,6 +32,7 @@ obj-$(CONFIG_KVM) += tod-kvm.o obj-$(CONFIG_KVM) += s390-skeys-kvm.o obj-$(CONFIG_KVM) += s390-stattrib-kvm.o s390-mchk.o obj-y += s390-ccw.o +obj-y += ccw-pong.o Not sure if unconditionally introducing a test device is a good idea. This definitely needs a CONFIG switch (which can be "y" by default, but still we should provide a possibility to disable it) yes, clearly obj-y += ap-device.o obj-y += ap-bridge.o obj-y += s390-sei.o diff --git a/hw/s390x/ccw-pong.c b/hw/s390x/ccw-pong.c new file mode 100644 index 000..e7439d5 --- /dev/null +++ b/hw/s390x/ccw-pong.c @@ -0,0 +1,186 @@ +/* + * CCW PING-PONG Please add a short description here what this device is all about. yes + * Copyright 2019 IBM Corp. + * Author(s): Pierre Morel + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level + * directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/module.h" +#include "cpu.h" +#include "exec/address-spaces.h" +#include "hw/s390x/css.h" +#include "hw/s390x/css-bridge.h" +#include "hw/qdev-properties.h" +#include "hw/s390x/pong.h" + +#define PONG_BUF_SIZE 0x1000 +static char buf[PONG_BUF_SIZE] = "Hello world\n"; + +static inline int pong_rw(CCW1 *ccw, char *p, int len, bool dir) +{ +int ret; + +ret = address_space_rw(_space_memory, ccw->cda, + MEMTXATTRS_UNSPECIFIED, + (unsigned char *)buf, len, dir); + +return (ret == MEMTX_OK) ? -EIO : 0; If return code was OK, then you return an EIO error? That looks weird? Totally weird. it is of course the oposite. This explain the comment from Connie on the unit check. +} + +/* Handle READ ccw commands from guest */ ...snip... + +static Property pong_ccw_properties[] = { +DEFINE_PROP_END_OF_LIST(), +}; + +static void pong_ccw_class_init(ObjectClass *klass, void *data) +{ +DeviceClass *dc = DEVICE_CLASS(klass); + +dc->props = pong_ccw_properties; As long as there are no properties, I think you can simply drop that line. Thomas Yes, right. Thanks for the comments, Regards, Pierre -- Pierre Morel IBM Lab Boeblingen
Convert VMDK to RAW
Ladies and Gentlemen, I am a PhD student at the Friedrich-Alexander-University Erlangen-Nuremberg in Bavaria, Germany and am currently working on a forensic reconstruction tool. The tool can be used to analyze physical and virtual hard disks and to reconstruct files. I would now like to extend the tool so that it is able to analyze VMDK files and convert them to raw. Unfortunately I have not been able to understand how to correctly unpack and assemble VMDK containers. Since qemu is able to convert VMDK to raw, I wanted to ask you if you could explain to me how to put the grains together? Many thanks and greetings Janine Schneider
Re: [PATCH v1] s390x: kvm-unit-tests: a PONG device for Sub Channels tests
On 2019-11-14 11:38, Cornelia Huck wrote: On Wed, 13 Nov 2019 20:02:33 +0100 Pierre Morel wrote: Minor nit for $SUBJECT: this isn't a kvm-unit-tests patch, that's just one consumer :) yes, right. The PONG device accept two commands: PONG_READ and PONG_WRITE which allow to read from and write to an internal buffer of 1024 bytes. The QEMU device is named ccw-pong. Signed-off-by: Pierre Morel --- hw/s390x/Makefile.objs | 1 + hw/s390x/ccw-pong.c | 186 include/hw/s390x/pong.h | 47 3 files changed, 234 insertions(+) create mode 100644 hw/s390x/ccw-pong.c create mode 100644 include/hw/s390x/pong.h diff --git a/hw/s390x/Makefile.objs b/hw/s390x/Makefile.objs index ee91152..3a83438 100644 --- a/hw/s390x/Makefile.objs +++ b/hw/s390x/Makefile.objs @@ -32,6 +32,7 @@ obj-$(CONFIG_KVM) += tod-kvm.o obj-$(CONFIG_KVM) += s390-skeys-kvm.o obj-$(CONFIG_KVM) += s390-stattrib-kvm.o s390-mchk.o obj-y += s390-ccw.o +obj-y += ccw-pong.o Not sure if unconditionally introducing a test device is a good idea. sure not. obj-y += ap-device.o obj-y += ap-bridge.o obj-y += s390-sei.o diff --git a/hw/s390x/ccw-pong.c b/hw/s390x/ccw-pong.c new file mode 100644 index 000..e7439d5 --- /dev/null +++ b/hw/s390x/ccw-pong.c @@ -0,0 +1,186 @@ +/* + * CCW PING-PONG + * + * Copyright 2019 IBM Corp. + * Author(s): Pierre Morel + * + * This work is licensed under the terms of the GNU GPL, version 2 or (at + * your option) any later version. See the COPYING file in the top-level + * directory. + */ + +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/module.h" +#include "cpu.h" +#include "exec/address-spaces.h" +#include "hw/s390x/css.h" +#include "hw/s390x/css-bridge.h" +#include "hw/qdev-properties.h" +#include "hw/s390x/pong.h" + +#define PONG_BUF_SIZE 0x1000 +static char buf[PONG_BUF_SIZE] = "Hello world\n"; + +static inline int pong_rw(CCW1 *ccw, char *p, int len, bool dir) +{ +int ret; + +ret = address_space_rw(_space_memory, ccw->cda, + MEMTXATTRS_UNSPECIFIED, + (unsigned char *)buf, len, dir); + +return (ret == MEMTX_OK) ? -EIO : 0; +} + +/* Handle READ ccw commands from guest */ +static int handle_payload_read(CcwPONGDevice *dev, CCW1 *ccw) +{ +CcwDevice *ccw_dev = CCW_DEVICE(dev); +int len; + +if (!ccw->cda) { +return -EFAULT; +} + +if (ccw->count > PONG_BUF_SIZE) { +len = PONG_BUF_SIZE; +ccw_dev->sch->curr_status.scsw.count = ccw->count - PONG_BUF_SIZE; +} else { +len = ccw->count; +ccw_dev->sch->curr_status.scsw.count = 0; +} + +return pong_rw(ccw, buf, len, 1); +} + +/* + * Handle WRITE ccw commands to write data to client + * The SCSW count is set to the number of bytes not transfered. + */ +static int handle_payload_write(CcwPONGDevice *dev, CCW1 *ccw) +{ +CcwDevice *ccw_dev = CCW_DEVICE(dev); +int len; + +if (!ccw->cda) { +ccw_dev->sch->curr_status.scsw.count = ccw->count; +return -EFAULT; +} + +if (ccw->count > PONG_BUF_SIZE) { +len = PONG_BUF_SIZE; +ccw_dev->sch->curr_status.scsw.count = ccw->count - PONG_BUF_SIZE; +} else { +len = ccw->count; +ccw_dev->sch->curr_status.scsw.count = 0; +} + +return pong_rw(ccw, buf, len, 0); Can you please use the dstream infrastructure for read/write handling? You also seem to miss some basic checks e.g. for short reads/writes with and without SLI set. OK, +} + +static int pong_ccw_cb(SubchDev *sch, CCW1 ccw) +{ +int rc = 0; +CcwPONGDevice *dev = sch->driver_data; + +switch (ccw.cmd_code) { +case PONG_WRITE: +rc = handle_payload_write(dev, ); +break; +case PONG_READ: +rc = handle_payload_read(dev, ); +break; +default: +rc = -ENOSYS; +break; +} + +if (rc == -EIO) { +/* I/O error, specific devices generate specific conditions */ +SCHIB *schib = >curr_status; + +sch->curr_status.scsw.dstat = SCSW_DSTAT_UNIT_CHECK; +sch->sense_data[0] = 0x40;/* intervention-req */ This is really odd. If it succeeds, you generate a unit check with intervention required? Confused. At the very least, this requires some description as to how this device is supposed to interact with the driver. The unit check is only done in case of I/O error. I thought it was right, since it should never happen. Yes it needs some documentation. +schib->scsw.ctrl &= ~SCSW_ACTL_START_PEND; +schib->scsw.ctrl &= ~SCSW_CTRL_MASK_STCTL; +schib->scsw.ctrl |= SCSW_STCTL_PRIMARY | SCSW_STCTL_SECONDARY | + SCSW_STCTL_ALERT | SCSW_STCTL_STATUS_PEND; +} +return rc; +} + +static void pong_ccw_realize(DeviceState *ds, Error **errp) +{ +uint16_t chpid; +CcwPONGDevice *dev = CCW_PONG(ds); +CcwDevice *cdev =
Re: [RFC PATCH v2 20/26] qcow2: Update L2 bitmap in qcow2_alloc_cluster_link_l2()
On Tue 05 Nov 2019 12:43:16 PM CET, Max Reitz wrote: > Speaking of handle_copied(); both elements of Qcow2COWRegion are of > type unsigned. handle_copied() doesn’t look like it takes any > precautions to limit the range to even UINT_MAX (and it should > probably limit it to INT_MAX). Or rather, both handle_copied() and handle_alloc() should probably limit it to BDRV_REQUEST_MAX_BYTES. Berto
Re: [PATCH for-4.2 v2 3/3] block/file-posix: Let post-EOF fallocate serialize
On Fri, Nov 01, 2019 at 04:25:10PM +0100, Max Reitz wrote: > The XFS kernel driver has a bug that may cause data corruption for qcow2 > images as of qemu commit c8bb23cbdbe32f. We can work around it by > treating post-EOF fallocates as serializing up until infinity (INT64_MAX > in practice). This has been fixed in the kernel a while ago. I don't think it makes sense to work around it in qemu.
Re: [RFC PATCH v2 16/26] qcow2: Add subcluster support to discard_in_l2_slice()
On 14.11.19 16:33, Alberto Garcia wrote: > On Mon 04 Nov 2019 04:07:35 PM CET, Max Reitz wrote: >>> /* First remove L2 entries */ >>> qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); >>> -if (!full_discard && s->qcow_version >= 3) { >>> +if (has_subclusters(s)) { >>> +set_l2_entry(s, l2_slice, l2_index + i, 0); >>> +set_l2_bitmap(s, l2_slice, l2_index + i, >>> + full_discard ? 0 : QCOW_L2_BITMAP_ALL_ZEROES); >> >> But only for !full_discard, right? > > Hence the conditional operator in my patch, maybe you didn't see it? Oops, yep, I was just hung up on the if conditional. Max signature.asc Description: OpenPGP digital signature
Re: [PATCH] Semihost SYS_READC implementation (v4)
On Fri, 25 Oct 2019 at 20:15, Keith Packard wrote: > There seems to be convergence on a pretty simple interface which uses > ebreak surrounded by a couple of specific no-ops: > > slli x0, x0, 0x1f > ebreak > srai x0, x0, 0x7 > > There are implementations in rust and openocd, and I've got one for > picolibc. The risc-v semihosting code is sitting on a branch in my repo > on github: > > https://github.com/keith-packard/qemu/tree/riscv-semihost I had an idle glance at this implementation, and this: uint32_t pre = opcode_at(>base, ctx->base.pc_next - 4); uint32_t ebreak = opcode_at(>base, ctx->base.pc_next); uint32_t post = opcode_at(>base, ctx->base.pc_next + 4); (where opcode_at() is a wrapper for cpu_ldl_code()) has some unfortunate side effects: if the previous instruction is in the previous MMU page, or the following instruction is in the next MMU page, you might incorrectly trigger an exception (where QEMU will just longjmp straight out of the cpu_ldl_code()) if that other page isn't actually mapped in the guest's page table. You need to be careful not to access code outside the page you're actually on unless you're really going to execute it and are OK with it faulting. I'm not sure what the best way on QEMU to identify this kind of multiple-instruction-sequence is -- Richard might have an idea. One approach would be to always take an exception (or call a helper function) for the 'ebreak', and then distinguish semihosting from other kinds of ebreak by doing the load of the preceding/succeeding instructions at runtime rather than translate time. Does your semihosting spec expect to have the semihosting call work if the sequence crosses a page boundary, the code is being executed by a userspace process, and one of the two pages has been paged out by the OS ? thanks -- PMM
Re: [Qemu-devel] [PATCH v4 03/14] qapi: Introduce default values for struct members
Less than thorough review, because I expect the necessary rebase will require a bit of rewriting here and there. Max Reitz writes: > With this change, it is possible to give default values for struct > members, as follows: > > What you had to do so far: > > # @member: Some description, defaults to 42. > { 'struct': 'Foo', > 'data': { '*member': 'int' } } > > What you can do now: > > { 'struct': 'Foo', > 'data': { '*member': { 'type': 'int', 'default': 42 } } > > On the C side, this change would remove Foo.has_member, because > Foo.member is always valid now. The input visitor deals with setting > it. (Naturally, this means that such defaults are useful only for input > parameters.) > > At least three things are left unimplemented: > > First, support for alternate data types. This is because supporting > them would mean having to allocate the object in the input visitor, and > then potentially going through multiple levels of nested types. In any > case, it would have been difficult and I do not think there is need for > such support at this point. I don't mind restricting the 'default' feature to uses we actually have, at least initially. I'm afraid I don't fully understand the difficulties you describe, but I guess that's okay. > Second, support for null. The most important reason for this is that > introspection already uses "'default': null" for "no default, but this > field is optional". The second reason is that without support for > alternate data types, there is not really a point in supporting null. Also, commit 9d55380b5a "qapi: Remove null from schema language" :) > Third, full support for default lists. This has a similar reason to the > lack of support for alternate data types: Allocating a default list is > not trivial -- unless the list is empty, which is exactly what we have > support for. Your commit message says "for struct members". What about union members? Cases: * Flat union 'base' members: 'base' is a a struct, possibly implicit. Do defaults work in implicit bases, like BlockdevOption's? * Flat union branch members: these are always struct types, so there's nothing for me to ask. I think. * Simple union branch members: these are each wrapped in an implicit struct type. Do defaults work? I'd be totally fine with "nope, not implemented, not going to implement it" here. > Signed-off-by: Max Reitz > --- > qapi/introspect.json | 9 +- > scripts/qapi/commands.py | 2 +- > scripts/qapi/common.py | 167 +++-- > scripts/qapi/doc.py| 20 - > scripts/qapi/introspect.py | 2 +- > scripts/qapi/types.py | 2 +- > scripts/qapi/visit.py | 38 - > 7 files changed, 217 insertions(+), 23 deletions(-) Missing: docs/devel/qapi-code-gen.txt update. > > diff --git a/qapi/introspect.json b/qapi/introspect.json > index 1843c1cb17..db703135f9 100644 > --- a/qapi/introspect.json > +++ b/qapi/introspect.json > @@ -198,11 +198,10 @@ > # > # @default: default when used as command parameter. > # If absent, the parameter is mandatory. > -# If present, the value must be null. The parameter is > -# optional, and behavior when it's missing is not specified > -# here. > -# Future extension: if present and non-null, the parameter > -# is optional, and defaults to this value. > +# If present and null, the parameter is optional, and > +# behavior when it's missing is not specified here. > +# If present and non-null, the parameter is optional, and > +# defaults to this value. > # > # Since: 2.5 > ## > diff --git a/scripts/qapi/commands.py b/scripts/qapi/commands.py > index b929e07be4..6c407cd4ba 100644 > --- a/scripts/qapi/commands.py > +++ b/scripts/qapi/commands.py > @@ -35,7 +35,7 @@ def gen_call(name, arg_type, boxed, ret_type): > elif arg_type: > assert not arg_type.variants > for memb in arg_type.members: > -if memb.optional: > +if memb.optional and memb.default is None: > argstr += 'arg.has_%s, ' % c_name(memb.name) > argstr += 'arg.%s, ' % c_name(memb.name) > > diff --git a/scripts/qapi/common.py b/scripts/qapi/common.py > index c6754a5856..8c57d0c67a 100644 > --- a/scripts/qapi/common.py > +++ b/scripts/qapi/common.py > @@ -14,6 +14,7 @@ > from __future__ import print_function > from contextlib import contextmanager > import errno > +import math > import os > import re > import string > @@ -800,6 +801,136 @@ def check_if(expr, info): > check_if_str(ifcond, info) > > > +def check_value_str(info, value): > +return 'g_strdup(%s)' % to_c_string(value) if type(value) is str else > False What's wrong with isinstance(value, str)? I'm a happy user of ternaries myself, but this one results in a rather long line. Easier to read, I think: if isinstance(value,
Re: [RFC 0/4] POC: Generating realistic block errors
On 9/20/19 12:28 PM, Tony Asleson wrote: > On 9/20/19 4:22 AM, Stefan Hajnoczi wrote: >> blkdebug is purely at the QEMU block layer level. It is not aware of >> storage controller-specific error information or features. If you want >> to inject NVMe- or SCSI-specific errors that make no sense in QEMU's >> block layer, then trying to do it in blkdebug becomes a layering >> violation. This justifies adding a new error injection feature directly >> into AHCI, virtio-scsi, NVMe, etc devices. > > Good discussion point... > > In my opening use case for this POC I'm generically trying to create an > unrecoverable media error for a specific sector. For each of the > different device types it's different on how that error is conveyed and > the associated data in transfer. > I would like to get some additional clarification on this point. Should I be investing more time integrating my proposed functionality into blkdebug or other? Sorry for the long response time, got sidetracked with other stuff. Thanks, Tony
Re: [PATCH] Semihost SYS_READC implementation (v4)
On Mon, Nov 11, 2019 at 6:51 AM Peter Maydell wrote: > > On Tue, 5 Nov 2019 at 05:10, Keith Packard wrote: > > > > Peter Maydell writes: > > > > > I'm going to push for somebody actually writing out a > > > document and putting it somewhere that we can point to > > > and say "that's the authoritative spec", please... > > > it doesn't have to be a big formal thing, but I do > > > think you want it written down, because the whole point > > > is for multiple implementations and users to interoperate. > > > > That happened in June -- I was just looking at the wrong version of the > > spec. In the current version, which can be found here: > > > > https://riscv.org/specifications/ > > > >The RISC-V Instruction Set Manual > >Volume I: Unprivileged ISA > > Document Version 20190608-Base-Ratified > > > > Section 2.8 says: > > > > Another use of EBREAK is to support “semihosting”, where the > > execution environment includes a debugger that can provide > > services over an alternate system call interface built around > > the EBREAK instruction. Because the RISC-V base ISA does not > > provide more than one EBREAK instruction, RISC-V semihosting > > uses a special sequence of instructions to distinguish a > > semihosting EBREAK from a debugger inserted EBREAK. > > > > slli x0, x0, 0x1f # Entry NOP > > ebreak # Break to debugger > > srai x0, x0, 7 # NOP encoding the semihosting call > > number 7 > > > > Note that these three instructions must be 32-bit-wide > > instructions, i.e., they mustn’t be among the compressed 16-bit > > instructions described in Chapter 16. > > > > The shift NOP instructions are still considered available for > > use as HINTS. > > > > Semihosting is a form of service call and would be more > > naturally encoded as an ECALL using an existing ABI, but this > > would require the debugger to be able to intercept ECALLs, which > > is a newer addition to the debug standard. We intend to move > > over to using ECALLs with a standard ABI, in which case, > > semihosting can share a service ABI with an existing standard. > > > > We note that ARM processors have also moved to using SVC instead > > of BKPT for semihosting calls in newer designs. > > That defines the instruction sequence used to make a semihosting > call, but not the specification of what the calls are: > * what call numbers perform which functions > * how arguments are passed to the call (registers? parameter >blocks in memory? other?) > * the semantics of each function supported (number of arguments, >behaviour, error handling) > > That's really what I had in mind by the overall semihosting spec. This sounds like something that the platform spec should contain. +Atish +Palmer who are in charge of that. @Keith maybe you should contact the platform spec group and ask them to look into this? Alistair > > PS: the parenthetical about ARM semihosting at the bottom of > the text you quote is wrong, incidentally. The traditional insn > for semihosting on A-profile devices has always been SWI/SVC; it > is BKPT only on M-profile devices; and the latest revision of the > semihosting spec recommends the HLT instruction for both A- and M-. > > thanks > -- PMM >
Re: [RFC PATCH v2 18/26] qcow2: Add subcluster support to expand_zero_clusters_in_l1()
On Tue 05 Nov 2019 12:05:02 PM CET, Max Reitz wrote: >> @@ -2102,6 +2103,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState >> *bs, uint64_t *l1_table, >> } else { >> set_l2_entry(s, l2_slice, j, offset); >> } >> +set_l2_bitmap(s, l2_slice, j, QCOW_L2_BITMAP_ALL_ALLOC); >> l2_dirty = true; >> } > > Technically this isn’t needed because this function is only called > when downgrading v3 to v2 images (which can’t have subclusters), but > of course it won’t hurt. Right, but we need to change the function anyway to either do this or assert that there are no subclusters. I prefer to do this because it's trivial but I won't oppose if someone prefers the alternative. Berto
Re: [PATCH] virtio: fix IO request length in virtio SCSI/block #PSBM-78839
On 13.11.2019 16:18, Michael S. Tsirkin wrote: > On Wed, Nov 13, 2019 at 12:38:48PM +, Denis Plotnikov wrote: >> >> On 06.11.2019 15:03, Michael S. Tsirkin wrote: >>> On Thu, Oct 24, 2019 at 11:34:34AM +, Denis Lunev wrote: On 10/24/19 12:28 AM, Michael S. Tsirkin wrote: > On Fri, Oct 18, 2019 at 02:55:47PM +0300, Denis Plotnikov wrote: >> From: "Denis V. Lunev" >> >> Linux guests submit IO requests no longer than PAGE_SIZE * max_seg >> field reported by SCSI controler. Thus typical sequential read with >> 1 MB size results in the following pattern of the IO from the guest: >> 8,16 115754 2.766095122 2071 D R 2095104 + 1008 [dd] >> 8,16 115755 2.766108785 2071 D R 2096112 + 1008 [dd] >> 8,16 115756 2.766113486 2071 D R 2097120 + 32 [dd] >> 8,16 115757 2.767668961 0 C R 2095104 + 1008 [0] >> 8,16 115758 2.768534315 0 C R 2096112 + 1008 [0] >> 8,16 115759 2.768539782 0 C R 2097120 + 32 [0] >> The IO was generated by >> dd if=/dev/sda of=/dev/null bs=1024 iflag=direct >> >> This effectively means that on rotational disks we will observe 3 IOPS >> for each 2 MBs processed. This definitely negatively affects both >> guest and host IO performance. >> >> The cure is relatively simple - we should report lengthy scatter-gather >> ability of the SCSI controller. Fortunately the situation here is very >> good. VirtIO transport layer can accomodate 1024 items in one request >> while we are using only 128. This situation is present since almost >> very beginning. 2 items are dedicated for request metadata thus we >> should publish VIRTQUEUE_MAX_SIZE - 2 as max_seg. >> >> The following pattern is observed after the patch: >> 8,16 1 9921 2.662721340 2063 D R 2095104 + 1024 [dd] >> 8,16 1 9922 2.662737585 2063 D R 2096128 + 1024 [dd] >> 8,16 1 9923 2.665188167 0 C R 2095104 + 1024 [0] >> 8,16 1 9924 2.665198777 0 C R 2096128 + 1024 [0] >> which is much better. >> >> The dark side of this patch is that we are tweaking guest visible >> parameter, though this should be relatively safe as above transport >> layer support is present in QEMU/host Linux for a very long time. >> The patch adds configurable property for VirtIO SCSI with a new default >> and hardcode option for VirtBlock which does not provide good >> configurable framework. >> >> Unfortunately the commit can not be applied as is. For the real cure we >> need guest to be fixed to accomodate that queue length, which is done >> only in the latest 4.14 kernel. Thus we are going to expose the property >> and tweak it on machine type level. >> >> The problem with the old kernels is that they have >> max_segments <= virtqueue_size restriction which cause the guest >> crashing in the case of violation. > This isn't just in the guests: virtio spec also seems to imply this, > or at least be vague on this point. > > So I think it'll need a feature bit. > Doing that in a safe way will also allow being compatible with old guests. > > The only downside is it's a bit more work as we need to > spec this out and add guest support. > >> To fix the case described above in the old kernels we can increase >> virtqueue_size to 256 and max_segments to 254. The pitfall here is >> that seabios allows the virtqueue_size-s < 128, however, the seabios >> patch extending that value to 256 is pending. > And the fix here is just to limit large vq size to virtio 1.0. > In that mode it's fine I think: > > > /* check if the queue is available */ > if (vp->use_modern) { > num = vp_read(>common, virtio_pci_common_cfg, queue_size); > if (num > MAX_QUEUE_NUM) { > vp_write(>common, virtio_pci_common_cfg, queue_size, > MAX_QUEUE_NUM); > num = vp_read(>common, virtio_pci_common_cfg, > queue_size); > } > } else { > num = vp_read(>legacy, virtio_pci_legacy, queue_num); > } you mean to put the code like this into virtio_pci_realize() inside QEMU? If no, can you pls clarify which component should be touched. Den >>> I mean: >>>- add an API to change the default queue size >>>- add a validate features callback, in there check and for modern >>> flag set in features increase the queue size >>> >>> maybe all this is too much work, we could block this >>> for transitional devices, but your patch does not do it, >>> you need to check that legacy is enabled not that modern >>> is not disabled. >> To develop the idea of how to adjust queue size further I'd
Re: [RFC PATCH v2 16/26] qcow2: Add subcluster support to discard_in_l2_slice()
On Mon 04 Nov 2019 04:07:35 PM CET, Max Reitz wrote: >> /* First remove L2 entries */ >> qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); >> -if (!full_discard && s->qcow_version >= 3) { >> +if (has_subclusters(s)) { >> +set_l2_entry(s, l2_slice, l2_index + i, 0); >> +set_l2_bitmap(s, l2_slice, l2_index + i, >> + full_discard ? 0 : QCOW_L2_BITMAP_ALL_ZEROES); > > But only for !full_discard, right? Hence the conditional operator in my patch, maybe you didn't see it? Berto
Re: [RFC PATCH v2 15/26] qcow2: Add subcluster support to zero_in_l2_slice()
On Mon 04 Nov 2019 04:10:58 PM CET, Max Reitz wrote: >>> qcow2_cache_entry_mark_dirty(s->l2_table_cache, l2_slice); >>> if (cluster_type == QCOW2_CLUSTER_COMPRESSED || unmap) { >>> -set_l2_entry(s, l2_slice, l2_index + i, QCOW_OFLAG_ZERO); >>> qcow2_free_any_clusters(bs, old_offset, 1, >>> QCOW2_DISCARD_REQUEST); >> >> It feels wrong to me to free the cluster before updating the L2 >> entry. > > (Although it’s pre-existing, as set_l2_entry() is just an in-cache > operation anyway :-/) Yes, I think that if you want to do it afterwards you need to add another loop after the qcow2_cache_put() call. Berto
Re: [PATCH] MAINTAINERS: add more bitmap-related to Dirty Bitmaps section
On 10/26/19 11:56 AM, Vladimir Sementsov-Ogievskiy wrote: Let's add bitmaps persistence qcow2 feature and postcopy bitmaps migration to Dirty Bitmaps section. Signed-off-by: Vladimir Sementsov-Ogievskiy --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) I see no reason why this can't go in -rc2, so I'll queue it through my NBD tree. diff --git a/MAINTAINERS b/MAINTAINERS index 556ce0bfe3..51f31b4011 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1829,6 +1829,8 @@ F: util/hbitmap.c F: block/dirty-bitmap.c F: include/qemu/hbitmap.h F: include/block/dirty-bitmap.h +F: qcow2-bitmap.c +F: migration/block-dirty-bitmap.c F: tests/test-hbitmap.c F: docs/interop/bitmaps.rst T: git https://github.com/jnsnow/qemu.git bitmaps -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Braille device (chardev/baum.c) is unable to detect the TTY correctly and does not act on graphic console connect/disconnect
Hello, As a blind developer I would be very happy to use QEMU's baum chardev for a braille display. Unfortunately, this device fails to detect the tty in which the spice client is running. I would like to improve this device but I don't yet know how to achieve a better solution. The current code calls qemu_console_get_window_id() to get the tty. This function returns zero, which causes the code to skip even the default behavior of brlapi's brlapi__enterTtyMode() (including checcking some env variables such as CONTROLVT). Furthermore, window id sounds like something different than a tty number, maybe a number of X display? The code does not currently consider the fact that the lifetime of the connected graphical consoles is not the same as the lifetime of the VM. I hardcoded a tty number to the sources of QEMU version 3.0.0 to gladly verify that the device is working besides these problems. So, I have a handful of open questions to start with: Is it possible to get callbacks for connect and disconnect of a graphical console (like spice and vnc)? How? Is it further possible to get any information of the connected client to determine the tty, and possibly sub-windows too (see brlapi__enterTtyModeWithPath), in which the client is running? Such events should lead to calls of brlapi__EnterTtyMode() and brlapi__leaveTtyMode(). To get this device working properly would be a life changing lifehack for me so I highly appreciate any help! It would allow the use of Linux side by side with Windows and ChromiumOS, which is a big leap in this rather poorly accessible world. If this is successful, I'm looking forward to spreading the word and helping others too! With kind regards and informally on behalf of the Finnish braille authority, -- Teemu Kuusisto
Re: [EXTERNAL]Re: [PATCH v2 2/5] MAINTAINERS: Adjust maintainership for Fulong 2E board
Hi, all, On Thu, Nov 14, 2019 at 8:34 PM Aleksandar Markovic wrote: > > Hi, Philippe, > > > From: Philippe Mathieu-Daudé > > > > Hi Aleksandar, > > > > On 11/13/19 2:47 PM, Aleksandar Markovic wrote: > > > From: Aleksandar Markovic > > > > > > Change the maintainership for Fulong 2E board to improve its quality. > > > > IIRC you told me once this board is named Fuloong, and its CPU is a > > Loongson, both with 2x 'o' :) I have a patch renaming the various > > occurrences. > > > > I still think that the oficial name is "Fuloong 2E", however, it is > shortened to "Fulong 2E" quite often in communication, and, it seems, > sometimes even in various docs/app notes etc. > > Can perhaps Huacai Chen enlighten us regarding the right spelling? The right spelling is Fuloong. > > > > Signed-off-by: Aleksandar Markovic > > > --- > > > MAINTAINERS | 7 --- > > > 1 file changed, 4 insertions(+), 3 deletions(-) > > > > > > diff --git a/MAINTAINERS b/MAINTAINERS > > > index fd9ba32..3bf2144 100644 > > > --- a/MAINTAINERS > > > +++ b/MAINTAINERS > > > @@ -976,9 +976,10 @@ S: Maintained > > > F: hw/mips/mips_r4k.c > > > > > > Fulong 2E > > > -M: Aleksandar Markovic > > > -R: Aleksandar Rikalo > > > -S: Odd Fixes > > > +M: Philippe Mathieu-Daudé > > > > I am happy to keep the Fuloong working, but this will be on my personal > > (hobbyist) time. This area is not a priority for my employer, so I'll > > use my personal email: f4...@amsat.org. > > > > OK. > > > The original author is active on the Linux kernel, let ask him if he'd > > be OK to keep an eye on his work. > > > > Huacai, would you agree to be listed as a reviewer on the Fuloong > > related QEMU files? You don't have to worry about the generic QEMU code > > plate (like keeping API up to date) I'll manage that, but I'd like to > > have you listed to assert the hardware is properly modeled. > > > > You would appear as: > > R: Huacai Chen > > > > That is a great idea! > > Please, Chen, get involved, you would be very welcomed, there is a place > for you in QEMU community! I'm sorry that I'm busy now, but I think I will do something in QEMU in the next year. > > > > > +R: Hervé Poussineau > > > > I don't think Hervé is interested by this board, he has not modified the > > code. > > > > > +R: Aleksandar Markovic > > > +S: Maintained > > > > Let keep it as "Odd Fixes" :) > > > > OK. > > >Odd Fixes: It has a maintainer but they don't have > > time to do much other than throw the odd > > patch in. > > > > > F: hw/mips/mips_fulong2e.c > > > F: hw/isa/vt82c686.c > > > F: hw/pci-host/bonito.c > > > > > > > So the patch would be: > > > > -- 8< -- > > Fulong 2E > > -M: Aleksandar Markovic > > -R: Aleksandar Rikalo > > +M: Philippe Mathieu-Daudé > > +R: Aleksandar Markovic > > +R: Huacai Chen > > S: Odd Fixes > > F: hw/mips/mips_fulong2e.c > > F: hw/isa/vt82c686.c > > Plus possible s/Fulong 2E/Fuloong 2E/ > > > --- > > > > But let's wait to see what Huacai Chen thinks of it, before posting it. > > > > Thanks for taking care of those changes! > > > > Phil. > > > > > > Thank you! > > Aleksandar Thanks, Huacai
[PATCH v2 for 5.0 5/6] linux-user: Add support for get/set RTC PLL correction using ioctls
This patch implements functionalities of following ioctls: RTC_PLL_GET - Get PLL correction Read the PLL correction for RTCs that support PLL. The PLL correction is returned in the following structure: struct rtc_pll_info { int pll_ctrl;/* placeholder for fancier control */ int pll_value; /* get/set correction value */ int pll_max; /* max +ve (faster) adjustment value */ int pll_min; /* max -ve (slower) adjustment value */ int pll_posmult; /* factor for +ve correction */ int pll_negmult; /* factor for -ve correction */ long pll_clock; /* base PLL frequency */ }; A pointer to this structure should be passed as the third ioctl's argument. RTC_PLL_SET - Set PLL correction Sets the PLL correction for RTCs that support PLL. The PLL correction that is set is specified by the rtc_pll_info structure pointed to by the third ioctl's' argument. Implementation notes: All ioctls in this patch have pointer to a structure rtc_pll_info as their third argument. All elements of this structure are of type 'int', except the last one that is of type 'long'. That is the reason why a separate target structure (target_rtc_pll_info) is defined in linux-user/syscall_defs. The rest of the implementation is straightforward. Signed-off-by: Filip Bozuta --- linux-user/ioctls.h| 2 ++ linux-user/syscall_defs.h | 14 ++ linux-user/syscall_types.h | 9 + 3 files changed, 25 insertions(+) diff --git a/linux-user/ioctls.h b/linux-user/ioctls.h index e4d89c2..a8dd235 100644 --- a/linux-user/ioctls.h +++ b/linux-user/ioctls.h @@ -87,6 +87,8 @@ IOCTL(RTC_EPOCH_SET, IOC_W, MK_PTR(TYPE_ULONG)) IOCTL(RTC_WKALM_RD, IOC_R, MK_PTR(MK_STRUCT(STRUCT_rtc_wkalrm))) IOCTL(RTC_WKALM_SET, IOC_W, MK_PTR(MK_STRUCT(STRUCT_rtc_wkalrm))) + IOCTL(RTC_PLL_GET, IOC_R, MK_PTR(MK_STRUCT(STRUCT_rtc_pll_info))) + IOCTL(RTC_PLL_SET, IOC_W, MK_PTR(MK_STRUCT(STRUCT_rtc_pll_info))) IOCTL(BLKROSET, IOC_W, MK_PTR(TYPE_INT)) IOCTL(BLKROGET, IOC_R, MK_PTR(TYPE_INT)) diff --git a/linux-user/syscall_defs.h b/linux-user/syscall_defs.h index 37504a2..8370f41 100644 --- a/linux-user/syscall_defs.h +++ b/linux-user/syscall_defs.h @@ -763,6 +763,16 @@ struct target_pollfd { #define TARGET_KDSETLED0x4B32 /* set led state [lights, not flags] */ #define TARGET_KDSIGACCEPT 0x4B4E +struct target_rtc_pll_info { +int pll_ctrl; +int pll_value; +int pll_max; +int pll_min; +int pll_posmult; +int pll_negmult; +abi_long pll_clock; +}; + /* real time clock ioctls */ #define TARGET_RTC_AIE_ON TARGET_IO('p', 0x01) #define TARGET_RTC_AIE_OFF TARGET_IO('p', 0x02) @@ -782,6 +792,10 @@ struct target_pollfd { #define TARGET_RTC_EPOCH_SETTARGET_IOW('p', 0x0e, abi_ulong) #define TARGET_RTC_WKALM_RD TARGET_IOR('p', 0x10, struct rtc_wkalrm) #define TARGET_RTC_WKALM_SETTARGET_IOW('p', 0x0f, struct rtc_wkalrm) +#define TARGET_RTC_PLL_GET TARGET_IOR('p', 0x11, \ + struct target_rtc_pll_info) +#define TARGET_RTC_PLL_SET TARGET_IOW('p', 0x12, \ + struct target_rtc_pll_info) #if defined(TARGET_ALPHA) || defined(TARGET_MIPS) || defined(TARGET_SH4) || \ defined(TARGET_XTENSA) diff --git a/linux-user/syscall_types.h b/linux-user/syscall_types.h index 820bc8e..4027272 100644 --- a/linux-user/syscall_types.h +++ b/linux-user/syscall_types.h @@ -271,6 +271,15 @@ STRUCT(rtc_wkalrm, TYPE_CHAR, /* pending */ MK_STRUCT(STRUCT_rtc_time)) /* time */ +STRUCT(rtc_pll_info, + TYPE_INT, /* pll_ctrl */ + TYPE_INT, /* pll_value */ + TYPE_INT, /* pll_max */ + TYPE_INT, /* pll_min */ + TYPE_INT, /* pll_posmult */ + TYPE_INT, /* pll_negmult */ + TYPE_LONG) /* pll_clock */ + STRUCT(blkpg_ioctl_arg, TYPE_INT, /* op */ TYPE_INT, /* flags */ -- 2.7.4
[PATCH v2 for 5.0 1/6] linux-user: Add support for enable/disable RTC features using ioctls
This patch implements functionalities of following ioctls: RTC_AIE_ON, RTC_AIE_OFF - Alarm interrupt enable on/off Enable or disable the alarm interrupt, for RTCs that support alarms. The third ioctl's argument is ignored. RTC_UIE_ON, RTC_UIE_OFF - Update interrupt enable on/off Enable or disable the interrupt on every clock update, for RTCs that support this once-per-second interrupt. The third ioctl's argument is ignored. RTC_PIE_ON, RTC_PIE_OFF - Periodic interrupt enable on/off Enable or disable the periodic interrupt, for RTCs that sup‐ port these periodic interrupts. The third ioctl's argument is ignored. Only a privileged process (i.e., one having the CAP_SYS_RESOURCE capability) can enable the periodic interrupt if the frequency is currently set above the value specified in /proc/sys/dev/rtc/max-user-freq. RTC_WIE_ON, RTC_WIE_OFF - Watchdog interrupt enable on/off Enable or disable the Watchdog interrupt, for RTCs that sup- port this Watchdog interrupt. The third ioctl's argument is ignored. Implementation notes: Since all of involved ioctls have NULL as their third argument, their implementation was straightforward. The line '#include ' was added to recognize preprocessor definitions for these ioctls. This needs to be done only once in this series of commits. Also, the content of this file (with respect to ioctl definitions) remained unchanged for a long time, therefore there is no need to worry about supporting older Linux kernel version. Signed-off-by: Filip Bozuta --- linux-user/ioctls.h | 9 + linux-user/syscall.c | 1 + linux-user/syscall_defs.h | 10 ++ 3 files changed, 20 insertions(+) diff --git a/linux-user/ioctls.h b/linux-user/ioctls.h index c6b9d6a..97741c7 100644 --- a/linux-user/ioctls.h +++ b/linux-user/ioctls.h @@ -69,6 +69,15 @@ IOCTL(KDSETLED, 0, TYPE_INT) IOCTL_SPECIAL(KDSIGACCEPT, 0, do_ioctl_kdsigaccept, TYPE_INT) + IOCTL(RTC_AIE_ON, 0, TYPE_NULL) + IOCTL(RTC_AIE_OFF, 0, TYPE_NULL) + IOCTL(RTC_UIE_ON, 0, TYPE_NULL) + IOCTL(RTC_UIE_OFF, 0, TYPE_NULL) + IOCTL(RTC_PIE_ON, 0, TYPE_NULL) + IOCTL(RTC_PIE_OFF, 0, TYPE_NULL) + IOCTL(RTC_WIE_ON, 0, TYPE_NULL) + IOCTL(RTC_WIE_OFF, 0, TYPE_NULL) + IOCTL(BLKROSET, IOC_W, MK_PTR(TYPE_INT)) IOCTL(BLKROGET, IOC_R, MK_PTR(TYPE_INT)) IOCTL(BLKRRPART, 0, TYPE_NULL) diff --git a/linux-user/syscall.c b/linux-user/syscall.c index ce399a5..74c3c08 100644 --- a/linux-user/syscall.c +++ b/linux-user/syscall.c @@ -107,6 +107,7 @@ #include #include #include +#include #include "linux_loop.h" #include "uname.h" diff --git a/linux-user/syscall_defs.h b/linux-user/syscall_defs.h index 98c2119..f91579a 100644 --- a/linux-user/syscall_defs.h +++ b/linux-user/syscall_defs.h @@ -763,6 +763,16 @@ struct target_pollfd { #define TARGET_KDSETLED0x4B32 /* set led state [lights, not flags] */ #define TARGET_KDSIGACCEPT 0x4B4E +/* real time clock ioctls */ +#define TARGET_RTC_AIE_ON TARGET_IO('p', 0x01) +#define TARGET_RTC_AIE_OFF TARGET_IO('p', 0x02) +#define TARGET_RTC_UIE_ON TARGET_IO('p', 0x03) +#define TARGET_RTC_UIE_OFF TARGET_IO('p', 0x04) +#define TARGET_RTC_PIE_ON TARGET_IO('p', 0x05) +#define TARGET_RTC_PIE_OFF TARGET_IO('p', 0x06) +#define TARGET_RTC_WIE_ON TARGET_IO('p', 0x0f) +#define TARGET_RTC_WIE_OFF TARGET_IO('p', 0x10) + #if defined(TARGET_ALPHA) || defined(TARGET_MIPS) || defined(TARGET_SH4) || \ defined(TARGET_XTENSA) #define TARGET_FIOGETOWN TARGET_IOR('f', 123, int) -- 2.7.4
[PATCH v2 for 5.0 6/6] linux-user: Add support for read/clear RTC voltage low detector using ioctls
RTC_VL_READ - Read voltage low detection information Read the voltage low for RTCs that support voltage low. The third ioctl's' argument points to an int in which the voltage low is returned. RTC_VL_CLR - Clear voltage low information Clear the information about voltage low for RTCs that support voltage low. The third ioctl(2) argument is ignored. Implementation notes: Since one ioctl has a pointer to 'int' as its third agrument, and another ioctl has NULL as its third argument, their implementation was straightforward. Signed-off-by: Filip Bozuta --- linux-user/ioctls.h | 2 ++ linux-user/syscall_defs.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/linux-user/ioctls.h b/linux-user/ioctls.h index a8dd235..371c25e 100644 --- a/linux-user/ioctls.h +++ b/linux-user/ioctls.h @@ -89,6 +89,8 @@ IOCTL(RTC_WKALM_SET, IOC_W, MK_PTR(MK_STRUCT(STRUCT_rtc_wkalrm))) IOCTL(RTC_PLL_GET, IOC_R, MK_PTR(MK_STRUCT(STRUCT_rtc_pll_info))) IOCTL(RTC_PLL_SET, IOC_W, MK_PTR(MK_STRUCT(STRUCT_rtc_pll_info))) + IOCTL(RTC_VL_READ, IOC_R, MK_PTR(TYPE_INT)) + IOCTL(RTC_VL_CLR, 0, TYPE_NULL) IOCTL(BLKROSET, IOC_W, MK_PTR(TYPE_INT)) IOCTL(BLKROGET, IOC_R, MK_PTR(TYPE_INT)) diff --git a/linux-user/syscall_defs.h b/linux-user/syscall_defs.h index 8370f41..af4f366 100644 --- a/linux-user/syscall_defs.h +++ b/linux-user/syscall_defs.h @@ -796,6 +796,8 @@ struct target_rtc_pll_info { struct target_rtc_pll_info) #define TARGET_RTC_PLL_SET TARGET_IOW('p', 0x12, \ struct target_rtc_pll_info) +#define TARGET_RTC_VL_READ TARGET_IOR('p', 0x13, int) +#define TARGET_RTC_VL_CLR TARGET_IO('p', 0x14) #if defined(TARGET_ALPHA) || defined(TARGET_MIPS) || defined(TARGET_SH4) || \ defined(TARGET_XTENSA) -- 2.7.4
[PATCH v2 for 5.0 2/6] linux-user: Add support for read/set RTC time and alarm using ioctls
This patch implements functionalities of following ioctls: RTC_RD_TIME - Read RTC time Returns this RTC's time in the following structure: struct rtc_time { int tm_sec; int tm_min; int tm_hour; int tm_mday; int tm_mon; int tm_year; int tm_wday; /* unused */ int tm_yday; /* unused */ int tm_isdst;/* unused */ }; The fields in this structure have the same meaning and ranges as for the tm structure described in gmtime man page. A pointer to this structure should be passed as the third ioctl's argument. RTC_SET_TIME - Set RTC time Sets this RTC's time to the time specified by the rtc_time structure pointed to by the third ioctl's argument. To set the RTC's time the process must be privileged (i.e., have the CAP_SYS_TIME capability). RTC_ALM_READ, RTC_ALM_SET - Read/Set alarm time Read and set the alarm time, for RTCs that support alarms. The alarm interrupt must be separately enabled or disabled using the RTC_AIE_ON, RTC_AIE_OFF requests. The third ioctl's argument is a pointer to an rtc_time structure. Only the tm_sec, tm_min, and tm_hour fields of this structure are used. Implementation notes: All ioctls in this patch have pointer to a structure rtc_time as their third argument. That is the reason why corresponding definition is added in linux-user/syscall_types.h. Since all elements of this structure are of type 'int', the rest of the implementation is straightforward. Signed-off-by: Filip Bozuta --- linux-user/ioctls.h| 4 linux-user/syscall_defs.h | 4 linux-user/syscall_types.h | 11 +++ 3 files changed, 19 insertions(+) diff --git a/linux-user/ioctls.h b/linux-user/ioctls.h index 97741c7..f472794 100644 --- a/linux-user/ioctls.h +++ b/linux-user/ioctls.h @@ -77,6 +77,10 @@ IOCTL(RTC_PIE_OFF, 0, TYPE_NULL) IOCTL(RTC_WIE_ON, 0, TYPE_NULL) IOCTL(RTC_WIE_OFF, 0, TYPE_NULL) + IOCTL(RTC_ALM_READ, IOC_R, MK_PTR(MK_STRUCT(STRUCT_rtc_time))) + IOCTL(RTC_ALM_SET, IOC_W, MK_PTR(MK_STRUCT(STRUCT_rtc_time))) + IOCTL(RTC_RD_TIME, IOC_R, MK_PTR(MK_STRUCT(STRUCT_rtc_time))) + IOCTL(RTC_SET_TIME, IOC_W, MK_PTR(MK_STRUCT(STRUCT_rtc_time))) IOCTL(BLKROSET, IOC_W, MK_PTR(TYPE_INT)) IOCTL(BLKROGET, IOC_R, MK_PTR(TYPE_INT)) diff --git a/linux-user/syscall_defs.h b/linux-user/syscall_defs.h index f91579a..f0bf09d 100644 --- a/linux-user/syscall_defs.h +++ b/linux-user/syscall_defs.h @@ -772,6 +772,10 @@ struct target_pollfd { #define TARGET_RTC_PIE_OFF TARGET_IO('p', 0x06) #define TARGET_RTC_WIE_ON TARGET_IO('p', 0x0f) #define TARGET_RTC_WIE_OFF TARGET_IO('p', 0x10) +#define TARGET_RTC_ALM_READ TARGET_IOR('p', 0x08, struct rtc_time) +#define TARGET_RTC_ALM_SET TARGET_IOW('p', 0x07, struct rtc_time) +#define TARGET_RTC_RD_TIME TARGET_IOR('p', 0x09, struct rtc_time) +#define TARGET_RTC_SET_TIME TARGET_IOW('p', 0x0a, struct rtc_time) #if defined(TARGET_ALPHA) || defined(TARGET_MIPS) || defined(TARGET_SH4) || \ defined(TARGET_XTENSA) diff --git a/linux-user/syscall_types.h b/linux-user/syscall_types.h index 4e36983..a35072a 100644 --- a/linux-user/syscall_types.h +++ b/linux-user/syscall_types.h @@ -255,6 +255,17 @@ STRUCT(blkpg_partition, MK_ARRAY(TYPE_CHAR, BLKPG_DEVNAMELTH), /* devname */ MK_ARRAY(TYPE_CHAR, BLKPG_VOLNAMELTH)) /* volname */ +STRUCT(rtc_time, + TYPE_INT, /* tm_sec */ + TYPE_INT, /* tm_min */ + TYPE_INT, /* tm_hour */ + TYPE_INT, /* tm_mday */ + TYPE_INT, /* tm_mon */ + TYPE_INT, /* tm_year */ + TYPE_INT, /* tm_wday */ + TYPE_INT, /* tm_yday */ + TYPE_INT) /* tm_isdst */ + STRUCT(blkpg_ioctl_arg, TYPE_INT, /* op */ TYPE_INT, /* flags */ -- 2.7.4
[PATCH v2 for 5.0 0/6] linux-user: Add support for real time clock ioctls
Add ioctls for all RTC features that are currently supported in linux kernel. This series covers following iocts: * RTC_AIE_ON * RTC_AIE_OFF * RTC_UIE_ON * RTC_UIE_OFF * RTC_PIE_ON * RTC_PIE_OFF * RTC_WIE_ON * RTC_WIE_OFF * RTC_ALM_SET * RTC_ALM_READ * RTC_RD_TIME * RTC_SET_TIME * RTC_IRQP_READ * RTC_IRQP_SET * RTC_EPOCH_READ * RTC_EPOCH_SET * RTC_WKALM_SET * RTC_WKALM_RD * RTC_PLL_GET * RTC_PLL_SET * RTC_VL_READ * RTC_VL_CLR The functionalities of individual ioctls were described in this series patch commit messages. Testing method: Mini test programs were written for each ioctl. Those programs were compiled (sometimes using cross-compilers) for the following architectures: * Intel 64-bit (little endian) * Power pc 32-bit (big endian) * Power pc 64-bit (big endian) The corresponding native programs were executed without using QEMU on following hosts: * Intel(R) Core(TM) i7-4790K CPU @ 4.00GHz *.7447A, (ppc32 host) All applicable compiled programs were in turn executed through QEMU and the results obtained were the same ones gotten for native execution. Example of a test program: For ioctl RTC_RD_TIME we have used the following test program: #include #include #include #include #include #include #include #define ERROR -1 int main() { int fd = open("/dev/rtc", O_RDWR | O_NONBLOCK); if(fd == ERROR) { perror("open"); return -1; } struct rtc_time cur_time; if(ioctl(fd, RTC_RD_TIME, _time) < 0) { perror("ioctl"); return -1; } printf("Second: %d, Minute: %d, Hour: %d, Day: %d, Month: %d, Year: %d,", cur_time.tm_sec, cur_time.tm_min, cur_time.tm_hour, cur_time.tm_mday, cur_time.tm_mon, cur_time.tm_year); return 0; } Filip Bozuta (6): linux-user: Add support for enable/disable RTC features using ioctls linux-user: Add support for read/set RTC time and alarm using ioctls linux-user: Add support for read/set RTC periodic interrupt and epoch using ioctls linux-user: Add support for get/set RTC wakeup alarm using ioctls linux-user: Add support for get/set RTC PLL correction using ioctls linux-user: Add support for read/clear RTC voltage low detector using ioctls linux-user/ioctls.h| 23 +++ linux-user/syscall.c | 1 + linux-user/syscall_defs.h | 36 linux-user/syscall_types.h | 25 + 4 files changed, 85 insertions(+) -- 2.7.4
[PATCH v2 for 5.0 3/6] linux-user: Add support for read/set RTC periodic interrupt and epoch using ioctls
This patch implements functionalities of following ioctls: RTC_IRQP_READ, RTC_IRQP_SET - Read/Set IRQ rate Read and set the frequency for periodic interrupts, for RTCs that support periodic interrupts. The periodic interrupt must be separately enabled or disabled using the RTC_PIE_ON, RTC_PIE_OFF requests. The third ioctl's argument is an unsigned long * or an unsigned long, respectively. The value is the frequency in interrupts per second. The set of allow‐ able frequencies is the multiples of two in the range 2 to 8192. Only a privileged process (i.e., one having the CAP_SYS_RESOURCE capability) can set frequencies above the value specified in /proc/sys/dev/rtc/max-user-freq. (This file contains the value 64 by default.) RTC_EPOCH_READ, RTC_EPOCH_SET - Read/Set epoch Many RTCs encode the year in an 8-bit register which is either interpreted as an 8-bit binary number or as a BCD number. In both cases, the number is interpreted relative to this RTC's Epoch. The RTC's Epoch is initialized to 1900 on most systems but on Alpha and MIPS it might also be initialized to 1952, 1980, or 2000, depending on the value of an RTC register for the year. With some RTCs, these operations can be used to read or to set the RTC's Epoch, respectively. The third ioctl's argument is an unsigned long * or an unsigned long, respectively, and the value returned (or assigned) is the Epoch. To set the RTC's Epoch the process must be privileged (i.e., have the CAP_SYS_TIME capability). Implementation notes: All ioctls in this patch have a pointer to 'ulong' as their third argument. That is the reason why corresponding parts of added code in linux-user/syscall_defs.h contain special handling related to 'ulong' type: they use 'abi_ulong' type to make sure that ioctl's code is calculated correctly for both 32-bit and 64-bit targets. Also, 'MK_PTR(TYPE_ULONG)' is used for the similar reason in linux-user/ioctls.h. Signed-off-by: Filip Bozuta --- linux-user/ioctls.h | 4 linux-user/syscall_defs.h | 4 2 files changed, 8 insertions(+) diff --git a/linux-user/ioctls.h b/linux-user/ioctls.h index f472794..fa2fe7f 100644 --- a/linux-user/ioctls.h +++ b/linux-user/ioctls.h @@ -81,6 +81,10 @@ IOCTL(RTC_ALM_SET, IOC_W, MK_PTR(MK_STRUCT(STRUCT_rtc_time))) IOCTL(RTC_RD_TIME, IOC_R, MK_PTR(MK_STRUCT(STRUCT_rtc_time))) IOCTL(RTC_SET_TIME, IOC_W, MK_PTR(MK_STRUCT(STRUCT_rtc_time))) + IOCTL(RTC_IRQP_READ, IOC_R, MK_PTR(TYPE_ULONG)) + IOCTL(RTC_IRQP_SET, IOC_W, MK_PTR(TYPE_ULONG)) + IOCTL(RTC_EPOCH_READ, IOC_R, MK_PTR(TYPE_ULONG)) + IOCTL(RTC_EPOCH_SET, IOC_W, MK_PTR(TYPE_ULONG)) IOCTL(BLKROSET, IOC_W, MK_PTR(TYPE_INT)) IOCTL(BLKROGET, IOC_R, MK_PTR(TYPE_INT)) diff --git a/linux-user/syscall_defs.h b/linux-user/syscall_defs.h index f0bf09d..bbfa935 100644 --- a/linux-user/syscall_defs.h +++ b/linux-user/syscall_defs.h @@ -776,6 +776,10 @@ struct target_pollfd { #define TARGET_RTC_ALM_SET TARGET_IOW('p', 0x07, struct rtc_time) #define TARGET_RTC_RD_TIME TARGET_IOR('p', 0x09, struct rtc_time) #define TARGET_RTC_SET_TIME TARGET_IOW('p', 0x0a, struct rtc_time) +#define TARGET_RTC_IRQP_READTARGET_IOR('p', 0x0b, abi_ulong) +#define TARGET_RTC_IRQP_SET TARGET_IOW('p', 0x0c, abi_ulong) +#define TARGET_RTC_EPOCH_READ TARGET_IOR('p', 0x0d, abi_ulong) +#define TARGET_RTC_EPOCH_SETTARGET_IOW('p', 0x0e, abi_ulong) #if defined(TARGET_ALPHA) || defined(TARGET_MIPS) || defined(TARGET_SH4) || \ defined(TARGET_XTENSA) -- 2.7.4
[PATCH v2 for 5.0 4/6] linux-user: Add support for get/set RTC wakeup alarm using ioctls
This patch implements functionalities of following ioctls: RTC_WKALM_SET, RTC_WKALM_GET - Get/Set wakeup alarm Some RTCs support a more powerful alarm interface, using these ioctls to read or write the RTC's alarm time (respectively) with this structure: struct rtc_wkalrm { unsigned char enabled; unsigned char pending; struct rtc_time time; }; The enabled flag is used to enable or disable the alarm inter‐ rupt, or to read its current status; when using these calls, RTC_AIE_ON and RTC_AIE_OFF are not used. The pending flag is used by RTC_WKALM_RD to report a pending interrupt (so it's mostly useless on Linux, except when talking to the RTC man‐ aged by EFI firmware). The time field is as used with RTC_ALM_READ and RTC_ALM_SET except that the tm_mday, tm_mon, and tm_year fields are also valid. A pointer to this struc‐ ture should be passed as the third ioctl's argument. Implementation notes: All ioctls in this patch have pointer to a structure rtc_wkalrm as their third argument. That is the reason why corresponding definition is added in linux-user/syscall_types.h. Since all elements of this structure are either of type 'unsigned char' or 'struct rtc_time' (that was covered in one of previous patches), the rest of the implementation is straightforward. Signed-off-by: Filip Bozuta --- linux-user/ioctls.h| 2 ++ linux-user/syscall_defs.h | 2 ++ linux-user/syscall_types.h | 5 + 3 files changed, 9 insertions(+) diff --git a/linux-user/ioctls.h b/linux-user/ioctls.h index fa2fe7f..e4d89c2 100644 --- a/linux-user/ioctls.h +++ b/linux-user/ioctls.h @@ -85,6 +85,8 @@ IOCTL(RTC_IRQP_SET, IOC_W, MK_PTR(TYPE_ULONG)) IOCTL(RTC_EPOCH_READ, IOC_R, MK_PTR(TYPE_ULONG)) IOCTL(RTC_EPOCH_SET, IOC_W, MK_PTR(TYPE_ULONG)) + IOCTL(RTC_WKALM_RD, IOC_R, MK_PTR(MK_STRUCT(STRUCT_rtc_wkalrm))) + IOCTL(RTC_WKALM_SET, IOC_W, MK_PTR(MK_STRUCT(STRUCT_rtc_wkalrm))) IOCTL(BLKROSET, IOC_W, MK_PTR(TYPE_INT)) IOCTL(BLKROGET, IOC_R, MK_PTR(TYPE_INT)) diff --git a/linux-user/syscall_defs.h b/linux-user/syscall_defs.h index bbfa935..37504a2 100644 --- a/linux-user/syscall_defs.h +++ b/linux-user/syscall_defs.h @@ -780,6 +780,8 @@ struct target_pollfd { #define TARGET_RTC_IRQP_SET TARGET_IOW('p', 0x0c, abi_ulong) #define TARGET_RTC_EPOCH_READ TARGET_IOR('p', 0x0d, abi_ulong) #define TARGET_RTC_EPOCH_SETTARGET_IOW('p', 0x0e, abi_ulong) +#define TARGET_RTC_WKALM_RD TARGET_IOR('p', 0x10, struct rtc_wkalrm) +#define TARGET_RTC_WKALM_SETTARGET_IOW('p', 0x0f, struct rtc_wkalrm) #if defined(TARGET_ALPHA) || defined(TARGET_MIPS) || defined(TARGET_SH4) || \ defined(TARGET_XTENSA) diff --git a/linux-user/syscall_types.h b/linux-user/syscall_types.h index a35072a..820bc8e 100644 --- a/linux-user/syscall_types.h +++ b/linux-user/syscall_types.h @@ -266,6 +266,11 @@ STRUCT(rtc_time, TYPE_INT, /* tm_yday */ TYPE_INT) /* tm_isdst */ +STRUCT(rtc_wkalrm, + TYPE_CHAR, /* enabled */ + TYPE_CHAR, /* pending */ + MK_STRUCT(STRUCT_rtc_time)) /* time */ + STRUCT(blkpg_ioctl_arg, TYPE_INT, /* op */ TYPE_INT, /* flags */ -- 2.7.4
Re: [PATCH v5] iotests: Test NBD client reconnection
On 11/11/19 9:39 PM, Andrey Shinkevich wrote: The test for an NBD client. The NBD server is disconnected after the client write request. The NBD client should reconnect and complete the write operation. Suggested-by: Denis V. Lunev Suggested-by: Vladimir Sementsov-Ogievskiy Signed-off-by: Andrey Shinkevich Reviewed-by: Eric Blake Tested-by: Eric Blake --- v5: "" were replaced with '' in the test except the function comments. The 'quick' word was added to the 'qroup' file next to the test #277. Queuing for 4.2-rc2 through my NBD tree. -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3226 Virtualization: qemu.org | libvirt.org
Re: [PATCH v5 00/20] Add virtual device fuzzing support
On 11/14/19 5:55 AM, Darren Kenny wrote: Hi Alexander, A quick comment on the fact that you omitted any Reviewed-by's that you have received so far. Was that intentional? No - I'll find a way to add them. sorry about that -Alex Thanks, Darren. On Wed, Nov 13, 2019 at 10:50:41PM +, Oleinik, Alexander wrote: This series adds a framework for coverage-guided fuzzing of virtual-devices. Fuzzing targets are based on qtest and can make use of the libqos abstractions. V5: * misc fixes addressing V4 comments * cleanup in-process handlers/globals in libqtest.c * small fixes to fork-based fuzzing and support for multiple workers * changes to the virtio-net fuzzer to kick after each vq add V4: * add/transfer license headers to new files * restructure the added QTestClientTransportOps struct * restructure the FuzzTarget struct and fuzzer skeleton * fork-based fuzzer now directly mmaps shm over the coverage bitmaps * fixes to i440 and virtio-net fuzz targets * undo the changes to qtest_memwrite * possible to build /fuzz and /all in the same build-dir * misc fixes to address V3 comments V3: * rebased onto v4.1.0+ * add the fuzzer as a new build-target type in the build-system * add indirection to qtest client/server communication functions * remove ramfile and snapshot-based fuzzing support * add i440fx fuzz-target as a reference for developers. * add linker-script to assist with fork-based fuzzer V2: * split off changes to qos virtio-net and qtest server to other patches * move vl:main initialization into new func: qemu_init * moved useful functions from qos-test.c to a separate object * use struct of function pointers for add_fuzz_target(), instead of arguments * move ramfile to migration/qemu-file * rewrite fork-based fuzzer pending patch to libfuzzer * pass check-patch Alexander Bulekov (20): softmmu: split off vl.c:main() into main.c libqos: Rename i2c_send and i2c_recv fuzz: Add FUZZ_TARGET module type qtest: add qtest_server_send abstraction libqtest: Add a layer of abstraciton to send/recv module: check module wasn't already initialized qtest: add in-process incoming command handler tests: provide test variables to other targets libqos: split qos-test and libqos makefile vars libqos: move useful qos-test funcs to qos_external libqtest: make bufwrite rely on the TransportOps libqtest: add in-process qtest.c tx/rx handlers fuzz: add configure flag --enable-fuzzing fuzz: Add target/fuzz makefile rules fuzz: add fuzzer skeleton fuzz: add support for fork-based fuzzing. fuzz: add support for qos-assisted fuzz targets fuzz: add i440fx fuzz targets fuzz: add virtio-net fuzz target fuzz: add documentation to docs/devel/ Makefile | 16 ++- Makefile.objs | 4 + Makefile.target | 18 ++- configure | 39 ++ docs/devel/fuzzing.txt | 119 ++ exec.c | 12 +- include/qemu/module.h | 4 +- include/sysemu/qtest.h | 4 + include/sysemu/sysemu.h | 4 + main.c | 53 qtest.c | 31 - tests/Makefile.include | 75 +-- tests/fuzz/Makefile.include | 11 ++ tests/fuzz/fork_fuzz.c | 55 + tests/fuzz/fork_fuzz.h | 23 tests/fuzz/fork_fuzz.ld | 37 ++ tests/fuzz/fuzz.c | 179 +++ tests/fuzz/fuzz.h | 94 ++ tests/fuzz/i440fx_fuzz.c | 176 ++ tests/fuzz/qos_fuzz.c | 232 +++ tests/fuzz/qos_fuzz.h | 33 + tests/fuzz/virtio_net_fuzz.c | 100 +++ tests/libqos/i2c.c | 10 +- tests/libqos/i2c.h | 4 +- tests/libqos/qos_external.c | 168 + tests/libqos/qos_external.h | 28 + tests/libqtest.c | 108 ++-- tests/libqtest.h | 4 + tests/pca9552-test.c | 10 +- tests/qos-test.c | 140 + util/module.c | 7 ++ vl.c | 38 ++ 32 files changed, 1607 insertions(+), 229 deletions(-) create mode 100644 docs/devel/fuzzing.txt create mode 100644 main.c create mode 100644 tests/fuzz/Makefile.include create mode 100644 tests/fuzz/fork_fuzz.c create mode 100644 tests/fuzz/fork_fuzz.h create mode 100644 tests/fuzz/fork_fuzz.ld create mode 100644 tests/fuzz/fuzz.c create mode 100644 tests/fuzz/fuzz.h create mode 100644 tests/fuzz/i440fx_fuzz.c create mode 100644 tests/fuzz/qos_fuzz.c create mode 100644 tests/fuzz/qos_fuzz.h create mode 100644 tests/fuzz/virtio_net_fuzz.c create mode 100644 tests/libqos/qos_external.c create mode 100644 tests/libqos/qos_external.h -- 2.23.0 -- === I recently changed my last name from Oleinik to Bulekov ===