[PATCH v2] net/colo: check vnet_hdr_support flag when using virtio-net
When COLO use only one vnet_hdr_support parameter between COLO network filter(filter-mirror, filter-redirector or filter-rewriter and colo-compare, packet will not be parsed correctly. Acquire network driver related to COLO, if it is nirtio-net, check vnet_hdr_support flag of COLO network filter and colo-compare. Signed-off-by: Tao Xu Signed-off-by: Zhang Chen --- Changelog: v2: Detect virtio-net driver and apply vnet_hdr_support automatically. (Jason) --- net/colo-compare.c| 57 +++ net/colo.c| 20 +++ net/colo.h| 4 +++ net/filter-mirror.c | 21 net/filter-rewriter.c | 10 qapi/qom.json | 6 + qemu-options.hx | 6 +++-- 7 files changed, 122 insertions(+), 2 deletions(-) diff --git a/net/colo-compare.c b/net/colo-compare.c index b100e7b51f..870bd05a41 100644 --- a/net/colo-compare.c +++ b/net/colo-compare.c @@ -110,6 +110,7 @@ struct CompareState { char *sec_indev; char *outdev; char *notify_dev; +char *netdev; CharBackend chr_pri_in; CharBackend chr_sec_in; CharBackend chr_out; @@ -838,6 +839,28 @@ static int compare_chr_can_read(void *opaque) return COMPARE_READ_LEN_MAX; } +static int colo_set_default_netdev(void *opaque, QemuOpts *opts, Error **errp) +{ +const char *colo_obj_type, *netdev_from_filter; +char **netdev = (char **)opaque; + +colo_obj_type = qemu_opt_get(opts, "qom-type"); + +if (colo_obj_type && +(strcmp(colo_obj_type, "filter-mirror") == 0 || + strcmp(colo_obj_type, "filter-redirector") == 0 || + strcmp(colo_obj_type, "filter-rewriter") == 0)) { +netdev_from_filter = qemu_opt_get(opts, "netdev"); +if (*netdev == NULL) { +*netdev = g_strdup(netdev_from_filter); +} else if (strcmp(*netdev, netdev_from_filter) != 0) { +warn_report("%s is using a different netdev from other COLO " +"component", colo_obj_type); +} +} +return 0; +} + /* * Called from the main thread on the primary for packets * arriving over the socket from the primary. @@ -1050,6 +1073,21 @@ static void compare_set_vnet_hdr(Object *obj, s->vnet_hdr = value; } +static char *compare_get_netdev(Object *obj, Error **errp) +{ +CompareState *s = COLO_COMPARE(obj); + +return g_strdup(s->netdev); +} + +static void compare_set_netdev(Object *obj, const char *value, Error **errp) +{ +CompareState *s = COLO_COMPARE(obj); + +g_free(s->netdev); +s->netdev = g_strdup(value); +} + static char *compare_get_notify_dev(Object *obj, Error **errp) { CompareState *s = COLO_COMPARE(obj); @@ -1274,6 +1312,12 @@ static void colo_compare_complete(UserCreatable *uc, Error **errp) max_queue_size = MAX_QUEUE_SIZE; } +if (!s->netdev) { +/* Set default netdev as the first colo netfilter found */ +qemu_opts_foreach(qemu_find_opts("object"), + colo_set_default_netdev, &s->netdev, NULL); +} + if (find_and_check_chardev(&chr, s->pri_indev, errp) || !qemu_chr_fe_init(&s->chr_pri_in, chr, errp)) { return; @@ -1289,6 +1333,16 @@ static void colo_compare_complete(UserCreatable *uc, Error **errp) return; } +if (!s->vnet_hdr && +qemu_opts_foreach(qemu_find_opts("device"), + vnet_driver_check, s->netdev, NULL)) { +/* + * colo compare needs 'vnet_hdr_support' when it works on virtio-net, + * add 'vnet_hdr_support' automatically + */ +s->vnet_hdr = true; +} + net_socket_rs_init(&s->pri_rs, compare_pri_rs_finalize, s->vnet_hdr); net_socket_rs_init(&s->sec_rs, compare_sec_rs_finalize, s->vnet_hdr); @@ -1400,6 +1454,9 @@ static void colo_compare_init(Object *obj) s->vnet_hdr = false; object_property_add_bool(obj, "vnet_hdr_support", compare_get_vnet_hdr, compare_set_vnet_hdr); +/* colo compare can't varify that netdev is correct */ +object_property_add_str(obj, "netdev", compare_get_netdev, +compare_set_netdev); } void colo_compare_cleanup(void) diff --git a/net/colo.c b/net/colo.c index 3a3e6e89a0..4a03780f45 100644 --- a/net/colo.c +++ b/net/colo.c @@ -243,3 +243,23 @@ bool connection_has_tracked(GHashTable *connection_track_table, return conn ? true : false; } + +/* check the network driver related to COLO, return 1 if it is virtio-net */ +int vnet_driver_check(void *opaque, QemuOpts *opts, Error **errp) +{ +const char *driver_type, *netdev_from_driver; +char *netdev_from_filter = (char *)opaque; + +driver_
Re: [PATCH] net/colo: check vnet_hdr_support flag when using virtio-net
On 8/17/2021 2:01 PM, Tao Xu wrote: On 8/16/2021 10:58 AM, Jason Wang wrote: 在 2021/8/6 下午2:08, Tao Xu 写道: When COLO use only one vnet_hdr_support parameter between COLO network filter(filter-mirror, filter-redirector or filter-rewriter and colo-compare, packet will not be parsed correctly. Acquire network driver related to COLO, if it is nirtio-net, check vnet_hdr_support flag of COLO network filter and colo-compare. Signed-off-by: Tao Xu Signed-off-by: Zhang Chen --- net/colo-compare.c | 25 + net/colo.c | 20 net/colo.h | 4 net/filter-mirror.c | 17 + net/filter-rewriter.c | 9 + 5 files changed, 75 insertions(+) diff --git a/net/colo-compare.c b/net/colo-compare.c index b100e7b51f..bc1cc951c0 100644 --- a/net/colo-compare.c +++ b/net/colo-compare.c @@ -838,6 +838,23 @@ static int compare_chr_can_read(void *opaque) return COMPARE_READ_LEN_MAX; } +/* check vnet_hdr_support flag through COLO filter modules */ +static int colo_vnet_driver_check(void *opaque, QemuOpts *opts, Error **errp) +{ + const char *colo_obj_type; + + colo_obj_type = qemu_opt_get(opts, "qom-type"); + + if (strcmp(colo_obj_type, "filter-mirror") == 0 || + strcmp(colo_obj_type, "filter-redirector") == 0 || + strcmp(colo_obj_type, "filter-rewriter") == 0) { + if (qemu_opt_get(opts, "vnet_hdr_support")) { + return 1; + } + } + return 0; +} + /* * Called from the main thread on the primary for packets * arriving over the socket from the primary. @@ -1289,6 +1306,14 @@ static void colo_compare_complete(UserCreatable *uc, Error **errp) return; } + if (!s->vnet_hdr && + qemu_opts_foreach(qemu_find_opts("object"), + colo_vnet_driver_check, NULL, NULL)) { + error_setg(errp, "colo compare needs 'vnet_hdr_support' " + "when colo filter modules work on virtio-net"); + return; + } I wonder if we can detect virtio-net and apply vnet_hdr automatically. Thanks For filter-mirror, filter-redirector and filter-rewriter, we can detect and add it automatically, because these netfilter is attached to netdev, for example, if (!s->vnet_hdr && qemu_opts_foreach(qemu_find_opts("device"), vnet_driver_check, nf->netdev_id, NULL)) { s->vnet_hdr = true. } But for colo-compare, it isn't attached to netdev, only can check colo netfilter to check vnet_hdr_support. In this situation, if all netfilter vnet_hdr_support is missing, colo_vnet_driver_check() will return 0, it can't find vnet_hdr_support is missing. So can we apply vnet_hdr automatically for filter-mirror, filter-redirector and filter-rewriter? And keep report error for colo-compare? Sorry, I find the solution for colo-compare apply vnet_hdr automatically, I will submit V2 later.
Re: [PATCH] net/colo: check vnet_hdr_support flag when using virtio-net
On 8/16/2021 10:58 AM, Jason Wang wrote: 在 2021/8/6 下午2:08, Tao Xu 写道: When COLO use only one vnet_hdr_support parameter between COLO network filter(filter-mirror, filter-redirector or filter-rewriter and colo-compare, packet will not be parsed correctly. Acquire network driver related to COLO, if it is nirtio-net, check vnet_hdr_support flag of COLO network filter and colo-compare. Signed-off-by: Tao Xu Signed-off-by: Zhang Chen --- net/colo-compare.c| 25 + net/colo.c| 20 net/colo.h| 4 net/filter-mirror.c | 17 + net/filter-rewriter.c | 9 + 5 files changed, 75 insertions(+) diff --git a/net/colo-compare.c b/net/colo-compare.c index b100e7b51f..bc1cc951c0 100644 --- a/net/colo-compare.c +++ b/net/colo-compare.c @@ -838,6 +838,23 @@ static int compare_chr_can_read(void *opaque) return COMPARE_READ_LEN_MAX; } +/* check vnet_hdr_support flag through COLO filter modules */ +static int colo_vnet_driver_check(void *opaque, QemuOpts *opts, Error **errp) +{ +const char *colo_obj_type; + +colo_obj_type = qemu_opt_get(opts, "qom-type"); + +if (strcmp(colo_obj_type, "filter-mirror") == 0 || +strcmp(colo_obj_type, "filter-redirector") == 0 || +strcmp(colo_obj_type, "filter-rewriter") == 0) { +if (qemu_opt_get(opts, "vnet_hdr_support")) { +return 1; +} +} +return 0; +} + /* * Called from the main thread on the primary for packets * arriving over the socket from the primary. @@ -1289,6 +1306,14 @@ static void colo_compare_complete(UserCreatable *uc, Error **errp) return; } +if (!s->vnet_hdr && +qemu_opts_foreach(qemu_find_opts("object"), + colo_vnet_driver_check, NULL, NULL)) { +error_setg(errp, "colo compare needs 'vnet_hdr_support' " + "when colo filter modules work on virtio-net"); +return; +} I wonder if we can detect virtio-net and apply vnet_hdr automatically. Thanks For filter-mirror, filter-redirector and filter-rewriter, we can detect and add it automatically, because these netfilter is attached to netdev, for example, if (!s->vnet_hdr && qemu_opts_foreach(qemu_find_opts("device"), vnet_driver_check, nf->netdev_id, NULL)) { s->vnet_hdr = true. } But for colo-compare, it isn't attached to netdev, only can check colo netfilter to check vnet_hdr_support. In this situation, if all netfilter vnet_hdr_support is missing, colo_vnet_driver_check() will return 0, it can't find vnet_hdr_support is missing. So can we apply vnet_hdr automatically for filter-mirror, filter-redirector and filter-rewriter? And keep report error for colo-compare?
Re: [PATCH] net/colo: check vnet_hdr_support flag when using virtio-net
Hi Jason, Do you have any comments on this patch? Thank you! On 8/6/2021 2:08 PM, Xu, Tao3 wrote: When COLO use only one vnet_hdr_support parameter between COLO network filter(filter-mirror, filter-redirector or filter-rewriter and colo-compare, packet will not be parsed correctly. Acquire network driver related to COLO, if it is nirtio-net, check vnet_hdr_support flag of COLO network filter and colo-compare. Signed-off-by: Tao Xu Signed-off-by: Zhang Chen --- net/colo-compare.c| 25 + net/colo.c| 20 net/colo.h| 4 net/filter-mirror.c | 17 + net/filter-rewriter.c | 9 + 5 files changed, 75 insertions(+) diff --git a/net/colo-compare.c b/net/colo-compare.c index b100e7b51f..bc1cc951c0 100644 --- a/net/colo-compare.c +++ b/net/colo-compare.c @@ -838,6 +838,23 @@ static int compare_chr_can_read(void *opaque) return COMPARE_READ_LEN_MAX; } +/* check vnet_hdr_support flag through COLO filter modules */ +static int colo_vnet_driver_check(void *opaque, QemuOpts *opts, Error **errp) +{ +const char *colo_obj_type; + +colo_obj_type = qemu_opt_get(opts, "qom-type"); + +if (strcmp(colo_obj_type, "filter-mirror") == 0 || +strcmp(colo_obj_type, "filter-redirector") == 0 || +strcmp(colo_obj_type, "filter-rewriter") == 0) { +if (qemu_opt_get(opts, "vnet_hdr_support")) { +return 1; +} +} +return 0; +} + /* * Called from the main thread on the primary for packets * arriving over the socket from the primary. @@ -1289,6 +1306,14 @@ static void colo_compare_complete(UserCreatable *uc, Error **errp) return; } +if (!s->vnet_hdr && +qemu_opts_foreach(qemu_find_opts("object"), + colo_vnet_driver_check, NULL, NULL)) { +error_setg(errp, "colo compare needs 'vnet_hdr_support' " + "when colo filter modules work on virtio-net"); +return; +} + net_socket_rs_init(&s->pri_rs, compare_pri_rs_finalize, s->vnet_hdr); net_socket_rs_init(&s->sec_rs, compare_sec_rs_finalize, s->vnet_hdr); diff --git a/net/colo.c b/net/colo.c index 3a3e6e89a0..4a03780f45 100644 --- a/net/colo.c +++ b/net/colo.c @@ -243,3 +243,23 @@ bool connection_has_tracked(GHashTable *connection_track_table, return conn ? true : false; } + +/* check the network driver related to COLO, return 1 if it is virtio-net */ +int vnet_driver_check(void *opaque, QemuOpts *opts, Error **errp) +{ +const char *driver_type, *netdev_from_driver; +char *netdev_from_filter = (char *)opaque; + +driver_type = qemu_opt_get(opts, "driver"); +netdev_from_driver = qemu_opt_get(opts, "netdev"); + +if (!driver_type || !netdev_from_driver || !netdev_from_filter) { +return 0; +} + +if (g_str_has_prefix(driver_type, "virtio-net") && +strcmp(netdev_from_driver, netdev_from_filter) == 0) { +return 1; +} +return 0; +} diff --git a/net/colo.h b/net/colo.h index d91cd245c4..d401fc76b6 100644 --- a/net/colo.h +++ b/net/colo.h @@ -18,6 +18,9 @@ #include "qemu/jhash.h" #include "qemu/timer.h" #include "net/eth.h" +#include "qemu/option.h" +#include "qemu/option_int.h" +#include "qemu/config-file.h" #define HASHTABLE_MAX_SIZE 16384 @@ -104,5 +107,6 @@ Packet *packet_new(const void *data, int size, int vnet_hdr_len); Packet *packet_new_nocopy(void *data, int size, int vnet_hdr_len); void packet_destroy(void *opaque, void *user_data); void packet_destroy_partial(void *opaque, void *user_data); +int vnet_driver_check(void *opaque, QemuOpts *opts, Error **errp); #endif /* NET_COLO_H */ diff --git a/net/filter-mirror.c b/net/filter-mirror.c index f20240cc9f..b8b3f2fe1d 100644 --- a/net/filter-mirror.c +++ b/net/filter-mirror.c @@ -12,6 +12,7 @@ #include "qemu/osdep.h" #include "net/filter.h" #include "net/net.h" +#include "net/colo.h" #include "qapi/error.h" #include "qom/object.h" #include "qemu/main-loop.h" @@ -224,6 +225,14 @@ static void filter_mirror_setup(NetFilterState *nf, Error **errp) return; } +if (!s->vnet_hdr && +qemu_opts_foreach(qemu_find_opts("device"), + vnet_driver_check, nf->netdev_id, NULL)) { +error_setg(errp, "filter mirror needs 'vnet_hdr_support' " + "when network driver is virtio-net"); +return; +} + qemu_chr_fe_init(&s->chr_out, chr, errp); } @@ -252,6 +261,14 @@ static void filter_redirector_setup(N
[PATCH] net/colo: check vnet_hdr_support flag when using virtio-net
When COLO use only one vnet_hdr_support parameter between COLO network filter(filter-mirror, filter-redirector or filter-rewriter and colo-compare, packet will not be parsed correctly. Acquire network driver related to COLO, if it is nirtio-net, check vnet_hdr_support flag of COLO network filter and colo-compare. Signed-off-by: Tao Xu Signed-off-by: Zhang Chen --- net/colo-compare.c| 25 + net/colo.c| 20 net/colo.h| 4 net/filter-mirror.c | 17 + net/filter-rewriter.c | 9 + 5 files changed, 75 insertions(+) diff --git a/net/colo-compare.c b/net/colo-compare.c index b100e7b51f..bc1cc951c0 100644 --- a/net/colo-compare.c +++ b/net/colo-compare.c @@ -838,6 +838,23 @@ static int compare_chr_can_read(void *opaque) return COMPARE_READ_LEN_MAX; } +/* check vnet_hdr_support flag through COLO filter modules */ +static int colo_vnet_driver_check(void *opaque, QemuOpts *opts, Error **errp) +{ +const char *colo_obj_type; + +colo_obj_type = qemu_opt_get(opts, "qom-type"); + +if (strcmp(colo_obj_type, "filter-mirror") == 0 || +strcmp(colo_obj_type, "filter-redirector") == 0 || +strcmp(colo_obj_type, "filter-rewriter") == 0) { +if (qemu_opt_get(opts, "vnet_hdr_support")) { +return 1; +} +} +return 0; +} + /* * Called from the main thread on the primary for packets * arriving over the socket from the primary. @@ -1289,6 +1306,14 @@ static void colo_compare_complete(UserCreatable *uc, Error **errp) return; } +if (!s->vnet_hdr && +qemu_opts_foreach(qemu_find_opts("object"), + colo_vnet_driver_check, NULL, NULL)) { +error_setg(errp, "colo compare needs 'vnet_hdr_support' " + "when colo filter modules work on virtio-net"); +return; +} + net_socket_rs_init(&s->pri_rs, compare_pri_rs_finalize, s->vnet_hdr); net_socket_rs_init(&s->sec_rs, compare_sec_rs_finalize, s->vnet_hdr); diff --git a/net/colo.c b/net/colo.c index 3a3e6e89a0..4a03780f45 100644 --- a/net/colo.c +++ b/net/colo.c @@ -243,3 +243,23 @@ bool connection_has_tracked(GHashTable *connection_track_table, return conn ? true : false; } + +/* check the network driver related to COLO, return 1 if it is virtio-net */ +int vnet_driver_check(void *opaque, QemuOpts *opts, Error **errp) +{ +const char *driver_type, *netdev_from_driver; +char *netdev_from_filter = (char *)opaque; + +driver_type = qemu_opt_get(opts, "driver"); +netdev_from_driver = qemu_opt_get(opts, "netdev"); + +if (!driver_type || !netdev_from_driver || !netdev_from_filter) { +return 0; +} + +if (g_str_has_prefix(driver_type, "virtio-net") && +strcmp(netdev_from_driver, netdev_from_filter) == 0) { +return 1; +} +return 0; +} diff --git a/net/colo.h b/net/colo.h index d91cd245c4..d401fc76b6 100644 --- a/net/colo.h +++ b/net/colo.h @@ -18,6 +18,9 @@ #include "qemu/jhash.h" #include "qemu/timer.h" #include "net/eth.h" +#include "qemu/option.h" +#include "qemu/option_int.h" +#include "qemu/config-file.h" #define HASHTABLE_MAX_SIZE 16384 @@ -104,5 +107,6 @@ Packet *packet_new(const void *data, int size, int vnet_hdr_len); Packet *packet_new_nocopy(void *data, int size, int vnet_hdr_len); void packet_destroy(void *opaque, void *user_data); void packet_destroy_partial(void *opaque, void *user_data); +int vnet_driver_check(void *opaque, QemuOpts *opts, Error **errp); #endif /* NET_COLO_H */ diff --git a/net/filter-mirror.c b/net/filter-mirror.c index f20240cc9f..b8b3f2fe1d 100644 --- a/net/filter-mirror.c +++ b/net/filter-mirror.c @@ -12,6 +12,7 @@ #include "qemu/osdep.h" #include "net/filter.h" #include "net/net.h" +#include "net/colo.h" #include "qapi/error.h" #include "qom/object.h" #include "qemu/main-loop.h" @@ -224,6 +225,14 @@ static void filter_mirror_setup(NetFilterState *nf, Error **errp) return; } +if (!s->vnet_hdr && +qemu_opts_foreach(qemu_find_opts("device"), + vnet_driver_check, nf->netdev_id, NULL)) { +error_setg(errp, "filter mirror needs 'vnet_hdr_support' " + "when network driver is virtio-net"); +return; +} + qemu_chr_fe_init(&s->chr_out, chr, errp); } @@ -252,6 +261,14 @@ static void filter_redirector_setup(NetFilterState *nf, Error **errp) } } +if (!s->vnet_hdr && +qemu_opts_foreach(qemu_find_opts("device"), +
[PATCH] iotests: Fix typo in iotest 051
There is an typo in iotest 051, correct it. Signed-off-by: Tao Xu --- tests/qemu-iotests/051| 2 +- tests/qemu-iotests/051.pc.out | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/qemu-iotests/051 b/tests/qemu-iotests/051 index f92161d8ef..1595babe82 100755 --- a/tests/qemu-iotests/051 +++ b/tests/qemu-iotests/051 @@ -209,7 +209,7 @@ case "$QEMU_DEFAULT_MACHINE" in # virtio-blk enables the iothread only when the driver initialises the # device, so a second virtio-blk device can't be added even with the # same iothread. virtio-scsi allows this. -run_qemu $iothread -device virtio-blk-pci,drive=disk,iohtread=iothread0,share-rw=on +run_qemu $iothread -device virtio-blk-pci,drive=disk,iothread=iothread0,share-rw=on run_qemu $iothread -device virtio-scsi,id=virtio-scsi1,iothread=thread0 -device scsi-hd,bus=virtio-scsi1.0,drive=disk,share-rw=on ;; *) diff --git a/tests/qemu-iotests/051.pc.out b/tests/qemu-iotests/051.pc.out index a28e3fc124..a43086bb41 100644 --- a/tests/qemu-iotests/051.pc.out +++ b/tests/qemu-iotests/051.pc.out @@ -183,9 +183,9 @@ Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id QEMU X.Y.Z monitor - type 'help' for more information (qemu) QEMU_PROG: -device scsi-hd,bus=virtio-scsi1.0,drive=disk,share-rw=on: Cannot change iothread of active block backend -Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device virtio-blk-pci,drive=disk,iohtread=iothread0,share-rw=on +Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device virtio-blk-pci,drive=disk,iothread=iothread0,share-rw=on QEMU X.Y.Z monitor - type 'help' for more information -(qemu) QEMU_PROG: -device virtio-blk-pci,drive=disk,iohtread=iothread0,share-rw=on: Cannot change iothread of active block backend +(qemu) QEMU_PROG: -device virtio-blk-pci,drive=disk,iothread=iothread0,share-rw=on: Cannot change iothread of active block backend Testing: -drive file=TEST_DIR/t.qcow2,if=none,node-name=disk -object iothread,id=thread0 -device virtio-scsi,iothread=thread0,id=virtio-scsi0 -device scsi-hd,bus=virtio-scsi0.0,drive=disk,share-rw=on -device virtio-scsi,id=virtio-scsi1,iothread=thread0 -device scsi-hd,bus=virtio-scsi1.0,drive=disk,share-rw=on QEMU X.Y.Z monitor - type 'help' for more information -- 2.25.1
[Bug 1920871] [NEW] netperf UDP_STREAM high packet loss on QEMU tap network
Public bug reported: Hi, I boot a guest with "-netdev tap,id=hn0,vhost=off,br=br0,helper=/usr/local/libexec/qemu-bridge- helper" network option, and using "netperf -H IP -t UDP_STREAM" to test guest UDP performance, I got the following output: Socket Message Elapsed Messages SizeSize Time Okay Errors Throughput bytes bytessecs# # 10^6bits/sec 212992 65507 10.00 144710 07583.56 212992 10.00 32 1.68 We can find most of UDP packets are lost. But I test another host machine or use "-netdev usr,x". I can got: Socket Message Elapsed Messages SizeSize Time Okay Errors Throughput bytes bytessecs# # 10^6bits/sec 212992 65507 10.00 18351 0 961.61 212992 10.00 18350961.56 most of UDP packets are recived. And If we check the tap qemu used, we can see: ifconfig tap0 tap0: flags=4419 mtu 1500 inet6 fe80::ecc6:21ff:fe6f:b174 prefixlen 64 scopeid 0x20 ether ee:c6:21:6f:b1:74 txqueuelen 1000 (Ethernet) RX packets 282 bytes 30097 (29.3 KiB) RX errors 0 dropped 0 overruns 0 frame 0 TX packets 9086214 bytes 12731596673 (11.8 GiB) TX errors 0 dropped 16349024 overruns 0 carrier 0 collisions 0 lots of TX packets are dropped. list other packet size: ➜ boot netperf -H 192.168.199.200 -t UDP_STREAM -- -m 1 MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.199.200 () port 0 AF_INET Socket Message Elapsed Messages SizeSize Time Okay Errors Throughput bytes bytessecs# # 10^6bits/sec 212992 1 10.00 2297941 0 1.84 212992 10.00 1462024 1.17 ➜ boot netperf -H 192.168.199.200 -t UDP_STREAM -- -m 128 MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.199.200 () port 0 AF_INET Socket Message Elapsed Messages SizeSize Time Okay Errors Throughput bytes bytessecs# # 10^6bits/sec 212992 128 10.00 2311547 0 236.70 212992 10.00 1359834139.25 ** Affects: qemu Importance: Undecided Status: New -- You received this bug notification because you are a member of qemu- devel-ml, which is subscribed to QEMU. https://bugs.launchpad.net/bugs/1920871 Title: netperf UDP_STREAM high packet loss on QEMU tap network Status in QEMU: New Bug description: Hi, I boot a guest with "-netdev tap,id=hn0,vhost=off,br=br0,helper=/usr/local/libexec/qemu-bridge- helper" network option, and using "netperf -H IP -t UDP_STREAM" to test guest UDP performance, I got the following output: Socket Message Elapsed Messages SizeSize Time Okay Errors Throughput bytes bytessecs# # 10^6bits/sec 212992 65507 10.00 144710 07583.56 212992 10.00 32 1.68 We can find most of UDP packets are lost. But I test another host machine or use "-netdev usr,x". I can got: Socket Message Elapsed Messages SizeSize Time Okay Errors Throughput bytes bytessecs# # 10^6bits/sec 212992 65507 10.00 18351 0 961.61 212992 10.00 18350961.56 most of UDP packets are recived. And If we check the tap qemu used, we can see: ifconfig tap0 tap0: flags=4419 mtu 1500 inet6 fe80::ecc6:21ff:fe6f:b174 prefixlen 64 scopeid 0x20 ether ee:c6:21:6f:b1:74 txqueuelen 1000 (Ethernet) RX packets 282 bytes 30097 (29.3 KiB) RX errors 0 dropped 0 overruns 0 frame 0 TX packets 9086214 bytes 12731596673 (11.8 GiB) TX errors 0 dropped 16349024 overruns 0 carrier 0 collisions 0 lots of TX packets are dropped. list other packet size: ➜ boot netperf -H 192.168.199.200 -t UDP_STREAM -- -m 1 MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.199.200 () port 0 AF_INET Socket Message Elapsed Messages SizeSize Time Okay Errors Throughput bytes bytessecs# # 10^6bits/sec 212992 1 10.00 2297941 0 1.84 212992 10.00 1462024 1.17 ➜ boot netperf -H 192.168.199.200 -t UDP_STREAM -- -m 128 MIGRATED UDP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to 192.168.199.200 () port 0 AF_INET Socket Message Elapsed Messages SizeSize Time Okay Errors Throughput bytes bytessecs# # 10^6bits/sec 212992 128 10.00 2311547 0 236.70 212992 10.00 1359834
Re: [PATCH] quorum: Implement bdrv_co_block_status()
I test this patch in COLO, it resolve the issue qcow2 image become larger after drive-mirror. Thank you! Tested-by: Tao Xu On 11/5/2020 2:04 AM, Alberto Garcia wrote: The quorum driver does not implement bdrv_co_block_status() and because of that it always reports to contain data even if all its children are known to be empty. One consequence of this is that if we for example create a quorum with a size of 10GB and we mirror it to a new image the operation will write 10GB of actual zeroes to the destination image wasting a lot of time and disk space. Since a quorum has an arbitrary number of children of potentially different formats there is no way to report all possible allocation status flags in a way that makes sense, so this implementation only reports when a given region is known to contain zeroes (BDRV_BLOCK_ZERO) or not (BDRV_BLOCK_DATA). If all children agree that a region contains zeroes then we can return BDRV_BLOCK_ZERO using the smallest size reported by the children (because all agree that a region of at least that size contains zeroes). If at least one child disagrees we have to return BDRV_BLOCK_DATA. In this case we use the largest of the sizes reported by the children that didn't return BDRV_BLOCK_ZERO (because we know that there won't be an agreement for at least that size). Signed-off-by: Alberto Garcia --- block/quorum.c | 49 tests/qemu-iotests/312 | 148 + tests/qemu-iotests/312.out | 67 + tests/qemu-iotests/group | 1 + 4 files changed, 265 insertions(+) create mode 100755 tests/qemu-iotests/312 create mode 100644 tests/qemu-iotests/312.out diff --git a/block/quorum.c b/block/quorum.c index e846a7e892..29cee42705 100644 --- a/block/quorum.c +++ b/block/quorum.c @@ -18,6 +18,7 @@ #include "qemu/module.h" #include "qemu/option.h" #include "block/block_int.h" +#include "block/coroutines.h" #include "block/qdict.h" #include "qapi/error.h" #include "qapi/qapi-events-block.h" @@ -1174,6 +1175,53 @@ static void quorum_child_perm(BlockDriverState *bs, BdrvChild *c, | DEFAULT_PERM_UNCHANGED; } +/* + * Each one of the children can report different status flags even + * when they contain the same data, so what this function does is + * return BDRV_BLOCK_ZERO if *all* children agree that a certain + * region contains zeroes, and BDRV_BLOCK_DATA otherwise. + */ +static int coroutine_fn quorum_co_block_status(BlockDriverState *bs, + bool want_zero, + int64_t offset, int64_t count, + int64_t *pnum, int64_t *map, + BlockDriverState **file) +{ +BDRVQuorumState *s = bs->opaque; +int i, ret; +int64_t pnum_zero = count; +int64_t pnum_data = 0; + +for (i = 0; i < s->num_children; i++) { +int64_t bytes; +ret = bdrv_co_common_block_status_above(s->children[i]->bs, NULL, false, +want_zero, offset, count, +&bytes, NULL, NULL, NULL); +if (ret < 0) { +return ret; +} +/* + * Even if all children agree about whether there are zeroes + * or not at @offset they might disagree on the size, so use + * the smallest when reporting BDRV_BLOCK_ZERO and the largest + * when reporting BDRV_BLOCK_DATA. + */ +if (ret & BDRV_BLOCK_ZERO) { +pnum_zero = MIN(pnum_zero, bytes); +} else { +pnum_data = MAX(pnum_data, bytes); +} +} + +if (pnum_data) { +*pnum = pnum_data; +return BDRV_BLOCK_DATA; +} else { +*pnum = pnum_zero; +return BDRV_BLOCK_ZERO; +} +} + static const char *const quorum_strong_runtime_opts[] = { QUORUM_OPT_VOTE_THRESHOLD, QUORUM_OPT_BLKVERIFY, @@ -1192,6 +1240,7 @@ static BlockDriver bdrv_quorum = { .bdrv_close = quorum_close, .bdrv_gather_child_options = quorum_gather_child_options, .bdrv_dirname = quorum_dirname, +.bdrv_co_block_status = quorum_co_block_status, .bdrv_co_flush_to_disk = quorum_co_flush, diff --git a/tests/qemu-iotests/312 b/tests/qemu-iotests/312 new file mode 100755 index 00..1b08f1552f --- /dev/null +++ b/tests/qemu-iotests/312 @@ -0,0 +1,148 @@ +#!/usr/bin/env bash +# +# Test drive-mirror with quorum +# +# The goal of this test is to check how the quorum driver reports +# regions that are known to read as zeroes (BDRV_BLOCK_ZERO). The idea +# is that drive-mirror will try the efficient representation of zeroes +# in the destinati
Re: [PATCH 3/3] numa: Initialize node initiator with respect to .has_cpu
On 6/3/20 5:16 PM, Michal Privoznik wrote: On 6/2/20 10:00 AM, Tao Xu wrote: On 6/1/2020 4:10 PM, Michal Privoznik wrote: On 5/29/20 5:09 PM, Igor Mammedov wrote: On Fri, 29 May 2020 15:33:48 +0200 Michal Privoznik wrote: The initiator attribute of a NUMA node is documented as the 'NUMA node that has best performance to given NUMA node'. If a NUMA node has at least one CPU there can hardly be a different node with better performace and thus all NUMA nodes which have a CPU are initiators to themselves. Reflect this fact when initializing the attribute. It is not true in case of the node is memory-less Are you saying that if there's a memory-less NUMA node, then it needs to have initiator set too? Asking mostly out of curiosity because we don't allow memory-less NUMA nodes in Libvirt just yet. Nor cpu-less, but my patches that I'm referring to in cover letter will allow at least cpu-less nodes. Should I allow both? QEMU now is not support memory-less NUMA node, but in hardware may be supported. So we reserve this type of NUMA node for future usage. And QEMU now can support cpu-less NUMA node, for emulating some "slow" memory(like some NVDIMM). Oh yeah, I understand that. But it doesn't explain why initiator needs to be specified for NUMA nodes with cpus and memory, or does it? Maybe I'm still misunderstanding what the initiator is. Yes, the initiator NUMA nodes with cpus and memory should be itself. In ACPI 6.3 spec, initiator is defined as: This field is valid only if the memory controller responsible for satisfying the access to memory belonging to the specified memory proximity domain is directly attached to an initiator that belongs to a proximity domain. In that case, this field contains the integer that represents the proximity domain to which the initiator (Generic Initiator or Processor) belongs. This number shall match the corresponding entry in the SRAT table’s processor affinity structure (e.g., Processor Local APIC/SAPIC Affinity Structure, Processor Local x2APIC Affinity Structure, GICC Affinity Structure) if the initiator is a processor, or the Generic Initiator Affinity Structure if the initator is a generic initiator. Note: this field provides additional information as to the initiator node that is closest (as in directly attached) to the memory address ranges within the specified memory proximity domain, and therefore should provide the best performance. And if in the future, there is a memory-less NUMA node. Because in HMAT we describe "Memory" Proximity Domain Attributes Structure, I think we should not add memory-less NUMA node into HMAT. Also, can you shed more light into why machine_set_cpu_numa_node() did not override the .initiator? And this one is still unanswered too. Because from user's perspective, initiator has to be set on all NUMA nodes (if HMAT is enabled) and it seems like this auto assignment code is not run/not working. Michal So we check the HMAT configure in hw/core/machine.c numa_validate_initiator(NumaState *numa_state) because the initiator NUMA nodes with cpus and memory should be itself. And in machine_set_cpu_numa_node we didn't use auto assignment way just use user's setting in cli (although there is only one right choice for NUMA nodes with cpus and memory). But I don't know if it is appropriate to auto assign the initiator for NUMA nodes with cpus and memory.
Re: [PATCH 3/3] numa: Initialize node initiator with respect to .has_cpu
On 6/1/2020 4:10 PM, Michal Privoznik wrote: On 5/29/20 5:09 PM, Igor Mammedov wrote: On Fri, 29 May 2020 15:33:48 +0200 Michal Privoznik wrote: The initiator attribute of a NUMA node is documented as the 'NUMA node that has best performance to given NUMA node'. If a NUMA node has at least one CPU there can hardly be a different node with better performace and thus all NUMA nodes which have a CPU are initiators to themselves. Reflect this fact when initializing the attribute. It is not true in case of the node is memory-less Are you saying that if there's a memory-less NUMA node, then it needs to have initiator set too? Asking mostly out of curiosity because we don't allow memory-less NUMA nodes in Libvirt just yet. Nor cpu-less, but my patches that I'm referring to in cover letter will allow at least cpu-less nodes. Should I allow both? QEMU now is not support memory-less NUMA node, but in hardware may be supported. So we reserve this type of NUMA node for future usage. And QEMU now can support cpu-less NUMA node, for emulating some "slow" memory(like some NVDIMM). Also, can you shed more light into why machine_set_cpu_numa_node() did not override the .initiator? Thanks, Michal
Re: [PATCH v4] target/i386: Add notes for versioned CPU models
Hi Eduardo Could you review this patch? Tao Xu On 3/24/2020 1:10 PM, Xu, Tao3 wrote: Add which features are added or removed in this version. Signed-off-by: Tao Xu --- The output is as follows: qemu-system-x86_64 -cpu help | grep "\[" x86 Cascadelake-Server-v2 Intel Xeon Processor (Cascadelake) [ARCH_CAPABILITIES] x86 Cascadelake-Server-v3 Intel Xeon Processor (Cascadelake) [ARCH_CAPABILITIES, no TSX] x86 Denverton-v2 Intel Atom Processor (Denverton) [no MPX, no MONITOR] x86 Icelake-Client-v2 Intel Core Processor (Icelake) [no TSX] x86 Icelake-Server-v2 Intel Xeon Processor (Icelake) [no TSX] Changes in v3: - Keep the existing custom model-id (Eduardo) Changes in v2: - correct the note of Cascadelake v3 (Xiaoyao) --- target/i386/cpu.c | 5 + 1 file changed, 5 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 34b511f078..1c7690baa0 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3192,6 +3192,7 @@ static X86CPUDefinition builtin_x86_defs[] = { .versions = (X86CPUVersionDefinition[]) { { .version = 1 }, { .version = 2, + .note = "ARCH_CAPABILITIES", .props = (PropValue[]) { { "arch-capabilities", "on" }, { "rdctl-no", "on" }, @@ -3203,6 +3204,7 @@ static X86CPUDefinition builtin_x86_defs[] = { }, { .version = 3, .alias = "Cascadelake-Server-noTSX", + .note = "ARCH_CAPABILITIES, no TSX", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, @@ -3424,6 +3426,7 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 1 }, { .version = 2, +.note = "no TSX", .alias = "Icelake-Client-noTSX", .props = (PropValue[]) { { "hle", "off" }, @@ -3541,6 +3544,7 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 1 }, { .version = 2, +.note = "no TSX", .alias = "Icelake-Server-noTSX", .props = (PropValue[]) { { "hle", "off" }, @@ -3648,6 +3652,7 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 1 }, { .version = 2, +.note = "no MPX, no MONITOR", .props = (PropValue[]) { { "monitor", "off" }, { "mpx", "off" },
Re: Migration with ``drive-mirror`` + NBD will let quorum qcow2 image become larger
On 5/19/2020 10:49 PM, Alberto Garcia wrote: On Tue 19 May 2020 11:15:44 AM CEST, Kevin Wolf wrote: But maybe it could return a limited set of flags at least so that the mirror job can get the BDRV_BLOCK_ZERO information if the quorum children agree on it. Yeah, maybe it is possible to implement a conservative version of that function and fall back to BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED (or something like that) in the cases where there's no clear alternative. Berto Thank you Kevin and Berto for your suggestion.
Migration with ``drive-mirror`` + NBD will let quorum qcow2 image become larger
Hi, I am using ``drive-mirror`` + NBD for live storage migration. But I find that if I use a qcow2 image(virtual size: 10 GiB, disk size: 1.8 GiB) as a child of quorum, then the destination image become larger(virtual size: 10 GiB, disk size: 10 GiB). However if I use a qcow2 image directly, then the destination image(virtual size: 10 GiB, disk size: 1.8 GiB) will be equal to the source. So I am wondering if my usage is wrong or it is expected with quorum+drive-mirror? P.S. Detail: 1) [On *destination* Host]: qemu-img create -f qcow2 fedora32.qcow2 10G Formatting 'fedora32.qcow2', fmt=qcow2 size=10737418240 cluster_size=65536 lazy_refcounts=off refcount_bits=16 qemu-img info fedora32.qcow2 image: fedora32.qcow2 file format: qcow2 virtual size: 10 GiB (10737418240 bytes) disk size: 196 KiB cluster_size: 65536 Format specific information: compat: 1.1 lazy refcounts: false refcount bits: 16 corrupt: false Boot the QEMU using: disk_path=fedora32.qcow2 net_param="-netdev tap,id=hn0,vhost=off,br=br0,helper=/usr/local/libexec/qemu-bridge-helper -device rtl8139,id=e0,netdev=hn0" cmdline="qemu-system-x86_64 \ -enable-kvm \ -m 2G -smp 4 -qmp stdio -bios OVMF.fd \ -monitor telnet:127.0.0.1:,nowait,server -vnc :7 -rtc base=utc \ -cpu host -device cirrus-vga,id=video0,bus=pci.0,addr=0x2 \ -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 \ -device usb-tablet,id=input0,bus=usb.0,port=1 $net_param \ -drive if=none,id=parent0,file.filename=$disk_path,driver=qcow2 \ -incoming tcp:0:" exec $cmdline [On *destination* QEMU]: {'execute':'qmp_capabilities'} {'execute': 'nbd-server-start', 'arguments': {'addr': {'type': 'inet', 'data': {'host': '192.168.0.33', 'port': '8889'} } } } {'execute': 'nbd-server-add', 'arguments': {'device': 'parent0', 'writable': true } } 2) [On *source* Host]: Boot the QEMU using: disk_path=fedora32.qcow2 net_param="-netdev tap,id=hn0,vhost=off,br=br0,helper=/usr/local/libexec/qemu-bridge-helper -device rtl8139,id=e0,netdev=hn0" cmdline="qemu-system-x86_64 \ -enable-kvm \ -m 2G -smp 4 -qmp stdio -bios OVMF.fd \ -monitor telnet:127.0.0.1:,nowait,server -vnc :7 -rtc base=utc \ -cpu host -device cirrus-vga,id=video0,bus=pci.0,addr=0x2 \ -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 \ -device usb-tablet,id=input0,bus=usb.0,port=1 $net_param \ -drive if=virtio,id=colo-disk0,driver=quorum,vote-threshold=1,children.0.file.filename=$disk_path,children.0.driver=qcow2" exec $cmdline [On *source* QEMU]: {'execute':'qmp_capabilities'} {'execute': 'drive-mirror', 'arguments':{ 'device': 'colo-disk0', 'job-id': 'resync', 'target': 'nbd://192.168.0.33:8889/parent0', 'mode': 'existing', 'format': 'nbd', 'sync': 'full'} } {"timestamp": {"seconds": 1589902560, "microseconds": 107418}, "event": "JOB_STATUS_CHANGE", "data": {"status": "created", "id": "resync"}} {"timestamp": {"seconds": 1589902560, "microseconds": 107487}, "event": "JOB_STATUS_CHANGE", "data": {"status": "running", "id": "resync"}} {"return": {}} {"timestamp": {"seconds": 1589902721, "microseconds": 439095}, "event": "JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "resync"}} {"timestamp": {"seconds": 1589902721, "microseconds": 439194}, "event": "BLOCK_JOB_READY", "data": {"device": "resync", "len": 10739253248, "offset": 10739253248, "speed": 0, "type": "mirror"}} 3)[On *destination* Host]: qemu-img info fedora32.qcow2 image: fedora32.qcow2 file format: qcow2 virtual size: 10 GiB (10737418240 bytes) disk size: 10 GiB cluster_size: 65536 Format specific information: compat: 1.1 lazy refcounts: false refcount bits: 16 corrupt: false 4)But if [On *source* Host] boot qemu using: disk_path=fedora32.qcow2 net_param="-netdev tap,id=hn0,vhost=off,br=br0,helper=/usr/local/libexec/qemu-bridge-helper -device rtl8139,id=e0,netdev=hn0" cmdline="qemu-system-x86_64 \ -enable-kvm \ -m 2G -smp 4 -qmp stdio -bios OVMF.fd \ -monitor telnet:127.0.0.1:,nowait,server -vnc :7 -rtc base=utc \ -cpu host -device cirrus-vga,id=video0,bus=pci.0,addr=0x2 \ -device piix3-usb-uhci,id=usb,bus=pci.0,addr=0x1.0x2 \ -device usb-tablet,id=input0,bus=usb.0,port=1 $net_param \ -drive if=virtio,id=parent0,file.filename=$disk_path,driver=qcow2" exec $cmdline Then [On *destination* Host]: qemu-img info fedora32.qcow2 image: fedora32.qcow2 file format: qcow2 virtual size: 10 GiB (10737418240 bytes) disk size: 1.8 GiB cluster_size: 65536 Format specific information: compat: 1.1 lazy refcounts: false refcount bits: 16 corrupt: false
Re: [PATCH v4] target/i386: Add notes for versioned CPU models
Ping for comments On 3/24/2020 1:10 PM, Xu, Tao3 wrote: Add which features are added or removed in this version. Signed-off-by: Tao Xu --- The output is as follows: qemu-system-x86_64 -cpu help | grep "\[" x86 Cascadelake-Server-v2 Intel Xeon Processor (Cascadelake) [ARCH_CAPABILITIES] x86 Cascadelake-Server-v3 Intel Xeon Processor (Cascadelake) [ARCH_CAPABILITIES, no TSX] x86 Denverton-v2 Intel Atom Processor (Denverton) [no MPX, no MONITOR] x86 Icelake-Client-v2 Intel Core Processor (Icelake) [no TSX] x86 Icelake-Server-v2 Intel Xeon Processor (Icelake) [no TSX] Changes in v3: - Keep the existing custom model-id (Eduardo) Changes in v2: - correct the note of Cascadelake v3 (Xiaoyao) --- target/i386/cpu.c | 5 + 1 file changed, 5 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 34b511f078..1c7690baa0 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3192,6 +3192,7 @@ static X86CPUDefinition builtin_x86_defs[] = { .versions = (X86CPUVersionDefinition[]) { { .version = 1 }, { .version = 2, + .note = "ARCH_CAPABILITIES", .props = (PropValue[]) { { "arch-capabilities", "on" }, { "rdctl-no", "on" }, @@ -3203,6 +3204,7 @@ static X86CPUDefinition builtin_x86_defs[] = { }, { .version = 3, .alias = "Cascadelake-Server-noTSX", + .note = "ARCH_CAPABILITIES, no TSX", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, @@ -3424,6 +3426,7 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 1 }, { .version = 2, +.note = "no TSX", .alias = "Icelake-Client-noTSX", .props = (PropValue[]) { { "hle", "off" }, @@ -3541,6 +3544,7 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 1 }, { .version = 2, +.note = "no TSX", .alias = "Icelake-Server-noTSX", .props = (PropValue[]) { { "hle", "off" }, @@ -3648,6 +3652,7 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 1 }, { .version = 2, +.note = "no MPX, no MONITOR", .props = (PropValue[]) { { "monitor", "off" }, { "mpx", "off" },
[PATCH v4] target/i386: Add notes for versioned CPU models
Add which features are added or removed in this version. Signed-off-by: Tao Xu --- The output is as follows: qemu-system-x86_64 -cpu help | grep "\[" x86 Cascadelake-Server-v2 Intel Xeon Processor (Cascadelake) [ARCH_CAPABILITIES] x86 Cascadelake-Server-v3 Intel Xeon Processor (Cascadelake) [ARCH_CAPABILITIES, no TSX] x86 Denverton-v2 Intel Atom Processor (Denverton) [no MPX, no MONITOR] x86 Icelake-Client-v2 Intel Core Processor (Icelake) [no TSX] x86 Icelake-Server-v2 Intel Xeon Processor (Icelake) [no TSX] Changes in v3: - Keep the existing custom model-id (Eduardo) Changes in v2: - correct the note of Cascadelake v3 (Xiaoyao) --- target/i386/cpu.c | 5 + 1 file changed, 5 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 34b511f078..1c7690baa0 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3192,6 +3192,7 @@ static X86CPUDefinition builtin_x86_defs[] = { .versions = (X86CPUVersionDefinition[]) { { .version = 1 }, { .version = 2, + .note = "ARCH_CAPABILITIES", .props = (PropValue[]) { { "arch-capabilities", "on" }, { "rdctl-no", "on" }, @@ -3203,6 +3204,7 @@ static X86CPUDefinition builtin_x86_defs[] = { }, { .version = 3, .alias = "Cascadelake-Server-noTSX", + .note = "ARCH_CAPABILITIES, no TSX", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, @@ -3424,6 +3426,7 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 1 }, { .version = 2, +.note = "no TSX", .alias = "Icelake-Client-noTSX", .props = (PropValue[]) { { "hle", "off" }, @@ -3541,6 +3544,7 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 1 }, { .version = 2, +.note = "no TSX", .alias = "Icelake-Server-noTSX", .props = (PropValue[]) { { "hle", "off" }, @@ -3648,6 +3652,7 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 1 }, { .version = 2, +.note = "no MPX, no MONITOR", .props = (PropValue[]) { { "monitor", "off" }, { "mpx", "off" }, -- 2.20.1
Re: [PATCH v2] target/i386: Add ARCH_CAPABILITIES related bits into Icelake-Server CPU model
On 3/24/2020 2:39 AM, Eduardo Habkost wrote: On Mon, Mar 23, 2020 at 10:58:16AM +0800, Xiaoyao Li wrote: On 3/23/2020 10:32 AM, Tao Xu wrote: Hi Xiaoyao, May be you can add .note for this new version. for example: + .version = 3, + .note = "ARCH_CAPABILITIES", + .props = (PropValue[]) { Hi Paolo and Eduardo, Need I spin a new version to add the .note ? Maybe you can add it when queue? Please send a follow up patch so we don't hold a bug fix because of something that's just cosmetic. I will queue this patch. We still need a new version of "target/i386: Add notes for versioned CPU models"[1], don't we? [1] https://lore.kernel.org/qemu-devel/20200228215253.gb494...@habkost.net/ I am sorry for misunderstanding your comments in that patch[1]. I will submit a new version of this patch.
Re: [PATCH v2] target/i386: Add ARCH_CAPABILITIES related bits into Icelake-Server CPU model
Hi Xiaoyao, May be you can add .note for this new version. for example: +.version = 3, +.note = "ARCH_CAPABILITIES", +.props = (PropValue[]) { On 3/16/2020 5:56 PM, Xiaoyao Li wrote: Current Icelake-Server CPU model lacks all the features enumerated by MSR_IA32_ARCH_CAPABILITIES. Add them, so that guest of "Icelake-Server" can see all of them. Signed-off-by: Xiaoyao Li --- v2: - Add it as a new version. --- target/i386/cpu.c | 13 + 1 file changed, 13 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 92fafa265914..5fba6a2ad6b3 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3496,6 +3496,19 @@ static X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ } }, }, +{ +.version = 3, +.props = (PropValue[]) { +{ "arch-capabilities", "on" }, +{ "rdctl-no", "on" }, +{ "ibrs-all", "on" }, +{ "skip-l1dfl-vmentry", "on" }, +{ "mds-no", "on" }, +{ "pschange-mc-no", "on" }, +{ "taa-no", "on" }, +{ /* end of list */ } +}, +}, { /* end of list */ } } }, -- 2.20.1
Re: [PATCH v3 2/4] target/i386: Remove monitor from some CPU models
On 3/3/2020 1:19 AM, Eduardo Habkost wrote: On Mon, Mar 02, 2020 at 07:47:28PM +0800, Tao Xu wrote: On 2/29/2020 5:39 AM, Eduardo Habkost wrote: On Wed, Feb 12, 2020 at 04:13:26PM +0800, Tao Xu wrote: Add new version of Snowridge, Denverton, Opteron_G3, EPYC, and Dhyana CPU model to uremove MONITOR/MWAIT featre. After QEMU/KVM use "-overcommit cpu-pm=on" to expose MONITOR/MWAIT (commit id 6f131f13e68d648a8e4f083c667ab1acd88ce4cd), the MONITOR/MWAIT feature in these CPU model is unused. Signed-off-by: Tao Xu What exactly is the problem you are trying to fix? No CPU model will ever have monitor=on set by default with KVM, because kvm_default_props has a monitor=off element. Maybe it is not a fix. For example, when we boot a guest with Denverton cpu model, guest cannot detect MONITOR/MWAIT and boot with no warning, because of "monitor=off" by default. The MONITOR/MWAIT feature in these CPU model is unused,but no harm. I am wondering if we should remove it from existing CPU models. As monitor=off is on kvm_default_props, changing the CPU model table will only affect other accelerators (e.g. TCG, where MONITOR/MWAIT support is advertised as supported). We shouldn't be dictating policy for other accelerators just because KVM doesn't support it. Removing the feature on kvm_default_props is sufficient. I understand, thanks.
Re: [PATCH v3 4/4] target/i386: Add notes for versioned CPU models
On 2/29/2020 5:52 AM, Eduardo Habkost wrote: On Wed, Feb 12, 2020 at 04:13:28PM +0800, Tao Xu wrote: Add which features are added or removed in this version. Remove the changed model-id in versioned CPU models, to keep the model name unchanged at /proc/cpuinfo inside the VM. Signed-off-by: Tao Xu --- Changes in v2: - correct the note of Cascadelake v3 (Xiaoyao) --- target/i386/cpu.c | 54 ++- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 81a039beb6..739ef4ce91 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -2278,10 +2278,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Nehalem-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Intel Core i7 9xx (Nehalem Core i7, IBRS update)" }, { /* end of list */ } Changing model-id is guest-visible, so we can't do this. The same applies to the other models where model-id is being removed. I suggest using the .note property only on the CPU model versions that don't have custom model-id set yet, or when existing information on model-id is incomplete. For future CPU model versions, we can start using only .note and stop changing model-id. Got it,thanks!
Re: [PATCH v3 2/4] target/i386: Remove monitor from some CPU models
On 2/29/2020 5:39 AM, Eduardo Habkost wrote: On Wed, Feb 12, 2020 at 04:13:26PM +0800, Tao Xu wrote: Add new version of Snowridge, Denverton, Opteron_G3, EPYC, and Dhyana CPU model to uremove MONITOR/MWAIT featre. After QEMU/KVM use "-overcommit cpu-pm=on" to expose MONITOR/MWAIT (commit id 6f131f13e68d648a8e4f083c667ab1acd88ce4cd), the MONITOR/MWAIT feature in these CPU model is unused. Signed-off-by: Tao Xu What exactly is the problem you are trying to fix? No CPU model will ever have monitor=on set by default with KVM, because kvm_default_props has a monitor=off element. Maybe it is not a fix. For example, when we boot a guest with Denverton cpu model, guest cannot detect MONITOR/MWAIT and boot with no warning, because of "monitor=off" by default. The MONITOR/MWAIT feature in these CPU model is unused,but no harm. I am wondering if we should remove it from existing CPU models.
Re: [PATCH v3 4/4] target/i386: Add notes for versioned CPU models
On 2/12/2020 5:00 PM, Igor Mammedov wrote: On Wed, 12 Feb 2020 16:13:28 +0800 Tao Xu wrote: Add which features are added or removed in this version. Remove the changed model-id in versioned CPU models, to keep the model name unchanged at /proc/cpuinfo inside the VM. Signed-off-by: Tao Xu --- Changes in v2: - correct the note of Cascadelake v3 (Xiaoyao) --- target/i386/cpu.c | 54 ++- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 81a039beb6..739ef4ce91 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c [...] @@ -3142,6 +3130,7 @@ static X86CPUDefinition builtin_x86_defs[] = { .versions = (X86CPUVersionDefinition[]) { { .version = 1 }, { .version = 2, + .note = "ARCH_CAPABILITIES", what's ARCH_CAPABILITIES? These are some features exposed by MSR_IA32_ARCH_CAPABILITIES. For Cascadelake, these are "rdctl-no" "ibrs-all" "skip-l1dfl-vmentry" "mds-no"
[PATCH v3 4/4] target/i386: Add notes for versioned CPU models
Add which features are added or removed in this version. Remove the changed model-id in versioned CPU models, to keep the model name unchanged at /proc/cpuinfo inside the VM. Signed-off-by: Tao Xu --- Changes in v2: - correct the note of Cascadelake v3 (Xiaoyao) --- target/i386/cpu.c | 54 ++- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 81a039beb6..739ef4ce91 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -2278,10 +2278,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Nehalem-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Intel Core i7 9xx (Nehalem Core i7, IBRS update)" }, { /* end of list */ } } }, @@ -2359,10 +2358,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Westmere-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Westmere E56xx/L56xx/X56xx (IBRS update)" }, { /* end of list */ } } }, @@ -2445,10 +2443,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "SandyBridge-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Intel Xeon E312xx (Sandy Bridge, IBRS update)" }, { /* end of list */ } } }, @@ -2537,10 +2534,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "IvyBridge-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Intel Xeon E3-12xx v2 (Ivy Bridge, IBRS)" }, { /* end of list */ } } }, @@ -2634,17 +2630,18 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Haswell-noTSX", +.note = "no TSX", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, { "stepping", "1" }, -{ "model-id", "Intel Core Processor (Haswell, no TSX)", }, { /* end of list */ } }, }, { .version = 3, .alias = "Haswell-IBRS", +.note = "IBRS", .props = (PropValue[]) { /* Restore TSX features removed by -v2 above */ { "hle", "on" }, @@ -2655,21 +2652,18 @@ static X86CPUDefinition builtin_x86_defs[] = { */ { "stepping", "4" }, { "spec-ctrl", "on" }, -{ "model-id", - "Intel Core Processor (Haswell, IBRS)" }, { /* end of list */ } } }, { .version = 4, .alias = "Haswell-noTSX-IBRS", +.note = "no TSX, IBRS", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, /* spec-ctrl was already enabled by -v3 above */ { "stepping", "1" }, -{ "model-id", - "Intel Core Processor (Haswell, no TSX, IBRS)" }, { /* end of list */ } } }, @@ -2765,35 +2759,33 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Broadwell-noTSX", +.note = "no TSX", .props = (PropValue[]) { { "hle", "off" }, { "rtm", &quo
[PATCH v3 1/4] target/i386: Add Denverton-v2 (no MPX) CPU model
Because MPX is being removed from the linux kernel, remove MPX feature from Denverton. Signed-off-by: Tao Xu --- target/i386/cpu.c | 12 1 file changed, 12 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 32efa46852..848c992cd3 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3592,6 +3592,18 @@ static X86CPUDefinition builtin_x86_defs[] = { .features[FEAT_VMX_VMFUNC] = MSR_VMX_VMFUNC_EPT_SWITCHING, .xlevel = 0x8008, .model_id = "Intel Atom Processor (Denverton)", +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ "mpx", "off" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, { .name = "Snowridge", -- 2.20.1
[PATCH v3 0/4] Add extra information to versioned CPU models
This series of patches will remove MPX from Denverton, remove Remove monitor from some CPU models. Add additional information for -cpu help to indicate the changes in this version of CPU model. The output is as follows: ./x86_64-softmmu/qemu-system-x86_64 -cpu help | grep "\[" x86 Broadwell-v2 Intel Core Processor (Broadwell) [no TSX] x86 Broadwell-v3 Intel Core Processor (Broadwell) [IBRS] x86 Broadwell-v4 Intel Core Processor (Broadwell) [no TSX, IBRS] x86 Cascadelake-Server-v2 Intel Xeon Processor (Cascadelake) [ARCH_CAPABILITIES] x86 Cascadelake-Server-v3 Intel Xeon Processor (Cascadelake) [ARCH_CAPABILITIES, no TSX] x86 Denverton-v2 Intel Atom Processor (Denverton) [no MPX, no MONITOR] x86 Dhyana-v2 Hygon Dhyana Processor [no MONITOR] x86 EPYC-v2 AMD EPYC Processor [IBPB] x86 EPYC-v3 AMD EPYC Processor [IBPB, no MONITOR] x86 Haswell-v2Intel Core Processor (Haswell) [no TSX] x86 Haswell-v3Intel Core Processor (Haswell) [IBRS] x86 Haswell-v4Intel Core Processor (Haswell) [no TSX, IBRS] x86 Icelake-Client-v2 Intel Core Processor (Icelake) [no TSX] x86 Icelake-Server-v2 Intel Xeon Processor (Icelake) [no TSX] x86 IvyBridge-v2 Intel Xeon E3-12xx v2 (Ivy Bridge) [IBRS] x86 Nehalem-v2Intel Core i7 9xx (Nehalem Class Core i7) [IBRS] x86 Opteron_G3-v2 AMD Opteron 23xx (Gen 3 Class Opteron) [no MONITOR] x86 SandyBridge-v2Intel Xeon E312xx (Sandy Bridge) [IBRS] x86 Skylake-Client-v2 Intel Core Processor (Skylake) [IBRS] x86 Skylake-Client-v3 Intel Core Processor (Skylake) [no TSX, IBRS] x86 Skylake-Server-v2 Intel Xeon Processor (Skylake) [IBRS] x86 Skylake-Server-v3 Intel Xeon Processor (Skylake) [no TSX, IBRS] x86 Snowridge-v2 Intel Atom Processor (SnowRidge) [no MPX] x86 Snowridge-v3 Intel Atom Processor (SnowRidge) [no MPX, no MONITOR] x86 Westmere-v2 Westmere E56xx/L56xx/X56xx (Nehalem-C) [IBRS] Changes in v2: - Rebase - correct the note of Cascadelake v3 (Xiaoyao) Tao Xu (4): target/i386: Add Denverton-v2 (no MPX) CPU model target/i386: Remove monitor from some CPU models target/i386: Add new property note to versioned CPU models target/i386: Add notes for versioned CPU models target/i386/cpu.c | 115 +- 1 file changed, 84 insertions(+), 31 deletions(-) -- 2.20.1
[PATCH v3 3/4] target/i386: Add new property note to versioned CPU models
Add additional information for -cpu help to indicate the changes in this version of CPU model. Suggested-by: Eduardo Habkost Signed-off-by: Tao Xu --- target/i386/cpu.c | 11 +-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 6905e4eabd..81a039beb6 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1690,6 +1690,7 @@ typedef struct PropValue { typedef struct X86CPUVersionDefinition { X86CPUVersion version; const char *alias; +const char *note; PropValue *props; } X86CPUVersionDefinition; @@ -1720,6 +1721,7 @@ struct X86CPUModel { X86CPUDefinition *cpudef; /* CPU model version */ X86CPUVersion version; +const char *note; /* * If true, this is an alias CPU model. * This matters only for "-cpu help" and query-cpu-definitions @@ -4899,6 +4901,7 @@ static void x86_cpu_list_entry(gpointer data, gpointer user_data) g_autofree char *name = x86_cpu_class_get_model_name(cc); g_autofree char *desc = g_strdup(cc->model_description); g_autofree char *alias_of = x86_cpu_class_get_alias_of(cc); +g_autofree char *model_id = x86_cpu_class_get_model_id(cc); if (!desc && alias_of) { if (cc->model && cc->model->version == CPU_VERSION_AUTO) { @@ -4907,11 +4910,14 @@ static void x86_cpu_list_entry(gpointer data, gpointer user_data) desc = g_strdup_printf("(alias of %s)", alias_of); } } +if (!desc && cc->model && cc->model->note) { +desc = g_strdup_printf("%s [%s]", model_id, cc->model->note); +} if (!desc) { -desc = x86_cpu_class_get_model_id(cc); +desc = g_strdup_printf("%s", model_id); } -qemu_printf("x86 %-20s %-48s\n", name, desc); +qemu_printf("x86 %-20s %-58s\n", name, desc); } /* list available CPU models and flags */ @@ -5388,6 +5394,7 @@ static void x86_register_cpudef_types(X86CPUDefinition *def) x86_cpu_versioned_model_name(def, vdef->version); m->cpudef = def; m->version = vdef->version; +m->note = vdef->note; x86_register_cpu_model_type(name, m); if (vdef->alias) { -- 2.20.1
[PATCH v3 2/4] target/i386: Remove monitor from some CPU models
Add new version of Snowridge, Denverton, Opteron_G3, EPYC, and Dhyana CPU model to remove MONITOR/MWAIT feature. After QEMU/KVM use "-overcommit cpu-pm=on" to expose MONITOR/MWAIT (commit id 6f131f13e68d648a8e4f083c667ab1acd88ce4cd), the MONITOR/MWAIT feature in these CPU model is unused. Signed-off-by: Tao Xu --- target/i386/cpu.c | 38 ++ 1 file changed, 38 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 848c992cd3..6905e4eabd 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3731,6 +3731,14 @@ static X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ }, }, }, +{ +.version = 3, +.props = (PropValue[]) { +/* mpx was already removed by -v2 above */ +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, { /* end of list */ }, }, }, @@ -3842,6 +3850,17 @@ static X86CPUDefinition builtin_x86_defs[] = { CPUID_EXT3_ABM | CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM, .xlevel = 0x8008, .model_id = "AMD Opteron 23xx (Gen 3 Class Opteron)", +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, { .name = "Opteron_G4", @@ -3966,6 +3985,14 @@ static X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ } } }, +{ +.version = 3, +.props = (PropValue[]) { +/* ibpb was already enabled by -v2 above */ +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, { /* end of list */ } } }, @@ -4018,6 +4045,17 @@ static X86CPUDefinition builtin_x86_defs[] = { .xlevel = 0x801E, .model_id = "Hygon Dhyana Processor", .cache_info = &epyc_cache_info, +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, }; -- 2.20.1
[PATCH RESEND v2 4/4] target/i386: Add notes for versioned CPU models
Add which features are added or removed in this version. Remove the changed model-id in versioned CPU models. Signed-off-by: Tao Xu --- Changes in v2: - correct the note of Cascadelake v3 (Xiaoyao) --- target/i386/cpu.c | 50 +++ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 736b4c7326..4daa153bfa 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -2278,10 +2278,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Nehalem-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Intel Core i7 9xx (Nehalem Core i7, IBRS update)" }, { /* end of list */ } } }, @@ -2359,10 +2358,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Westmere-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Westmere E56xx/L56xx/X56xx (IBRS update)" }, { /* end of list */ } } }, @@ -2445,10 +2443,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "SandyBridge-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Intel Xeon E312xx (Sandy Bridge, IBRS update)" }, { /* end of list */ } } }, @@ -2537,10 +2534,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "IvyBridge-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Intel Xeon E3-12xx v2 (Ivy Bridge, IBRS)" }, { /* end of list */ } } }, @@ -2634,17 +2630,18 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Haswell-noTSX", +.note = "no TSX", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, { "stepping", "1" }, -{ "model-id", "Intel Core Processor (Haswell, no TSX)", }, { /* end of list */ } }, }, { .version = 3, .alias = "Haswell-IBRS", +.note = "IBRS", .props = (PropValue[]) { /* Restore TSX features removed by -v2 above */ { "hle", "on" }, @@ -2655,21 +2652,18 @@ static X86CPUDefinition builtin_x86_defs[] = { */ { "stepping", "4" }, { "spec-ctrl", "on" }, -{ "model-id", - "Intel Core Processor (Haswell, IBRS)" }, { /* end of list */ } } }, { .version = 4, .alias = "Haswell-noTSX-IBRS", +.note = "no TSX, IBRS", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, /* spec-ctrl was already enabled by -v3 above */ { "stepping", "1" }, -{ "model-id", - "Intel Core Processor (Haswell, no TSX, IBRS)" }, { /* end of list */ } } }, @@ -2765,35 +2759,33 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Broadwell-noTSX", +.note = "no TSX", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, -{ "model-id", "Inte
[PATCH RESEND v2 0/4] Add extra information to versioned CPU models
This series of patches will remove MPX from Denverton, remove Remove monitor from some CPU models. Add additional information for -cpu help to indicate the changes in this version of CPU model. The output is as follows: ./x86_64-softmmu/qemu-system-x86_64 -cpu help | grep "\[" x86 Broadwell-v2 Intel Core Processor (Broadwell) [no TSX] x86 Broadwell-v3 Intel Core Processor (Broadwell) [IBRS] x86 Broadwell-v4 Intel Core Processor (Broadwell) [no TSX, IBRS] x86 Cascadelake-Server-v2 Intel Xeon Processor (Cascadelake) [ARCH_CAPABILITIES] x86 Cascadelake-Server-v3 Intel Xeon Processor (Cascadelake) [ARCH_CAPABILITIES, no TSX] x86 Denverton-v2 Intel Atom Processor (Denverton) [no MPX, no MONITOR] x86 Dhyana-v2 Hygon Dhyana Processor [no MONITOR] x86 EPYC-v2 AMD EPYC Processor [IBPB] x86 EPYC-v3 AMD EPYC Processor [IBPB, no MONITOR] x86 Haswell-v2Intel Core Processor (Haswell) [no TSX] x86 Haswell-v3Intel Core Processor (Haswell) [IBRS] x86 Haswell-v4Intel Core Processor (Haswell) [no TSX, IBRS] x86 Icelake-Client-v2 Intel Core Processor (Icelake) [no TSX] x86 Icelake-Server-v2 Intel Xeon Processor (Icelake) [no TSX] x86 IvyBridge-v2 Intel Xeon E3-12xx v2 (Ivy Bridge) [IBRS] x86 Nehalem-v2Intel Core i7 9xx (Nehalem Class Core i7) [IBRS] x86 Opteron_G3-v2 AMD Opteron 23xx (Gen 3 Class Opteron) [no MONITOR] x86 SandyBridge-v2Intel Xeon E312xx (Sandy Bridge) [IBRS] x86 Skylake-Client-v2 Intel Core Processor (Skylake) [IBRS] x86 Skylake-Client-v3 Intel Core Processor (Skylake) [no TSX, IBRS] x86 Skylake-Server-v2 Intel Xeon Processor (Skylake) [IBRS] x86 Skylake-Server-v3 Intel Xeon Processor (Skylake) [no TSX, IBRS] x86 Snowridge-v2 Intel Atom Processor (SnowRidge) [no MPX] x86 Snowridge-v3 Intel Atom Processor (SnowRidge) [no MPX, no MONITOR] x86 Westmere-v2 Westmere E56xx/L56xx/X56xx (Nehalem-C) [IBRS] Changes in v2: - Rebase - correct the note of Cascadelake v3 (Xiaoyao) Tao Xu (4): target/i386: Add Denverton-v2 (no MPX) CPU model target/i386: Remove monitor from some CPU models target/i386: Add new property note to versioned CPU models target/i386: Add notes for versioned CPU models target/i386/cpu.c | 111 +++--- 1 file changed, 84 insertions(+), 27 deletions(-) -- 2.20.1
[PATCH RESEND v2 3/4] target/i386: Add new property note to versioned CPU models
Add additional information for -cpu help to indicate the changes in this version of CPU model. Suggested-by: Eduardo Habkost Signed-off-by: Tao Xu --- target/i386/cpu.c | 11 +-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index a6eb1b81fd..736b4c7326 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1690,6 +1690,7 @@ typedef struct PropValue { typedef struct X86CPUVersionDefinition { X86CPUVersion version; const char *alias; +const char *note; PropValue *props; } X86CPUVersionDefinition; @@ -1720,6 +1721,7 @@ struct X86CPUModel { X86CPUDefinition *cpudef; /* CPU model version */ X86CPUVersion version; +const char *note; /* * If true, this is an alias CPU model. * This matters only for "-cpu help" and query-cpu-definitions @@ -4846,6 +4848,7 @@ static void x86_cpu_list_entry(gpointer data, gpointer user_data) g_autofree char *name = x86_cpu_class_get_model_name(cc); g_autofree char *desc = g_strdup(cc->model_description); g_autofree char *alias_of = x86_cpu_class_get_alias_of(cc); +g_autofree char *model_id = x86_cpu_class_get_model_id(cc); if (!desc && alias_of) { if (cc->model && cc->model->version == CPU_VERSION_AUTO) { @@ -4854,11 +4857,14 @@ static void x86_cpu_list_entry(gpointer data, gpointer user_data) desc = g_strdup_printf("(alias of %s)", alias_of); } } +if (!desc && cc->model && cc->model->note) { +desc = g_strdup_printf("%s [%s]", model_id, cc->model->note); +} if (!desc) { -desc = x86_cpu_class_get_model_id(cc); +desc = g_strdup_printf("%s", model_id); } -qemu_printf("x86 %-20s %-48s\n", name, desc); +qemu_printf("x86 %-20s %-58s\n", name, desc); } /* list available CPU models and flags */ @@ -5335,6 +5341,7 @@ static void x86_register_cpudef_types(X86CPUDefinition *def) x86_cpu_versioned_model_name(def, vdef->version); m->cpudef = def; m->version = vdef->version; +m->note = vdef->note; x86_register_cpu_model_type(name, m); if (vdef->alias) { -- 2.20.1
[PATCH RESEND v2 1/4] target/i386: Add Denverton-v2 (no MPX) CPU model
Because MPX is being removed from the linux kernel, remove MPX feature from Denverton. Signed-off-by: Tao Xu --- target/i386/cpu.c | 12 1 file changed, 12 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 31556b7ec4..6981aa2a34 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3539,6 +3539,18 @@ static X86CPUDefinition builtin_x86_defs[] = { .features[FEAT_VMX_VMFUNC] = MSR_VMX_VMFUNC_EPT_SWITCHING, .xlevel = 0x8008, .model_id = "Intel Atom Processor (Denverton)", +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ "mpx", "off" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, { .name = "Snowridge", -- 2.20.1
[PATCH RESEND v2 2/4] target/i386: Remove monitor from some CPU models
Add new version of Snowridge, Denverton, Opteron_G3, EPYC, and Dhyana CPU model to remove MONITOR/MWAIT feature. After QEMU/KVM use "-overcommit cpu-pm=on" to expose MONITOR/MWAIT (commit id 6f131f13e68d648a8e4f083c667ab1acd88ce4cd), the MONITOR/MWAIT feature in these CPU model is unused. Signed-off-by: Tao Xu --- target/i386/cpu.c | 38 ++ 1 file changed, 38 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 6981aa2a34..a6eb1b81fd 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3678,6 +3678,14 @@ static X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ }, }, }, +{ +.version = 3, +.props = (PropValue[]) { +/* mpx was already removed by -v2 above */ +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, { /* end of list */ }, }, }, @@ -3789,6 +3797,17 @@ static X86CPUDefinition builtin_x86_defs[] = { CPUID_EXT3_ABM | CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM, .xlevel = 0x8008, .model_id = "AMD Opteron 23xx (Gen 3 Class Opteron)", +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, { .name = "Opteron_G4", @@ -3913,6 +3932,14 @@ static X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ } } }, +{ +.version = 3, +.props = (PropValue[]) { +/* ibpb was already enabled by -v2 above */ +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, { /* end of list */ } } }, @@ -3965,6 +3992,17 @@ static X86CPUDefinition builtin_x86_defs[] = { .xlevel = 0x801E, .model_id = "Hygon Dhyana Processor", .cache_info = &epyc_cache_info, +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, }; -- 2.20.1
Re: [PATCH] util/cutils: Expand do_strtosz parsing precision to 64 bits
On 12/19/2019 2:26 AM, Markus Armbruster wrote: Tao Xu writes: On 12/18/2019 9:33 AM, Tao Xu wrote: On 12/17/2019 6:25 PM, Markus Armbruster wrote: [...] Also fun: for "0123", we use uint64_t 83, not double 123.0. But for "0123.", we use 123.0, not 83. Do we really want to accept octal and hexadecimal integers? Thank you for reminding me. Octal and hexadecimal may bring more confusion. I will use qemu_strtou64(nptr, &suffixu, 10, &valu) and add test for input like "0123". Hi Markus, After I use qemu_strtou64(nptr, &suffixu, 10, &valu), it cause another question. Because qemu_strtod_finite support hexadecimal input, so in this situation, it will parsed as double. It will also let large hexadecimal integers be rounded. So there may be two solution: 1: use qemu_strtou64(nptr, &suffixu, 0, &valu) and parse octal as decimal. This will keep hexadecimal valid as now. "0123" --> 123; "0x123" --> 291 How would you make qemu_strtou64() parse octal as decimal? How about this solution, set @base as variable, if we detect hexadecimal, we use 0, then can prase decimal as u64, else we use 10, then can prase octal as decimal, because 0 prefix will be ignored in qemu_strtou64(nptr, &suffixu, 10, &valu); const char *p = nptr; while (qemu_isspace(*p)) { p++; } if (*p == '0' && (qemu_toupper(*(p+1)) == 'X' ||) { base = 0; } else { base = 10; } retd = qemu_strtod_finite(nptr, &suffixd, &vald); retu = qemu_strtou64(nptr, &suffixu, base, &valu); use_strtod = strlen(suffixd) < strlen(suffixu); if (use_strtod) { endptr = suffixd; retval = retd; } else { endptr = suffixu; retval = retu; } 2: use qemu_strtou64(nptr, &suffixu, 10, &valu) and reject octal and decimal. "0123" --> Error; "0x123" --> Error How would you reject the 0x prefix? How about check the first&second character is '0' and 'x' and then return -EINVAL.
Re: [PATCH RESEND v20 0/8] Build ACPI Heterogeneous Memory Attribute Table (HMAT)
On 12/13/2019 6:06 PM, Michael S. Tsirkin wrote: On Fri, Dec 13, 2019 at 09:19:21AM +0800, Tao Xu wrote: This series of patches will build Heterogeneous Memory Attribute Table (HMAT) according to the command line. The ACPI HMAT describes the memory attributes, such as memory side cache attributes and bandwidth and latency details, related to the Memory Proximity Domain. The software is expected to use HMAT information as hint for optimization. In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report the platform's HMAT tables. The V19 patches link: https://patchwork.kernel.org/cover/11265525/ Looks good to me, I'll queue it for merge after the release. If possible please ping me after the release to help make sure it didn't get dropped. Hi Michael, I am wondering if these patches can be merged this week, because QEMU 5.0 developing tree is open and next week may be the holidays. Thank you very much! Tao Xu
Re: [PATCH] util/cutils: Expand do_strtosz parsing precision to 64 bits
On 12/18/2019 9:33 AM, Tao Xu wrote: On 12/17/2019 6:25 PM, Markus Armbruster wrote: Tao Xu writes: On 12/5/19 11:29 PM, Markus Armbruster wrote: Tao Xu writes: Parse input string both as a double and as a uint64_t, then use the method which consumes more characters. Update the related test cases. Signed-off-by: Tao Xu --- [...] diff --git a/util/cutils.c b/util/cutils.c index 77acadc70a..b08058c57c 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -212,24 +212,43 @@ static int do_strtosz(const char *nptr, const char **end, const char default_suffix, int64_t unit, uint64_t *result) { - int retval; - const char *endptr; + int retval, retd, retu; + const char *suffix, *suffixd, *suffixu; unsigned char c; int mul_required = 0; - double val, mul, integral, fraction; + bool use_strtod; + uint64_t valu; + double vald, mul, integral, fraction; Note for later: @mul is double. + + retd = qemu_strtod_finite(nptr, &suffixd, &vald); + retu = qemu_strtou64(nptr, &suffixu, 0, &valu); Note for later: passing 0 to base accepts octal and hexadecimal integers. + use_strtod = strlen(suffixd) < strlen(suffixu); + + /* + * Parse @nptr both as a double and as a uint64_t, then use the method + * which consumes more characters. + */ The comment is in a funny place. I'd put it right before the qemu_strtod_finite() line. + if (use_strtod) { + suffix = suffixd; + retval = retd; + } else { + suffix = suffixu; + retval = retu; + } - retval = qemu_strtod_finite(nptr, &endptr, &val); if (retval) { goto out; } This is even more subtle than it looks. A close reading of the function contracts leads to three cases for each conversion: * parse error (including infinity and NaN) @retu / @retd is -EINVAL @valu / @vald is uninitialized @suffixu / @suffixd is @nptr * range error @retu / @retd is -ERANGE @valu / @vald is our best approximation of the conversion result @suffixu / @suffixd points to the first character not consumed by the conversion. Sub-cases: - uint64_t overflow We know the conversion result exceeds UINT64_MAX. - double overflow we know the conversion result's magnitude exceeds the largest representable finite double DBL_MAX. - double underflow we know the conversion result is close to zero (closer than DBL_MIN, the smallest normalized positive double). * success @retu / @retd is 0 @valu / @vald is the conversion result @suffixu / @suffixd points to the first character not consumed by the conversion. This leads to a matrix (parse error, uint64_t overflow, success) x (parse error, double overflow, double underflow, success). We need to check the code does what we want for each element of this matrix, and document any behavior that's not perfectly obvious. (success, success): we pick uint64_t if qemu_strtou64() consumed more characters than qemu_strtod_finite(), else double. "More" is important here; when they consume the same characters, we *need* to use the uint64_t result. Example: for "18446744073709551615", we need to use uint64_t 18446744073709551615, not double 18446744073709551616.0. But for "18446744073709551616.", we need to use the double. Good. Also fun: for "0123", we use uint64_t 83, not double 123.0. But for "0123.", we use 123.0, not 83. Do we really want to accept octal and hexadecimal integers? Thank you for reminding me. Octal and hexadecimal may bring more confusion. I will use qemu_strtou64(nptr, &suffixu, 10, &valu) and add test for input like "0123". Hi Markus, After I use qemu_strtou64(nptr, &suffixu, 10, &valu), it cause another question. Because qemu_strtod_finite support hexadecimal input, so in this situation, it will parsed as double. It will also let large hexadecimal integers be rounded. So there may be two solution: 1: use qemu_strtou64(nptr, &suffixu, 0, &valu) and parse octal as decimal. This will keep hexadecimal valid as now. "0123" --> 123; "0x123" --> 291 2: use qemu_strtou64(nptr, &suffixu, 10, &valu) and reject octal and decimal. "0123" --> Error; "0x123" --> Error
Re: [PATCH] util/cutils: Expand do_strtosz parsing precision to 64 bits
On 12/17/2019 11:01 PM, Markus Armbruster wrote: Christophe de Dinechin writes: On 17 Dec 2019, at 15:08, Markus Armbruster wrote: Christophe de Dinechin writes: On 5 Dec 2019, at 16:29, Markus Armbruster wrote: Tao Xu writes: Parse input string both as a double and as a uint64_t, then use the method which consumes more characters. Update the related test cases. Signed-off-by: Tao Xu --- [...] diff --git a/util/cutils.c b/util/cutils.c index 77acadc70a..b08058c57c 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -212,24 +212,43 @@ static int do_strtosz(const char *nptr, const char **end, const char default_suffix, int64_t unit, uint64_t *result) { -int retval; -const char *endptr; +int retval, retd, retu; +const char *suffix, *suffixd, *suffixu; unsigned char c; int mul_required = 0; -double val, mul, integral, fraction; +bool use_strtod; +uint64_t valu; +double vald, mul, integral, fraction; Note for later: @mul is double. + +retd = qemu_strtod_finite(nptr, &suffixd, &vald); +retu = qemu_strtou64(nptr, &suffixu, 0, &valu); +use_strtod = strlen(suffixd) < strlen(suffixu); + +/* + * Parse @nptr both as a double and as a uint64_t, then use the method + * which consumes more characters. + */ The comment is in a funny place. I'd put it right before the qemu_strtod_finite() line. +if (use_strtod) { +suffix = suffixd; +retval = retd; +} else { +suffix = suffixu; +retval = retu; +} -retval = qemu_strtod_finite(nptr, &endptr, &val); if (retval) { goto out; } This is even more subtle than it looks. But why it is even necessary? The “contract” for the function used to be that it returned rounded values beyond 2^53, which in itself is curious. But now it’s a 6-dimensional matrix of hell with NaNs and barfnots, when the name implies it’s simply doing a text to u64 conversion… There is certainly a reason, but I’m really curious what it is :-) It all goes back to commit 9f9b17a4f0 "Introduce strtosz() library function to convert a string to a byte count.". To support "convenient" usage like "1.5G", it parses the number part with strtod(). This limits us to 53 bits of precision. Larger sizes get rounded. I guess the excuse for this was that when you're dealing with sizes that large (petabytes!), your least significant bits are zero anyway. Regardless, the interface is *awful*. We should've forced the author to spell it out in all its glory in a proper function contract. That tends to cool the enthusiasm for "convenient" syntax amazingly fast. The awful interface has been confusing people for close to a decade now. What to do? I see. Thanks for the rationale. I knew it had to make sense :-) For a value of "sense"... I’d probably avoid strtod even with the convenient syntax above. Do you want 1.33e-6M to be allowed? Do we want to ever accept or generate NaN or Inf values? NaN or Inf definitely not. That's why we use qemu_strtod_finite() before and after the patch. No sane person should ever use 1.33e-6M. Or even 1.1k (which yields 1126, rounded silently from machine number 1126.40001, which approximates the true value 1126.4). Certain fractions are actually sane. 1.5k denotes a perfectly fine integer, which the code manages not to screw up. I'd recommend against using fractions regardless. What usage are we prepared to break? What kind of confusion are we willing to bear? Those are the questions. Tao Xu's patch tries to make the function do what its users expect, namely parse a bleepin' 64 bit integer, without breaking any of the "convenience" syntax. Turns out that's amazingly subtle. Are we making things less confusing or more? Thanks for your explanation. I think another reason is build-in 'size' is really commonly used. May be someone use '-m 1.5G' to boot QEMU or write it to a config file.
Re: [PATCH RESEND v2] util/cutils: Expand do_strtosz parsing precision to 64 bits
On 12/17/2019 7:44 PM, Christophe de Dinechin wrote: On 9 Dec 2019, at 09:30, Tao Xu wrote: Parse input string both as a double and as a uint64_t, then use the method which consumes more characters. Update the related test cases. Signed-off-by: Tao Xu --- Changes in v2: - Resend to use double small than DBL_MIN - Add more test case for double overflow and underflow. - Set mul as int64_t (Markus) - Restore endptr (Markus) --- tests/test-cutils.c| 37 +++ tests/test-keyval.c| 47 + tests/test-qemu-opts.c | 39 +--- util/cutils.c | 67 +++--- 4 files changed, 75 insertions(+), 115 deletions(-) [...] +/* + * Parse @nptr both as a double and as a uint64_t, then use the method + * which consumes more characters. + */ Why do ever need to parse as double if you have uint64? Because we want to keep do_strtosz Compatible with double input (such as 1.5k). +retd = qemu_strtod_finite(nptr, &suffixd, &vald); +retu = qemu_strtou64(nptr, &suffixu, 0, &valu); +use_strtod = strlen(suffixd) < strlen(suffixu); You could simply compare suffixd and suffixu: use_strtod = suffixd > suffixu; Thank you for your suggestion. + +if (use_strtod) { +endptr = suffixd; +retval = retd; +} else { +endptr = suffixu; +retval = retu; +} -retval = qemu_strtod_finite(nptr, &endptr, &val); if (retval) { goto out; } -fraction = modf(val, &integral); -if (fraction != 0) { -mul_required = 1; +if (use_strtod) { +fraction = modf(vald, &integral); +if (fraction != 0) { +mul_required = 1; +} } c = *endptr; mul = suffix_mul(c, unit); @@ -238,17 +258,30 @@ static int do_strtosz(const char *nptr, const char **end, retval = -EINVAL; goto out; } -/* - * Values near UINT64_MAX overflow to 2**64 when converting to double - * precision. Compare against the maximum representable double precision - * value below 2**64, computed as "the next value after 2**64 (0x1p64) in - * the direction of 0". - */ -if ((val * mul > nextafter(0x1p64, 0)) || val < 0) { -retval = -ERANGE; -goto out; + +if (use_strtod) { +/* + * Values near UINT64_MAX overflow to 2**64 when converting to double + * precision. Compare against the maximum representable double precision + * value below 2**64, computed as "the next value after 2**64 (0x1p64) + * in the direction of 0". + */ +if ((vald * mul > nextafter(0x1p64, 0)) || vald < 0) { +retval = -ERANGE; +goto out; +} +*result = vald * mul; +} else { +/* Reject negative input and overflow output */ +while (qemu_isspace(*nptr)) { +nptr++; +} +if (*nptr == '-' || UINT64_MAX / mul < valu) { +retval = -ERANGE; +goto out; +} +*result = valu * mul; } -*result = val * mul; retval = 0; out: -- 2.20.1
Re: [PATCH] util/cutils: Expand do_strtosz parsing precision to 64 bits
On 12/17/2019 6:25 PM, Markus Armbruster wrote: Tao Xu writes: On 12/5/19 11:29 PM, Markus Armbruster wrote: Tao Xu writes: Parse input string both as a double and as a uint64_t, then use the method which consumes more characters. Update the related test cases. Signed-off-by: Tao Xu --- [...] diff --git a/util/cutils.c b/util/cutils.c index 77acadc70a..b08058c57c 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -212,24 +212,43 @@ static int do_strtosz(const char *nptr, const char **end, const char default_suffix, int64_t unit, uint64_t *result) { -int retval; -const char *endptr; +int retval, retd, retu; +const char *suffix, *suffixd, *suffixu; unsigned char c; int mul_required = 0; -double val, mul, integral, fraction; +bool use_strtod; +uint64_t valu; +double vald, mul, integral, fraction; Note for later: @mul is double. + +retd = qemu_strtod_finite(nptr, &suffixd, &vald); +retu = qemu_strtou64(nptr, &suffixu, 0, &valu); Note for later: passing 0 to base accepts octal and hexadecimal integers. +use_strtod = strlen(suffixd) < strlen(suffixu); + +/* + * Parse @nptr both as a double and as a uint64_t, then use the method + * which consumes more characters. + */ The comment is in a funny place. I'd put it right before the qemu_strtod_finite() line. +if (use_strtod) { +suffix = suffixd; +retval = retd; +} else { +suffix = suffixu; +retval = retu; +} -retval = qemu_strtod_finite(nptr, &endptr, &val); if (retval) { goto out; } This is even more subtle than it looks. A close reading of the function contracts leads to three cases for each conversion: * parse error (including infinity and NaN) @retu / @retd is -EINVAL @valu / @vald is uninitialized @suffixu / @suffixd is @nptr * range error @retu / @retd is -ERANGE @valu / @vald is our best approximation of the conversion result @suffixu / @suffixd points to the first character not consumed by the conversion. Sub-cases: - uint64_t overflow We know the conversion result exceeds UINT64_MAX. - double overflow we know the conversion result's magnitude exceeds the largest representable finite double DBL_MAX. - double underflow we know the conversion result is close to zero (closer than DBL_MIN, the smallest normalized positive double). * success @retu / @retd is 0 @valu / @vald is the conversion result @suffixu / @suffixd points to the first character not consumed by the conversion. This leads to a matrix (parse error, uint64_t overflow, success) x (parse error, double overflow, double underflow, success). We need to check the code does what we want for each element of this matrix, and document any behavior that's not perfectly obvious. (success, success): we pick uint64_t if qemu_strtou64() consumed more characters than qemu_strtod_finite(), else double. "More" is important here; when they consume the same characters, we *need* to use the uint64_t result. Example: for "18446744073709551615", we need to use uint64_t 18446744073709551615, not double 18446744073709551616.0. But for "18446744073709551616.", we need to use the double. Good. Also fun: for "0123", we use uint64_t 83, not double 123.0. But for "0123.", we use 123.0, not 83. Do we really want to accept octal and hexadecimal integers? Thank you for reminding me. Octal and hexadecimal may bring more confusion. I will use qemu_strtou64(nptr, &suffixu, 10, &valu) and add test for input like "0123". (success, parse error) and (parse error, success): we pick the one that succeeds, because success consumes characters, and failure to parse does not. Good. (parse error, parse error): neither consumes characters, so we pick uint64_t. Good. (parse error, double overflow), (parse error, double underflow) and (uint64_t overflow, parse error): we pick the range error, because it consumes characters. Good. These are the simple combinations. The remainder are hairier: (success, double overflow), (success, double underflow), (uint64_t overflow, success). I lack the time to analyze them today. Must be done before we take this patch. Any takers? (success, double overflow), (success, double underflow), pick double overflow error, return -ERANGE. Because it consumes characters. Example: for "1.79769e+309", qemu_strtou64 consumes "1", and prases as uint64_t; but qemu_strtod_finite return -ERANGE and consumes all characters. It is OK. The only way to have double overflow when uint64_t succeeds is an exponent. Double consumes the characters making up the exponent, uint64_t does not. We use double. The only way to have double underflo
Re: [PATCH RESEND v2] util/cutils: Expand do_strtosz parsing precision to 64 bits
Gentle ping. On 12/9/2019 4:30 PM, Xu, Tao3 wrote: Parse input string both as a double and as a uint64_t, then use the method which consumes more characters. Update the related test cases. Signed-off-by: Tao Xu --- Changes in v2: - Resend to use double small than DBL_MIN - Add more test case for double overflow and underflow. - Set mul as int64_t (Markus) - Restore endptr (Markus) --- tests/test-cutils.c| 37 +++ tests/test-keyval.c| 47 + tests/test-qemu-opts.c | 39 +--- util/cutils.c | 67 +++--- 4 files changed, 75 insertions(+), 115 deletions(-) diff --git a/tests/test-cutils.c b/tests/test-cutils.c index 1aa8351520..49e495b8ba 100644 --- a/tests/test-cutils.c +++ b/tests/test-cutils.c @@ -1970,40 +1970,25 @@ static void test_qemu_strtosz_simple(void) g_assert_cmpint(err, ==, 0); g_assert_cmpint(res, ==, 12345); -/* Note: precision is 53 bits since we're parsing with strtod() */ - -str = "9007199254740991"; /* 2^53-1 */ -err = qemu_strtosz(str, &endptr, &res); -g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0x1f); -g_assert(endptr == str + 16); - -str = "9007199254740992"; /* 2^53 */ -err = qemu_strtosz(str, &endptr, &res); -g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0x20); -g_assert(endptr == str + 16); +/* Note: precision is 64 bits (UINT64_MAX) */ str = "9007199254740993"; /* 2^53+1 */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0x20); /* rounded to 53 bits */ +g_assert_cmpint(res, ==, 0x21); g_assert(endptr == str + 16); -str = "18446744073709549568"; /* 0xf800 (53 msbs set) */ +str = "18446744073709550591"; /* 0xfbff */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0xf800); +g_assert_cmpint(res, ==, 0xfbff); g_assert(endptr == str + 20); -str = "18446744073709550591"; /* 0xfbff */ +str = "18446744073709551615"; /* 2^64-1 (UINT64_MAX) */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0xf800); /* rounded to 53 bits */ +g_assert_cmpint(res, ==, 0x); g_assert(endptr == str + 20); - -/* 0x7e00..0x7fff get rounded to - * 0x8000, thus -ERANGE; see test_qemu_strtosz_erange() */ } static void test_qemu_strtosz_units(void) @@ -2145,20 +2130,20 @@ static void test_qemu_strtosz_erange(void) g_assert_cmpint(err, ==, -ERANGE); g_assert(endptr == str + 2); -str = "18446744073709550592"; /* 0xfc00 */ +str = "18446744073709551616"; /* 2^64 */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, -ERANGE); g_assert(endptr == str + 20); -str = "18446744073709551615"; /* 2^64-1 */ +str = "1.7976931348623158e+308"; /* DBL_MAX, double overflows */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, -ERANGE); -g_assert(endptr == str + 20); +g_assert(endptr == str + 23); -str = "18446744073709551616"; /* 2^64 */ +str = "2.225e-308"; /* Small than DBL_MIN, double underflows */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, -ERANGE); -g_assert(endptr == str + 20); +g_assert(endptr == str + 10); str = "20E"; err = qemu_strtosz(str, &endptr, &res); diff --git a/tests/test-keyval.c b/tests/test-keyval.c index 09b0ae3c68..fad941fcb8 100644 --- a/tests/test-keyval.c +++ b/tests/test-keyval.c @@ -383,59 +383,26 @@ static void test_keyval_visit_size(void) visit_end_struct(v, NULL); visit_free(v); -/* Note: precision is 53 bits since we're parsing with strtod() */ +/* Note: precision is 64 bits (UINT64_MAX) */ -/* Around limit of precision: 2^53-1, 2^53, 2^53+1 */ -qdict = keyval_parse("sz1=9007199254740991," - "sz2=9007199254740992," - "sz3=9007199254740993", +/* Around limit of precision: UINT64_MAX - 1, UINT64_MAX */ +qdict = keyval_parse("sz1=18446744073709551614," + "sz2=18446744073709551615", NULL, &error_abort); v = qobject_input_visitor_new_keyval(QOBJECT(qdict)); qobject_unref(qdict); visit_start_struct(v, NULL, NULL, 0, &error_abort);
Re: [PATCH v2 0/4] Add extra information to versioned CPU models
Ping for comments. On 12/9/2019 3:12 PM, Tao Xu wrote: This series of patches will remove MPX from Denverton, remove Remove monitor from some CPU models. Add additional information for -cpu help to indicate the changes in this version of CPU model. The output is as follows: ./x86_64-softmmu/qemu-system-x86_64 -cpu help | grep "\[" x86 Broadwell-v2 Intel Core Processor (Broadwell) [no TSX] x86 Broadwell-v3 Intel Core Processor (Broadwell) [IBRS] x86 Broadwell-v4 Intel Core Processor (Broadwell) [no TSX, IBRS] x86 Cascadelake-Server-v2 Intel Xeon Processor (Cascadelake) [ARCH_CAPABILITIES] x86 Cascadelake-Server-v3 Intel Xeon Processor (Cascadelake) [ARCH_CAPABILITIES, no TSX] x86 Denverton-v2 Intel Atom Processor (Denverton) [no MPX, no MONITOR] x86 Dhyana-v2 Hygon Dhyana Processor [no MONITOR] x86 EPYC-v2 AMD EPYC Processor [IBPB] x86 EPYC-v3 AMD EPYC Processor [IBPB, no MONITOR] x86 Haswell-v2Intel Core Processor (Haswell) [no TSX] x86 Haswell-v3Intel Core Processor (Haswell) [IBRS] x86 Haswell-v4Intel Core Processor (Haswell) [no TSX, IBRS] x86 Icelake-Client-v2 Intel Core Processor (Icelake) [no TSX] x86 Icelake-Server-v2 Intel Xeon Processor (Icelake) [no TSX] x86 IvyBridge-v2 Intel Xeon E3-12xx v2 (Ivy Bridge) [IBRS] x86 Nehalem-v2Intel Core i7 9xx (Nehalem Class Core i7) [IBRS] x86 Opteron_G3-v2 AMD Opteron 23xx (Gen 3 Class Opteron) [no MONITOR] x86 SandyBridge-v2Intel Xeon E312xx (Sandy Bridge) [IBRS] x86 Skylake-Client-v2 Intel Core Processor (Skylake) [IBRS] x86 Skylake-Client-v3 Intel Core Processor (Skylake) [no TSX, IBRS] x86 Skylake-Server-v2 Intel Xeon Processor (Skylake) [IBRS] x86 Skylake-Server-v3 Intel Xeon Processor (Skylake) [no TSX, IBRS] x86 Snowridge-v2 Intel Atom Processor (SnowRidge) [no MPX] x86 Snowridge-v3 Intel Atom Processor (SnowRidge) [no MPX, no MONITOR] x86 Westmere-v2 Westmere E56xx/L56xx/X56xx (Nehalem-C) [IBRS] Changes in v2: - correct the note of Cascadelake v3 (Xiaoyao) Tao Xu (4): target/i386: Add Denverton-v2 (no MPX) CPU model target/i386: Remove monitor from some CPU models target/i386: Add new property note to versioned CPU models target/i386: Add notes for versioned CPU models target/i386/cpu.c | 112 +++--- 1 file changed, 85 insertions(+), 27 deletions(-) -- 2.20.1
Re: [PATCH 2/2] numa: properly check if numa is supported
On 12/13/2019 5:12 PM, Igor Mammedov wrote: On Fri, 13 Dec 2019 09:33:10 +0800 Tao Xu wrote: On 12/12/2019 8:48 PM, Igor Mammedov wrote: Commit aa57020774b, by mistake used MachineClass::numa_mem_supported to check if NUMA is supported by machine and also as unrelated change set it to true for sbsa-ref board. Luckily change didn't break machines that support NUMA, as the field is set to true for them. But the field is not intended for checking if NUMA is supported and will be flipped to false within this release for new machine types. Fix it: - by using previously used condition !mc->cpu_index_to_instance_props || !mc->get_default_cpu_node_id the first time and then use MachineState::numa_state down the road to check if NUMA is supported - dropping stray sbsa-ref chunk Fixes: aa57020774b690a22be72453b8e91c9b5a68c516 Signed-off-by: Igor Mammedov --- CC: Radoslaw Biernacki CC: Peter Maydell CC: Leif Lindholm CC: qemu-...@nongnu.org CC: qemu-sta...@nongnu.org hw/arm/sbsa-ref.c | 1 - hw/core/machine.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c index 27046cc..c6261d4 100644 --- a/hw/arm/sbsa-ref.c +++ b/hw/arm/sbsa-ref.c @@ -791,7 +791,6 @@ static void sbsa_ref_class_init(ObjectClass *oc, void *data) mc->possible_cpu_arch_ids = sbsa_ref_possible_cpu_arch_ids; mc->cpu_index_to_instance_props = sbsa_ref_cpu_index_to_props; mc->get_default_cpu_node_id = sbsa_ref_get_default_cpu_node_id; -mc->numa_mem_supported = true; } static const TypeInfo sbsa_ref_info = { diff --git a/hw/core/machine.c b/hw/core/machine.c index 1689ad3..aa63231 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -958,7 +958,7 @@ static void machine_initfn(Object *obj) NULL); } -if (mc->numa_mem_supported) { +if (mc->cpu_index_to_instance_props && mc->get_default_cpu_node_id) { ms->numa_state = g_new0(NumaState, 1); } I am wondering if @numa_mem_supported is unused here, it is unused for QEMU, because the only usage of @numa_mem_supported is to initialize @numa_state. Or there is other usage? So should it be removed from struct MachineClass? You are wrong, it's not intended for numa_state initialization, read doc comment for it in include/hw/boards.h (for full story look at commit cd5ff8333a3) I understand.
Re: [PATCH RESEND v20 0/8] Build ACPI Heterogeneous Memory Attribute Table (HMAT)
On 12/13/2019 6:06 PM, Michael S. Tsirkin wrote: On Fri, Dec 13, 2019 at 09:19:21AM +0800, Tao Xu wrote: This series of patches will build Heterogeneous Memory Attribute Table (HMAT) according to the command line. The ACPI HMAT describes the memory attributes, such as memory side cache attributes and bandwidth and latency details, related to the Memory Proximity Domain. The software is expected to use HMAT information as hint for optimization. In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report the platform's HMAT tables. The V19 patches link: https://patchwork.kernel.org/cover/11265525/ Looks good to me, I'll queue it for merge after the release. If possible please ping me after the release to help make sure it didn't get dropped. Thank you! Changelog: v20: - Resend to fix the wrong target in pc_hmat_erange_cfg() - Use g_assert_true and g_assert_false to replace g_assert (Thomas and Markus) - Rename assoc as associativity, update the QAPI description (Markus) - Disable cache level 0 in hmat-cache option (Igor) - Keep base and bitmap unchanged when latency or bandwidth out of range - Fix the broken CI case when user input latency or bandwidth less than required. v19: - Add description about the machine property 'hmat' in commit message (Markus) - Update the QAPI comments - Add a check for no memory side cache - Add some fail cases for hmat-cache when level=0 v18: - Defer patches 01/14~06/14 of V17, use qapi type uint64 and only nanosecond for latency (Markus) - Rewrite the lines over 80 characters(Igor) v17: - Add check when user input latency or bandwidth 0, the lb_info_provided should also be 0. Because in ACPI 6.3 5.2.27.4, 0 means the corresponding latency or bandwidth information is not provided. - Fix the infinite loop when node->latency is 0. - Use NumaHmatCacheOptions to replace HMAT_Cache_Info (Igor) - Add check for unordered cache level input (Igor) - Add some fail test cases (Igor) v16: - Add and use qemu_strtold_finite to parse size, support full 64bit precision, modify related test cases (Eduardo and Markus) - Simplify struct HMAT_LB_Info and related code, unify latency and bandwidth (Igor) - Add cross check with hmat_lb data (Igor) - Fields in Cache Attributes are promoted to uint32_t before shifting (Igor) - Add case for QMP build HMAT (Igor) v15: - Add a new patch to refactor do_strtosz() (Eduardo) - Make tests without breaking CI (Michael) v14: - Reuse the codes of do_strtosz to build qemu_strtotime_ns (Eduardo) - Squash patch v13 01/12 and 02/12 together (Daniel and Eduardo) - Drop time unit picosecond (Eric) - Use qemu ctz64 and clz64 instead of builtin function v13: - Modify some text description - Drop "initiator_valid" field in struct NodeInfo - Reuse Garray to store the raw bandwidth and bandwidth data - Calculate common base unit using range bitmap - Add a patch to alculate hmat latency and bandwidth entry list - Drop the total_levels option and use readable cache size - Remove the unnecessary head file - Use decimal notation with appropriate suffix for cache size Liu Jingqi (5): numa: Extend CLI to provide memory latency and bandwidth information numa: Extend CLI to provide memory side cache information hmat acpi: Build Memory Proximity Domain Attributes Structure(s) hmat acpi: Build System Locality Latency and Bandwidth Information Structure(s) hmat acpi: Build Memory Side Cache Information Structure(s) Tao Xu (3): numa: Extend CLI to provide initiator information for numa nodes tests/numa: Add case for QMP build HMAT tests/bios-tables-test: add test cases for ACPI HMAT hw/acpi/Kconfig | 7 +- hw/acpi/Makefile.objs | 1 + hw/acpi/hmat.c| 268 +++ hw/acpi/hmat.h| 42 hw/core/machine.c | 64 ++ hw/core/numa.c| 297 ++ hw/i386/acpi-build.c | 5 + include/sysemu/numa.h | 63 ++ qapi/machine.json | 180 +++- qemu-options.hx | 95 +++- tests/bios-tables-test-allowed-diff.h | 8 + tests/bios-tables-test.c | 44 tests/data/acpi/pc/APIC.acpihmat | 0 tests/data/acpi/pc/DSDT.acpihmat | 0 tests/data/acpi/pc/HMAT.acpihmat | 0 tests/data/acpi/pc/SRAT.acpihmat | 0 tests/data/acpi/q35/APIC.acpihmat | 0 tests/data/acpi/q35/DSDT.acpihmat | 0 tests/data/acpi/q35/HMAT.acpihmat | 0 tests/data/acpi/q35/SRAT.acpihmat | 0 tests/numa-test.c
Re: [PATCH 2/2] numa: properly check if numa is supported
On 12/12/2019 8:48 PM, Igor Mammedov wrote: Commit aa57020774b, by mistake used MachineClass::numa_mem_supported to check if NUMA is supported by machine and also as unrelated change set it to true for sbsa-ref board. Luckily change didn't break machines that support NUMA, as the field is set to true for them. But the field is not intended for checking if NUMA is supported and will be flipped to false within this release for new machine types. Fix it: - by using previously used condition !mc->cpu_index_to_instance_props || !mc->get_default_cpu_node_id the first time and then use MachineState::numa_state down the road to check if NUMA is supported - dropping stray sbsa-ref chunk Fixes: aa57020774b690a22be72453b8e91c9b5a68c516 Signed-off-by: Igor Mammedov --- CC: Radoslaw Biernacki CC: Peter Maydell CC: Leif Lindholm CC: qemu-...@nongnu.org CC: qemu-sta...@nongnu.org hw/arm/sbsa-ref.c | 1 - hw/core/machine.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/hw/arm/sbsa-ref.c b/hw/arm/sbsa-ref.c index 27046cc..c6261d4 100644 --- a/hw/arm/sbsa-ref.c +++ b/hw/arm/sbsa-ref.c @@ -791,7 +791,6 @@ static void sbsa_ref_class_init(ObjectClass *oc, void *data) mc->possible_cpu_arch_ids = sbsa_ref_possible_cpu_arch_ids; mc->cpu_index_to_instance_props = sbsa_ref_cpu_index_to_props; mc->get_default_cpu_node_id = sbsa_ref_get_default_cpu_node_id; -mc->numa_mem_supported = true; } static const TypeInfo sbsa_ref_info = { diff --git a/hw/core/machine.c b/hw/core/machine.c index 1689ad3..aa63231 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -958,7 +958,7 @@ static void machine_initfn(Object *obj) NULL); } -if (mc->numa_mem_supported) { +if (mc->cpu_index_to_instance_props && mc->get_default_cpu_node_id) { ms->numa_state = g_new0(NumaState, 1); } I am wondering if @numa_mem_supported is unused here, it is unused for QEMU, because the only usage of @numa_mem_supported is to initialize @numa_state. Or there is other usage? So should it be removed from struct MachineClass?
[PATCH RESEND v20 6/8] hmat acpi: Build Memory Side Cache Information Structure(s)
From: Liu Jingqi This structure describes memory side cache information for memory proximity domains if the memory side cache is present and the physical device forms the memory side cache. The software could use this information to effectively place the data in memory to maximize the performance of the system memory that use the memory side cache. Acked-by: Markus Armbruster Reviewed-by: Igor Mammedov Reviewed-by: Daniel Black Reviewed-by: Jonathan Cameron Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- No changes in v20. Changes in v16: - Use checks and assert to replace masks (Igor) - Fields in Cache Attributes are promoted to uint32_t before shifting (Igor) - Drop cpu_to_le32() (Igor) Changes in v13: - rename level as cache_level --- hw/acpi/hmat.c | 69 +- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c index 4635d45dee..7c24bb5371 100644 --- a/hw/acpi/hmat.c +++ b/hw/acpi/hmat.c @@ -143,14 +143,62 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, g_free(entry_list); } +/* ACPI 6.3: 5.2.27.5 Memory Side Cache Information Structure: Table 5-147 */ +static void build_hmat_cache(GArray *table_data, uint8_t total_levels, + NumaHmatCacheOptions *hmat_cache) +{ +/* + * Cache Attributes: Bits [3:0] – Total Cache Levels + * for this Memory Proximity Domain + */ +uint32_t cache_attr = total_levels; + +/* Bits [7:4] : Cache Level described in this structure */ +cache_attr |= (uint32_t) hmat_cache->level << 4; + +/* Bits [11:8] - Cache Associativity */ +cache_attr |= (uint32_t) hmat_cache->associativity << 8; + +/* Bits [15:12] - Write Policy */ +cache_attr |= (uint32_t) hmat_cache->policy << 12; + +/* Bits [31:16] - Cache Line size in bytes */ +cache_attr |= (uint32_t) hmat_cache->line << 16; + +/* Type */ +build_append_int_noprefix(table_data, 2, 2); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* Length */ +build_append_int_noprefix(table_data, 32, 4); +/* Proximity Domain for the Memory */ +build_append_int_noprefix(table_data, hmat_cache->node_id, 4); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 4); +/* Memory Side Cache Size */ +build_append_int_noprefix(table_data, hmat_cache->size, 8); +/* Cache Attributes */ +build_append_int_noprefix(table_data, cache_attr, 4); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* + * Number of SMBIOS handles (n) + * Linux kernel uses Memory Side Cache Information Structure + * without SMBIOS entries for now, so set Number of SMBIOS handles + * as 0. + */ +build_append_int_noprefix(table_data, 0, 2); +} + /* Build HMAT sub table structures */ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) { uint16_t flags; uint32_t num_initiator = 0; uint32_t initiator_list[MAX_NODES]; -int i, hierarchy, type; +int i, hierarchy, type, cache_level, total_levels; HMAT_LB_Info *hmat_lb; +NumaHmatCacheOptions *hmat_cache; for (i = 0; i < numa_state->num_nodes; i++) { flags = 0; @@ -184,6 +232,25 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) } } } + +/* + * ACPI 6.3: 5.2.27.5 Memory Side Cache Information Structure: + * Table 5-147 + */ +for (i = 0; i < numa_state->num_nodes; i++) { +total_levels = 0; +for (cache_level = 1; cache_level < HMAT_LB_LEVELS; cache_level++) { +if (numa_state->hmat_cache[i][cache_level]) { +total_levels++; +} +} +for (cache_level = 0; cache_level <= total_levels; cache_level++) { +hmat_cache = numa_state->hmat_cache[i][cache_level]; +if (hmat_cache) { +build_hmat_cache(table_data, total_levels, hmat_cache); +} +} +} } void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state) -- 2.20.1
[PATCH RESEND v20 5/8] hmat acpi: Build System Locality Latency and Bandwidth Information Structure(s)
From: Liu Jingqi This structure describes the memory access latency and bandwidth information from various memory access initiator proximity domains. The latency and bandwidth numbers represented in this structure correspond to rated latency and bandwidth for the platform. The software could use this information as hint for optimization. Acked-by: Markus Armbruster Reviewed-by: Igor Mammedov Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- Changes in v20: - Fix the broken CI case when user input latency or bandwidth less than required Changes in v17: - Remove unnecessary header file (Igor) Changes in v16: - Add more description for lb_length (Igor) - Drop entry_list and calculate entries in this patch (Igor) Changes in v13: - Calculate the entries in a new patch. --- hw/acpi/hmat.c | 104 - 1 file changed, 103 insertions(+), 1 deletion(-) diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c index 9ff79308a4..4635d45dee 100644 --- a/hw/acpi/hmat.c +++ b/hw/acpi/hmat.c @@ -25,6 +25,7 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "sysemu/numa.h" #include "hw/acpi/hmat.h" @@ -67,11 +68,89 @@ static void build_hmat_mpda(GArray *table_data, uint16_t flags, build_append_int_noprefix(table_data, 0, 8); } +/* + * ACPI 6.3: 5.2.27.4 System Locality Latency and Bandwidth Information + * Structure: Table 5-146 + */ +static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, + uint32_t num_initiator, uint32_t num_target, + uint32_t *initiator_list) +{ +int i, index; +HMAT_LB_Data *lb_data; +uint16_t *entry_list; +uint32_t base; +/* Length in bytes for entire structure */ +uint32_t lb_length += 32 /* Table length upto and including Entry Base Unit */ ++ 4 * num_initiator /* Initiator Proximity Domain List */ ++ 4 * num_target /* Target Proximity Domain List */ ++ 2 * num_initiator * num_target; /* Latency or Bandwidth Entries */ + +/* Type */ +build_append_int_noprefix(table_data, 1, 2); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* Length */ +build_append_int_noprefix(table_data, lb_length, 4); +/* Flags: Bits [3:0] Memory Hierarchy, Bits[7:4] Reserved */ +assert(!(hmat_lb->hierarchy >> 4)); +build_append_int_noprefix(table_data, hmat_lb->hierarchy, 1); +/* Data Type */ +build_append_int_noprefix(table_data, hmat_lb->data_type, 1); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* Number of Initiator Proximity Domains (s) */ +build_append_int_noprefix(table_data, num_initiator, 4); +/* Number of Target Proximity Domains (t) */ +build_append_int_noprefix(table_data, num_target, 4); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 4); + +/* Entry Base Unit */ +if (hmat_lb->data_type <= HMAT_LB_DATA_WRITE_LATENCY) { +/* Convert latency base from nanoseconds to picosecond */ +base = hmat_lb->base * 1000; +} else { +/* Convert bandwidth base from Byte to Megabyte */ +base = hmat_lb->base / MiB; +} +build_append_int_noprefix(table_data, base, 8); + +/* Initiator Proximity Domain List */ +for (i = 0; i < num_initiator; i++) { +build_append_int_noprefix(table_data, initiator_list[i], 4); +} + +/* Target Proximity Domain List */ +for (i = 0; i < num_target; i++) { +build_append_int_noprefix(table_data, i, 4); +} + +/* Latency or Bandwidth Entries */ +entry_list = g_malloc0(num_initiator * num_target * sizeof(uint16_t)); +for (i = 0; i < hmat_lb->list->len; i++) { +lb_data = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); +index = lb_data->initiator * num_target + lb_data->target; + +entry_list[index] = (uint16_t)(lb_data->data / hmat_lb->base); +} + +for (i = 0; i < num_initiator * num_target; i++) { +build_append_int_noprefix(table_data, entry_list[i], 2); +} + +g_free(entry_list); +} + /* Build HMAT sub table structures */ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) { uint16_t flags; -int i; +uint32_t num_initiator = 0; +uint32_t initiator_list[MAX_NODES]; +int i, hierarchy, type; +HMAT_LB_Info *hmat_lb; for (i = 0; i < numa_state->num_nodes; i++) { flags = 0; @@ -82,6 +161,29 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) build_hmat_mpda(table_data, flags, numa_state->nodes[i].initiator, i); } + +for (i = 0; i < numa_state->num_nodes; i++) { +if (numa_state->nodes[i].has_cpu) { +initiator_list[num_initiator++] = i; +} +} + +/* +
[PATCH RESEND v20 7/8] tests/numa: Add case for QMP build HMAT
Check configuring HMAT usecase Acked-by: Markus Armbruster Suggested-by: Igor Mammedov Signed-off-by: Tao Xu --- Changes in v20: - Fix the wrong target in pc_hmat_erange_cfg - Use g_assert_true and g_assert_false to replace g_assert (Thomas and Markus) Changes in v19: - Add some fail cases for hmat-cache when level=0 Changes in v18: - Rewrite the lines over 80 characters Chenges in v17: - Add some fail test cases (Igor) --- tests/numa-test.c | 213 ++ 1 file changed, 213 insertions(+) diff --git a/tests/numa-test.c b/tests/numa-test.c index 8de8581231..17dd807d2a 100644 --- a/tests/numa-test.c +++ b/tests/numa-test.c @@ -327,6 +327,216 @@ static void pc_dynamic_cpu_cfg(const void *data) qtest_quit(qs); } +static void pc_hmat_build_cfg(const void *data) +{ +QTestState *qs = qtest_initf("%s -nodefaults --preconfig -machine hmat=on " + "-smp 2,sockets=2 " + "-m 128M,slots=2,maxmem=1G " + "-object memory-backend-ram,size=64M,id=m0 " + "-object memory-backend-ram,size=64M,id=m1 " + "-numa node,nodeid=0,memdev=m0 " + "-numa node,nodeid=1,memdev=m1,initiator=0 " + "-numa cpu,node-id=0,socket-id=0 " + "-numa cpu,node-id=0,socket-id=1", + data ? (char *)data : ""); + +/* Fail: Initiator should be less than the number of nodes */ +g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 2, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); + +/* Fail: Target should be less than the number of nodes */ +g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 2," +" 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); + +/* Fail: Initiator should contain cpu */ +g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 1, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); + +/* Fail: Data-type mismatch */ +g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"write-latency\"," +" 'bandwidth': 524288000 } }"))); +g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"read-bandwidth\"," +" 'latency': 5 } }"))); + +/* Fail: Bandwidth should be 1MB (1048576) aligned */ +g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," +" 'bandwidth': 1048575 } }"))); + +/* Configuring HMAT bandwidth and latency details */ +g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"access-latency\"," +" 'latency': 1 } }")));/* 1 ns */ +g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," +" 'hierarchy&
[PATCH RESEND v20 3/8] numa: Extend CLI to provide memory side cache information
From: Liu Jingqi Add -numa hmat-cache option to provide Memory Side Cache Information. These memory attributes help to build Memory Side Cache Information Structure(s) in ACPI Heterogeneous Memory Attribute Table (HMAT). Before using hmat-cache option, enable HMAT with -machine hmat=on. Acked-by: Markus Armbruster Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- Changes in v20: - Disable cache level 0 in hmat-cache option (Igor) - Update the QAPI description (Markus) Changes in v19: - Add description about the machine property 'hmat' in commit message (Markus) - Update the QAPI comments - Add a check for no memory side cache Changes in v18: - Update the error message (Igor) Changes in v17: - Use NumaHmatCacheOptions to replace HMAT_Cache_Info (Igor) - Add check for unordered cache level input (Igor) Changes in v16: - Add cross check with hmat_lb data (Igor) - Drop total_levels in struct HMAT_Cache_Info (Igor) - Correct the error table number (Igor) --- hw/core/numa.c| 80 ++ include/sysemu/numa.h | 5 +++ qapi/machine.json | 81 +-- qemu-options.hx | 17 +++-- 4 files changed, 179 insertions(+), 4 deletions(-) diff --git a/hw/core/numa.c b/hw/core/numa.c index 34eb413f5d..33fda31a4c 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -379,6 +379,73 @@ void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, g_array_append_val(hmat_lb->list, lb_data); } +void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, + Error **errp) +{ +int nb_numa_nodes = ms->numa_state->num_nodes; +NodeInfo *numa_info = ms->numa_state->nodes; +NumaHmatCacheOptions *hmat_cache = NULL; + +if (node->node_id >= nb_numa_nodes) { +error_setg(errp, "Invalid node-id=%" PRIu32 ", it should be less " + "than %d", node->node_id, nb_numa_nodes); +return; +} + +if (numa_info[node->node_id].lb_info_provided != (BIT(0) | BIT(1))) { +error_setg(errp, "The latency and bandwidth information of " + "node-id=%" PRIu32 " should be provided before memory side " + "cache attributes", node->node_id); +return; +} + +if (node->level < 1 || node->level >= HMAT_LB_LEVELS) { +error_setg(errp, "Invalid level=%" PRIu8 ", it should be larger than 0 " + "and less than or equal to %d", node->level, + HMAT_LB_LEVELS - 1); +return; +} + +assert(node->associativity < HMAT_CACHE_ASSOCIATIVITY__MAX); +assert(node->policy < HMAT_CACHE_WRITE_POLICY__MAX); +if (ms->numa_state->hmat_cache[node->node_id][node->level]) { +error_setg(errp, "Duplicate configuration of the side cache for " + "node-id=%" PRIu32 " and level=%" PRIu8, + node->node_id, node->level); +return; +} + +if ((node->level > 1) && +ms->numa_state->hmat_cache[node->node_id][node->level - 1] && +(node->size >= +ms->numa_state->hmat_cache[node->node_id][node->level - 1]->size)) { +error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 + " should be less than the size(%" PRIu64 ") of " + "level=%" PRIu8, node->size, node->level, + ms->numa_state->hmat_cache[node->node_id] + [node->level - 1]->size, + node->level - 1); +return; +} + +if ((node->level < HMAT_LB_LEVELS - 1) && +ms->numa_state->hmat_cache[node->node_id][node->level + 1] && +(node->size <= +ms->numa_state->hmat_cache[node->node_id][node->level + 1]->size)) { +error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 + " should be larger than the size(%" PRIu64 ") of " + "level=%" PRIu8, node->size, node->level, + ms->numa_state->hmat_cache[node->node_id] + [node->level + 1]->size, + node->level + 1); +return; +} + +hmat_cache = g_malloc0(sizeof(*hmat_cache)); +memcpy(hmat_cache, node, sizeof(*hmat_cache)); +ms->numa_state->hmat_cache[node->node_id][node->level] = hmat_cache; +} + void set_numa_options(MachineState *ms, NumaOpti
[PATCH RESEND v20 8/8] tests/bios-tables-test: add test cases for ACPI HMAT
ACPI table HMAT has been introduced, QEMU now builds HMAT tables for Heterogeneous Memory with boot option '-numa node'. Add test cases on PC and Q35 machines with 2 numa nodes. Because HMAT is generated when system enable numa, the following tables need to be added for this test: tests/data/acpi/pc/APIC.acpihmat tests/data/acpi/pc/SRAT.acpihmat tests/data/acpi/pc/HMAT.acpihmat tests/data/acpi/pc/DSDT.acpihmat tests/data/acpi/q35/APIC.acpihmat tests/data/acpi/q35/SRAT.acpihmat tests/data/acpi/q35/HMAT.acpihmat tests/data/acpi/q35/DSDT.acpihmat Acked-by: Markus Armbruster Reviewed-by: Igor Mammedov Reviewed-by: Daniel Black Reviewed-by: Jingqi Liu Suggested-by: Igor Mammedov Signed-off-by: Tao Xu --- No changes in v20. Changes in v18: - Remove unit "ns". Changes in v17: - Update the latency and bandwidth Changes in v15: - Make tests without breaking CI (Michael) Changes in v13: - Use decimal notation with appropriate suffix for cache size --- tests/bios-tables-test-allowed-diff.h | 8 + tests/bios-tables-test.c | 44 +++ tests/data/acpi/pc/APIC.acpihmat | 0 tests/data/acpi/pc/DSDT.acpihmat | 0 tests/data/acpi/pc/HMAT.acpihmat | 0 tests/data/acpi/pc/SRAT.acpihmat | 0 tests/data/acpi/q35/APIC.acpihmat | 0 tests/data/acpi/q35/DSDT.acpihmat | 0 tests/data/acpi/q35/HMAT.acpihmat | 0 tests/data/acpi/q35/SRAT.acpihmat | 0 10 files changed, 52 insertions(+) create mode 100644 tests/data/acpi/pc/APIC.acpihmat create mode 100644 tests/data/acpi/pc/DSDT.acpihmat create mode 100644 tests/data/acpi/pc/HMAT.acpihmat create mode 100644 tests/data/acpi/pc/SRAT.acpihmat create mode 100644 tests/data/acpi/q35/APIC.acpihmat create mode 100644 tests/data/acpi/q35/DSDT.acpihmat create mode 100644 tests/data/acpi/q35/HMAT.acpihmat create mode 100644 tests/data/acpi/q35/SRAT.acpihmat diff --git a/tests/bios-tables-test-allowed-diff.h b/tests/bios-tables-test-allowed-diff.h index dfb8523c8b..3c9e0c979b 100644 --- a/tests/bios-tables-test-allowed-diff.h +++ b/tests/bios-tables-test-allowed-diff.h @@ -1 +1,9 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/pc/APIC.acpihmat", +"tests/data/acpi/pc/SRAT.acpihmat", +"tests/data/acpi/pc/HMAT.acpihmat", +"tests/data/acpi/pc/DSDT.acpihmat", +"tests/data/acpi/q35/APIC.acpihmat", +"tests/data/acpi/q35/SRAT.acpihmat", +"tests/data/acpi/q35/HMAT.acpihmat", +"tests/data/acpi/q35/DSDT.acpihmat", diff --git a/tests/bios-tables-test.c b/tests/bios-tables-test.c index 79f5da092f..9823820043 100644 --- a/tests/bios-tables-test.c +++ b/tests/bios-tables-test.c @@ -947,6 +947,48 @@ static void test_acpi_virt_tcg_numamem(void) } +static void test_acpi_tcg_acpi_hmat(const char *machine) +{ +test_data data; + +memset(&data, 0, sizeof(data)); +data.machine = machine; +data.variant = ".acpihmat"; +test_acpi_one(" -machine hmat=on" + " -smp 2,sockets=2" + " -m 128M,slots=2,maxmem=1G" + " -object memory-backend-ram,size=64M,id=m0" + " -object memory-backend-ram,size=64M,id=m1" + " -numa node,nodeid=0,memdev=m0" + " -numa node,nodeid=1,memdev=m1,initiator=0" + " -numa cpu,node-id=0,socket-id=0" + " -numa cpu,node-id=0,socket-id=1" + " -numa hmat-lb,initiator=0,target=0,hierarchy=memory," + "data-type=access-latency,latency=1" + " -numa hmat-lb,initiator=0,target=0,hierarchy=memory," + "data-type=access-bandwidth,bandwidth=65534M" + " -numa hmat-lb,initiator=0,target=1,hierarchy=memory," + "data-type=access-latency,latency=65534" + " -numa hmat-lb,initiator=0,target=1,hierarchy=memory," + "data-type=access-bandwidth,bandwidth=32767M" + " -numa hmat-cache,node-id=0,size=10K,level=1," + "associativity=direct,policy=write-back,line=8" + " -numa hmat-cache,node-id=1,size=10K,level=1," + "associativity=direct,policy=write-back,line=8", + &data); +free_test_data(&data); +} + +static void test_acpi_q35_tcg_acpi_hmat(void) +{ +test_acpi_tcg_acpi_hmat(MACHINE_Q35); +} + +static void test_acpi_piix4_tcg_acpi_hmat(void) +{ +test_acpi_tcg_acpi_hmat(MACHINE_PC); +} + static void test_acpi_virt_tcg(void) { test_data data = { @@ -991,6 +1033,8 @@ int main(int argc, char *argv[]) qt
[PATCH RESEND v20 1/8] numa: Extend CLI to provide initiator information for numa nodes
In ACPI 6.3 chapter 5.2.27 Heterogeneous Memory Attribute Table (HMAT), The initiator represents processor which access to memory. And in 5.2.27.3 Memory Proximity Domain Attributes Structure, the attached initiator is defined as where the memory controller responsible for a memory proximity domain. With attached initiator information, the topology of heterogeneous memory can be described. Add new machine property 'hmat' to enable all HMAT specific options. Extend CLI of "-numa node" option to indicate the initiator numa node-id. In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report the platform's HMAT tables. Before using initiator option, enable HMAT with -machine hmat=on. Acked-by: Markus Armbruster Reviewed-by: Igor Mammedov Reviewed-by: Jingqi Liu Suggested-by: Dan Williams Signed-off-by: Tao Xu --- No changes in v20. Changes in v19: - Add description about the machine property 'hmat' in commit message (Markus) Changes in v15: - Change the QAPI version tag to 5.0 (Eric) --- hw/core/machine.c | 64 +++ hw/core/numa.c| 23 include/sysemu/numa.h | 5 qapi/machine.json | 10 ++- qemu-options.hx | 35 +++ 5 files changed, 131 insertions(+), 6 deletions(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index 1689ad3bf8..d7d2cfa66d 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -518,6 +518,20 @@ static void machine_set_nvdimm(Object *obj, bool value, Error **errp) ms->nvdimms_state->is_enabled = value; } +static bool machine_get_hmat(Object *obj, Error **errp) +{ +MachineState *ms = MACHINE(obj); + +return ms->numa_state->hmat_enabled; +} + +static void machine_set_hmat(Object *obj, bool value, Error **errp) +{ +MachineState *ms = MACHINE(obj); + +ms->numa_state->hmat_enabled = value; +} + static char *machine_get_nvdimm_persistence(Object *obj, Error **errp) { MachineState *ms = MACHINE(obj); @@ -645,6 +659,7 @@ void machine_set_cpu_numa_node(MachineState *machine, const CpuInstanceProperties *props, Error **errp) { MachineClass *mc = MACHINE_GET_CLASS(machine); +NodeInfo *numa_info = machine->numa_state->nodes; bool match = false; int i; @@ -714,6 +729,17 @@ void machine_set_cpu_numa_node(MachineState *machine, match = true; slot->props.node_id = props->node_id; slot->props.has_node_id = props->has_node_id; + +if (machine->numa_state->hmat_enabled) { +if ((numa_info[props->node_id].initiator < MAX_NODES) && +(props->node_id != numa_info[props->node_id].initiator)) { +error_setg(errp, "The initiator of CPU NUMA node %" PRId64 +" should be itself", props->node_id); +return; +} +numa_info[props->node_id].has_cpu = true; +numa_info[props->node_id].initiator = props->node_id; +} } if (!match) { @@ -960,6 +986,13 @@ static void machine_initfn(Object *obj) if (mc->numa_mem_supported) { ms->numa_state = g_new0(NumaState, 1); +object_property_add_bool(obj, "hmat", + machine_get_hmat, machine_set_hmat, + &error_abort); +object_property_set_description(obj, "hmat", +"Set on/off to enable/disable " +"ACPI Heterogeneous Memory Attribute " +"Table (HMAT)", NULL); } /* Register notifier when init is done for sysbus sanity checks */ @@ -1048,6 +1081,32 @@ static char *cpu_slot_to_string(const CPUArchId *cpu) return g_string_free(s, false); } +static void numa_validate_initiator(NumaState *numa_state) +{ +int i; +NodeInfo *numa_info = numa_state->nodes; + +for (i = 0; i < numa_state->num_nodes; i++) { +if (numa_info[i].initiator == MAX_NODES) { +error_report("The initiator of NUMA node %d is missing, use " + "'-numa node,initiator' option to declare it", i); +exit(1); +} + +if (!numa_info[numa_info[i].initiator].present) { +error_report("NUMA node %" PRIu16 " is missing, use " + "'-numa node' option to declare it first", + numa_info[i].initiator); +exit(1); +} + +if (!numa_info[numa_info[i].initiator].has_cpu) { +error_report("The initiator of NUMA node %d is invalid", i); +exit(1); +}
[PATCH RESEND v20 2/8] numa: Extend CLI to provide memory latency and bandwidth information
From: Liu Jingqi Add -numa hmat-lb option to provide System Locality Latency and Bandwidth Information. These memory attributes help to build System Locality Latency and Bandwidth Information Structure(s) in ACPI Heterogeneous Memory Attribute Table (HMAT). Before using hmat-lb option, enable HMAT with -machine hmat=on. Acked-by: Markus Armbruster Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- Changes in v20: - Update the QAPI description (Markus) - Keep base and bitmap unchanged when latency or bandwidth out of range Changes in v19: - Add description about the machine property 'hmat' in commit message (Markus) Changes in v18: - Use qapi type uint64 and only nanosecond for latency (Markus) Changes in v17: - Add check when user input latency or bandwidth 0, the lb_info_provided should also be 0. Because in ACPI 6.3 5.2.27.4, 0 means the corresponding latency or bandwidth information is not provided. - Fix the infinite loop when node->latency is 0. --- hw/core/numa.c| 194 ++ include/sysemu/numa.h | 53 qapi/machine.json | 93 +++- qemu-options.hx | 47 +- 4 files changed, 384 insertions(+), 3 deletions(-) diff --git a/hw/core/numa.c b/hw/core/numa.c index e60da99293..34eb413f5d 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -23,6 +23,7 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "sysemu/hostmem.h" #include "sysemu/numa.h" #include "sysemu/sysemu.h" @@ -198,6 +199,186 @@ void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error **errp) ms->numa_state->have_numa_distance = true; } +void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, +Error **errp) +{ +int i, first_bit, last_bit; +uint64_t max_entry, temp_base, bitmap_copy; +NodeInfo *numa_info = numa_state->nodes; +HMAT_LB_Info *hmat_lb = +numa_state->hmat_lb[node->hierarchy][node->data_type]; +HMAT_LB_Data lb_data = {}; +HMAT_LB_Data *lb_temp; + +/* Error checking */ +if (node->initiator > numa_state->num_nodes) { +error_setg(errp, "Invalid initiator=%d, it should be less than %d", + node->initiator, numa_state->num_nodes); +return; +} +if (node->target > numa_state->num_nodes) { +error_setg(errp, "Invalid target=%d, it should be less than %d", + node->target, numa_state->num_nodes); +return; +} +if (!numa_info[node->initiator].has_cpu) { +error_setg(errp, "Invalid initiator=%d, it isn't an " + "initiator proximity domain", node->initiator); +return; +} +if (!numa_info[node->target].present) { +error_setg(errp, "The target=%d should point to an existing node", + node->target); +return; +} + +if (!hmat_lb) { +hmat_lb = g_malloc0(sizeof(*hmat_lb)); +numa_state->hmat_lb[node->hierarchy][node->data_type] = hmat_lb; +hmat_lb->list = g_array_new(false, true, sizeof(HMAT_LB_Data)); +} +hmat_lb->hierarchy = node->hierarchy; +hmat_lb->data_type = node->data_type; +lb_data.initiator = node->initiator; +lb_data.target = node->target; + +if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) { +/* Input latency data */ + +if (!node->has_latency) { +error_setg(errp, "Missing 'latency' option"); +return; +} +if (node->has_bandwidth) { +error_setg(errp, "Invalid option 'bandwidth' since " + "the data type is latency"); +return; +} + +/* Detect duplicate configuration */ +for (i = 0; i < hmat_lb->list->len; i++) { +lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); + +if (node->initiator == lb_temp->initiator && +node->target == lb_temp->target) { +error_setg(errp, "Duplicate configuration of the latency for " +"initiator=%d and target=%d", node->initiator, +node->target); +return; +} +} + +hmat_lb->base = hmat_lb->base ? hmat_lb->base : UINT64_MAX; + +if (node->latency) { +/* Calculate the temporary base and compressed latency */ +max_entry = node->latency; +temp_base = 1; +while (QEMU_IS_ALIGNED(max_entry, 10)) { +max_entry /= 10; +temp_base *=
[PATCH RESEND v20 4/8] hmat acpi: Build Memory Proximity Domain Attributes Structure(s)
From: Liu Jingqi HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table (HMAT). The specification references below link: http://www.uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf It describes the memory attributes, such as memory side cache attributes and bandwidth and latency details, related to the Memory Proximity Domain. The software is expected to use this information as hint for optimization. This structure describes Memory Proximity Domain Attributes by memory subsystem and its associativity with processor proximity domain as well as hint for memory usage. In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report the platform's HMAT tables. Acked-by: Markus Armbruster Reviewed-by: Igor Mammedov Reviewed-by: Daniel Black Reviewed-by: Jonathan Cameron Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- No changes in v20. Changes in v16: - Use uint32_t for initiator and mem_node Changes in v13: - Remove the unnecessary head file. --- hw/acpi/Kconfig | 7 ++- hw/acpi/Makefile.objs | 1 + hw/acpi/hmat.c| 99 +++ hw/acpi/hmat.h| 42 ++ hw/i386/acpi-build.c | 5 +++ 5 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 hw/acpi/hmat.c create mode 100644 hw/acpi/hmat.h diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig index 12e3f1e86e..54209c6f2f 100644 --- a/hw/acpi/Kconfig +++ b/hw/acpi/Kconfig @@ -7,6 +7,7 @@ config ACPI_X86 select ACPI_NVDIMM select ACPI_CPU_HOTPLUG select ACPI_MEMORY_HOTPLUG +select ACPI_HMAT config ACPI_X86_ICH bool @@ -23,6 +24,10 @@ config ACPI_NVDIMM bool depends on ACPI +config ACPI_HMAT +bool +depends on ACPI + config ACPI_PCI bool depends on ACPI && PCI @@ -33,5 +38,3 @@ config ACPI_VMGENID depends on PC config ACPI_HW_REDUCED -bool -depends on ACPI diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs index 655a9c1973..517bd88704 100644 --- a/hw/acpi/Makefile.objs +++ b/hw/acpi/Makefile.objs @@ -7,6 +7,7 @@ common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu.o common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o common-obj-$(CONFIG_ACPI_VMGENID) += vmgenid.o common-obj-$(CONFIG_ACPI_HW_REDUCED) += generic_event_device.o +common-obj-$(CONFIG_ACPI_HMAT) += hmat.o common-obj-$(call lnot,$(CONFIG_ACPI_X86)) += acpi-stub.o common-obj-y += acpi_interface.o diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c new file mode 100644 index 00..9ff79308a4 --- /dev/null +++ b/hw/acpi/hmat.c @@ -0,0 +1,99 @@ +/* + * HMAT ACPI Implementation + * + * Copyright(C) 2019 Intel Corporation. + * + * Author: + * Liu jingqi + * Tao Xu + * + * HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table + * (HMAT) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/> + */ + +#include "qemu/osdep.h" +#include "sysemu/numa.h" +#include "hw/acpi/hmat.h" + +/* + * ACPI 6.3: + * 5.2.27.3 Memory Proximity Domain Attributes Structure: Table 5-145 + */ +static void build_hmat_mpda(GArray *table_data, uint16_t flags, +uint32_t initiator, uint32_t mem_node) +{ + +/* Memory Proximity Domain Attributes Structure */ +/* Type */ +build_append_int_noprefix(table_data, 0, 2); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* Length */ +build_append_int_noprefix(table_data, 40, 4); +/* Flags */ +build_append_int_noprefix(table_data, flags, 2); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* Proximity Domain for the Attached Initiator */ +build_append_int_noprefix(table_data, initiator, 4); +/* Proximity Domain for the Memory */ +build_append_int_noprefix(table_data, mem_node, 4); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 4); +/* + * Reserved: + * Previously defined as the Start Address of the System Physical + * Address Range. Deprecated since ACPI Spec 6.3. + */ +build_append_int_noprefix(table_data, 0, 8); +/* + * Reserved: + * Previously defined as the Range Length of the region in bytes. + * Deprecated since ACPI Spec 6.3. + */ +build_append_int_noprefix(table_data, 0, 8); +} + +/
[PATCH RESEND v20 0/8] Build ACPI Heterogeneous Memory Attribute Table (HMAT)
This series of patches will build Heterogeneous Memory Attribute Table (HMAT) according to the command line. The ACPI HMAT describes the memory attributes, such as memory side cache attributes and bandwidth and latency details, related to the Memory Proximity Domain. The software is expected to use HMAT information as hint for optimization. In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report the platform's HMAT tables. The V19 patches link: https://patchwork.kernel.org/cover/11265525/ Changelog: v20: - Resend to fix the wrong target in pc_hmat_erange_cfg() - Use g_assert_true and g_assert_false to replace g_assert (Thomas and Markus) - Rename assoc as associativity, update the QAPI description (Markus) - Disable cache level 0 in hmat-cache option (Igor) - Keep base and bitmap unchanged when latency or bandwidth out of range - Fix the broken CI case when user input latency or bandwidth less than required. v19: - Add description about the machine property 'hmat' in commit message (Markus) - Update the QAPI comments - Add a check for no memory side cache - Add some fail cases for hmat-cache when level=0 v18: - Defer patches 01/14~06/14 of V17, use qapi type uint64 and only nanosecond for latency (Markus) - Rewrite the lines over 80 characters(Igor) v17: - Add check when user input latency or bandwidth 0, the lb_info_provided should also be 0. Because in ACPI 6.3 5.2.27.4, 0 means the corresponding latency or bandwidth information is not provided. - Fix the infinite loop when node->latency is 0. - Use NumaHmatCacheOptions to replace HMAT_Cache_Info (Igor) - Add check for unordered cache level input (Igor) - Add some fail test cases (Igor) v16: - Add and use qemu_strtold_finite to parse size, support full 64bit precision, modify related test cases (Eduardo and Markus) - Simplify struct HMAT_LB_Info and related code, unify latency and bandwidth (Igor) - Add cross check with hmat_lb data (Igor) - Fields in Cache Attributes are promoted to uint32_t before shifting (Igor) - Add case for QMP build HMAT (Igor) v15: - Add a new patch to refactor do_strtosz() (Eduardo) - Make tests without breaking CI (Michael) v14: - Reuse the codes of do_strtosz to build qemu_strtotime_ns (Eduardo) - Squash patch v13 01/12 and 02/12 together (Daniel and Eduardo) - Drop time unit picosecond (Eric) - Use qemu ctz64 and clz64 instead of builtin function v13: - Modify some text description - Drop "initiator_valid" field in struct NodeInfo - Reuse Garray to store the raw bandwidth and bandwidth data - Calculate common base unit using range bitmap - Add a patch to alculate hmat latency and bandwidth entry list - Drop the total_levels option and use readable cache size - Remove the unnecessary head file - Use decimal notation with appropriate suffix for cache size Liu Jingqi (5): numa: Extend CLI to provide memory latency and bandwidth information numa: Extend CLI to provide memory side cache information hmat acpi: Build Memory Proximity Domain Attributes Structure(s) hmat acpi: Build System Locality Latency and Bandwidth Information Structure(s) hmat acpi: Build Memory Side Cache Information Structure(s) Tao Xu (3): numa: Extend CLI to provide initiator information for numa nodes tests/numa: Add case for QMP build HMAT tests/bios-tables-test: add test cases for ACPI HMAT hw/acpi/Kconfig | 7 +- hw/acpi/Makefile.objs | 1 + hw/acpi/hmat.c| 268 +++ hw/acpi/hmat.h| 42 hw/core/machine.c | 64 ++ hw/core/numa.c| 297 ++ hw/i386/acpi-build.c | 5 + include/sysemu/numa.h | 63 ++ qapi/machine.json | 180 +++- qemu-options.hx | 95 +++- tests/bios-tables-test-allowed-diff.h | 8 + tests/bios-tables-test.c | 44 tests/data/acpi/pc/APIC.acpihmat | 0 tests/data/acpi/pc/DSDT.acpihmat | 0 tests/data/acpi/pc/HMAT.acpihmat | 0 tests/data/acpi/pc/SRAT.acpihmat | 0 tests/data/acpi/q35/APIC.acpihmat | 0 tests/data/acpi/q35/DSDT.acpihmat | 0 tests/data/acpi/q35/HMAT.acpihmat | 0 tests/data/acpi/q35/SRAT.acpihmat | 0 tests/numa-test.c | 213 ++ 21 files changed, 1276 insertions(+), 11 deletions(-) create mode 100644 hw/acpi/hmat.c create mode 100644 hw/acpi/hmat.h create mode 100644 tests/data/acpi/pc/APIC.acpihmat create mode 100644 tests/data/acpi/pc/DSDT.acpihmat create mode 100644 tests/data/acpi/pc/HMAT.acpihmat create mode 100644 tests/data/acpi/pc/SRAT.
[PATCH RESEND v2] util/cutils: Expand do_strtosz parsing precision to 64 bits
Parse input string both as a double and as a uint64_t, then use the method which consumes more characters. Update the related test cases. Signed-off-by: Tao Xu --- Changes in v2: - Resend to use double small than DBL_MIN - Add more test case for double overflow and underflow. - Set mul as int64_t (Markus) - Restore endptr (Markus) --- tests/test-cutils.c| 37 +++ tests/test-keyval.c| 47 + tests/test-qemu-opts.c | 39 +--- util/cutils.c | 67 +++--- 4 files changed, 75 insertions(+), 115 deletions(-) diff --git a/tests/test-cutils.c b/tests/test-cutils.c index 1aa8351520..49e495b8ba 100644 --- a/tests/test-cutils.c +++ b/tests/test-cutils.c @@ -1970,40 +1970,25 @@ static void test_qemu_strtosz_simple(void) g_assert_cmpint(err, ==, 0); g_assert_cmpint(res, ==, 12345); -/* Note: precision is 53 bits since we're parsing with strtod() */ - -str = "9007199254740991"; /* 2^53-1 */ -err = qemu_strtosz(str, &endptr, &res); -g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0x1f); -g_assert(endptr == str + 16); - -str = "9007199254740992"; /* 2^53 */ -err = qemu_strtosz(str, &endptr, &res); -g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0x20); -g_assert(endptr == str + 16); +/* Note: precision is 64 bits (UINT64_MAX) */ str = "9007199254740993"; /* 2^53+1 */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0x20); /* rounded to 53 bits */ +g_assert_cmpint(res, ==, 0x21); g_assert(endptr == str + 16); -str = "18446744073709549568"; /* 0xf800 (53 msbs set) */ +str = "18446744073709550591"; /* 0xfbff */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0xf800); +g_assert_cmpint(res, ==, 0xfbff); g_assert(endptr == str + 20); -str = "18446744073709550591"; /* 0xfbff */ +str = "18446744073709551615"; /* 2^64-1 (UINT64_MAX) */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0xf800); /* rounded to 53 bits */ +g_assert_cmpint(res, ==, 0x); g_assert(endptr == str + 20); - -/* 0x7e00..0x7fff get rounded to - * 0x8000, thus -ERANGE; see test_qemu_strtosz_erange() */ } static void test_qemu_strtosz_units(void) @@ -2145,20 +2130,20 @@ static void test_qemu_strtosz_erange(void) g_assert_cmpint(err, ==, -ERANGE); g_assert(endptr == str + 2); -str = "18446744073709550592"; /* 0xfc00 */ +str = "18446744073709551616"; /* 2^64 */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, -ERANGE); g_assert(endptr == str + 20); -str = "18446744073709551615"; /* 2^64-1 */ +str = "1.7976931348623158e+308"; /* DBL_MAX, double overflows */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, -ERANGE); -g_assert(endptr == str + 20); +g_assert(endptr == str + 23); -str = "18446744073709551616"; /* 2^64 */ +str = "2.225e-308"; /* Small than DBL_MIN, double underflows */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, -ERANGE); -g_assert(endptr == str + 20); +g_assert(endptr == str + 10); str = "20E"; err = qemu_strtosz(str, &endptr, &res); diff --git a/tests/test-keyval.c b/tests/test-keyval.c index 09b0ae3c68..fad941fcb8 100644 --- a/tests/test-keyval.c +++ b/tests/test-keyval.c @@ -383,59 +383,26 @@ static void test_keyval_visit_size(void) visit_end_struct(v, NULL); visit_free(v); -/* Note: precision is 53 bits since we're parsing with strtod() */ +/* Note: precision is 64 bits (UINT64_MAX) */ -/* Around limit of precision: 2^53-1, 2^53, 2^53+1 */ -qdict = keyval_parse("sz1=9007199254740991," - "sz2=9007199254740992," - "sz3=9007199254740993", +/* Around limit of precision: UINT64_MAX - 1, UINT64_MAX */ +qdict = keyval_parse("sz1=18446744073709551614," + "sz2=18446744073709551615", NULL, &error_abort); v = qobject_input_visitor_new_keyval(QOBJECT(qdict)); qobject_unref(qdict); visit_start_struct(v, NULL, NULL, 0, &error_abort); visit_type_size(v, "sz1", &sz, &error_abort); -g_assert_cmphex(sz, ==, 0x1f
[PATCH v2 4/4] target/i386: Add notes for versioned CPU models
Add which features are added or removed in this version. Remove the changed model-id in versioned CPU models. Signed-off-by: Tao Xu --- Changes in v2: - correct the note of Cascadelake v3 (Xiaoyao) --- target/i386/cpu.c | 50 +++ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 7b3bd6d4db..4717862cee 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -2281,10 +2281,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Nehalem-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Intel Core i7 9xx (Nehalem Core i7, IBRS update)" }, { /* end of list */ } } }, @@ -2362,10 +2361,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Westmere-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Westmere E56xx/L56xx/X56xx (IBRS update)" }, { /* end of list */ } } }, @@ -2448,10 +2446,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "SandyBridge-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Intel Xeon E312xx (Sandy Bridge, IBRS update)" }, { /* end of list */ } } }, @@ -2540,10 +2537,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "IvyBridge-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Intel Xeon E3-12xx v2 (Ivy Bridge, IBRS)" }, { /* end of list */ } } }, @@ -2637,17 +2633,18 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Haswell-noTSX", +.note = "no TSX", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, { "stepping", "1" }, -{ "model-id", "Intel Core Processor (Haswell, no TSX)", }, { /* end of list */ } }, }, { .version = 3, .alias = "Haswell-IBRS", +.note = "IBRS", .props = (PropValue[]) { /* Restore TSX features removed by -v2 above */ { "hle", "on" }, @@ -2658,21 +2655,18 @@ static X86CPUDefinition builtin_x86_defs[] = { */ { "stepping", "4" }, { "spec-ctrl", "on" }, -{ "model-id", - "Intel Core Processor (Haswell, IBRS)" }, { /* end of list */ } } }, { .version = 4, .alias = "Haswell-noTSX-IBRS", +.note = "no TSX, IBRS", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, /* spec-ctrl was already enabled by -v3 above */ { "stepping", "1" }, -{ "model-id", - "Intel Core Processor (Haswell, no TSX, IBRS)" }, { /* end of list */ } } }, @@ -2768,35 +2762,33 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Broadwell-noTSX", +.note = "no TSX", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, -{ "model-id", "Inte
[PATCH v2 1/4] target/i386: Add Denverton-v2 (no MPX) CPU model
Because MPX is being removed from the linux kernel, remove MPX feature from Denverton. Signed-off-by: Tao Xu --- target/i386/cpu.c | 12 1 file changed, 12 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 69f518a21a..06a3077f95 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3482,6 +3482,18 @@ static X86CPUDefinition builtin_x86_defs[] = { .features[FEAT_VMX_VMFUNC] = MSR_VMX_VMFUNC_EPT_SWITCHING, .xlevel = 0x8008, .model_id = "Intel Atom Processor (Denverton)", +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ "mpx", "off" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, { .name = "Snowridge", -- 2.20.1
[PATCH v2 0/4] Add extra information to versioned CPU models
This series of patches will remove MPX from Denverton, remove Remove monitor from some CPU models. Add additional information for -cpu help to indicate the changes in this version of CPU model. The output is as follows: ./x86_64-softmmu/qemu-system-x86_64 -cpu help | grep "\[" x86 Broadwell-v2 Intel Core Processor (Broadwell) [no TSX] x86 Broadwell-v3 Intel Core Processor (Broadwell) [IBRS] x86 Broadwell-v4 Intel Core Processor (Broadwell) [no TSX, IBRS] x86 Cascadelake-Server-v2 Intel Xeon Processor (Cascadelake) [ARCH_CAPABILITIES] x86 Cascadelake-Server-v3 Intel Xeon Processor (Cascadelake) [ARCH_CAPABILITIES, no TSX] x86 Denverton-v2 Intel Atom Processor (Denverton) [no MPX, no MONITOR] x86 Dhyana-v2 Hygon Dhyana Processor [no MONITOR] x86 EPYC-v2 AMD EPYC Processor [IBPB] x86 EPYC-v3 AMD EPYC Processor [IBPB, no MONITOR] x86 Haswell-v2Intel Core Processor (Haswell) [no TSX] x86 Haswell-v3Intel Core Processor (Haswell) [IBRS] x86 Haswell-v4Intel Core Processor (Haswell) [no TSX, IBRS] x86 Icelake-Client-v2 Intel Core Processor (Icelake) [no TSX] x86 Icelake-Server-v2 Intel Xeon Processor (Icelake) [no TSX] x86 IvyBridge-v2 Intel Xeon E3-12xx v2 (Ivy Bridge) [IBRS] x86 Nehalem-v2Intel Core i7 9xx (Nehalem Class Core i7) [IBRS] x86 Opteron_G3-v2 AMD Opteron 23xx (Gen 3 Class Opteron) [no MONITOR] x86 SandyBridge-v2Intel Xeon E312xx (Sandy Bridge) [IBRS] x86 Skylake-Client-v2 Intel Core Processor (Skylake) [IBRS] x86 Skylake-Client-v3 Intel Core Processor (Skylake) [no TSX, IBRS] x86 Skylake-Server-v2 Intel Xeon Processor (Skylake) [IBRS] x86 Skylake-Server-v3 Intel Xeon Processor (Skylake) [no TSX, IBRS] x86 Snowridge-v2 Intel Atom Processor (SnowRidge) [no MPX] x86 Snowridge-v3 Intel Atom Processor (SnowRidge) [no MPX, no MONITOR] x86 Westmere-v2 Westmere E56xx/L56xx/X56xx (Nehalem-C) [IBRS] Changes in v2: - correct the note of Cascadelake v3 (Xiaoyao) Tao Xu (4): target/i386: Add Denverton-v2 (no MPX) CPU model target/i386: Remove monitor from some CPU models target/i386: Add new property note to versioned CPU models target/i386: Add notes for versioned CPU models target/i386/cpu.c | 112 +++--- 1 file changed, 85 insertions(+), 27 deletions(-) -- 2.20.1
[PATCH v2 2/4] target/i386: Remove monitor from some CPU models
Add new version of Snowridge, Denverton, Opteron_G3, EPYC, and Dhyana CPU model to remove MONITOR/MWAIT feature. After QEMU/KVM use "-overcommit cpu-pm=on" to expose MONITOR/MWAIT (commit id 6f131f13e68d648a8e4f083c667ab1acd88ce4cd), the MONITOR/MWAIT feature in these CPU model is unused. Signed-off-by: Tao Xu --- target/i386/cpu.c | 38 ++ 1 file changed, 38 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 06a3077f95..b09ac38409 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3621,6 +3621,14 @@ static X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ }, }, }, +{ +.version = 3, +.props = (PropValue[]) { +/* mpx was already removed by -v2 above */ +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, { /* end of list */ }, }, }, @@ -3732,6 +3740,17 @@ static X86CPUDefinition builtin_x86_defs[] = { CPUID_EXT3_ABM | CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM, .xlevel = 0x8008, .model_id = "AMD Opteron 23xx (Gen 3 Class Opteron)", +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, { .name = "Opteron_G4", @@ -3856,6 +3875,14 @@ static X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ } } }, +{ +.version = 3, +.props = (PropValue[]) { +/* ibpb was already enabled by -v2 above */ +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, { /* end of list */ } } }, @@ -3908,6 +3935,17 @@ static X86CPUDefinition builtin_x86_defs[] = { .xlevel = 0x801E, .model_id = "Hygon Dhyana Processor", .cache_info = &epyc_cache_info, +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, }; -- 2.20.1
[PATCH v2 3/4] target/i386: Add new property note to versioned CPU models
Add additional information for -cpu help to indicate the changes in this version of CPU model. Suggested-by: Eduardo Habkost Signed-off-by: Tao Xu --- target/i386/cpu.c | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index b09ac38409..7b3bd6d4db 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1693,6 +1693,7 @@ typedef struct PropValue { typedef struct X86CPUVersionDefinition { X86CPUVersion version; const char *alias; +const char *note; PropValue *props; } X86CPUVersionDefinition; @@ -1723,6 +1724,7 @@ struct X86CPUModel { X86CPUDefinition *cpudef; /* CPU model version */ X86CPUVersion version; +const char *note; /* * If true, this is an alias CPU model. * This matters only for "-cpu help" and query-cpu-definitions @@ -4788,6 +4790,7 @@ static void x86_cpu_list_entry(gpointer data, gpointer user_data) char *name = x86_cpu_class_get_model_name(cc); char *desc = g_strdup(cc->model_description); char *alias_of = x86_cpu_class_get_alias_of(cc); +char *model_id = x86_cpu_class_get_model_id(cc); if (!desc && alias_of) { if (cc->model && cc->model->version == CPU_VERSION_AUTO) { @@ -4796,14 +4799,18 @@ static void x86_cpu_list_entry(gpointer data, gpointer user_data) desc = g_strdup_printf("(alias of %s)", alias_of); } } +if (!desc && cc->model && cc->model->note) { +desc = g_strdup_printf("%s [%s]", model_id, cc->model->note); +} if (!desc) { -desc = x86_cpu_class_get_model_id(cc); +desc = g_strdup_printf("%s", model_id); } -qemu_printf("x86 %-20s %-48s\n", name, desc); +qemu_printf("x86 %-20s %-58s\n", name, desc); g_free(name); g_free(desc); g_free(alias_of); +g_free(model_id); } /* list available CPU models and flags */ @@ -5280,6 +5287,7 @@ static void x86_register_cpudef_types(X86CPUDefinition *def) X86CPUModel *m = g_new0(X86CPUModel, 1); m->cpudef = def; m->version = vdef->version; +m->note = vdef->note; name = x86_cpu_versioned_model_name(def, vdef->version); x86_register_cpu_model_type(name, m); g_free(name); -- 2.20.1
[PATCH v2] util/cutils: Expand do_strtosz parsing precision to 64 bits
Parse input string both as a double and as a uint64_t, then use the method which consumes more characters. Update the related test cases. Signed-off-by: Tao Xu --- Changes in v2: - Add more test case for double overflow and underflow. - Set mul as int64_t (Markus) - Restore endptr (Markus) --- tests/test-cutils.c| 37 +++ tests/test-keyval.c| 47 + tests/test-qemu-opts.c | 39 +--- util/cutils.c | 67 +++--- 4 files changed, 75 insertions(+), 115 deletions(-) diff --git a/tests/test-cutils.c b/tests/test-cutils.c index 1aa8351520..6fa9f88488 100644 --- a/tests/test-cutils.c +++ b/tests/test-cutils.c @@ -1970,40 +1970,25 @@ static void test_qemu_strtosz_simple(void) g_assert_cmpint(err, ==, 0); g_assert_cmpint(res, ==, 12345); -/* Note: precision is 53 bits since we're parsing with strtod() */ - -str = "9007199254740991"; /* 2^53-1 */ -err = qemu_strtosz(str, &endptr, &res); -g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0x1f); -g_assert(endptr == str + 16); - -str = "9007199254740992"; /* 2^53 */ -err = qemu_strtosz(str, &endptr, &res); -g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0x20); -g_assert(endptr == str + 16); +/* Note: precision is 64 bits (UINT64_MAX) */ str = "9007199254740993"; /* 2^53+1 */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0x20); /* rounded to 53 bits */ +g_assert_cmpint(res, ==, 0x21); g_assert(endptr == str + 16); -str = "18446744073709549568"; /* 0xf800 (53 msbs set) */ +str = "18446744073709550591"; /* 0xfbff */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0xf800); +g_assert_cmpint(res, ==, 0xfbff); g_assert(endptr == str + 20); -str = "18446744073709550591"; /* 0xfbff */ +str = "18446744073709551615"; /* 2^64-1 (UINT64_MAX) */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0xf800); /* rounded to 53 bits */ +g_assert_cmpint(res, ==, 0x); g_assert(endptr == str + 20); - -/* 0x7e00..0x7fff get rounded to - * 0x8000, thus -ERANGE; see test_qemu_strtosz_erange() */ } static void test_qemu_strtosz_units(void) @@ -2145,20 +2130,20 @@ static void test_qemu_strtosz_erange(void) g_assert_cmpint(err, ==, -ERANGE); g_assert(endptr == str + 2); -str = "18446744073709550592"; /* 0xfc00 */ +str = "18446744073709551616"; /* 2^64 */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, -ERANGE); g_assert(endptr == str + 20); -str = "18446744073709551615"; /* 2^64-1 */ +str = "1.7976931348623158e+308"; /* DBL_MAX */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, -ERANGE); -g_assert(endptr == str + 20); +g_assert(endptr == str + 23); -str = "18446744073709551616"; /* 2^64 */ +str = "2.2250738585072014e-308"; /* DBL_MIN */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, -ERANGE); -g_assert(endptr == str + 20); +g_assert(endptr == str + 23); str = "20E"; err = qemu_strtosz(str, &endptr, &res); diff --git a/tests/test-keyval.c b/tests/test-keyval.c index 09b0ae3c68..fad941fcb8 100644 --- a/tests/test-keyval.c +++ b/tests/test-keyval.c @@ -383,59 +383,26 @@ static void test_keyval_visit_size(void) visit_end_struct(v, NULL); visit_free(v); -/* Note: precision is 53 bits since we're parsing with strtod() */ +/* Note: precision is 64 bits (UINT64_MAX) */ -/* Around limit of precision: 2^53-1, 2^53, 2^53+1 */ -qdict = keyval_parse("sz1=9007199254740991," - "sz2=9007199254740992," - "sz3=9007199254740993", +/* Around limit of precision: UINT64_MAX - 1, UINT64_MAX */ +qdict = keyval_parse("sz1=18446744073709551614," + "sz2=18446744073709551615", NULL, &error_abort); v = qobject_input_visitor_new_keyval(QOBJECT(qdict)); qobject_unref(qdict); visit_start_struct(v, NULL, NULL, 0, &error_abort); visit_type_size(v, "sz1", &sz, &error_abort); -g_assert_cmphex(sz, ==, 0x1f); +g_assert_cmphex(sz, ==, 0xfffe); visit_type_size(v, "s
Re: [PATCH] util/cutils: Expand do_strtosz parsing precision to 64 bits
On 12/5/19 11:29 PM, Markus Armbruster wrote: Tao Xu writes: Parse input string both as a double and as a uint64_t, then use the method which consumes more characters. Update the related test cases. Signed-off-by: Tao Xu --- [...] diff --git a/util/cutils.c b/util/cutils.c index 77acadc70a..b08058c57c 100644 --- a/util/cutils.c +++ b/util/cutils.c @@ -212,24 +212,43 @@ static int do_strtosz(const char *nptr, const char **end, const char default_suffix, int64_t unit, uint64_t *result) { -int retval; -const char *endptr; +int retval, retd, retu; +const char *suffix, *suffixd, *suffixu; unsigned char c; int mul_required = 0; -double val, mul, integral, fraction; +bool use_strtod; +uint64_t valu; +double vald, mul, integral, fraction; Note for later: @mul is double. + +retd = qemu_strtod_finite(nptr, &suffixd, &vald); +retu = qemu_strtou64(nptr, &suffixu, 0, &valu); +use_strtod = strlen(suffixd) < strlen(suffixu); + +/* + * Parse @nptr both as a double and as a uint64_t, then use the method + * which consumes more characters. + */ The comment is in a funny place. I'd put it right before the qemu_strtod_finite() line. +if (use_strtod) { +suffix = suffixd; +retval = retd; +} else { +suffix = suffixu; +retval = retu; +} -retval = qemu_strtod_finite(nptr, &endptr, &val); if (retval) { goto out; } This is even more subtle than it looks. A close reading of the function contracts leads to three cases for each conversion: * parse error (including infinity and NaN) @retu / @retd is -EINVAL @valu / @vald is uninitialized @suffixu / @suffixd is @nptr * range error @retu / @retd is -ERANGE @valu / @vald is our best approximation of the conversion result @suffixu / @suffixd points to the first character not consumed by the conversion. Sub-cases: - uint64_t overflow We know the conversion result exceeds UINT64_MAX. - double overflow we know the conversion result's magnitude exceeds the largest representable finite double DBL_MAX. - double underflow we know the conversion result is close to zero (closer than DBL_MIN, the smallest normalized positive double). * success @retu / @retd is 0 @valu / @vald is the conversion result @suffixu / @suffixd points to the first character not consumed by the conversion. This leads to a matrix (parse error, uint64_t overflow, success) x (parse error, double overflow, double underflow, success). We need to check the code does what we want for each element of this matrix, and document any behavior that's not perfectly obvious. (success, success): we pick uint64_t if qemu_strtou64() consumed more characters than qemu_strtod_finite(), else double. "More" is important here; when they consume the same characters, we *need* to use the uint64_t result. Example: for "18446744073709551615", we need to use uint64_t 18446744073709551615, not double 18446744073709551616.0. But for "18446744073709551616.", we need to use the double. Good. (success, parse error) and (parse error, success): we pick the one that succeeds, because success consumes characters, and failure to parse does not. Good. (parse error, parse error): neither consumes characters, so we pick uint64_t. Good. (parse error, double overflow), (parse error, double underflow) and (uint64_t overflow, parse error): we pick the range error, because it consumes characters. Good. These are the simple combinations. The remainder are hairier: (success, double overflow), (success, double underflow), (uint64_t overflow, success). I lack the time to analyze them today. Must be done before we take this patch. Any takers? (success, double overflow), (success, double underflow), pick double overflow error, return -ERANGE. Because it consumes characters. Example: for "1.79769e+309", qemu_strtou64 consumes "1", and prases as uint64_t; but qemu_strtod_finite return -ERANGE and consumes all characters. It is OK. (uint64_t overflow, success), consume the same characters, use the uint64_t return -ERANGE. Note that even if qemu_strtod_finite can parse these cases such as "18446744073709551617", but the result is uint64_t so we also need to return -ERANGE. It is OK. Thank you for your analysis and suggestion. I will add more test cases to cover some of these analysis. -fraction = modf(val, &integral); -if (fraction != 0) { -mul_required = 1; +if (use_strtod) { +fraction = modf(vald, &integral); +if (fraction != 0) { +mul_required = 1; +} } Here, @suffix points to the suffix character, if any. -c = *endptr; +c = *suffix; mul = suffix_mul(c, unit
Re: [PATCH RESEND 4/4] target/i386: Add notes for versioned CPU models
On 12/5/2019 4:44 PM, Xiaoyao Li wrote: On 12/2/2019 2:32 PM, Tao Xu wrote: Add which features are added or removed in this version. Remove the changed model-id in versioned CPU models. Signed-off-by: Tao Xu --- target/i386/cpu.c | 50 +++ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 7b3bd6d4db..c82fbfd02e 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c [...] @@ -3141,6 +3133,7 @@ static X86CPUDefinition builtin_x86_defs[] = { .versions = (X86CPUVersionDefinition[]) { { .version = 1 }, { .version = 2, + .note = "ARCH_CAPABILITIES", Here ARCH_CAPABILITIES doesn't tell what bits in MSR_IA32_ARCH_CAPABILITIES this version has, which makes it meaningless. Maybe .note = "ARCH_CAPABLITIES(rdctl-no, ibrs-all, skip-l1dfl-vmentry, mds-no)", is better? But it is too long for -cpu help, break the info into 2 lines.
Re: [PATCH RESEND 0/4] Add extra information to versioned CPU models
On 12/5/2019 4:55 PM, Xiaoyao Li wrote: On 12/2/2019 2:32 PM, Tao Xu wrote: This series of patches will remove MPX from Denverton, remove Remove monitor from some CPU models. Add additional information for -cpu help to indicate the changes in this version of CPU model. The output is as follows: x86_64-softmmu/qemu-system-x86_64 -cpu help | grep "\[" x86 Broadwell-v2 Intel Core Processor (Broadwell) [no TSX] x86 Broadwell-v3 Intel Core Processor (Broadwell) [IBRS] x86 Broadwell-v4 Intel Core Processor (Broadwell) [no TSX, IBRS] Above the changes of each Broadwell-v{2,3,4} are based on Broadwell-v1. x86 Cascadelake-Server-v2 Intel Xeon Processor (Cascadelake) [ARCH_CAPABILITIES] x86 Cascadelake-Server-v3 Intel Xeon Processor (Cascadelake) [no TSX] But in the code, Cascadelake-Server-v3 inherits all the features in Cascadelake-Server-v2 and removes TSX related hle & rtm. So if we keep the same rule based on v1, it should be x86 Cascadelake-Server-v3 Intel Xeon Processor (Cascadelake) [ARCH_CAPABILITIES, no TSX] Thank you for your suggestion. I will correct this.
[PATCH] util/cutils: Expand do_strtosz parsing precision to 64 bits
Parse input string both as a double and as a uint64_t, then use the method which consumes more characters. Update the related test cases. Signed-off-by: Tao Xu --- tests/test-cutils.c| 37 - tests/test-keyval.c| 47 --- tests/test-qemu-opts.c | 39 -- util/cutils.c | 74 ++ 4 files changed, 73 insertions(+), 124 deletions(-) diff --git a/tests/test-cutils.c b/tests/test-cutils.c index 1aa8351520..4a7030c611 100644 --- a/tests/test-cutils.c +++ b/tests/test-cutils.c @@ -1970,40 +1970,25 @@ static void test_qemu_strtosz_simple(void) g_assert_cmpint(err, ==, 0); g_assert_cmpint(res, ==, 12345); -/* Note: precision is 53 bits since we're parsing with strtod() */ - -str = "9007199254740991"; /* 2^53-1 */ -err = qemu_strtosz(str, &endptr, &res); -g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0x1f); -g_assert(endptr == str + 16); - -str = "9007199254740992"; /* 2^53 */ -err = qemu_strtosz(str, &endptr, &res); -g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0x20); -g_assert(endptr == str + 16); +/* Note: precision is 64 bits (UINT64_MAX) */ str = "9007199254740993"; /* 2^53+1 */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0x20); /* rounded to 53 bits */ +g_assert_cmpint(res, ==, 0x21); g_assert(endptr == str + 16); -str = "18446744073709549568"; /* 0xf800 (53 msbs set) */ +str = "18446744073709550591"; /* 0xfbff */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0xf800); +g_assert_cmpint(res, ==, 0xfbff); g_assert(endptr == str + 20); -str = "18446744073709550591"; /* 0xfbff */ +str = "18446744073709551615"; /* 2^64-1 (UINT64_MAX) */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, 0); -g_assert_cmpint(res, ==, 0xf800); /* rounded to 53 bits */ +g_assert_cmpint(res, ==, 0x); g_assert(endptr == str + 20); - -/* 0x7e00..0x7fff get rounded to - * 0x8000, thus -ERANGE; see test_qemu_strtosz_erange() */ } static void test_qemu_strtosz_units(void) @@ -2145,16 +2130,6 @@ static void test_qemu_strtosz_erange(void) g_assert_cmpint(err, ==, -ERANGE); g_assert(endptr == str + 2); -str = "18446744073709550592"; /* 0xfc00 */ -err = qemu_strtosz(str, &endptr, &res); -g_assert_cmpint(err, ==, -ERANGE); -g_assert(endptr == str + 20); - -str = "18446744073709551615"; /* 2^64-1 */ -err = qemu_strtosz(str, &endptr, &res); -g_assert_cmpint(err, ==, -ERANGE); -g_assert(endptr == str + 20); - str = "18446744073709551616"; /* 2^64 */ err = qemu_strtosz(str, &endptr, &res); g_assert_cmpint(err, ==, -ERANGE); diff --git a/tests/test-keyval.c b/tests/test-keyval.c index 09b0ae3c68..fad941fcb8 100644 --- a/tests/test-keyval.c +++ b/tests/test-keyval.c @@ -383,59 +383,26 @@ static void test_keyval_visit_size(void) visit_end_struct(v, NULL); visit_free(v); -/* Note: precision is 53 bits since we're parsing with strtod() */ +/* Note: precision is 64 bits (UINT64_MAX) */ -/* Around limit of precision: 2^53-1, 2^53, 2^53+1 */ -qdict = keyval_parse("sz1=9007199254740991," - "sz2=9007199254740992," - "sz3=9007199254740993", +/* Around limit of precision: UINT64_MAX - 1, UINT64_MAX */ +qdict = keyval_parse("sz1=18446744073709551614," + "sz2=18446744073709551615", NULL, &error_abort); v = qobject_input_visitor_new_keyval(QOBJECT(qdict)); qobject_unref(qdict); visit_start_struct(v, NULL, NULL, 0, &error_abort); visit_type_size(v, "sz1", &sz, &error_abort); -g_assert_cmphex(sz, ==, 0x1f); +g_assert_cmphex(sz, ==, 0xfffe); visit_type_size(v, "sz2", &sz, &error_abort); -g_assert_cmphex(sz, ==, 0x20); -visit_type_size(v, "sz3", &sz, &error_abort); -g_assert_cmphex(sz, ==, 0x20); -visit_check_struct(v, &error_abort); -visit_end_struct(v, NULL); -visit_free(v); - -/* Close to signed upper limit 0x7c00 (53 msbs set) */ -qdict = keyval_parse("sz1=9223372036854774784," /* 7c00 */ - "sz2=9223
Re: [PATCH v20 0/8] Build ACPI Heterogeneous Memory Attribute Table (HMAT)
On 12/3/2019 2:25 PM, Michael S. Tsirkin wrote: On Tue, Dec 03, 2019 at 07:00:53AM +0100, Markus Armbruster wrote: "Michael S. Tsirkin" writes: On Tue, Dec 03, 2019 at 08:53:30AM +0800, Tao Xu wrote: Hi Michael, Could this patch series be queued? Thank you very much! Tao QEMU is in freeze, so not yet. Please ping after the release. Just to avoid confusion: it's Michael's personal preference not to process patches for the next version during freeze. Other maintainers do, and that's actually the project's policy: Subject: QEMU Summit 2017: minutes Message-ID: https://lists.nongnu.org/archive/html/qemu-devel/2017-11/msg04453.html qemu-next: * Problem 1: Contributors cannot get patches merged during freeze (bad experience) [...] * Markus Armbruster: Problem 1 is solved if maintainers keep their own -next trees * Paolo Bonzini: Maintaining -next could slow down or create work for -freeze (e.g. who does backports) * Action: Maintainers mustn't tell submitters to go away just because we're in a release freeze (it's up to them whether they prefer to maintain a "-next" tree for their subsystem with patches queued for the following release, or track which patches they've accepted some other way) * We're not going to have an official project-wide "-next" tree, though Michael, would queuing up patches in a -next branch really be too much trouble for you? Thanks for pointing this out! I stopped asking for re-post since awhile ago. I don't queue patches in a public tree but I do review and do keep track of pending patches. I tend to ask contributors to also ping because sometimes there's a problem with rebase, I drop the patch but forget to tell the contributor, and it tends to happen more with big patchsets posted during freeze as there's a rush to merge changes right after that. I usually don't bother people with this for small patches though. I'll try to be clearer in my communication so contributors don't feel stressed. Would something like: "I'll queue it for merge after the release. If possible please ping me after the release to help make sure it didn't get dropped." be clearer? Hopefully windows CI efforts will soon bear fruit to the point where they stress PCI enough to make maintaining next worth the effort. I see. Thanks for Markus and Michael's kindly response. I feel happy rather than stressed in QEMU community :)
Re: [PATCH v20 0/8] Build ACPI Heterogeneous Memory Attribute Table (HMAT)
On 12/3/2019 1:35 PM, Michael S. Tsirkin wrote: On Tue, Dec 03, 2019 at 08:53:30AM +0800, Tao Xu wrote: Hi Michael, Could this patch series be queued? Thank you very much! Tao QEMU is in freeze, so not yet. Please ping after the release. OK, Thank you!
Re: [PATCH v20 0/8] Build ACPI Heterogeneous Memory Attribute Table (HMAT)
Hi Michael, Could this patch series be queued? Thank you very much! Tao On 11/29/2019 3:56 PM, Xu, Tao3 wrote: This series of patches will build Heterogeneous Memory Attribute Table (HMAT) according to the command line. The ACPI HMAT describes the memory attributes, such as memory side cache attributes and bandwidth and latency details, related to the Memory Proximity Domain. The software is expected to use HMAT information as hint for optimization. In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report the platform's HMAT tables. The V19 patches link: https://patchwork.kernel.org/cover/11265525/ Changelog: v20: - Use g_assert_true and g_assert_false to replace g_assert (Thomas and Markus) - Rename assoc as associativity, update the QAPI description (Markus) - Disable cache level 0 in hmat-cache option (Igor) - Keep base and bitmap unchanged when latency or bandwidth out of range - Fix the broken CI case when user input latency or bandwidth less than required. v19: - Add description about the machine property 'hmat' in commit message (Markus) - Update the QAPI comments - Add a check for no memory side cache - Add some fail cases for hmat-cache when level=0 v18: - Defer patches 01/14~06/14 of V17, use qapi type uint64 and only nanosecond for latency (Markus) - Rewrite the lines over 80 characters(Igor) v17: - Add check when user input latency or bandwidth 0, the lb_info_provided should also be 0. Because in ACPI 6.3 5.2.27.4, 0 means the corresponding latency or bandwidth information is not provided. - Fix the infinite loop when node->latency is 0. - Use NumaHmatCacheOptions to replace HMAT_Cache_Info (Igor) - Add check for unordered cache level input (Igor) - Add some fail test cases (Igor) v16: - Add and use qemu_strtold_finite to parse size, support full 64bit precision, modify related test cases (Eduardo and Markus) - Simplify struct HMAT_LB_Info and related code, unify latency and bandwidth (Igor) - Add cross check with hmat_lb data (Igor) - Fields in Cache Attributes are promoted to uint32_t before shifting (Igor) - Add case for QMP build HMAT (Igor) v15: - Add a new patch to refactor do_strtosz() (Eduardo) - Make tests without breaking CI (Michael) v14: - Reuse the codes of do_strtosz to build qemu_strtotime_ns (Eduardo) - Squash patch v13 01/12 and 02/12 together (Daniel and Eduardo) - Drop time unit picosecond (Eric) - Use qemu ctz64 and clz64 instead of builtin function v13: - Modify some text description - Drop "initiator_valid" field in struct NodeInfo - Reuse Garray to store the raw bandwidth and bandwidth data - Calculate common base unit using range bitmap - Add a patch to alculate hmat latency and bandwidth entry list - Drop the total_levels option and use readable cache size - Remove the unnecessary head file - Use decimal notation with appropriate suffix for cache size Liu Jingqi (5): numa: Extend CLI to provide memory latency and bandwidth information numa: Extend CLI to provide memory side cache information hmat acpi: Build Memory Proximity Domain Attributes Structure(s) hmat acpi: Build System Locality Latency and Bandwidth Information Structure(s) hmat acpi: Build Memory Side Cache Information Structure(s) Tao Xu (3): numa: Extend CLI to provide initiator information for numa nodes tests/numa: Add case for QMP build HMAT tests/bios-tables-test: add test cases for ACPI HMAT hw/acpi/Kconfig | 7 +- hw/acpi/Makefile.objs | 1 + hw/acpi/hmat.c| 268 +++ hw/acpi/hmat.h| 42 hw/core/machine.c | 64 ++ hw/core/numa.c| 297 ++ hw/i386/acpi-build.c | 5 + include/sysemu/numa.h | 63 ++ qapi/machine.json | 180 +++- qemu-options.hx | 95 +++- tests/bios-tables-test-allowed-diff.h | 8 + tests/bios-tables-test.c | 44 tests/data/acpi/pc/APIC.acpihmat | 0 tests/data/acpi/pc/DSDT.acpihmat | 0 tests/data/acpi/pc/HMAT.acpihmat | 0 tests/data/acpi/pc/SRAT.acpihmat | 0 tests/data/acpi/q35/APIC.acpihmat | 0 tests/data/acpi/q35/DSDT.acpihmat | 0 tests/data/acpi/q35/HMAT.acpihmat | 0 tests/data/acpi/q35/SRAT.acpihmat | 0 tests/numa-test.c | 213 ++ 21 files changed, 1276 insertions(+), 11 deletions(-) create mode 100644 hw/acpi/hmat.c create mode 100644 hw/acpi/hmat.h create mode 100644 tests/data/acpi/pc/APIC.acpihmat create mo
Re: [PATCH] target/i386: Remove monitor from some CPU model
I am so forry for sending this old version patch by mistake. Please ignore this patch. On 12/2/2019 2:28 PM, Xu, Tao3 wrote: Add new version of Snowridge, Denverton, Opteron_G3, EPYC, and Dhyana CPU model to remove MONITOR/MWAIT feature. After QEMU/KVM use "-overcommit cpu-pm=on" to expose MONITOR/MWAIT (commit id 6f131f13e68d648a8e4f083c667ab1acd88ce4cd), the MONITOR/MWAIT feature in these CPU model is unused. Signed-off-by: Tao Xu --- target/i386/cpu.c | 58 +++ 1 file changed, 58 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index a624163ac2..7c5f1e8fe0 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -2770,6 +2770,19 @@ static X86CPUDefinition builtin_x86_defs[] = { MSR_ARCH_CAP_RDCL_NO | MSR_ARCH_CAP_SKIP_L1DFL_VMENTRY, .xlevel = 0x8008, .model_id = "Intel Atom Processor (Denverton)", +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ "model-id", + "Intel Atom Processor (Denverton, no MONITOR)" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, { .name = "Snowridge", @@ -2850,6 +2863,16 @@ static X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ }, }, }, +{ +.version = 3, +.props = (PropValue[]) { +/* mpx was already removed by -v2 above */ +{ "monitor", "off" }, +{ "model-id", + "Intel Atom Processor (Snowridge, no MPX, no MONITOR)" }, +{ /* end of list */ }, +}, +}, { /* end of list */ }, }, }, @@ -2961,6 +2984,19 @@ static X86CPUDefinition builtin_x86_defs[] = { CPUID_EXT3_ABM | CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM, .xlevel = 0x8008, .model_id = "AMD Opteron 23xx (Gen 3 Class Opteron)", +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ "model-id", + "AMD Opteron 23xx (Gen 3 Class Opteron, no MONITOR)" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, { .name = "Opteron_G4", @@ -3085,6 +3121,16 @@ static X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ } } }, +{ +.version = 3, +.props = (PropValue[]) { +/* ibpb was already enabled by -v2 above */ +{ "monitor", "off" }, +{ "model-id", + "AMD EPYC Processor (with IBPB, no MONITOR)" }, +{ /* end of list */ }, +}, +}, { /* end of list */ } } }, @@ -3137,6 +3183,18 @@ static X86CPUDefinition builtin_x86_defs[] = { .xlevel = 0x801E, .model_id = "Hygon Dhyana Processor", .cache_info = &epyc_cache_info, +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ "model-id", "Hygon Dhyana Processor (no MONITOR)" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, };
[PATCH RESEND 2/4] target/i386: Remove monitor from some CPU models
Add new version of Snowridge, Denverton, Opteron_G3, EPYC, and Dhyana CPU model to remove MONITOR/MWAIT feature. After QEMU/KVM use "-overcommit cpu-pm=on" to expose MONITOR/MWAIT (commit id 6f131f13e68d648a8e4f083c667ab1acd88ce4cd), the MONITOR/MWAIT feature in these CPU model is unused. Signed-off-by: Tao Xu --- target/i386/cpu.c | 38 ++ 1 file changed, 38 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 06a3077f95..b09ac38409 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3621,6 +3621,14 @@ static X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ }, }, }, +{ +.version = 3, +.props = (PropValue[]) { +/* mpx was already removed by -v2 above */ +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, { /* end of list */ }, }, }, @@ -3732,6 +3740,17 @@ static X86CPUDefinition builtin_x86_defs[] = { CPUID_EXT3_ABM | CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM, .xlevel = 0x8008, .model_id = "AMD Opteron 23xx (Gen 3 Class Opteron)", +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, { .name = "Opteron_G4", @@ -3856,6 +3875,14 @@ static X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ } } }, +{ +.version = 3, +.props = (PropValue[]) { +/* ibpb was already enabled by -v2 above */ +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, { /* end of list */ } } }, @@ -3908,6 +3935,17 @@ static X86CPUDefinition builtin_x86_defs[] = { .xlevel = 0x801E, .model_id = "Hygon Dhyana Processor", .cache_info = &epyc_cache_info, +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, }; -- 2.20.1
[PATCH RESEND 1/4] target/i386: Add Denverton-v2 (no MPX) CPU model
Because MPX is being removed from the linux kernel, remove MPX feature from Denverton. Signed-off-by: Tao Xu --- target/i386/cpu.c | 12 1 file changed, 12 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 69f518a21a..06a3077f95 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3482,6 +3482,18 @@ static X86CPUDefinition builtin_x86_defs[] = { .features[FEAT_VMX_VMFUNC] = MSR_VMX_VMFUNC_EPT_SWITCHING, .xlevel = 0x8008, .model_id = "Intel Atom Processor (Denverton)", +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ "mpx", "off" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, { .name = "Snowridge", -- 2.20.1
[PATCH RESEND 0/4] Add extra information to versioned CPU models
This series of patches will remove MPX from Denverton, remove Remove monitor from some CPU models. Add additional information for -cpu help to indicate the changes in this version of CPU model. The output is as follows: x86_64-softmmu/qemu-system-x86_64 -cpu help | grep "\[" x86 Broadwell-v2 Intel Core Processor (Broadwell) [no TSX] x86 Broadwell-v3 Intel Core Processor (Broadwell) [IBRS] x86 Broadwell-v4 Intel Core Processor (Broadwell) [no TSX, IBRS] x86 Cascadelake-Server-v2 Intel Xeon Processor (Cascadelake) [ARCH_CAPABILITIES] x86 Cascadelake-Server-v3 Intel Xeon Processor (Cascadelake) [no TSX] x86 Denverton-v2 Intel Atom Processor (Denverton) [no MPX, no MONITOR] x86 Dhyana-v2 Hygon Dhyana Processor [no MONITOR] x86 EPYC-v2 AMD EPYC Processor [IBPB] x86 EPYC-v3 AMD EPYC Processor [IBPB, no MONITOR] x86 Haswell-v2Intel Core Processor (Haswell) [no TSX] x86 Haswell-v3Intel Core Processor (Haswell) [IBRS] x86 Haswell-v4Intel Core Processor (Haswell) [no TSX, IBRS] x86 Icelake-Client-v2 Intel Core Processor (Icelake) [no TSX] x86 Icelake-Server-v2 Intel Xeon Processor (Icelake) [no TSX] x86 IvyBridge-v2 Intel Xeon E3-12xx v2 (Ivy Bridge) [IBRS] x86 Nehalem-v2Intel Core i7 9xx (Nehalem Class Core i7) [IBRS] x86 Opteron_G3-v2 AMD Opteron 23xx (Gen 3 Class Opteron) [no MONITOR] x86 SandyBridge-v2Intel Xeon E312xx (Sandy Bridge) [IBRS] x86 Skylake-Client-v2 Intel Core Processor (Skylake) [IBRS] x86 Skylake-Client-v3 Intel Core Processor (Skylake) [no TSX, IBRS] x86 Skylake-Server-v2 Intel Xeon Processor (Skylake) [IBRS] x86 Skylake-Server-v3 Intel Xeon Processor (Skylake) [no TSX, IBRS] x86 Snowridge-v2 Intel Atom Processor (SnowRidge) [no MPX] x86 Snowridge-v3 Intel Atom Processor (SnowRidge) [no MPX, no MONITOR] x86 Westmere-v2 Westmere E56xx/L56xx/X56xx (Nehalem-C) [IBRS] Tao Xu (4): target/i386: Add Denverton-v2 (no MPX) CPU model target/i386: Remove monitor from some CPU models target/i386: Add new property note to versioned CPU models target/i386: Add notes for versioned CPU models target/i386/cpu.c | 112 +++--- 1 file changed, 85 insertions(+), 27 deletions(-) -- 2.20.1
[PATCH RESEND 3/4] target/i386: Add new property note to versioned CPU models
Add additional information for -cpu help to indicate the changes in this version of CPU model. Suggested-by: Eduardo Habkost Signed-off-by: Tao Xu --- target/i386/cpu.c | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index b09ac38409..7b3bd6d4db 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1693,6 +1693,7 @@ typedef struct PropValue { typedef struct X86CPUVersionDefinition { X86CPUVersion version; const char *alias; +const char *note; PropValue *props; } X86CPUVersionDefinition; @@ -1723,6 +1724,7 @@ struct X86CPUModel { X86CPUDefinition *cpudef; /* CPU model version */ X86CPUVersion version; +const char *note; /* * If true, this is an alias CPU model. * This matters only for "-cpu help" and query-cpu-definitions @@ -4788,6 +4790,7 @@ static void x86_cpu_list_entry(gpointer data, gpointer user_data) char *name = x86_cpu_class_get_model_name(cc); char *desc = g_strdup(cc->model_description); char *alias_of = x86_cpu_class_get_alias_of(cc); +char *model_id = x86_cpu_class_get_model_id(cc); if (!desc && alias_of) { if (cc->model && cc->model->version == CPU_VERSION_AUTO) { @@ -4796,14 +4799,18 @@ static void x86_cpu_list_entry(gpointer data, gpointer user_data) desc = g_strdup_printf("(alias of %s)", alias_of); } } +if (!desc && cc->model && cc->model->note) { +desc = g_strdup_printf("%s [%s]", model_id, cc->model->note); +} if (!desc) { -desc = x86_cpu_class_get_model_id(cc); +desc = g_strdup_printf("%s", model_id); } -qemu_printf("x86 %-20s %-48s\n", name, desc); +qemu_printf("x86 %-20s %-58s\n", name, desc); g_free(name); g_free(desc); g_free(alias_of); +g_free(model_id); } /* list available CPU models and flags */ @@ -5280,6 +5287,7 @@ static void x86_register_cpudef_types(X86CPUDefinition *def) X86CPUModel *m = g_new0(X86CPUModel, 1); m->cpudef = def; m->version = vdef->version; +m->note = vdef->note; name = x86_cpu_versioned_model_name(def, vdef->version); x86_register_cpu_model_type(name, m); g_free(name); -- 2.20.1
[PATCH RESEND 4/4] target/i386: Add notes for versioned CPU models
Add which features are added or removed in this version. Remove the changed model-id in versioned CPU models. Signed-off-by: Tao Xu --- target/i386/cpu.c | 50 +++ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 7b3bd6d4db..c82fbfd02e 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -2281,10 +2281,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Nehalem-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Intel Core i7 9xx (Nehalem Core i7, IBRS update)" }, { /* end of list */ } } }, @@ -2362,10 +2361,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Westmere-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Westmere E56xx/L56xx/X56xx (IBRS update)" }, { /* end of list */ } } }, @@ -2448,10 +2446,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "SandyBridge-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Intel Xeon E312xx (Sandy Bridge, IBRS update)" }, { /* end of list */ } } }, @@ -2540,10 +2537,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "IvyBridge-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Intel Xeon E3-12xx v2 (Ivy Bridge, IBRS)" }, { /* end of list */ } } }, @@ -2637,17 +2633,18 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Haswell-noTSX", +.note = "no TSX", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, { "stepping", "1" }, -{ "model-id", "Intel Core Processor (Haswell, no TSX)", }, { /* end of list */ } }, }, { .version = 3, .alias = "Haswell-IBRS", +.note = "IBRS", .props = (PropValue[]) { /* Restore TSX features removed by -v2 above */ { "hle", "on" }, @@ -2658,21 +2655,18 @@ static X86CPUDefinition builtin_x86_defs[] = { */ { "stepping", "4" }, { "spec-ctrl", "on" }, -{ "model-id", - "Intel Core Processor (Haswell, IBRS)" }, { /* end of list */ } } }, { .version = 4, .alias = "Haswell-noTSX-IBRS", +.note = "no TSX, IBRS", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, /* spec-ctrl was already enabled by -v3 above */ { "stepping", "1" }, -{ "model-id", - "Intel Core Processor (Haswell, no TSX, IBRS)" }, { /* end of list */ } } }, @@ -2768,35 +2762,33 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Broadwell-noTSX", +.note = "no TSX", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, -{ "model-id", "Inte
[PATCH 4/4] target/i386: Add notes for versioned CPU models
Add which features are added or removed in this version. Remove the changed model-id in versioned CPU models. Signed-off-by: Tao Xu --- target/i386/cpu.c | 50 +++ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 7b3bd6d4db..c82fbfd02e 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -2281,10 +2281,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Nehalem-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Intel Core i7 9xx (Nehalem Core i7, IBRS update)" }, { /* end of list */ } } }, @@ -2362,10 +2361,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Westmere-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Westmere E56xx/L56xx/X56xx (IBRS update)" }, { /* end of list */ } } }, @@ -2448,10 +2446,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "SandyBridge-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Intel Xeon E312xx (Sandy Bridge, IBRS update)" }, { /* end of list */ } } }, @@ -2540,10 +2537,9 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "IvyBridge-IBRS", +.note = "IBRS", .props = (PropValue[]) { { "spec-ctrl", "on" }, -{ "model-id", - "Intel Xeon E3-12xx v2 (Ivy Bridge, IBRS)" }, { /* end of list */ } } }, @@ -2637,17 +2633,18 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Haswell-noTSX", +.note = "no TSX", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, { "stepping", "1" }, -{ "model-id", "Intel Core Processor (Haswell, no TSX)", }, { /* end of list */ } }, }, { .version = 3, .alias = "Haswell-IBRS", +.note = "IBRS", .props = (PropValue[]) { /* Restore TSX features removed by -v2 above */ { "hle", "on" }, @@ -2658,21 +2655,18 @@ static X86CPUDefinition builtin_x86_defs[] = { */ { "stepping", "4" }, { "spec-ctrl", "on" }, -{ "model-id", - "Intel Core Processor (Haswell, IBRS)" }, { /* end of list */ } } }, { .version = 4, .alias = "Haswell-noTSX-IBRS", +.note = "no TSX, IBRS", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, /* spec-ctrl was already enabled by -v3 above */ { "stepping", "1" }, -{ "model-id", - "Intel Core Processor (Haswell, no TSX, IBRS)" }, { /* end of list */ } } }, @@ -2768,35 +2762,33 @@ static X86CPUDefinition builtin_x86_defs[] = { { .version = 2, .alias = "Broadwell-noTSX", +.note = "no TSX", .props = (PropValue[]) { { "hle", "off" }, { "rtm", "off" }, -{ "model-id", "Inte
[PATCH 2/4] target/i386: Remove monitor from some CPU models
Add new version of Snowridge, Denverton, Opteron_G3, EPYC, and Dhyana CPU model to remove MONITOR/MWAIT feature. After QEMU/KVM use "-overcommit cpu-pm=on" to expose MONITOR/MWAIT (commit id 6f131f13e68d648a8e4f083c667ab1acd88ce4cd), the MONITOR/MWAIT feature in these CPU model is unused. Signed-off-by: Tao Xu --- target/i386/cpu.c | 38 ++ 1 file changed, 38 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 06a3077f95..b09ac38409 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3621,6 +3621,14 @@ static X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ }, }, }, +{ +.version = 3, +.props = (PropValue[]) { +/* mpx was already removed by -v2 above */ +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, { /* end of list */ }, }, }, @@ -3732,6 +3740,17 @@ static X86CPUDefinition builtin_x86_defs[] = { CPUID_EXT3_ABM | CPUID_EXT3_SVM | CPUID_EXT3_LAHF_LM, .xlevel = 0x8008, .model_id = "AMD Opteron 23xx (Gen 3 Class Opteron)", +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, { .name = "Opteron_G4", @@ -3856,6 +3875,14 @@ static X86CPUDefinition builtin_x86_defs[] = { { /* end of list */ } } }, +{ +.version = 3, +.props = (PropValue[]) { +/* ibpb was already enabled by -v2 above */ +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, { /* end of list */ } } }, @@ -3908,6 +3935,17 @@ static X86CPUDefinition builtin_x86_defs[] = { .xlevel = 0x801E, .model_id = "Hygon Dhyana Processor", .cache_info = &epyc_cache_info, +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, }; -- 2.20.1
[PATCH 1/4] target/i386: Add Denverton-v2 (no MPX) CPU model
Because MPX is being removed from the linux kernel, remove MPX feature from Denverton. Signed-off-by: Tao Xu --- target/i386/cpu.c | 12 1 file changed, 12 insertions(+) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index 69f518a21a..06a3077f95 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -3482,6 +3482,18 @@ static X86CPUDefinition builtin_x86_defs[] = { .features[FEAT_VMX_VMFUNC] = MSR_VMX_VMFUNC_EPT_SWITCHING, .xlevel = 0x8008, .model_id = "Intel Atom Processor (Denverton)", +.versions = (X86CPUVersionDefinition[]) { +{ .version = 1 }, +{ +.version = 2, +.props = (PropValue[]) { +{ "monitor", "off" }, +{ "mpx", "off" }, +{ /* end of list */ }, +}, +}, +{ /* end of list */ }, +}, }, { .name = "Snowridge", -- 2.20.1
[PATCH 3/4] target/i386: Add new property note to versioned CPU models
Add additional information for -cpu help to indicate the changes in this version of CPU model. Suggested-by: Eduardo Habkost Signed-off-by: Tao Xu --- target/i386/cpu.c | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/target/i386/cpu.c b/target/i386/cpu.c index b09ac38409..7b3bd6d4db 100644 --- a/target/i386/cpu.c +++ b/target/i386/cpu.c @@ -1693,6 +1693,7 @@ typedef struct PropValue { typedef struct X86CPUVersionDefinition { X86CPUVersion version; const char *alias; +const char *note; PropValue *props; } X86CPUVersionDefinition; @@ -1723,6 +1724,7 @@ struct X86CPUModel { X86CPUDefinition *cpudef; /* CPU model version */ X86CPUVersion version; +const char *note; /* * If true, this is an alias CPU model. * This matters only for "-cpu help" and query-cpu-definitions @@ -4788,6 +4790,7 @@ static void x86_cpu_list_entry(gpointer data, gpointer user_data) char *name = x86_cpu_class_get_model_name(cc); char *desc = g_strdup(cc->model_description); char *alias_of = x86_cpu_class_get_alias_of(cc); +char *model_id = x86_cpu_class_get_model_id(cc); if (!desc && alias_of) { if (cc->model && cc->model->version == CPU_VERSION_AUTO) { @@ -4796,14 +4799,18 @@ static void x86_cpu_list_entry(gpointer data, gpointer user_data) desc = g_strdup_printf("(alias of %s)", alias_of); } } +if (!desc && cc->model && cc->model->note) { +desc = g_strdup_printf("%s [%s]", model_id, cc->model->note); +} if (!desc) { -desc = x86_cpu_class_get_model_id(cc); +desc = g_strdup_printf("%s", model_id); } -qemu_printf("x86 %-20s %-48s\n", name, desc); +qemu_printf("x86 %-20s %-58s\n", name, desc); g_free(name); g_free(desc); g_free(alias_of); +g_free(model_id); } /* list available CPU models and flags */ @@ -5280,6 +5287,7 @@ static void x86_register_cpudef_types(X86CPUDefinition *def) X86CPUModel *m = g_new0(X86CPUModel, 1); m->cpudef = def; m->version = vdef->version; +m->note = vdef->note; name = x86_cpu_versioned_model_name(def, vdef->version); x86_register_cpu_model_type(name, m); g_free(name); -- 2.20.1
[PATCH v20 7/8] tests/numa: Add case for QMP build HMAT
Check configuring HMAT usecase Acked-by: Markus Armbruster Suggested-by: Igor Mammedov Signed-off-by: Tao Xu --- Changes in v20: - Use g_assert_true and g_assert_false to replace g_assert (Thomas and Markus) Changes in v19: - Add some fail cases for hmat-cache when level=0 Changes in v18: - Rewrite the lines over 80 characters Chenges in v17: - Add some fail test cases (Igor) --- tests/numa-test.c | 213 ++ 1 file changed, 213 insertions(+) diff --git a/tests/numa-test.c b/tests/numa-test.c index 8de8581231..da1c19ef74 100644 --- a/tests/numa-test.c +++ b/tests/numa-test.c @@ -327,6 +327,216 @@ static void pc_dynamic_cpu_cfg(const void *data) qtest_quit(qs); } +static void pc_hmat_build_cfg(const void *data) +{ +QTestState *qs = qtest_initf("%s -nodefaults --preconfig -machine hmat=on " + "-smp 2,sockets=2 " + "-m 128M,slots=2,maxmem=1G " + "-object memory-backend-ram,size=64M,id=m0 " + "-object memory-backend-ram,size=64M,id=m1 " + "-numa node,nodeid=0,memdev=m0 " + "-numa node,nodeid=1,memdev=m1,initiator=0 " + "-numa cpu,node-id=0,socket-id=0 " + "-numa cpu,node-id=0,socket-id=1", + data ? (char *)data : ""); + +/* Fail: Initiator should be less than the number of nodes */ +g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 2, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); + +/* Fail: Target should be less than the number of nodes */ +g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 2," +" 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); + +/* Fail: Initiator should contain cpu */ +g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 1, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); + +/* Fail: Data-type mismatch */ +g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"write-latency\"," +" 'bandwidth': 524288000 } }"))); +g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"read-bandwidth\"," +" 'latency': 5 } }"))); + +/* Fail: Bandwidth should be 1MB (1048576) aligned */ +g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," +" 'bandwidth': 1048575 } }"))); + +/* Configuring HMAT bandwidth and latency details */ +g_assert_false(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"access-latency\"," +" 'latency': 1 } }")));/* 1 ns */ +g_assert_true(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," +" 'hierarchy': \"memory\", 'data-type':
[PATCH v20 5/8] hmat acpi: Build System Locality Latency and Bandwidth Information Structure(s)
From: Liu Jingqi This structure describes the memory access latency and bandwidth information from various memory access initiator proximity domains. The latency and bandwidth numbers represented in this structure correspond to rated latency and bandwidth for the platform. The software could use this information as hint for optimization. Acked-by: Markus Armbruster Reviewed-by: Igor Mammedov Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- Changes in v20: - Fix the broken CI case when user input latency or bandwidth less than required Changes in v17: - Remove unnecessary header file (Igor) Changes in v16: - Add more description for lb_length (Igor) - Drop entry_list and calculate entries in this patch (Igor) Changes in v13: - Calculate the entries in a new patch. --- hw/acpi/hmat.c | 104 - 1 file changed, 103 insertions(+), 1 deletion(-) diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c index 9ff79308a4..4635d45dee 100644 --- a/hw/acpi/hmat.c +++ b/hw/acpi/hmat.c @@ -25,6 +25,7 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "sysemu/numa.h" #include "hw/acpi/hmat.h" @@ -67,11 +68,89 @@ static void build_hmat_mpda(GArray *table_data, uint16_t flags, build_append_int_noprefix(table_data, 0, 8); } +/* + * ACPI 6.3: 5.2.27.4 System Locality Latency and Bandwidth Information + * Structure: Table 5-146 + */ +static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, + uint32_t num_initiator, uint32_t num_target, + uint32_t *initiator_list) +{ +int i, index; +HMAT_LB_Data *lb_data; +uint16_t *entry_list; +uint32_t base; +/* Length in bytes for entire structure */ +uint32_t lb_length += 32 /* Table length upto and including Entry Base Unit */ ++ 4 * num_initiator /* Initiator Proximity Domain List */ ++ 4 * num_target /* Target Proximity Domain List */ ++ 2 * num_initiator * num_target; /* Latency or Bandwidth Entries */ + +/* Type */ +build_append_int_noprefix(table_data, 1, 2); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* Length */ +build_append_int_noprefix(table_data, lb_length, 4); +/* Flags: Bits [3:0] Memory Hierarchy, Bits[7:4] Reserved */ +assert(!(hmat_lb->hierarchy >> 4)); +build_append_int_noprefix(table_data, hmat_lb->hierarchy, 1); +/* Data Type */ +build_append_int_noprefix(table_data, hmat_lb->data_type, 1); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* Number of Initiator Proximity Domains (s) */ +build_append_int_noprefix(table_data, num_initiator, 4); +/* Number of Target Proximity Domains (t) */ +build_append_int_noprefix(table_data, num_target, 4); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 4); + +/* Entry Base Unit */ +if (hmat_lb->data_type <= HMAT_LB_DATA_WRITE_LATENCY) { +/* Convert latency base from nanoseconds to picosecond */ +base = hmat_lb->base * 1000; +} else { +/* Convert bandwidth base from Byte to Megabyte */ +base = hmat_lb->base / MiB; +} +build_append_int_noprefix(table_data, base, 8); + +/* Initiator Proximity Domain List */ +for (i = 0; i < num_initiator; i++) { +build_append_int_noprefix(table_data, initiator_list[i], 4); +} + +/* Target Proximity Domain List */ +for (i = 0; i < num_target; i++) { +build_append_int_noprefix(table_data, i, 4); +} + +/* Latency or Bandwidth Entries */ +entry_list = g_malloc0(num_initiator * num_target * sizeof(uint16_t)); +for (i = 0; i < hmat_lb->list->len; i++) { +lb_data = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); +index = lb_data->initiator * num_target + lb_data->target; + +entry_list[index] = (uint16_t)(lb_data->data / hmat_lb->base); +} + +for (i = 0; i < num_initiator * num_target; i++) { +build_append_int_noprefix(table_data, entry_list[i], 2); +} + +g_free(entry_list); +} + /* Build HMAT sub table structures */ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) { uint16_t flags; -int i; +uint32_t num_initiator = 0; +uint32_t initiator_list[MAX_NODES]; +int i, hierarchy, type; +HMAT_LB_Info *hmat_lb; for (i = 0; i < numa_state->num_nodes; i++) { flags = 0; @@ -82,6 +161,29 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) build_hmat_mpda(table_data, flags, numa_state->nodes[i].initiator, i); } + +for (i = 0; i < numa_state->num_nodes; i++) { +if (numa_state->nodes[i].has_cpu) { +initiator_list[num_initiator++] = i; +} +} + +/* +
[PATCH v20 0/8] Build ACPI Heterogeneous Memory Attribute Table (HMAT)
This series of patches will build Heterogeneous Memory Attribute Table (HMAT) according to the command line. The ACPI HMAT describes the memory attributes, such as memory side cache attributes and bandwidth and latency details, related to the Memory Proximity Domain. The software is expected to use HMAT information as hint for optimization. In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report the platform's HMAT tables. The V19 patches link: https://patchwork.kernel.org/cover/11265525/ Changelog: v20: - Use g_assert_true and g_assert_false to replace g_assert (Thomas and Markus) - Rename assoc as associativity, update the QAPI description (Markus) - Disable cache level 0 in hmat-cache option (Igor) - Keep base and bitmap unchanged when latency or bandwidth out of range - Fix the broken CI case when user input latency or bandwidth less than required. v19: - Add description about the machine property 'hmat' in commit message (Markus) - Update the QAPI comments - Add a check for no memory side cache - Add some fail cases for hmat-cache when level=0 v18: - Defer patches 01/14~06/14 of V17, use qapi type uint64 and only nanosecond for latency (Markus) - Rewrite the lines over 80 characters(Igor) v17: - Add check when user input latency or bandwidth 0, the lb_info_provided should also be 0. Because in ACPI 6.3 5.2.27.4, 0 means the corresponding latency or bandwidth information is not provided. - Fix the infinite loop when node->latency is 0. - Use NumaHmatCacheOptions to replace HMAT_Cache_Info (Igor) - Add check for unordered cache level input (Igor) - Add some fail test cases (Igor) v16: - Add and use qemu_strtold_finite to parse size, support full 64bit precision, modify related test cases (Eduardo and Markus) - Simplify struct HMAT_LB_Info and related code, unify latency and bandwidth (Igor) - Add cross check with hmat_lb data (Igor) - Fields in Cache Attributes are promoted to uint32_t before shifting (Igor) - Add case for QMP build HMAT (Igor) v15: - Add a new patch to refactor do_strtosz() (Eduardo) - Make tests without breaking CI (Michael) v14: - Reuse the codes of do_strtosz to build qemu_strtotime_ns (Eduardo) - Squash patch v13 01/12 and 02/12 together (Daniel and Eduardo) - Drop time unit picosecond (Eric) - Use qemu ctz64 and clz64 instead of builtin function v13: - Modify some text description - Drop "initiator_valid" field in struct NodeInfo - Reuse Garray to store the raw bandwidth and bandwidth data - Calculate common base unit using range bitmap - Add a patch to alculate hmat latency and bandwidth entry list - Drop the total_levels option and use readable cache size - Remove the unnecessary head file - Use decimal notation with appropriate suffix for cache size Liu Jingqi (5): numa: Extend CLI to provide memory latency and bandwidth information numa: Extend CLI to provide memory side cache information hmat acpi: Build Memory Proximity Domain Attributes Structure(s) hmat acpi: Build System Locality Latency and Bandwidth Information Structure(s) hmat acpi: Build Memory Side Cache Information Structure(s) Tao Xu (3): numa: Extend CLI to provide initiator information for numa nodes tests/numa: Add case for QMP build HMAT tests/bios-tables-test: add test cases for ACPI HMAT hw/acpi/Kconfig | 7 +- hw/acpi/Makefile.objs | 1 + hw/acpi/hmat.c| 268 +++ hw/acpi/hmat.h| 42 hw/core/machine.c | 64 ++ hw/core/numa.c| 297 ++ hw/i386/acpi-build.c | 5 + include/sysemu/numa.h | 63 ++ qapi/machine.json | 180 +++- qemu-options.hx | 95 +++- tests/bios-tables-test-allowed-diff.h | 8 + tests/bios-tables-test.c | 44 tests/data/acpi/pc/APIC.acpihmat | 0 tests/data/acpi/pc/DSDT.acpihmat | 0 tests/data/acpi/pc/HMAT.acpihmat | 0 tests/data/acpi/pc/SRAT.acpihmat | 0 tests/data/acpi/q35/APIC.acpihmat | 0 tests/data/acpi/q35/DSDT.acpihmat | 0 tests/data/acpi/q35/HMAT.acpihmat | 0 tests/data/acpi/q35/SRAT.acpihmat | 0 tests/numa-test.c | 213 ++ 21 files changed, 1276 insertions(+), 11 deletions(-) create mode 100644 hw/acpi/hmat.c create mode 100644 hw/acpi/hmat.h create mode 100644 tests/data/acpi/pc/APIC.acpihmat create mode 100644 tests/data/acpi/pc/DSDT.acpihmat create mode 100644 tests/data/acpi/pc/HMAT.acpihmat create mode 100644 tests/data/acpi/pc/SRAT.acpihmat create mode 100644 tests/data/acpi/q35/APIC.acpihmat c
[PATCH v20 8/8] tests/bios-tables-test: add test cases for ACPI HMAT
ACPI table HMAT has been introduced, QEMU now builds HMAT tables for Heterogeneous Memory with boot option '-numa node'. Add test cases on PC and Q35 machines with 2 numa nodes. Because HMAT is generated when system enable numa, the following tables need to be added for this test: tests/data/acpi/pc/APIC.acpihmat tests/data/acpi/pc/SRAT.acpihmat tests/data/acpi/pc/HMAT.acpihmat tests/data/acpi/pc/DSDT.acpihmat tests/data/acpi/q35/APIC.acpihmat tests/data/acpi/q35/SRAT.acpihmat tests/data/acpi/q35/HMAT.acpihmat tests/data/acpi/q35/DSDT.acpihmat Acked-by: Markus Armbruster Reviewed-by: Igor Mammedov Reviewed-by: Daniel Black Reviewed-by: Jingqi Liu Suggested-by: Igor Mammedov Signed-off-by: Tao Xu --- No changes in v20. Changes in v18: - Remove unit "ns". Changes in v17: - Update the latency and bandwidth Changes in v15: - Make tests without breaking CI (Michael) Changes in v13: - Use decimal notation with appropriate suffix for cache size --- tests/bios-tables-test-allowed-diff.h | 8 + tests/bios-tables-test.c | 44 +++ tests/data/acpi/pc/APIC.acpihmat | 0 tests/data/acpi/pc/DSDT.acpihmat | 0 tests/data/acpi/pc/HMAT.acpihmat | 0 tests/data/acpi/pc/SRAT.acpihmat | 0 tests/data/acpi/q35/APIC.acpihmat | 0 tests/data/acpi/q35/DSDT.acpihmat | 0 tests/data/acpi/q35/HMAT.acpihmat | 0 tests/data/acpi/q35/SRAT.acpihmat | 0 10 files changed, 52 insertions(+) create mode 100644 tests/data/acpi/pc/APIC.acpihmat create mode 100644 tests/data/acpi/pc/DSDT.acpihmat create mode 100644 tests/data/acpi/pc/HMAT.acpihmat create mode 100644 tests/data/acpi/pc/SRAT.acpihmat create mode 100644 tests/data/acpi/q35/APIC.acpihmat create mode 100644 tests/data/acpi/q35/DSDT.acpihmat create mode 100644 tests/data/acpi/q35/HMAT.acpihmat create mode 100644 tests/data/acpi/q35/SRAT.acpihmat diff --git a/tests/bios-tables-test-allowed-diff.h b/tests/bios-tables-test-allowed-diff.h index dfb8523c8b..3c9e0c979b 100644 --- a/tests/bios-tables-test-allowed-diff.h +++ b/tests/bios-tables-test-allowed-diff.h @@ -1 +1,9 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/pc/APIC.acpihmat", +"tests/data/acpi/pc/SRAT.acpihmat", +"tests/data/acpi/pc/HMAT.acpihmat", +"tests/data/acpi/pc/DSDT.acpihmat", +"tests/data/acpi/q35/APIC.acpihmat", +"tests/data/acpi/q35/SRAT.acpihmat", +"tests/data/acpi/q35/HMAT.acpihmat", +"tests/data/acpi/q35/DSDT.acpihmat", diff --git a/tests/bios-tables-test.c b/tests/bios-tables-test.c index 79f5da092f..9823820043 100644 --- a/tests/bios-tables-test.c +++ b/tests/bios-tables-test.c @@ -947,6 +947,48 @@ static void test_acpi_virt_tcg_numamem(void) } +static void test_acpi_tcg_acpi_hmat(const char *machine) +{ +test_data data; + +memset(&data, 0, sizeof(data)); +data.machine = machine; +data.variant = ".acpihmat"; +test_acpi_one(" -machine hmat=on" + " -smp 2,sockets=2" + " -m 128M,slots=2,maxmem=1G" + " -object memory-backend-ram,size=64M,id=m0" + " -object memory-backend-ram,size=64M,id=m1" + " -numa node,nodeid=0,memdev=m0" + " -numa node,nodeid=1,memdev=m1,initiator=0" + " -numa cpu,node-id=0,socket-id=0" + " -numa cpu,node-id=0,socket-id=1" + " -numa hmat-lb,initiator=0,target=0,hierarchy=memory," + "data-type=access-latency,latency=1" + " -numa hmat-lb,initiator=0,target=0,hierarchy=memory," + "data-type=access-bandwidth,bandwidth=65534M" + " -numa hmat-lb,initiator=0,target=1,hierarchy=memory," + "data-type=access-latency,latency=65534" + " -numa hmat-lb,initiator=0,target=1,hierarchy=memory," + "data-type=access-bandwidth,bandwidth=32767M" + " -numa hmat-cache,node-id=0,size=10K,level=1," + "associativity=direct,policy=write-back,line=8" + " -numa hmat-cache,node-id=1,size=10K,level=1," + "associativity=direct,policy=write-back,line=8", + &data); +free_test_data(&data); +} + +static void test_acpi_q35_tcg_acpi_hmat(void) +{ +test_acpi_tcg_acpi_hmat(MACHINE_Q35); +} + +static void test_acpi_piix4_tcg_acpi_hmat(void) +{ +test_acpi_tcg_acpi_hmat(MACHINE_PC); +} + static void test_acpi_virt_tcg(void) { test_data data = { @@ -991,6 +1033,8 @@ int main(int argc, char *argv[]) qt
[PATCH v20 2/8] numa: Extend CLI to provide memory latency and bandwidth information
From: Liu Jingqi Add -numa hmat-lb option to provide System Locality Latency and Bandwidth Information. These memory attributes help to build System Locality Latency and Bandwidth Information Structure(s) in ACPI Heterogeneous Memory Attribute Table (HMAT). Before using hmat-lb option, enable HMAT with -machine hmat=on. Acked-by: Markus Armbruster Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- Changes in v20: - Update the QAPI description (Markus) - Keep base and bitmap unchanged when latency or bandwidth out of range Changes in v19: - Add description about the machine property 'hmat' in commit message (Markus) Changes in v18: - Use qapi type uint64 and only nanosecond for latency (Markus) Changes in v17: - Add check when user input latency or bandwidth 0, the lb_info_provided should also be 0. Because in ACPI 6.3 5.2.27.4, 0 means the corresponding latency or bandwidth information is not provided. - Fix the infinite loop when node->latency is 0. --- hw/core/numa.c| 194 ++ include/sysemu/numa.h | 53 qapi/machine.json | 93 +++- qemu-options.hx | 47 +- 4 files changed, 384 insertions(+), 3 deletions(-) diff --git a/hw/core/numa.c b/hw/core/numa.c index e60da99293..34eb413f5d 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -23,6 +23,7 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "sysemu/hostmem.h" #include "sysemu/numa.h" #include "sysemu/sysemu.h" @@ -198,6 +199,186 @@ void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error **errp) ms->numa_state->have_numa_distance = true; } +void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, +Error **errp) +{ +int i, first_bit, last_bit; +uint64_t max_entry, temp_base, bitmap_copy; +NodeInfo *numa_info = numa_state->nodes; +HMAT_LB_Info *hmat_lb = +numa_state->hmat_lb[node->hierarchy][node->data_type]; +HMAT_LB_Data lb_data = {}; +HMAT_LB_Data *lb_temp; + +/* Error checking */ +if (node->initiator > numa_state->num_nodes) { +error_setg(errp, "Invalid initiator=%d, it should be less than %d", + node->initiator, numa_state->num_nodes); +return; +} +if (node->target > numa_state->num_nodes) { +error_setg(errp, "Invalid target=%d, it should be less than %d", + node->target, numa_state->num_nodes); +return; +} +if (!numa_info[node->initiator].has_cpu) { +error_setg(errp, "Invalid initiator=%d, it isn't an " + "initiator proximity domain", node->initiator); +return; +} +if (!numa_info[node->target].present) { +error_setg(errp, "The target=%d should point to an existing node", + node->target); +return; +} + +if (!hmat_lb) { +hmat_lb = g_malloc0(sizeof(*hmat_lb)); +numa_state->hmat_lb[node->hierarchy][node->data_type] = hmat_lb; +hmat_lb->list = g_array_new(false, true, sizeof(HMAT_LB_Data)); +} +hmat_lb->hierarchy = node->hierarchy; +hmat_lb->data_type = node->data_type; +lb_data.initiator = node->initiator; +lb_data.target = node->target; + +if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) { +/* Input latency data */ + +if (!node->has_latency) { +error_setg(errp, "Missing 'latency' option"); +return; +} +if (node->has_bandwidth) { +error_setg(errp, "Invalid option 'bandwidth' since " + "the data type is latency"); +return; +} + +/* Detect duplicate configuration */ +for (i = 0; i < hmat_lb->list->len; i++) { +lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); + +if (node->initiator == lb_temp->initiator && +node->target == lb_temp->target) { +error_setg(errp, "Duplicate configuration of the latency for " +"initiator=%d and target=%d", node->initiator, +node->target); +return; +} +} + +hmat_lb->base = hmat_lb->base ? hmat_lb->base : UINT64_MAX; + +if (node->latency) { +/* Calculate the temporary base and compressed latency */ +max_entry = node->latency; +temp_base = 1; +while (QEMU_IS_ALIGNED(max_entry, 10)) { +max_entry /= 10; +temp_base *=
[PATCH v20 4/8] hmat acpi: Build Memory Proximity Domain Attributes Structure(s)
From: Liu Jingqi HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table (HMAT). The specification references below link: http://www.uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf It describes the memory attributes, such as memory side cache attributes and bandwidth and latency details, related to the Memory Proximity Domain. The software is expected to use this information as hint for optimization. This structure describes Memory Proximity Domain Attributes by memory subsystem and its associativity with processor proximity domain as well as hint for memory usage. In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report the platform's HMAT tables. Acked-by: Markus Armbruster Reviewed-by: Igor Mammedov Reviewed-by: Daniel Black Reviewed-by: Jonathan Cameron Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- No changes in v20. Changes in v16: - Use uint32_t for initiator and mem_node Changes in v13: - Remove the unnecessary head file. --- hw/acpi/Kconfig | 7 ++- hw/acpi/Makefile.objs | 1 + hw/acpi/hmat.c| 99 +++ hw/acpi/hmat.h| 42 ++ hw/i386/acpi-build.c | 5 +++ 5 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 hw/acpi/hmat.c create mode 100644 hw/acpi/hmat.h diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig index 12e3f1e86e..54209c6f2f 100644 --- a/hw/acpi/Kconfig +++ b/hw/acpi/Kconfig @@ -7,6 +7,7 @@ config ACPI_X86 select ACPI_NVDIMM select ACPI_CPU_HOTPLUG select ACPI_MEMORY_HOTPLUG +select ACPI_HMAT config ACPI_X86_ICH bool @@ -23,6 +24,10 @@ config ACPI_NVDIMM bool depends on ACPI +config ACPI_HMAT +bool +depends on ACPI + config ACPI_PCI bool depends on ACPI && PCI @@ -33,5 +38,3 @@ config ACPI_VMGENID depends on PC config ACPI_HW_REDUCED -bool -depends on ACPI diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs index 655a9c1973..517bd88704 100644 --- a/hw/acpi/Makefile.objs +++ b/hw/acpi/Makefile.objs @@ -7,6 +7,7 @@ common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu.o common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o common-obj-$(CONFIG_ACPI_VMGENID) += vmgenid.o common-obj-$(CONFIG_ACPI_HW_REDUCED) += generic_event_device.o +common-obj-$(CONFIG_ACPI_HMAT) += hmat.o common-obj-$(call lnot,$(CONFIG_ACPI_X86)) += acpi-stub.o common-obj-y += acpi_interface.o diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c new file mode 100644 index 00..9ff79308a4 --- /dev/null +++ b/hw/acpi/hmat.c @@ -0,0 +1,99 @@ +/* + * HMAT ACPI Implementation + * + * Copyright(C) 2019 Intel Corporation. + * + * Author: + * Liu jingqi + * Tao Xu + * + * HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table + * (HMAT) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/> + */ + +#include "qemu/osdep.h" +#include "sysemu/numa.h" +#include "hw/acpi/hmat.h" + +/* + * ACPI 6.3: + * 5.2.27.3 Memory Proximity Domain Attributes Structure: Table 5-145 + */ +static void build_hmat_mpda(GArray *table_data, uint16_t flags, +uint32_t initiator, uint32_t mem_node) +{ + +/* Memory Proximity Domain Attributes Structure */ +/* Type */ +build_append_int_noprefix(table_data, 0, 2); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* Length */ +build_append_int_noprefix(table_data, 40, 4); +/* Flags */ +build_append_int_noprefix(table_data, flags, 2); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* Proximity Domain for the Attached Initiator */ +build_append_int_noprefix(table_data, initiator, 4); +/* Proximity Domain for the Memory */ +build_append_int_noprefix(table_data, mem_node, 4); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 4); +/* + * Reserved: + * Previously defined as the Start Address of the System Physical + * Address Range. Deprecated since ACPI Spec 6.3. + */ +build_append_int_noprefix(table_data, 0, 8); +/* + * Reserved: + * Previously defined as the Range Length of the region in bytes. + * Deprecated since ACPI Spec 6.3. + */ +build_append_int_noprefix(table_data, 0, 8); +} + +/
[PATCH v20 1/8] numa: Extend CLI to provide initiator information for numa nodes
In ACPI 6.3 chapter 5.2.27 Heterogeneous Memory Attribute Table (HMAT), The initiator represents processor which access to memory. And in 5.2.27.3 Memory Proximity Domain Attributes Structure, the attached initiator is defined as where the memory controller responsible for a memory proximity domain. With attached initiator information, the topology of heterogeneous memory can be described. Add new machine property 'hmat' to enable all HMAT specific options. Extend CLI of "-numa node" option to indicate the initiator numa node-id. In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report the platform's HMAT tables. Before using initiator option, enable HMAT with -machine hmat=on. Acked-by: Markus Armbruster Reviewed-by: Igor Mammedov Reviewed-by: Jingqi Liu Suggested-by: Dan Williams Signed-off-by: Tao Xu --- No changes in v20. Changes in v19: - Add description about the machine property 'hmat' in commit message (Markus) Changes in v15: - Change the QAPI version tag to 5.0 (Eric) --- hw/core/machine.c | 64 +++ hw/core/numa.c| 23 include/sysemu/numa.h | 5 qapi/machine.json | 10 ++- qemu-options.hx | 35 +++ 5 files changed, 131 insertions(+), 6 deletions(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index 1689ad3bf8..d7d2cfa66d 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -518,6 +518,20 @@ static void machine_set_nvdimm(Object *obj, bool value, Error **errp) ms->nvdimms_state->is_enabled = value; } +static bool machine_get_hmat(Object *obj, Error **errp) +{ +MachineState *ms = MACHINE(obj); + +return ms->numa_state->hmat_enabled; +} + +static void machine_set_hmat(Object *obj, bool value, Error **errp) +{ +MachineState *ms = MACHINE(obj); + +ms->numa_state->hmat_enabled = value; +} + static char *machine_get_nvdimm_persistence(Object *obj, Error **errp) { MachineState *ms = MACHINE(obj); @@ -645,6 +659,7 @@ void machine_set_cpu_numa_node(MachineState *machine, const CpuInstanceProperties *props, Error **errp) { MachineClass *mc = MACHINE_GET_CLASS(machine); +NodeInfo *numa_info = machine->numa_state->nodes; bool match = false; int i; @@ -714,6 +729,17 @@ void machine_set_cpu_numa_node(MachineState *machine, match = true; slot->props.node_id = props->node_id; slot->props.has_node_id = props->has_node_id; + +if (machine->numa_state->hmat_enabled) { +if ((numa_info[props->node_id].initiator < MAX_NODES) && +(props->node_id != numa_info[props->node_id].initiator)) { +error_setg(errp, "The initiator of CPU NUMA node %" PRId64 +" should be itself", props->node_id); +return; +} +numa_info[props->node_id].has_cpu = true; +numa_info[props->node_id].initiator = props->node_id; +} } if (!match) { @@ -960,6 +986,13 @@ static void machine_initfn(Object *obj) if (mc->numa_mem_supported) { ms->numa_state = g_new0(NumaState, 1); +object_property_add_bool(obj, "hmat", + machine_get_hmat, machine_set_hmat, + &error_abort); +object_property_set_description(obj, "hmat", +"Set on/off to enable/disable " +"ACPI Heterogeneous Memory Attribute " +"Table (HMAT)", NULL); } /* Register notifier when init is done for sysbus sanity checks */ @@ -1048,6 +1081,32 @@ static char *cpu_slot_to_string(const CPUArchId *cpu) return g_string_free(s, false); } +static void numa_validate_initiator(NumaState *numa_state) +{ +int i; +NodeInfo *numa_info = numa_state->nodes; + +for (i = 0; i < numa_state->num_nodes; i++) { +if (numa_info[i].initiator == MAX_NODES) { +error_report("The initiator of NUMA node %d is missing, use " + "'-numa node,initiator' option to declare it", i); +exit(1); +} + +if (!numa_info[numa_info[i].initiator].present) { +error_report("NUMA node %" PRIu16 " is missing, use " + "'-numa node' option to declare it first", + numa_info[i].initiator); +exit(1); +} + +if (!numa_info[numa_info[i].initiator].has_cpu) { +error_report("The initiator of NUMA node %d is invalid", i); +exit(1); +}
[PATCH v20 6/8] hmat acpi: Build Memory Side Cache Information Structure(s)
From: Liu Jingqi This structure describes memory side cache information for memory proximity domains if the memory side cache is present and the physical device forms the memory side cache. The software could use this information to effectively place the data in memory to maximize the performance of the system memory that use the memory side cache. Acked-by: Markus Armbruster Reviewed-by: Igor Mammedov Reviewed-by: Daniel Black Reviewed-by: Jonathan Cameron Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- No changes in v20. Changes in v16: - Use checks and assert to replace masks (Igor) - Fields in Cache Attributes are promoted to uint32_t before shifting (Igor) - Drop cpu_to_le32() (Igor) Changes in v13: - rename level as cache_level --- hw/acpi/hmat.c | 69 +- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c index 4635d45dee..7c24bb5371 100644 --- a/hw/acpi/hmat.c +++ b/hw/acpi/hmat.c @@ -143,14 +143,62 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, g_free(entry_list); } +/* ACPI 6.3: 5.2.27.5 Memory Side Cache Information Structure: Table 5-147 */ +static void build_hmat_cache(GArray *table_data, uint8_t total_levels, + NumaHmatCacheOptions *hmat_cache) +{ +/* + * Cache Attributes: Bits [3:0] – Total Cache Levels + * for this Memory Proximity Domain + */ +uint32_t cache_attr = total_levels; + +/* Bits [7:4] : Cache Level described in this structure */ +cache_attr |= (uint32_t) hmat_cache->level << 4; + +/* Bits [11:8] - Cache Associativity */ +cache_attr |= (uint32_t) hmat_cache->associativity << 8; + +/* Bits [15:12] - Write Policy */ +cache_attr |= (uint32_t) hmat_cache->policy << 12; + +/* Bits [31:16] - Cache Line size in bytes */ +cache_attr |= (uint32_t) hmat_cache->line << 16; + +/* Type */ +build_append_int_noprefix(table_data, 2, 2); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* Length */ +build_append_int_noprefix(table_data, 32, 4); +/* Proximity Domain for the Memory */ +build_append_int_noprefix(table_data, hmat_cache->node_id, 4); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 4); +/* Memory Side Cache Size */ +build_append_int_noprefix(table_data, hmat_cache->size, 8); +/* Cache Attributes */ +build_append_int_noprefix(table_data, cache_attr, 4); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* + * Number of SMBIOS handles (n) + * Linux kernel uses Memory Side Cache Information Structure + * without SMBIOS entries for now, so set Number of SMBIOS handles + * as 0. + */ +build_append_int_noprefix(table_data, 0, 2); +} + /* Build HMAT sub table structures */ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) { uint16_t flags; uint32_t num_initiator = 0; uint32_t initiator_list[MAX_NODES]; -int i, hierarchy, type; +int i, hierarchy, type, cache_level, total_levels; HMAT_LB_Info *hmat_lb; +NumaHmatCacheOptions *hmat_cache; for (i = 0; i < numa_state->num_nodes; i++) { flags = 0; @@ -184,6 +232,25 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) } } } + +/* + * ACPI 6.3: 5.2.27.5 Memory Side Cache Information Structure: + * Table 5-147 + */ +for (i = 0; i < numa_state->num_nodes; i++) { +total_levels = 0; +for (cache_level = 1; cache_level < HMAT_LB_LEVELS; cache_level++) { +if (numa_state->hmat_cache[i][cache_level]) { +total_levels++; +} +} +for (cache_level = 0; cache_level <= total_levels; cache_level++) { +hmat_cache = numa_state->hmat_cache[i][cache_level]; +if (hmat_cache) { +build_hmat_cache(table_data, total_levels, hmat_cache); +} +} +} } void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state) -- 2.20.1
[PATCH v20 3/8] numa: Extend CLI to provide memory side cache information
From: Liu Jingqi Add -numa hmat-cache option to provide Memory Side Cache Information. These memory attributes help to build Memory Side Cache Information Structure(s) in ACPI Heterogeneous Memory Attribute Table (HMAT). Before using hmat-cache option, enable HMAT with -machine hmat=on. Acked-by: Markus Armbruster Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- Changes in v20: - Disable cache level 0 in hmat-cache option (Igor) - Update the QAPI description (Markus) Changes in v19: - Add description about the machine property 'hmat' in commit message (Markus) - Update the QAPI comments - Add a check for no memory side cache Changes in v18: - Update the error message (Igor) Changes in v17: - Use NumaHmatCacheOptions to replace HMAT_Cache_Info (Igor) - Add check for unordered cache level input (Igor) Changes in v16: - Add cross check with hmat_lb data (Igor) - Drop total_levels in struct HMAT_Cache_Info (Igor) - Correct the error table number (Igor) --- hw/core/numa.c| 80 ++ include/sysemu/numa.h | 5 +++ qapi/machine.json | 81 +-- qemu-options.hx | 17 +++-- 4 files changed, 179 insertions(+), 4 deletions(-) diff --git a/hw/core/numa.c b/hw/core/numa.c index 34eb413f5d..33fda31a4c 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -379,6 +379,73 @@ void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, g_array_append_val(hmat_lb->list, lb_data); } +void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, + Error **errp) +{ +int nb_numa_nodes = ms->numa_state->num_nodes; +NodeInfo *numa_info = ms->numa_state->nodes; +NumaHmatCacheOptions *hmat_cache = NULL; + +if (node->node_id >= nb_numa_nodes) { +error_setg(errp, "Invalid node-id=%" PRIu32 ", it should be less " + "than %d", node->node_id, nb_numa_nodes); +return; +} + +if (numa_info[node->node_id].lb_info_provided != (BIT(0) | BIT(1))) { +error_setg(errp, "The latency and bandwidth information of " + "node-id=%" PRIu32 " should be provided before memory side " + "cache attributes", node->node_id); +return; +} + +if (node->level < 1 || node->level >= HMAT_LB_LEVELS) { +error_setg(errp, "Invalid level=%" PRIu8 ", it should be larger than 0 " + "and less than or equal to %d", node->level, + HMAT_LB_LEVELS - 1); +return; +} + +assert(node->associativity < HMAT_CACHE_ASSOCIATIVITY__MAX); +assert(node->policy < HMAT_CACHE_WRITE_POLICY__MAX); +if (ms->numa_state->hmat_cache[node->node_id][node->level]) { +error_setg(errp, "Duplicate configuration of the side cache for " + "node-id=%" PRIu32 " and level=%" PRIu8, + node->node_id, node->level); +return; +} + +if ((node->level > 1) && +ms->numa_state->hmat_cache[node->node_id][node->level - 1] && +(node->size >= +ms->numa_state->hmat_cache[node->node_id][node->level - 1]->size)) { +error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 + " should be less than the size(%" PRIu64 ") of " + "level=%" PRIu8, node->size, node->level, + ms->numa_state->hmat_cache[node->node_id] + [node->level - 1]->size, + node->level - 1); +return; +} + +if ((node->level < HMAT_LB_LEVELS - 1) && +ms->numa_state->hmat_cache[node->node_id][node->level + 1] && +(node->size <= +ms->numa_state->hmat_cache[node->node_id][node->level + 1]->size)) { +error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 + " should be larger than the size(%" PRIu64 ") of " + "level=%" PRIu8, node->size, node->level, + ms->numa_state->hmat_cache[node->node_id] + [node->level + 1]->size, + node->level + 1); +return; +} + +hmat_cache = g_malloc0(sizeof(*hmat_cache)); +memcpy(hmat_cache, node, sizeof(*hmat_cache)); +ms->numa_state->hmat_cache[node->node_id][node->level] = hmat_cache; +} + void set_numa_options(MachineState *ms, NumaOpti
Re: [PATCH v19 7/8] tests/numa: Add case for QMP build HMAT
On 11/28/2019 7:53 PM, Thomas Huth wrote: On 28/11/2019 12.49, Markus Armbruster wrote: Tao Xu writes: Check configuring HMAT usecase Reviewed-by: Igor Mammedov Suggested-by: Igor Mammedov Signed-off-by: Tao Xu --- Changes in v19: - Add some fail cases for hmat-cache when level=0 Changes in v18: - Rewrite the lines over 80 characters Chenges in v17: - Add some fail test cases (Igor) --- tests/numa-test.c | 213 ++ 1 file changed, 213 insertions(+) diff --git a/tests/numa-test.c b/tests/numa-test.c index 8de8581231..aed7b2f31b 100644 --- a/tests/numa-test.c +++ b/tests/numa-test.c @@ -327,6 +327,216 @@ static void pc_dynamic_cpu_cfg(const void *data) qtest_quit(qs); } +static void pc_hmat_build_cfg(const void *data) +{ +QTestState *qs = qtest_initf("%s -nodefaults --preconfig -machine hmat=on " + "-smp 2,sockets=2 " + "-m 128M,slots=2,maxmem=1G " + "-object memory-backend-ram,size=64M,id=m0 " + "-object memory-backend-ram,size=64M,id=m1 " + "-numa node,nodeid=0,memdev=m0 " + "-numa node,nodeid=1,memdev=m1,initiator=0 " + "-numa cpu,node-id=0,socket-id=0 " + "-numa cpu,node-id=0,socket-id=1", + data ? (char *)data : ""); + +/* Fail: Initiator should be less than the number of nodes */ +g_assert(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 2, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); Code smell: side effect within assert(). Harmless here, because compiling tests with NDEBUG is pointless. Still, it sets a bad example. Not your idea, the pattern seems to go back to commit c35665e1ee3 and fb1e58f72ba. ... maybe best to use g_assert_true() which can't be disabled and thus should be used in tests. See: https://developer.gnome.org/glib/unstable/glib-Testing.html#g-assert-true Thomas Thank you for your suggestion. I will use g_assert_true and g_assert_false to replace g_assert
Re: [PATCH v19 2/8] numa: Extend CLI to provide memory latency and bandwidth information
On 11/28/2019 7:50 PM, Markus Armbruster wrote: Tao Xu writes: From: Liu Jingqi Add -numa hmat-lb option to provide System Locality Latency and Bandwidth Information. These memory attributes help to build System Locality Latency and Bandwidth Information Structure(s) in ACPI Heterogeneous Memory Attribute Table (HMAT). Before using hmat-lb option, enable HMAT with -machine hmat=on. Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- [...] diff --git a/qapi/machine.json b/qapi/machine.json index 27d0e37534..cf9851fcd1 100644 --- a/qapi/machine.json +++ b/qapi/machine.json @@ -426,10 +426,12 @@ # # @cpu: property based CPU(s) to node mapping (Since: 2.10) # +# @hmat-lb: memory latency and bandwidth information (Since: 5.0) +# # Since: 2.1 ## { 'enum': 'NumaOptionsType', - 'data': [ 'node', 'dist', 'cpu' ] } + 'data': [ 'node', 'dist', 'cpu', 'hmat-lb' ] } ## # @NumaOptions: @@ -444,7 +446,8 @@ 'data': { 'node': 'NumaNodeOptions', 'dist': 'NumaDistOptions', -'cpu': 'NumaCpuOptions' }} +'cpu': 'NumaCpuOptions', +'hmat-lb': 'NumaHmatLBOptions' }} ## # @NumaNodeOptions: @@ -557,6 +560,92 @@ 'base': 'CpuInstanceProperties', 'data' : {} } +## +# @HmatLBMemoryHierarchy: +# +# The memory hierarchy in the System Locality Latency and Bandwidth +# Information Structure of HMAT (Heterogeneous Memory Attribute Table) +# +# For more information about @HmatLBMemoryHierarchy see chapter @HmatLBMemoryHierarchy, see +# 5.2.27.4: Table 5-146: Field "Flags" of ACPI 6.3 spec. +# +# @memory: the structure represents the memory performance +# +# @first-level: first level of memory side cache +# +# @second-level: second level of memory side cache +# +# @third-level: third level of memory side cache +# +# Since: 5.0 +## +{ 'enum': 'HmatLBMemoryHierarchy', + 'data': [ 'memory', 'first-level', 'second-level', 'third-level' ] } + +## +# @HmatLBDataType: +# +# Data type in the System Locality Latency and Bandwidth +# Information Structure of HMAT (Heterogeneous Memory Attribute Table) +# +# For more information about @HmatLBDataType see chapter @HmatLBDataType, see +# 5.2.27.4: Table 5-146: Field "Data Type" of ACPI 6.3 spec. +# +# @access-latency: access latency (nanoseconds) +# +# @read-latency: read latency (nanoseconds) +# +# @write-latency: write latency (nanoseconds) +# +# @access-bandwidth: access bandwidth (Bytes per second) +# +# @read-bandwidth: read bandwidth (Bytes per second) +# +# @write-bandwidth: write bandwidth (Bytes per second) +# +# Since: 5.0 +## +{ 'enum': 'HmatLBDataType', + 'data': [ 'access-latency', 'read-latency', 'write-latency', +'access-bandwidth', 'read-bandwidth', 'write-bandwidth' ] } + +## +# @NumaHmatLBOptions: +# +# Set the system locality latency and bandwidth information +# between Initiator and Target proximity Domains. +# +# For more information about @NumaHmatLBOptions see chapter @NumaHmatLBOptions, see +# 5.2.27.4: Table 5-146 of ACPI 6.3 spec. +# +# @initiator: the Initiator Proximity Domain. +# +# @target: the Target Proximity Domain. +# +# @hierarchy: the Memory Hierarchy. Indicates the performance +# of memory or side cache. +# +# @data-type: presents the type of data, access/read/write +# latency or hit latency. +# +# @latency: the value of latency from @initiator to @target +# proximity domain, the latency unit is "ns(nanosecond)". +# +# @bandwidth: the value of bandwidth between @initiator and @target +# proximity domain, the bandwidth unit is +# "Bytes per second". +# +# Since: 5.0 +## +{ 'struct': 'NumaHmatLBOptions', +'data': { +'initiator': 'uint16', +'target': 'uint16', +'hierarchy': 'HmatLBMemoryHierarchy', +'data-type': 'HmatLBDataType', +'*latency': 'uint64', +'*bandwidth': 'size' }} + ## # @HostMemPolicy: # diff --git a/qemu-options.hx b/qemu-options.hx index 63f6b33322..23303fc7d7 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -168,16 +168,19 @@ DEF("numa", HAS_ARG, QEMU_OPTION_numa, "-numa node[,mem=size][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" "-numa node[,memdev=id][,cpus=firstcpu[-lastcpu]][,nodeid=node][,initiator=node]\n" "-numa dist,src=source,dst=destination,val=distance\n" -"-nu
Re: [PATCH v19 3/8] numa: Extend CLI to provide memory side cache information
On 11/28/2019 7:50 PM, Markus Armbruster wrote: Tao Xu writes: From: Liu Jingqi Add -numa hmat-cache option to provide Memory Side Cache Information. These memory attributes help to build Memory Side Cache Information Structure(s) in ACPI Heterogeneous Memory Attribute Table (HMAT). Before using hmat-cache option, enable HMAT with -machine hmat=on. Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- Changes in v19: - Add description about the machine property 'hmat' in commit message (Markus) - Update the QAPI comments - Add a check for no memory side cache Changes in v18: - Update the error message (Igor) Changes in v17: - Use NumaHmatCacheOptions to replace HMAT_Cache_Info (Igor) - Add check for unordered cache level input (Igor) Changes in v16: - Add cross check with hmat_lb data (Igor) - Drop total_levels in struct HMAT_Cache_Info (Igor) - Correct the error table number (Igor) Changes in v15: - Change the QAPI version tag to 5.0 (Eric) --- hw/core/numa.c| 86 +++ include/sysemu/numa.h | 5 +++ qapi/machine.json | 81 +++- qemu-options.hx | 16 +++- 4 files changed, 184 insertions(+), 4 deletions(-) diff --git a/hw/core/numa.c b/hw/core/numa.c index 2183c8df1f..664b44ad68 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -366,6 +366,79 @@ void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, g_array_append_val(hmat_lb->list, lb_data); } +void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, + Error **errp) +{ +int nb_numa_nodes = ms->numa_state->num_nodes; +NodeInfo *numa_info = ms->numa_state->nodes; +NumaHmatCacheOptions *hmat_cache = NULL; + +if (node->node_id >= nb_numa_nodes) { +error_setg(errp, "Invalid node-id=%" PRIu32 ", it should be less " + "than %d", node->node_id, nb_numa_nodes); +return; +} + +if (numa_info[node->node_id].lb_info_provided != (BIT(0) | BIT(1))) { +error_setg(errp, "The latency and bandwidth information of " + "node-id=%" PRIu32 " should be provided before memory side " + "cache attributes", node->node_id); +return; +} + +if (node->level >= HMAT_LB_LEVELS) { +error_setg(errp, "Invalid level=%" PRIu8 ", it should be less than or " + "equal to %d", node->level, HMAT_LB_LEVELS - 1); +return; +} + +if (!node->level && (node->assoc || node->policy || node->line)) { +error_setg(errp, "Assoc and policy options should be 'none', line " + "should be 0. If cache level is 0, which means no memory " + "side cache in node-id=%" PRIu32, node->node_id); Error messages should be a phrase, not a paragraph; see error_setg()'s function comment. I think you want something like "be 0 when cache level is 0". I'm not sure the error message should explain what level 0 means, but I'm happy to defer to the NUMA maintainers there. +return; +} + +assert(node->assoc < HMAT_CACHE_ASSOCIATIVITY__MAX); +assert(node->policy < HMAT_CACHE_WRITE_POLICY__MAX); +if (ms->numa_state->hmat_cache[node->node_id][node->level]) { +error_setg(errp, "Duplicate configuration of the side cache for " + "node-id=%" PRIu32 " and level=%" PRIu8, + node->node_id, node->level); +return; +} + +if ((node->level > 1) && +ms->numa_state->hmat_cache[node->node_id][node->level - 1] && +(node->size >= +ms->numa_state->hmat_cache[node->node_id][node->level - 1]->size)) { +error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 + " should be less than the size(%" PRIu64 ") of " + "level=%" PRIu8, node->size, node->level, + ms->numa_state->hmat_cache[node->node_id] + [node->level - 1]->size, + node->level - 1); +return; +} + +if ((node->level < HMAT_LB_LEVELS - 1) && +ms->numa_state->hmat_cache[node->node_id][node->level + 1] && +(node->size <= +ms->numa_state->hmat_cache[node->node_id][node->level + 1]->size)) { +error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 +
Re: [PATCH v19 3/8] numa: Extend CLI to provide memory side cache information
On 11/28/2019 9:57 PM, Igor Mammedov wrote: On Thu, 28 Nov 2019 12:50:36 +0100 Markus Armbruster wrote: Tao Xu writes: From: Liu Jingqi Add -numa hmat-cache option to provide Memory Side Cache Information. These memory attributes help to build Memory Side Cache Information Structure(s) in ACPI Heterogeneous Memory Attribute Table (HMAT). Before using hmat-cache option, enable HMAT with -machine hmat=on. Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- Changes in v19: - Add description about the machine property 'hmat' in commit message (Markus) - Update the QAPI comments - Add a check for no memory side cache Changes in v18: - Update the error message (Igor) Changes in v17: - Use NumaHmatCacheOptions to replace HMAT_Cache_Info (Igor) - Add check for unordered cache level input (Igor) Changes in v16: - Add cross check with hmat_lb data (Igor) - Drop total_levels in struct HMAT_Cache_Info (Igor) - Correct the error table number (Igor) Changes in v15: - Change the QAPI version tag to 5.0 (Eric) --- hw/core/numa.c| 86 +++ include/sysemu/numa.h | 5 +++ qapi/machine.json | 81 +++- qemu-options.hx | 16 +++- 4 files changed, 184 insertions(+), 4 deletions(-) diff --git a/hw/core/numa.c b/hw/core/numa.c index 2183c8df1f..664b44ad68 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -366,6 +366,79 @@ void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, g_array_append_val(hmat_lb->list, lb_data); } +void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, + Error **errp) +{ +int nb_numa_nodes = ms->numa_state->num_nodes; +NodeInfo *numa_info = ms->numa_state->nodes; +NumaHmatCacheOptions *hmat_cache = NULL; + +if (node->node_id >= nb_numa_nodes) { +error_setg(errp, "Invalid node-id=%" PRIu32 ", it should be less " + "than %d", node->node_id, nb_numa_nodes); +return; +} + +if (numa_info[node->node_id].lb_info_provided != (BIT(0) | BIT(1))) { +error_setg(errp, "The latency and bandwidth information of " + "node-id=%" PRIu32 " should be provided before memory side " + "cache attributes", node->node_id); +return; +} + +if (node->level >= HMAT_LB_LEVELS) { +error_setg(errp, "Invalid level=%" PRIu8 ", it should be less than or " + "equal to %d", node->level, HMAT_LB_LEVELS - 1); +return; +} + +if (!node->level && (node->assoc || node->policy || node->line)) { +error_setg(errp, "Assoc and policy options should be 'none', line " + "should be 0. If cache level is 0, which means no memory " + "side cache in node-id=%" PRIu32, node->node_id); Do we have to describe node->level == 0 in side-cache table (spec isn't clear on this usecase)? Can we just tell user that "RAM (level 0) should not be used with 'hmat-cache' option? Yes we can. I will do that. Error messages should be a phrase, not a paragraph; see error_setg()'s function comment. I think you want something like "be 0 when cache level is 0". I'm not sure the error message should explain what level 0 means, but I'm happy to defer to the NUMA maintainers there. +return; +} + +assert(node->assoc < HMAT_CACHE_ASSOCIATIVITY__MAX); +assert(node->policy < HMAT_CACHE_WRITE_POLICY__MAX); +if (ms->numa_state->hmat_cache[node->node_id][node->level]) { +error_setg(errp, "Duplicate configuration of the side cache for " + "node-id=%" PRIu32 " and level=%" PRIu8, + node->node_id, node->level); +return; +} + +if ((node->level > 1) && +ms->numa_state->hmat_cache[node->node_id][node->level - 1] && +(node->size >= +ms->numa_state->hmat_cache[node->node_id][node->level - 1]->size)) { +error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 + " should be less than the size(%" PRIu64 ") of " + "level=%" PRIu8, node->size, node->level, + ms->numa_state->hmat_cache[node->node_id] + [node->level - 1]->size, + node->level - 1); +return; +} + +if ((node->level < HMAT_LB_LEVELS - 1)
[PATCH v19 5/8] hmat acpi: Build System Locality Latency and Bandwidth Information Structure(s)
From: Liu Jingqi This structure describes the memory access latency and bandwidth information from various memory access initiator proximity domains. The latency and bandwidth numbers represented in this structure correspond to rated latency and bandwidth for the platform. The software could use this information as hint for optimization. Reviewed-by: Igor Mammedov Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- No changes in 19. Changes in v17: - Remove unnecessary header file (Igor) Changes in v16: - Add more description for lb_length (Igor) - Drop entry_list and calculate entries in this patch (Igor) Changes in v13: - Calculate the entries in a new patch. --- hw/acpi/hmat.c | 104 - 1 file changed, 103 insertions(+), 1 deletion(-) diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c index 9ff79308a4..e5ee8b4317 100644 --- a/hw/acpi/hmat.c +++ b/hw/acpi/hmat.c @@ -25,6 +25,7 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "sysemu/numa.h" #include "hw/acpi/hmat.h" @@ -67,11 +68,89 @@ static void build_hmat_mpda(GArray *table_data, uint16_t flags, build_append_int_noprefix(table_data, 0, 8); } +/* + * ACPI 6.3: 5.2.27.4 System Locality Latency and Bandwidth Information + * Structure: Table 5-146 + */ +static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, + uint32_t num_initiator, uint32_t num_target, + uint32_t *initiator_list) +{ +int i, index; +HMAT_LB_Data *lb_data; +uint16_t *entry_list; +uint32_t base; +/* Length in bytes for entire structure */ +uint32_t lb_length += 32 /* Table length upto and including Entry Base Unit */ ++ 4 * num_initiator /* Initiator Proximity Domain List */ ++ 4 * num_target /* Target Proximity Domain List */ ++ 2 * num_initiator * num_target; /* Latency or Bandwidth Entries */ + +/* Type */ +build_append_int_noprefix(table_data, 1, 2); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* Length */ +build_append_int_noprefix(table_data, lb_length, 4); +/* Flags: Bits [3:0] Memory Hierarchy, Bits[7:4] Reserved */ +assert(!(hmat_lb->hierarchy >> 4)); +build_append_int_noprefix(table_data, hmat_lb->hierarchy, 1); +/* Data Type */ +build_append_int_noprefix(table_data, hmat_lb->data_type, 1); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* Number of Initiator Proximity Domains (s) */ +build_append_int_noprefix(table_data, num_initiator, 4); +/* Number of Target Proximity Domains (t) */ +build_append_int_noprefix(table_data, num_target, 4); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 4); + +/* Entry Base Unit */ +if (hmat_lb->data_type <= HMAT_LB_DATA_WRITE_LATENCY) { +/* Convert latency base from nanoseconds to picosecond */ +base = hmat_lb->base * 1000; +} else { +/* Convert bandwidth base from Byte to Megabyte */ +base = hmat_lb->base / MiB; +} +build_append_int_noprefix(table_data, base, 8); + +/* Initiator Proximity Domain List */ +for (i = 0; i < num_initiator; i++) { +build_append_int_noprefix(table_data, initiator_list[i], 4); +} + +/* Target Proximity Domain List */ +for (i = 0; i < num_target; i++) { +build_append_int_noprefix(table_data, i, 4); +} + +/* Latency or Bandwidth Entries */ +entry_list = g_malloc0(hmat_lb->list->len * sizeof(uint16_t)); +for (i = 0; i < hmat_lb->list->len; i++) { +lb_data = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); +index = lb_data->initiator * num_target + lb_data->target; + +entry_list[index] = (uint16_t)(lb_data->data / hmat_lb->base); +} + +for (i = 0; i < num_initiator * num_target; i++) { +build_append_int_noprefix(table_data, entry_list[i], 2); +} + +g_free(entry_list); +} + /* Build HMAT sub table structures */ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) { uint16_t flags; -int i; +uint32_t num_initiator = 0; +uint32_t initiator_list[MAX_NODES]; +int i, hierarchy, type; +HMAT_LB_Info *hmat_lb; for (i = 0; i < numa_state->num_nodes; i++) { flags = 0; @@ -82,6 +161,29 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) build_hmat_mpda(table_data, flags, numa_state->nodes[i].initiator, i); } + +for (i = 0; i < numa_state->num_nodes; i++) { +if (numa_state->nodes[i].has_cpu) { +initiator_list[num_initiator++] = i; +} +} + +/* + * ACPI 6.3: 5.2.27.4 System Locality Latency and Bandwidth Information + * Structure: Table 5-146 + */
[PATCH v19 4/8] hmat acpi: Build Memory Proximity Domain Attributes Structure(s)
From: Liu Jingqi HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table (HMAT). The specification references below link: http://www.uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf It describes the memory attributes, such as memory side cache attributes and bandwidth and latency details, related to the Memory Proximity Domain. The software is expected to use this information as hint for optimization. This structure describes Memory Proximity Domain Attributes by memory subsystem and its associativity with processor proximity domain as well as hint for memory usage. In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report the platform's HMAT tables. Reviewed-by: Igor Mammedov Reviewed-by: Daniel Black Reviewed-by: Jonathan Cameron Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- No changes in v19. Changes in v16: - Use uint32_t for initiator and mem_node Changes in v13: - Remove the unnecessary head file. --- hw/acpi/Kconfig | 7 ++- hw/acpi/Makefile.objs | 1 + hw/acpi/hmat.c| 99 +++ hw/acpi/hmat.h| 42 ++ hw/i386/acpi-build.c | 5 +++ 5 files changed, 152 insertions(+), 2 deletions(-) create mode 100644 hw/acpi/hmat.c create mode 100644 hw/acpi/hmat.h diff --git a/hw/acpi/Kconfig b/hw/acpi/Kconfig index 12e3f1e86e..54209c6f2f 100644 --- a/hw/acpi/Kconfig +++ b/hw/acpi/Kconfig @@ -7,6 +7,7 @@ config ACPI_X86 select ACPI_NVDIMM select ACPI_CPU_HOTPLUG select ACPI_MEMORY_HOTPLUG +select ACPI_HMAT config ACPI_X86_ICH bool @@ -23,6 +24,10 @@ config ACPI_NVDIMM bool depends on ACPI +config ACPI_HMAT +bool +depends on ACPI + config ACPI_PCI bool depends on ACPI && PCI @@ -33,5 +38,3 @@ config ACPI_VMGENID depends on PC config ACPI_HW_REDUCED -bool -depends on ACPI diff --git a/hw/acpi/Makefile.objs b/hw/acpi/Makefile.objs index 655a9c1973..517bd88704 100644 --- a/hw/acpi/Makefile.objs +++ b/hw/acpi/Makefile.objs @@ -7,6 +7,7 @@ common-obj-$(CONFIG_ACPI_CPU_HOTPLUG) += cpu.o common-obj-$(CONFIG_ACPI_NVDIMM) += nvdimm.o common-obj-$(CONFIG_ACPI_VMGENID) += vmgenid.o common-obj-$(CONFIG_ACPI_HW_REDUCED) += generic_event_device.o +common-obj-$(CONFIG_ACPI_HMAT) += hmat.o common-obj-$(call lnot,$(CONFIG_ACPI_X86)) += acpi-stub.o common-obj-y += acpi_interface.o diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c new file mode 100644 index 00..9ff79308a4 --- /dev/null +++ b/hw/acpi/hmat.c @@ -0,0 +1,99 @@ +/* + * HMAT ACPI Implementation + * + * Copyright(C) 2019 Intel Corporation. + * + * Author: + * Liu jingqi + * Tao Xu + * + * HMAT is defined in ACPI 6.3: 5.2.27 Heterogeneous Memory Attribute Table + * (HMAT) + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/> + */ + +#include "qemu/osdep.h" +#include "sysemu/numa.h" +#include "hw/acpi/hmat.h" + +/* + * ACPI 6.3: + * 5.2.27.3 Memory Proximity Domain Attributes Structure: Table 5-145 + */ +static void build_hmat_mpda(GArray *table_data, uint16_t flags, +uint32_t initiator, uint32_t mem_node) +{ + +/* Memory Proximity Domain Attributes Structure */ +/* Type */ +build_append_int_noprefix(table_data, 0, 2); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* Length */ +build_append_int_noprefix(table_data, 40, 4); +/* Flags */ +build_append_int_noprefix(table_data, flags, 2); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* Proximity Domain for the Attached Initiator */ +build_append_int_noprefix(table_data, initiator, 4); +/* Proximity Domain for the Memory */ +build_append_int_noprefix(table_data, mem_node, 4); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 4); +/* + * Reserved: + * Previously defined as the Start Address of the System Physical + * Address Range. Deprecated since ACPI Spec 6.3. + */ +build_append_int_noprefix(table_data, 0, 8); +/* + * Reserved: + * Previously defined as the Range Length of the region in bytes. + * Deprecated since ACPI Spec 6.3. + */ +build_append_int_noprefix(table_data, 0, 8); +} + +/* Build HMAT sub table structures */ +static
[PATCH v19 8/8] tests/bios-tables-test: add test cases for ACPI HMAT
ACPI table HMAT has been introduced, QEMU now builds HMAT tables for Heterogeneous Memory with boot option '-numa node'. Add test cases on PC and Q35 machines with 2 numa nodes. Because HMAT is generated when system enable numa, the following tables need to be added for this test: tests/data/acpi/pc/APIC.acpihmat tests/data/acpi/pc/SRAT.acpihmat tests/data/acpi/pc/HMAT.acpihmat tests/data/acpi/pc/DSDT.acpihmat tests/data/acpi/q35/APIC.acpihmat tests/data/acpi/q35/SRAT.acpihmat tests/data/acpi/q35/HMAT.acpihmat tests/data/acpi/q35/DSDT.acpihmat Reviewed-by: Igor Mammedov Reviewed-by: Daniel Black Reviewed-by: Jingqi Liu Suggested-by: Igor Mammedov Signed-off-by: Tao Xu --- No changes in v19. Changes in v18: - Remove unit "ns". Changes in v17: - Update the latency and bandwidth Changes in v15: - Make tests without breaking CI (Michael) Changes in v13: - Use decimal notation with appropriate suffix for cache size --- tests/bios-tables-test-allowed-diff.h | 8 + tests/bios-tables-test.c | 44 +++ tests/data/acpi/pc/APIC.acpihmat | 0 tests/data/acpi/pc/DSDT.acpihmat | 0 tests/data/acpi/pc/HMAT.acpihmat | 0 tests/data/acpi/pc/SRAT.acpihmat | 0 tests/data/acpi/q35/APIC.acpihmat | 0 tests/data/acpi/q35/DSDT.acpihmat | 0 tests/data/acpi/q35/HMAT.acpihmat | 0 tests/data/acpi/q35/SRAT.acpihmat | 0 10 files changed, 52 insertions(+) create mode 100644 tests/data/acpi/pc/APIC.acpihmat create mode 100644 tests/data/acpi/pc/DSDT.acpihmat create mode 100644 tests/data/acpi/pc/HMAT.acpihmat create mode 100644 tests/data/acpi/pc/SRAT.acpihmat create mode 100644 tests/data/acpi/q35/APIC.acpihmat create mode 100644 tests/data/acpi/q35/DSDT.acpihmat create mode 100644 tests/data/acpi/q35/HMAT.acpihmat create mode 100644 tests/data/acpi/q35/SRAT.acpihmat diff --git a/tests/bios-tables-test-allowed-diff.h b/tests/bios-tables-test-allowed-diff.h index dfb8523c8b..3c9e0c979b 100644 --- a/tests/bios-tables-test-allowed-diff.h +++ b/tests/bios-tables-test-allowed-diff.h @@ -1 +1,9 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/pc/APIC.acpihmat", +"tests/data/acpi/pc/SRAT.acpihmat", +"tests/data/acpi/pc/HMAT.acpihmat", +"tests/data/acpi/pc/DSDT.acpihmat", +"tests/data/acpi/q35/APIC.acpihmat", +"tests/data/acpi/q35/SRAT.acpihmat", +"tests/data/acpi/q35/HMAT.acpihmat", +"tests/data/acpi/q35/DSDT.acpihmat", diff --git a/tests/bios-tables-test.c b/tests/bios-tables-test.c index 79f5da092f..cb1de58053 100644 --- a/tests/bios-tables-test.c +++ b/tests/bios-tables-test.c @@ -947,6 +947,48 @@ static void test_acpi_virt_tcg_numamem(void) } +static void test_acpi_tcg_acpi_hmat(const char *machine) +{ +test_data data; + +memset(&data, 0, sizeof(data)); +data.machine = machine; +data.variant = ".acpihmat"; +test_acpi_one(" -machine hmat=on" + " -smp 2,sockets=2" + " -m 128M,slots=2,maxmem=1G" + " -object memory-backend-ram,size=64M,id=m0" + " -object memory-backend-ram,size=64M,id=m1" + " -numa node,nodeid=0,memdev=m0" + " -numa node,nodeid=1,memdev=m1,initiator=0" + " -numa cpu,node-id=0,socket-id=0" + " -numa cpu,node-id=0,socket-id=1" + " -numa hmat-lb,initiator=0,target=0,hierarchy=memory," + "data-type=access-latency,latency=1" + " -numa hmat-lb,initiator=0,target=0,hierarchy=memory," + "data-type=access-bandwidth,bandwidth=65534M" + " -numa hmat-lb,initiator=0,target=1,hierarchy=memory," + "data-type=access-latency,latency=65534" + " -numa hmat-lb,initiator=0,target=1,hierarchy=memory," + "data-type=access-bandwidth,bandwidth=32767M" + " -numa hmat-cache,node-id=0,size=10K,level=1,assoc=direct," + "policy=write-back,line=8" + " -numa hmat-cache,node-id=1,size=10K,level=1,assoc=direct," + "policy=write-back,line=8", + &data); +free_test_data(&data); +} + +static void test_acpi_q35_tcg_acpi_hmat(void) +{ +test_acpi_tcg_acpi_hmat(MACHINE_Q35); +} + +static void test_acpi_piix4_tcg_acpi_hmat(void) +{ +test_acpi_tcg_acpi_hmat(MACHINE_PC); +} + static void test_acpi_virt_tcg(void) { test_data data = { @@ -991,6 +1033,8 @@ int main(int argc, char *argv[]) qtest_add_func("acpi/q35/numamem", test_acpi_q35_tcg
[PATCH v19 2/8] numa: Extend CLI to provide memory latency and bandwidth information
From: Liu Jingqi Add -numa hmat-lb option to provide System Locality Latency and Bandwidth Information. These memory attributes help to build System Locality Latency and Bandwidth Information Structure(s) in ACPI Heterogeneous Memory Attribute Table (HMAT). Before using hmat-lb option, enable HMAT with -machine hmat=on. Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- Changes in v19: - Add description about the machine property 'hmat' in commit message (Markus) Changes in v18: - Use qapi type uint64 and only nanosecond for latency (Markus) Changes in v17: - Add check when user input latency or bandwidth 0, the lb_info_provided should also be 0. Because in ACPI 6.3 5.2.27.4, 0 means the corresponding latency or bandwidth information is not provided. - Fix the infinite loop when node->latency is 0. Changes in v16: - Initialize HMAT_LB_Data lb_data (Igor) - Remove punctuation from error_setg (Igor) - Correct some description (Igor) - Drop statement about max value (Igor) - Simplify struct HMAT_LB_Info and related code, unify latency and bandwidth (Igor) --- hw/core/numa.c| 181 ++ include/sysemu/numa.h | 53 + qapi/machine.json | 93 +- qemu-options.hx | 48 ++- 4 files changed, 372 insertions(+), 3 deletions(-) diff --git a/hw/core/numa.c b/hw/core/numa.c index e60da99293..2183c8df1f 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -23,6 +23,7 @@ */ #include "qemu/osdep.h" +#include "qemu/units.h" #include "sysemu/hostmem.h" #include "sysemu/numa.h" #include "sysemu/sysemu.h" @@ -198,6 +199,173 @@ void parse_numa_distance(MachineState *ms, NumaDistOptions *dist, Error **errp) ms->numa_state->have_numa_distance = true; } +void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, +Error **errp) +{ +int i, first_bit, last_bit; +uint64_t max_entry, temp_base_la; +NodeInfo *numa_info = numa_state->nodes; +HMAT_LB_Info *hmat_lb = +numa_state->hmat_lb[node->hierarchy][node->data_type]; +HMAT_LB_Data lb_data = {}; +HMAT_LB_Data *lb_temp; + +/* Error checking */ +if (node->initiator > numa_state->num_nodes) { +error_setg(errp, "Invalid initiator=%d, it should be less than %d", + node->initiator, numa_state->num_nodes); +return; +} +if (node->target > numa_state->num_nodes) { +error_setg(errp, "Invalid target=%d, it should be less than %d", + node->target, numa_state->num_nodes); +return; +} +if (!numa_info[node->initiator].has_cpu) { +error_setg(errp, "Invalid initiator=%d, it isn't an " + "initiator proximity domain", node->initiator); +return; +} +if (!numa_info[node->target].present) { +error_setg(errp, "The target=%d should point to an existing node", + node->target); +return; +} + +if (!hmat_lb) { +hmat_lb = g_malloc0(sizeof(*hmat_lb)); +numa_state->hmat_lb[node->hierarchy][node->data_type] = hmat_lb; +hmat_lb->list = g_array_new(false, true, sizeof(HMAT_LB_Data)); +} +hmat_lb->hierarchy = node->hierarchy; +hmat_lb->data_type = node->data_type; +lb_data.initiator = node->initiator; +lb_data.target = node->target; + +if (node->data_type <= HMATLB_DATA_TYPE_WRITE_LATENCY) { +/* Input latency data */ + +if (!node->has_latency) { +error_setg(errp, "Missing 'latency' option"); +return; +} +if (node->has_bandwidth) { +error_setg(errp, "Invalid option 'bandwidth' since " + "the data type is latency"); +return; +} + +/* Detect duplicate configuration */ +for (i = 0; i < hmat_lb->list->len; i++) { +lb_temp = &g_array_index(hmat_lb->list, HMAT_LB_Data, i); + +if (node->initiator == lb_temp->initiator && +node->target == lb_temp->target) { +error_setg(errp, "Duplicate configuration of the latency for " +"initiator=%d and target=%d", node->initiator, +node->target); +return; +} +} + +hmat_lb->base = hmat_lb->base ? hmat_lb->base : UINT64_MAX; + +if (node->latency) { +/* Calculate the temporary base and compressed latency */ +max_entry = node->latency; +temp_base_la = 1; +while
[PATCH v19 6/8] hmat acpi: Build Memory Side Cache Information Structure(s)
From: Liu Jingqi This structure describes memory side cache information for memory proximity domains if the memory side cache is present and the physical device forms the memory side cache. The software could use this information to effectively place the data in memory to maximize the performance of the system memory that use the memory side cache. Reviewed-by: Igor Mammedov Reviewed-by: Daniel Black Reviewed-by: Jonathan Cameron Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- No changes in v19. Changes in v16: - Use checks and assert to replace masks (Igor) - Fields in Cache Attributes are promoted to uint32_t before shifting (Igor) - Drop cpu_to_le32() (Igor) Changes in v13: - rename level as cache_level --- hw/acpi/hmat.c | 69 +- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/hw/acpi/hmat.c b/hw/acpi/hmat.c index e5ee8b4317..bb6adb0ccf 100644 --- a/hw/acpi/hmat.c +++ b/hw/acpi/hmat.c @@ -143,14 +143,62 @@ static void build_hmat_lb(GArray *table_data, HMAT_LB_Info *hmat_lb, g_free(entry_list); } +/* ACPI 6.3: 5.2.27.5 Memory Side Cache Information Structure: Table 5-147 */ +static void build_hmat_cache(GArray *table_data, uint8_t total_levels, + NumaHmatCacheOptions *hmat_cache) +{ +/* + * Cache Attributes: Bits [3:0] – Total Cache Levels + * for this Memory Proximity Domain + */ +uint32_t cache_attr = total_levels; + +/* Bits [7:4] : Cache Level described in this structure */ +cache_attr |= (uint32_t) hmat_cache->level << 4; + +/* Bits [11:8] - Cache Associativity */ +cache_attr |= (uint32_t) hmat_cache->assoc << 8; + +/* Bits [15:12] - Write Policy */ +cache_attr |= (uint32_t) hmat_cache->policy << 12; + +/* Bits [31:16] - Cache Line size in bytes */ +cache_attr |= (uint32_t) hmat_cache->line << 16; + +/* Type */ +build_append_int_noprefix(table_data, 2, 2); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* Length */ +build_append_int_noprefix(table_data, 32, 4); +/* Proximity Domain for the Memory */ +build_append_int_noprefix(table_data, hmat_cache->node_id, 4); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 4); +/* Memory Side Cache Size */ +build_append_int_noprefix(table_data, hmat_cache->size, 8); +/* Cache Attributes */ +build_append_int_noprefix(table_data, cache_attr, 4); +/* Reserved */ +build_append_int_noprefix(table_data, 0, 2); +/* + * Number of SMBIOS handles (n) + * Linux kernel uses Memory Side Cache Information Structure + * without SMBIOS entries for now, so set Number of SMBIOS handles + * as 0. + */ +build_append_int_noprefix(table_data, 0, 2); +} + /* Build HMAT sub table structures */ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) { uint16_t flags; uint32_t num_initiator = 0; uint32_t initiator_list[MAX_NODES]; -int i, hierarchy, type; +int i, hierarchy, type, cache_level, total_levels; HMAT_LB_Info *hmat_lb; +NumaHmatCacheOptions *hmat_cache; for (i = 0; i < numa_state->num_nodes; i++) { flags = 0; @@ -184,6 +232,25 @@ static void hmat_build_table_structs(GArray *table_data, NumaState *numa_state) } } } + +/* + * ACPI 6.3: 5.2.27.5 Memory Side Cache Information Structure: + * Table 5-147 + */ +for (i = 0; i < numa_state->num_nodes; i++) { +total_levels = 0; +for (cache_level = 1; cache_level < HMAT_LB_LEVELS; cache_level++) { +if (numa_state->hmat_cache[i][cache_level]) { +total_levels++; +} +} +for (cache_level = 0; cache_level <= total_levels; cache_level++) { +hmat_cache = numa_state->hmat_cache[i][cache_level]; +if (hmat_cache) { +build_hmat_cache(table_data, total_levels, hmat_cache); +} +} +} } void build_hmat(GArray *table_data, BIOSLinker *linker, NumaState *numa_state) -- 2.20.1
[PATCH v19 0/8] Build ACPI Heterogeneous Memory Attribute Table (HMAT)
This series of patches will build Heterogeneous Memory Attribute Table (HMAT) according to the command line. The ACPI HMAT describes the memory attributes, such as memory side cache attributes and bandwidth and latency details, related to the Memory Proximity Domain. The software is expected to use HMAT information as hint for optimization. In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report the platform's HMAT tables. The V18 patches link: https://patchwork.kernel.org/cover/11263551/ Changelog: v19: - Add description about the machine property 'hmat' in commit message (Markus) - Update the QAPI comments - Add a check for no memory side cache - Add some fail cases for hmat-cache when level=0 v18: - Defer patches 01/14~06/14 of V17, use qapi type uint64 and only nanosecond for latency (Markus) - Rewrite the lines over 80 characters(Igor) v17: - Add check when user input latency or bandwidth 0, the lb_info_provided should also be 0. Because in ACPI 6.3 5.2.27.4, 0 means the corresponding latency or bandwidth information is not provided. - Fix the infinite loop when node->latency is 0. - Use NumaHmatCacheOptions to replace HMAT_Cache_Info (Igor) - Add check for unordered cache level input (Igor) - Add some fail test cases (Igor) v16: - Add and use qemu_strtold_finite to parse size, support full 64bit precision, modify related test cases (Eduardo and Markus) - Simplify struct HMAT_LB_Info and related code, unify latency and bandwidth (Igor) - Add cross check with hmat_lb data (Igor) - Fields in Cache Attributes are promoted to uint32_t before shifting (Igor) - Add case for QMP build HMAT (Igor) v15: - Add a new patch to refactor do_strtosz() (Eduardo) - Make tests without breaking CI (Michael) v14: - Reuse the codes of do_strtosz to build qemu_strtotime_ns (Eduardo) - Squash patch v13 01/12 and 02/12 together (Daniel and Eduardo) - Drop time unit picosecond (Eric) - Use qemu ctz64 and clz64 instead of builtin function v13: - Modify some text description - Drop "initiator_valid" field in struct NodeInfo - Reuse Garray to store the raw bandwidth and bandwidth data - Calculate common base unit using range bitmap - Add a patch to alculate hmat latency and bandwidth entry list - Drop the total_levels option and use readable cache size - Remove the unnecessary head file - Use decimal notation with appropriate suffix for cache size v12: - Fix a bug that a memory-only node without initiator setting doesn't report error. (reported by Danmei Wei) - Fix a bug that if HMAT is enabled and without hmat-lb setting, QEMU will crash. (reported by Danmei Wei) Liu Jingqi (5): numa: Extend CLI to provide memory latency and bandwidth information numa: Extend CLI to provide memory side cache information hmat acpi: Build Memory Proximity Domain Attributes Structure(s) hmat acpi: Build System Locality Latency and Bandwidth Information Structure(s) hmat acpi: Build Memory Side Cache Information Structure(s) Tao Xu (3): numa: Extend CLI to provide initiator information for numa nodes tests/numa: Add case for QMP build HMAT tests/bios-tables-test: add test cases for ACPI HMAT hw/acpi/Kconfig | 7 +- hw/acpi/Makefile.objs | 1 + hw/acpi/hmat.c| 268 hw/acpi/hmat.h| 42 hw/core/machine.c | 64 ++ hw/core/numa.c| 290 ++ hw/i386/acpi-build.c | 5 + include/sysemu/numa.h | 63 ++ qapi/machine.json | 180 +++- qemu-options.hx | 95 - tests/bios-tables-test-allowed-diff.h | 8 + tests/bios-tables-test.c | 44 tests/data/acpi/pc/APIC.acpihmat | 0 tests/data/acpi/pc/DSDT.acpihmat | 0 tests/data/acpi/pc/HMAT.acpihmat | 0 tests/data/acpi/pc/SRAT.acpihmat | 0 tests/data/acpi/q35/APIC.acpihmat | 0 tests/data/acpi/q35/DSDT.acpihmat | 0 tests/data/acpi/q35/HMAT.acpihmat | 0 tests/data/acpi/q35/SRAT.acpihmat | 0 tests/numa-test.c | 213 +++ 21 files changed, 1269 insertions(+), 11 deletions(-) create mode 100644 hw/acpi/hmat.c create mode 100644 hw/acpi/hmat.h create mode 100644 tests/data/acpi/pc/APIC.acpihmat create mode 100644 tests/data/acpi/pc/DSDT.acpihmat create mode 100644 tests/data/acpi/pc/HMAT.acpihmat create mode 100644 tests/data/acpi/pc/SRAT.acpihmat create mode 100644 tests/data/acpi/q35/APIC.acpihmat create mode 100644 tests/data/acpi/q35/DSDT.acpihmat create mode 100644 tests/data/acpi/q35/HMAT.acpihmat create mode 100644 tests/data/acpi/q35/SRAT.acpihmat -- 2.20.1
[PATCH v19 7/8] tests/numa: Add case for QMP build HMAT
Check configuring HMAT usecase Reviewed-by: Igor Mammedov Suggested-by: Igor Mammedov Signed-off-by: Tao Xu --- Changes in v19: - Add some fail cases for hmat-cache when level=0 Changes in v18: - Rewrite the lines over 80 characters Chenges in v17: - Add some fail test cases (Igor) --- tests/numa-test.c | 213 ++ 1 file changed, 213 insertions(+) diff --git a/tests/numa-test.c b/tests/numa-test.c index 8de8581231..aed7b2f31b 100644 --- a/tests/numa-test.c +++ b/tests/numa-test.c @@ -327,6 +327,216 @@ static void pc_dynamic_cpu_cfg(const void *data) qtest_quit(qs); } +static void pc_hmat_build_cfg(const void *data) +{ +QTestState *qs = qtest_initf("%s -nodefaults --preconfig -machine hmat=on " + "-smp 2,sockets=2 " + "-m 128M,slots=2,maxmem=1G " + "-object memory-backend-ram,size=64M,id=m0 " + "-object memory-backend-ram,size=64M,id=m1 " + "-numa node,nodeid=0,memdev=m0 " + "-numa node,nodeid=1,memdev=m1,initiator=0 " + "-numa cpu,node-id=0,socket-id=0 " + "-numa cpu,node-id=0,socket-id=1", + data ? (char *)data : ""); + +/* Fail: Initiator should be less than the number of nodes */ +g_assert(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 2, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); + +/* Fail: Target should be less than the number of nodes */ +g_assert(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 2," +" 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); + +/* Fail: Initiator should contain cpu */ +g_assert(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 1, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"access-latency\" } }"))); + +/* Fail: Data-type mismatch */ +g_assert(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"write-latency\"," +" 'bandwidth': 524288000 } }"))); +g_assert(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"read-bandwidth\"," +" 'latency': 5 } }"))); + +/* Fail: Bandwidth should be 1MB (1048576) aligned */ +g_assert(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"access-bandwidth\"," +" 'bandwidth': 1048575 } }"))); + +/* Configuring HMAT bandwidth and latency details */ +g_assert(!qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"access-latency\"," +" 'latency': 1 } }")));/* 1 ns */ +g_assert(qmp_rsp_is_err(qtest_qmp(qs, "{ 'execute': 'set-numa-node'," +" 'arguments': { 'type': 'hmat-lb', 'initiator': 0, 'target': 0," +" 'hierarchy': \"memory\", 'data-type': \"access-latency\"," +" 'latency': 5 } }")));/* Fail: Duplicate configuration */ +g_assert(!qmp_rsp_
[PATCH v19 1/8] numa: Extend CLI to provide initiator information for numa nodes
In ACPI 6.3 chapter 5.2.27 Heterogeneous Memory Attribute Table (HMAT), The initiator represents processor which access to memory. And in 5.2.27.3 Memory Proximity Domain Attributes Structure, the attached initiator is defined as where the memory controller responsible for a memory proximity domain. With attached initiator information, the topology of heterogeneous memory can be described. Add new machine property 'hmat' to enable all HMAT specific options. Extend CLI of "-numa node" option to indicate the initiator numa node-id. In the linux kernel, the codes in drivers/acpi/hmat/hmat.c parse and report the platform's HMAT tables. Before using initiator option, enable HMAT with -machine hmat=on. Reviewed-by: Igor Mammedov Reviewed-by: Jingqi Liu Suggested-by: Dan Williams Signed-off-by: Tao Xu --- Changes in v19: - Add description about the machine property 'hmat' in commit message (Markus) Changes in v15: - Change the QAPI version tag to 5.0 (Eric) --- hw/core/machine.c | 64 +++ hw/core/numa.c| 23 include/sysemu/numa.h | 5 qapi/machine.json | 10 ++- qemu-options.hx | 35 +++ 5 files changed, 131 insertions(+), 6 deletions(-) diff --git a/hw/core/machine.c b/hw/core/machine.c index 1689ad3bf8..d7d2cfa66d 100644 --- a/hw/core/machine.c +++ b/hw/core/machine.c @@ -518,6 +518,20 @@ static void machine_set_nvdimm(Object *obj, bool value, Error **errp) ms->nvdimms_state->is_enabled = value; } +static bool machine_get_hmat(Object *obj, Error **errp) +{ +MachineState *ms = MACHINE(obj); + +return ms->numa_state->hmat_enabled; +} + +static void machine_set_hmat(Object *obj, bool value, Error **errp) +{ +MachineState *ms = MACHINE(obj); + +ms->numa_state->hmat_enabled = value; +} + static char *machine_get_nvdimm_persistence(Object *obj, Error **errp) { MachineState *ms = MACHINE(obj); @@ -645,6 +659,7 @@ void machine_set_cpu_numa_node(MachineState *machine, const CpuInstanceProperties *props, Error **errp) { MachineClass *mc = MACHINE_GET_CLASS(machine); +NodeInfo *numa_info = machine->numa_state->nodes; bool match = false; int i; @@ -714,6 +729,17 @@ void machine_set_cpu_numa_node(MachineState *machine, match = true; slot->props.node_id = props->node_id; slot->props.has_node_id = props->has_node_id; + +if (machine->numa_state->hmat_enabled) { +if ((numa_info[props->node_id].initiator < MAX_NODES) && +(props->node_id != numa_info[props->node_id].initiator)) { +error_setg(errp, "The initiator of CPU NUMA node %" PRId64 +" should be itself", props->node_id); +return; +} +numa_info[props->node_id].has_cpu = true; +numa_info[props->node_id].initiator = props->node_id; +} } if (!match) { @@ -960,6 +986,13 @@ static void machine_initfn(Object *obj) if (mc->numa_mem_supported) { ms->numa_state = g_new0(NumaState, 1); +object_property_add_bool(obj, "hmat", + machine_get_hmat, machine_set_hmat, + &error_abort); +object_property_set_description(obj, "hmat", +"Set on/off to enable/disable " +"ACPI Heterogeneous Memory Attribute " +"Table (HMAT)", NULL); } /* Register notifier when init is done for sysbus sanity checks */ @@ -1048,6 +1081,32 @@ static char *cpu_slot_to_string(const CPUArchId *cpu) return g_string_free(s, false); } +static void numa_validate_initiator(NumaState *numa_state) +{ +int i; +NodeInfo *numa_info = numa_state->nodes; + +for (i = 0; i < numa_state->num_nodes; i++) { +if (numa_info[i].initiator == MAX_NODES) { +error_report("The initiator of NUMA node %d is missing, use " + "'-numa node,initiator' option to declare it", i); +exit(1); +} + +if (!numa_info[numa_info[i].initiator].present) { +error_report("NUMA node %" PRIu16 " is missing, use " + "'-numa node' option to declare it first", + numa_info[i].initiator); +exit(1); +} + +if (!numa_info[numa_info[i].initiator].has_cpu) { +error_report("The initiator of NUMA node %d is invalid", i); +exit(1); +} +} +} + static void machine_nu
[PATCH v19 3/8] numa: Extend CLI to provide memory side cache information
From: Liu Jingqi Add -numa hmat-cache option to provide Memory Side Cache Information. These memory attributes help to build Memory Side Cache Information Structure(s) in ACPI Heterogeneous Memory Attribute Table (HMAT). Before using hmat-cache option, enable HMAT with -machine hmat=on. Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu --- Changes in v19: - Add description about the machine property 'hmat' in commit message (Markus) - Update the QAPI comments - Add a check for no memory side cache Changes in v18: - Update the error message (Igor) Changes in v17: - Use NumaHmatCacheOptions to replace HMAT_Cache_Info (Igor) - Add check for unordered cache level input (Igor) Changes in v16: - Add cross check with hmat_lb data (Igor) - Drop total_levels in struct HMAT_Cache_Info (Igor) - Correct the error table number (Igor) Changes in v15: - Change the QAPI version tag to 5.0 (Eric) --- hw/core/numa.c| 86 +++ include/sysemu/numa.h | 5 +++ qapi/machine.json | 81 +++- qemu-options.hx | 16 +++- 4 files changed, 184 insertions(+), 4 deletions(-) diff --git a/hw/core/numa.c b/hw/core/numa.c index 2183c8df1f..664b44ad68 100644 --- a/hw/core/numa.c +++ b/hw/core/numa.c @@ -366,6 +366,79 @@ void parse_numa_hmat_lb(NumaState *numa_state, NumaHmatLBOptions *node, g_array_append_val(hmat_lb->list, lb_data); } +void parse_numa_hmat_cache(MachineState *ms, NumaHmatCacheOptions *node, + Error **errp) +{ +int nb_numa_nodes = ms->numa_state->num_nodes; +NodeInfo *numa_info = ms->numa_state->nodes; +NumaHmatCacheOptions *hmat_cache = NULL; + +if (node->node_id >= nb_numa_nodes) { +error_setg(errp, "Invalid node-id=%" PRIu32 ", it should be less " + "than %d", node->node_id, nb_numa_nodes); +return; +} + +if (numa_info[node->node_id].lb_info_provided != (BIT(0) | BIT(1))) { +error_setg(errp, "The latency and bandwidth information of " + "node-id=%" PRIu32 " should be provided before memory side " + "cache attributes", node->node_id); +return; +} + +if (node->level >= HMAT_LB_LEVELS) { +error_setg(errp, "Invalid level=%" PRIu8 ", it should be less than or " + "equal to %d", node->level, HMAT_LB_LEVELS - 1); +return; +} + +if (!node->level && (node->assoc || node->policy || node->line)) { +error_setg(errp, "Assoc and policy options should be 'none', line " + "should be 0. If cache level is 0, which means no memory " + "side cache in node-id=%" PRIu32, node->node_id); +return; +} + +assert(node->assoc < HMAT_CACHE_ASSOCIATIVITY__MAX); +assert(node->policy < HMAT_CACHE_WRITE_POLICY__MAX); +if (ms->numa_state->hmat_cache[node->node_id][node->level]) { +error_setg(errp, "Duplicate configuration of the side cache for " + "node-id=%" PRIu32 " and level=%" PRIu8, + node->node_id, node->level); +return; +} + +if ((node->level > 1) && +ms->numa_state->hmat_cache[node->node_id][node->level - 1] && +(node->size >= +ms->numa_state->hmat_cache[node->node_id][node->level - 1]->size)) { +error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 + " should be less than the size(%" PRIu64 ") of " + "level=%" PRIu8, node->size, node->level, + ms->numa_state->hmat_cache[node->node_id] + [node->level - 1]->size, + node->level - 1); +return; +} + +if ((node->level < HMAT_LB_LEVELS - 1) && +ms->numa_state->hmat_cache[node->node_id][node->level + 1] && +(node->size <= +ms->numa_state->hmat_cache[node->node_id][node->level + 1]->size)) { +error_setg(errp, "Invalid size=%" PRIu64 ", the size of level=%" PRIu8 + " should be larger than the size(%" PRIu64 ") of " + "level=%" PRIu8, node->size, node->level, + ms->numa_state->hmat_cache[node->node_id] + [node->level + 1]->size, + node->level + 1); +return; +} + +hmat_
Re: [PATCH v18 3/8] numa: Extend CLI to provide memory side cache information
On 11/28/2019 10:46 AM, Tao Xu wrote: On 11/27/2019 5:56 PM, Markus Armbruster wrote: Tao Xu writes: From: Liu Jingqi Add -numa hmat-cache option to provide Memory Side Cache Information. These memory attributes help to build Memory Side Cache Information Structure(s) in ACPI Heterogeneous Memory Attribute Table (HMAT). Please mention this requires -machine hmat=on. OK I will add these for 3 related patches. Reviewed-by: Igor Mammedov Reviewed-by: Daniel Black Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu [...] diff --git a/qapi/machine.json b/qapi/machine.json index c741649d7b..3d0ba226a9 100644 --- a/qapi/machine.json +++ b/qapi/machine.json @@ -428,10 +428,12 @@ # # @hmat-lb: memory latency and bandwidth information (Since: 5.0) # +# @hmat-cache: memory side cache information (Since: 5.0) +# # Since: 2.1 ## { 'enum': 'NumaOptionsType', - 'data': [ 'node', 'dist', 'cpu', 'hmat-lb' ] } + 'data': [ 'node', 'dist', 'cpu', 'hmat-lb', 'hmat-cache' ] } ## # @NumaOptions: @@ -447,7 +449,8 @@ 'node': 'NumaNodeOptions', 'dist': 'NumaDistOptions', 'cpu': 'NumaCpuOptions', - 'hmat-lb': 'NumaHmatLBOptions' }} + 'hmat-lb': 'NumaHmatLBOptions', + 'hmat-cache': 'NumaHmatCacheOptions' }} ## # @NumaNodeOptions: @@ -647,6 +650,77 @@ '*latency': 'uint64', '*bandwidth': 'size' }} +## +# @HmatCacheAssociativity: +# +# Cache associativity in the Memory Side Cache +# Information Structure of HMAT +# +# For more information of @HmatCacheAssociativity see +# the chapter 5.2.27.5: Table 5-147 of ACPI 6.3 spec. # Cache associativity in the Memory Side Cache Information Structure # of HMAT # # For more information of @HmatCacheAssociativity, see chapter # 5.2.27.5: Table 5-147 of ACPI 6.3 spec. +# +# @none: None What does cache associativity @none mean? A none-associative cache? I guess it makes sense to people familiar with the ACPI spec... This means this proximity domain has no memory cache, thus none for Cache associativity, I will add more description about this. Read again about ACPI spec, there is no description about 'none'. In linux kernel HMAT code, this is handle as "other", maybe means not provided. I will also add a check when level is none, the associativity, policy and line_size should be none or 0.
Re: [PATCH v18 3/8] numa: Extend CLI to provide memory side cache information
On 11/27/2019 5:56 PM, Markus Armbruster wrote: Tao Xu writes: From: Liu Jingqi Add -numa hmat-cache option to provide Memory Side Cache Information. These memory attributes help to build Memory Side Cache Information Structure(s) in ACPI Heterogeneous Memory Attribute Table (HMAT). Please mention this requires -machine hmat=on. OK I will add these for 3 related patches. Reviewed-by: Igor Mammedov Reviewed-by: Daniel Black Signed-off-by: Liu Jingqi Signed-off-by: Tao Xu [...] diff --git a/qapi/machine.json b/qapi/machine.json index c741649d7b..3d0ba226a9 100644 --- a/qapi/machine.json +++ b/qapi/machine.json @@ -428,10 +428,12 @@ # # @hmat-lb: memory latency and bandwidth information (Since: 5.0) # +# @hmat-cache: memory side cache information (Since: 5.0) +# # Since: 2.1 ## { 'enum': 'NumaOptionsType', - 'data': [ 'node', 'dist', 'cpu', 'hmat-lb' ] } + 'data': [ 'node', 'dist', 'cpu', 'hmat-lb', 'hmat-cache' ] } ## # @NumaOptions: @@ -447,7 +449,8 @@ 'node': 'NumaNodeOptions', 'dist': 'NumaDistOptions', 'cpu': 'NumaCpuOptions', -'hmat-lb': 'NumaHmatLBOptions' }} +'hmat-lb': 'NumaHmatLBOptions', +'hmat-cache': 'NumaHmatCacheOptions' }} ## # @NumaNodeOptions: @@ -647,6 +650,77 @@ '*latency': 'uint64', '*bandwidth': 'size' }} +## +# @HmatCacheAssociativity: +# +# Cache associativity in the Memory Side Cache +# Information Structure of HMAT +# +# For more information of @HmatCacheAssociativity see +# the chapter 5.2.27.5: Table 5-147 of ACPI 6.3 spec. # Cache associativity in the Memory Side Cache Information Structure # of HMAT # # For more information of @HmatCacheAssociativity, see chapter # 5.2.27.5: Table 5-147 of ACPI 6.3 spec. +# +# @none: None What does cache associativity @none mean? A none-associative cache? I guess it makes sense to people familiar with the ACPI spec... This means this proximity domain has no memory cache, thus none for Cache associativity, I will add more description about this. +# +# @direct: Direct Mapped +# +# @complex: Complex Cache Indexing (implementation specific) +# +# Since: 5.0 +## +{ 'enum': 'HmatCacheAssociativity', + 'data': [ 'none', 'direct', 'complex' ] } + +## +# @HmatCacheWritePolicy: +# +# Cache write policy in the Memory Side Cache +# Information Structure of HMAT +# +# For more information of @HmatCacheWritePolicy see +# the chapter 5.2.27.5: Table 5-147: Field "Cache Attributes" of ACPI 6.3 spec. Break lines around column 70, please. +# +# @none: None What does cache write policy @none mean? This means this proximity domain has no memory cache, thus none for cache write policy. +# +# @write-back: Write Back (WB) +# +# @write-through: Write Through (WT) +# +# Since: 5.0 +## +{ 'enum': 'HmatCacheWritePolicy', + 'data': [ 'none', 'write-back', 'write-through' ] } + +## +# @NumaHmatCacheOptions: +# +# Set the memory side cache information for a given memory domain. +# +# For more information of @NumaHmatCacheOptions see +# the chapter 5.2.27.5: Table 5-147: Field "Cache Attributes" of ACPI 6.3 spec. +# +# @node-id: the memory proximity domain to which the memory belongs. +# +# @size: the size of memory side cache in bytes. +# +# @level: the cache level described in this structure. +# +# @assoc: the cache associativity, none/direct-mapped/complex(complex cache indexing). +# +# @policy: the write policy, none/write-back/write-through. +# +# @line: the cache Line size in bytes. +# +# Since: 5.0 +## +{ 'struct': 'NumaHmatCacheOptions', + 'data': { + 'node-id': 'uint32', Ignorant question: you use 'uint16' for other "proximity domains". Is 'uint32' intentional here? Yes, because ACPI 6.3 spec define the domain as 4 byte(32 bit), and for HmatCacheOptions we directly use this QAPI struct when building HMAT. But for other "proximity domains" we use local variable (such as "uint32_t initiator")
[PATCH v18 8/8] tests/bios-tables-test: add test cases for ACPI HMAT
ACPI table HMAT has been introduced, QEMU now builds HMAT tables for Heterogeneous Memory with boot option '-numa node'. Add test cases on PC and Q35 machines with 2 numa nodes. Because HMAT is generated when system enable numa, the following tables need to be added for this test: tests/data/acpi/pc/APIC.acpihmat tests/data/acpi/pc/SRAT.acpihmat tests/data/acpi/pc/HMAT.acpihmat tests/data/acpi/pc/DSDT.acpihmat tests/data/acpi/q35/APIC.acpihmat tests/data/acpi/q35/SRAT.acpihmat tests/data/acpi/q35/HMAT.acpihmat tests/data/acpi/q35/DSDT.acpihmat Reviewed-by: Igor Mammedov Reviewed-by: Daniel Black Reviewed-by: Jingqi Liu Suggested-by: Igor Mammedov Signed-off-by: Tao Xu --- Changes in v18: - Remove unit "ns". Changes in v17: - Update the latency and bandwidth Changes in v15: - Make tests without breaking CI (Michael) Changes in v13: - Use decimal notation with appropriate suffix for cache size --- tests/bios-tables-test-allowed-diff.h | 8 + tests/bios-tables-test.c | 44 +++ tests/data/acpi/pc/APIC.acpihmat | 0 tests/data/acpi/pc/DSDT.acpihmat | 0 tests/data/acpi/pc/HMAT.acpihmat | 0 tests/data/acpi/pc/SRAT.acpihmat | 0 tests/data/acpi/q35/APIC.acpihmat | 0 tests/data/acpi/q35/DSDT.acpihmat | 0 tests/data/acpi/q35/HMAT.acpihmat | 0 tests/data/acpi/q35/SRAT.acpihmat | 0 10 files changed, 52 insertions(+) create mode 100644 tests/data/acpi/pc/APIC.acpihmat create mode 100644 tests/data/acpi/pc/DSDT.acpihmat create mode 100644 tests/data/acpi/pc/HMAT.acpihmat create mode 100644 tests/data/acpi/pc/SRAT.acpihmat create mode 100644 tests/data/acpi/q35/APIC.acpihmat create mode 100644 tests/data/acpi/q35/DSDT.acpihmat create mode 100644 tests/data/acpi/q35/HMAT.acpihmat create mode 100644 tests/data/acpi/q35/SRAT.acpihmat diff --git a/tests/bios-tables-test-allowed-diff.h b/tests/bios-tables-test-allowed-diff.h index dfb8523c8b..3c9e0c979b 100644 --- a/tests/bios-tables-test-allowed-diff.h +++ b/tests/bios-tables-test-allowed-diff.h @@ -1 +1,9 @@ /* List of comma-separated changed AML files to ignore */ +"tests/data/acpi/pc/APIC.acpihmat", +"tests/data/acpi/pc/SRAT.acpihmat", +"tests/data/acpi/pc/HMAT.acpihmat", +"tests/data/acpi/pc/DSDT.acpihmat", +"tests/data/acpi/q35/APIC.acpihmat", +"tests/data/acpi/q35/SRAT.acpihmat", +"tests/data/acpi/q35/HMAT.acpihmat", +"tests/data/acpi/q35/DSDT.acpihmat", diff --git a/tests/bios-tables-test.c b/tests/bios-tables-test.c index 79f5da092f..cb1de58053 100644 --- a/tests/bios-tables-test.c +++ b/tests/bios-tables-test.c @@ -947,6 +947,48 @@ static void test_acpi_virt_tcg_numamem(void) } +static void test_acpi_tcg_acpi_hmat(const char *machine) +{ +test_data data; + +memset(&data, 0, sizeof(data)); +data.machine = machine; +data.variant = ".acpihmat"; +test_acpi_one(" -machine hmat=on" + " -smp 2,sockets=2" + " -m 128M,slots=2,maxmem=1G" + " -object memory-backend-ram,size=64M,id=m0" + " -object memory-backend-ram,size=64M,id=m1" + " -numa node,nodeid=0,memdev=m0" + " -numa node,nodeid=1,memdev=m1,initiator=0" + " -numa cpu,node-id=0,socket-id=0" + " -numa cpu,node-id=0,socket-id=1" + " -numa hmat-lb,initiator=0,target=0,hierarchy=memory," + "data-type=access-latency,latency=1" + " -numa hmat-lb,initiator=0,target=0,hierarchy=memory," + "data-type=access-bandwidth,bandwidth=65534M" + " -numa hmat-lb,initiator=0,target=1,hierarchy=memory," + "data-type=access-latency,latency=65534" + " -numa hmat-lb,initiator=0,target=1,hierarchy=memory," + "data-type=access-bandwidth,bandwidth=32767M" + " -numa hmat-cache,node-id=0,size=10K,level=1,assoc=direct," + "policy=write-back,line=8" + " -numa hmat-cache,node-id=1,size=10K,level=1,assoc=direct," + "policy=write-back,line=8", + &data); +free_test_data(&data); +} + +static void test_acpi_q35_tcg_acpi_hmat(void) +{ +test_acpi_tcg_acpi_hmat(MACHINE_Q35); +} + +static void test_acpi_piix4_tcg_acpi_hmat(void) +{ +test_acpi_tcg_acpi_hmat(MACHINE_PC); +} + static void test_acpi_virt_tcg(void) { test_data data = { @@ -991,6 +1033,8 @@ int main(int argc, char *argv[]) qtest_add_func("acpi/q35/numamem", test_acpi_q35_tcg_numamem);