Re: Xen network domain performance for 10Gb NIC

2020-04-27 Thread tosher 1
> Driver domains with passthrough devices need to perform IOMMU
operations in order to add/remove page table entries when doing grant
maps (ie: IOMMU TLB flushes), while dom0 doesn't need to because it
has the whole RAM identity mapped in the IOMMU tables. Depending on
how fast your IOMMU is and what capabilities it has doing such
operations can introduce a significant amount of overhead.

It makes sense to me. Do you know, in general, how to measure IOMMU 
performance, and what features/properties of IOMMU can contribute to getting a 
better network throughput? Please let me know. 

Thanks!



[PATCH v5 21/21] tools: Clean up vchan-socket-proxy socket

2020-04-27 Thread Jason Andryuk
To avoid socket files lingering in /run/xen, have vchan-socket-proxy
clean up the sockets it creates.  Use a signal handler as well as atexit
to handle both means of termination.

Signed-off-by: Jason Andryuk 
---
 tools/libvchan/vchan-socket-proxy.c | 22 ++
 1 file changed, 22 insertions(+)

diff --git a/tools/libvchan/vchan-socket-proxy.c 
b/tools/libvchan/vchan-socket-proxy.c
index 13700c5d67..0fb42964b5 100644
--- a/tools/libvchan/vchan-socket-proxy.c
+++ b/tools/libvchan/vchan-socket-proxy.c
@@ -33,6 +33,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -88,6 +89,22 @@ char outbuf[BUFSIZE];
 int insiz = 0;
 int outsiz = 0;
 int verbose = 0;
+char *cleanup_socket;
+
+static void cleanup(void)
+{
+if (cleanup_socket) {
+unlink(cleanup_socket);
+free(cleanup_socket);
+cleanup_socket = NULL;
+}
+}
+
+static void cleanup_exit(int signum)
+{
+cleanup();
+exit(0);
+}
 
 static void vchan_wr(struct libxenvchan *ctrl) {
 int ret;
@@ -394,6 +411,9 @@ int main(int argc, char **argv)
 vchan_path = argv[optind+1];
 socket_path = argv[optind+2];
 
+signal(SIGHUP, cleanup_exit);
+signal(SIGTERM, cleanup_exit);
+
 if (is_server) {
 ctrl = libxenvchan_server_init(NULL, domid, vchan_path, 0, 0);
 if (!ctrl) {
@@ -410,6 +430,8 @@ int main(int argc, char **argv)
 perror("listen socket");
 return 1;
 }
+cleanup_socket = strdup(socket_path);
+atexit(cleanup);
 }
 }
 
-- 
2.20.1




[PATCH v5 17/21] docs: Add device-model-domid to xenstore-paths

2020-04-27 Thread Jason Andryuk
Document device-model-domid for when using a device model stubdomain.

Signed-off-by: Jason Andryuk 
---
 docs/misc/xenstore-paths.pandoc | 5 +
 1 file changed, 5 insertions(+)

diff --git a/docs/misc/xenstore-paths.pandoc b/docs/misc/xenstore-paths.pandoc
index a152f5ea68..766e8008dc 100644
--- a/docs/misc/xenstore-paths.pandoc
+++ b/docs/misc/xenstore-paths.pandoc
@@ -148,6 +148,11 @@ The domain's own ID.
 The process ID of the device model associated with this domain, if it
 has one.
 
+ ~/image/device-model-domid = INTEGER   [INTERNAL]
+
+The domain ID of the device model stubdomain associated with this domain,
+if it has one.
+
  ~/cpu/[0-9]+/availability = ("online"|"offline") [PV]
 
 One node for each virtual CPU up to the guest's configured
-- 
2.20.1




[PATCH v5 14/21] libxl: require qemu in dom0 even if stubdomain is in use

2020-04-27 Thread Jason Andryuk
From: Marek Marczykowski-Górecki 

Until xenconsoled learns how to handle multiple consoles, this is needed
for save/restore support (qemu state is transferred over secondary
consoles).
Additionally, Linux-based stubdomain waits for all the backends to
initialize during boot. Lack of some console backends results in
stubdomain startup timeout.

This is a temporary patch until xenconsoled will be improved.

Signed-off-by: Marek Marczykowski-Górecki 
Signed-off-by: Jason Andryuk 
---
 tools/libxl/libxl_dm.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c
index e420c3fc7b..5e5e7a27b3 100644
--- a/tools/libxl/libxl_dm.c
+++ b/tools/libxl/libxl_dm.c
@@ -2484,7 +2484,11 @@ static void spawn_stub_launch_dm(libxl__egc *egc,
 }
 }
 
-need_qemu = libxl__need_xenpv_qemu(gc, dm_config);
+/*
+ * Until xenconsoled learns how to handle multiple consoles, require qemu
+ * in dom0 to serve consoles for a stubdomain - it require at least 3 of 
them.
+ */
+need_qemu = 1 || libxl__need_xenpv_qemu(gc, >dm_config);
 
 for (i = 0; i < num_console; i++) {
 libxl__device device;
@@ -2616,7 +2620,11 @@ static void qmp_proxy_spawn_outcome(libxl__egc *egc,
 int rc)
 {
 STATE_AO_GC(sdss->qmp_proxy_spawn.ao);
-int need_pvqemu = libxl__need_xenpv_qemu(gc, >dm_config);
+/*
+ * Until xenconsoled learns how to handle multiple consoles, require qemu
+ * in dom0 to serve consoles for a stubdomain - it require at least 3 of 
them.
+ */
+int need_pvqemu = 1 || libxl__need_xenpv_qemu(gc, >dm_config);
 
 if (rc) goto out;
 
-- 
2.20.1




[PATCH v5 20/21] libxl: Kill vchan-socket-proxy when cleaning up qmp

2020-04-27 Thread Jason Andryuk
We need to kill the vchan-socket-proxy so we don't leak the daemonized
processes.  libxl__stubdomain_is_linux_running works against the
guest_domid, but the xenstore path is beneath the stubdomain.  This
leads to the use of libxl_is_stubdom in addition to
libxl__stubdomain_is_linux_running so that the stubdomain calls kill for
the qmp-proxy

Signed-off-by: Jason Andryuk 
---
libxl__qmp_cleanup was considered, but it is not called for guests with
a stubdomain.
---
 tools/libxl/libxl_domain.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/tools/libxl/libxl_domain.c b/tools/libxl/libxl_domain.c
index fef2cd4e13..3b66e25aa7 100644
--- a/tools/libxl/libxl_domain.c
+++ b/tools/libxl/libxl_domain.c
@@ -1260,10 +1260,17 @@ static void dm_destroy_cb(libxl__egc *egc,
 libxl__destroy_domid_state *dis = CONTAINER_OF(ddms, *dis, ddms);
 STATE_AO_GC(dis->ao);
 uint32_t domid = dis->domid;
+uint32_t target_domid;
 
 if (rc < 0)
 LOGD(ERROR, domid, "libxl__destroy_device_model failed");
 
+if (libxl_is_stubdom(CTX, domid, _domid) &&
+libxl__stubdomain_is_linux_running(gc, target_domid)) {
+char *path = GCSPRINTF("/local/domain/%d/image/qmp-proxy-pid", domid);
+libxl__kill_xs_path(gc, path, "QMP Proxy");
+}
+
 dis->drs.ao = ao;
 dis->drs.domid = domid;
 dis->drs.callback = devices_destroy_cb;
-- 
2.20.1




[PATCH v5 15/21] libxl: ignore emulated IDE disks beyond the first 4

2020-04-27 Thread Jason Andryuk
From: Marek Marczykowski-Górecki 

Qemu supports only 4 emulated IDE disks, when given more (or with higher
indexes), it will fail to start. Since the disks can still be accessible
using PV interface, just ignore emulated path and log a warning, instead
of rejecting the configuration altogether.

Signed-off-by: Marek Marczykowski-Górecki 
Reviewed-by: Jason Andryuk 
Signed-off-by: Jason Andryuk 
---
 tools/libxl/libxl_dm.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c
index 5e5e7a27b3..03d7a38f1f 100644
--- a/tools/libxl/libxl_dm.c
+++ b/tools/libxl/libxl_dm.c
@@ -1891,6 +1891,13 @@ static int libxl__build_device_model_args_new(libxl__gc 
*gc,
 }
 
 if (disks[i].is_cdrom) {
+if (disk > 4) {
+LOGD(WARN, guest_domid, "Emulated CDROM can be only one of 
the first 4 disks.\n"
+ "Disk %s will be available via PV drivers but not as 
an "
+ "emulated disk.",
+ disks[i].vdev);
+continue;
+}
 drive = libxl__sprintf(gc,
  "if=ide,index=%d,readonly=on,media=cdrom,id=ide-%i",
  disk, dev_number);
@@ -1968,6 +1975,10 @@ static int libxl__build_device_model_args_new(libxl__gc 
*gc,
[i],
colo_mode);
 } else {
+LOGD(WARN, guest_domid, "Only 4 emulated IDE disks are 
supported.\n"
+ "Disk %s will be available via PV drivers but not as 
an "
+ "emulated disk.",
+ disks[i].vdev);
 continue; /* Do not emulate this disk */
 }
 
-- 
2.20.1




[PATCH v5 12/21] libxl: use vchan for QMP access with Linux stubdomain

2020-04-27 Thread Jason Andryuk
From: Marek Marczykowski-Górecki 

Access to QMP of QEMU in Linux stubdomain is possible over vchan
connection. Handle the actual vchan connection in a separate process
(vchan-socket-proxy). This simplified integration with QMP (already
quite complex), but also allows preliminary filtering of (potentially
malicious) QMP input.
Since only one client can be connected to vchan server at the same time
and it is not enforced by the libxenvchan itself, additional client-side
locking is needed. It is implicitly implemented by vchan-socket-proxy,
as it handle only one connection at a time. Note that qemu supports only
one simultaneous client on a control socket anyway (but in UNIX socket
case, it enforce it server-side), so it doesn't add any extra
limitation.

Signed-off-by: Marek Marczykowski-Górecki 
Signed-off-by: Jason Andryuk 
---
Changes in v4:
 - new patch, in place of both "libxl: use vchan for QMP access ..."
Changes in v5:
 - Use device-model/%u/qmp-proxy-state xenstore path
 - Rephrase comment
---
 tools/configure.ac   |   9 ++
 tools/libxl/libxl_dm.c   | 163 +--
 tools/libxl/libxl_internal.h |   1 +
 3 files changed, 165 insertions(+), 8 deletions(-)

diff --git a/tools/configure.ac b/tools/configure.ac
index ea0272766f..1c0ed4c3b1 100644
--- a/tools/configure.ac
+++ b/tools/configure.ac
@@ -192,6 +192,15 @@ AC_SUBST(qemu_xen)
 AC_SUBST(qemu_xen_path)
 AC_SUBST(qemu_xen_systemd)
 
+AC_ARG_WITH([stubdom-qmp-proxy],
+AC_HELP_STRING([--stubdom-qmp-proxy@<:@=PATH@:>@],
+[Use supplied binary PATH as a QMP proxy into stubdomain]),[
+stubdom_qmp_proxy="$withval"
+],[
+stubdom_qmp_proxy="$bindir/vchan-socket-proxy"
+])
+AC_DEFINE_UNQUOTED([STUBDOM_QMP_PROXY_PATH], ["$stubdom_qmp_proxy"], [QMP 
proxy path])
+
 AC_ARG_WITH([system-seabios],
 AS_HELP_STRING([--with-system-seabios@<:@=PATH@:>@],
[Use system supplied seabios PATH instead of building and installing
diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c
index 45d0dd56f5..e420c3fc7b 100644
--- a/tools/libxl/libxl_dm.c
+++ b/tools/libxl/libxl_dm.c
@@ -1200,7 +1200,11 @@ static int libxl__build_device_model_args_new(libxl__gc 
*gc,
   GCSPRINTF("%d", guest_domid), NULL);
 flexarray_append(dm_args, "-no-shutdown");
 
-/* There is currently no way to access the QMP socket in the stubdom */
+/*
+ * QMP access to qemu running in stubdomain is done over vchan. The
+ * stubdomain init script adds the appropriate monitor options for
+ * vchan-socket-proxy.
+ */
 if (!is_stubdom) {
 flexarray_append(dm_args, "-chardev");
 if (state->dm_monitor_fd >= 0) {
@@ -2194,6 +2198,23 @@ static void stubdom_pvqemu_unpaused(libxl__egc *egc,
 static void stubdom_xswait_cb(libxl__egc *egc, libxl__xswait_state *xswait,
   int rc, const char *p);
 
+static void spawn_qmp_proxy(libxl__egc *egc,
+libxl__stub_dm_spawn_state *sdss);
+
+static void qmp_proxy_confirm(libxl__egc *egc, libxl__spawn_state *spawn,
+  const char *xsdata);
+
+static void qmp_proxy_startup_failed(libxl__egc *egc,
+ libxl__spawn_state *spawn,
+ int rc);
+
+static void qmp_proxy_detached(libxl__egc *egc,
+   libxl__spawn_state *spawn);
+
+static void qmp_proxy_spawn_outcome(libxl__egc *egc,
+libxl__stub_dm_spawn_state *sdss,
+int rc);
+
 char *libxl__stub_dm_name(libxl__gc *gc, const char *guest_name)
 {
 return GCSPRINTF("%s-dm", guest_name);
@@ -2476,24 +2497,150 @@ static void spawn_stub_launch_dm(libxl__egc *egc,
 goto out;
 }
 
+sdss->qmp_proxy_spawn.ao = ao;
+if (libxl__stubdomain_is_linux(_config->b_info)) {
+spawn_qmp_proxy(egc, sdss);
+} else {
+qmp_proxy_spawn_outcome(egc, sdss, 0);
+}
+
+return;
+
+out:
+assert(ret);
+qmp_proxy_spawn_outcome(egc, sdss, ret);
+}
+
+static void spawn_qmp_proxy(libxl__egc *egc,
+libxl__stub_dm_spawn_state *sdss)
+{
+STATE_AO_GC(sdss->qmp_proxy_spawn.ao);
+const uint32_t guest_domid = sdss->dm.guest_domid;
+const uint32_t dm_domid = sdss->pvqemu.guest_domid;
+const char *dom_path = libxl__xs_get_dompath(gc, dm_domid);
+char **args;
+int nr = 0;
+int rc, logfile_w, null;
+
+if (access(STUBDOM_QMP_PROXY_PATH, X_OK) < 0) {
+LOGED(ERROR, guest_domid, "qmp proxy %s is not executable", 
STUBDOM_QMP_PROXY_PATH);
+rc = ERROR_FAIL;
+goto out;
+}
+
+sdss->qmp_proxy_spawn.what = GCSPRINTF("domain %d device model qmp proxy", 
guest_domid);
+sdss->qmp_proxy_spawn.pidpath = GCSPRINTF("%s/image/qmp-proxy-pid", 
dom_path);
+sdss->qmp_proxy_spawn.xspath = DEVICE_MODEL_XS_PATH(gc, 
LIBXL_TOOLSTACK_DOMID,
+

[PATCH v5 16/21] libxl: consider also qemu in stubdomain in libxl__dm_active check

2020-04-27 Thread Jason Andryuk
From: Marek Marczykowski-Górecki 

Since qemu-xen can now run in stubdomain too, handle this case when
checking it's state too.

Signed-off-by: Marek Marczykowski-Górecki 
Reviewed-by: Jason Andryuk 
Signed-off-by: Jason Andryuk 
---
 tools/libxl/libxl_dm.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c
index 03d7a38f1f..5d61da1de8 100644
--- a/tools/libxl/libxl_dm.c
+++ b/tools/libxl/libxl_dm.c
@@ -3749,12 +3749,18 @@ out:
 
 int libxl__dm_active(libxl__gc *gc, uint32_t domid)
 {
-char *pid, *path;
+char *pid, *dm_domid, *path;
 
 path = GCSPRINTF("/local/domain/%d/image/device-model-pid", domid);
 pid = libxl__xs_read(gc, XBT_NULL, path);
 
-return pid != NULL;
+if (pid)
+return true;
+
+path = GCSPRINTF("/local/domain/%d/image/device-model-domid", domid);
+dm_domid = libxl__xs_read(gc, XBT_NULL, path);
+
+return dm_domid != NULL;
 }
 
 int libxl__dm_check_start(libxl__gc *gc, libxl_domain_config *d_config,
-- 
2.20.1




[PATCH v5 13/21] Regenerate autotools files

2020-04-27 Thread Jason Andryuk
From: Marek Marczykowski-Górecki 

Since we have those generated files committed to the repo (why?!),
update them after changing configure.ac.

Signed-off-by: Marek Marczykowski-Górecki 
Signed-off-by: Jason Andryuk 
---
 configure | 14 +-
 docs/configure| 14 +-
 stubdom/configure | 14 +-
 tools/config.h.in |  3 +++
 tools/configure   | 46 --
 5 files changed, 34 insertions(+), 57 deletions(-)

diff --git a/configure b/configure
index 9da3970cef..8af54e8a5a 100755
--- a/configure
+++ b/configure
@@ -644,7 +644,6 @@ infodir
 docdir
 oldincludedir
 includedir
-runstatedir
 localstatedir
 sharedstatedir
 sysconfdir
@@ -723,7 +722,6 @@ datadir='${datarootdir}'
 sysconfdir='${prefix}/etc'
 sharedstatedir='${prefix}/com'
 localstatedir='${prefix}/var'
-runstatedir='${localstatedir}/run'
 includedir='${prefix}/include'
 oldincludedir='/usr/include'
 docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
@@ -976,15 +974,6 @@ do
   | -silent | --silent | --silen | --sile | --sil)
 silent=yes ;;
 
-  -runstatedir | --runstatedir | --runstatedi | --runstated \
-  | --runstate | --runstat | --runsta | --runst | --runs \
-  | --run | --ru | --r)
-ac_prev=runstatedir ;;
-  -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
-  | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
-  | --run=* | --ru=* | --r=*)
-runstatedir=$ac_optarg ;;
-
   -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
 ac_prev=sbindir ;;
   -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
@@ -1122,7 +,7 @@ fi
 for ac_var in  exec_prefix prefix bindir sbindir libexecdir datarootdir \
datadir sysconfdir sharedstatedir localstatedir includedir \
oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
-   libdir localedir mandir runstatedir
+   libdir localedir mandir
 do
   eval ac_val=\$$ac_var
   # Remove trailing slashes.
@@ -1275,7 +1264,6 @@ Fine tuning of the installation directories:
   --sysconfdir=DIRread-only single-machine data [PREFIX/etc]
   --sharedstatedir=DIRmodifiable architecture-independent data [PREFIX/com]
   --localstatedir=DIR modifiable single-machine data [PREFIX/var]
-  --runstatedir=DIR   modifiable per-process data [LOCALSTATEDIR/run]
   --libdir=DIRobject code libraries [EPREFIX/lib]
   --includedir=DIRC header files [PREFIX/include]
   --oldincludedir=DIR C header files for non-gcc [/usr/include]
diff --git a/docs/configure b/docs/configure
index 9e3ed60462..93e9dcf404 100755
--- a/docs/configure
+++ b/docs/configure
@@ -634,7 +634,6 @@ infodir
 docdir
 oldincludedir
 includedir
-runstatedir
 localstatedir
 sharedstatedir
 sysconfdir
@@ -711,7 +710,6 @@ datadir='${datarootdir}'
 sysconfdir='${prefix}/etc'
 sharedstatedir='${prefix}/com'
 localstatedir='${prefix}/var'
-runstatedir='${localstatedir}/run'
 includedir='${prefix}/include'
 oldincludedir='/usr/include'
 docdir='${datarootdir}/doc/${PACKAGE_TARNAME}'
@@ -964,15 +962,6 @@ do
   | -silent | --silent | --silen | --sile | --sil)
 silent=yes ;;
 
-  -runstatedir | --runstatedir | --runstatedi | --runstated \
-  | --runstate | --runstat | --runsta | --runst | --runs \
-  | --run | --ru | --r)
-ac_prev=runstatedir ;;
-  -runstatedir=* | --runstatedir=* | --runstatedi=* | --runstated=* \
-  | --runstate=* | --runstat=* | --runsta=* | --runst=* | --runs=* \
-  | --run=* | --ru=* | --r=*)
-runstatedir=$ac_optarg ;;
-
   -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
 ac_prev=sbindir ;;
   -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
@@ -1110,7 +1099,7 @@ fi
 for ac_var in  exec_prefix prefix bindir sbindir libexecdir datarootdir \
datadir sysconfdir sharedstatedir localstatedir includedir \
oldincludedir docdir infodir htmldir dvidir pdfdir psdir \
-   libdir localedir mandir runstatedir
+   libdir localedir mandir
 do
   eval ac_val=\$$ac_var
   # Remove trailing slashes.
@@ -1263,7 +1252,6 @@ Fine tuning of the installation directories:
   --sysconfdir=DIRread-only single-machine data [PREFIX/etc]
   --sharedstatedir=DIRmodifiable architecture-independent data [PREFIX/com]
   --localstatedir=DIR modifiable single-machine data [PREFIX/var]
-  --runstatedir=DIR   modifiable per-process data [LOCALSTATEDIR/run]
   --libdir=DIRobject code libraries [EPREFIX/lib]
   --includedir=DIRC header files [PREFIX/include]
   --oldincludedir=DIR C header files for non-gcc [/usr/include]
diff --git a/stubdom/configure b/stubdom/configure
index da03da535a..f7604a37f7 100755
--- a/stubdom/configure
+++ b/stubdom/configure
@@ -661,7 +661,6 @@ infodir
 docdir
 oldincludedir
 includedir
-runstatedir
 localstatedir
 sharedstatedir
 sysconfdir
@@ -751,7 +750,6 @@ 

[PATCH v5 18/21] libxl: Check stubdomain kernel & ramdisk presence

2020-04-27 Thread Jason Andryuk
Just out of context is the following comment for libxl__domain_make:
/* fixme: this function can leak the stubdom if it fails */

When the stubdomain kernel or ramdisk is not present, the domid and
stubdomain name will indeed be leaked.  Avoid the leak by checking the
file presence and erroring out when absent.  It doesn't fix all cases,
but it avoids a big one when using a linux device model stubdomain.

Signed-off-by: Jason Andryuk 
---
 tools/libxl/libxl_dm.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c
index 5d61da1de8..a57c13bdf4 100644
--- a/tools/libxl/libxl_dm.c
+++ b/tools/libxl/libxl_dm.c
@@ -2316,6 +2316,22 @@ void libxl__spawn_stub_dm(libxl__egc *egc, 
libxl__stub_dm_spawn_state *sdss)
 dm_config->num_vkbs = 1;
 }
 
+if (guest_config->b_info.stubdomain_kernel &&
+access(guest_config->b_info.stubdomain_kernel, R_OK) != 0) {
+LOGED(ERROR, guest_domid, "could not access stubdomain kernel %s",
+  guest_config->b_info.stubdomain_kernel);
+ret = ERROR_INVAL;
+goto out;
+}
+
+if (guest_config->b_info.stubdomain_ramdisk &&
+access(guest_config->b_info.stubdomain_ramdisk, R_OK) != 0) {
+LOGED(ERROR, guest_domid, "could not access stubdomain ramdisk %s",
+  guest_config->b_info.stubdomain_ramdisk);
+ret = ERROR_INVAL;
+goto out;
+}
+
 stubdom_state->pv_kernel.path = guest_config->b_info.stubdomain_kernel;
 stubdom_state->pv_ramdisk.path = guest_config->b_info.stubdomain_ramdisk;
 
-- 
2.20.1




[PATCH v5 19/21] libxl: Refactor kill_device_model to libxl__kill_xs_path

2020-04-27 Thread Jason Andryuk
Move kill_device_model to libxl__kill_xs_path so we have a helper to
kill a process from a pid stored in xenstore.  We'll be using it to kill
vchan-qmp-proxy.

libxl__kill_xs_path takes a "what" string for use in printing error
messages.  kill_device_model is retained in libxl_dm.c to provide the
string.

Signed-off-by: Jason Andryuk 
---
 tools/libxl/libxl_aoutils.c  | 32 
 tools/libxl/libxl_dm.c   | 27 +--
 tools/libxl/libxl_internal.h |  3 +++
 3 files changed, 36 insertions(+), 26 deletions(-)

diff --git a/tools/libxl/libxl_aoutils.c b/tools/libxl/libxl_aoutils.c
index 1be858c93c..c4c095a5ba 100644
--- a/tools/libxl/libxl_aoutils.c
+++ b/tools/libxl/libxl_aoutils.c
@@ -626,6 +626,38 @@ void libxl__kill(libxl__gc *gc, pid_t pid, int sig, const 
char *what)
 what, (unsigned long)pid, sig);
 }
 
+/* Generic function to signal (HUP) a pid stored in xenstore */
+int libxl__kill_xs_path(libxl__gc *gc, const char *xs_path_pid,
+const char *what)
+{
+const char *xs_pid;
+int ret, pid;
+
+ret = libxl__xs_read_checked(gc, XBT_NULL, xs_path_pid, _pid);
+if (ret || !xs_pid) {
+LOG(ERROR, "unable to find %s pid in %s", what, xs_path_pid);
+ret = ret ? : ERROR_FAIL;
+goto out;
+}
+pid = atoi(xs_pid);
+
+ret = kill(pid, SIGHUP);
+if (ret < 0 && errno == ESRCH) {
+LOG(ERROR, "%s already exited", what);
+ret = 0;
+} else if (ret == 0) {
+LOG(DEBUG, "%s signaled", what);
+ret = 0;
+} else {
+LOGE(ERROR, "failed to kill %s [%d]", what, pid);
+ret = ERROR_FAIL;
+goto out;
+}
+
+out:
+return ret;
+}
+
 /*
  * Local variables:
  * mode: C
diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c
index a57c13bdf4..10ca4226ba 100644
--- a/tools/libxl/libxl_dm.c
+++ b/tools/libxl/libxl_dm.c
@@ -3397,32 +3397,7 @@ out:
 /* Generic function to signal a Qemu instance to exit */
 static int kill_device_model(libxl__gc *gc, const char *xs_path_pid)
 {
-const char *xs_pid;
-int ret, pid;
-
-ret = libxl__xs_read_checked(gc, XBT_NULL, xs_path_pid, _pid);
-if (ret || !xs_pid) {
-LOG(ERROR, "unable to find device model pid in %s", xs_path_pid);
-ret = ret ? : ERROR_FAIL;
-goto out;
-}
-pid = atoi(xs_pid);
-
-ret = kill(pid, SIGHUP);
-if (ret < 0 && errno == ESRCH) {
-LOG(ERROR, "Device Model already exited");
-ret = 0;
-} else if (ret == 0) {
-LOG(DEBUG, "Device Model signaled");
-ret = 0;
-} else {
-LOGE(ERROR, "failed to kill Device Model [%d]", pid);
-ret = ERROR_FAIL;
-goto out;
-}
-
-out:
-return ret;
+return libxl__kill_xs_path(gc, xs_path_pid, "Device Model");
 }
 
 /* Helper to destroy a Qdisk backend */
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 00cfbe1fac..32aa797714 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2707,6 +2707,9 @@ int libxl__async_exec_start(libxl__async_exec_state *aes);
 bool libxl__async_exec_inuse(const libxl__async_exec_state *aes);
 
 _hidden void libxl__kill(libxl__gc *gc, pid_t pid, int sig, const char *what);
+/* kill SIGHUP a pid stored in xenstore */
+_hidden int libxl__kill_xs_path(libxl__gc *gc, const char *xs_path_pid,
+const char *what);
 
 /*- device addition/removal -*/
 
-- 
2.20.1




[PATCH v5 09/21] libxl: add save/restore support for qemu-xen in stubdomain

2020-04-27 Thread Jason Andryuk
From: Marek Marczykowski-Górecki 

Rely on a wrapper script in stubdomain to attach FD 3/4 of qemu to
relevant consoles.

Signed-off-by: Marek Marczykowski-Górecki 
Address TODO in dm_state_save_to_fdset: Only remove savefile for
non-stubdom.

Signed-off-by: Jason Andryuk 
---
Changes in v3:
 - adjust for qmp_ev*
 - assume specific fdset id in qemu set in stubdomain
Changes in v5:
 - Only remove savefile for non-stubdom
---
 tools/libxl/libxl_dm.c  | 23 +++
 tools/libxl/libxl_qmp.c | 27 +--
 2 files changed, 36 insertions(+), 14 deletions(-)

diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c
index bdc23554eb..45d0dd56f5 100644
--- a/tools/libxl/libxl_dm.c
+++ b/tools/libxl/libxl_dm.c
@@ -1744,10 +1744,17 @@ static int libxl__build_device_model_args_new(libxl__gc 
*gc,
 }
 
 if (state->saved_state) {
-/* This file descriptor is meant to be used by QEMU */
-*dm_state_fd = open(state->saved_state, O_RDONLY);
-flexarray_append(dm_args, "-incoming");
-flexarray_append(dm_args, GCSPRINTF("fd:%d",*dm_state_fd));
+if (is_stubdom) {
+/* Linux stubdomain connects specific FD to STUBDOM_CONSOLE_RESTORE
+ */
+flexarray_append(dm_args, "-incoming");
+flexarray_append(dm_args, "fd:3");
+} else {
+/* This file descriptor is meant to be used by QEMU */
+*dm_state_fd = open(state->saved_state, O_RDONLY);
+flexarray_append(dm_args, "-incoming");
+flexarray_append(dm_args, GCSPRINTF("fd:%d",*dm_state_fd));
+}
 }
 for (i = 0; b_info->extra && b_info->extra[i] != NULL; i++)
 flexarray_append(dm_args, b_info->extra[i]);
@@ -2218,14 +2225,6 @@ void libxl__spawn_stub_dm(libxl__egc *egc, 
libxl__stub_dm_spawn_state *sdss)
 
 assert(libxl_defbool_val(guest_config->b_info.device_model_stubdomain));
 
-if (libxl__stubdomain_is_linux(_config->b_info)) {
-if (d_state->saved_state) {
-LOG(ERROR, "Save/Restore not supported yet with Linux Stubdom.");
-ret = -1;
-goto out;
-}
-}
-
 sdss->pvqemu.guest_domid = INVALID_DOMID;
 
 libxl_domain_create_info_init(_config->c_info);
diff --git a/tools/libxl/libxl_qmp.c b/tools/libxl/libxl_qmp.c
index efaba91086..c394000ea9 100644
--- a/tools/libxl/libxl_qmp.c
+++ b/tools/libxl/libxl_qmp.c
@@ -962,6 +962,7 @@ static void dm_stopped(libxl__egc *egc, libxl__ev_qmp *ev,
const libxl__json_object *response, int rc);
 static void dm_state_fd_ready(libxl__egc *egc, libxl__ev_qmp *ev,
   const libxl__json_object *response, int rc);
+static void dm_state_save_to_fdset(libxl__egc *egc, libxl__ev_qmp *ev, int 
fdset);
 static void dm_state_saved(libxl__egc *egc, libxl__ev_qmp *ev,
const libxl__json_object *response, int rc);
 
@@ -994,10 +995,17 @@ static void dm_stopped(libxl__egc *egc, libxl__ev_qmp *ev,
 EGC_GC;
 libxl__domain_suspend_state *dsps = CONTAINER_OF(ev, *dsps, qmp);
 const char *const filename = dsps->dm_savefile;
+uint32_t dm_domid = libxl_get_stubdom_id(CTX, dsps->domid);
 
 if (rc)
 goto error;
 
+if (dm_domid) {
+/* see Linux stubdom interface in docs/stubdom.txt */
+dm_state_save_to_fdset(egc, ev, 1);
+return;
+}
+
 ev->payload_fd = open(filename, O_WRONLY | O_CREAT, 0600);
 if (ev->payload_fd < 0) {
 LOGED(ERROR, ev->domid,
@@ -1028,7 +1036,6 @@ static void dm_state_fd_ready(libxl__egc *egc, 
libxl__ev_qmp *ev,
 EGC_GC;
 int fdset;
 const libxl__json_object *o;
-libxl__json_object *args = NULL;
 libxl__domain_suspend_state *dsps = CONTAINER_OF(ev, *dsps, qmp);
 
 close(ev->payload_fd);
@@ -1043,6 +1050,21 @@ static void dm_state_fd_ready(libxl__egc *egc, 
libxl__ev_qmp *ev,
 goto error;
 }
 fdset = libxl__json_object_get_integer(o);
+dm_state_save_to_fdset(egc, ev, fdset);
+return;
+
+error:
+assert(rc);
+libxl__remove_file(gc, dsps->dm_savefile);
+dsps->callback_device_model_done(egc, dsps, rc);
+}
+
+static void dm_state_save_to_fdset(libxl__egc *egc, libxl__ev_qmp *ev, int 
fdset)
+{
+EGC_GC;
+int rc;
+libxl__json_object *args = NULL;
+libxl__domain_suspend_state *dsps = CONTAINER_OF(ev, *dsps, qmp);
 
 ev->callback = dm_state_saved;
 
@@ -1060,7 +1082,8 @@ static void dm_state_fd_ready(libxl__egc *egc, 
libxl__ev_qmp *ev,
 
 error:
 assert(rc);
-libxl__remove_file(gc, dsps->dm_savefile);
+if (!libxl_get_stubdom_id(CTX, dsps->domid))
+libxl__remove_file(gc, dsps->dm_savefile);
 dsps->callback_device_model_done(egc, dsps, rc);
 }
 
-- 
2.20.1




[PATCH v5 08/21] tools/libvchan: notify server when client is connected

2020-04-27 Thread Jason Andryuk
From: Marek Marczykowski-Górecki 

Let the server know when the client is connected. Otherwise server will
notice only when client send some data.
This change does not break existing clients, as libvchan user should
handle spurious notifications anyway (for example acknowledge of remote
side reading the data).

Signed-off-by: Marek Marczykowski-Górecki 
Replace spaces with tabs to match the file's whitespace.
Signed-off-by: Jason Andryuk 
---
Marek: I had this patch in Qubes for a long time and totally forgot it
wasn't upstream thing...
---
 tools/libvchan/init.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/libvchan/init.c b/tools/libvchan/init.c
index 180833dc2f..ad4b64fbe3 100644
--- a/tools/libvchan/init.c
+++ b/tools/libvchan/init.c
@@ -447,6 +447,9 @@ struct libxenvchan *libxenvchan_client_init(struct 
xentoollog_logger *logger,
ctrl->ring->cli_live = 1;
ctrl->ring->srv_notify = VCHAN_NOTIFY_WRITE;
 
+   /* wake up the server */
+   xenevtchn_notify(ctrl->event, ctrl->event_port);
+
  out:
if (xs)
xs_daemon_close(xs);
-- 
2.20.1




[PATCH v5 10/21] tools: add missing libxenvchan cflags

2020-04-27 Thread Jason Andryuk
From: Marek Marczykowski-Górecki 

libxenvchan.h include xenevtchn.h and xengnttab.h, so applications built
with it needs applicable -I in CFLAGS too.

Signed-off-by: Marek Marczykowski-Górecki 
Reviewed-by: Jason Andryuk 
Signed-off-by: Jason Andryuk 
---
 tools/Rules.mk | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/Rules.mk b/tools/Rules.mk
index 9bac15c8d1..078eb7230a 100644
--- a/tools/Rules.mk
+++ b/tools/Rules.mk
@@ -159,7 +159,7 @@ SHDEPS_libxenstat  = $(SHLIB_libxenctrl) 
$(SHLIB_libxenstore)
 LDLIBS_libxenstat  = $(SHDEPS_libxenstat) 
$(XEN_LIBXENSTAT)/libxenstat$(libextension)
 SHLIB_libxenstat   = $(SHDEPS_libxenstat) -Wl,-rpath-link=$(XEN_LIBXENSTAT)
 
-CFLAGS_libxenvchan = -I$(XEN_LIBVCHAN)
+CFLAGS_libxenvchan = -I$(XEN_LIBVCHAN) $(CFLAGS_libxengnttab) 
$(CFLAGS_libxenevtchn)
 SHDEPS_libxenvchan = $(SHLIB_libxentoollog) $(SHLIB_libxenstore) 
$(SHLIB_libxenevtchn) $(SHLIB_libxengnttab)
 LDLIBS_libxenvchan = $(SHDEPS_libxenvchan) 
$(XEN_LIBVCHAN)/libxenvchan$(libextension)
 SHLIB_libxenvchan  = $(SHDEPS_libxenvchan) -Wl,-rpath-link=$(XEN_LIBVCHAN)
-- 
2.20.1




[PATCH v5 03/21] libxl: fix qemu-trad cmdline for no sdl/vnc case

2020-04-27 Thread Jason Andryuk
From: Marek Marczykowski-Górecki 

When qemu is running in stubdomain, any attempt to initialize vnc/sdl
there will crash it (on failed attempt to load a keymap from a file). If
vfb is present, all those cases are skipped. But since
b053f0c4c9e533f3d97837cf897eb920b8355ed3 "libxl: do not start dom0 qemu
for stubdomain when not needed" it is possible to create a stubdomain
without vfb and contrary to the comment -vnc none do trigger VNC
initialization code (just skips exposing it externally).
Change the implicit SDL avoiding method to -nographics option, used when
none of SDL or VNC is enabled.

Signed-off-by: Marek Marczykowski-Górecki 
Reviewed-by: Jason Andryuk 
Acked-by: Ian Jackson 
Acked-by: Wei Liu 
Signed-off-by: Jason Andryuk 
---
Changes in v2:
 - typo in qemu option
Changes in v3:
 - add missing { }
---
 tools/libxl/libxl_dm.c | 11 ++-
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c
index f4007bbe50..b91e63db6f 100644
--- a/tools/libxl/libxl_dm.c
+++ b/tools/libxl/libxl_dm.c
@@ -734,14 +734,15 @@ static int libxl__build_device_model_args_old(libxl__gc 
*gc,
 if (libxl_defbool_val(vnc->findunused)) {
 flexarray_append(dm_args, "-vncunused");
 }
-} else
+} else if (!sdl) {
 /*
  * VNC is not enabled by default by qemu-xen-traditional,
- * however passing -vnc none causes SDL to not be
- * (unexpectedly) enabled by default. This is overridden by
- * explicitly passing -sdl below as required.
+ * however skipping -vnc causes SDL to be
+ * (unexpectedly) enabled by default. If undesired, disable graphics at
+ * all.
  */
-flexarray_append_pair(dm_args, "-vnc", "none");
+flexarray_append(dm_args, "-nographic");
+}
 
 if (sdl) {
 flexarray_append(dm_args, "-sdl");
-- 
2.20.1




[PATCH v5 05/21] libxl: Handle Linux stubdomain specific QEMU options.

2020-04-27 Thread Jason Andryuk
From: Eric Shelton 

This patch creates an appropriate command line for the QEMU instance
running in a Linux-based stubdomain.

NOTE: a number of items are not currently implemented for Linux-based
stubdomains, such as:
- save/restore
- QMP socket
- graphics output (e.g., VNC)

Signed-off-by: Eric Shelton 

Simon:
 * fix disk path
 * fix cdrom path and "format"

Signed-off-by: Simon Gaiser 
[drop Qubes-specific parts]
Signed-off-by: Marek Marczykowski-Górecki 

Allow setting stubdomain_ramdisk independently from stubdomain_kernel
Add a qemu- prefix for qemu-stubdom-linux-{kernel,rootfs} since stubdom
doesn't convey device-model.  Use qemu- since this code is qemu specific.

Signed-off-by: Jason Andryuk 
---
Changes in v2:
 - fix serial specified with serial=[ ... ] syntax
 - error out on multiple consoles (incompatible with stubdom)
 - drop erroneous chunk about cdrom
Changes in v3:
 - change to use libxl__stubdomain_is_linux instead of
   b_info->stubdomain_version
 - drop libxl__stubdomain_version_running, prefer
   libxl__stubdomain_is_linux_running introduced by previous patch
 - drop ifup/ifdown script - stubdomain will handle that with qemu
   events itself
 - slightly simplify -serial argument
 - add support for multiple serial consoles, do not ignore
   b_info.u.serial(_list)
 - add error checking for more than 26 emulated disks ("/dev/xvd%c"
   format string)
Changes in v5:
 - commit message fixup to match patch contents - Marek
 - file names are now qemu-stubdom-linux-{kernel,rootfs} - Jason
 - allow setting ramdisk independently of kernel - Jason
---
 tools/libxl/libxl_create.c   |  45 +
 tools/libxl/libxl_dm.c   | 190 ---
 tools/libxl/libxl_internal.h |   1 +
 tools/libxl/libxl_mem.c  |   6 +-
 tools/libxl/libxl_types.idl  |   3 +
 5 files changed, 184 insertions(+), 61 deletions(-)

diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index 7423ee8e57..3b5535f2c8 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -171,6 +171,40 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc,
 }
 }
 
+if (b_info->type == LIBXL_DOMAIN_TYPE_HVM &&
+libxl_defbool_val(b_info->device_model_stubdomain)) {
+if (!b_info->stubdomain_kernel) {
+switch (b_info->device_model_version) {
+case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL:
+b_info->stubdomain_kernel =
+libxl__abs_path(NOGC, "ioemu-stubdom.gz", 
libxl__xenfirmwaredir_path());
+break;
+case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN:
+b_info->stubdomain_kernel =
+libxl__abs_path(NOGC,
+"qemu-stubdom-linux-kernel",
+libxl__xenfirmwaredir_path());
+break;
+default:
+abort();
+}
+}
+if (!b_info->stubdomain_ramdisk) {
+switch (b_info->device_model_version) {
+case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL:
+break;
+case LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN:
+b_info->stubdomain_ramdisk =
+libxl__abs_path(NOGC,
+"qemu-stubdom-linux-rootfs",
+libxl__xenfirmwaredir_path());
+break;
+default:
+abort();
+}
+}
+}
+
 if (!b_info->max_vcpus)
 b_info->max_vcpus = 1;
 if (!b_info->avail_vcpus.size) {
@@ -206,6 +240,17 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc,
 if (b_info->target_memkb == LIBXL_MEMKB_DEFAULT)
 b_info->target_memkb = b_info->max_memkb;
 
+if (b_info->stubdomain_memkb == LIBXL_MEMKB_DEFAULT) {
+if (libxl_defbool_val(b_info->device_model_stubdomain)) {
+if (libxl__stubdomain_is_linux(b_info))
+b_info->stubdomain_memkb = LIBXL_LINUX_STUBDOM_MEM * 1024;
+else
+b_info->stubdomain_memkb = 28 * 1024; // MiniOS
+} else {
+b_info->stubdomain_memkb = 0; // no stubdomain
+}
+}
+
 libxl_defbool_setdefault(_info->claim_mode, false);
 
 libxl_defbool_setdefault(_info->localtime, false);
diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c
index b91e63db6f..5a7d55686f 100644
--- a/tools/libxl/libxl_dm.c
+++ b/tools/libxl/libxl_dm.c
@@ -1188,6 +1188,7 @@ static int libxl__build_device_model_args_new(libxl__gc 
*gc,
 int i, connection, devid;
 uint64_t ram_size;
 const char *path, *chardev;
+bool is_stubdom = libxl_defbool_val(b_info->device_model_stubdomain);
 
 dm_args = flexarray_make(gc, 16, 1);
 dm_envs = flexarray_make(gc, 16, 1);
@@ -1197,39 +1198,42 @@ static int libxl__build_device_model_args_new(libxl__gc 
*gc,
 

[PATCH v5 04/21] libxl: Allow running qemu-xen in stubdomain

2020-04-27 Thread Jason Andryuk
From: Marek Marczykowski-Górecki 

Do not prohibit anymore using stubdomain with qemu-xen.
To help distingushing MiniOS and Linux stubdomain, add helper inline
functions libxl__stubdomain_is_linux() and
libxl__stubdomain_is_linux_running(). Those should be used where really
the difference is about MiniOS/Linux, not qemu-xen/qemu-xen-traditional.

Signed-off-by: Marek Marczykowski-Górecki 
Signed-off-by: Jason Andryuk 

---
Changes in v3:
 - new patch, instead of "libxl: Add "stubdomain_version" to
 domain_build_info"
 - helper functions as suggested by Ian Jackson
---
 tools/libxl/libxl_create.c   |  9 -
 tools/libxl/libxl_internal.h | 17 +
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/tools/libxl/libxl_create.c b/tools/libxl/libxl_create.c
index e7cb2dbc2b..7423ee8e57 100644
--- a/tools/libxl/libxl_create.c
+++ b/tools/libxl/libxl_create.c
@@ -171,15 +171,6 @@ int libxl__domain_build_info_setdefault(libxl__gc *gc,
 }
 }
 
-if (b_info->type == LIBXL_DOMAIN_TYPE_HVM &&
-b_info->device_model_version !=
-LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN_TRADITIONAL &&
-libxl_defbool_val(b_info->device_model_stubdomain)) {
-LOG(ERROR,
-"device model stubdomains require \"qemu-xen-traditional\"");
-return ERROR_INVAL;
-}
-
 if (!b_info->max_vcpus)
 b_info->max_vcpus = 1;
 if (!b_info->avail_vcpus.size) {
diff --git a/tools/libxl/libxl_internal.h b/tools/libxl/libxl_internal.h
index 5f39e44cb9..ebbf926897 100644
--- a/tools/libxl/libxl_internal.h
+++ b/tools/libxl/libxl_internal.h
@@ -2320,6 +2320,23 @@ _hidden int 
libxl__device_model_version_running(libxl__gc *gc, uint32_t domid);
   /* Return the system-wide default device model */
 _hidden libxl_device_model_version libxl__default_device_model(libxl__gc *gc);
 
+static inline
+bool libxl__stubdomain_is_linux_running(libxl__gc *gc, uint32_t domid)
+{
+/* same logic as in libxl__stubdomain_is_linux */
+return libxl__device_model_version_running(gc, domid)
+== LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN;
+}
+
+static inline
+bool libxl__stubdomain_is_linux(libxl_domain_build_info *b_info)
+{
+/* right now qemu-tranditional implies MiniOS stubdomain and qemu-xen
+ * implies Linux stubdomain */
+return libxl_defbool_val(b_info->device_model_stubdomain) &&
+b_info->device_model_version == LIBXL_DEVICE_MODEL_VERSION_QEMU_XEN;
+}
+
 #define DEVICE_MODEL_XS_PATH(gc, dm_domid, domid, fmt, _a...)  \
 libxl__sprintf(gc, "/local/domain/%u/device-model/%u" fmt, dm_domid,   \
domid, ##_a)
-- 
2.20.1




[PATCH v5 02/21] Document ioemu Linux stubdomain protocol

2020-04-27 Thread Jason Andryuk
From: Marek Marczykowski-Górecki 

Add documentation for upcoming Linux stubdomain for qemu-upstream.

Signed-off-by: Marek Marczykowski-Górecki 
Signed-off-by: Jason Andryuk 
---
 docs/misc/stubdom.txt | 50 +++
 1 file changed, 50 insertions(+)

diff --git a/docs/misc/stubdom.txt b/docs/misc/stubdom.txt
index 64c77d9b64..bb5e87dfb9 100644
--- a/docs/misc/stubdom.txt
+++ b/docs/misc/stubdom.txt
@@ -75,6 +75,56 @@ Defined commands:
- "running" - success
 
 
+Toolstack to Linux ioemu stubdomain protocol
+
+
+This section describe communication protocol between toolstack and
+qemu-upstream running in Linux stubdomain. The protocol include
+expectations of both stubdomain, and qemu.
+
+Setup (done by toolstack, expected by stubdomain):
+ - Block devices for target domain are connected as PV disks to stubdomain,
+   according to configuration order, starting with xvda
+ - Network devices for target domain are connected as PV nics to stubdomain,
+   according to configuration order, starting with 0
+ - [not implemented] if graphics output is expected, VFB and VKB devices are 
set for stubdomain
+   (its backend is responsible for exposing them using appropriate protocol
+   like VNC or Spice)
+ - other target domain's devices are not connected at this point to stubdomain
+   (may be hot-plugged later)
+ - QEMU command line is stored in
+   /vm//image/dmargs xenstore dir, each argument as separate key
+   in form /vm//image/dmargs/NNN, where NNN is 0-padded argument
+   number
+ - target domain id is stored in /local/domain//target xenstore 
path
+?? - bios type is stored in /local/domain//hvmloader/bios
+ - stubdomain's console 0 is connected to qemu log file
+ - stubdomain's console 1 is connected to qemu save file (for saving state)
+ - stubdomain's console 2 is connected to qemu save file (for restoring state)
+ - next consoles are connected according to target guest's serial console 
configuration
+
+Environment exposed by stubdomain to qemu (needed to construct appropriate 
qemu command line and later interact with qmp):
+ - target domain's disks are available as /dev/xvd[a-z]
+ - console 2 (incoming domain state) is connected with FD 3
+ - console 1 (saving domain state) is added over QMP to qemu as "fdset-id 1" 
(done by stubdomain, toolstack doesn't need to care about it)
+ - nics are connected to relevant stubdomain PV vifs when available (qemu 
-netdev should specify ifname= explicitly)
+
+Startup:
+1. toolstack starts PV stubdomain with stubdom-linux-kernel kernel and 
stubdom-linux-initrd initrd
+2. stubdomain initialize relevant devices
+3. stubdomain starts qemu with requested command line, plus few stubdomain 
specific ones - including local qmp access options
+4. stubdomain starts vchan server on 
/local/domain//device-model//qmp-vchan, exposing qmp 
socket to the toolstack
+5. qemu signal readiness by writing "running" to 
/local/domain//device-model//state xenstore path
+6. now device model is considered running
+
+QEMU can be controlled using QMP over vchan at 
/local/domain//device-model//qmp-vchan. Only one 
simultaneous connection is supported and toolstack needs to ensure that.
+
+Limitations:
+ - PCI passthrough require permissive mode
+ - only one nic is supported
+ - at most 26 emulated disks are supported (more are still available as PV 
disks)
+ - graphics output (VNC/SDL/Spice) not supported
+
 
PV-GRUB
===
-- 
2.20.1




[PATCH v5 01/21] Document ioemu MiniOS stubdomain protocol

2020-04-27 Thread Jason Andryuk
From: Marek Marczykowski-Górecki 

Add documentation based on reverse-engineered toolstack-ioemu stubdomain
protocol.

Signed-off-by: Marek Marczykowski-Górecki 
Signed-off-by: Jason Andryuk 
---
 docs/misc/stubdom.txt | 53 +++
 1 file changed, 53 insertions(+)

diff --git a/docs/misc/stubdom.txt b/docs/misc/stubdom.txt
index 882a18cab4..64c77d9b64 100644
--- a/docs/misc/stubdom.txt
+++ b/docs/misc/stubdom.txt
@@ -23,6 +23,59 @@ and https://wiki.xen.org/wiki/Device_Model_Stub_Domains for 
more
 information on device model stub domains
 
 
+Toolstack to MiniOS ioemu stubdomain protocol
+-
+
+This section describe communication protocol between toolstack and
+qemu-traditional running in MiniOS stubdomain. The protocol include
+expectations of both qemu and stubdomain itself.
+
+Setup (done by toolstack, expected by stubdomain):
+ - Block devices for target domain are connected as PV disks to stubdomain,
+   according to configuration order, starting with xvda
+ - Network devices for target domain are connected as PV nics to stubdomain,
+   according to configuration order, starting with 0
+ - if graphics output is expected, VFB and VKB devices are set for stubdomain
+   (its backend is responsible for exposing them using appropriate protocol
+   like VNC or Spice)
+ - other target domain's devices are not connected at this point to stubdomain
+   (may be hot-plugged later)
+ - QEMU command line (space separated arguments) is stored in
+   /vm//image/dmargs xenstore path
+ - target domain id is stored in /local/domain//target xenstore 
path
+?? - bios type is stored in /local/domain//hvmloader/bios
+ - stubdomain's console 0 is connected to qemu log file
+ - stubdomain's console 1 is connected to qemu save file (for saving state)
+ - stubdomain's console 2 is connected to qemu save file (for restoring state)
+ - next consoles are connected according to target guest's serial console 
configuration
+
+Startup:
+1. PV stubdomain is started with ioemu-stubdom.gz kernel and no initrd
+2. stubdomain initialize relevant devices
+3. stubdomain signal readiness by writing "running" to 
/local/domain//device-model//state xenstore path
+4. now stubdomain is considered running
+
+Runtime control (hotplug etc):
+Toolstack can issue command through xenstore. The sequence is (from toolstack 
POV):
+1. Write parameter to 
/local/domain//device-model//parameter.
+2. Write command to 
/local/domain//device-model//command.
+3. Wait for command result in 
/local/domain//device-model//state (command specific 
value).
+4. Write "running" back to 
/local/domain//device-model//state.
+
+Defined commands:
+ - "pci-ins" - PCI hot plug, results:
+   - "pci-inserted" - success
+   - "pci-insert-failed" - failure
+ - "pci-rem" - PCI hot remove, results:
+   - "pci-removed" - success
+   - ??
+ - "save" - save domain state to console 1, results:
+   - "paused" - success
+ - "continue" - resume domain execution, after loading state from console 2 
(require -loadvm command argument), results:
+   - "running" - success
+
+
+
PV-GRUB
===
 
-- 
2.20.1




[PATCH v5 11/21] tools: add simple vchan-socket-proxy

2020-04-27 Thread Jason Andryuk
From: Marek Marczykowski-Górecki 

Add a simple proxy for tunneling socket connection over vchan. This is
based on existing vchan-node* applications, but extended with socket
support. vchan-socket-proxy serves both as a client and as a server,
depending on parameters. It can be used to transparently communicate
with an application in another domian that normally expose UNIX socket
interface. Specifically, it's written to communicate with qemu running
within stubdom.

Server mode listens for vchan connections and when one is opened,
connects to a pointed UNIX socket.  Client mode listens on UNIX
socket and when someone connects, opens a vchan connection.  Only
a single connection at a time is supported.

Additionally, socket can be provided as a number - in which case it's
interpreted as already open FD (in case of UNIX listening socket -
listen() needs to be already called). Or "-" meaning stdin/stdout - in
which case it is reduced to vchan-node2 functionality.

Example usage:

1. (in dom0) vchan-socket-proxy --mode=client 
/local/domain//data/vchan/1234 /run/qemu.(DOMID)

2. (in DOMID) vchan-socket-proxy --mode=server 0
   /local/domain//data/vchan/1234 /run/qemu.(DOMID)

This will listen on /run/qemu.(DOMID) in dom0 and whenever connection is
made, it will connect to DOMID, where server process will connect to
/run/qemu.(DOMID) there. When client disconnects, vchan connection is
terminated and server vchan-socket-proxy process also disconnects from
qemu.

Signed-off-by: Marek Marczykowski-Górecki 
Reviewed-by: Jason Andryuk 
Signed-off-by: Jason Andryuk 
---
Changes on v5:
 - Ensure bindir directory is present
 - String and comment fixes
---
 .gitignore  |   1 +
 tools/libvchan/Makefile |   8 +-
 tools/libvchan/vchan-socket-proxy.c | 478 
 3 files changed, 486 insertions(+), 1 deletion(-)
 create mode 100644 tools/libvchan/vchan-socket-proxy.c

diff --git a/.gitignore b/.gitignore
index 4ca679ddbc..1760e54464 100644
--- a/.gitignore
+++ b/.gitignore
@@ -368,6 +368,7 @@ tools/misc/xenwatchdogd
 tools/misc/xen-hvmcrash
 tools/misc/xen-lowmemd
 tools/libvchan/vchan-node[12]
+tools/libvchan/vchan-socket-proxy
 tools/ocaml/*/.ocamldep.make
 tools/ocaml/*/*.cm[ixao]
 tools/ocaml/*/*.cmxa
diff --git a/tools/libvchan/Makefile b/tools/libvchan/Makefile
index 7892750c3e..913bcc8884 100644
--- a/tools/libvchan/Makefile
+++ b/tools/libvchan/Makefile
@@ -13,6 +13,7 @@ LIBVCHAN_PIC_OBJS = $(patsubst %.o,%.opic,$(LIBVCHAN_OBJS))
 LIBVCHAN_LIBS = $(LDLIBS_libxenstore) $(LDLIBS_libxengnttab) 
$(LDLIBS_libxenevtchn)
 $(LIBVCHAN_OBJS) $(LIBVCHAN_PIC_OBJS): CFLAGS += $(CFLAGS_libxenstore) 
$(CFLAGS_libxengnttab) $(CFLAGS_libxenevtchn)
 $(NODE_OBJS) $(NODE2_OBJS): CFLAGS += $(CFLAGS_libxengnttab) 
$(CFLAGS_libxenevtchn)
+vchan-socket-proxy.o: CFLAGS += $(CFLAGS_libxenstore) $(CFLAGS_libxenctrl) 
$(CFLAGS_libxengnttab) $(CFLAGS_libxenevtchn)
 
 MAJOR = 4.14
 MINOR = 0
@@ -39,7 +40,7 @@ $(PKG_CONFIG_LOCAL): PKG_CONFIG_LIBDIR = $(CURDIR)
 $(PKG_CONFIG_LOCAL): PKG_CONFIG_CFLAGS_LOCAL = $(CFLAGS_xeninclude)
 
 .PHONY: all
-all: libxenvchan.so vchan-node1 vchan-node2 libxenvchan.a $(PKG_CONFIG_INST) 
$(PKG_CONFIG_LOCAL)
+all: libxenvchan.so vchan-node1 vchan-node2 vchan-socket-proxy libxenvchan.a 
$(PKG_CONFIG_INST) $(PKG_CONFIG_LOCAL)
 
 libxenvchan.so: libxenvchan.so.$(MAJOR)
ln -sf $< $@
@@ -59,13 +60,18 @@ vchan-node1: $(NODE_OBJS) libxenvchan.so
 vchan-node2: $(NODE2_OBJS) libxenvchan.so
$(CC) $(LDFLAGS) -o $@ $(NODE2_OBJS) $(LDLIBS_libxenvchan) 
$(APPEND_LDFLAGS)
 
+vchan-socket-proxy: vchan-socket-proxy.o libxenvchan.so
+   $(CC) $(LDFLAGS) -o $@ $< $(LDLIBS_libxenvchan) $(LDLIBS_libxenstore) 
$(LDLIBS_libxenctrl) $(APPEND_LDFLAGS)
+
 .PHONY: install
 install: all
$(INSTALL_DIR) $(DESTDIR)$(libdir)
$(INSTALL_DIR) $(DESTDIR)$(includedir)
+   $(INSTALL_DIR) $(DESTDIR)$(bindir)
$(INSTALL_PROG) libxenvchan.so.$(MAJOR).$(MINOR) $(DESTDIR)$(libdir)
ln -sf libxenvchan.so.$(MAJOR).$(MINOR) 
$(DESTDIR)$(libdir)/libxenvchan.so.$(MAJOR)
ln -sf libxenvchan.so.$(MAJOR) $(DESTDIR)$(libdir)/libxenvchan.so
+   $(INSTALL_PROG) vchan-socket-proxy $(DESTDIR)$(bindir)
$(INSTALL_DATA) libxenvchan.h $(DESTDIR)$(includedir)
$(INSTALL_DATA) libxenvchan.a $(DESTDIR)$(libdir)
$(INSTALL_DATA) xenvchan.pc $(DESTDIR)$(PKG_INSTALLDIR)
diff --git a/tools/libvchan/vchan-socket-proxy.c 
b/tools/libvchan/vchan-socket-proxy.c
new file mode 100644
index 00..13700c5d67
--- /dev/null
+++ b/tools/libvchan/vchan-socket-proxy.c
@@ -0,0 +1,478 @@
+/**
+ * @file
+ * @section AUTHORS
+ *
+ * Copyright (C) 2010  Rafal Wojtczuk  
+ *
+ *  Authors:
+ *   Rafal Wojtczuk  
+ *   Daniel De Graaf 
+ *   Marek Marczykowski-Górecki  
+ *
+ * @section LICENSE
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Lesser General Public
+ *  

[PATCH v5 06/21] libxl: write qemu arguments into separate xenstore keys

2020-04-27 Thread Jason Andryuk
From: Marek Marczykowski-Górecki 

This allows using arguments with spaces, like -append, without
nominating any special "separator" character.

Signed-off-by: Marek Marczykowski-Górecki 
Reviewed-by: Jason Andryuk 
Signed-off-by: Jason Andryuk 
---
Changes in v3:
 - previous version of this patch "libxl: use \x1b to separate qemu
   arguments for linux stubdomain" used specific non-printable
   separator, but it was rejected as xenstore doesn't cope well with
   non-printable chars
---
 tools/libxl/libxl_dm.c | 39 ++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/tools/libxl/libxl_dm.c b/tools/libxl/libxl_dm.c
index 5a7d55686f..bdc23554eb 100644
--- a/tools/libxl/libxl_dm.c
+++ b/tools/libxl/libxl_dm.c
@@ -2065,6 +2065,40 @@ static int 
libxl__vfb_and_vkb_from_hvm_guest_config(libxl__gc *gc,
 return 0;
 }
 
+static int libxl__write_stub_linux_dmargs(libxl__gc *gc,
+int dm_domid, int guest_domid,
+char **args)
+{
+libxl_ctx *ctx = libxl__gc_owner(gc);
+int i;
+char *vm_path;
+char *path;
+struct xs_permissions roperm[2];
+xs_transaction_t t;
+
+roperm[0].id = 0;
+roperm[0].perms = XS_PERM_NONE;
+roperm[1].id = dm_domid;
+roperm[1].perms = XS_PERM_READ;
+
+vm_path = libxl__xs_read(gc, XBT_NULL, GCSPRINTF("/local/domain/%d/vm", 
guest_domid));
+path = GCSPRINTF("%s/image/dmargs", vm_path);
+
+retry_transaction:
+t = xs_transaction_start(ctx->xsh);
+xs_write(ctx->xsh, t, path, "", 0);
+xs_set_permissions(ctx->xsh, t, path, roperm, ARRAY_SIZE(roperm));
+i = 1;
+for (i=1; args[i] != NULL; i++)
+xs_write(ctx->xsh, t, GCSPRINTF("%s/%03d", path, i), args[i], 
strlen(args[i]));
+
+xs_set_permissions(ctx->xsh, t, GCSPRINTF("%s/rtc/timeoffset", vm_path), 
roperm, ARRAY_SIZE(roperm));
+if (!xs_transaction_end(ctx->xsh, t, 0))
+if (errno == EAGAIN)
+goto retry_transaction;
+return 0;
+}
+
 static int libxl__write_stub_dmargs(libxl__gc *gc,
 int dm_domid, int guest_domid,
 char **args)
@@ -2274,7 +2308,10 @@ void libxl__spawn_stub_dm(libxl__egc *egc, 
libxl__stub_dm_spawn_state *sdss)
 
 libxl__store_libxl_entry(gc, guest_domid, "dm-version",
 
libxl_device_model_version_to_string(dm_config->b_info.device_model_version));
-libxl__write_stub_dmargs(gc, dm_domid, guest_domid, args);
+if (libxl__stubdomain_is_linux(_config->b_info))
+libxl__write_stub_linux_dmargs(gc, dm_domid, guest_domid, args);
+else
+libxl__write_stub_dmargs(gc, dm_domid, guest_domid, args);
 libxl__xs_printf(gc, XBT_NULL,
  GCSPRINTF("%s/image/device-model-domid",
libxl__xs_get_dompath(gc, guest_domid)),
-- 
2.20.1




[PATCH v5 00/21] Add support for qemu-xen runnning in a Linux-based stubdomain

2020-04-27 Thread Jason Andryuk
Hi,

In coordination with Marek, I'm making a submission of his patches for Linux
stubdomain device-model support.  I made a few of my own additions, but Marek
did the heavy lifting.  Thank you, Marek.

Below is mostly the v4 cover leter with a few additions.

General idea is to allow freely set device_model_version and
device_model_stubdomain_override and choose the right options based on this
choice.  Also, allow to specific path to stubdomain kernel/ramdisk, for greater
flexibility.

First two patches add documentation about expected toolstack-stubdomain-qemu
interface, both for MiniOS stubdomain and Linux stubdomain.

Initial version has no QMP support - in initial patches it is completely
disabled, which means no suspend/restore and no PCI passthrough.

Later patches add QMP over libvchan connection support. The actual connection
is made in a separate process. As discussed on Xen Summit 2019, this allows to
apply some basic checks and/or filtering (not part of this series), to limit
libxl exposure for potentially malicious stubdomain.

Jason's additions ensure the qmp-proxy (vchan-socket-proxy) processes and
sockets are cleaned up and add some documentation.

The actual stubdomain implementation is here:

https://github.com/marmarek/qubes-vmm-xen-stubdom-linux
(branch for-upstream, tag for-upstream-v3)

See readme there for build instructions.  Marek's version requires dracut.  I
have hacked up a version usable install with initramfs-tools:

   https://github.com/jandryuk/qubes-vmm-xen-stubdom-linux
   (branch initramfs-tools)

Few comments/questions about the stubdomain code:

1. There are extra patches for qemu that are necessary to run it in stubdomain.
While it is desirable to upstream them, I think it can be done after merging
libxl part. Stubdomain's qemu build will in most cases be separate anyway, to
limit qemu's dependencies (so the stubdomain size).

2. By default Linux hvc-xen console frontend is unreliable for data transfer
(qemu state save/restore) - it drops data sent faster than client is reading
it. To fix it, console device needs to be switched into raw mode (`stty raw
/dev/hvc1`). Especially for restoring qemu state it is tricky, as it would need
to be done before opening the device, but stty (obviously) needs to open the
device first. To solve this problem, for now the repository contains kernel
patch which changes the default for all hvc consoles. Again, this isn't
practical problem, as the kernel for stubdomain is built separately. But it
would be nice to have something working with vanilla kernel. I see those
options:
  - convert it to kernel cmdline parameter (hvc_console_raw=1 ?)
  - use channels instead of consoles (and on the kernel side change the default
to "raw" only for channels); while in theory better design, libxl part will
be more complex, as channels can be connected to sockets but not files, so
libxl would need to read/write to it exactly when qemu write/read the data,
not before/after as it is done now

3. Mini-OS stubdoms use dmargs xenstore key as a string.  Linux stubdoms use
dmargs as a directory for numbered entries.  Should they be different names?

Remaining parts for eliminating dom0's instance of qemu:
 - do not force QDISK backend for CDROM
 - multiple consoles support in xenconsoled

Changes in v2:
 - apply review comments by Jason Andryuk
Changes in v3:
 - rework qemu arguments handling (separate xenstore keys, instead of \x1b 
separator)
 - add QMP over libvchan, instead of console
 - add protocol documentation
 - a lot of minor changes, see individual patches for full changes list
 - split xenconsoled patches into separate series
Changes in v4:
 - extract vchan connection into a separate process
 - rebase on master
 - various fixes
Changes in v5:
 - Marek: apply review comments from Jason Andryuk
 - Jason: Clean up qmp-proxy processes and sockets

Cc: Marek Marczykowski-Górecki 
Cc: Simon Gaiser 
Cc: Eric Shelton 
Cc: Ian Jackson 
Cc: Wei Liu 

Eric Shelton (1):
  libxl: Handle Linux stubdomain specific QEMU options.

Jason Andryuk (5):
  docs: Add device-model-domid to xenstore-paths
  libxl: Check stubdomain kernel & ramdisk presence
  libxl: Refactor kill_device_model to libxl__kill_xs_path
  libxl: Kill vchan-socket-proxy when cleaning up qmp
  tools: Clean up vchan-socket-proxy socket

Marek Marczykowski-Górecki (15):
  Document ioemu MiniOS stubdomain protocol
  Document ioemu Linux stubdomain protocol
  libxl: fix qemu-trad cmdline for no sdl/vnc case
  libxl: Allow running qemu-xen in stubdomain
  libxl: write qemu arguments into separate xenstore keys
  xl: add stubdomain related options to xl config parser
  tools/libvchan: notify server when client is connected
  libxl: add save/restore support for qemu-xen in stubdomain
  tools: add missing libxenvchan cflags
  tools: add simple vchan-socket-proxy
  libxl: use vchan for QMP access with Linux stubdomain
  Regenerate autotools files
  libxl: require qemu in dom0 even 

[PATCH v5 07/21] xl: add stubdomain related options to xl config parser

2020-04-27 Thread Jason Andryuk
From: Marek Marczykowski-Górecki 

Signed-off-by: Marek Marczykowski-Górecki 
Reviewed-by: Jason Andryuk 
Signed-off-by: Jason Andryuk 
---
 docs/man/xl.cfg.5.pod.in | 27 +++
 tools/xl/xl_parse.c  |  7 +++
 2 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/docs/man/xl.cfg.5.pod.in b/docs/man/xl.cfg.5.pod.in
index 0e9e58a41a..c9bc181a95 100644
--- a/docs/man/xl.cfg.5.pod.in
+++ b/docs/man/xl.cfg.5.pod.in
@@ -2733,10 +2733,29 @@ model which they were installed with.
 
 =item B
 
-Override the path to the binary to be used as the device-model. The
-binary provided here MUST be consistent with the
-B which you have specified. You should not
-normally need to specify this option.
+Override the path to the binary to be used as the device-model running in
+toolstack domain. The binary provided here MUST be consistent with the
+B which you have specified. You should not normally need
+to specify this option.
+
+=item B
+
+Override the path to the kernel image used as device-model stubdomain.
+The binary provided here MUST be consistent with the
+B which you have specified.
+In case of B it is expected to be MiniOS-based stubdomain
+image, in case of B it is expected to be Linux-based stubdomain
+kernel.
+
+=item B
+
+Override the path to the ramdisk image used as device-model stubdomain.
+The binary provided here is to be used by a kernel pointed by 
B.
+It is known to be used only by Linux-based stubdomain kernel.
+
+=item B
+
+Start the stubdomain with MBYTES megabytes of RAM. Default is 128.
 
 =item B
 
diff --git a/tools/xl/xl_parse.c b/tools/xl/xl_parse.c
index 4450d59f16..61b4ef7b7e 100644
--- a/tools/xl/xl_parse.c
+++ b/tools/xl/xl_parse.c
@@ -2525,6 +2525,13 @@ skip_usbdev:
 xlu_cfg_replace_string(config, "device_model_user",
_info->device_model_user, 0);
 
+xlu_cfg_replace_string (config, "stubdomain_kernel",
+_info->stubdomain_kernel, 0);
+xlu_cfg_replace_string (config, "stubdomain_ramdisk",
+_info->stubdomain_ramdisk, 0);
+if (!xlu_cfg_get_long (config, "stubdomain_memory", , 0))
+b_info->stubdomain_memkb = l * 1024;
+
 #define parse_extra_args(type)\
 e = xlu_cfg_get_list_as_string_list(config, "device_model_args"#type, \
 _info->extra##type, 0);\
-- 
2.20.1




[xen-4.12-testing test] 149838: regressions - FAIL

2020-04-27 Thread osstest service owner
flight 149838 xen-4.12-testing real [real]
http://logs.test-lab.xenproject.org/osstest/logs/149838/

Regressions :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 test-armhf-armhf-xl-credit2   7 xen-boot fail REGR. vs. 149646

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-xl-qcow217 guest-localmigrate/x10   fail  like 149646
 test-amd64-i386-xl-pvshim12 guest-start  fail   never pass
 test-amd64-amd64-libvirt 13 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt-xsm  13 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-xsm 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  14 saverestore-support-checkfail   never pass
 test-amd64-i386-libvirt  13 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 11 migrate-support-check 
fail never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 11 migrate-support-check 
fail never pass
 test-arm64-arm64-xl  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 14 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-vhd 12 migrate-support-checkfail   never pass
 test-amd64-amd64-qemuu-nested-amd 17 debian-hvm-install/l1/l2  fail never pass
 test-arm64-arm64-xl-credit1  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 13 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 13 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 14 saverestore-support-checkfail  never pass
 test-amd64-i386-xl-qemuu-win7-amd64 17 guest-stop  fail never pass
 test-armhf-armhf-xl-vhd  12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  13 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt 13 migrate-support-checkfail   never pass
 test-armhf-armhf-libvirt 14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-cubietruck 13 migrate-support-checkfail never pass
 test-armhf-armhf-xl-cubietruck 14 saverestore-support-checkfail never pass
 test-amd64-amd64-xl-qemut-win7-amd64 17 guest-stop fail never pass
 test-amd64-amd64-xl-qemuu-ws16-amd64 17 guest-stop fail never pass
 test-amd64-i386-xl-qemut-win7-amd64 17 guest-stop  fail never pass
 test-amd64-i386-xl-qemuu-ws16-amd64 17 guest-stop  fail never pass
 test-armhf-armhf-libvirt-raw 12 migrate-support-checkfail   never pass
 test-armhf-armhf-libvirt-raw 13 saverestore-support-checkfail   never pass
 test-amd64-amd64-xl-qemuu-win7-amd64 17 guest-stop fail never pass
 test-amd64-amd64-xl-qemut-ws16-amd64 17 guest-stop fail never pass
 test-amd64-i386-xl-qemut-ws16-amd64 17 guest-stop  fail never pass

version targeted for testing:
 xen  e6a2681148382e9227f54b70a5df8e895914c877
baseline version:
 xen  3536f8dc39cc7311715340b87a04a89fac468406

Last test of basis   149646  2020-04-14 13:05:53 Z   13 days
Testing same since   149838  2020-04-27 14:06:02 Z0 days1 attempts


People who touched revisions under test:
  Andrew Cooper 
  Anthony PERARD 
  Harsha Shamsundara Havanur 
  Jan Beulich 
  Julien Grall 
  Roger Pau Monné 
  Wei Liu 

jobs:
 build-amd64-xsm  pass
 build-arm64-xsm 

[xen-unstable-smoke test] 149839: tolerable all pass - PUSHED

2020-04-27 Thread osstest service owner
flight 149839 xen-unstable-smoke real [real]
http://logs.test-lab.xenproject.org/osstest/logs/149839/

Failures :-/ but no regressions.

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-libvirt 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  14 saverestore-support-checkfail   never pass

version targeted for testing:
 xen  dd0882f912005b56653013025ebd160862e360ad
baseline version:
 xen  df669de074c395a3b2eeb975fddd3da4c148da13

Last test of basis   149837  2020-04-27 14:00:43 Z0 days
Testing same since   149839  2020-04-27 21:01:50 Z0 days1 attempts


People who touched revisions under test:
  Andrew Cooper 
  Jan Beulich 

jobs:
 build-arm64-xsm  pass
 build-amd64  pass
 build-armhf  pass
 build-amd64-libvirt  pass
 test-armhf-armhf-xl  pass
 test-arm64-arm64-xl-xsm  pass
 test-amd64-amd64-xl-qemuu-debianhvm-amd64pass
 test-amd64-amd64-libvirt pass



sg-report-flight on osstest.test-lab.xenproject.org
logs: /home/logs/logs
images: /home/logs/images

Logs, config files, etc. are available at
http://logs.test-lab.xenproject.org/osstest/logs

Explanation of these reports, and of osstest in general, is at
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README.email;hb=master
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README;hb=master

Test harness code can be found at
http://xenbits.xen.org/gitweb?p=osstest.git;a=summary


Pushing revision :

To xenbits.xen.org:/home/xen/git/xen.git
   df669de074..dd0882f912  dd0882f912005b56653013025ebd160862e360ad -> smoke



Re: arm: DomU Networking enable leads to Dom0 kernel oops

2020-04-27 Thread Stefano Stabellini
On Sat, 25 Apr 2020, Julien Grall wrote:
> On Sat, 25 Apr 2020 at 10:49, Julien Grall  wrote:
> >
> > On Sat, 25 Apr 2020 at 03:01, Stefano Stabellini  
> > wrote:
> > > [   86.900974] [ cut here ]
> > > [   86.905134] Interrupt for port 6, but apparently not enabled; per-user 
> > > (ptrval)
> > > [   86.913228] WARNING: CPU: 0 PID: 2437 at drivers/xen/evtchn.c:167 
> > > evtchn_interrupt+0xfc/0x108
> >
> > The implementation of the evtchn_interrupt() is relying to be called
> > in the top-half. On RT, interrupts handlers are forced to be threaded
> > and use the IRQF_ONESHOT semantics if they were not threaded before.
> >
> > However, IRQF_ONESHOT is completely broken for event channels (this is
> > not RT's fault) and hence why you see the warning here.
> >
> > Note that you can't force to run evtchn_interrupt() in the top-half
> > because it relies on functions that may sleep.
> >
> > See https://lkml.org/lkml/2019/2/19/642.
> 
> Here at better link with the full conversation:
> 
> https://lore.kernel.org/lkml/5e256d9a-572c-e01e-7706-407f99245...@arm.com/

Many thanks for pointing it out to me. I think I know what the problem
is. I replied to that thread with a patch that fixes my LinuxRT issue on ARM:

https://marc.info/?l=xen-devel=158802965821440=2



Re: [Xen-devel] xen/evtchn and forced threaded irq

2020-04-27 Thread Stefano Stabellini
On Thu, 21 Feb 2019, Julien Grall wrote:
> Hi Roger,
> 
> On Thu, 21 Feb 2019, 08:08 Roger Pau Monné,  wrote:
>   FWIW, you can also mask the interrupt while waiting for the thread to
>   execute the interrupt handler. Ie:
> 
> 
> Thank you for providing steps, however where would the masking be done? By 
> the irqchip or a custom solution?
> 
> 
>   1. Interrupt injected
>   2. Execute guest event channel callback
>   3. Scan for pending interrupts
>   4. Mask interrupt
>   5. Clear pending field
>   6. Queue threaded handler
>   7. Go to 3 until all interrupts are drained
>   [...]
>   8. Execute interrupt handler in thread
>   9. Unmask interrupt
> 
>   That should prevent you from stacking interrupts?

Sorry for coming late to the thread, and thanks Julien for pointing it
out to me. I am afraid I was the one to break the flow back in 2011 with
the following commit:

  7e186bdd0098 xen: do not clear and mask evtchns in __xen_evtchn_do_upcall

Oops :-)


Xen event channels have their own workflow; the one Roger wrote above.
They used to be handled using handle_fasteoi_irq until 7e186bdd0098,
then I switched (almost) all of them to handle_edge_irq.

Looking closely at irq handling again, it doesn't look like we can do
what we need with handle_edge_irq today: we can't mask the event channel
before clearing it. But we can do that if we go back to using
handle_fasteoi_irq.

In fact, I managed to verify that LinuxRT works fine as dom0 with the
attached dynamic.patch that switches back xen_dynamic_chip IRQs to
handle_fasteoi_irq.


>From the rest of this thread, it looks like the issue might appear with
PIRQs as well. Thus, I wrote a second patch pirqs.patch to switch back
to handle_fasteoi_irq PIRQs as well. However, Xen on ARM does not use
PIRQs so I couldn't test it at all. I would appreciate if Boris/Juegen
tested it. Let me know what you want me to do with the second patch.From ce26c371a8ff7b49c98a3b8c7b57199154cbca59 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini 
Date: Mon, 27 Apr 2020 16:15:26 -0700
Subject: [PATCH] xen: use handle_fasteoi_irq to handle xen events

When handling Xen events, we need to make sure the following sequence is
followed:

- mask event
- handle event and clear event (the order does not matter)
- unmask event

It is not possible to implement this flow with handle_edge_irq, so
switch back to handle_fasteoi_irq. Please note that Xen event irqs are
ONESHOT. Also note that handle_fasteoi_irq was in-use before the
following commit, that is partially reverted by this patch:

7e186bdd0098 xen: do not clear and mask evtchns in __xen_evtchn_do_upcall

PIRQ handling is left unchanged.

This patch fixes a domU hang observed when using LinuxRT as dom0 kernel.

Link: https://lore.kernel.org/lkml/5e256d9a-572c-e01e-7706-407f99245...@arm.com/
Signed-off-by: Stefano Stabellini 
---
 drivers/xen/events/events_base.c | 13 +++--
 1 file changed, 3 insertions(+), 10 deletions(-)

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 499eff7d3f65..5f9b8104dbcf 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -845,7 +845,7 @@ int bind_evtchn_to_irq(unsigned int evtchn)
 			goto out;
 
 		irq_set_chip_and_handler_name(irq, _dynamic_chip,
-	  handle_edge_irq, "event");
+	  handle_fasteoi_irq, "event");
 
 		ret = xen_irq_info_evtchn_setup(irq, evtchn);
 		if (ret < 0) {
@@ -978,7 +978,7 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu, bool percpu)
 		  handle_percpu_irq, "virq");
 		else
 			irq_set_chip_and_handler_name(irq, _dynamic_chip,
-		  handle_edge_irq, "virq");
+		  handle_fasteoi_irq, "virq");
 
 		bind_virq.virq = virq;
 		bind_virq.vcpu = xen_vcpu_nr(cpu);
@@ -1377,12 +1377,6 @@ static void ack_dynirq(struct irq_data *data)
 		clear_evtchn(evtchn);
 }
 
-static void mask_ack_dynirq(struct irq_data *data)
-{
-	disable_dynirq(data);
-	ack_dynirq(data);
-}
-
 static int retrigger_dynirq(struct irq_data *data)
 {
 	unsigned int evtchn = evtchn_from_irq(data->irq);
@@ -1585,8 +1579,7 @@ static struct irq_chip xen_dynamic_chip __read_mostly = {
 	.irq_mask		= disable_dynirq,
 	.irq_unmask		= enable_dynirq,
 
-	.irq_ack		= ack_dynirq,
-	.irq_mask_ack		= mask_ack_dynirq,
+	.irq_eoi		= ack_dynirq,
 
 	.irq_set_affinity	= set_affinity_irq,
 	.irq_retrigger		= retrigger_dynirq,
-- 
2.17.1

diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c
index 5f9b8104dbcf..57a29c94fefc 100644
--- a/drivers/xen/events/events_base.c
+++ b/drivers/xen/events/events_base.c
@@ -498,12 +498,6 @@ static void eoi_pirq(struct irq_data *data)
 	}
 }
 
-static void mask_ack_pirq(struct irq_data *data)
-{
-	disable_dynirq(data);
-	eoi_pirq(data);
-}
-
 static unsigned int __startup_pirq(unsigned int irq)
 {
 	struct evtchn_bind_pirq bind_pirq;
@@ -684,13 +678,9 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi,
 	}
 

[xen-4.13-testing test] 149836: tolerable FAIL - PUSHED

2020-04-27 Thread osstest service owner
flight 149836 xen-4.13-testing real [real]
http://logs.test-lab.xenproject.org/osstest/logs/149836/

Failures :-/ but no regressions.

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-xl-rtds 18 guest-localmigrate/x10   fail  like 149664
 test-armhf-armhf-xl-rtds 16 guest-start/debian.repeatfail  like 149664
 test-amd64-i386-xl-pvshim12 guest-start  fail   never pass
 test-amd64-i386-libvirt  13 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt 13 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-xsm 13 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 11 migrate-support-check 
fail never pass
 test-amd64-i386-libvirt-xsm  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  14 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 11 migrate-support-check 
fail never pass
 test-amd64-amd64-libvirt-vhd 12 migrate-support-checkfail   never pass
 test-amd64-amd64-qemuu-nested-amd 17 debian-hvm-install/l1/l2  fail never pass
 test-arm64-arm64-xl  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  14 saverestore-support-checkfail   never pass
 test-amd64-amd64-xl-qemuu-win7-amd64 17 guest-stop fail never pass
 test-armhf-armhf-libvirt-raw 12 migrate-support-checkfail   never pass
 test-armhf-armhf-libvirt-raw 13 saverestore-support-checkfail   never pass
 test-amd64-i386-xl-qemuu-win7-amd64 17 guest-stop  fail never pass
 test-amd64-i386-xl-qemut-win7-amd64 17 guest-stop  fail never pass
 test-armhf-armhf-xl-rtds 13 migrate-support-checkfail   never pass
 test-armhf-armhf-libvirt 13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 14 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt 14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-cubietruck 13 migrate-support-checkfail never pass
 test-armhf-armhf-xl-cubietruck 14 saverestore-support-checkfail never pass
 test-armhf-armhf-xl-multivcpu 13 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 14 saverestore-support-checkfail  never pass
 test-amd64-i386-xl-qemut-ws16-amd64 17 guest-stop  fail never pass
 test-armhf-armhf-xl-vhd  12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  13 saverestore-support-checkfail   never pass
 test-amd64-amd64-xl-qemut-win7-amd64 17 guest-stop fail never pass
 test-amd64-i386-xl-qemuu-ws16-amd64 17 guest-stop  fail never pass
 test-amd64-amd64-xl-qemuu-ws16-amd64 17 guest-stop fail never pass
 test-amd64-amd64-xl-qemut-ws16-amd64 17 guest-stop fail never pass

version targeted for testing:
 xen  35b80b2a011416383466f21e32cb72cf73df491b
baseline version:
 xen  b66ce5058ec9ce84418cedd39b2bf07b7c5a1f65

Last test of basis   149664  2020-04-14 23:36:10 Z   12 days
Testing same since   149836  2020-04-27 13:36:20 Z0 days1 attempts


People who touched revisions under test:
  Andrew Cooper 
  Anthony PERARD 
  Dario Faggioli 
  Harsha Shamsundara Havanur 
  Jan Beulich 
  Jeff Kubascik 
  Julien Grall 

[xen-unstable test] 149835: regressions - FAIL

2020-04-27 Thread osstest service owner
flight 149835 xen-unstable real [real]
http://logs.test-lab.xenproject.org/osstest/logs/149835/

Regressions :-(

Tests which did not succeed and are blocking,
including tests which could not be run:
 test-amd64-amd64-examine  4 memdisk-try-append   fail REGR. vs. 149831

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-xl-rtds 18 guest-localmigrate/x10  fail blocked in 149831
 test-amd64-amd64-xl-qemuu-ws16-amd64 17 guest-stopfail like 149831
 test-amd64-i386-xl-qemut-win7-amd64 17 guest-stop fail like 149831
 test-amd64-amd64-xl-qemut-win7-amd64 17 guest-stopfail like 149831
 test-armhf-armhf-libvirt-raw 13 saverestore-support-checkfail  like 149831
 test-armhf-armhf-libvirt 14 saverestore-support-checkfail  like 149831
 test-amd64-i386-xl-qemuu-win7-amd64 17 guest-stop fail like 149831
 test-amd64-amd64-xl-qemut-ws16-amd64 17 guest-stopfail like 149831
 test-amd64-amd64-xl-qemuu-win7-amd64 17 guest-stopfail like 149831
 test-amd64-i386-xl-qemuu-ws16-amd64 17 guest-stop fail like 149831
 test-amd64-i386-xl-pvshim12 guest-start  fail   never pass
 test-amd64-amd64-libvirt 13 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt-xsm  13 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-xsm 13 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  14 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 11 migrate-support-check 
fail never pass
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 11 migrate-support-check 
fail never pass
 test-amd64-amd64-qemuu-nested-amd 17 debian-hvm-install/l1/l2  fail never pass
 test-arm64-arm64-xl-xsm  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  13 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 14 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-vhd 12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 13 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 14 saverestore-support-checkfail  never pass
 test-armhf-armhf-xl-credit2  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-cubietruck 13 migrate-support-checkfail never pass
 test-armhf-armhf-xl-cubietruck 14 saverestore-support-checkfail never pass
 test-amd64-i386-xl-qemut-ws16-amd64 17 guest-stop  fail never pass
 test-armhf-armhf-xl-vhd  12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  13 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt-raw 12 migrate-support-checkfail   never pass
 test-armhf-armhf-libvirt 13 migrate-support-checkfail   never pass

version targeted for testing:
 xen  8d4c17a90b0a9b3d82628461090a54a2c7897154
baseline version:
 xen  f093b08c47b39da6019421a2b61d40745b3e573b

Last test of basis   149831  2020-04-27 01:52:33 Z0 days
Testing same since   149835  2020-04-27 12:07:17 Z0 days1 attempts


People who touched revisions under test:
  Anthony PERARD 
  Jan Beulich 
  Julien Grall 
  Stefano 

Re: [PATCH] x86/ioemul: Rewrite stub generation

2020-04-27 Thread Andrew Cooper
On 27/04/2020 16:28, Jan Beulich wrote:
> On 27.04.2020 14:20, Andrew Cooper wrote:
>> The logic is completely undocumented and almost impossible to follow.  It
>> actually uses return oriented programming.  Rewrite it to conform to more
>> normal call mechanics, and leave a big comment explaining thing.  As well as
>> the code being easier to follow, it will execute faster as it isn't fighting
>> the branch predictor.
>>
>> Move the ioemul_handle_quirk() function pointer from traps.c to
>> ioport_emulate.c.  There is no reason for it to be in neither of the two
>> translation units which use it.  Alter the behaviour to return the number of
>> bytes written into the stub.
>>
>> Access the addresses of the host/guest helpers with extern const char arrays.
>> Nothing good will come of C thinking they are regular functions.
> I agree with the C aspect, but imo the assembly routines should,
> with the changes you make, be marked as being ordinary functions.

Despite the changes, they are still very much not ordinary functions,
and cannot be used by C.

I have no objection to marking them as STT_FUNCTION (as this doesn't
mean C function), but...

> A reasonable linker would then warn about the C file wanting an
> STT_OBJECT while the assembly file provides an STT_FUNC. I'd
> therefore prefer, along with marking the functions as such, to
> have them also declared as functions in C.

... there is literally nothing safe which C can do with them other than
manipulate their address.

Writing it like this is specifically prevents something from compiling
which will explode at runtime, is someone is naive enough to try using
the function from C.

>> --- a/xen/arch/x86/ioport_emulate.c
>> +++ b/xen/arch/x86/ioport_emulate.c
>> @@ -8,7 +8,10 @@
>>  #include 
>>  #include 
>>  
>> -static bool ioemul_handle_proliant_quirk(
>> +unsigned int (*ioemul_handle_quirk)(
>> +u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
> Would you mind adding __read_mostly on this occasion?
>
>> @@ -19,18 +22,16 @@ static bool ioemul_handle_proliant_quirk(
>>  0xa8, 0x80, /*test $0x80, %al */
>>  0x75, 0xfb, /*jnz 1b  */
>>  0x9d,   /*popf*/
>> -0xc3,   /*ret */
>>  };
>>  uint16_t port = regs->dx;
>>  uint8_t value = regs->al;
>>  
>>  if ( (opcode != 0xee) || (port != 0xcd4) || !(value & 0x80) )
>> -return false;
>> +return 0;
>>  
>>  memcpy(io_emul_stub, stub, sizeof(stub));
>> -BUILD_BUG_ON(IOEMUL_QUIRK_STUB_BYTES < sizeof(stub));
> So you treat a build failure for a runtime crash.

I presume you mean s/treat/trade/ here, and the answer is no, not really.

There is nothing which actually forced a connection between the build
time checks and runtime behaviour, so it was only a facade of safety,
not real safety.

>  I can see the
> advantages of the new approach, but the original got away with
> less stub space.

Stub space doesn't matter, so long as it fits.  In particular, ...

> If our L1_CACHE_SHIFT wasn't hard coded to 7
> just to cover some unusual CPUs, your new approach would, if I
> got the counting and calculations right, not work (with a value
> resulting in a 64-byte cache line size).

... the SYSCALL stubs use 64 bytes so Xen cannot function with a shift
less than 7.

> Introducing a Kconfig
> option for this should imo not come with a need to re-work the
> logic here again. Therefore I'd like us to think about a way
> to make the space needed not exceed 32 bytes.

And why would we ever want an option like that?  (I know how you're
going to answer this, so I'm going to pre-emptively point out that there
are hundreds of kilobytes of easier-to-shrink per-cpu data structures
than this one).

Honestly, this suggestion is a total waste of time and effort.  It is an
enormous amount of complexity to micro-optimise a problem which doesn't
exist in the first place.

The stubs are already 128 bytes per CPU and cannot be made smaller. 
Attempting to turn this particular stub into <32 has no benefit (the
stubs don't actually get smaller), and major costs.

~Andrew



Re: [PATCH] x86: refine guest_mode()

2020-04-27 Thread Andrew Cooper
On 27/04/2020 16:15, Jan Beulich wrote:
> On 27.04.2020 16:35, Andrew Cooper wrote:
>> On 27/04/2020 09:03, Jan Beulich wrote:
>>> The 2nd of the assertions as well as the macro's return value have been
>>> assuming we're on the primary stack. While for most IST exceptions we
>>> eventually switch back to the main one,
>> "we switch to the main one when interrupting user mode".
>>
>> "eventually" isn't accurate as it is before we enter C.
> Right, will change.
>
>>> --- a/xen/include/asm-x86/regs.h
>>> +++ b/xen/include/asm-x86/regs.h
>>> @@ -10,9 +10,10 @@
>>>  /* Frame pointer must point into current CPU stack. */ 
>>>\
>>>  ASSERT(diff < STACK_SIZE); 
>>>\
>>>  /* If not a guest frame, it must be a hypervisor frame. */ 
>>>\
>>> -ASSERT((diff == 0) || (r->cs == __HYPERVISOR_CS)); 
>>>\
>>> +if ( diff < PRIMARY_STACK_SIZE )   
>>>\
>>> +ASSERT(!diff || ((r)->cs == __HYPERVISOR_CS)); 
>>>\
>>>  /* Return TRUE if it's a guest frame. */   
>>>\
>>> -(diff == 0);   
>>>\
>>> +!diff || ((r)->cs != __HYPERVISOR_CS); 
>>>\
>> The (diff == 0) already worried me before because it doesn't fail safe,
>> but this makes things more problematic.  Consider the case back when we
>> had __HYPERVISOR_CS32.
> Yes - if __HYPERVISOR_CS32 would ever have been to be used for
> anything, it would have needed checking for here.
>
>> Guest mode is strictly "(r)->cs & 3".
> As long as CS (a) gets properly saved (it's a "manual" step for
> SYSCALL/SYSRET as well as #VMEXIT) and (b) didn't get clobbered. I
> didn't write this code, I don't think, so I can only guess that
> there were intentions behind this along these lines.

Hmm - the VMExit case might be problematic here, due to the variability
in the poison used.

>
>> Everything else is expectations about how things ought to be laid out,
>> but for safety in release builds, the final judgement should not depend
>> on the expectations evaluating true.
> Well, I can switch to a purely CS.RPL based approach, as long as
> we're happy to live with the possible downside mentioned above.
> Of course this would then end up being a more intrusive change
> than originally intended ...

I'd certainly prefer to go for something which is more robust, even if
it is a larger change.

~Andrew



Re: [PATCH 1/3] x86/pv: Options to disable and/or compile out 32bit PV support

2020-04-27 Thread Andrew Cooper
On 24/04/2020 06:28, Jürgen Groß wrote:
> On 23.04.20 19:35, Andrew Cooper wrote:
>> On 21/04/2020 07:02, Jan Beulich wrote:
>>> On 20.04.2020 20:05, Andrew Cooper wrote:
 On 20/04/2020 15:05, Jan Beulich wrote:
> I'm in particular
> concerned that we may gain a large number of such printk()s over
> time, if we added them in such cases.
 The printk() was a bit of an afterthought, but deliberately
 avoiding the
 -EINVAL path was specifically not.

 In the case that the user tries to use `pv=no-32` without CONFIG_PV32,
 they should see something other than

 (XEN) parameter "pv=no-32" unknown!
>>> Why - to this specific build of Xen the parameter is unknown.
>>
>> Because it is unnecessarily problematic and borderline obnoxious to
>> users, as well as occasional Xen developers.
>>
>> "you've not got the correct CONFIG_$X for that to be meaningful" is
>> specifically useful to separate from "I've got no idea".
>>
 I don't think overloading the return value is a clever move, because
 then every parse function has to take care of ensuring that
 -EOPNOTSUPP
 (or ENODEV?) never clobbers -EINVAL.
>>> I didn't suggest overloading the return value. Instead I
>>> specifically want this to go the -EINVAL path.
>>>
 We could have a generic helper which looks like:

 static inline void ignored_param(const char *cfg, const char *name,
 const char *s, const char *ss)
 {
  printk(XENLOG_INFO "%s disabled - ignoring '%s=%*.s' setting\n",
 cfg, name, s, (int)(ss - s));
 }

 which at least would keep all the users consistent.
>>> Further bloating the binary with (almost) useless string literals.
>>> I'd specifically like to avoid this.
>>
>> I don't accept that as a valid argument.
>>
>> We're talking about literally tens of bytes (which will merge anyway, so
>> 0 in practice), and a resulting UI which helps people get out of
>> problems rather than penalises them for having gotten into a problem to
>> begin with.
>>
>> I will absolutely prioritise a more helpful UI over a handful of bytes.
>
> What about a kconfig option (defaulting to "yes") enabling this feature?

IMO, that's literally not worth the bytes taken to implement.

> That way an embedded variant can be made smaller while a server one is
> more user friendly.

There is far lower hanging fruit for an embedded usecase, and its not at
all a foregone conclusion that such a usecase would want the less user
friendly version.  (Embedded very definitely doesn't mean that it won't
have users interacting with the command line).

I would certainly recommend against attempting to speculatively fix
something which isn't a problem, based on guesswork about what a
hypothetical group of people might want while totally ignoring the far
larger savings to be gained by e.g. making CONFIG_INTEL/AMD work.

~Andrew



Re: [PATCH 01/12] libxc/save: Shrink code volume where possible

2020-04-27 Thread Andrew Cooper
On 27/04/2020 20:55, Wei Liu wrote:
> On Mon, Apr 27, 2020 at 06:19:37PM +0100, Ian Jackson wrote:
>> Andrew Cooper writes ("Re: [PATCH 01/12] libxc/save: Shrink code volume 
>> where possible"):
>>> On 14/01/2020 16:48, Ian Jackson wrote:
 Andrew Cooper writes ("[PATCH 01/12] libxc/save: Shrink code volume where 
 possible"):
> A property of how the error handling (0 on success, nonzero otherwise)
> allows these calls to be chained together with the ternary operatior.
 I'm quite surprised to find a suggestion like this coming from you in
 particular.
>>> What probably is relevant is that ?: is a common construct in the
>>> hypervisor, which I suppose does colour my expectation of people knowing
>>> exactly what it means and how it behaves.
>> I expect other C programmers to know what ?: does, too.  But I think
>> using it to implement the error monad is quite unusual.  I asked
>> around a bit and my feeling is still that this isn't an improvement.
>>
 Or just to permit
rc = write_one_vcpu_basic(ctx, i);if (rc) goto error;
 (ie on a single line).
>>> OTOH, it should come as no surprise that I'd rather drop this patch
>>> entirely than go with these alternatives, both of which detract from
>>> code clarity. The former for hiding control flow, and the latter for
>>> being atypical layout which unnecessary cognitive load to follow.
>> I think, then, that it would be best to drop this patch, unless Wei
>> (or someone else) disagrees with me.
> I don't feel strongly either way.

I'm confused... I dropped this 3 and a half months ago, because it was
blindingly obvious it was going nowhere.

This is the v1 series which was totally superseded by the v2 series also
posted in January.

~Andrew



Re: [PATCH 01/12] libxc/save: Shrink code volume where possible

2020-04-27 Thread Wei Liu
On Mon, Apr 27, 2020 at 06:19:37PM +0100, Ian Jackson wrote:
> Andrew Cooper writes ("Re: [PATCH 01/12] libxc/save: Shrink code volume where 
> possible"):
> > On 14/01/2020 16:48, Ian Jackson wrote:
> > > Andrew Cooper writes ("[PATCH 01/12] libxc/save: Shrink code volume where 
> > > possible"):
> > >> A property of how the error handling (0 on success, nonzero otherwise)
> > >> allows these calls to be chained together with the ternary operatior.
> > > I'm quite surprised to find a suggestion like this coming from you in
> > > particular.
> > 
> > What probably is relevant is that ?: is a common construct in the
> > hypervisor, which I suppose does colour my expectation of people knowing
> > exactly what it means and how it behaves.
> 
> I expect other C programmers to know what ?: does, too.  But I think
> using it to implement the error monad is quite unusual.  I asked
> around a bit and my feeling is still that this isn't an improvement.
> 
> > > Or just to permit
> > >rc = write_one_vcpu_basic(ctx, i);if (rc) goto error;
> > > (ie on a single line).
> > 
> > OTOH, it should come as no surprise that I'd rather drop this patch
> > entirely than go with these alternatives, both of which detract from
> > code clarity. The former for hiding control flow, and the latter for
> > being atypical layout which unnecessary cognitive load to follow.
> 
> I think, then, that it would be best to drop this patch, unless Wei
> (or someone else) disagrees with me.

I don't feel strongly either way.

Wei.



Re: [PATCH 2/2] x86: drop high compat r/o M2P table address range

2020-04-27 Thread Wei Liu
On Wed, Apr 15, 2020 at 10:23:58AM +0200, Jan Beulich wrote:
> Now that we don't properly hook things up into the page tables anymore
> we also don't need to set aside an address range. Drop it, using
> compat_idle_pg_table_l2[] simply (explicitly) from slot 0.
> 
> While doing the re-arrangement, which is accompanied by the dropping or
> replacing of some local variables, restrict the scopes of some further
> ones at the same time.
> 
> Signed-off-by: Jan Beulich 

Reviewed-by: Wei Liu 

> ---
> TBD: With the changed usage perhaps we want to also rename
>  compat_idle_pg_table_l2[] (to e.g. compat_idle_l2_entries[])?

No opinion on this.



Re: Xen network domain performance for 10Gb NIC

2020-04-27 Thread tosher 1
The driver domain is HVM. Both the driver domain and 






On Monday, April 27, 2020, 1:28:13 AM EDT, Jürgen Groß  wrote: 




> Is the driver domain PV or HVM?

The driver domain is HVM.

> How many vcpus do dom0, the driver domain and the guest have?

Dom0 has 12 vcpus, pinned. Both the driver domain and the guest have 4 vcpus, 
pinned as well.


Juergen



Re: [PATCH v2 1/1] golang/xenlight: add NameToDomid and DomidToName util functions

2020-04-27 Thread Nick Rosbrook
On Mon, Apr 27, 2020 at 8:59 AM George Dunlap  wrote:
>
>
>
> > On Apr 24, 2020, at 4:02 AM, Nick Rosbrook  wrote:
> >
> > Many exported functions in xenlight require a domid as an argument. Make
> > it easier for package users to use these functions by adding wrappers
> > for the libxl utility functions libxl_name_to_domid and
> > libxl_domid_to_name.
> >
> > Signed-off-by: Nick Rosbrook 
> > ---
> > tools/golang/xenlight/xenlight.go | 38 ++-
> > 1 file changed, 37 insertions(+), 1 deletion(-)
> >
> > diff --git a/tools/golang/xenlight/xenlight.go 
> > b/tools/golang/xenlight/xenlight.go
> > index 6b4f492550..d1d30b63e1 100644
> > --- a/tools/golang/xenlight/xenlight.go
> > +++ b/tools/golang/xenlight/xenlight.go
> > @@ -21,13 +21,13 @@ package xenlight
> > #cgo LDFLAGS: -lxenlight -lyajl -lxentoollog
> > #include 
> > #include 
> > +#include 
> >
> > static const libxl_childproc_hooks childproc_hooks = { .chldowner = 
> > libxl_sigchld_owner_mainloop };
> >
> > void xenlight_set_chldproc(libxl_ctx *ctx) {
> >   libxl_childproc_setmode(ctx, _hooks, NULL);
> > }
> > -
> > */
> > import "C"
> >
> > @@ -75,6 +75,10 @@ var libxlErrors = map[Error]string{
> >   ErrorFeatureRemoved:   "Feature removed",
> > }
> >
> > +const (
> > + DomidInvalid Domid = ^Domid(0)
>
> Not to be a stickler, but are we positive that this will always result in the 
> same value as C.INVALID_DOMID?
>
> There are some functions that will return INVALID_DOMID, or accept 
> INVALID_DOMID as an argument.  Users of the `xenlight` package will 
> presumably need to compare against this value and/or pass it in.
>
> It seems like we could:
>
> 1. Rely on language lawyering to justify our assumption that ^Domid(0) will 
> always be the same as C “~0”
>
> 2. On marshalling into / out of C, convert C.INVALID_DOMID to DomidInvalid
>
> 3. Set Domid = C.INVALID_DOMID

I just tested this way, and it does not work as expected. Since
C.INVALID_DOMID is #define'd, cgo does not know it is intended to be
used as uint32_t, and decides to declare C.INVALID_DOMID as int. So,
C.INVALID_DOMID = ^int(0) = -1, which overflows Domid.

I tried a few things in the cgo preamble without any luck.
Essentially, one cannot define a const uint32_t in C space, and use
that to define a const in Go space.

Any ideas?

-NR



[xen-unstable-smoke test] 149837: tolerable all pass - PUSHED

2020-04-27 Thread osstest service owner
flight 149837 xen-unstable-smoke real [real]
http://logs.test-lab.xenproject.org/osstest/logs/149837/

Failures :-/ but no regressions.

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-libvirt 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  14 saverestore-support-checkfail   never pass

version targeted for testing:
 xen  df669de074c395a3b2eeb975fddd3da4c148da13
baseline version:
 xen  8d4c17a90b0a9b3d82628461090a54a2c7897154

Last test of basis   149834  2020-04-27 08:01:51 Z0 days
Testing same since   149837  2020-04-27 14:00:43 Z0 days1 attempts


People who touched revisions under test:
  George Dunlap 
  Nick Rosbrook 
  Nick Rosbrook 
  Wei Liu 

jobs:
 build-arm64-xsm  pass
 build-amd64  pass
 build-armhf  pass
 build-amd64-libvirt  pass
 test-armhf-armhf-xl  pass
 test-arm64-arm64-xl-xsm  pass
 test-amd64-amd64-xl-qemuu-debianhvm-amd64pass
 test-amd64-amd64-libvirt pass



sg-report-flight on osstest.test-lab.xenproject.org
logs: /home/logs/logs
images: /home/logs/images

Logs, config files, etc. are available at
http://logs.test-lab.xenproject.org/osstest/logs

Explanation of these reports, and of osstest in general, is at
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README.email;hb=master
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README;hb=master

Test harness code can be found at
http://xenbits.xen.org/gitweb?p=osstest.git;a=summary


Pushing revision :

To xenbits.xen.org:/home/xen/git/xen.git
   8d4c17a90b..df669de074  df669de074c395a3b2eeb975fddd3da4c148da13 -> smoke



Re: [PATCH 01/12] libxc/save: Shrink code volume where possible

2020-04-27 Thread Ian Jackson
Andrew Cooper writes ("Re: [PATCH 01/12] libxc/save: Shrink code volume where 
possible"):
> On 14/01/2020 16:48, Ian Jackson wrote:
> > Andrew Cooper writes ("[PATCH 01/12] libxc/save: Shrink code volume where 
> > possible"):
> >> A property of how the error handling (0 on success, nonzero otherwise)
> >> allows these calls to be chained together with the ternary operatior.
> > I'm quite surprised to find a suggestion like this coming from you in
> > particular.
> 
> What probably is relevant is that ?: is a common construct in the
> hypervisor, which I suppose does colour my expectation of people knowing
> exactly what it means and how it behaves.

I expect other C programmers to know what ?: does, too.  But I think
using it to implement the error monad is quite unusual.  I asked
around a bit and my feeling is still that this isn't an improvement.

> > Or just to permit
> >rc = write_one_vcpu_basic(ctx, i);if (rc) goto error;
> > (ie on a single line).
> 
> OTOH, it should come as no surprise that I'd rather drop this patch
> entirely than go with these alternatives, both of which detract from
> code clarity. The former for hiding control flow, and the latter for
> being atypical layout which unnecessary cognitive load to follow.

I think, then, that it would be best to drop this patch, unless Wei
(or someone else) disagrees with me.

Sorry,
Ian.



[PATCH] mem_sharing: map shared_info page to same gfn during fork

2020-04-27 Thread Tamas K Lengyel
During a VM fork we copy the shared_info page; however, we also need to ensure
that the page is mapped into the same GFN in the fork as its in the parent.

Signed-off-by: Tamas K Lengyel 
Suggested-by: Roger Pau Monne 
---
 xen/arch/x86/mm/mem_sharing.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/xen/arch/x86/mm/mem_sharing.c b/xen/arch/x86/mm/mem_sharing.c
index 344a5bfb3d..acbf21b22c 100644
--- a/xen/arch/x86/mm/mem_sharing.c
+++ b/xen/arch/x86/mm/mem_sharing.c
@@ -1656,6 +1656,7 @@ static void copy_tsc(struct domain *cd, struct domain *d)
 static int copy_special_pages(struct domain *cd, struct domain *d)
 {
 mfn_t new_mfn, old_mfn;
+gfn_t old_gfn;
 struct p2m_domain *p2m = p2m_get_hostp2m(cd);
 static const unsigned int params[] =
 {
@@ -1701,6 +1702,34 @@ static int copy_special_pages(struct domain *cd, struct 
domain *d)
 new_mfn = _mfn(virt_to_mfn(cd->shared_info));
 copy_domain_page(new_mfn, old_mfn);
 
+old_gfn = _gfn(get_gpfn_from_mfn(mfn_x(old_mfn)));
+
+if ( !gfn_eq(old_gfn, INVALID_GFN) )
+{
+/* let's make sure shared_info is mapped to the same gfn */
+gfn_t new_gfn = _gfn(get_gpfn_from_mfn(mfn_x(new_mfn)));
+
+if ( !gfn_eq(new_gfn, INVALID_GFN) && !gfn_eq(old_gfn, new_gfn) )
+{
+/* if shared info is mapped to a different gfn just remove it */
+rc = p2m->set_entry(p2m, new_gfn, INVALID_MFN, PAGE_ORDER_4K,
+p2m_invalid, p2m->default_access, -1);
+if ( rc )
+return rc;
+
+new_gfn = INVALID_GFN;
+}
+
+if ( gfn_eq(new_gfn, INVALID_GFN) )
+{
+/* if shared info is not currently mapped then map it */
+rc = p2m->set_entry(p2m, old_gfn, new_mfn, PAGE_ORDER_4K,
+p2m_ram_rw, p2m->default_access, -1);
+if ( rc )
+return rc;
+}
+}
+
 return 0;
 }
 
-- 
2.20.1




Re: [PATCH v7 08/12] xen: add /buildinfo/config entry to hypervisor filesystem

2020-04-27 Thread George Dunlap


> On Apr 27, 2020, at 4:40 PM, Jürgen Groß  wrote:
> 
> Stefano,
> 
> On 06.04.20 14:29, Jan Beulich wrote:
>> On 03.04.2020 17:45, Jürgen Groß wrote:
>>> On 03.04.20 17:33, Jan Beulich wrote:
 On 03.04.2020 17:12, Jürgen Groß wrote:
> On 03.04.20 16:31, Jan Beulich wrote:
>> On 02.04.2020 17:46, Juergen Gross wrote:
>>> --- a/xen/common/Kconfig
>>> +++ b/xen/common/Kconfig
>>> @@ -353,6 +353,16 @@ config DOM0_MEM
>>>Leave empty if you are not sure what to specify.
>>>+config HYPFS_CONFIG
>>> +bool "Provide hypervisor .config via hypfs entry"
>>> +default y
>> 
>> My initial reaction was to ask for "depends on HYPFS", but then
>> I noticed the earlier patch doesn't introduce such. Am I
>> mis-remembering that it was agreed to make the whole thing
>> possible to disable at least in EXPERT mode?
> 
> No, I don't remember that agreement.
> 
> And TBH I'm not sure this is a good idea, as that would at once make the
> plan to replace at least some sysctl and/or domctl interfaces impossible
> (like e.g. the last 3 patches of the series are doing already), or at
> least would tie the related functionality to CONFIG_HYPFS.
 
 I think that would be fine - that's what config setting are for.
 Someone caring about space may not care about runtime setting of
 parameters.
>>> 
>>> So right now it would start with a plain hypfs available or not.
>>> 
>>> The next step would be in patch 12 to tell the user he will lose the
>>> capability of setting runtime parameters.
>>> 
>>> Another planned extension would be to control per-cpupool settings,
>>> which would the go away (possibly functionality being unconditionally
>>> available today).
>>> 
>>> Next would be the lack of being able to control per-domain mitigations
>>> like XPTI or L1TF, which I'd like to add.
>>> 
>>> Another thing I wanted to add is some debugging stuff (e.g. to switch
>>> lock profiling using hypfs).
>>> 
>>> And the list will go on.
>> Understood.
>>> Does it really make sense to make a central control and information
>>> interface conditional?
>> None of the above may be of interest to e.g. embedded use cases.
>>> I'd like at least a second opinion on that topic.
>> Yes, further opinions would surely help.
> 
> Any opinion on making hypfs configurable?
> 
> It would be quite some code churn and I want to avoid it in case you
> see no benefit in configuring it out for embedded.

Just to reply with what I said on IRC:

First of all, if it were free, I think that having CONFIG_HYPFS would be 
valuable.  Sub-options should be made available on a case-by-case basis; I 
think CONFIG_HYPFS_CONFIG would be valuable, but I don’t see any point in 
having CONFIG_HYPFS_RUNTIME_PARAMETER.

However,  Juergen seems to think it would require a lot of churn to the current 
series.  I don’t quite understand why; but supposing that’s so: In general, the 
people who want a feature should be the ones who do the work to make it.  I 
think it would be perfectly reasonable for Juergen to say, “This is a lot of 
work that I don’t have time to do before the 4.14 release; if people want to be 
able to disable this feature, they can post their own patches.”  (It’s also 
perfectly reasonable for him to do the work himself just to be helpful.)

The discussion about CONFIG_EXPERT is sort of the same.  If we have 
CONFIG_HYPFS, it would obviously be more valuable if it were *not* in 
CONFIG_EXPERT, so that people could change it while still being security 
supported.

If Jan is OK with it simply being outside CONFIG_EXPERT, then great.  But if he 
insists on some kind of testing for it to be outside of CONFIG_EXPERT, then 
again, the people who want it to be security supported should be the ones who 
do the work to make it happen.

Hope that makes sense. :-)

 -George

Re: [PATCH] x86/ioemul: Rewrite stub generation

2020-04-27 Thread Roger Pau Monné
On Mon, Apr 27, 2020 at 05:18:52PM +0100, Andrew Cooper wrote:
> On 27/04/2020 16:18, Roger Pau Monné wrote:
> > On Mon, Apr 27, 2020 at 01:20:41PM +0100, Andrew Cooper wrote:
> >> +/* Helpers - Read outer scope but only modify p. */
> >> +#define APPEND_BUFF(b) ({ memcpy(p, b, sizeof(b)); p += sizeof(b); })
> >> +#define APPEND_CALL(f)  \
> >> +({  \
> >> +long disp = (long)(f) - (stub_va + p - ctxt->io_emul_stub + 5); \
> >> +BUG_ON((int32_t)disp != disp);  \
> > I'm not sure I get the point of using signed integers instead of
> > unsigned ones, AFAICT you just want to check that the displacement is
> > < 4GB so that a relative call can be used?
> 
> Displacements are +/- 2G, not <4G.
> 
> Using unsigned here would be buggy.

Right, sorry for the noise:

Reviewed-by: Roger Pau Monné 

With the minor nits pointed above in the ioemul_handle_quirk.

Thanks, Roger.



Re: [PATCH] x86/ioemul: Rewrite stub generation

2020-04-27 Thread Andrew Cooper
On 27/04/2020 16:18, Roger Pau Monné wrote:
> On Mon, Apr 27, 2020 at 01:20:41PM +0100, Andrew Cooper wrote:
>> The logic is completely undocumented and almost impossible to follow.  It
>> actually uses return oriented programming.  Rewrite it to conform to more
>> normal call mechanics, and leave a big comment explaining thing.  As well as
>> the code being easier to follow, it will execute faster as it isn't fighting
>> the branch predictor.
>>
>> Move the ioemul_handle_quirk() function pointer from traps.c to
>> ioport_emulate.c.
> Seeing as the newest quirk was added in 2008, I wonder if such quirks
> are still relevant?
>
> Maybe they are also used by newer boxes, I really have no idea.

Its not something which I'd consider altering in this patch anyway.

>
>> +
>> +static unsigned int ioemul_handle_proliant_quirk(
>>  u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs)
>>  {
>>  static const char stub[] = {
>> @@ -19,18 +22,16 @@ static bool ioemul_handle_proliant_quirk(
>>  0xa8, 0x80, /*test $0x80, %al */
>>  0x75, 0xfb, /*jnz 1b  */
>>  0x9d,   /*popf*/
>> -0xc3,   /*ret */
>>  };
>>  uint16_t port = regs->dx;
>>  uint8_t value = regs->al;
>>  
>>  if ( (opcode != 0xee) || (port != 0xcd4) || !(value & 0x80) )
>> -return false;
>> +return 0;
>>  
>>  memcpy(io_emul_stub, stub, sizeof(stub));
>> -BUILD_BUG_ON(IOEMUL_QUIRK_STUB_BYTES < sizeof(stub));
>>  
>> -return true;
>> +return sizeof(stub);
>>  }
>>  
>>  /* This table is the set of system-specific I/O emulation hooks. */
>> diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
>> index e24b84f46a..f150886711 100644
>> --- a/xen/arch/x86/pv/emul-priv-op.c
>> +++ b/xen/arch/x86/pv/emul-priv-op.c
>> @@ -54,51 +54,96 @@ struct priv_op_ctxt {
>>  unsigned int bpmatch;
>>  };
>>  
>> -/* I/O emulation support. Helper routines for, and type of, the stack stub. 
>> */
>> -void host_to_guest_gpr_switch(struct cpu_user_regs *);
>> -unsigned long guest_to_host_gpr_switch(unsigned long);
>> +/* I/O emulation helpers.  Use non-standard calling conventions. */
>> +extern const char load_guest_gprs[], save_guest_gprs[];
>>  
>>  typedef void io_emul_stub_t(struct cpu_user_regs *);
>>  
>>  static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 
>> opcode,
>>unsigned int port, unsigned int 
>> bytes)
>>  {
>> +/*
>> + * Construct a stub for IN/OUT emulation.
>> + *
>> + * Some platform drivers communicate with the SMM handler using GPRs as 
>> a
>> + * mailbox.  Therefore, we must perform the emulation with the hardware
>> + * domain's registers in view.
>> + *
>> + * We write a stub of the following form, using the guest load/save
>> + * helpers (abnormal calling conventions), and one of several possible
>> + * stubs performing the real I/O.
>> + */
>> +static const char prologue[] = {
>> +0x53,   /* push %rbx */
>> +0x55,   /* push %rbp */
>> +0x41, 0x54, /* push %r12 */
>> +0x41, 0x55, /* push %r13 */
>> +0x41, 0x56, /* push %r14 */
>> +0x41, 0x57, /* push %r15 */
>> +0x57,   /* push %rdi (param for save_guest_gprs) */
>> +};  /* call load_guest_gprs */
>> +/*  */
>> +/* call save_guest_gprs */
>> +static const char epilogue[] = {
>> +0x5f,   /* pop %rdi  */
>> +0x41, 0x5f, /* pop %r15  */
>> +0x41, 0x5e, /* pop %r14  */
>> +0x41, 0x5d, /* pop %r13  */
>> +0x41, 0x5c, /* pop %r12  */
>> +0x5d,   /* pop %rbp  */
>> +0x5b,   /* pop %rbx  */
>> +0xc3,   /* ret   */
>> +};
>> +
>>  struct stubs *this_stubs = _cpu(stubs);
>>  unsigned long stub_va = this_stubs->addr + STUB_BUF_SIZE / 2;
>> -long disp;
>> -bool use_quirk_stub = false;
>> +unsigned int quirk_bytes = 0;
>> +char *p;
>> +
>> +/* Helpers - Read outer scope but only modify p. */
>> +#define APPEND_BUFF(b) ({ memcpy(p, b, sizeof(b)); p += sizeof(b); })
>> +#define APPEND_CALL(f)  \
>> +({  \
>> +long disp = (long)(f) - (stub_va + p - ctxt->io_emul_stub + 5); \
>> +BUG_ON((int32_t)disp != disp);  \
> I'm not sure I get the point of using signed integers instead of
> unsigned ones, AFAICT you just want to check that the displacement is
> < 4GB so that a relative call can be used?

Displacements are +/- 2G, not <4G.

Using unsigned here would be buggy.

~Andrew



Re: [PATCH v2 14/17] libxc/save: Write X86_{CPUID,MSR}_DATA records

2020-04-27 Thread Ian Jackson
Andrew Cooper writes ("[PATCH v2 14/17] libxc/save: Write X86_{CPUID,MSR}_DATA 
records"):
> With all other plumbing in place, obtain the CPU Policy from Xen and
> write it into the migration stream.

Thanks for your earlier explanation in the thread:

  In all cases with migration development, the receive side logic
  (previous patch) has to come before the save side logic (this patch), or
  the result will break bisection with the receive side choking on an
  unknown record type.

  From the "whole series" point of view, compatibility is also the
  destination side discarding the data because libxl still needs its order
  of CPUID handling shuffling to cope.

I still would have some comments about the compatibility implications
*in the commit message*; maybe an abbreviated version the text above.

Nevertheless,

Acked-by: Ian Jackson 



Re: [PATCH v2 03/17] tools/migration: Drop IHDR_VERSION constant from libxc and python [and 1 more messages]

2020-04-27 Thread Ian Jackson
Andrew Cooper writes ("Re: [PATCH v2 03/17] tools/migration: Drop IHDR_VERSION 
constant from libxc and python"):
> I presume you mean here 2x send IHDR and 2x receive IHDR, one C and one
> Python in each case.
> 
> I understand what you're suggesting.  I completely disagree with it, as
> it obfuscates the logic and introduces a source of bugs for zero gain.
...
> The only thing IHDR_VERSION_* usefully gets you is the ability to get
> the defines accidentally wrong, and introduce bugs that way.

Andrew Cooper writes ("Re: [PATCH v2 07/17] libxc/restore: STATIC_DATA_END 
inference for v2 compatibility"):
> On 05/03/2020 16:24, Ian Jackson wrote:
> > You could handle that with a small bit of code around one of the call
> > sites to adjust the error handling.  (Also, what a mess, but I guess
> > we're here now...)
> 
> ... which is the majority the code you're trying to abstract away.
...
> tl;dr I could put an #ifdef x86'd static inline in the root common
> header (xc_sr_common.h), but the overall complexity is greater than what
> is presented here.

I still don't agree with you I'm afraid.  I went back through our
messages, and looked at the code again, and I think we are probably
still not communicating well.  However, I thought about how best to
deal with this disagreement.  As the actual author of much of this
code, and certainly the person putting a lot of effort in, you should
get quite a bit of leeway.

I considered taking your branch and showing you what I meant in code
terms.  But ultimately I think this is a waste of our time and I don't
want us to get into a pointless argument.  I don't think these issues
matter enough to be worth the debate.  I don't think there are actual
bugs here - we're talking about matters of style and taste.

So,

Acked-by: Ian Jackson 

It would probably have been better for me to have got to this point
earlier.

Sorry,
Ian.



Re: [PATCH v3 10/17] tools/libxl: Plumb a restore boolean into libxl__domain_build_state

2020-04-27 Thread Ian Jackson
Andrew Cooper writes ("[PATCH v3 10/17] tools/libxl: Plumb a restore boolean 
into libxl__domain_build_state"):
> To fix CPUID handling, libxl__build_pre() is going to have to distinguish
> between a brand new VM vs one which is being migrated-in/resumed.
> 
> Transcribe dcs->restore_fd into dbs->restore in initiate_domain_create()
> only (specifically avoiding the stubdom state in libxl__spawn_stub_dm()).
> 
> While tweaking initiate_domain_create(), make a new dbs alias and simplify
> later code, and drop the local restore_fd alias as the new dbs->restore is
> more intuitive in context.
> 
> No functional change.

Acked-by: Ian Jackson 



Re: [PATCH] x86: refine guest_mode()

2020-04-27 Thread Roger Pau Monné
On Mon, Apr 27, 2020 at 04:08:59PM +0200, Jan Beulich wrote:
> On 27.04.2020 11:59, Roger Pau Monné wrote:
> > On Mon, Apr 27, 2020 at 10:03:05AM +0200, Jan Beulich wrote:
> >> --- a/xen/include/asm-x86/regs.h
> >> +++ b/xen/include/asm-x86/regs.h
> >> @@ -10,9 +10,10 @@
> >>  /* Frame pointer must point into current CPU stack. */
> >> \
> >>  ASSERT(diff < STACK_SIZE);
> >> \
> >>  /* If not a guest frame, it must be a hypervisor frame. */
> >> \
> >> -ASSERT((diff == 0) || (r->cs == __HYPERVISOR_CS));
> >> \
> >> +if ( diff < PRIMARY_STACK_SIZE )  
> >> \
> >> +ASSERT(!diff || ((r)->cs == __HYPERVISOR_CS));
> >> \
> > 
> > Why not use:
> > 
> > ASSERT(diff >= PRIMARY_STACK_SIZE || !diff || ((r)->cs == __HYPERVISOR_CS));
> 
> Except for the longer (without being helpful imo) string reported if
> the assertion triggers, I see not difference.

Wanted to avoid the empty if on non-debug builds, but I assume the
compiler will already optimize it out.

> > I'm not sure I fully understand this layout, is it possible that you
> > also need to account for the size of cpu_info?
> 
> Depends on how paranoid we want the checking here to be, but in going
> further I wouldn't want this to become sub-page fine-grained if we
> aren't first doing e.g. what I'm mentioning in the post-commit-message
> remark.

Right, leaving it as-is is fine, just wanted to be sure I fully
understand the layout.

Thanks, Roger.



Re: [PATCH v4] docs/designs: re-work the xenstore migration document...

2020-04-27 Thread Jürgen Groß

On 27.04.20 17:50, Paul Durrant wrote:

From: Paul Durrant 

... to specify a separate migration stream that will also be suitable for
live update.

The original scope of the document was to support non-cooperative migration
of guests [1] but, since then, live update of xenstored has been brought into
scope. Thus it makes more sense to define a separate image format for
serializing xenstore state that is suitable for both purposes.

The document has been limited to specifying a new image format. The mechanism
for acquiring the image for live update or migration is not covered as that
is more appropriately dealt with by a patch to docs/misc/xenstore.txt. It is
also expected that, when the first implementation of live update or migration
making use of this specification is committed, that the document is moved from
docs/designs into docs/specs.

NOTE: It will only be necessary to save and restore state for active xenstore
   connections, but the documentation for 'RESUME' in xenstore.txt implies
   otherwise. That command is unused so this patch deletes it from the
   specification.

[1] See 
https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=docs/designs/non-cooperative-migration.md

Signed-off-by: Paul Durrant 


Reviewed-by: Juergen Gross 


Juergen



[PATCH v4] docs/designs: re-work the xenstore migration document...

2020-04-27 Thread Paul Durrant
From: Paul Durrant 

... to specify a separate migration stream that will also be suitable for
live update.

The original scope of the document was to support non-cooperative migration
of guests [1] but, since then, live update of xenstored has been brought into
scope. Thus it makes more sense to define a separate image format for
serializing xenstore state that is suitable for both purposes.

The document has been limited to specifying a new image format. The mechanism
for acquiring the image for live update or migration is not covered as that
is more appropriately dealt with by a patch to docs/misc/xenstore.txt. It is
also expected that, when the first implementation of live update or migration
making use of this specification is committed, that the document is moved from
docs/designs into docs/specs.

NOTE: It will only be necessary to save and restore state for active xenstore
  connections, but the documentation for 'RESUME' in xenstore.txt implies
  otherwise. That command is unused so this patch deletes it from the
  specification.

[1] See 
https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=docs/designs/non-cooperative-migration.md

Signed-off-by: Paul Durrant 
---
Cc: Juergen Gross 
Cc: Andrew Cooper 
Cc: George Dunlap 
Cc: Ian Jackson 
Cc: Jan Beulich 
Cc: Julien Grall 
Cc: Stefano Stabellini 
Cc: Wei Liu 

v4:
 - move path-len and value-len earlier in NODE_DATA

v3:
 - Address missed comments from Juergen

v2:
 - Address comments from Juergen
---
 docs/designs/xenstore-migration.md | 470 +++--
 docs/misc/xenstore.txt |  17 --
 2 files changed, 308 insertions(+), 179 deletions(-)

diff --git a/docs/designs/xenstore-migration.md 
b/docs/designs/xenstore-migration.md
index 6ab351e8fe..51d8b85171 100644
--- a/docs/designs/xenstore-migration.md
+++ b/docs/designs/xenstore-migration.md
@@ -3,254 +3,400 @@
 ## Background
 
 The design for *Non-Cooperative Migration of Guests*[1] explains that extra
-save records are required in the migrations stream to allow a guest running
-PV drivers to be migrated without its co-operation. Moreover the save
-records must include details of registered xenstore watches as well as
-content; information that cannot currently be recovered from `xenstored`,
-and hence some extension to the xenstore protocol[2] will also be required.
-
-The *libxenlight Domain Image Format* specification[3] already defines a
-record type `EMULATOR_XENSTORE_DATA` but this is not suitable for
-transferring xenstore data pertaining to the domain directly as it is
-specified such that keys are relative to the path
-`/local/domain/$dm_domid/device-model/$domid`. Thus it is necessary to
-define at least one new save record type.
+save records are required in the migrations stream to allow a guest running PV
+drivers to be migrated without its co-operation. Moreover the save records must
+include details of registered xenstore watches as well ascontent; information
+that cannot currently be recovered from `xenstored`, and hence some extension
+to the xenstored implementations will also be required.
+
+As a similar set of data is needed for transferring xenstore data from one
+instance to another when live updating xenstored this document proposes an
+image format for a 'migration stream' suitable for both purposes.
 
 ## Proposal
 
-### New Save Record
+The image format consists of a _header_ followed by 1 or more _records_. Each
+record consists of a type and length field, followed by any data mandated by
+the record type. At minimum there will be a single record of type `END`
+(defined below).
 
-A new mandatory record type should be defined within the libxenlight Domain
-Image Format:
+### Header
 
-`0x0007: DOMAIN_XENSTORE_DATA`
+The header identifies the stream as a `xenstore` stream, including the version
+of the specification that it complies with.
 
-An arbitrary number of these records may be present in the migration
-stream and may appear in any order. The format of each record should be as
-follows:
+All fields in this header must be in _big-endian_ byte order, regardless of
+the setting of the endianness bit.
 
 
 ```
 0   1   2   3   4   5   6   7octet
 +---+---+---+---+---+---+---+---+
-| type  | record specific data  |
-+---+   |
-...
-+---+
+| ident |
++---+---|
+| version   | flags |
++---+---+
 ```
 
-where type is one of the following values
 
+| Field | Description   |
+|---|---|
+| `ident`   | 

RE: [PATCH v3] docs/designs: re-work the xenstore migration document...

2020-04-27 Thread Paul Durrant
> -Original Message-
> From: Jürgen Groß 
> Sent: 27 April 2020 16:14
> To: Paul Durrant ; xen-devel@lists.xenproject.org
> Cc: Paul Durrant ; Andrew Cooper 
> ; George Dunlap
> ; Ian Jackson ; Jan 
> Beulich ;
> Julien Grall ; Stefano Stabellini ; 
> Wei Liu 
> Subject: Re: [PATCH v3] docs/designs: re-work the xenstore migration 
> document...
> 
> On 27.04.20 17:06, Paul Durrant wrote:
> > From: Paul Durrant 
> >
> > ... to specify a separate migration stream that will also be suitable for
> > live update.
> >
> > The original scope of the document was to support non-cooperative migration
> > of guests [1] but, since then, live update of xenstored has been brought 
> > into
> > scope. Thus it makes more sense to define a separate image format for
> > serializing xenstore state that is suitable for both purposes.
> >
> > The document has been limited to specifying a new image format. The 
> > mechanism
> > for acquiring the image for live update or migration is not covered as that
> > is more appropriately dealt with by a patch to docs/misc/xenstore.txt. It is
> > also expected that, when the first implementation of live update or 
> > migration
> > making use of this specification is committed, that the document is moved 
> > from
> > docs/designs into docs/specs.
> >
> > NOTE: It will only be necessary to save and restore state for active 
> > xenstore
> >connections, but the documentation for 'RESUME' in xenstore.txt 
> > implies
> >otherwise. That command is unused so this patch deletes it from the
> >specification.
> >
> > [1] See 
> > https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=docs/designs/non-cooperative-migration.md
> >
> > Signed-off-by: Paul Durrant 
> > ---
> > Cc: Juergen Gross 
> > Cc: Andrew Cooper 
> > Cc: George Dunlap 
> > Cc: Ian Jackson 
> > Cc: Jan Beulich 
> > Cc: Julien Grall 
> > Cc: Stefano Stabellini 
> > Cc: Wei Liu 
> >
> > v3:
> >   - Address missed comments from Juergen
> 
> Not all :-(
> 
> >
> > v2:
> >   - Address comments from Juergen
> > ---
> 
> ...
> 
> > +### NODE_DATA
> > +
> > +For live update the image format will contain a `NODE_DATA` record for each
> > +node in xenstore. For migration it will only contain a record for the nodes
> > +relating to the domain being migrated. The `NODE_DATA` may be related to
> > +a _committed_ node (globally visible in xenstored) or a _pending_ node 
> > (created
> > +or modified by a transaction for which there is also a `TRANSACTION_DATA`
> > +record previously present).
> >
> > -The `TRANSACTION_START` operation does not allow specification of a
> > -``; it is assumed that the transaction pertains to the domain that
> > -owns the shared ring over which the operation is passed. Neither does it
> > -allow a `` to be specified; it is always chosen by xenstored.
> > -Hence, for the tool-stack to be able to open a transaction on behalf of a
> > -domain a new operation is needed:
> >
> >   ```
> > -START_DOMAIN_TRANSACTION||
> > +0   1   2   3octet
> > ++---+---+---+---+
> > +| conn-id   |
> > ++---+
> > +| tx-id |
> > ++---+---+
> > +| access| perm-count|
> > ++---+---+
> > +| perm1 |
> > ++---+
> > +...
> > ++---+
> > +| permN |
> > ++---+---+
> > +| path-len  | value-len |
> > ++---+---+
> 
> 
> path-len and value-len are still after perm1...permN, which makes it
> impossible to include them in a struct.
> 
> Could you please either move them before the perm array or tell me why
> not?

Gah, sorry, I meant to move them and then forgot again. I'll send v4.

  Paul

> 
> 
> Juergen




[linux-linus test] 149832: tolerable FAIL - PUSHED

2020-04-27 Thread osstest service owner
flight 149832 linux-linus real [real]
http://logs.test-lab.xenproject.org/osstest/logs/149832/

Failures :-/ but no regressions.

Regressions which are regarded as allowable (not blocking):
 test-amd64-amd64-xl-rtds 16 guest-localmigrate   fail REGR. vs. 149830

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-xl-qemut-win7-amd64 17 guest-stopfail like 149830
 test-armhf-armhf-libvirt 14 saverestore-support-checkfail  like 149830
 test-amd64-amd64-xl-qemuu-win7-amd64 17 guest-stopfail like 149830
 test-amd64-amd64-xl-qemut-ws16-amd64 17 guest-stopfail like 149830
 test-amd64-i386-xl-qemuu-win7-amd64 17 guest-stop fail like 149830
 test-armhf-armhf-libvirt-raw 13 saverestore-support-checkfail  like 149830
 test-amd64-i386-xl-qemut-win7-amd64 17 guest-stop fail like 149830
 test-amd64-amd64-xl-qemuu-ws16-amd64 17 guest-stopfail like 149830
 test-amd64-i386-xl-pvshim12 guest-start  fail   never pass
 test-amd64-amd64-libvirt-xsm 13 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  14 saverestore-support-checkfail   never pass
 test-amd64-i386-libvirt-xsm  13 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt  13 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 11 migrate-support-check 
fail never pass
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 11 migrate-support-check 
fail never pass
 test-amd64-amd64-qemuu-nested-amd 17 debian-hvm-install/l1/l2  fail never pass
 test-arm64-arm64-xl-xsm  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 14 saverestore-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 13 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  14 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-vhd 12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt 13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 13 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 14 saverestore-support-checkfail  never pass
 test-armhf-armhf-xl  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  14 saverestore-support-checkfail   never pass
 test-amd64-i386-xl-qemuu-ws16-amd64 17 guest-stop  fail never pass
 test-armhf-armhf-libvirt-raw 12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-cubietruck 13 migrate-support-checkfail never pass
 test-armhf-armhf-xl-cubietruck 14 saverestore-support-checkfail never pass
 test-armhf-armhf-xl-vhd  12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  13 saverestore-support-checkfail   never pass
 test-amd64-i386-xl-qemut-ws16-amd64 17 guest-stop  fail never pass

version targeted for testing:
 linux6a8b55ed4056ea5559ebe4f6a4b247f627870d4c
baseline version:
 linuxe9a61afb69f07b1c5880984d45e5cc232ec1bf6f

Last test of basis   149830  2020-04-26 18:38:51 Z0 days
Testing same since   149832  2020-04-27 03:24:02 Z0 days1 attempts


People who touched revisions under test:
  Linus Torvalds 
  Paulo Alcantara (SUSE) 
  Paulo Alcantara 
  Ronnie Sahlberg 
  Steve French 

jobs:
 build-amd64-xsm  

Re: [PATCH v7 08/12] xen: add /buildinfo/config entry to hypervisor filesystem

2020-04-27 Thread Jürgen Groß

Stefano,

On 06.04.20 14:29, Jan Beulich wrote:

On 03.04.2020 17:45, Jürgen Groß wrote:

On 03.04.20 17:33, Jan Beulich wrote:

On 03.04.2020 17:12, Jürgen Groß wrote:

On 03.04.20 16:31, Jan Beulich wrote:

On 02.04.2020 17:46, Juergen Gross wrote:

--- a/xen/common/Kconfig
+++ b/xen/common/Kconfig
@@ -353,6 +353,16 @@ config DOM0_MEM
        Leave empty if you are not sure what to specify.
    +config HYPFS_CONFIG
+    bool "Provide hypervisor .config via hypfs entry"
+    default y


My initial reaction was to ask for "depends on HYPFS", but then
I noticed the earlier patch doesn't introduce such. Am I
mis-remembering that it was agreed to make the whole thing
possible to disable at least in EXPERT mode?


No, I don't remember that agreement.

And TBH I'm not sure this is a good idea, as that would at once make the
plan to replace at least some sysctl and/or domctl interfaces impossible
(like e.g. the last 3 patches of the series are doing already), or at
least would tie the related functionality to CONFIG_HYPFS.


I think that would be fine - that's what config setting are for.
Someone caring about space may not care about runtime setting of
parameters.


So right now it would start with a plain hypfs available or not.

The next step would be in patch 12 to tell the user he will lose the
capability of setting runtime parameters.

Another planned extension would be to control per-cpupool settings,
which would the go away (possibly functionality being unconditionally
available today).

Next would be the lack of being able to control per-domain mitigations
like XPTI or L1TF, which I'd like to add.

Another thing I wanted to add is some debugging stuff (e.g. to switch
lock profiling using hypfs).

And the list will go on.


Understood.


Does it really make sense to make a central control and information
interface conditional?


None of the above may be of interest to e.g. embedded use cases.


I'd like at least a second opinion on that topic.


Yes, further opinions would surely help.


Any opinion on making hypfs configurable?

It would be quite some code churn and I want to avoid it in case you
see no benefit in configuring it out for embedded.


Juergen




Re: [PATCH] x86/ioemul: Rewrite stub generation

2020-04-27 Thread Jan Beulich
On 27.04.2020 14:20, Andrew Cooper wrote:
> The logic is completely undocumented and almost impossible to follow.  It
> actually uses return oriented programming.  Rewrite it to conform to more
> normal call mechanics, and leave a big comment explaining thing.  As well as
> the code being easier to follow, it will execute faster as it isn't fighting
> the branch predictor.
> 
> Move the ioemul_handle_quirk() function pointer from traps.c to
> ioport_emulate.c.  There is no reason for it to be in neither of the two
> translation units which use it.  Alter the behaviour to return the number of
> bytes written into the stub.
> 
> Access the addresses of the host/guest helpers with extern const char arrays.
> Nothing good will come of C thinking they are regular functions.

I agree with the C aspect, but imo the assembly routines should,
with the changes you make, be marked as being ordinary functions.
A reasonable linker would then warn about the C file wanting an
STT_OBJECT while the assembly file provides an STT_FUNC. I'd
therefore prefer, along with marking the functions as such, to
have them also declared as functions in C.

> --- a/xen/arch/x86/ioport_emulate.c
> +++ b/xen/arch/x86/ioport_emulate.c
> @@ -8,7 +8,10 @@
>  #include 
>  #include 
>  
> -static bool ioemul_handle_proliant_quirk(
> +unsigned int (*ioemul_handle_quirk)(
> +u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);

Would you mind adding __read_mostly on this occasion?

> @@ -19,18 +22,16 @@ static bool ioemul_handle_proliant_quirk(
>  0xa8, 0x80, /*test $0x80, %al */
>  0x75, 0xfb, /*jnz 1b  */
>  0x9d,   /*popf*/
> -0xc3,   /*ret */
>  };
>  uint16_t port = regs->dx;
>  uint8_t value = regs->al;
>  
>  if ( (opcode != 0xee) || (port != 0xcd4) || !(value & 0x80) )
> -return false;
> +return 0;
>  
>  memcpy(io_emul_stub, stub, sizeof(stub));
> -BUILD_BUG_ON(IOEMUL_QUIRK_STUB_BYTES < sizeof(stub));

So you treat a build failure for a runtime crash. I can see the
advantages of the new approach, but the original got away with
less stub space. If our L1_CACHE_SHIFT wasn't hard coded to 7
just to cover some unusual CPUs, your new approach would, if I
got the counting and calculations right, not work (with a value
resulting in a 64-byte cache line size). Introducing a Kconfig
option for this should imo not come with a need to re-work the
logic here again. Therefore I'd like us to think about a way
to make the space needed not exceed 32 bytes.

One option might be to arrange for some or all of R12-R15 to
not need spilling. And of course there then still ought to be
a BUILD_BUG_ON() identifying ahead of any testing that yet
smaller cache line sizes would indeed require re-work here.

Jan



Re: [PATCH] x86/ioemul: Rewrite stub generation

2020-04-27 Thread Roger Pau Monné
On Mon, Apr 27, 2020 at 01:20:41PM +0100, Andrew Cooper wrote:
> The logic is completely undocumented and almost impossible to follow.  It
> actually uses return oriented programming.  Rewrite it to conform to more
> normal call mechanics, and leave a big comment explaining thing.  As well as
> the code being easier to follow, it will execute faster as it isn't fighting
> the branch predictor.
> 
> Move the ioemul_handle_quirk() function pointer from traps.c to
> ioport_emulate.c.

Seeing as the newest quirk was added in 2008, I wonder if such quirks
are still relevant?

Maybe they are also used by newer boxes, I really have no idea.

> There is no reason for it to be in neither of the two
> translation units which use it.  Alter the behaviour to return the number of
> bytes written into the stub.
> 
> Access the addresses of the host/guest helpers with extern const char arrays.
> Nothing good will come of C thinking they are regular functions.
> 
> Signed-off-by: Andrew Cooper 
> --
> CC: Jan Beulich 
> CC: Wei Liu 
> CC: Roger Pau Monné 
> ---
>  xen/arch/x86/ioport_emulate.c  | 11 ++---
>  xen/arch/x86/pv/emul-priv-op.c | 91 
> +++---
>  xen/arch/x86/pv/gpr_switch.S   | 37 +
>  xen/arch/x86/traps.c   |  3 --
>  xen/include/asm-x86/io.h   |  3 +-
>  5 files changed, 85 insertions(+), 60 deletions(-)
> 
> diff --git a/xen/arch/x86/ioport_emulate.c b/xen/arch/x86/ioport_emulate.c
> index 499c1f6056..f7511a9c49 100644
> --- a/xen/arch/x86/ioport_emulate.c
> +++ b/xen/arch/x86/ioport_emulate.c
> @@ -8,7 +8,10 @@
>  #include 
>  #include 
>  
> -static bool ioemul_handle_proliant_quirk(
> +unsigned int (*ioemul_handle_quirk)(
> +u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);

uint8_t for opcode and also I think regs can be made const?

(at least looking at the only implementation from
ioemul_handle_proliant_quirk). I'm not familiar with this area
however.

> +
> +static unsigned int ioemul_handle_proliant_quirk(
>  u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs)
>  {
>  static const char stub[] = {
> @@ -19,18 +22,16 @@ static bool ioemul_handle_proliant_quirk(
>  0xa8, 0x80, /*test $0x80, %al */
>  0x75, 0xfb, /*jnz 1b  */
>  0x9d,   /*popf*/
> -0xc3,   /*ret */
>  };
>  uint16_t port = regs->dx;
>  uint8_t value = regs->al;
>  
>  if ( (opcode != 0xee) || (port != 0xcd4) || !(value & 0x80) )
> -return false;
> +return 0;
>  
>  memcpy(io_emul_stub, stub, sizeof(stub));
> -BUILD_BUG_ON(IOEMUL_QUIRK_STUB_BYTES < sizeof(stub));
>  
> -return true;
> +return sizeof(stub);
>  }
>  
>  /* This table is the set of system-specific I/O emulation hooks. */
> diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
> index e24b84f46a..f150886711 100644
> --- a/xen/arch/x86/pv/emul-priv-op.c
> +++ b/xen/arch/x86/pv/emul-priv-op.c
> @@ -54,51 +54,96 @@ struct priv_op_ctxt {
>  unsigned int bpmatch;
>  };
>  
> -/* I/O emulation support. Helper routines for, and type of, the stack stub. 
> */
> -void host_to_guest_gpr_switch(struct cpu_user_regs *);
> -unsigned long guest_to_host_gpr_switch(unsigned long);
> +/* I/O emulation helpers.  Use non-standard calling conventions. */
> +extern const char load_guest_gprs[], save_guest_gprs[];
>  
>  typedef void io_emul_stub_t(struct cpu_user_regs *);
>  
>  static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 
> opcode,
>unsigned int port, unsigned int 
> bytes)
>  {
> +/*
> + * Construct a stub for IN/OUT emulation.
> + *
> + * Some platform drivers communicate with the SMM handler using GPRs as a
> + * mailbox.  Therefore, we must perform the emulation with the hardware
> + * domain's registers in view.
> + *
> + * We write a stub of the following form, using the guest load/save
> + * helpers (abnormal calling conventions), and one of several possible
> + * stubs performing the real I/O.
> + */
> +static const char prologue[] = {
> +0x53,   /* push %rbx */
> +0x55,   /* push %rbp */
> +0x41, 0x54, /* push %r12 */
> +0x41, 0x55, /* push %r13 */
> +0x41, 0x56, /* push %r14 */
> +0x41, 0x57, /* push %r15 */
> +0x57,   /* push %rdi (param for save_guest_gprs) */
> +};  /* call load_guest_gprs */
> +/*  */
> +/* call save_guest_gprs */
> +static const char epilogue[] = {
> +0x5f,   /* pop %rdi  */
> +0x41, 0x5f, /* pop %r15  */
> +0x41, 0x5e, /* pop %r14  */
> +0x41, 0x5d, /* pop %r13  */
> +0x41, 0x5c, /* pop %r12  */
> +0x5d,   /* pop %rbp  */
> +0x5b,   /* pop %rbx  */
> +0xc3,   /* ret   */
> +};
> +

Re: [PATCH] x86: refine guest_mode()

2020-04-27 Thread Jan Beulich
On 27.04.2020 16:35, Andrew Cooper wrote:
> On 27/04/2020 09:03, Jan Beulich wrote:
>> The 2nd of the assertions as well as the macro's return value have been
>> assuming we're on the primary stack. While for most IST exceptions we
>> eventually switch back to the main one,
> 
> "we switch to the main one when interrupting user mode".
> 
> "eventually" isn't accurate as it is before we enter C.

Right, will change.

>> --- a/xen/include/asm-x86/regs.h
>> +++ b/xen/include/asm-x86/regs.h
>> @@ -10,9 +10,10 @@
>>  /* Frame pointer must point into current CPU stack. */  
>>   \
>>  ASSERT(diff < STACK_SIZE);  
>>   \
>>  /* If not a guest frame, it must be a hypervisor frame. */  
>>   \
>> -ASSERT((diff == 0) || (r->cs == __HYPERVISOR_CS));  
>>   \
>> +if ( diff < PRIMARY_STACK_SIZE )
>>   \
>> +ASSERT(!diff || ((r)->cs == __HYPERVISOR_CS));  
>>   \
>>  /* Return TRUE if it's a guest frame. */
>>   \
>> -(diff == 0);
>>   \
>> +!diff || ((r)->cs != __HYPERVISOR_CS);  
>>   \
> 
> The (diff == 0) already worried me before because it doesn't fail safe,
> but this makes things more problematic.  Consider the case back when we
> had __HYPERVISOR_CS32.

Yes - if __HYPERVISOR_CS32 would ever have been to be used for
anything, it would have needed checking for here.

> Guest mode is strictly "(r)->cs & 3".

As long as CS (a) gets properly saved (it's a "manual" step for
SYSCALL/SYSRET as well as #VMEXIT) and (b) didn't get clobbered. I
didn't write this code, I don't think, so I can only guess that
there were intentions behind this along these lines.

> Everything else is expectations about how things ought to be laid out,
> but for safety in release builds, the final judgement should not depend
> on the expectations evaluating true.

Well, I can switch to a purely CS.RPL based approach, as long as
we're happy to live with the possible downside mentioned above.
Of course this would then end up being a more intrusive change
than originally intended ...

Jan



Re: [PATCH v3] docs/designs: re-work the xenstore migration document...

2020-04-27 Thread Jürgen Groß

On 27.04.20 17:06, Paul Durrant wrote:

From: Paul Durrant 

... to specify a separate migration stream that will also be suitable for
live update.

The original scope of the document was to support non-cooperative migration
of guests [1] but, since then, live update of xenstored has been brought into
scope. Thus it makes more sense to define a separate image format for
serializing xenstore state that is suitable for both purposes.

The document has been limited to specifying a new image format. The mechanism
for acquiring the image for live update or migration is not covered as that
is more appropriately dealt with by a patch to docs/misc/xenstore.txt. It is
also expected that, when the first implementation of live update or migration
making use of this specification is committed, that the document is moved from
docs/designs into docs/specs.

NOTE: It will only be necessary to save and restore state for active xenstore
   connections, but the documentation for 'RESUME' in xenstore.txt implies
   otherwise. That command is unused so this patch deletes it from the
   specification.

[1] See 
https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=docs/designs/non-cooperative-migration.md

Signed-off-by: Paul Durrant 
---
Cc: Juergen Gross 
Cc: Andrew Cooper 
Cc: George Dunlap 
Cc: Ian Jackson 
Cc: Jan Beulich 
Cc: Julien Grall 
Cc: Stefano Stabellini 
Cc: Wei Liu 

v3:
  - Address missed comments from Juergen


Not all :-(



v2:
  - Address comments from Juergen
---


...


+### NODE_DATA
+
+For live update the image format will contain a `NODE_DATA` record for each
+node in xenstore. For migration it will only contain a record for the nodes
+relating to the domain being migrated. The `NODE_DATA` may be related to
+a _committed_ node (globally visible in xenstored) or a _pending_ node (created
+or modified by a transaction for which there is also a `TRANSACTION_DATA`
+record previously present).
  
-The `TRANSACTION_START` operation does not allow specification of a

-``; it is assumed that the transaction pertains to the domain that
-owns the shared ring over which the operation is passed. Neither does it
-allow a `` to be specified; it is always chosen by xenstored.
-Hence, for the tool-stack to be able to open a transaction on behalf of a
-domain a new operation is needed:
  
  ```

-START_DOMAIN_TRANSACTION||
+0   1   2   3octet
++---+---+---+---+
+| conn-id   |
++---+
+| tx-id |
++---+---+
+| access| perm-count|
++---+---+
+| perm1 |
++---+
+...
++---+
+| permN |
++---+---+
+| path-len  | value-len |
++---+---+



path-len and value-len are still after perm1...permN, which makes it
impossible to include them in a struct.

Could you please either move them before the perm array or tell me why
not?


Juergen



[PATCH v3] docs/designs: re-work the xenstore migration document...

2020-04-27 Thread Paul Durrant
From: Paul Durrant 

... to specify a separate migration stream that will also be suitable for
live update.

The original scope of the document was to support non-cooperative migration
of guests [1] but, since then, live update of xenstored has been brought into
scope. Thus it makes more sense to define a separate image format for
serializing xenstore state that is suitable for both purposes.

The document has been limited to specifying a new image format. The mechanism
for acquiring the image for live update or migration is not covered as that
is more appropriately dealt with by a patch to docs/misc/xenstore.txt. It is
also expected that, when the first implementation of live update or migration
making use of this specification is committed, that the document is moved from
docs/designs into docs/specs.

NOTE: It will only be necessary to save and restore state for active xenstore
  connections, but the documentation for 'RESUME' in xenstore.txt implies
  otherwise. That command is unused so this patch deletes it from the
  specification.

[1] See 
https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=docs/designs/non-cooperative-migration.md

Signed-off-by: Paul Durrant 
---
Cc: Juergen Gross 
Cc: Andrew Cooper 
Cc: George Dunlap 
Cc: Ian Jackson 
Cc: Jan Beulich 
Cc: Julien Grall 
Cc: Stefano Stabellini 
Cc: Wei Liu 

v3:
 - Address missed comments from Juergen

v2:
 - Address comments from Juergen
---
 docs/designs/xenstore-migration.md | 470 +++--
 docs/misc/xenstore.txt |  17 --
 2 files changed, 308 insertions(+), 179 deletions(-)

diff --git a/docs/designs/xenstore-migration.md 
b/docs/designs/xenstore-migration.md
index 6ab351e8fe..61db212507 100644
--- a/docs/designs/xenstore-migration.md
+++ b/docs/designs/xenstore-migration.md
@@ -3,254 +3,400 @@
 ## Background
 
 The design for *Non-Cooperative Migration of Guests*[1] explains that extra
-save records are required in the migrations stream to allow a guest running
-PV drivers to be migrated without its co-operation. Moreover the save
-records must include details of registered xenstore watches as well as
-content; information that cannot currently be recovered from `xenstored`,
-and hence some extension to the xenstore protocol[2] will also be required.
-
-The *libxenlight Domain Image Format* specification[3] already defines a
-record type `EMULATOR_XENSTORE_DATA` but this is not suitable for
-transferring xenstore data pertaining to the domain directly as it is
-specified such that keys are relative to the path
-`/local/domain/$dm_domid/device-model/$domid`. Thus it is necessary to
-define at least one new save record type.
+save records are required in the migrations stream to allow a guest running PV
+drivers to be migrated without its co-operation. Moreover the save records must
+include details of registered xenstore watches as well ascontent; information
+that cannot currently be recovered from `xenstored`, and hence some extension
+to the xenstored implementations will also be required.
+
+As a similar set of data is needed for transferring xenstore data from one
+instance to another when live updating xenstored this document proposes an
+image format for a 'migration stream' suitable for both purposes.
 
 ## Proposal
 
-### New Save Record
+The image format consists of a _header_ followed by 1 or more _records_. Each
+record consists of a type and length field, followed by any data mandated by
+the record type. At minimum there will be a single record of type `END`
+(defined below).
 
-A new mandatory record type should be defined within the libxenlight Domain
-Image Format:
+### Header
 
-`0x0007: DOMAIN_XENSTORE_DATA`
+The header identifies the stream as a `xenstore` stream, including the version
+of the specification that it complies with.
 
-An arbitrary number of these records may be present in the migration
-stream and may appear in any order. The format of each record should be as
-follows:
+All fields in this header must be in _big-endian_ byte order, regardless of
+the setting of the endianness bit.
 
 
 ```
 0   1   2   3   4   5   6   7octet
 +---+---+---+---+---+---+---+---+
-| type  | record specific data  |
-+---+   |
-...
-+---+
+| ident |
++---+---|
+| version   | flags |
++---+---+
 ```
 
-where type is one of the following values
 
+| Field | Description   |
+|---|---|
+| `ident`   | 0x78656e73746f7265 ('xenstore' in ASCII)  |
+|   |

Re: [PATCH v7 05/12] libs: add libxenhypfs

2020-04-27 Thread George Dunlap


> On Apr 2, 2020, at 4:46 PM, Juergen Gross  wrote:
> 
> Add the new library libxenhypfs for access to the hypervisor filesystem.
> 
> Signed-off-by: Juergen Gross 
> Acked-by: Wei Liu 

Just a few questions...

> +/* Returned buffer and dirent should be freed via free(). */
> +void *xenhypfs_read_raw(xenhypfs_handle *fshdl, const char *path,
> +struct xenhypfs_dirent **dirent);
> +
> +/* Returned buffer should be freed via free(). */
> +char *xenhypfs_read(xenhypfs_handle *fshdl, const char *path);

What’s the difference between these two?  And what’s the `dirent` argument to 
xenhypfs_read_raw() for?

 -George



Re: [PATCH] x86: refine guest_mode()

2020-04-27 Thread Andrew Cooper
On 27/04/2020 09:03, Jan Beulich wrote:
> The 2nd of the assertions as well as the macro's return value have been
> assuming we're on the primary stack. While for most IST exceptions we
> eventually switch back to the main one,

"we switch to the main one when interrupting user mode".

"eventually" isn't accurate as it is before we enter C.

>  for #DF we intentionally never
> do, and hence a #DF actually triggering on a user mode insn (which then
> is still a Xen bug) would in turn trigger this assertion, rather than
> cleanly logging state.
>
> Reported-by: Andrew Cooper 
> Signed-off-by: Jan Beulich 
> ---
> While we could go further and also assert we're on the correct IST
> stack in an "else" ti the "if()" added, I'm not fully convinced this
> would be generally helpful. I'll be happy to adjust accordingly if
> others think differently; at such a point though I think this should
> then no longer be a macro.
>
> --- a/xen/include/asm-x86/regs.h
> +++ b/xen/include/asm-x86/regs.h
> @@ -10,9 +10,10 @@
>  /* Frame pointer must point into current CPU stack. */   
>  \
>  ASSERT(diff < STACK_SIZE);   
>  \
>  /* If not a guest frame, it must be a hypervisor frame. */   
>  \
> -ASSERT((diff == 0) || (r->cs == __HYPERVISOR_CS));   
>  \
> +if ( diff < PRIMARY_STACK_SIZE ) 
>  \
> +ASSERT(!diff || ((r)->cs == __HYPERVISOR_CS));   
>  \
>  /* Return TRUE if it's a guest frame. */ 
>  \
> -(diff == 0); 
>  \
> +!diff || ((r)->cs != __HYPERVISOR_CS);   
>  \

The (diff == 0) already worried me before because it doesn't fail safe,
but this makes things more problematic.  Consider the case back when we
had __HYPERVISOR_CS32.

Guest mode is strictly "(r)->cs & 3".

Everything else is expectations about how things ought to be laid out,
but for safety in release builds, the final judgement should not depend
on the expectations evaluating true.

~Andrew



Re: [PATCH] x86/IST: Fix comment about stack layout

2020-04-27 Thread Jan Beulich
On 27.04.2020 15:19, Wei Liu wrote:
>> --- a/xen/include/asm-x86/current.h
>> +++ b/xen/include/asm-x86/current.h
>> @@ -18,7 +18,7 @@
>>   * 6 - Primary stack
>>   * 5 - Optionally not present (MEMORY_GUARD)
>>   * 4 - Unused; optionally not present (MEMORY_GUARD)
>> - * 3 - Unused; optionally not present (MEMORY_GUARD)
>> + * 3 -  DB IST stack
> 
> There seems to be an extraneous space before "DB".

I guess #DB was meant. With that

Acked-by: Jan Beulich 

Jan



Re: [PATCH] x86/IST: Fix comment about stack layout

2020-04-27 Thread Andrew Cooper
On 27/04/2020 14:19, Wei Liu wrote:
> On Mon, Apr 27, 2020 at 02:17:02PM +0100, Andrew Cooper wrote:
>> This was an oversight in c/s 5d37af364dc "x86/traps: Use an Interrupt Stack
>> Table for #DB" which introduced the #DB IST to begin with.
>>
>> Reported-by: Jan Beulich 
>> Signed-off-by: Andrew Cooper 
>> ---
>> CC: Jan Beulich 
>> CC: Wei Liu 
>> CC: Roger Pau Monné 
>> ---
>>  xen/include/asm-x86/current.h | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/xen/include/asm-x86/current.h b/xen/include/asm-x86/current.h
>> index 0b47485337..39c2c6cbf8 100644
>> --- a/xen/include/asm-x86/current.h
>> +++ b/xen/include/asm-x86/current.h
>> @@ -18,7 +18,7 @@
>>   * 6 - Primary stack
>>   * 5 - Optionally not present (MEMORY_GUARD)
>>   * 4 - Unused; optionally not present (MEMORY_GUARD)
>> - * 3 - Unused; optionally not present (MEMORY_GUARD)
>> + * 3 -  DB IST stack
> There seems to be an extraneous space before "DB".

Yes.  The alternative would be for misaligned tabulation with the lower
"IST stack".

~Andrew

>
> Wei.
>
>>   * 2 - MCE IST stack
>>   * 1 - NMI IST stack
>>   * 0 - Double Fault IST stack
>> -- 
>> 2.11.0
>>




Re: [PATCH] x86: refine guest_mode()

2020-04-27 Thread Jan Beulich
On 27.04.2020 11:59, Roger Pau Monné wrote:
> On Mon, Apr 27, 2020 at 10:03:05AM +0200, Jan Beulich wrote:
>> --- a/xen/include/asm-x86/regs.h
>> +++ b/xen/include/asm-x86/regs.h
>> @@ -10,9 +10,10 @@
>>  /* Frame pointer must point into current CPU stack. */  
>>   \
>>  ASSERT(diff < STACK_SIZE);  
>>   \
>>  /* If not a guest frame, it must be a hypervisor frame. */  
>>   \
>> -ASSERT((diff == 0) || (r->cs == __HYPERVISOR_CS));  
>>   \
>> +if ( diff < PRIMARY_STACK_SIZE )
>>   \
>> +ASSERT(!diff || ((r)->cs == __HYPERVISOR_CS));  
>>   \
> 
> Why not use:
> 
> ASSERT(diff >= PRIMARY_STACK_SIZE || !diff || ((r)->cs == __HYPERVISOR_CS));

Except for the longer (without being helpful imo) string reported if
the assertion triggers, I see not difference.

> I'm not sure I fully understand this layout, is it possible that you
> also need to account for the size of cpu_info?

Depends on how paranoid we want the checking here to be, but in going
further I wouldn't want this to become sub-page fine-grained if we
aren't first doing e.g. what I'm mentioning in the post-commit-message
remark.

Jan



Re: [PATCH v7 03/12] docs: add feature document for Xen hypervisor sysfs-like support

2020-04-27 Thread George Dunlap

> On Apr 2, 2020, at 4:46 PM, Juergen Gross  wrote:
[snip]
> +* {VALUE, VALUE, ... } -- a list of possible values separated by "," and
> +  enclosed in "{" and "}".
[snip]
> +So an entry could look like this:
> +
> +/cpu-bugs/active-pv/xpti = ("No"|{"dom0", "domU", "PCID on"}) [w,X86,PV]
> +
> +Possible values would be "No" or a list of "dom0", "domU", and "PCID on".

One thing that wasn’t clear to me here:  Does the “list” look like

“dom0", “domU", “PCID on”

or 

{dom0, domU, PCID on}

or

{“dom0”, “domU”, “PCID on”}

?

Another question that occurs to me from asking this question: in a case like 
this, are we generally expecting to have options with spaces in them (like 
“PCID on”)? And/or, are we expecting the strings themselves to have quotes in 
them? If so, commands to manipulate these will start to look a little gnarly:

xenhypfs write  “{\”dom0\”, \”PCID on\”}”

It seems like it would be nicer to be able to write:

xenhypfs write  "{dom0, PCID-on}”

(Maybe this will be made more clear later in the series, just thought I’d share 
my thoughts / confusion here.)

 -George

Cpu on/offlining crash with core scheduling

2020-04-27 Thread Sergey Dyasli
Hi Juergen,

When I'm testing vcpu pinning with something like:

 # xl vcpu-pin 0 0 2
 # xen-hptool cpu-offline 3

 (offline / online CPUs {2,3} if the above is successful)

I'm reliably seeing the following crash on the latest staging:

(XEN) Watchdog timer detects that CPU1 is stuck!
(XEN) [ Xen-4.14-unstable  x86_64  debug=y   Not tainted ]
(XEN) CPU:1
(XEN) RIP:e008:[] 
common/sched/core.c#sched_wait_rendezvous_in+0x16c/0x385
(XEN) RFLAGS: 0002   CONTEXT: hypervisor
(XEN) rax: f001   rbx: 82d0805c9118   rcx: 83085e750301
(XEN) rdx: 0001   rsi: 83086499b972   rdi: 83085e7503a6
(XEN) rbp: 83085e7dfe28   rsp: 83085e7dfdd8   r8:  830864985440
(XEN) r9:  83085e714068   r10: 0014   r11: 0056b6a1aab2
(XEN) r12: 83086499e490   r13: 82d0805f26e0   r14: 83085e7503a0
(XEN) r15: 0001   cr0: 80050033   cr4: 00362660
(XEN) cr3: 000823a8e000   cr2: 6026000f6fc0
(XEN) fsb:    gsb: 888138dc   gss: 
(XEN) ds: 002b   es: 002b   fs:    gs:    ss: e010   cs: e008
(XEN) Xen code around  
(common/sched/core.c#sched_wait_rendezvous_in+0x16c/0x385):
(XEN)  4c 89 f7 e8 dc a5 fd ff <4b> 8b 44 fd 00 48 8b 04 18 4c 3b 70 10 0f 85 3f
(XEN) Xen stack trace from rsp=83085e7dfdd8:
(XEN)0056b42128a6 83086499ff30 83086498a000 83085e7dfe48
(XEN)00010001 0056b42128a6 83086499e490 
(XEN)0001 0001 83085e7dfe78 82d080252ae8
(XEN)83086498a000 000180230434 83085e7503a0 82d0805ceb00
(XEN) 82d0805cea80  82d0805dea80
(XEN)83085e7dfeb0 82d08022c232 0001 82d0805ceb00
(XEN)0001 0001 0001 83085e7dfec0
(XEN)82d08022c2cd 83085e7dfef0 82d08031cae9 83086498a000
(XEN)83086498a000 0001 0001 83085e7dfde8
(XEN)88813021d700 88813021d700  
(XEN)0007 88813021d700 0246 7ff0
(XEN) 0001ca00  810013aa
(XEN)8203d210 deadbeefdeadf00d deadbeefdeadf00d 0100
(XEN)810013aa e033 0246 c900400dfeb0
(XEN)e02b   
(XEN) e011 83086498a000 0037e43bd000
(XEN)00362660  800864980002 0601
(XEN)
(XEN) Xen call trace:
(XEN)[] R 
common/sched/core.c#sched_wait_rendezvous_in+0x16c/0x385
(XEN)[] F common/sched/core.c#sched_slave+0x262/0x31e
(XEN)[] F common/softirq.c#__do_softirq+0x8a/0xbc
(XEN)[] F do_softirq+0x13/0x15
(XEN)[] F arch/x86/domain.c#idle_loop+0x57/0xa7
(XEN)
(XEN) CPU0 @ e008:82d08022c2b7 (process_pending_softirqs+0x53/0x56)
(XEN) CPU4 @ e008:82d08022bc40 
(common/rcupdate.c#rcu_process_callbacks+0x22e/0x24b)
(XEN) CPU2 @ e008:82d08022c26f (process_pending_softirqs+0xb/0x56)
(XEN) CPU7 @ e008:82d08022bc40 
(common/rcupdate.c#rcu_process_callbacks+0x22e/0x24b)
(XEN) CPU3 @ e008:82d08022bc40 
(common/rcupdate.c#rcu_process_callbacks+0x22e/0x24b)
(XEN) CPU5 @ e008:82d08022cc34 (_spin_lock+0x4d/0x62)
(XEN) CPU6 @ e008:82d08022c264 (process_pending_softirqs+0/0x56)
(XEN)
(XEN) 
(XEN) Panic on CPU 1:
(XEN) FATAL TRAP: vector = 2 (nmi)
(XEN) [error_code=] , IN INTERRUPT CONTEXT
(XEN) 
(XEN)
(XEN) Reboot in five seconds...
(XEN) Executing kexec image on cpu1
(XEN) Shot down all CPUs


Is this something you can reproduce?

--
Thanks,
Sergey



Re: [PATCH] mini-os: Avoid segfaults in tc{g,s}etattr

2020-04-27 Thread Jason Andryuk
On Mon, Apr 27, 2020 at 3:54 AM Samuel Thibault
 wrote:
>
> Jason Andryuk, le dim. 26 avril 2020 23:40:19 -0400, a ecrit:
> > Commit c96c22f1d94 "mini-os: minimal implementations of some termios
> > functions" introduced implementations of tcgetattr and tcsetattr.
> > However, they do not check if files[fildes].cons.dev is non-NULL before
> > dereferencing.  This is not a problem for FDs allocated through
> > alloc_fd, but the files array pre-allocates FDs 0-2 for stdio.  Those
> > entries have a NULL .dev, so tc{g,s}etattr on them segfault.
> >
> > ioemu-stubdom segfaults when term_init() calls tcgetattr on FD 0.
> >
> > Restore tcgetattr and tcsetattr behavior when .dev is NULL equivalent to
> > unsupported_function as it was before c96c22f1d94.
> >
> > Signed-off-by: Jason Andryuk 
>
> Reviewed-by: Samuel Thibault 
>
> Thanks!

Thank you!

> > ---
> > I can't get ioemu-stubdom to start without this.  With this, the guest
> > just reboots immediately, but it does that with a non-stubdom
> > device_model_version="qemu-xen-traditional" .  The same guest disk image
> > (cirros 0.5.1) boots with a linux stubdom or non-stubdom Ubuntu
> > qemu-system-x86_64.

Ubuntu gcc-9 adds -fcf-protection by default.  Somehow that flag
caused rombios (I think) to restart.  Setting -fcf-protection=none
like below (probably just the EMBEDDED_EXTRA_CFLAGS part) lets rombios
start properly.  The hypervisor needs it as well via
EXTRA_CFLAGS_XEN_CORE=-fcf-protection=none and maybe also added to
xen/arch/x86/boot/build32.mk .

diff --git a/Config.mk b/Config.mk
index 0f303c79b2..efb3d42bc4 100644
--- a/Config.mk
+++ b/Config.mk
@@ -205,6 +205,7 @@ APPEND_CFLAGS += $(foreach i, $(APPEND_INCLUDES), -I$(i))

 EMBEDDED_EXTRA_CFLAGS := -nopie -fno-stack-protector -fno-stack-protector-all
 EMBEDDED_EXTRA_CFLAGS += -fno-exceptions
+EMBEDDED_EXTRA_CFLAGS += -fcf-protection=none

 XEN_EXTFILES_URL ?= http://xenbits.xen.org/xen-extfiles
 # All the files at that location were downloaded from elsewhere on
diff --git a/tools/firmware/Rules.mk b/tools/firmware/Rules.mk
index 26bbddccd4..0d33514d53 100644
--- a/tools/firmware/Rules.mk
+++ b/tools/firmware/Rules.mk
@@ -17,3 +17,4 @@ $(call cc-options-add,CFLAGS,CC,$(EMBEDDED_EXTRA_CFLAGS))

 # Extra CFLAGS suitable for an embedded type of environment.
 CFLAGS += -fno-builtin -msoft-float
+CFLAGS += -fcf-protection=none



[PATCH] x86/IST: Fix comment about stack layout

2020-04-27 Thread Andrew Cooper
This was an oversight in c/s 5d37af364dc "x86/traps: Use an Interrupt Stack
Table for #DB" which introduced the #DB IST to begin with.

Reported-by: Jan Beulich 
Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Wei Liu 
CC: Roger Pau Monné 
---
 xen/include/asm-x86/current.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/xen/include/asm-x86/current.h b/xen/include/asm-x86/current.h
index 0b47485337..39c2c6cbf8 100644
--- a/xen/include/asm-x86/current.h
+++ b/xen/include/asm-x86/current.h
@@ -18,7 +18,7 @@
  * 6 - Primary stack
  * 5 - Optionally not present (MEMORY_GUARD)
  * 4 - Unused; optionally not present (MEMORY_GUARD)
- * 3 - Unused; optionally not present (MEMORY_GUARD)
+ * 3 -  DB IST stack
  * 2 - MCE IST stack
  * 1 - NMI IST stack
  * 0 - Double Fault IST stack
-- 
2.11.0




Re: [PATCH] x86/pvh: Override opt_console_xen earlier

2020-04-27 Thread Wei Liu
On Mon, Apr 27, 2020 at 01:19:44PM +0100, Andrew Cooper wrote:
> This allows printk() to work from the start of day, and backtraces from as
> early as the IDT is set up.
> 
> Signed-off-by: Andrew Cooper 

Reviewed-by: Wei Liu 



Re: [PATCH v2 1/1] golang/xenlight: add NameToDomid and DomidToName util functions

2020-04-27 Thread George Dunlap


> On Apr 24, 2020, at 4:02 AM, Nick Rosbrook  wrote:
> 
> Many exported functions in xenlight require a domid as an argument. Make
> it easier for package users to use these functions by adding wrappers
> for the libxl utility functions libxl_name_to_domid and
> libxl_domid_to_name.
> 
> Signed-off-by: Nick Rosbrook 
> ---
> tools/golang/xenlight/xenlight.go | 38 ++-
> 1 file changed, 37 insertions(+), 1 deletion(-)
> 
> diff --git a/tools/golang/xenlight/xenlight.go 
> b/tools/golang/xenlight/xenlight.go
> index 6b4f492550..d1d30b63e1 100644
> --- a/tools/golang/xenlight/xenlight.go
> +++ b/tools/golang/xenlight/xenlight.go
> @@ -21,13 +21,13 @@ package xenlight
> #cgo LDFLAGS: -lxenlight -lyajl -lxentoollog
> #include 
> #include 
> +#include 
> 
> static const libxl_childproc_hooks childproc_hooks = { .chldowner = 
> libxl_sigchld_owner_mainloop };
> 
> void xenlight_set_chldproc(libxl_ctx *ctx) {
>   libxl_childproc_setmode(ctx, _hooks, NULL);
> }
> -
> */
> import "C"
> 
> @@ -75,6 +75,10 @@ var libxlErrors = map[Error]string{
>   ErrorFeatureRemoved:   "Feature removed",
> }
> 
> +const (
> + DomidInvalid Domid = ^Domid(0)

Not to be a stickler, but are we positive that this will always result in the 
same value as C.INVALID_DOMID?

There are some functions that will return INVALID_DOMID, or accept 
INVALID_DOMID as an argument.  Users of the `xenlight` package will presumably 
need to compare against this value and/or pass it in.

It seems like we could:

1. Rely on language lawyering to justify our assumption that ^Domid(0) will 
always be the same as C “~0”

2. On marshalling into / out of C, convert C.INVALID_DOMID to DomidInvalid

3. Set Domid = C.INVALID_DOMID

If you’re confident in your language lawyering skills, I could accept #1; but 
I’d prefer a comment with some sort of reference.  But if it were me I’d just 
go with #3; particularly given that this value could theoretically change 
(libxl has a stable API, not a stable ABI).

Everything else looks good.

If you want I could s/~Domid(0)/C.INVALID_DOMID/; myself, add my R-b and check 
it in.

 -George



[PATCH] x86/pvh: Override opt_console_xen earlier

2020-04-27 Thread Andrew Cooper
This allows printk() to work from the start of day, and backtraces from as
early as the IDT is set up.

Signed-off-by: Andrew Cooper 
---
CC: Jan Beulich 
CC: Wei Liu 
CC: Roger Pau Monné 
---
 xen/arch/x86/boot/head.S | 3 +++
 xen/arch/x86/setup.c | 5 -
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/xen/arch/x86/boot/head.S b/xen/arch/x86/boot/head.S
index 153a53f250..150f7f90a2 100644
--- a/xen/arch/x86/boot/head.S
+++ b/xen/arch/x86/boot/head.S
@@ -402,6 +402,9 @@ __pvh_start:
 
 mov %ebx, sym_esi(pvh_start_info_pa)
 
+/* Force xen console.  Will revert to user choice in init code. */
+movb$-1, sym_esi(opt_console_xen)
+
 /* Prepare gdt and segments */
 add %esi, sym_esi(gdt_boot_base)
 lgdtsym_esi(gdt_boot_descr)
diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c
index 885919d5c3..eb56d78c2f 100644
--- a/xen/arch/x86/setup.c
+++ b/xen/arch/x86/setup.c
@@ -728,11 +728,6 @@ void __init noreturn __start_xen(unsigned long mbi_p)
 
 if ( pvh_boot )
 {
-/*
- * Force xen console to be enabled. We will reset it later in console
- * initialisation code.
- */
-opt_console_xen = -1;
 ASSERT(mbi_p == 0);
 pvh_init(, );
 }
-- 
2.11.0




[PATCH] x86/ioemul: Rewrite stub generation

2020-04-27 Thread Andrew Cooper
The logic is completely undocumented and almost impossible to follow.  It
actually uses return oriented programming.  Rewrite it to conform to more
normal call mechanics, and leave a big comment explaining thing.  As well as
the code being easier to follow, it will execute faster as it isn't fighting
the branch predictor.

Move the ioemul_handle_quirk() function pointer from traps.c to
ioport_emulate.c.  There is no reason for it to be in neither of the two
translation units which use it.  Alter the behaviour to return the number of
bytes written into the stub.

Access the addresses of the host/guest helpers with extern const char arrays.
Nothing good will come of C thinking they are regular functions.

Signed-off-by: Andrew Cooper 
--
CC: Jan Beulich 
CC: Wei Liu 
CC: Roger Pau Monné 
---
 xen/arch/x86/ioport_emulate.c  | 11 ++---
 xen/arch/x86/pv/emul-priv-op.c | 91 +++---
 xen/arch/x86/pv/gpr_switch.S   | 37 +
 xen/arch/x86/traps.c   |  3 --
 xen/include/asm-x86/io.h   |  3 +-
 5 files changed, 85 insertions(+), 60 deletions(-)

diff --git a/xen/arch/x86/ioport_emulate.c b/xen/arch/x86/ioport_emulate.c
index 499c1f6056..f7511a9c49 100644
--- a/xen/arch/x86/ioport_emulate.c
+++ b/xen/arch/x86/ioport_emulate.c
@@ -8,7 +8,10 @@
 #include 
 #include 
 
-static bool ioemul_handle_proliant_quirk(
+unsigned int (*ioemul_handle_quirk)(
+u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
+
+static unsigned int ioemul_handle_proliant_quirk(
 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs)
 {
 static const char stub[] = {
@@ -19,18 +22,16 @@ static bool ioemul_handle_proliant_quirk(
 0xa8, 0x80, /*test $0x80, %al */
 0x75, 0xfb, /*jnz 1b  */
 0x9d,   /*popf*/
-0xc3,   /*ret */
 };
 uint16_t port = regs->dx;
 uint8_t value = regs->al;
 
 if ( (opcode != 0xee) || (port != 0xcd4) || !(value & 0x80) )
-return false;
+return 0;
 
 memcpy(io_emul_stub, stub, sizeof(stub));
-BUILD_BUG_ON(IOEMUL_QUIRK_STUB_BYTES < sizeof(stub));
 
-return true;
+return sizeof(stub);
 }
 
 /* This table is the set of system-specific I/O emulation hooks. */
diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
index e24b84f46a..f150886711 100644
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -54,51 +54,96 @@ struct priv_op_ctxt {
 unsigned int bpmatch;
 };
 
-/* I/O emulation support. Helper routines for, and type of, the stack stub. */
-void host_to_guest_gpr_switch(struct cpu_user_regs *);
-unsigned long guest_to_host_gpr_switch(unsigned long);
+/* I/O emulation helpers.  Use non-standard calling conventions. */
+extern const char load_guest_gprs[], save_guest_gprs[];
 
 typedef void io_emul_stub_t(struct cpu_user_regs *);
 
 static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
   unsigned int port, unsigned int 
bytes)
 {
+/*
+ * Construct a stub for IN/OUT emulation.
+ *
+ * Some platform drivers communicate with the SMM handler using GPRs as a
+ * mailbox.  Therefore, we must perform the emulation with the hardware
+ * domain's registers in view.
+ *
+ * We write a stub of the following form, using the guest load/save
+ * helpers (abnormal calling conventions), and one of several possible
+ * stubs performing the real I/O.
+ */
+static const char prologue[] = {
+0x53,   /* push %rbx */
+0x55,   /* push %rbp */
+0x41, 0x54, /* push %r12 */
+0x41, 0x55, /* push %r13 */
+0x41, 0x56, /* push %r14 */
+0x41, 0x57, /* push %r15 */
+0x57,   /* push %rdi (param for save_guest_gprs) */
+};  /* call load_guest_gprs */
+/*  */
+/* call save_guest_gprs */
+static const char epilogue[] = {
+0x5f,   /* pop %rdi  */
+0x41, 0x5f, /* pop %r15  */
+0x41, 0x5e, /* pop %r14  */
+0x41, 0x5d, /* pop %r13  */
+0x41, 0x5c, /* pop %r12  */
+0x5d,   /* pop %rbp  */
+0x5b,   /* pop %rbx  */
+0xc3,   /* ret   */
+};
+
 struct stubs *this_stubs = _cpu(stubs);
 unsigned long stub_va = this_stubs->addr + STUB_BUF_SIZE / 2;
-long disp;
-bool use_quirk_stub = false;
+unsigned int quirk_bytes = 0;
+char *p;
+
+/* Helpers - Read outer scope but only modify p. */
+#define APPEND_BUFF(b) ({ memcpy(p, b, sizeof(b)); p += sizeof(b); })
+#define APPEND_CALL(f)  \
+({  \
+long disp = (long)(f) - (stub_va + p - ctxt->io_emul_stub + 5); \
+BUG_ON((int32_t)disp != disp);  \
+

[xen-unstable test] 149831: tolerable FAIL

2020-04-27 Thread osstest service owner
flight 149831 xen-unstable real [real]
http://logs.test-lab.xenproject.org/osstest/logs/149831/

Failures :-/ but no regressions.

Tests which are failing intermittently (not blocking):
 test-amd64-amd64-xl-rtds15 guest-saverestore fail in 149829 pass in 149831
 test-armhf-armhf-xl-vhd  11 guest-start  fail in 149829 pass in 149831
 test-armhf-armhf-xl-rtds 16 guest-start/debian.repeat  fail pass in 149829

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-xl-rtds 16 guest-localmigrate   fail  like 149824
 test-amd64-amd64-xl-qemuu-ws16-amd64 17 guest-stopfail like 149829
 test-amd64-i386-xl-qemut-win7-amd64 17 guest-stop fail like 149829
 test-amd64-amd64-xl-qemut-win7-amd64 17 guest-stopfail like 149829
 test-armhf-armhf-libvirt-raw 13 saverestore-support-checkfail  like 149829
 test-armhf-armhf-libvirt 14 saverestore-support-checkfail  like 149829
 test-amd64-i386-xl-qemuu-win7-amd64 17 guest-stop fail like 149829
 test-amd64-amd64-xl-qemut-ws16-amd64 17 guest-stopfail like 149829
 test-amd64-amd64-xl-qemuu-win7-amd64 17 guest-stopfail like 149829
 test-amd64-i386-xl-qemuu-ws16-amd64 17 guest-stop fail like 149829
 test-amd64-i386-xl-pvshim12 guest-start  fail   never pass
 test-amd64-amd64-libvirt 13 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt-xsm  13 migrate-support-checkfail   never pass
 test-amd64-amd64-libvirt-xsm 13 migrate-support-checkfail   never pass
 test-amd64-i386-libvirt  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-seattle  14 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-qemuu-debianhvm-amd64-xsm 11 migrate-support-check 
fail never pass
 test-amd64-i386-libvirt-qemuu-debianhvm-amd64-xsm 11 migrate-support-check 
fail never pass
 test-amd64-amd64-qemuu-nested-amd 17 debian-hvm-install/l1/l2  fail never pass
 test-arm64-arm64-xl-xsm  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit1  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  13 migrate-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-credit2  14 saverestore-support-checkfail   never pass
 test-arm64-arm64-libvirt-xsm 14 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-thunderx 14 saverestore-support-checkfail   never pass
 test-amd64-amd64-libvirt-vhd 12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-multivcpu 13 migrate-support-checkfail  never pass
 test-armhf-armhf-xl-multivcpu 14 saverestore-support-checkfail  never pass
 test-armhf-armhf-xl-credit2  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-credit1  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-credit2  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-rtds 14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-cubietruck 13 migrate-support-checkfail never pass
 test-armhf-armhf-xl-cubietruck 14 saverestore-support-checkfail never pass
 test-amd64-i386-xl-qemut-ws16-amd64 17 guest-stop  fail never pass
 test-armhf-armhf-xl-vhd  12 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-vhd  13 saverestore-support-checkfail   never pass
 test-arm64-arm64-xl  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl-arndale  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-libvirt-raw 12 migrate-support-checkfail   never pass
 test-armhf-armhf-libvirt 13 migrate-support-checkfail   never pass

version targeted for testing:
 xen  f093b08c47b39da6019421a2b61d40745b3e573b
baseline version:
 xen  f093b08c47b39da6019421a2b61d40745b3e573b

Last test of basis   149831  2020-04-27 01:52:33 Z0 days
Testing same since  (not found) 0 attempts

jobs:
 

RE: [PATCH v2] docs/designs: re-work the xenstore migration document...

2020-04-27 Thread Paul Durrant
> -Original Message-
> From: Jürgen Groß 
> Sent: 27 April 2020 12:13
> To: p...@xen.org; xen-devel@lists.xenproject.org
> Cc: 'Paul Durrant' 
> Subject: Re: [PATCH v2] docs/designs: re-work the xenstore migration 
> document...
> 
> On 27.04.20 12:45, Paul Durrant wrote:
> >> -Original Message-
> >> From: Jürgen Groß 
> >> Sent: 27 April 2020 11:37
> >> To: Paul Durrant ; xen-devel@lists.xenproject.org
> >> Cc: Paul Durrant 
> >> Subject: Re: [PATCH v2] docs/designs: re-work the xenstore migration 
> >> document...
> >>
> >> On 27.04.20 09:53, Paul Durrant wrote:
> >>> From: Paul Durrant 
> >>>
> >>> ... to specify a separate migration stream that will also be suitable for
> >>> live update.
> >>>
> >>> The original scope of the document was to support non-cooperative 
> >>> migration
> >>> of guests [1] but, since then, live update of xenstored has been brought 
> >>> into
> >>> scope. Thus it makes more sense to define a separate image format for
> >>> serializing xenstore state that is suitable for both purposes.
> >>>
> >>> The document has been limited to specifying a new image format. The 
> >>> mechanism
> >>> for acquiring the image for live update or migration is not covered as 
> >>> that
> >>> is more appropriately dealt with by a patch to docs/misc/xenstore.txt. It 
> >>> is
> >>> also expected that, when the first implementation of live update or 
> >>> migration
> >>> making use of this specification is committed, that the document is moved 
> >>> from
> >>> docs/designs into docs/specs.
> >>>
> >>> [1] See 
> >>> https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=docs/designs/non-cooperative-
> migration.md
> >>>
> >>> Signed-off-by: Paul Durrant 
> >>> ---
> >>> Juergen Gross 
> >>> Andrew Cooper 
> >>> George Dunlap 
> >>> Ian Jackson 
> >>> Jan Beulich 
> >>> Julien Grall 
> >>> Stefano Stabellini 
> >>> Wei Liu 
> >>
> >> Mind adding CC: before those mail addresses in order to let git add
> >> those to the recipients list?
> >>
> >
> > D'oh... good spot.
> >
> >>>
> >>> v2:
> >>>- Address comments from Juergen
> >>
> >> Not all unfortunately. :-(
> >>
> >
> > OK.
> >
> >>> +### CONNECTION_DATA
> >>>
> >>> -Each WATCH_DATA record specifies a registered watch and is formatted as
> >>> -follows:
> >>> +For live update the image format will contain a `CONNECTION_DATA` record 
> >>> for
> >>> +each connection to xenstore. For migration it will only contain a record 
> >>> for
> >>> +the domain being migrated.
> >>>
> >>>
> >>>```
> >>> -0   1   2   3 octet
> >>> -+---+---+---+---+
> >>> -| WATCH_DATA|
> >>> -+---+
> >>> -| wpath length  |
> >>> -+---+
> >>> -| wpath data|
> >>> -...
> >>> -| pad (0 to 3 octets)   |
> >>> -+---+
> >>> +0   1   2   3   4   5   6   7octet
> >>> ++---+---+---+---+---+---+---+---+
> >>> +| conn-id   | pad   |
> >>> ++---+---+
> >>> +| conn-type | conn-spec
> >>>...
> >>
> >> I asked whether it wouldn't be better to drop the pad and move conn-type
> >> and a 2-byte (unified) flag field at its position. This together ...
> >>
> >>> ++---+---+
> >>> +| data-len  | data
> >>>+---+
> >>> -| token length  |
> >>> -+---+
> >>> -| token data|
> >>>...
> >>> -| pad (0 to 3 octets)   |
> >>> -+---+
> >>>```
> >>>
> >>> -wpath length and token length are specified in octets (excluding the NUL
> >>> -terminator). The wpath should be as described for the `WATCH` operation 
> >>> in
> >>> -[2]. The token is an arbitrary string of octets not containing any NUL
> >>> -values.
> >>>
> >>> +| Field   | Description |
> >>> +|-|-|
> >>> +| `conn-id`   | A non-zero number used to identify this |
> >>> +| | connection in subsequent connection-specific|
> >>> +| | records |
> >>> +| | |
> >>> +| `conn-type` | 0x: shared ring |
> >>> +| | 0x0001: socket  |
> >>> +| | |
> >>> +| `conn-spec` | See below   |
> >>> +| | |
> >>> +| `data-len`  | The length (in octets) of any pending data not  |
> >>> +| | yet written to the 

[PATCH v7 10/11] x86emul: support RDPRU

2020-04-27 Thread Jan Beulich
While the PM doesn't say so, this assumes that the MPERF value read this
way gets scaled similarly to its reading through RDMSR.

Also introduce the SVM related constants at this occasion.

Signed-off-by: Jan Beulich 
---
v6: Re-base.
v5: The CPUID field used is just 8 bits wide.
v4: Add GENERAL2_INTERCEPT_RDPRU and VMEXIT_RDPRU enumerators. Fold
handling of out of bounds indexes into switch(). Avoid
recalculate_misc() clobbering what recalculate_cpu_policy() has
done. Re-base.
v3: New.
---
RFC: Andrew promised to take care of the CPUID side of this; re-base
 over his work once available.

--- a/tools/libxl/libxl_cpuid.c
+++ b/tools/libxl/libxl_cpuid.c
@@ -264,6 +264,7 @@ int libxl_cpuid_parse_config(libxl_cpuid
 
 {"clzero",   0x8008, NA, CPUID_REG_EBX,  0,  1},
 {"rstr-fp-err-ptrs", 0x8008, NA, CPUID_REG_EBX, 2, 1},
+{"rdpru",0x8008, NA, CPUID_REG_EBX,  4,  1},
 {"wbnoinvd", 0x8008, NA, CPUID_REG_EBX,  9,  1},
 {"ibpb", 0x8008, NA, CPUID_REG_EBX, 12,  1},
 {"ppin", 0x8008, NA, CPUID_REG_EBX, 23,  1},
--- a/tools/misc/xen-cpuid.c
+++ b/tools/misc/xen-cpuid.c
@@ -148,6 +148,8 @@ static const char *const str_e8b[32] =
 [ 0] = "clzero",
 [ 2] = "rstr-fp-err-ptrs",
 
+[ 4] = "rdpru",
+
 /* [ 8] */[ 9] = "wbnoinvd",
 
 [12] = "ibpb",
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -683,6 +683,13 @@ static int read_msr(
 {
 switch ( reg )
 {
+case 0x00e8: /* APERF */
+case 0xc0e8: /* APERF_RD_ONLY */
+#define APERF_LO_VALUE 0xAEAEAEAE
+#define APERF_HI_VALUE 0xEAEAEAEA
+*val = ((uint64_t)APERF_HI_VALUE << 32) | APERF_LO_VALUE;
+return X86EMUL_OKAY;
+
 case 0xc080: /* EFER */
 *val = ctxt->addr_size > 32 ? 0x500 /* LME|LMA */ : 0;
 return X86EMUL_OKAY;
@@ -2421,6 +2428,30 @@ int main(int argc, char **argv)
 else
 printf("skipped\n");
 
+printf("%-40s", "Testing rdpru...");
+instr[0] = 0x0f; instr[1] = 0x01; instr[2] = 0xfd;
+regs.eip = (unsigned long)[0];
+regs.ecx = 1;
+regs.eflags = EFLAGS_ALWAYS_SET;
+rc = x86_emulate(, );
+if ( (rc != X86EMUL_OKAY) ||
+ (regs.eax != APERF_LO_VALUE) || (regs.edx != APERF_HI_VALUE) ||
+ !(regs.eflags & X86_EFLAGS_CF) ||
+ (regs.eip != (unsigned long)[3]) )
+goto fail;
+if ( ctxt.cpuid->extd.rdpru_max < 0x )
+{
+regs.eip = (unsigned long)[0];
+regs.ecx = ctxt.cpuid->extd.rdpru_max + 1;
+regs.eflags = EFLAGS_ALWAYS_SET | X86_EFLAGS_CF;
+rc = x86_emulate(, );
+if ( (rc != X86EMUL_OKAY) || regs.eax || regs.edx ||
+ (regs.eflags & X86_EFLAGS_CF) ||
+ (regs.eip != (unsigned long)[3]) )
+goto fail;
+}
+printf("okay\n");
+
 printf("%-40s", "Testing fnstenv 4(%ecx)...");
 if ( stack_exec && cpu_has_fpu )
 {
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -84,6 +84,8 @@ bool emul_test_init(void)
 cp.feat.avx512pf = cp.feat.avx512f;
 cp.feat.rdpid = true;
 cp.extd.clzero = true;
+cp.extd.rdpru = true;
+cp.extd.rdpru_max = 1;
 
 if ( cpu_has_xsave )
 {
@@ -156,11 +158,11 @@ int emul_test_cpuid(
 }
 
 /*
- * The emulator doesn't itself use CLZERO, so we can always run the
+ * The emulator doesn't itself use CLZERO/RDPRU, so we can always run the
  * respective test(s).
  */
 if ( leaf == 0x8008 )
-res->b |= 1U << 0;
+res->b |= (1U << 0) | (1U << 4);
 
 return X86EMUL_OKAY;
 }
--- a/xen/arch/x86/cpuid.c
+++ b/xen/arch/x86/cpuid.c
@@ -243,8 +243,6 @@ static void recalculate_misc(struct cpui
 /* Most of Power/RAS hidden from guests. */
 p->extd.raw[0x7].a = p->extd.raw[0x7].b = p->extd.raw[0x7].c = 0;
 
-p->extd.raw[0x8].d = 0;
-
 switch ( p->x86_vendor )
 {
 case X86_VENDOR_INTEL:
@@ -263,6 +261,7 @@ static void recalculate_misc(struct cpui
 
 p->extd.raw[0x8].a &= 0x;
 p->extd.raw[0x8].c = 0;
+p->extd.raw[0x8].d = 0;
 break;
 
 case X86_VENDOR_AMD:
@@ -281,6 +280,7 @@ static void recalculate_misc(struct cpui
 
 p->extd.raw[0x8].a &= 0x; /* GuestMaxPhysAddr hidden. */
 p->extd.raw[0x8].c &= 0x0003f0ff;
+p->extd.raw[0x8].d &= 0x;
 
 p->extd.raw[0x9] = EMPTY_LEAF;
 
@@ -643,6 +643,11 @@ void recalculate_cpuid_policy(struct dom
 
 p->extd.maxlinaddr = p->extd.lm ? 48 : 32;
 
+if ( p->extd.rdpru )
+p->extd.rdpru_max = min(p->extd.rdpru_max, max->extd.rdpru_max);
+else
+p->extd.rdpru_max = 0;
+
 recalculate_xstate(p);
 recalculate_misc(p);
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1967,6 +1967,7 @@ amd_like(const struct 

[PATCH v7 11/11] x86/HVM: don't needlessly intercept APERF/MPERF/TSC MSR reads

2020-04-27 Thread Jan Beulich
If the hardware can handle accesses, we should allow it to do so. This
way we can expose EFRO to HVM guests, and "all" that's left for exposing
APERF/MPERF is to figure out how to handle writes to these MSRs. (Note
that the leaf 6 guest CPUID checks will evaluate to false for now, as
recalculate_misc() zaps the entire leaf.)

For TSC the intercepts are made mirror the RDTSC ones.

Signed-off-by: Jan Beulich 
Reviewed-by: Kevin Tian 
---
v4: Make TSC intercepts mirror RDTSC ones. Re-base.
v3: New.

--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -595,6 +595,7 @@ static void svm_cpuid_policy_changed(str
 struct vmcb_struct *vmcb = svm->vmcb;
 const struct cpuid_policy *cp = v->domain->arch.cpuid;
 u32 bitmap = vmcb_get_exception_intercepts(vmcb);
+unsigned int mode;
 
 if ( opt_hvm_fep ||
  (v->domain->arch.cpuid->x86_vendor != boot_cpu_data.x86_vendor) )
@@ -607,6 +608,17 @@ static void svm_cpuid_policy_changed(str
 /* Give access to MSR_PRED_CMD if the guest has been told about it. */
 svm_intercept_msr(v, MSR_PRED_CMD,
   cp->extd.ibpb ? MSR_INTERCEPT_NONE : MSR_INTERCEPT_RW);
+
+/* Allow direct reads from APERF/MPERF if permitted by the policy. */
+mode = cp->basic.raw[6].c & CPUID6_ECX_APERFMPERF_CAPABILITY
+   ? MSR_INTERCEPT_WRITE : MSR_INTERCEPT_RW;
+svm_intercept_msr(v, MSR_IA32_APERF, mode);
+svm_intercept_msr(v, MSR_IA32_MPERF, mode);
+
+/* Allow direct access to their r/o counterparts if permitted. */
+mode = cp->extd.efro ? MSR_INTERCEPT_NONE : MSR_INTERCEPT_RW;
+svm_intercept_msr(v, MSR_APERF_RD_ONLY, mode);
+svm_intercept_msr(v, MSR_MPERF_RD_ONLY, mode);
 }
 
 void svm_sync_vmcb(struct vcpu *v, enum vmcb_sync_state new_state)
@@ -860,7 +872,10 @@ static void svm_set_rdtsc_exiting(struct
 {
 general1_intercepts |= GENERAL1_INTERCEPT_RDTSC;
 general2_intercepts |= GENERAL2_INTERCEPT_RDTSCP;
+svm_enable_intercept_for_msr(v, MSR_IA32_TSC);
 }
+else
+svm_intercept_msr(v, MSR_IA32_TSC, MSR_INTERCEPT_WRITE);
 
 vmcb_set_general1_intercepts(vmcb, general1_intercepts);
 vmcb_set_general2_intercepts(vmcb, general2_intercepts);
--- a/xen/arch/x86/hvm/svm/vmcb.c
+++ b/xen/arch/x86/hvm/svm/vmcb.c
@@ -108,6 +108,7 @@ static int construct_vmcb(struct vcpu *v
 {
 vmcb->_general1_intercepts |= GENERAL1_INTERCEPT_RDTSC;
 vmcb->_general2_intercepts |= GENERAL2_INTERCEPT_RDTSCP;
+svm_intercept_msr(v, MSR_IA32_TSC, MSR_INTERCEPT_WRITE);
 }
 
 /* Guest segment limits. */
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -1141,8 +1141,13 @@ static int construct_vmcs(struct vcpu *v
 vmx_clear_msr_intercept(v, MSR_IA32_SYSENTER_CS, VMX_MSR_RW);
 vmx_clear_msr_intercept(v, MSR_IA32_SYSENTER_ESP, VMX_MSR_RW);
 vmx_clear_msr_intercept(v, MSR_IA32_SYSENTER_EIP, VMX_MSR_RW);
+
+if ( !(v->arch.hvm.vmx.exec_control & CPU_BASED_RDTSC_EXITING) )
+vmx_clear_msr_intercept(v, MSR_IA32_TSC, VMX_MSR_R);
+
 if ( paging_mode_hap(d) && (!is_iommu_enabled(d) || iommu_snoop) )
 vmx_clear_msr_intercept(v, MSR_IA32_CR_PAT, VMX_MSR_RW);
+
 if ( (vmexit_ctl & VM_EXIT_CLEAR_BNDCFGS) &&
  (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) )
 vmx_clear_msr_intercept(v, MSR_IA32_BNDCFGS, VMX_MSR_RW);
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -585,6 +585,18 @@ static void vmx_cpuid_policy_changed(str
 vmx_clear_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW);
 else
 vmx_set_msr_intercept(v, MSR_FLUSH_CMD, VMX_MSR_RW);
+
+/* Allow direct reads from APERF/MPERF if permitted by the policy. */
+if ( cp->basic.raw[6].c & CPUID6_ECX_APERFMPERF_CAPABILITY )
+{
+vmx_clear_msr_intercept(v, MSR_IA32_APERF, VMX_MSR_R);
+vmx_clear_msr_intercept(v, MSR_IA32_MPERF, VMX_MSR_R);
+}
+else
+{
+vmx_set_msr_intercept(v, MSR_IA32_APERF, VMX_MSR_R);
+vmx_set_msr_intercept(v, MSR_IA32_MPERF, VMX_MSR_R);
+}
 }
 
 int vmx_guest_x86_mode(struct vcpu *v)
@@ -1250,7 +1262,12 @@ static void vmx_set_rdtsc_exiting(struct
 vmx_vmcs_enter(v);
 v->arch.hvm.vmx.exec_control &= ~CPU_BASED_RDTSC_EXITING;
 if ( enable )
+{
 v->arch.hvm.vmx.exec_control |= CPU_BASED_RDTSC_EXITING;
+vmx_set_msr_intercept(v, MSR_IA32_TSC, VMX_MSR_R);
+}
+else
+vmx_clear_msr_intercept(v, MSR_IA32_TSC, VMX_MSR_R);
 vmx_update_cpu_exec_control(v);
 vmx_vmcs_exit(v);
 }
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -245,7 +245,7 @@ XEN_CPUFEATURE(ENQCMD,6*32+29) /
 
 /* AMD-defined CPU features, CPUID level 0x8007.edx, word 7 */
 XEN_CPUFEATURE(ITSC,  7*32+ 8) /*   Invariant TSC */
-XEN_CPUFEATURE(EFRO,  7*32+10) /*   APERF/MPERF Read Only interface */

[PATCH v7 09/11] x86/HVM: scale MPERF values reported to guests (on AMD)

2020-04-27 Thread Jan Beulich
AMD's PM specifies that MPERF (and its r/o counterpart) reads are
affected by the TSC ratio. Hence when processing such reads in software
we too should scale the values. While we don't currently (yet) expose
the underlying feature flags, besides us allowing the MSRs to be read
nevertheless, RDPRU is going to expose the values even to user space.

Furthermore, due to the not exposed feature flags, this change has the
effect of making properly inaccessible (for reads) the two MSRs.

Note that writes to MPERF (and APERF) continue to be unsupported.

Signed-off-by: Jan Beulich 
---
v3: New.
---
I did consider whether to put the code in guest_rdmsr() instead, but
decided that it's better to have it next to TSC handling.

--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -3478,6 +3478,22 @@ int hvm_msr_read_intercept(unsigned int
 *msr_content = v->arch.hvm.msr_tsc_adjust;
 break;
 
+case MSR_MPERF_RD_ONLY:
+if ( !d->arch.cpuid->extd.efro )
+{
+goto gp_fault;
+
+case MSR_IA32_MPERF:
+if ( !(d->arch.cpuid->basic.raw[6].c &
+   CPUID6_ECX_APERFMPERF_CAPABILITY) )
+goto gp_fault;
+}
+if ( rdmsr_safe(msr, *msr_content) )
+goto gp_fault;
+if ( d->arch.cpuid->x86_vendor & (X86_VENDOR_AMD | X86_VENDOR_HYGON) )
+*msr_content = hvm_get_guest_tsc_fixed(v, *msr_content);
+break;
+
 case MSR_APIC_BASE:
 *msr_content = vcpu_vlapic(v)->hw.apic_base_msr;
 break;
--- a/xen/include/asm-x86/msr-index.h
+++ b/xen/include/asm-x86/msr-index.h
@@ -405,6 +405,9 @@
 #define MSR_IA32_MPERF 0x00e7
 #define MSR_IA32_APERF 0x00e8
 
+#define MSR_MPERF_RD_ONLY  0xc0e7
+#define MSR_APERF_RD_ONLY  0xc0e8
+
 #define MSR_IA32_THERM_CONTROL 0x019a
 #define MSR_IA32_THERM_INTERRUPT   0x019b
 #define MSR_IA32_THERM_STATUS  0x019c




[PATCH v7 08/11] x86emul: support FXSAVE/FXRSTOR

2020-04-27 Thread Jan Beulich
Note that FPU selector handling as well as MXCSR mask saving for now
does not honor differences between host and guest visible featuresets.

Since operation of the insns on CR4.OSFXSR is implementation dependent,
the easiest solution is used: Simply don't look at the bit in the first
place.

Signed-off-by: Jan Beulich 
---
v7: New.

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -767,6 +767,12 @@ static void zap_fpsel(unsigned int *env,
 }
 }
 
+static void zap_xfpsel(unsigned int *env)
+{
+env[3] &= ~0x;
+env[5] &= ~0x;
+}
+
 #ifdef __x86_64__
 # define STKVAL_DISP 64
 static const struct {
@@ -2517,6 +2523,91 @@ int main(int argc, char **argv)
 else
 printf("skipped\n");
 
+printf("%-40s", "Testing fxsave 4(%ecx)...");
+if ( stack_exec && cpu_has_fxsr )
+{
+const uint16_t nine = 9;
+
+memset(res + 0x80, 0xcc, 0x400);
+if ( cpu_has_sse2 )
+asm volatile ( "pcmpeqd %xmm7, %xmm7\n\t"
+   "pxor %xmm6, %xmm6\n\t"
+   "psubw %xmm7, %xmm6" );
+asm volatile ( "fninit\n\t"
+   "fld1\n\t"
+   "fidivs %1\n\t"
+   "fxsave %0"
+   : "=m" (res[0x100]) : "m" (nine) : "memory" );
+zap_xfpsel([0x100]);
+instr[0] = 0x0f; instr[1] = 0xae; instr[2] = 0x41; instr[3] = 0x04;
+regs.eip = (unsigned long)[0];
+regs.ecx = (unsigned long)(res + 0x7f);
+memset(res + 0x100 + 0x74, 0x33, 0x30);
+memset(res + 0x80 + 0x74, 0x33, 0x30);
+rc = x86_emulate(, );
+zap_xfpsel([0x80]);
+if ( (rc != X86EMUL_OKAY) ||
+ memcmp(res + 0x80, res + 0x100, 0x200) ||
+ (regs.eip != (unsigned long)[4]) )
+goto fail;
+printf("okay\n");
+}
+else
+printf("skipped\n");
+
+printf("%-40s", "Testing fxrstor -4(%ecx)...");
+if ( stack_exec && cpu_has_fxsr )
+{
+const uint16_t eleven = 11;
+
+memset(res + 0x80, 0xcc, 0x400);
+asm volatile ( "fxsave %0" : "=m" (res[0x80]) :: "memory" );
+zap_xfpsel([0x80]);
+if ( cpu_has_sse2 )
+asm volatile ( "pxor %xmm7, %xmm6\n\t"
+   "pxor %xmm7, %xmm3\n\t"
+   "pxor %xmm7, %xmm0\n\t"
+   "pxor %xmm7, %xmm7" );
+asm volatile ( "fninit\n\t"
+   "fld1\n\t"
+   "fidivs %0\n\t"
+   :: "m" (eleven) );
+instr[0] = 0x0f; instr[1] = 0xae; instr[2] = 0x49; instr[3] = 0xfc;
+regs.eip = (unsigned long)[0];
+regs.ecx = (unsigned long)(res + 0x81);
+rc = x86_emulate(, );
+asm volatile ( "fxsave %0" : "=m" (res[0x100]) :: "memory" );
+if ( (rc != X86EMUL_OKAY) ||
+ memcmp(res + 0x100, res + 0x80, 0x200) ||
+ (regs.eip != (unsigned long)[4]) )
+goto fail;
+printf("okay\n");
+}
+else
+printf("skipped\n");
+
+#ifdef __x86_64__
+printf("%-40s", "Testing fxsaveq 8(%edx)...");
+if ( stack_exec && cpu_has_fxsr )
+{
+memset(res + 0x80, 0xcc, 0x400);
+asm volatile ( "fxsaveq %0" : "=m" (res[0x100]) :: "memory" );
+instr[0] = 0x48; instr[1] = 0x0f; instr[2] = 0xae; instr[3] = 0x42; 
instr[4] = 0x08;
+regs.eip = (unsigned long)[0];
+regs.edx = (unsigned long)(res + 0x7e);
+memset(res + 0x100 + 0x74, 0x33, 0x30);
+memset(res + 0x80 + 0x74, 0x33, 0x30);
+rc = x86_emulate(, );
+if ( (rc != X86EMUL_OKAY) ||
+ memcmp(res + 0x80, res + 0x100, 0x200) ||
+ (regs.eip != (unsigned long)[5]) )
+goto fail;
+printf("okay\n");
+}
+else
+printf("skipped\n");
+#endif
+
 printf("%-40s", "Testing movq %mm3,(%ecx)...");
 if ( stack_exec && cpu_has_mmx )
 {
--- a/tools/tests/x86_emulator/x86-emulate.c
+++ b/tools/tests/x86_emulator/x86-emulate.c
@@ -30,6 +30,13 @@ struct cpuid_policy cp;
 static char fpu_save_area[4096] __attribute__((__aligned__((64;
 static bool use_xsave;
 
+/*
+ * Re-use the area above also as scratch space for the emulator itself.
+ * (When debugging the emulator, care needs to be taken when inserting
+ * printf() or alike function calls into regions using this.)
+ */
+#define FXSAVE_AREA ((struct x86_fxsr *)fpu_save_area)
+
 void emul_save_fpu_state(void)
 {
 if ( use_xsave )
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -860,6 +860,11 @@ struct x86_emulate_state {
 blk_fld, /* FLDENV, FRSTOR */
 blk_fst, /* FNSTENV, FNSAVE */
 #endif
+#if !defined(X86EMUL_NO_FPU) && !defined(X86EMUL_NO_MMX) && \
+!defined(X86EMUL_NO_SIMD)
+blk_fxrstor,
+blk_fxsave,
+#endif
 blk_movdir,
 } blk;
 

[PATCH v7 07/11] x86emul: support FLDENV and FRSTOR

2020-04-27 Thread Jan Beulich
While the Intel SDM claims that FRSTOR itself may raise #MF upon
completion, this was confirmed by Intel to be a doc error which will be
corrected in due course; behavior is like FLDENV, and like old hard copy
manuals describe it. Otherwise we'd have to emulate the insn by filling
st(N) in suitable order, followed by FLDENV.

Signed-off-by: Jan Beulich 
---
v7: New.

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -2442,6 +2442,27 @@ int main(int argc, char **argv)
 else
 printf("skipped\n");
 
+printf("%-40s", "Testing fldenv 8(%edx)...");
+if ( stack_exec && cpu_has_fpu )
+{
+asm volatile ( "fnstenv %0\n\t"
+   "fninit"
+   : "=m" (res[2]) :: "memory" );
+zap_fpsel([2], true);
+instr[0] = 0xd9; instr[1] = 0x62; instr[2] = 0x08;
+regs.eip = (unsigned long)[0];
+regs.edx = (unsigned long)res;
+rc = x86_emulate(, );
+asm volatile ( "fnstenv %0" : "=m" (res[9]) :: "memory" );
+if ( (rc != X86EMUL_OKAY) ||
+ memcmp(res + 2, res + 9, 28) ||
+ (regs.eip != (unsigned long)[3]) )
+goto fail;
+printf("okay\n");
+}
+else
+printf("skipped\n");
+
 printf("%-40s", "Testing 16-bit fnsave (%ecx)...");
 if ( stack_exec && cpu_has_fpu )
 {
@@ -2468,6 +2489,31 @@ int main(int argc, char **argv)
 goto fail;
 printf("okay\n");
 }
+else
+printf("skipped\n");
+
+printf("%-40s", "Testing frstor (%edx)...");
+if ( stack_exec && cpu_has_fpu )
+{
+const uint16_t seven = 7;
+
+asm volatile ( "fninit\n\t"
+   "fld1\n\t"
+   "fidivs %1\n\t"
+   "fnsave %0\n\t"
+   : "=" (res[0]) : "m" (seven) : "memory" );
+zap_fpsel([0], true);
+instr[0] = 0xdd; instr[1] = 0x22;
+regs.eip = (unsigned long)[0];
+regs.edx = (unsigned long)res;
+rc = x86_emulate(, );
+asm volatile ( "fnsave %0" : "=m" (res[27]) :: "memory" );
+if ( (rc != X86EMUL_OKAY) ||
+ memcmp(res, res + 27, 108) ||
+ (regs.eip != (unsigned long)[2]) )
+goto fail;
+printf("okay\n");
+}
 else
 printf("skipped\n");
 
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -857,6 +857,7 @@ struct x86_emulate_state {
 blk_NONE,
 blk_enqcmd,
 #ifndef X86EMUL_NO_FPU
+blk_fld, /* FLDENV, FRSTOR */
 blk_fst, /* FNSTENV, FNSAVE */
 #endif
 blk_movdir,
@@ -4948,21 +4949,14 @@ x86_emulate(
 dst.bytes = 4;
 emulate_fpu_insn_memdst(b, modrm_reg & 7, dst.val);
 break;
-case 4: /* fldenv - TODO */
-state->fpu_ctrl = true;
-goto unimplemented_insn;
-case 5: /* fldcw m2byte */
-state->fpu_ctrl = true;
-fpu_memsrc16:
-if ( (rc = ops->read(ea.mem.seg, ea.mem.off, ,
- 2, ctxt)) != X86EMUL_OKAY )
-goto done;
-emulate_fpu_insn_memsrc(b, modrm_reg & 7, src.val);
-break;
+case 4: /* fldenv */
+/* Raise #MF now if there are pending unmasked exceptions. */
+emulate_fpu_insn_stub(0xd9, 0xd0 /* fnop */);
+/* fall through */
 case 6: /* fnstenv */
 fail_if(!ops->blk);
-state->blk = blk_fst;
-/* REX is meaningless for this insn by this point. */
+state->blk = modrm_reg & 2 ? blk_fst : blk_fld;
+/* REX is meaningless for these insns by this point. */
 rex_prefix = in_protmode(ctxt, ops);
 if ( (rc = ops->blk(ea.mem.seg, ea.mem.off, NULL,
 op_bytes > 2 ? sizeof(struct x87_env32)
@@ -4972,6 +4966,14 @@ x86_emulate(
 goto done;
 state->fpu_ctrl = true;
 break;
+case 5: /* fldcw m2byte */
+state->fpu_ctrl = true;
+fpu_memsrc16:
+if ( (rc = ops->read(ea.mem.seg, ea.mem.off, ,
+ 2, ctxt)) != X86EMUL_OKAY )
+goto done;
+emulate_fpu_insn_memsrc(b, modrm_reg & 7, src.val);
+break;
 case 7: /* fnstcw m2byte */
 state->fpu_ctrl = true;
 fpu_memdst16:
@@ -5124,13 +5126,14 @@ x86_emulate(
 dst.bytes = 8;
 emulate_fpu_insn_memdst(b, modrm_reg & 7, dst.val);
 break;
-case 4: /* frstor - TODO */
-state->fpu_ctrl = true;
-goto unimplemented_insn;
+case 4: /* frstor 

[PATCH v7 06/11] x86emul: support FNSTENV and FNSAVE

2020-04-27 Thread Jan Beulich
To avoid introducing another boolean into emulator state, the
rex_prefix field gets (ab)used to convey the real/VM86 vs protected mode
info (affecting structure layout, albeit not size) to x86_emul_blk().

Signed-off-by: Jan Beulich 
---
TBD: The full 16-bit padding fields in the 32-bit structures get filled
 with all ones by modern CPUs (i.e. other than the comment says for
 FIP and FDP). We may want to mirror this as well (for the real mode
 variant), even if those fields' contents are unspecified.
---
v7: New.

--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -120,6 +120,7 @@ static inline bool xcr0_mask(uint64_t ma
 }
 
 #define cache_line_size() (cp.basic.clflush_size * 8)
+#define cpu_has_fpucp.basic.fpu
 #define cpu_has_mmxcp.basic.mmx
 #define cpu_has_fxsr   cp.basic.fxsr
 #define cpu_has_ssecp.basic.sse
--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -748,6 +748,25 @@ static struct x86_emulate_ops emulops =
 
 #define MMAP_ADDR 0x10
 
+/*
+ * 64-bit OSes may not (be able to) properly restore the two selectors in
+ * the FPU environment. Zap them so that memcmp() on two saved images will
+ * work regardless of whether a context switch occurred in the middle.
+ */
+static void zap_fpsel(unsigned int *env, bool is_32bit)
+{
+if ( is_32bit )
+{
+env[4] &= ~0x;
+env[6] &= ~0x;
+}
+else
+{
+env[2] &= ~0x;
+env[3] &= ~0x;
+}
+}
+
 #ifdef __x86_64__
 # define STKVAL_DISP 64
 static const struct {
@@ -2394,6 +2413,62 @@ int main(int argc, char **argv)
 printf("okay\n");
 }
 else
+printf("skipped\n");
+
+printf("%-40s", "Testing fnstenv 4(%ecx)...");
+if ( stack_exec && cpu_has_fpu )
+{
+const uint16_t three = 3;
+
+asm volatile ( "fninit\n\t"
+   "fld1\n\t"
+   "fidivs %1\n\t"
+   "fstenv %0"
+   : "=m" (res[9]) : "m" (three) : "memory" );
+zap_fpsel([9], true);
+instr[0] = 0xd9; instr[1] = 0x71; instr[2] = 0x04;
+regs.eip = (unsigned long)[0];
+regs.ecx = (unsigned long)res;
+res[8] = 0xaa55aa55;
+rc = x86_emulate(, );
+zap_fpsel([1], true);
+if ( (rc != X86EMUL_OKAY) ||
+ memcmp(res + 1, res + 9, 28) ||
+ res[8] != 0xaa55aa55 ||
+ (regs.eip != (unsigned long)[3]) )
+goto fail;
+printf("okay\n");
+}
+else
+printf("skipped\n");
+
+printf("%-40s", "Testing 16-bit fnsave (%ecx)...");
+if ( stack_exec && cpu_has_fpu )
+{
+const uint16_t five = 5;
+
+asm volatile ( "fninit\n\t"
+   "fld1\n\t"
+   "fidivs %1\n\t"
+   "fsaves %0"
+   : "=m" (res[25]) : "m" (five) : "memory" );
+zap_fpsel([25], false);
+asm volatile ( "frstors %0" :: "m" (res[25]) : "memory" );
+instr[0] = 0x66; instr[1] = 0xdd; instr[2] = 0x31;
+regs.eip = (unsigned long)[0];
+regs.ecx = (unsigned long)res;
+res[23] = 0xaa55aa55;
+res[24] = 0xaa55aa55;
+rc = x86_emulate(, );
+if ( (rc != X86EMUL_OKAY) ||
+ memcmp(res, res + 25, 94) ||
+ (res[23] >> 16) != 0xaa55 ||
+ res[24] != 0xaa55aa55 ||
+ (regs.eip != (unsigned long)[3]) )
+goto fail;
+printf("okay\n");
+}
+else
 printf("skipped\n");
 
 printf("%-40s", "Testing movq %mm3,(%ecx)...");
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -856,6 +856,9 @@ struct x86_emulate_state {
 enum {
 blk_NONE,
 blk_enqcmd,
+#ifndef X86EMUL_NO_FPU
+blk_fst, /* FNSTENV, FNSAVE */
+#endif
 blk_movdir,
 } blk;
 uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
@@ -897,6 +900,50 @@ struct x86_emulate_state {
 #define PTR_POISON NULL /* 32-bit builds are for user-space, so NULL is OK. */
 #endif
 
+#ifndef X86EMUL_NO_FPU
+struct x87_env16 {
+uint16_t fcw;
+uint16_t fsw;
+uint16_t ftw;
+union {
+struct {
+uint16_t fip_lo;
+uint16_t fop:11, :1, fip_hi:4;
+uint16_t fdp_lo;
+uint16_t :12, fdp_hi:4;
+} real;
+struct {
+uint16_t fip;
+uint16_t fcs;
+uint16_t fdp;
+uint16_t fds;
+} prot;
+} mode;
+};
+
+struct x87_env32 {
+uint32_t fcw:16, :16;
+uint32_t fsw:16, :16;
+uint32_t ftw:16, :16;
+union {
+struct {
+/* some CPUs/FPUs also store the full FIP here */
+uint32_t fip_lo:16, :16;
+uint32_t fop:11, :1, fip_hi:16, :4;
+/* some CPUs/FPUs also store the full FDP here */
+   

[PATCH v7 05/11] x86emul: support X{SUS,RES}LDTRK

2020-04-27 Thread Jan Beulich
There's nothing to be done by the emulator, as we unconditionally abort
any XBEGIN.

Signed-off-by: Jan Beulich 
---
v6: New.

--- a/tools/libxl/libxl_cpuid.c
+++ b/tools/libxl/libxl_cpuid.c
@@ -208,6 +208,7 @@ int libxl_cpuid_parse_config(libxl_cpuid
 {"avx512-vnni",  0x0007,  0, CPUID_REG_ECX, 11,  1},
 {"avx512-bitalg",0x0007,  0, CPUID_REG_ECX, 12,  1},
 {"avx512-vpopcntdq",0x0007,0,CPUID_REG_ECX, 14,  1},
+{"tsxldtrk", 0x0007,  0, CPUID_REG_ECX, 16,  1},
 {"rdpid",0x0007,  0, CPUID_REG_ECX, 22,  1},
 {"cldemote", 0x0007,  0, CPUID_REG_ECX, 25,  1},
 
--- a/tools/misc/xen-cpuid.c
+++ b/tools/misc/xen-cpuid.c
@@ -128,6 +128,7 @@ static const char *const str_7c0[32] =
 [10] = "vpclmulqdq",   [11] = "avx512_vnni",
 [12] = "avx512_bitalg",
 [14] = "avx512_vpopcntdq",
+[16] = "tsxldtrk",
 
 [22] = "rdpid",
 /* 24 */   [25] = "cldemote",
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1921,6 +1921,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_avx512_vnni() (ctxt->cpuid->feat.avx512_vnni)
 #define vcpu_has_avx512_bitalg() (ctxt->cpuid->feat.avx512_bitalg)
 #define vcpu_has_avx512_vpopcntdq() (ctxt->cpuid->feat.avx512_vpopcntdq)
+#define vcpu_has_tsxldtrk()(ctxt->cpuid->feat.tsxldtrk)
 #define vcpu_has_rdpid()   (ctxt->cpuid->feat.rdpid)
 #define vcpu_has_movdiri() (ctxt->cpuid->feat.movdiri)
 #define vcpu_has_movdir64b()   (ctxt->cpuid->feat.movdir64b)
@@ -5668,6 +5669,20 @@ x86_emulate(
 host_and_vcpu_must_have(serialize);
 asm volatile ( ".byte 0x0f, 0x01, 0xe8" );
 break;
+case vex_f2: /* xsusldtrk */
+vcpu_must_have(tsxldtrk);
+break;
+default:
+goto unimplemented_insn;
+}
+break;
+
+case 0xe9:
+switch ( vex.pfx )
+{
+case vex_f2: /* xresldtrk */
+vcpu_must_have(tsxldtrk);
+break;
 default:
 goto unimplemented_insn;
 }
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -236,6 +236,7 @@ XEN_CPUFEATURE(VPCLMULQDQ,6*32+10) /
 XEN_CPUFEATURE(AVX512_VNNI,   6*32+11) /*A  Vector Neural Network Instrs */
 XEN_CPUFEATURE(AVX512_BITALG, 6*32+12) /*A  Support for VPOPCNT[B,W] and 
VPSHUFBITQMB */
 XEN_CPUFEATURE(AVX512_VPOPCNTDQ, 6*32+14) /*A  POPCNT for vectors of DW/QW */
+XEN_CPUFEATURE(TSXLDTRK,  6*32+16) /*A  TSX load tracking suspend/resume 
insns */
 XEN_CPUFEATURE(RDPID, 6*32+22) /*A  RDPID instruction */
 XEN_CPUFEATURE(CLDEMOTE,  6*32+25) /*A  CLDEMOTE instruction */
 XEN_CPUFEATURE(MOVDIRI,   6*32+27) /*A  MOVDIRI instruction */
--- a/xen/tools/gen-cpuid.py
+++ b/xen/tools/gen-cpuid.py
@@ -284,6 +284,9 @@ def crunch_numbers(state):
 # as dependent features simplifies Xen's logic, and prevents the guest
 # from seeing implausible configurations.
 IBRSB: [STIBP, SSBD],
+
+# In principle the TSXLDTRK insns could also be considered independent.
+RTM: [TSXLDTRK],
 }
 
 deep_features = tuple(sorted(deps.keys()))




[PATCH v7 04/11] x86emul: support SERIALIZE

2020-04-27 Thread Jan Beulich
... enabling its use by all guest kinds at the same time.

Signed-off-by: Jan Beulich 
---
v7: Re-base.
v6: New.

--- a/tools/libxl/libxl_cpuid.c
+++ b/tools/libxl/libxl_cpuid.c
@@ -214,6 +214,7 @@ int libxl_cpuid_parse_config(libxl_cpuid
 {"avx512-4vnniw",0x0007,  0, CPUID_REG_EDX,  2,  1},
 {"avx512-4fmaps",0x0007,  0, CPUID_REG_EDX,  3,  1},
 {"md-clear", 0x0007,  0, CPUID_REG_EDX, 10,  1},
+{"serialize",0x0007,  0, CPUID_REG_EDX, 14,  1},
 {"cet-ibt",  0x0007,  0, CPUID_REG_EDX, 20,  1},
 {"ibrsb",0x0007,  0, CPUID_REG_EDX, 26,  1},
 {"stibp",0x0007,  0, CPUID_REG_EDX, 27,  1},
--- a/tools/misc/xen-cpuid.c
+++ b/tools/misc/xen-cpuid.c
@@ -161,6 +161,7 @@ static const char *const str_7d0[32] =
 
 [10] = "md-clear",
 /* 12 */[13] = "tsx-force-abort",
+[14] = "serialize",
 
 [18] = "pconfig",
 [20] = "cet-ibt",
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -158,6 +158,7 @@ static inline bool xcr0_mask(uint64_t ma
 #define cpu_has_movdir64b  cp.feat.movdir64b
 #define cpu_has_avx512_4vnniw (cp.feat.avx512_4vnniw && xcr0_mask(0xe6))
 #define cpu_has_avx512_4fmaps (cp.feat.avx512_4fmaps && xcr0_mask(0xe6))
+#define cpu_has_serialize  cp.feat.serialize
 #define cpu_has_avx512_bf16 (cp.feat.avx512_bf16 && xcr0_mask(0xe6))
 
 #define cpu_has_xgetbv1   (cpu_has_xsave && cp.xstate.xgetbv1)
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -1927,6 +1927,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_enqcmd()  (ctxt->cpuid->feat.enqcmd)
 #define vcpu_has_avx512_4vnniw() (ctxt->cpuid->feat.avx512_4vnniw)
 #define vcpu_has_avx512_4fmaps() (ctxt->cpuid->feat.avx512_4fmaps)
+#define vcpu_has_serialize()   (ctxt->cpuid->feat.serialize)
 #define vcpu_has_avx512_bf16() (ctxt->cpuid->feat.avx512_bf16)
 
 #define vcpu_must_have(feat) \
@@ -5660,6 +5661,18 @@ x86_emulate(
 goto done;
 break;
 
+case 0xe8:
+switch ( vex.pfx )
+{
+case vex_none: /* serialize */
+host_and_vcpu_must_have(serialize);
+asm volatile ( ".byte 0x0f, 0x01, 0xe8" );
+break;
+default:
+goto unimplemented_insn;
+}
+break;
+
 case 0xf8: /* swapgs */
 generate_exception_if(!mode_64bit(), EXC_UD);
 generate_exception_if(!mode_ring0(), EXC_GP, 0);
--- a/xen/include/asm-x86/cpufeature.h
+++ b/xen/include/asm-x86/cpufeature.h
@@ -129,6 +129,7 @@
 #define cpu_has_avx512_4vnniw   boot_cpu_has(X86_FEATURE_AVX512_4VNNIW)
 #define cpu_has_avx512_4fmaps   boot_cpu_has(X86_FEATURE_AVX512_4FMAPS)
 #define cpu_has_tsx_force_abort boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)
+#define cpu_has_serialize   boot_cpu_has(X86_FEATURE_SERIALIZE)
 
 /* CPUID level 0x0007:1.eax */
 #define cpu_has_avx512_bf16 boot_cpu_has(X86_FEATURE_AVX512_BF16)
--- a/xen/include/public/arch-x86/cpufeatureset.h
+++ b/xen/include/public/arch-x86/cpufeatureset.h
@@ -258,6 +258,7 @@ XEN_CPUFEATURE(AVX512_4VNNIW, 9*32+ 2) /
 XEN_CPUFEATURE(AVX512_4FMAPS, 9*32+ 3) /*A  AVX512 Multiply Accumulation 
Single Precision */
 XEN_CPUFEATURE(MD_CLEAR,  9*32+10) /*A  VERW clears microarchitectural 
buffers */
 XEN_CPUFEATURE(TSX_FORCE_ABORT, 9*32+13) /* MSR_TSX_FORCE_ABORT.RTM_ABORT */
+XEN_CPUFEATURE(SERIALIZE, 9*32+14) /*A  SERIALIZE insn */
 XEN_CPUFEATURE(CET_IBT,   9*32+20) /*   CET - Indirect Branch Tracking */
 XEN_CPUFEATURE(IBRSB, 9*32+26) /*A  IBRS and IBPB support (used by 
Intel) */
 XEN_CPUFEATURE(STIBP, 9*32+27) /*A  STIBP */




Re: [PATCH v2] docs/designs: re-work the xenstore migration document...

2020-04-27 Thread Jürgen Groß

On 27.04.20 12:45, Paul Durrant wrote:

-Original Message-
From: Jürgen Groß 
Sent: 27 April 2020 11:37
To: Paul Durrant ; xen-devel@lists.xenproject.org
Cc: Paul Durrant 
Subject: Re: [PATCH v2] docs/designs: re-work the xenstore migration document...

On 27.04.20 09:53, Paul Durrant wrote:

From: Paul Durrant 

... to specify a separate migration stream that will also be suitable for
live update.

The original scope of the document was to support non-cooperative migration
of guests [1] but, since then, live update of xenstored has been brought into
scope. Thus it makes more sense to define a separate image format for
serializing xenstore state that is suitable for both purposes.

The document has been limited to specifying a new image format. The mechanism
for acquiring the image for live update or migration is not covered as that
is more appropriately dealt with by a patch to docs/misc/xenstore.txt. It is
also expected that, when the first implementation of live update or migration
making use of this specification is committed, that the document is moved from
docs/designs into docs/specs.

[1] See 
https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=docs/designs/non-cooperative-migration.md

Signed-off-by: Paul Durrant 
---
Juergen Gross 
Andrew Cooper 
George Dunlap 
Ian Jackson 
Jan Beulich 
Julien Grall 
Stefano Stabellini 
Wei Liu 


Mind adding CC: before those mail addresses in order to let git add
those to the recipients list?



D'oh... good spot.



v2:
   - Address comments from Juergen


Not all unfortunately. :-(



OK.


+### CONNECTION_DATA

-Each WATCH_DATA record specifies a registered watch and is formatted as
-follows:
+For live update the image format will contain a `CONNECTION_DATA` record for
+each connection to xenstore. For migration it will only contain a record for
+the domain being migrated.


   ```
-0   1   2   3 octet
-+---+---+---+---+
-| WATCH_DATA|
-+---+
-| wpath length  |
-+---+
-| wpath data|
-...
-| pad (0 to 3 octets)   |
-+---+
+0   1   2   3   4   5   6   7octet
++---+---+---+---+---+---+---+---+
+| conn-id   | pad   |
++---+---+
+| conn-type | conn-spec
   ...


I asked whether it wouldn't be better to drop the pad and move conn-type
and a 2-byte (unified) flag field at its position. This together ...


++---+---+
+| data-len  | data
   +---+
-| token length  |
-+---+
-| token data|
   ...
-| pad (0 to 3 octets)   |
-+---+
   ```

-wpath length and token length are specified in octets (excluding the NUL
-terminator). The wpath should be as described for the `WATCH` operation in
-[2]. The token is an arbitrary string of octets not containing any NUL
-values.

+| Field   | Description |
+|-|-|
+| `conn-id`   | A non-zero number used to identify this |
+| | connection in subsequent connection-specific|
+| | records |
+| | |
+| `conn-type` | 0x: shared ring |
+| | 0x0001: socket  |
+| | |
+| `conn-spec` | See below   |
+| | |
+| `data-len`  | The length (in octets) of any pending data not  |
+| | yet written to the connection   |
+| | |
+| `data`  | Pending data (may be empty) |

-**TRANSACTION_DATA**
+The format of `conn-spec` is dependent upon `conn-type`.

+\pagebreak

-Each TRANSACTION_DATA record specifies an open transaction and is formatted
-as follows:
+For `shared ring` connections it is as follows:


   ```
-0   1   2   3 octet
-+---+---+---+---+
-| TRANSACTION_DATA  |
-+---+
-| tx_id |
-+---+
+0   1   2   3   4   5   6   7octet
++---+---+---+---+---+---+
+| domid | tdomid| flags |
++---+---+---+---+
+| revtchn   | levtchn

[PATCH v7 02/11] x86emul: support MOVDIR{I,64B} insns

2020-04-27 Thread Jan Beulich
Introduce a new blk() hook, paralleling the rmw() one in a certain way,
but being intended for larger data sizes, and hence its HVM intermediate
handling function doesn't fall back to splitting the operation if the
requested virtual address can't be mapped.

Note that SDM revision 071 doesn't specify exception behavior for
ModRM.mod == 0b11; assuming #UD here.

Signed-off-by: Jan Beulich 
Reviewed-by: Paul Durrant 
---
v7: Add blk_NONE. Move  harness'es setting of .blk. Correct indentation.
Re-base.
v6: Fold MOVDIRI and MOVDIR64B changes again. Use blk() for both. All
tags dropped.
v5: Introduce/use ->blk() hook. Correct asm() operands.
v4: Split MOVDIRI and MOVDIR64B and move this one ahead. Re-base.
v3: Update description.
---
(SDE: -tnt)

--- a/tools/tests/x86_emulator/test_x86_emulator.c
+++ b/tools/tests/x86_emulator/test_x86_emulator.c
@@ -652,6 +652,18 @@ static int cmpxchg(
 return X86EMUL_OKAY;
 }
 
+static int blk(
+enum x86_segment seg,
+unsigned long offset,
+void *p_data,
+unsigned int bytes,
+uint32_t *eflags,
+struct x86_emulate_state *state,
+struct x86_emulate_ctxt *ctxt)
+{
+return x86_emul_blk((void *)offset, p_data, bytes, eflags, state, ctxt);
+}
+
 static int read_segment(
 enum x86_segment seg,
 struct segment_register *reg,
@@ -721,6 +733,7 @@ static struct x86_emulate_ops emulops =
 .insn_fetch = fetch,
 .write  = write,
 .cmpxchg= cmpxchg,
+.blk= blk,
 .read_segment = read_segment,
 .cpuid  = emul_test_cpuid,
 .read_cr= emul_test_read_cr,
@@ -2339,6 +2352,50 @@ int main(int argc, char **argv)
 goto fail;
 printf("okay\n");
 
+printf("%-40s", "Testing movdiri %edx,(%ecx)...");
+if ( stack_exec && cpu_has_movdiri )
+{
+instr[0] = 0x0f; instr[1] = 0x38; instr[2] = 0xf9; instr[3] = 0x11;
+
+regs.eip = (unsigned long)[0];
+regs.ecx = (unsigned long)memset(res, -1, 16);
+regs.edx = 0x44332211;
+
+rc = x86_emulate(, );
+if ( (rc != X86EMUL_OKAY) ||
+ (regs.eip != (unsigned long)[4]) ||
+ res[0] != 0x44332211 || ~res[1] )
+goto fail;
+printf("okay\n");
+}
+else
+printf("skipped\n");
+
+printf("%-40s", "Testing movdir64b 144(%edx),%ecx...");
+if ( stack_exec && cpu_has_movdir64b )
+{
+instr[0] = 0x66; instr[1] = 0x0f; instr[2] = 0x38; instr[3] = 0xf8;
+instr[4] = 0x8a; instr[5] = 0x90; instr[8] = instr[7] = instr[6] = 0;
+
+regs.eip = (unsigned long)[0];
+for ( i = 0; i < 64; ++i )
+res[i] = i - 20;
+regs.edx = (unsigned long)res;
+regs.ecx = (unsigned long)(res + 16);
+
+rc = x86_emulate(, );
+if ( (rc != X86EMUL_OKAY) ||
+ (regs.eip != (unsigned long)[9]) ||
+ res[15] != -5 || res[32] != 12 )
+goto fail;
+for ( i = 16; i < 32; ++i )
+if ( res[i] != i )
+goto fail;
+printf("okay\n");
+}
+else
+printf("skipped\n");
+
 printf("%-40s", "Testing movq %mm3,(%ecx)...");
 if ( stack_exec && cpu_has_mmx )
 {
--- a/tools/tests/x86_emulator/x86-emulate.h
+++ b/tools/tests/x86_emulator/x86-emulate.h
@@ -154,6 +154,8 @@ static inline bool xcr0_mask(uint64_t ma
 #define cpu_has_avx512_vnni (cp.feat.avx512_vnni && xcr0_mask(0xe6))
 #define cpu_has_avx512_bitalg (cp.feat.avx512_bitalg && xcr0_mask(0xe6))
 #define cpu_has_avx512_vpopcntdq (cp.feat.avx512_vpopcntdq && xcr0_mask(0xe6))
+#define cpu_has_movdiricp.feat.movdiri
+#define cpu_has_movdir64b  cp.feat.movdir64b
 #define cpu_has_avx512_4vnniw (cp.feat.avx512_4vnniw && xcr0_mask(0xe6))
 #define cpu_has_avx512_4fmaps (cp.feat.avx512_4fmaps && xcr0_mask(0xe6))
 #define cpu_has_avx512_bf16 (cp.feat.avx512_bf16 && xcr0_mask(0xe6))
--- a/xen/arch/x86/arch.mk
+++ b/xen/arch/x86/arch.mk
@@ -47,6 +47,7 @@ $(call as-option-add,CFLAGS,CC,"rdseed %
 $(call as-option-add,CFLAGS,CC,"clwb (%rax)",-DHAVE_AS_CLWB)
 $(call as-option-add,CFLAGS,CC,".equ \"x\"$$(comma)1",-DHAVE_AS_QUOTED_SYM)
 $(call as-option-add,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
+$(call as-option-add,CFLAGS,CC,"movdiri %rax$$(comma)(%rax)",-DHAVE_AS_MOVDIR)
 
 # GAS's idea of true is -1.  Clang's idea is 1
 $(call as-option-add,CFLAGS,CC,\
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -1441,6 +1441,44 @@ static int hvmemul_rmw(
 return rc;
 }
 
+static int hvmemul_blk(
+enum x86_segment seg,
+unsigned long offset,
+void *p_data,
+unsigned int bytes,
+uint32_t *eflags,
+struct x86_emulate_state *state,
+struct x86_emulate_ctxt *ctxt)
+{
+struct hvm_emulate_ctxt *hvmemul_ctxt =
+container_of(ctxt, struct hvm_emulate_ctxt, ctxt);
+unsigned long addr;
+uint32_t pfec = PFEC_page_present | PFEC_write_access;
+int rc;
+void *mapping = NULL;
+
+rc = 

[PATCH v7 03/11] x86emul: support ENQCMD insn

2020-04-27 Thread Jan Beulich
Note that the ISA extensions document revision 037 doesn't specify
exception behavior for ModRM.mod == 0b11; assuming #UD here.

No tests are being added to the harness - this would be quite hard,
we can't just issue the insns against RAM. Their similarity with
MOVDIR64B should have the test case there be god enough to cover any
fundamental flaws.

Signed-off-by: Jan Beulich 
---
TBD: This doesn't (can't) consult PASID translation tables yet, as we
 have no VMX code for this so far. I guess for this we will want to
 replace the direct ->read_msr(MSR_IA32_PASID, ...) with a new
 ->read_pasid() hook.
---
v7: Re-base.
v6: Re-base.
v5: New.

--- a/xen/arch/x86/arch.mk
+++ b/xen/arch/x86/arch.mk
@@ -48,6 +48,7 @@ $(call as-option-add,CFLAGS,CC,"clwb (%r
 $(call as-option-add,CFLAGS,CC,".equ \"x\"$$(comma)1",-DHAVE_AS_QUOTED_SYM)
 $(call as-option-add,CFLAGS,CC,"invpcid (%rax)$$(comma)%rax",-DHAVE_AS_INVPCID)
 $(call as-option-add,CFLAGS,CC,"movdiri %rax$$(comma)(%rax)",-DHAVE_AS_MOVDIR)
+$(call as-option-add,CFLAGS,CC,"enqcmd (%rax)$$(comma)%rax",-DHAVE_AS_ENQCMD)
 
 # GAS's idea of true is -1.  Clang's idea is 1
 $(call as-option-add,CFLAGS,CC,\
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -855,6 +855,7 @@ struct x86_emulate_state {
 } rmw;
 enum {
 blk_NONE,
+blk_enqcmd,
 blk_movdir,
 } blk;
 uint8_t modrm, modrm_mod, modrm_reg, modrm_rm;
@@ -901,6 +902,7 @@ typedef union {
 uint64_t __attribute__ ((aligned(16))) xmm[2];
 uint64_t __attribute__ ((aligned(32))) ymm[4];
 uint64_t __attribute__ ((aligned(64))) zmm[8];
+uint32_t data32[16];
 } mmval_t;
 
 /*
@@ -1922,6 +1924,7 @@ amd_like(const struct x86_emulate_ctxt *
 #define vcpu_has_rdpid()   (ctxt->cpuid->feat.rdpid)
 #define vcpu_has_movdiri() (ctxt->cpuid->feat.movdiri)
 #define vcpu_has_movdir64b()   (ctxt->cpuid->feat.movdir64b)
+#define vcpu_has_enqcmd()  (ctxt->cpuid->feat.enqcmd)
 #define vcpu_has_avx512_4vnniw() (ctxt->cpuid->feat.avx512_4vnniw)
 #define vcpu_has_avx512_4fmaps() (ctxt->cpuid->feat.avx512_4fmaps)
 #define vcpu_has_avx512_bf16() (ctxt->cpuid->feat.avx512_bf16)
@@ -10200,6 +10203,36 @@ x86_emulate(
 state->simd_size = simd_none;
 break;
 
+case X86EMUL_OPC_F2(0x0f38, 0xf8): /* enqcmd r,m512 */
+case X86EMUL_OPC_F3(0x0f38, 0xf8): /* enqcmds r,m512 */
+host_and_vcpu_must_have(enqcmd);
+generate_exception_if(ea.type != OP_MEM, EXC_UD);
+generate_exception_if(vex.pfx != vex_f2 && !mode_ring0(), EXC_GP, 0);
+src.val = truncate_ea(*dst.reg);
+generate_exception_if(!is_aligned(x86_seg_es, src.val, 64, ctxt, ops),
+  EXC_GP, 0);
+fail_if(!ops->blk);
+BUILD_BUG_ON(sizeof(*mmvalp) < 64);
+if ( (rc = ops->read(ea.mem.seg, ea.mem.off, mmvalp, 64,
+ ctxt)) != X86EMUL_OKAY )
+goto done;
+if ( vex.pfx == vex_f2 ) /* enqcmd */
+{
+fail_if(!ops->read_msr);
+if ( (rc = ops->read_msr(MSR_IA32_PASID,
+ _val, ctxt)) != X86EMUL_OKAY )
+goto done;
+generate_exception_if(!(msr_val & PASID_VALID), EXC_GP, 0);
+mmvalp->data32[0] = MASK_EXTR(msr_val, PASID_PASID_MASK);
+}
+mmvalp->data32[0] &= ~0x7ff0;
+state->blk = blk_enqcmd;
+if ( (rc = ops->blk(x86_seg_es, src.val, mmvalp, 64, &_regs.eflags,
+state, ctxt)) != X86EMUL_OKAY )
+goto done;
+state->simd_size = simd_none;
+break;
+
 case X86EMUL_OPC(0x0f38, 0xf9): /* movdiri mem,r */
 host_and_vcpu_must_have(movdiri);
 generate_exception_if(dst.type != OP_MEM, EXC_UD);
@@ -11480,11 +11513,36 @@ int x86_emul_blk(
 {
 switch ( state->blk )
 {
+bool zf;
+
 /*
  * Throughout this switch(), memory clobbers are used to compensate
  * that other operands may not properly express the (full) memory
  * ranges covered.
  */
+case blk_enqcmd:
+ASSERT(bytes == 64);
+if ( ((unsigned long)ptr & 0x3f) )
+{
+ASSERT_UNREACHABLE();
+return X86EMUL_UNHANDLEABLE;
+}
+*eflags &= ~EFLAGS_MASK;
+#ifdef HAVE_AS_ENQCMD
+asm ( "enqcmds (%[src]), %[dst]" ASM_FLAG_OUT(, "; setz %0")
+  : [zf] ASM_FLAG_OUT("=@ccz", "=qm") (zf)
+  : [src] "r" (data), [dst] "r" (ptr) : "memory" );
+#else
+/* enqcmds (%rsi), %rdi */
+asm ( ".byte 0xf3, 0x0f, 0x38, 0xf8, 0x3e"
+  ASM_FLAG_OUT(, "; setz %[zf]")
+  : [zf] ASM_FLAG_OUT("=@ccz", "=qm") (zf)
+  : "S" (data), "D" (ptr) : "memory" );
+#endif
+if ( zf )
+*eflags |= X86_EFLAGS_ZF;
+break;
+
 case blk_movdir:
 switch ( bytes )
 {
--- 

[PATCH v7 01/11] x86emul: disable FPU/MMX/SIMD insn emulation when !HVM

2020-04-27 Thread Jan Beulich
In a pure PV environment (the PV shim in particular) we don't really
need emulation of all these. To limit #ifdef-ary utilize some of the
CASE_*() macros we have, by providing variants expanding to
(effectively) nothing (really a label, which in turn requires passing
-Wno-unused-label to the compiler when build such configurations).

Due to the mixture of macro and #ifdef use, the placement of some of
the #ifdef-s is a little arbitrary.

The resulting object file's .text is less than half the size of the
original, and looks to also be compiling a little more quickly.

This is meant as a first step; more parts can likely be disabled down
the road.

Suggested-by: Andrew Cooper 
Signed-off-by: Jan Beulich 
---
v7: Integrate into this series. Re-base.
---
I'll be happy to take suggestions allowing to avoid -Wno-unused-label.

--- a/xen/arch/x86/Makefile
+++ b/xen/arch/x86/Makefile
@@ -73,6 +73,9 @@ obj-y += vm_event.o
 obj-y += xstate.o
 extra-y += asm-macros.i
 
+ifneq ($(CONFIG_HVM),y)
+x86_emulate.o: CFLAGS-y += -Wno-unused-label
+endif
 x86_emulate.o: x86_emulate/x86_emulate.c x86_emulate/x86_emulate.h
 
 efi-y := $(shell if [ ! -r $(BASEDIR)/include/xen/compile.h -o \
--- a/xen/arch/x86/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate.c
@@ -42,6 +42,12 @@
 }  \
 })
 
+#ifndef CONFIG_HVM
+# define X86EMUL_NO_FPU
+# define X86EMUL_NO_MMX
+# define X86EMUL_NO_SIMD
+#endif
+
 #include "x86_emulate/x86_emulate.c"
 
 int x86emul_read_xcr(unsigned int reg, uint64_t *val,
--- a/xen/arch/x86/x86_emulate/x86_emulate.c
+++ b/xen/arch/x86/x86_emulate/x86_emulate.c
@@ -3492,6 +3492,7 @@ x86_decode(
 op_bytes = 4;
 break;
 
+#ifndef X86EMUL_NO_SIMD
 case simd_packed_int:
 switch ( vex.pfx )
 {
@@ -3557,6 +3558,7 @@ x86_decode(
 case simd_256:
 op_bytes = 32;
 break;
+#endif /* !X86EMUL_NO_SIMD */
 
 default:
 op_bytes = 0;
@@ -3711,6 +3713,7 @@ x86_emulate(
 break;
 }
 
+#ifndef X86EMUL_NO_SIMD
 /* With a memory operand, fetch the mask register in use (if any). */
 if ( ea.type == OP_MEM && evex.opmsk &&
  _get_fpu(fpu_type = X86EMUL_FPU_opmask, ctxt, ops) == X86EMUL_OKAY )
@@ -3741,6 +3744,7 @@ x86_emulate(
 put_fpu(X86EMUL_FPU_opmask, false, state, ctxt, ops);
 fpu_type = X86EMUL_FPU_none;
 }
+#endif /* !X86EMUL_NO_SIMD */
 
 /* Decode (but don't fetch) the destination operand: register or memory. */
 switch ( d & DstMask )
@@ -4386,11 +4390,13 @@ x86_emulate(
 singlestep = _regs.eflags & X86_EFLAGS_TF;
 break;
 
+#ifndef X86EMUL_NO_FPU
 case 0x9b:  /* wait/fwait */
 host_and_vcpu_must_have(fpu);
 get_fpu(X86EMUL_FPU_wait);
 emulate_fpu_insn_stub(b);
 break;
+#endif
 
 case 0x9c: /* pushf */
 if ( (_regs.eflags & X86_EFLAGS_VM) &&
@@ -4800,6 +4806,7 @@ x86_emulate(
 break;
 }
 
+#ifndef X86EMUL_NO_FPU
 case 0xd8: /* FPU 0xd8 */
 host_and_vcpu_must_have(fpu);
 get_fpu(X86EMUL_FPU_fpu);
@@ -5134,6 +5141,7 @@ x86_emulate(
 }
 }
 break;
+#endif /* !X86EMUL_NO_FPU */
 
 case 0xe0 ... 0xe2: /* loop{,z,nz} */ {
 unsigned long count = get_loop_count(&_regs, ad_bytes);
@@ -6079,6 +6087,8 @@ x86_emulate(
 case X86EMUL_OPC(0x0f, 0x19) ... X86EMUL_OPC(0x0f, 0x1f): /* nop */
 break;
 
+#ifndef X86EMUL_NO_MMX
+
 case X86EMUL_OPC(0x0f, 0x0e): /* femms */
 host_and_vcpu_must_have(3dnow);
 asm volatile ( "femms" );
@@ -6099,39 +6109,71 @@ x86_emulate(
 state->simd_size = simd_other;
 goto simd_0f_imm8;
 
-#define CASE_SIMD_PACKED_INT(pfx, opc)   \
+#endif /* !X86EMUL_NO_MMX */
+
+#if !defined(X86EMUL_NO_SIMD) && !defined(X86EMUL_NO_MMX)
+# define CASE_SIMD_PACKED_INT(pfx, opc)  \
 case X86EMUL_OPC(pfx, opc):  \
 case X86EMUL_OPC_66(pfx, opc)
-#define CASE_SIMD_PACKED_INT_VEX(pfx, opc)   \
+#elif !defined(X86EMUL_NO_SIMD)
+# define CASE_SIMD_PACKED_INT(pfx, opc)  \
+case X86EMUL_OPC_66(pfx, opc)
+#elif !defined(X86EMUL_NO_MMX)
+# define CASE_SIMD_PACKED_INT(pfx, opc)  \
+case X86EMUL_OPC(pfx, opc)
+#else
+# define CASE_SIMD_PACKED_INT(pfx, opc) C##pfx##_##opc
+#endif
+
+#ifndef X86EMUL_NO_SIMD
+
+# define CASE_SIMD_PACKED_INT_VEX(pfx, opc)  \
 CASE_SIMD_PACKED_INT(pfx, opc):  \
 case X86EMUL_OPC_VEX_66(pfx, opc)
 
-#define CASE_SIMD_ALL_FP(kind, pfx, opc) \
+# define CASE_SIMD_ALL_FP(kind, pfx, opc)\
 CASE_SIMD_PACKED_FP(kind, pfx, opc): \
 CASE_SIMD_SCALAR_FP(kind, pfx, opc)
-#define CASE_SIMD_PACKED_FP(kind, pfx, opc)  \
+# define CASE_SIMD_PACKED_FP(kind, pfx, opc) \
 case X86EMUL_OPC##kind(pfx, opc):\
 case X86EMUL_OPC##kind##_66(pfx, opc)
-#define CASE_SIMD_SCALAR_FP(kind, pfx, opc)  \
+# define CASE_SIMD_SCALAR_FP(kind, pfx, opc) \
 case X86EMUL_OPC##kind##_F3(pfx, opc):   

[PATCH v7 00/11] x86emul: further work

2020-04-27 Thread Jan Beulich
At least the RDPRU patches is still at least partly RFC. I'd
appreciate though if at least some of the series could go in
rather sooner than later. Note in particular that there's no
strong ordering throughout the entire series, i.e. certain
later parts could be arranged for to go in earlier. This is
also specifically the case for what is now the last patch.

 1: x86emul: disable FPU/MMX/SIMD insn emulation when !HVM
 2: x86emul: support MOVDIR{I,64B} insns
 3: x86emul: support ENQCMD insn
 4: x86emul: support SERIALIZE
 5: x86emul: support X{SUS,RES}LDTRK
 6: x86emul: support FNSTENV and FNSAVE
 7: x86emul: support FLDENV and FRSTOR
 8: x86emul: support FXSAVE/FXRSTOR
 9: x86/HVM: scale MPERF values reported to guests (on AMD)
10: x86emul: support RDPRU
11: x86/HVM: don't needlessly intercept APERF/MPERF/TSC MSR reads

Main changes from v6 are, besides three new patches and some
re-ordering, the integration into this series of what is now
patch 1 (for later patches now depending even more heavily on
it), and the dropping (for the time being) of the MCOMMIT one.

Jan



[xen-unstable-smoke test] 149834: tolerable all pass - PUSHED

2020-04-27 Thread osstest service owner
flight 149834 xen-unstable-smoke real [real]
http://logs.test-lab.xenproject.org/osstest/logs/149834/

Failures :-/ but no regressions.

Tests which did not succeed, but are not blocking:
 test-amd64-amd64-libvirt 13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  13 migrate-support-checkfail   never pass
 test-arm64-arm64-xl-xsm  14 saverestore-support-checkfail   never pass
 test-armhf-armhf-xl  13 migrate-support-checkfail   never pass
 test-armhf-armhf-xl  14 saverestore-support-checkfail   never pass

version targeted for testing:
 xen  8d4c17a90b0a9b3d82628461090a54a2c7897154
baseline version:
 xen  f093b08c47b39da6019421a2b61d40745b3e573b

Last test of basis   149822  2020-04-25 18:00:23 Z1 days
Testing same since   149834  2020-04-27 08:01:51 Z0 days1 attempts


People who touched revisions under test:
  Anthony PERARD 
  Jan Beulich 
  Julien Grall 
  Stefano Stabellini 

jobs:
 build-arm64-xsm  pass
 build-amd64  pass
 build-armhf  pass
 build-amd64-libvirt  pass
 test-armhf-armhf-xl  pass
 test-arm64-arm64-xl-xsm  pass
 test-amd64-amd64-xl-qemuu-debianhvm-amd64pass
 test-amd64-amd64-libvirt pass



sg-report-flight on osstest.test-lab.xenproject.org
logs: /home/logs/logs
images: /home/logs/images

Logs, config files, etc. are available at
http://logs.test-lab.xenproject.org/osstest/logs

Explanation of these reports, and of osstest in general, is at
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README.email;hb=master
http://xenbits.xen.org/gitweb/?p=osstest.git;a=blob;f=README;hb=master

Test harness code can be found at
http://xenbits.xen.org/gitweb?p=osstest.git;a=summary


Pushing revision :

To xenbits.xen.org:/home/xen/git/xen.git
   f093b08c47..8d4c17a90b  8d4c17a90b0a9b3d82628461090a54a2c7897154 -> smoke



RE: [PATCH v2] docs/designs: re-work the xenstore migration document...

2020-04-27 Thread Paul Durrant
> -Original Message-
> From: Jürgen Groß 
> Sent: 27 April 2020 11:37
> To: Paul Durrant ; xen-devel@lists.xenproject.org
> Cc: Paul Durrant 
> Subject: Re: [PATCH v2] docs/designs: re-work the xenstore migration 
> document...
> 
> On 27.04.20 09:53, Paul Durrant wrote:
> > From: Paul Durrant 
> >
> > ... to specify a separate migration stream that will also be suitable for
> > live update.
> >
> > The original scope of the document was to support non-cooperative migration
> > of guests [1] but, since then, live update of xenstored has been brought 
> > into
> > scope. Thus it makes more sense to define a separate image format for
> > serializing xenstore state that is suitable for both purposes.
> >
> > The document has been limited to specifying a new image format. The 
> > mechanism
> > for acquiring the image for live update or migration is not covered as that
> > is more appropriately dealt with by a patch to docs/misc/xenstore.txt. It is
> > also expected that, when the first implementation of live update or 
> > migration
> > making use of this specification is committed, that the document is moved 
> > from
> > docs/designs into docs/specs.
> >
> > [1] See 
> > https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=docs/designs/non-cooperative-migration.md
> >
> > Signed-off-by: Paul Durrant 
> > ---
> > Juergen Gross 
> > Andrew Cooper 
> > George Dunlap 
> > Ian Jackson 
> > Jan Beulich 
> > Julien Grall 
> > Stefano Stabellini 
> > Wei Liu 
> 
> Mind adding CC: before those mail addresses in order to let git add
> those to the recipients list?
> 

D'oh... good spot.

> >
> > v2:
> >   - Address comments from Juergen
> 
> Not all unfortunately. :-(
> 

OK.

> > +### CONNECTION_DATA
> >
> > -Each WATCH_DATA record specifies a registered watch and is formatted as
> > -follows:
> > +For live update the image format will contain a `CONNECTION_DATA` record 
> > for
> > +each connection to xenstore. For migration it will only contain a record 
> > for
> > +the domain being migrated.
> >
> >
> >   ```
> > -0   1   2   3 octet
> > -+---+---+---+---+
> > -| WATCH_DATA|
> > -+---+
> > -| wpath length  |
> > -+---+
> > -| wpath data|
> > -...
> > -| pad (0 to 3 octets)   |
> > -+---+
> > +0   1   2   3   4   5   6   7octet
> > ++---+---+---+---+---+---+---+---+
> > +| conn-id   | pad   |
> > ++---+---+
> > +| conn-type | conn-spec
> >   ...
> 
> I asked whether it wouldn't be better to drop the pad and move conn-type
> and a 2-byte (unified) flag field at its position. This together ...
> 
> > ++---+---+
> > +| data-len  | data
> >   +---+
> > -| token length  |
> > -+---+
> > -| token data|
> >   ...
> > -| pad (0 to 3 octets)   |
> > -+---+
> >   ```
> >
> > -wpath length and token length are specified in octets (excluding the NUL
> > -terminator). The wpath should be as described for the `WATCH` operation in
> > -[2]. The token is an arbitrary string of octets not containing any NUL
> > -values.
> >
> > +| Field   | Description |
> > +|-|-|
> > +| `conn-id`   | A non-zero number used to identify this |
> > +| | connection in subsequent connection-specific|
> > +| | records |
> > +| | |
> > +| `conn-type` | 0x: shared ring |
> > +| | 0x0001: socket  |
> > +| | |
> > +| `conn-spec` | See below   |
> > +| | |
> > +| `data-len`  | The length (in octets) of any pending data not  |
> > +| | yet written to the connection   |
> > +| | |
> > +| `data`  | Pending data (may be empty) |
> >
> > -**TRANSACTION_DATA**
> > +The format of `conn-spec` is dependent upon `conn-type`.
> >
> > +\pagebreak
> >
> > -Each TRANSACTION_DATA record specifies an open transaction and is formatted
> > -as follows:
> > +For `shared ring` connections it is as follows:
> >
> >
> >   ```
> > -0   1   2   3 octet
> > -+---+---+---+---+
> > -| TRANSACTION_DATA  |
> > 

Re: [PATCH v2] docs/designs: re-work the xenstore migration document...

2020-04-27 Thread Jürgen Groß

On 27.04.20 09:53, Paul Durrant wrote:

From: Paul Durrant 

... to specify a separate migration stream that will also be suitable for
live update.

The original scope of the document was to support non-cooperative migration
of guests [1] but, since then, live update of xenstored has been brought into
scope. Thus it makes more sense to define a separate image format for
serializing xenstore state that is suitable for both purposes.

The document has been limited to specifying a new image format. The mechanism
for acquiring the image for live update or migration is not covered as that
is more appropriately dealt with by a patch to docs/misc/xenstore.txt. It is
also expected that, when the first implementation of live update or migration
making use of this specification is committed, that the document is moved from
docs/designs into docs/specs.

[1] See 
https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=docs/designs/non-cooperative-migration.md

Signed-off-by: Paul Durrant 
---
Juergen Gross 
Andrew Cooper 
George Dunlap 
Ian Jackson 
Jan Beulich 
Julien Grall 
Stefano Stabellini 
Wei Liu 


Mind adding CC: before those mail addresses in order to let git add
those to the recipients list?



v2:
  - Address comments from Juergen


Not all unfortunately. :-(


+### CONNECTION_DATA
  
-Each WATCH_DATA record specifies a registered watch and is formatted as

-follows:
+For live update the image format will contain a `CONNECTION_DATA` record for
+each connection to xenstore. For migration it will only contain a record for
+the domain being migrated.
  
  
  ```

-0   1   2   3 octet
-+---+---+---+---+
-| WATCH_DATA|
-+---+
-| wpath length  |
-+---+
-| wpath data|
-...
-| pad (0 to 3 octets)   |
-+---+
+0   1   2   3   4   5   6   7octet
++---+---+---+---+---+---+---+---+
+| conn-id   | pad   |
++---+---+
+| conn-type | conn-spec
  ...


I asked whether it wouldn't be better to drop the pad and move conn-type
and a 2-byte (unified) flag field at its position. This together ...


++---+---+
+| data-len  | data
  +---+
-| token length  |
-+---+
-| token data|
  ...
-| pad (0 to 3 octets)   |
-+---+
  ```
  
-wpath length and token length are specified in octets (excluding the NUL

-terminator). The wpath should be as described for the `WATCH` operation in
-[2]. The token is an arbitrary string of octets not containing any NUL
-values.
  
+| Field   | Description |

+|-|-|
+| `conn-id`   | A non-zero number used to identify this |
+| | connection in subsequent connection-specific|
+| | records |
+| | |
+| `conn-type` | 0x: shared ring |
+| | 0x0001: socket  |
+| | |
+| `conn-spec` | See below   |
+| | |
+| `data-len`  | The length (in octets) of any pending data not  |
+| | yet written to the connection   |
+| | |
+| `data`  | Pending data (may be empty) |
  
-**TRANSACTION_DATA**

+The format of `conn-spec` is dependent upon `conn-type`.
  
+\pagebreak
  
-Each TRANSACTION_DATA record specifies an open transaction and is formatted

-as follows:
+For `shared ring` connections it is as follows:
  
  
  ```

-0   1   2   3 octet
-+---+---+---+---+
-| TRANSACTION_DATA  |
-+---+
-| tx_id |
-+---+
+0   1   2   3   4   5   6   7octet
++---+---+---+---+---+---+
+| domid | tdomid| flags |
++---+---+---+---+
+| revtchn   | levtchn   |
++---+---+
+| mfn   |
++---+


... with dropping levtchn (which isn't needed 

Re: [PATCH v11 1/3] x86/tlb: introduce a flush HVM ASIDs flag

2020-04-27 Thread Roger Pau Monné
On Mon, Apr 27, 2020 at 11:12:35AM +0100, Wei Liu wrote:
> On Thu, Apr 23, 2020 at 06:33:49PM +0200, Jan Beulich wrote:
> > On 23.04.2020 16:56, Roger Pau Monne wrote:
> > > Introduce a specific flag to request a HVM guest linear TLB flush,
> > > which is an ASID/VPID tickle that forces a guest linear to guest
> > > physical TLB flush for all HVM guests.
> > > 
> > > This was previously unconditionally done in each pre_flush call, but
> > > that's not required: HVM guests not using shadow don't require linear
> > > TLB flushes as Xen doesn't modify the pages tables the guest runs on
> > > in that case (ie: when using HAP). Note that shadow paging code
> > > already takes care of issuing the necessary flushes when the shadow
> > > page tables are modified.
> > > 
> > > In order to keep the previous behavior modify all shadow code TLB
> > > flushes to also flush the guest linear to physical TLB if the guest is
> > > HVM. I haven't looked at each specific shadow code TLB flush in order
> > > to figure out whether it actually requires a guest TLB flush or not,
> > > so there might be room for improvement in that regard.
> > > 
> > > Also perform ASID/VPID flushes when modifying the p2m tables as it's a
> > > requirement for AMD hardware. Finally keep the flush in
> > > switch_cr3_cr4, as it's not clear whether code could rely on
> > > switch_cr3_cr4 also performing a guest linear TLB flush. A following
> > > patch can remove the ASID/VPID tickle from switch_cr3_cr4 if found to
> > > not be necessary.
> > > 
> > > Signed-off-by: Roger Pau Monné 
> > 
> > Reviewed-by: Jan Beulich 
> > 
> 
> Tim, ICYMI, this patch needs your ack.

Let me put Tim on the To: field, more likely to raise attention.

Roger.



Re: [PATCH v14 3/3] xen/tools: VM forking toolstack side

2020-04-27 Thread Wei Liu
On Fri, Apr 10, 2020 at 10:20:49AM -0600, Tamas K Lengyel wrote:
> On Fri, Apr 10, 2020 at 10:19 AM Wei Liu  wrote:
> >
> > On Fri, Apr 10, 2020 at 10:05:50AM -0600, Tamas K Lengyel wrote:
> > > On Thu, Apr 9, 2020 at 11:30 AM Tamas K Lengyel
> > >  wrote:
> > > >
> > > > On Thu, Apr 9, 2020 at 11:11 AM Wei Liu  wrote:
> > > > >
> > > > > On Thu, Apr 09, 2020 at 10:59:55AM -0600, Tamas K Lengyel wrote:
> > > > > [...]
> > > > > > >
> > > > > > > >
> > > > > > > > > >
> > > > > > > > > > +/*
> > > > > > > > > > + * The parent domain is expected to be created with 
> > > > > > > > > > default settings for
> > > > > > > > > > + * - max_evtch_port
> > > > > > > > > > + * - max_grant_frames
> > > > > > > > > > + * - max_maptrack_frames
> > > > > > > > > > + */
> > > > > > > > > > +int libxl_domain_fork_vm(libxl_ctx *ctx, uint32_t pdomid, 
> > > > > > > > > > uint32_t max_vcpus, uint32_t *domid)
> > > > > > > > > > +{
> > > > > > > > > > +int rc;
> > > > > > > > > > +struct xen_domctl_createdomain create = {0};
> > > > > > > > > > +create.flags |= XEN_DOMCTL_CDF_hvm;
> > > > > > > > > > +create.flags |= XEN_DOMCTL_CDF_hap;
> > > > > > > > > > +create.flags |= XEN_DOMCTL_CDF_oos_off;
> > > > > > > > > > +create.arch.emulation_flags = (XEN_X86_EMU_ALL & 
> > > > > > > > > > ~XEN_X86_EMU_VPCI);
> > > > > > > > > > +create.ssidref = SECINITSID_DOMU;
> > > > > > > > > > +create.max_vcpus = max_vcpus;
> > > > > > > > >
> > > > > > > > >
> > > > > > > > > Some questions:
> > > > > > > > >
> > > > > > > > > Why does the caller need to specify the number of vcpus?
> > > > > > > > >
> > > > > > > > > Since the parent (source) domain is around, can you retrieve 
> > > > > > > > > its domain
> > > > > > > > > configuration such that you know its max_vcpus and other 
> > > > > > > > > information
> > > > > > > > > including max_evtchn_port and maptrack frames?
> > > > > > > >
> > > > > > > > Because we want to avoid having to issue an extra hypercall for 
> > > > > > > > these.
> > > > > > > > Normally these pieces of information will be available for the 
> > > > > > > > user
> > > > > > > > and won't change, so we save time by not querying for it every 
> > > > > > > > time a
> > > > > > > > fork is created. Remember, we might be creating thousands of 
> > > > > > > > forks in
> > > > > > > > a very short time, so those extra hypercalls add up.
> > > > > > >
> > > > > > > I see. Speed is a big concern to you.
> > > > > > >
> > > > > > > What I was referring to doesn't require issuing hypercalls but 
> > > > > > > requires
> > > > > > > calling libxl_retrieve_domain_configuration. That's likely to be 
> > > > > > > even
> > > > > > > slower than making a hypercall.
> > > > > >
> > > > > > Right. We only want to parse the domain config if the device model 
> > > > > > is
> > > > > > being launched.
> > > > > >
> > > > > > >
> > > > > > > I'm afraid the current incarnation of libxl_domain_fork_vm cannot 
> > > > > > > become
> > > > > > > supported (as in Xen's support statement) going forward, because 
> > > > > > > it is
> > > > > > > leaky.
> > > > > >
> > > > > > What do you mean by leaky?
> > > > >
> > > > > It requires the caller to specify the number of max_vcpus. The reason
> > > > > for doing that is because you want to avoid extra hypercall(s). There 
> > > > > is
> > > > > nothing that stops someone from coming along in the future claiming 
> > > > > some
> > > > > other parameters are required because of the same concern you have
> > > > > today. It is an optimisation, not a must-have in terms of 
> > > > > functionality.
> > > > >
> > > > > To me the number shouldn't be specified by the caller in the first
> > > > > place. It is like forking a process somehow requires you to specify 
> > > > > how
> > > > > many threads it will have.
> > > >
> > > > I agree. It's not how I wanted to have the interface work but
> > > > unfortunately this was the least "ugly" of the possible solutions
> > > > given the circumstances.
> > >
> > > By the way, it would be trivial to query the parent in case max_vcpus
> > > is not provided by the user. But I would still like to keep the option
> > > available to skip that step if the number is known already. Let me
> > > know if you would like me to add that.
> >
> > I'm still waiting for Ian and Anthony to chime in to see whether they
> > agree to keep this interface unstable.
> >
> > At the end of the day, if it is unstable, I don't really care that much.
> > I'm happy to let you (the developer and user) have more say in that
> > case.  I will instead divert my (limited) time to code that your patch
> > touches which is also used by other stable functions.
> 
> SGTM

Ian and Anthony, your opinions?

Wei.



Re: [PATCH v11 1/3] x86/tlb: introduce a flush HVM ASIDs flag

2020-04-27 Thread Wei Liu
On Thu, Apr 23, 2020 at 06:33:49PM +0200, Jan Beulich wrote:
> On 23.04.2020 16:56, Roger Pau Monne wrote:
> > Introduce a specific flag to request a HVM guest linear TLB flush,
> > which is an ASID/VPID tickle that forces a guest linear to guest
> > physical TLB flush for all HVM guests.
> > 
> > This was previously unconditionally done in each pre_flush call, but
> > that's not required: HVM guests not using shadow don't require linear
> > TLB flushes as Xen doesn't modify the pages tables the guest runs on
> > in that case (ie: when using HAP). Note that shadow paging code
> > already takes care of issuing the necessary flushes when the shadow
> > page tables are modified.
> > 
> > In order to keep the previous behavior modify all shadow code TLB
> > flushes to also flush the guest linear to physical TLB if the guest is
> > HVM. I haven't looked at each specific shadow code TLB flush in order
> > to figure out whether it actually requires a guest TLB flush or not,
> > so there might be room for improvement in that regard.
> > 
> > Also perform ASID/VPID flushes when modifying the p2m tables as it's a
> > requirement for AMD hardware. Finally keep the flush in
> > switch_cr3_cr4, as it's not clear whether code could rely on
> > switch_cr3_cr4 also performing a guest linear TLB flush. A following
> > patch can remove the ASID/VPID tickle from switch_cr3_cr4 if found to
> > not be necessary.
> > 
> > Signed-off-by: Roger Pau Monné 
> 
> Reviewed-by: Jan Beulich 
> 

Tim, ICYMI, this patch needs your ack.

Wei.



Re: [PATCH] x86: refine guest_mode()

2020-04-27 Thread Roger Pau Monné
On Mon, Apr 27, 2020 at 10:03:05AM +0200, Jan Beulich wrote:
> The 2nd of the assertions as well as the macro's return value have been
> assuming we're on the primary stack. While for most IST exceptions we
> eventually switch back to the main one, for #DF we intentionally never
> do, and hence a #DF actually triggering on a user mode insn (which then
> is still a Xen bug) would in turn trigger this assertion, rather than
> cleanly logging state.
> 
> Reported-by: Andrew Cooper 
> Signed-off-by: Jan Beulich 
> ---
> While we could go further and also assert we're on the correct IST
> stack in an "else" ti the "if()" added, I'm not fully convinced this
> would be generally helpful. I'll be happy to adjust accordingly if
> others think differently; at such a point though I think this should
> then no longer be a macro.
> 
> --- a/xen/include/asm-x86/regs.h
> +++ b/xen/include/asm-x86/regs.h
> @@ -10,9 +10,10 @@
>  /* Frame pointer must point into current CPU stack. */   
>  \
>  ASSERT(diff < STACK_SIZE);   
>  \
>  /* If not a guest frame, it must be a hypervisor frame. */   
>  \
> -ASSERT((diff == 0) || (r->cs == __HYPERVISOR_CS));   
>  \
> +if ( diff < PRIMARY_STACK_SIZE ) 
>  \
> +ASSERT(!diff || ((r)->cs == __HYPERVISOR_CS));   
>  \

Why not use:

ASSERT(diff >= PRIMARY_STACK_SIZE || !diff || ((r)->cs == __HYPERVISOR_CS));

I'm not sure I fully understand this layout, is it possible that you
also need to account for the size of cpu_info?

Roger.



Re: [PATCH 2/2] golang/xenlight: stop tracking generated files

2020-04-27 Thread George Dunlap

> On Apr 23, 2020, at 1:25 AM, Nick Rosbrook  wrote:
> 
> The generated go files were tracked temporarily while the initial
> implementation of gengotypes.py was in progress. They can now be removed
> and ignored by git and hg.
> 
> While here, make sure generated files are removed by make clean.
> 
> Signed-off-by: Nick Rosbrook 

Just to be clear, to onlookers (particularly committers): it’s looking like 
we’re going to leave the generated files in the tree; so this patch is probably 
obsolete.

 -George

Re: [PATCH v17 1/2] mem_sharing: fix sharability check during fork reset

2020-04-27 Thread Roger Pau Monné
On Sat, Apr 25, 2020 at 06:31:46AM -0600, Tamas K Lengyel wrote:
> On Sat, Apr 25, 2020 at 3:01 AM Roger Pau Monné  wrote:
> >
> > On Fri, Apr 24, 2020 at 06:18:24AM -0600, Tamas K Lengyel wrote:
> > > On Fri, Apr 24, 2020 at 3:44 AM Roger Pau Monné  
> > > wrote:
> > > >
> > > > On Thu, Apr 23, 2020 at 08:30:06AM -0700, Tamas K Lengyel wrote:
> > > > > When resetting a VM fork we ought to only remove pages that were 
> > > > > allocated for
> > > > > the fork during it's execution and the contents copied over from the 
> > > > > parent.
> > > > > This can be determined if the page is sharable as special pages used 
> > > > > by the
> > > > > fork for other purposes will not pass this test. Unfortunately during 
> > > > > the fork
> > > > > reset loop we only partially check whether that's the case. A page's 
> > > > > type may
> > > > > indicate it is sharable (pass p2m_is_sharable) but that's not a 
> > > > > sufficient
> > > > > check by itself. All checks that are normally performed before a page 
> > > > > is
> > > > > converted to the sharable type need to be performed to avoid removing 
> > > > > pages
> > > > > from the p2m that may be used for other purposes. For example, 
> > > > > currently the
> > > > > reset loop also removes the vcpu info pages from the p2m, potentially 
> > > > > putting
> > > > > the guest into infinite page-fault loops.
> > > > >
> > > > > Signed-off-by: Tamas K Lengyel 
> > > >
> > > > Reviewed-by: Roger Pau Monné 
> > >
> > > Thanks!
> > >
> > > >
> > > > I've been looking however and I'm not able to spot where you copy the
> > > > shared_info data, which I think it's also quite critical for the
> > > > domain, as it contains the info about event channels (when using L2).
> > > > Also for HVM forks the shared info should be mapped at the same gfn as
> > > > the parent, or else the child trying to access it will trigger an EPT
> > > > fault that won't be able to populate such gfn, because the shared_info
> > > > on the parent is already shared between Xen and the parent.
> > >
> > > Pages that cause an EPT fault but can't be made shared get a new page
> > > allocated for them and copied in mem_sharing_fork_page. There are many
> > > pages like that, mostly due to QEMU mapping them and thus holding an
> > > extra reference. That said, shared_info is manually copied during
> > > forking in copy_special_pages, at the end of the function you will
> > > find:
> > >
> > > old_mfn = _mfn(virt_to_mfn(d->shared_info));
> > > new_mfn = _mfn(virt_to_mfn(cd->shared_info));
> > >
> > > copy_domain_page(new_mfn, old_mfn);
> >
> > Yes, that's indeed fine, you need to copy the contents of the shared
> > info page, but for HVM you also need to make sure the shared info page
> > is mapped at the same gfn as the parent. HVM guest issue a hypercall
> > to request the mapping of the shared info page to a specific gfn,
> > since it's not added to the guest physmap by default.
> > XENMAPSPACE_shared_info is used in order to request the shared info
> > page to be mapped at a specific gfn.
> >
> > AFAICT during fork such shared info mapping is not replicated to the
> > child, and hence the child trying to access the gfn of the shared info
> > page would just result in EPT faults that won't be fixed (because the
> > parent shared info page cannot be shared with the child)?
> >
> > You should be able to use get_gpfn_from_mfn in order to get the parent
> > gfn where the shared info mfn is mapped (if any), and then replicate
> > this in the child using it's own shared info.
> >
> > On fork reset you should check if the child shared info gfn != parent
> > shared info gfn and reinstate the parent state if different from the
> > child.
> 
> OK, I see what you mean. In the way things set up currently the EPT
> fault-loop problem doesn't happen since the page does get copied, it
> just gets copied to a new mfn not the one d->shared_info points to. So
> whatever issue that may bring it must be subtle because we haven't
> noticed any instability.

In any case I think it would be good if you could add such mapping on
domain fork and reset, as you already partially handle the shared info
page by copying the data from the parent into the child. Or at least
add a FIXME comment to note the fact that a child trying to access the
shared info page won't work.

> Also, looking at the save/restore code in libxc it seems to me that
> shared_info is actually a PV specific page and it doesn't get
> saved/restored with an HVM domain. The hvm code paths don't process
> REC_TYPE_SHARED_INFO at all. So since forks are exclusively for HVM
> domains, do we really need it and if so, why doesn't HVM save/restore
> need it?

HVM domains on suspend/resume are aware of the need to re-instate the
shared info mapping, as it's part of the protocol and the guest is
actively cooperating on such actions. The same happens with the vcpu
info pages or the PV timers for example: they are not saved during a
suspend/resume and the guest 

[PATCH] x86: refine guest_mode()

2020-04-27 Thread Jan Beulich
The 2nd of the assertions as well as the macro's return value have been
assuming we're on the primary stack. While for most IST exceptions we
eventually switch back to the main one, for #DF we intentionally never
do, and hence a #DF actually triggering on a user mode insn (which then
is still a Xen bug) would in turn trigger this assertion, rather than
cleanly logging state.

Reported-by: Andrew Cooper 
Signed-off-by: Jan Beulich 
---
While we could go further and also assert we're on the correct IST
stack in an "else" ti the "if()" added, I'm not fully convinced this
would be generally helpful. I'll be happy to adjust accordingly if
others think differently; at such a point though I think this should
then no longer be a macro.

--- a/xen/include/asm-x86/regs.h
+++ b/xen/include/asm-x86/regs.h
@@ -10,9 +10,10 @@
 /* Frame pointer must point into current CPU stack. */\
 ASSERT(diff < STACK_SIZE);\
 /* If not a guest frame, it must be a hypervisor frame. */\
-ASSERT((diff == 0) || (r->cs == __HYPERVISOR_CS));\
+if ( diff < PRIMARY_STACK_SIZE )  \
+ASSERT(!diff || ((r)->cs == __HYPERVISOR_CS));\
 /* Return TRUE if it's a guest frame. */  \
-(diff == 0);  \
+!diff || ((r)->cs != __HYPERVISOR_CS);\
 })
 
 #endif /* __X86_REGS_H__ */



[PATCH v2] docs/designs: re-work the xenstore migration document...

2020-04-27 Thread Paul Durrant
From: Paul Durrant 

... to specify a separate migration stream that will also be suitable for
live update.

The original scope of the document was to support non-cooperative migration
of guests [1] but, since then, live update of xenstored has been brought into
scope. Thus it makes more sense to define a separate image format for
serializing xenstore state that is suitable for both purposes.

The document has been limited to specifying a new image format. The mechanism
for acquiring the image for live update or migration is not covered as that
is more appropriately dealt with by a patch to docs/misc/xenstore.txt. It is
also expected that, when the first implementation of live update or migration
making use of this specification is committed, that the document is moved from
docs/designs into docs/specs.

[1] See 
https://xenbits.xen.org/gitweb/?p=xen.git;a=blob;f=docs/designs/non-cooperative-migration.md

Signed-off-by: Paul Durrant 
---
Juergen Gross 
Andrew Cooper 
George Dunlap 
Ian Jackson 
Jan Beulich 
Julien Grall 
Stefano Stabellini 
Wei Liu 

v2:
 - Address comments from Juergen
---
 docs/designs/xenstore-migration.md | 475 +++--
 1 file changed, 312 insertions(+), 163 deletions(-)

diff --git a/docs/designs/xenstore-migration.md 
b/docs/designs/xenstore-migration.md
index 6ab351e8fe..815aaeee59 100644
--- a/docs/designs/xenstore-migration.md
+++ b/docs/designs/xenstore-migration.md
@@ -3,254 +3,403 @@
 ## Background
 
 The design for *Non-Cooperative Migration of Guests*[1] explains that extra
-save records are required in the migrations stream to allow a guest running
-PV drivers to be migrated without its co-operation. Moreover the save
-records must include details of registered xenstore watches as well as
-content; information that cannot currently be recovered from `xenstored`,
-and hence some extension to the xenstore protocol[2] will also be required.
-
-The *libxenlight Domain Image Format* specification[3] already defines a
-record type `EMULATOR_XENSTORE_DATA` but this is not suitable for
-transferring xenstore data pertaining to the domain directly as it is
-specified such that keys are relative to the path
-`/local/domain/$dm_domid/device-model/$domid`. Thus it is necessary to
-define at least one new save record type.
+save records are required in the migrations stream to allow a guest running PV
+drivers to be migrated without its co-operation. Moreover the save records must
+include details of registered xenstore watches as well ascontent; information
+that cannot currently be recovered from `xenstored`, and hence some extension
+to the xenstored implementations will also be required.
+
+As a similar set of data is needed for transferring xenstore data from one
+instance to another when live updating xenstored this document proposes an
+image format for a 'migration stream' suitable for both purposes.
 
 ## Proposal
 
-### New Save Record
+The image format consists of a _header_ followed by 1 or more _records_. Each
+record consists of a type and length field, followed by any data mandated by
+the record type. At minimum there will be a single record of type `END`
+(defined below).
 
-A new mandatory record type should be defined within the libxenlight Domain
-Image Format:
+### Header
 
-`0x0007: DOMAIN_XENSTORE_DATA`
+The header identifies the stream as a `xenstore` stream, including the version
+of the specification that it complies with.
 
-An arbitrary number of these records may be present in the migration
-stream and may appear in any order. The format of each record should be as
-follows:
+All fields in this header must be in _big-endian_ byte order, regardless of
+the setting of the endianness bit.
 
 
 ```
 0   1   2   3   4   5   6   7octet
 +---+---+---+---+---+---+---+---+
-| type  | record specific data  |
-+---+   |
-...
-+---+
+| ident |
++---+---|
+| version   | flags |
++---+---+
 ```
 
-where type is one of the following values
 
+| Field | Description   |
+|---|---|
+| `ident`   | 0x78656e73746f7265 ('xenstore' in ASCII)  |
+|   |   |
+| `version` | 0x0001 (the version of the specification) |
+|   |   |
+| `flags`   | 0 (LSB): Endianness: 0 = little, 1 = big  |
+|   |   |
+|   | 1-31: Reserved (must be zero)  

Re: [PATCH] mini-os: Avoid segfaults in tc{g,s}etattr

2020-04-27 Thread Samuel Thibault
Jason Andryuk, le dim. 26 avril 2020 23:40:19 -0400, a ecrit:
> Commit c96c22f1d94 "mini-os: minimal implementations of some termios
> functions" introduced implementations of tcgetattr and tcsetattr.
> However, they do not check if files[fildes].cons.dev is non-NULL before
> dereferencing.  This is not a problem for FDs allocated through
> alloc_fd, but the files array pre-allocates FDs 0-2 for stdio.  Those
> entries have a NULL .dev, so tc{g,s}etattr on them segfault.
> 
> ioemu-stubdom segfaults when term_init() calls tcgetattr on FD 0.
> 
> Restore tcgetattr and tcsetattr behavior when .dev is NULL equivalent to
> unsupported_function as it was before c96c22f1d94.
> 
> Signed-off-by: Jason Andryuk 

Reviewed-by: Samuel Thibault 

Thanks!

> ---
> I can't get ioemu-stubdom to start without this.  With this, the guest
> just reboots immediately, but it does that with a non-stubdom
> device_model_version="qemu-xen-traditional" .  The same guest disk image
> (cirros 0.5.1) boots with a linux stubdom or non-stubdom Ubuntu
> qemu-system-x86_64.
> 
>  lib/sys.c | 10 ++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/lib/sys.c b/lib/sys.c
> index da434fc..c6a7b9f 100644
> --- a/lib/sys.c
> +++ b/lib/sys.c
> @@ -1472,6 +1472,11 @@ int tcsetattr(int fildes, int action, const struct 
> termios *tios)
>  return -1;
>  }
>  
> +if (files[fildes].cons.dev == NULL) {
> +errno = ENOSYS;
> +return -1;
> +}
> +
>  if (tios->c_oflag & OPOST)
>  files[fildes].cons.dev->is_raw = false;
>  else
> @@ -1492,6 +1497,11 @@ int tcgetattr(int fildes, struct termios *tios)
>  return -1;
>  }
>  
> +if (files[fildes].cons.dev == NULL) {
> +errno = ENOSYS;
> +return 0;
> +}
> +
>  if (tios == NULL) {
>  errno = EINVAL;
>  return -1;
> -- 
> 2.20.1
> 



RE: [PATCH] docs/designs: re-work the xenstore migration document...

2020-04-27 Thread Paul Durrant
> -Original Message-
> From: Jürgen Groß 
> Sent: 27 April 2020 08:42
> To: Paul Durrant ; xen-devel@lists.xenproject.org
> Cc: Paul Durrant 
> Subject: Re: [PATCH] docs/designs: re-work the xenstore migration document...
> 
> On 24.04.20 15:37, Paul Durrant wrote:
> > From: Paul Durrant 
> >
> > ... to specify a separate migration stream that will also be suitable for
> > live update.
> >
> > The original scope of the document was to support non-cooperative migration
> > of guests [1] but, since then, live update of xenstored has been brought 
> > into
> > scope. Thus it makes more sense to define a separate image format for
> > serializing xenstore state that is suitable for both purposes.
> >
> > The document has been limited to specifying a new image format. The 
> > mechanism
> > for acquiring the image for live update or migration is not covered as that
> > is more appropriately dealt with by a patch to docs/misc/xenstore.txt. It is
> > also expected that, when the first implementation of live update or 
> > migration
> > making use of this specification is committed, that the document is moved 
> > from
> > docs/designs into docs/specs.
> 
> Can we please add a general remark:
> 
> Records depending other records (e.g. watch records referencing a
> connection record via a connection-id, or node records for a node
> being a child of another node in the stream) must always be located
> after the record they depend on.
> 

Sure. I'll call that out at the beginning of the records section.

  Paul

> 
> Juergen




Re: [PATCH] docs/designs: re-work the xenstore migration document...

2020-04-27 Thread Jürgen Groß

On 24.04.20 15:37, Paul Durrant wrote:

From: Paul Durrant 

... to specify a separate migration stream that will also be suitable for
live update.

The original scope of the document was to support non-cooperative migration
of guests [1] but, since then, live update of xenstored has been brought into
scope. Thus it makes more sense to define a separate image format for
serializing xenstore state that is suitable for both purposes.

The document has been limited to specifying a new image format. The mechanism
for acquiring the image for live update or migration is not covered as that
is more appropriately dealt with by a patch to docs/misc/xenstore.txt. It is
also expected that, when the first implementation of live update or migration
making use of this specification is committed, that the document is moved from
docs/designs into docs/specs.


Can we please add a general remark:

Records depending other records (e.g. watch records referencing a
connection record via a connection-id, or node records for a node
being a child of another node in the stream) must always be located
after the record they depend on.


Juergen



RE: [PATCH 02/11] xen: Fix and improve handling of device_add usb-host errors

2020-04-27 Thread Paul Durrant
> -Original Message-
> From: Markus Armbruster 
> Sent: 24 April 2020 20:20
> To: qemu-de...@nongnu.org
> Cc: Stefano Stabellini ; Anthony Perard 
> ; Paul
> Durrant ; Gerd Hoffmann ; 
> xen-devel@lists.xenproject.org
> Subject: [PATCH 02/11] xen: Fix and improve handling of device_add usb-host 
> errors
> 
> usbback_portid_add() leaks the error when qdev_device_add() fails.
> Fix that.  While there, use the error to improve the error message.
> 
> The qemu_opts_from_qdict() similarly leaks on failure.  But any
> failure there is a programming error.  Pass _abort.
> 
> Fixes: 816ac92ef769f9ffc534e49a1bb6177bddce7aa2
> Cc: Stefano Stabellini 
> Cc: Anthony Perard 
> Cc: Paul Durrant 
> Cc: Gerd Hoffmann 
> Cc: xen-devel@lists.xenproject.org
> Signed-off-by: Markus Armbruster 
> ---
>  hw/usb/xen-usb.c | 18 --
>  1 file changed, 8 insertions(+), 10 deletions(-)
> 
> diff --git a/hw/usb/xen-usb.c b/hw/usb/xen-usb.c
> index 961190d0f7..42643c3390 100644
> --- a/hw/usb/xen-usb.c
> +++ b/hw/usb/xen-usb.c
> @@ -30,6 +30,7 @@
>  #include "hw/usb.h"
>  #include "hw/xen/xen-legacy-backend.h"
>  #include "monitor/qdev.h"
> +#include "qapi/error.h"
>  #include "qapi/qmp/qdict.h"
>  #include "qapi/qmp/qstring.h"
> 
> @@ -755,13 +756,15 @@ static void usbback_portid_add(struct usbback_info 
> *usbif, unsigned port,
>  qdict_put_int(qdict, "port", port);
>  qdict_put_int(qdict, "hostbus", atoi(busid));
>  qdict_put_str(qdict, "hostport", portname);
> -opts = qemu_opts_from_qdict(qemu_find_opts("device"), qdict, _err);
> -if (local_err) {
> -goto err;
> -}
> +opts = qemu_opts_from_qdict(qemu_find_opts("device"), qdict,
> +_abort);
>  usbif->ports[port - 1].dev = USB_DEVICE(qdev_device_add(opts, 
> _err));
>  if (!usbif->ports[port - 1].dev) {
> -goto err;
> +qobject_unref(qdict);
> +xen_pv_printf(>xendev, 0,
> +  "device %s could not be opened: %s\n",
> +  busid, error_get_pretty(local_err));
> +error_free(local_err);

Previously the goto caused the function to bail out. Should there not be a 
'return' here?

>  }
>  qobject_unref(qdict);
>  speed = usbif->ports[port - 1].dev->speed;
> @@ -793,11 +796,6 @@ static void usbback_portid_add(struct usbback_info 
> *usbif, unsigned port,
>  usbback_hotplug_enq(usbif, port);
> 
>  TR_BUS(>xendev, "port %d attached\n", port);
> -return;
> -
> -err:
> -qobject_unref(qdict);
> -xen_pv_printf(>xendev, 0, "device %s could not be opened\n", 
> busid);
>  }
> 
>  static void usbback_process_port(struct usbback_info *usbif, unsigned port)
> --
> 2.21.1





  1   2   >