Re: [Qemu-devel] [PATCH 5/7] sockets: Rename SocketAddressFlat to SocketAddress

2017-04-26 Thread Prasanna Kalever
On Wed, Apr 26, 2017 at 1:06 PM, Markus Armbruster  wrote:
> Signed-off-by: Markus Armbruster 
> ---
>  block/gluster.c| 48 
>  block/nbd.c| 26 +-
>  block/sheepdog.c   |  6 +++---
>  include/qemu/sockets.h |  4 ++--
>  qapi-schema.json   | 14 +++---
>  qapi/block-core.json   |  6 +++---
>  util/qemu-sockets.c| 10 +-
>  7 files changed, 57 insertions(+), 57 deletions(-)
>
> diff --git a/block/gluster.c b/block/gluster.c
> index cf29b5f..93eac87 100644
> --- a/block/gluster.c
> +++ b/block/gluster.c
> @@ -321,7 +321,7 @@ static int parse_volume_options(BlockdevOptionsGluster 
> *gconf, char *path)
>  static int qemu_gluster_parse_uri(BlockdevOptionsGluster *gconf,
>const char *filename)
>  {
> -SocketAddressFlat *gsconf;
> +SocketAddress *gsconf;
>  URI *uri;
>  QueryParams *qp = NULL;
>  bool is_unix = false;
> @@ -332,19 +332,19 @@ static int 
> qemu_gluster_parse_uri(BlockdevOptionsGluster *gconf,
>  return -EINVAL;
>  }
>
> -gconf->server = g_new0(SocketAddressFlatList, 1);
> -gconf->server->value = gsconf = g_new0(SocketAddressFlat, 1);
> +gconf->server = g_new0(SocketAddressList, 1);
> +gconf->server->value = gsconf = g_new0(SocketAddress, 1);
>
>  /* transport */
>  if (!uri->scheme || !strcmp(uri->scheme, "gluster")) {
> -gsconf->type = SOCKET_ADDRESS_FLAT_TYPE_INET;
> +gsconf->type = SOCKET_ADDRESS_TYPE_INET;
>  } else if (!strcmp(uri->scheme, "gluster+tcp")) {
> -gsconf->type = SOCKET_ADDRESS_FLAT_TYPE_INET;
> +gsconf->type = SOCKET_ADDRESS_TYPE_INET;
>  } else if (!strcmp(uri->scheme, "gluster+unix")) {
> -gsconf->type = SOCKET_ADDRESS_FLAT_TYPE_UNIX;
> +gsconf->type = SOCKET_ADDRESS_TYPE_UNIX;
>  is_unix = true;
>  } else if (!strcmp(uri->scheme, "gluster+rdma")) {
> -gsconf->type = SOCKET_ADDRESS_FLAT_TYPE_INET;
> +gsconf->type = SOCKET_ADDRESS_TYPE_INET;
>  error_report("Warning: rdma feature is not supported, falling "
>   "back to tcp");
>  } else {
> @@ -396,7 +396,7 @@ static struct glfs 
> *qemu_gluster_glfs_init(BlockdevOptionsGluster *gconf,
>  struct glfs *glfs;
>  int ret;
>  int old_errno;
> -SocketAddressFlatList *server;
> +SocketAddressList *server;
>  unsigned long long port;
>
>  glfs = glfs_find_preopened(gconf->volume);
> @@ -413,11 +413,11 @@ static struct glfs 
> *qemu_gluster_glfs_init(BlockdevOptionsGluster *gconf,
>
>  for (server = gconf->server; server; server = server->next) {
>  switch (server->value->type) {
> -case SOCKET_ADDRESS_FLAT_TYPE_UNIX:
> +case SOCKET_ADDRESS_TYPE_UNIX:
>  ret = glfs_set_volfile_server(glfs, "unix",
> server->value->u.q_unix.path, 0);
>  break;
> -case SOCKET_ADDRESS_FLAT_TYPE_INET:
> +case SOCKET_ADDRESS_TYPE_INET:
>  if (parse_uint_full(server->value->u.inet.port, , 10) < 0 ||
>  port > 65535) {
>  error_setg(errp, "'%s' is not a valid port number",
> @@ -429,8 +429,8 @@ static struct glfs 
> *qemu_gluster_glfs_init(BlockdevOptionsGluster *gconf,
> server->value->u.inet.host,
> (int)port);
>  break;
> -case SOCKET_ADDRESS_FLAT_TYPE_VSOCK:
> -case SOCKET_ADDRESS_FLAT_TYPE_FD:
> +case SOCKET_ADDRESS_TYPE_VSOCK:
> +case SOCKET_ADDRESS_TYPE_FD:
>  default:
>  abort();
>  }
> @@ -450,7 +450,7 @@ static struct glfs 
> *qemu_gluster_glfs_init(BlockdevOptionsGluster *gconf,
>  error_setg(errp, "Gluster connection for volume %s, path %s failed"
>   " to connect", gconf->volume, gconf->path);
>  for (server = gconf->server; server; server = server->next) {
> -if (server->value->type  == SOCKET_ADDRESS_FLAT_TYPE_UNIX) {
> +if (server->value->type  == SOCKET_ADDRESS_TYPE_UNIX) {
>  error_append_hint(errp, "hint: failed on socket %s ",
>server->value->u.q_unix.path);
>  } else {
> @@ -487,8 +487,8 @@ static int qemu_gluster_parse_json(BlockdevOptionsGluster 
> *gconf,
>QDict *options, Error **errp)
>  {
>  QemuOpts *opts;
> -SocketAddressFlat *gsconf = NULL;
> -SocketAddressFlatList *curr = NULL;
> +SocketAddress *gsconf = NULL;
> +SocketAddressList *curr = NULL;
>  QDict *backing_options = NULL;
>  Error *local_err = NULL;
>  char *str = NULL;
> @@ -542,14 +542,14 @@ static int 
> qemu_gluster_parse_json(BlockdevOptionsGluster *gconf,
>  goto out;
>
>  }
> -

[Qemu-devel] [PULL v2 6/8] qga: improve fsfreeze documentations

2017-04-26 Thread Michael Roth
From: Marc-André Lureau 

Some users find the fsfreeze behaviour confusing. Add some notes about
invalid mount points and Windows usage.

Related to:
https://bugzilla.redhat.com/show_bug.cgi?id=1436976

Signed-off-by: Marc-André Lureau 
Reviewed-by: Vinzenz Feenstra 
Signed-off-by: Michael Roth 
---
 qga/qapi-schema.json | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
index 6307ae2..0f4cba5 100644
--- a/qga/qapi-schema.json
+++ b/qga/qapi-schema.json
@@ -426,7 +426,13 @@
 ##
 # @guest-fsfreeze-freeze:
 #
-# Sync and freeze all freezable, local guest filesystems
+# Sync and freeze all freezable, local guest filesystems. If this
+# command succeeded, you may call @guest-fsfreeze-thaw later to
+# unfreeze.
+#
+# Note: On Windows, the command is implemented with the help of a
+# Volume Shadow-copy Service DLL helper. The frozen state is limited
+# for up to 10 seconds by VSS.
 #
 # Returns: Number of file systems currently frozen. On error, all filesystems
 # will be thawed.
@@ -439,10 +445,12 @@
 ##
 # @guest-fsfreeze-freeze-list:
 #
-# Sync and freeze specified guest filesystems
+# Sync and freeze specified guest filesystems.
+# See also @guest-fsfreeze-freeze.
 #
 # @mountpoints: an array of mountpoints of filesystems to be frozen.
 #   If omitted, every mounted filesystem is frozen.
+#   Invalid mount points are ignored.
 #
 # Returns: Number of file systems currently frozen. On error, all filesystems
 # will be thawed.
-- 
2.7.4




[Qemu-devel] [PULL v2 1/8] qemu-ga: Make QGA VSS provider service run only when needed

2017-04-26 Thread Michael Roth
From: Sameeh Jubran 

Currently the service runs in background on boot even though it is not
needed and once it is running it never stops. The service needs to be
running only during freeze operation and it should be stopped after
executing thaw.

Signed-off-by: Sameeh Jubran 
Signed-off-by: Michael Roth 
---
 qga/vss-win32/install.cpp   | 28 ++--
 qga/vss-win32/install.h | 20 
 qga/vss-win32/requester.cpp |  2 ++
 3 files changed, 48 insertions(+), 2 deletions(-)
 create mode 100644 qga/vss-win32/install.h

diff --git a/qga/vss-win32/install.cpp b/qga/vss-win32/install.cpp
index f4160a3..f41fcdf 100644
--- a/qga/vss-win32/install.cpp
+++ b/qga/vss-win32/install.cpp
@@ -14,7 +14,7 @@
 
 #include "vss-common.h"
 #include 
-#include 
+#include "install.h"
 #include 
 #include 
 #include 
@@ -276,7 +276,7 @@ STDAPI COMRegister(void)
 
 chk(pCatalog->CreateServiceForApplication(
 _bstr_t(QGA_PROVIDER_LNAME), _bstr_t(QGA_PROVIDER_LNAME),
-_bstr_t(L"SERVICE_AUTO_START"), _bstr_t(L"SERVICE_ERROR_NORMAL"),
+_bstr_t(L"SERVICE_DEMAND_START"), _bstr_t(L"SERVICE_ERROR_NORMAL"),
 _bstr_t(L""), _bstr_t(L".\\localsystem"), _bstr_t(L""), FALSE));
 chk(pCatalog->InstallComponent(_bstr_t(QGA_PROVIDER_LNAME),
_bstr_t(dllPath), _bstr_t(tlbPath),
@@ -461,3 +461,27 @@ namespace _com_util
 return bstr;
 }
 }
+
+/* Stop QGA VSS provider service from COM+ Application Admin Catalog */
+
+STDAPI StopService(void)
+{
+HRESULT hr;
+COMInitializer initializer;
+COMPointer pUnknown;
+COMPointer pCatalog;
+
+int count = 0;
+
+chk(QGAProviderFind(QGAProviderCount, (void *)));
+if (count) {
+chk(CoCreateInstance(CLSID_COMAdminCatalog, NULL, CLSCTX_INPROC_SERVER,
+IID_IUnknown, (void **)pUnknown.replace()));
+chk(pUnknown->QueryInterface(IID_ICOMAdminCatalog2,
+(void **)pCatalog.replace()));
+chk(pCatalog->ShutdownApplication(_bstr_t(QGA_PROVIDER_LNAME)));
+}
+
+out:
+return hr;
+}
diff --git a/qga/vss-win32/install.h b/qga/vss-win32/install.h
new file mode 100644
index 000..35364af
--- /dev/null
+++ b/qga/vss-win32/install.h
@@ -0,0 +1,20 @@
+/*
+ * QEMU Guest Agent VSS requester declarations
+ *
+ * Copyright Hitachi Data Systems Corp. 2013
+ *
+ * Authors:
+ *  Tomoki Sekiyama   
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef INSTALL_H
+#define INSTALL_H
+
+#include 
+
+STDAPI StopService(void);
+
+#endif
diff --git a/qga/vss-win32/requester.cpp b/qga/vss-win32/requester.cpp
index 0cd2f0e..301762d 100644
--- a/qga/vss-win32/requester.cpp
+++ b/qga/vss-win32/requester.cpp
@@ -13,6 +13,7 @@
 #include "qemu/osdep.h"
 #include "vss-common.h"
 #include "requester.h"
+#include "install.h"
 #include 
 #include 
 
@@ -501,4 +502,5 @@ void requester_thaw(int *num_vols, ErrorSet *errset)
 requester_cleanup();
 
 CoUninitialize();
+StopService();
 }
-- 
2.7.4




[Qemu-devel] [PULL v2 5/8] qga: Add 'guest-get-host-name' command

2017-04-26 Thread Michael Roth
From: Vinzenz Feenstra 

Retrieving the guest host name is a very useful feature for virtual management
systems. This information can help to have more user friendly VM access
details, instead of an IP there would be the host name. Also the host name
reported can be used to have automated checks for valid SSL certificates.

virsh # qemu-agent-command F25 '{ "execute": "guest-get-host-name" }'
{"return":{"host-name":"F25.lab.evilissimo.net"}}

Signed-off-by: Vinzenz Feenstra 
* minor whitespace fix-ups
Signed-off-by: Michael Roth 
---
 qga/commands.c   | 11 +++
 qga/qapi-schema.json | 26 ++
 2 files changed, 37 insertions(+)

diff --git a/qga/commands.c b/qga/commands.c
index 4d92946..57a31bb 100644
--- a/qga/commands.c
+++ b/qga/commands.c
@@ -499,3 +499,14 @@ int ga_parse_whence(GuestFileWhence *whence, Error **errp)
 error_setg(errp, "invalid whence code %"PRId64, whence->u.value);
 return -1;
 }
+
+GuestHostName *qmp_guest_get_host_name(Error **err)
+{
+GuestHostName *result = NULL;
+gchar const *hostname = g_get_host_name();
+if (hostname != NULL) {
+result = g_new0(GuestHostName, 1);
+result->host_name = g_strdup(hostname);
+}
+return result;
+}
diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
index a02dbf2..6307ae2 100644
--- a/qga/qapi-schema.json
+++ b/qga/qapi-schema.json
@@ -1042,3 +1042,29 @@
   'data':{ 'path': 'str', '*arg': ['str'], '*env': ['str'],
'*input-data': 'str', '*capture-output': 'bool' },
   'returns': 'GuestExec' }
+
+
+##
+# @GuestHostName:
+# @host-name: Fully qualified domain name of the guest OS
+#
+# Since: 2.10
+##
+{ 'struct': 'GuestHostName',
+  'data':   { 'host-name': 'str' } }
+
+##
+# @guest-get-host-name:
+#
+# Return a name for the machine.
+#
+# The returned name is not necessarily a fully-qualified domain name, or even
+# present in DNS or some other name service at all. It need not even be unique
+# on your local network or site, but usually it is.
+#
+# Returns: the host name of the machine on success
+#
+# Since: 2.10
+##
+{ 'command': 'guest-get-host-name',
+  'returns': 'GuestHostName' }
-- 
2.7.4




[Qemu-devel] [PULL v2 4/8] qga-win: Fix Event Viewer errors caused by qemu-ga

2017-04-26 Thread Michael Roth
From: Sameeh Jubran 

When the command "guest-fsfreeze-freeze" is executed it causes
the VSS service to log the error below in the Event Viewer. This
error is caused by an issue in the function "CommitSnapshots" in
provider.cpp:

* When VSS_TIMEOUT_MSEC expires the funtion returns E_ABORT. This causes
the error #12293.

|event id|   error   |
* 12293  : Volume Shadow Copy Service error: Error calling a routine on a
   Shadow Copy Provider {----}.
   Routine details CommitSnapshots [hr = 0x80004004, Operation
   aborted.

Signed-off-by: Sameeh Jubran 
Signed-off-by: Michael Roth 
---
 qga/vss-win32/provider.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/qga/vss-win32/provider.cpp b/qga/vss-win32/provider.cpp
index ef94669..72d8b0e 100644
--- a/qga/vss-win32/provider.cpp
+++ b/qga/vss-win32/provider.cpp
@@ -377,7 +377,6 @@ STDMETHODIMP CQGAVssProvider::CommitSnapshots(VSS_ID 
SnapshotSetId)
 if (WaitForSingleObject(hEventThaw, VSS_TIMEOUT_MSEC) != WAIT_OBJECT_0) {
 /* Send event to qemu-ga to notify the provider is timed out */
 SetEvent(hEventTimeout);
-hr = E_ABORT;
 }
 
 CloseHandle(hEventThaw);
-- 
2.7.4




[Qemu-devel] [PULL v2 8/8] qga: Add `guest-get-timezone` command

2017-04-26 Thread Michael Roth
From: Vinzenz Feenstra 

Adds a new command `guest-get-timezone` reporting the currently
configured timezone on the system. The information on what timezone is
currently is configured is useful in case of Windows VMs where the
offset of the hardware clock is required to have the same offset. This
can be used for management systems like `oVirt` to detect the timezone
difference and warn administrators of the misconfiguration.

Signed-off-by: Vinzenz Feenstra 
Reviewed-by: Sameeh Jubran 
Tested-by: Sameeh Jubran 
* moved stub implementation to end of function for consistency
* document that timezone names are for informational use only.
Signed-off-by: Michael Roth 
---
 qga/commands.c   | 38 ++
 qga/qapi-schema.json | 25 +
 2 files changed, 63 insertions(+)

diff --git a/qga/commands.c b/qga/commands.c
index 57a31bb..ed5 100644
--- a/qga/commands.c
+++ b/qga/commands.c
@@ -510,3 +510,41 @@ GuestHostName *qmp_guest_get_host_name(Error **err)
 }
 return result;
 }
+
+GuestTimezone *qmp_guest_get_timezone(Error **errp)
+{
+#if GLIB_CHECK_VERSION(2, 28, 0)
+GuestTimezone *info = NULL;
+GTimeZone *tz = NULL;
+gint64 now = 0;
+gint32 intv = 0;
+gchar const *name = NULL;
+
+info = g_new0(GuestTimezone, 1);
+tz = g_time_zone_new_local();
+if (tz == NULL) {
+error_setg(errp, QERR_QGA_COMMAND_FAILED,
+   "Couldn't retrieve local timezone");
+goto error;
+}
+
+now = g_get_real_time() / G_USEC_PER_SEC;
+intv = g_time_zone_find_interval(tz, G_TIME_TYPE_UNIVERSAL, now);
+info->offset = g_time_zone_get_offset(tz, intv);
+name = g_time_zone_get_abbreviation(tz, intv);
+if (name != NULL) {
+info->has_zone = true;
+info->zone = g_strdup(name);
+}
+g_time_zone_unref(tz);
+
+return info;
+
+error:
+g_free(info);
+return NULL;
+#else
+error_setg(errp, QERR_UNSUPPORTED);
+return NULL;
+#endif
+}
diff --git a/qga/qapi-schema.json b/qga/qapi-schema.json
index f25467a..03743ab 100644
--- a/qga/qapi-schema.json
+++ b/qga/qapi-schema.json
@@ -1101,3 +1101,28 @@
 ##
 { 'command': 'guest-get-users',
   'returns': ['GuestUser'] }
+
+##
+# @GuestTimezone:
+#
+# @zone:Timezone name. These values may differ depending on guest/OS and
+#   should only be used for informational purposes.
+# @offset:  Offset to UTC in seconds, negative numbers for time zones west of
+#   GMT, positive numbers for east
+#
+# Since: 2.10
+##
+{ 'struct': 'GuestTimezone',
+  'data':   { '*zone': 'str', 'offset': 'int' } }
+
+##
+# @guest-get-timezone:
+#
+# Retrieves the timezone information from the guest.
+#
+# Returns: A GuestTimezone dictionary.
+#
+# Since: 2.10
+##
+{ 'command': 'guest-get-timezone',
+  'returns': 'GuestTimezone' }
-- 
2.7.4




[Qemu-devel] [PULL v2 3/8] qga-win: Fix a bug where qemu-ga service is stuck during stop operation

2017-04-26 Thread Michael Roth
From: Sameeh Jubran 

After triggering a freeze command without any following thaw command,
qemu-ga will not respond to stop operation. This behaviour is wanted on Linux
as there is no time limit for a freeze command and we want to prevent
quitting in the middle of freeze, on the other hand on Windows the time
limit for freeze is 10 seconds, so we should wait for the timeout, thaw
the file system and quit.

Signed-off-by: Sameeh Jubran 
Signed-off-by: Michael Roth 
---
 qga/main.c  | 23 +++
 qga/vss-win32.h |  1 +
 qga/vss-win32/vss-common.h  | 11 +--
 qga/vss-win32/vss-handles.h | 14 ++
 4 files changed, 39 insertions(+), 10 deletions(-)
 create mode 100644 qga/vss-win32/vss-handles.h

diff --git a/qga/main.c b/qga/main.c
index 07c2953..ad6f68f 100644
--- a/qga/main.c
+++ b/qga/main.c
@@ -131,9 +131,32 @@ static void quit_handler(int sig)
  * unless all log/pid files are on unfreezable filesystems. there's
  * also a very likely chance killing the agent before unfreezing
  * the filesystems is a mistake (or will be viewed as one later).
+ * On Windows the freeze interval is limited to 10 seconds, so
+ * we should quit, but first we should wait for the timeout, thaw
+ * the filesystem and quit.
  */
 if (ga_is_frozen(ga_state)) {
+#ifdef _WIN32
+int i = 0;
+Error *err = NULL;
+HANDLE hEventTimeout;
+
+g_debug("Thawing filesystems before exiting");
+
+hEventTimeout = OpenEvent(EVENT_ALL_ACCESS, FALSE, EVENT_NAME_TIMEOUT);
+if (hEventTimeout) {
+WaitForSingleObject(hEventTimeout, 0);
+CloseHandle(hEventTimeout);
+}
+qga_vss_fsfreeze(, false, );
+if (err) {
+g_debug("Error unfreezing filesystems prior to exiting: %s",
+error_get_pretty(err));
+error_free(err);
+}
+#else
 return;
+#endif
 }
 g_debug("received signal num %d, quitting", sig);
 
diff --git a/qga/vss-win32.h b/qga/vss-win32.h
index 51d303a..4f8e39a 100644
--- a/qga/vss-win32.h
+++ b/qga/vss-win32.h
@@ -13,6 +13,7 @@
 #ifndef VSS_WIN32_H
 #define VSS_WIN32_H
 
+#include "qga/vss-win32/vss-handles.h"
 
 bool vss_init(bool init_requester);
 void vss_deinit(bool deinit_requester);
diff --git a/qga/vss-win32/vss-common.h b/qga/vss-win32/vss-common.h
index c81a856..61c170b 100644
--- a/qga/vss-win32/vss-common.h
+++ b/qga/vss-win32/vss-common.h
@@ -51,21 +51,12 @@
  * http://www.microsoft.com/en-us/download/details.aspx?id=23490
  */
 #include 
+#include "vss-handles.h"
 
 /* Macros to convert char definitions to wchar */
 #define _L(a) L##a
 #define L(a) _L(a)
 
-/* Constants for QGA VSS Provider */
-
-#define QGA_PROVIDER_NAME "QEMU Guest Agent VSS Provider"
-#define QGA_PROVIDER_LNAME L(QGA_PROVIDER_NAME)
-#define QGA_PROVIDER_VERSION L(QEMU_VERSION)
-
-#define EVENT_NAME_FROZEN  "Global\\QGAVSSEvent-frozen"
-#define EVENT_NAME_THAW"Global\\QGAVSSEvent-thaw"
-#define EVENT_NAME_TIMEOUT "Global\\QGAVSSEvent-timeout"
-
 const GUID g_gProviderId = { 0x3629d4ed, 0xee09, 0x4e0e,
 {0x9a, 0x5c, 0x6d, 0x8b, 0xa2, 0x87, 0x2a, 0xef} };
 const GUID g_gProviderVersion = { 0x11ef8b15, 0xcac6, 0x40d6,
diff --git a/qga/vss-win32/vss-handles.h b/qga/vss-win32/vss-handles.h
new file mode 100644
index 000..ff399dd
--- /dev/null
+++ b/qga/vss-win32/vss-handles.h
@@ -0,0 +1,14 @@
+#ifndef VSS_HANDLES
+#define VSS_HANDLES
+
+/* Constants for QGA VSS Provider */
+
+#define QGA_PROVIDER_NAME "QEMU Guest Agent VSS Provider"
+#define QGA_PROVIDER_LNAME L(QGA_PROVIDER_NAME)
+#define QGA_PROVIDER_VERSION L(QEMU_VERSION)
+
+#define EVENT_NAME_FROZEN  "Global\\QGAVSSEvent-frozen"
+#define EVENT_NAME_THAW"Global\\QGAVSSEvent-thaw"
+#define EVENT_NAME_TIMEOUT "Global\\QGAVSSEvent-timeout"
+
+#endif
-- 
2.7.4




[Qemu-devel] [PULL v2 2/8] qga-win: Enable 'can-offline' field in 'guest-get-vcpus' reply

2017-04-26 Thread Michael Roth
From: Sameeh Jubran 

The QGA schema states:

@can-offline: Whether offlining the VCPU is possible. This member
   is always filled in by the guest agent when the structure
   is returned, and always ignored on input (hence it can be
   omitted then).

Currently 'can-offline' is missing entirely from the reply. This causes
errors in libvirt which is expecting the reply to be compliant with the
schema docs.

BZ#1438735: https://bugzilla.redhat.com/show_bug.cgi?id=1438735

Signed-off-by: Sameeh Jubran 
Reviewed-by: Eric Blake 
Cc: qemu-sta...@nongnu.org
Signed-off-by: Michael Roth 
---
 qga/commands-win32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/qga/commands-win32.c b/qga/commands-win32.c
index 04026ee..9fec1fb 100644
--- a/qga/commands-win32.c
+++ b/qga/commands-win32.c
@@ -1344,7 +1344,7 @@ GuestLogicalProcessorList *qmp_guest_get_vcpus(Error 
**errp)
 vcpu = g_malloc0(sizeof *vcpu);
 vcpu->logical_id = current++;
 vcpu->online = true;
-vcpu->has_can_offline = false;
+vcpu->has_can_offline = true;
 
 entry = g_malloc0(sizeof *entry);
 entry->value = vcpu;
-- 
2.7.4




[Qemu-devel] [PULL v2 7/8] qga: Add 'guest-get-users' command

2017-04-26 Thread Michael Roth
From: Vinzenz Feenstra 

A command that will list all currently logged in users, and the time
since when they are logged in.

Examples:

virsh # qemu-agent-command F25 '{ "execute": "guest-get-users" }'
{"return":[{"login-time":1490622289.903835,"user":"root"}]}

virsh # qemu-agent-command Win2k12r2 '{ "execute": "guest-get-users" }'
{"return":[{"login-time":1490351044.670552,"domain":"LADIDA",
"user":"Administrator"}]}

Signed-off-by: Vinzenz Feenstra 
* make g_hash_table_contains compat func inline to avoid
  unused warnings
Signed-off-by: Michael Roth 
---
 configure |   2 +-
 include/glib-compat.h |   6 +++
 qga/commands-posix.c  |  60 +
 qga/commands-win32.c  | 103 ++
 qga/qapi-schema.json  |  25 
 5 files changed, 195 insertions(+), 1 deletion(-)

diff --git a/configure b/configure
index c35acf1..80d17f3 100755
--- a/configure
+++ b/configure
@@ -743,7 +743,7 @@ if test "$mingw32" = "yes" ; then
   sysconfdir="\${prefix}"
   local_statedir=
   confsuffix=""
-  libs_qga="-lws2_32 -lwinmm -lpowrprof -liphlpapi -lnetapi32 $libs_qga"
+  libs_qga="-lws2_32 -lwinmm -lpowrprof -lwtsapi32 -liphlpapi -lnetapi32 
$libs_qga"
 fi
 
 werror=""
diff --git a/include/glib-compat.h b/include/glib-compat.h
index 863c8cf..fcffcd3 100644
--- a/include/glib-compat.h
+++ b/include/glib-compat.h
@@ -217,6 +217,12 @@ static inline void g_hash_table_add(GHashTable 
*hash_table, gpointer key)
 {
 g_hash_table_replace(hash_table, key, key);
 }
+
+static inline gboolean g_hash_table_contains(GHashTable *hash_table,
+ gpointer key)
+{
+return g_hash_table_lookup_extended(hash_table, key, NULL, NULL);
+}
 #endif
 
 #ifndef g_assert_true
diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 915df9e..ba06be4 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "qga/guest-agent-core.h"
 #include "qga-qmp-commands.h"
 #include "qapi/qmp/qerror.h"
@@ -2517,3 +2518,62 @@ void ga_command_state_init(GAState *s, GACommandState 
*cs)
 ga_command_state_add(cs, NULL, guest_fsfreeze_cleanup);
 #endif
 }
+
+#define QGA_MICRO_SECOND_TO_SECOND 100
+
+static double ga_get_login_time(struct utmpx *user_info)
+{
+double seconds = (double)user_info->ut_tv.tv_sec;
+double useconds = (double)user_info->ut_tv.tv_usec;
+useconds /= QGA_MICRO_SECOND_TO_SECOND;
+return seconds + useconds;
+}
+
+GuestUserList *qmp_guest_get_users(Error **err)
+{
+GHashTable *cache = NULL;
+GuestUserList *head = NULL, *cur_item = NULL;
+struct utmpx *user_info = NULL;
+gpointer value = NULL;
+GuestUser *user = NULL;
+GuestUserList *item = NULL;
+double login_time = 0;
+
+cache = g_hash_table_new(g_str_hash, g_str_equal);
+setutxent();
+
+for (;;) {
+user_info = getutxent();
+if (user_info == NULL) {
+break;
+} else if (user_info->ut_type != USER_PROCESS) {
+continue;
+} else if (g_hash_table_contains(cache, user_info->ut_user)) {
+value = g_hash_table_lookup(cache, user_info->ut_user);
+user = (GuestUser *)value;
+login_time = ga_get_login_time(user_info);
+/* We're ensuring the earliest login time to be sent */
+if (login_time < user->login_time) {
+user->login_time = login_time;
+}
+continue;
+}
+
+item = g_new0(GuestUserList, 1);
+item->value = g_new0(GuestUser, 1);
+item->value->user = g_strdup(user_info->ut_user);
+item->value->login_time = ga_get_login_time(user_info);
+
+g_hash_table_insert(cache, item->value->user, item->value);
+
+if (!cur_item) {
+head = cur_item = item;
+} else {
+cur_item->next = item;
+cur_item = item;
+}
+}
+endutxent();
+g_hash_table_destroy(cache);
+return head;
+}
diff --git a/qga/commands-win32.c b/qga/commands-win32.c
index 9fec1fb..439d229 100644
--- a/qga/commands-win32.c
+++ b/qga/commands-win32.c
@@ -11,6 +11,9 @@
  * See the COPYING file in the top-level directory.
  */
 
+#ifndef _WIN32_WINNT
+#   define _WIN32_WINNT 0x0600
+#endif
 #include "qemu/osdep.h"
 #include 
 #include 
@@ -25,6 +28,7 @@
 #include 
 #endif
 #include 
+#include 
 
 #include "qga/guest-agent-core.h"
 #include "qga/vss-win32.h"
@@ -1536,3 +1540,102 @@ void ga_command_state_init(GAState *s, GACommandState 
*cs)
 ga_command_state_add(cs, NULL, guest_fsfreeze_cleanup);
 }
 }
+
+/* MINGW is missing two fields: IncomingFrames & OutgoingFrames */
+typedef struct _GA_WTSINFOA {
+WTS_CONNECTSTATE_CLASS State;
+DWORD SessionId;
+DWORD IncomingBytes;
+DWORD OutgoingBytes;
+DWORD IncomingFrames;
+  

[Qemu-devel] [PULL v2 0/8] qemu-ga patch queue

2017-04-26 Thread Michael Roth
The following changes since commit 81b2d5ceb0cfb4cdc2163492e3169ed714b0cda9:

  Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20170426' into 
staging (2017-04-26 20:50:49 +0100)

are available in the git repository at:

  git://github.com/mdroth/qemu.git tags/qga-pull-2017-04-25-v2-tag

for you to fetch changes up to 53c58e64d0a27c59d763778faa2b5a522c544719:

  qga: Add `guest-get-timezone` command (2017-04-27 00:40:25 -0500)


qemu-ga patch queue

* new commands: guest-get-timezone, guest-get-users, guest-get-host-name
* fix hang on w32 when stopping qemu-ga service while fs frozen
* fix missing setting of can-offline in guest-get-vcpus
* make qemu-ga VSS w32 service on-demand rather than on-startup
* fix unecessary errors to EventLog on w32
* improvements to fsfreeze documentation

v2:
 * document 'zone' field of guest-get-timezone as informational-only
   (Daniel, Eric)
 * fix build error for glib < 2.32 (Peter)


Marc-André Lureau (1):
  qga: improve fsfreeze documentations

Sameeh Jubran (4):
  qemu-ga: Make QGA VSS provider service run only when needed
  qga-win: Enable 'can-offline' field in 'guest-get-vcpus' reply
  qga-win: Fix a bug where qemu-ga service is stuck during stop operation
  qga-win: Fix Event Viewer errors caused by qemu-ga

Vinzenz Feenstra (3):
  qga: Add 'guest-get-host-name' command
  qga: Add 'guest-get-users' command
  qga: Add `guest-get-timezone` command

 configure   |   2 +-
 include/glib-compat.h   |   6 +++
 qga/commands-posix.c|  60 +
 qga/commands-win32.c| 105 +++-
 qga/commands.c  |  49 +
 qga/main.c  |  23 ++
 qga/qapi-schema.json|  88 -
 qga/vss-win32.h |   1 +
 qga/vss-win32/install.cpp   |  28 +++-
 qga/vss-win32/install.h |  20 +
 qga/vss-win32/provider.cpp  |   1 -
 qga/vss-win32/requester.cpp |   2 +
 qga/vss-win32/vss-common.h  |  11 +
 qga/vss-win32/vss-handles.h |  14 ++
 14 files changed, 393 insertions(+), 17 deletions(-)
 create mode 100644 qga/vss-win32/install.h
 create mode 100644 qga/vss-win32/vss-handles.h




Re: [Qemu-devel] dns server not working in QEMU using usermode networking (SLIRP)

2017-04-26 Thread FONNEMANN Mark
-Original Message-
From: Stefan Weil [mailto:s...@weilnetz.de] 
Sent: Friday, April 21, 2017 15:58
To: FONNEMANN Mark ; qemu-devel@nongnu.org; Samuel 
Thibault 
Subject: Re: [Qemu-devel] dns server not working in QEMU using usermode 
networking (SLIRP)

>See report on
>http://stackoverflow.com/questions/43308310/dns-server-not-working-in-qemu-usermode-networking.

>Mark, did you also get that problem with QEMU running on a Linux host, or is 
>it specific for QEMU running on Windows?

I just verified the problem exists on Linux host as well. I cannot ping the DNS 
or SMB server and the DNS server does not respond to nslookup requests.



[Qemu-devel] [PATCH v3 6/6] target/ppc: do not reset reserve_addr in exec_enter

2017-04-26 Thread Nikunj A Dadhania
In case when atomic operation is not supported, exit_atomic is called
and we stop the world and execute the atomic operation. This results
in a following call chain:

tcg_gen_atomic_cmpxchg_tl()
  -> gen_helper_exit_atomic()
 -> HELPER(exit_atomic)
-> cpu_loop_exit_atomic() -> EXCP_ATOMIC
   -> qemu_tcg_cpu_thread_fn() => case EXCP_ATOMIC
  -> cpu_exec_step_atomic()
 -> cpu_step_atomic()
-> cc->cpu_exec_enter() = ppc_cpu_exec_enter()
   Sets env->reserve_addr = -1;

But by the time it return back, the reservation is erased and the code
fails, this continues forever and the lock is never taken.

Instead set this in powerpc_excp()

Now that ppc_cpu_exec_enter() doesn't have anything meaningful to do,
let us get rid of the function.

Signed-off-by: Nikunj A Dadhania 
---
 target/ppc/excp_helper.c| 3 +++
 target/ppc/translate_init.c | 9 -
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/target/ppc/excp_helper.c b/target/ppc/excp_helper.c
index f4ee7aa..a6bcb47 100644
--- a/target/ppc/excp_helper.c
+++ b/target/ppc/excp_helper.c
@@ -728,6 +728,9 @@ static inline void powerpc_excp(PowerPCCPU *cpu, int 
excp_model, int excp)
 cs->exception_index = POWERPC_EXCP_NONE;
 env->error_code = 0;
 
+/* Reset the reservation */
+env->reserve_addr = -1;
+
 /* Any interrupt is context synchronizing, check if TCG TLB
  * needs a delayed flush on ppc64
  */
diff --git a/target/ppc/translate_init.c b/target/ppc/translate_init.c
index e82e3e6..9b048cd 100644
--- a/target/ppc/translate_init.c
+++ b/target/ppc/translate_init.c
@@ -10436,14 +10436,6 @@ static bool ppc_cpu_has_work(CPUState *cs)
 return msr_ee && (cs->interrupt_request & CPU_INTERRUPT_HARD);
 }
 
-static void ppc_cpu_exec_enter(CPUState *cs)
-{
-PowerPCCPU *cpu = POWERPC_CPU(cs);
-CPUPPCState *env = >env;
-
-env->reserve_addr = -1;
-}
-
 /* CPUClass::reset() */
 static void ppc_cpu_reset(CPUState *s)
 {
@@ -10660,7 +10652,6 @@ static void ppc_cpu_class_init(ObjectClass *oc, void 
*data)
 cc->get_phys_page_debug = ppc_cpu_get_phys_page_debug;
 cc->vmsd = _ppc_cpu;
 #endif
-cc->cpu_exec_enter = ppc_cpu_exec_enter;
 #if defined(CONFIG_SOFTMMU)
 cc->write_elf64_note = ppc64_cpu_write_elf64_note;
 cc->write_elf32_note = ppc32_cpu_write_elf32_note;
-- 
2.9.3




[Qemu-devel] [PATCH v3 3/6] target/ppc: Generate fence operations

2017-04-26 Thread Nikunj A Dadhania
Signed-off-by: Nikunj A Dadhania 
Reviewed-by: Richard Henderson 
---
 target/ppc/translate.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index 50b6d4d..4a1f24a 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -2971,6 +2971,7 @@ static void gen_stswx(DisasContext *ctx)
 /* eieio */
 static void gen_eieio(DisasContext *ctx)
 {
+tcg_gen_mb(TCG_MO_LD_ST | TCG_BAR_SC);
 }
 
 #if !defined(CONFIG_USER_ONLY)
@@ -3008,6 +3009,7 @@ static void gen_isync(DisasContext *ctx)
 if (!ctx->pr) {
 gen_check_tlb_flush(ctx, false);
 }
+tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
 gen_stop_exception(ctx);
 }
 
@@ -3028,6 +3030,7 @@ static void gen_##name(DisasContext *ctx) 
   \
 tcg_gen_qemu_ld_tl(gpr, t0, ctx->mem_idx, memop);\
 tcg_gen_mov_tl(cpu_reserve, t0); \
 tcg_gen_mov_tl(cpu_reserve_val, gpr);\
+tcg_gen_mb(TCG_MO_ALL | TCG_BAR_LDAQ);   \
 tcg_temp_free(t0);   \
 }
 
@@ -3177,6 +3180,10 @@ static void gen_conditional_store(DisasContext *ctx, 
TCGv EA,
 tcg_gen_br(l2);
 
 gen_set_label(l1);
+
+/* Address mismatch implies failure.  But we still need to provide the
+   memory barrier semantics of the instruction.  */
+tcg_gen_mb(TCG_MO_ALL | TCG_BAR_STRL);
 tcg_gen_trunc_tl_i32(cpu_crf[0], cpu_so);
 
 gen_set_label(l2);
@@ -3308,6 +3315,7 @@ static void gen_sync(DisasContext *ctx)
 if (((l == 2) || !(ctx->insns_flags & PPC_64B)) && !ctx->pr) {
 gen_check_tlb_flush(ctx, true);
 }
+tcg_gen_mb(TCG_MO_ALL | TCG_BAR_SC);
 }
 
 /* wait */
-- 
2.9.3




[Qemu-devel] [PATCH v3] Issue a deprecation warning if the user specifies the "-hdachs" option.

2017-04-26 Thread Thomas Huth
If the user needs to specify the disk geometry, the corresponding
parameters of the "-device ide-hd" option should be used instead.
"-hdachs" is considered as deprecated and might be removed soon.

Signed-off-by: Thomas Huth 
---
 v3:
 - Recommend to use the parameters of -device ide-hd instead
 - Also add the deprecation warning to the documentation

 qemu-options.hx | 4 ++--
 vl.c| 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/qemu-options.hx b/qemu-options.hx
index 787b9c3..f68829f 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -803,8 +803,8 @@ STEXI
 Force hard disk 0 physical geometry (1 <= @var{c} <= 16383, 1 <=
 @var{h} <= 16, 1 <= @var{s} <= 63) and optionally force the BIOS
 translation mode (@var{t}=none, lba or auto). Usually QEMU can guess
-all those parameters. This option is useful for old MS-DOS disk
-images.
+all those parameters. This option is deprecated, please use
+@code{-device ide-hd,cyls=c,heads=h,secs=s,...} instead.
 ETEXI
 
 DEF("fsdev", HAS_ARG, QEMU_OPTION_fsdev,
diff --git a/vl.c b/vl.c
index f46e070..42d4bce 100644
--- a/vl.c
+++ b/vl.c
@@ -3231,6 +3231,8 @@ int main(int argc, char **argv, char **envp)
 }
 }
 }
+error_report("'-hdachs' is deprecated, please use '-device"
+ " ide-hd,cyls=c,heads=h,secs=s,...' instead");
 break;
 case QEMU_OPTION_numa:
 opts = qemu_opts_parse_noisily(qemu_find_opts("numa"),
-- 
1.8.3.1




[Qemu-devel] [PATCH v3 4/6] cpus: Fix CPU unplug for MTTCG

2017-04-26 Thread Nikunj A Dadhania
From: Bharata B Rao 

Ensure that the unplugged CPU thread is destroyed and the waiting
thread is notified about it. This is needed for CPU unplug to work
correctly in MTTCG mode.

Signed-off-by: Bharata B Rao 
Signed-off-by: Nikunj A Dadhania 
---
 cpus.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/cpus.c b/cpus.c
index 740b8dc..79f780b 100644
--- a/cpus.c
+++ b/cpus.c
@@ -1483,6 +1483,12 @@ static void *qemu_tcg_cpu_thread_fn(void *arg)
 /* Ignore everything else? */
 break;
 }
+} else if (cpu->unplug) {
+qemu_tcg_destroy_vcpu(cpu);
+cpu->created = false;
+qemu_cond_signal(_cpu_cond);
+qemu_mutex_unlock_iothread();
+return NULL;
 }
 
 atomic_mb_set(>exit_request, 0);
-- 
2.9.3




[Qemu-devel] [PATCH v3 1/6] target/ppc: Emulate LL/SC using cmpxchg helpers

2017-04-26 Thread Nikunj A Dadhania
Emulating LL/SC with cmpxchg is not correct, since it can suffer from
the ABA problem. However, portable parallel code is written assuming
only cmpxchg which means that in practice this is a viable alternative.

Signed-off-by: Nikunj A Dadhania 
Reviewed-by: Richard Henderson 
---
 target/ppc/translate.c | 29 +++--
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/target/ppc/translate.c b/target/ppc/translate.c
index f40b5a1..50b6d4d 100644
--- a/target/ppc/translate.c
+++ b/target/ppc/translate.c
@@ -73,6 +73,7 @@ static TCGv cpu_cfar;
 #endif
 static TCGv cpu_xer, cpu_so, cpu_ov, cpu_ca, cpu_ov32, cpu_ca32;
 static TCGv cpu_reserve;
+static TCGv cpu_reserve_val;
 static TCGv cpu_fpscr;
 static TCGv_i32 cpu_access_type;
 
@@ -181,6 +182,9 @@ void ppc_translate_init(void)
 cpu_reserve = tcg_global_mem_new(cpu_env,
  offsetof(CPUPPCState, reserve_addr),
  "reserve_addr");
+cpu_reserve_val = tcg_global_mem_new(cpu_env,
+ offsetof(CPUPPCState, reserve_val),
+ "reserve_val");
 
 cpu_fpscr = tcg_global_mem_new(cpu_env,
offsetof(CPUPPCState, fpscr), "fpscr");
@@ -3023,7 +3027,7 @@ static void gen_##name(DisasContext *ctx) 
   \
 }\
 tcg_gen_qemu_ld_tl(gpr, t0, ctx->mem_idx, memop);\
 tcg_gen_mov_tl(cpu_reserve, t0); \
-tcg_gen_st_tl(gpr, cpu_env, offsetof(CPUPPCState, reserve_val)); \
+tcg_gen_mov_tl(cpu_reserve_val, gpr);\
 tcg_temp_free(t0);   \
 }
 
@@ -3155,14 +3159,27 @@ static void gen_conditional_store(DisasContext *ctx, 
TCGv EA,
 static void gen_conditional_store(DisasContext *ctx, TCGv EA,
   int reg, int memop)
 {
-TCGLabel *l1;
+TCGLabel *l1 = gen_new_label();
+TCGLabel *l2 = gen_new_label();
+TCGv t0;
 
-tcg_gen_trunc_tl_i32(cpu_crf[0], cpu_so);
-l1 = gen_new_label();
 tcg_gen_brcond_tl(TCG_COND_NE, EA, cpu_reserve, l1);
-tcg_gen_ori_i32(cpu_crf[0], cpu_crf[0], CRF_EQ);
-tcg_gen_qemu_st_tl(cpu_gpr[reg], EA, ctx->mem_idx, memop);
+
+t0 = tcg_temp_new();
+tcg_gen_atomic_cmpxchg_tl(t0, cpu_reserve, cpu_reserve_val,
+  cpu_gpr[reg], ctx->mem_idx,
+  DEF_MEMOP(memop) | MO_ALIGN);
+tcg_gen_setcond_tl(TCG_COND_EQ, t0, t0, cpu_reserve_val);
+tcg_gen_shli_tl(t0, t0, CRF_EQ_BIT);
+tcg_gen_or_tl(t0, t0, cpu_so);
+tcg_gen_trunc_tl_i32(cpu_crf[0], t0);
+tcg_temp_free(t0);
+tcg_gen_br(l2);
+
 gen_set_label(l1);
+tcg_gen_trunc_tl_i32(cpu_crf[0], cpu_so);
+
+gen_set_label(l2);
 tcg_gen_movi_tl(cpu_reserve, -1);
 }
 #endif
-- 
2.9.3




[Qemu-devel] [PATCH v3 2/6] cputlb: handle first atomic write to the page

2017-04-26 Thread Nikunj A Dadhania
In case where the conditional write is the first write to the page,
TLB_NOTDIRTY will be set and stop_the_world is triggered. Handle this as
a special case and set the dirty bit. After that fall through to the
actual atomic instruction below.

Signed-off-by: Nikunj A Dadhania 
Reviewed-by: Richard Henderson 
---
 cputlb.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/cputlb.c b/cputlb.c
index f5d056c..743776a 100644
--- a/cputlb.c
+++ b/cputlb.c
@@ -930,7 +930,13 @@ static void *atomic_mmu_lookup(CPUArchState *env, 
target_ulong addr,
 tlb_addr = tlbe->addr_write;
 }
 
-/* Notice an IO access, or a notdirty page.  */
+/* Check notdirty */
+if (unlikely(tlb_addr & TLB_NOTDIRTY)) {
+tlb_set_dirty(ENV_GET_CPU(env), addr);
+tlb_addr = tlb_addr & ~TLB_NOTDIRTY;
+}
+
+/* Notice an IO access  */
 if (unlikely(tlb_addr & ~TARGET_PAGE_MASK)) {
 /* There's really nothing that can be done to
support this apart from stop-the-world.  */
-- 
2.9.3




[Qemu-devel] [PATCH v3 5/6] tcg: enable MTTCG by default for PPC64 on x86

2017-04-26 Thread Nikunj A Dadhania
This enables the multi-threaded system emulation by default for PPC64
guests using the x86_64 TCG back-end.

Signed-off-by: Nikunj A Dadhania 
Reviewed-by: Alex Bennée 
---
 configure| 2 ++
 target/ppc/cpu.h | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/configure b/configure
index c35acf1..3814617 100755
--- a/configure
+++ b/configure
@@ -6090,12 +6090,14 @@ case "$target_name" in
   ppc64)
 TARGET_BASE_ARCH=ppc
 TARGET_ABI_DIR=ppc
+mttcg=yes
 gdb_xml_files="power64-core.xml power-fpu.xml power-altivec.xml 
power-spe.xml power-vsx.xml"
   ;;
   ppc64le)
 TARGET_ARCH=ppc64
 TARGET_BASE_ARCH=ppc
 TARGET_ABI_DIR=ppc
+mttcg=yes
 gdb_xml_files="power64-core.xml power-fpu.xml power-altivec.xml 
power-spe.xml power-vsx.xml"
   ;;
   ppc64abi32)
diff --git a/target/ppc/cpu.h b/target/ppc/cpu.h
index e0ff041..ece535d 100644
--- a/target/ppc/cpu.h
+++ b/target/ppc/cpu.h
@@ -30,6 +30,8 @@
 #define TARGET_LONG_BITS 64
 #define TARGET_PAGE_BITS 12
 
+#define TCG_GUEST_DEFAULT_MO 0
+
 /* Note that the official physical address space bits is 62-M where M
is implementation dependent.  I've not looked up M for the set of
cpus we emulate at the system level.  */
-- 
2.9.3




[Qemu-devel] [PATCH v3 0/6] The series enables Multi-Threaded TCG on PPC64

2017-04-26 Thread Nikunj A Dadhania
Patch 01: Use atomic_cmpxchg in store conditional
  02: Handle first write to page during atomic operation
  03: Generate memory barriers for sync/isync and load/store conditional
  04: Fix CPU unplug in MTTCG
  05: Enable MTTCG by default on PPC64
  06: Fixes a bug in PPC where the reservation is reset 
  causing atomic operations to never succeed 

Patches are based on ppc-for-2.10

Changelog:
v2: 
* David found problem related to clang and "make check" that was 
  root caused to a tcg bug and the patch is in mainline:

  79b1af9 tcg: Initialize return value after exit_atomic

* Fixed a bug in ppc_cpu_exec_enter(), which was resetting the 
  reserve_addr, this should be done in powerpc_excp()

v1:
* Rewrote store_conditional as suggested by Richard

Bharata B Rao (1):
  cpus: Fix CPU unplug for MTTCG

Nikunj A Dadhania (5):
  target/ppc: Emulate LL/SC using cmpxchg helpers
  cputlb: handle first atomic write to the page
  target/ppc: Generate fence operations
  tcg: enable MTTCG by default for PPC64 on x86
  target/ppc: do not reset reserve_addr in exec_enter

 configure   |  2 ++
 cpus.c  |  6 ++
 cputlb.c|  8 +++-
 target/ppc/cpu.h|  2 ++
 target/ppc/excp_helper.c|  3 +++
 target/ppc/translate.c  | 37 +++--
 target/ppc/translate_init.c |  9 -
 7 files changed, 51 insertions(+), 16 deletions(-)

-- 
2.9.3




Re: [Qemu-devel] [PATCH v3 3/3] tests: Add a tester for HMP commands

2017-04-26 Thread Thomas Huth
On 25.04.2017 15:22, Eric Blake wrote:
> On 04/25/2017 12:16 AM, Thomas Huth wrote:
>> HMP commands do not get any automatic testing yet, so on certain
>> QEMU machines, some HMP commands were causing crashes in the past.
>> Thus we should test HMP commands in our test suite, too, to avoid
>> that such problems creep in again in the future.
>>
>> Signed-off-by: Thomas Huth 
>> ---
>>  v3:
>>  - Fixed the stupid "!strcmp() == 0" problem
> 
> That's why libvirt prefers the use of a STREQ() macro (it's easier to
> read STREQ(a,b) than it is to remember if !strcmp(a,b) has the right
> semantics).  But switching the code base to use such a nice wrapper
> would be a task for Coccinelle, if we wanted it.

I just saw that glib seems also to have a g_str_equal() function ... I
guess we could use that in QEMU instead. I'll try to remember that for
future code that I write ...

 Thomas




signature.asc
Description: OpenPGP digital signature


[Qemu-devel] [Bug 1081416] Re: Qemu 1.2.0 crashes when using tcp serial console and GRUB boots

2017-04-26 Thread Thomas Huth
Can you still reproduce this issue somehow with the latest version of
QEMU (currently v2.9.0)? Otherwise, I think we can close this ticket
nowadays...

-- 
You received this bug notification because you are a member of qemu-
devel-ml, which is subscribed to QEMU.
https://bugs.launchpad.net/bugs/1081416

Title:
  Qemu 1.2.0 crashes when using tcp serial console and GRUB boots

Status in QEMU:
  New

Bug description:
  When booting OpenWRT Attitude Adjustement ( 
http://downloads.openwrt.org/attitude_adjustment/12.09-beta2/x86/generic/openwrt-x86-generic-combined-ext4.img.gz
 ) with this command line:
  qemu-system-x86_64 -serial tcp:127.0.0.1: -hda 
openwrt-x86-generic-combined-ext4.img

  Qemu crashes as soon as GRUB starts, after network cards start.

  *** buffer overflow detected ***: /usr/bin/qemu-system-x86_64 terminated
  === Backtrace: =
  /usr/lib/libc.so.6(__fortify_fail+0x37)[0x745f2ad7]
  /usr/lib/libc.so.6(+0xf9bb0)[0x745f0bb0]
  /usr/lib/libc.so.6(+0xfba47)[0x745f2a47]
  /usr/bin/qemu-system-x86_64[0x46a628]
  /usr/bin/qemu-system-x86_64[0x4e8a14]
  /usr/bin/qemu-system-x86_64[0x4e802b]
  /usr/lib/libc.so.6(__libc_start_main+0xf5)[0x74518725]
  /usr/bin/qemu-system-x86_64[0x40d949]

  
  Here is a GDB backtrace:

  Program received signal SIGABRT, Aborted.
  0x7452bfa5 in raise () from /usr/lib/libc.so.6
  (gdb) bt
  #0  0x7452bfa5 in raise () from /usr/lib/libc.so.6
  #1  0x7452d428 in abort () from /usr/lib/libc.so.6
  #2  0x7456acfb in __libc_message () from /usr/lib/libc.so.6
  #3  0x745f2ad7 in __fortify_fail () from /usr/lib/libc.so.6
  #4  0x745f0bb0 in __chk_fail () from /usr/lib/libc.so.6
  #5  0x745f2a47 in __fdelt_warn () from /usr/lib/libc.so.6
  #6  0x0046a628 in qemu_iohandler_poll (readfds=0xdb7da0 , 
  writefds=0xdb7e20 , xfds=0x6, xfds@entry=0xdb7ea0 , ret=-1, 
  ret@entry=1) at iohandler.c:121
  #7  0x004e8a14 in main_loop_wait (nonblocking=)
  at main-loop.c:497
  #8  0x004e802b in main_loop ()
  at /usr/src/aur/qemu/src/qemu-1.2.0/vl.c:1643
  #9  main (argc=, argv=, envp=)
  at /usr/src/aur/qemu/src/qemu-1.2.0/vl.c:3755
  (gdb) 

  Here is a more useless dump...

To manage notifications about this bug go to:
https://bugs.launchpad.net/qemu/+bug/1081416/+subscriptions



[Qemu-devel] [PATCH] COLO-compare: Improve tcp compare trace event readability

2017-04-26 Thread Zhang Chen
Because of previous patch's trace arguments over the limit
of UST backend, so I rewrite the patch.

Signed-off-by: Zhang Chen 
---
 net/colo-compare.c | 33 ++---
 net/trace-events   |  3 +--
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index 4ab80b1..2639c7f 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -265,17 +265,28 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet 
*ppkt)
 }
 
 if (res != 0 && trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE)) {
-trace_colo_compare_pkt_info_src(inet_ntoa(ppkt->ip->ip_src),
-ntohl(stcp->th_seq),
-ntohl(stcp->th_ack),
-res, stcp->th_flags,
-spkt->size);
-
-trace_colo_compare_pkt_info_dst(inet_ntoa(ppkt->ip->ip_dst),
-ntohl(ptcp->th_seq),
-ntohl(ptcp->th_ack),
-res, ptcp->th_flags,
-ppkt->size);
+char pri_ip_src[20], pri_ip_dst[20], sec_ip_src[20], sec_ip_dst[20];
+
+strcpy(pri_ip_src, inet_ntoa(ppkt->ip->ip_src));
+strcpy(pri_ip_dst, inet_ntoa(ppkt->ip->ip_dst));
+strcpy(sec_ip_src, inet_ntoa(spkt->ip->ip_src));
+strcpy(sec_ip_dst, inet_ntoa(spkt->ip->ip_dst));
+
+trace_colo_compare_ip_info(ppkt->size, pri_ip_src,
+   pri_ip_dst, spkt->size,
+   sec_ip_src, sec_ip_dst);
+
+trace_colo_compare_tcp_info("pri tcp packet",
+ntohl(ptcp->th_seq),
+ntohl(ptcp->th_ack),
+res, ptcp->th_flags,
+ppkt->size);
+
+trace_colo_compare_tcp_info("sec tcp packet",
+ntohl(stcp->th_seq),
+ntohl(stcp->th_ack),
+res, stcp->th_flags,
+spkt->size);
 
 qemu_hexdump((char *)ppkt->data, stderr,
  "colo-compare ppkt", ppkt->size);
diff --git a/net/trace-events b/net/trace-events
index 35198bc..247e5c0 100644
--- a/net/trace-events
+++ b/net/trace-events
@@ -13,8 +13,7 @@ colo_compare_icmp_miscompare(const char *sta, int size) ": %s 
= %d"
 colo_compare_ip_info(int psize, const char *sta, const char *stb, int ssize, 
const char *stc, const char *std) "ppkt size = %d, ip_src = %s, ip_dst = %s, 
spkt size = %d, ip_src = %s, ip_dst = %s"
 colo_old_packet_check_found(int64_t old_time) "%" PRId64
 colo_compare_miscompare(void) ""
-colo_compare_pkt_info_src(const char *src, uint32_t sseq, uint32_t sack, int 
res, uint32_t sflag, int ssize) "src/dst: %s s: seq/ack=%u/%u res=%d flags=%x 
spkt_size: %d\n"
-colo_compare_pkt_info_dst(const char *dst, uint32_t dseq, uint32_t dack, int 
res, uint32_t dflag, int dsize) "src/dst: %s d: seq/ack=%u/%u res=%d flags=%x 
dpkt_size: %d\n"
+colo_compare_tcp_info(const char *pkt, uint32_t seq, uint32_t ack, int res, 
uint32_t flag, int size) "side: %s seq/ack= %u/%u res= %d flags= %x pkt_size: 
%d\n"
 
 # net/filter-rewriter.c
 colo_filter_rewriter_debug(void) ""
-- 
2.7.4






Re: [Qemu-devel] [PATCH v3] qemu-img: use blk_co_pwrite_zeroes for zero sectors when compressed

2017-04-26 Thread no-reply
Hi,

This series failed automatic build test. Please find the testing commands and
their output below. If you have docker installed, you can probably reproduce it
locally.

Message-id: 1493261907-18734-1-git-send-email-lidongc...@tencent.com
Subject: [Qemu-devel] [PATCH v3] qemu-img: use blk_co_pwrite_zeroes for zero 
sectors when compressed
Type: series

=== TEST SCRIPT BEGIN ===
#!/bin/bash
set -e
git submodule update --init dtc
# Let docker tests dump environment info
export SHOW_ENV=1
export J=8
make docker-test-quick@centos6
make docker-test-mingw@fedora
make docker-test-build@min-glib
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
Switched to a new branch 'test'
07c1956 qemu-img: use blk_co_pwrite_zeroes for zero sectors when compressed

=== OUTPUT BEGIN ===
Submodule 'dtc' (git://git.qemu-project.org/dtc.git) registered for path 'dtc'
Cloning into '/var/tmp/patchew-tester-tmp-xk8dup4f/src/dtc'...
Submodule path 'dtc': checked out '558cd81bdd432769b59bff01240c44f82cfb1a9d'
  BUILD   centos6
make[1]: Entering directory '/var/tmp/patchew-tester-tmp-xk8dup4f/src'
  ARCHIVE qemu.tgz
  ARCHIVE dtc.tgz
  COPYRUNNER
RUN test-quick in qemu:centos6 
Packages installed:
SDL-devel-1.2.14-7.el6_7.1.x86_64
ccache-3.1.6-2.el6.x86_64
epel-release-6-8.noarch
gcc-4.4.7-17.el6.x86_64
git-1.7.1-4.el6_7.1.x86_64
glib2-devel-2.28.8-5.el6.x86_64
libfdt-devel-1.4.0-1.el6.x86_64
make-3.81-23.el6.x86_64
package g++ is not installed
pixman-devel-0.32.8-1.el6.x86_64
tar-1.23-15.el6_8.x86_64
zlib-devel-1.2.3-29.el6.x86_64

Environment variables:
PACKAGES=libfdt-devel ccache tar git make gcc g++ zlib-devel 
glib2-devel SDL-devel pixman-devel epel-release
HOSTNAME=528984ae5192
TERM=xterm
MAKEFLAGS= -j8
HISTSIZE=1000
J=8
USER=root
CCACHE_DIR=/var/tmp/ccache
EXTRA_CONFIGURE_OPTS=
V=
SHOW_ENV=1
MAIL=/var/spool/mail/root
PATH=/usr/lib/ccache:/usr/lib64/ccache:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
PWD=/
LANG=en_US.UTF-8
TARGET_LIST=
HISTCONTROL=ignoredups
SHLVL=1
HOME=/root
TEST_DIR=/tmp/qemu-test
LOGNAME=root
LESSOPEN=||/usr/bin/lesspipe.sh %s
FEATURES= dtc
DEBUG=
G_BROKEN_FILENAMES=1
CCACHE_HASHDIR=
_=/usr/bin/env

Configure options:
--enable-werror --target-list=x86_64-softmmu,aarch64-softmmu 
--prefix=/var/tmp/qemu-build/install
grep: scripts/tracetool/backend/*.py: No such file or directory
No C++ compiler available; disabling C++ specific optional code
Install prefix/var/tmp/qemu-build/install
BIOS directory/var/tmp/qemu-build/install/share/qemu
binary directory  /var/tmp/qemu-build/install/bin
library directory /var/tmp/qemu-build/install/lib
module directory  /var/tmp/qemu-build/install/lib/qemu
libexec directory /var/tmp/qemu-build/install/libexec
include directory /var/tmp/qemu-build/install/include
config directory  /var/tmp/qemu-build/install/etc
local state directory   /var/tmp/qemu-build/install/var
Manual directory  /var/tmp/qemu-build/install/share/man
ELF interp prefix /usr/gnemul/qemu-%M
Source path   /tmp/qemu-test/src
C compilercc
Host C compiler   cc
C++ compiler  
Objective-C compiler cc
ARFLAGS   rv
CFLAGS-O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -g 
QEMU_CFLAGS   -I/usr/include/pixman-1   -I$(SRC_PATH)/dtc/libfdt -pthread 
-I/usr/include/glib-2.0 -I/usr/lib64/glib-2.0/include   -fPIE -DPIE -m64 -mcx16 
-D_GNU_SOURCE -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -Wstrict-prototypes 
-Wredundant-decls -Wall -Wundef -Wwrite-strings -Wmissing-prototypes 
-fno-strict-aliasing -fno-common -fwrapv  -Wendif-labels 
-Wno-missing-include-dirs -Wempty-body -Wnested-externs -Wformat-security 
-Wformat-y2k -Winit-self -Wignored-qualifiers -Wold-style-declaration 
-Wold-style-definition -Wtype-limits -fstack-protector-all
LDFLAGS   -Wl,--warn-common -Wl,-z,relro -Wl,-z,now -pie -m64 -g 
make  make
install   install
pythonpython -B
smbd  /usr/sbin/smbd
module supportno
host CPU  x86_64
host big endian   no
target list   x86_64-softmmu aarch64-softmmu
tcg debug enabled no
gprof enabled no
sparse enabledno
strip binariesyes
profiler  no
static build  no
pixmansystem
SDL support   yes (1.2.14)
GTK support   no 
GTK GL supportno
VTE support   no 
TLS priority  NORMAL
GNUTLS supportno
GNUTLS rndno
libgcrypt no
libgcrypt kdf no
nettleno 
nettle kdfno
libtasn1  no
curses supportno
virgl support no
curl support  no
mingw32 support   no
Audio drivers oss
Block whitelist (rw) 
Block whitelist (ro) 
VirtFS supportno
VNC support   yes
VNC SASL support  no
VNC JPEG support  no
VNC PNG support   no
xen support   no
brlapi supportno
bluez  supportno
Documentation no
PIE   yes
vde support   no
netmap supportno
Linux AIO support no
ATTR/XATTR support yes
Install blobs yes
KVM support   

[Qemu-devel] [PATCH v4 11/11] tb-hash: improve tb_jmp_cache hash function in user mode

2017-04-26 Thread Emilio G. Cota
Optimizations to cross-page chaining and indirect branches make
performance more sensitive to the hit rate of tb_jmp_cache.
The constraint of reserving some bits for the page number
lowers the achievable quality of the hashing function.

However, user-mode does not have this requirement. Thus,
with this change we use for user-mode a hashing function that
is both faster and of better quality than the previous one.

Measurements:

Note: baseline (i.e. speedup == 1x) is QEMU v2.9.0.

-   SPECint06 (test set), x86_64-linux-user. Host: 
Intel i7-6700K @ 4.00GHz

 2.2x 
+-+--+-+
  | 
 |
  | jr  
 |
   2x +jr+multhash
++...+-+
  |jr+hash  
|$$$ |
  | 
|$+$ |
  |
### $ |
 1.8x 
+-+..#|#.$...+-+
  |  
++#+# $ |
  |   
|# # $ |
 1.6x 
+-+***.#.$++$$$..+-+
  | $$$  
*+* # $ |$+$|
  |   ++$$$   ### $  * 
* # $  +++|$ $|
  | ++###+$   # # $  * 
* # $   ###   ## $|
 1.4x 
+-+...***+#.$.***.#.$..*.*.#.$...#+#$$.*++*|#.$..+-+
  | *+* # $ * * # $  * 
* # $   # # $ *  *+# $|
  | * * # $   + * * # $  * 
* # $ *** # $ *  * # $   ###$$|
 1.2x 
+-+...*.*.#.$.***##$$.*.*.#.$..*.*.#.$.*.*.#.$.*..*.#.$.***+#+$..+-+
  | * * # $ *+* # $ * * # $   +++* 
* # $ ++###$$ * * # $ *  * # $ * * # $|
  |***##$$  * * # $ * * # $ * * # $ ***##$$  ++###   * 
* # $ *** #+$ * * # $ *  * # $ * * # $|
  |*+*+#+$ ***##$$$ * * # $ * * # $ * * # $ *+* # $ ++$$ ***+#   * 
* # $ * * # $ * * # $ *  * # $ * * # $|
   1x 
+-++-*+*+#+$+*+*+#-+$+*+*-#+$+*+*+#+$+*+*+#+$+*-*+#+$+***++#+$+*+*+#$$+*+*+#+$+*+*+#+$+*+*-#+$+*+-*+#+$+*+*+#+$-++-+
  |* * # $ * * #  $ * * # $ * * # $ * * # $ * * # $ * *  # $ * * # $ * 
* # $ * * # $ * * # $ *  * # $ * * # $|
  |* * # $ * * #  $ * * # $ * * # $ * * # $ * * # $ * *  # $ * * # $ * 
* # $ * * # $ * * # $ *  * # $ * * # $|
 0.8x 
+-+--***##$$-***##$$$-***##$$-***##$$-***##$$-***##$$-***###$$-***##$$-***##$$-***##$$-***##$$-##$$-***##$$--+-+
 astar   bzip2  gcc   gobmk h264ref   hmmlibquantum  mcf 
omnetpperlbench   sjengxalancbmk   hmean
  png: http://imgur.com/4UXTrEc

Here I also tried the hash function suggested by Paolo ("multhash"):

  return ((uint64_t) (pc * 2654435761) >> 32) & (TB_JMP_CACHE_SIZE - 1);

As you can see it is just as good as the other new function ("hash"),
which is what I ended up going with.

-  SPECint06 (train set), x86_64-linux-user. Host: 
Intel i7-6700K @ 4.00GHz

 2.6x 
+-+--+-+
  | 
 |
  | jr  
 ### |
 2.4x 
+jr+hash...#.#...+-+
  | 
 # # |
  | 
 # # |
 2.2x 

[Qemu-devel] [PATCH v4 07/11] target/arm: optimize indirect branches

2017-04-26 Thread Emilio G. Cota
Speed up indirect branches by jumping to the target if it is valid.

Softmmu measurements (see later commit for user-mode results):

Note: baseline (i.e. speedup == 1x) is QEMU v2.9.0.

- Impact on Boot time

| setup  | ARM debian jessie boot+shutdown time | stddev |
|+--+|
| v2.9.0 | 8.84 |   0.07 |
| +cross | 8.85 |   0.03 |
| +jr| 8.83 |   0.06 |

-NBench, arm-softmmu (debian jessie guest). Host: 
Intel i7-4790K @ 4.00GHz

  1.3x 
+-+-+-+
   |
 |
   |   cross    
 |
 1.25x 
+cross+jr..#++#.+-+
   |  #  #  
 |
   | +++#  #  #  #  
 |
   |  +++  #  #  #  
 |
  1.2x 
+-+...*..*..#..#..#.+-+
   |    #*  *  #  #  #  
     |
   |  *  *  #*  *  #  #  #  
   #  #  |
 1.15x 
+-+*..*..#*..*..#..#..#.#..#+-+
   |  *  *  #*  *  #  #  #  
   #  #  |
   |  *  *  #    *  *  #  #  #  
   #  #  |
   |  *  *  #  #  #  *  *  #  #  #  
   #  #  |
  1.1x 
+-+*..*..#..#..#..*..*..#..#..#.#..#.#..#...+-+
   |  *  *  #  #  #  *  *  #  #  #  
   #  # #  # |
   |  *  *  #  #  #  *  *  #  #  #  
   #  # #  # |
 1.05x 
+-+....*..*..#..#..#..*..*..#..#..#.#..#..+++*..#...+-+
   |*  #  *  *  #  #  #  *  *  #  *  #  
   #  #   +++ |###  *   *  # |
   |*+++*  #  *  *  #  #  #  *  *  #  *+++*  #  
  #  *###  *  *  #  *   *  # |
   | *###  +++  *   *  #  *  *  #  *  #  *  *  #  *   *  #  
*  *  #  * | *++#  *  *  #  *   *  # |
1x 
+-++-+*+++*-+#++++#++*+-+*++#+-*++*++#-+*+++*-+#++*++*++#++*+-+*++#+-*++*++#-+*+++*-+#++*++*++#++*+-+*++#+-++-+
   | *   *  #  *  *  #  *   *  #  *  *  #  *   *  #  *  *  #  *   *  #  
*  *  #  *   *  #  *  *  #  *   *  # |
   | *   *  #  *  *  #  *   *  #  *  *  #  *   *  #  *  *  #  *   *  #  
*  *  #  *   *  #  *  *  #  *   *  # |
 0.95x 
+-+---*###--###--*###--###--*###--###--*###--###--*###--###--*###---+-+
   ASSIGNMENT BITFIELD   FOURFP EMULATION   HUFFMAN   LU DECOMPOSITIONEURAL 
NNUMERIC SOSTRING SORT hmean
  png: http://imgur.com/eOLmZNR

NB. 'cross' represents the previous commit.

Signed-off-by: Emilio G. Cota 
---
 target/arm/translate.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/target/arm/translate.c b/target/arm/translate.c
index 02cad96..d46a576 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -65,6 +65,7 @@ static TCGv_i32 cpu_R[16];
 TCGv_i32 cpu_CF, cpu_NF, cpu_VF, cpu_ZF;
 TCGv_i64 cpu_exclusive_addr;
 TCGv_i64 cpu_exclusive_val;
+static bool gen_jr;
 
 /* FIXME:  These should be removed.  */
 static TCGv_i32 cpu_F0s, cpu_F1s;
@@ -221,6 +222,7 @@ static void store_reg(DisasContext *s, int reg, TCGv_i32 
var)
  */
 tcg_gen_andi_i32(var, var, s->thumb ? ~1 : ~3);
 s->is_jmp = DISAS_JUMP;
+gen_jr = true;
 }
 tcg_gen_mov_i32(cpu_R[reg], var);
 tcg_temp_free_i32(var);
@@ -893,6 +895,7 @@ static inline void gen_bx_im(DisasContext *s, uint32_t addr)
 tcg_temp_free_i32(tmp);
 }
 tcg_gen_movi_i32(cpu_R[15], addr & ~1);
+gen_jr = true;
 }
 
 /* Set PC and Thumb state from var.  var is marked as dead.  */
@@ -902,6 +905,7 @@ static 

Re: [Qemu-devel] [PATCH v4 00/11] TCG optimizations for 2.10

2017-04-26 Thread Emilio G. Cota
Just to avoid confusion,

On Wed, Apr 26, 2017 at 23:29:13 -0400, Emilio G. Cota wrote:
> I have *not* re-run all experiments, because it takes several hours and
> performance hasn't changed much from v3, as can be seen in these two charts:
> * spec06int user-mode, test input, v2.9.0 baseline: http://imgur.com/ME2eMq1

Here v4 doesn't have the htable lookup. (v4+htable has it)

> * spec06int softmmu, test input, v3 baseline: http://imgur.com/Clolu9Z

Here v4 does have it.

E.



[Qemu-devel] [PATCH v4 03/11] tcg: introduce goto_ptr opcode

2017-04-26 Thread Emilio G. Cota
Signed-off-by: Emilio G. Cota 
---
 tcg/aarch64/tcg-target.h | 1 +
 tcg/arm/tcg-target.h | 1 +
 tcg/i386/tcg-target.h| 1 +
 tcg/ia64/tcg-target.h| 1 +
 tcg/mips/tcg-target.h| 1 +
 tcg/ppc/tcg-target.h | 1 +
 tcg/s390/tcg-target.h| 1 +
 tcg/sparc/tcg-target.h   | 1 +
 tcg/tcg-opc.h| 1 +
 tcg/tci/tcg-target.h | 1 +
 10 files changed, 10 insertions(+)

diff --git a/tcg/aarch64/tcg-target.h b/tcg/aarch64/tcg-target.h
index 1a5ea23..b82eac4 100644
--- a/tcg/aarch64/tcg-target.h
+++ b/tcg/aarch64/tcg-target.h
@@ -77,6 +77,7 @@ typedef enum {
 #define TCG_TARGET_HAS_mulsh_i320
 #define TCG_TARGET_HAS_extrl_i64_i320
 #define TCG_TARGET_HAS_extrh_i64_i320
+#define TCG_TARGET_HAS_goto_ptr 0
 
 #define TCG_TARGET_HAS_div_i64  1
 #define TCG_TARGET_HAS_rem_i64  1
diff --git a/tcg/arm/tcg-target.h b/tcg/arm/tcg-target.h
index 09a19c6..2f3ecfd 100644
--- a/tcg/arm/tcg-target.h
+++ b/tcg/arm/tcg-target.h
@@ -123,6 +123,7 @@ extern bool use_idiv_instructions;
 #define TCG_TARGET_HAS_mulsh_i320
 #define TCG_TARGET_HAS_div_i32  use_idiv_instructions
 #define TCG_TARGET_HAS_rem_i32  0
+#define TCG_TARGET_HAS_goto_ptr 0
 
 enum {
 TCG_AREG0 = TCG_REG_R6,
diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 4275787..59d9835 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -107,6 +107,7 @@ extern bool have_popcnt;
 #define TCG_TARGET_HAS_muls2_i321
 #define TCG_TARGET_HAS_muluh_i320
 #define TCG_TARGET_HAS_mulsh_i320
+#define TCG_TARGET_HAS_goto_ptr 0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_extrl_i64_i320
diff --git a/tcg/ia64/tcg-target.h b/tcg/ia64/tcg-target.h
index 42aea03..901bb75 100644
--- a/tcg/ia64/tcg-target.h
+++ b/tcg/ia64/tcg-target.h
@@ -173,6 +173,7 @@ typedef enum {
 #define TCG_TARGET_HAS_mulsh_i640
 #define TCG_TARGET_HAS_extrl_i64_i320
 #define TCG_TARGET_HAS_extrh_i64_i320
+#define TCG_TARGET_HAS_goto_ptr 0
 
 #define TCG_TARGET_deposit_i32_valid(ofs, len) ((len) <= 16)
 #define TCG_TARGET_deposit_i64_valid(ofs, len) ((len) <= 16)
diff --git a/tcg/mips/tcg-target.h b/tcg/mips/tcg-target.h
index f46d64a..e3240cf 100644
--- a/tcg/mips/tcg-target.h
+++ b/tcg/mips/tcg-target.h
@@ -130,6 +130,7 @@ extern bool use_mips32r2_instructions;
 #define TCG_TARGET_HAS_muluh_i321
 #define TCG_TARGET_HAS_mulsh_i321
 #define TCG_TARGET_HAS_bswap32_i32  1
+#define TCG_TARGET_HAS_goto_ptr 0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_add2_i32 0
diff --git a/tcg/ppc/tcg-target.h b/tcg/ppc/tcg-target.h
index abd8b3d..a9aa974 100644
--- a/tcg/ppc/tcg-target.h
+++ b/tcg/ppc/tcg-target.h
@@ -82,6 +82,7 @@ extern bool have_isa_3_00;
 #define TCG_TARGET_HAS_muls2_i320
 #define TCG_TARGET_HAS_muluh_i321
 #define TCG_TARGET_HAS_mulsh_i321
+#define TCG_TARGET_HAS_goto_ptr 0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_add2_i32 0
diff --git a/tcg/s390/tcg-target.h b/tcg/s390/tcg-target.h
index cbdd2a6..6b7bcfb 100644
--- a/tcg/s390/tcg-target.h
+++ b/tcg/s390/tcg-target.h
@@ -92,6 +92,7 @@ extern uint64_t s390_facilities;
 #define TCG_TARGET_HAS_mulsh_i32  0
 #define TCG_TARGET_HAS_extrl_i64_i32  0
 #define TCG_TARGET_HAS_extrh_i64_i32  0
+#define TCG_TARGET_HAS_goto_ptr   0
 
 #define TCG_TARGET_HAS_div2_i64   1
 #define TCG_TARGET_HAS_rot_i641
diff --git a/tcg/sparc/tcg-target.h b/tcg/sparc/tcg-target.h
index b8b74f9..9348ddd 100644
--- a/tcg/sparc/tcg-target.h
+++ b/tcg/sparc/tcg-target.h
@@ -123,6 +123,7 @@ extern bool use_vis3_instructions;
 #define TCG_TARGET_HAS_muls2_i321
 #define TCG_TARGET_HAS_muluh_i320
 #define TCG_TARGET_HAS_mulsh_i320
+#define TCG_TARGET_HAS_goto_ptr 0
 
 #define TCG_TARGET_HAS_extrl_i64_i321
 #define TCG_TARGET_HAS_extrh_i64_i321
diff --git a/tcg/tcg-opc.h b/tcg/tcg-opc.h
index f06f894..956fb1e 100644
--- a/tcg/tcg-opc.h
+++ b/tcg/tcg-opc.h
@@ -193,6 +193,7 @@ DEF(insn_start, 0, 0, TLADDR_ARGS * TARGET_INSN_START_WORDS,
 TCG_OPF_NOT_PRESENT)
 DEF(exit_tb, 0, 0, 1, TCG_OPF_BB_END)
 DEF(goto_tb, 0, 0, 1, TCG_OPF_BB_END)
+DEF(goto_ptr, 0, 1, 0, TCG_OPF_BB_END | IMPL(TCG_TARGET_HAS_goto_ptr))
 
 DEF(qemu_ld_i32, 1, TLADDR_ARGS, 1,
 TCG_OPF_CALL_CLOBBER | TCG_OPF_SIDE_EFFECTS)
diff --git a/tcg/tci/tcg-target.h b/tcg/tci/tcg-target.h
index 838bf3a..0696328 100644
--- a/tcg/tci/tcg-target.h
+++ b/tcg/tci/tcg-target.h
@@ -85,6 +85,7 @@
 #define TCG_TARGET_HAS_muls2_i320
 #define TCG_TARGET_HAS_muluh_i320
 #define TCG_TARGET_HAS_mulsh_i320
+#define TCG_TARGET_HAS_goto_ptr 0
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_extrl_i64_i320
-- 
2.7.4




[Qemu-devel] [PATCH v4 10/11] target/i386: optimize indirect branches

2017-04-26 Thread Emilio G. Cota
Speed up indirect branches by jumping to the target if it is valid.

Softmmu measurements (see later commit for user-mode numbers):

Note: baseline (i.e. speedup == 1x) is QEMU v2.9.0.

-  SPECint06 (test set), x86_64-softmmu (Ubuntu 16.04 guest). 
Host: Intel i7-4790K @ 4.00GHz

 2.4x 
+-+--+-+
  | 
 |
  |   cross 
 |
 2.2x 
+cross+jr..+++...+-+
  | 
  |  |
  | 
  +++ |  |
   2x 
+-+..|..|+-+
  | 
   |  |  |
  | 
   |  |  |
 1.8x 
+-+..|...+-+
  | 
   |# |# |
  | 
  |# |
 1.6x 
+-+*.|*.|#...+-+
  | 
 * |* |# |
  | 
 * |* |# |
 1.4x 
+-+...+++..*.|*.|#...+-+
  |  ++ 
 * |*++# +++ |
  |+++|  |  
#++# *++*  #  +++ |  |
 1.2x 
+-+..###.+++|..|.....#.*..*..#...|.###...+-+
  |+++   #    #  ***###  
*++*  # *  *  ##++#  |#  +++#++#|
  |### +++  *++* #  *++*  #  ++#  #  *|* |# +++  *  
*  # *  *  #  ***  #  *| *|#    #|
   1x 
+-++-*++*++#++***###++*++*+#++*+-*++#+++#++***++#+-*+*++#-+##++*++*-+#+*++*-+#++*+*++#++*-+*+#++*++*++#-++-+
  |*  *  #  * *  #  *  * #  *  *  # *  *  #  * *  #  *|* |#  *++* #  *  
*  # *  *  #  * *  #  *  * #  *  *  #|
  |*  *  #  * *  #  *  * #  *  *  # *  *  #  * *  #  *+*++#  *  * #  *  
*  # *  *  #  * *  #  *  * #  *  *  #|
 0.8x 
+-+--###--***###--##--###-###--***###--***###--##--###-###--***###--##--###--+-+
 astar   bzip2  gcc   gobmk h264ref   hmmlibquantum  mcf 
omnetpperlbench   sjengxalancbmk   hmean
  png: http://imgur.com/DU36YFU

NB. 'cross' represents the previous commit.

Reviewed-by: Richard Henderson 
Signed-off-by: Emilio G. Cota 
---
 target/i386/translate.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/target/i386/translate.c b/target/i386/translate.c
index ea113fe..674ec96 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -4996,7 +4996,7 @@ static target_ulong disas_insn(CPUX86State *env, 
DisasContext *s,
 gen_push_v(s, cpu_T1);
 gen_op_jmp_v(cpu_T0);
 gen_bnd_jmp(s);
-gen_eob(s);
+gen_jr(s, cpu_T0);
 break;
 case 3: /* lcall Ev */
 gen_op_ld_v(s, ot, cpu_T1, cpu_A0);
@@ -5014,7 +5014,8 @@ static target_ulong disas_insn(CPUX86State *env, 
DisasContext *s,
   tcg_const_i32(dflag - 1),
   tcg_const_i32(s->pc - s->cs_base));
 }
-gen_eob(s);
+tcg_gen_ld_tl(cpu_tmp4, cpu_env, offsetof(CPUX86State, eip));
+gen_jr(s, cpu_tmp4);
 break;
 case 4: /* jmp Ev */
 if (dflag == MO_16) {
@@ -5022,7 +5023,7 @@ static target_ulong disas_insn(CPUX86State *env, 
DisasContext *s,
 }
 gen_op_jmp_v(cpu_T0);
 gen_bnd_jmp(s);
-gen_eob(s);
+gen_jr(s, cpu_T0);
 break;
 

[Qemu-devel] [PATCH v4 08/11] target/i386: introduce gen_jr helper to generate lookup_and_goto_ptr

2017-04-26 Thread Emilio G. Cota
This helper will be used by subsequent changes.

Signed-off-by: Emilio G. Cota 
---
 target/i386/translate.c | 25 -
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/target/i386/translate.c b/target/i386/translate.c
index 1d1372f..f0e48dc 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -141,6 +141,7 @@ typedef struct DisasContext {
 } DisasContext;
 
 static void gen_eob(DisasContext *s);
+static void gen_jr(DisasContext *s, TCGv dest);
 static void gen_jmp(DisasContext *s, target_ulong eip);
 static void gen_jmp_tb(DisasContext *s, target_ulong eip, int tb_num);
 static void gen_op(DisasContext *s1, int op, TCGMemOp ot, int d);
@@ -2509,7 +2510,8 @@ static void gen_bnd_jmp(DisasContext *s)
If INHIBIT, set HF_INHIBIT_IRQ_MASK if it isn't already set.
If RECHECK_TF, emit a rechecking helper for #DB, ignoring the state of
S->TF.  This is used by the syscall/sysret insns.  */
-static void gen_eob_worker(DisasContext *s, bool inhibit, bool recheck_tf)
+static void
+do_gen_eob_worker(DisasContext *s, bool inhibit, bool recheck_tf, TCGv jr)
 {
 gen_update_cc_op(s);
 
@@ -2530,12 +2532,27 @@ static void gen_eob_worker(DisasContext *s, bool 
inhibit, bool recheck_tf)
 tcg_gen_exit_tb(0);
 } else if (s->tf) {
 gen_helper_single_step(cpu_env);
+} else if (!TCGV_IS_UNUSED(jr)) {
+TCGv vaddr = tcg_temp_new();
+
+tcg_gen_add_tl(vaddr, jr, cpu_seg_base[R_CS]);
+tcg_gen_lookup_and_goto_ptr(vaddr);
+tcg_temp_free(vaddr);
 } else {
 tcg_gen_exit_tb(0);
 }
 s->is_jmp = DISAS_TB_JUMP;
 }
 
+static inline void
+gen_eob_worker(DisasContext *s, bool inhibit, bool recheck_tf)
+{
+TCGv unused;
+
+TCGV_UNUSED(unused);
+do_gen_eob_worker(s, inhibit, recheck_tf, unused);
+}
+
 /* End of block.
If INHIBIT, set HF_INHIBIT_IRQ_MASK if it isn't already set.  */
 static void gen_eob_inhibit_irq(DisasContext *s, bool inhibit)
@@ -2549,6 +2566,12 @@ static void gen_eob(DisasContext *s)
 gen_eob_worker(s, false, false);
 }
 
+/* Jump to register */
+static void gen_jr(DisasContext *s, TCGv dest)
+{
+do_gen_eob_worker(s, false, false, dest);
+}
+
 /* generate a jump to eip. No segment change must happen before as a
direct call to the next block may occur */
 static void gen_jmp_tb(DisasContext *s, target_ulong eip, int tb_num)
-- 
2.7.4




[Qemu-devel] [PATCH v4 05/11] tcg/i386: implement goto_ptr op

2017-04-26 Thread Emilio G. Cota
Suggested-by: Richard Henderson 
Reviewed-by: Richard Henderson 
Signed-off-by: Emilio G. Cota 
---
 tcg/i386/tcg-target.h |  2 +-
 tcg/i386/tcg-target.inc.c | 15 +++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/tcg/i386/tcg-target.h b/tcg/i386/tcg-target.h
index 59d9835..73a15f7 100644
--- a/tcg/i386/tcg-target.h
+++ b/tcg/i386/tcg-target.h
@@ -107,7 +107,7 @@ extern bool have_popcnt;
 #define TCG_TARGET_HAS_muls2_i321
 #define TCG_TARGET_HAS_muluh_i320
 #define TCG_TARGET_HAS_mulsh_i320
-#define TCG_TARGET_HAS_goto_ptr 0
+#define TCG_TARGET_HAS_goto_ptr 1
 
 #if TCG_TARGET_REG_BITS == 64
 #define TCG_TARGET_HAS_extrl_i64_i320
diff --git a/tcg/i386/tcg-target.inc.c b/tcg/i386/tcg-target.inc.c
index 5918008..d0bf53a 100644
--- a/tcg/i386/tcg-target.inc.c
+++ b/tcg/i386/tcg-target.inc.c
@@ -1906,6 +1906,10 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
 }
 s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
 break;
+case INDEX_op_goto_ptr:
+/* jmp to the given host address (could be epilogue) */
+tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
+break;
 case INDEX_op_br:
 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
 break;
@@ -2277,6 +2281,7 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
 
 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
 {
+static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
 static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
 static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
 static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
@@ -2299,6 +2304,9 @@ static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode 
op)
 = { .args_ct_str = { "L", "L", "L", "L" } };
 
 switch (op) {
+case INDEX_op_goto_ptr:
+return 
+
 case INDEX_op_ld8u_i32:
 case INDEX_op_ld8u_i64:
 case INDEX_op_ld8s_i32:
@@ -2567,6 +2575,13 @@ static void tcg_target_qemu_prologue(TCGContext *s)
 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
 #endif
 
+/*
+ * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
+ * and fall through to the rest of the epilogue.
+ */
+s->code_gen_epilogue = s->code_ptr;
+tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
+
 /* TB epilogue */
 tb_ret_addr = s->code_ptr;
 
-- 
2.7.4




[Qemu-devel] [PATCH v4 02/11] tcg-runtime: add lookup_tb_ptr helper

2017-04-26 Thread Emilio G. Cota
This paves the way for upcoming work.

Reviewed-by: Richard Henderson 
Reviewed-by: Alex Bennée 
Signed-off-by: Emilio G. Cota 
---
 tcg-runtime.c | 24 
 tcg/tcg-runtime.h |  2 ++
 tcg/tcg.h |  1 +
 3 files changed, 27 insertions(+)

diff --git a/tcg-runtime.c b/tcg-runtime.c
index 4c60c96..8a24bdd 100644
--- a/tcg-runtime.c
+++ b/tcg-runtime.c
@@ -27,6 +27,7 @@
 #include "exec/helper-proto.h"
 #include "exec/cpu_ldst.h"
 #include "exec/exec-all.h"
+#include "exec/tb-hash.h"
 
 /* 32-bit helpers */
 
@@ -141,6 +142,29 @@ uint64_t HELPER(ctpop_i64)(uint64_t arg)
 return ctpop64(arg);
 }
 
+void *HELPER(lookup_tb_ptr)(CPUArchState *env, target_ulong addr)
+{
+CPUState *cpu = ENV_GET_CPU(env);
+TranslationBlock *tb;
+target_ulong cs_base, pc;
+uint32_t flags;
+
+tb = atomic_rcu_read(>tb_jmp_cache[tb_jmp_cache_hash_func(addr)]);
+if (likely(tb)) {
+cpu_get_tb_cpu_state(env, , _base, );
+if (likely(tb->pc == addr && tb->cs_base == cs_base &&
+   tb->flags == flags)) {
+return tb->tc_ptr;
+}
+tb = tb_htable_lookup(cpu, pc, cs_base, flags);
+if (likely(tb)) {
+atomic_set(>tb_jmp_cache[tb_jmp_cache_hash_func(addr)], tb);
+return tb->tc_ptr;
+}
+}
+return tcg_ctx.code_gen_epilogue;
+}
+
 void HELPER(exit_atomic)(CPUArchState *env)
 {
 cpu_loop_exit_atomic(ENV_GET_CPU(env), GETPC());
diff --git a/tcg/tcg-runtime.h b/tcg/tcg-runtime.h
index 114ea6f..c41d38a 100644
--- a/tcg/tcg-runtime.h
+++ b/tcg/tcg-runtime.h
@@ -24,6 +24,8 @@ DEF_HELPER_FLAGS_1(clrsb_i64, TCG_CALL_NO_RWG_SE, i64, i64)
 DEF_HELPER_FLAGS_1(ctpop_i32, TCG_CALL_NO_RWG_SE, i32, i32)
 DEF_HELPER_FLAGS_1(ctpop_i64, TCG_CALL_NO_RWG_SE, i64, i64)
 
+DEF_HELPER_FLAGS_2(lookup_tb_ptr, TCG_CALL_NO_WG_SE, ptr, env, tl)
+
 DEF_HELPER_FLAGS_1(exit_atomic, TCG_CALL_NO_WG, noreturn, env)
 
 #ifdef CONFIG_SOFTMMU
diff --git a/tcg/tcg.h b/tcg/tcg.h
index 6c216bb..5ec48d1 100644
--- a/tcg/tcg.h
+++ b/tcg/tcg.h
@@ -699,6 +699,7 @@ struct TCGContext {
extension that allows arithmetic on void*.  */
 int code_gen_max_blocks;
 void *code_gen_prologue;
+void *code_gen_epilogue;
 void *code_gen_buffer;
 size_t code_gen_buffer_size;
 void *code_gen_ptr;
-- 
2.7.4




[Qemu-devel] [PATCH v4 04/11] tcg: export tcg_gen_lookup_and_goto_ptr

2017-04-26 Thread Emilio G. Cota
Instead of exporting goto_ptr directly to TCG frontends, export
tcg_gen_lookup_and_goto_ptr(), which calls goto_ptr with the pointer
returned by the lookup_tb_ptr() helper. This is the only use case
we have for goto_ptr and lookup_tb_ptr, so having this function is
very convenient. Furthermore, it trivially allows us to avoid calling
the lookup helper if goto_ptr is not implemented by the backend.

Suggested-by: Richard Henderson 
Reviewed-by: Richard Henderson 
Reviewed-by: Alex Bennée 
Signed-off-by: Emilio G. Cota 
---
 tcg/README   |  8 
 tcg/tcg-op.c | 13 +
 tcg/tcg-op.h | 11 +++
 3 files changed, 32 insertions(+)

diff --git a/tcg/README b/tcg/README
index a9858c2..bf49e82 100644
--- a/tcg/README
+++ b/tcg/README
@@ -477,6 +477,14 @@ current TB was linked to this TB. Otherwise execute the 
next
 instructions. Only indices 0 and 1 are valid and tcg_gen_goto_tb may be issued
 at most once with each slot index per TB.
 
+* lookup_and_goto_ptr tb_addr
+
+Look up a TB address ('tb_addr') and jump to it if valid. If not valid,
+jump to the TCG epilogue to go back to the exec loop.
+
+This operation is optional. If the TCG backend does not implement the
+goto_ptr opcode, emitting this op is equivalent to emitting exit_tb(0).
+
 * qemu_ld_i32/i64 t0, t1, flags, memidx
 * qemu_st_i32/i64 t0, t1, flags, memidx
 
diff --git a/tcg/tcg-op.c b/tcg/tcg-op.c
index 95a39b7..8ff1eaf 100644
--- a/tcg/tcg-op.c
+++ b/tcg/tcg-op.c
@@ -2587,6 +2587,19 @@ void tcg_gen_goto_tb(unsigned idx)
 tcg_gen_op1i(INDEX_op_goto_tb, idx);
 }
 
+void tcg_gen_lookup_and_goto_ptr(TCGv addr)
+{
+if (TCG_TARGET_HAS_goto_ptr) {
+TCGv_ptr ptr = tcg_temp_new_ptr();
+
+gen_helper_lookup_tb_ptr(ptr, tcg_ctx.tcg_env, addr);
+tcg_gen_op1i(INDEX_op_goto_ptr, GET_TCGV_PTR(ptr));
+tcg_temp_free_ptr(ptr);
+} else {
+tcg_gen_exit_tb(0);
+}
+}
+
 static inline TCGMemOp tcg_canonicalize_memop(TCGMemOp op, bool is64, bool st)
 {
 /* Trigger the asserts within as early as possible.  */
diff --git a/tcg/tcg-op.h b/tcg/tcg-op.h
index c68e300..5d3278f 100644
--- a/tcg/tcg-op.h
+++ b/tcg/tcg-op.h
@@ -796,6 +796,17 @@ static inline void tcg_gen_exit_tb(uintptr_t val)
  */
 void tcg_gen_goto_tb(unsigned idx);
 
+/**
+ * tcg_gen_lookup_and_goto_ptr() - look up a TB and jump to it if valid
+ * @addr: Guest address of the target TB
+ *
+ * If the TB is not valid, jump to the epilogue.
+ *
+ * This operation is optional. If the TCG backend does not implement goto_ptr,
+ * this op is equivalent to calling tcg_gen_exit_tb() with 0 as the argument.
+ */
+void tcg_gen_lookup_and_goto_ptr(TCGv addr);
+
 #if TARGET_LONG_BITS == 32
 #define tcg_temp_new() tcg_temp_new_i32()
 #define tcg_global_reg_new tcg_global_reg_new_i32
-- 
2.7.4




[Qemu-devel] [PATCH v4 06/11] target/arm: optimize cross-page direct jumps in softmmu

2017-04-26 Thread Emilio G. Cota
Instead of unconditionally exiting to the exec loop, use the
lookup_and_goto_ptr helper to jump to the target if it is valid.

Perf impact: see next commit's log.

Reviewed-by: Richard Henderson 
Signed-off-by: Emilio G. Cota 
---
 target/arm/translate.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/target/arm/translate.c b/target/arm/translate.c
index e32e38c..02cad96 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -4085,8 +4085,12 @@ static inline void gen_goto_tb(DisasContext *s, int n, 
target_ulong dest)
 gen_set_pc_im(s, dest);
 tcg_gen_exit_tb((uintptr_t)s->tb + n);
 } else {
+TCGv addr = tcg_temp_new();
+
 gen_set_pc_im(s, dest);
-tcg_gen_exit_tb(0);
+tcg_gen_extu_i32_tl(addr, cpu_R[15]);
+tcg_gen_lookup_and_goto_ptr(addr);
+tcg_temp_free(addr);
 }
 }
 
-- 
2.7.4




[Qemu-devel] [PATCH v4 01/11] exec-all: export tb_htable_lookup

2017-04-26 Thread Emilio G. Cota
Signed-off-by: Emilio G. Cota 
---
 cpu-exec.c  | 6 ++
 include/exec/exec-all.h | 2 ++
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpu-exec.c b/cpu-exec.c
index 63a56d0..5b181c1 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -309,10 +309,8 @@ static bool tb_cmp(const void *p, const void *d)
 return false;
 }
 
-static TranslationBlock *tb_htable_lookup(CPUState *cpu,
-  target_ulong pc,
-  target_ulong cs_base,
-  uint32_t flags)
+TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
+   target_ulong cs_base, uint32_t flags)
 {
 tb_page_addr_t phys_pc;
 struct tb_desc desc;
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index bcde1e6..87ae10b 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -368,6 +368,8 @@ struct TranslationBlock {
 void tb_free(TranslationBlock *tb);
 void tb_flush(CPUState *cpu);
 void tb_phys_invalidate(TranslationBlock *tb, tb_page_addr_t page_addr);
+TranslationBlock *tb_htable_lookup(CPUState *cpu, target_ulong pc,
+   target_ulong cs_base, uint32_t flags);
 
 #if defined(USE_DIRECT_JUMP)
 
-- 
2.7.4




[Qemu-devel] [PATCH v4 09/11] target/i386: optimize cross-page direct jumps in softmmu

2017-04-26 Thread Emilio G. Cota
Instead of unconditionally exiting to the exec loop, use the
gen_jr helper to jump to the target if it is valid.

Perf impact: see next commit's log.

Reviewed-by: Richard Henderson 
Signed-off-by: Emilio G. Cota 
---
 target/i386/translate.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/target/i386/translate.c b/target/i386/translate.c
index f0e48dc..ea113fe 100644
--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -2154,9 +2154,9 @@ static inline void gen_goto_tb(DisasContext *s, int 
tb_num, target_ulong eip)
 gen_jmp_im(eip);
 tcg_gen_exit_tb((uintptr_t)s->tb + tb_num);
 } else {
-/* jump to another page: currently not optimized */
+/* jump to another page */
 gen_jmp_im(eip);
-gen_eob(s);
+gen_jr(s, cpu_tmp0);
 }
 }
 
-- 
2.7.4




[Qemu-devel] [PATCH v4 00/11] TCG optimizations for 2.10

2017-04-26 Thread Emilio G. Cota
v3 for context: 
https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg04795.html

Changes from v3:

- Added reviewed-by tags.

- Added a couple of suggested-by tags that I forgot to add in v3
  regarding lookup_and_goto_ptr and i386's implementation of goto_ptr.

- lookup_tb_ptr
  + Dropped the unnecessary exit_request check, as suggested by Paolo and
Richard.
  + Only get the CPU state if we get a tb from the jmp_cache, as suggested
by Richard.
  + Added tb_htable_lookup if we miss in tb_jmp_cache, as suggested by
Richard. This requires an extra patch to export tb_htable_lookup.

- goto_ptr: add IMPL(has_goto_ptr), as pointed out by Richard.

- target/arm: added a comment about gen_jr. See the v3 thread for why
  it is needed.

- target/i386: use TCGV_UNUSED instead of (ab)using NULL on a TCGv,
  as suggested by Richard. Also took his suggestion to simplify
  the addition of jr + cs_base.
  To minimize churn I renamed gen_eob_worker to do_gen_eob_worker,
  which takes the newly added argument.

I have *not* re-run all experiments, because it takes several hours and
performance hasn't changed much from v3, as can be seen in these two charts:
* spec06int user-mode, test input, v2.9.0 baseline: http://imgur.com/ME2eMq1
* spec06int softmmu, test input, v3 baseline: http://imgur.com/Clolu9Z
The perf differences are mostly due to adding the htable check. Note that
its impact is small, since tb_jmp_cache has a %hit rate in the high 90's.

You can inspect/fetch the changes at:
  https://github.com/cota/qemu/tree/tcg-opt-v4

Thanks,

Emilio




[Qemu-devel] qemu integrator hardware error: icp_control_write: Bad offset 789688

2017-04-26 Thread heavybird
Hi Peter, 

I am using your qemu integrator to debug my application but it encouters the 
exception below, i have no idea to figure out the message below. could you give 
me some suggestion or show me some document to get out this issues?? i noticed 
there is trace option in the qeumu, but don't know how to use it more 
efficently.

thanks.
jason

qemu: hardware error: icp_control_write: Bad offset 789688

CPU #0:
R00= R01=cb789688 R02= R03=
R04=00109a0c R05=0505 R06=0606 R07=0707
R08=0808 R09=0909 R10=1010 R11=0010c46c
R12=000a R13=0007e000 R14=0028 R15=0004
PSR=61db -ZC- A und32
s00= s01= d00=
s02= s03= d01=
s04= s05= d02=
s06= s07= d03=
s08= s09= d04=
s10= s11= d05=
s12= s13= d06=
s14= s15= d07=
s16= s17= d08=
s18= s19= d09=
s20= s21= d10=
s22= s23= d11=
s24= s25= d12=
s26= s27= d13=
s28= s29= d14=
s30= s31= d15=
FPSCR: 
Aborted (core dumped)

Re: [Qemu-devel] [PATCH 1/2] Postcopy: Force allocation of all-zero precopy pages

2017-04-26 Thread Peter Xu
On Wed, Apr 26, 2017 at 09:37:43PM +0200, Andrea Arcangeli wrote:
> Hello,
> 
> On Wed, Apr 26, 2017 at 08:04:43PM +0100, Dr. David Alan Gilbert wrote:
> > * Christian Borntraeger (borntrae...@de.ibm.com) wrote:
> > > On 04/26/2017 08:37 PM, Dr. David Alan Gilbert (git) wrote:
> > > > From: "Dr. David Alan Gilbert" 
> > > > 
> > > > When an all-zero page is received during the precopy
> > > > phase of a postcopy-enabled migration we must force
> > > > allocation otherwise accesses to the page will still
> > > > get blocked by userfault.
> > > > 
> > > > Symptom:
> > > >   a) If the page is accessed by a device during device-load
> > > > then we get a deadlock as the source finishes sending
> > > > all its pages but the destination device-load is still
> > > > paused and so doesn't clean up.
> > > > 
> > > >   b) If the page is accessed later, then the thread will stay
> > > > paused until the end of migration rather than carrying on
> > > > running, until we release userfault at the end.
> > > > 
> > > > Signed-off-by: Dr. David Alan Gilbert 
> > > > Reported-by: Christian Borntraeger 
> > > 
> > > CC stable? after all the guest hangs on both sides
> > > 
> > > Has survived 40 migrations (usually failed at the 2nd)
> > > Tested-by: Christian Borntraeger 
> > 
> > Great...but.
> > Andrea (added to the mail) says this shouldn't be necessary.
> > The read we were doing in the is_zero_range() should have been sufficient
> > to get the page mapped and that zero page should have survived.
> > 
> > So - I guess that's back a step, we need to figure out why the
> > page disapepars for you.
> 
> Yes reading during precopy is enough to fill the hole and prevent
> userfault missing faults to trigger.
> 
> Somehow the pagetable must be mapped by a zeropage or a hugezeropage
> or a regular page allocated during a previous precopy pass or a
> pre-zeroed subpage part of a THP.
> 
> Even if the hugezeropage is splitted later by a MADV_DONTNEED with
> postcopy starts, they will become 4k zeropages.
> 
> After a read succeeds, nothing (except MADV_DONTNEED or other explicit
> syscalls which qemu would need to invoke explicitly between
> is_zero_range and UFFDIO_REGISTER) should be able to bring the
> pagetable back to its "pte_none/pmd_none" state that will then trigger
> missing userfaults during postcopy later.

No matter what finally the solution would be (after see Juan's
comment, I am curious about whether is_zero_page() behaves differently
in power now)... Dave, would it worth mentioning in
ram_handle_compressed() about this read side-effect? Otherwise imho it
might be hard for many people to quickly notice this.

Thanks,

-- 
Peter Xu



Re: [Qemu-devel] [PATCH v3 06/10] target/arm: optimize indirect branches

2017-04-26 Thread Emilio G. Cota
On Wed, Apr 26, 2017 at 09:54:07 +0200, Richard Henderson wrote:
> On 04/26/2017 08:23 AM, Emilio G. Cota wrote:
> >+static bool gen_jr;...
> > case DISAS_JUMP:
> >+if (gen_jr) {
> 
> Why the variable?  Why not just try the goto_ptr for any DISAS_JUMP?

We have code that assumes DISAS_JUMP implies "go to exec loop", e.g.:
case 6: /* isb */
/* We need to break the TB after this insn to execute
 * self-modifying code correctly and also to take
 * any pending interrupts immediately.
 */
gen_lookup_tb(s);
where gen_lookup_tb does:

/* Force a TB lookup after an instruction that changes the CPU state.  */
static inline void gen_lookup_tb(DisasContext *s)
{
tcg_gen_movi_i32(cpu_R[15], s->pc & ~1);
s->is_jmp = DISAS_JUMP;
}

Also, the gen_exception_* functions set DISAS_JUMP. I suspect we want to go
to the exec loop with those as well.

Testing shows that I'm onto something; if I remove the variable,
and note that I make sure DISAS_UPDATE is not falling through, I get
easily reproducible (~1 out of 5) freezes and other instability
(e.g. RCU lockup warnings) when booting + shutting down debian jessie
in the guest.

In v4 I've added a comment about this.

Thanks,

E.



Re: [Qemu-devel] [PATCH] Issue a deprecation warning if the user specifies the "-hdachs" option.

2017-04-26 Thread Thomas Huth
On 26.04.2017 15:44, Paolo Bonzini wrote:
> 
> 
> On 25/04/2017 10:08, Thomas Huth wrote:
>> If the user needs to specify the disk geometry, the corresponding
>> parameters of the "-drive" option should be used instead. "-hdachs"
>> is considered as deprecated and might be removed soon.
>>
>> Signed-off-by: Thomas Huth 
>> ---
>>  vl.c | 2 ++
>>  1 file changed, 2 insertions(+)
>>
>> diff --git a/vl.c b/vl.c
>> index 0b4ed52..0980213 100644
>> --- a/vl.c
>> +++ b/vl.c
>> @@ -3230,6 +3230,8 @@ int main(int argc, char **argv, char **envp)
>>  }
>>  }
>>  }
>> +error_report("'-hdachs' is deprecated. Please use '-drive"
>> + " ...,cyls=c,heads=h,secs=s,trans=t' 
>> instead.");
>>  break;
>>  case QEMU_OPTION_numa:
>>  opts = qemu_opts_parse_noisily(qemu_find_opts("numa"),
>>
> 
> Not even this, which should also be deprecated; please suggest using
> "-device ide-hd" instead.

D'oh, you're right, of course ... I'll send a v3 ...

 Thomas




Re: [Qemu-devel] [PATCH v2] qemu-img: use blk_co_pwrite_zeroes for zero sectors when compressed

2017-04-26 Thread 858585 jemmy
On Thu, Apr 27, 2017 at 4:25 AM, Max Reitz  wrote:
> On 21.04.2017 11:57, jemmy858...@gmail.com wrote:
>> From: Lidong Chen 
>>
>> when the buffer is zero, blk_co_pwrite_zeroes is more effectively than
>
> s/when/When/, s/effectively/effective/
>
>> blk_co_pwritev with BDRV_REQ_WRITE_COMPRESSED. this patch can reduces
>
> s/this/This/, s/reduces/reduce/
>
>> the time when converts the qcow2 image with lots of zero.
>
> s/when converts the qcow2 image/for converting qcow2 images/,
> s/zero/zero data/
>
>>
>> Signed-off-by: Lidong Chen 
>> ---
>> v2 changelog:
>> unify the compressed and non-compressed code paths
>> ---
>>  qemu-img.c | 41 +++--
>>  1 file changed, 11 insertions(+), 30 deletions(-)
>
> Functionally, looks good to me. Just some stylistic nit picks:
>
>>
>> diff --git a/qemu-img.c b/qemu-img.c
>> index b220cf7..60c9adf 100644
>> --- a/qemu-img.c
>> +++ b/qemu-img.c
>> @@ -1661,6 +1661,8 @@ static int coroutine_fn 
>> convert_co_write(ImgConvertState *s, int64_t sector_num,
>>
>>  while (nb_sectors > 0) {
>>  int n = nb_sectors;
>> +BdrvRequestFlags flags = s->compressed ? BDRV_REQ_WRITE_COMPRESSED 
>> : 0;
>> +
>>  switch (status) {
>>  case BLK_BACKING_FILE:
>>  /* If we have a backing file, leave clusters unallocated that 
>> are
>> @@ -1670,43 +1672,21 @@ static int coroutine_fn 
>> convert_co_write(ImgConvertState *s, int64_t sector_num,
>>  break;
>>
>>  case BLK_DATA:
>> -/* We must always write compressed clusters as a whole, so don't
>> - * try to find zeroed parts in the buffer. We can only save the
>> - * write if the buffer is completely zeroed and we're allowed to
>> - * keep the target sparse. */
>> -if (s->compressed) {
>> -if (s->has_zero_init && s->min_sparse &&
>> -buffer_is_zero(buf, n * BDRV_SECTOR_SIZE))
>> -{
>> -assert(!s->target_has_backing);
>> -break;
>> -}
>> -
>> -iov.iov_base = buf;
>> -iov.iov_len = n << BDRV_SECTOR_BITS;
>> -qemu_iovec_init_external(, , 1);
>> -
>> -ret = blk_co_pwritev(s->target, sector_num << 
>> BDRV_SECTOR_BITS,
>> - n << BDRV_SECTOR_BITS, ,
>> - BDRV_REQ_WRITE_COMPRESSED);
>> -if (ret < 0) {
>> -return ret;
>> -}
>> -break;
>> -}
>> -
>> -/* If there is real non-zero data or we're told to keep the 
>> target
>> - * fully allocated (-S 0), we must write it. Otherwise we can 
>> treat
>> +/* If we're told to keep the target fully allocated (-S 0) or 
>> there
>> + * is real non-zero data, we must write it. Otherwise we can 
>> treat
>>   * it as zero sectors. */
>
> I think we should still mention why there is a difference depending on
> s->compressed. Maybe like this:
>
> /* If we're told to keep the target fully allocated (-S 0) or there
>  * is real non-zero data, we must write it. Otherwise we can treat
>  * it as zero sectors.
>  * Compressed clusters need to be written as a whole, so in that
>  * case we can only save the write if the buffer is completely
>  * zeroed. */
>
>>  if (!s->min_sparse ||
>> -is_allocated_sectors_min(buf, n, , s->min_sparse))
>> -{
>> +(!s->compressed &&
>> + is_allocated_sectors_min(buf, n, , s->min_sparse)) ||
>> +(s->compressed &&
>> + !buffer_is_zero(buf, n * BDRV_SECTOR_SIZE))) {
>> +
>
> This newline is a bit weird. Normally we don't have newlines at the
> start of a block.
>
> If you (like me) think there should be a visual separation between the
> if condition and the block, I'd suggest keeping the opening brace { on
> its own line (as it is now).
>
Thanks for your review.

> Max
>
>>  iov.iov_base = buf;
>>  iov.iov_len = n << BDRV_SECTOR_BITS;
>>  qemu_iovec_init_external(, , 1);
>>
>>  ret = blk_co_pwritev(s->target, sector_num << 
>> BDRV_SECTOR_BITS,
>> - n << BDRV_SECTOR_BITS, , 0);
>> + n << BDRV_SECTOR_BITS, , flags);
>>  if (ret < 0) {
>>  return ret;
>>  }
>> @@ -1716,6 +1696,7 @@ static int coroutine_fn 
>> convert_co_write(ImgConvertState *s, int64_t sector_num,
>>
>>  case BLK_ZERO:
>>  if (s->has_zero_init) {
>> +assert(!s->target_has_backing);
>>  break;
>>  }
>>  ret = blk_co_pwrite_zeroes(s->target,
>>
>
>



[Qemu-devel] [PATCH v3] qemu-img: use blk_co_pwrite_zeroes for zero sectors when compressed

2017-04-26 Thread jemmy858585
From: Lidong Chen 

When the buffer is zero, blk_co_pwrite_zeroes is more effective than
blk_co_pwritev with BDRV_REQ_WRITE_COMPRESSED. This patch can reduce
the time for converting qcow2 images with lots of zero data.

Signed-off-by: Lidong Chen 
---
   v3 changelog:
   fix some spelling errors and code style.
   v2 changelog:
   unify the compressed and non-compressed code paths.
---
 qemu-img.c | 44 ++--
 1 file changed, 14 insertions(+), 30 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index bbe1574..872a91b7 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1661,6 +1661,8 @@ static int coroutine_fn convert_co_write(ImgConvertState 
*s, int64_t sector_num,
 
 while (nb_sectors > 0) {
 int n = nb_sectors;
+BdrvRequestFlags flags = s->compressed ? BDRV_REQ_WRITE_COMPRESSED : 0;
+
 switch (status) {
 case BLK_BACKING_FILE:
 /* If we have a backing file, leave clusters unallocated that are
@@ -1670,43 +1672,24 @@ static int coroutine_fn 
convert_co_write(ImgConvertState *s, int64_t sector_num,
 break;
 
 case BLK_DATA:
-/* We must always write compressed clusters as a whole, so don't
- * try to find zeroed parts in the buffer. We can only save the
- * write if the buffer is completely zeroed and we're allowed to
- * keep the target sparse. */
-if (s->compressed) {
-if (s->has_zero_init && s->min_sparse &&
-buffer_is_zero(buf, n * BDRV_SECTOR_SIZE))
-{
-assert(!s->target_has_backing);
-break;
-}
-
-iov.iov_base = buf;
-iov.iov_len = n << BDRV_SECTOR_BITS;
-qemu_iovec_init_external(, , 1);
-
-ret = blk_co_pwritev(s->target, sector_num << BDRV_SECTOR_BITS,
- n << BDRV_SECTOR_BITS, ,
- BDRV_REQ_WRITE_COMPRESSED);
-if (ret < 0) {
-return ret;
-}
-break;
-}
-
-/* If there is real non-zero data or we're told to keep the target
- * fully allocated (-S 0), we must write it. Otherwise we can treat
- * it as zero sectors. */
+/* If we're told to keep the target fully allocated (-S 0) or there
+ * is real non-zero data, we must write it. Otherwise we can treat
+ * it as zero sectors.
+ * Compressed clusters need to be written as a whole, so in that
+ * case we can only save the write if the buffer is completely
+ * zeroed. */
 if (!s->min_sparse ||
-is_allocated_sectors_min(buf, n, , s->min_sparse))
+(!s->compressed &&
+ is_allocated_sectors_min(buf, n, , s->min_sparse)) ||
+(s->compressed &&
+ !buffer_is_zero(buf, n * BDRV_SECTOR_SIZE)))
 {
 iov.iov_base = buf;
 iov.iov_len = n << BDRV_SECTOR_BITS;
 qemu_iovec_init_external(, , 1);
 
 ret = blk_co_pwritev(s->target, sector_num << BDRV_SECTOR_BITS,
- n << BDRV_SECTOR_BITS, , 0);
+ n << BDRV_SECTOR_BITS, , flags);
 if (ret < 0) {
 return ret;
 }
@@ -1716,6 +1699,7 @@ static int coroutine_fn convert_co_write(ImgConvertState 
*s, int64_t sector_num,
 
 case BLK_ZERO:
 if (s->has_zero_init) {
+assert(!s->target_has_backing);
 break;
 }
 ret = blk_co_pwrite_zeroes(s->target,
-- 
1.8.3.1




Re: [Qemu-devel] [RFC PATCH 12/20] Memory: Add func to fire pasidt_bind notifier

2017-04-26 Thread Liu, Yi L
On Wed, Apr 26, 2017 at 03:50:16PM +0200, Paolo Bonzini wrote:
> 
> 
> On 26/04/2017 12:06, Liu, Yi L wrote:
> > +void memory_region_notify_iommu_svm_bind(MemoryRegion *mr,
> > + void *data)
> > +{
> > +IOMMUNotifier *iommu_notifier;
> > +IOMMUNotifierFlag request_flags;
> > +
> > +assert(memory_region_is_iommu(mr));
> > +
> > +/*TODO: support other bind requests with smaller gran,
> > + * e.g. bind signle pasid entry
> > + */
> > +request_flags = IOMMU_NOTIFIER_SVM_PASIDT_BIND;
> > +
> > +QLIST_FOREACH(iommu_notifier, >iommu_notify, node) {
> > +if (iommu_notifier->notifier_flags & request_flags) {
> > +iommu_notifier->notify(iommu_notifier, data);
> > +break;
> > +}
> > +}
> 
> Peter,
> 
> should this reuse ->notify, or should it be different function pointer
> in IOMMUNotifier?

Hi Paolo,

Thx for your review.

I think it should be “->notify” here. In this patchset, the new notifier
is registered with the existing notifier registration API. So the all the
notifiers are in the mr->iommu_notify list. And notifiers are labeled
by notify flag, so it is able to differentiate the IOMMUNotifier nodes.
When the flag meets, trigger it by “->notify”. The diagram below shows
my understanding , wish it helps to make me understood.

VFIOContainer
   |
   giommu_list(VFIOGuestIOMMU)
\
 VFIOGuestIOMMU1 ->   VFIOGuestIOMMU2 -> VFIOGuestIOMMU3 ...
| | |
mr->iommu_notify: IOMMUNotifier   ->IOMMUNotifier  ->  IOMMUNotifier
  (Flag:MAP/UNMAP) (Flag:SVM bind)  (Flag:tlb invalidate)


Actually, compared with the MAP/UNMAP notifier, the newly added notifier has
no start/end check, and there may be other types of bind notfier flag in
future, so I added a separate fire func for SVM bind notifier.

Thanks,
Yi L

> Paolo
> 



Re: [Qemu-devel] [PULL V2 8/8] COLO-compare: Optimize tcp compare trace event

2017-04-26 Thread Zhang Chen



On 04/27/2017 10:28 AM, Jason Wang wrote:



On 2017年04月26日 23:55, Peter Maydell wrote:
On 26 April 2017 at 16:18, Peter Maydell  
wrote:

On 25 April 2017 at 12:26, Jason Wang  wrote:

From: Zhang Chen 

Optimize two trace events as one, adjust print format make
it easy to read. rename trace_colo_compare_pkt_info_src/dst
to trace_colo_compare_tcp_info.

Signed-off-by: Zhang Chen 
Signed-off-by: Jason Wang 

Hi. This commit has broken building with the UST backend,
because it is effectively reverting commit 2dfe5113b11,
which split this trace line into two events because the
UST backend has a limit on the number of arguments that
a UST event can take.

Could you revert it, please?

...actually it looks like it reverts cleanly on master, so
I'll just revert it directly.

thanks
-- PMM



Zheng Chen, consider the UST limitation, I think we can just leave the 
two tracepoints as is (of course you can improve readability).


OK, I will send a independent patch to improve readability.

Thanks
Zhang Chen



Thanks





--
Thanks
Zhang Chen






Re: [Qemu-devel] About QEMU BQL and dirty log switch in Migration

2017-04-26 Thread Yang Hongyang
Hi Paolo, Dave,

On 2017/4/26 23:46, Paolo Bonzini wrote:
> 
> 
> On 24/04/2017 18:42, Dr. David Alan Gilbert wrote:
>> I suppose there's a few questions;
>>   a) Do we actually need the BQL - and if so why

Enable/disable dirty log tracking are operations on memory regions.
That's why they need to be in BQL I think.

>>   b) What actually takes 13s?  It's probably worth figuring
>> out where it goes,  the whole bitmap is only 1GB isn't it
>> even on a 4TB machine, and even the simplest way to fill
>> that takes way less than 13s.

I found two time consuming operations in KVM module,

- one is kvm_mmu_slot_apply_flags(), when enable dirty log tracking
kvm_vm_ioctl_set_memory_region
  |->kvm_set_memory_region
 |->__kvm_set_memory_region
|->kvm_arch_commit_memory_region
   |->kvm_mmu_slot_apply_flags
  ...

- the other is kvm_mmu_zap_collapsible_sptes(), when disable dirty log tracking
kvm_vm_ioctl_set_memory_region
  |->kvm_set_memory_region
 |->__kvm_set_memory_region
|->kvm_arch_commit_memory_region
   |->kvm_mmu_zap_collapsible_sptes
  ...

Any ideas that could optimize the time spending for these operations?

> 
> It's more likely that it is the migration_bitmap_sync immediately after
> that.

It's not, it's enable/disable dirty log tracking that costs time.

> 
> I think it is possible to move migration_bitmap_sync outside BQL.  It
> should be simpler to evaluate that after Juan's cleanups go in.
> 
> Paolo
> 

-- 
Thanks,
Yang



[Qemu-devel] [PATCH v9] Allow setting NUMA distance for different NUMA nodes

2017-04-26 Thread He Chen
This patch is going to add SLIT table support in QEMU, and provides
additional option `dist` for command `-numa` to allow user set vNUMA
distance by QEMU command.

With this patch, when a user wants to create a guest that contains
several vNUMA nodes and also wants to set distance among those nodes,
the QEMU command would like:

```
-numa node,nodeid=0,cpus=0 \
-numa node,nodeid=1,cpus=1 \
-numa node,nodeid=2,cpus=2 \
-numa node,nodeid=3,cpus=3 \
-numa dist,src=0,dst=1,val=21 \
-numa dist,src=0,dst=2,val=31 \
-numa dist,src=0,dst=3,val=41 \
-numa dist,src=1,dst=2,val=21 \
-numa dist,src=1,dst=3,val=31 \
-numa dist,src=2,dst=3,val=21 \
```

Signed-off-by: He Chen 

---
Changes since v8:
* numa_{node, distance}_parse --> parse_numa_{node, distance}
* Comments refinement.

Changes since v7:
* Remove unnecessary node present check.
* Minor improvement on prompt message.

Changes since v6:
* Split validate_numa_distance into 2 separate functions.
* Add comments before validate and complete numa distance functions.

Changes since v5:
* Made the generation of the SLIT dependent on `have_numa_distance`.
* Doc refinement.
---
 hw/acpi/aml-build.c |  26 +
 hw/i386/acpi-build.c|   4 ++
 include/hw/acpi/aml-build.h |   1 +
 include/sysemu/numa.h   |   2 +
 include/sysemu/sysemu.h |   4 ++
 numa.c  | 137 +++-
 qapi-schema.json|  30 +-
 qemu-options.hx |  16 +-
 8 files changed, 215 insertions(+), 5 deletions(-)

diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index c6f2032..be496c8 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -24,6 +24,7 @@
 #include "hw/acpi/aml-build.h"
 #include "qemu/bswap.h"
 #include "qemu/bitops.h"
+#include "sysemu/numa.h"
 
 static GArray *build_alloc_array(void)
 {
@@ -1609,3 +1610,28 @@ void build_srat_memory(AcpiSratMemoryAffinity *numamem, 
uint64_t base,
 numamem->base_addr = cpu_to_le64(base);
 numamem->range_length = cpu_to_le64(len);
 }
+
+/*
+ * ACPI spec 5.2.17 System Locality Distance Information Table
+ * (Revision 2.0 or later)
+ */
+void build_slit(GArray *table_data, BIOSLinker *linker)
+{
+int slit_start, i, j;
+slit_start = table_data->len;
+
+acpi_data_push(table_data, sizeof(AcpiTableHeader));
+
+build_append_int_noprefix(table_data, nb_numa_nodes, 8);
+for (i = 0; i < nb_numa_nodes; i++) {
+for (j = 0; j < nb_numa_nodes; j++) {
+assert(numa_info[i].distance[j]);
+build_append_int_noprefix(table_data, numa_info[i].distance[j], 1);
+}
+}
+
+build_header(linker, table_data,
+ (void *)(table_data->data + slit_start),
+ "SLIT",
+ table_data->len - slit_start, 1, NULL, NULL);
+}
diff --git a/hw/i386/acpi-build.c b/hw/i386/acpi-build.c
index 2073108..2458ebc 100644
--- a/hw/i386/acpi-build.c
+++ b/hw/i386/acpi-build.c
@@ -2678,6 +2678,10 @@ void acpi_build(AcpiBuildTables *tables, MachineState 
*machine)
 if (pcms->numa_nodes) {
 acpi_add_table(table_offsets, tables_blob);
 build_srat(tables_blob, tables->linker, machine);
+if (have_numa_distance) {
+acpi_add_table(table_offsets, tables_blob);
+build_slit(tables_blob, tables->linker);
+}
 }
 if (acpi_get_mcfg()) {
 acpi_add_table(table_offsets, tables_blob);
diff --git a/include/hw/acpi/aml-build.h b/include/hw/acpi/aml-build.h
index 00c21f1..329a0d0 100644
--- a/include/hw/acpi/aml-build.h
+++ b/include/hw/acpi/aml-build.h
@@ -389,4 +389,5 @@ GCC_FMT_ATTR(2, 3);
 void build_srat_memory(AcpiSratMemoryAffinity *numamem, uint64_t base,
uint64_t len, int node, MemoryAffinityFlags flags);
 
+void build_slit(GArray *table_data, BIOSLinker *linker);
 #endif
diff --git a/include/sysemu/numa.h b/include/sysemu/numa.h
index 8f09dcf..0ea1bc0 100644
--- a/include/sysemu/numa.h
+++ b/include/sysemu/numa.h
@@ -8,6 +8,7 @@
 #include "hw/boards.h"
 
 extern int nb_numa_nodes;   /* Number of NUMA nodes */
+extern bool have_numa_distance;
 
 struct numa_addr_range {
 ram_addr_t mem_start;
@@ -21,6 +22,7 @@ typedef struct node_info {
 struct HostMemoryBackend *node_memdev;
 bool present;
 QLIST_HEAD(, numa_addr_range) addr; /* List to store address ranges */
+uint8_t distance[MAX_NODES];
 } NodeInfo;
 
 extern NodeInfo numa_info[MAX_NODES];
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index 576c7ce..6999545 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -169,6 +169,10 @@ extern int mem_prealloc;
 
 #define MAX_NODES 128
 #define NUMA_NODE_UNASSIGNED MAX_NODES
+#define NUMA_DISTANCE_MIN 10
+#define NUMA_DISTANCE_DEFAULT 20
+#define NUMA_DISTANCE_MAX 254
+#define NUMA_DISTANCE_UNREACHABLE 255
 
 #define MAX_OPTION_ROMS 16
 typedef struct QEMUOptionRom {
diff --git a/numa.c 

Re: [Qemu-devel] [PATCH v15 16/21] file-posix: Add 'locking' option

2017-04-26 Thread Fam Zheng
On Wed, 04/26 14:41, Kevin Wolf wrote:
> Am 26.04.2017 um 05:34 hat Fam Zheng geschrieben:
> > Making this option available even before implementing it will let
> > converting tests easier: in coming patches they can specify the option
> > already when necessary, before we actually write code to lock the
> > images.
> > 
> > Signed-off-by: Fam Zheng 
> > ---
> >  block/file-posix.c | 5 +
> >  1 file changed, 5 insertions(+)
> > 
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index ade71db..2114720 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -392,6 +392,11 @@ static QemuOptsList raw_runtime_opts = {
> >  .type = QEMU_OPT_STRING,
> >  .help = "host AIO implementation (threads, native)",
> >  },
> > +{
> > +.name = "locking",
> > +.type = QEMU_OPT_BOOL,
> > +.help = "lock the file",
> > +},
> >  { /* end of list */ }
> >  },
> >  };
> 
> I don't mind whether it happens in this patch or when the option is
> actually implemented, but this option needs to be added to the QAPI
> schema.

I will add it.

Fam



Re: [Qemu-devel] [PULL V2 8/8] COLO-compare: Optimize tcp compare trace event

2017-04-26 Thread Jason Wang



On 2017年04月26日 23:55, Peter Maydell wrote:

On 26 April 2017 at 16:18, Peter Maydell  wrote:

On 25 April 2017 at 12:26, Jason Wang  wrote:

From: Zhang Chen 

Optimize two trace events as one, adjust print format make
it easy to read. rename trace_colo_compare_pkt_info_src/dst
to trace_colo_compare_tcp_info.

Signed-off-by: Zhang Chen 
Signed-off-by: Jason Wang 

Hi. This commit has broken building with the UST backend,
because it is effectively reverting commit 2dfe5113b11,
which split this trace line into two events because the
UST backend has a limit on the number of arguments that
a UST event can take.

Could you revert it, please?

...actually it looks like it reverts cleanly on master, so
I'll just revert it directly.

thanks
-- PMM



Zheng Chen, consider the UST limitation, I think we can just leave the 
two tracepoints as is (of course you can improve readability).


Thanks



Re: [Qemu-devel] [PATCH 2/2] ramblock: add new hmp command "info ramblock"

2017-04-26 Thread Peter Xu
On Wed, Apr 26, 2017 at 02:10:16PM +0200, Markus Armbruster wrote:
> Peter Xu  writes:
> 
> > To dump information about ramblocks. It looks like:
> >
> > (qemu) info ramblock
> >   Block NamePSize  Offset   Used
> >   Total
> > /objects/mem   2M  0x 0x8000 
> > 0x8000
> > vga.vram   4K  0x8006 0x0100 
> > 0x0100
> > /rom@etc/acpi/tables   4K  0x810b 0x0002 
> > 0x0020
> >  pc.bios   4K  0x8000 0x0004 
> > 0x0004
> >   :00:03.0/e1000.rom   4K  0x8107 0x0004 
> > 0x0004
> >   pc.rom   4K  0x8004 0x0002 
> > 0x0002
> > :00:02.0/vga.rom   4K  0x8106 0x0001 
> > 0x0001
> >/rom@etc/table-loader   4K  0x812b 0x1000 
> > 0x1000
> >   /rom@etc/acpi/rsdp   4K  0x812b1000 0x1000 
> > 0x1000
> >
> > Signed-off-by: Peter Xu 
> 
> You implemented this just for HMP.
> 
> In general, functionality available in HMP should also available in QMP.
> Exceptions include functionality that makes no sense in QMP, or is of
> use only for human users.  If you think your command is an exception,
> please explain why in the commit message.
> 
> If it isn't, you need to implement it for QMP, then rewrite the HMP
> version to reuse either the QMP command or a common core.

I see. I think it should belongs to the first condition, which should
only be used by human users, especially developers. Maybe I can repost
with some more comment in the commit message to emphasize that.

But, before that, I would also want to know whether anyone would think
this an useful thing even to QMP... If so, I would be glad to provide
QMP interface as well.

Thanks,

-- 
Peter Xu



Re: [Qemu-devel] [PATCH v2 0/2] ramblock: add hmp command "info ramblock"

2017-04-26 Thread Peter Xu
On Thu, Apr 27, 2017 at 09:12:43AM +0800, Fam Zheng wrote:
> On Wed, 04/26 19:13, Peter Xu wrote:
> > v2:
> > - replace "lx" with "PRIx64" in three places
> 
> ram_addr_t doesn't have a fixed bit width, to keep compilers happy, maybe you
> can stick to PRIx64 in the format string, and cast the parameters.

Will do when there is new version. Thanks!

-- 
Peter Xu



Re: [Qemu-devel] [PATCH v15 02/21] block: Define BLK_PERM_MAX

2017-04-26 Thread Fam Zheng
On Wed, 04/26 11:36, Kevin Wolf wrote:
> Am 26.04.2017 um 05:33 hat Fam Zheng geschrieben:
> > This is the order of the largest possible permission.
> > 
> > Signed-off-by: Fam Zheng 
> > ---
> >  include/block/block.h | 2 ++
> >  1 file changed, 2 insertions(+)
> > 
> > diff --git a/include/block/block.h b/include/block/block.h
> > index eb0565d..a798f10 100644
> > --- a/include/block/block.h
> > +++ b/include/block/block.h
> > @@ -224,6 +224,8 @@ enum {
> >  BLK_PERM_ALL= 0x1f,
> >  };
> >  
> > +#define BLK_PERM_MAX (64 - clz64((uint64_t)BLK_PERM_ALL))
> 
> Contrary to the commit message, this is the number of permission bits in
> use (i.e. one more than the largest possible permission). You're using
> it correctly, though, because your loop condition is i < BLK_PERM_MAX.
> 
> This could use an updated commit message and a comment at the #define at
> least. Ideally a less ambiguous name instead of the commit (because _MAX
> seems to imply what the commit message currently says, not what it
> really is), but I can't think of one.

Good point.

Given it another thought, using BLK_PERM_ALL in the loop condition is as easy.
I'll drop this patch.

Fam



[Qemu-devel] [PATCH v10 11/17] qcow2: Discard/zero clusters by byte count

2017-04-26 Thread Eric Blake
Passing a byte offset, but sector count, when we ultimately
want to operate on cluster granularity, is madness.  Clean up
the external interfaces to take both offset and count as bytes,
while still keeping the assertion added previously that the
caller must align the values to a cluster.  Then rename things
to make sure backports don't get confused by changed units:
instead of qcow2_discard_clusters() and qcow2_zero_clusters(),
we now have qcow2_cluster_discard() and qcow2_cluster_zeroize().

The internal functions still operate on clusters at a time, and
return an int for number of cleared clusters; but on an image
with 2M clusters, a single L2 table holds 256k entries that each
represent a 2M cluster, totalling well over INT_MAX bytes if we
ever had a request for that many bytes at once.  All our callers
currently limit themselves to 32-bit bytes (and therefore fewer
clusters), but by making this function 64-bit clean, we have one
less place to clean up if we later improve the block layer to
support 64-bit bytes through all operations (with the block layer
auto-fragmenting on behalf of more-limited drivers), rather than
the current state where some interfaces are artificially limited
to INT_MAX at a time.

Signed-off-by: Eric Blake 

---
v10: squash in fixup accounting for unaligned file end
v9: rebase to earlier changes, drop R-b
v7, v8: only earlier half of series submitted for 2.9
v6: rebase due to context
v5: s/count/byte/ to make the units obvious, and rework the math
to ensure no 32-bit integer overflow on large clusters
v4: improve function names, split assertion additions into earlier patch
[no v3 or v2]
v1: https://lists.gnu.org/archive/html/qemu-devel/2016-12/msg00339.html
---
 block/qcow2.h  |  9 +
 block/qcow2-cluster.c  | 40 +---
 block/qcow2-snapshot.c |  7 +++
 block/qcow2.c  | 21 +
 4 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index 8731f24..03f23bf 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -547,10 +547,11 @@ uint64_t 
qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs,
  int compressed_size);

 int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m);
-int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
-int nb_sectors, enum qcow2_discard_type type, bool full_discard);
-int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors,
-int flags);
+int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
+  uint64_t bytes, enum qcow2_discard_type type,
+  bool full_discard);
+int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset,
+  uint64_t bytes, int flags);

 int qcow2_expand_zero_clusters(BlockDriverState *bs,
BlockDriverAmendStatusCB *status_cb,
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 4f641a9..a47aadc 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1562,16 +1562,16 @@ static int discard_single_l2(BlockDriverState *bs, 
uint64_t offset,
 return nb_clusters;
 }

-int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset,
-int nb_sectors, enum qcow2_discard_type type, bool full_discard)
+int qcow2_cluster_discard(BlockDriverState *bs, uint64_t offset,
+  uint64_t bytes, enum qcow2_discard_type type,
+  bool full_discard)
 {
 BDRVQcow2State *s = bs->opaque;
-uint64_t end_offset;
+uint64_t end_offset = offset + bytes;
 uint64_t nb_clusters;
+int64_t cleared;
 int ret;

-end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);
-
 /* Caller must pass aligned values, except at image end */
 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
 assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
@@ -1583,13 +1583,15 @@ int qcow2_discard_clusters(BlockDriverState *bs, 
uint64_t offset,

 /* Each L2 table is handled by its own loop iteration */
 while (nb_clusters > 0) {
-ret = discard_single_l2(bs, offset, nb_clusters, type, full_discard);
-if (ret < 0) {
+cleared = discard_single_l2(bs, offset, nb_clusters, type,
+full_discard);
+if (cleared < 0) {
+ret = cleared;
 goto fail;
 }

-nb_clusters -= ret;
-offset += (ret * s->cluster_size);
+nb_clusters -= cleared;
+offset += (cleared * s->cluster_size);
 }

 ret = 0;
@@ -1652,16 +1654,15 @@ static int zero_single_l2(BlockDriverState *bs, 
uint64_t offset,
 return nb_clusters;
 }

-int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors,
-int flags)
+int qcow2_cluster_zeroize(BlockDriverState *bs, uint64_t offset,
+   

[Qemu-devel] [PATCH v10 09/17] qcow2: Optimize write zero of unaligned tail cluster

2017-04-26 Thread Eric Blake
We've already improved discards to operate efficiently on the tail
of an unaligned qcow2 image; it's time to make a similar improvement
to write zeroes.  The special case is only valid at the tail
cluster of a file, where we must recognize that any sectors beyond
the image end would implicitly read as zero, and therefore should
not penalize our logic for widening a partial cluster into writing
the whole cluster as zero.

Update test 154 to cover the new scenarios, using two images of
intentionally differing length.

While at it, fix the test to gracefully skip when run as
./check -qcow2 -o compat=0.10 154
since the older format lacks zero clusters already required earlier
in the test.

Signed-off-by: Eric Blake 

---
v10: rebase to better reporting of preallocated zero clusters
v9: new patch
---
 block/qcow2.c  |   7 +++
 tests/qemu-iotests/154 | 112 -
 tests/qemu-iotests/154.out |  83 +
 3 files changed, 200 insertions(+), 2 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 5c1573c..8038793 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -2449,6 +2449,10 @@ static bool is_zero_sectors(BlockDriverState *bs, 
int64_t start,
 BlockDriverState *file;
 int64_t res;

+if (start + count > bs->total_sectors) {
+count = bs->total_sectors - start;
+}
+
 if (!count) {
 return true;
 }
@@ -2467,6 +2471,9 @@ static coroutine_fn int 
qcow2_co_pwrite_zeroes(BlockDriverState *bs,
 uint32_t tail = (offset + count) % s->cluster_size;

 trace_qcow2_pwrite_zeroes_start_req(qemu_coroutine_self(), offset, count);
+if (offset + count == bs->total_sectors * BDRV_SECTOR_SIZE) {
+tail = 0;
+}

 if (head || tail) {
 int64_t cl_start = (offset - head) >> BDRV_SECTOR_BITS;
diff --git a/tests/qemu-iotests/154 b/tests/qemu-iotests/154
index 7ca7219..005f09f 100755
--- a/tests/qemu-iotests/154
+++ b/tests/qemu-iotests/154
@@ -2,7 +2,7 @@
 #
 # qcow2 specific bdrv_pwrite_zeroes tests with backing files (complements 034)
 #
-# Copyright (C) 2016 Red Hat, Inc.
+# Copyright (C) 2016-2017 Red Hat, Inc.
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -42,7 +42,10 @@ _supported_proto file
 _supported_os Linux

 CLUSTER_SIZE=4k
-size=128M
+size=$((128 * 1024 * 1024))
+
+# This test requires zero clusters, added in v3 images
+_unsupported_imgopts compat=0.10

 echo
 echo == backing file contains zeros ==
@@ -299,6 +302,111 @@ $QEMU_IO -c "read -P 0 75k 1k" "$TEST_IMG" | 
_filter_qemu_io

 $QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map

+echo
+echo == unaligned image tail cluster, no allocation needed ==
+
+CLUSTER_SIZE=1024 TEST_IMG="$TEST_IMG.base" _make_test_img $((size + 1024))
+_make_test_img -b "$TEST_IMG.base" $((size + 2048))
+
+# With no backing file, write to all or part of unallocated partial cluster
+
+# Backing file: 128m: ... | 00 --
+$QEMU_IO -c "write -z $size 512" "$TEST_IMG.base" | _filter_qemu_io
+
+# Backing file: 128m: ... | -- 00
+$QEMU_IO -c "write -z $size 512" "$TEST_IMG.base" | _filter_qemu_io
+
+# Backing file: 128m: ... | 00 00
+$QEMU_IO -c "write -z $size 1024" "$TEST_IMG.base" | _filter_qemu_io
+
+# No offset should be allocated, although the cluster itself is considered
+# allocated due to the zero flag
+$QEMU_IO -c "alloc $size 1024" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG.base" | _filter_qemu_img_map
+
+# With backing file that reads as zero, write to all or part of entire cluster
+
+# Backing file: 128m: ... | 00 00
+# Active layer: 128m: ... | 00 00 00 00
+$QEMU_IO -c "write -z $size 2048" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# Backing file: 128m: ... | 00 00
+# Active layer: 128m: ... | 00 00 -- --
+$QEMU_IO -c "write -z $((size)) 1024" "$TEST_IMG" | _filter_qemu_io
+
+# Backing file: 128m: ... | 00 00
+# Active layer: 128m: ... | -- -- 00 00
+$QEMU_IO -c "write -z $((size + 1024)) 1024" "$TEST_IMG" | _filter_qemu_io
+
+# Backing file: 128m: ... | 00 00
+# Active layer: 128m: ... | -- 00 00 --
+$QEMU_IO -c "write -z $((size + 512)) 1024" "$TEST_IMG" | _filter_qemu_io
+
+# No offset should be allocated, although the cluster itself is considered
+# allocated due to the zero flag
+$QEMU_IO -c "alloc $size 2048" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map
+
+# Preallocated cluster is not freed when unmap is not requested
+
+# Backing file: 128m: ... | XX -- => ... | XX 00
+$QEMU_IO -c "write $((size)) 512" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z $((size + 512)) 512" "$TEST_IMG.base" | _filter_qemu_io
+
+# Backing file: 128m: ... | XX 00 => ... | 00 00
+$QEMU_IO -c "write -z $((size)) 1024" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IMG map 

[Qemu-devel] [PATCH v10 15/17] blkdebug: Simplify override logic

2017-04-26 Thread Eric Blake
Rather than store into a local variable, then copy to the struct
if the value is valid, then reporting errors otherwise, it is
simpler to just store into the struct and report errors if the
value is invalid.  This however requires that the struct store
a 64-bit number, rather than a narrower type.  Move the errno
assignment into a label that will be reused from more places in
the next patch.

Signed-off-by: Eric Blake 
Reviewed-by: Max Reitz 
Reviewed-by: Kevin Wolf 

---
v10: no change
v9: no change
v7-v8: not submitted (earlier half of series sent for 2.9)
v6: tweak error message, but R-b kept
v5: no change
v4: fix typo in commit message, move errno assignment
v3: new patch
---
 block/blkdebug.c | 14 ++
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index 62d113c..a1501eb 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -38,7 +38,7 @@
 typedef struct BDRVBlkdebugState {
 int state;
 int new_state;
-int align;
+uint64_t align;

 /* For blkdebug_refresh_filename() */
 char *config_file;
@@ -353,7 +353,6 @@ static int blkdebug_open(BlockDriverState *bs, QDict 
*options, int flags,
 BDRVBlkdebugState *s = bs->opaque;
 QemuOpts *opts;
 Error *local_err = NULL;
-uint64_t align;
 int ret;

 opts = qemu_opts_create(_opts, NULL, 0, _abort);
@@ -389,12 +388,10 @@ static int blkdebug_open(BlockDriverState *bs, QDict 
*options, int flags,
 bs->file->bs->supported_zero_flags;

 /* Set request alignment */
-align = qemu_opt_get_size(opts, "align", 0);
-if (align < INT_MAX && is_power_of_2(align)) {
-s->align = align;
-} else if (align) {
-error_setg(errp, "Invalid alignment");
-ret = -EINVAL;
+s->align = qemu_opt_get_size(opts, "align", 0);
+if (s->align && (s->align >= INT_MAX || !is_power_of_2(s->align))) {
+error_setg(errp, "Cannot meet constraints with align %" PRIu64,
+   s->align);
 goto fail_unref;
 }

@@ -402,6 +399,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict 
*options, int flags,
 goto out;

 fail_unref:
+ret = -EINVAL;
 bdrv_unref_child(bs, bs->file);
 out:
 if (ret < 0) {
-- 
2.9.3




[Qemu-devel] [PATCH v10 13/17] blkdebug: Refactor error injection

2017-04-26 Thread Eric Blake
Rather than repeat the logic at each caller of checking if a Rule
exists that warrants an error injection, fold that logic into
inject_error(); and rename it to rule_check() for legibility.
This will help the next patch, which adds two more callers that
need to check rules for the potential of injecting errors.

Signed-off-by: Eric Blake 
Reviewed-by: Max Reitz 

---
v10: no change
v9: no change
v7-v8: not submitted (earlier half of series sent for 2.9)
v6: new patch
---
 block/blkdebug.c | 74 +---
 1 file changed, 33 insertions(+), 41 deletions(-)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index 14d3fc5..db58db0 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -405,11 +405,30 @@ out:
 return ret;
 }

-static int inject_error(BlockDriverState *bs, BlkdebugRule *rule)
+static int rule_check(BlockDriverState *bs, uint64_t offset, uint64_t bytes)
 {
 BDRVBlkdebugState *s = bs->opaque;
-int error = rule->options.inject.error;
-bool immediately = rule->options.inject.immediately;
+BlkdebugRule *rule = NULL;
+int error;
+bool immediately;
+
+QSIMPLEQ_FOREACH(rule, >active_rules, active_next) {
+uint64_t inject_offset = rule->options.inject.offset;
+
+if (inject_offset == -1 ||
+(bytes && inject_offset >= offset &&
+ inject_offset < offset + bytes))
+{
+break;
+}
+}
+
+if (!rule || !rule->options.inject.error) {
+return 0;
+}
+
+immediately = rule->options.inject.immediately;
+error = rule->options.inject.error;

 if (rule->options.inject.once) {
 QSIMPLEQ_REMOVE(>active_rules, rule, BlkdebugRule, active_next);
@@ -428,8 +447,7 @@ static int coroutine_fn
 blkdebug_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
QEMUIOVector *qiov, int flags)
 {
-BDRVBlkdebugState *s = bs->opaque;
-BlkdebugRule *rule = NULL;
+int err;

 /* Sanity check block layer guarantees */
 assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment));
@@ -438,18 +456,9 @@ blkdebug_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 assert(bytes <= bs->bl.max_transfer);
 }

-QSIMPLEQ_FOREACH(rule, >active_rules, active_next) {
-uint64_t inject_offset = rule->options.inject.offset;
-
-if (inject_offset == -1 ||
-(inject_offset >= offset && inject_offset < offset + bytes))
-{
-break;
-}
-}
-
-if (rule && rule->options.inject.error) {
-return inject_error(bs, rule);
+err = rule_check(bs, offset, bytes);
+if (err) {
+return err;
 }

 return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
@@ -459,8 +468,7 @@ static int coroutine_fn
 blkdebug_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes,
 QEMUIOVector *qiov, int flags)
 {
-BDRVBlkdebugState *s = bs->opaque;
-BlkdebugRule *rule = NULL;
+int err;

 /* Sanity check block layer guarantees */
 assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment));
@@ -469,18 +477,9 @@ blkdebug_co_pwritev(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 assert(bytes <= bs->bl.max_transfer);
 }

-QSIMPLEQ_FOREACH(rule, >active_rules, active_next) {
-uint64_t inject_offset = rule->options.inject.offset;
-
-if (inject_offset == -1 ||
-(inject_offset >= offset && inject_offset < offset + bytes))
-{
-break;
-}
-}
-
-if (rule && rule->options.inject.error) {
-return inject_error(bs, rule);
+err = rule_check(bs, offset, bytes);
+if (err) {
+return err;
 }

 return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
@@ -488,17 +487,10 @@ blkdebug_co_pwritev(BlockDriverState *bs, uint64_t 
offset, uint64_t bytes,

 static int blkdebug_co_flush(BlockDriverState *bs)
 {
-BDRVBlkdebugState *s = bs->opaque;
-BlkdebugRule *rule = NULL;
+int err = rule_check(bs, 0, 0);

-QSIMPLEQ_FOREACH(rule, >active_rules, active_next) {
-if (rule->options.inject.offset == -1) {
-break;
-}
-}
-
-if (rule && rule->options.inject.error) {
-return inject_error(bs, rule);
+if (err) {
+return err;
 }

 return bdrv_co_flush(bs->file->bs);
-- 
2.9.3




[Qemu-devel] [PATCH v10 14/17] blkdebug: Add pass-through write_zero and discard support

2017-04-26 Thread Eric Blake
In order to test the effects of artificial geometry constraints
on operations like write zero or discard, we first need blkdebug
to manage these actions.  It also allows us to inject errors on
those operations, just like we can for read/write/flush.

We can also test the contract promised by the block layer; namely,
if a device has specified limits on alignment or maximum size,
then those limits must be obeyed (for now, the blkdebug driver
merely inherits limits from whatever it is wrapping, but the next
patch will further enhance it to allow specific limit overrides).

This patch intentionally refuses to service requests smaller than
the requested alignments; this is because an upcoming patch adds
a qemu-iotest to prove that the block layer is correctly handling
fragmentation, but the test only works if there is a way to tell
the difference at artificial alignment boundaries when blkdebug is
using a larger-than-default alignment.  If we let the blkdebug
layer always defer to the underlying layer, which potentially has
a smaller granularity, the iotest will be thwarted.

Tested by setting up an NBD server with export 'foo', then invoking:
$ ./qemu-io
qemu-io> open -o driver=blkdebug blkdebug::nbd://localhost:10809/foo
qemu-io> d 0 15M
qemu-io> w -z 0 15M

Pre-patch, the server never sees the discard (it was silently
eaten by the block layer); post-patch it is passed across the
wire.  Likewise, pre-patch the write is always passed with
NBD_WRITE (with 15M of zeroes on the wire), while post-patch
it can utilize NBD_WRITE_ZEROES (for less traffic).

Signed-off-by: Eric Blake 
Reviewed-by: Max Reitz 

---
v10: no change
v9: no change
v7-v8: not submitted (earlier half of series sent for 2.9)
v6: tighten check of unaligned requests, rebase on rule check
refactoring, drop R-b
v5: include 2017 copyright
v4: correct error injection to respect byte range, tweak formatting
v3: rebase to byte-based read/write, improve docs on why no
partial write zero passthrough
v2: new patch
---
 block/blkdebug.c | 74 
 1 file changed, 74 insertions(+)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index db58db0..62d113c 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -1,6 +1,7 @@
 /*
  * Block protocol for I/O error injection
  *
+ * Copyright (C) 2016-2017 Red Hat, Inc.
  * Copyright (c) 2010 Kevin Wolf 
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -382,6 +383,11 @@ static int blkdebug_open(BlockDriverState *bs, QDict 
*options, int flags,
 goto out;
 }

+bs->supported_write_flags = BDRV_REQ_FUA &
+bs->file->bs->supported_write_flags;
+bs->supported_zero_flags = (BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP) &
+bs->file->bs->supported_zero_flags;
+
 /* Set request alignment */
 align = qemu_opt_get_size(opts, "align", 0);
 if (align < INT_MAX && is_power_of_2(align)) {
@@ -496,6 +502,72 @@ static int blkdebug_co_flush(BlockDriverState *bs)
 return bdrv_co_flush(bs->file->bs);
 }

+static int coroutine_fn blkdebug_co_pwrite_zeroes(BlockDriverState *bs,
+  int64_t offset, int count,
+  BdrvRequestFlags flags)
+{
+uint32_t align = MAX(bs->bl.request_alignment,
+ bs->bl.pwrite_zeroes_alignment);
+int err;
+
+/* Only pass through requests that are larger than requested
+ * preferred alignment (so that we test the fallback to writes on
+ * unaligned portions), and check that the block layer never hands
+ * us anything unaligned that crosses an alignment boundary.  */
+if (count < align) {
+assert(QEMU_IS_ALIGNED(offset, align) ||
+   QEMU_IS_ALIGNED(offset + count, align) ||
+   DIV_ROUND_UP(offset, align) ==
+   DIV_ROUND_UP(offset + count, align));
+return -ENOTSUP;
+}
+assert(QEMU_IS_ALIGNED(offset, align));
+assert(QEMU_IS_ALIGNED(count, align));
+if (bs->bl.max_pwrite_zeroes) {
+assert(count <= bs->bl.max_pwrite_zeroes);
+}
+
+err = rule_check(bs, offset, count);
+if (err) {
+return err;
+}
+
+return bdrv_co_pwrite_zeroes(bs->file, offset, count, flags);
+}
+
+static int coroutine_fn blkdebug_co_pdiscard(BlockDriverState *bs,
+ int64_t offset, int count)
+{
+uint32_t align = bs->bl.pdiscard_alignment;
+int err;
+
+/* Only pass through requests that are larger than requested
+ * minimum alignment, and ensure that unaligned requests do not
+ * cross optimum discard boundaries. */
+if (count < bs->bl.request_alignment) {
+assert(QEMU_IS_ALIGNED(offset, align) ||
+   QEMU_IS_ALIGNED(offset + count, align) ||
+   DIV_ROUND_UP(offset, align) ==
+   DIV_ROUND_UP(offset + 

[Qemu-devel] [PATCH v10 08/17] qemu-io: Switch 'map' output to byte-based reporting

2017-04-26 Thread Eric Blake
Mixing byte offset and sector allocation counts is a bit
confusing.  Also, reporting n/m sectors, where m decreases
according to the remaining size of the file, isn't really
adding any useful information.  Update the output to use
byte counts, and adjust the affected tests (./check -qcow2 102,
./check -vpc 146).

Signed-off-by: Eric Blake 

---
v10: rebase to updated test 179
v9: new patch
---
 qemu-io-cmds.c |  5 ++---
 tests/qemu-iotests/102.out |  4 ++--
 tests/qemu-iotests/146.out | 30 +++---
 tests/qemu-iotests/179.out | 22 +++---
 4 files changed, 30 insertions(+), 31 deletions(-)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 34f6707..40a3f2d 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1895,9 +1895,8 @@ static int map_f(BlockBackend *blk, int argc, char **argv)

 retstr = ret ? "allocated" : "not allocated";
 cvtstr(offset << 9ULL, s1, sizeof(s1));
-printf("[% 24" PRId64 "] % 8" PRId64 "/% 8" PRId64 " sectors %s "
-   "at offset %s (%d)\n",
-   offset << 9ULL, num, nb_sectors, retstr, s1, ret);
+printf("[% 24" PRId64 "] % 16" PRId64 " bytes %s at offset %s (%d)\n",
+   offset << 9ULL, num << 9ULL, retstr, s1, ret);

 offset += num;
 nb_sectors -= num;
diff --git a/tests/qemu-iotests/102.out b/tests/qemu-iotests/102.out
index eecde16..0e0ae4c 100644
--- a/tests/qemu-iotests/102.out
+++ b/tests/qemu-iotests/102.out
@@ -6,7 +6,7 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=65536
 wrote 65536/65536 bytes at offset 0
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 Image resized.
-[   0]  128/ 128 sectors allocated at offset 0 
bytes (1)
+[   0]65536 bytes allocated at offset 0 
bytes (1)
 Offset  Length  Mapped to   File

 === Testing map on an image file truncated outside of qemu ===
@@ -17,5 +17,5 @@ wrote 65536/65536 bytes at offset 0
 Image resized.
 QEMU X.Y.Z monitor - type 'help' for more information
 (qemu) qemu-io drv0 map
-[   0]  128/ 128 sectors allocated at offset 0 
bytes (1)
+[   0]65536 bytes allocated at offset 0 
bytes (1)
 *** done
diff --git a/tests/qemu-iotests/146.out b/tests/qemu-iotests/146.out
index 4f334d8..d4e2f62 100644
--- a/tests/qemu-iotests/146.out
+++ b/tests/qemu-iotests/146.out
@@ -2,39 +2,39 @@ QA output created by 146

 === Testing VPC Autodetect ===

-[   0]  266334240/ 266334240 sectors not allocated at 
offset 0 bytes (0)
+[   0] 136363130880 bytes not allocated at offset 0 
bytes (0)

 === Testing VPC with current_size force ===

-[   0]  266338304/ 266338304 sectors not allocated at 
offset 0 bytes (0)
+[   0] 136365211648 bytes not allocated at offset 0 
bytes (0)

 === Testing VPC with chs force ===

-[   0]  266334240/ 266334240 sectors not allocated at 
offset 0 bytes (0)
+[   0] 136363130880 bytes not allocated at offset 0 
bytes (0)

 === Testing Hyper-V Autodetect ===

-[   0]  266338304/ 266338304 sectors not allocated at 
offset 0 bytes (0)
+[   0] 136365211648 bytes not allocated at offset 0 
bytes (0)

 === Testing Hyper-V with current_size force ===

-[   0]  266338304/ 266338304 sectors not allocated at 
offset 0 bytes (0)
+[   0] 136365211648 bytes not allocated at offset 0 
bytes (0)

 === Testing Hyper-V with chs force ===

-[   0]  266334240/ 266334240 sectors not allocated at 
offset 0 bytes (0)
+[   0] 136363130880 bytes not allocated at offset 0 
bytes (0)

 === Testing d2v Autodetect ===

-[   0]   514560/  514560 sectors allocated at offset 0 
bytes (1)
+[   0]263454720 bytes allocated at offset 0 
bytes (1)

 === Testing d2v with current_size force ===

-[   0]   514560/  514560 sectors allocated at offset 0 
bytes (1)
+[   0]263454720 bytes allocated at offset 0 
bytes (1)

 === Testing d2v with chs force ===

-[   0]   514560/  514560 sectors allocated at offset 0 
bytes (1)
+[   0]263454720 bytes allocated at offset 0 
bytes (1)

 === Testing Image create, default ===

@@ -42,15 +42,15 @@ Formatting 'TEST_DIR/IMGFMT-create-test.IMGFMT', fmt=IMGFMT 
size=4294967296

 === Read created image, default opts 

-[   0]  8389584/ 8389584 sectors not allocated at offset 0 
bytes (0)
+[   0]   4295467008 bytes not allocated at offset 0 
bytes (0)

 === Read created image, force_size_calc=chs 

-[   0]  8389584/ 

[Qemu-devel] [PATCH v10 16/17] blkdebug: Add ability to override unmap geometries

2017-04-26 Thread Eric Blake
Make it easier to simulate various unusual hardware setups (for
example, recent commits 3482b9b and b8d0a98 affect the Dell
Equallogic iSCSI with its 15M preferred and maximum unmap and
write zero sizing, or b2f95fe deals with the Linux loopback
block device having a max_transfer of 64k), by allowing blkdebug
to wrap any other device with further restrictions on various
alignments.

Signed-off-by: Eric Blake 
Reviewed-by: Max Reitz 

---
v10: no change
v9: rebase to master (dropped #optional tags)
v7-v8: not submitted (earlier half of series sent for 2.9)
v6: more tweaks in docs and error messages
v5: tweak docs regarding max-transfer minimum
v4: relax 512 byte minimum now that blkdebug is byte-based, fix doc typo
v3: improve legibility of bounds checking, improve docs
v2: new patch
---
 qapi/block-core.json | 33 --
 block/blkdebug.c | 96 +++-
 2 files changed, 125 insertions(+), 4 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 87fb747..2082242 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2430,8 +2430,33 @@
 #
 # @config:  filename of the configuration file
 #
-# @align:   required alignment for requests in bytes,
-#   must be power of 2, or 0 for default
+# @align:   required alignment for requests in bytes, must be
+#   positive power of 2, or 0 for default
+#
+# @max-transfer:maximum size for I/O transfers in bytes, must be
+#   positive multiple of @align and of the underlying
+#   file's request alignment (but need not be a power of
+#   2), or 0 for default (since 2.10)
+#
+# @opt-write-zero:  preferred alignment for write zero requests in bytes,
+#   must be positive multiple of @align and of the
+#   underlying file's request alignment (but need not be a
+#   power of 2), or 0 for default (since 2.10)
+#
+# @max-write-zero:  maximum size for write zero requests in bytes, must be
+#   positive multiple of @align, of @opt-write-zero, and of
+#   the underlying file's request alignment (but need not
+#   be a power of 2), or 0 for default (since 2.10)
+#
+# @opt-discard: preferred alignment for discard requests in bytes, must
+#   be positive multiple of @align and of the underlying
+#   file's request alignment (but need not be a power of
+#   2), or 0 for default (since 2.10)
+#
+# @max-discard: maximum size for discard requests in bytes, must be
+#   positive multiple of @align, of @opt-discard, and of
+#   the underlying file's request alignment (but need not
+#   be a power of 2), or 0 for default (since 2.10)
 #
 # @inject-error:array of error injection descriptions
 #
@@ -2442,7 +2467,9 @@
 { 'struct': 'BlockdevOptionsBlkdebug',
   'data': { 'image': 'BlockdevRef',
 '*config': 'str',
-'*align': 'int',
+'*align': 'int', '*max-transfer': 'int32',
+'*opt-write-zero': 'int32', '*max-write-zero': 'int32',
+'*opt-discard': 'int32', '*max-discard': 'int32',
 '*inject-error': ['BlkdebugInjectErrorOptions'],
 '*set-state': ['BlkdebugSetStateOptions'] } }

diff --git a/block/blkdebug.c b/block/blkdebug.c
index a1501eb..4ccdceb 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -39,6 +39,11 @@ typedef struct BDRVBlkdebugState {
 int state;
 int new_state;
 uint64_t align;
+uint64_t max_transfer;
+uint64_t opt_write_zero;
+uint64_t max_write_zero;
+uint64_t opt_discard;
+uint64_t max_discard;

 /* For blkdebug_refresh_filename() */
 char *config_file;
@@ -343,6 +348,31 @@ static QemuOptsList runtime_opts = {
 .type = QEMU_OPT_SIZE,
 .help = "Required alignment in bytes",
 },
+{
+.name = "max-transfer",
+.type = QEMU_OPT_SIZE,
+.help = "Maximum transfer size in bytes",
+},
+{
+.name = "opt-write-zero",
+.type = QEMU_OPT_SIZE,
+.help = "Optimum write zero alignment in bytes",
+},
+{
+.name = "max-write-zero",
+.type = QEMU_OPT_SIZE,
+.help = "Maximum write zero size in bytes",
+},
+{
+.name = "opt-discard",
+.type = QEMU_OPT_SIZE,
+.help = "Optimum discard alignment in bytes",
+},
+{
+.name = "max-discard",
+.type = QEMU_OPT_SIZE,
+.help = "Maximum discard size in bytes",
+},
 { /* end of list */ }
 },
 };
@@ -354,6 +384,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict 
*options, int flags,
 QemuOpts *opts;

[Qemu-devel] [PATCH v10 05/17] iotests: Add test 179 to cover write zeroes with unmap

2017-04-26 Thread Eric Blake
No tests were covering write zeroes with unmap.  Additionally,
I needed to prove that my previous patches for correct status
reporting and write zeroes optimizations actually had an impact.

The test works for cluster_size between 8k and 2M (for smaller
sizes, it fails because our allocation patterns are not contiguous
with small clusters).

Signed-off-by: Eric Blake 

---
v10: drop any changes to v2 files, rewrite test to work with updates
earlier in the series, add a blkdebug probe
v9: new patch
---
 tests/qemu-iotests/179 | 123 +
 tests/qemu-iotests/179.out |  90 +
 tests/qemu-iotests/group   |   1 +
 3 files changed, 214 insertions(+)
 create mode 100755 tests/qemu-iotests/179
 create mode 100644 tests/qemu-iotests/179.out

diff --git a/tests/qemu-iotests/179 b/tests/qemu-iotests/179
new file mode 100755
index 000..4e4ffce
--- /dev/null
+++ b/tests/qemu-iotests/179
@@ -0,0 +1,123 @@
+#!/bin/bash
+#
+# Test case for write zeroes with unmap
+#
+# Copyright (C) 2017 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+# creator
+owner=ebl...@redhat.com
+
+seq="$(basename $0)"
+echo "QA output created by $seq"
+
+here="$PWD"
+status=1   # failure is the default!
+
+_cleanup()
+{
+   _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qcow2
+_supported_proto file
+_supported_os Linux
+
+# v2 images can't mark clusters as zero
+_unsupported_imgopts compat=0.10
+
+echo
+echo '=== Testing write zeroes with unmap ==='
+echo
+
+TEST_IMG="$TEST_IMG.base" _make_test_img 64M
+_make_test_img -b "$TEST_IMG.base"
+
+# Offsets chosen at or near 2M boundaries so test works at any cluster size
+
+# Aligned writes to unallocated cluster should not allocate mapping, but must
+# mark cluster as zero, whether or not unmap was requested
+$QEMU_IO -c "write -z -u 2M 2M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 6M 2M" "$TEST_IMG.base" | _filter_qemu_io
+
+# Unaligned writes need not allocate mapping if the cluster already reads
+# as zero, but must mark cluster as zero, whether or not unmap was requested
+$QEMU_IO -c "write -z -u 10485761 2097150" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 14680065 2097150" "$TEST_IMG.base" | _filter_qemu_io
+
+# Requesting unmap of normal data must deallocate; omitting unmap should
+# preserve the mapping
+$QEMU_IO -c "write 18M 14M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z -u 20M 2M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 24M 6M" "$TEST_IMG.base" | _filter_qemu_io
+
+# Likewise when writing on already-mapped zero data
+$QEMU_IO -c "write -z -u 26M 2M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 28M 2M" "$TEST_IMG.base" | _filter_qemu_io
+
+# Writing on unmapped zeroes does not allocate
+$QEMU_IO -c "write -z 32M 8M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z -u 34M 2M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z 36M 2M" "$TEST_IMG.base" | _filter_qemu_io
+
+# Writing zero overrides a backing file, regardless of backing cluster type
+$QEMU_IO -c "write -z 40M 8M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write 48M 8M" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IO -c "write -z -u 42M 2M" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "write -z 44M 2M" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "write -z -u 50M 2M" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "write -z 52M 2M" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "write -z -u 58M 2M" "$TEST_IMG" | _filter_qemu_io
+$QEMU_IO -c "write -z 60M 2M" "$TEST_IMG" | _filter_qemu_io
+
+# Final check that mappings are correct and images are still sane
+$QEMU_IO -c "map" "$TEST_IMG.base" | _filter_qemu_io
+$QEMU_IMG map --output=json "$TEST_IMG" | _filter_qemu_img_map |
+sed 's/"offset": [0-9]*/"offset": OFFSET/g'
+TEST_IMG="$TEST_IMG.base" _check_test_img
+_check_test_img
+
+echo
+echo '=== Testing cache optimization ==='
+echo
+
+BLKDBG_TEST_IMG="blkdebug:$TEST_DIR/blkdebug.conf:$TEST_IMG.base"
+
+cat > "$TEST_DIR/blkdebug.conf" <

[Qemu-devel] [PATCH v10 17/17] tests: Add coverage for recent block geometry fixes

2017-04-26 Thread Eric Blake
Use blkdebug's new geometry constraints to emulate setups that
have caused recent regression fixes: write zeroes asserting
when running through a loopback block device with max-transfer
smaller than cluster size, and discard rounding away portions
of requests not aligned to preferred boundaries.  Also, add
coverage that the block layer is honoring max transfer limits.

For now, a single iotest performs all actions, with the idea
that we can add future blkdebug constraint test cases in the
same file; but it can be split into multiple iotests if we find
reason to run one portion of the test in more setups than what
are possible in the other.

For reference, the final portion of the test (checking whether
discard passes as much as possible to the lowest layers of the
stack) works as follows:

qemu-io: discard 30M at 8001, passed to blkdebug
  blkdebug: discard 511 bytes at 8001, -ENOTSUP (smaller than
blkdebug's 512 align)
  blkdebug: discard 14371328 bytes at 8512, passed to qcow2
qcow2: discard 739840 bytes at 8512, -ENOTSUP (smaller than
qcow2's 1M align)
qcow2: discard 13M bytes at 77M, succeeds
  blkdebug: discard 15M bytes at 90M, passed to qcow2
qcow2: discard 15M bytes at 90M, succeeds
  blkdebug: discard 1356800 bytes at 105M, passed to qcow2
qcow2: discard 1M at 105M, succeeds
qcow2: discard 308224 bytes at 106M, -ENOTSUP (smaller than qcow2's
1M align)
  blkdebug: discard 1 byte at 111457280, -ENOTSUP (smaller than
blkdebug's 512 align)

Signed-off-by: Eric Blake 
Reviewed-by: Max Reitz 

---

[yes, I added tests 177 and 179 out of order during this series.
Oh well; I'm tired of renumbering this one.]

v10: no change, rebase to context
v9: no change
v7-v8: not submitted (earlier half of series sent for 2.9)
v6: rebase to master by renumbering s/175/177/
v5: rebase to master by renumbering s/173/175/
v4: clean up some comments, nicer backing file creation, more commit message
v3: make comments tied more to test at hand, rather than the
particular hardware that led to the earlier patches being tested
v2: new patch
---
 tests/qemu-iotests/177 | 114 +
 tests/qemu-iotests/177.out |  49 +++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 164 insertions(+)
 create mode 100755 tests/qemu-iotests/177
 create mode 100644 tests/qemu-iotests/177.out

diff --git a/tests/qemu-iotests/177 b/tests/qemu-iotests/177
new file mode 100755
index 000..e4ddec7
--- /dev/null
+++ b/tests/qemu-iotests/177
@@ -0,0 +1,114 @@
+#!/bin/bash
+#
+# Test corner cases with unusual block geometries
+#
+# Copyright (C) 2016-2017 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+# creator
+owner=ebl...@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+status=1   # failure is the default!
+
+_cleanup()
+{
+   _cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+_supported_fmt qcow2
+_supported_proto file
+
+CLUSTER_SIZE=1M
+size=128M
+options=driver=blkdebug,image.driver=qcow2
+
+echo
+echo "== setting up files =="
+
+TEST_IMG="$TEST_IMG.base" _make_test_img $size
+$QEMU_IO -c "write -P 11 0 $size" "$TEST_IMG.base" | _filter_qemu_io
+_make_test_img -b "$TEST_IMG.base"
+$QEMU_IO -c "write -P 22 0 $size" "$TEST_IMG" | _filter_qemu_io
+
+# Limited to 64k max-transfer
+echo
+echo "== constrained alignment and max-transfer =="
+limits=align=4k,max-transfer=64k
+$QEMU_IO -c "open -o $options,$limits blkdebug::$TEST_IMG" \
+ -c "write -P 33 1000 128k" -c "read -P 33 1000 128k" | _filter_qemu_io
+
+echo
+echo "== write zero with constrained max-transfer =="
+limits=align=512,max-transfer=64k,opt-write-zero=$CLUSTER_SIZE
+$QEMU_IO -c "open -o $options,$limits blkdebug::$TEST_IMG" \
+ -c "write -z 8003584 2093056" | _filter_qemu_io
+
+# non-power-of-2 write-zero/discard alignments
+echo
+echo "== non-power-of-2 write zeroes limits =="
+
+limits=align=512,opt-write-zero=15M,max-write-zero=15M,opt-discard=15M,max-discard=15M
+$QEMU_IO -c "open -o $options,$limits blkdebug::$TEST_IMG" \
+ -c "write -z 32M 32M" | _filter_qemu_io
+
+echo
+echo "== non-power-of-2 discard limits =="
+

[Qemu-devel] [PATCH v10 07/17] qemu-io: Switch 'alloc' command to byte-based length

2017-04-26 Thread Eric Blake
For the 'alloc' command, accepting an offset in bytes but a length
in sectors, and reporting output in sectors, is confusing.  Do
everything in bytes, and adjust the expected output accordingly.

Signed-off-by: Eric Blake 
Reviewed-by: Philippe Mathieu-Daudé 

---
v10: rebase to code cleanup
v9: new patch
---
 qemu-io-cmds.c| 30 ++
 tests/qemu-iotests/019.out|  8 
 tests/qemu-iotests/common.pattern |  2 +-
 3 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index fabc394..34f6707 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1760,7 +1760,7 @@ out:
 static int alloc_f(BlockBackend *blk, int argc, char **argv)
 {
 BlockDriverState *bs = blk_bs(blk);
-int64_t offset, sector_num, nb_sectors, remaining;
+int64_t offset, sector_num, nb_sectors, remaining, bytes;
 char s1[64];
 int num, ret;
 int64_t sum_alloc;
@@ -1776,18 +1776,24 @@ static int alloc_f(BlockBackend *blk, int argc, char 
**argv)
 }

 if (argc == 3) {
-nb_sectors = cvtnum(argv[2]);
-if (nb_sectors < 0) {
-print_cvtnum_err(nb_sectors, argv[2]);
+bytes = cvtnum(argv[2]);
+if (bytes < 0) {
+print_cvtnum_err(bytes, argv[2]);
 return 0;
-} else if (nb_sectors > INT_MAX) {
-printf("length argument cannot exceed %d, given %s\n",
-   INT_MAX, argv[2]);
+} else if (bytes > INT_MAX * BDRV_SECTOR_SIZE) {
+printf("length argument cannot exceed %llu, given %s\n",
+   INT_MAX * BDRV_SECTOR_SIZE, argv[2]);
 return 0;
 }
 } else {
-nb_sectors = 1;
+bytes = BDRV_SECTOR_SIZE;
 }
+if (!QEMU_IS_ALIGNED(bytes, BDRV_SECTOR_SIZE)) {
+printf("bytes %" PRId64 " is not sector aligned\n",
+   bytes);
+return 0;
+}
+nb_sectors = bytes >> BDRV_SECTOR_BITS;

 remaining = nb_sectors;
 sum_alloc = 0;
@@ -1811,8 +1817,8 @@ static int alloc_f(BlockBackend *blk, int argc, char 
**argv)

 cvtstr(offset, s1, sizeof(s1));

-printf("%"PRId64"/%"PRId64" sectors allocated at offset %s\n",
-   sum_alloc, nb_sectors, s1);
+printf("%"PRId64"/%"PRId64" bytes allocated at offset %s\n",
+   sum_alloc << BDRV_SECTOR_BITS, nb_sectors << BDRV_SECTOR_BITS, s1);
 return 0;
 }

@@ -1822,8 +1828,8 @@ static const cmdinfo_t alloc_cmd = {
 .argmin = 1,
 .argmax = 2,
 .cfunc  = alloc_f,
-.args   = "off [sectors]",
-.oneline= "checks if a sector is present in the file",
+.args   = "offset [bytes]",
+.oneline= "checks if offset is allocated in the file",
 };


diff --git a/tests/qemu-iotests/019.out b/tests/qemu-iotests/019.out
index 0124264..17a7c03 100644
--- a/tests/qemu-iotests/019.out
+++ b/tests/qemu-iotests/019.out
@@ -542,8 +542,8 @@ Testing conversion with -B TEST_DIR/t.IMGFMT.base

 Checking if backing clusters are allocated when they shouldn't

-0/128 sectors allocated at offset 1 MiB
-0/128 sectors allocated at offset 4.001 GiB
+0/65536 bytes allocated at offset 1 MiB
+0/65536 bytes allocated at offset 4.001 GiB
 Reading

 === IO: pattern 42
@@ -1086,8 +1086,8 @@ Testing conversion with -o 
backing_file=TEST_DIR/t.IMGFMT.base

 Checking if backing clusters are allocated when they shouldn't

-0/128 sectors allocated at offset 1 MiB
-0/128 sectors allocated at offset 4.001 GiB
+0/65536 bytes allocated at offset 1 MiB
+0/65536 bytes allocated at offset 4.001 GiB
 Reading

 === IO: pattern 42
diff --git a/tests/qemu-iotests/common.pattern 
b/tests/qemu-iotests/common.pattern
index ddfbca1..34f4a8d 100644
--- a/tests/qemu-iotests/common.pattern
+++ b/tests/qemu-iotests/common.pattern
@@ -18,7 +18,7 @@

 function do_is_allocated() {
 local start=$1
-local size=$(( $2 / 512))
+local size=$2
 local step=$3
 local count=$4

-- 
2.9.3




[Qemu-devel] [PATCH v10 04/17] qcow2: Optimize zero_single_l2() to minimize L2 churn

2017-04-26 Thread Eric Blake
Similar to discard_single_l2(), we should try to avoid dirtying
the L2 cache when the cluster we are changing already has the
right characteristics.

Note that by the time we get to zero_single_l2(), BDRV_REQ_MAY_UNMAP
is a requirement to unallocate a cluster (this is because the block
layer clears that flag if discard.* flags during open requested that
we never punch holes - see the conversation around commit 170f4b2e,
https://lists.gnu.org/archive/html/qemu-devel/2016-09/msg07306.html).
Therefore, this patch can only reuse a zero cluster as-is if either
unmapping is not requested, or if the zero cluster was not associated
with an allocation.

Technically, there are some cases where an unallocated cluster
already reads as all zeroes (namely, when there is no backing file
[easy: check bs->backing], or when the backing file also reads as
zeroes [harder: we can't check bdrv_get_block_status since we are
already holding the lock]), where the guest would not immediately see
a difference if we left that cluster unallocated.  But if the user
did not request unmapping, leaving an unallocated cluster is wrong;
and even if the user DID request unmapping, keeping a cluster
unallocated risks a subtle semantic change of guest-visible contents
if a backing file is later added, and it is not worth auditing
whether all internal uses such as mirror properly avoid an unmap
request.  Thus, this patch is intentionally limited to just clusters
that are already marked as zero.

Signed-off-by: Eric Blake 

---
v10: new patch, replacing earlier attempt to use unallocated clusters,
and ditching any optimization of v2 files
---
 block/qcow2-cluster.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index db3d937..d542894 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1614,6 +1614,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t 
offset,
 int l2_index;
 int ret;
 int i;
+bool unmap = !!(flags & BDRV_REQ_MAY_UNMAP);

 ret = get_cluster_table(bs, offset, _table, _index);
 if (ret < 0) {
@@ -1629,9 +1630,17 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t 
offset,

 old_offset = be64_to_cpu(l2_table[l2_index + i]);

-/* Update L2 entries */
+/*
+ * Minimize L2 changes if the cluster already reads back as
+ * zeroes with correct allocation.
+ */
+if (qcow2_get_cluster_type(old_offset) == QCOW2_CLUSTER_ZERO &&
+!(unmap && old_offset & L2E_OFFSET_MASK)) {
+continue;
+}
+
 qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table);
-if (old_offset & QCOW_OFLAG_COMPRESSED || flags & BDRV_REQ_MAY_UNMAP) {
+if (old_offset & QCOW_OFLAG_COMPRESSED || unmap) {
 l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO);
 qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST);
 } else {
-- 
2.9.3




[Qemu-devel] [PATCH v10 06/17] qemu-io: Don't open-code QEMU_IS_ALIGNED

2017-04-26 Thread Eric Blake
Manual comparison against 0x1ff is not as clean as using our
alignment macros from osdep.h.

Suggested-by: Philippe Mathieu-Daudé 
Signed-off-by: Eric Blake 

---
v10: new patch
---
 qemu-io-cmds.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 21af9e6..fabc394 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -740,12 +740,12 @@ static int read_f(BlockBackend *blk, int argc, char 
**argv)
 }

 if (bflag) {
-if (offset & 0x1ff) {
+if (!QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)) {
 printf("offset %" PRId64 " is not sector aligned\n",
offset);
 return 0;
 }
-if (count & 0x1ff) {
+if (!QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE)) {
 printf("count %"PRId64" is not sector aligned\n",
count);
 return 0;
@@ -1050,13 +1050,13 @@ static int write_f(BlockBackend *blk, int argc, char 
**argv)
 }

 if (bflag || cflag) {
-if (offset & 0x1ff) {
+if (!QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)) {
 printf("offset %" PRId64 " is not sector aligned\n",
offset);
 return 0;
 }

-if (count & 0x1ff) {
+if (!QEMU_IS_ALIGNED(count, BDRV_SECTOR_SIZE)) {
 printf("count %"PRId64" is not sector aligned\n",
count);
 return 0;
@@ -1769,7 +1769,7 @@ static int alloc_f(BlockBackend *blk, int argc, char 
**argv)
 if (offset < 0) {
 print_cvtnum_err(offset, argv[1]);
 return 0;
-} else if (offset & 0x1ff) {
+} else if (!QEMU_IS_ALIGNED(offset, BDRV_SECTOR_SIZE)) {
 printf("offset %" PRId64 " is not sector aligned\n",
offset);
 return 0;
-- 
2.9.3




Re: [Qemu-devel] [PATCH v15 18/21] block: Reuse bs as backing hd for drive-backup sync=none

2017-04-26 Thread Fam Zheng
On Wed, 04/26 16:34, Kevin Wolf wrote:
> Am 26.04.2017 um 15:15 hat Fam Zheng geschrieben:
> > On Wed, 04/26 14:52, Kevin Wolf wrote:
> > > Am 26.04.2017 um 05:34 hat Fam Zheng geschrieben:
> > > > Signed-off-by: Fam Zheng 
> > > 
> > > The commit message is a bit terse. :-)
> > > 
> > > So I think this means that instead of opening the backing file of the
> > > backup (which is probably the active file of the VM) a second time, we
> > > instead take a reference on the existing node. Very good idea.
> > > 
> > > In fact, my question is: How did this ever work? Did we just neglect to
> > > test this? The backup backing file will use stale qcow2 metadata when we
> > > continue to write to the source.
> > 
> > Reading from the target BDS would be a problem, but I guess it's relatively
> > uncommon:
> > 
> > - In QEMU code, COW is always done manually in backup_do_cow, so no reading 
> > from
> >   target->backing in block layer.
> > 
> > - Writing to target is done in target cluster granularity so no COW too.
> > 
> > But it's still a fully valid point, for example it can be exported to NBD, 
> > etc.
> 
> Which I believe is the exact setup for fleecing.

Hmm, yes you must be right. Though IMO I've always considered blockdev-add with
backing reference to be the only valid way to do fleecing, this one is IMHO a
hack. And when it was added, there was no way to export it in NBD because it
didn't have a device id.

> 
> And actually, if we were sure that we never read from the file, we
> wouldn't even have to attach the backing file in the first place.

OK, fair enough.

> 
> > > Unless I'm missing something, I'd propose this for qemu-stable.
> > 
> > Yes, sounds good.
> > 
> > > 
> > > Also, mirror with MIRROR_OPEN_BACKING_CHAIN probably has a similar
> > > problem with opening the images in the backing chain a second time.
> > 
> > Seems so. But if there was such a test case, image locking would have
> > complained.
> > 
> > Is that a practical use case?  Mirror target image uses the active image as
> > backign file? (It's a bit late in the evening, please remind me :-)
> 
> I believe storage migration uses a mode like this, but to be honest I
> don't remember the details at the moment either.
> 
> But I do see that NEW_IMAGE_MODE_ABSOLUTE_PATHS enters the source as the
> backing file of the target and opens it (a second instance of it) when
> the mirror completes. Hopefully, by that time the source doesn't have
> another user that keeps it alive (and I think op blockers should
> actually ensure this), but it looks a bit fishy. Probably worth a closer
> look.

Mirror has many graph manipulation magic, which seems even darker than commit.
We should perhaps allocate sometime to see if there are cleaner ways to do them,
with all the new infrastructure we have. Hopefully it's not hard to do, like
this one.

Fam



[Qemu-devel] [PATCH v10 12/17] blkdebug: Sanity check block layer guarantees

2017-04-26 Thread Eric Blake
Commits 04ed95f4 and 1a62d0ac updated the block layer to auto-fragment
any I/O to fit within device boundaries. Additionally, when using a
minimum alignment of 4k, we want to ensure the block layer does proper
read-modify-write rather than requesting I/O on a slice of a sector.
Let's enforce that the contract is obeyed when using blkdebug.  For
now, blkdebug only allows alignment overrides, and just inherits other
limits from whatever device it is wrapping, but a future patch will
further enhance things.

Signed-off-by: Eric Blake 
Reviewed-by: Kevin Wolf 
Reviewed-by: Max Reitz 

---
v10: no change
v5-v9: no change
v4: no change
v3: rebase to byte-based interfaces
v2: new patch
---
 block/blkdebug.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/block/blkdebug.c b/block/blkdebug.c
index c795ae9..14d3fc5 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -431,6 +431,13 @@ blkdebug_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 BDRVBlkdebugState *s = bs->opaque;
 BlkdebugRule *rule = NULL;

+/* Sanity check block layer guarantees */
+assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment));
+assert(QEMU_IS_ALIGNED(bytes, bs->bl.request_alignment));
+if (bs->bl.max_transfer) {
+assert(bytes <= bs->bl.max_transfer);
+}
+
 QSIMPLEQ_FOREACH(rule, >active_rules, active_next) {
 uint64_t inject_offset = rule->options.inject.offset;

@@ -455,6 +462,13 @@ blkdebug_co_pwritev(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 BDRVBlkdebugState *s = bs->opaque;
 BlkdebugRule *rule = NULL;

+/* Sanity check block layer guarantees */
+assert(QEMU_IS_ALIGNED(offset, bs->bl.request_alignment));
+assert(QEMU_IS_ALIGNED(bytes, bs->bl.request_alignment));
+if (bs->bl.max_transfer) {
+assert(bytes <= bs->bl.max_transfer);
+}
+
 QSIMPLEQ_FOREACH(rule, >active_rules, active_next) {
 uint64_t inject_offset = rule->options.inject.offset;

-- 
2.9.3




[Qemu-devel] [PATCH v10 00/17] add blkdebug tests

2017-04-26 Thread Eric Blake
Available as a tag at:
git fetch git://repo.or.cz/qemu/ericb.git nbd-blkdebug-v10

Prerequisites: Max's block-next tree:
https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg01298.html
Fam's fix for test 109:
https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg03195.html

v9 was:
https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg01723.html

Since then:
- Rebase to master
- Add even more qcow2 patches to make tracking of preallocated
zero clusters correct
- replace v9 1/13 with 4/17 that is not as invasive (no change to
v2 format, and no questionable use of unallocated clusters)
- rewrite v9 2/13 (iotests 179) with 5/17, as something more robust

001/17:[down] 'block: Update comments on BDRV_BLOCK_* meanings'
002/17:[down] 'qcow2: Correctly report status of preallocated zero clusters'
003/17:[down] 'qcow2: Reuse preallocated zero clusters'
004/17:[down] 'qcow2: Optimize zero_single_l2() to minimize L2 churn'
005/17:[0187] [FC] 'iotests: Add test 179 to cover write zeroes with unmap'
006/17:[down] 'qemu-io: Don't open-code QEMU_IS_ALIGNED'
007/17:[0010] [FC] 'qemu-io: Switch 'alloc' command to byte-based length'
008/17:[0022] [FC] 'qemu-io: Switch 'map' output to byte-based reporting'
009/17:[0006] [FC] 'qcow2: Optimize write zero of unaligned tail cluster'
010/17:[] [-C] 'qcow2: Assert that cluster operations are aligned'
011/17:[0005] [FC] 'qcow2: Discard/zero clusters by byte count'
012/17:[] [--] 'blkdebug: Sanity check block layer guarantees'
013/17:[] [--] 'blkdebug: Refactor error injection'
014/17:[] [--] 'blkdebug: Add pass-through write_zero and discard support'
015/17:[] [--] 'blkdebug: Simplify override logic'
016/17:[] [--] 'blkdebug: Add ability to override unmap geometries'
017/17:[] [-C] 'tests: Add coverage for recent block geometry fixes'

Eric Blake (16):
  block: Update comments on BDRV_BLOCK_* meanings
  qcow2: Correctly report status of preallocated zero clusters
  qcow2: Optimize zero_single_l2() to minimize L2 churn
  iotests: Add test 179 to cover write zeroes with unmap
  qemu-io: Don't open-code QEMU_IS_ALIGNED
  qemu-io: Switch 'alloc' command to byte-based length
  qemu-io: Switch 'map' output to byte-based reporting
  qcow2: Optimize write zero of unaligned tail cluster
  qcow2: Assert that cluster operations are aligned
  qcow2: Discard/zero clusters by byte count
  blkdebug: Sanity check block layer guarantees
  blkdebug: Refactor error injection
  blkdebug: Add pass-through write_zero and discard support
  blkdebug: Simplify override logic
  blkdebug: Add ability to override unmap geometries
  tests: Add coverage for recent block geometry fixes

Max Reitz (1):
  qcow2: Reuse preallocated zero clusters

 qapi/block-core.json  |  33 -
 block/qcow2.h |  12 +-
 include/block/block.h |  13 +-
 block/blkdebug.c  | 264 +++---
 block/qcow2-cluster.c | 177 +
 block/qcow2-snapshot.c|   7 +-
 block/qcow2.c |  28 ++--
 qemu-io-cmds.c|  45 ---
 tests/qemu-iotests/019.out|   8 +-
 tests/qemu-iotests/102.out|   4 +-
 tests/qemu-iotests/146.out|  30 ++---
 tests/qemu-iotests/154| 112 +++-
 tests/qemu-iotests/154.out|  83 
 tests/qemu-iotests/177| 114 
 tests/qemu-iotests/177.out|  49 +++
 tests/qemu-iotests/179| 123 ++
 tests/qemu-iotests/179.out|  90 +
 tests/qemu-iotests/common.pattern |   2 +-
 tests/qemu-iotests/group  |   2 +
 19 files changed, 1024 insertions(+), 172 deletions(-)
 create mode 100755 tests/qemu-iotests/177
 create mode 100644 tests/qemu-iotests/177.out
 create mode 100755 tests/qemu-iotests/179
 create mode 100644 tests/qemu-iotests/179.out

-- 
2.9.3




[Qemu-devel] [PATCH v10 10/17] qcow2: Assert that cluster operations are aligned

2017-04-26 Thread Eric Blake
We already audited (in commit 0c1bd469) that qcow2_discard_clusters()
is only passed cluster-aligned start values; but we can further
tighten the assertion that the only unaligned end value is at EOF.

Recent commits have taken advantage of an unaligned tail cluster,
for both discard and write zeroes.

Signed-off-by: Eric Blake 

---
v10: rebase to context
v9: rebase to master, by asserting that only tail cluster is unaligned
v7, v8: only earlier half of series submitted for 2.9
v6: avoid assertion on non-cluster-aligned image, use s->cluster_sectors
to avoid a shift, drop R-b
v5: no change
v4: new patch
---
 block/qcow2-cluster.c | 15 +++
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index d542894..4f641a9 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1572,11 +1572,10 @@ int qcow2_discard_clusters(BlockDriverState *bs, 
uint64_t offset,

 end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);

-/* The caller must cluster-align start; round end down except at EOF */
+/* Caller must pass aligned values, except at image end */
 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
-if (end_offset != bs->total_sectors * BDRV_SECTOR_SIZE) {
-end_offset = start_of_cluster(s, end_offset);
-}
+assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
+   end_offset == bs->total_sectors << BDRV_SECTOR_BITS);

 nb_clusters = size_to_clusters(s, end_offset - offset);

@@ -1657,9 +1656,17 @@ int qcow2_zero_clusters(BlockDriverState *bs, uint64_t 
offset, int nb_sectors,
 int flags)
 {
 BDRVQcow2State *s = bs->opaque;
+uint64_t end_offset;
 uint64_t nb_clusters;
 int ret;

+end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS);
+
+/* Caller must pass aligned values, except at image end */
+assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
+assert(QEMU_IS_ALIGNED(end_offset, s->cluster_size) ||
+   end_offset == bs->total_sectors << BDRV_SECTOR_BITS);
+
 /* The zero flag is only supported by version 3 and newer */
 if (s->qcow_version < 3) {
 return -ENOTSUP;
-- 
2.9.3




[Qemu-devel] [PATCH v10 03/17] qcow2: Reuse preallocated zero clusters

2017-04-26 Thread Eric Blake
From: Max Reitz 

Instead of just freeing preallocated zero clusters and completely
allocating them from scratch, reuse them.

We cannot do this in handle_copied(), however, since this is a COW
operation. Therefore, we have to add the new logic to handle_alloc() and
simply return the existing offset if it exists. The only catch is that
we have to convince qcow2_alloc_cluster_link_l2() not to free the old
clusters (because we have reused them).

Reported-by: Eric Blake 
Signed-off-by: Max Reitz 
Signed-off-by: Eric Blake 

---
v10: new patch. Max hasn't posted the patch directly on list, but
did mention it here:
https://lists.gnu.org/archive/html/qemu-devel/2017-04/msg03936.html
and that he would post it once he has tests. Well, my later patches
add a test that requires this one :)  The other two patches that
he mentioned there are also good, but not essential for my series
(and I didn't want to write tests to expose the behavior difference,
because it would deprive Max of that fun).
---
 block/qcow2.h |  3 ++
 block/qcow2-cluster.c | 83 +++
 2 files changed, 60 insertions(+), 26 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index f8aeb08..8731f24 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -322,6 +322,9 @@ typedef struct QCowL2Meta
 /** Number of newly allocated clusters */
 int nb_clusters;

+/** Do not free the old clusters */
+bool keep_old_clusters;
+
 /**
  * Requests that overlap with this allocation and wait to be restarted
  * when the allocating request has completed.
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index d1063df..db3d937 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -309,14 +309,20 @@ static int count_contiguous_clusters(int nb_clusters, int 
cluster_size,
 uint64_t *l2_table, uint64_t stop_flags)
 {
 int i;
+int first_cluster_type;
 uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED;
 uint64_t first_entry = be64_to_cpu(l2_table[0]);
 uint64_t offset = first_entry & mask;

-if (!offset)
+if (!offset) {
 return 0;
+}

-assert(qcow2_get_cluster_type(first_entry) == QCOW2_CLUSTER_NORMAL);
+/* must be allocated */
+first_cluster_type = qcow2_get_cluster_type(first_entry);
+assert(first_cluster_type == QCOW2_CLUSTER_NORMAL ||
+   (first_cluster_type == QCOW2_CLUSTER_ZERO &&
+(first_entry & L2E_OFFSET_MASK) != 0));

 for (i = 0; i < nb_clusters; i++) {
 uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
@@ -857,7 +863,7 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, 
QCowL2Meta *m)
  * Don't discard clusters that reach a refcount of 0 (e.g. compressed
  * clusters), the next write will reuse them anyway.
  */
-if (j != 0) {
+if (!m->keep_old_clusters && j != 0) {
 for (i = 0; i < j; i++) {
 qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1,
 QCOW2_DISCARD_NEVER);
@@ -1154,8 +1160,9 @@ static int handle_alloc(BlockDriverState *bs, uint64_t 
guest_offset,
 uint64_t entry;
 uint64_t nb_clusters;
 int ret;
+bool keep_old_clusters = false;

-uint64_t alloc_cluster_offset;
+uint64_t alloc_cluster_offset = 0;

 trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset,
  *bytes);
@@ -1192,31 +1199,53 @@ static int handle_alloc(BlockDriverState *bs, uint64_t 
guest_offset,
  * wrong with our code. */
 assert(nb_clusters > 0);

+if (!*host_offset && qcow2_get_cluster_type(entry) == QCOW2_CLUSTER_ZERO &&
+(entry & L2E_OFFSET_MASK) != 0 && (entry & QCOW_OFLAG_COPIED))
+{
+/* Try to reuse preallocated zero clusters; contiguous normal clusters
+ * would be fine, too, but count_cow_clusters() above has limited
+ * nb_clusters already to a range of COW clusters */
+int preallocated_nb_clusters =
+count_contiguous_clusters(nb_clusters, s->cluster_size, l2_table,
+  QCOW_OFLAG_COPIED);
+
+if (preallocated_nb_clusters) {
+nb_clusters = preallocated_nb_clusters;
+alloc_cluster_offset = entry & L2E_OFFSET_MASK;
+
+/* We want to reuse these clusters, so 
qcow2_alloc_cluster_link_l2()
+ * should not free them. */
+keep_old_clusters = true;
+}
+}
+
 qcow2_cache_put(bs, s->l2_table_cache, (void **) _table);

-/* Allocate, if necessary at a given offset in the image file */
-alloc_cluster_offset = start_of_cluster(s, *host_offset);
-ret = do_alloc_cluster_offset(bs, guest_offset, _cluster_offset,
-  _clusters);
-if (ret < 0) {
-goto fail;
-}
-
-/* Can't extend contiguous 

[Qemu-devel] [PATCH v10 02/17] qcow2: Correctly report status of preallocated zero clusters

2017-04-26 Thread Eric Blake
We were throwing away the preallocation information associated with
zero clusters.  But we should be matching the well-defined semantics
in bdrv_get_block_status(), where (BDRV_BLOCK_ZERO |
BDRV_BLOCK_OFFSET_VALID) informs the user which offset is reserved,
while still reminding the user that reading from that offset is
likely to read garbage.

Making this change lets us see which portions of an image are zero
but preallocated, when using qemu-img map --output=json.  The
--output=human side intentionally ignores all zero clusters, whether
or not they are preallocated.

The fact that there is no change to qemu-iotests './check -qcow2'
merely means that we aren't yet testing this aspect of qemu-img;
a later patch will add a test.

Signed-off-by: Eric Blake 

---
v10: new patch
---
 block/qcow2-cluster.c | 32 +++-
 1 file changed, 27 insertions(+), 5 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 100398c..d1063df 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -328,6 +328,10 @@ static int count_contiguous_clusters(int nb_clusters, int 
cluster_size,
return i;
 }

+/*
+ * Checks how many consecutive clusters in a given L2 table have the same
+ * cluster type with no corresponding allocation.
+ */
 static int count_contiguous_clusters_by_type(int nb_clusters,
  uint64_t *l2_table,
  int wanted_type)
@@ -335,9 +339,10 @@ static int count_contiguous_clusters_by_type(int 
nb_clusters,
 int i;

 for (i = 0; i < nb_clusters; i++) {
-int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i]));
+uint64_t entry = be64_to_cpu(l2_table[i]);
+int type = qcow2_get_cluster_type(entry);

-if (type != wanted_type) {
+if (type != wanted_type || entry & L2E_OFFSET_MASK) {
 break;
 }
 }
@@ -559,9 +564,26 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, 
uint64_t offset,
 ret = -EIO;
 goto fail;
 }
-c = count_contiguous_clusters_by_type(nb_clusters, _table[l2_index],
-  QCOW2_CLUSTER_ZERO);
-*cluster_offset = 0;
+/* Distinguish between pure zero clusters and pre-allocated ones */
+if (*cluster_offset & L2E_OFFSET_MASK) {
+c = count_contiguous_clusters(nb_clusters, s->cluster_size,
+  _table[l2_index], 
QCOW_OFLAG_ZERO);
+*cluster_offset &= L2E_OFFSET_MASK;
+if (offset_into_cluster(s, *cluster_offset)) {
+qcow2_signal_corruption(bs, true, -1, -1,
+"Preallocated zero cluster offset %#"
+PRIx64 " unaligned (L2 offset: %#"
+PRIx64 ", L2 index: %#x)",
+*cluster_offset, l2_offset, l2_index);
+ret = -EIO;
+goto fail;
+}
+} else {
+c = count_contiguous_clusters_by_type(nb_clusters,
+  _table[l2_index],
+  QCOW2_CLUSTER_ZERO);
+*cluster_offset = 0;
+}
 break;
 case QCOW2_CLUSTER_UNALLOCATED:
 /* how many empty clusters ? */
-- 
2.9.3




[Qemu-devel] [PATCH v10 01/17] block: Update comments on BDRV_BLOCK_* meanings

2017-04-26 Thread Eric Blake
We had some conflicting documentation: a nice 8-way table that
described all possible combinations of DATA, ZERO, and
OFFSET_VALID, couple with text that implied that OFFSET_VALID
always meant raw data could be read directly.  As the 8-way
table is the intended semantics, simplify the rest of the text
to get rid of the confusion.

Suggested-by: Max Reitz 
Signed-off-by: Eric Blake 

---
v10: new patch
---
 include/block/block.h | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index 862eb56..04fcbd0 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -121,21 +121,22 @@ typedef struct HDGeometry {

 /*
  * Allocation status flags
- * BDRV_BLOCK_DATA: data is read from a file returned by bdrv_get_block_status.
+ * BDRV_BLOCK_DATA: data is read from a file returned by bdrv_get_block_status
  * BDRV_BLOCK_ZERO: sectors read as zero
- * BDRV_BLOCK_OFFSET_VALID: sector stored as raw data in a file returned by
- *  bdrv_get_block_status.
+ * BDRV_BLOCK_OFFSET_VALID: an associated offset exists for accessing raw data
  * BDRV_BLOCK_ALLOCATED: the content of the block is determined by this
  *   layer (as opposed to the backing file)
  * BDRV_BLOCK_RAW: used internally to indicate that the request
  * was answered by the raw driver and that one
  * should look in bs->file directly.
  *
- * If BDRV_BLOCK_OFFSET_VALID is set, bits 9-62 represent the offset in
- * bs->file where sector data can be read from as raw data.
- *
  * DATA == 0 && ZERO == 0 means that data is read from backing_hd if present.
  *
+ * If BDRV_BLOCK_OFFSET_VALID is set, bits 9-62 represent the offset
+ * in bs->file that is allocated for the corresponding raw data;
+ * however, whether that offset actually contains data also depends on
+ * BDRV_BLOCK_DATA, as follows:
+ *
  * DATA ZERO OFFSET_VALID
  *  ttt   sectors read as zero, bs->file is zero at offset
  *  tft   sectors read as valid from bs->file at offset
-- 
2.9.3




Re: [Qemu-devel] [PATCH v15 19/21] osdep: Add qemu_lock_fd and qemu_unlock_fd

2017-04-26 Thread Fam Zheng
On Wed, 04/26 15:29, Daniel P. Berrange wrote:
> On Wed, Apr 26, 2017 at 09:20:28PM +0800, Fam Zheng wrote:
> > On Wed, 04/26 14:57, Kevin Wolf wrote:
> > > Am 26.04.2017 um 05:34 hat Fam Zheng geschrieben:
> > > > They are wrappers of POSIX fcntl "file private locking", with a
> > > > convenient "try lock" wrapper implemented with F_OFD_GETLK.
> > > > 
> > > > Signed-off-by: Fam Zheng 
> > > > Reviewed-by: Max Reitz 
> > > > ---
> > > >  include/qemu/osdep.h |  3 +++
> > > >  util/osdep.c | 48 
> > > > 
> > > >  2 files changed, 51 insertions(+)
> > > > 
> > > > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
> > > > index 122ff06..1c9f5e2 100644
> > > > --- a/include/qemu/osdep.h
> > > > +++ b/include/qemu/osdep.h
> > > > @@ -341,6 +341,9 @@ int qemu_close(int fd);
> > > >  #ifndef _WIN32
> > > >  int qemu_dup(int fd);
> > > >  #endif
> > > > +int qemu_lock_fd(int fd, int64_t start, int64_t len, bool exclusive);
> > > > +int qemu_unlock_fd(int fd, int64_t start, int64_t len);
> > > > +int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool 
> > > > exclusive);
> > > 
> > > For the record: On IRC, I proposed adding something like the following:
> > > 
> > > #ifndef F_OFD_SETLK
> > > #define F_OFD_SETLK F_SETLK
> > > #define F_OFD_GETLK F_GETLK
> > > #endif
> > > 
> > > F_OFD_* are still relatively new and e.g. RHEL 7 doesn't support it yet.
> > > Using process-based locks is suboptimal because we can easily lose them
> > > earlier than we want, but it's still better than nothing and covers the
> > > common simple cases.
> > 
> > Yes, we should add that. But I'd prefer:
> > 
> > #ifdef F_OFD_SETLK
> > #define QEMU_SETLK F_OFD_SETLK
> > #define QEMU_GETLK F_OFD_GETLK
> > #else
> > #define QEMU_SETLK F_SETLK
> > #define QEMU_GETLK F_GETLK
> > #endif
> > 
> > to avoid "abusing" the macro name.
> > 
> > Another question is whether we should print a warning to make users aware? 
> > Even
> > the test case in patch 21 can see three "lock losses" on RHEL with posix 
> > lock,
> > and there are way more corner cases, I believe.
> 
> Is it possible to use F_GETLK to detect when we have a missing lock, then
> add assertions in various key places that call F_GETLK to validate hat
> the lock still exists & print an warning.

Possibly, but it's ugly to do, and probably isn't worth the effort in finding
all the places where losses happen and where locks should be checked against
losses. Besides, maybe I'm missing something, it's also hard to use F_GETLK to
find lost locks, because either it's lost or not, it can return F_UNLCK.

> 
> I don't like the idea that downstream would be running with locking enabled,
> but silently loosing locks with no indication at all that this happened,
> especially when upstream development won't be testing this since those devs
> will use the F_OFD_SETLK instead.

Yes, see my reply to Kevin on the last patch: we could default to disabled if
there is no OFD lock (via documented locking=auto, no magic), and avoid
surprising users.

Fam



Re: [Qemu-devel] [PATCH v15 21/21] qemu-iotests: Add test case 153 for image locking

2017-04-26 Thread Fam Zheng
On Wed, 04/26 16:49, Kevin Wolf wrote:
> Am 26.04.2017 um 05:34 hat Fam Zheng geschrieben:
> > Signed-off-by: Fam Zheng 
> 
> I'll do the actual review of the test case later, but I already have a
> question... Can we find a way to accept the expected lock losses on
> systems without OFD? I'm using RHEL 7 as my main development OS, so I
> wouldn't like having to deal with false positives there.

Split the lock loss cases to a separate case and _not_run? :)

> 
> Hm... And actually, printing a warning will probably invalidate all the
> other reference outputs, too. After all, every single invocation of
> qemu/qemu-img/qemu-io will print a warning. Maybe that's a bit too much
> even for manual users when they can't do anything about it (which is
> unlike the format probing warning that you can get rid of by simply
> changing your command line).

An idea: make locking=on/off/auto, and auto is on iff OFD is usable, otherwise
off. When user specifies locking=on but OFD is unusable, we print a warning.

Only in this on test case the warning will be printed on RHEL, and it's easy to
filter it out.

Fam



Re: [Qemu-devel] [PATCH v2 0/2] ramblock: add hmp command "info ramblock"

2017-04-26 Thread Fam Zheng
On Wed, 04/26 19:13, Peter Xu wrote:
> v2:
> - replace "lx" with "PRIx64" in three places

ram_addr_t doesn't have a fixed bit width, to keep compilers happy, maybe you
can stick to PRIx64 in the format string, and cast the parameters.

Fam



Re: [Qemu-devel] [PATCH 1/2] target/openrisc: Implement EVBAR register

2017-04-26 Thread Tim Ansell
I'm about to add support for disabling the inbuilt or1k timer peripheral
(as our SoC does not have it enabled). That isn't really a CPU feature so I
think it still makes sense to have some type of feature field? Maybe CPU
features should just be a separate category and set directly on that
register?

I also thinks it makes sense to maybe define a couple of new CPU
configurations. I think we might want,

 * or1k-all - OpenRISC CPU with all emulated features enabled.
 * or1k-minimal - Only features enabled which can't be disabled.
 * mor1k-default - Configured to match the defaults in the mor1k verilog
repo.

Then I think we also want configs for the "major users" of mor1k like,

 * mor1k-litex - Configured to match the default config used by litex/misoc
(maybe mor1k-misoc as an alias?)
 * mor1k-fusesoc - Configured to match the default of FuseSoC
 * mor1k-minsoc - Configured to match the default of minsoc (which is
different to misoc)

Thoughts?

Tim 'mithro' Ansell

On Apr 18, 2017 10:47 PM, "Stafford Horne"  wrote:

On Tue, Apr 18, 2017 at 04:15:50PM +1000, Tim 'mithro' Ansell wrote:
> Exception Vector Base Address Register (EVBAR) - This optional register
> can be used to apply an offset to the exception vector addresses.
>
> The significant bits (31-12) of the vector offset address for each
> exception depend on the setting of the Supervision Register (SR)'s EPH
> bit and the Exception Vector Base Address Register (EVBAR).
>
> Its presence is indicated by the EVBARP bit in the CPU Configuration
> Register (CPUCFGR).
>
> Signed-off-by: Tim 'mithro' Ansell 
> ---
>  target/openrisc/cpu.c| 2 ++
>  target/openrisc/cpu.h| 7 +++
>  target/openrisc/interrupt.c  | 6 +-
>  target/openrisc/sys_helper.c | 7 +++
>  4 files changed, 21 insertions(+), 1 deletion(-)
>
> diff --git a/target/openrisc/cpu.c b/target/openrisc/cpu.c
> index 7fd2b9a216..1524ed981a 100644
> --- a/target/openrisc/cpu.c
> +++ b/target/openrisc/cpu.c
> @@ -134,6 +134,7 @@ static void or1200_initfn(Object *obj)
>
>  set_feature(cpu, OPENRISC_FEATURE_OB32S);
>  set_feature(cpu, OPENRISC_FEATURE_OF32S);
> +set_feature(cpu, OPENRISC_FEATURE_EVBAR);
>  }
>
>  static void openrisc_any_initfn(Object *obj)
> @@ -141,6 +142,7 @@ static void openrisc_any_initfn(Object *obj)
>  OpenRISCCPU *cpu = OPENRISC_CPU(obj);
>
>  set_feature(cpu, OPENRISC_FEATURE_OB32S);
> +set_feature(cpu, OPENRISC_FEATURE_EVBAR);
>  }
>
>  typedef struct OpenRISCCPUInfo {
> diff --git a/target/openrisc/cpu.h b/target/openrisc/cpu.h
> index 418a0e6960..1958b72718 100644
> --- a/target/openrisc/cpu.h
> +++ b/target/openrisc/cpu.h
> @@ -111,6 +111,11 @@ enum {
>  CPUCFGR_OF32S = (1 << 7),
>  CPUCFGR_OF64S = (1 << 8),
>  CPUCFGR_OV64S = (1 << 9),
> +/* CPUCFGR_ND = (1 << 10), */
> +/* CPUCFGR_AVRP = (1 << 11), */
> +CPUCFGR_EVBARP = (1 << 12),
> +/* CPUCFGR_ISRP = (1 << 13), */
> +/* CPUCFGR_AECSRP = (1 << 14), */
>  };
>
>  /* DMMU configure register */
> @@ -200,6 +205,7 @@ enum {
>  OPENRISC_FEATURE_OF32S = (1 << 7),
>  OPENRISC_FEATURE_OF64S = (1 << 8),
>  OPENRISC_FEATURE_OV64S = (1 << 9),
> +OPENRISC_FEATURE_EVBAR = (1 << 12),

Does anyone know why we have to add both Features and CPUCFG bits?

It seems like duplication to me.

>  };
>
>  /* Tick Timer Mode Register */
> @@ -289,6 +295,7 @@ typedef struct CPUOpenRISCState {
>  uint32_t dmmucfgr;/* DMMU configure register */
>  uint32_t immucfgr;/* IMMU configure register */
>  uint32_t esr; /* Exception supervisor register */
> +uint32_t evbar;   /* Exception vector base address register
*/

If we add something to CPUOpenRISCState, we ideally need to also add it to
machine.c vmstate definition.  However, I currently have a patch out to
rework the vmstate serialization.  I can add this to that patch.

>  uint32_t fpcsr;   /* Float register */
>  float_status fp_status;
>
> diff --git a/target/openrisc/interrupt.c b/target/openrisc/interrupt.c
> index a2eec6fb32..78f0ba9421 100644
> --- a/target/openrisc/interrupt.c
> +++ b/target/openrisc/interrupt.c
> @@ -65,7 +65,11 @@ void openrisc_cpu_do_interrupt(CPUState *cs)
>  env->lock_addr = -1;
>
>  if (cs->exception_index > 0 && cs->exception_index < EXCP_NR) {
> -env->pc = (cs->exception_index << 8);
> +hwaddr vect_pc = cs->exception_index << 8;
> +if (env->cpucfgr & CPUCFGR_EVBARP) {
> +vect_pc |= env->evbar;
> +}
> +env->pc = vect_pc;
>  } else {
>  cpu_abort(cs, "Unhandled exception 0x%x\n", cs->exception_index);
>  }
> diff --git a/target/openrisc/sys_helper.c b/target/openrisc/sys_helper.c
> index 60c3193656..6ba816249b 100644
> --- a/target/openrisc/sys_helper.c
> +++ b/target/openrisc/sys_helper.c
> @@ -39,6 +39,10 @@ void HELPER(mtspr)(CPUOpenRISCState *env,
>  env->vr = rb;
>  

Re: [Qemu-devel] [PATCH] virtio: allow broken device to notify guest

2017-04-26 Thread Michael S. Tsirkin
On Wed, Apr 26, 2017 at 03:29:46PM +0200, Greg Kurz wrote:
> On Wed, 26 Apr 2017 15:15:48 +0200
> Cornelia Huck  wrote:
> 
> > On Wed, 26 Apr 2017 14:51:17 +0200
> > Greg Kurz  wrote:
> > 
> > > According to section 2.1.2 of the virtio-1 specification:
> > > 
> > > "The device SHOULD set DEVICE_NEEDS_RESET when it enters an error state 
> > > that
> > > a reset is needed. If DRIVER_OK is set, after it sets DEVICE_NEEDS_RESET,
> > > the device MUST send a device configuration change notification to the
> > > driver."
> > > 
> > > Commit "f5ed36635d8f virtio: stop virtqueue processing if device is 
> > > broken"
> > > introduced a virtio_error() call that just does that:
> > > 
> > > - internally mark the device as broken
> > > - set the DEVICE_NEEDS_RESET bit in the status
> > > - send a configuration change notification
> > > 
> > > Unfortunately, virtio_notify_vector(), called by virtio_notify_config(),
> > > returns right away when the device is marked as broken and the 
> > > notification
> > > isn't sent in this case.
> > > 
> > > The spec doesn't say whether a broken device can send notifications
> > > in other situations or not. But since the driver isn't supposed to do
> > > anything but to reset the device, it makes sense to keep the check in
> > > virtio_notify_config().
> > > 
> > > Marking the device as broken AFTER the configuration change notification 
> > > was
> > > sent is enough to fix the issue.
> > > 
> > > Signed-off-by: Greg Kurz 
> > > ---
> > >  hw/virtio/virtio.c |4 ++--
> > >  1 file changed, 2 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> > > index 03592c542a55..890b4d7eb751 100644
> > > --- a/hw/virtio/virtio.c
> > > +++ b/hw/virtio/virtio.c
> > > @@ -2451,12 +2451,12 @@ void GCC_FMT_ATTR(2, 3) virtio_error(VirtIODevice 
> > > *vdev, const char *fmt, ...)
> > >  error_vreport(fmt, ap);
> > >  va_end(ap);
> > > 
> > > -vdev->broken = true;
> > > -
> > >  if (virtio_vdev_has_feature(vdev, VIRTIO_F_VERSION_1)) {
> > >  virtio_set_status(vdev, vdev->status | 
> > > VIRTIO_CONFIG_S_NEEDS_RESET);
> > >  virtio_notify_config(vdev);
> > >  }
> > > +
> > > +vdev->broken = true;
> > >  }
> > > 
> > >  static void virtio_memory_listener_commit(MemoryListener *listener)
> > >   
> > 
> > Good catch.
> > 
> > Reviewed-by: Cornelia Huck 
> > 
> > Should this be cc:stable, as it's a spec violation?
> > 
> 
> I don't know if this qualifies for stable, but if it does then it affects
> all versions >= 2.8.0.


It's a SHOULD so not a violation, just a quality of implementation
issue. Seems a bit too intrusive for stable and we are yet to
have drivers actually handling these errors, so let's wait a bit
and see.

I'll apply this to master for now.


-- 
MST



Re: [Qemu-devel] [PATCH v3 01/10] tcg-runtime: add lookup_tb_ptr helper

2017-04-26 Thread Emilio G. Cota
On Wed, Apr 26, 2017 at 19:11:32 -0400, Emilio G. Cota wrote:
> On Wed, Apr 26, 2017 at 18:45:31 -0400, Emilio G. Cota wrote:
> > On Thu, Apr 27, 2017 at 00:29:49 +0200, Richard Henderson wrote:
> > > On 04/26/2017 11:56 PM, Emilio G. Cota wrote:
> > > >On Wed, Apr 26, 2017 at 10:40:45 +0200, Richard Henderson wrote:
> > > >>On 04/26/2017 08:23 AM, Emilio G. Cota wrote:
> > > >(snip)
> > > >>>+cpu_get_tb_cpu_state(env, , _base, );
> > > >>>+tb = 
> > > >>>atomic_rcu_read(>tb_jmp_cache[tb_jmp_cache_hash_func(addr)]);
> > > >>>+if (likely(tb && tb->pc == addr && tb->cs_base == cs_base &&
> > > >>>+   tb->flags == flags)) {
> > > >>
> > > >>This comparison is wrong.  It will incorrectly reject a TB for i386 
> > > >>guest
> > > >>when CS_BASE != 0.  You really want
> > > >>
> > > >>   tb = 
> > > >> atomic_rcu_read(>tb_jmp_cache[tb_jmp_cache_hash_func(addr)]);
> > > >>   if (tb) {
> > > >> cpu_get_tb_cpu_state(env, , _base, );
> > > >> if (tb->pc == pc && tb->cs_base == cs_base && tb->flags == flags) {
> > > >>   return tb->tc_ptr;
> > > >> }
> > > >>   }
> > > >>   return tcg_ctx.code_gen_epilogue;
> > > >
> > > >wrt the comparison, the only change I notice in your suggested change is
> > > >   tb->pc == pc
> > > >
> > > >instead of
> > > >   tb->pc == addr
> > > >
> > > >, which seems innocuous to me (since tb->pc == addr).
> > > >
> > > >I fail to see how this relates to your "CS_BASE != 0" comment.
> > > >What am I missing?
> > > 
> > > Recall how you computed vaddr for target/i386:
> > > 
> > >   addr = pc + cs_base
> > 
> > I see, thanks!
> 
> Hmm TB's are added to tb_jmp_cache by pc, not by pc + cs_base:
> 
>   atomic_set(>tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);
> 
> Shouldn't we then pass just the pc (without adding cs_base) to
> lookup_ptr, then? i.e.
> 
> --- a/target/i386/translate.c
> +++ b/target/i386/translate.c
> @@ -2533,11 +2533,7 @@ gen_eob_worker(DisasContext *s, bool inhibit, bool 
> recheck_tf, TCGv jr)
>  } else if (s->tf) {
>  gen_helper_single_step(cpu_env);
>  } else if (!TCGV_IS_UNUSED(jr)) {
> -TCGv vaddr = tcg_temp_new();
> -
> -tcg_gen_add_tl(vaddr, jr, cpu_seg_base[R_CS]);
> -tcg_gen_lookup_and_goto_ptr(vaddr);
> -tcg_temp_free(vaddr);
> +tcg_gen_lookup_and_goto_ptr(jr);
>  } else {
>  tcg_gen_exit_tb(0);
>  }
> 
> And while at it, rename the "addr" argument in lookup_ptr to "pc". Hmm?

Answering to myself again..

target/i386/cpu.c:
static inline void cpu_get_tb_cpu_state(CPUX86State *env, target_ulong *pc,
target_ulong *cs_base, uint32_t *flags)
{
*cs_base = env->segs[R_CS].base;
*pc = *cs_base + env->eip;
*flags = env->hflags |
(env->eflags & (IOPL_MASK | TF_MASK | RF_MASK | VM_MASK | AC_MASK));
}

cpu-exec.c:
/* We add the TB in the virtual pc hash table for the fast lookup */
atomic_set(>tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);

So in lookup_and_goto_ptr, checking tb->pc == pc or tb->pc == addr,
where addr was passed from 'jr + cpu_seg_base[R_CS]', are both correct.
FWIW, I just checked with an assertion in full-system mode.

E.



Re: [Qemu-devel] [virtio-dev] Re: [PATCH v9 2/5] virtio-balloon: VIRTIO_BALLOON_F_BALLOON_CHUNKS

2017-04-26 Thread Michael S. Tsirkin
On Wed, Apr 26, 2017 at 11:03:34AM +, Wang, Wei W wrote:
> Hi Michael, could you please give some feedback?

I'm sorry, I'm not sure feedback on what you are requesting.

The interface looks reasonable now, even though there's
a way to make it even simpler if we can limit chunk size
to 2G (in fact 4G - 1). Do you think we can live with this
limitation?

But the code still needs some cleanup.

-- 
MST



Re: [Qemu-devel] [PATCH v3 01/10] tcg-runtime: add lookup_tb_ptr helper

2017-04-26 Thread Emilio G. Cota
On Wed, Apr 26, 2017 at 10:40:45 +0200, Richard Henderson wrote:
> On 04/26/2017 08:23 AM, Emilio G. Cota wrote:
> >This paves the way for upcoming work.
> >
> >Reviewed-by: Richard Henderson 
> >Signed-off-by: Emilio G. Cota 
> >---
> >  tcg-runtime.c | 21 +
> >  tcg/tcg-runtime.h |  2 ++
> >  tcg/tcg.h |  1 +
> >  3 files changed, 24 insertions(+)
> >
> >diff --git a/tcg-runtime.c b/tcg-runtime.c
> >index 4c60c96..90d2d4b 100644
> >--- a/tcg-runtime.c
> >+++ b/tcg-runtime.c
> >@@ -27,6 +27,7 @@
> >  #include "exec/helper-proto.h"
> >  #include "exec/cpu_ldst.h"
> >  #include "exec/exec-all.h"
> >+#include "exec/tb-hash.h"
> >  /* 32-bit helpers */
> >@@ -141,6 +142,26 @@ uint64_t HELPER(ctpop_i64)(uint64_t arg)
> >  return ctpop64(arg);
> >  }
> >+cpu_get_tb_cpu_state(env, , _base, );
> >+tb = atomic_rcu_read(>tb_jmp_cache[tb_jmp_cache_hash_func(addr)]);
> >+if (likely(tb && tb->pc == addr && tb->cs_base == cs_base &&
> >+   tb->flags == flags)) {
> 
> This comparison is wrong.  It will incorrectly reject a TB for i386 guest
> when CS_BASE != 0.  You really want
> 
>   tb = atomic_rcu_read(>tb_jmp_cache[tb_jmp_cache_hash_func(addr)]);
>   if (tb) {
> cpu_get_tb_cpu_state(env, , _base, );
> if (tb->pc == pc && tb->cs_base == cs_base && tb->flags == flags) {
>   return tb->tc_ptr;
> }
>   }
>   return tcg_ctx.code_gen_epilogue;
> 
> where you don't even load the cpu state if there isn't a preliminary hit in
> the cache.

Yes, I like this.

> (Note to self: That minor optimization would also apply to tb_find.)

FWIW I looked at tb_find -- you need the pc though, which comes from
loading the CPU state:

cpu_get_tb_cpu_state(env, , _base, );
   ^^
tb = atomic_rcu_read(>tb_jmp_cache[tb_jmp_cache_hash_func(pc)]);
   ^^

If we wanted to really avoid getting all the state I guess we'd have to add
another function that returned just the pc.

E.



Re: [Qemu-devel] [PATCH v3 01/10] tcg-runtime: add lookup_tb_ptr helper

2017-04-26 Thread Emilio G. Cota
On Wed, Apr 26, 2017 at 18:45:31 -0400, Emilio G. Cota wrote:
> On Thu, Apr 27, 2017 at 00:29:49 +0200, Richard Henderson wrote:
> > On 04/26/2017 11:56 PM, Emilio G. Cota wrote:
> > >On Wed, Apr 26, 2017 at 10:40:45 +0200, Richard Henderson wrote:
> > >>On 04/26/2017 08:23 AM, Emilio G. Cota wrote:
> > >(snip)
> > >>>+cpu_get_tb_cpu_state(env, , _base, );
> > >>>+tb = 
> > >>>atomic_rcu_read(>tb_jmp_cache[tb_jmp_cache_hash_func(addr)]);
> > >>>+if (likely(tb && tb->pc == addr && tb->cs_base == cs_base &&
> > >>>+   tb->flags == flags)) {
> > >>
> > >>This comparison is wrong.  It will incorrectly reject a TB for i386 guest
> > >>when CS_BASE != 0.  You really want
> > >>
> > >>   tb = atomic_rcu_read(>tb_jmp_cache[tb_jmp_cache_hash_func(addr)]);
> > >>   if (tb) {
> > >> cpu_get_tb_cpu_state(env, , _base, );
> > >> if (tb->pc == pc && tb->cs_base == cs_base && tb->flags == flags) {
> > >>   return tb->tc_ptr;
> > >> }
> > >>   }
> > >>   return tcg_ctx.code_gen_epilogue;
> > >
> > >wrt the comparison, the only change I notice in your suggested change is
> > >   tb->pc == pc
> > >
> > >instead of
> > >   tb->pc == addr
> > >
> > >, which seems innocuous to me (since tb->pc == addr).
> > >
> > >I fail to see how this relates to your "CS_BASE != 0" comment.
> > >What am I missing?
> > 
> > Recall how you computed vaddr for target/i386:
> > 
> >   addr = pc + cs_base
> 
> I see, thanks!

Hmm TB's are added to tb_jmp_cache by pc, not by pc + cs_base:

  atomic_set(>tb_jmp_cache[tb_jmp_cache_hash_func(pc)], tb);

Shouldn't we then pass just the pc (without adding cs_base) to
lookup_ptr, then? i.e.

--- a/target/i386/translate.c
+++ b/target/i386/translate.c
@@ -2533,11 +2533,7 @@ gen_eob_worker(DisasContext *s, bool inhibit, bool 
recheck_tf, TCGv jr)
 } else if (s->tf) {
 gen_helper_single_step(cpu_env);
 } else if (!TCGV_IS_UNUSED(jr)) {
-TCGv vaddr = tcg_temp_new();
-
-tcg_gen_add_tl(vaddr, jr, cpu_seg_base[R_CS]);
-tcg_gen_lookup_and_goto_ptr(vaddr);
-tcg_temp_free(vaddr);
+tcg_gen_lookup_and_goto_ptr(jr);
 } else {
 tcg_gen_exit_tb(0);
 }

And while at it, rename the "addr" argument in lookup_ptr to "pc". Hmm?

E.



Re: [Qemu-devel] [PATCH v3 01/10] tcg-runtime: add lookup_tb_ptr helper

2017-04-26 Thread Emilio G. Cota
On Thu, Apr 27, 2017 at 00:29:49 +0200, Richard Henderson wrote:
> On 04/26/2017 11:56 PM, Emilio G. Cota wrote:
> >On Wed, Apr 26, 2017 at 10:40:45 +0200, Richard Henderson wrote:
> >>On 04/26/2017 08:23 AM, Emilio G. Cota wrote:
> >(snip)
> >>>+cpu_get_tb_cpu_state(env, , _base, );
> >>>+tb = 
> >>>atomic_rcu_read(>tb_jmp_cache[tb_jmp_cache_hash_func(addr)]);
> >>>+if (likely(tb && tb->pc == addr && tb->cs_base == cs_base &&
> >>>+   tb->flags == flags)) {
> >>
> >>This comparison is wrong.  It will incorrectly reject a TB for i386 guest
> >>when CS_BASE != 0.  You really want
> >>
> >>   tb = atomic_rcu_read(>tb_jmp_cache[tb_jmp_cache_hash_func(addr)]);
> >>   if (tb) {
> >> cpu_get_tb_cpu_state(env, , _base, );
> >> if (tb->pc == pc && tb->cs_base == cs_base && tb->flags == flags) {
> >>   return tb->tc_ptr;
> >> }
> >>   }
> >>   return tcg_ctx.code_gen_epilogue;
> >
> >wrt the comparison, the only change I notice in your suggested change is
> >   tb->pc == pc
> >
> >instead of
> >   tb->pc == addr
> >
> >, which seems innocuous to me (since tb->pc == addr).
> >
> >I fail to see how this relates to your "CS_BASE != 0" comment.
> >What am I missing?
> 
> Recall how you computed vaddr for target/i386:
> 
>   addr = pc + cs_base

I see, thanks!

Emilio



Re: [Qemu-devel] [RFC PATCH 1/8] iommu: Introduce bind_pasid_table API function

2017-04-26 Thread jacob pan
On Wed, 26 Apr 2017 17:56:45 +0100
Jean-Philippe Brucker  wrote:

> Hi Yi, Jacob,
> 
> On 26/04/17 11:11, Liu, Yi L wrote:
> > From: Jacob Pan 
> > 
> > Virtual IOMMU was proposed to support Shared Virtual Memory (SVM)
> > use case in the guest:
> > https://lists.gnu.org/archive/html/qemu-devel/2016-11/msg05311.html
> > 
> > As part of the proposed architecture, when a SVM capable PCI
> > device is assigned to a guest, nested mode is turned on. Guest owns
> > the first level page tables (request with PASID) and performs
> > GVA->GPA translation. Second level page tables are owned by the
> > host for GPA->HPA translation for both request with and without
> > PASID.
> > 
> > A new IOMMU driver interface is therefore needed to perform tasks as
> > follows:
> > * Enable nested translation and appropriate translation type
> > * Assign guest PASID table pointer (in GPA) and size to host IOMMU
> > 
> > This patch introduces new functions called
> > iommu_(un)bind_pasid_table() to IOMMU APIs. Architecture specific
> > IOMMU function can be added later to perform the specific steps for
> > binding pasid table of assigned devices.
> > 
> > This patch also adds model definition in iommu.h. It would be used
> > to check if the bind request is from a compatible entity. e.g. a
> > bind request from an intel_iommu emulator may not be supported by
> > an ARM SMMU driver.
> > 
> > Signed-off-by: Jacob Pan 
> > Signed-off-by: Liu, Yi L 
> > ---
> >  drivers/iommu/iommu.c | 19 +++
> >  include/linux/iommu.h | 31 +++
> >  2 files changed, 50 insertions(+)
> > 
> > diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
> > index dbe7f65..f2da636 100644
> > --- a/drivers/iommu/iommu.c
> > +++ b/drivers/iommu/iommu.c
> > @@ -1134,6 +1134,25 @@ int iommu_attach_device(struct iommu_domain
> > *domain, struct device *dev) }
> >  EXPORT_SYMBOL_GPL(iommu_attach_device);
> >  
> > +int iommu_bind_pasid_table(struct iommu_domain *domain, struct
> > device *dev,
> > +   struct pasid_table_info *pasidt_binfo)
> 
> I guess that domain can always be deduced from dev using
> iommu_get_domain_for_dev, and doesn't need to be passed as argument?
> 
true. device should have attached a domain before calling this API.
> For the next version of my SVM series, I was thinking of passing group
> instead of device to iommu_bind. Since all devices in a group are
> expected to share the same mappings (whether they want it or not),
> users will have to do iommu_group_for_each_dev anyway (as you do in
> patch 6/8). So it might be simpler to let the IOMMU core take the
> group lock and do group->domain->ops->bind_task(dev...) for each
> device. The question also holds for iommu_do_invalidate in patch 3/8.
> 
> This way the prototypes would be:
> int iommu_bind...(struct iommu_group *group, struct ... *info)
> int iommu_unbind...(struct iommu_group *group, struct ...*info)
> int iommu_invalidate...(struct iommu_group *group, struct ...*info)
> 
Just to understand this granularity implication of fault notification
(e.g. page request) of this change. PRI for all devices in the group
will be enabled. IOMMU driver receives page request per device with the
same PASID bond to the group. There can be two scenarios:
1. If iommu_bind() to a task, IOMMU driver handles page fault
internally per device, there is no need to do group level, true?
2. If the device iommu_bind_pasid_table() is called, then we propagate
PRQ to VFIO per device.


> For PASID table binding it might not matter much, as VFIO will most
> likely be the only user. But task binding will be called by device
> drivers, which by now should be encouraged to do things at
> iommu_group granularity. Alternatively it could be done implicitly
> like in iommu_attach_device, with "iommu_bind_device_x" calling
> "iommu_bind_group_x".
> 
> 
> Extending this reasoning, since groups in a domain are also supposed
> to have the same mappings, then similarly to map/unmap,
> bind/unbind/invalidate should really be done with an iommu_domain (and
> nothing else) as target argument. However this requires the IOMMU
> core to keep a group list in each domain, which might complicate
> things a little too much.
> 
> But "all devices in a domain share the same PASID table" is the
> paradigm I'm currently using in the guts of arm-smmu-v3. And I wonder
> if, as with iommu_group, it should be made more explicit to users, so
> they don't assume that devices within a domain are isolated from each
> others with regard to PASID DMA.
> 
> > +{
> > +   if (unlikely(!domain->ops->bind_pasid_table))
> > +   return -EINVAL;
> > +
> > +   return domain->ops->bind_pasid_table(domain, dev,
> > pasidt_binfo); +}
> > +EXPORT_SYMBOL_GPL(iommu_bind_pasid_table);
> > +
> > +int iommu_unbind_pasid_table(struct iommu_domain *domain, struct
> > device *dev) +{

Re: [Qemu-devel] [PATCH v3 01/10] tcg-runtime: add lookup_tb_ptr helper

2017-04-26 Thread Richard Henderson

On 04/26/2017 11:56 PM, Emilio G. Cota wrote:

On Wed, Apr 26, 2017 at 10:40:45 +0200, Richard Henderson wrote:

On 04/26/2017 08:23 AM, Emilio G. Cota wrote:

(snip)

+cpu_get_tb_cpu_state(env, , _base, );
+tb = atomic_rcu_read(>tb_jmp_cache[tb_jmp_cache_hash_func(addr)]);
+if (likely(tb && tb->pc == addr && tb->cs_base == cs_base &&
+   tb->flags == flags)) {


This comparison is wrong.  It will incorrectly reject a TB for i386 guest
when CS_BASE != 0.  You really want

   tb = atomic_rcu_read(>tb_jmp_cache[tb_jmp_cache_hash_func(addr)]);
   if (tb) {
 cpu_get_tb_cpu_state(env, , _base, );
 if (tb->pc == pc && tb->cs_base == cs_base && tb->flags == flags) {
   return tb->tc_ptr;
 }
   }
   return tcg_ctx.code_gen_epilogue;


wrt the comparison, the only change I notice in your suggested change is
   tb->pc == pc

instead of
   tb->pc == addr

, which seems innocuous to me (since tb->pc == addr).

I fail to see how this relates to your "CS_BASE != 0" comment.
What am I missing?


Recall how you computed vaddr for target/i386:

  addr = pc + cs_base


r~



[Qemu-devel] [PATCH] iotests: Fix typo in 026

2017-04-26 Thread Eric Blake
s/refcout/refcount/

CC: qemu-triv...@nongnu.org
Signed-off-by: Eric Blake 
---
 tests/qemu-iotests/026 | 2 +-
 tests/qemu-iotests/026.out | 2 +-
 tests/qemu-iotests/026.out.nocache | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/026 b/tests/qemu-iotests/026
index f5a7f02..7fadfba 100755
--- a/tests/qemu-iotests/026
+++ b/tests/qemu-iotests/026
@@ -119,7 +119,7 @@ done


 echo
-echo === Refcout table growth tests ===
+echo === Refcount table growth tests ===
 echo
 CLUSTER_SIZE=512

diff --git a/tests/qemu-iotests/026.out b/tests/qemu-iotests/026.out
index 59b8f74..86a50a2 100644
--- a/tests/qemu-iotests/026.out
+++ b/tests/qemu-iotests/026.out
@@ -462,7 +462,7 @@ Event: cluster_alloc; errno: 28; imm: off; once: off; write 
-b
 write failed: No space left on device
 No errors were found on the image.

-=== Refcout table growth tests ===
+=== Refcount table growth tests ===

 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824

diff --git a/tests/qemu-iotests/026.out.nocache 
b/tests/qemu-iotests/026.out.nocache
index b4aeebc..ea2e166 100644
--- a/tests/qemu-iotests/026.out.nocache
+++ b/tests/qemu-iotests/026.out.nocache
@@ -470,7 +470,7 @@ Event: cluster_alloc; errno: 28; imm: off; once: off; write 
-b
 write failed: No space left on device
 No errors were found on the image.

-=== Refcout table growth tests ===
+=== Refcount table growth tests ===

 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 

-- 
2.9.3




Re: [Qemu-devel] [PATCH 1/4] hw/ppc: setting spapr_drc_detach_cb in spapr_dr_connector_new

2017-04-26 Thread Michael Roth
Quoting Daniel Henrique Barboza (2017-04-26 16:31:21)
> The idea of moving the detach callback functions to the constructor
> of the dr_connector is to set them statically at init time, avoiding
> any post-load hooks to restore it (after a migration, for example).
> 
> Summary of changes:
> 
> - hw/ppc/spapr_drc.c and include/hw/ppc/spapr_drc.h:
> *  spapr_dr_connector_new() now has an additional parameter,
> spapr_drc_detach_cb *detach_cb
> *  'spapr_drc_detach_cb *detach_cb' parameter was removed of
> the detach function pointer in sPAPRDRConnectorClass
> 
> - hw/ppc/spapr_pci.c:
> * the callback 'spapr_phb_remove_pci_device_cb' is now passed
> as a parameter in 'spapr_dr_connector_new' instead of 'drck->detach()'
> 
> - hw/ppc/spapr.c:
> * 'spapr_create_lmb_dr_connectors' now passes the callback
> 'spapr_lmb_release' to 'spapr_dr_connector_new' instead of 'drck-detach()'
> * 'spapr_init_cpus' now passes the callback 'spapr_core_release'
> to 'spapr_dr_connector_new' instead of 'drck-detach()'
> * moved the callback functions up in the code so they can be referenced
> by 'spapr_create_lmb_dr_connectors' and 'spapr_init_cpus'
> 
> Signed-off-by: Daniel Henrique Barboza 

Patch looks good, but we need a follow-up afterward that removes the
need for the void *opaque argument as I mentioned in my last email.
Otherwise we'd still need to restore the opaque argument in post-load
which isn't really viable.

> ---
>  hw/ppc/spapr.c | 71 
> +++---
>  hw/ppc/spapr_drc.c | 17 ++-
>  hw/ppc/spapr_pci.c |  5 ++--
>  include/hw/ppc/spapr_drc.h |  4 +--
>  4 files changed, 49 insertions(+), 48 deletions(-)
> 
> diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
> index 80d12d0..bc11757 100644
> --- a/hw/ppc/spapr.c
> +++ b/hw/ppc/spapr.c
> @@ -1887,6 +1887,29 @@ static void spapr_drc_reset(void *opaque)
>  }
>  }
> 
> +typedef struct sPAPRDIMMState {
> +uint32_t nr_lmbs;
> +} sPAPRDIMMState;
> +
> +static void spapr_lmb_release(DeviceState *dev, void *opaque)
> +{
> +sPAPRDIMMState *ds = (sPAPRDIMMState *)opaque;
> +HotplugHandler *hotplug_ctrl;
> +
> +if (--ds->nr_lmbs) {
> +return;
> +}
> +
> +g_free(ds);
> +
> +/*
> + * Now that all the LMBs have been removed by the guest, call the
> + * pc-dimm unplug handler to cleanup up the pc-dimm device.
> + */
> +hotplug_ctrl = qdev_get_hotplug_handler(dev);
> +hotplug_handler_unplug(hotplug_ctrl, dev, _abort);
> +}
> +
>  static void spapr_create_lmb_dr_connectors(sPAPRMachineState *spapr)
>  {
>  MachineState *machine = MACHINE(spapr);
> @@ -1900,7 +1923,7 @@ static void 
> spapr_create_lmb_dr_connectors(sPAPRMachineState *spapr)
> 
>  addr = i * lmb_size + spapr->hotplug_memory.base;
>  drc = spapr_dr_connector_new(OBJECT(spapr), 
> SPAPR_DR_CONNECTOR_TYPE_LMB,
> - addr/lmb_size);
> + (addr / lmb_size), spapr_lmb_release);
>  qemu_register_reset(spapr_drc_reset, drc);
>  }
>  }
> @@ -1956,6 +1979,14 @@ static CPUArchId *spapr_find_cpu_slot(MachineState 
> *ms, uint32_t id, int *idx)
>  return >possible_cpus->cpus[index];
>  }
> 
> +static void spapr_core_release(DeviceState *dev, void *opaque)
> +{
> +HotplugHandler *hotplug_ctrl;
> +
> +hotplug_ctrl = qdev_get_hotplug_handler(dev);
> +hotplug_handler_unplug(hotplug_ctrl, dev, _abort);
> +}
> +
>  static void spapr_init_cpus(sPAPRMachineState *spapr)
>  {
>  MachineState *machine = MACHINE(spapr);
> @@ -1998,7 +2029,8 @@ static void spapr_init_cpus(sPAPRMachineState *spapr)
>  sPAPRDRConnector *drc =
>  spapr_dr_connector_new(OBJECT(spapr),
> SPAPR_DR_CONNECTOR_TYPE_CPU,
> -   (core_id / smp_threads) * smt);
> +   (core_id / smp_threads) * smt,
> +   spapr_core_release);
> 
>  qemu_register_reset(spapr_drc_reset, drc);
>  }
> @@ -2596,29 +2628,6 @@ out:
>  error_propagate(errp, local_err);
>  }
> 
> -typedef struct sPAPRDIMMState {
> -uint32_t nr_lmbs;
> -} sPAPRDIMMState;
> -
> -static void spapr_lmb_release(DeviceState *dev, void *opaque)
> -{
> -sPAPRDIMMState *ds = (sPAPRDIMMState *)opaque;
> -HotplugHandler *hotplug_ctrl;
> -
> -if (--ds->nr_lmbs) {
> -return;
> -}
> -
> -g_free(ds);
> -
> -/*
> - * Now that all the LMBs have been removed by the guest, call the
> - * pc-dimm unplug handler to cleanup up the pc-dimm device.
> - */
> -hotplug_ctrl = qdev_get_hotplug_handler(dev);
> -hotplug_handler_unplug(hotplug_ctrl, dev, _abort);
> -}
> -
>  static void spapr_del_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t 
> size,
> Error 

Re: [Qemu-devel] [PATCH v3 01/10] tcg-runtime: add lookup_tb_ptr helper

2017-04-26 Thread Emilio G. Cota
On Wed, Apr 26, 2017 at 10:40:45 +0200, Richard Henderson wrote:
> On 04/26/2017 08:23 AM, Emilio G. Cota wrote:
(snip)
> >+cpu_get_tb_cpu_state(env, , _base, );
> >+tb = atomic_rcu_read(>tb_jmp_cache[tb_jmp_cache_hash_func(addr)]);
> >+if (likely(tb && tb->pc == addr && tb->cs_base == cs_base &&
> >+   tb->flags == flags)) {
> 
> This comparison is wrong.  It will incorrectly reject a TB for i386 guest
> when CS_BASE != 0.  You really want
> 
>   tb = atomic_rcu_read(>tb_jmp_cache[tb_jmp_cache_hash_func(addr)]);
>   if (tb) {
> cpu_get_tb_cpu_state(env, , _base, );
> if (tb->pc == pc && tb->cs_base == cs_base && tb->flags == flags) {
>   return tb->tc_ptr;
> }
>   }
>   return tcg_ctx.code_gen_epilogue;

wrt the comparison, the only change I notice in your suggested change is
  tb->pc == pc

instead of
  tb->pc == addr

, which seems innocuous to me (since tb->pc == addr).

I fail to see how this relates to your "CS_BASE != 0" comment.
What am I missing?

E.




Re: [Qemu-devel] [PATCH 02/31] block: Make bdrv_round_to_clusters() signature more useful

2017-04-26 Thread John Snow


On 04/26/2017 05:47 PM, Eric Blake wrote:
> On 04/26/2017 04:41 PM, John Snow wrote:
>>
>>
>> On 04/17/2017 09:33 PM, Eric Blake wrote:
>>> In the process of converting sector-based interfaces to bytes,
>>> I'm finding it easier to represent a byte count as a 64-bit
>>> integer at the block layer (even if we are internally capped
>>> by SIZE_MAX or even INT_MAX for individual transactions, it's
>>> still nicer to not have to worry about truncation/overflow
>>> issues on as many variables).  Update the signature of
>>> bdrv_round_to_clusters() to uniformly use uint64_t, matching
>> 
>>
>> While we're here, since you went with int64_t in the end, what steered
>> you away from uint64_t, or was that just a thinko?
> 
> Later patches were made easier with signed (the compiler complained when
> I mixed signed and unsigned pointers).
> 
>>
>> (AFAICT: off_t is usually something like int64_t, so your choice makes
>> sense to me, generally.)
> 
> Indeed, and that's something I should update my commit message to mention.
> 
>>
>> --js
>>
>>> the signature already chosen for bdrv_is_allocated, and
>>> adjust clients according to the required fallout.
> 
> If you want me to try and use uint64_t *pnum instead of int64_t *pnum
> throughout both my series 1  (the changes to bdrv_is_allocated) and this
> one, it will take more effort.  I'll do it if there's a reason, but I'd
> rather not if the signed version is good enough.
> 

No, I didn't mean to imply you should, I was just pointing out the
commit message typo. int64_t is likely the correct choice for a number
of reasons, at least being able to return -1 from functions returning a
byte offset being the chief reason.

--js



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH 02/31] block: Make bdrv_round_to_clusters() signature more useful

2017-04-26 Thread Eric Blake
On 04/26/2017 04:41 PM, John Snow wrote:
> 
> 
> On 04/17/2017 09:33 PM, Eric Blake wrote:
>> In the process of converting sector-based interfaces to bytes,
>> I'm finding it easier to represent a byte count as a 64-bit
>> integer at the block layer (even if we are internally capped
>> by SIZE_MAX or even INT_MAX for individual transactions, it's
>> still nicer to not have to worry about truncation/overflow
>> issues on as many variables).  Update the signature of
>> bdrv_round_to_clusters() to uniformly use uint64_t, matching
> 
> 
> While we're here, since you went with int64_t in the end, what steered
> you away from uint64_t, or was that just a thinko?

Later patches were made easier with signed (the compiler complained when
I mixed signed and unsigned pointers).

> 
> (AFAICT: off_t is usually something like int64_t, so your choice makes
> sense to me, generally.)

Indeed, and that's something I should update my commit message to mention.

> 
> --js
> 
>> the signature already chosen for bdrv_is_allocated, and
>> adjust clients according to the required fallout.

If you want me to try and use uint64_t *pnum instead of int64_t *pnum
throughout both my series 1  (the changes to bdrv_is_allocated) and this
one, it will take more effort.  I'll do it if there's a reason, but I'd
rather not if the signed version is good enough.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH 01/31] block: Drop unused bdrv_round_sectors_to_clusters()

2017-04-26 Thread John Snow


On 04/17/2017 09:33 PM, Eric Blake wrote:
> Now that the last user [mirror_iteration()] has converted to using
> bytes, we no longer need a function to round sectors to clusters.
> 
> Signed-off-by: Eric Blake 

Reviewed-by: John Snow 

> ---
>  include/block/block.h |  4 
>  block/io.c| 21 -
>  2 files changed, 25 deletions(-)
> 
> diff --git a/include/block/block.h b/include/block/block.h
> index 740cb86..86ad511 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -467,10 +467,6 @@ const char *bdrv_get_device_or_node_name(const 
> BlockDriverState *bs);
>  int bdrv_get_flags(BlockDriverState *bs);
>  int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
>  ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs);
> -void bdrv_round_sectors_to_clusters(BlockDriverState *bs,
> -int64_t sector_num, int nb_sectors,
> -int64_t *cluster_sector_num,
> -int *cluster_nb_sectors);
>  void bdrv_round_to_clusters(BlockDriverState *bs,
>  int64_t offset, unsigned int bytes,
>  int64_t *cluster_offset,
> diff --git a/block/io.c b/block/io.c
> index d22d35f..d61a906 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -419,27 +419,6 @@ static void mark_request_serialising(BdrvTrackedRequest 
> *req, uint64_t align)
>  }
> 
>  /**
> - * Round a region to cluster boundaries (sector-based)
> - */
> -void bdrv_round_sectors_to_clusters(BlockDriverState *bs,
> -int64_t sector_num, int nb_sectors,
> -int64_t *cluster_sector_num,
> -int *cluster_nb_sectors)
> -{
> -BlockDriverInfo bdi;
> -
> -if (bdrv_get_info(bs, ) < 0 || bdi.cluster_size == 0) {
> -*cluster_sector_num = sector_num;
> -*cluster_nb_sectors = nb_sectors;
> -} else {
> -int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE;
> -*cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c);
> -*cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num 
> +
> -nb_sectors, c);
> -}
> -}
> -
> -/**
>   * Round a region to cluster boundaries
>   */
>  void bdrv_round_to_clusters(BlockDriverState *bs,
> 

-- 
—js



Re: [Qemu-devel] [PATCH 02/31] block: Make bdrv_round_to_clusters() signature more useful

2017-04-26 Thread John Snow


On 04/17/2017 09:33 PM, Eric Blake wrote:
> In the process of converting sector-based interfaces to bytes,
> I'm finding it easier to represent a byte count as a 64-bit
> integer at the block layer (even if we are internally capped
> by SIZE_MAX or even INT_MAX for individual transactions, it's
> still nicer to not have to worry about truncation/overflow
> issues on as many variables).  Update the signature of
> bdrv_round_to_clusters() to uniformly use uint64_t, matching


While we're here, since you went with int64_t in the end, what steered
you away from uint64_t, or was that just a thinko?

(AFAICT: off_t is usually something like int64_t, so your choice makes
sense to me, generally.)

--js

> the signature already chosen for bdrv_is_allocated, and
> adjust clients according to the required fallout.
> 
> Signed-off-by: Eric Blake 
> ---
>  include/block/block.h | 4 ++--
>  block/io.c| 7 ---
>  block/mirror.c| 9 -
>  block/trace-events| 2 +-
>  4 files changed, 11 insertions(+), 11 deletions(-)
> 
> diff --git a/include/block/block.h b/include/block/block.h
> index 86ad511..eed1330 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -468,9 +468,9 @@ int bdrv_get_flags(BlockDriverState *bs);
>  int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo *bdi);
>  ImageInfoSpecific *bdrv_get_specific_info(BlockDriverState *bs);
>  void bdrv_round_to_clusters(BlockDriverState *bs,
> -int64_t offset, unsigned int bytes,
> +int64_t offset, int64_t bytes,
>  int64_t *cluster_offset,
> -unsigned int *cluster_bytes);
> +int64_t *cluster_bytes);
> 
>  const char *bdrv_get_encrypted_filename(BlockDriverState *bs);
>  void bdrv_get_backing_filename(BlockDriverState *bs,
> diff --git a/block/io.c b/block/io.c
> index d61a906..07165dc 100644
> --- a/block/io.c
> +++ b/block/io.c
> @@ -422,9 +422,9 @@ static void mark_request_serialising(BdrvTrackedRequest 
> *req, uint64_t align)
>   * Round a region to cluster boundaries
>   */
>  void bdrv_round_to_clusters(BlockDriverState *bs,
> -int64_t offset, unsigned int bytes,
> +int64_t offset, int64_t bytes,
>  int64_t *cluster_offset,
> -unsigned int *cluster_bytes)
> +int64_t *cluster_bytes)
>  {
>  BlockDriverInfo bdi;
> 
> @@ -920,7 +920,7 @@ static int coroutine_fn 
> bdrv_co_do_copy_on_readv(BdrvChild *child,
>  struct iovec iov;
>  QEMUIOVector bounce_qiov;
>  int64_t cluster_offset;
> -unsigned int cluster_bytes;
> +int64_t cluster_bytes;
>  size_t skip_bytes;
>  int ret;
> 
> @@ -941,6 +941,7 @@ static int coroutine_fn 
> bdrv_co_do_copy_on_readv(BdrvChild *child,
>  trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
> cluster_offset, cluster_bytes);
> 
> +assert(cluster_bytes < SIZE_MAX);
>  iov.iov_len = cluster_bytes;
>  iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len);
>  if (bounce_buffer == NULL) {
> diff --git a/block/mirror.c b/block/mirror.c
> index 846e392..2510793 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -177,7 +177,7 @@ static void mirror_read_complete(void *opaque, int ret)
>  /* Clip bytes relative to offset to not exceed end-of-file */
>  static inline void mirror_clip_bytes(MirrorBlockJob *s,
>   int64_t offset,
> - unsigned int *bytes)
> + int64_t *bytes)
>  {
>  *bytes = MIN(*bytes, s->bdev_length - offset);
>  }
> @@ -190,10 +190,9 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t 
> *offset,
>  bool need_cow;
>  int ret = 0;
>  int64_t align_offset = *offset;
> -unsigned int align_bytes = *bytes;
> +int64_t align_bytes = *bytes;
>  int max_bytes = s->granularity * s->max_iov;
> 
> -assert(*bytes < INT_MAX);
>  need_cow = !test_bit(*offset / s->granularity, s->cow_bitmap);
>  need_cow |= !test_bit((*offset + *bytes - 1) / s->granularity,
>s->cow_bitmap);
> @@ -384,7 +383,7 @@ static uint64_t coroutine_fn 
> mirror_iteration(MirrorBlockJob *s)
>  while (nb_chunks > 0 && offset < s->bdev_length) {
>  int64_t ret;
>  int io_sectors;
> -unsigned int io_bytes;
> +int64_t io_bytes;
>  int64_t io_bytes_acct;
>  BlockDriverState *file;
>  enum MirrorMethod {
> @@ -410,7 +409,7 @@ static uint64_t coroutine_fn 
> mirror_iteration(MirrorBlockJob *s)
>  io_bytes = s->granularity;
>  } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) {
>  int64_t target_offset;
> -  

[Qemu-devel] [PATCH 4/4] migration: spapr: migrate pending_events of spapr state

2017-04-26 Thread Daniel Henrique Barboza
From: Jianjun Duan 

In racing situations between hotplug events and migration operation,
a rtas hotplug event could have not yet be delivered to the source
guest when migration is started. In this case the pending_events of
spapr state need be transmitted to the target so that the hotplug
event can be finished on the target.

All the different fields of the events are encoded as defined by
PAPR. We can migrate them as uint8_t binary stream without any
concerns about data padding or endianess.

pending_events is put in a subsection in the spapr state VMSD to make
sure migration across different versions is not broken.

Signed-off-by: Jianjun Duan 
Signed-off-by: Daniel Henrique Barboza 
---
 hw/ppc/spapr.c | 33 +
 hw/ppc/spapr_events.c  | 24 +---
 include/hw/ppc/spapr.h |  3 ++-
 3 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 2a788dc..bc4ca91 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1468,6 +1468,38 @@ static const VMStateDescription vmstate_spapr_ccs_list = 
{
 },
 };
 
+static bool spapr_pending_events_needed(void *opaque)
+{
+sPAPRMachineState *spapr = (sPAPRMachineState *)opaque;
+return !QTAILQ_EMPTY(>pending_events);
+}
+
+static const VMStateDescription vmstate_spapr_event_entry = {
+.name = "spapr_event_log_entry",
+.version_id = 1,
+.minimum_version_id = 1,
+.fields = (VMStateField[]) {
+VMSTATE_INT32(log_type, sPAPREventLogEntry),
+VMSTATE_BOOL(exception, sPAPREventLogEntry),
+VMSTATE_UINT32(data_size, sPAPREventLogEntry),
+VMSTATE_VARRAY_UINT32_ALLOC(data, sPAPREventLogEntry, data_size,
+0, vmstate_info_uint8, uint8_t),
+VMSTATE_END_OF_LIST()
+},
+};
+
+static const VMStateDescription vmstate_spapr_pending_events = {
+.name = "spapr_pending_events",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = spapr_pending_events_needed,
+.fields = (VMStateField[]) {
+VMSTATE_QTAILQ_V(pending_events, sPAPRMachineState, 1,
+ vmstate_spapr_event_entry, sPAPREventLogEntry, next),
+VMSTATE_END_OF_LIST()
+},
+};
+
 static bool spapr_ov5_cas_needed(void *opaque)
 {
 sPAPRMachineState *spapr = opaque;
@@ -1567,6 +1599,7 @@ static const VMStateDescription vmstate_spapr = {
 _spapr_ov5_cas,
 _spapr_patb_entry,
 _spapr_ccs_list,
+_spapr_pending_events,
 NULL
 }
 };
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index f0b28d8..70c7cfc 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -342,7 +342,8 @@ static int rtas_event_log_to_irq(sPAPRMachineState *spapr, 
int log_type)
 return source->irq;
 }
 
-static void rtas_event_log_queue(int log_type, void *data, bool exception)
+static void rtas_event_log_queue(int log_type, void *data, bool exception,
+ int data_size)
 {
 sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
 sPAPREventLogEntry *entry = g_new(sPAPREventLogEntry, 1);
@@ -351,6 +352,7 @@ static void rtas_event_log_queue(int log_type, void *data, 
bool exception)
 entry->log_type = log_type;
 entry->exception = exception;
 entry->data = data;
+entry->data_size = data_size;
 QTAILQ_INSERT_TAIL(>pending_events, entry, next);
 }
 
@@ -445,6 +447,7 @@ static void spapr_powerdown_req(Notifier *n, void *opaque)
 struct rtas_event_log_v6_mainb *mainb;
 struct rtas_event_log_v6_epow *epow;
 struct epow_log_full *new_epow;
+uint32_t data_size;
 
 new_epow = g_malloc0(sizeof(*new_epow));
 hdr = _epow->hdr;
@@ -453,14 +456,13 @@ static void spapr_powerdown_req(Notifier *n, void *opaque)
 mainb = _epow->mainb;
 epow = _epow->epow;
 
+data_size = sizeof(*new_epow);
 hdr->summary = cpu_to_be32(RTAS_LOG_VERSION_6
| RTAS_LOG_SEVERITY_EVENT
| RTAS_LOG_DISPOSITION_NOT_RECOVERED
| RTAS_LOG_OPTIONAL_PART_PRESENT
| RTAS_LOG_TYPE_EPOW);
-hdr->extended_length = cpu_to_be32(sizeof(*new_epow)
-   - sizeof(new_epow->hdr));
-
+hdr->extended_length = cpu_to_be32(data_size - sizeof(new_epow->hdr));
 spapr_init_v6hdr(v6hdr);
 spapr_init_maina(maina, 3 /* Main-A, Main-B and EPOW */);
 
@@ -479,7 +481,7 @@ static void spapr_powerdown_req(Notifier *n, void *opaque)
 epow->event_modifier = RTAS_LOG_V6_EPOW_MODIFIER_NORMAL;
 epow->extended_modifier = RTAS_LOG_V6_EPOW_XMODIFIER_PARTITION_SPECIFIC;
 
-rtas_event_log_queue(RTAS_LOG_TYPE_EPOW, new_epow, true);
+rtas_event_log_queue(RTAS_LOG_TYPE_EPOW, new_epow, true, data_size);
 
 qemu_irq_pulse(xics_get_qirq(XICS_FABRIC(spapr),
   

[Qemu-devel] [PATCH 2/4] hw/ppc: migrating the DRC state of hotplugged devices

2017-04-26 Thread Daniel Henrique Barboza
In pseries, a firmware abstraction called Dynamic Reconfiguration
Connector (DRC) is used to assign a particular dynamic resource
to the guest and provide an interface to manage configuration/removal
of the resource associated with it. In other words, DRC is the
'plugged state' of a device.

Before this patch, DRC wasn't being migrated. This causes
post-migration problems due to DRC state mismatch between source and
target. The DRC state of a device X in the source might
change, while in the target the DRC state of X is still fresh. When
migrating the guest, X will not have the same hotplugged state as it
did in the source. This means that we can't hot unplug X in the
target after migration is completed because its DRC state is not consistent.
https://bugs.launchpad.net/ubuntu/+source/qemu/+bug/1677552 is one
bug that is caused by this DRC state mismatch between source and
target.

To migrate the DRC state, we defined the VMStateDescription struct for
spapr_drc to enable the transmission of spapr_drc state in migration.
Not all the elements in the DRC state are migrated - only those
that can be modified by guest actions or device add/remove
operations:

- 'isolation_state', 'allocation_state' and 'configured' are involved
in the DR state transition diagram from PAPR+ 2.7, 13.4;

- 'configured' and 'signalled' are needed in attaching and detaching
devices;

- 'indicator_state' provides users with hardware state information.

These are the DRC elements that are migrated.

In this patch the DRC state is migrated for PCI, LMB and CPU
connector types. At this moment there is no support to migrate
DRC for the PHB (PCI Host Bridge) type.

In the 'realize' function the DRC is registered using vmstate_register,
similar to what hw/ppc/spapr_iommu.c does in 'spapr_tce_table_realize'.
This approach works because  DRCs are bus-less and do not sit
on a BusClass that implements bc->get_dev_path, so as a fallback the
VMSD gets identified via "spapr_drc"/get_index(drc).

Signed-off-by: Daniel Henrique Barboza 
---
 hw/ppc/spapr_drc.c | 61 ++
 1 file changed, 61 insertions(+)

diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
index afe5d82..13d18f5 100644
--- a/hw/ppc/spapr_drc.c
+++ b/hw/ppc/spapr_drc.c
@@ -512,6 +512,64 @@ static void reset(DeviceState *d)
 }
 }
 
+static bool spapr_drc_needed(void *opaque)
+{
+sPAPRDRConnector *drc = (sPAPRDRConnector *)opaque;
+sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+bool rc = false;
+sPAPRDREntitySense value;
+drck->entity_sense(drc, );
+/* If no dev is plugged in there is no need to migrate the DRC state */
+if (value != SPAPR_DR_ENTITY_SENSE_PRESENT) {
+return false;
+}
+
+/*
+ * If there is dev plugged in, we need to migrate the DRC state when
+ * it is different from cold-plugged state
+ */
+switch (drc->type) {
+
+case SPAPR_DR_CONNECTOR_TYPE_PCI:
+rc = !((drc->isolation_state == SPAPR_DR_ISOLATION_STATE_UNISOLATED) &&
+   (drc->allocation_state == SPAPR_DR_ALLOCATION_STATE_USABLE) &&
+   drc->configured && drc->signalled && !drc->awaiting_release);
+break;
+
+case SPAPR_DR_CONNECTOR_TYPE_LMB:
+rc = !((drc->isolation_state == SPAPR_DR_ISOLATION_STATE_ISOLATED) &&
+   (drc->allocation_state == SPAPR_DR_ALLOCATION_STATE_UNUSABLE) &&
+   drc->configured && drc->signalled && !drc->awaiting_release);
+break;
+
+case SPAPR_DR_CONNECTOR_TYPE_CPU:
+rc = !((drc->isolation_state == SPAPR_DR_ISOLATION_STATE_ISOLATED) &&
+   (drc->allocation_state == SPAPR_DR_ALLOCATION_STATE_UNUSABLE) &&
+drc->configured && drc->signalled && !drc->awaiting_release);
+break;
+
+default:
+;
+}
+return rc;
+}
+
+static const VMStateDescription vmstate_spapr_drc = {
+.name = "spapr_drc",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = spapr_drc_needed,
+.fields  = (VMStateField []) {
+VMSTATE_UINT32(isolation_state, sPAPRDRConnector),
+VMSTATE_UINT32(allocation_state, sPAPRDRConnector),
+VMSTATE_UINT32(indicator_state, sPAPRDRConnector),
+VMSTATE_BOOL(configured, sPAPRDRConnector),
+VMSTATE_BOOL(awaiting_release, sPAPRDRConnector),
+VMSTATE_BOOL(signalled, sPAPRDRConnector),
+VMSTATE_END_OF_LIST()
+}
+};
+
 static void realize(DeviceState *d, Error **errp)
 {
 sPAPRDRConnector *drc = SPAPR_DR_CONNECTOR(d);
@@ -540,6 +598,8 @@ static void realize(DeviceState *d, Error **errp)
 object_unref(OBJECT(drc));
 }
 g_free(child_name);
+vmstate_register(DEVICE(drc), drck->get_index(drc), _spapr_drc,
+ drc);
 trace_spapr_drc_realize_complete(drck->get_index(drc));
 }
 
@@ -658,6 +718,7 @@ static void spapr_dr_connector_class_init(ObjectClass *k, 
void *data)
 

[Qemu-devel] [PATCH 3/4] migration: spapr: migrate ccs_list in spapr state

2017-04-26 Thread Daniel Henrique Barboza
From: Jianjun Duan 

ccs_list in spapr state maintains the device tree related
information on the rtas side for hotplugged devices. In racing
situations between hotplug events and migration operation, a rtas
hotplug event could be migrated from the source guest to target
guest, or the source guest could have not yet finished fetching
the device tree when migration is started, the target will try
to finish fetching the device tree. By migrating ccs_list, the
target can fetch the device tree properly.

In theory there would be other alternatives besides migrating the
css_list to fix this. For example, we could add a flag that indicates
whether a device is in the middle of the configure_connector during the
migration process, in the post_load we can detect if this flag
is active and then return an error informing the guest to restart the
hotplug process. However, the DRC state can still be modified outside of
hotplug. Using:

   drmgr -c pci -s  -r
   drmgr -c pci -s  -a

it is possible to return a device to firmware and then later take it
back and reconfigure it. This is not a common case but it's not prohibited,
and performing a migration between these 2 operations would fail because
the default coldplug state on target assumes a configured state in
the source*. Migrating ccs_list is one solution that cover this
case as well.

ccs_list is put in a subsection in the spapr state VMSD to make
sure migration across different versions is not broken.

* see http://lists.nongnu.org/archive/html/qemu-devel/2016-10/msg01763.html
for more information on this discussion.

Signed-off-by: Jianjun Duan 
Signed-off-by: Daniel Henrique Barboza 
---
 hw/ppc/spapr.c | 32 
 1 file changed, 32 insertions(+)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index bc11757..2a788dc 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1437,6 +1437,37 @@ static bool version_before_3(void *opaque, int 
version_id)
 return version_id < 3;
 }
 
+static bool spapr_ccs_list_needed(void *opaque)
+{
+sPAPRMachineState *spapr = (sPAPRMachineState *)opaque;
+return !QTAILQ_EMPTY(>ccs_list);
+}
+
+static const VMStateDescription vmstate_spapr_ccs = {
+.name = "spapr_configure_connector_state",
+.version_id = 1,
+.minimum_version_id = 1,
+.fields = (VMStateField[]) {
+VMSTATE_UINT32(drc_index, sPAPRConfigureConnectorState),
+VMSTATE_INT32(fdt_offset, sPAPRConfigureConnectorState),
+VMSTATE_INT32(fdt_depth, sPAPRConfigureConnectorState),
+VMSTATE_END_OF_LIST()
+},
+};
+
+static const VMStateDescription vmstate_spapr_ccs_list = {
+.name = "spapr_ccs_list",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = spapr_ccs_list_needed,
+.fields = (VMStateField[]) {
+VMSTATE_QTAILQ_V(ccs_list, sPAPRMachineState, 1,
+ vmstate_spapr_ccs, sPAPRConfigureConnectorState,
+ next),
+VMSTATE_END_OF_LIST()
+},
+};
+
 static bool spapr_ov5_cas_needed(void *opaque)
 {
 sPAPRMachineState *spapr = opaque;
@@ -1535,6 +1566,7 @@ static const VMStateDescription vmstate_spapr = {
 .subsections = (const VMStateDescription*[]) {
 _spapr_ov5_cas,
 _spapr_patb_entry,
+_spapr_ccs_list,
 NULL
 }
 };
-- 
2.9.3




[Qemu-devel] [PATCH 0/4 v7] migration/ppc: migrating DRC, ccs_list and pending_events

2017-04-26 Thread Daniel Henrique Barboza
v7:
- removed the first patch. DRC registration is now done by vmstate_register
in patch 2.
- added a new patch that changes spapr_dr_connector_new to receive as argument
the detach_cb.
- removed the callback logic of patch 2 since there is no need to restore the
detach_cb on post-load due to the detach_cb on spapr_dr_connector_new change.
- added word separators in the VMSD names of patch 3 and 4.

v6: - Rebased with QEMU master after 6+ months.
- Simplified the logic in patch 1.
- Reworked patch 2: added CPU DRC migration, removed a function pointer 
from DRC
class and minor improvements.
- Added clarifications from the previous v5 discussions in the commit 
messages.

v5: - Rebased to David's ppc-for-2.8.

v4: - Introduce a way to set customized instance_id in SaveStateEntry. Use it
  to set instance_id for DRC using its unique index to address David 
  Gibson's concern.
- Rename VMS_CSTM to VMS_LINKED based on Paolo Bonzini's suggestions.
- Clean up qjson stuff in put_qtailq. 
- Add trace for put_qtailq and get_qtailq based on David Gilbert's 
  suggestion.

- Based on David's ppc-for-2.7. 

v3: - Simplify overall design followng discussion with Paolo. No longer need
  metadata to migrate QTAILQ.
- Extend VMStateInfo instead of adding similar fields to VMStateField.
- Clean up macros in qemu/queue.h.
(link: https://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg05695.html)

v2: - Introduce a general approach to migrate QTAILQ in qemu/queue.h.
- Migrate signalled field in the DRC state.
- Put the newly added migrating fields in subsections so that backward 
  migration is not broken.  
- Set detach_cb field right after migration so that a migrated hot-unplug
  event could finish its course.
(link: https://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg04188.html)

v1: - Inital version.
(link: https://lists.nongnu.org/archive/html/qemu-devel/2016-04/msg02601.html)


To make guest devices (PCI, CPU and memory) hotplug work together 
with guest migration, spapr drc state needs be transmitted in
migration. This patch defines the VMStateDescription struct for
spapr drc state to enable it.

To fix the potential racing between hotplug events on guest and 
guest migration, ccs_list and pending_events of spapr state need be 
transmitted in migration. This patch set also takes care of it.




Daniel Henrique Barboza (2):
  hw/ppc: setting spapr_drc_detach_cb in spapr_dr_connector_new
  hw/ppc: migrating the DRC state of hotplugged devices

Jianjun Duan (2):
  migration: spapr: migrate ccs_list in spapr state
  migration: spapr: migrate pending_events of spapr state

 hw/ppc/spapr.c | 136 +
 hw/ppc/spapr_drc.c |  78 +++---
 hw/ppc/spapr_events.c  |  24 
 hw/ppc/spapr_pci.c |   5 +-
 include/hw/ppc/spapr.h |   3 +-
 include/hw/ppc/spapr_drc.h |   4 +-
 6 files changed, 190 insertions(+), 60 deletions(-)

-- 
2.9.3




[Qemu-devel] [PATCH 1/4] hw/ppc: setting spapr_drc_detach_cb in spapr_dr_connector_new

2017-04-26 Thread Daniel Henrique Barboza
The idea of moving the detach callback functions to the constructor
of the dr_connector is to set them statically at init time, avoiding
any post-load hooks to restore it (after a migration, for example).

Summary of changes:

- hw/ppc/spapr_drc.c and include/hw/ppc/spapr_drc.h:
*  spapr_dr_connector_new() now has an additional parameter,
spapr_drc_detach_cb *detach_cb
*  'spapr_drc_detach_cb *detach_cb' parameter was removed of
the detach function pointer in sPAPRDRConnectorClass

- hw/ppc/spapr_pci.c:
* the callback 'spapr_phb_remove_pci_device_cb' is now passed
as a parameter in 'spapr_dr_connector_new' instead of 'drck->detach()'

- hw/ppc/spapr.c:
* 'spapr_create_lmb_dr_connectors' now passes the callback
'spapr_lmb_release' to 'spapr_dr_connector_new' instead of 'drck-detach()'
* 'spapr_init_cpus' now passes the callback 'spapr_core_release'
to 'spapr_dr_connector_new' instead of 'drck-detach()'
* moved the callback functions up in the code so they can be referenced
by 'spapr_create_lmb_dr_connectors' and 'spapr_init_cpus'

Signed-off-by: Daniel Henrique Barboza 
---
 hw/ppc/spapr.c | 71 +++---
 hw/ppc/spapr_drc.c | 17 ++-
 hw/ppc/spapr_pci.c |  5 ++--
 include/hw/ppc/spapr_drc.h |  4 +--
 4 files changed, 49 insertions(+), 48 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 80d12d0..bc11757 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1887,6 +1887,29 @@ static void spapr_drc_reset(void *opaque)
 }
 }
 
+typedef struct sPAPRDIMMState {
+uint32_t nr_lmbs;
+} sPAPRDIMMState;
+
+static void spapr_lmb_release(DeviceState *dev, void *opaque)
+{
+sPAPRDIMMState *ds = (sPAPRDIMMState *)opaque;
+HotplugHandler *hotplug_ctrl;
+
+if (--ds->nr_lmbs) {
+return;
+}
+
+g_free(ds);
+
+/*
+ * Now that all the LMBs have been removed by the guest, call the
+ * pc-dimm unplug handler to cleanup up the pc-dimm device.
+ */
+hotplug_ctrl = qdev_get_hotplug_handler(dev);
+hotplug_handler_unplug(hotplug_ctrl, dev, _abort);
+}
+
 static void spapr_create_lmb_dr_connectors(sPAPRMachineState *spapr)
 {
 MachineState *machine = MACHINE(spapr);
@@ -1900,7 +1923,7 @@ static void 
spapr_create_lmb_dr_connectors(sPAPRMachineState *spapr)
 
 addr = i * lmb_size + spapr->hotplug_memory.base;
 drc = spapr_dr_connector_new(OBJECT(spapr), 
SPAPR_DR_CONNECTOR_TYPE_LMB,
- addr/lmb_size);
+ (addr / lmb_size), spapr_lmb_release);
 qemu_register_reset(spapr_drc_reset, drc);
 }
 }
@@ -1956,6 +1979,14 @@ static CPUArchId *spapr_find_cpu_slot(MachineState *ms, 
uint32_t id, int *idx)
 return >possible_cpus->cpus[index];
 }
 
+static void spapr_core_release(DeviceState *dev, void *opaque)
+{
+HotplugHandler *hotplug_ctrl;
+
+hotplug_ctrl = qdev_get_hotplug_handler(dev);
+hotplug_handler_unplug(hotplug_ctrl, dev, _abort);
+}
+
 static void spapr_init_cpus(sPAPRMachineState *spapr)
 {
 MachineState *machine = MACHINE(spapr);
@@ -1998,7 +2029,8 @@ static void spapr_init_cpus(sPAPRMachineState *spapr)
 sPAPRDRConnector *drc =
 spapr_dr_connector_new(OBJECT(spapr),
SPAPR_DR_CONNECTOR_TYPE_CPU,
-   (core_id / smp_threads) * smt);
+   (core_id / smp_threads) * smt,
+   spapr_core_release);
 
 qemu_register_reset(spapr_drc_reset, drc);
 }
@@ -2596,29 +2628,6 @@ out:
 error_propagate(errp, local_err);
 }
 
-typedef struct sPAPRDIMMState {
-uint32_t nr_lmbs;
-} sPAPRDIMMState;
-
-static void spapr_lmb_release(DeviceState *dev, void *opaque)
-{
-sPAPRDIMMState *ds = (sPAPRDIMMState *)opaque;
-HotplugHandler *hotplug_ctrl;
-
-if (--ds->nr_lmbs) {
-return;
-}
-
-g_free(ds);
-
-/*
- * Now that all the LMBs have been removed by the guest, call the
- * pc-dimm unplug handler to cleanup up the pc-dimm device.
- */
-hotplug_ctrl = qdev_get_hotplug_handler(dev);
-hotplug_handler_unplug(hotplug_ctrl, dev, _abort);
-}
-
 static void spapr_del_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t 
size,
Error **errp)
 {
@@ -2636,7 +2645,7 @@ static void spapr_del_lmbs(DeviceState *dev, uint64_t 
addr_start, uint64_t size,
 g_assert(drc);
 
 drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
-drck->detach(drc, dev, spapr_lmb_release, ds, errp);
+drck->detach(drc, dev, ds, errp);
 addr += SPAPR_MEMORY_BLOCK_SIZE;
 }
 
@@ -2712,14 +2721,6 @@ static void spapr_core_unplug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
 object_unparent(OBJECT(dev));
 }
 
-static void spapr_core_release(DeviceState *dev, void *opaque)
-{
-  

Re: [Qemu-devel] [PATCH v1 3/8] dmg: Limit the output buffer size to a max of 2MB

2017-04-26 Thread John Snow


On 04/25/2017 03:59 PM, Ashijeet Acharya wrote:
> The size of the output buffer is limited to a maximum of 2MB so that
> QEMU doesn't end up allocating huge amounts of memory while
> decompressing compressed input streams.
> 
> 2MB is an appropriate size because "qemu-img convert" has the same I/O
> buffer size and the most important use case for DMG files is to be
> compatible with qemu-img convert.
> 
> Signed-off-by: Ashijeet Acharya 
> ---

Patch 1 adds a new structure and patch 2 starts using it, but in a
store-only manner and only with placeholder variables that are difficult
to authenticate, so there's still "insufficient data" to review either
patch meaningfully.

This patch seems unrelated to either of those, so the ordering is strange.

>  block/dmg.c | 12 ++--
>  1 file changed, 6 insertions(+), 6 deletions(-)
> 
> diff --git a/block/dmg.c b/block/dmg.c
> index c6fe8b0..7ae30e3 100644
> --- a/block/dmg.c
> +++ b/block/dmg.c
> @@ -37,8 +37,8 @@ enum {
>  /* Limit chunk sizes to prevent unreasonable amounts of memory being used
>   * or truncating when converting to 32-bit types
>   */
> -DMG_LENGTHS_MAX = 64 * 1024 * 1024, /* 64 MB */
> -DMG_SECTORCOUNTS_MAX = DMG_LENGTHS_MAX / 512,
> +DMG_MAX_OUTPUT = 2 * 1024 * 1024, /* 2 MB */

why "MAX OUTPUT" ? Aren't we using this for buffering on reads?

> +DMG_SECTOR_MAX = DMG_MAX_OUTPUT / 512,
>  };
>  
>  static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename)
> @@ -260,10 +260,10 @@ static int dmg_read_mish_block(BDRVDMGState *s, 
> DmgHeaderState *ds,
>  
>  /* all-zeroes sector (type 2) does not need to be "uncompressed" and 
> can
>   * therefore be unbounded. */
> -if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) {
> +if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTOR_MAX) {
>  error_report("sector count %" PRIu64 " for chunk %" PRIu32
>   " is larger than max (%u)",
> - s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX);
> + s->sectorcounts[i], i, DMG_SECTOR_MAX);
>  ret = -EINVAL;
>  goto fail;
>  }
> @@ -275,10 +275,10 @@ static int dmg_read_mish_block(BDRVDMGState *s, 
> DmgHeaderState *ds,
>  /* length in (compressed) data fork */
>  s->lengths[i] = buff_read_uint64(buffer, offset + 0x20);
>  
> -if (s->lengths[i] > DMG_LENGTHS_MAX) {
> +if (s->lengths[i] > DMG_MAX_OUTPUT) {
>  error_report("length %" PRIu64 " for chunk %" PRIu32
>   " is larger than max (%u)",
> - s->lengths[i], i, DMG_LENGTHS_MAX);
> + s->lengths[i], i, DMG_MAX_OUTPUT);
>  ret = -EINVAL;
>  goto fail;
>  }
> 

Seems OK otherwise, but I would normally expect you to fix the buffering
problems first, and then reduce the size of the buffer -- not the other
way around. This version introduces new limitations that didn't exist
previously (As of this commit, QEMU can't open DMG files with chunks
larger than 2MB now, right?)

--js



[Qemu-devel] [PATCH 3/4] migration: spapr: migrate ccs_list in spapr state

2017-04-26 Thread Daniel Henrique Barboza
From: Jianjun Duan 

ccs_list in spapr state maintains the device tree related
information on the rtas side for hotplugged devices. In racing
situations between hotplug events and migration operation, a rtas
hotplug event could be migrated from the source guest to target
guest, or the source guest could have not yet finished fetching
the device tree when migration is started, the target will try
to finish fetching the device tree. By migrating ccs_list, the
target can fetch the device tree properly.

In theory there would be other alternatives besides migrating the
css_list to fix this. For example, we could add a flag that indicates
whether a device is in the middle of the configure_connector during the
migration process, in the post_load we can detect if this flag
is active and then return an error informing the guest to restart the
hotplug process. However, the DRC state can still be modified outside of
hotplug. Using:

   drmgr -c pci -s  -r
   drmgr -c pci -s  -a

it is possible to return a device to firmware and then later take it
back and reconfigure it. This is not a common case but it's not prohibited,
and performing a migration between these 2 operations would fail because
the default coldplug state on target assumes a configured state in
the source*. Migrating ccs_list is one solution that cover this
case as well.

ccs_list is put in a subsection in the spapr state VMSD to make
sure migration across different versions is not broken.

* see http://lists.nongnu.org/archive/html/qemu-devel/2016-10/msg01763.html
for more information on this discussion.

Signed-off-by: Jianjun Duan 
Signed-off-by: Daniel Henrique Barboza 
---
 hw/ppc/spapr.c | 32 
 1 file changed, 32 insertions(+)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index bc11757..2a788dc 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1437,6 +1437,37 @@ static bool version_before_3(void *opaque, int 
version_id)
 return version_id < 3;
 }
 
+static bool spapr_ccs_list_needed(void *opaque)
+{
+sPAPRMachineState *spapr = (sPAPRMachineState *)opaque;
+return !QTAILQ_EMPTY(>ccs_list);
+}
+
+static const VMStateDescription vmstate_spapr_ccs = {
+.name = "spapr_configure_connector_state",
+.version_id = 1,
+.minimum_version_id = 1,
+.fields = (VMStateField[]) {
+VMSTATE_UINT32(drc_index, sPAPRConfigureConnectorState),
+VMSTATE_INT32(fdt_offset, sPAPRConfigureConnectorState),
+VMSTATE_INT32(fdt_depth, sPAPRConfigureConnectorState),
+VMSTATE_END_OF_LIST()
+},
+};
+
+static const VMStateDescription vmstate_spapr_ccs_list = {
+.name = "spapr_ccs_list",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = spapr_ccs_list_needed,
+.fields = (VMStateField[]) {
+VMSTATE_QTAILQ_V(ccs_list, sPAPRMachineState, 1,
+ vmstate_spapr_ccs, sPAPRConfigureConnectorState,
+ next),
+VMSTATE_END_OF_LIST()
+},
+};
+
 static bool spapr_ov5_cas_needed(void *opaque)
 {
 sPAPRMachineState *spapr = opaque;
@@ -1535,6 +1566,7 @@ static const VMStateDescription vmstate_spapr = {
 .subsections = (const VMStateDescription*[]) {
 _spapr_ov5_cas,
 _spapr_patb_entry,
+_spapr_ccs_list,
 NULL
 }
 };
-- 
2.9.3




[Qemu-devel] [PATCH 2/4] hw/ppc: migrating the DRC state of hotplugged devices

2017-04-26 Thread Daniel Henrique Barboza
In pseries, a firmware abstraction called Dynamic Reconfiguration
Connector (DRC) is used to assign a particular dynamic resource
to the guest and provide an interface to manage configuration/removal
of the resource associated with it. In other words, DRC is the
'plugged state' of a device.

Before this patch, DRC wasn't being migrated. This causes
post-migration problems due to DRC state mismatch between source and
target. The DRC state of a device X in the source might
change, while in the target the DRC state of X is still fresh. When
migrating the guest, X will not have the same hotplugged state as it
did in the source. This means that we can't hot unplug X in the
target after migration is completed because its DRC state is not consistent.
https://bugs.launchpad.net/ubuntu/+source/qemu/+bug/1677552 is one
bug that is caused by this DRC state mismatch between source and
target.

To migrate the DRC state, we defined the VMStateDescription struct for
spapr_drc to enable the transmission of spapr_drc state in migration.
Not all the elements in the DRC state are migrated - only those
that can be modified by guest actions or device add/remove
operations:

- 'isolation_state', 'allocation_state' and 'configured' are involved
in the DR state transition diagram from PAPR+ 2.7, 13.4;

- 'configured' and 'signalled' are needed in attaching and detaching
devices;

- 'indicator_state' provides users with hardware state information.

These are the DRC elements that are migrated.

In this patch the DRC state is migrated for PCI, LMB and CPU
connector types. At this moment there is no support to migrate
DRC for the PHB (PCI Host Bridge) type.

In the 'realize' function the DRC is registered using vmstate_register,
similar to what hw/ppc/spapr_iommu.c does in 'spapr_tce_table_realize'.
This approach works because  DRCs are bus-less and do not sit
on a BusClass that implements bc->get_dev_path, so as a fallback the
VMSD gets identified via "spapr_drc"/get_index(drc).

Signed-off-by: Daniel Henrique Barboza 
---
 hw/ppc/spapr_drc.c | 61 ++
 1 file changed, 61 insertions(+)

diff --git a/hw/ppc/spapr_drc.c b/hw/ppc/spapr_drc.c
index afe5d82..13d18f5 100644
--- a/hw/ppc/spapr_drc.c
+++ b/hw/ppc/spapr_drc.c
@@ -512,6 +512,64 @@ static void reset(DeviceState *d)
 }
 }
 
+static bool spapr_drc_needed(void *opaque)
+{
+sPAPRDRConnector *drc = (sPAPRDRConnector *)opaque;
+sPAPRDRConnectorClass *drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
+bool rc = false;
+sPAPRDREntitySense value;
+drck->entity_sense(drc, );
+/* If no dev is plugged in there is no need to migrate the DRC state */
+if (value != SPAPR_DR_ENTITY_SENSE_PRESENT) {
+return false;
+}
+
+/*
+ * If there is dev plugged in, we need to migrate the DRC state when
+ * it is different from cold-plugged state
+ */
+switch (drc->type) {
+
+case SPAPR_DR_CONNECTOR_TYPE_PCI:
+rc = !((drc->isolation_state == SPAPR_DR_ISOLATION_STATE_UNISOLATED) &&
+   (drc->allocation_state == SPAPR_DR_ALLOCATION_STATE_USABLE) &&
+   drc->configured && drc->signalled && !drc->awaiting_release);
+break;
+
+case SPAPR_DR_CONNECTOR_TYPE_LMB:
+rc = !((drc->isolation_state == SPAPR_DR_ISOLATION_STATE_ISOLATED) &&
+   (drc->allocation_state == SPAPR_DR_ALLOCATION_STATE_UNUSABLE) &&
+   drc->configured && drc->signalled && !drc->awaiting_release);
+break;
+
+case SPAPR_DR_CONNECTOR_TYPE_CPU:
+rc = !((drc->isolation_state == SPAPR_DR_ISOLATION_STATE_ISOLATED) &&
+   (drc->allocation_state == SPAPR_DR_ALLOCATION_STATE_UNUSABLE) &&
+drc->configured && drc->signalled && !drc->awaiting_release);
+break;
+
+default:
+;
+}
+return rc;
+}
+
+static const VMStateDescription vmstate_spapr_drc = {
+.name = "spapr_drc",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = spapr_drc_needed,
+.fields  = (VMStateField []) {
+VMSTATE_UINT32(isolation_state, sPAPRDRConnector),
+VMSTATE_UINT32(allocation_state, sPAPRDRConnector),
+VMSTATE_UINT32(indicator_state, sPAPRDRConnector),
+VMSTATE_BOOL(configured, sPAPRDRConnector),
+VMSTATE_BOOL(awaiting_release, sPAPRDRConnector),
+VMSTATE_BOOL(signalled, sPAPRDRConnector),
+VMSTATE_END_OF_LIST()
+}
+};
+
 static void realize(DeviceState *d, Error **errp)
 {
 sPAPRDRConnector *drc = SPAPR_DR_CONNECTOR(d);
@@ -540,6 +598,8 @@ static void realize(DeviceState *d, Error **errp)
 object_unref(OBJECT(drc));
 }
 g_free(child_name);
+vmstate_register(DEVICE(drc), drck->get_index(drc), _spapr_drc,
+ drc);
 trace_spapr_drc_realize_complete(drck->get_index(drc));
 }
 
@@ -658,6 +718,7 @@ static void spapr_dr_connector_class_init(ObjectClass *k, 
void *data)
 

[Qemu-devel] [PATCH 1/4] hw/ppc: setting spapr_drc_detach_cb in spapr_dr_connector_new

2017-04-26 Thread Daniel Henrique Barboza
The idea of moving the detach callback functions to the constructor
of the dr_connector is to set them statically at init time, avoiding
any post-load hooks to restore it (after a migration, for example).

Summary of changes:

- hw/ppc/spapr_drc.c and include/hw/ppc/spapr_drc.h:
*  spapr_dr_connector_new() now has an additional parameter,
spapr_drc_detach_cb *detach_cb
*  'spapr_drc_detach_cb *detach_cb' parameter was removed of
the detach function pointer in sPAPRDRConnectorClass

- hw/ppc/spapr_pci.c:
* the callback 'spapr_phb_remove_pci_device_cb' is now passed
as a parameter in 'spapr_dr_connector_new' instead of 'drck->detach()'

- hw/ppc/spapr.c:
* 'spapr_create_lmb_dr_connectors' now passes the callback
'spapr_lmb_release' to 'spapr_dr_connector_new' instead of 'drck-detach()'
* 'spapr_init_cpus' now passes the callback 'spapr_core_release'
to 'spapr_dr_connector_new' instead of 'drck-detach()'
* moved the callback functions up in the code so they can be referenced
by 'spapr_create_lmb_dr_connectors' and 'spapr_init_cpus'

Signed-off-by: Daniel Henrique Barboza 
---
 hw/ppc/spapr.c | 71 +++---
 hw/ppc/spapr_drc.c | 17 ++-
 hw/ppc/spapr_pci.c |  5 ++--
 include/hw/ppc/spapr_drc.h |  4 +--
 4 files changed, 49 insertions(+), 48 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 80d12d0..bc11757 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1887,6 +1887,29 @@ static void spapr_drc_reset(void *opaque)
 }
 }
 
+typedef struct sPAPRDIMMState {
+uint32_t nr_lmbs;
+} sPAPRDIMMState;
+
+static void spapr_lmb_release(DeviceState *dev, void *opaque)
+{
+sPAPRDIMMState *ds = (sPAPRDIMMState *)opaque;
+HotplugHandler *hotplug_ctrl;
+
+if (--ds->nr_lmbs) {
+return;
+}
+
+g_free(ds);
+
+/*
+ * Now that all the LMBs have been removed by the guest, call the
+ * pc-dimm unplug handler to cleanup up the pc-dimm device.
+ */
+hotplug_ctrl = qdev_get_hotplug_handler(dev);
+hotplug_handler_unplug(hotplug_ctrl, dev, _abort);
+}
+
 static void spapr_create_lmb_dr_connectors(sPAPRMachineState *spapr)
 {
 MachineState *machine = MACHINE(spapr);
@@ -1900,7 +1923,7 @@ static void 
spapr_create_lmb_dr_connectors(sPAPRMachineState *spapr)
 
 addr = i * lmb_size + spapr->hotplug_memory.base;
 drc = spapr_dr_connector_new(OBJECT(spapr), 
SPAPR_DR_CONNECTOR_TYPE_LMB,
- addr/lmb_size);
+ (addr / lmb_size), spapr_lmb_release);
 qemu_register_reset(spapr_drc_reset, drc);
 }
 }
@@ -1956,6 +1979,14 @@ static CPUArchId *spapr_find_cpu_slot(MachineState *ms, 
uint32_t id, int *idx)
 return >possible_cpus->cpus[index];
 }
 
+static void spapr_core_release(DeviceState *dev, void *opaque)
+{
+HotplugHandler *hotplug_ctrl;
+
+hotplug_ctrl = qdev_get_hotplug_handler(dev);
+hotplug_handler_unplug(hotplug_ctrl, dev, _abort);
+}
+
 static void spapr_init_cpus(sPAPRMachineState *spapr)
 {
 MachineState *machine = MACHINE(spapr);
@@ -1998,7 +2029,8 @@ static void spapr_init_cpus(sPAPRMachineState *spapr)
 sPAPRDRConnector *drc =
 spapr_dr_connector_new(OBJECT(spapr),
SPAPR_DR_CONNECTOR_TYPE_CPU,
-   (core_id / smp_threads) * smt);
+   (core_id / smp_threads) * smt,
+   spapr_core_release);
 
 qemu_register_reset(spapr_drc_reset, drc);
 }
@@ -2596,29 +2628,6 @@ out:
 error_propagate(errp, local_err);
 }
 
-typedef struct sPAPRDIMMState {
-uint32_t nr_lmbs;
-} sPAPRDIMMState;
-
-static void spapr_lmb_release(DeviceState *dev, void *opaque)
-{
-sPAPRDIMMState *ds = (sPAPRDIMMState *)opaque;
-HotplugHandler *hotplug_ctrl;
-
-if (--ds->nr_lmbs) {
-return;
-}
-
-g_free(ds);
-
-/*
- * Now that all the LMBs have been removed by the guest, call the
- * pc-dimm unplug handler to cleanup up the pc-dimm device.
- */
-hotplug_ctrl = qdev_get_hotplug_handler(dev);
-hotplug_handler_unplug(hotplug_ctrl, dev, _abort);
-}
-
 static void spapr_del_lmbs(DeviceState *dev, uint64_t addr_start, uint64_t 
size,
Error **errp)
 {
@@ -2636,7 +2645,7 @@ static void spapr_del_lmbs(DeviceState *dev, uint64_t 
addr_start, uint64_t size,
 g_assert(drc);
 
 drck = SPAPR_DR_CONNECTOR_GET_CLASS(drc);
-drck->detach(drc, dev, spapr_lmb_release, ds, errp);
+drck->detach(drc, dev, ds, errp);
 addr += SPAPR_MEMORY_BLOCK_SIZE;
 }
 
@@ -2712,14 +2721,6 @@ static void spapr_core_unplug(HotplugHandler 
*hotplug_dev, DeviceState *dev,
 object_unparent(OBJECT(dev));
 }
 
-static void spapr_core_release(DeviceState *dev, void *opaque)
-{
-  

[Qemu-devel] [PATCH 0/4 v7] migration/ppc: migrating DRC, ccs_list and pending_events

2017-04-26 Thread Daniel Henrique Barboza
v7:
- removed the first patch. DRC registration is now done by vmstate_register
in patch 2.
- added a new patch that changes spapr_dr_connector_new to receive as argument
the detach_cb.
- removed the callback logic of patch 2 since there is no need to restore the
detach_cb on post-load due to the detach_cb on spapr_dr_connector_new change.
- added word separators in the VMSD names of patch 3 and 4.

v6: - Rebased with QEMU master after 6+ months.
- Simplified the logic in patch 1.
- Reworked patch 2: added CPU DRC migration, removed a function pointer 
from DRC
class and minor improvements.
- Added clarifications from the previous v5 discussions in the commit 
messages.

v5: - Rebased to David's ppc-for-2.8.

v4: - Introduce a way to set customized instance_id in SaveStateEntry. Use it
  to set instance_id for DRC using its unique index to address David 
  Gibson's concern.
- Rename VMS_CSTM to VMS_LINKED based on Paolo Bonzini's suggestions.
- Clean up qjson stuff in put_qtailq. 
- Add trace for put_qtailq and get_qtailq based on David Gilbert's 
  suggestion.

- Based on David's ppc-for-2.7. 

v3: - Simplify overall design followng discussion with Paolo. No longer need
  metadata to migrate QTAILQ.
- Extend VMStateInfo instead of adding similar fields to VMStateField.
- Clean up macros in qemu/queue.h.
(link: https://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg05695.html)

v2: - Introduce a general approach to migrate QTAILQ in qemu/queue.h.
- Migrate signalled field in the DRC state.
- Put the newly added migrating fields in subsections so that backward 
  migration is not broken.  
- Set detach_cb field right after migration so that a migrated hot-unplug
  event could finish its course.
(link: https://lists.nongnu.org/archive/html/qemu-devel/2016-05/msg04188.html)

v1: - Inital version.
(link: https://lists.nongnu.org/archive/html/qemu-devel/2016-04/msg02601.html)


To make guest devices (PCI, CPU and memory) hotplug work together 
with guest migration, spapr drc state needs be transmitted in
migration. This patch defines the VMStateDescription struct for
spapr drc state to enable it.

To fix the potential racing between hotplug events on guest and 
guest migration, ccs_list and pending_events of spapr state need be 
transmitted in migration. This patch set also takes care of it.




Daniel Henrique Barboza (2):
  hw/ppc: setting spapr_drc_detach_cb in spapr_dr_connector_new
  hw/ppc: migrating the DRC state of hotplugged devices

Jianjun Duan (2):
  migration: spapr: migrate ccs_list in spapr state
  migration: spapr: migrate pending_events of spapr state

 hw/ppc/spapr.c | 136 +
 hw/ppc/spapr_drc.c |  78 +++---
 hw/ppc/spapr_events.c  |  24 
 hw/ppc/spapr_pci.c |   5 +-
 include/hw/ppc/spapr.h |   3 +-
 include/hw/ppc/spapr_drc.h |   4 +-
 6 files changed, 190 insertions(+), 60 deletions(-)

-- 
2.9.3




[Qemu-devel] [PATCH 4/4] migration: spapr: migrate pending_events of spapr state

2017-04-26 Thread Daniel Henrique Barboza
From: Jianjun Duan 

In racing situations between hotplug events and migration operation,
a rtas hotplug event could have not yet be delivered to the source
guest when migration is started. In this case the pending_events of
spapr state need be transmitted to the target so that the hotplug
event can be finished on the target.

All the different fields of the events are encoded as defined by
PAPR. We can migrate them as uint8_t binary stream without any
concerns about data padding or endianess.

pending_events is put in a subsection in the spapr state VMSD to make
sure migration across different versions is not broken.

Signed-off-by: Jianjun Duan 
Signed-off-by: Daniel Henrique Barboza 
---
 hw/ppc/spapr.c | 33 +
 hw/ppc/spapr_events.c  | 24 +---
 include/hw/ppc/spapr.h |  3 ++-
 3 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 2a788dc..bc4ca91 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1468,6 +1468,38 @@ static const VMStateDescription vmstate_spapr_ccs_list = 
{
 },
 };
 
+static bool spapr_pending_events_needed(void *opaque)
+{
+sPAPRMachineState *spapr = (sPAPRMachineState *)opaque;
+return !QTAILQ_EMPTY(>pending_events);
+}
+
+static const VMStateDescription vmstate_spapr_event_entry = {
+.name = "spapr_event_log_entry",
+.version_id = 1,
+.minimum_version_id = 1,
+.fields = (VMStateField[]) {
+VMSTATE_INT32(log_type, sPAPREventLogEntry),
+VMSTATE_BOOL(exception, sPAPREventLogEntry),
+VMSTATE_UINT32(data_size, sPAPREventLogEntry),
+VMSTATE_VARRAY_UINT32_ALLOC(data, sPAPREventLogEntry, data_size,
+0, vmstate_info_uint8, uint8_t),
+VMSTATE_END_OF_LIST()
+},
+};
+
+static const VMStateDescription vmstate_spapr_pending_events = {
+.name = "spapr_pending_events",
+.version_id = 1,
+.minimum_version_id = 1,
+.needed = spapr_pending_events_needed,
+.fields = (VMStateField[]) {
+VMSTATE_QTAILQ_V(pending_events, sPAPRMachineState, 1,
+ vmstate_spapr_event_entry, sPAPREventLogEntry, next),
+VMSTATE_END_OF_LIST()
+},
+};
+
 static bool spapr_ov5_cas_needed(void *opaque)
 {
 sPAPRMachineState *spapr = opaque;
@@ -1567,6 +1599,7 @@ static const VMStateDescription vmstate_spapr = {
 _spapr_ov5_cas,
 _spapr_patb_entry,
 _spapr_ccs_list,
+_spapr_pending_events,
 NULL
 }
 };
diff --git a/hw/ppc/spapr_events.c b/hw/ppc/spapr_events.c
index f0b28d8..70c7cfc 100644
--- a/hw/ppc/spapr_events.c
+++ b/hw/ppc/spapr_events.c
@@ -342,7 +342,8 @@ static int rtas_event_log_to_irq(sPAPRMachineState *spapr, 
int log_type)
 return source->irq;
 }
 
-static void rtas_event_log_queue(int log_type, void *data, bool exception)
+static void rtas_event_log_queue(int log_type, void *data, bool exception,
+ int data_size)
 {
 sPAPRMachineState *spapr = SPAPR_MACHINE(qdev_get_machine());
 sPAPREventLogEntry *entry = g_new(sPAPREventLogEntry, 1);
@@ -351,6 +352,7 @@ static void rtas_event_log_queue(int log_type, void *data, 
bool exception)
 entry->log_type = log_type;
 entry->exception = exception;
 entry->data = data;
+entry->data_size = data_size;
 QTAILQ_INSERT_TAIL(>pending_events, entry, next);
 }
 
@@ -445,6 +447,7 @@ static void spapr_powerdown_req(Notifier *n, void *opaque)
 struct rtas_event_log_v6_mainb *mainb;
 struct rtas_event_log_v6_epow *epow;
 struct epow_log_full *new_epow;
+uint32_t data_size;
 
 new_epow = g_malloc0(sizeof(*new_epow));
 hdr = _epow->hdr;
@@ -453,14 +456,13 @@ static void spapr_powerdown_req(Notifier *n, void *opaque)
 mainb = _epow->mainb;
 epow = _epow->epow;
 
+data_size = sizeof(*new_epow);
 hdr->summary = cpu_to_be32(RTAS_LOG_VERSION_6
| RTAS_LOG_SEVERITY_EVENT
| RTAS_LOG_DISPOSITION_NOT_RECOVERED
| RTAS_LOG_OPTIONAL_PART_PRESENT
| RTAS_LOG_TYPE_EPOW);
-hdr->extended_length = cpu_to_be32(sizeof(*new_epow)
-   - sizeof(new_epow->hdr));
-
+hdr->extended_length = cpu_to_be32(data_size - sizeof(new_epow->hdr));
 spapr_init_v6hdr(v6hdr);
 spapr_init_maina(maina, 3 /* Main-A, Main-B and EPOW */);
 
@@ -479,7 +481,7 @@ static void spapr_powerdown_req(Notifier *n, void *opaque)
 epow->event_modifier = RTAS_LOG_V6_EPOW_MODIFIER_NORMAL;
 epow->extended_modifier = RTAS_LOG_V6_EPOW_XMODIFIER_PARTITION_SPECIFIC;
 
-rtas_event_log_queue(RTAS_LOG_TYPE_EPOW, new_epow, true);
+rtas_event_log_queue(RTAS_LOG_TYPE_EPOW, new_epow, true, data_size);
 
 qemu_irq_pulse(xics_get_qirq(XICS_FABRIC(spapr),
   

  1   2   3   4   5   6   >