[Qemu-block] [PATCH v2] raw_bsd: add offset and size options

2016-10-17 Thread Tomáš Golembiovský
Added two new options 'offset' and 'size'. This makes it possible to use
only part of the file as a device. This can be used e.g. to limit the
access only to single partition in a disk image or use a disk inside a
tar archive (like OVA).

When 'size' is specified we do our best to honour it.

Signed-off-by: Tomáš Golembiovský 
---
 block/raw_bsd.c  | 169 ++-
 qapi/block-core.json |  16 -
 2 files changed, 181 insertions(+), 4 deletions(-)

diff --git a/block/raw_bsd.c b/block/raw_bsd.c
index 588d408..3fb3f13 100644
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -31,6 +31,30 @@
 #include "qapi/error.h"
 #include "qemu/option.h"
 
+typedef struct BDRVRawState {
+uint64_t offset;
+uint64_t size;
+bool has_size;
+} BDRVRawState;
+
+static QemuOptsList raw_runtime_opts = {
+.name = "raw",
+.head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head),
+.desc = {
+{
+.name = "offset",
+.type = QEMU_OPT_SIZE,
+.help = "offset in the disk where the image starts",
+},
+{
+.name = "size",
+.type = QEMU_OPT_SIZE,
+.help = "virtual disk size",
+},
+{ /* end of list */ }
+},
+};
+
 static QemuOptsList raw_create_opts = {
 .name = "raw-create-opts",
 .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head),
@@ -44,17 +68,107 @@ static QemuOptsList raw_create_opts = {
 }
 };
 
+static int raw_read_options(QDict *options, BlockDriverState *bs,
+BDRVRawState *s, Error **errp)
+{
+Error *local_err = NULL;
+QemuOpts *opts = NULL;
+int64_t real_size = 0;
+int ret;
+
+real_size = bdrv_getlength(bs->file->bs);
+if (real_size < 0) {
+error_setg_errno(errp, -real_size, "Could not get image size");
+return real_size;
+}
+
+opts = qemu_opts_create(_runtime_opts, NULL, 0, _abort);
+qemu_opts_absorb_qdict(opts, options, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+ret = -EINVAL;
+goto fail;
+}
+
+s->offset = qemu_opt_get_size(opts, "offset", 0);
+if (qemu_opt_find(opts, "size") != NULL) {
+s->size = qemu_opt_get_size(opts, "size", 0);
+s->has_size = true;
+} else {
+s->has_size = false;
+s->size = real_size;
+}
+
+/* Check size and offset */
+if (real_size < s->offset || (real_size - s->offset) < s->size) {
+error_setg(errp, "The sum of offset (%"PRIu64") and size "
+"(%"PRIu64") has to be smaller or equal to the "
+" actual size of the containing file (%"PRId64").",
+s->offset, s->size, real_size);
+ret = -EINVAL;
+goto fail;
+}
+
+/* Make sure size is multiple of BDRV_SECTOR_SIZE to prevent rounding
+ * up and leaking out of the specified area. */
+if (s->size != QEMU_ALIGN_DOWN(s->size, BDRV_SECTOR_SIZE)) {
+s->size = QEMU_ALIGN_DOWN(s->size, BDRV_SECTOR_SIZE);
+fprintf(stderr,
+"WARNING: Specified size is not multiple of %llu! "
+"Rounding down to %"PRIu64 ". (End of the image will be "
+"ignored.)\n",
+BDRV_SECTOR_SIZE, s->size);
+}
+
+ret = 0;
+
+fail:
+
+qemu_opts_del(opts);
+
+return ret;
+}
+
 static int raw_reopen_prepare(BDRVReopenState *reopen_state,
   BlockReopenQueue *queue, Error **errp)
 {
-return 0;
+assert(reopen_state != NULL);
+assert(reopen_state->bs != NULL);
+
+reopen_state->opaque = g_new0(BDRVRawState, 1);
+
+return raw_read_options(
+reopen_state->options,
+reopen_state->bs,
+reopen_state->opaque,
+errp);
+}
+
+static void raw_reopen_commit(BDRVReopenState *state)
+{
+BDRVRawState *new_s = state->opaque;
+BDRVRawState *s = state->bs->opaque;
+
+memcpy(s, new_s, sizeof(BDRVRawState));
+
+g_free(state->opaque);
+state->opaque = NULL;
+}
+
+static void raw_reopen_abort(BDRVReopenState *state)
+{
+g_free(state->opaque);
+state->opaque = NULL;
 }
 
 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
   uint64_t bytes, QEMUIOVector *qiov,
   int flags)
 {
+BDRVRawState *s = bs->opaque;
+
 BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+offset += s->offset;
 return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
 }
 
@@ -62,11 +176,18 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState 
*bs, uint64_t offset,
uint64_t bytes, QEMUIOVector *qiov,
int flags)
 {
+BDRVRawState *s = bs->opaque;
 void *buf = NULL;
 BlockDriver *drv;
 QEMUIOVector local_qiov;
 int ret;
 
+if (s->has_size && (offset > s->size || bytes > (s->size - offset))) {
+/* There's not 

[Qemu-block] [PATCH v2] Add 'offset' and 'size' options

2016-10-17 Thread Tomáš Golembiovský
This is a follow-up to the patch:
[PATCH] raw-posix: add 'offset' and 'size' options

The main changes are:
 -  options were moved from 'file' driver into 'raw' driver as suggested
 -  added support for writing, reopen and truncate when possible

If I forgot to address somebody's comments feel free to raise them again,
please.

Some general notes to the code:

1)  The size is rounded *down* to the 512 byte boundary. It's not that
the raw driver really cares about this, but if we don't do it then 
bdrv_getlength() will do that instead of us. The problem is that
bdrv_getlength() does round *up* and this can lead to reads/writes
outside the specified 'size'.

2)  We don't provide '.bdrv_get_allocated_file_size' function. As a
result the information about allocated disk size reports size of the
whole file. This is, rather confusingly, larger than the provided
'size'. But I don't think this matters much. Note that we don't have
any easy way how to get the correct information here apart from
checking all the block with bdrv_co_get_block_status() (as suggested
by Kevin Wolf).

3)  No options for raw_create(). The 'size' and 'offset' options were
added only to open/reopen. In my opinion there is no real reason for
them there. AFAIK you cannot create embeded QCOW2/VMDK/etc. image
that way anyway.


Tomáš Golembiovský (1):
  raw_bsd: add offset and size options

 block/raw_bsd.c  | 169 ++-
 qapi/block-core.json |  16 -
 2 files changed, 181 insertions(+), 4 deletions(-)

-- 
2.10.0




Re: [Qemu-block] [Qemu-devel] block/nfs: Fine grained runtime options in nfs

2016-10-17 Thread Ashijeet Acharya
On Tue, Oct 18, 2016 at 12:59 AM, Eric Blake  wrote:
> On 10/17/2016 01:00 PM, Ashijeet Acharya wrote:
>
>> One more relatively easy question though, will we include @port as an
>> option in runtime_opts while converting NFS to use several
>> runtime_opts? The reason I ask this because the uri syntax for NFS in
>> QEMU looks like this:
>>
>>nfs:[?param=value[=value2[&...]]]
>
> It's actually nfs://[:port]/...
>
> so the URI syntax already supports port.

But the commit message which added support for NFS had the uri which I
mentioned above and the code for NFS does not make use of 'port'
anywhere either, which is why I am a bit confused.

Ashijeet
>
> --
> Eric Blake   eblake redhat com+1-919-301-3266
> Libvirt virtualization library http://libvirt.org
>



Re: [Qemu-block] [PATCH v10 00/10] Dirty bitmap changes for migration/persistence work

2016-10-17 Thread Max Reitz
On 13.10.2016 23:58, John Snow wrote:
> Key:
> [] : patches are identical
> [] : number of functional differences between upstream/downstream patch
> [down] : patch is downstream-only
> The flags [FC] indicate (F)unctional and (C)ontextual differences, 
> respectively
> 
> 001/10:[] [--] 'block: Hide HBitmap in block dirty bitmap interface'
> 002/10:[] [--] 'HBitmap: Introduce "meta" bitmap to track bit changes'
> 003/10:[] [--] 'tests: Add test code for meta bitmap'
> 004/10:[] [--] 'block: Support meta dirty bitmap'
> 005/10:[] [--] 'block: Add two dirty bitmap getters'
> 006/10:[] [--] 'block: Assert that bdrv_release_dirty_bitmap succeeded'
> 007/10:[] [--] 'hbitmap: serialization'
> 008/10:[] [--] 'block: BdrvDirtyBitmap serialization interface'
> 009/10:[0005] [FC] 'tests: Add test code for hbitmap serialization'
> 010/10:[] [--] 'block: More operations for meta dirty bitmap'
> 
> ===
> v10: Now with less bits

Thanks, I've applied the series to my block branch:

https://github.com/XanClic/qemu/commits/block

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] block/nfs: Fine grained runtime options in nfs

2016-10-17 Thread Eric Blake
On 10/17/2016 01:00 PM, Ashijeet Acharya wrote:

> One more relatively easy question though, will we include @port as an
> option in runtime_opts while converting NFS to use several
> runtime_opts? The reason I ask this because the uri syntax for NFS in
> QEMU looks like this:
> 
>nfs:[?param=value[=value2[&...]]]

It's actually nfs://[:port]/...

so the URI syntax already supports port.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] [PATCH v3 0/5] Allow blockdev-add for SSH

2016-10-17 Thread no-reply
Hi,

Your series failed automatic build test. Please find the testing commands and
their output below. If you have docker installed, you can probably reproduce it
locally.

Type: series
Message-id: 1476725535-3350-1-git-send-email-ashijeetacha...@gmail.com
Subject: [Qemu-devel] [PATCH v3 0/5] Allow blockdev-add for SSH

=== TEST SCRIPT BEGIN ===
#!/bin/bash
set -e
git submodule update --init dtc
# Let docker tests dump environment info
export SHOW_ENV=1
export J=16
make docker-test-quick@centos6
make docker-test-mingw@fedora
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
From https://github.com/patchew-project/qemu
 - [tag update]  
patchew/1476416631-2870-1-git-send-email-ppan...@redhat.com -> 
patchew/1476416631-2870-1-git-send-email-ppan...@redhat.com
 * [new tag] 
patchew/1476725535-3350-1-git-send-email-ashijeetacha...@gmail.com -> 
patchew/1476725535-3350-1-git-send-email-ashijeetacha...@gmail.com
 * [new tag] patchew/20161017180939.27912-1-mre...@redhat.com -> 
patchew/20161017180939.27912-1-mre...@redhat.com
Switched to a new branch 'test'
32a50b2 qapi: allow blockdev-add for ssh
e1f6a0a block/ssh: Use InetSocketAddress options
02358bf block/ssh: Add InetSocketAddress and accept it
d115bdf util/qemu-sockets: Make inet_connect_saddr() public
585c083 block/ssh: Add ssh_has_filename_options_conflict()

=== OUTPUT BEGIN ===
Submodule 'dtc' (git://git.qemu-project.org/dtc.git) registered for path 'dtc'
Cloning into 'dtc'...
Submodule path 'dtc': checked out '65cc4d2748a2c2e6f27f1cf39e07a5dbabd80ebf'
  BUILD   centos6
make[1]: Entering directory '/var/tmp/patchew-tester-tmp-_3aeudre/src'
  ARCHIVE qemu.tgz
  ARCHIVE dtc.tgz
  COPYRUNNER
RUN test-quick in qemu:centos6 
Packages installed:
SDL-devel-1.2.14-7.el6_7.1.x86_64
ccache-3.1.6-2.el6.x86_64
epel-release-6-8.noarch
gcc-4.4.7-17.el6.x86_64
git-1.7.1-4.el6_7.1.x86_64
glib2-devel-2.28.8-5.el6.x86_64
libfdt-devel-1.4.0-1.el6.x86_64
make-3.81-23.el6.x86_64
package g++ is not installed
pixman-devel-0.32.8-1.el6.x86_64
tar-1.23-15.el6_8.x86_64
zlib-devel-1.2.3-29.el6.x86_64

Environment variables:
PACKAGES=libfdt-devel ccache tar git make gcc g++ zlib-devel 
glib2-devel SDL-devel pixman-devel epel-release
HOSTNAME=171aa48e5c62
TERM=xterm
MAKEFLAGS= -j16
HISTSIZE=1000
J=16
USER=root
CCACHE_DIR=/var/tmp/ccache
EXTRA_CONFIGURE_OPTS=
V=
SHOW_ENV=1
MAIL=/var/spool/mail/root
PATH=/usr/lib/ccache:/usr/lib64/ccache:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
PWD=/
LANG=en_US.UTF-8
TARGET_LIST=
HISTCONTROL=ignoredups
SHLVL=1
HOME=/root
TEST_DIR=/tmp/qemu-test
LOGNAME=root
LESSOPEN=||/usr/bin/lesspipe.sh %s
FEATURES= dtc
DEBUG=
G_BROKEN_FILENAMES=1
CCACHE_HASHDIR=
_=/usr/bin/env

Configure options:
--enable-werror --target-list=x86_64-softmmu,aarch64-softmmu 
--prefix=/var/tmp/qemu-build/install
No C++ compiler available; disabling C++ specific optional code
Install prefix/var/tmp/qemu-build/install
BIOS directory/var/tmp/qemu-build/install/share/qemu
binary directory  /var/tmp/qemu-build/install/bin
library directory /var/tmp/qemu-build/install/lib
module directory  /var/tmp/qemu-build/install/lib/qemu
libexec directory /var/tmp/qemu-build/install/libexec
include directory /var/tmp/qemu-build/install/include
config directory  /var/tmp/qemu-build/install/etc
local state directory   /var/tmp/qemu-build/install/var
Manual directory  /var/tmp/qemu-build/install/share/man
ELF interp prefix /usr/gnemul/qemu-%M
Source path   /tmp/qemu-test/src
C compilercc
Host C compiler   cc
C++ compiler  
Objective-C compiler cc
ARFLAGS   rv
CFLAGS-O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 -g 
QEMU_CFLAGS   -I/usr/include/pixman-1-pthread -I/usr/include/glib-2.0 
-I/usr/lib64/glib-2.0/include   -fPIE -DPIE -m64 -D_GNU_SOURCE 
-D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE -Wstrict-prototypes 
-Wredundant-decls -Wall -Wundef -Wwrite-strings -Wmissing-prototypes 
-fno-strict-aliasing -fno-common -fwrapv  -Wendif-labels -Wmissing-include-dirs 
-Wempty-body -Wnested-externs -Wformat-security -Wformat-y2k -Winit-self 
-Wignored-qualifiers -Wold-style-declaration -Wold-style-definition 
-Wtype-limits -fstack-protector-all
LDFLAGS   -Wl,--warn-common -Wl,-z,relro -Wl,-z,now -pie -m64 -g 
make  make
install   install
pythonpython -B
smbd  /usr/sbin/smbd
module supportno
host CPU  x86_64
host big endian   no
target list   x86_64-softmmu aarch64-softmmu
tcg debug enabled no
gprof enabled no
sparse enabledno
strip binariesyes
profiler  no
static build  no
pixmansystem
SDL support   yes (1.2.14)
GTK support   no 
GTK GL supportno
VTE support   no 
TLS priority  NORMAL
GNUTLS supportno
GNUTLS rndno
libgcrypt no
libgcrypt kdf no
nettleno 
nettle kdfno
libtasn1  no
curses supportno
virgl 

Re: [Qemu-block] [Qemu-devel] [PATCH] iotests: Always use -machine accel=qtest

2016-10-17 Thread Eric Blake
On 10/17/2016 01:39 PM, Max Reitz wrote:
> Currently, we only use -machine accel=qtest when qemu is invoked through
> the common.qemu functions. However, we always want to use it, so move it
> from common.qemu directly into QEMU_OPTIONS.
> 
> Signed-off-by: Max Reitz 
> ---
>  tests/qemu-iotests/common  |  2 +-
>  tests/qemu-iotests/common.qemu | 12 +---
>  2 files changed, 6 insertions(+), 8 deletions(-)

Reviewed-by: Eric Blake 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


[Qemu-block] DROP THIS Re: [PATCH 11/22] qcow2-bitmap: add qcow2_store_persistent_bitmaps()

2016-10-17 Thread Vladimir Sementsov-Ogievskiy

Sorry, this was an accidental reply.

On 17.10.2016 20:57, Vladimir Sementsov-Ogievskiy wrote:

On 30.09.2016 12:53, Vladimir Sementsov-Ogievskiy wrote:

Realize block bitmap stroing interface, to allow qcow2 images store
persistent bitmaps.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2-bitmap.c | 241 
+++

 block/qcow2.c|   2 +
 block/qcow2.h|   2 +
 3 files changed, 245 insertions(+)

diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c
index 81520cd..a5be25a 100644
--- a/block/qcow2-bitmap.c
+++ b/block/qcow2-bitmap.c
@@ -27,6 +27,7 @@

 #include "qemu/osdep.h"
 #include "qapi/error.h"
+#include "qemu/cutils.h"

 #include "block/block_int.h"
 #include "block/qcow2.h"
@@ -96,6 +97,15 @@ static inline void bitmap_table_to_cpu(uint64_t 
*bitmap_table, size_t size)

 }
 }

+static inline void bitmap_table_to_be(uint64_t *bitmap_table, size_t 
size)

+{
+size_t i;
+
+for (i = 0; i < size; ++i) {
+cpu_to_be64s(_table[i]);
+}
+}
+
 static inline int calc_dir_entry_size(size_t name_size, size_t 
extra_data_size)

 {
 return align_offset(sizeof(Qcow2BitmapDirEntry) +
@@ -564,3 +574,234 @@ out:

 return ret;
 }
+
+/* store_bitmap_data()
+ * Store bitmap to image, filling bitamp table accordingly.




+ */
+static int store_bitmap_data(BlockDriverState *bs, BdrvDirtyBitmap 
*bitmap,
+ uint64_t *bitmap_table, uint32_t 
bitmap_table_size)

+{
+int ret;
+BDRVQcow2State *s = bs->opaque;
+uint64_t sector, dsc;
+uint64_t bm_size = bdrv_dirty_bitmap_size(bitmap);
+int cl_size = s->cluster_size;





+uint8_t *buf = NULL;
+uint32_t tb_size =
+size_to_clusters(s,
+bdrv_dirty_bitmap_serialization_size(bitmap, 0, 
bm_size));




+
+BdrvDirtyBitmapIter *dbi;
+
+if (tb_size != bitmap_table_size) {
+return -EINVAL;
+}
+
+memset(bitmap_table, 0, bitmap_table_size * 
sizeof(bitmap_table[0]));




+
+dbi = bdrv_dirty_iter_new(bitmap, 0);
+buf = g_malloc(cl_size);
+dsc = dirty_sectors_in_cluster(s, bitmap);
+
+while ((sector = bdrv_dirty_iter_next(dbi)) != -1) {




+uint64_t cluster = sector / dsc;
+sector = cluster * dsc;





+uint64_t end = MIN(bm_size, sector + dsc);
+uint64_t write_size =
+bdrv_dirty_bitmap_serialization_size(bitmap, sector, end 
- sector);

+
+int64_t off = qcow2_alloc_clusters(bs, cl_size);
+if (off < 0) {
+ret = off;
+goto finish;
+}
+bitmap_table[cluster] = off;
+
+bdrv_dirty_bitmap_serialize_part(bitmap, buf, sector, end);





+if (write_size < cl_size) {
+memset(buf + write_size, 0, cl_size - write_size);
+}
+




+ret = bdrv_pwrite(bs->file, off, buf, cl_size);
+if (ret < 0) {
+goto finish;
+}
+
+if (end >= bm_size) {
+break;
+}
+
+bdrv_set_dirty_iter(dbi, end);
+}
+ret = 0; /* writes */


What is that comment supposed to mean?


+
+finish:
+if (ret < 0) {
+clear_bitmap_table(bs, bitmap_table, bitmap_table_size);
+}
+g_free(buf);
+bdrv_dirty_iter_free(dbi);
+
+return ret;


In case you decide to keep BME_MAX_PHYS_SIZE, this function should check
somewhere that the physical size of the bitmap does not exceed that 
value.



+}
+
+/* store_bitmap()
+ * Store bitmap to qcow2 and set bitmap_table. bitmap_table itself 
is not

+ * stored to qcow2.


First of all, there is no parameter called "bitmap_table", and second,
yes, the bitmap table is written to the qcow2 file.


+ */
+static int store_bitmap(BlockDriverState *bs,
+BdrvDirtyBitmap *bitmap,
+Qcow2BitmapDirEntry *entry)
+{
+int ret;
+BDRVQcow2State *s = bs->opaque;
+uint64_t bm_size = bdrv_dirty_bitmap_size(bitmap);
+const char *bm_name = bdrv_dirty_bitmap_name(bitmap);
+
+uint64_t *tb;
+int64_t tb_offset;
+uint32_t tb_size =
+size_to_clusters(s,
+bdrv_dirty_bitmap_serialization_size(bitmap, 0, 
bm_size));


As above, this variable should be of type uint64_t.

Also, you have to check that it does not exceed BME_MAX_TABLE_SIZE.


+
+tb = g_try_new(uint64_t, tb_size);
+if (tb == NULL) {
+return -ENOMEM;
+}
+
+ret = store_bitmap_data(bs, bitmap, tb, tb_size);
+if (ret < 0) {
+g_free(tb);
+return ret;
+}
+
+tb_offset = qcow2_alloc_clusters(bs, tb_size * sizeof(tb[0]));


If you don't limit tb_size, then this multiplication can overflow on 32
bit machines.


+if (tb_offset < 0) {
+ret = tb_offset;
+goto fail;
+}
+


There should be a metadata overlap check here.


+bitmap_table_to_be(tb, tb_size);
+ret = bdrv_pwrite(bs->file, tb_offset, tb, tb_size * 
sizeof(tb[0]));

Re: [Qemu-block] [PATCH] iotests: drop thread spun work-around

2016-10-17 Thread Max Reitz
On 10.10.2016 04:57, Michael S. Tsirkin wrote:
> We've disabled the warning, no need for test to work
> around it.
> 
> Signed-off-by: Michael S. Tsirkin 
> ---
>  tests/qemu-iotests/common.filter | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/tests/qemu-iotests/common.filter 
> b/tests/qemu-iotests/common.filter
> index cfdb633..49217b0 100644
> --- a/tests/qemu-iotests/common.filter
> +++ b/tests/qemu-iotests/common.filter
> @@ -164,7 +164,6 @@ _filter_qemu()
>  {
>  sed -e "s#\\(^\\|(qemu) \\)$(basename $QEMU_PROG):#\1QEMU_PROG:#" \
>  -e 's#^QEMU [0-9]\+\.[0-9]\+\.[0-9]\+ monitor#QEMU X.Y.Z monitor#' \
> --e '/main-loop: WARNING: I\/O thread spun for [0-9]\+ iterations/d' \
>  -e $'s#\r##' # QEMU monitor uses \r\n line endings
>  }

I'll have to drop this patch from my queue for now because I noticed the
warnings reappearing. This is because the iotests only use
-machine accel=qtest, but don't create any -qtest character device.

Commit 21a24302e85024dd7b2a151158adbc1f5dc5c4dd changed the behavior so
that the warning will only be emitted if there is a -qtest character
device, reasoning that there are tests that do not use the qtest
accelerator but just the character device. However, this has the
downside of actually printing the warning if qemu is run only under the
qtest accelerator, without a -qtest chardev.

I've sent a patch to remedy this, but until that is merged, I'll have to
hold this patch.

(Also, some iotests do not yet use -machine accel=qtest at all, I'll
have to fix that first, too.)

Max



signature.asc
Description: OpenPGP digital signature


[Qemu-block] [PATCH] iotests: Always use -machine accel=qtest

2016-10-17 Thread Max Reitz
Currently, we only use -machine accel=qtest when qemu is invoked through
the common.qemu functions. However, we always want to use it, so move it
from common.qemu directly into QEMU_OPTIONS.

Signed-off-by: Max Reitz 
---
 tests/qemu-iotests/common  |  2 +-
 tests/qemu-iotests/common.qemu | 12 +---
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/tests/qemu-iotests/common b/tests/qemu-iotests/common
index d60ea2c..b6274be 100644
--- a/tests/qemu-iotests/common
+++ b/tests/qemu-iotests/common
@@ -51,7 +51,7 @@ export IMGOPTS=""
 export CACHEMODE="writeback"
 export QEMU_IO_OPTIONS=""
 export CACHEMODE_IS_DEFAULT=true
-export QEMU_OPTIONS="-nodefaults"
+export QEMU_OPTIONS="-nodefaults -machine accel=qtest"
 export VALGRIND_QEMU=
 export IMGKEYSECRET=
 export IMGOPTSSYNTAX=false
diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu
index 2548a87..e657361 100644
--- a/tests/qemu-iotests/common.qemu
+++ b/tests/qemu-iotests/common.qemu
@@ -155,15 +155,13 @@ function _launch_qemu()
 
 if [ -z "$keep_stderr" ]; then
 QEMU_NEED_PID='y'\
-${QEMU} -nographic -serial none ${comm} -machine accel=qtest "${@}" \
->"${fifo_out}" 
\
-2>&1 \
-<"${fifo_in}" &
+${QEMU} -nographic -serial none ${comm} "${@}" >"${fifo_out}" \
+   2>&1 \
+   <"${fifo_in}" &
 elif [ "$keep_stderr" = "y" ]; then
 QEMU_NEED_PID='y'\
-${QEMU} -nographic -serial none ${comm} -machine accel=qtest "${@}" \
->"${fifo_out}" 
\
-<"${fifo_in}" &
+${QEMU} -nographic -serial none ${comm} "${@}" >"${fifo_out}" \
+   <"${fifo_in}" &
 else
 exit 1
 fi
-- 
2.10.0




Re: [Qemu-block] [PATCH v2 09/11] blockjob: add block_job_start

2016-10-17 Thread John Snow



On 10/06/2016 06:44 PM, John Snow wrote:



On 10/05/2016 11:17 AM, Kevin Wolf wrote:

Am 01.10.2016 um 00:00 hat John Snow geschrieben:

Instead of automatically starting jobs at creation time via backup_start
et al, we'd like to return a job object pointer that can be started
manually at later point in time.

For now, add the block_job_start mechanism and start the jobs
automatically as we have been doing, with conversions job-by-job coming
in later patches.

Of note: cancellation of unstarted jobs will perform all the normal
cleanup as if the job had started, particularly abort and clean. The
only difference is that we will not emit any events, because the job
never actually started.

Signed-off-by: John Snow 


Should we make sure that jobs are only added to the block_jobs list once
they are started? It doesn't sound like a good idea to make a job
without a coroutine user-accessible.

Kevin



That would certainly help limit exposure to a potentially dangerous
object, but some operations on these un-started jobs are still perfectly
reasonable, like cancellation. Even things like "set speed" are
perfectly reasonable on an unstarted job.

I'd rather just verify that having an accessible job cannot be operated
on unsafely via the public interface, even though that's more work.

So here's the list:

-block_job_next: Harmless.

-block_job_get: Harmless.

-block_job_set_speed: Depends on the set_speed implementation, but
backup_set_speed makes no assumptions and that's the only job I am
attempting to convert in this series.

-block_job_cancel: Edited to specifically support pre-started jobs in
this patch.

-block_job_complete: Edited to check for a coroutine specifically, but
even without, a job will not be able to become ready without running first.

-block_job_query: Harmless* (*I could probably expose a 'started'
variable so that libvirt didn't get confused as to why there are jobs
that exist but are not busy nor paused.)

-block_job_pause: Harmless**

-block_job_user_pause: Harmless**

-block_job_user_paused: Harmless, if meaningless.

-block_job_resume: **We will attempt to call block_job_enter, but
conditional on job->co, we do nothing, so it's harmless if not
misleading that you can pause/resume to your heart's content.

-block_job_user_resume: ^ http://i.imgur.com/2zYxrIe.png ^

-block_job_cancel_sync: Safe, due to edits to block_job_cancel. Polling
loop WILL complete as normal, because all jobs will finish through
block_job_completed one way or another.

-block_job_cancel_sync_all: As safe as the above.

-block_job_complete_sync: Safe, complete will return error for unstarted
jobs.

-block_job_iostatus_reset: Harmless, I think -- backup does not
implement this method. (Huh, *no* jobs implement iostatus_reset at the
moment...)

-block_job_txn_new: Doesn't operate on jobs.

-block_job_txn_unref: Also doesn't operate on jobs.

-block_job_get_aio_context: Harmless.

-block_job_txn_add_job: Safe and intended! There is trickery here,
though, as once a job is introduced into transactions it opens it up to
the private interface. This adds the following functions to considerations:

-block_job_completed: Safe, does not assume a coroutine anywhere.

-block_job_completed_single: Specifically updated to be context-aware of
if we are pre-started or not. This is the "real" completion mechanism
for BlockJobs that gets run for transactional OR individual jobs.

-block_job_completed_txn_abort: ***BUG***! The problem with the patch as
I've sent it is that cancel calls completed (under the premise that
nobody else would ever get to be able to), but we call both cancel AND
completed from this function, which will cause us to call completed
twice on pre-started jobs. I will fix this for the next version.



Actually, I was mistaken. The way I wrote it is fine, and it will work 
like this:


qmp_transaction encounters an error during prepare and calls the abort 
method for each action.


- The abort method calls block_job_cancel on the not-yet-started job,
- Which in turn calls block_job_completed with ret = -ECANCELED

which then either calls:
- block_job_completed_single with a failure mode (which calls .abort and 
.clean) if we are not in a transaction, or

- block_job_completed_txn_abort

txn_abort is going to:

1) Set txn->aborting to true, and
2) Cancel every other job in the transaction using block_job_cancel_sync.

These jobs will also come to txn_abort, but since txn->aborting is true, 
will become a NOP.


Then, finally, we will call block_job_completed_single on every job in 
the transaction which will perform our proper cleanup by calling .abort 
and .clean for each job.


No duplication, I was mistaken...!


-block_job_completed_txn_success: Should never be called; if it IS, the
presence of an unstarted job in the transaction will cause an early
return. And even if I am utterly wrong and every job in the transaction
completes successfully (somehow?) calling block_job_completed_single is

Re: [Qemu-block] block/nfs: Fine grained runtime options in nfs

2016-10-17 Thread Ashijeet Acharya
On Fri, Oct 14, 2016 at 9:16 PM, Stefan Hajnoczi  wrote:
> On Mon, Oct 10, 2016 at 10:39:30AM +0530, Ashijeet Acharya wrote:
>> Hi all,
>>
>> I was working on trying to add blockdev-add compatibility for the nfs
>> block driver but before that runtime options need to be separated into
>> various options rather than just a simple "filename" option.
>>
>> I have added the following until now:
>> a) host
>> b) port (not sure about this one, do we just use a default port number?)
>> c) export
>> d) path (path to the file)
>>
>> I have matched these with the URI but still let me know if i have
>> missed anyone :)
>>
>> Now, in order to parse the uri for different runtime options, I have
>> made two new functions nfs_parse_filename() and nfs_parse_uri() which
>> is pretty similar to the way how other network block drivers do it.
>>
>> Currently we parse the uri in a nfs_client_open() function which takes
>> 'const char *filename' as one of its parameters but I dont think
>> that's the right way anymore because we pass 'qemu_opt_get(opts,
>> "filename")' which is invalid due to no runtime option named
>> "filename" available anymore. Right?
>>
>> While parsing uri we check for the query parameters inside a 'for
>> loop', so I have moved that too inside
>>
>> nfs_parse_uri(const char *filename, QDict *options, Error **errp)
>>
>> but the problem is there is no struct NFSClient parameter here, so I
>> cannot fill up its important fields while parsing the query
>> parameters. I cannot do the same inside nfs_client_open() because I no
>> longer parse the uri over there.OR CAN I? A completely different
>> approach will work too :)
>>
>> I can attach a pastebin link containing a raw patch if you want to get
>> a clear view but I am afraid it doesn't compile at the moment due to
>> the problems mentioned above.
>
> Please post the code and annotate the relevant places where you are
> stuck.

I have solved the issues I was facing earlier (thanks to Max!).
One more relatively easy question though, will we include @port as an
option in runtime_opts while converting NFS to use several
runtime_opts? The reason I ask this because the uri syntax for NFS in
QEMU looks like this:

   nfs:[?param=value[=value2[&...]]]

At the moment my runtime_opts looks like this:

static QemuOptsList runtime_opts = {
.name = "nfs",
.head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
.desc = {
{
.name = "host",
.type = QEMU_OPT_STRING,
.help = "Host to connect to",
},
{
.name = "export",
.type = QEMU_OPT_STRING,
.help = "Name of the NFS export to open",
},
{
.name = "path",
.type = QEMU_OPT_STRING,
.help = "Path of the image on the host",
},
{
.name = "uid",
.type = QEMU_OPT_NUMBER,
.help = "UID value to use when talking to the server",
},
{
.name = "gid",
.type = QEMU_OPT_NUMBER,
.help = "GID value to use when talking to the server",
},
{
.name = "tcp-syncnt",
.type = QEMU_OPT_NUMBER,
.help = "Number of SYNs to send during the session establish",
},
{
.name = "readahead",
.type = QEMU_OPT_NUMBER,
.help = "Set the readahead size in bytes",
},
{
.name = "pagecache",
.type = QEMU_OPT_NUMBER,
.help = "Set the pagecache size in bytes",
},
{
.name = "debug",
.type = QEMU_OPT_NUMBER,
.help = "Set the NFS debug level (max 2)",
},
{ /* end of list */ }
},
};

Any comment on including several query params of the uri in
runtime_opts will be helpful too.

Ashijeet

> Stefan



Re: [Qemu-block] [PATCH 11/22] qcow2-bitmap: add qcow2_store_persistent_bitmaps()

2016-10-17 Thread Vladimir Sementsov-Ogievskiy

On 30.09.2016 12:53, Vladimir Sementsov-Ogievskiy wrote:

Realize block bitmap stroing interface, to allow qcow2 images store
persistent bitmaps.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2-bitmap.c | 241 +++
 block/qcow2.c|   2 +
 block/qcow2.h|   2 +
 3 files changed, 245 insertions(+)

diff --git a/block/qcow2-bitmap.c b/block/qcow2-bitmap.c
index 81520cd..a5be25a 100644
--- a/block/qcow2-bitmap.c
+++ b/block/qcow2-bitmap.c
@@ -27,6 +27,7 @@

 #include "qemu/osdep.h"
 #include "qapi/error.h"
+#include "qemu/cutils.h"

 #include "block/block_int.h"
 #include "block/qcow2.h"
@@ -96,6 +97,15 @@ static inline void bitmap_table_to_cpu(uint64_t 
*bitmap_table, size_t size)
 }
 }

+static inline void bitmap_table_to_be(uint64_t *bitmap_table, size_t size)
+{
+size_t i;
+
+for (i = 0; i < size; ++i) {
+cpu_to_be64s(_table[i]);
+}
+}
+
 static inline int calc_dir_entry_size(size_t name_size, size_t extra_data_size)
 {
 return align_offset(sizeof(Qcow2BitmapDirEntry) +
@@ -564,3 +574,234 @@ out:

 return ret;
 }
+
+/* store_bitmap_data()
+ * Store bitmap to image, filling bitamp table accordingly.




+ */
+static int store_bitmap_data(BlockDriverState *bs, BdrvDirtyBitmap *bitmap,
+ uint64_t *bitmap_table, uint32_t 
bitmap_table_size)
+{
+int ret;
+BDRVQcow2State *s = bs->opaque;
+uint64_t sector, dsc;
+uint64_t bm_size = bdrv_dirty_bitmap_size(bitmap);
+int cl_size = s->cluster_size;





+uint8_t *buf = NULL;
+uint32_t tb_size =
+size_to_clusters(s,
+bdrv_dirty_bitmap_serialization_size(bitmap, 0, bm_size));




+
+BdrvDirtyBitmapIter *dbi;
+
+if (tb_size != bitmap_table_size) {
+return -EINVAL;
+}
+
+memset(bitmap_table, 0, bitmap_table_size * sizeof(bitmap_table[0]));




+
+dbi = bdrv_dirty_iter_new(bitmap, 0);
+buf = g_malloc(cl_size);
+dsc = dirty_sectors_in_cluster(s, bitmap);
+
+while ((sector = bdrv_dirty_iter_next(dbi)) != -1) {




+uint64_t cluster = sector / dsc;
+sector = cluster * dsc;





+uint64_t end = MIN(bm_size, sector + dsc);
+uint64_t write_size =
+bdrv_dirty_bitmap_serialization_size(bitmap, sector, end - sector);
+
+int64_t off = qcow2_alloc_clusters(bs, cl_size);
+if (off < 0) {
+ret = off;
+goto finish;
+}
+bitmap_table[cluster] = off;
+
+bdrv_dirty_bitmap_serialize_part(bitmap, buf, sector, end);





+if (write_size < cl_size) {
+memset(buf + write_size, 0, cl_size - write_size);
+}
+




+ret = bdrv_pwrite(bs->file, off, buf, cl_size);
+if (ret < 0) {
+goto finish;
+}
+
+if (end >= bm_size) {
+break;
+}
+
+bdrv_set_dirty_iter(dbi, end);
+}
+ret = 0; /* writes */


What is that comment supposed to mean?


+
+finish:
+if (ret < 0) {
+clear_bitmap_table(bs, bitmap_table, bitmap_table_size);
+}
+g_free(buf);
+bdrv_dirty_iter_free(dbi);
+
+return ret;


In case you decide to keep BME_MAX_PHYS_SIZE, this function should check
somewhere that the physical size of the bitmap does not exceed that value.


+}
+
+/* store_bitmap()
+ * Store bitmap to qcow2 and set bitmap_table. bitmap_table itself is not
+ * stored to qcow2.


First of all, there is no parameter called "bitmap_table", and second,
yes, the bitmap table is written to the qcow2 file.


+ */
+static int store_bitmap(BlockDriverState *bs,
+BdrvDirtyBitmap *bitmap,
+Qcow2BitmapDirEntry *entry)
+{
+int ret;
+BDRVQcow2State *s = bs->opaque;
+uint64_t bm_size = bdrv_dirty_bitmap_size(bitmap);
+const char *bm_name = bdrv_dirty_bitmap_name(bitmap);
+
+uint64_t *tb;
+int64_t tb_offset;
+uint32_t tb_size =
+size_to_clusters(s,
+bdrv_dirty_bitmap_serialization_size(bitmap, 0, bm_size));


As above, this variable should be of type uint64_t.

Also, you have to check that it does not exceed BME_MAX_TABLE_SIZE.


+
+tb = g_try_new(uint64_t, tb_size);
+if (tb == NULL) {
+return -ENOMEM;
+}
+
+ret = store_bitmap_data(bs, bitmap, tb, tb_size);
+if (ret < 0) {
+g_free(tb);
+return ret;
+}
+
+tb_offset = qcow2_alloc_clusters(bs, tb_size * sizeof(tb[0]));


If you don't limit tb_size, then this multiplication can overflow on 32
bit machines.


+if (tb_offset < 0) {
+ret = tb_offset;
+goto fail;
+}
+


There should be a metadata overlap check here.


+bitmap_table_to_be(tb, tb_size);
+ret = bdrv_pwrite(bs->file, tb_offset, tb, tb_size * sizeof(tb[0]));
+if (ret < 0) {
+goto fail;
+}
+
+g_free(tb);
+
+entry->bitmap_table_offset = tb_offset;
+ 

Re: [Qemu-block] [Qemu-devel] [PATCH v14 13/21] qdict: allow qdict_crumple to accept compound types as values

2016-10-17 Thread Markus Armbruster
Paolo Bonzini  writes:

>> For me, a replacement should do structured values by providing suitable
>> *value* syntax instead of hacking it into the keys:
>> 
>> { "driver": "quorum",
>>   "child": [ { "driver": "file", "filename": "disk1.img" },
>>  { "driver": "host_device", "filename=/dev/sdb" },
>>  { "driver": "nbd", "host": "localhost" } ] }
>> 
>> Note how the structure is obvious.  It isn't with dotted keys, not even
>> if you order them sensibly (which users inevitably won't).
>> 
>> Also not that the value needs to be parsed by QemuOpts!  You can't leave
>> it to the consumer of the QemuOpts, because if you did, you'd have to
>> escape the commas.
>> 
>> If you'd rather invent syntax closer to QemuOpts than reuse JSON, you
>> could try
>> 
>> driver=quorum,
>> child=[{ driver=file, filename=disk1.img },
>>{ driver=host_device, filename=/dev/sdb },
>>{ driver=nbd, host=localhost } ]
>> 
>> I'd go with some existing syntax, though.  The one we already use is
>> JSON.
>
> In fact there is already "filename=json:{...}" support in the block layer.

Yes.

> By the way, abuse of QemuOpts dates back to 
> http://wiki.qemu.org/Features/QCFG.

That page is from 2011, when QAPI was being implemented.  The idea to
bring the power of the QAPI schema to bear on the command line is sound,
but QCFG never made it past the ideas stage, I'm afraid.  It uses either
dotted keys or braces for nested structs.  It doesn't cover lists.

>> Your dotted key convention requires two rules: 1. names must not look
>> like integers, and 2. names must not contain '.'.
>> 
>> We can avoid rule 2 by requiring '.' to be escaped.  Dan's
>> qdict_crumple() currently does that, to your surprise.  Adding the
>> escaping to existing options is a compatibility break, however.  So, if
>> names with '.' already exist, we can either break compatibility by
>> renaming them, or break it by requiring the '.' to be escaped.
>
>> * "device", qemu_device_opts in qdev-monitor.c
>> 
>>   This one pulls in qdev properties.  Properties violating rule 2 exist.
>
> Which are they?  Only bus names?

Finding properties is difficult, because we (foolishly!) define them in
code rather than data.  My testing with "info qdm" for all targets
finds:

= xlnx.axi-dma =
xlnx.axi-dma[0]
= xlnx.xps-ethernetlite =
xlnx.xps-ethernetlite[0]
= xlnx.xps-intc =
xlnx.xps-intc[0]
= xlnx.xps-uartlite =
xlnx.xps-uartlite[0]
= xlnx.axi-dma =
xlnx.axi-dma[0]
= xlnx.xps-ethernetlite =
xlnx.xps-ethernetlite[0]
= xlnx.xps-intc =
xlnx.xps-intc[0]
= xlnx.xps-uartlite =
xlnx.xps-uartlite[0]
= cuda =
adb.0
= raven-pcihost =
pci.0
= macio-ide =
ide.0
= mpc8544-guts =
mpc8544.guts[0]
= xlnx.xps-ethernetlite =
xlnx.xps-ethernetlite[0]
= xlnx.xps-intc =
xlnx.xps-intc[0]
= xlnx.xps-uartlite =
xlnx.xps-uartlite[0]
= cuda =
adb.0
= raven-pcihost =
pci.0
= macio-ide =
ide.0
= mpc8544-guts =
mpc8544.guts[0]
= xlnx.xps-ethernetlite =
xlnx.xps-ethernetlite[0]
= xlnx.xps-intc =
xlnx.xps-intc[0]
= xlnx.xps-uartlite =
xlnx.xps-uartlite[0]
= s390-sclp-event-facility =
s390-sclp-events-bus.0
= cgthree =
cg3.reg[0]
cg3.prom[0]
= SUNW,tcx =
tcx.thc[0]
tcx.rblit[0]
tcx.dac[0]
tcx.alt[0]
tcx.tec[0]
tcx.rstip[0]
tcx.blit[0]
tcx.dhc[0]
tcx.prom[0]
tcx.stip[0]

>> * "object", qemu_object_opts in vl.c
>> 
>>   This one pulls in properties of user-creatable objects.
>> 
>> * "machine", qemu_machine_opts in vl.c
>> 
>>   This one pulls in machine properties.
>
>> > > I know we have keys containing '.' elsewhere, e.g. device "macio-ide"
>> > > property "ide.0".  Our chronic inability to consistently restrict names
>> > > in ABI to something sane is beyond foolish.
>> >
>> > I wanted to have a look at this example, but I can only find the string
>> > "ide.0" used as a bus name in the sources, that is, a value rather than
>> > a key.
>> >
>> > Do you have a pointer to the property definition that you mean?
>> 
>> We've gotten better at hiding property definitions...
>> 
>> "qemu-system-ppc -device macio-ide,help" shows the property:
>> 
>> macio-ide.ide.0=child
>
> It is a bug that this property is shown in the help, because it's not
> assignable (same for all other child<> properties).

Let's fix it then.

>  I'd rather declare
> other occurrences of "." in user-accessible property names to be bugs,
> and break the ABI if there are any.

I propose to adopt QAPI's rules on permissible names: "All names must
begin with a letter, and contain only ASCII letters, digits, dash, and
underscore" (docs/qapi-code-gen.txt).  QAPI's naming rules get adopted
anyway on QAPIfication, so why wait? :)

Note that this may affect names generated by 

[Qemu-block] [PATCH v3 5/5] qapi: allow blockdev-add for ssh

2016-10-17 Thread Ashijeet Acharya
Introduce new object 'BlockdevOptionsSsh' in qapi/block-core.json to
support blockdev-add for SSH network protocol driver. Use only 'struct
InetSocketAddress' since SSH only supports connection over TCP.

Signed-off-by: Ashijeet Acharya 
---
 qapi/block-core.json | 26 --
 1 file changed, 24 insertions(+), 2 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 9d797b8..689dc0a 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1716,7 +1716,8 @@
 'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom',
 'host_device', 'http', 'https', 'luks', 'null-aio', 'null-co',
 'parallels', 'qcow', 'qcow2', 'qed', 'quorum', 'raw',
-   'replication', 'tftp', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
+'replication', 'ssh', 'tftp', 'vdi', 'vhdx', 'vmdk', 'vpc',
+'vvfat' ] }
 
 ##
 # @BlockdevOptionsFile
@@ -1953,6 +1954,27 @@
 '*vport': 'int',
 '*segment': 'str' } }
 
+##
+# @BlockdevOptionsSsh
+#
+# @server:  host address
+#
+# @path:path to the image on the host
+#
+# @user:#optional user as which to connect, defaults to current
+#   local user name
+#
+# @host_key_check   #optional defines how and what to check the host
+#   key against, defaults to "yes"
+#
+# Since 2.8
+##
+{ 'struct': 'BlockdevOptionsSsh',
+  'data': { 'server': 'InetSocketAddress',
+'path': 'str',
+'*user': 'str',
+'*host_key_check': 'str' } }
+
 
 ##
 # @BlkdebugEvent
@@ -2281,7 +2303,7 @@
 # TODO rbd: Wait for structured options
   'replication':'BlockdevOptionsReplication',
 # TODO sheepdog: Wait for structured options
-# TODO ssh: Should take InetSocketAddress for 'host'?
+  'ssh':'BlockdevOptionsSsh',
   'tftp':   'BlockdevOptionsCurl',
   'vdi':'BlockdevOptionsGenericFormat',
   'vhdx':   'BlockdevOptionsGenericFormat',
-- 
2.6.2




[Qemu-block] [PATCH v3 4/5] block/ssh: Use InetSocketAddress options

2016-10-17 Thread Ashijeet Acharya
Drop the use of legacy options in favour of the InetSocketAddress
options.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Max Reitz 
---
 block/ssh.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/ssh.c b/block/ssh.c
index 7963b48..9b693f6 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -199,6 +199,7 @@ static int parse_uri(const char *filename, QDict *options, 
Error **errp)
 {
 URI *uri = NULL;
 QueryParams *qp;
+char *port_str;
 int i;
 
 uri = uri_parse(filename);
@@ -231,11 +232,11 @@ static int parse_uri(const char *filename, QDict 
*options, Error **errp)
 qdict_put(options, "user", qstring_from_str(uri->user));
 }
 
-qdict_put(options, "host", qstring_from_str(uri->server));
+qdict_put(options, "server.host", qstring_from_str(uri->server));
 
-if (uri->port) {
-qdict_put(options, "port", qint_from_int(uri->port));
-}
+port_str = g_strdup_printf("%d", uri->port ?: 22);
+qdict_put(options, "server.port", qstring_from_str(port_str));
+g_free(port_str);
 
 qdict_put(options, "path", qstring_from_str(uri->path));
 
-- 
2.6.2




[Qemu-block] [PATCH v3 0/5] Allow blockdev-add for SSH

2016-10-17 Thread Ashijeet Acharya
Previously posted series patches:
v2: https://lists.gnu.org/archive/html/qemu-devel/2016-10/msg03403.html
v1: http://lists.nongnu.org/archive/html/qemu-devel/2016-10/msg02137.html

This series adds blockdev-add support for SSH block driver.

Patch 1 prepares the code for the addition of a new option prefix,
which is "server.". This is accomplished by adding a
ssh_has_filename_options_conflict() function which helps to iterate
over the various options and check for conflict.

Patch 2 makes inet_connect_saddr() in util/qemu-sockets.c public.

Patch 3 first adds InetSocketAddress compatibility to SSH block driver
and then makes it accept a InetSocketAddress under the "server" option.
The old options "host" and "port" are supported as legacy options and
then translated to the respective InetSocketAddress representation.

Patch 4 drops the usage of "host" and "port" outside of
ssh_has_filename_options_conflict() and
ssh_process_legacy_socket_options() functions in order to make them
legacy options completely.

Patch 5 helps to allow blockdev-add support for the SSH block driver
by making the SSH option available.


*** This series depends on the following patch: ***
"qdict: implement a qdict_crumple method for un-flattening a dict"
from Daniel's "QAPI/QOM work for non-scalar object properties"
series.

Changes in v3:
- reorder patch 2 and 3 from v2 (Max)
- fix coding-style issue in patch 2 (Max)
- drop testing to check for "server" as an object itself (Max)
- fix strstart() bug (Max)
- fix a segfault bug when host gets set to NULL (Max)
- revert back to using qobject_input_visitor_new() (Max & Kevin)
- use qemu_strtol() instead of atoi() for better error handling (Kevin)
- make @user an optional argument in qapi/block-core.json (Max)
- update documentation for BlockdevOptionsSsh (Max)

Changes in v2:
- Use strstart() instead of strcmp() (Kevin)
- Use qobject_input_visitor_new_autocast() instead of
  qmp_input_visitor_new() and change header files accordingly (Kevin)
- Use inet_connect_saddr() instead of inet_connect() (Kevin)
- Drop the contruction of : string (Kevin)
- Fix the bug in ssh_process_legacy_socket_options()


Ashijeet Acharya (5):
  block/ssh: Add ssh_has_filename_options_conflict()
  util/qemu-sockets: Make inet_connect_saddr() public
  block/ssh: Add InetSocketAddress and accept it
  block/ssh: Use InetSocketAddress options
  qapi: allow blockdev-add for ssh

 block/ssh.c| 128 -
 include/qemu/sockets.h |   2 +
 qapi/block-core.json   |  26 +-
 util/qemu-sockets.c|   4 +-
 4 files changed, 134 insertions(+), 26 deletions(-)

-- 
2.6.2




Re: [Qemu-block] [Qemu-devel] [PATCH v14 13/21] qdict: allow qdict_crumple to accept compound types as values

2016-10-17 Thread Eric Blake
On 10/17/2016 09:50 AM, Markus Armbruster wrote:
>> But even if I realised that QemuOpts support this syntax, I think we
>> would still have to use the dotted syntax because it's explicit about
>> the index and we need that because the list can contains dicts.
>>
>> Compare this:
>>
>> driver=quorum,
>> child.0.driver=file,child.0.filename=disk1.img,
>> child.1.driver=host_device,child.1.filename=/dev/sdb,
>> child.2.driver=nbd,child.2.host=localhost
>>
>> And this:
>>
>> driver=quorum,
>> child.driver=file,child.filename=disk1.img,
>> child.driver=host_device,child.filename=/dev/sdb,
>> child.driver=nbd,child.host=localhost
> 
> Aside: both are about equally illegible, to be honest.

> Permit me to digress.
> 
> QemuOpts wasn't designed for list-values keys.  Doing lists by
> repetition was clever.
> 
> QemuOpts wasn't designed for structured values.  Doing structured values
> by a dotted key convention plus repetition was clever.
> 
> And there's the problem: too much cleverness, not enough "this is being
> pushed way beyond its design limits, time to replace it".
> 
> For me, a replacement should do structured values by providing suitable
> *value* syntax instead of hacking it into the keys:
> 
> { "driver": "quorum",
>   "child": [ { "driver": "file", "filename": "disk1.img" },
>  { "driver": "host_device", "filename=/dev/sdb" },
>  { "driver": "nbd", "host": "localhost" } ] }

Possible hack solution:

QemuOpts already special-cases id=.  What if we ALSO make it
special-case a leading json=?  Shown here with shell quoting, the above
example of creating a Quorum -drive argument could then be:

-drive json='
{ "driver": "quorum",
  "child": [ { "driver": "file", "filename": "disk1.img" },
 { "driver": "host_device", "filename=/dev/sdb" },
 { "driver": "nbd", "host": "localhost" } ] }
'

As far as I know, we don't have 'json' as any existing QemuOpts key (do
we? A full audit may be better than my quick git grep '"json"').  Thus,
if QemuOpts sees a leading json=, it hands off the rest of the string to
the same parser as we use for QMP, where we no longer have to escape
commas (even nicer than the drive hack where we support
filename=json:{...} but have to double up all commas to make it through
the QemuOpts layer).  Encountering json= as anything other than the
first option would be an error, and you would be unable to combine a
json= option with any other old-style option.  In other words, the use
of leading json= would be the switch for whether to do old-style parsing
or to use a saner syntax for everything else that needs structure, on a
per-argument basis.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


[Qemu-block] [PATCH v3 3/5] block/ssh: Add InetSocketAddress and accept it

2016-10-17 Thread Ashijeet Acharya
Add InetSocketAddress compatibility to SSH driver.

Add a new option "server" to the SSH block driver which then accepts
a InetSocketAddress.

"host" and "port" are supported as legacy options and are mapped to
their InetSocketAddress representation.

Signed-off-by: Ashijeet Acharya 
---
 block/ssh.c | 94 -
 1 file changed, 81 insertions(+), 13 deletions(-)

diff --git a/block/ssh.c b/block/ssh.c
index 75cb7bc..7963b48 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -30,10 +30,14 @@
 #include "block/block_int.h"
 #include "qapi/error.h"
 #include "qemu/error-report.h"
+#include "qemu/cutils.h"
 #include "qemu/sockets.h"
 #include "qemu/uri.h"
+#include "qapi-visit.h"
 #include "qapi/qmp/qint.h"
 #include "qapi/qmp/qstring.h"
+#include "qapi/qobject-input-visitor.h"
+#include "qapi/qobject-output-visitor.h"
 
 /* DEBUG_SSH=1 enables the DPRINTF (debugging printf) statements in
  * this block driver code.
@@ -74,6 +78,8 @@ typedef struct BDRVSSHState {
  */
 LIBSSH2_SFTP_ATTRIBUTES attrs;
 
+InetSocketAddress *inet;
+
 /* Used to warn if 'flush' is not supported. */
 char *hostport;
 bool unsafe_flush_warning;
@@ -263,7 +269,8 @@ static bool ssh_has_filename_options_conflict(QDict 
*options, Error **errp)
 !strcmp(qe->key, "port") ||
 !strcmp(qe->key, "path") ||
 !strcmp(qe->key, "user") ||
-!strcmp(qe->key, "host_key_check"))
+!strcmp(qe->key, "host_key_check") ||
+strstart(qe->key, "server.", NULL))
 {
 error_setg(errp, "Option '%s' cannot be used with a file name",
qe->key);
@@ -555,14 +562,69 @@ static QemuOptsList ssh_runtime_opts = {
 },
 };
 
+static bool ssh_process_legacy_socket_options(QDict *output_opts,
+  QemuOpts *legacy_opts,
+  Error **errp)
+{
+const char *host = qemu_opt_get(legacy_opts, "host");
+const char *port = qemu_opt_get(legacy_opts, "port");
+
+if (!host && port) {
+error_setg(errp, "port may not be used without host");
+return false;
+}
+
+if (host) {
+qdict_put(output_opts, "server.host", qstring_from_str(host));
+qdict_put(output_opts, "server.port",
+  qstring_from_str(port ?: stringify(22)));
+}
+
+return true;
+}
+
+static InetSocketAddress *ssh_config(BDRVSSHState *s, QDict *options,
+ Error **errp)
+{
+InetSocketAddress *inet = NULL;
+QDict *addr = NULL;
+QObject *crumpled_addr = NULL;
+Visitor *iv = NULL;
+Error *local_error = NULL;
+
+qdict_extract_subqdict(options, , "server.");
+if (!qdict_size(addr)) {
+error_setg(errp, "SSH server address missing");
+goto out;
+}
+
+crumpled_addr = qdict_crumple(addr, true, errp);
+if (!crumpled_addr) {
+goto out;
+}
+
+iv = qobject_input_visitor_new(crumpled_addr, true);
+visit_type_InetSocketAddress(iv, NULL, , _error);
+if (local_error) {
+error_propagate(errp, local_error);
+goto out;
+}
+
+out:
+QDECREF(addr);
+qobject_decref(crumpled_addr);
+visit_free(iv);
+return inet;
+}
+
 static int connect_to_ssh(BDRVSSHState *s, QDict *options,
   int ssh_flags, int creat_mode, Error **errp)
 {
 int r, ret;
 QemuOpts *opts = NULL;
 Error *local_err = NULL;
-const char *host, *user, *path, *host_key_check;
-int port;
+const char *user, *path, *host_key_check;
+long port = 0;
 
 opts = qemu_opts_create(_runtime_opts, NULL, 0, _abort);
 qemu_opts_absorb_qdict(opts, options, _err);
@@ -572,15 +634,11 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
 goto err;
 }
 
-host = qemu_opt_get(opts, "host");
-if (!host) {
+if (!ssh_process_legacy_socket_options(options, opts, errp)) {
 ret = -EINVAL;
-error_setg(errp, "No hostname was specified");
 goto err;
 }
 
-port = qemu_opt_get_number(opts, "port", 22);
-
 path = qemu_opt_get(opts, "path");
 if (!path) {
 ret = -EINVAL;
@@ -603,12 +661,21 @@ static int connect_to_ssh(BDRVSSHState *s, QDict *options,
 host_key_check = "yes";
 }
 
-/* Construct the host:port name for inet_connect. */
-g_free(s->hostport);
-s->hostport = g_strdup_printf("%s:%d", host, port);
+/* Pop the config into our state object, Exit if invalid */
+s->inet = ssh_config(s, options, errp);
+if (!s->inet) {
+ret = -EINVAL;
+goto err;
+}
+
+if (qemu_strtol(s->inet->port, NULL, 10, ) < 0) {
+error_setg(errp, "Use only numeric port value");
+ret = -EINVAL;
+goto err;
+}
 
 /* Open the socket and connect. */
-s->sock = inet_connect(s->hostport, errp);
+

[Qemu-block] [PATCH v3 1/5] block/ssh: Add ssh_has_filename_options_conflict()

2016-10-17 Thread Ashijeet Acharya
We have 5 options plus ("server") option which is added in the next
patch that conflict with specifying a SSH filename. We need to iterate
over all the options to check whether its key has an "server." prefix.

This iteration will help us adding the new option "server" easily.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Max Reitz 
---
 block/ssh.c | 27 +--
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/block/ssh.c b/block/ssh.c
index 5ce12b6..75cb7bc 100644
--- a/block/ssh.c
+++ b/block/ssh.c
@@ -254,15 +254,30 @@ static int parse_uri(const char *filename, QDict 
*options, Error **errp)
 return -EINVAL;
 }
 
+static bool ssh_has_filename_options_conflict(QDict *options, Error **errp)
+{
+const QDictEntry *qe;
+
+for (qe = qdict_first(options); qe; qe = qdict_next(options, qe)) {
+if (!strcmp(qe->key, "host") ||
+!strcmp(qe->key, "port") ||
+!strcmp(qe->key, "path") ||
+!strcmp(qe->key, "user") ||
+!strcmp(qe->key, "host_key_check"))
+{
+error_setg(errp, "Option '%s' cannot be used with a file name",
+   qe->key);
+return true;
+}
+}
+
+return false;
+}
+
 static void ssh_parse_filename(const char *filename, QDict *options,
Error **errp)
 {
-if (qdict_haskey(options, "user") ||
-qdict_haskey(options, "host") ||
-qdict_haskey(options, "port") ||
-qdict_haskey(options, "path") ||
-qdict_haskey(options, "host_key_check")) {
-error_setg(errp, "user, host, port, path, host_key_check cannot be 
used at the same time as a file option");
+if (ssh_has_filename_options_conflict(options, errp)) {
 return;
 }
 
-- 
2.6.2




[Qemu-block] [PATCH v3 2/5] util/qemu-sockets: Make inet_connect_saddr() public

2016-10-17 Thread Ashijeet Acharya
Make inet_connect_saddr() in util/qemu-sockets.c public in order to be
able to use it with InetSocketAddress sockets outside of
util/qemu-sockets.c independently.

Signed-off-by: Ashijeet Acharya 
---
 include/qemu/sockets.h | 2 ++
 util/qemu-sockets.c| 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/qemu/sockets.h b/include/qemu/sockets.h
index 9eb2470..5589e68 100644
--- a/include/qemu/sockets.h
+++ b/include/qemu/sockets.h
@@ -34,6 +34,8 @@ typedef void NonBlockingConnectHandler(int fd, Error *err, 
void *opaque);
 
 InetSocketAddress *inet_parse(const char *str, Error **errp);
 int inet_connect(const char *str, Error **errp);
+int inet_connect_saddr(InetSocketAddress *saddr, Error **errp,
+   NonBlockingConnectHandler *callback, void *opaque);
 
 NetworkAddressFamily inet_netfamily(int family);
 
diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c
index 4cef549..31f7fc6 100644
--- a/util/qemu-sockets.c
+++ b/util/qemu-sockets.c
@@ -412,8 +412,8 @@ static struct addrinfo 
*inet_parse_connect_saddr(InetSocketAddress *saddr,
  * function succeeds, callback will be called when the connection
  * completes, with the file descriptor on success, or -1 on error.
  */
-static int inet_connect_saddr(InetSocketAddress *saddr, Error **errp,
-  NonBlockingConnectHandler *callback, void 
*opaque)
+int inet_connect_saddr(InetSocketAddress *saddr, Error **errp,
+   NonBlockingConnectHandler *callback, void *opaque)
 {
 Error *local_err = NULL;
 struct addrinfo *res, *e;
-- 
2.6.2




Re: [Qemu-block] [PATCH 11/22] qcow2-bitmap: add qcow2_store_persistent_bitmaps()

2016-10-17 Thread Vladimir Sementsov-Ogievskiy

On 07.10.2016 22:24, Max Reitz wrote:

On 30.09.2016 12:53, Vladimir Sementsov-Ogievskiy wrote:

Realize block bitmap stroing interface, to allow qcow2 images store
persistent bitmaps.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  block/qcow2-bitmap.c | 241 +++


[...]



+ret = bdrv_pwrite(bs->file, off, buf, cl_size);
+if (ret < 0) {
+goto finish;
+}
+
+if (end >= bm_size) {
+break;
+}
+
+bdrv_set_dirty_iter(dbi, end);
+}
+ret = 0; /* writes */

What is that comment supposed to mean?




Now I think I can drop this assignment, as bdrv_aligned_preadv have 
'return ret < 0 ? ret : 0;' in the end...  Am I right? Can bdrv_pwrite 
and friends return positive value on success?


--
Best regards,
Vladimir




Re: [Qemu-block] [PATCH v4 0/3] iotests: Fix test 162

2016-10-17 Thread Max Reitz
On 28.09.2016 22:46, Max Reitz wrote:
> 162 is potentially racy and makes some invalid assumptions about what
> should happen when connecting to a non-existing domain name. This series
> fixes both issues.
> 
> 
> v4:
> - Added documentation for the new --fork option [Kevin]
> 
> 
> git-backport-diff against v3:
> 
> Key:
> [] : patches are identical
> [] : number of functional differences between upstream/downstream
> patch
> [down] : patch is downstream-only
> The flags [FC] indicate (F)unctional and (C)ontextual differences,
> respectively
> 
> 001/3:[0004] [FC] 'qemu-nbd: Add --fork option'
> 002/3:[] [--] 'iotests: Remove raciness from 162'
> 003/3:[] [--] 'iotests: Do not rely on unavailable domains in 162'
> 
> 
> Max Reitz (3):
>   qemu-nbd: Add --fork option
>   iotests: Remove raciness from 162
>   iotests: Do not rely on unavailable domains in 162
> 
>  qemu-nbd.c | 17 -
>  qemu-nbd.texi  |  2 ++
>  tests/qemu-iotests/162 | 22 --
>  tests/qemu-iotests/162.out |  2 +-
>  4 files changed, 35 insertions(+), 8 deletions(-)

Ping



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH 0/2] quorum: cleanup and optimization for FIFO case

2016-10-17 Thread Max Reitz
On 05.10.2016 18:35, Paolo Bonzini wrote:
> A couple patches I had lying in the huge multiqueue series but are
> not really related.  So let's flush 'em...
> 
> Paolo Bonzini (2):
>   quorum: change child_iter to children_read
>   quorum: do not allocate multiple iovecs for FIFO strategy
> 
>  block/quorum.c | 93 
> --
>  1 file changed, 45 insertions(+), 48 deletions(-)

Thanks, I've applied the series to my block branch:

https://github.com/XanClic/qemu/commits/block

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] [PATCH v2] rbd: make the code more readable

2016-10-17 Thread John Snow



On 10/15/2016 04:26 AM, Xiubo Li wrote:

Make it a bit clearer and more readable.

Signed-off-by: Xiubo Li 
CC: John Snow 
---

V2:
- Advice from John Snow. Thanks.


 block/rbd.c | 25 -
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/block/rbd.c b/block/rbd.c
index 0a5840d..d0d4b39 100644
--- a/block/rbd.c
+++ b/block/rbd.c
@@ -366,45 +366,44 @@ static int qemu_rbd_create(const char *filename, QemuOpts 
*opts, Error **errp)
 rados_conf_read_file(cluster, NULL);
 } else if (conf[0] != '\0' &&
qemu_rbd_set_conf(cluster, conf, true, _err) < 0) {
-rados_shutdown(cluster);
 error_propagate(errp, local_err);
-return -EIO;
+ret = -EIO;
+goto shutdown;
 }

 if (conf[0] != '\0' &&
 qemu_rbd_set_conf(cluster, conf, false, _err) < 0) {
-rados_shutdown(cluster);
 error_propagate(errp, local_err);
-return -EIO;
+ret = -EIO;
+goto shutdown;
 }

 if (qemu_rbd_set_auth(cluster, secretid, errp) < 0) {
-rados_shutdown(cluster);
-return -EIO;
+ret = -EIO;
+goto shutdown;
 }

 ret = rados_connect(cluster);
 if (ret < 0) {
 error_setg_errno(errp, -ret, "error connecting");
-rados_shutdown(cluster);
-return ret;
+goto shutdown;
 }

 ret = rados_ioctx_create(cluster, pool, _ctx);
 if (ret < 0) {
 error_setg_errno(errp, -ret, "error opening pool %s", pool);
-rados_shutdown(cluster);
-return ret;
+goto shutdown;
 }

 ret = rbd_create(io_ctx, name, bytes, _order);
-rados_ioctx_destroy(io_ctx);
-rados_shutdown(cluster);
 if (ret < 0) {
 error_setg_errno(errp, -ret, "error rbd create");
-return ret;
 }

+rados_ioctx_destroy(io_ctx);
+
+shutdown:
+rados_shutdown(cluster);
 return ret;
 }




Reviewed-by: John Snow 



Re: [Qemu-block] [v2 2/5] block/ssh: Add InetSocketAddress and accept it

2016-10-17 Thread Ashijeet Acharya
On Mon, Oct 17, 2016 at 9:29 PM, Eric Blake  wrote:
> On 10/17/2016 10:44 AM, Ashijeet Acharya wrote:
>
>>
>> I think, its better to keep using atoi() and check if it returns a '0'
>
> Please not atoi(), as it lacks sane error checking. It cannot tell the
> difference between '1' and '1garbage'.  It's obvious that you want to
> treat both '0' and 'name' as an error, but that is not the only error
> you want to flag, thus atoi() is insufficient to flag all the errors you
> want.
>
Okay, will using qemu_strtol() be any good as I think it has better
error handling support? Otherwise I will resort to passing -ve value
as Kevin suggested earlier.

Ashijeet
>
> --
> Eric Blake   eblake redhat com+1-919-301-3266
> Libvirt virtualization library http://libvirt.org
>



Re: [Qemu-block] [v2 2/5] block/ssh: Add InetSocketAddress and accept it

2016-10-17 Thread Eric Blake
On 10/17/2016 10:44 AM, Ashijeet Acharya wrote:

> 
> I think, its better to keep using atoi() and check if it returns a '0'

Please not atoi(), as it lacks sane error checking. It cannot tell the
difference between '1' and '1garbage'.  It's obvious that you want to
treat both '0' and 'name' as an error, but that is not the only error
you want to flag, thus atoi() is insufficient to flag all the errors you
want.


-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH 1/2] throttle: Correct access to wrong BlockBackendPublic structures

2016-10-17 Thread Paolo Bonzini


On 17/10/2016 17:46, Alberto Garcia wrote:
> In 27ccdd52598290f0f8b58be56e235aff7aebfaf3 the throttling fields were
> moved from BlockDriverState to BlockBackend. However in a few cases
> the code started using throttling fields from the active BlockBackend
> instead of the round-robin token, making the algorithm behave
> incorrectly.
> 
> This can cause starvation if there's a throttling group with several
> drives but only one of them has I/O.
> 
> Reported-by: Paolo Bonzini 
> Signed-off-by: Alberto Garcia 
> ---
>  block/throttle-groups.c | 27 +++
>  1 file changed, 23 insertions(+), 4 deletions(-)
> 
> diff --git a/block/throttle-groups.c b/block/throttle-groups.c
> index 59545e2..17b2efb 100644
> --- a/block/throttle-groups.c
> +++ b/block/throttle-groups.c
> @@ -168,6 +168,22 @@ static BlockBackend 
> *throttle_group_next_blk(BlockBackend *blk)
>  return blk_by_public(next);
>  }
>  
> +/*
> + * Return whether a BlockBackend has pending requests.
> + *
> + * This assumes that tg->lock is held.
> + *
> + * @blk: the BlockBackend
> + * @is_write:  the type of operation (read/write)
> + * @ret:   whether the BlockBackend has pending requests.
> + */
> +static inline bool blk_has_pending_reqs(BlockBackend *blk,
> +bool is_write)
> +{
> +const BlockBackendPublic *blkp = blk_get_public(blk);
> +return blkp->pending_reqs[is_write];
> +}
> +
>  /* Return the next BlockBackend in the round-robin sequence with pending I/O
>   * requests.
>   *
> @@ -188,7 +204,7 @@ static BlockBackend *next_throttle_token(BlockBackend 
> *blk, bool is_write)
>  
>  /* get next bs round in round robin style */
>  token = throttle_group_next_blk(token);
> -while (token != start && !blkp->pending_reqs[is_write]) {
> +while (token != start && !blk_has_pending_reqs(token, is_write)) {
>  token = throttle_group_next_blk(token);
>  }
>  
> @@ -196,10 +212,13 @@ static BlockBackend *next_throttle_token(BlockBackend 
> *blk, bool is_write)
>   * then decide the token is the current bs because chances are
>   * the current bs get the current request queued.
>   */
> -if (token == start && !blkp->pending_reqs[is_write]) {
> +if (token == start && !blk_has_pending_reqs(token, is_write)) {
>  token = blk;
>  }
>  
> +/* Either we return the original BB, or one with pending requests */
> +assert(token == blk || blk_has_pending_reqs(token, is_write));

Nice. :-)

>  return token;
>  }
>  
> @@ -257,7 +276,7 @@ static void schedule_next_request(BlockBackend *blk, bool 
> is_write)
>  
>  /* Check if there's any pending request to schedule next */
>  token = next_throttle_token(blk, is_write);
> -if (!blkp->pending_reqs[is_write]) {
> +if (!blk_has_pending_reqs(token, is_write)) {
>  return;
>  }
>  
> @@ -271,7 +290,7 @@ static void schedule_next_request(BlockBackend *blk, bool 
> is_write)
>  qemu_co_queue_next(>throttled_reqs[is_write])) {
>  token = blk;
>  } else {
> -ThrottleTimers *tt = >throttle_timers;
> +ThrottleTimers *tt = _get_public(token)->throttle_timers;
>  int64_t now = qemu_clock_get_ns(tt->clock_type);
>  timer_mod(tt->timers[is_write], now + 1);
>  tg->any_timer_armed[is_write] = true;
> 

Reviewed-by: Paolo Bonzini 



Re: [Qemu-block] [v2 2/5] block/ssh: Add InetSocketAddress and accept it

2016-10-17 Thread Ashijeet Acharya
On Mon, Oct 17, 2016 at 9:23 PM, Kevin Wolf  wrote:
> Am 17.10.2016 um 17:44 hat Ashijeet Acharya geschrieben:
>> On Mon, Oct 17, 2016 at 6:27 PM, Kevin Wolf  wrote:
>> > Am 17.10.2016 um 14:33 hat Ashijeet Acharya geschrieben:
>> >> On Sun, Oct 16, 2016 at 4:00 AM, Max Reitz  wrote:
>> >> > On 15.10.2016 11:04, Ashijeet Acharya wrote:
>> >> >> Add InetSocketAddress compatibility to SSH driver.
>> >> >>
>> >> >> Add a new option "server" to the SSH block driver which then accepts
>> >> >> a InetSocketAddress.
>> >> >>
>> >> >> "host" and "port" are supported as legacy options and are mapped to
>> >> >> their InetSocketAddress representation.
>> >> >>
>> >> >> Signed-off-by: Ashijeet Acharya 
>> >> >> ---
>> >> >>  block/ssh.c | 83 
>> >> >> ++---
>> >> >>  1 file changed, 74 insertions(+), 9 deletions(-)
>> >> >>
>> >> >>
>> >> >>  /* Open the socket and connect. */
>> >> >>  s->sock = inet_connect(s->hostport, errp);
>> >> >> @@ -634,7 +698,8 @@ static int connect_to_ssh(BDRVSSHState *s, QDict 
>> >> >> *options,
>> >> >>  }
>> >> >>
>> >> >>  /* Check the remote host's key against known_hosts. */
>> >> >> -ret = check_host_key(s, host, port, host_key_check, errp);
>> >> >> +ret = check_host_key(s, s->inet->host, port, host_key_check,
>> >> >
>> >> > But then you're still using the port here... And I can't come up with a
>> >> > way (not even a bad one) to get the numeric port. Maybe interpret the
>> >> > addrinfo in inet_connect_saddr()? But getting that information out would
>> >> > be ugly, if even possible...
>> >> >
>> >> > So maybe the best is to keep it this way and put a FIXME above the
>> >> > atoi() call. :-/
>> >>
>> >> Kevin, I believe (after talking with Max) that regarding the atoi()
>> >> issue, I can't use any string to integer function since it won't
>> >> succeed for cases like port = 'ssh' and putting a FIXME over it seems
>> >> to be the only option. But Max did warn me, though, to get everybody's
>> >> opinion before I do so. So I am awaiting your response on this one.
>> >> Much better will be if you have a workaround solution in mind!! :-)
>> >
>> > The integer port is only needed for libssh2_knownhost_checkp(). One
>> > option could be to consider passing -1 instead:
>> >
>> > port is the port number used by the host (or a negative number to
>> > check the generic host). If the port number is given, libssh2 will
>> > check the key for the specific host + port number combination in
>> > addition to the plain host name only check.
>> >
>> > In 99% of the cases, this shouldn't make any difference.
>>
>> I think, its better to keep using atoi() and check if it returns a '0'
>>  value and display the error to the user to give the input as numeric.
>> This is possible since this will not clash with the possibility that
>> user gives the port input as port = '0' for no such port number exists
>> as far as I know. Will this work?
>
> It's fair enough. We will have a little inconsistency between ssh and
> other users of SocketAddress, but the driver never supported service
> names, so it isn't a regression either.

Great, I will do this change and post the patches shortly.

Ashijeet
>
> Kevin



[Qemu-block] [PATCH 2/2] qemu-iotests: Test I/O in a single drive from a throttling group

2016-10-17 Thread Alberto Garcia
iotest 093 contains a test that creates a throttling group with
several drives and performs I/O in all of them. This patch adds a new
test that creates a similar setup but only performs I/O in one of the
drives at the same time.

This is useful to test that the round robin algorithm is behaving
properly in these scenarios, and is specifically written using the
regression introduced in 27ccdd52598290f0f8b58be56e as an example.

Signed-off-by: Alberto Garcia 
---
 tests/qemu-iotests/093 | 33 -
 tests/qemu-iotests/093.out |  4 ++--
 2 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/tests/qemu-iotests/093 b/tests/qemu-iotests/093
index ffcb271..2ed393a 100755
--- a/tests/qemu-iotests/093
+++ b/tests/qemu-iotests/093
@@ -53,7 +53,7 @@ class ThrottleTestCase(iotests.QMPTestCase):
 result = self.vm.qmp("block_set_io_throttle", conv_keys=False, 
**params)
 self.assert_qmp(result, 'return', {})
 
-def do_test_throttle(self, ndrives, seconds, params):
+def do_test_throttle(self, ndrives, seconds, params, first_drive = 0):
 def check_limit(limit, num):
 # IO throttling algorithm is discrete, allow 10% error so the test
 # is more robust
@@ -85,12 +85,14 @@ class ThrottleTestCase(iotests.QMPTestCase):
 # Send I/O requests to all drives
 for i in range(rd_nr):
 for drive in range(0, ndrives):
-self.vm.hmp_qemu_io("drive%d" % drive, "aio_read %d %d" %
+idx = first_drive + drive
+self.vm.hmp_qemu_io("drive%d" % idx, "aio_read %d %d" %
 (i * rq_size, rq_size))
 
 for i in range(wr_nr):
 for drive in range(0, ndrives):
-self.vm.hmp_qemu_io("drive%d" % drive, "aio_write %d %d" %
+idx = first_drive + drive
+self.vm.hmp_qemu_io("drive%d" % idx, "aio_write %d %d" %
 (i * rq_size, rq_size))
 
 # We'll store the I/O stats for each drive in these arrays
@@ -105,15 +107,17 @@ class ThrottleTestCase(iotests.QMPTestCase):
 
 # Read the stats before advancing the clock
 for i in range(0, ndrives):
+idx = first_drive + i
 start_rd_bytes[i], start_rd_iops[i], start_wr_bytes[i], \
-start_wr_iops[i] = self.blockstats('drive%d' % i)
+start_wr_iops[i] = self.blockstats('drive%d' % idx)
 
 self.vm.qtest("clock_step %d" % ns)
 
 # Read the stats after advancing the clock
 for i in range(0, ndrives):
+idx = first_drive + i
 end_rd_bytes[i], end_rd_iops[i], end_wr_bytes[i], \
-end_wr_iops[i] = self.blockstats('drive%d' % i)
+end_wr_iops[i] = self.blockstats('drive%d' % idx)
 
 # Check that the I/O is within the limits and evenly distributed
 for i in range(0, ndrives):
@@ -129,6 +133,7 @@ class ThrottleTestCase(iotests.QMPTestCase):
 self.assertTrue(check_limit(params['iops_rd'], rd_iops))
 self.assertTrue(check_limit(params['iops_wr'], wr_iops))
 
+# Connect N drives to a VM and test I/O in all of them
 def test_all(self):
 params = {"bps": 4096,
   "bps_rd": 4096,
@@ -146,6 +151,24 @@ class ThrottleTestCase(iotests.QMPTestCase):
 self.configure_throttle(ndrives, limits)
 self.do_test_throttle(ndrives, 5, limits)
 
+# Connect N drives to a VM and test I/O in just one of them a time
+def test_one(self):
+params = {"bps": 4096,
+  "bps_rd": 4096,
+  "bps_wr": 4096,
+  "iops": 10,
+  "iops_rd": 10,
+  "iops_wr": 10,
+ }
+# Repeat the test for each one of the drives
+for drive in range(0, self.max_drives):
+# Pick each out of all possible params and test
+for tk in params:
+limits = dict([(k, 0) for k in params])
+limits[tk] = params[tk] * self.max_drives
+self.configure_throttle(self.max_drives, limits)
+self.do_test_throttle(1, 5, limits, drive)
+
 def test_burst(self):
 params = {"bps": 4096,
   "bps_rd": 4096,
diff --git a/tests/qemu-iotests/093.out b/tests/qemu-iotests/093.out
index 914e373..2f7d390 100644
--- a/tests/qemu-iotests/093.out
+++ b/tests/qemu-iotests/093.out
@@ -1,5 +1,5 @@
-.
+...
 --
-Ran 5 tests
+Ran 7 tests
 
 OK
-- 
2.9.3




[Qemu-block] [PATCH 1/2] throttle: Correct access to wrong BlockBackendPublic structures

2016-10-17 Thread Alberto Garcia
In 27ccdd52598290f0f8b58be56e235aff7aebfaf3 the throttling fields were
moved from BlockDriverState to BlockBackend. However in a few cases
the code started using throttling fields from the active BlockBackend
instead of the round-robin token, making the algorithm behave
incorrectly.

This can cause starvation if there's a throttling group with several
drives but only one of them has I/O.

Reported-by: Paolo Bonzini 
Signed-off-by: Alberto Garcia 
---
 block/throttle-groups.c | 27 +++
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index 59545e2..17b2efb 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -168,6 +168,22 @@ static BlockBackend *throttle_group_next_blk(BlockBackend 
*blk)
 return blk_by_public(next);
 }
 
+/*
+ * Return whether a BlockBackend has pending requests.
+ *
+ * This assumes that tg->lock is held.
+ *
+ * @blk: the BlockBackend
+ * @is_write:  the type of operation (read/write)
+ * @ret:   whether the BlockBackend has pending requests.
+ */
+static inline bool blk_has_pending_reqs(BlockBackend *blk,
+bool is_write)
+{
+const BlockBackendPublic *blkp = blk_get_public(blk);
+return blkp->pending_reqs[is_write];
+}
+
 /* Return the next BlockBackend in the round-robin sequence with pending I/O
  * requests.
  *
@@ -188,7 +204,7 @@ static BlockBackend *next_throttle_token(BlockBackend *blk, 
bool is_write)
 
 /* get next bs round in round robin style */
 token = throttle_group_next_blk(token);
-while (token != start && !blkp->pending_reqs[is_write]) {
+while (token != start && !blk_has_pending_reqs(token, is_write)) {
 token = throttle_group_next_blk(token);
 }
 
@@ -196,10 +212,13 @@ static BlockBackend *next_throttle_token(BlockBackend 
*blk, bool is_write)
  * then decide the token is the current bs because chances are
  * the current bs get the current request queued.
  */
-if (token == start && !blkp->pending_reqs[is_write]) {
+if (token == start && !blk_has_pending_reqs(token, is_write)) {
 token = blk;
 }
 
+/* Either we return the original BB, or one with pending requests */
+assert(token == blk || blk_has_pending_reqs(token, is_write));
+
 return token;
 }
 
@@ -257,7 +276,7 @@ static void schedule_next_request(BlockBackend *blk, bool 
is_write)
 
 /* Check if there's any pending request to schedule next */
 token = next_throttle_token(blk, is_write);
-if (!blkp->pending_reqs[is_write]) {
+if (!blk_has_pending_reqs(token, is_write)) {
 return;
 }
 
@@ -271,7 +290,7 @@ static void schedule_next_request(BlockBackend *blk, bool 
is_write)
 qemu_co_queue_next(>throttled_reqs[is_write])) {
 token = blk;
 } else {
-ThrottleTimers *tt = >throttle_timers;
+ThrottleTimers *tt = _get_public(token)->throttle_timers;
 int64_t now = qemu_clock_get_ns(tt->clock_type);
 timer_mod(tt->timers[is_write], now + 1);
 tg->any_timer_armed[is_write] = true;
-- 
2.9.3




[Qemu-block] [PATCH 0/2] Correct access to wrong BlockBackendPublic structures

2016-10-17 Thread Alberto Garcia
Hi all,

Paolo found that commit 27ccdd52598290f introduced a regression in the
throttling code.

It can be easily reproduced in scenarios where you have a throttling
group with several drives but you only write to one of them. In that
case the round-robin algorithm can select the wrong drive all the time
and the actual requests are never completed.

QEMU 2.7 is affected, here's the patch to fix it, plus a test case.

Thanks,

Berto

Alberto Garcia (2):
  throttle: Correct access to wrong BlockBackendPublic structures
  qemu-iotests: Test I/O in a single drive from a throttling group

 block/throttle-groups.c| 27 +++
 tests/qemu-iotests/093 | 33 -
 tests/qemu-iotests/093.out |  4 ++--
 3 files changed, 53 insertions(+), 11 deletions(-)

-- 
2.9.3




Re: [Qemu-block] [v2 2/5] block/ssh: Add InetSocketAddress and accept it

2016-10-17 Thread Kevin Wolf
Am 17.10.2016 um 17:44 hat Ashijeet Acharya geschrieben:
> On Mon, Oct 17, 2016 at 6:27 PM, Kevin Wolf  wrote:
> > Am 17.10.2016 um 14:33 hat Ashijeet Acharya geschrieben:
> >> On Sun, Oct 16, 2016 at 4:00 AM, Max Reitz  wrote:
> >> > On 15.10.2016 11:04, Ashijeet Acharya wrote:
> >> >> Add InetSocketAddress compatibility to SSH driver.
> >> >>
> >> >> Add a new option "server" to the SSH block driver which then accepts
> >> >> a InetSocketAddress.
> >> >>
> >> >> "host" and "port" are supported as legacy options and are mapped to
> >> >> their InetSocketAddress representation.
> >> >>
> >> >> Signed-off-by: Ashijeet Acharya 
> >> >> ---
> >> >>  block/ssh.c | 83 
> >> >> ++---
> >> >>  1 file changed, 74 insertions(+), 9 deletions(-)
> >> >>
> >> >>
> >> >>  /* Open the socket and connect. */
> >> >>  s->sock = inet_connect(s->hostport, errp);
> >> >> @@ -634,7 +698,8 @@ static int connect_to_ssh(BDRVSSHState *s, QDict 
> >> >> *options,
> >> >>  }
> >> >>
> >> >>  /* Check the remote host's key against known_hosts. */
> >> >> -ret = check_host_key(s, host, port, host_key_check, errp);
> >> >> +ret = check_host_key(s, s->inet->host, port, host_key_check,
> >> >
> >> > But then you're still using the port here... And I can't come up with a
> >> > way (not even a bad one) to get the numeric port. Maybe interpret the
> >> > addrinfo in inet_connect_saddr()? But getting that information out would
> >> > be ugly, if even possible...
> >> >
> >> > So maybe the best is to keep it this way and put a FIXME above the
> >> > atoi() call. :-/
> >>
> >> Kevin, I believe (after talking with Max) that regarding the atoi()
> >> issue, I can't use any string to integer function since it won't
> >> succeed for cases like port = 'ssh' and putting a FIXME over it seems
> >> to be the only option. But Max did warn me, though, to get everybody's
> >> opinion before I do so. So I am awaiting your response on this one.
> >> Much better will be if you have a workaround solution in mind!! :-)
> >
> > The integer port is only needed for libssh2_knownhost_checkp(). One
> > option could be to consider passing -1 instead:
> >
> > port is the port number used by the host (or a negative number to
> > check the generic host). If the port number is given, libssh2 will
> > check the key for the specific host + port number combination in
> > addition to the plain host name only check.
> >
> > In 99% of the cases, this shouldn't make any difference.
> 
> I think, its better to keep using atoi() and check if it returns a '0'
>  value and display the error to the user to give the input as numeric.
> This is possible since this will not clash with the possibility that
> user gives the port input as port = '0' for no such port number exists
> as far as I know. Will this work?

It's fair enough. We will have a little inconsistency between ssh and
other users of SocketAddress, but the driver never supported service
names, so it isn't a regression either.

Kevin

> Ashijeet
> > Alternatively it could be possible to use getservbyname() to get the
> > port number from the name, but maybe that's a bit too much for a feature
> > that most people don't even know of.
> >
> > I'm also not completely opposed to simply requiring a numeric argument
> > for SSH. There is no real use to support service names here other than
> > being consistent with other places in qemu.
> >
> > Kevin



Re: [Qemu-block] [v2 2/5] block/ssh: Add InetSocketAddress and accept it

2016-10-17 Thread Ashijeet Acharya
On Mon, Oct 17, 2016 at 6:27 PM, Kevin Wolf  wrote:
> Am 17.10.2016 um 14:33 hat Ashijeet Acharya geschrieben:
>> On Sun, Oct 16, 2016 at 4:00 AM, Max Reitz  wrote:
>> > On 15.10.2016 11:04, Ashijeet Acharya wrote:
>> >> Add InetSocketAddress compatibility to SSH driver.
>> >>
>> >> Add a new option "server" to the SSH block driver which then accepts
>> >> a InetSocketAddress.
>> >>
>> >> "host" and "port" are supported as legacy options and are mapped to
>> >> their InetSocketAddress representation.
>> >>
>> >> Signed-off-by: Ashijeet Acharya 
>> >> ---
>> >>  block/ssh.c | 83 
>> >> ++---
>> >>  1 file changed, 74 insertions(+), 9 deletions(-)
>> >>
>> >>
>> >>  /* Open the socket and connect. */
>> >>  s->sock = inet_connect(s->hostport, errp);
>> >> @@ -634,7 +698,8 @@ static int connect_to_ssh(BDRVSSHState *s, QDict 
>> >> *options,
>> >>  }
>> >>
>> >>  /* Check the remote host's key against known_hosts. */
>> >> -ret = check_host_key(s, host, port, host_key_check, errp);
>> >> +ret = check_host_key(s, s->inet->host, port, host_key_check,
>> >
>> > But then you're still using the port here... And I can't come up with a
>> > way (not even a bad one) to get the numeric port. Maybe interpret the
>> > addrinfo in inet_connect_saddr()? But getting that information out would
>> > be ugly, if even possible...
>> >
>> > So maybe the best is to keep it this way and put a FIXME above the
>> > atoi() call. :-/
>>
>> Kevin, I believe (after talking with Max) that regarding the atoi()
>> issue, I can't use any string to integer function since it won't
>> succeed for cases like port = 'ssh' and putting a FIXME over it seems
>> to be the only option. But Max did warn me, though, to get everybody's
>> opinion before I do so. So I am awaiting your response on this one.
>> Much better will be if you have a workaround solution in mind!! :-)
>
> The integer port is only needed for libssh2_knownhost_checkp(). One
> option could be to consider passing -1 instead:
>
> port is the port number used by the host (or a negative number to
> check the generic host). If the port number is given, libssh2 will
> check the key for the specific host + port number combination in
> addition to the plain host name only check.
>
> In 99% of the cases, this shouldn't make any difference.

I think, its better to keep using atoi() and check if it returns a '0'
 value and display the error to the user to give the input as numeric.
This is possible since this will not clash with the possibility that
user gives the port input as port = '0' for no such port number exists
as far as I know. Will this work?

Ashijeet
> Alternatively it could be possible to use getservbyname() to get the
> port number from the name, but maybe that's a bit too much for a feature
> that most people don't even know of.
>
> I'm also not completely opposed to simply requiring a numeric argument
> for SSH. There is no real use to support service names here other than
> being consistent with other places in qemu.
>
> Kevin



Re: [Qemu-block] [Qemu-devel] [PATCH v14 13/21] qdict: allow qdict_crumple to accept compound types as values

2016-10-17 Thread Paolo Bonzini
> For me, a replacement should do structured values by providing suitable
> *value* syntax instead of hacking it into the keys:
> 
> { "driver": "quorum",
>   "child": [ { "driver": "file", "filename": "disk1.img" },
>  { "driver": "host_device", "filename=/dev/sdb" },
>  { "driver": "nbd", "host": "localhost" } ] }
> 
> Note how the structure is obvious.  It isn't with dotted keys, not even
> if you order them sensibly (which users inevitably won't).
> 
> Also not that the value needs to be parsed by QemuOpts!  You can't leave
> it to the consumer of the QemuOpts, because if you did, you'd have to
> escape the commas.
> 
> If you'd rather invent syntax closer to QemuOpts than reuse JSON, you
> could try
> 
> driver=quorum,
> child=[{ driver=file, filename=disk1.img },
>{ driver=host_device, filename=/dev/sdb },
>{ driver=nbd, host=localhost } ]
> 
> I'd go with some existing syntax, though.  The one we already use is
> JSON.

In fact there is already "filename=json:{...}" support in the block layer.
By the way, abuse of QemuOpts dates back to http://wiki.qemu.org/Features/QCFG.

> Your dotted key convention requires two rules: 1. names must not look
> like integers, and 2. names must not contain '.'.
> 
> We can avoid rule 2 by requiring '.' to be escaped.  Dan's
> qdict_crumple() currently does that, to your surprise.  Adding the
> escaping to existing options is a compatibility break, however.  So, if
> names with '.' already exist, we can either break compatibility by
> renaming them, or break it by requiring the '.' to be escaped.

> * "device", qemu_device_opts in qdev-monitor.c
> 
>   This one pulls in qdev properties.  Properties violating rule 2 exist.

Which are they?  Only bus names?

> * "object", qemu_object_opts in vl.c
> 
>   This one pulls in properties of user-creatable objects.
> 
> * "machine", qemu_machine_opts in vl.c
> 
>   This one pulls in machine properties.

> > > I know we have keys containing '.' elsewhere, e.g. device "macio-ide"
> > > property "ide.0".  Our chronic inability to consistently restrict names
> > > in ABI to something sane is beyond foolish.
> >
> > I wanted to have a look at this example, but I can only find the string
> > "ide.0" used as a bus name in the sources, that is, a value rather than
> > a key.
> >
> > Do you have a pointer to the property definition that you mean?
> 
> We've gotten better at hiding property definitions...
> 
> "qemu-system-ppc -device macio-ide,help" shows the property:
> 
> macio-ide.ide.0=child

It is a bug that this property is shown in the help, because it's not
assignable (same for all other child<> properties).  I'd rather declare
other occurrences of "." in user-accessible property names to be bugs,
and break the ABI if there are any.

Paolo



Re: [Qemu-block] [Qemu-devel] [PATCH v14 13/21] qdict: allow qdict_crumple to accept compound types as values

2016-10-17 Thread Markus Armbruster
Kevin Wolf  writes:

> Am 13.10.2016 um 14:35 hat Markus Armbruster geschrieben:
>> Cc: Kevin for discussion of QemuOpts dotted key convention
>> 
>> "Daniel P. Berrange"  writes:
>> 
>> > Currently qdict_crumple requires a totally flat QDict as its
>> > input. i.e. all values in the QDict must be scalar types.
>> >
>> > In order to have backwards compatibility with the OptsVisitor,
>> > qemu_opt_to_qdict() has a new mode where it may return a QList
>> > for values in the QDict, if there was a repeated key. We thus
>> > need to allow compound types to appear as values in the input
>> > dict given to qdict_crumple().
>> >
>> > To avoid confusion, we sanity check that the user has not mixed
>> > the old and new syntax at the same time. e.g. these are allowed
>> >
>> >foo=hello,foo=world,foo=wibble
>> >foo.0=hello,foo.1=world,foo.2=wibble
>> >
>> > but this is forbidden
>> >
>> >foo=hello,foo=world,foo.2=wibble
>> 
>> I understand the need for foo.bar=val.  It makes it possible to specify
>> nested dictionaries with QemuOpts.
>> 
>> The case for foo.0=val is less clear.  QemuOpts already supports lists,
>> by repeating keys.  Why do we need a second, wordier way to specify
>> them?
>
> Probably primarily because someone didn't realise this when introducing
> the dotted syntax.

Can't even blame "someone" for that; it's an obscure, underdocumented
feature of an interface that's collapsing under its load of obscure,
underdocumented features.

On the other hand, that's not exactly a state that allows for *more*
obscure features.

>Also because flat QDicts can't represent this.

Explain?

> But even if I realised that QemuOpts support this syntax, I think we
> would still have to use the dotted syntax because it's explicit about
> the index and we need that because the list can contains dicts.
>
> Compare this:
>
> driver=quorum,
> child.0.driver=file,child.0.filename=disk1.img,
> child.1.driver=host_device,child.1.filename=/dev/sdb,
> child.2.driver=nbd,child.2.host=localhost
>
> And this:
>
> driver=quorum,
> child.driver=file,child.filename=disk1.img,
> child.driver=host_device,child.filename=/dev/sdb,
> child.driver=nbd,child.host=localhost

Aside: both are about equally illegible, to be honest.

> For starters, can we really trust the order in QemuOpts so that the
> right driver and filename are associated with each other?

The order is trustworthy, but...

> We would also have code to associate the third child.driver with the
> first child.host (because file and host_device don't have a host
> property). And this isn't even touching optional arguments yet. Would
> you really want to implement or review this?

... you're right, doing lists by repeating keys breaks down when
combined with the dotted key convention's use of repetition to do
structured values.

Permit me to digress.

QemuOpts wasn't designed for list-values keys.  Doing lists by
repetition was clever.

QemuOpts wasn't designed for structured values.  Doing structured values
by a dotted key convention plus repetition was clever.

And there's the problem: too much cleverness, not enough "this is being
pushed way beyond its design limits, time to replace it".

For me, a replacement should do structured values by providing suitable
*value* syntax instead of hacking it into the keys:

{ "driver": "quorum",
  "child": [ { "driver": "file", "filename": "disk1.img" },
 { "driver": "host_device", "filename=/dev/sdb" },
 { "driver": "nbd", "host": "localhost" } ] }

Note how the structure is obvious.  It isn't with dotted keys, not even
if you order them sensibly (which users inevitably won't).

Also not that the value needs to be parsed by QemuOpts!  You can't leave
it to the consumer of the QemuOpts, because if you did, you'd have to
escape the commas.

If you'd rather invent syntax closer to QemuOpts than reuse JSON, you
could try

driver=quorum,
child=[{ driver=file, filename=disk1.img },
   { driver=host_device, filename=/dev/sdb },
   { driver=nbd, host=localhost } ]

I'd go with some existing syntax, though.  The one we already use is
JSON.

End of digression.

>> Note that this second way creates entirely new failure modes and
>> restrictions.  Let me show using an example derived from one in
>> qdict_crumple()'s contract:
>> 
>> foo.0.bar=bla,foo.eek.bar=blubb
>> 
>> Without the dotted key convention, this is perfectly fine: key
>> "foo.0.bar" has the single value "bla", and key "foo.eek.bar" has
>> the single value "blubb".  Equivalent JSON would be
>> 
>>   { "foo.0.bar": "bla", "foo.eek.bar": "blubb" }
>> 
>> With just the struct convention, it's still fine: it obviously means
>> the same as JSON
>> 
>>   { "foo": { "0": { "bar": "bla" }, "eek": { "bar": "blubb" } } }
>> 
>> Adding the list convention makes it invalid.  It also outlaws a
>>   

[Qemu-block] [PATCH 18/20] iothread: release AioContext around aio_poll

2016-10-17 Thread Paolo Bonzini
This is the first step towards having fine-grained critical sections in
dataplane threads, which will resolve lock ordering problems between
address_space_* functions (which need the BQL when doing MMIO, even
after we complete RCU-based dispatch) and the AioContext.

Because AioContext does not use contention callbacks anymore, the
unit test has to be changed.

Previously applied as a0710f7995f914e3044e5899bd8ff6c43c62f916 and
then reverted.

Reviewed-by: Fam Zheng 
Signed-off-by: Paolo Bonzini 
---
 async.c | 22 +++---
 docs/multiple-iothreads.txt | 40 +++-
 include/block/aio.h |  3 ---
 iothread.c  | 11 ++-
 tests/test-aio.c| 22 ++
 5 files changed, 42 insertions(+), 56 deletions(-)

diff --git a/async.c b/async.c
index fb37b03..27db772 100644
--- a/async.c
+++ b/async.c
@@ -107,8 +107,8 @@ int aio_bh_poll(AioContext *ctx)
  * aio_notify again if necessary.
  */
 if (atomic_xchg(>scheduled, 0)) {
-/* Idle BHs and the notify BH don't count as progress */
-if (!bh->idle && bh != ctx->notify_dummy_bh) {
+/* Idle BHs don't count as progress */
+if (!bh->idle) {
 ret = 1;
 }
 bh->idle = 0;
@@ -260,7 +260,6 @@ aio_ctx_finalize(GSource *source)
 {
 AioContext *ctx = (AioContext *) source;
 
-qemu_bh_delete(ctx->notify_dummy_bh);
 thread_pool_free(ctx->thread_pool);
 
 #ifdef CONFIG_LINUX_AIO
@@ -346,19 +345,6 @@ static void aio_timerlist_notify(void *opaque)
 aio_notify(opaque);
 }
 
-static void aio_rfifolock_cb(void *opaque)
-{
-AioContext *ctx = opaque;
-
-/* Kick owner thread in case they are blocked in aio_poll() */
-qemu_bh_schedule(ctx->notify_dummy_bh);
-}
-
-static void notify_dummy_bh(void *opaque)
-{
-/* Do nothing, we were invoked just to force the event loop to iterate */
-}
-
 static void event_notifier_dummy_cb(EventNotifier *e)
 {
 }
@@ -386,11 +372,9 @@ AioContext *aio_context_new(Error **errp)
 #endif
 ctx->thread_pool = NULL;
 qemu_mutex_init(>bh_lock);
-rfifolock_init(>lock, aio_rfifolock_cb, ctx);
+rfifolock_init(>lock, NULL, NULL);
 timerlistgroup_init(>tlg, aio_timerlist_notify, ctx);
 
-ctx->notify_dummy_bh = aio_bh_new(ctx, notify_dummy_bh, NULL);
-
 return ctx;
 fail:
 g_source_destroy(>source);
diff --git a/docs/multiple-iothreads.txt b/docs/multiple-iothreads.txt
index 40b8419..0e7cdb2 100644
--- a/docs/multiple-iothreads.txt
+++ b/docs/multiple-iothreads.txt
@@ -105,13 +105,10 @@ a BH in the target AioContext beforehand and then call 
qemu_bh_schedule().  No
 acquire/release or locking is needed for the qemu_bh_schedule() call.  But be
 sure to acquire the AioContext for aio_bh_new() if necessary.
 
-The relationship between AioContext and the block layer

-The AioContext originates from the QEMU block layer because it provides a
-scoped way of running event loop iterations until all work is done.  This
-feature is used to complete all in-flight block I/O requests (see
-bdrv_drain_all()).  Nowadays AioContext is a generic event loop that can be
-used by any QEMU subsystem.
+AioContext and the block layer
+--
+The AioContext originates from the QEMU block layer, even though nowadays
+AioContext is a generic event loop that can be used by any QEMU subsystem.
 
 The block layer has support for AioContext integrated.  Each BlockDriverState
 is associated with an AioContext using bdrv_set_aio_context() and
@@ -122,13 +119,22 @@ Block layer code must therefore expect to run in an 
IOThread and avoid using
 old APIs that implicitly use the main loop.  See the "How to program for
 IOThreads" above for information on how to do that.
 
-If main loop code such as a QMP function wishes to access a BlockDriverState it
-must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure the
-IOThread does not run in parallel.
-
-Long-running jobs (usually in the form of coroutines) are best scheduled in the
-BlockDriverState's AioContext to avoid the need to acquire/release around each
-bdrv_*() call.  Be aware that there is currently no mechanism to get notified
-when bdrv_set_aio_context() moves this BlockDriverState to a different
-AioContext (see bdrv_detach_aio_context()/bdrv_attach_aio_context()), so you
-may need to add this if you want to support long-running jobs.
+If main loop code such as a QMP function wishes to access a BlockDriverState
+it must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure
+that callbacks in the IOThread do not run in parallel.
+
+Code running in the monitor typically needs to ensure that past
+requests from the guest are completed.  When a block device is running
+in an IOThread, the IOThread can also process 

[Qemu-block] [PATCH 17/20] block: only call aio_poll on the current thread's AioContext

2016-10-17 Thread Paolo Bonzini
aio_poll is not thread safe; for example bdrv_drain can hang if
the last in-flight I/O operation is completed in the I/O thread after
the main thread has checked bs->in_flight.

The bug remains latent as long as all of it is called within
aio_context_acquire/aio_context_release, but this will change soon.

To fix this, if bdrv_drain is called from outside the I/O thread,
signal the main AioContext through a dummy bottom half.  The event
loop then only runs in the I/O thread.

Reviewed-by: Fam Zheng 
Signed-off-by: Paolo Bonzini 
---
v1->v2: bdrv_wakeup moved here and documented [Stefan]

 async.c |  1 +
 block.c |  2 ++
 block/io.c  | 12 
 block/nfs.c |  1 +
 block/sheepdog.c|  3 +++
 hw/scsi/virtio-scsi-dataplane.c |  4 +---
 include/block/block.h   | 24 +---
 include/block/block_int.h   | 17 +
 8 files changed, 58 insertions(+), 6 deletions(-)

diff --git a/async.c b/async.c
index f30d011..fb37b03 100644
--- a/async.c
+++ b/async.c
@@ -61,6 +61,7 @@ void aio_bh_schedule_oneshot(AioContext *ctx, QEMUBHFunc *cb, 
void *opaque)
 smp_wmb();
 ctx->first_bh = bh;
 qemu_mutex_unlock(>bh_lock);
+aio_notify(ctx);
 }
 
 QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
diff --git a/block.c b/block.c
index fbe485c..a17baab 100644
--- a/block.c
+++ b/block.c
@@ -2090,7 +2090,9 @@ int bdrv_reopen_multiple(AioContext *ctx, 
BlockReopenQueue *bs_queue, Error **er
 
 assert(bs_queue != NULL);
 
+aio_context_release(ctx);
 bdrv_drain_all();
+aio_context_acquire(ctx);
 
 QSIMPLEQ_FOREACH(bs_entry, bs_queue, entry) {
 if (bdrv_reopen_prepare(_entry->state, bs_queue, _err)) {
diff --git a/block/io.c b/block/io.c
index f0682dd..91f8a08 100644
--- a/block/io.c
+++ b/block/io.c
@@ -474,9 +474,21 @@ void bdrv_inc_in_flight(BlockDriverState *bs)
 atomic_inc(>in_flight);
 }
 
+static void dummy_bh_cb(void *opaque)
+{
+}
+
+void bdrv_wakeup(BlockDriverState *bs)
+{
+if (bs->wakeup) {
+aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
+}
+}
+
 void bdrv_dec_in_flight(BlockDriverState *bs)
 {
 atomic_dec(>in_flight);
+bdrv_wakeup(bs);
 }
 
 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
diff --git a/block/nfs.c b/block/nfs.c
index 7474fbc..88c60a9 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -505,6 +505,7 @@ nfs_get_allocated_file_size_cb(int ret, struct nfs_context 
*nfs, void *data,
 error_report("NFS Error: %s", nfs_get_error(nfs));
 }
 task->complete = 1;
+bdrv_wakeup(task->bs);
 }
 
 static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
diff --git a/block/sheepdog.c b/block/sheepdog.c
index 16a5c1c..1fb9173 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -702,6 +702,9 @@ out:
 
 srco->ret = ret;
 srco->finished = true;
+if (srco->bs) {
+bdrv_wakeup(srco->bs);
+}
 }
 
 /*
diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c
index b173b94..9424f0e 100644
--- a/hw/scsi/virtio-scsi-dataplane.c
+++ b/hw/scsi/virtio-scsi-dataplane.c
@@ -189,13 +189,11 @@ void virtio_scsi_dataplane_stop(VirtIOSCSI *s)
 assert(s->ctx == iothread_get_aio_context(vs->conf.iothread));
 
 aio_context_acquire(s->ctx);
-
 virtio_scsi_clear_aio(s);
+aio_context_release(s->ctx);
 
 blk_drain_all(); /* ensure there are no in-flight requests */
 
-aio_context_release(s->ctx);
-
 for (i = 0; i < vs->conf.num_queues + 2; i++) {
 virtio_bus_set_host_notifier(VIRTIO_BUS(qbus), i, false);
 }
diff --git a/include/block/block.h b/include/block/block.h
index fb86611..df25665 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -343,9 +343,27 @@ void bdrv_drain_all(void);
 #define BDRV_POLL_WHILE(bs, cond) ({   \
 bool waited_ = false;  \
 BlockDriverState *bs_ = (bs);  \
-while ((cond)) {   \
-aio_poll(bdrv_get_aio_context(bs_), true); \
-waited_ = true;\
+AioContext *ctx_ = bdrv_get_aio_context(bs_);  \
+if (aio_context_in_iothread(ctx_)) {   \
+while ((cond)) {   \
+aio_poll(ctx_, true);  \
+waited_ = true;\
+}  \
+} else {   \
+assert(qemu_get_current_aio_context() ==   \
+   qemu_get_aio_context());\
+/* Ask bdrv_dec_in_flight to wake up the main  \
+ * QEMU AioContext.   

[Qemu-block] [PATCH 15/20] qemu-io: acquire AioContext

2016-10-17 Thread Paolo Bonzini
Signed-off-by: Paolo Bonzini 
---
v1->v2: new [qemu-iotests]

 qemu-io-cmds.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index dbe0413..4750e9a 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -2216,6 +2216,7 @@ static const cmdinfo_t help_cmd = {
 
 bool qemuio_command(BlockBackend *blk, const char *cmd)
 {
+AioContext *ctx;
 char *input;
 const cmdinfo_t *ct;
 char **v;
@@ -2227,7 +2228,10 @@ bool qemuio_command(BlockBackend *blk, const char *cmd)
 if (c) {
 ct = find_command(v[0]);
 if (ct) {
+ctx = blk ? blk_get_aio_context(blk) : qemu_get_aio_context();
+aio_context_acquire(ctx);
 done = command(blk, ct, c, v);
+aio_context_release(ctx);
 } else {
 fprintf(stderr, "command \"%s\" not found\n", v[0]);
 }
-- 
2.7.4





[Qemu-block] [PATCH 12/20] iothread: detach all block devices before stopping them

2016-10-17 Thread Paolo Bonzini
Soon bdrv_drain will not call aio_poll itself on iothreads.  If block
devices are left hanging off the iothread's AioContext, there will be no
one to do I/O for those poor devices.

Signed-off-by: Paolo Bonzini 
---
v1->v2: add "if ... continue" [Fam]

 iothread.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/iothread.c b/iothread.c
index 62c8796..fdfb440 100644
--- a/iothread.c
+++ b/iothread.c
@@ -16,6 +16,7 @@
 #include "qom/object_interfaces.h"
 #include "qemu/module.h"
 #include "block/aio.h"
+#include "block/block.h"
 #include "sysemu/iothread.h"
 #include "qmp-commands.h"
 #include "qemu/error-report.h"
@@ -199,6 +200,18 @@ IOThreadInfoList *qmp_query_iothreads(Error **errp)
 void iothread_stop_all(void)
 {
 Object *container = object_get_objects_root();
+BlockDriverState *bs;
+BdrvNextIterator it;
+
+for (bs = bdrv_first(); bs; bs = bdrv_next()) {
+AioContext *ctx = bdrv_get_aio_context(bs);
+if (ctx == qemu_get_aio_context()) {
+continue;
+}
+aio_context_acquire(ctx);
+bdrv_set_aio_context(bs, qemu_get_aio_context());
+aio_context_release(ctx);
+}
 
 object_child_foreach(container, iothread_stop, NULL);
 }
-- 
2.7.4





[Qemu-block] [PATCH 05/20] block: change drain to look only at one child at a time

2016-10-17 Thread Paolo Bonzini
bdrv_requests_pending is checking children to also wait until internal
requests (such as metadata writes) have completed.  However, checking
children is in general overkill.  Children requests can be of two kinds:

- requests caused by an operation on bs, e.g. a bdrv_aio_write to bs
causing a write to bs->file->bs.  In this case, the parent's in_flight
count will always be incremented by at least one for every request in
the child.

- asynchronous metadata writes or flushes.  Such writes can be started
even if bs's in_flight count is zero, but not after the .bdrv_drain
callback has been invoked.

This patch therefore changes bdrv_drain to finish I/O in the parent
(after which the parent's in_flight will be locked to zero), call
bdrv_drain (after which the parent will not generate I/O on the child
anymore), and then wait for internal I/O in the children to complete.

Signed-off-by: Paolo Bonzini 
---
 block/io.c | 47 ++-
 1 file changed, 26 insertions(+), 21 deletions(-)

diff --git a/block/io.c b/block/io.c
index 8d46d8b..afec968 100644
--- a/block/io.c
+++ b/block/io.c
@@ -156,16 +156,33 @@ bool bdrv_requests_pending(BlockDriverState *bs)
 return false;
 }
 
-static void bdrv_drain_recurse(BlockDriverState *bs)
+static bool bdrv_drain_poll(BlockDriverState *bs)
+{
+bool waited = false;
+
+while (atomic_read(>in_flight) > 0) {
+aio_poll(bdrv_get_aio_context(bs), true);
+waited = true;
+}
+return waited;
+}
+
+static bool bdrv_drain_recurse(BlockDriverState *bs)
 {
 BdrvChild *child;
+bool waited;
+
+waited = bdrv_drain_poll(bs);
 
 if (bs->drv && bs->drv->bdrv_drain) {
 bs->drv->bdrv_drain(bs);
 }
+
 QLIST_FOREACH(child, >children, next) {
-bdrv_drain_recurse(child->bs);
+waited |= bdrv_drain_recurse(child->bs);
 }
+
+return waited;
 }
 
 typedef struct {
@@ -174,14 +191,6 @@ typedef struct {
 bool done;
 } BdrvCoDrainData;
 
-static void bdrv_drain_poll(BlockDriverState *bs)
-{
-while (bdrv_requests_pending(bs)) {
-/* Keep iterating */
-aio_poll(bdrv_get_aio_context(bs), true);
-}
-}
-
 static void bdrv_co_drain_bh_cb(void *opaque)
 {
 BdrvCoDrainData *data = opaque;
@@ -189,7 +198,7 @@ static void bdrv_co_drain_bh_cb(void *opaque)
 BlockDriverState *bs = data->bs;
 
 bdrv_dec_in_flight(bs);
-bdrv_drain_poll(bs);
+bdrv_drained_begin(bs);
 data->done = true;
 qemu_coroutine_enter(co);
 }
@@ -220,6 +229,11 @@ static void coroutine_fn 
bdrv_co_yield_to_drain(BlockDriverState *bs)
 
 void bdrv_drained_begin(BlockDriverState *bs)
 {
+if (qemu_in_coroutine()) {
+bdrv_co_yield_to_drain(bs);
+return;
+}
+
 if (!bs->quiesce_counter++) {
 aio_disable_external(bdrv_get_aio_context(bs));
 bdrv_parent_drained_begin(bs);
@@ -227,11 +241,6 @@ void bdrv_drained_begin(BlockDriverState *bs)
 
 bdrv_io_unplugged_begin(bs);
 bdrv_drain_recurse(bs);
-if (qemu_in_coroutine()) {
-bdrv_co_yield_to_drain(bs);
-} else {
-bdrv_drain_poll(bs);
-}
 bdrv_io_unplugged_end(bs);
 }
 
@@ -299,7 +308,6 @@ void bdrv_drain_all(void)
 aio_context_acquire(aio_context);
 bdrv_parent_drained_begin(bs);
 bdrv_io_unplugged_begin(bs);
-bdrv_drain_recurse(bs);
 aio_context_release(aio_context);
 
 if (!g_slist_find(aio_ctxs, aio_context)) {
@@ -322,10 +330,7 @@ void bdrv_drain_all(void)
 aio_context_acquire(aio_context);
 for (bs = bdrv_first(); bs; bs = bdrv_next()) {
 if (aio_context == bdrv_get_aio_context(bs)) {
-if (bdrv_requests_pending(bs)) {
-aio_poll(aio_context, true);
-waited = true;
-}
+waited |= bdrv_drain_recurse(bs);
 }
 }
 aio_context_release(aio_context);
-- 
2.7.4





[Qemu-block] [PATCH 20/20] aio: convert from RFifoLock to QemuRecMutex

2016-10-17 Thread Paolo Bonzini
It is simpler and a bit faster, and QEMU does not need the contention
callbacks (and thus the fairness) anymore.

Reviewed-by: Fam Zheng 
Signed-off-by: Paolo Bonzini 
---
 async.c  |  8 ++---
 include/block/aio.h  |  3 +-
 include/qemu/rfifolock.h | 54 
 tests/.gitignore |  1 -
 tests/Makefile.include   |  2 --
 tests/test-rfifolock.c   | 91 
 util/Makefile.objs   |  1 -
 util/rfifolock.c | 78 -
 8 files changed, 5 insertions(+), 233 deletions(-)
 delete mode 100644 include/qemu/rfifolock.h
 delete mode 100644 tests/test-rfifolock.c
 delete mode 100644 util/rfifolock.c

diff --git a/async.c b/async.c
index 27db772..b2de360 100644
--- a/async.c
+++ b/async.c
@@ -284,7 +284,7 @@ aio_ctx_finalize(GSource *source)
 
 aio_set_event_notifier(ctx, >notifier, false, NULL);
 event_notifier_cleanup(>notifier);
-rfifolock_destroy(>lock);
+qemu_rec_mutex_destroy(>lock);
 qemu_mutex_destroy(>bh_lock);
 timerlistgroup_deinit(>tlg);
 }
@@ -372,7 +372,7 @@ AioContext *aio_context_new(Error **errp)
 #endif
 ctx->thread_pool = NULL;
 qemu_mutex_init(>bh_lock);
-rfifolock_init(>lock, NULL, NULL);
+qemu_rec_mutex_init(>lock);
 timerlistgroup_init(>tlg, aio_timerlist_notify, ctx);
 
 return ctx;
@@ -393,10 +393,10 @@ void aio_context_unref(AioContext *ctx)
 
 void aio_context_acquire(AioContext *ctx)
 {
-rfifolock_lock(>lock);
+qemu_rec_mutex_lock(>lock);
 }
 
 void aio_context_release(AioContext *ctx)
 {
-rfifolock_unlock(>lock);
+qemu_rec_mutex_unlock(>lock);
 }
diff --git a/include/block/aio.h b/include/block/aio.h
index 8e1885b..c7ae27c 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -18,7 +18,6 @@
 #include "qemu/queue.h"
 #include "qemu/event_notifier.h"
 #include "qemu/thread.h"
-#include "qemu/rfifolock.h"
 #include "qemu/timer.h"
 
 typedef struct BlockAIOCB BlockAIOCB;
@@ -54,7 +53,7 @@ struct AioContext {
 GSource source;
 
 /* Protects all fields from multi-threaded access */
-RFifoLock lock;
+QemuRecMutex lock;
 
 /* The list of registered AIO handlers */
 QLIST_HEAD(, AioHandler) aio_handlers;
diff --git a/include/qemu/rfifolock.h b/include/qemu/rfifolock.h
deleted file mode 100644
index b23ab53..000
--- a/include/qemu/rfifolock.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- * Recursive FIFO lock
- *
- * Copyright Red Hat, Inc. 2013
- *
- * Authors:
- *  Stefan Hajnoczi   
- *
- * This work is licensed under the terms of the GNU GPL, version 2 or later.
- * See the COPYING file in the top-level directory.
- *
- */
-
-#ifndef QEMU_RFIFOLOCK_H
-#define QEMU_RFIFOLOCK_H
-
-#include "qemu/thread.h"
-
-/* Recursive FIFO lock
- *
- * This lock provides more features than a plain mutex:
- *
- * 1. Fairness - enforces FIFO order.
- * 2. Nesting - can be taken recursively.
- * 3. Contention callback - optional, called when thread must wait.
- *
- * The recursive FIFO lock is heavyweight so prefer other synchronization
- * primitives if you do not need its features.
- */
-typedef struct {
-QemuMutex lock; /* protects all fields */
-
-/* FIFO order */
-unsigned int head;  /* active ticket number */
-unsigned int tail;  /* waiting ticket number */
-QemuCond cond;  /* used to wait for our ticket number */
-
-/* Nesting */
-QemuThread owner_thread;/* thread that currently has ownership */
-unsigned int nesting;   /* amount of nesting levels */
-
-/* Contention callback */
-void (*cb)(void *); /* called when thread must wait, with ->lock
- * held so it may not recursively lock/unlock
- */
-void *cb_opaque;
-} RFifoLock;
-
-void rfifolock_init(RFifoLock *r, void (*cb)(void *), void *opaque);
-void rfifolock_destroy(RFifoLock *r);
-void rfifolock_lock(RFifoLock *r);
-void rfifolock_unlock(RFifoLock *r);
-
-#endif /* QEMU_RFIFOLOCK_H */
diff --git a/tests/.gitignore b/tests/.gitignore
index 9f3d2ee..2cd897e 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -66,7 +66,6 @@ test-qmp-marshal.c
 test-qmp-output-visitor
 test-rcu-list
 test-replication
-test-rfifolock
 test-string-input-visitor
 test-string-output-visitor
 test-thread-pool
diff --git a/tests/Makefile.include b/tests/Makefile.include
index a7c..26e7e90 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -43,7 +43,6 @@ check-unit-y += tests/test-visitor-serialization$(EXESUF)
 check-unit-y += tests/test-iov$(EXESUF)
 gcov-files-test-iov-y = util/iov.c
 check-unit-y += tests/test-aio$(EXESUF)
-check-unit-$(CONFIG_POSIX) += tests/test-rfifolock$(EXESUF)
 check-unit-y += tests/test-throttle$(EXESUF)
 gcov-files-test-aio-$(CONFIG_WIN32) = aio-win32.c
 

[Qemu-block] [PATCH 14/20] block: prepare bdrv_reopen_multiple to release AioContext

2016-10-17 Thread Paolo Bonzini
After the next patch bdrv_drain_all will have to be called without holding any
AioContext.  Prepare to do this by adding an AioContext argument to
bdrv_reopen_multiple.

Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Paolo Bonzini 
---
 block.c   | 4 ++--
 block/commit.c| 2 +-
 block/replication.c   | 3 ++-
 include/block/block.h | 2 +-
 qemu-io-cmds.c| 2 +-
 5 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/block.c b/block.c
index 7f3e7bc..fbe485c 100644
--- a/block.c
+++ b/block.c
@@ -2082,7 +2082,7 @@ BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue 
*bs_queue,
  * to all devices.
  *
  */
-int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp)
+int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error 
**errp)
 {
 int ret = -1;
 BlockReopenQueueEntry *bs_entry, *next;
@@ -2131,7 +2131,7 @@ int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, 
Error **errp)
 Error *local_err = NULL;
 BlockReopenQueue *queue = bdrv_reopen_queue(NULL, bs, NULL, bdrv_flags);
 
-ret = bdrv_reopen_multiple(queue, _err);
+ret = bdrv_reopen_multiple(bdrv_get_aio_context(bs), queue, _err);
 if (local_err != NULL) {
 error_propagate(errp, local_err);
 }
diff --git a/block/commit.c b/block/commit.c
index 9f67a8b..499ecca 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -251,7 +251,7 @@ void commit_start(const char *job_id, BlockDriverState *bs,
  orig_overlay_flags | BDRV_O_RDWR);
 }
 if (reopen_queue) {
-bdrv_reopen_multiple(reopen_queue, _err);
+bdrv_reopen_multiple(bdrv_get_aio_context(bs), reopen_queue, 
_err);
 if (local_err != NULL) {
 error_propagate(errp, local_err);
 block_job_unref(>common);
diff --git a/block/replication.c b/block/replication.c
index af47479..c6962e2 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -355,7 +355,8 @@ static void reopen_backing_file(BlockDriverState *bs, bool 
writable,
 }
 
 if (reopen_queue) {
-bdrv_reopen_multiple(reopen_queue, _err);
+bdrv_reopen_multiple(bdrv_get_aio_context(bs),
+ reopen_queue, _err);
 error_propagate(errp, local_err);
 }
 }
diff --git a/include/block/block.h b/include/block/block.h
index 3ba9e34..fb86611 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -218,7 +218,7 @@ BlockDriverState *bdrv_open(const char *filename, const 
char *reference,
 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
 BlockDriverState *bs,
 QDict *options, int flags);
-int bdrv_reopen_multiple(BlockReopenQueue *bs_queue, Error **errp);
+int bdrv_reopen_multiple(AioContext *ctx, BlockReopenQueue *bs_queue, Error 
**errp);
 int bdrv_reopen(BlockDriverState *bs, int bdrv_flags, Error **errp);
 int bdrv_reopen_prepare(BDRVReopenState *reopen_state,
 BlockReopenQueue *queue, Error **errp);
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 3a3838a..dbe0413 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -1956,7 +1956,7 @@ static int reopen_f(BlockBackend *blk, int argc, char 
**argv)
 qemu_opts_reset(_opts);
 
 brq = bdrv_reopen_queue(NULL, bs, opts, flags);
-bdrv_reopen_multiple(brq, _err);
+bdrv_reopen_multiple(bdrv_get_aio_context(bs), brq, _err);
 if (local_err) {
 error_report_err(local_err);
 } else {
-- 
2.7.4





[Qemu-block] [PATCH 16/20] qemu-img: call aio_context_acquire/release around block job

2016-10-17 Thread Paolo Bonzini
This will be needed by bdrv_reopen_multiple, which calls
bdrv_drain_all and thus will *release* the AioContext.

Signed-off-by: Paolo Bonzini 
---
v1->v2: new [qemu-iotests]

 qemu-img.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/qemu-img.c b/qemu-img.c
index 02c07b9..ad7c964 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -795,6 +795,7 @@ static void run_block_job(BlockJob *job, Error **errp)
 {
 AioContext *aio_context = blk_get_aio_context(job->blk);
 
+aio_context_acquire(aio_context);
 do {
 aio_poll(aio_context, true);
 qemu_progress_print(job->len ?
@@ -802,6 +803,7 @@ static void run_block_job(BlockJob *job, Error **errp)
 } while (!job->ready);
 
 block_job_complete_sync(job, errp);
+aio_context_release(aio_context);
 
 /* A block job may finish instantaneously without publishing any progress,
  * so just signal completion here */
@@ -819,6 +821,7 @@ static int img_commit(int argc, char **argv)
 Error *local_err = NULL;
 CommonBlockJobCBInfo cbi;
 bool image_opts = false;
+AioContext *aio_context;
 
 fmt = NULL;
 cache = BDRV_DEFAULT_CACHE;
@@ -928,8 +931,11 @@ static int img_commit(int argc, char **argv)
 .bs   = bs,
 };
 
+aio_context = bdrv_get_aio_context(bs);
+aio_context_acquire(aio_context);
 commit_active_start("commit", bs, base_bs, 0, BLOCKDEV_ON_ERROR_REPORT,
 common_block_job_cb, , _err, false);
+aio_context_release(aio_context);
 if (local_err) {
 goto done;
 }
-- 
2.7.4





[Qemu-block] [PATCH 19/20] qemu-thread: introduce QemuRecMutex

2016-10-17 Thread Paolo Bonzini
GRecMutex is new in glib 2.32, so we cannot use it.  Introduce
a recursive mutex in qemu-thread instead, which will be used
instead of RFifoLock.

Reviewed-by: Fam Zheng 
Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Paolo Bonzini 
---
 include/qemu/thread-posix.h |  6 ++
 include/qemu/thread-win32.h | 10 ++
 include/qemu/thread.h   |  3 +++
 util/qemu-thread-posix.c| 14 ++
 util/qemu-thread-win32.c| 25 +
 5 files changed, 58 insertions(+)

diff --git a/include/qemu/thread-posix.h b/include/qemu/thread-posix.h
index aa03567..09d1e15 100644
--- a/include/qemu/thread-posix.h
+++ b/include/qemu/thread-posix.h
@@ -4,6 +4,12 @@
 #include 
 #include 
 
+typedef QemuMutex QemuRecMutex;
+#define qemu_rec_mutex_destroy qemu_mutex_destroy
+#define qemu_rec_mutex_lock qemu_mutex_lock
+#define qemu_rec_mutex_try_lock qemu_mutex_try_lock
+#define qemu_rec_mutex_unlock qemu_mutex_unlock
+
 struct QemuMutex {
 pthread_mutex_t lock;
 };
diff --git a/include/qemu/thread-win32.h b/include/qemu/thread-win32.h
index c7ce8dc..5fb6541 100644
--- a/include/qemu/thread-win32.h
+++ b/include/qemu/thread-win32.h
@@ -8,6 +8,16 @@ struct QemuMutex {
 LONG owner;
 };
 
+typedef struct QemuRecMutex QemuRecMutex;
+struct QemuRecMutex {
+CRITICAL_SECTION lock;
+};
+
+void qemu_rec_mutex_destroy(QemuRecMutex *mutex);
+void qemu_rec_mutex_lock(QemuRecMutex *mutex);
+int qemu_rec_mutex_trylock(QemuRecMutex *mutex);
+void qemu_rec_mutex_unlock(QemuRecMutex *mutex);
+
 struct QemuCond {
 LONG waiters, target;
 HANDLE sema;
diff --git a/include/qemu/thread.h b/include/qemu/thread.h
index 31237e9..e8e665f 100644
--- a/include/qemu/thread.h
+++ b/include/qemu/thread.h
@@ -25,6 +25,9 @@ void qemu_mutex_lock(QemuMutex *mutex);
 int qemu_mutex_trylock(QemuMutex *mutex);
 void qemu_mutex_unlock(QemuMutex *mutex);
 
+/* Prototypes for other functions are in thread-posix.h/thread-win32.h.  */
+void qemu_rec_mutex_init(QemuRecMutex *mutex);
+
 void qemu_cond_init(QemuCond *cond);
 void qemu_cond_destroy(QemuCond *cond);
 
diff --git a/util/qemu-thread-posix.c b/util/qemu-thread-posix.c
index 74a3023..1bb00a8 100644
--- a/util/qemu-thread-posix.c
+++ b/util/qemu-thread-posix.c
@@ -80,6 +80,20 @@ void qemu_mutex_unlock(QemuMutex *mutex)
 error_exit(err, __func__);
 }
 
+void qemu_rec_mutex_init(QemuRecMutex *mutex)
+{
+int err;
+pthread_mutexattr_t attr;
+
+pthread_mutexattr_init();
+pthread_mutexattr_settype(, PTHREAD_MUTEX_RECURSIVE);
+err = pthread_mutex_init(>lock, );
+pthread_mutexattr_destroy();
+if (err) {
+error_exit(err, __func__);
+}
+}
+
 void qemu_cond_init(QemuCond *cond)
 {
 int err;
diff --git a/util/qemu-thread-win32.c b/util/qemu-thread-win32.c
index 98a5ddf..171d83c 100644
--- a/util/qemu-thread-win32.c
+++ b/util/qemu-thread-win32.c
@@ -79,6 +79,31 @@ void qemu_mutex_unlock(QemuMutex *mutex)
 LeaveCriticalSection(>lock);
 }
 
+void qemu_rec_mutex_init(QemuRecMutex *mutex)
+{
+InitializeCriticalSection(>lock);
+}
+
+void qemu_rec_mutex_destroy(QemuRecMutex *mutex)
+{
+DeleteCriticalSection(>lock);
+}
+
+void qemu_rec_mutex_lock(QemuRecMutex *mutex)
+{
+EnterCriticalSection(>lock);
+}
+
+int qemu_rec_mutex_trylock(QemuRecMutex *mutex)
+{
+return !TryEnterCriticalSection(>lock);
+}
+
+void qemu_rec_mutex_unlock(QemuRecMutex *mutex)
+{
+LeaveCriticalSection(>lock);
+}
+
 void qemu_cond_init(QemuCond *cond)
 {
 memset(cond, 0, sizeof(*cond));
-- 
2.7.4





[Qemu-block] [PATCH 07/20] block: introduce BDRV_POLL_WHILE

2016-10-17 Thread Paolo Bonzini
We want the BDS event loop to run exclusively in the iothread that
owns the BDS's AioContext.  This macro will provide the synchronization
between the two event loops; for now it just wraps the common idiom
of a while loop around aio_poll.

Signed-off-by: Paolo Bonzini 
---
v1->v2: make macro name uppercase [Fam]
leave bdrv_wakeup for later [Stefan]

 block/block-backend.c |  7 +--
 block/io.c| 42 ++
 block/qed-table.c | 16 
 include/block/block.h |  9 +
 4 files changed, 20 insertions(+), 54 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 234df1e..27cb030 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -878,7 +878,6 @@ static int blk_prw(BlockBackend *blk, int64_t offset, 
uint8_t *buf,
int64_t bytes, CoroutineEntry co_entry,
BdrvRequestFlags flags)
 {
-AioContext *aio_context;
 QEMUIOVector qiov;
 struct iovec iov;
 Coroutine *co;
@@ -900,11 +899,7 @@ static int blk_prw(BlockBackend *blk, int64_t offset, 
uint8_t *buf,
 
 co = qemu_coroutine_create(co_entry, );
 qemu_coroutine_enter(co);
-
-aio_context = blk_get_aio_context(blk);
-while (rwco.ret == NOT_DONE) {
-aio_poll(aio_context, true);
-}
+BDRV_POLL_WHILE(blk_bs(blk), rwco.ret == NOT_DONE);
 
 return rwco.ret;
 }
diff --git a/block/io.c b/block/io.c
index afec968..f0682dd 100644
--- a/block/io.c
+++ b/block/io.c
@@ -156,23 +156,12 @@ bool bdrv_requests_pending(BlockDriverState *bs)
 return false;
 }
 
-static bool bdrv_drain_poll(BlockDriverState *bs)
-{
-bool waited = false;
-
-while (atomic_read(>in_flight) > 0) {
-aio_poll(bdrv_get_aio_context(bs), true);
-waited = true;
-}
-return waited;
-}
-
 static bool bdrv_drain_recurse(BlockDriverState *bs)
 {
 BdrvChild *child;
 bool waited;
 
-waited = bdrv_drain_poll(bs);
+waited = BDRV_POLL_WHILE(bs, atomic_read(>in_flight) > 0);
 
 if (bs->drv && bs->drv->bdrv_drain) {
 bs->drv->bdrv_drain(bs);
@@ -597,13 +586,9 @@ static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
 /* Fast-path if already in coroutine context */
 bdrv_rw_co_entry();
 } else {
-AioContext *aio_context = bdrv_get_aio_context(child->bs);
-
 co = qemu_coroutine_create(bdrv_rw_co_entry, );
 qemu_coroutine_enter(co);
-while (rwco.ret == NOT_DONE) {
-aio_poll(aio_context, true);
-}
+BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
 }
 return rwco.ret;
 }
@@ -1845,14 +1830,10 @@ int64_t bdrv_get_block_status_above(BlockDriverState 
*bs,
 /* Fast-path if already in coroutine context */
 bdrv_get_block_status_above_co_entry();
 } else {
-AioContext *aio_context = bdrv_get_aio_context(bs);
-
 co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry,
);
 qemu_coroutine_enter(co);
-while (!data.done) {
-aio_poll(aio_context, true);
-}
+BDRV_POLL_WHILE(bs, !data.done);
 }
 return data.ret;
 }
@@ -2411,13 +2392,9 @@ int bdrv_flush(BlockDriverState *bs)
 /* Fast-path if already in coroutine context */
 bdrv_flush_co_entry(_co);
 } else {
-AioContext *aio_context = bdrv_get_aio_context(bs);
-
 co = qemu_coroutine_create(bdrv_flush_co_entry, _co);
 qemu_coroutine_enter(co);
-while (flush_co.ret == NOT_DONE) {
-aio_poll(aio_context, true);
-}
+BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
 }
 
 return flush_co.ret;
@@ -2543,13 +2520,9 @@ int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, 
int count)
 /* Fast-path if already in coroutine context */
 bdrv_pdiscard_co_entry();
 } else {
-AioContext *aio_context = bdrv_get_aio_context(bs);
-
 co = qemu_coroutine_create(bdrv_pdiscard_co_entry, );
 qemu_coroutine_enter(co);
-while (rwco.ret == NOT_DONE) {
-aio_poll(aio_context, true);
-}
+BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE);
 }
 
 return rwco.ret;
@@ -2608,11 +2581,8 @@ int bdrv_ioctl(BlockDriverState *bs, unsigned long int 
req, void *buf)
 bdrv_co_ioctl_entry();
 } else {
 Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry, );
-
 qemu_coroutine_enter(co);
-while (data.ret == -EINPROGRESS) {
-aio_poll(bdrv_get_aio_context(bs), true);
-}
+BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
 }
 return data.ret;
 }
diff --git a/block/qed-table.c b/block/qed-table.c
index 1a731df..ed443e2 100644
--- a/block/qed-table.c
+++ b/block/qed-table.c
@@ -174,9 +174,7 @@ int qed_read_l1_table_sync(BDRVQEDState *s)
 
 qed_read_table(s, s->header.l1_table_offset,
 

[Qemu-block] [PATCH 10/20] sheepdog: use BDRV_POLL_WHILE

2016-10-17 Thread Paolo Bonzini
This is important when the sheepdog driver works on a BlockDriverState
that is attached to an I/O thread other than the main thread.

Signed-off-by: Paolo Bonzini 
---
v1->v2: no bdrv_wakeup yet

 block/sheepdog.c | 64 +++-
 1 file changed, 35 insertions(+), 29 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index ccbf7e1..16a5c1c 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -641,6 +641,7 @@ static void restart_co_req(void *opaque)
 
 typedef struct SheepdogReqCo {
 int sockfd;
+BlockDriverState *bs;
 AioContext *aio_context;
 SheepdogReq *hdr;
 void *data;
@@ -708,13 +709,14 @@ out:
  *
  * Return 0 on success, -errno in case of error.
  */
-static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr,
+static int do_req(int sockfd, BlockDriverState *bs, SheepdogReq *hdr,
   void *data, unsigned int *wlen, unsigned int *rlen)
 {
 Coroutine *co;
 SheepdogReqCo srco = {
 .sockfd = sockfd,
-.aio_context = aio_context,
+.aio_context = bs ? bdrv_get_aio_context(bs) : qemu_get_aio_context(),
+.bs = bs,
 .hdr = hdr,
 .data = data,
 .wlen = wlen,
@@ -727,9 +729,14 @@ static int do_req(int sockfd, AioContext *aio_context, 
SheepdogReq *hdr,
 do_co_req();
 } else {
 co = qemu_coroutine_create(do_co_req, );
-qemu_coroutine_enter(co);
-while (!srco.finished) {
-aio_poll(aio_context, true);
+if (bs) {
+qemu_coroutine_enter(co);
+BDRV_POLL_WHILE(bs, !srco.finished);
+} else {
+qemu_coroutine_enter(co);
+while (!srco.finished) {
+aio_poll(qemu_get_aio_context(), true);
+}
 }
 }
 
@@ -1125,7 +1132,7 @@ static int find_vdi_name(BDRVSheepdogState *s, const char 
*filename,
 hdr.snapid = snapid;
 hdr.flags = SD_FLAG_CMD_WRITE;
 
-ret = do_req(fd, s->aio_context, (SheepdogReq *), buf, , );
+ret = do_req(fd, s->bs, (SheepdogReq *), buf, , );
 if (ret) {
 error_setg_errno(errp, -ret, "cannot get vdi info");
 goto out;
@@ -1240,7 +1247,7 @@ out:
 qemu_co_mutex_unlock(>lock);
 }
 
-static int read_write_object(int fd, AioContext *aio_context, char *buf,
+static int read_write_object(int fd, BlockDriverState *bs, char *buf,
  uint64_t oid, uint8_t copies,
  unsigned int datalen, uint64_t offset,
  bool write, bool create, uint32_t cache_flags)
@@ -1274,7 +1281,7 @@ static int read_write_object(int fd, AioContext 
*aio_context, char *buf,
 hdr.offset = offset;
 hdr.copies = copies;
 
-ret = do_req(fd, aio_context, (SheepdogReq *), buf, , );
+ret = do_req(fd, bs, (SheepdogReq *), buf, , );
 if (ret) {
 error_report("failed to send a request to the sheep");
 return ret;
@@ -1289,22 +1296,22 @@ static int read_write_object(int fd, AioContext 
*aio_context, char *buf,
 }
 }
 
-static int read_object(int fd, AioContext *aio_context, char *buf,
+static int read_object(int fd, BlockDriverState *bs, char *buf,
uint64_t oid, uint8_t copies,
unsigned int datalen, uint64_t offset,
uint32_t cache_flags)
 {
-return read_write_object(fd, aio_context, buf, oid, copies,
+return read_write_object(fd, bs, buf, oid, copies,
  datalen, offset, false,
  false, cache_flags);
 }
 
-static int write_object(int fd, AioContext *aio_context, char *buf,
+static int write_object(int fd, BlockDriverState *bs, char *buf,
 uint64_t oid, uint8_t copies,
 unsigned int datalen, uint64_t offset, bool create,
 uint32_t cache_flags)
 {
-return read_write_object(fd, aio_context, buf, oid, copies,
+return read_write_object(fd, bs, buf, oid, copies,
  datalen, offset, true,
  create, cache_flags);
 }
@@ -1331,7 +1338,7 @@ static int reload_inode(BDRVSheepdogState *s, uint32_t 
snapid, const char *tag)
 goto out;
 }
 
-ret = read_object(fd, s->aio_context, (char *)inode, vid_to_vdi_oid(vid),
+ret = read_object(fd, s->bs, (char *)inode, vid_to_vdi_oid(vid),
   s->inode.nr_copies, SD_INODE_HEADER_SIZE, 0,
   s->cache_flags);
 if (ret < 0) {
@@ -1489,7 +1496,7 @@ static int sd_open(BlockDriverState *bs, QDict *options, 
int flags,
 }
 
 buf = g_malloc(SD_INODE_SIZE);
-ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid),
+ret = read_object(fd, s->bs, buf, vid_to_vdi_oid(vid),
   0, SD_INODE_SIZE, 0, s->cache_flags);
 
 closesocket(fd);
@@ -1618,7 +1625,7 @@ static int 

[Qemu-block] [PATCH 11/20] aio: introduce qemu_get_current_aio_context

2016-10-17 Thread Paolo Bonzini
This will be used by BDRV_POLL_WHILE (and thus by bdrv_drain)
to choose how to wait for I/O completion.

Reviewed-by: Fam Zheng 
Signed-off-by: Paolo Bonzini 
---
v1->v2: adjust documentation according to Stefan's suggestion

 include/block/aio.h | 18 ++
 iothread.c  |  9 +
 stubs/Makefile.objs |  1 +
 stubs/iothread.c|  8 
 4 files changed, 36 insertions(+)
 create mode 100644 stubs/iothread.c

diff --git a/include/block/aio.h b/include/block/aio.h
index b9fe2cb..040b3b1 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -453,6 +453,24 @@ static inline bool aio_node_check(AioContext *ctx, bool 
is_external)
 }
 
 /**
+ * Return the AioContext whose event loop runs in the current thread.
+ *
+ * If called from an IOThread this will be the IOThread's AioContext.  If
+ * called from another thread it will be the main loop AioContext.
+ */
+AioContext *qemu_get_current_aio_context(void);
+
+/**
+ * @ctx: the aio context
+ *
+ * Return whether we are running in the I/O thread that manages @ctx.
+ */
+static inline bool aio_context_in_iothread(AioContext *ctx)
+{
+return ctx == qemu_get_current_aio_context();
+}
+
+/**
  * aio_context_setup:
  * @ctx: the aio context
  *
diff --git a/iothread.c b/iothread.c
index fbeb8de..62c8796 100644
--- a/iothread.c
+++ b/iothread.c
@@ -20,6 +20,7 @@
 #include "qmp-commands.h"
 #include "qemu/error-report.h"
 #include "qemu/rcu.h"
+#include "qemu/main-loop.h"
 
 typedef ObjectClass IOThreadClass;
 
@@ -28,6 +29,13 @@ typedef ObjectClass IOThreadClass;
 #define IOTHREAD_CLASS(klass) \
OBJECT_CLASS_CHECK(IOThreadClass, klass, TYPE_IOTHREAD)
 
+static __thread IOThread *my_iothread;
+
+AioContext *qemu_get_current_aio_context(void)
+{
+return my_iothread ? my_iothread->ctx : qemu_get_aio_context();
+}
+
 static void *iothread_run(void *opaque)
 {
 IOThread *iothread = opaque;
@@ -35,6 +43,7 @@ static void *iothread_run(void *opaque)
 
 rcu_register_thread();
 
+my_iothread = iothread;
 qemu_mutex_lock(>init_done_lock);
 iothread->thread_id = qemu_get_thread_id();
 qemu_cond_signal(>init_done_cond);
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index c5850e8..84b9d9e 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -17,6 +17,7 @@ stub-obj-y += gdbstub.o
 stub-obj-y += get-fd.o
 stub-obj-y += get-next-serial.o
 stub-obj-y += get-vm-name.o
+stub-obj-y += iothread.o
 stub-obj-y += iothread-lock.o
 stub-obj-y += is-daemonized.o
 stub-obj-y += machine-init-done.o
diff --git a/stubs/iothread.c b/stubs/iothread.c
new file mode 100644
index 000..8cc9e28
--- /dev/null
+++ b/stubs/iothread.c
@@ -0,0 +1,8 @@
+#include "qemu/osdep.h"
+#include "block/aio.h"
+#include "qemu/main-loop.h"
+
+AioContext *qemu_get_current_aio_context(void)
+{
+return qemu_get_aio_context();
+}
-- 
2.7.4





[Qemu-block] [PATCH 08/20] nfs: move nfs_set_events out of the while loops

2016-10-17 Thread Paolo Bonzini
nfs_set_events only needs to be called once before entering the
while loop; afterwards, nfs_process_read and nfs_process_write
take care of it.

Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Paolo Bonzini 
---
 block/nfs.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/nfs.c b/block/nfs.c
index c3db2ec..c8df8d8 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -149,8 +149,8 @@ static int coroutine_fn nfs_co_readv(BlockDriverState *bs,
 return -ENOMEM;
 }
 
+nfs_set_events(client);
 while (!task.complete) {
-nfs_set_events(client);
 qemu_coroutine_yield();
 }
 
@@ -191,8 +191,8 @@ static int coroutine_fn nfs_co_writev(BlockDriverState *bs,
 return -ENOMEM;
 }
 
+nfs_set_events(client);
 while (!task.complete) {
-nfs_set_events(client);
 qemu_coroutine_yield();
 }
 
@@ -217,8 +217,8 @@ static int coroutine_fn nfs_co_flush(BlockDriverState *bs)
 return -ENOMEM;
 }
 
+nfs_set_events(client);
 while (!task.complete) {
-nfs_set_events(client);
 qemu_coroutine_yield();
 }
 
@@ -513,8 +513,8 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState 
*bs)
 return -ENOMEM;
 }
 
+nfs_set_events(client);
 while (!task.complete) {
-nfs_set_events(client);
 aio_poll(client->aio_context, true);
 }
 
-- 
2.7.4





[Qemu-block] [PATCH 09/20] nfs: use BDRV_POLL_WHILE

2016-10-17 Thread Paolo Bonzini
This will make it possible to use nfs_get_allocated_file_size on
a file that is not in the main AioContext.

Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Paolo Bonzini 
---
v1->v2: no bdrv_wakeup yet

 block/nfs.c | 46 --
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/block/nfs.c b/block/nfs.c
index c8df8d8..7474fbc 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -52,6 +52,7 @@ typedef struct NFSClient {
 } NFSClient;
 
 typedef struct NFSRPC {
+BlockDriverState *bs;
 int ret;
 int complete;
 QEMUIOVector *iov;
@@ -90,11 +91,12 @@ static void nfs_process_write(void *arg)
 nfs_set_events(client);
 }
 
-static void nfs_co_init_task(NFSClient *client, NFSRPC *task)
+static void nfs_co_init_task(BlockDriverState *bs, NFSRPC *task)
 {
 *task = (NFSRPC) {
 .co = qemu_coroutine_self(),
-.client = client,
+.bs = bs,
+.client = bs->opaque,
 };
 }
 
@@ -111,6 +113,7 @@ nfs_co_generic_cb(int ret, struct nfs_context *nfs, void 
*data,
 {
 NFSRPC *task = private_data;
 task->ret = ret;
+assert(!task->st);
 if (task->ret > 0 && task->iov) {
 if (task->ret <= task->iov->size) {
 qemu_iovec_from_buf(task->iov, 0, data, task->ret);
@@ -118,18 +121,11 @@ nfs_co_generic_cb(int ret, struct nfs_context *nfs, void 
*data,
 task->ret = -EIO;
 }
 }
-if (task->ret == 0 && task->st) {
-memcpy(task->st, data, sizeof(struct stat));
-}
 if (task->ret < 0) {
 error_report("NFS Error: %s", nfs_get_error(nfs));
 }
-if (task->co) {
-aio_bh_schedule_oneshot(task->client->aio_context,
-nfs_co_generic_bh_cb, task);
-} else {
-task->complete = 1;
-}
+aio_bh_schedule_oneshot(task->client->aio_context,
+nfs_co_generic_bh_cb, task);
 }
 
 static int coroutine_fn nfs_co_readv(BlockDriverState *bs,
@@ -139,7 +135,7 @@ static int coroutine_fn nfs_co_readv(BlockDriverState *bs,
 NFSClient *client = bs->opaque;
 NFSRPC task;
 
-nfs_co_init_task(client, );
+nfs_co_init_task(bs, );
 task.iov = iov;
 
 if (nfs_pread_async(client->context, client->fh,
@@ -174,7 +170,7 @@ static int coroutine_fn nfs_co_writev(BlockDriverState *bs,
 NFSRPC task;
 char *buf = NULL;
 
-nfs_co_init_task(client, );
+nfs_co_init_task(bs, );
 
 buf = g_try_malloc(nb_sectors * BDRV_SECTOR_SIZE);
 if (nb_sectors && buf == NULL) {
@@ -210,7 +206,7 @@ static int coroutine_fn nfs_co_flush(BlockDriverState *bs)
 NFSClient *client = bs->opaque;
 NFSRPC task;
 
-nfs_co_init_task(client, );
+nfs_co_init_task(bs, );
 
 if (nfs_fsync_async(client->context, client->fh, nfs_co_generic_cb,
 ) != 0) {
@@ -496,6 +492,21 @@ static int nfs_has_zero_init(BlockDriverState *bs)
 return client->has_zero_init;
 }
 
+static void
+nfs_get_allocated_file_size_cb(int ret, struct nfs_context *nfs, void *data,
+   void *private_data)
+{
+NFSRPC *task = private_data;
+task->ret = ret;
+if (task->ret == 0) {
+memcpy(task->st, data, sizeof(struct stat));
+}
+if (task->ret < 0) {
+error_report("NFS Error: %s", nfs_get_error(nfs));
+}
+task->complete = 1;
+}
+
 static int64_t nfs_get_allocated_file_size(BlockDriverState *bs)
 {
 NFSClient *client = bs->opaque;
@@ -507,16 +518,15 @@ static int64_t 
nfs_get_allocated_file_size(BlockDriverState *bs)
 return client->st_blocks * 512;
 }
 
+task.bs = bs;
 task.st = 
-if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb,
+if (nfs_fstat_async(client->context, client->fh, 
nfs_get_allocated_file_size_cb,
 ) != 0) {
 return -ENOMEM;
 }
 
 nfs_set_events(client);
-while (!task.complete) {
-aio_poll(client->aio_context, true);
-}
+BDRV_POLL_WHILE(bs, !task.complete);
 
 return (task.ret < 0 ? task.ret : st.st_blocks * 512);
 }
-- 
2.7.4





[Qemu-block] [PATCH 13/20] replication: pass BlockDriverState to reopen_backing_file

2016-10-17 Thread Paolo Bonzini
This will be needed in the next patch to retrieve the AioContext.

Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Paolo Bonzini 
---
 block/replication.c | 21 -
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index 5231a00..af47479 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -317,9 +317,10 @@ static void secondary_do_checkpoint(BDRVReplicationState 
*s, Error **errp)
 }
 }
 
-static void reopen_backing_file(BDRVReplicationState *s, bool writable,
+static void reopen_backing_file(BlockDriverState *bs, bool writable,
 Error **errp)
 {
+BDRVReplicationState *s = bs->opaque;
 BlockReopenQueue *reopen_queue = NULL;
 int orig_hidden_flags, orig_secondary_flags;
 int new_hidden_flags, new_secondary_flags;
@@ -359,8 +360,9 @@ static void reopen_backing_file(BDRVReplicationState *s, 
bool writable,
 }
 }
 
-static void backup_job_cleanup(BDRVReplicationState *s)
+static void backup_job_cleanup(BlockDriverState *bs)
 {
+BDRVReplicationState *s = bs->opaque;
 BlockDriverState *top_bs;
 
 top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
@@ -369,19 +371,20 @@ static void backup_job_cleanup(BDRVReplicationState *s)
 }
 bdrv_op_unblock_all(top_bs, s->blocker);
 error_free(s->blocker);
-reopen_backing_file(s, false, NULL);
+reopen_backing_file(bs, false, NULL);
 }
 
 static void backup_job_completed(void *opaque, int ret)
 {
-BDRVReplicationState *s = opaque;
+BlockDriverState *bs = opaque;
+BDRVReplicationState *s = bs->opaque;
 
 if (s->replication_state != BLOCK_REPLICATION_FAILOVER) {
 /* The backup job is cancelled unexpectedly */
 s->error = -EIO;
 }
 
-backup_job_cleanup(s);
+backup_job_cleanup(bs);
 }
 
 static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
@@ -477,7 +480,7 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 }
 
 /* reopen the backing file in r/w mode */
-reopen_backing_file(s, true, _err);
+reopen_backing_file(bs, true, _err);
 if (local_err) {
 error_propagate(errp, local_err);
 aio_context_release(aio_context);
@@ -492,7 +495,7 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 if (!top_bs || !bdrv_is_root_node(top_bs) ||
 !check_top_bs(top_bs, bs)) {
 error_setg(errp, "No top_bs or it is invalid");
-reopen_backing_file(s, false, NULL);
+reopen_backing_file(bs, false, NULL);
 aio_context_release(aio_context);
 return;
 }
@@ -502,10 +505,10 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 backup_start("replication-backup", s->secondary_disk->bs,
  s->hidden_disk->bs, 0, MIRROR_SYNC_MODE_NONE, NULL, false,
  BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
- backup_job_completed, s, NULL, _err);
+ backup_job_completed, bs, NULL, _err);
 if (local_err) {
 error_propagate(errp, local_err);
-backup_job_cleanup(s);
+backup_job_cleanup(bs);
 aio_context_release(aio_context);
 return;
 }
-- 
2.7.4





[Qemu-block] [PATCH 04/20] block: add BDS field to count in-flight requests

2016-10-17 Thread Paolo Bonzini
Unlike tracked_requests, this field also counts throttled requests,
and remains non-zero if an AIO operation needs a BH to be "really"
completed.

With this change, it is no longer necessary to have a dummy
BdrvTrackedRequest for requests that are never serialising, and
it is no longer necessary to poll the AioContext once after
bdrv_requests_pending(bs) returns false.

Signed-off-by: Paolo Bonzini 
---
 block/block-backend.c | 23 +++---
 block/io.c| 81 ---
 include/block/block_int.h | 10 +++---
 3 files changed, 81 insertions(+), 33 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 1a724a8..234df1e 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -799,20 +799,25 @@ int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t 
offset,
BdrvRequestFlags flags)
 {
 int ret;
+BlockDriverState *bs = blk_bs(blk);
 
-trace_blk_co_preadv(blk, blk_bs(blk), offset, bytes, flags);
+trace_blk_co_preadv(blk, bs, offset, bytes, flags);
 
 ret = blk_check_byte_request(blk, offset, bytes);
 if (ret < 0) {
 return ret;
 }
 
+bdrv_inc_in_flight(bs);
+
 /* throttling disk I/O */
 if (blk->public.throttle_state) {
 throttle_group_co_io_limits_intercept(blk, bytes, false);
 }
 
-return bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
+ret = bdrv_co_preadv(blk->root, offset, bytes, qiov, flags);
+bdrv_dec_in_flight(bs);
+return ret;
 }
 
 int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset,
@@ -820,14 +825,17 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, 
int64_t offset,
 BdrvRequestFlags flags)
 {
 int ret;
+BlockDriverState *bs = blk_bs(blk);
 
-trace_blk_co_pwritev(blk, blk_bs(blk), offset, bytes, flags);
+trace_blk_co_pwritev(blk, bs, offset, bytes, flags);
 
 ret = blk_check_byte_request(blk, offset, bytes);
 if (ret < 0) {
 return ret;
 }
 
+bdrv_inc_in_flight(bs);
+
 /* throttling disk I/O */
 if (blk->public.throttle_state) {
 throttle_group_co_io_limits_intercept(blk, bytes, true);
@@ -837,7 +845,9 @@ int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t 
offset,
 flags |= BDRV_REQ_FUA;
 }
 
-return bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags);
+ret = bdrv_co_pwritev(blk->root, offset, bytes, qiov, flags);
+bdrv_dec_in_flight(bs);
+return ret;
 }
 
 typedef struct BlkRwCo {
@@ -930,6 +940,8 @@ int blk_make_zero(BlockBackend *blk, BdrvRequestFlags flags)
 static void error_callback_bh(void *opaque)
 {
 struct BlockBackendAIOCB *acb = opaque;
+
+bdrv_dec_in_flight(acb->common.bs);
 acb->common.cb(acb->common.opaque, acb->ret);
 qemu_aio_unref(acb);
 }
@@ -940,6 +952,7 @@ BlockAIOCB *blk_abort_aio_request(BlockBackend *blk,
 {
 struct BlockBackendAIOCB *acb;
 
+bdrv_inc_in_flight(blk_bs(blk));
 acb = blk_aio_get(_backend_aiocb_info, blk, cb, opaque);
 acb->blk = blk;
 acb->ret = ret;
@@ -962,6 +975,7 @@ static const AIOCBInfo blk_aio_em_aiocb_info = {
 static void blk_aio_complete(BlkAioEmAIOCB *acb)
 {
 if (acb->has_returned) {
+bdrv_dec_in_flight(acb->common.bs);
 acb->common.cb(acb->common.opaque, acb->rwco.ret);
 qemu_aio_unref(acb);
 }
@@ -983,6 +997,7 @@ static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t 
offset, int bytes,
 BlkAioEmAIOCB *acb;
 Coroutine *co;
 
+bdrv_inc_in_flight(blk_bs(blk));
 acb = blk_aio_get(_aio_em_aiocb_info, blk, cb, opaque);
 acb->rwco = (BlkRwCo) {
 .blk= blk,
diff --git a/block/io.c b/block/io.c
index b136c89..8d46d8b 100644
--- a/block/io.c
+++ b/block/io.c
@@ -143,7 +143,7 @@ bool bdrv_requests_pending(BlockDriverState *bs)
 {
 BdrvChild *child;
 
-if (!QLIST_EMPTY(>tracked_requests)) {
+if (atomic_read(>in_flight)) {
 return true;
 }
 
@@ -176,12 +176,9 @@ typedef struct {
 
 static void bdrv_drain_poll(BlockDriverState *bs)
 {
-bool busy = true;
-
-while (busy) {
+while (bdrv_requests_pending(bs)) {
 /* Keep iterating */
-busy = bdrv_requests_pending(bs);
-busy |= aio_poll(bdrv_get_aio_context(bs), busy);
+aio_poll(bdrv_get_aio_context(bs), true);
 }
 }
 
@@ -189,8 +186,10 @@ static void bdrv_co_drain_bh_cb(void *opaque)
 {
 BdrvCoDrainData *data = opaque;
 Coroutine *co = data->co;
+BlockDriverState *bs = data->bs;
 
-bdrv_drain_poll(data->bs);
+bdrv_dec_in_flight(bs);
+bdrv_drain_poll(bs);
 data->done = true;
 qemu_coroutine_enter(co);
 }
@@ -209,6 +208,7 @@ static void coroutine_fn 
bdrv_co_yield_to_drain(BlockDriverState *bs)
 .bs = bs,
 .done = false,
 };
+bdrv_inc_in_flight(bs);
 aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
  

[Qemu-block] [PATCH v2 00/20] dataplane: remove RFifoLock

2016-10-17 Thread Paolo Bonzini
This patch reorganizes aio_poll callers to establish new rules for
dataplane locking.  The idea is that I/O operations on a dataplane
BDS (i.e. one where the AioContext is not the main one) do not call
aio_poll anymore.  Instead, they wait for the operation to end in the
other I/O thread, at which point the other I/O thread calls bdrv_wakeup
to wake up the main thread.

With this change, only one thread runs aio_poll for an AioContext.
While aio_context_acquire/release is still needed to protect the BDSes,
it need not interrupt the other thread's event loop anymore, and therefore
it does not need contention callbacks anymore.  Thus the patch can remove
RFifoLock.  This fixes possible hangs in bdrv_drain_all, reproducible (for
example) by unplugging a virtio-scsi-dataplane device while there is I/O
going on for a virtio-blk-dataplane on the same I/O thread.

Patch 1 is a bugfix that I already posted.

Patch 2 makes blockjobs independent of aio_poll, the reason for which
should be apparent from the explanation above.

Patch 3 is an independent mirror bugfix, that I wanted to submit separately
but happens to fix a hang in COLO replication.  Like patch 1 I believe
it's pre-existing and merely exposed by these patches.

Patches 4 to 10 introduce the infrastructure to wake up the main thread
while bdrv_drain or other synchronous operations are running.  Patches 11
to 16 do other changes to prepare for this.  Notably bdrv_drain_all
needs to be called without holding any AioContext lock, so bdrv_reopen
releases the lock temporarily (and callers of bdrv_reopen needs fixing).

Patch 17 then does the big change, after which there are just some
cleanups left to do.

Paolo

Fam Zheng (1):
  qed: Implement .bdrv_drain

Paolo Bonzini (19):
  replication: interrupt failover if the main device is closed
  blockjob: introduce .drain callback for jobs
  mirror: use bdrv_drained_begin/bdrv_drained_end
  block: add BDS field to count in-flight requests
  block: change drain to look only at one child at a time
  block: introduce BDRV_POLL_WHILE
  nfs: move nfs_set_events out of the while loops
  nfs: use BDRV_POLL_WHILE
  sheepdog: use BDRV_POLL_WHILE
  aio: introduce qemu_get_current_aio_context
  iothread: detach all block devices before stopping them
  replication: pass BlockDriverState to reopen_backing_file
  block: prepare bdrv_reopen_multiple to release AioContext
  qemu-io: acquire AioContext
  qemu-img: call aio_context_acquire/release around block job
  block: only call aio_poll on the current thread's AioContext
  iothread: release AioContext around aio_poll
  qemu-thread: introduce QemuRecMutex
  aio: convert from RFifoLock to QemuRecMutex

 async.c |  29 ++--
 block.c |   6 +-
 block/backup.c  |  17 +
 block/block-backend.c   |  30 +---
 block/commit.c  |   2 +-
 block/io.c  | 148 ++--
 block/mirror.c  |  70 +--
 block/nfs.c |  55 +--
 block/qed-table.c   |  16 ++---
 block/qed.c |  16 -
 block/replication.c |  27 +---
 block/sheepdog.c|  67 ++
 blockjob.c  |  37 +-
 docs/multiple-iothreads.txt |  40 ++-
 hw/scsi/virtio-scsi-dataplane.c |   4 +-
 include/block/aio.h |  24 +--
 include/block/block.h   |  29 +++-
 include/block/block_int.h   |  27 ++--
 include/block/blockjob.h|   7 ++
 include/qemu/rfifolock.h|  54 ---
 include/qemu/thread-posix.h |   6 ++
 include/qemu/thread-win32.h |  10 +++
 include/qemu/thread.h   |   3 +
 iothread.c  |  33 ++---
 qemu-img.c  |   6 ++
 qemu-io-cmds.c  |   6 +-
 stubs/Makefile.objs |   1 +
 stubs/iothread.c|   8 +++
 tests/.gitignore|   1 -
 tests/Makefile.include  |   2 -
 tests/test-aio.c|  22 +++---
 tests/test-rfifolock.c  |  91 
 util/Makefile.objs  |   1 -
 util/qemu-thread-posix.c|  14 
 util/qemu-thread-win32.c|  25 +++
 util/rfifolock.c|  78 -
 36 files changed, 526 insertions(+), 486 deletions(-)
 delete mode 100644 include/qemu/rfifolock.h
 create mode 100644 stubs/iothread.c
 delete mode 100644 tests/test-rfifolock.c
 delete mode 100644 util/rfifolock.c

-- 
2.7.4




[Qemu-block] [PATCH 02/20] blockjob: introduce .drain callback for jobs

2016-10-17 Thread Paolo Bonzini
This is required to decouple block jobs from running in an
AioContext.  With multiqueue block devices, a BlockDriverState
does not really belong to a single AioContext.

The solution is to first wait until all I/O operations are
complete; then loop in the main thread for the block job to
complete entirely.

Signed-off-by: Paolo Bonzini 
---
v1->v2: save s->target around ref/drain/unref sequence [Stefan]

 block/backup.c   | 17 +
 block/mirror.c   | 35 +++
 blockjob.c   | 37 -
 include/block/blockjob.h |  7 +++
 4 files changed, 71 insertions(+), 25 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 582bd0f..130cb3f 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -300,6 +300,21 @@ void backup_cow_request_end(CowRequest *req)
 cow_request_end(req);
 }
 
+static void backup_drain(BlockJob *job)
+{
+BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+
+/* Need to keep a reference in case blk_drain triggers execution
+ * of backup_complete...
+ */
+if (s->target) {
+BlockBackend *target = s->target;
+blk_ref(target);
+blk_drain(target);
+blk_unref(target);
+}
+}
+
 static const BlockJobDriver backup_job_driver = {
 .instance_size  = sizeof(BackupBlockJob),
 .job_type   = BLOCK_JOB_TYPE_BACKUP,
@@ -307,6 +322,7 @@ static const BlockJobDriver backup_job_driver = {
 .commit = backup_commit,
 .abort  = backup_abort,
 .attached_aio_context   = backup_attached_aio_context,
+.drain  = backup_drain,
 };
 
 static BlockErrorAction backup_error_action(BackupBlockJob *job,
@@ -331,6 +347,7 @@ static void backup_complete(BlockJob *job, void *opaque)
 BackupCompleteData *data = opaque;
 
 blk_unref(s->target);
+s->target = NULL;
 
 block_job_completed(job, data->ret);
 g_free(data);
diff --git a/block/mirror.c b/block/mirror.c
index f9d1fec..77d188a 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -469,7 +469,11 @@ static void mirror_free_init(MirrorBlockJob *s)
 }
 }
 
-static void mirror_drain(MirrorBlockJob *s)
+/* This is also used for the .pause callback. There is no matching
+ * mirror_resume() because mirror_run() will begin iterating again
+ * when the job is resumed.
+ */
+static void mirror_wait_for_all_io(MirrorBlockJob *s)
 {
 while (s->in_flight > 0) {
 mirror_wait_for_io(s);
@@ -528,6 +532,7 @@ static void mirror_exit(BlockJob *job, void *opaque)
 g_free(s->replaces);
 bdrv_op_unblock_all(target_bs, s->common.blocker);
 blk_unref(s->target);
+s->target = NULL;
 block_job_completed(>common, data->ret);
 g_free(data);
 bdrv_drained_end(src);
@@ -582,7 +587,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
 sector_num += nb_sectors;
 }
 
-mirror_drain(s);
+mirror_wait_for_all_io(s);
 }
 
 /* First part, loop on the sectors and initialize the dirty bitmap.  */
@@ -786,7 +791,7 @@ immediate_exit:
  * the target is a copy of the source.
  */
 assert(ret < 0 || (!s->synced && block_job_is_cancelled(>common)));
-mirror_drain(s);
+mirror_wait_for_all_io(s);
 }
 
 assert(s->in_flight == 0);
@@ -870,14 +875,11 @@ static void mirror_complete(BlockJob *job, Error **errp)
 block_job_enter(>common);
 }
 
-/* There is no matching mirror_resume() because mirror_run() will begin
- * iterating again when the job is resumed.
- */
-static void coroutine_fn mirror_pause(BlockJob *job)
+static void mirror_pause(BlockJob *job)
 {
 MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
 
-mirror_drain(s);
+mirror_wait_for_all_io(s);
 }
 
 static void mirror_attached_aio_context(BlockJob *job, AioContext *new_context)
@@ -887,6 +889,21 @@ static void mirror_attached_aio_context(BlockJob *job, 
AioContext *new_context)
 blk_set_aio_context(s->target, new_context);
 }
 
+static void mirror_drain(BlockJob *job)
+{
+MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
+
+/* Need to keep a reference in case blk_drain triggers execution
+ * of mirror_complete...
+ */
+if (s->target) {
+BlockBackend *target = s->target;
+blk_ref(target);
+blk_drain(target);
+blk_unref(target);
+}
+}
+
 static const BlockJobDriver mirror_job_driver = {
 .instance_size  = sizeof(MirrorBlockJob),
 .job_type   = BLOCK_JOB_TYPE_MIRROR,
@@ -894,6 +911,7 @@ static const BlockJobDriver mirror_job_driver = {
 .complete   = mirror_complete,
 .pause  = mirror_pause,
 .attached_aio_context   = mirror_attached_aio_context,
+.drain  = mirror_drain,
 };
 
 static const BlockJobDriver 

[Qemu-block] [PATCH 03/20] mirror: use bdrv_drained_begin/bdrv_drained_end

2016-10-17 Thread Paolo Bonzini
Ensure that there are no changes between the last check to
bdrv_get_dirty_count and the switch to the target.

There is already a bdrv_drained_end call, we only need to ensure
that bdrv_drained_begin is not called twice.

Signed-off-by: Paolo Bonzini 
---
 block/mirror.c | 35 +++
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 77d188a..45f9248 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -622,6 +622,7 @@ static void coroutine_fn mirror_run(void *opaque)
 MirrorExitData *data;
 BlockDriverState *bs = blk_bs(s->common.blk);
 BlockDriverState *target_bs = blk_bs(s->target);
+bool need_drain = true;
 int64_t length;
 BlockDriverInfo bdi;
 char backing_filename[2]; /* we only need 2 characters because we are only
@@ -756,11 +757,26 @@ static void coroutine_fn mirror_run(void *opaque)
  * source has dirty data to copy!
  *
  * Note that I/O can be submitted by the guest while
- * mirror_populate runs.
+ * mirror_populate runs, so pause it now.  Before deciding
+ * whether to switch to target check one last time if I/O has
+ * come in the meanwhile, and if not flush the data to disk.
  */
 trace_mirror_before_drain(s, cnt);
-bdrv_co_drain(bs);
+
+bdrv_drained_begin(bs);
 cnt = bdrv_get_dirty_count(s->dirty_bitmap);
+if (cnt > 0) {
+bdrv_drained_end(bs);
+continue;
+}
+
+/* The two disks are in sync.  Exit and report successful
+ * completion.
+ */
+assert(QLIST_EMPTY(>tracked_requests));
+s->common.cancelled = false;
+need_drain = false;
+break;
 }
 
 ret = 0;
@@ -773,13 +789,6 @@ static void coroutine_fn mirror_run(void *opaque)
 } else if (!should_complete) {
 delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0);
 block_job_sleep_ns(>common, QEMU_CLOCK_REALTIME, delay_ns);
-} else if (cnt == 0) {
-/* The two disks are in sync.  Exit and report successful
- * completion.
- */
-assert(QLIST_EMPTY(>tracked_requests));
-s->common.cancelled = false;
-break;
 }
 s->last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
 }
@@ -791,6 +800,7 @@ immediate_exit:
  * the target is a copy of the source.
  */
 assert(ret < 0 || (!s->synced && block_job_is_cancelled(>common)));
+assert(need_drain);
 mirror_wait_for_all_io(s);
 }
 
@@ -802,9 +812,10 @@ immediate_exit:
 
 data = g_malloc(sizeof(*data));
 data->ret = ret;
-/* Before we switch to target in mirror_exit, make sure data doesn't
- * change. */
-bdrv_drained_begin(bs);
+
+if (need_drain) {
+bdrv_drained_begin(bs);
+}
 block_job_defer_to_main_loop(>common, mirror_exit, data);
 }
 
-- 
2.7.4





[Qemu-block] [PATCH 06/20] qed: Implement .bdrv_drain

2016-10-17 Thread Paolo Bonzini
From: Fam Zheng 

The "need_check_timer" is used to clear the "NEED_CHECK" flag in the
image header after a grace period once metadata update has finished. To
comply with the bdrv_drain semantics, we should make sure it remains
deleted once .bdrv_drain is called.

The change to qed_need_check_timer_cb is needed because bdrv_qed_drain
is called after s->bs has been drained, and should not operate on it;
instead it should operate on the BdrvChild-ren exclusively.  Doing so
is easy because QED does not have a bdrv_co_flush_to_os callback, hence
all that is needed to flush it is to ensure writes have reached the disk.

Based on commit df9a681dc9a (which however included some unrelated
hunks, possibly due to a merge failure or an overlooked squash).
The patch was reverted because at the time bdrv_qed_drain could call
qed_plug_allocating_write_reqs while an allocating write was queued.
This however is not possible anymore after the previous patch;
.bdrv_drain is only called after all writes have completed at the
QED level, and its purpose is to trigger metadata writes in bs->file.

Signed-off-by: Fam Zheng 
Signed-off-by: Paolo Bonzini 
---
 block/qed.c | 16 +++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/block/qed.c b/block/qed.c
index 3ee879b..1a7ef0a 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -336,7 +336,7 @@ static void qed_need_check_timer_cb(void *opaque)
 qed_plug_allocating_write_reqs(s);
 
 /* Ensure writes are on disk before clearing flag */
-bdrv_aio_flush(s->bs, qed_clear_need_check, s);
+bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s);
 }
 
 static void qed_start_need_check_timer(BDRVQEDState *s)
@@ -378,6 +378,19 @@ static void bdrv_qed_attach_aio_context(BlockDriverState 
*bs,
 }
 }
 
+static void bdrv_qed_drain(BlockDriverState *bs)
+{
+BDRVQEDState *s = bs->opaque;
+
+/* Fire the timer immediately in order to start doing I/O as soon as the
+ * header is flushed.
+ */
+if (s->need_check_timer && timer_pending(s->need_check_timer)) {
+qed_cancel_need_check_timer(s);
+qed_need_check_timer_cb(s);
+}
+}
+
 static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
  Error **errp)
 {
@@ -1668,6 +1681,7 @@ static BlockDriver bdrv_qed = {
 .bdrv_check   = bdrv_qed_check,
 .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
 .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
+.bdrv_drain   = bdrv_qed_drain,
 };
 
 static void bdrv_qed_init(void)
-- 
2.7.4





[Qemu-block] [PATCH 01/20] replication: interrupt failover if the main device is closed

2016-10-17 Thread Paolo Bonzini
Without this change, there is a race condition in tests/test-replication.
Depending on how fast the failover job (active commit) runs, there is a
chance of two bad things happening:

1) replication_done can be called after the secondary has been closed
and hence when the BDRVReplicationState is not valid anymore.

2) two copies of the active disk are present during the
/replication/secondary/stop test (that test runs immediately after
/replication/secondary/start, which tests failover).  This causes the
corruption detector to fire.

Reviewed-by: Wen Congyang 
Reviewed-by: Changlong Xie 
Signed-off-by: Paolo Bonzini 
---
(already applied to Stefan's tree)

 block/replication.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/block/replication.c b/block/replication.c
index 3bd1cf1..5231a00 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -133,6 +133,9 @@ static void replication_close(BlockDriverState *bs)
 if (s->replication_state == BLOCK_REPLICATION_RUNNING) {
 replication_stop(s->rs, false, NULL);
 }
+if (s->replication_state == BLOCK_REPLICATION_FAILOVER) {
+block_job_cancel_sync(s->active_disk->bs->job);
+}
 
 if (s->mode == REPLICATION_MODE_SECONDARY) {
 g_free(s->top_id);
-- 
2.7.4





Re: [Qemu-block] [v2 2/5] block/ssh: Add InetSocketAddress and accept it

2016-10-17 Thread Kevin Wolf
Am 17.10.2016 um 14:33 hat Ashijeet Acharya geschrieben:
> On Sun, Oct 16, 2016 at 4:00 AM, Max Reitz  wrote:
> > On 15.10.2016 11:04, Ashijeet Acharya wrote:
> >> Add InetSocketAddress compatibility to SSH driver.
> >>
> >> Add a new option "server" to the SSH block driver which then accepts
> >> a InetSocketAddress.
> >>
> >> "host" and "port" are supported as legacy options and are mapped to
> >> their InetSocketAddress representation.
> >>
> >> Signed-off-by: Ashijeet Acharya 
> >> ---
> >>  block/ssh.c | 83 
> >> ++---
> >>  1 file changed, 74 insertions(+), 9 deletions(-)
> >>
> >>
> >>  /* Open the socket and connect. */
> >>  s->sock = inet_connect(s->hostport, errp);
> >> @@ -634,7 +698,8 @@ static int connect_to_ssh(BDRVSSHState *s, QDict 
> >> *options,
> >>  }
> >>
> >>  /* Check the remote host's key against known_hosts. */
> >> -ret = check_host_key(s, host, port, host_key_check, errp);
> >> +ret = check_host_key(s, s->inet->host, port, host_key_check,
> >
> > But then you're still using the port here... And I can't come up with a
> > way (not even a bad one) to get the numeric port. Maybe interpret the
> > addrinfo in inet_connect_saddr()? But getting that information out would
> > be ugly, if even possible...
> >
> > So maybe the best is to keep it this way and put a FIXME above the
> > atoi() call. :-/
> 
> Kevin, I believe (after talking with Max) that regarding the atoi()
> issue, I can't use any string to integer function since it won't
> succeed for cases like port = 'ssh' and putting a FIXME over it seems
> to be the only option. But Max did warn me, though, to get everybody's
> opinion before I do so. So I am awaiting your response on this one.
> Much better will be if you have a workaround solution in mind!! :-)

The integer port is only needed for libssh2_knownhost_checkp(). One
option could be to consider passing -1 instead:

port is the port number used by the host (or a negative number to
check the generic host). If the port number is given, libssh2 will
check the key for the specific host + port number combination in
addition to the plain host name only check.

In 99% of the cases, this shouldn't make any difference.

Alternatively it could be possible to use getservbyname() to get the
port number from the name, but maybe that's a bit too much for a feature
that most people don't even know of.

I'm also not completely opposed to simply requiring a numeric argument
for SSH. There is no real use to support service names here other than
being consistent with other places in qemu.

Kevin



Re: [Qemu-block] [v2 0/5] Allow blockdev-add for SSH

2016-10-17 Thread Ashijeet Acharya
On Mon, Oct 17, 2016 at 4:59 PM, Kevin Wolf  wrote:
> Am 15.10.2016 um 11:04 hat Ashijeet Acharya geschrieben:
>> Previously posted series patches:
>> v1: http://lists.nongnu.org/archive/html/qemu-devel/2016-10/msg02137.html
>
> One general remark: The subject line should still include the PATCH
> keyword, so it should be like "[PATCH v2 0/5] Allow blockdev-add for
> SSH".
>
> This is also what you automatically get with 'git format-patch -v2 ...'
>
Noted, won't happen again.

Ashijeet
> Kevin



Re: [Qemu-block] [v2 2/5] block/ssh: Add InetSocketAddress and accept it

2016-10-17 Thread Ashijeet Acharya
On Sun, Oct 16, 2016 at 4:00 AM, Max Reitz  wrote:
> On 15.10.2016 11:04, Ashijeet Acharya wrote:
>> Add InetSocketAddress compatibility to SSH driver.
>>
>> Add a new option "server" to the SSH block driver which then accepts
>> a InetSocketAddress.
>>
>> "host" and "port" are supported as legacy options and are mapped to
>> their InetSocketAddress representation.
>>
>> Signed-off-by: Ashijeet Acharya 
>> ---
>>  block/ssh.c | 83 
>> ++---
>>  1 file changed, 74 insertions(+), 9 deletions(-)
>>
>>
>>  /* Open the socket and connect. */
>>  s->sock = inet_connect(s->hostport, errp);
>> @@ -634,7 +698,8 @@ static int connect_to_ssh(BDRVSSHState *s, QDict 
>> *options,
>>  }
>>
>>  /* Check the remote host's key against known_hosts. */
>> -ret = check_host_key(s, host, port, host_key_check, errp);
>> +ret = check_host_key(s, s->inet->host, port, host_key_check,
>
> But then you're still using the port here... And I can't come up with a
> way (not even a bad one) to get the numeric port. Maybe interpret the
> addrinfo in inet_connect_saddr()? But getting that information out would
> be ugly, if even possible...
>
> So maybe the best is to keep it this way and put a FIXME above the
> atoi() call. :-/

Kevin, I believe (after talking with Max) that regarding the atoi()
issue, I can't use any string to integer function since it won't
succeed for cases like port = 'ssh' and putting a FIXME over it seems
to be the only option. But Max did warn me, though, to get everybody's
opinion before I do so. So I am awaiting your response on this one.
Much better will be if you have a workaround solution in mind!! :-)

Ashijeet
>
> Max



Re: [Qemu-block] [v2 0/5] Allow blockdev-add for SSH

2016-10-17 Thread Kevin Wolf
Am 15.10.2016 um 11:04 hat Ashijeet Acharya geschrieben:
> Previously posted series patches:
> v1: http://lists.nongnu.org/archive/html/qemu-devel/2016-10/msg02137.html

One general remark: The subject line should still include the PATCH
keyword, so it should be like "[PATCH v2 0/5] Allow blockdev-add for
SSH".

This is also what you automatically get with 'git format-patch -v2 ...'

Kevin



Re: [Qemu-block] [v2 2/5] block/ssh: Add InetSocketAddress and accept it

2016-10-17 Thread Kevin Wolf
Am 16.10.2016 um 00:30 hat Max Reitz geschrieben:
> > +static InetSocketAddress *ssh_config(BDRVSSHState *s, QDict *options,
> > + Error **errp)
> > +{
> > +InetSocketAddress *inet = NULL;
> > +QDict *addr = NULL;
> > +QObject *crumpled_addr = NULL;
> > +Visitor *iv = NULL;
> > +Error *local_error = NULL;
> > +
> > +qdict_extract_subqdict(options, , "server.");
> > +if (!qdict_size(addr)) {
> > +error_setg(errp, "SSH server address missing");
> > +goto out;
> > +}
> > +
> > +crumpled_addr = qdict_crumple(addr, true, errp);
> > +if (!crumpled_addr) {
> > +goto out;
> > +}
> > +
> > +iv = qobject_input_visitor_new_autocast(crumpled_addr, true, 1, true);
> 
> In contrast to what Kevin said in v1, I think you do not want to use
> autocast here.
> 
> Or, to be more specific, it's difficult. The thing is that the autocast
> documentation says: "Any scalar values in the @obj input data structure
> should always be represented as strings".
> 
> So if you do use the autocast version, command line works great because
> from there everything comes as a string. But blockdev-add no longer
> works because from there everything comes with the correct type (and you
> cannot give it the wrong type).
> [...]
> In contrast, if you do not use the autocast version, blockdev-add will
> work just fine, but you can no longer specify non-string values from the
> command line.

Ah, right, I missed that. :-/

> I don't think this is your problem, though. There should be a way for
> the command line options to be converted to the correct types while we
> continue to use strict type-checking for blockdev-add.
> 
> Therefore, I think you'll have to sacrifice one or the other here. All
> of the non-string options are optional, so it won't be too bad in any case.

If we have to sacrifice one, then yes, blockdev-add is the one that must
work. The new -blockdev command line option will then automatically
work, too, so at least there will be a way to create such nodes.

The usual way to get around the type conflicts is going through a
QemuOpts. So maybe qemu_opts_from_dict() with a QemuOptionsList that
accepts anythign, and then qobject_input_visitor_new_opts() could be a
workaround to keep -drive working at the same time. It's kind of ugly,
though.

Kevin


pgpaxQWH_xpeT.pgp
Description: PGP signature


Re: [Qemu-block] bug introduced by "block: Move throttling fields from BDS to BB"

2016-10-17 Thread Kevin Wolf
Am 17.10.2016 um 10:49 hat Alberto Garcia geschrieben:
> On Fri 14 Oct 2016 04:11:46 PM CEST, Paolo Bonzini wrote:
> > Here is next_throttle_token:
> >
> > -ThrottleGroup *tg = container_of(blk_bs(blk)->throttle_state,
> > - ThrottleGroup, ts);
> > +BlockBackendPublic *blkp = blk_get_public(blk);
> > +ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, 
> > ts);
> >  BlockBackend *token, *start;
> >  
> >  start = token = tg->tokens[is_write];
> >  
> >  /* get next bs round in round robin style */
> >  token = throttle_group_next_blk(token);
> > -while (token != start && !blk_bs(token)->pending_reqs[is_write]) {
> > +while (token != start && !blkp->pending_reqs[is_write]) {
> >  token = throttle_group_next_blk(token);
> >  }
> >
> >
> > blkp isn't updated every time token is updated.
> 
> You're right, I'll write a patch. I'd also try to check why this was not
> detected by any iotest.
> 
> Thanks!

Thanks a lot, Berto! (Both for fixing my bug and thinking of test cases)

Kevin



Re: [Qemu-block] [PATCH v5 1/1] block: improve error handling in raw_open

2016-10-17 Thread Kevin Wolf
Am 14.10.2016 um 17:59 hat Stefan Hajnoczi geschrieben:
> On Tue, Oct 11, 2016 at 04:12:35PM +0200, Halil Pasic wrote:
> > Make raw_open for POSIX more consistent in handling errors by setting
> > the error object also when qemu_open fails. The error object was set
> > generally set in case of errors, but I guess this case was overlooked.
> > Do the same for win32.
> > 
> > Signed-off-by: Halil Pasic 
> > Reviewed-by: Sascha Silbe 
> > Tested-by: Marc Hartmayer  (POSIX only)
> > 
> > ---
> > 
> > Stumbled upon this (POSIX) while testing VMs with too many SCSI disks in
> > respect to my nofile limit. When open hits the nofile limit while trying
> > to hotplug yet another SCSI disk via libvirt we end up with no adequate
> > error message (one stating too many files). Sadly this patch in not
> > sufficient to fix this problem because drive_new (/qemu/blockdev.c)
> > handles errors using error_report_err which is documented as not to be
> > used in QMP context.
> > 
> > The win32 part was not tested, and the sole reason I touched it is
> > to not introduce unnecessary divergence.
> > 
> > v4 -> v5:
> > * fix qemu-iotests by adding the filename to the message
> 
> This patch doesn't modify any iotests golden master files.  Does this
> mean the iotests output is unchanged?
> 
> > v3 -> v4:
> > * rebased on current master
> > v2 -> v3:
> > * first save errno then error_setg_errno
> > v1 -> v2:
> > * fixed win32 by the correct error_setg_*
> > * use the original errno consequently
> > ---
> >  block/raw-posix.c | 1 +
> >  block/raw-win32.c | 1 +
> >  2 files changed, 2 insertions(+)
> 
> Reviewed-by: Stefan Hajnoczi 

Thanks, applied to the block branch.

Kevin


pgpKjFr_3EYB1.pgp
Description: PGP signature


Re: [Qemu-block] [Qemu-devel] [PATCH v2 10/11] blockjob: refactor backup_start as backup_job_create

2016-10-17 Thread Fam Zheng
On Tue, 10/11 11:35, Kevin Wolf wrote:
> > >By the way, why did we allow to add a 'bitmap' option for DriveBackup
> > >without adding it to BlockdevBackup at the same time?
> > 
> > I don't remember. I'm not sure anyone ever audited it to convince
> > themselves it was a useful or safe thing to do. I believe at the
> > time I was pushing for bitmaps in DriveBackup, Fam was still
> > authoring the BlockdevBackup interface.
> 
> Hm, maybe that's why. I checked the commit dates of both (and there
> BlockdevBackup was earlier), but I didn't check the development history.
> 
> Should we add it now or is it a bad idea?

Yes, we should add it. I'll send a separate patch. Thanks for catching that.

Fam



Re: [Qemu-block] [Qemu-devel] [PATCH 00/18] dataplane: remove RFifoLock (including almost all previously sent patches)

2016-10-17 Thread Christian Borntraeger
On 10/13/2016 07:34 PM, Paolo Bonzini wrote:
> This patch reorganizes aio_poll callers to establish new rules for
> dataplane locking.  The idea is that I/O operations on a dataplane
> BDS (i.e. one where the AioContext is not the main one) do not call
> aio_poll anymore.  Instead, they wait for the operation to end in the
> other I/O thread, at which point the other I/O thread calls bdrv_wakeup
> to wake up the main thread.
> 
> With this change, only one thread runs aio_poll for an AioContext.
> While aio_context_acquire/release is still needed to protect the BDSes,
> it need not interrupt the other thread's event loop anymore, and therefore
> it does not need contention callbacks anymore.  Thus the patch can remove
> RFifoLock.  This fixes possible hangs in bdrv_drain_all, reproducible (for
> example) by unplugging a virtio-scsi-dataplane device while there is I/O
> going on for a virtio-blk-dataplane on the same I/O thread.


Have you seen improvements or deteriorations in performance with single disks
or multiple disks? Is there a branch? 




Re: [Qemu-block] [Qemu-devel] [PATCH v2 3/4] fdc: Move qdev properties to FloppyDrive

2016-10-17 Thread Kevin Wolf
Am 15.10.2016 um 00:32 hat John Snow geschrieben:
> On 09/30/2016 03:39 PM, Kevin Wolf wrote:
> >This makes the FloppyDrive qdev object actually useful: Now that it has
> >all properties that don't belong to the controller, you can actually
> >use '-device floppy' and get a working result.
> >
> >Command line semantics is consistent with CD-ROM drives: By default you
> >get a single empty floppy drive. You can override it with -drive and
> >using the same index, but if you use -drive to add a floppy to a
> >different index, you get both of them. However, as soon as you use any
> >'-device floppy', even to a different slot, the default drive is
> >disabled.
> >
> >Using '-device floppy' without specifying the unit will choose the first
> >free slot on the controller.
> >
> >Signed-off-by: Kevin Wolf 
> >---
> > hw/block/fdc.c | 112 
> > ++---
> > vl.c   |   1 +
> > 2 files changed, 85 insertions(+), 28 deletions(-)
> >
> >diff --git a/hw/block/fdc.c b/hw/block/fdc.c
> >index 5aa8e52..00c0ec6 100644
> >--- a/hw/block/fdc.c
> >+++ b/hw/block/fdc.c
> >@@ -35,6 +35,7 @@
> > #include "qemu/timer.h"
> > #include "hw/isa/isa.h"
> > #include "hw/sysbus.h"
> >+#include "hw/block/block.h"
> > #include "sysemu/block-backend.h"
> > #include "sysemu/blockdev.h"
> > #include "sysemu/sysemu.h"
> >@@ -487,12 +488,18 @@ static const BlockDevOps fd_block_ops = {
> >  OBJECT_CHECK(FloppyDrive, (obj), TYPE_FLOPPY_DRIVE)
> >
> > typedef struct FloppyDrive {
> >-DeviceState qdev;
> >-uint32_tunit;
> >+DeviceState qdev;
> >+uint32_tunit;
> >+BlockConf   conf;
> >+FloppyDriveType type;
> > } FloppyDrive;
> >
> > static Property floppy_drive_properties[] = {
> > DEFINE_PROP_UINT32("unit", FloppyDrive, unit, -1),
> >+DEFINE_BLOCK_PROPERTIES(FloppyDrive, conf),
> >+DEFINE_PROP_DEFAULT("drive-type", FloppyDrive, type,
> >+FLOPPY_DRIVE_TYPE_AUTO, qdev_prop_fdc_drive_type,
> >+FloppyDriveType),
> > DEFINE_PROP_END_OF_LIST(),
> > };
> >
> >@@ -501,6 +508,7 @@ static int floppy_drive_init(DeviceState *qdev)
> > FloppyDrive *dev = FLOPPY_DRIVE(qdev);
> > FloppyBus *bus = DO_UPCAST(FloppyBus, bus, dev->qdev.parent_bus);
> > FDrive *drive;
> >+int ret;
> >
> > if (dev->unit == -1) {
> > for (dev->unit = 0; dev->unit < MAX_FD; dev->unit++) {
> >@@ -517,29 +525,57 @@ static int floppy_drive_init(DeviceState *qdev)
> > return -1;
> > }
> >
> >-/* TODO Check whether unit is in use */
> >-
> > drive = get_drv(bus->fdc, dev->unit);
> >-
> > if (drive->blk) {
> >-if (blk_get_on_error(drive->blk, 0) != BLOCKDEV_ON_ERROR_ENOSPC) {
> >-error_report("fdc doesn't support drive option werror");
> >-return -1;
> >-}
> >-if (blk_get_on_error(drive->blk, 1) != BLOCKDEV_ON_ERROR_REPORT) {
> >-error_report("fdc doesn't support drive option rerror");
> >-return -1;
> >-}
> >-} else {
> >+error_report("Floppy unit %d is in use", dev->unit);
> >+return -1;
> >+}
> >+
> >+if (!dev->conf.blk) {
> > /* Anonymous BlockBackend for an empty drive */
> >-drive->blk = blk_new();
> >+dev->conf.blk = blk_new();
> >+ret = blk_attach_dev(dev->conf.blk, dev);
> 
> Missing a 'q' here:   ^

Yes. It has the same value, but after my last pull request we need a
DeviceState* here indeed rather than a void*.

> >@@ -2782,8 +2838,8 @@ static const VMStateDescription vmstate_sysbus_fdc ={
> > };
> >
> > static Property sysbus_fdc_properties[] = {
> >-DEFINE_PROP_DRIVE("driveA", FDCtrlSysBus, state.drives[0].blk),
> >-DEFINE_PROP_DRIVE("driveB", FDCtrlSysBus, state.drives[1].blk),
> >+DEFINE_PROP_DRIVE("driveA", FDCtrlSysBus, state.qdev_for_drives[0].blk),
> >+DEFINE_PROP_DRIVE("driveB", FDCtrlSysBus, state.qdev_for_drives[1].blk),
> > DEFINE_PROP_DEFAULT("fdtypeA", FDCtrlSysBus, state.drives[0].drive,
> > FLOPPY_DRIVE_TYPE_AUTO, qdev_prop_fdc_drive_type,
> > FloppyDriveType),
> 
> ^ Does sysbus' type property not need updating ...?

Doing half of the properties here felt like a good transitional step
from fully converting the PC device to completely ignoring Sun.

Well, I guess, I should fix that...

Kevin



Re: [Qemu-block] bug introduced by "block: Move throttling fields from BDS to BB"

2016-10-17 Thread Alberto Garcia
On Fri 14 Oct 2016 04:11:46 PM CEST, Paolo Bonzini wrote:
> Here is next_throttle_token:
>
> -ThrottleGroup *tg = container_of(blk_bs(blk)->throttle_state,
> - ThrottleGroup, ts);
> +BlockBackendPublic *blkp = blk_get_public(blk);
> +ThrottleGroup *tg = container_of(blkp->throttle_state, ThrottleGroup, 
> ts);
>  BlockBackend *token, *start;
>  
>  start = token = tg->tokens[is_write];
>  
>  /* get next bs round in round robin style */
>  token = throttle_group_next_blk(token);
> -while (token != start && !blk_bs(token)->pending_reqs[is_write]) {
> +while (token != start && !blkp->pending_reqs[is_write]) {
>  token = throttle_group_next_blk(token);
>  }
>
>
> blkp isn't updated every time token is updated.

You're right, I'll write a patch. I'd also try to check why this was not
detected by any iotest.

Thanks!

Berto



Re: [Qemu-block] [PATCH v4 09/12] iotests.py: Add qemu_nbd function

2016-10-17 Thread Kevin Wolf
Am 15.10.2016 um 19:17 hat Max Reitz geschrieben:
> On 13.10.2016 15:11, Kevin Wolf wrote:
> > Am 28.09.2016 um 22:55 hat Max Reitz geschrieben:
> >> Signed-off-by: Max Reitz 
> >> ---
> >>  tests/qemu-iotests/iotests.py | 8 
> >>  1 file changed, 8 insertions(+)
> >>
> >> diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
> >> index 3329bc1..5a2678f 100644
> >> --- a/tests/qemu-iotests/iotests.py
> >> +++ b/tests/qemu-iotests/iotests.py
> >> @@ -39,6 +39,10 @@ qemu_io_args = [os.environ.get('QEMU_IO_PROG', 
> >> 'qemu-io')]
> >>  if os.environ.get('QEMU_IO_OPTIONS'):
> >>  qemu_io_args += os.environ['QEMU_IO_OPTIONS'].strip().split(' ')
> >>  
> >> +qemu_nbd_args = [os.environ.get('QEMU_NBD_PROG', 'qemu-nbd')]
> >> +if os.environ.get('QEMU_NBD_OPTIONS'):
> >> +qemu_nbd_args += os.environ['QEMU_NBD_OPTIONS'].strip().split(' ')
> >> +
> >>  qemu_prog = os.environ.get('QEMU_PROG', 'qemu')
> >>  qemu_opts = os.environ.get('QEMU_OPTIONS', '').strip().split(' ')
> >>  
> >> @@ -87,6 +91,10 @@ def qemu_io(*args):
> >>  sys.stderr.write('qemu-io received signal %i: %s\n' % (-exitcode, 
> >> ' '.join(args)))
> >>  return subp.communicate()[0]
> >>  
> >> +def qemu_nbd(*args):
> >> +'''Run qemu-nbd in daemon mode and return the parent's exit code'''
> >> +return subprocess.call(qemu_nbd_args + ['--fork'] + list(args))
> > 
> > Wouldn't it be better to always use -t, track the PID and shut it down
> > explicitly when the test exits?
> 
> Probably. It's a lot more complicated, though. I'll see what I can do
> but I'm not sure if I can do a lot before 2.8.

In that case, I'd prefer to have this series in 2.8 and improve the test
case later, so don't let this stop you from sending the next version.

Kevin


pgpzbUcWw3Ebw.pgp
Description: PGP signature


Re: [Qemu-block] [PATCH 15/18] block: only call aio_poll on the current thread's AioContext

2016-10-17 Thread Paolo Bonzini


On 16/10/2016 18:40, Stefan Hajnoczi wrote:
> >  void bdrv_wakeup(BlockDriverState *bs)
> >  {
> > +if (bs->wakeup) {
> > +aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
> > +}
> >  }
> 
> Why use a dummy BH instead of aio_notify()?

Originally I used aio_bh_schedule_oneshot() because aio_notify() is not
enough for aio_poll() to return true.  It's also true that I am not
using anymore the result of aio_poll, though.

Since this is not a fast path and it's not very much stressed by
qemu-iotests, I think it's better if we can move towards making
aio_notify() more or less an internal detail.  If you prefer
aio_notify(), however, I can look into that as well.

Thanks,

Paolo

>> >  void bdrv_dec_in_flight(BlockDriverState *bs)
>> > diff --git a/include/block/block.h b/include/block/block.h
>> > index ba4318b..72d5d8e 100644
>> > --- a/include/block/block.h
>> > +++ b/include/block/block.h
>> > @@ -343,9 +343,27 @@ void bdrv_drain_all(void);
>> >  #define bdrv_poll_while(bs, cond) ({   \
>> >  bool waited_ = false;  \
>> >  BlockDriverState *bs_ = (bs);  \
>> > -while ((cond)) {   \
>> > -aio_poll(bdrv_get_aio_context(bs_), true); \
>> > -waited_ = true;\
>> > +AioContext *ctx_ = bdrv_get_aio_context(bs_);  \
>> > +if (aio_context_in_iothread(ctx_)) {   \
>> > +while ((cond)) {   \
>> > +aio_poll(ctx_, true);  \
>> > +waited_ = true;\
>> > +}  \
>> > +} else {   \
>> > +assert(qemu_get_current_aio_context() ==   \
>> > +   qemu_get_aio_context());\
> The assumption is that IOThread #1 will never call bdrv_poll_while() on
> IOThread #2's AioContext.  I believe that is true today.  Is this what
> you had in mind?
> 
> Please add a comment since it's not completely obvious from the assert
> expression.
> 



Re: [Qemu-block] [PATCH 07/18] block: introduce bdrv_poll_while and bdrv_wakeup

2016-10-17 Thread Paolo Bonzini


On 16/10/2016 12:25, Stefan Hajnoczi wrote:
> On Thu, Oct 13, 2016 at 07:34:11PM +0200, Paolo Bonzini wrote:
>> @@ -485,9 +474,14 @@ void bdrv_inc_in_flight(BlockDriverState *bs)
>>  atomic_inc(>in_flight);
>>  }
>>  
>> +void bdrv_wakeup(BlockDriverState *bs)
>> +{
>> +}
> 
> Please write a doc comment explaining the semantics of this new API.
> 
> Since it's a nop here it may be better to introduce bdrv_wakeup() in
> another patch.

Okay, will do.

Paolo



Re: [Qemu-block] [PATCH 02/18] blockjob: introduce .drain callback for jobs

2016-10-17 Thread Paolo Bonzini


On 16/10/2016 12:02, Stefan Hajnoczi wrote:
> On Thu, Oct 13, 2016 at 07:34:06PM +0200, Paolo Bonzini wrote:
>> +static void backup_drain(BlockJob *job)
>> +{
>> +BackupBlockJob *s = container_of(job, BackupBlockJob, common);
>> +
>> +/* Need to keep a reference in case blk_drain triggers execution
>> + * of backup_complete...
>> + */
>> +if (s->target) {
>> +blk_ref(s->target);
>> +blk_drain(s->target);
>> +blk_unref(s->target);
>> +}
> [...]
>> @@ -331,6 +346,7 @@ static void backup_complete(BlockJob *job, void *opaque)
>>  BackupCompleteData *data = opaque;
>>  
>>  blk_unref(s->target);
>> +s->target = NULL;
> 
> Will blk_unref(s->target) segfault since backup_complete() has set it to
> NULL?  I expected backup_drain() to stash the pointer in a local
> variable to avoid using s->target.

Yes, indeed.

Paolo