Re: [Qemu-block] [Qemu-devel] [PATCH v3 0/3] file-posix: Simplifications on image locking

2018-08-17 Thread no-reply
Hi,

This series failed docker-quick@centos7 build test. Please find the testing 
commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

Type: series
Message-id: 20180817054758.14515-1-f...@redhat.com
Subject: [Qemu-devel] [PATCH v3 0/3] file-posix: Simplifications on image 
locking

=== TEST SCRIPT BEGIN ===
#!/bin/bash
time make docker-test-quick@centos7 SHOW_ENV=1 J=8
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
Switched to a new branch 'test'
4276529934 tests: Add unit tests for image locking
a32bcf1c2b file-posix: Drop s->lock_fd
6c9342ffd1 file-posix: Skip effectiveless OFD lock operations

=== OUTPUT BEGIN ===
  BUILD   centos7
make[1]: Entering directory '/var/tmp/patchew-tester-tmp-l_n0wk17/src'
  GEN 
/var/tmp/patchew-tester-tmp-l_n0wk17/src/docker-src.2018-08-17-20.32.22.25661/qemu.tar
Cloning into 
'/var/tmp/patchew-tester-tmp-l_n0wk17/src/docker-src.2018-08-17-20.32.22.25661/qemu.tar.vroot'...
done.
Checking out files:  45% (2884/6330)   
Checking out files:  46% (2912/6330)   
Checking out files:  47% (2976/6330)   
Checking out files:  48% (3039/6330)   
Checking out files:  49% (3102/6330)   
Checking out files:  50% (3165/6330)   
Checking out files:  51% (3229/6330)   
Checking out files:  52% (3292/6330)   
Checking out files:  53% (3355/6330)   
Checking out files:  54% (3419/6330)   
Checking out files:  55% (3482/6330)   
Checking out files:  56% (3545/6330)   
Checking out files:  57% (3609/6330)   
Checking out files:  58% (3672/6330)   
Checking out files:  59% (3735/6330)   
Checking out files:  60% (3798/6330)   
Checking out files:  61% (3862/6330)   
Checking out files:  62% (3925/6330)   
Checking out files:  63% (3988/6330)   
Checking out files:  64% (4052/6330)   
Checking out files:  65% (4115/6330)   
Checking out files:  66% (4178/6330)   
Checking out files:  67% (4242/6330)   
Checking out files:  68% (4305/6330)   
Checking out files:  69% (4368/6330)   
Checking out files:  70% (4431/6330)   
Checking out files:  71% (4495/6330)   
Checking out files:  72% (4558/6330)   
Checking out files:  73% (4621/6330)   
Checking out files:  74% (4685/6330)   
Checking out files:  75% (4748/6330)   
Checking out files:  75% (4801/6330)   
Checking out files:  76% (4811/6330)   
Checking out files:  77% (4875/6330)   
Checking out files:  78% (4938/6330)   
Checking out files:  79% (5001/6330)   
Checking out files:  80% (5064/6330)   
Checking out files:  81% (5128/6330)   
Checking out files:  82% (5191/6330)   
Checking out files:  83% (5254/6330)   
Checking out files:  84% (5318/6330)   
Checking out files:  85% (5381/6330)   
Checking out files:  86% (5444/6330)   
Checking out files:  87% (5508/6330)   
Checking out files:  88% (5571/6330)   
Checking out files:  89% (5634/6330)   
Checking out files:  90% (5697/6330)   
Checking out files:  91% (5761/6330)   
Checking out files:  92% (5824/6330)   
Checking out files:  93% (5887/6330)   
Checking out files:  94% (5951/6330)   
Checking out files:  95% (6014/6330)   
Checking out files:  96% (6077/6330)   
Checking out files:  97% (6141/6330)   
Checking out files:  98% (6204/6330)   
Checking out files:  99% (6267/6330)   
Checking out files: 100% (6330/6330)   
Checking out files: 100% (6330/6330), done.
Your branch is up-to-date with 'origin/test'.
Submodule 'dtc' (git://git.qemu-project.org/dtc.git) registered for path 'dtc'
Cloning into 
'/var/tmp/patchew-tester-tmp-l_n0wk17/src/docker-src.2018-08-17-20.32.22.25661/qemu.tar.vroot/dtc'...
Submodule path 'dtc': checked out 'e54388015af1fb4bf04d0bca99caba1074d9cc42'
Submodule 'ui/keycodemapdb' (git://git.qemu.org/keycodemapdb.git) registered 
for path 'ui/keycodemapdb'
Cloning into 
'/var/tmp/patchew-tester-tmp-l_n0wk17/src/docker-src.2018-08-17-20.32.22.25661/qemu.tar.vroot/ui/keycodemapdb'...
Submodule path 'ui/keycodemapdb': checked out 
'6b3d716e2b6472eb7189d3220552280ef3d832ce'
  COPYRUNNER
RUN test-quick in qemu:centos7 
Packages installed:
SDL-devel-1.2.15-14.el7.x86_64
bison-3.0.4-1.el7.x86_64
bzip2-1.0.6-13.el7.x86_64
bzip2-devel-1.0.6-13.el7.x86_64
ccache-3.3.4-1.el7.x86_64
csnappy-devel-0-6.20150729gitd7bc683.el7.x86_64
flex-2.5.37-3.el7.x86_64
gcc-4.8.5-28.el7_5.1.x86_64
gettext-0.19.8.1-2.el7.x86_64
git-1.8.3.1-14.el7_5.x86_64
glib2-devel-2.54.2-2.el7.x86_64
libaio-devel-0.3.109-13.el7.x86_64
libepoxy-devel-1.3.1-2.el7_5.x86_64
libfdt-devel-1.4.6-1.el7.x86_64
lzo-devel-2.06-8.el7.x86_64
make-3.82-23.el7.x86_64
mesa-libEGL-devel-17.2.3-8.20171019.el7.x86_64
mesa-libgbm-devel-17.2.3-8.20171019.el7.x86_64
nettle-devel-2.7.1-8.el7.x86_64
package g++ is not installed
package librdmacm-devel is not installed
pixman-devel-0.34.0-1.el7.x86_64
spice-glib-devel-0.34-3.el7_5.1.x86_64
spice-server-devel-0.14.0-2.el7_5.4.x86_64
tar-1.26-34.el7.x86_64
vte-devel-0.28.2-10.el7.x86_64
xen-devel-4.6.6-12.el7.x86_64
zlib-devel-1.2.7-17.el7.x86_64

Environment variables:
PACKAGES=bison   

Re: [Qemu-block] [RFC v2] new, node-graph-based fleecing and backup

2018-08-17 Thread Max Reitz
On 2018-08-14 19:01, Vladimir Sementsov-Ogievskiy wrote:
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
> 
> [v2 is just a resend. I forget to add Den an me to cc, and I don't see the
> letter in my thunderbird at all. strange. sorry for that]
> 
> Hi all!
> 
> Here is an idea and kind of proof-of-concept of how to unify and improve
> push/pull backup schemes.
> 
> Let's start from fleecing, a way of importing a point-in-time snapshot not
> creating a real snapshot. Now we do it with help of backup(sync=none)..
> 
> Proposal:
> 
> For fleecing we need two nodes:
> 
> 1. fleecing hook. It's a filter which should be inserted on top of active
> disk. It's main purpose is handling guest writes by copy-on-write operation,
> i.e. it's a substitution for write-notifier in backup job.
> 
> 2. fleecing cache. It's a target node for COW operations by fleecing-hook.
> It also represents a point-in-time snapshot of active disk for the readers.

It's not really COW, it's copy-before-write, isn't it?  It's something
else entirely.  COW is about writing data to an overlay *instead* of
writing it to the backing file.  Ideally, you don't copy anything,
actually.  It's just a side effect that you need to copy things if your
cluster size doesn't happen to match exactly what you're overwriting.

CBW is about copying everything to the overlay, and then leaving it
alone, instead writing the data to the backing file.

I'm not sure how important it is, I just wanted to make a note so we
don't misunderstand what's going on, somehow.


The fleecing hook sounds good to me, but I'm asking myself why we don't
just add that behavior to the backup filter node.  That is, re-implement
backup without before-write notifiers by making the filter node actually
do something (I think there was some reason, but I don't remember).

> The simplest realization of fleecing cache is a qcow2 temporary image, backed
> by active disk, i.e.:
> 
> +---+
> | Guest |
> +---+---+
> |
> v
> +---+---+  file +---+
> | Fleecing hook +-->+ Fleecing cache(qcow2) |
> +---+---+   +---+---+
> |   |
> backing |   |
> v   |
> +---+-+  backing|
> | Active disk +<+
> +-+
> 
> Hm. No, because of permissions I can't do so, I have to do like this:
> 
> +---+
> | Guest |
> +---+---+
> |
> v
> +---+---+  file +---+
> | Fleecing hook +-->+ Fleecing cache(qcow2) |
> +---+---+   +-+-+
> | |
> backing | | backing
> v v
> +---+-+   backing   +-+-+
> | Active disk +<+ hack children permissions |
> +-+ | filter node   |
> +---+
> 
> Ok, this works, it's an image fleecing scheme without any block jobs.

So this is the goal?  Hm.  How useful is that really?

I suppose technically you could allow blockdev-add'ing a backup filter
node (though only with sync=none) and that would give you the same.

> Problems with realization:
> 
> 1 What to do with hack-permissions-node? What is a true way to implement
> something like this? How to tune permissions to avoid this additional node?

Hm, how is that different from what we currently do?  Because the block
job takes care of it?

Well, the user would have to guarantee the permissions.  And they can
only do that by manually adding a filter node in the backing chain, I
suppose.

Or they just start a block job which guarantees the permissions work...
So maybe it's best to just stay with a block job as it is.

> 2 Inserting/removing the filter. Do we have working way or developments on
> it?

Berto has posted patches for an x-blockdev-reopen QMP command.

> 3. Interesting: we can't setup backing link to active disk before inserting
> fleecing-hook, otherwise, it will damage this link on insertion. This means,
> that we can't create fleecing cache node in advance with all backing to
> reference it when creating fleecing hook. And we can't prepare all the nodes
> in advance and then insert the filter.. We have to:
> 1. create all the nodes with all links in one big json, or

I think that should be possible with x-blockdev-reopen.

> 2. set backing links/create nodes automatically, as it is done in this RFC
>  (it's a bad way I think, not clear, not transparent)
> 
> 4. Is it a good idea to use "backing" and "file" links in such way?

I don't think so, because you're pretending it to be a COW relationship
when it isn't.  Using backing for what it is is kind of OK (because
that's what the mirror and backup filters do, too), but 

Re: [Qemu-block] [PATCH v2 1/3] qapi: add x-query-block-graph

2018-08-17 Thread Max Reitz
On 2018-08-17 22:32, Eric Blake wrote:
> On 08/17/2018 01:04 PM, Vladimir Sementsov-Ogievskiy wrote:
>> Add a new command, returning block nodes graph.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy 
>> ---
>>   qapi/block-core.json  | 116
>> ++
> 
>> +##
>> +# @BlockGraphEdge:
>> +#
>> +# Block Graph edge description for x-query-block-graph.
>> +#
>> +# @parent: parent id
>> +#
>> +# @child: child id
>> +#
>> +# @name: name of the relation (examples are 'file' and 'backing')
> 
> Can this be made into a QAPI enum? (It would have ripple effects to
> existing code, but might be a worthwhile cleanup).
> 
>> +#
>> +# @perm: granted permissions for the parent operating on the child
>> +#
>> +# @shared-perm: permissions that can still be granted to other users
>> of the
>> +#   child while it is still attached this parent
>> +#
>> +# Since: 3.1
>> +##
>> +{ 'struct': 'BlockGraphEdge',
>> +  'data': { 'parent': 'uint64', 'child': 'uint64',
>> +    'name': 'str', 'perm': [ 'BlockPermission' ],
>> +    'shared-perm': [ 'BlockPermission' ] } }
>> +
> 
>> +##
>> +# @x-query-block-graph:
>> +#
>> +# Get the block graph.
>> +#
>> +# Since: 3.1
>> +##
>> +{ 'command': 'x-query-block-graph', 'returns': 'BlockGraph' }
> 
> Why is this command given an x- prefix?  What would it take to promote
> it from experimental to fully supported?

This is just a very bad reasons, but I'll give it anyway: We really want
such a command but still don't have one.  So I doubt this is exactly
what we want. :-)

A better reason is that we probably do not want a single command to
return the whole block graph.  Do we want the command to just return
info for a single node, including just the node names of the children?
Do we want the command to include child info on request (but start from
a user-specified root)?

Also, the interface is...  Er, weird?  Honestly, I don't quite see why
we would want it like this without x-.

Why use newly generated IDs instead of node names?  Why are those RAM
addresses?  That is just so fishy.

Because parents can be non-nodes?  Well, I suppose you better not
include them in the graph like this, then.

I don't see why the query command we want would include non-BDSs at all.

It may be useful for debugging, so, er, well, with an x-debug- prefix,
OK.  But the question then is whether it's useful enough to warrant
having a separate query command that isn't precisely the command we want
anyway.

The first question we probably have to ask is whether the query command
needs to output parent information.  If not, querying would probably
start at some named node and then you'd go down from there (either with
many queries or through a single one).

If so, well, then we can still output parent information, but I'd say
that then is purely informational and we don't need to "generate" new
IDs for them.  Just have either a node-name there or a user-readable
description (like it was in v1; although it has to be noted that such a
user-readable description is useless to a management layer), but these
new IDs are really not something I like.

> Overall, the command looks quite useful; and the fact that you produced
> some nice .dot graphs from it for visualization seems like it is worth
> considering this as a permanent API addition.  The question, then, is if
> the interface is right, or if it needs any tweaks (like using an enum
> instead of open-coded string for the relation between parent and child),
> as a reason for keeping the x- prefix.

You can use x-debug- even when you decide to keep a command.

I see no reason why a command should hastily not get an x- prefix just
because it may be useful enough.  If it really is and we really see the
interface is good (which I really don't think it is), then we can always
drop it later.

Max

>> +++ b/block.c
>> @@ -4003,6 +4003,86 @@ BlockDeviceInfoList
>> *bdrv_named_nodes_list(Error **errp)
>>   return list;
>>   }
>>   +#define QAPI_LIST_ADD(list, element) do { \
>> +    typeof(list) _tmp = g_new(typeof(*(list)), 1); \
>> +    _tmp->value = (element); \
>> +    _tmp->next = (list); \
>> +    list = _tmp; \
>> +} while (0)
> 
> Hmm - this seems like a frequently observed pattern - should it be
> something that we add independently and use throughout the tree as part
> of QAPI interactions in general?
> 
>> +
>> +BlockGraph *bdrv_get_block_graph(Error **errp)
>> +{
>> +    BlockGraph *graph = g_new0(BlockGraph, 1);
>> +    BlockDriverState *bs;
>> +    BdrvChild *child;
>> +    BlockGraphNode *node;
>> +    BlockGraphEdge *edge;
>> +    struct {
>> +    unsigned int flag;
>> +    BlockPermission num;
>> +    } permissions[] = {
>> +    { BLK_PERM_CONSISTENT_READ, BLOCK_PERMISSION_CONSISTENT_READ },
> 
> Would it be worth directly reusing a QAPI enum for all such permissions
> in the existing code base, instead of having to map between an internal
> and a QAPI enum?
> 
>> +    { 

Re: [Qemu-block] [Qemu-devel] [RFC v2] new, node-graph-based fleecing and backup

2018-08-17 Thread no-reply
Hi,

This series failed docker-quick@centos7 build test. Please find the testing 
commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

Type: series
Message-id: 20180814170126.56461-1-vsement...@virtuozzo.com
Subject: [Qemu-devel] [RFC v2] new, node-graph-based fleecing and backup

=== TEST SCRIPT BEGIN ===
#!/bin/bash
time make docker-test-quick@centos7 SHOW_ENV=1 J=8
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
Switched to a new branch 'test'
d307c74eae new, node-graph-based fleecing and backup

=== OUTPUT BEGIN ===
  BUILD   centos7
make[1]: Entering directory '/var/tmp/patchew-tester-tmp-yl0gza68/src'
  GEN 
/var/tmp/patchew-tester-tmp-yl0gza68/src/docker-src.2018-08-17-16.59.54.2505/qemu.tar
Cloning into 
'/var/tmp/patchew-tester-tmp-yl0gza68/src/docker-src.2018-08-17-16.59.54.2505/qemu.tar.vroot'...
done.
Checking out files:  49% (3155/6322)   
Checking out files:  50% (3161/6322)   
Checking out files:  51% (3225/6322)   
Checking out files:  52% (3288/6322)   
Checking out files:  53% (3351/6322)   
Checking out files:  54% (3414/6322)   
Checking out files:  55% (3478/6322)   
Checking out files:  56% (3541/6322)   
Checking out files:  57% (3604/6322)   
Checking out files:  58% (3667/6322)   
Checking out files:  59% (3730/6322)   
Checking out files:  60% (3794/6322)   
Checking out files:  61% (3857/6322)   
Checking out files:  62% (3920/6322)   
Checking out files:  63% (3983/6322)   
Checking out files:  64% (4047/6322)   
Checking out files:  65% (4110/6322)   
Checking out files:  66% (4173/6322)   
Checking out files:  67% (4236/6322)   
Checking out files:  68% (4299/6322)   
Checking out files:  69% (4363/6322)   
Checking out files:  70% (4426/6322)   
Checking out files:  71% (4489/6322)   
Checking out files:  72% (4552/6322)   
Checking out files:  73% (4616/6322)   
Checking out files:  74% (4679/6322)   
Checking out files:  75% (4742/6322)   
Checking out files:  76% (4805/6322)   
Checking out files:  77% (4868/6322)   
Checking out files:  78% (4932/6322)   
Checking out files:  79% (4995/6322)   
Checking out files:  80% (5058/6322)   
Checking out files:  81% (5121/6322)   
Checking out files:  82% (5185/6322)   
Checking out files:  83% (5248/6322)   
Checking out files:  84% (5311/6322)   
Checking out files:  85% (5374/6322)   
Checking out files:  86% (5437/6322)   
Checking out files:  87% (5501/6322)   
Checking out files:  88% (5564/6322)   
Checking out files:  89% (5627/6322)   
Checking out files:  90% (5690/6322)   
Checking out files:  91% (5754/6322)   
Checking out files:  92% (5817/6322)   
Checking out files:  93% (5880/6322)   
Checking out files:  94% (5943/6322)   
Checking out files:  95% (6006/6322)   
Checking out files:  96% (6070/6322)   
Checking out files:  97% (6133/6322)   
Checking out files:  98% (6196/6322)   
Checking out files:  99% (6259/6322)   
Checking out files: 100% (6322/6322)   
Checking out files: 100% (6322/6322), done.
Your branch is up-to-date with 'origin/test'.
Submodule 'dtc' (git://git.qemu-project.org/dtc.git) registered for path 'dtc'
Cloning into 
'/var/tmp/patchew-tester-tmp-yl0gza68/src/docker-src.2018-08-17-16.59.54.2505/qemu.tar.vroot/dtc'...
Submodule path 'dtc': checked out 'e54388015af1fb4bf04d0bca99caba1074d9cc42'
Submodule 'ui/keycodemapdb' (git://git.qemu.org/keycodemapdb.git) registered 
for path 'ui/keycodemapdb'
Cloning into 
'/var/tmp/patchew-tester-tmp-yl0gza68/src/docker-src.2018-08-17-16.59.54.2505/qemu.tar.vroot/ui/keycodemapdb'...
Submodule path 'ui/keycodemapdb': checked out 
'6b3d716e2b6472eb7189d3220552280ef3d832ce'
  COPYRUNNER
RUN test-quick in qemu:centos7 
Packages installed:
SDL-devel-1.2.15-14.el7.x86_64
bison-3.0.4-1.el7.x86_64
bzip2-devel-1.0.6-13.el7.x86_64
ccache-3.3.4-1.el7.x86_64
csnappy-devel-0-6.20150729gitd7bc683.el7.x86_64
flex-2.5.37-3.el7.x86_64
gcc-4.8.5-16.el7_4.2.x86_64
gettext-0.19.8.1-2.el7.x86_64
git-1.8.3.1-12.el7_4.x86_64
glib2-devel-2.50.3-3.el7.x86_64
libepoxy-devel-1.3.1-1.el7.x86_64
libfdt-devel-1.4.6-1.el7.x86_64
lzo-devel-2.06-8.el7.x86_64
make-3.82-23.el7.x86_64
mesa-libEGL-devel-17.0.1-6.20170307.el7.x86_64
mesa-libgbm-devel-17.0.1-6.20170307.el7.x86_64
package g++ is not installed
package librdmacm-devel is not installed
pixman-devel-0.34.0-1.el7.x86_64
spice-glib-devel-0.33-6.el7_4.1.x86_64
spice-server-devel-0.12.8-2.el7.1.x86_64
tar-1.26-32.el7.x86_64
vte-devel-0.28.2-10.el7.x86_64
xen-devel-4.6.6-10.el7.x86_64
zlib-devel-1.2.7-17.el7.x86_64

Environment variables:
PACKAGES=bison bzip2-devel ccache csnappy-devel flex g++
 gcc gettext git glib2-devel libepoxy-devel libfdt-devel
 librdmacm-devel lzo-devel make mesa-libEGL-devel 
mesa-libgbm-devel pixman-devel SDL-devel spice-glib-devel 
spice-server-devel tar vte-devel xen-devel zlib-devel
HOSTNAME=e9a9f6bac41c
MAKEFLAGS= -j8
J=8

Re: [Qemu-block] [PATCH] qemu-img.c: increase spacing between commands in documentation

2018-08-17 Thread Eric Blake

On 08/17/2018 02:28 PM, Programmingkid wrote:


  -o Used with a comma separated list of format specific options in 
a
 name=value format. Use "-o ?" for an overview of the options


Please spell that "-o help", not "-o ?".   Otherwise, the user has to quote the 
? to avoid it globbing into any single-byte file lying around in the current directory.


"-o ?" and "-o help" does not appear to work for this command. Maybe it should 
be removed.
This is what I tried:
qemu-img amend -o help
qemu-img amend -o ?


The set of options depends on the file format being amended.  So, you 
have to try:


qemu-img amend -o help -f qcow2

or supply an image name, as in:

qemu-img amend -o help myimage.qcow2

(of course, the latter relies on image probing, which I just said is 
potentially unsafe if you didn't use -f).  But the point is the option 
-o does work, just not in isolation.




  -t Specifies the cache mode that should be used with the
 destination file.


And what are those modes?  If you're going to be wordy, then give the user 
enough information to be useful.  Otherwise, being terse in --help is fine (and 
let the man page be wordy instead).


I don't know what the modes are. Anyone care to fill us in?


The source code is your friend. qemu-img.c has:

   case 'T':
cache = optarg;
...
ret = bdrv_parse_cache_mode(cache, , );

then you search for bdrv_parse_cache_mode(), in block.c:

if (!strcmp(mode, "off") || !strcmp(mode, "none")) {
*writethrough = false;
*flags |= BDRV_O_NOCACHE;
} else if (!strcmp(mode, "directsync")) {
*writethrough = true;
*flags |= BDRV_O_NOCACHE;
} else if (!strcmp(mode, "writeback")) {
*writethrough = false;
} else if (!strcmp(mode, "unsafe")) {
*writethrough = false;
*flags |= BDRV_O_NO_FLUSH;
} else if (!strcmp(mode, "writethrough")) {
*writethrough = true;

So six different aliases, for five different modes.  We can either 
improve --help output to document these directly, or add a '-t help' 
option (the way we have '-o help') to dynamically print the list.






Example:
 qemu-img amend -o compat=0.10 -f qcow2 image.qcow2


Where's an example with --image-opts and --object secret?


I prefer examples that I think a user would actually use. The --image-opts and 
-object options are not necessary to use this command.


Umm, they ARE necessary if you want to amend a LUKS-encrypted image, and 
that IS something I would actually use.  What's more, it's the complex 
examples (like a LUKS-encrypted image) where seeing something spelled 
out will save a LOT of hair-pulling from someone reading the docs (but, 
alongside it should ALSO be a short-and-simple example).





We're trying to move away from compat=0.10 (also spelled compat=v2), and 
instead start encouraging compat=v3.


So you want this: qemu-img amend -o compat=v3 -f qcow2 image.qcow2


Yes, that's one reasonable example, but should not be the only example.

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-block] [Qemu-devel] [RFC v2] new, node-graph-based fleecing and backup

2018-08-17 Thread no-reply
Hi,

This series failed docker-mingw@fedora build test. Please find the testing 
commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

Type: series
Message-id: 20180814170126.56461-1-vsement...@virtuozzo.com
Subject: [Qemu-devel] [RFC v2] new, node-graph-based fleecing and backup

=== TEST SCRIPT BEGIN ===
#!/bin/bash
time make docker-test-mingw@fedora SHOW_ENV=1 J=8
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
Switched to a new branch 'test'
d307c74eae new, node-graph-based fleecing and backup

=== OUTPUT BEGIN ===
  BUILD   fedora
make[1]: Entering directory '/var/tmp/patchew-tester-tmp-askh7zyi/src'
  GEN 
/var/tmp/patchew-tester-tmp-askh7zyi/src/docker-src.2018-08-17-16.54.21.21368/qemu.tar
Cloning into 
'/var/tmp/patchew-tester-tmp-askh7zyi/src/docker-src.2018-08-17-16.54.21.21368/qemu.tar.vroot'...
done.
Your branch is up-to-date with 'origin/test'.
Submodule 'dtc' (git://git.qemu-project.org/dtc.git) registered for path 'dtc'
Cloning into 
'/var/tmp/patchew-tester-tmp-askh7zyi/src/docker-src.2018-08-17-16.54.21.21368/qemu.tar.vroot/dtc'...
Submodule path 'dtc': checked out 'e54388015af1fb4bf04d0bca99caba1074d9cc42'
Submodule 'ui/keycodemapdb' (git://git.qemu.org/keycodemapdb.git) registered 
for path 'ui/keycodemapdb'
Cloning into 
'/var/tmp/patchew-tester-tmp-askh7zyi/src/docker-src.2018-08-17-16.54.21.21368/qemu.tar.vroot/ui/keycodemapdb'...
Submodule path 'ui/keycodemapdb': checked out 
'6b3d716e2b6472eb7189d3220552280ef3d832ce'
  COPYRUNNER
RUN test-mingw in qemu:fedora 
Packages installed:
SDL2-devel-2.0.8-5.fc28.x86_64
bc-1.07.1-5.fc28.x86_64
bison-3.0.4-9.fc28.x86_64
bluez-libs-devel-5.49-3.fc28.x86_64
brlapi-devel-0.6.7-12.fc28.x86_64
bzip2-1.0.6-26.fc28.x86_64
bzip2-devel-1.0.6-26.fc28.x86_64
ccache-3.4.2-2.fc28.x86_64
clang-6.0.0-5.fc28.x86_64
device-mapper-multipath-devel-0.7.4-2.git07e7bd5.fc28.x86_64
findutils-4.6.0-19.fc28.x86_64
flex-2.6.1-7.fc28.x86_64
gcc-8.1.1-1.fc28.x86_64
gcc-c++-8.1.1-1.fc28.x86_64
gettext-0.19.8.1-14.fc28.x86_64
git-2.17.1-2.fc28.x86_64
glib2-devel-2.56.1-3.fc28.x86_64
glusterfs-api-devel-4.0.2-1.fc28.x86_64
gnutls-devel-3.6.2-1.fc28.x86_64
gtk3-devel-3.22.30-1.fc28.x86_64
hostname-3.20-3.fc28.x86_64
libaio-devel-0.3.110-11.fc28.x86_64
libasan-8.1.1-1.fc28.x86_64
libattr-devel-2.4.47-23.fc28.x86_64
libcap-devel-2.25-9.fc28.x86_64
libcap-ng-devel-0.7.9-1.fc28.x86_64
libcurl-devel-7.59.0-3.fc28.x86_64
libfdt-devel-1.4.6-4.fc28.x86_64
libpng-devel-1.6.34-3.fc28.x86_64
librbd-devel-12.2.5-1.fc28.x86_64
libssh2-devel-1.8.0-7.fc28.x86_64
libubsan-8.1.1-1.fc28.x86_64
libusbx-devel-1.0.21-6.fc28.x86_64
libxml2-devel-2.9.7-4.fc28.x86_64
llvm-6.0.0-11.fc28.x86_64
lzo-devel-2.08-12.fc28.x86_64
make-4.2.1-6.fc28.x86_64
mingw32-SDL2-2.0.5-3.fc27.noarch
mingw32-bzip2-1.0.6-9.fc27.noarch
mingw32-curl-7.57.0-1.fc28.noarch
mingw32-glib2-2.54.1-1.fc28.noarch
mingw32-gmp-6.1.2-2.fc27.noarch
mingw32-gnutls-3.5.13-2.fc27.noarch
mingw32-gtk3-3.22.16-1.fc27.noarch
mingw32-libjpeg-turbo-1.5.1-3.fc27.noarch
mingw32-libpng-1.6.29-2.fc27.noarch
mingw32-libssh2-1.8.0-3.fc27.noarch
mingw32-libtasn1-4.13-1.fc28.noarch
mingw32-nettle-3.3-3.fc27.noarch
mingw32-pixman-0.34.0-3.fc27.noarch
mingw32-pkg-config-0.28-9.fc27.x86_64
mingw64-SDL2-2.0.5-3.fc27.noarch
mingw64-bzip2-1.0.6-9.fc27.noarch
mingw64-curl-7.57.0-1.fc28.noarch
mingw64-glib2-2.54.1-1.fc28.noarch
mingw64-gmp-6.1.2-2.fc27.noarch
mingw64-gnutls-3.5.13-2.fc27.noarch
mingw64-gtk3-3.22.16-1.fc27.noarch
mingw64-libjpeg-turbo-1.5.1-3.fc27.noarch
mingw64-libpng-1.6.29-2.fc27.noarch
mingw64-libssh2-1.8.0-3.fc27.noarch
mingw64-libtasn1-4.13-1.fc28.noarch
mingw64-nettle-3.3-3.fc27.noarch
mingw64-pixman-0.34.0-3.fc27.noarch
mingw64-pkg-config-0.28-9.fc27.x86_64
ncurses-devel-6.1-5.20180224.fc28.x86_64
nettle-devel-3.4-2.fc28.x86_64
nss-devel-3.36.1-1.1.fc28.x86_64
numactl-devel-2.0.11-8.fc28.x86_64
package PyYAML is not installed
package libjpeg-devel is not installed
perl-5.26.2-411.fc28.x86_64
pixman-devel-0.34.0-8.fc28.x86_64
python3-3.6.5-1.fc28.x86_64
snappy-devel-1.1.7-5.fc28.x86_64
sparse-0.5.2-1.fc28.x86_64
spice-server-devel-0.14.0-4.fc28.x86_64
systemtap-sdt-devel-3.2-11.fc28.x86_64
tar-1.30-3.fc28.x86_64
usbredir-devel-0.7.1-7.fc28.x86_64
virglrenderer-devel-0.6.0-4.20170210git76b3da97b.fc28.x86_64
vte3-devel-0.36.5-6.fc28.x86_64
which-2.21-8.fc28.x86_64
xen-devel-4.10.1-3.fc28.x86_64
zlib-devel-1.2.11-8.fc28.x86_64

Environment variables:
TARGET_LIST=
PACKAGES=ccache gettext git tar PyYAML sparse flex bison python3 bzip2 hostname 
gcc gcc-c++ llvm clang make perl which bc findutils glib2-devel 
libaio-devel pixman-devel zlib-devel libfdt-devel libasan libubsan 
bluez-libs-devel brlapi-devel bzip2-devel device-mapper-multipath-devel 
glusterfs-api-devel gnutls-devel gtk3-devel libattr-devel libcap-devel 
libcap-ng-devel libcurl-devel libjpeg-devel libpng-devel librbd-devel 
libssh2-devel libusbx-devel 

Re: [Qemu-block] [PATCH v2 1/3] qapi: add x-query-block-graph

2018-08-17 Thread Eric Blake

On 08/17/2018 01:04 PM, Vladimir Sementsov-Ogievskiy wrote:

Add a new command, returning block nodes graph.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  qapi/block-core.json  | 116 ++



+##
+# @BlockGraphEdge:
+#
+# Block Graph edge description for x-query-block-graph.
+#
+# @parent: parent id
+#
+# @child: child id
+#
+# @name: name of the relation (examples are 'file' and 'backing')


Can this be made into a QAPI enum? (It would have ripple effects to 
existing code, but might be a worthwhile cleanup).



+#
+# @perm: granted permissions for the parent operating on the child
+#
+# @shared-perm: permissions that can still be granted to other users of the
+#   child while it is still attached this parent
+#
+# Since: 3.1
+##
+{ 'struct': 'BlockGraphEdge',
+  'data': { 'parent': 'uint64', 'child': 'uint64',
+'name': 'str', 'perm': [ 'BlockPermission' ],
+'shared-perm': [ 'BlockPermission' ] } }
+



+##
+# @x-query-block-graph:
+#
+# Get the block graph.
+#
+# Since: 3.1
+##
+{ 'command': 'x-query-block-graph', 'returns': 'BlockGraph' }


Why is this command given an x- prefix?  What would it take to promote 
it from experimental to fully supported?


Overall, the command looks quite useful; and the fact that you produced 
some nice .dot graphs from it for visualization seems like it is worth 
considering this as a permanent API addition.  The question, then, is if 
the interface is right, or if it needs any tweaks (like using an enum 
instead of open-coded string for the relation between parent and child), 
as a reason for keeping the x- prefix.



+++ b/block.c
@@ -4003,6 +4003,86 @@ BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp)
  return list;
  }
  
+#define QAPI_LIST_ADD(list, element) do { \

+typeof(list) _tmp = g_new(typeof(*(list)), 1); \
+_tmp->value = (element); \
+_tmp->next = (list); \
+list = _tmp; \
+} while (0)


Hmm - this seems like a frequently observed pattern - should it be 
something that we add independently and use throughout the tree as part 
of QAPI interactions in general?



+
+BlockGraph *bdrv_get_block_graph(Error **errp)
+{
+BlockGraph *graph = g_new0(BlockGraph, 1);
+BlockDriverState *bs;
+BdrvChild *child;
+BlockGraphNode *node;
+BlockGraphEdge *edge;
+struct {
+unsigned int flag;
+BlockPermission num;
+} permissions[] = {
+{ BLK_PERM_CONSISTENT_READ, BLOCK_PERMISSION_CONSISTENT_READ },


Would it be worth directly reusing a QAPI enum for all such permissions 
in the existing code base, instead of having to map between an internal 
and a QAPI enum?



+{ BLK_PERM_WRITE,   BLOCK_PERMISSION_WRITE },
+{ BLK_PERM_WRITE_UNCHANGED, BLOCK_PERMISSION_WRITE_UNCHANGED },
+{ BLK_PERM_RESIZE,  BLOCK_PERMISSION_RESIZE },
+{ BLK_PERM_GRAPH_MOD,   BLOCK_PERMISSION_GRAPH_MOD },
+{ 0, 0 }
+}, *p;
+
+QTAILQ_FOREACH(bs, _bdrv_states, node_list) {
+QLIST_FOREACH(child, >parents, next_parent) {
+if (!child->role->parent_is_bds) {
+BlockGraphNodeList *el;
+bool add = true;


Does your command need/want to expose internal nodes? (I argue that it 
probably should show internal nodes, even when normal 'query-block' 
doesn't, just because knowing about the impact of internal nodes can be 
important when tracking down permissions issues).


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-block] [Qemu-devel] [RFC PATCH] rbd: Don't convert keypairs to JSON and back

2018-08-17 Thread Max Reitz
On 2018-08-16 08:40, Markus Armbruster wrote:
> Max Reitz  writes:
> 
>> On 2018-08-15 10:12, Markus Armbruster wrote:
>>> Max Reitz  writes:
>>
>> [...]
>>
 To me personally the issue is that if you can specify a plain filename,
 bdrv_refresh_filename() should give you that plain filename back.  So
 rbd's implementation of that is lacking.  Well, it just doesn't exist.
>>>
>>> I'm not even sure I understand what you're talking about.
>>
>> We have this bdrv_refresh_filename() thing which can do this:
>>
>> $ qemu-img info \
>> "json:{'driver':'raw',
>>'file':{'driver':'nbd','host':'localhost'}}"
>> image: nbd://localhost:10809
>> [...]
>>
>> So it can reconstruct a plain filename even if you specify it as options
>> instead of just using a plain filename.
>>
>>
>> Now here's my fault: I thought it might be necessary for a driver to
>> implement that function (which rbd doesn't) so that you'd get a nice
>> filename back (instead of just json:{} garbled things).   But you don't.
>>  For protocol drivers, you'll just get the initial filename back.  (So
>> my comment was just wrong.)
>>
>> So what I was thinking about was some case where you specified a normal
>> plain filename and qemu would give you back json:{}.  (If rbd
>> implemented bdrv_refresh_filename(), that wouldn't happen, because it
>> would reconstruct a nice normal filename.)  It turns out, I don't think
>> that can happen so easily.  You'll just get your filename back.
>>
>> Because here's what I'm thinking: If someone uses an option that is
>> undocumented and starts with =, well, too bad.  If someone uses a normal
>> filename, but gets back a json:{} filename...  Then they are free to use
>> that anywhere, and their use of "=" is legitimized.
>>
>>
>> Now that issue kind of reappears when you open an RBD volume, and then
>> e.g. take a blockdev-snapshot.  Then your overlay has an overridden
>> backing file (one that may be different from what its image header says)
>> and its filename may well become a json:{} one (to arrange for the
>> overridden backing file options).  Of course, if you opened the RBD
>> volume with a filename with some of the options warranting
>> =keyvalue-pairs, then your json:{} filename will contain those options
>> under =keyvalue-pairs.
>>
>> So...  I'm not quite sure what I want to say?  I think there are edge
>> cases where the user may not have put any weird option into qemu, but
>> they do get a json:{} filename with =keyvalue-pairs out of it.  And I
>> think users are free to use json:{} filenames qemu spews at them, and we
>> can't blame them for it.
> 
> Makes sense.
> 
> More reason to deprecate key-value pairs in pseudo-filenames.
> 
> The only alternative would be to also provide them in QAPI
> BlockdevOptionsRbd.  I find that as distasteful as ever, but if the
> block maintainers decide we need it, I'll hold my nose.
> 
> If so, and we are comfortable changing the output the way this patch does
> (technically altering ABI anyway), we might as well go all the way and
> filter it out completely.  That would be preferable to cleaning up the 
> json
> output of the internal key/value pairs, IMO.

 Well, this filtering at least is done by my "Fix some filename
 generation issues" series.
>>>
>>> Likewise.
>>
>> The series overhauls quite a bit of the bdrv_refresh_filename()
>> infrastructure.  That function is also responsible for generating
>> json:{} filenames.
>>
>> One thing it introduces is a BlockDriver field where a driver can
>> specify which of the runtime options are actually important.  The rest
>> is omitted from the generated json:{} filename.
>>
>> I may have taken the liberty not to include =keyvalue-pairs in RBD's
>> "strong runtime options" list.
> 
> I see.
> 
> Permit me to digress a bit.
> 
> I understand one application for generating a json: filename is for
> putting it into an image file header (say as a COW's backing image).

Yes.

(And it's not completely pointless, as there are options you may want to
specify, but cannot do so in a plain filename.  Like host-key-check for
https.)

(And technically you need a string filename to point to when doing
block-commit (Kevin sent patches to remedy this, though), so that could
be called an application as well.)

> Having image file headers point to other images is not as simple as it
> may look at first glance.  The basic case of image format plus plain
> filename (not containing '/') is straightforward enough.  But if I make
> the filename absolute (with a leading '/'), the image becomes less easy
> to move to another machine.

That assumes that we correctly implement relative backing file names.
Believe me, we don't.

For example, say you did this:

$ qemu-img create -f qcow2 foo/bot.qcow2 1M
$ qemu-img create -f qcow2 -b bot.qcow2 foo/mid.qcow2
$ qemu-img create -f qcow2 -b mid.qcow2 foo/top.qcow2

Now try committing top to mid.

You're right, it's impossible right now.

(Not via 

Re: [Qemu-block] [Qemu-devel] [PATCH for-3.1 v10 00/31] block: Fix some filename generation issues

2018-08-17 Thread Max Reitz
On 2018-08-16 08:02, Markus Armbruster wrote:
> Max Reitz  writes:
> 
>> On 2018-08-15 05:43, no-re...@patchew.org wrote:
>>> Hi,
>>>
>>> This series seems to have some coding style problems. See output below for
>>> more information:
>>
>> [...]
>>
>>> === OUTPUT BEGIN ===
>>> Checking PATCH 1/31: block: Use bdrv_refresh_filename() to pull...
>>> Checking PATCH 2/31: block: Use children list in bdrv_refresh_filename...
>>> Checking PATCH 3/31: block: Skip implicit nodes for filename info...
>>> Checking PATCH 4/31: block: Add BDS.auto_backing_file...
>>> Checking PATCH 5/31: block: Respect backing bs in bdrv_refresh_filename...
>>> ERROR: return is not a function, parentheses are not required
>>> #46: FILE: block.c:5192:
>>> +return (bs->auto_backing_file[0] != '\0');
>>>
>>> total: 1 errors, 0 warnings, 136 lines checked
>>
>> Sure, but I'd hate you personally if you omitted them.
> 
> @@
> expression E;
> @@
> -return (E);
> +return E;
> 
> You're welcome!

Well, I really don't like not putting parenthesis when returning
something that is not an identifier or a function call.

In fact, before I drop the parentheses, I'd drop the "!= '\0'", because
that is optional, too.  (And then I could drop the parentheses anyway.)

Or write it with an explicit if-else (return true - return false).

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH 0/7] qcow2: async handling of fragmented io

2018-08-17 Thread Denis V. Lunev
On 08/17/2018 10:34 PM, Max Reitz wrote:
> On 2018-08-16 15:58, Vladimir Sementsov-Ogievskiy wrote:
>> 16.08.2018 03:51, Max Reitz wrote:
>>> On 2018-08-07 19:43, Vladimir Sementsov-Ogievskiy wrote:
 Hi all!

 Here is an asynchronous scheme for handling fragmented qcow2
 reads and writes. Both qcow2 read and write functions loops through
 sequential portions of data. The series aim it to parallelize these
 loops iterations.

 It improves performance for fragmented qcow2 images, I've tested it
 as follows:

 I have four 4G qcow2 images (with default 64k block size) on my ssd disk:
 t-seq.qcow2 - sequentially written qcow2 image
 t-reverse.qcow2 - filled by writing 64k portions from end to the start
 t-rand.qcow2 - filled by writing 64k portions (aligned) in random order
 t-part-rand.qcow2 - filled by shuffling order of 64k writes in 1m clusters
 (see source code of image generation in the end for details)

 and the test (sequential io by 1mb chunks):

 test write:
 for t in /ssd/t-*; \
 do sync; echo 1 > /proc/sys/vm/drop_caches; echo ===  $t  ===; \
 ./qemu-img bench -c 4096 -d 1 -f qcow2 -n -s 1m -t none -w $t; \
 done

 test read (same, just drop -w parameter):
 for t in /ssd/t-*; \
 do sync; echo 1 > /proc/sys/vm/drop_caches; echo ===  $t  ===; \
 ./qemu-img bench -c 4096 -d 1 -f qcow2 -n -s 1m -t none $t; \
 done

 short info about parameters:
   -w - do writes (otherwise do reads)
   -c - count of blocks
   -s - block size
   -t none - disable cache
   -n - native aio
   -d 1 - don't use parallel requests provided by qemu-img bench itself
>>> Hm, actually, why not?  And how does a guest behave?
>>>
>>> If parallel requests on an SSD perform better, wouldn't a guest issue
>>> parallel requests to the virtual device and thus to qcow2 anyway?
>> Guest knows nothing about qcow2 fragmentation, so this kind of
>> "asynchronization" could be done only at qcow2 level.
> Hm, yes.  I'm sorry, but without having looked closer at the series
> (which is why I'm sorry in advance), I would suspect that the
> performance improvement comes from us being able to send parallel
> requests to an SSD.
>
> So if you send large requests to an SSD, you may either send them in
> parallel or sequentially, it doesn't matter.  But for small requests,
> it's better to send them in parallel so the SSD always has requests in
> its queue.
>
> I would think this is where the performance improvement comes from.  But
> I would also think that a guest OS knows this and it would also send
> many requests in parallel so the virtual block device never runs out of
> requests.
>
>> However, if guest do async io, send a lot of parallel requests, it
>> behave like qemu-img without -d 1 option, and in this case,
>> parallel loop iterations in qcow2 doesn't have such great sense.
>> However, I think that async parallel requests are better in
>> general than sequential, because if device have some unused opportunity
>> of parallelization, it will be utilized.
> I agree that it probably doesn't make things worse performance-wise, but
> it's always added complexity (see the diffstat), which is why I'm just
> routinely asking how useful it is in practice. :-)
>
> Anyway, I suspect there are indeed cases where a guest doesn't send many
> requests in parallel but it makes sense for the qcow2 driver to
> parallelize it.  That would be mainly when the guest reads seemingly
> sequential data that is then fragmented in the qcow2 file.  So basically
> what your benchmark is testing. :-)
>
> Then, the guest could assume that there is no sense in parallelizing it
> because the latency from the device is large enough, whereas in qemu
> itself we always run dry and wait for different parts of the single
> large request to finish.  So, yes, in that case, parallelization that's
> internal to qcow2 would make sense.
>
> Now another question is, does this negatively impact devices where
> seeking is slow, i.e. HDDs?  Unfortunately I'm not home right now, so I
> don't have access to an HDD to test myself...
There are different situations and different load pattern, f.e. there
are situations when the guest executes sequential read
in a single thread. This looks obvious and dummy, but this is for sure
is possible in the real life. Also there is an observation, that Windows
guest prefers long requests. There is not unusual to observe 4Mb
requests in a pipeline.

Thus for such a load in a scattered file the performance difference should
be very big, even on SSD as without this SSD will starve without requests.

Here we are speaking in terms of latency, which definitely will be bigger
in sequential case.

Den

>> We've already
>> use this approach in mirror and qemu-img convert.
> Indeed, but here you could always argue that this is just what guests
> do, so we should, too.

Re: [Qemu-block] [PATCH 0/7] qcow2: async handling of fragmented io

2018-08-17 Thread Max Reitz
On 2018-08-16 15:58, Vladimir Sementsov-Ogievskiy wrote:
> 16.08.2018 03:51, Max Reitz wrote:
>> On 2018-08-07 19:43, Vladimir Sementsov-Ogievskiy wrote:
>>> Hi all!
>>>
>>> Here is an asynchronous scheme for handling fragmented qcow2
>>> reads and writes. Both qcow2 read and write functions loops through
>>> sequential portions of data. The series aim it to parallelize these
>>> loops iterations.
>>>
>>> It improves performance for fragmented qcow2 images, I've tested it
>>> as follows:
>>>
>>> I have four 4G qcow2 images (with default 64k block size) on my ssd disk:
>>> t-seq.qcow2 - sequentially written qcow2 image
>>> t-reverse.qcow2 - filled by writing 64k portions from end to the start
>>> t-rand.qcow2 - filled by writing 64k portions (aligned) in random order
>>> t-part-rand.qcow2 - filled by shuffling order of 64k writes in 1m clusters
>>> (see source code of image generation in the end for details)
>>>
>>> and the test (sequential io by 1mb chunks):
>>>
>>> test write:
>>> for t in /ssd/t-*; \
>>> do sync; echo 1 > /proc/sys/vm/drop_caches; echo ===  $t  ===; \
>>> ./qemu-img bench -c 4096 -d 1 -f qcow2 -n -s 1m -t none -w $t; \
>>> done
>>>
>>> test read (same, just drop -w parameter):
>>> for t in /ssd/t-*; \
>>> do sync; echo 1 > /proc/sys/vm/drop_caches; echo ===  $t  ===; \
>>> ./qemu-img bench -c 4096 -d 1 -f qcow2 -n -s 1m -t none $t; \
>>> done
>>>
>>> short info about parameters:
>>>   -w - do writes (otherwise do reads)
>>>   -c - count of blocks
>>>   -s - block size
>>>   -t none - disable cache
>>>   -n - native aio
>>>   -d 1 - don't use parallel requests provided by qemu-img bench itself
>> Hm, actually, why not?  And how does a guest behave?
>>
>> If parallel requests on an SSD perform better, wouldn't a guest issue
>> parallel requests to the virtual device and thus to qcow2 anyway?
> 
> Guest knows nothing about qcow2 fragmentation, so this kind of
> "asynchronization" could be done only at qcow2 level.

Hm, yes.  I'm sorry, but without having looked closer at the series
(which is why I'm sorry in advance), I would suspect that the
performance improvement comes from us being able to send parallel
requests to an SSD.

So if you send large requests to an SSD, you may either send them in
parallel or sequentially, it doesn't matter.  But for small requests,
it's better to send them in parallel so the SSD always has requests in
its queue.

I would think this is where the performance improvement comes from.  But
I would also think that a guest OS knows this and it would also send
many requests in parallel so the virtual block device never runs out of
requests.

> However, if guest do async io, send a lot of parallel requests, it
> behave like qemu-img without -d 1 option, and in this case,
> parallel loop iterations in qcow2 doesn't have such great sense.
> However, I think that async parallel requests are better in
> general than sequential, because if device have some unused opportunity
> of parallelization, it will be utilized.

I agree that it probably doesn't make things worse performance-wise, but
it's always added complexity (see the diffstat), which is why I'm just
routinely asking how useful it is in practice. :-)

Anyway, I suspect there are indeed cases where a guest doesn't send many
requests in parallel but it makes sense for the qcow2 driver to
parallelize it.  That would be mainly when the guest reads seemingly
sequential data that is then fragmented in the qcow2 file.  So basically
what your benchmark is testing. :-)

Then, the guest could assume that there is no sense in parallelizing it
because the latency from the device is large enough, whereas in qemu
itself we always run dry and wait for different parts of the single
large request to finish.  So, yes, in that case, parallelization that's
internal to qcow2 would make sense.

Now another question is, does this negatively impact devices where
seeking is slow, i.e. HDDs?  Unfortunately I'm not home right now, so I
don't have access to an HDD to test myself...

> We've already
> use this approach in mirror and qemu-img convert.

Indeed, but here you could always argue that this is just what guests
do, so we should, too.

> In Virtuozzo we have
> backup, improved by parallelization of requests
> loop too. I think, it would be good to have some general code for such
> things in future.
Well, those are different things, I'd think.  Parallelization in
mirror/backup/convert is useful not just because of qcow2 issues, but
also because you have a volume to read from and a volume to write to, so
that's where parallelization gives you some pipelining.  And it gives
you buffers for latency spikes, I guess.

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH] qemu-img.c: increase spacing between commands in documentation

2018-08-17 Thread Programmingkid


> On Aug 17, 2018, at 9:44 AM, Eric Blake  wrote:
> 
> On 08/16/2018 08:27 PM, Programmingkid wrote:
> 
>> I am by no means an expert at qemu-img. But I did try my best to create what 
>> I think should be the new output for qemu-img  --help. This is just 
>> the text I plan on using in a future patch. It is easier to read right now 
>> than it will be in patch form, so please let me know if there are any errors 
>> in this documentation. Thank you.
>> 
> 
> Just reviewing the first for now, to give you a feel for what to consider.
> 
>> usage: qemu-img amend [--object objectdef] [--image-opts] [-p] [-q] [-f fmt]
>> [-t cache] -o options filename
>> Command parameters:
>>  -f 'fmt' is the disk image format. It is guessed automatically
>> in most cases.
> 
> Bad advice. We WANT users to use -f (if you don't, and the automatic guessing 
> sees something that is not raw, but your image SHOULD have been -f raw, then 
> you have a security hole: the guest can write anything into a raw image to 
> make the host access files it shouldn't based on interpreting the raw file as 
> something else).  I'd drop the last sentence, and use just the first.

Ok. 

> 
>> --image-optsTreat filename as a set of image options, instead of a plain
>> filename.
>>  -o Used with a comma separated list of format specific options 
>> in a
>> name=value format. Use "-o ?" for an overview of the options
> 
> Please spell that "-o help", not "-o ?".   Otherwise, the user has to quote 
> the ? to avoid it globbing into any single-byte file lying around in the 
> current directory.

"-o ?" and "-o help" does not appear to work for this command. Maybe it should 
be removed.
This is what I tried:
qemu-img amend -o help
qemu-img amend -o ?

This is what I see in both instances: qemu-img: Expecting one image file name

> 
>> supported by the used format
>> --object'objectdef' is a QEMU user creatable object definition. See 
>> the
>> qemu(1) manual page for a description of the object 
>> properties.
>> The most common object type is a 'secret', which is used to
>> supply passwords and/or encryption keys.
>>  -p Display progress bar. If the -p option is not used for a 
>> command
>> that supports it, the progress is reported when the process
>> receives a "SIGUSR1" signal.
>>  -q Quiet mode - do not print any output (except errors). 
>> There's no
>> progress bar in case both -q and -p options are used.
> 
> Not your fault, but I don't like it when interfaces take mutually exclusive 
> operations but the last one does not win.  Either '-p -q' should behave like 
> '-q', and '-q -p' behave like '-p' (because we accept the mutual exclusion 
> but last one wins), or both forms should error (because they are 
> incompatible).  But having both forms silently behave like '-q' is evil.  So, 
> as long as you're willing to patch up interfaces, that's a project to 
> consider.
> 
>>  -t Specifies the cache mode that should be used with the
>> destination file.
> 
> And what are those modes?  If you're going to be wordy, then give the user 
> enough information to be useful.  Otherwise, being terse in --help is fine 
> (and let the man page be wordy instead).

I don't know what the modes are. Anyone care to fill us in? 


>> Example:
>> qemu-img amend -o compat=0.10 -f qcow2 image.qcow2
> 
> Where's an example with --image-opts and --object secret?

I prefer examples that I think a user would actually use. The --image-opts and 
-object options are not necessary to use this command.

> We're trying to move away from compat=0.10 (also spelled compat=v2), and 
> instead start encouraging compat=v3.

So you want this: qemu-img amend -o compat=v3 -f qcow2 image.qcow2

Thank you for reviewing my documentation.


Re: [Qemu-block] [PATCH 2/2] qemu-img: Add dd seek= option

2018-08-17 Thread Max Reitz
On 2018-08-16 09:15, Kevin Wolf wrote:
> Am 16.08.2018 um 04:49 hat Max Reitz geschrieben:
>> On 2018-08-16 04:39, Eric Blake wrote:
>>> If convert were more powerful, I'd be fine dropping 'qemu-img dd' after
>>> a proper deprecation period.
>>
>> Technically it has those features already, with the raw block driver's
>> offset and size parameters.
> 
> In a way, yes. It requires you to precreate the target image, though,
> because --target-image-opts requires -n.
> 
> We'll probably want to fix something in this area anyway because in the
> common case, you already need those options to even precreate the image,
> and qemu-img create doesn't support open options for the protocol layer
> either (unlike blockdev-create).
> 
 ((That gave me a good idea.  Actually, it's probably not such a good
 idea, but I guess I'll do it in my spare time anyway.  A qemu-img fuse
 might be nice which represents an image as a raw image at some mount
 point.  Benefits over qemu-nbd: (1) You don't need root, (2) you don't
 need to type modprobe nbd.))
>>>
>>> So the kernel->userspace translation would be happening via the FUSE
>>> interface instead of the NBD interface.  Data still bounces around just
>>> as much, but it might be a fun project.  Does fuse behave well when
>>> serving exactly one file at the mountpoint, rather than the more typical
>>> file system rooted in a directory?  NBD at least has the benefit of
>>> claiming to be a block device all along, rather than complicating the
>>> user interface with POSIX file system rules (which you'll be bending,
>>> because you are serving exactly one file instead of a system).
> 
> To make it more real, you could expose a snapshots/ subdirectory. Or
> maybe better not.
> 
>> Well, but I can just pretend my FUSE file is a block device, no?
>>
>> Also, I just discovered something really interesting: FUSE allows you to
>> specify a single file as a mountpoint.
>>
>> And you know what?  You can open the original file before you replace it
>> by the FUSE "filesystem".
>>
>> So my fun interface is going to looks like this:
>>
>> $ qemu-img fuse foo.qcow2
>>
>> And then your foo.qcow2 is a raw image until the next "fusermount -u
>> foo.qcow2"!  Isn't that fun?
> 
> Yes, sounds fun. Until you realise that I'd actually like to merge
> something like this when it exists. ;-)

Well, for reference:

https://git.xanclic.moe/XanClic/qemu/commits/branch/qemu-img-fuse

(The main issue now is that you don't see FUSE error messages,
because...  Well.  FUSE likes to fork the daemon process itself, but for
some reason that doesn't really work with qemu.  I don't really want to
investigate that, instead I rather like to declare it broken and launch
FUSE in foreground mode, and then do the forking ourselves (like
qemu-nbd does).  But then we lose all error output from FUSE, because we
have to close stderr before we get to the main loop...

(I suppose I can just look into what fuse_main() does exactly and
replicate that.  The doc says using fuse_main() is just lazy anyway. O:-P))

> For a more serious implementation, maybe it should even be a separate
> qemu-fuse rather than a qemu-img subcommand.

True.  It doesn't really fit well into qemu-img.

(To me, qemu-img is about exposing block layer features without the user
having to use qemu proper.  Exporting an image over FUSE really is not a
block layer feature.)

Max



signature.asc
Description: OpenPGP digital signature


[Qemu-block] [PATCH 0/7] jobs: remove job_defer_to_main_loop

2018-08-17 Thread John Snow
First, it's redundant to have each job manage this itself.
Second, doing so allows us to remove a tricky case where the completion
code is called under an aio_context lock, which then calls the
finalization code which is itself executed under a second aio_context
lock.

Removing this recursive lock acquisition is necessary for converting
mirror to only modify its graph post-finalization, but it's also just
safer and will bite us less in the future.

This series introduces a .job_exit callback, but after jobs are
fully transitioned to using the .commit/.abort callbacks, this new
completion callback will be removed.

John Snow (7):
  jobs: change start callback to run callback
  jobs: canonize Error object
  jobs: add exit shim
  block/commit: utilize job_exit shim
  block/mirror: utilize job_exit shim
  jobs: utilize job_exit shim
  jobs: remove job_defer_to_main_loop

 block/backup.c| 23 +++-
 block/commit.c| 27 ++-
 block/create.c| 19 +
 block/mirror.c| 35 +++-
 block/stream.c| 29 
 include/qemu/job.h| 36 +
 job-qmp.c |  5 ++--
 job.c | 68 +++
 tests/test-bdrv-drain.c   | 13 +++--
 tests/test-blockjob-txn.c | 25 +++--
 tests/test-blockjob.c | 17 ++--
 11 files changed, 102 insertions(+), 195 deletions(-)

-- 
2.14.4




Re: [Qemu-block] [Qemu-devel] [PATCH 4/7] block/commit: utilize job_exit shim

2018-08-17 Thread John Snow



On 08/17/2018 03:04 PM, John Snow wrote:
> Change the manual deferment to commit_complete into the implicit
> callback to job_exit, renaming commit_complete to commit_exit.
> 
> This conversion does change the timing of when job_completed is
> called to after the bdrv_replace_node and bdrv_unref calls, which
> could have implications for bjob->blk which will now be put down
> after this cleanup.
> 
> Kevin highlights that we did not take any permissions for that backend
> at job creation time, so it is safe to reorder these operations.
> 
> Signed-off-by: John Snow 
> ---
>  block/commit.c | 20 
>  1 file changed, 4 insertions(+), 16 deletions(-)
> 
> diff --git a/block/commit.c b/block/commit.c
> index 4a17bb73ec..93c3b6a39e 100644
> --- a/block/commit.c
> +++ b/block/commit.c
> @@ -68,19 +68,13 @@ static int coroutine_fn commit_populate(BlockBackend *bs, 
> BlockBackend *base,
>  return 0;
>  }
>  
> -typedef struct {
> -int ret;
> -} CommitCompleteData;
> -
> -static void commit_complete(Job *job, void *opaque)
> +static void commit_exit(Job *job)
>  {
>  CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
>  BlockJob *bjob = >common;
> -CommitCompleteData *data = opaque;
>  BlockDriverState *top = blk_bs(s->top);
>  BlockDriverState *base = blk_bs(s->base);
>  BlockDriverState *commit_top_bs = s->commit_top_bs;
> -int ret = data->ret;
>  bool remove_commit_top_bs = false;
>  
>  /* Make sure commit_top_bs and top stay around until bdrv_replace_node() 
> */
> @@ -93,8 +87,8 @@ static void commit_complete(Job *job, void *opaque)
>  
>  if (!job_is_cancelled(job) && ret == 0) {

forgot to actually squash the change in here that replaces `ret` with
`job->ret`.

>  /* success */
> -ret = bdrv_drop_intermediate(s->commit_top_bs, base,
> - s->backing_file_str);
> +job->ret = bdrv_drop_intermediate(s->commit_top_bs, base,
> +  s->backing_file_str);
>  } else {
>  /* XXX Can (or should) we somehow keep 'consistent read' blocked even
>   * after the failed/cancelled commit job is gone? If we already wrote
> @@ -117,9 +111,6 @@ static void commit_complete(Job *job, void *opaque)
>   * bdrv_set_backing_hd() to fail. */
>  block_job_remove_all_bdrv(bjob);
>  
> -job_completed(job, ret);
> -g_free(data);
> -
>  /* If bdrv_drop_intermediate() didn't already do that, remove the commit
>   * filter driver from the backing chain. Do this as the final step so 
> that
>   * the 'consistent read' permission can be granted.  */
> @@ -137,7 +128,6 @@ static void commit_complete(Job *job, void *opaque)
>  static int coroutine_fn commit_run(Job *job, Error **errp)
>  {
>  CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
> -CommitCompleteData *data;
>  int64_t offset;
>  uint64_t delay_ns = 0;
>  int ret = 0;
> @@ -210,9 +200,6 @@ static int coroutine_fn commit_run(Job *job, Error **errp)
>  out:
>  qemu_vfree(buf);
>  
> -data = g_malloc(sizeof(*data));
> -data->ret = ret;
> -job_defer_to_main_loop(>common.job, commit_complete, data);
>  return ret;
>  }
>  
> @@ -224,6 +211,7 @@ static const BlockJobDriver commit_job_driver = {
>  .user_resume   = block_job_user_resume,
>  .drain = block_job_drain,
>  .run   = commit_run,
> +.exit  = commit_exit,
>  },
>  };
>  
> 

-- 
—js



Re: [Qemu-block] [PATCH v2 2/3] scripts: add render_block_graph function for QEMUMachine

2018-08-17 Thread Vladimir Sementsov-Ogievskiy

17.08.2018 21:25, Eduardo Habkost wrote:

On Fri, Aug 17, 2018 at 09:04:39PM +0300, Vladimir Sementsov-Ogievskiy wrote:

Render block nodes graph with help of graphviz. This new function is
for debugging, so there is no sense to put it into qemu.py as a method
of QEMUMachine. Let's instead put it separately.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  scripts/render_block_graph.py | 78 +++
  1 file changed, 78 insertions(+)
  create mode 100644 scripts/render_block_graph.py

diff --git a/scripts/render_block_graph.py b/scripts/render_block_graph.py
new file mode 100644
index 00..7048a0bac8
--- /dev/null
+++ b/scripts/render_block_graph.py
@@ -0,0 +1,78 @@
+# Render Qemu Block Graph

[...]

What about making the script work from the command-line?

Signed-off-by: Eduardo Habkost 
---
diff --git a/scripts/render_block_graph.py b/scripts/render_block_graph.py
old mode 100644
new mode 100755
index 7048a0bac8..e29fe0fc41
--- a/scripts/render_block_graph.py
+++ b/scripts/render_block_graph.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
  # Render Qemu Block Graph
  #
  # Copyright (c) 2017 Parallels International GmbH
@@ -16,8 +17,9 @@
  # along with this program.  If not, see .
  #
  
-import os

+import os, sys
  from graphviz import Digraph
+from qmp.qmp import QEMUMonitorProtocol
  
  def perm(arr):

  s = 'w' if 'write' in arr else '_'
@@ -27,16 +29,16 @@ def perm(arr):
  s += 's' if 'resize' in arr else '_'
  return s
  
-def render_block_graph(vm, filename, pointers=False, format='png'):

+def render_block_graph(qmp, filename, pointers=False, format='png'):
  '''
  Render graph in text (dot) representation into "@filename" and
  representation in @format into "@filename.@format"
  '''
  
-nodes = vm.command('query-named-block-nodes')

+nodes = qmp.command('query-named-block-nodes')
  nodes_info = {n['node-name']: n for n in nodes}
  
-block_graph = vm.command('x-query-block-graph')

+block_graph = qmp.command('x-query-block-graph')
  
  graph = Digraph(comment='Block Nodes Graph')

  graph.format = format
@@ -76,3 +78,9 @@ def render_block_graph(vm, filename, pointers=False, 
format='png'):
  graph.edge(str(e['parent']), str(e['child']), label=label)
  
  graph.render(filename)

+
+if __name__ == '__main__':
+#TODO: use argparse for command-line arguments
+qmp = QEMUMonitorProtocol(sys.argv[1])
+qmp.connect()
+render_block_graph(qmp, sys.argv[2])




Cool, thanks.

so, how to use it then?

for python iotest a proper parameter would be vm._qmp, yes?

is there a way to dump running libvirt vm graph?

I've started libvirt guest, qemu process has cmdline parameters
-chardev 
socket,id=charmonitor,path=/var/lib/libvirt/qemu/domain-3-tmp/monitor.sock,server,nowait

-mon chardev=charmonitor,id=monitor,mode=control

command
virsh qemu-monitor-command tmp '{"execute": "x-query-block-graph"}'
works fine,

but script hangs on connection:
# python ./scripts/render_block_graph.py 
/var/lib/libvirt/qemu/domain-3-tmp/monitor.sock

^CTraceback (most recent call last):
  File "./scripts/render_block_graph.py", line 85, in 
    qmp.connect()
  File "/work/src/qemu/up-new-fleecing/scripts/qmp/qmp.py", line 140, 
in connect

    self.__sock.connect(self.__address)
  File "/usr/lib64/python2.7/socket.py", line 224, in meth
    return getattr(self._sock,name)(*args)
KeyboardInterrupt

./scripts/qmp/qmp-shell /var/lib/libvirt/qemu/domain-3-tmp/monitor.sock
outputs nothing...


--
Best regards,
Vladimir




[Qemu-block] [PATCH 6/7] jobs: utilize job_exit shim

2018-08-17 Thread John Snow
Utilize the job_exit shim by not calling job_defer_to_main_loop, and
where applicable, converting the deferred callback into the job_exit
callback.

This converts backup, stream, create, and the unit tests all at once.
None of these jobs undergo and order of operations changes, so it should
be a mechanical change.

Signed-off-by: John Snow 
---
 block/backup.c| 16 
 block/create.c| 14 +++---
 block/stream.c| 22 +++---
 tests/test-bdrv-drain.c   |  6 --
 tests/test-blockjob-txn.c | 11 ++-
 tests/test-blockjob.c | 10 --
 6 files changed, 16 insertions(+), 63 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 1e965d54e5..a67b7fa96b 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -380,18 +380,6 @@ static BlockErrorAction backup_error_action(BackupBlockJob 
*job,
 }
 }
 
-typedef struct {
-int ret;
-} BackupCompleteData;
-
-static void backup_complete(Job *job, void *opaque)
-{
-BackupCompleteData *data = opaque;
-
-job_completed(job, data->ret);
-g_free(data);
-}
-
 static bool coroutine_fn yield_and_check(BackupBlockJob *job)
 {
 uint64_t delay_ns;
@@ -483,7 +471,6 @@ static void 
backup_incremental_init_copy_bitmap(BackupBlockJob *job)
 static int coroutine_fn backup_run(Job *opaque_job, Error **errp)
 {
 BackupBlockJob *job = container_of(opaque_job, BackupBlockJob, common.job);
-BackupCompleteData *data;
 BlockDriverState *bs = blk_bs(job->common.blk);
 int64_t offset, nb_clusters;
 int ret = 0;
@@ -584,9 +571,6 @@ static int coroutine_fn backup_run(Job *opaque_job, Error 
**errp)
 qemu_co_rwlock_unlock(>flush_rwlock);
 hbitmap_free(job->copy_bitmap);
 
-data = g_malloc(sizeof(*data));
-data->ret = ret;
-job_defer_to_main_loop(>common.job, backup_complete, data);
 return ret;
 }
 
diff --git a/block/create.c b/block/create.c
index 26a385c6c7..95341219ef 100644
--- a/block/create.c
+++ b/block/create.c
@@ -34,28 +34,20 @@ typedef struct BlockdevCreateJob {
 Job common;
 BlockDriver *drv;
 BlockdevCreateOptions *opts;
-int ret;
 } BlockdevCreateJob;
 
-static void blockdev_create_complete(Job *job, void *opaque)
-{
-BlockdevCreateJob *s = container_of(job, BlockdevCreateJob, common);
-
-job_completed(job, s->ret);
-}
-
 static int coroutine_fn blockdev_create_run(Job *job, Error **errp)
 {
 BlockdevCreateJob *s = container_of(job, BlockdevCreateJob, common);
+int ret;
 
 job_progress_set_remaining(>common, 1);
-s->ret = s->drv->bdrv_co_create(s->opts, errp);
+ret = s->drv->bdrv_co_create(s->opts, errp);
 job_progress_update(>common, 1);
 
 qapi_free_BlockdevCreateOptions(s->opts);
-job_defer_to_main_loop(>common, blockdev_create_complete, NULL);
 
-return s->ret;
+return ret;
 }
 
 static const JobDriver blockdev_create_job_driver = {
diff --git a/block/stream.c b/block/stream.c
index 26a775386b..67e1e72e23 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -54,20 +54,16 @@ static int coroutine_fn stream_populate(BlockBackend *blk,
 return blk_co_preadv(blk, offset, qiov.size, , BDRV_REQ_COPY_ON_READ);
 }
 
-typedef struct {
-int ret;
-} StreamCompleteData;
-
-static void stream_complete(Job *job, void *opaque)
+static void stream_exit(Job *job)
 {
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
 BlockJob *bjob = >common;
-StreamCompleteData *data = opaque;
 BlockDriverState *bs = blk_bs(bjob->blk);
 BlockDriverState *base = s->base;
 Error *local_err = NULL;
+int ret = job->ret;
 
-if (!job_is_cancelled(job) && bs->backing && data->ret == 0) {
+if (!job_is_cancelled(job) && bs->backing && ret == 0) {
 const char *base_id = NULL, *base_fmt = NULL;
 if (base) {
 base_id = s->backing_file_str;
@@ -75,11 +71,11 @@ static void stream_complete(Job *job, void *opaque)
 base_fmt = base->drv->format_name;
 }
 }
-data->ret = bdrv_change_backing_file(bs, base_id, base_fmt);
+ret = bdrv_change_backing_file(bs, base_id, base_fmt);
 bdrv_set_backing_hd(bs, base, _err);
 if (local_err) {
 error_report_err(local_err);
-data->ret = -EPERM;
+ret = -EPERM;
 goto out;
 }
 }
@@ -93,14 +89,12 @@ out:
 }
 
 g_free(s->backing_file_str);
-job_completed(job, data->ret);
-g_free(data);
+job->ret = ret;
 }
 
 static int coroutine_fn stream_run(Job *job, Error **errp)
 {
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
-StreamCompleteData *data;
 BlockBackend *blk = s->common.blk;
 BlockDriverState *bs = blk_bs(blk);
 BlockDriverState *base = s->base;
@@ -203,9 +197,6 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
 
 out:
 /* Modify backing chain and close BDSes in main loop */
-data = 

[Qemu-block] [PATCH 3/7] jobs: add exit shim

2018-08-17 Thread John Snow
All jobs do the same thing when they leave their running loop:
- Store the return code in a structure
- wait to receive this structure in the main thread
- signal job completion via job_completed

Few jobs do anything beyond exactly this. Consolidate this exit
logic for a net reduction in SLOC.

More seriously, when we utilize job_defer_to_main_loop_bh to call
a function that calls job_completed, job_finalize_single will run
in a context where it has recursively taken the aio_context lock,
which can cause hangs if it puts down a reference that causes a flush.

You can observe this in practice by looking at mirror_exit's careful
placement of job_completed and bdrv_unref calls.

If we centralize job exiting, we can signal job completion from outside
of the aio_context, which should allow for job cleanup code to run with
only one lock, which makes cleanup callbacks less tricky to write.

Signed-off-by: John Snow 
---
 include/qemu/job.h |  7 +++
 job.c  | 19 +++
 2 files changed, 26 insertions(+)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index 5c92c53ef0..6f5b13751a 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -204,6 +204,13 @@ struct JobDriver {
  */
 void (*drain)(Job *job);
 
+/**
+ * If the callback is not NULL, exit will be invoked from the main thread
+ * when the job's coroutine has finished, but before transactional
+ * convergence; before @prepare or @abort.
+ */
+void (*exit)(Job *job);
+
 /**
  * If the callback is not NULL, prepare will be invoked when all the jobs
  * belonging to the same transaction complete; or upon this job's 
completion
diff --git a/job.c b/job.c
index c9de1af556..fae8e6047c 100644
--- a/job.c
+++ b/job.c
@@ -535,6 +535,19 @@ void job_drain(Job *job)
 }
 }
 
+static void job_exit(void *opaque)
+{
+Job *job = (Job *)opaque;
+AioContext *aio_context = job->aio_context;
+
+if (job->driver->exit) {
+aio_context_acquire(aio_context);
+job->driver->exit(job);
+aio_context_release(aio_context);
+}
+job_completed(job, job->ret);
+}
+
 /**
  * All jobs must allow a pause point before entering their job proper. This
  * ensures that jobs can be paused prior to being started, then resumed later.
@@ -546,6 +559,12 @@ static void coroutine_fn job_co_entry(void *opaque)
 assert(job && job->driver && job->driver->run);
 job_pause_point(job);
 job->ret = job->driver->run(job, >err);
+if (!job->deferred_to_main_loop) {
+job->deferred_to_main_loop = true;
+aio_bh_schedule_oneshot(qemu_get_aio_context(),
+job_exit,
+job);
+}
 }
 
 
-- 
2.14.4




[Qemu-block] [PATCH 4/7] block/commit: utilize job_exit shim

2018-08-17 Thread John Snow
Change the manual deferment to commit_complete into the implicit
callback to job_exit, renaming commit_complete to commit_exit.

This conversion does change the timing of when job_completed is
called to after the bdrv_replace_node and bdrv_unref calls, which
could have implications for bjob->blk which will now be put down
after this cleanup.

Kevin highlights that we did not take any permissions for that backend
at job creation time, so it is safe to reorder these operations.

Signed-off-by: John Snow 
---
 block/commit.c | 20 
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/block/commit.c b/block/commit.c
index 4a17bb73ec..93c3b6a39e 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -68,19 +68,13 @@ static int coroutine_fn commit_populate(BlockBackend *bs, 
BlockBackend *base,
 return 0;
 }
 
-typedef struct {
-int ret;
-} CommitCompleteData;
-
-static void commit_complete(Job *job, void *opaque)
+static void commit_exit(Job *job)
 {
 CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
 BlockJob *bjob = >common;
-CommitCompleteData *data = opaque;
 BlockDriverState *top = blk_bs(s->top);
 BlockDriverState *base = blk_bs(s->base);
 BlockDriverState *commit_top_bs = s->commit_top_bs;
-int ret = data->ret;
 bool remove_commit_top_bs = false;
 
 /* Make sure commit_top_bs and top stay around until bdrv_replace_node() */
@@ -93,8 +87,8 @@ static void commit_complete(Job *job, void *opaque)
 
 if (!job_is_cancelled(job) && ret == 0) {
 /* success */
-ret = bdrv_drop_intermediate(s->commit_top_bs, base,
- s->backing_file_str);
+job->ret = bdrv_drop_intermediate(s->commit_top_bs, base,
+  s->backing_file_str);
 } else {
 /* XXX Can (or should) we somehow keep 'consistent read' blocked even
  * after the failed/cancelled commit job is gone? If we already wrote
@@ -117,9 +111,6 @@ static void commit_complete(Job *job, void *opaque)
  * bdrv_set_backing_hd() to fail. */
 block_job_remove_all_bdrv(bjob);
 
-job_completed(job, ret);
-g_free(data);
-
 /* If bdrv_drop_intermediate() didn't already do that, remove the commit
  * filter driver from the backing chain. Do this as the final step so that
  * the 'consistent read' permission can be granted.  */
@@ -137,7 +128,6 @@ static void commit_complete(Job *job, void *opaque)
 static int coroutine_fn commit_run(Job *job, Error **errp)
 {
 CommitBlockJob *s = container_of(job, CommitBlockJob, common.job);
-CommitCompleteData *data;
 int64_t offset;
 uint64_t delay_ns = 0;
 int ret = 0;
@@ -210,9 +200,6 @@ static int coroutine_fn commit_run(Job *job, Error **errp)
 out:
 qemu_vfree(buf);
 
-data = g_malloc(sizeof(*data));
-data->ret = ret;
-job_defer_to_main_loop(>common.job, commit_complete, data);
 return ret;
 }
 
@@ -224,6 +211,7 @@ static const BlockJobDriver commit_job_driver = {
 .user_resume   = block_job_user_resume,
 .drain = block_job_drain,
 .run   = commit_run,
+.exit  = commit_exit,
 },
 };
 
-- 
2.14.4




[Qemu-block] [PATCH 7/7] jobs: remove job_defer_to_main_loop

2018-08-17 Thread John Snow
Now that the job infrastructure is handling the job_completed call for
all implemented jobs, we can remove the interface that allowed jobs to
schedule their own completion.

Signed-off-by: John Snow 
---
 include/qemu/job.h | 17 -
 job.c  | 40 ++--
 2 files changed, 2 insertions(+), 55 deletions(-)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index 6f5b13751a..8349702e3e 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -558,23 +558,6 @@ void job_finalize(Job *job, Error **errp);
  */
 void job_dismiss(Job **job, Error **errp);
 
-typedef void JobDeferToMainLoopFn(Job *job, void *opaque);
-
-/**
- * @job: The job
- * @fn: The function to run in the main loop
- * @opaque: The opaque value that is passed to @fn
- *
- * This function must be called by the main job coroutine just before it
- * returns.  @fn is executed in the main loop with the job AioContext acquired.
- *
- * Block jobs must call bdrv_unref(), bdrv_close(), and anything that uses
- * bdrv_drain_all() in the main loop.
- *
- * The @job AioContext is held while @fn executes.
- */
-void job_defer_to_main_loop(Job *job, JobDeferToMainLoopFn *fn, void *opaque);
-
 /**
  * Synchronously finishes the given @job. If @finish is given, it is called to
  * trigger completion or cancellation of the job.
diff --git a/job.c b/job.c
index fae8e6047c..dbb225e35d 100644
--- a/job.c
+++ b/job.c
@@ -559,12 +559,8 @@ static void coroutine_fn job_co_entry(void *opaque)
 assert(job && job->driver && job->driver->run);
 job_pause_point(job);
 job->ret = job->driver->run(job, >err);
-if (!job->deferred_to_main_loop) {
-job->deferred_to_main_loop = true;
-aio_bh_schedule_oneshot(qemu_get_aio_context(),
-job_exit,
-job);
-}
+job->deferred_to_main_loop = true;
+aio_bh_schedule_oneshot(qemu_get_aio_context(), job_exit, job);
 }
 
 
@@ -968,38 +964,6 @@ void job_complete(Job *job, Error **errp)
 job->driver->complete(job, errp);
 }
 
-
-typedef struct {
-Job *job;
-JobDeferToMainLoopFn *fn;
-void *opaque;
-} JobDeferToMainLoopData;
-
-static void job_defer_to_main_loop_bh(void *opaque)
-{
-JobDeferToMainLoopData *data = opaque;
-Job *job = data->job;
-AioContext *aio_context = job->aio_context;
-
-aio_context_acquire(aio_context);
-data->fn(data->job, data->opaque);
-aio_context_release(aio_context);
-
-g_free(data);
-}
-
-void job_defer_to_main_loop(Job *job, JobDeferToMainLoopFn *fn, void *opaque)
-{
-JobDeferToMainLoopData *data = g_malloc(sizeof(*data));
-data->job = job;
-data->fn = fn;
-data->opaque = opaque;
-job->deferred_to_main_loop = true;
-
-aio_bh_schedule_oneshot(qemu_get_aio_context(),
-job_defer_to_main_loop_bh, data);
-}
-
 int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error 
**errp)
 {
 Error *local_err = NULL;
-- 
2.14.4




Re: [Qemu-block] [PATCH v2 2/3] scripts: add render_block_graph function for QEMUMachine

2018-08-17 Thread Eduardo Habkost
On Fri, Aug 17, 2018 at 09:59:41PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> 17.08.2018 21:25, Eduardo Habkost wrote:
> > On Fri, Aug 17, 2018 at 09:04:39PM +0300, Vladimir Sementsov-Ogievskiy 
> > wrote:
> > > Render block nodes graph with help of graphviz. This new function is
> > > for debugging, so there is no sense to put it into qemu.py as a method
> > > of QEMUMachine. Let's instead put it separately.
> > > 
> > > Signed-off-by: Vladimir Sementsov-Ogievskiy 
> > > ---
> > >   scripts/render_block_graph.py | 78 
> > > +++
> > >   1 file changed, 78 insertions(+)
> > >   create mode 100644 scripts/render_block_graph.py
> > > 
> > > diff --git a/scripts/render_block_graph.py b/scripts/render_block_graph.py
> > > new file mode 100644
> > > index 00..7048a0bac8
> > > --- /dev/null
> > > +++ b/scripts/render_block_graph.py
> > > @@ -0,0 +1,78 @@
> > > +# Render Qemu Block Graph
> > [...]
> > 
> > What about making the script work from the command-line?
> > 
> > Signed-off-by: Eduardo Habkost 
> > ---
> > diff --git a/scripts/render_block_graph.py b/scripts/render_block_graph.py
> > old mode 100644
> > new mode 100755
> > index 7048a0bac8..e29fe0fc41
> > --- a/scripts/render_block_graph.py
> > +++ b/scripts/render_block_graph.py
> > @@ -1,3 +1,4 @@
> > +#!/usr/bin/env python
> >   # Render Qemu Block Graph
> >   #
> >   # Copyright (c) 2017 Parallels International GmbH
> > @@ -16,8 +17,9 @@
> >   # along with this program.  If not, see .
> >   #
> > -import os
> > +import os, sys
> >   from graphviz import Digraph
> > +from qmp.qmp import QEMUMonitorProtocol
> >   def perm(arr):
> >   s = 'w' if 'write' in arr else '_'
> > @@ -27,16 +29,16 @@ def perm(arr):
> >   s += 's' if 'resize' in arr else '_'
> >   return s
> > -def render_block_graph(vm, filename, pointers=False, format='png'):
> > +def render_block_graph(qmp, filename, pointers=False, format='png'):
> >   '''
> >   Render graph in text (dot) representation into "@filename" and
> >   representation in @format into "@filename.@format"
> >   '''
> > -nodes = vm.command('query-named-block-nodes')
> > +nodes = qmp.command('query-named-block-nodes')
> >   nodes_info = {n['node-name']: n for n in nodes}
> > -block_graph = vm.command('x-query-block-graph')
> > +block_graph = qmp.command('x-query-block-graph')
> >   graph = Digraph(comment='Block Nodes Graph')
> >   graph.format = format
> > @@ -76,3 +78,9 @@ def render_block_graph(vm, filename, pointers=False, 
> > format='png'):
> >   graph.edge(str(e['parent']), str(e['child']), label=label)
> >   graph.render(filename)
> > +
> > +if __name__ == '__main__':
> > +#TODO: use argparse for command-line arguments
> > +qmp = QEMUMonitorProtocol(sys.argv[1])
> > +qmp.connect()
> > +render_block_graph(qmp, sys.argv[2])
> > 
> > 
> 
> Cool, thanks.
> 
> so, how to use it then?
> 
> for python iotest a proper parameter would be vm._qmp, yes?

Yes, assuming you just want to do it locally just for debugging.

One thing we could do later: refactor QEMUMachine and
QEMUMonitorProtocol so their method names are the same and both
'vm' and 'vm._qmp' work here.

> 
> is there a way to dump running libvirt vm graph?
> 
> I've started libvirt guest, qemu process has cmdline parameters
> -chardev 
> socket,id=charmonitor,path=/var/lib/libvirt/qemu/domain-3-tmp/monitor.sock,server,nowait
> -mon chardev=charmonitor,id=monitor,mode=control
> 
> command
> virsh qemu-monitor-command tmp '{"execute": "x-query-block-graph"}'
> works fine,
> 
> but script hangs on connection:
> # python ./scripts/render_block_graph.py
> /var/lib/libvirt/qemu/domain-3-tmp/monitor.sock
> ^CTraceback (most recent call last):
>   File "./scripts/render_block_graph.py", line 85, in 
>     qmp.connect()
>   File "/work/src/qemu/up-new-fleecing/scripts/qmp/qmp.py", line 140, in
> connect
>     self.__sock.connect(self.__address)
>   File "/usr/lib64/python2.7/socket.py", line 224, in meth
>     return getattr(self._sock,name)(*args)
> KeyboardInterrupt
> 
> ./scripts/qmp/qmp-shell /var/lib/libvirt/qemu/domain-3-tmp/monitor.sock
> outputs nothing...

I don't know why it doesn't work with a running instance started
by libvirt, but it works if I run QEMU manually using "-qmp
unix:/tmp/qmp,server".

-- 
Eduardo



Re: [Qemu-block] [PATCH v2 2/3] scripts: add render_block_graph function for QEMUMachine

2018-08-17 Thread Eduardo Habkost
On Fri, Aug 17, 2018 at 09:04:39PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> Render block nodes graph with help of graphviz. This new function is
> for debugging, so there is no sense to put it into qemu.py as a method
> of QEMUMachine. Let's instead put it separately.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  scripts/render_block_graph.py | 78 
> +++
>  1 file changed, 78 insertions(+)
>  create mode 100644 scripts/render_block_graph.py
> 
> diff --git a/scripts/render_block_graph.py b/scripts/render_block_graph.py
> new file mode 100644
> index 00..7048a0bac8
> --- /dev/null
> +++ b/scripts/render_block_graph.py
> @@ -0,0 +1,78 @@
> +# Render Qemu Block Graph
[...]

What about making the script work from the command-line?

Signed-off-by: Eduardo Habkost 
---
diff --git a/scripts/render_block_graph.py b/scripts/render_block_graph.py
old mode 100644
new mode 100755
index 7048a0bac8..e29fe0fc41
--- a/scripts/render_block_graph.py
+++ b/scripts/render_block_graph.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 # Render Qemu Block Graph
 #
 # Copyright (c) 2017 Parallels International GmbH
@@ -16,8 +17,9 @@
 # along with this program.  If not, see .
 #
 
-import os
+import os, sys
 from graphviz import Digraph
+from qmp.qmp import QEMUMonitorProtocol
 
 def perm(arr):
 s = 'w' if 'write' in arr else '_'
@@ -27,16 +29,16 @@ def perm(arr):
 s += 's' if 'resize' in arr else '_'
 return s
 
-def render_block_graph(vm, filename, pointers=False, format='png'):
+def render_block_graph(qmp, filename, pointers=False, format='png'):
 '''
 Render graph in text (dot) representation into "@filename" and
 representation in @format into "@filename.@format"
 '''
 
-nodes = vm.command('query-named-block-nodes')
+nodes = qmp.command('query-named-block-nodes')
 nodes_info = {n['node-name']: n for n in nodes}
 
-block_graph = vm.command('x-query-block-graph')
+block_graph = qmp.command('x-query-block-graph')
 
 graph = Digraph(comment='Block Nodes Graph')
 graph.format = format
@@ -76,3 +78,9 @@ def render_block_graph(vm, filename, pointers=False, 
format='png'):
 graph.edge(str(e['parent']), str(e['child']), label=label)
 
 graph.render(filename)
+
+if __name__ == '__main__':
+#TODO: use argparse for command-line arguments
+qmp = QEMUMonitorProtocol(sys.argv[1])
+qmp.connect()
+render_block_graph(qmp, sys.argv[2])


-- 
Eduardo



Re: [Qemu-block] [RFC v2] new, node-graph-based fleecing and backup

2018-08-17 Thread Vladimir Sementsov-Ogievskiy
attached generated block graph (with help of "[PATCH v2 0/3] block nodes 
graph visualization")


--
Best regards,
Vladimir



[Qemu-block] [PATCH v2 2/3] scripts: add render_block_graph function for QEMUMachine

2018-08-17 Thread Vladimir Sementsov-Ogievskiy
Render block nodes graph with help of graphviz. This new function is
for debugging, so there is no sense to put it into qemu.py as a method
of QEMUMachine. Let's instead put it separately.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 scripts/render_block_graph.py | 78 +++
 1 file changed, 78 insertions(+)
 create mode 100644 scripts/render_block_graph.py

diff --git a/scripts/render_block_graph.py b/scripts/render_block_graph.py
new file mode 100644
index 00..7048a0bac8
--- /dev/null
+++ b/scripts/render_block_graph.py
@@ -0,0 +1,78 @@
+# Render Qemu Block Graph
+#
+# Copyright (c) 2017 Parallels International GmbH
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+import os
+from graphviz import Digraph
+
+def perm(arr):
+s = 'w' if 'write' in arr else '_'
+s += 'r' if 'consistent-read' in arr else '_'
+s += 'u' if 'write-unchanged' in arr else '_'
+s += 'g' if 'graph-mod' in arr else '_'
+s += 's' if 'resize' in arr else '_'
+return s
+
+def render_block_graph(vm, filename, pointers=False, format='png'):
+'''
+Render graph in text (dot) representation into "@filename" and
+representation in @format into "@filename.@format"
+'''
+
+nodes = vm.command('query-named-block-nodes')
+nodes_info = {n['node-name']: n for n in nodes}
+
+block_graph = vm.command('x-query-block-graph')
+
+graph = Digraph(comment='Block Nodes Graph')
+graph.format = format
+graph.node('permission symbols:\l'
+   '  w - Write\l'
+   '  r - consistent-Read\l'
+   '  u - write - Unchanged\l'
+   '  g - Graph-mod\l'
+   '  s - reSize\l'
+   'edge label scheme:\l'
+   '  \l'
+   '  \l'
+   '  \l', shape='none')
+
+
+for n in block_graph['nodes']:
+if n['type'] == 'block':
+info = nodes_info[n['node-name']]
+label = n['node-name'] + ' [' + info['drv'] + ']'
+shape = 'ellipse'
+else:
+assert n['type'] == 'other'
+label = n['description']
+shape = 'box'
+
+if pointers:
+label += '\n0x%x' % n['id']
+
+if n['type'] == 'block' and info['drv'] == 'file':
+label += '\n' + os.path.basename(info['file'])
+
+graph.node(str(n['id']), label, shape=shape)
+
+for e in block_graph['edges']:
+label = '%s\l%s\l%s\l' % (e['name'], perm(e['perm']),
+  perm(e['shared-perm']))
+graph.edge(str(e['parent']), str(e['child']), label=label)
+
+graph.render(filename)
-- 
2.11.1




[Qemu-block] [PATCH v2 1/3] qapi: add x-query-block-graph

2018-08-17 Thread Vladimir Sementsov-Ogievskiy
Add a new command, returning block nodes graph.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 qapi/block-core.json  | 116 ++
 include/block/block.h |   1 +
 block.c   |  80 ++
 blockdev.c|   5 +++
 4 files changed, 202 insertions(+)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 4c7a37afdc..32398c9912 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1630,6 +1630,122 @@
 { 'command': 'query-named-block-nodes', 'returns': [ 'BlockDeviceInfo' ] }
 
 ##
+# @BlockGraphNodeType:
+#
+# @block: Actual block nodes, which may be queried by query-named-block-nodes.
+# @other: Not a block node. Examples: guest block device, block job.
+#
+# Since: 3.1
+##
+{ 'enum': 'BlockGraphNodeType',
+  'data': [ 'block', 'other' ] }
+
+##
+# @BlockGraphNodeTypeBlock:
+#
+# Since: 3.1
+##
+{ 'struct': 'BlockGraphNodeTypeBlock',
+  'data': { 'node-name': 'str' } }
+
+##
+# @BlockGraphNodeTypeOther:
+#
+# Since: 3.1
+##
+  { 'struct': 'BlockGraphNodeTypeOther',
+'data': { 'description': 'str' } }
+
+##
+# @BlockGraphNode:
+#
+# @id: unique node identifier, equal to node RAM-pointer
+# @type: node type
+#
+# Since: 3.1
+##
+{ 'union': 'BlockGraphNode',
+  'base': { 'id': 'uint64', 'type': 'BlockGraphNodeType' },
+  'discriminator': 'type',
+  'data' : { 'block': 'BlockGraphNodeTypeBlock',
+ 'other': 'BlockGraphNodeTypeOther' } }
+
+##
+# @BlockPermission:
+#
+# Enum of base block permissions.
+#
+# @consistent-read: A user that has the "permission" of consistent reads is
+#   guaranteed that their view of the contents of the block
+#   device is complete and self-consistent, representing the
+#   contents of a disk at a specific point.
+#   For most block devices (including their backing files) this
+#   is true, but the property cannot be maintained in a few
+#   situations like for intermediate nodes of a commit block
+#   job.
+#
+# @write: This permission is required to change the visible disk contents.
+#
+# @write-unchanged: This permission (which is weaker than BLK_PERM_WRITE) is
+#   both enough and required for writes to the block node when
+#   the caller promises that the visible disk content doesn't
+#   change.
+#   As the BLK_PERM_WRITE permission is strictly stronger,
+#   either is sufficient to perform an unchanging write.
+#
+# @resize: This permission is required to change the size of a block node.
+#
+# @graph-mod: This permission is required to change the node that this
+# BdrvChild points to.
+#
+# Since: 3.1
+##
+  { 'enum': 'BlockPermission',
+'data': [ 'consistent-read', 'write', 'write-unchanged', 'resize',
+  'graph-mod' ] }
+##
+# @BlockGraphEdge:
+#
+# Block Graph edge description for x-query-block-graph.
+#
+# @parent: parent id
+#
+# @child: child id
+#
+# @name: name of the relation (examples are 'file' and 'backing')
+#
+# @perm: granted permissions for the parent operating on the child
+#
+# @shared-perm: permissions that can still be granted to other users of the
+#   child while it is still attached this parent
+#
+# Since: 3.1
+##
+{ 'struct': 'BlockGraphEdge',
+  'data': { 'parent': 'uint64', 'child': 'uint64',
+'name': 'str', 'perm': [ 'BlockPermission' ],
+'shared-perm': [ 'BlockPermission' ] } }
+
+##
+# @BlockGraph:
+#
+# Block Graph - list of nodes and list of edges.
+#
+# Since: 3.1
+##
+{ 'struct': 'BlockGraph',
+  'data': { 'nodes': ['BlockGraphNode'], 'edges': ['BlockGraphEdge'] } }
+
+##
+# @x-query-block-graph:
+#
+# Get the block graph.
+#
+# Since: 3.1
+##
+{ 'command': 'x-query-block-graph', 'returns': 'BlockGraph' }
+
+##
 # @drive-mirror:
 #
 # Start mirroring a block device's writes to a new destination. target
diff --git a/include/block/block.h b/include/block/block.h
index 4e0871aaf9..6f2ccad040 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -448,6 +448,7 @@ void bdrv_eject(BlockDriverState *bs, bool eject_flag);
 const char *bdrv_get_format_name(BlockDriverState *bs);
 BlockDriverState *bdrv_find_node(const char *node_name);
 BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp);
+BlockGraph *bdrv_get_block_graph(Error **errp);
 BlockDriverState *bdrv_lookup_bs(const char *device,
  const char *node_name,
  Error **errp);
diff --git a/block.c b/block.c
index 6161dbe3eb..766354e79c 100644
--- a/block.c
+++ b/block.c
@@ -4003,6 +4003,86 @@ BlockDeviceInfoList *bdrv_named_nodes_list(Error **errp)
 return list;
 }
 
+#define QAPI_LIST_ADD(list, element) do { \
+typeof(list) _tmp = g_new(typeof(*(list)), 1); \
+_tmp->value = (element); \
+_tmp->next = (list); \
+list = 

[Qemu-block] [PATCH v2 3/3] not-for-commit: example of new command usage for debugging

2018-08-17 Thread Vladimir Sementsov-Ogievskiy
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/222 | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tests/qemu-iotests/222 b/tests/qemu-iotests/222
index 0ead56d574..c2e647fa83 100644
--- a/tests/qemu-iotests/222
+++ b/tests/qemu-iotests/222
@@ -137,6 +137,9 @@ with iotests.FilePath('base.img') as base_img_path, \
 log('--- Cleanup ---')
 log('')
 
+from render_block_graph import render_block_graph
+render_block_graph(vm, '/tmp/out.dot')
+render_block_graph(vm, '/tmp/out.pointers.dot', pointers=True)
 log(vm.qmp('block-job-cancel', device=src_node))
 log(vm.event_wait('BLOCK_JOB_CANCELLED'),
 filters=[iotests.filter_qmp_event])
-- 
2.11.1




Re: [Qemu-block] [PATCH 0/7] discard blockstats

2018-08-17 Thread Paolo Bonzini
On 20/11/2017 17:50, Anton Nefedov wrote:
> qmp query-blockstats provides stats info for write/read/flush ops.
> 
> Patches 1-5 implement the similar for discard (unmap) command for scsi
> and ide disks.
> Discard stat "unmap_ops / unmap_bytes" is supposed to account the ops that
> have completed without an error.
> 
> However, discard operation is advisory. Specifically,
>  - common block layer ignores ENOTSUP error code.
>That might be returned if the block driver does not support discard,
>or discard has been configured to be ignored.
>  - format drivers such as qcow2 may ignore discard if they were configured
>to ignore that, or if the corresponding area is already marked unused
>(unallocated / zero clusters).
> 
> And what is actually useful is the number of bytes actually discarded
> down on the host filesystem.
> To achieve that, driver-specific statistics has been added to blockstats
> (patch 7).
> With patch 6, file-posix driver accounts discard operations on its level too.
> 
> query-blockstat result:
> 
> (note the difference between blockdevice unmap and file discard stats. qcow2
> sends a few ops down to the file as the clusters are actually unallocated
> on qcow2 level)
> 
> {"return": [
> {"device": "drive-scsi0-0-0-0"
>   "parent": {
> "stats": {
>>  "unmap_operations": 0
>>  "unmap_merged": 0
>   "flush_total_time_ns": 0
>   "wr_highest_offset": 8111718400
>   [..]
>   "invalid_wr_operations": 0
>   "invalid_rd_operations": 0}
> "node-name": "#block047"
>>"driver_stats": {
>>  "type": "file"
>>  "data": {
>>"discard_bytes_ok": 1572864
>>"discard_nb_failed": 0
>>"discard_nb_ok": 5}}}
>   "stats": {
>>   "unmap_operations": 472
>>   "unmap_merged": 0
> "flush_total_time_ns": 44530540
> "wr_highest_offset": 7106662400
> "wr_total_time_ns": 45518856
> "failed_wr_operations": 0
> "failed_rd_operations": 0
> "wr_merged": 0
> "wr_bytes": 889856
> "timed_stats": []
>>"failed_unmap_operations": 0
> "failed_flush_operations": 0
> "account_invalid": true
> "rd_total_time_ns": 3306264098
>>"invalid_unmap_operations": 0
> "flush_operations": 18
> "wr_operations": 120
>>"unmap_bytes": 12312014848
> "rd_merged": 0
> "rd_bytes": 137103360
>>"unmap_total_time_ns": 22664692
> "invalid_flush_operations": 0
> "account_failed": true
> "idle_time_ns": 437316567
> "rd_operations": 5636
> "invalid_wr_operations": 0
> "invalid_rd_operations": 0}
>   "node-name": "#block128"}
> 
>   {"device": "drive-ide0-0-0"
>   [..]
> 
> Anton Nefedov (7):
>   qapi: add unmap to BlockDeviceStats
>   ide: account UNMAP (TRIM) operations
>   scsi: store unmap offset and nb_sectors in request struct
>   scsi: move unmap error checking to the complete callback
>   scsi: account unmap operations
>   file-posix: account discard operations
>   qapi: query-blockstat: add driver specific file-posix stats
> 
>  qapi/block-core.json   | 57 
> ++
>  include/block/accounting.h |  1 +
>  include/block/block.h  |  1 +
>  include/block/block_int.h  |  1 +
>  block.c|  9 
>  block/file-posix.c | 42 --
>  block/qapi.c   | 11 +
>  hw/ide/core.c  | 12 ++
>  hw/scsi/scsi-disk.c| 29 ++-
>  9 files changed, 150 insertions(+), 13 deletions(-)
> 

Hey, looks like this series has fallen through the cracks.  Anton, are
you going to send an updated version?

Thanks,

Paolo



[Qemu-block] [PATCH 2/2] qobject: modify qobject_ref() to assert on NULL

2018-08-17 Thread Marc-André Lureau
While it may be convenient to accept NULL value in
qobject_unref() (for similar reasons as free() accepts NULL), it is
not such a good idea for qobject_ref(): now assert() on NULL.

Some code relied on that behaviour, but it's best to be explicit that
NULL is accepted.  We have to rely on testing, and manual inspection
of qobject_ref() usage:

* block.c:
 - bdrv_refresh_filename(): guarded
 - append_open_options(): it depends if qdict values could be NULL,
   handled for extra safety, might be unnecessary

* block/blkdebug.c:
 - blkdebug_refresh_filename(): depends if qdict values could be NULL,
   full_open_options could be NULL apparently, handled

* block/blkverify.c: guarded

* block/{null,nvme}.c: guarded, previous qdict_del() (actually
  qdict_find()) guarantee non-NULL)

* block/quorum.c: full_open_options could be NULL, handled for extra
  safety, might be unnecessary

* monitor: events have associated qdict, but may not have 'data' dict
  entry. Command 'id' is already guarded. A queued response is
  non-NULL.

* qapi/qmp-dispatch.c: if "arguments" exists, it can't be NULL during
  json parsing

* qapi/qobject-input-visitor.c: guarded by assert in visit_type_any()

* qapi/qobject-output-visitor.c: guarded by assert() in visit_type_any()
  qobject_output_complete(): guarded by pre-condition assert()

* qmp.c: guarded, error out before if NULL

* qobject/q{list,dict}.c: can accept NULL values apparently, what's
  the reason? how are you supposed to handle that? no test?

  Some code, such as qdict_flatten_qdict(), assume the value is always
  non-NULL for example.

- tests/*: considered to be covered by make check, not critical

- util/keyval.c: guarded, if (!elt[i]) before

Signed-off-by: Marc-André Lureau 
Reviewed-by: Eric Blake 
---
 include/qapi/qmp/qobject.h | 7 ---
 block.c| 6 --
 block/blkdebug.c   | 3 ++-
 block/quorum.c | 3 ++-
 monitor.c  | 2 +-
 5 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/include/qapi/qmp/qobject.h b/include/qapi/qmp/qobject.h
index fcfd549220..2fe5b42579 100644
--- a/include/qapi/qmp/qobject.h
+++ b/include/qapi/qmp/qobject.h
@@ -74,9 +74,8 @@ static inline void qobject_init(QObject *obj, QType type)
 
 static inline void qobject_ref_impl(QObject *obj)
 {
-if (obj) {
-obj->base.refcnt++;
-}
+assert(obj);
+obj->base.refcnt++;
 }
 
 /**
@@ -103,6 +102,7 @@ static inline void qobject_unref_impl(QObject *obj)
 
 /**
  * qobject_ref(): Increment QObject's reference count
+ * @obj: a #QObject or child type instance (must not be NULL)
  *
  * Returns: the same @obj. The type of @obj will be propagated to the
  * return type.
@@ -116,6 +116,7 @@ static inline void qobject_unref_impl(QObject *obj)
 /**
  * qobject_unref(): Decrement QObject's reference count, deallocate
  * when it reaches zero
+ * @obj: a #QObject or child type instance (can be NULL)
  */
 #define qobject_unref(obj) qobject_unref_impl(QOBJECT(obj))
 
diff --git a/block.c b/block.c
index 6161dbe3eb..f1e35c3c1e 100644
--- a/block.c
+++ b/block.c
@@ -5154,6 +5154,8 @@ static bool append_open_options(QDict *d, 
BlockDriverState *bs)
 for (entry = qdict_first(bs->options); entry;
  entry = qdict_next(bs->options, entry))
 {
+QObject *val;
+
 /* Exclude node-name references to children */
 QLIST_FOREACH(child, >children, next) {
 if (!strcmp(entry->key, child->name)) {
@@ -5174,8 +5176,8 @@ static bool append_open_options(QDict *d, 
BlockDriverState *bs)
 continue;
 }
 
-qdict_put_obj(d, qdict_entry_key(entry),
-  qobject_ref(qdict_entry_value(entry)));
+val = qdict_entry_value(entry);
+qdict_put_obj(d, qdict_entry_key(entry), val ? qobject_ref(val) : 
NULL);
 found_any = true;
 }
 
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 0759452925..062263f7e1 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -846,7 +846,8 @@ static void blkdebug_refresh_filename(BlockDriverState *bs, 
QDict *options)
 opts = qdict_new();
 qdict_put_str(opts, "driver", "blkdebug");
 
-qdict_put(opts, "image", qobject_ref(bs->file->bs->full_open_options));
+qdict_put(opts, "image", bs->file->bs->full_open_options ?
+  qobject_ref(bs->file->bs->full_open_options) : NULL);
 
 for (e = qdict_first(options); e; e = qdict_next(options, e)) {
 if (strcmp(qdict_entry_key(e), "x-image")) {
diff --git a/block/quorum.c b/block/quorum.c
index 9152da8c58..96cd094ede 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -1089,7 +1089,8 @@ static void quorum_refresh_filename(BlockDriverState *bs, 
QDict *options)
 children = qlist_new();
 for (i = 0; i < s->num_children; i++) {
 qlist_append(children,
- qobject_ref(s->children[i]->bs->full_open_options));
+ s->children[i]->bs->full_open_options ?
+ 

[Qemu-block] [PATCH 1/2] qdict: add qdict_steal()

2018-08-17 Thread Marc-André Lureau
Add a new function qdict_steal(), that deletes a key from a dict, and
returns the associated value, if any. Simplify related code.

Signed-off-by: Marc-André Lureau 
---
 include/qapi/qmp/qdict.h |  1 +
 monitor.c|  3 +--
 qobject/block-qdict.c|  7 ++-
 qobject/qdict.c  | 22 ++
 4 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/include/qapi/qmp/qdict.h b/include/qapi/qmp/qdict.h
index 7f3ec10a10..84644bfba6 100644
--- a/include/qapi/qmp/qdict.h
+++ b/include/qapi/qmp/qdict.h
@@ -37,6 +37,7 @@ QObject *qdict_entry_value(const QDictEntry *entry);
 size_t qdict_size(const QDict *qdict);
 void qdict_put_obj(QDict *qdict, const char *key, QObject *value);
 void qdict_del(QDict *qdict, const char *key);
+QObject *qdict_steal(QDict *qdict, const char *key);
 int qdict_haskey(const QDict *qdict, const char *key);
 QObject *qdict_get(const QDict *qdict, const char *key);
 bool qdict_is_equal(const QObject *x, const QObject *y);
diff --git a/monitor.c b/monitor.c
index a1999e396c..eab2dc7b7b 100644
--- a/monitor.c
+++ b/monitor.c
@@ -4262,8 +4262,7 @@ static void handle_qmp_command(JSONMessageParser *parser, 
GQueue *tokens)
 
 qdict = qobject_to(QDict, req);
 if (qdict) {
-id = qobject_ref(qdict_get(qdict, "id"));
-qdict_del(qdict, "id");
+id = qdict_steal(qdict, "id");
 } /* else will fail qmp_dispatch() */
 
 if (req && trace_event_get_state_backends(TRACE_HANDLE_QMP_COMMAND)) {
diff --git a/qobject/block-qdict.c b/qobject/block-qdict.c
index 42054cc274..a07664ee6a 100644
--- a/qobject/block-qdict.c
+++ b/qobject/block-qdict.c
@@ -692,8 +692,6 @@ void qdict_join(QDict *dest, QDict *src, bool overwrite)
  */
 bool qdict_rename_keys(QDict *qdict, const QDictRenames *renames, Error **errp)
 {
-QObject *qobj;
-
 while (renames->from) {
 if (qdict_haskey(qdict, renames->from)) {
 if (qdict_haskey(qdict, renames->to)) {
@@ -702,9 +700,8 @@ bool qdict_rename_keys(QDict *qdict, const QDictRenames 
*renames, Error **errp)
 return false;
 }
 
-qobj = qdict_get(qdict, renames->from);
-qdict_put_obj(qdict, renames->to, qobject_ref(qobj));
-qdict_del(qdict, renames->from);
+qdict_put_obj(qdict, renames->to,
+  qdict_steal(qdict, renames->from));
 }
 
 renames++;
diff --git a/qobject/qdict.c b/qobject/qdict.c
index 3d8c2f7bbc..322f23f094 100644
--- a/qobject/qdict.c
+++ b/qobject/qdict.c
@@ -406,6 +406,28 @@ void qdict_del(QDict *qdict, const char *key)
 }
 }
 
+/**
+ * qdict_steal(): Steal a 'key:value' pair from the dictionary
+ *
+ * Return the associated entry value and remove it, or NULL.
+ */
+QObject *qdict_steal(QDict *qdict, const char *key)
+{
+QDictEntry *entry;
+QObject *val = NULL;
+
+entry = qdict_find(qdict, key, tdb_hash(key) % QDICT_BUCKET_MAX);
+if (entry) {
+QLIST_REMOVE(entry, next);
+val = qobject_ref(entry->value);
+qentry_destroy(entry);
+qdict->size--;
+}
+
+return val;
+}
+
+
 /**
  * qdict_is_equal(): Test whether the two QDicts are equal
  *
-- 
2.18.0.547.g1d89318c48




[Qemu-block] [RFC PATCH 4/5] block: Drop AioContext lock in bdrv_drain_poll_top_level()

2018-08-17 Thread Kevin Wolf
Simimlar to AIO_WAIT_WHILE(), bdrv_drain_poll_top_level() needs to
release the AioContext lock of the node to be drained before calling
aio_poll(). Otherwise, callbacks called by aio_poll() would possibly
take the lock a second time and run into a deadlock with a nested
AIO_WAIT_WHILE() call.

Signed-off-by: Kevin Wolf 
---
 block/io.c | 25 -
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/block/io.c b/block/io.c
index 7100344c7b..832d2536bf 100644
--- a/block/io.c
+++ b/block/io.c
@@ -268,9 +268,32 @@ bool bdrv_drain_poll(BlockDriverState *bs, bool recursive,
 static bool bdrv_drain_poll_top_level(BlockDriverState *bs, bool recursive,
   BdrvChild *ignore_parent)
 {
+AioContext *ctx = bdrv_get_aio_context(bs);
+
+/*
+ * We cannot easily release the lock unconditionally here because many
+ * callers of drain function (like qemu initialisation, tools, etc.) don't
+ * even hold the main context lock.
+ *
+ * This means that we fix potential deadlocks for the case where we are in
+ * the main context and polling a BDS in a different AioContext, but
+ * draining a BDS in the main context from a different I/O thread would
+ * still have this problem. Fortunately, this isn't supposed to happen
+ * anyway.
+ */
+if (ctx != qemu_get_aio_context()) {
+aio_context_release(ctx);
+} else {
+assert(qemu_get_current_aio_context() == qemu_get_aio_context());
+}
+
 /* Execute pending BHs first and check everything else only after the BHs
  * have executed. */
-while (aio_poll(bs->aio_context, false));
+while (aio_poll(ctx, false));
+
+if (ctx != qemu_get_aio_context()) {
+aio_context_acquire(ctx);
+}
 
 return bdrv_drain_poll(bs, recursive, ignore_parent, false);
 }
-- 
2.13.6




[Qemu-block] [RFC PATCH 2/5] tests: Acquire AioContext around job_finish_sync()

2018-08-17 Thread Kevin Wolf
All callers in QEMU proper hold the AioContext lock when calling
job_finish_sync(). The tests should do the same.

Signed-off-by: Kevin Wolf 
---
 include/qemu/job.h  | 6 ++
 tests/test-bdrv-drain.c | 6 ++
 tests/test-blockjob.c   | 6 ++
 3 files changed, 18 insertions(+)

diff --git a/include/qemu/job.h b/include/qemu/job.h
index 0dae5b8481..8ac48dbd28 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -520,6 +520,8 @@ void job_user_cancel(Job *job, bool force, Error **errp);
  *
  * Returns the return value from the job if the job actually completed
  * during the call, or -ECANCELED if it was canceled.
+ *
+ * Callers must hold the AioContext lock of job->aio_context.
  */
 int job_cancel_sync(Job *job);
 
@@ -537,6 +539,8 @@ void job_cancel_sync_all(void);
  * function).
  *
  * Returns the return value from the job.
+ *
+ * Callers must hold the AioContext lock of job->aio_context.
  */
 int job_complete_sync(Job *job, Error **errp);
 
@@ -579,6 +583,8 @@ void job_defer_to_main_loop(Job *job, JobDeferToMainLoopFn 
*fn, void *opaque);
  *
  * Returns 0 if the job is successfully completed, -ECANCELED if the job was
  * cancelled before completing, and -errno in other error cases.
+ *
+ * Callers must hold the AioContext lock of job->aio_context.
  */
 int job_finish_sync(Job *job, void (*finish)(Job *, Error **errp), Error 
**errp);
 
diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index 17bb8508ae..30294038ef 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -795,6 +795,7 @@ static void test_blockjob_common(enum drain_type drain_type)
 BlockBackend *blk_src, *blk_target;
 BlockDriverState *src, *target;
 BlockJob *job;
+AioContext *ctx;
 int ret;
 
 src = bdrv_new_open_driver(_test, "source", BDRV_O_RDWR,
@@ -807,6 +808,9 @@ static void test_blockjob_common(enum drain_type drain_type)
 blk_target = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
 blk_insert_bs(blk_target, target, _abort);
 
+ctx = qemu_get_aio_context();
+aio_context_acquire(ctx);
+
 job = block_job_create("job0", _job_driver, NULL, src, 0, 
BLK_PERM_ALL,
0, 0, NULL, NULL, _abort);
 block_job_add_bdrv(job, "target", target, 0, BLK_PERM_ALL, _abort);
@@ -853,6 +857,8 @@ static void test_blockjob_common(enum drain_type drain_type)
 ret = job_complete_sync(>job, _abort);
 g_assert_cmpint(ret, ==, 0);
 
+aio_context_release(ctx);
+
 blk_unref(blk_src);
 blk_unref(blk_target);
 bdrv_unref(src);
diff --git a/tests/test-blockjob.c b/tests/test-blockjob.c
index cb42f06e61..8c2babbe35 100644
--- a/tests/test-blockjob.c
+++ b/tests/test-blockjob.c
@@ -230,6 +230,10 @@ static void cancel_common(CancelJob *s)
 BlockJob *job = >common;
 BlockBackend *blk = s->blk;
 JobStatus sts = job->job.status;
+AioContext *ctx;
+
+ctx = job->job.aio_context;
+aio_context_acquire(ctx);
 
 job_cancel_sync(>job);
 if (sts != JOB_STATUS_CREATED && sts != JOB_STATUS_CONCLUDED) {
@@ -239,6 +243,8 @@ static void cancel_common(CancelJob *s)
 assert(job->job.status == JOB_STATUS_NULL);
 job_unref(>job);
 destroy_blk(blk);
+
+aio_context_release(ctx);
 }
 
 static void test_cancel_created(void)
-- 
2.13.6




[Qemu-block] [RFC PATCH 5/5] [WIP] Lock AioContext in bdrv_co_drain_bh_cb()

2018-08-17 Thread Kevin Wolf
Not sure if this is correct, but at least it makes qemu-iotests 127 pass
again.

Signed-off-by: Kevin Wolf 
---
 block/io.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/block/io.c b/block/io.c
index 832d2536bf..d3dde4d7fd 100644
--- a/block/io.c
+++ b/block/io.c
@@ -309,6 +309,10 @@ static void bdrv_co_drain_bh_cb(void *opaque)
 BdrvCoDrainData *data = opaque;
 Coroutine *co = data->co;
 BlockDriverState *bs = data->bs;
+AioContext *ctx;
+
+ctx = bdrv_get_aio_context(bs);
+aio_context_acquire(ctx);
 
 if (bs) {
 bdrv_dec_in_flight(bs);
@@ -324,6 +328,8 @@ static void bdrv_co_drain_bh_cb(void *opaque)
 bdrv_drain_all_begin();
 }
 
+aio_context_release(ctx);
+
 data->done = true;
 aio_co_wake(co);
 }
-- 
2.13.6




[Qemu-block] [RFC PATCH 0/5] Fix some jobs/drain/aio_poll related hangs

2018-08-17 Thread Kevin Wolf
I'm running out of time and will be offline for the next two weeks, so
I'm just sending out what I have right now. This is probably not ready
to be merged yet, but if need be, someone else can pick it up. Otherwise
I'll do that myself when I return.

This is related to the following bug reports:
https://bugzilla.redhat.com/show_bug.cgi?id=1614623
https://bugzilla.redhat.com/show_bug.cgi?id=1601212

Essentially it tries to fix a few deadlocks into which we run when
running two commit jobs (probably just one example for problematic
scenarios) on nodes that are in a non-mainloop AioContext (i.e. with
dataplane).

Obviously, besides verifying that the code changes are actually correct
as they are, we also want to add some test cases for this. I suppose
test-bdrv-drain should test for the individual bugs, and a qemu-iotests
case could try the higher level scenario of multiple commit jobs with
dataplane.

Kevin Wolf (5):
  blockjob: Wake up BDS when job becomes idle
  tests: Acquire AioContext around job_finish_sync()
  job: Drop AioContext lock around aio_poll()
  block: Drop AioContext lock in bdrv_drain_poll_top_level()
  [WIP] Lock AioContext in bdrv_co_drain_bh_cb()

 include/block/blockjob.h | 13 +
 include/qemu/job.h   |  9 +
 block/io.c   | 31 ++-
 blockjob.c   | 18 ++
 job.c| 10 ++
 tests/test-bdrv-drain.c  |  6 ++
 tests/test-blockjob.c|  6 ++
 7 files changed, 92 insertions(+), 1 deletion(-)

-- 
2.13.6




[Qemu-block] [RFC PATCH 1/5] blockjob: Wake up BDS when job becomes idle

2018-08-17 Thread Kevin Wolf
In the context of draining a BDS, the .drained_poll callback of block
jobs is called. If this returns true (i.e. there is still some activity
pending), the drain operation may call aio_poll() with blocking=true to
wait for completion.

As soon as the pending activity is completed and the job finally arrives
in a quiescent state (i.e. its coroutine either yields with busy=false
or terminates), the block job must notify the aio_poll() loop to wake
up, otherwise we get a deadlock if both are running in different
threads.

Signed-off-by: Kevin Wolf 
---
 include/block/blockjob.h | 13 +
 include/qemu/job.h   |  3 +++
 blockjob.c   | 18 ++
 job.c|  7 +++
 4 files changed, 41 insertions(+)

diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index 32c00b7dc0..2290bbb824 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -70,6 +70,9 @@ typedef struct BlockJob {
 /** Called when the job transitions to READY */
 Notifier ready_notifier;
 
+/** Called when the job coroutine yields or terminates */
+Notifier idle_notifier;
+
 /** BlockDriverStates that are involved in this block job */
 GSList *nodes;
 } BlockJob;
@@ -119,6 +122,16 @@ int block_job_add_bdrv(BlockJob *job, const char *name, 
BlockDriverState *bs,
 void block_job_remove_all_bdrv(BlockJob *job);
 
 /**
+ * block_job_wakeup_all_bdrv:
+ * @job: The block job
+ *
+ * Calls bdrv_wakeup() for all BlockDriverStates that have been added to the
+ * job. This function is to be called whenever child_job_drained_poll() would
+ * go from true to false to notify waiting drain requests.
+ */
+void block_job_wakeup_all_bdrv(BlockJob *job);
+
+/**
  * block_job_set_speed:
  * @job: The job to set the speed for.
  * @speed: The new value
diff --git a/include/qemu/job.h b/include/qemu/job.h
index 18c9223e31..0dae5b8481 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -148,6 +148,9 @@ typedef struct Job {
 /** Notifiers called when the job transitions to READY */
 NotifierList on_ready;
 
+/** Notifiers called when the job coroutine yields or terminates */
+NotifierList on_idle;
+
 /** Element of the list of jobs */
 QLIST_ENTRY(Job) job_list;
 
diff --git a/blockjob.c b/blockjob.c
index be5903aa96..8d27e8e1ea 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -221,6 +221,22 @@ int block_job_add_bdrv(BlockJob *job, const char *name, 
BlockDriverState *bs,
 return 0;
 }
 
+void block_job_wakeup_all_bdrv(BlockJob *job)
+{
+GSList *l;
+
+for (l = job->nodes; l; l = l->next) {
+BdrvChild *c = l->data;
+bdrv_wakeup(c->bs);
+}
+}
+
+static void block_job_on_idle(Notifier *n, void *opaque)
+{
+BlockJob *job = opaque;
+block_job_wakeup_all_bdrv(job);
+}
+
 bool block_job_is_internal(BlockJob *job)
 {
 return (job->job.id == NULL);
@@ -419,6 +435,7 @@ void *block_job_create(const char *job_id, const 
BlockJobDriver *driver,
 job->finalize_completed_notifier.notify = block_job_event_completed;
 job->pending_notifier.notify = block_job_event_pending;
 job->ready_notifier.notify = block_job_event_ready;
+job->idle_notifier.notify = block_job_on_idle;
 
 notifier_list_add(>job.on_finalize_cancelled,
   >finalize_cancelled_notifier);
@@ -426,6 +443,7 @@ void *block_job_create(const char *job_id, const 
BlockJobDriver *driver,
   >finalize_completed_notifier);
 notifier_list_add(>job.on_pending, >pending_notifier);
 notifier_list_add(>job.on_ready, >ready_notifier);
+notifier_list_add(>job.on_idle, >idle_notifier);
 
 error_setg(>blocker, "block device is in use by block job: %s",
job_type_str(>job));
diff --git a/job.c b/job.c
index fa671b431a..a746bfe70b 100644
--- a/job.c
+++ b/job.c
@@ -410,6 +410,11 @@ static void job_event_ready(Job *job)
 notifier_list_notify(>on_ready, job);
 }
 
+static void job_event_idle(Job *job)
+{
+notifier_list_notify(>on_idle, job);
+}
+
 void job_enter_cond(Job *job, bool(*fn)(Job *job))
 {
 if (!job_started(job)) {
@@ -455,6 +460,7 @@ static void coroutine_fn job_do_yield(Job *job, uint64_t ns)
 timer_mod(>sleep_timer, ns);
 }
 job->busy = false;
+job_event_idle(job);
 job_unlock();
 qemu_coroutine_yield();
 
@@ -547,6 +553,7 @@ static void coroutine_fn job_co_entry(void *opaque)
 assert(job && job->driver && job->driver->start);
 job_pause_point(job);
 job->driver->start(job);
+job_event_idle(job);
 }
 
 
-- 
2.13.6




[Qemu-block] [RFC PATCH 3/5] job: Drop AioContext lock around aio_poll()

2018-08-17 Thread Kevin Wolf
Simimlar to AIO_WAIT_WHILE(), job_finish_sync() needs to release the
AioContext lock of the job before calling aio_poll(). Otherwise,
callbacks called by aio_poll() would possibly take the lock a second
time and run into a deadlock with a nested AIO_WAIT_WHILE() call.

Signed-off-by: Kevin Wolf 
---
 job.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/job.c b/job.c
index a746bfe70b..6acf55bceb 100644
--- a/job.c
+++ b/job.c
@@ -1016,7 +1016,10 @@ int job_finish_sync(Job *job, void (*finish)(Job *, 
Error **errp), Error **errp)
 job_drain(job);
 }
 while (!job_is_completed(job)) {
+AioContext *aio_context = job->aio_context;
+aio_context_release(aio_context);
 aio_poll(qemu_get_aio_context(), true);
+aio_context_acquire(aio_context);
 }
 ret = (job_is_cancelled(job) && job->ret == 0) ? -ECANCELED : job->ret;
 job_unref(job);
-- 
2.13.6




Re: [Qemu-block] [Qemu-devel] [PATCH 2/4] nbd/server: Advertise actual minimum block size

2018-08-17 Thread Eric Blake

On 08/17/2018 10:04 AM, Vladimir Sementsov-Ogievskiy wrote:
Can you create a dirty bitmap with a granularity smaller than 
request_alignment?  I know you can configure dirty bitmap granularity 
independently from cluster size (in both directions: either smaller or 
larger than cluster size), but that it has a least a minimum lower 
bounds of 512.  You're probably right that we also want it to have a 
minimum lower bound of the request_alignment (if you're using a device 
with 4k minimum I/O, request_alignment would be 4k, and having a dirty 
bitmap any smaller than that granularity is wasted space).


On the other hand, I also think we're safe for this patch: even if you 
waste the space by creating a bitmap with too-small granularity, the 
actions that write bits into the bitmap will still be aligned to 
request_alignment,


Not quite right: nbd_export_bitmap searches through the backing chain 
for the bitmap, and it may correspond to the bds with smaller 
request_alignment.


Hmm. Interesting setup:

base (512-byte alignment, bitmap granularity) <- active (4k alignment)

so you're stating that exporting 'active' but with the 'bitmap' 
inherited from the backing chain from base means that the bitmap may 
have transitions in between the alignment advertised by active.  Then it 
sounds like nbd/server.c:bitmap_to_extents() should be aware of that 
fact, and intentionally round up (anywhere that it finds a mid-alignment 
transition, treat the entire aligned region as dirty. It never hurts to 
mark more of the image dirty than necessary, but does hurt to expose a 
mid-alignment transition to an NBD client not expecting one).


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-block] [Qemu-devel] [PATCH 4/4] nbd/client: Deal with unaligned size from server

2018-08-17 Thread Eric Blake

On 08/17/2018 08:57 AM, Vladimir Sementsov-Ogievskiy wrote:

02.08.2018 17:48, Eric Blake wrote:

When a server advertises an unaligned size but no block sizes,
the code was rounding up to a sector-aligned size (a known
limitation of bdrv_getlength()), then assuming a request_alignment
of 512 (the recommendation of the NBD spec for maximum portability).
However, this means that qemu will actually attempt to access the
padding bytes of the trailing partial sector.

An easy demonstration, using nbdkit as the server:
$ nbdkit -fv random size=1023
$ qemu-io -r -f raw -c 'r -v 0 1023' nbd://localhost:10809
read failed: Invalid argument

because the client rounded the request up to 1024 bytes, which
nbdkit then rejected as beyond the advertised size of 1023.

Note that qemu as the server refuses to send an unaligned size, as
it has already rounded the unaligned image up to sector size, and
then happily resizes the image on access (at least when serving a
POSIX file over NBD).

Reported-by: Richard W.M. Jones 
Signed-off-by: Eric Blake 


Reviewed-by: Vladimir Sementsov-Ogievskiy 



This patch is not a full solution. It fixes things so that a client 
accessing the first half of the final sector no longer rounds things up 
and chokes the server, but does not prevent a client from attempting to 
access the second half of the final sector (where that access still 
reaches the server).  I probably need yet another patch, similar to 
Rich's 'nbdkit --filter truncate', where reads of the trailing hole 
created by qemu's rounding are padded to NUL without asking the server, 
and where writes are ignored if all zero or cause ENOSPACE if nonzero.


Or, a much bigger patch series to make qemu quit rounding size up :) 
(I'd like to get there someday, but it's faster to kick out the quick 
patch for just NBD than to audit the entire block stack)


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-block] [Qemu-devel] [PATCH 2/4] nbd/server: Advertise actual minimum block size

2018-08-17 Thread Vladimir Sementsov-Ogievskiy

02.08.2018 17:48, Eric Blake wrote:

Both NBD_CMD_BLOCK_STATUS and structured NBD_CMD_READ will split
their reply according to bdrv_block_status() boundaries. If the
block device has a request_alignment smaller than 512, but we
advertise a block alignment of 512 to the client, then this can
result in the server reply violating client expectations by
reporting a smaller region of the export than what the client
is permitted to address.  Thus, it is imperative that we
advertise the actual minimum block limit, rather than blindly
rounding it up to 512 (bdrv_block_status() cannot return status
aligned any smaller than request_alignment).

Signed-off-by: Eric Blake 
---
  nbd/server.c | 10 ++
  1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/nbd/server.c b/nbd/server.c
index ea5fe0eb336..cd3c41f895b 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -608,10 +608,12 @@ static int nbd_negotiate_handle_info(NBDClient *client, 
uint16_t myflags,
  /* Send NBD_INFO_BLOCK_SIZE always, but tweak the minimum size
   * according to whether the client requested it, and according to
   * whether this is OPT_INFO or OPT_GO. */
-/* minimum - 1 for back-compat, or 512 if client is new enough.
- * TODO: consult blk_bs(blk)->bl.request_alignment? */
-sizes[0] =
-(client->opt == NBD_OPT_INFO || blocksize) ? BDRV_SECTOR_SIZE : 1;
+/* minimum - 1 for back-compat, or actual if client will obey it. */
+if (client->opt == NBD_OPT_INFO || blocksize) {
+sizes[0] = blk_get_request_alignment(exp->blk);
+} else {
+sizes[0] = 1;
+}
  /* preferred - Hard-code to 4096 for now.
   * TODO: is blk_bs(blk)->bl.opt_transfer appropriate? */
  sizes[1] = 4096;


Here, what about dirty bitmap export through BLOCK_STATUS? Could it's 
granularity be less than request_alignment? Shouldn't we handle it somehow?


--
Best regards,
Vladimir




Re: [Qemu-block] [PATCH 3/4] scripts/qemu: add render_block_graph method for QEMUMachine

2018-08-17 Thread Eduardo Habkost
On Fri, Aug 17, 2018 at 01:08:42PM +0300, Vladimir Sementsov-Ogievskiy wrote:
> 17.08.2018 04:54, Eduardo Habkost wrote:
> > On Thu, Aug 16, 2018 at 08:20:26PM +0300, Vladimir Sementsov-Ogievskiy 
> > wrote:
> > > Render block nodes graph with help of graphviz
> > > 
> > > Signed-off-by: Vladimir Sementsov-Ogievskiy 
> > Thanks for your patch.  Comments below:
> > 
> > > ---
> > >   scripts/qemu.py | 53 
> > > +
> > >   1 file changed, 53 insertions(+)
> > > 
> > > diff --git a/scripts/qemu.py b/scripts/qemu.py
> > > index f099ce7278..cff562c713 100644
> > > --- a/scripts/qemu.py
> > > +++ b/scripts/qemu.py
> > > @@ -460,3 +460,56 @@ class QEMUMachine(object):
> > >socket.SOCK_STREAM)
> > >   self._console_socket.connect(self._console_address)
> > >   return self._console_socket
> > > +
> > > +def render_block_graph(self, filename):
> > > +'''
> > > +Render graph in text (dot) representation into "filename" and 
> > > graphical
> > > +representation into pdf file "filename".pdf
> > > +'''
> > > +
> > > +try:
> > > +from graphviz import Digraph
> > I'm not convinced that this belongs to qemu.py, if most code
> > using the qemu.py module will never use it.  Do you see any
> > problem in moving this code to a scripts/render_block_graph.py
> > script?
> 
> Not a problem, I can do it.. Just one thought:
> Isn't it better to keep all function doing something with one vm as methods,
> not separate functions?

I don't think so.  We don't need to create a new QEMUMachine
method every time we write a function that takes a QEMUMachine as
argument in our scripts.

There are cases when we're forced to create a method: when the
new code is tightly coupled to the QEMUMachine code and need
access to its private attributes.

There are cases where we probably want to place the code in
qemu.py (as a method): when we expect many users of qemu.py to
call it.

But in other cases, I don't see any problem with another
script/module having a regular function like:

  def my_special_purpose_function(vm, ...):
 ...


-- 
Eduardo



Re: [Qemu-block] [PATCH] qemu-img.c: increase spacing between commands in documentation

2018-08-17 Thread Eric Blake

On 08/16/2018 08:27 PM, Programmingkid wrote:


I am by no means an expert at qemu-img. But I did try my best to create what I think 
should be the new output for qemu-img  --help. This is just the text I 
plan on using in a future patch. It is easier to read right now than it will be in 
patch form, so please let me know if there are any errors in this documentation. 
Thank you.






Just reviewing the first for now, to give you a feel for what to consider.


usage: qemu-img amend [--object objectdef] [--image-opts] [-p] [-q] [-f fmt]
[-t cache] -o options filename

Command parameters:
  -f 'fmt' is the disk image format. It is guessed automatically
 in most cases.


Bad advice. We WANT users to use -f (if you don't, and the automatic 
guessing sees something that is not raw, but your image SHOULD have been 
-f raw, then you have a security hole: the guest can write anything into 
a raw image to make the host access files it shouldn't based on 
interpreting the raw file as something else).  I'd drop the last 
sentence, and use just the first.




--image-optsTreat filename as a set of image options, instead of a plain
 filename.

  -o Used with a comma separated list of format specific options in 
a
 name=value format. Use "-o ?" for an overview of the options


Please spell that "-o help", not "-o ?".   Otherwise, the user has to 
quote the ? to avoid it globbing into any single-byte file lying around 
in the current directory.



 supported by the used format

--object'objectdef' is a QEMU user creatable object definition. See the
 qemu(1) manual page for a description of the object properties.
 The most common object type is a 'secret', which is used to
 supply passwords and/or encryption keys.

  -p Display progress bar. If the -p option is not used for a 
command
 that supports it, the progress is reported when the process
 receives a "SIGUSR1" signal.

  -q Quiet mode - do not print any output (except errors). There's 
no
 progress bar in case both -q and -p options are used.


Not your fault, but I don't like it when interfaces take mutually 
exclusive operations but the last one does not win.  Either '-p -q' 
should behave like '-q', and '-q -p' behave like '-p' (because we accept 
the mutual exclusion but last one wins), or both forms should error 
(because they are incompatible).  But having both forms silently behave 
like '-q' is evil.  So, as long as you're willing to patch up 
interfaces, that's a project to consider.




  -t Specifies the cache mode that should be used with the
 destination file.


And what are those modes?  If you're going to be wordy, then give the 
user enough information to be useful.  Otherwise, being terse in --help 
is fine (and let the man page be wordy instead).


Example:
 qemu-img amend -o compat=0.10 -f qcow2 image.qcow2


Where's an example with --image-opts and --object secret?

We're trying to move away from compat=0.10 (also spelled compat=v2), and 
instead start encouraging compat=v3.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-block] [Qemu-devel] [PATCH 1/4] block: Add bdrv_get_request_alignment()

2018-08-17 Thread Vladimir Sementsov-Ogievskiy

02.08.2018 17:48, Eric Blake wrote:

The next patch needs access to a device's minimum permitted
alignment, since NBD wants to advertise this to clients. Add
an accessor function, borrowing from blk_get_max_transfer()
for accessing a backend's block limits.

Signed-off-by: Eric Blake 


Reviewed-by: Vladimir Sementsov-Ogievskiy 



--
Best regards,
Vladimir




[Qemu-block] [PATCH v2 1/7] block/qcow2-refcount: fix check_oflag_copied

2018-08-17 Thread Vladimir Sementsov-Ogievskiy
Increase corruptions_fixed only after successful fix.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2-refcount.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 3c539f02e5..615847eb09 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1816,7 +1816,7 @@ static int check_oflag_copied(BlockDriverState *bs, 
BdrvCheckResult *res,
 for (i = 0; i < s->l1_size; i++) {
 uint64_t l1_entry = s->l1_table[i];
 uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
-bool l2_dirty = false;
+int l2_fixed_entries = 0;
 
 if (!l2_offset) {
 continue;
@@ -1878,8 +1878,7 @@ static int check_oflag_copied(BlockDriverState *bs, 
BdrvCheckResult *res,
 l2_table[j] = cpu_to_be64(refcount == 1
 ? l2_entry |  QCOW_OFLAG_COPIED
 : l2_entry & ~QCOW_OFLAG_COPIED);
-l2_dirty = true;
-res->corruptions_fixed++;
+l2_fixed_entries++;
 } else {
 res->corruptions++;
 }
@@ -1887,7 +1886,7 @@ static int check_oflag_copied(BlockDriverState *bs, 
BdrvCheckResult *res,
 }
 }
 
-if (l2_dirty) {
+if (l2_fixed_entries > 0) {
 ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2,
 l2_offset, s->cluster_size);
 if (ret < 0) {
@@ -1905,6 +1904,7 @@ static int check_oflag_copied(BlockDriverState *bs, 
BdrvCheckResult *res,
 res->check_errors++;
 goto fail;
 }
+res->corruptions_fixed += l2_fixed_entries;
 }
 }
 
-- 
2.11.1




[Qemu-block] [PATCH v2 7/7] block/qcow2-refcount: fix out-of-file L2 entries to be read-as-zero

2018-08-17 Thread Vladimir Sementsov-Ogievskiy
Rewrite corrupted L2 table entry, which reference space out of
underlying file.

Make this L2 table entry read-as-all-zeros without any allocation.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2-refcount.c | 32 
 1 file changed, 32 insertions(+)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 3c004e5bfe..3de3768a3c 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1720,8 +1720,30 @@ static int check_refcounts_l2(BlockDriverState *bs, 
BdrvCheckResult *res,
 /* Mark cluster as used */
 csize = (((l2_entry >> s->csize_shift) & s->csize_mask) + 1) *
 BDRV_SECTOR_SIZE;
+if (csize > s->cluster_size) {
+ret = fix_l2_entry_to_zero(
+bs, res, fix, l2_offset, i, active,
+"compressed cluster larger than cluster: size 0x%"
+PRIx64, csize);
+if (ret < 0) {
+goto fail;
+}
+continue;
+}
+
 coffset = l2_entry & s->cluster_offset_mask &
   ~(BDRV_SECTOR_SIZE - 1);
+if (coffset >= bdrv_getlength(bs->file->bs)) {
+ret = fix_l2_entry_to_zero(
+bs, res, fix, l2_offset, i, active,
+"compressed cluster out of file: offset 0x%" PRIx64,
+coffset);
+if (ret < 0) {
+goto fail;
+}
+continue;
+}
+
 ret = qcow2_inc_refcounts_imrt(bs, res,
refcount_table, refcount_table_size,
coffset, csize);
@@ -1748,6 +1770,16 @@ static int check_refcounts_l2(BlockDriverState *bs, 
BdrvCheckResult *res,
 {
 uint64_t offset = l2_entry & L2E_OFFSET_MASK;
 
+if (offset >= bdrv_getlength(bs->file->bs)) {
+ret = fix_l2_entry_to_zero(
+bs, res, fix, l2_offset, i, active,
+"cluster out of file: offset 0x%" PRIx64, offset);
+if (ret < 0) {
+goto fail;
+}
+continue;
+}
+
 if (flags & CHECK_FRAG_INFO) {
 res->bfi.allocated_clusters++;
 if (next_contiguous_offset &&
-- 
2.11.1




[Qemu-block] [PATCH v2 3/7] block/qcow2-refcount: check_refcounts_l2: refactor compressed case

2018-08-17 Thread Vladimir Sementsov-Ogievskiy
Separate offset and size of compressed cluster.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2-refcount.c | 15 ++-
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 566c19fbfa..0ea01e3ee2 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1570,7 +1570,7 @@ static int check_refcounts_l2(BlockDriverState *bs, 
BdrvCheckResult *res,
 BDRVQcow2State *s = bs->opaque;
 uint64_t *l2_table, l2_entry;
 uint64_t next_contiguous_offset = 0;
-int i, l2_size, nb_csectors, ret;
+int i, l2_size, ret;
 
 /* Read L2 table from disk */
 l2_size = s->l2_size * sizeof(uint64_t);
@@ -1589,6 +1589,9 @@ static int check_refcounts_l2(BlockDriverState *bs, 
BdrvCheckResult *res,
 
 switch (qcow2_get_cluster_type(l2_entry)) {
 case QCOW2_CLUSTER_COMPRESSED:
+{
+int64_t csize, coffset;
+
 /* Compressed clusters don't have QCOW_OFLAG_COPIED */
 if (l2_entry & QCOW_OFLAG_COPIED) {
 fprintf(stderr, "ERROR: coffset=0x%" PRIx64 ": "
@@ -1599,12 +1602,13 @@ static int check_refcounts_l2(BlockDriverState *bs, 
BdrvCheckResult *res,
 }
 
 /* Mark cluster as used */
-nb_csectors = ((l2_entry >> s->csize_shift) &
-   s->csize_mask) + 1;
-l2_entry &= s->cluster_offset_mask;
+csize = (((l2_entry >> s->csize_shift) & s->csize_mask) + 1) *
+BDRV_SECTOR_SIZE;
+coffset = l2_entry & s->cluster_offset_mask &
+  ~(BDRV_SECTOR_SIZE - 1);
 ret = qcow2_inc_refcounts_imrt(bs, res,
refcount_table, refcount_table_size,
-   l2_entry & ~511, nb_csectors * 512);
+   coffset, csize);
 if (ret < 0) {
 goto fail;
 }
@@ -1621,6 +1625,7 @@ static int check_refcounts_l2(BlockDriverState *bs, 
BdrvCheckResult *res,
 res->bfi.fragmented_clusters++;
 }
 break;
+}
 
 case QCOW2_CLUSTER_ZERO_ALLOC:
 case QCOW2_CLUSTER_NORMAL:
-- 
2.11.1




[Qemu-block] [PATCH v2 2/7] block/qcow2-refcount: avoid eating RAM

2018-08-17 Thread Vladimir Sementsov-Ogievskiy
qcow2_inc_refcounts_imrt() (through realloc_refcount_array()) can eat
an unpredictable amount of memory on corrupted table entries, which are
referencing regions far beyond the end of file.

Prevent this, by skipping such regions from further processing.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2-refcount.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 615847eb09..566c19fbfa 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1499,12 +1499,26 @@ int qcow2_inc_refcounts_imrt(BlockDriverState *bs, 
BdrvCheckResult *res,
 {
 BDRVQcow2State *s = bs->opaque;
 uint64_t start, last, cluster_offset, k, refcount;
+int64_t file_len;
 int ret;
 
 if (size <= 0) {
 return 0;
 }
 
+file_len = bdrv_getlength(bs->file->bs);
+if (file_len < 0) {
+return file_len;
+}
+
+if (offset + size - file_len > s->cluster_size) {
+fprintf(stderr, "ERROR: counting reference for region exceeding the "
+"end of the file by more than one cluster: offset 0x%" PRIx64
+" size 0x%" PRIx64 "\n", offset, size);
+res->corruptions++;
+return 0;
+}
+
 start = start_of_cluster(s, offset);
 last = start_of_cluster(s, offset + size - 1);
 for(cluster_offset = start; cluster_offset <= last;
-- 
2.11.1




[Qemu-block] [PATCH v2 6/7] block/qcow2-refcount: fix out-of-file L1 entries to be zero

2018-08-17 Thread Vladimir Sementsov-Ogievskiy
Zero out corrupted L1 table entry, which reference L2 table out of
underlying file.
Zero L1 table entry means that "the L2 table and all clusters described
by this L2 table are unallocated."

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2-refcount.c | 44 
 1 file changed, 44 insertions(+)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index d9c8cd666b..3c004e5bfe 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1647,6 +1647,29 @@ static int fix_l2_entry_to_zero(BlockDriverState *bs, 
BdrvCheckResult *res,
 return ret;
 }
 
+/* Zero out L1 entry
+ *
+ * Returns: -errno if overlap check failed
+ *  0 if write failed
+ *  1 on success
+ */
+static int fix_l1_entry_to_zero(BlockDriverState *bs, BdrvCheckResult *res,
+BdrvCheckMode fix, int64_t l1_offset,
+int l1_index, bool active,
+const char *fmt, ...)
+{
+int ret;
+int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2;
+va_list args;
+
+va_start(args, fmt);
+ret = fix_table_entry(bs, res, fix, "L1", l1_offset, l1_index, 0, ign,
+  fmt, args);
+va_end(args);
+
+return ret;
+}
+
 /*
  * Increases the refcount in the given refcount table for the all clusters
  * referenced in the L2 table. While doing so, performs some checks on L2
@@ -1808,6 +1831,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
 {
 BDRVQcow2State *s = bs->opaque;
 uint64_t *l1_table = NULL, l2_offset, l1_size2;
+int64_t file_len;
 int i, ret;
 
 l1_size2 = l1_size * sizeof(uint64_t);
@@ -1837,12 +1861,32 @@ static int check_refcounts_l1(BlockDriverState *bs,
 be64_to_cpus(_table[i]);
 }
 
+file_len = bdrv_getlength(bs->file->bs);
+if (file_len < 0) {
+ret = file_len;
+goto fail;
+}
+
 /* Do the actual checks */
 for(i = 0; i < l1_size; i++) {
 l2_offset = l1_table[i];
 if (l2_offset) {
 /* Mark L2 table as used */
 l2_offset &= L1E_OFFSET_MASK;
+if (l2_offset >= file_len) {
+ret = fix_l1_entry_to_zero(
+bs, res, fix, l1_table_offset, i, active,
+"l2 table offset out of file: offset 0x%" PRIx64,
+l2_offset);
+if (ret < 0) {
+/* Something is seriously wrong, so abort checking
+ * this L1 table */
+goto fail;
+}
+
+continue;
+}
+
 ret = qcow2_inc_refcounts_imrt(bs, res,
refcount_table, refcount_table_size,
l2_offset, s->cluster_size);
-- 
2.11.1




[Qemu-block] [PATCH v2 4/7] block/qcow2-refcount: check_refcounts_l2: reduce ignored overlaps

2018-08-17 Thread Vladimir Sementsov-Ogievskiy
Reduce number of structures ignored in overlap check: when checking
active table ignore active tables, when checking inactive table ignore
inactive ones.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2-refcount.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 0ea01e3ee2..3b057b635d 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1565,7 +1565,7 @@ enum {
 static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
   void **refcount_table,
   int64_t *refcount_table_size, int64_t l2_offset,
-  int flags, BdrvCheckMode fix)
+  int flags, BdrvCheckMode fix, bool active)
 {
 BDRVQcow2State *s = bs->opaque;
 uint64_t *l2_table, l2_entry;
@@ -1654,11 +1654,12 @@ static int check_refcounts_l2(BlockDriverState *bs, 
BdrvCheckResult *res,
 if (fix & BDRV_FIX_ERRORS) {
 uint64_t l2e_offset =
 l2_offset + (uint64_t)i * sizeof(uint64_t);
+int ign = active ? QCOW2_OL_ACTIVE_L2 :
+   QCOW2_OL_INACTIVE_L2;
 
 l2_entry = QCOW_OFLAG_ZERO;
 l2_table[i] = cpu_to_be64(l2_entry);
-ret = qcow2_pre_write_overlap_check(bs,
-QCOW2_OL_ACTIVE_L2 | QCOW2_OL_INACTIVE_L2,
+ret = qcow2_pre_write_overlap_check(bs, ign,
 l2e_offset, sizeof(uint64_t));
 if (ret < 0) {
 fprintf(stderr, "ERROR: Overlap check failed\n");
@@ -1732,7 +1733,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
   void **refcount_table,
   int64_t *refcount_table_size,
   int64_t l1_table_offset, int l1_size,
-  int flags, BdrvCheckMode fix)
+  int flags, BdrvCheckMode fix, bool active)
 {
 BDRVQcow2State *s = bs->opaque;
 uint64_t *l1_table = NULL, l2_offset, l1_size2;
@@ -1788,7 +1789,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
 /* Process and check L2 entries */
 ret = check_refcounts_l2(bs, res, refcount_table,
  refcount_table_size, l2_offset, flags,
- fix);
+ fix, active);
 if (ret < 0) {
 goto fail;
 }
@@ -2074,7 +2075,7 @@ static int calculate_refcounts(BlockDriverState *bs, 
BdrvCheckResult *res,
 /* current L1 table */
 ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
  s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO,
- fix);
+ fix, true);
 if (ret < 0) {
 return ret;
 }
@@ -2097,7 +2098,8 @@ static int calculate_refcounts(BlockDriverState *bs, 
BdrvCheckResult *res,
 continue;
 }
 ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters,
- sn->l1_table_offset, sn->l1_size, 0, fix);
+ sn->l1_table_offset, sn->l1_size, 0, fix,
+ false);
 if (ret < 0) {
 return ret;
 }
-- 
2.11.1




[Qemu-block] [PATCH v2 5/7] block/qcow2-refcount: check_refcounts_l2: split fix_l2_entry_to_zero

2018-08-17 Thread Vladimir Sementsov-Ogievskiy
Split entry repairing to separate function, to be reused later.

Note: entry in in-memory l2 table (local variable in
check_refcounts_l2) is not updated after this patch.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2-refcount.c | 147 -
 1 file changed, 109 insertions(+), 38 deletions(-)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 3b057b635d..d9c8cd666b 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1554,6 +1554,99 @@ enum {
 CHECK_FRAG_INFO = 0x2,  /* update BlockFragInfo counters */
 };
 
+/* Update entry in L1 or L2 table
+ *
+ * Returns: -errno if overlap check failed
+ *  0 if write failed
+ *  1 on success
+ */
+static int write_table_entry(BlockDriverState *bs, const char *table_name,
+ uint64_t table_offset, int entry_index,
+ uint64_t new_val, int ign)
+{
+int ret;
+uint64_t entry_offset =
+table_offset + (uint64_t)entry_index * sizeof(new_val);
+
+cpu_to_be64s(_val);
+ret = qcow2_pre_write_overlap_check(bs, ign, entry_offset, 
sizeof(new_val));
+if (ret < 0) {
+fprintf(stderr,
+"ERROR: Can't write %s table entry: overlap check failed: 
%s\n",
+table_name, strerror(-ret));
+return ret;
+}
+
+ret = bdrv_pwrite_sync(bs->file, entry_offset, _val, sizeof(new_val));
+if (ret < 0) {
+fprintf(stderr, "ERROR: Failed to overwrite %s table entry: %s\n",
+table_name, strerror(-ret));
+return 0;
+}
+
+return 1;
+}
+
+/* Try to fix (if allowed) entry in L1 or L2 table. Update @res 
correspondingly.
+ *
+ * Returns: -errno if overlap check failed
+ *  0 if entry was not updated for other reason
+ *(fixing disabled or write failed)
+ *  1 on success
+ */
+static int fix_table_entry(BlockDriverState *bs, BdrvCheckResult *res,
+   BdrvCheckMode fix, const char *table_name,
+   uint64_t table_offset, int entry_index,
+   uint64_t new_val, int ign,
+   const char *fmt, va_list args)
+{
+int ret;
+
+fprintf(stderr, fix & BDRV_FIX_ERRORS ? "Repairing: " : "ERROR: ");
+vfprintf(stderr, fmt, args);
+fprintf(stderr, "\n");
+
+if (!(fix & BDRV_FIX_ERRORS)) {
+res->corruptions++;
+return 0;
+}
+
+ret = write_table_entry(bs, table_name, table_offset, entry_index, new_val,
+ign);
+
+if (ret == 1) {
+res->corruptions_fixed++;
+} else {
+res->check_errors++;
+}
+
+return ret;
+}
+
+/* Make L2 entry to be QCOW2_CLUSTER_ZERO_PLAIN
+ *
+ * Returns: -errno if overlap check failed
+ *  0 if write failed
+ *  1 on success
+ */
+static int fix_l2_entry_to_zero(BlockDriverState *bs, BdrvCheckResult *res,
+BdrvCheckMode fix, int64_t l2_offset,
+int l2_index, bool active,
+const char *fmt, ...)
+{
+int ret;
+int ign = active ? QCOW2_OL_ACTIVE_L2 : QCOW2_OL_INACTIVE_L2;
+uint64_t l2_entry = QCOW_OFLAG_ZERO;
+va_list args;
+
+va_start(args, fmt);
+ret = fix_table_entry(bs, res, fix, "L2", l2_offset, l2_index, l2_entry,
+  ign, fmt, args);
+va_end(args);
+
+return ret;
+}
+
 /*
  * Increases the refcount in the given refcount table for the all clusters
  * referenced in the L2 table. While doing so, performs some checks on L2
@@ -1646,46 +1739,24 @@ static int check_refcounts_l2(BlockDriverState *bs, 
BdrvCheckResult *res,
 if (qcow2_get_cluster_type(l2_entry) ==
 QCOW2_CLUSTER_ZERO_ALLOC)
 {
-fprintf(stderr, "%s offset=%" PRIx64 ": Preallocated zero "
-"cluster is not properly aligned; L2 entry "
-"corrupted.\n",
-fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR",
+ret = fix_l2_entry_to_zero(
+bs, res, fix, l2_offset, i, active,
+"offset=%" PRIx64 ": Preallocated zero cluster is "
+"not properly aligned; L2 entry corrupted.",
 offset);
-if (fix & BDRV_FIX_ERRORS) {
-uint64_t l2e_offset =
-l2_offset + (uint64_t)i * sizeof(uint64_t);
-int ign = active ? QCOW2_OL_ACTIVE_L2 :
-   QCOW2_OL_INACTIVE_L2;
-
-l2_entry = QCOW_OFLAG_ZERO;
-l2_table[i] = cpu_to_be64(l2_entry);
-ret = qcow2_pre_write_overlap_check(bs, ign,
-