Re: [Qemu-devel] [PATCH 2/3] COLO-compare: Optimize colo_packet_compare_common

2017-02-24 Thread Hailiang Zhang

On 2017/2/25 11:32, Zhang Chen wrote:

Add offset args for colo_packet_compare_common, optimize
colo_packet_compare_icmp() and colo_packet_compare_udp()
just compare the IP payload.

Signed-off-by: Zhang Chen 
---
  net/colo-compare.c | 28 +---
  1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index e75f0ae..9853232 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -180,7 +180,7 @@ static int packet_enqueue(CompareState *s, int mode)
   * return:0  means packet same
   *> 0 || < 0 means packet different
   */
-static int colo_packet_compare_common(Packet *ppkt, Packet *spkt)
+static int colo_packet_compare_common(Packet *ppkt, Packet *spkt, int offset)
  {
  trace_colo_compare_ip_info(ppkt->size, inet_ntoa(ppkt->ip->ip_src),
 inet_ntoa(ppkt->ip->ip_dst), spkt->size,
@@ -188,7 +188,8 @@ static int colo_packet_compare_common(Packet *ppkt, Packet 
*spkt)
 inet_ntoa(spkt->ip->ip_dst));

  if (ppkt->size == spkt->size) {


This check is useless, because we have done such checks in every caller.


-return memcmp(ppkt->data, spkt->data, spkt->size);
+return memcmp(ppkt->data + offset, spkt->data + offset,
+  spkt->size - offset);
  } else {
  trace_colo_compare_main("Net packet size are not the same");
  return -1;
@@ -237,8 +238,7 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet 
*ppkt)
  spkt->ip->ip_sum = ppkt->ip->ip_sum;
  }

-res = memcmp(ppkt->data + ETH_HLEN, spkt->data + ETH_HLEN,
-(spkt->size - ETH_HLEN));
+res = colo_packet_compare_common(ppkt, spkt, ETH_HLEN);

  if (res != 0 && trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE)) {
  trace_colo_compare_pkt_info_src(inet_ntoa(ppkt->ip->ip_src),
@@ -277,7 +277,14 @@ static int colo_packet_compare_udp(Packet *spkt, Packet 
*ppkt)
  return -1;
  }

-ret = colo_packet_compare_common(ppkt, spkt);
+/*
+ * Because of ppkt and spkt are both in the same connection,
+ * The ppkt's src ip, dst ip, src port, dst port, ip_proto all are
+ * same with spkt. In addition, IP header's Identification is a random
+ * field, we can handle it in IP fragmentation function later.
+ * So we just compare the ip payload here.
+ */
+ret = colo_packet_compare_common(ppkt, spkt, network_length + ETH_HLEN);

  if (ret) {
  trace_colo_compare_udp_miscompare("primary pkt size", ppkt->size);
@@ -304,7 +311,14 @@ static int colo_packet_compare_icmp(Packet *spkt, Packet 
*ppkt)
  return -1;
  }

-if (colo_packet_compare_common(ppkt, spkt)) {
+/*
+ * Because of ppkt and spkt are both in the same connection,
+ * The ppkt's src ip, dst ip, src port, dst port, ip_proto all are
+ * same with spkt. In addition, IP header's Identification is a random
+ * field, we can handle it in IP fragmentation function later.
+ * So we just compare the ip payload here.
+ */
+if (colo_packet_compare_common(ppkt, spkt, network_length + ETH_HLEN)) {
  trace_colo_compare_icmp_miscompare("primary pkt size",
 ppkt->size);
  qemu_hexdump((char *)ppkt->data, stderr, "colo-compare",
@@ -330,7 +344,7 @@ static int colo_packet_compare_other(Packet *spkt, Packet 
*ppkt)
 inet_ntoa(ppkt->ip->ip_dst), spkt->size,
 inet_ntoa(spkt->ip->ip_src),
 inet_ntoa(spkt->ip->ip_dst));
-return colo_packet_compare_common(ppkt, spkt);
+return colo_packet_compare_common(ppkt, spkt, 0);
  }

  static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time)






Re: [Qemu-devel] [PATCH 2/3] COLO-compare: Optimize colo_packet_compare_common

2017-02-24 Thread Hailiang Zhang

On 2017/2/25 11:32, Zhang Chen wrote:

Add offset args for colo_packet_compare_common, optimize
colo_packet_compare_icmp() and colo_packet_compare_udp()
just compare the IP payload.

Signed-off-by: Zhang Chen 
---
  net/colo-compare.c | 28 +---
  1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index e75f0ae..9853232 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -180,7 +180,7 @@ static int packet_enqueue(CompareState *s, int mode)
   * return:0  means packet same
   *> 0 || < 0 means packet different
   */
-static int colo_packet_compare_common(Packet *ppkt, Packet *spkt)
+static int colo_packet_compare_common(Packet *ppkt, Packet *spkt, int offset)
  {
  trace_colo_compare_ip_info(ppkt->size, inet_ntoa(ppkt->ip->ip_src),
 inet_ntoa(ppkt->ip->ip_dst), spkt->size,
@@ -188,7 +188,8 @@ static int colo_packet_compare_common(Packet *ppkt, Packet 
*spkt)
 inet_ntoa(spkt->ip->ip_dst));

  if (ppkt->size == spkt->size) {
-return memcmp(ppkt->data, spkt->data, spkt->size);
+return memcmp(ppkt->data + offset, spkt->data + offset,
+  spkt->size - offset);
  } else {
  trace_colo_compare_main("Net packet size are not the same");
  return -1;
@@ -237,8 +238,7 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet 
*ppkt)
  spkt->ip->ip_sum = ppkt->ip->ip_sum;
  }

-res = memcmp(ppkt->data + ETH_HLEN, spkt->data + ETH_HLEN,
-(spkt->size - ETH_HLEN));
+res = colo_packet_compare_common(ppkt, spkt, ETH_HLEN);



For tcp packets check, why not ignore the ip headers, just like udp packets 
check ?
Besides, here, can we compare the checksum stored in headers of tcp and udp
before call colo_packet_compare_common(), which i think will improve the 
comparing
performance.

Thanks.
Hailiang


  if (res != 0 && trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE)) {
  trace_colo_compare_pkt_info_src(inet_ntoa(ppkt->ip->ip_src),
@@ -277,7 +277,14 @@ static int colo_packet_compare_udp(Packet *spkt, Packet 
*ppkt)
  return -1;
  }

-ret = colo_packet_compare_common(ppkt, spkt);
+/*
+ * Because of ppkt and spkt are both in the same connection,
+ * The ppkt's src ip, dst ip, src port, dst port, ip_proto all are
+ * same with spkt. In addition, IP header's Identification is a random
+ * field, we can handle it in IP fragmentation function later.
+ * So we just compare the ip payload here.
+ */
+ret = colo_packet_compare_common(ppkt, spkt, network_length + ETH_HLEN);

  if (ret) {
  trace_colo_compare_udp_miscompare("primary pkt size", ppkt->size);
@@ -304,7 +311,14 @@ static int colo_packet_compare_icmp(Packet *spkt, Packet 
*ppkt)
  return -1;
  }

-if (colo_packet_compare_common(ppkt, spkt)) {
+/*
+ * Because of ppkt and spkt are both in the same connection,
+ * The ppkt's src ip, dst ip, src port, dst port, ip_proto all are
+ * same with spkt. In addition, IP header's Identification is a random
+ * field, we can handle it in IP fragmentation function later.
+ * So we just compare the ip payload here.
+ */
+if (colo_packet_compare_common(ppkt, spkt, network_length + ETH_HLEN)) {
  trace_colo_compare_icmp_miscompare("primary pkt size",
 ppkt->size);
  qemu_hexdump((char *)ppkt->data, stderr, "colo-compare",
@@ -330,7 +344,7 @@ static int colo_packet_compare_other(Packet *spkt, Packet 
*ppkt)
 inet_ntoa(ppkt->ip->ip_dst), spkt->size,
 inet_ntoa(spkt->ip->ip_src),
 inet_ntoa(spkt->ip->ip_dst));
-return colo_packet_compare_common(ppkt, spkt);
+return colo_packet_compare_common(ppkt, spkt, 0);
  }

  static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time)






Re: [Qemu-devel] [PATCH 1/3] COLO-compare: Add minimum packet size check and some fix

2017-02-24 Thread Hailiang Zhang

Hi,

On 2017/2/25 11:32, Zhang Chen wrote:

Add packet minimum size check in colo_packet_compare_udp()
and colo_packet_compare_udp() like colo_packet_compare_icmp(),
rename function colo_packet_compare() to colo_packet_compare_common()
that we will reuse it later.

Signed-off-by: Zhang Chen 
---
  net/colo-compare.c | 30 ++
  1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index 300f017..e75f0ae 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -180,7 +180,7 @@ static int packet_enqueue(CompareState *s, int mode)
   * return:0  means packet same
   *> 0 || < 0 means packet different
   */
-static int colo_packet_compare(Packet *ppkt, Packet *spkt)
+static int colo_packet_compare_common(Packet *ppkt, Packet *spkt)
  {
  trace_colo_compare_ip_info(ppkt->size, inet_ntoa(ppkt->ip->ip_src),
 inet_ntoa(ppkt->ip->ip_dst), spkt->size,
@@ -190,6 +190,7 @@ static int colo_packet_compare(Packet *ppkt, Packet *spkt)
  if (ppkt->size == spkt->size) {
  return memcmp(ppkt->data, spkt->data, spkt->size);
  } else {
+trace_colo_compare_main("Net packet size are not the same");
  return -1;
  }
  }
@@ -202,9 +203,10 @@ static int colo_packet_compare(Packet *ppkt, Packet *spkt)
  static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
  {
  struct tcphdr *ptcp, *stcp;
-int res;
+int res, network_length;

  trace_colo_compare_main("compare tcp");
+
  if (ppkt->size != spkt->size) {
  if (trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE)) {
  trace_colo_compare_main("pkt size not same");
@@ -212,6 +214,12 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet 
*ppkt)
  return -1;
  }

+network_length = ppkt->ip->ip_hl * 4;
+if (ppkt->size < network_length + ETH_HLEN) {


I think the check here is useless, because you have such check in
parse_packet_early() which is been called before these helpers.
And what check you need to add is, to check if the packet's size

= packet's length been record in ip header.


Like:
+++ b/net/colo.c
@@ -78,6 +78,12 @@ int parse_packet_early(Packet *pkt)
 trace_colo_proxy_main("pkt->size < network_header + network_length");
 return 1;
 }
+
+if (pkt->size < ETH_HLEN + ntohs(pkt->ip->ip_len)) {
+fprintf(stderr, "pkt->size %d < pkt expect total len %ld\n", pkt->size,
+pkt_MAChdr_len + ntohs(pkt->ip->ip_len));
+return -1;
+}



+trace_colo_compare_main("tcp packet size error");
+return -1;
+}
+
  ptcp = (struct tcphdr *)ppkt->transport_header;
  stcp = (struct tcphdr *)spkt->transport_header;

@@ -260,10 +268,16 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet 
*ppkt)
   */
  static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
  {
-int ret;
+int ret, network_length;

  trace_colo_compare_main("compare udp");
-ret = colo_packet_compare(ppkt, spkt);
+network_length = ppkt->ip->ip_hl * 4;
+if (ppkt->size < network_length + ETH_HLEN) {
+trace_colo_compare_main("udp packet size error");
+return -1;
+}
+
+ret = colo_packet_compare_common(ppkt, spkt);

  if (ret) {
  trace_colo_compare_udp_miscompare("primary pkt size", ppkt->size);
@@ -285,12 +299,12 @@ static int colo_packet_compare_icmp(Packet *spkt, Packet 
*ppkt)

  trace_colo_compare_main("compare icmp");
  network_length = ppkt->ip->ip_hl * 4;
-if (ppkt->size != spkt->size ||
-ppkt->size < network_length + ETH_HLEN) {
+if (ppkt->size < network_length + ETH_HLEN) {
+trace_colo_compare_main("icmp packet size error");
  return -1;
  }

-if (colo_packet_compare(ppkt, spkt)) {
+if (colo_packet_compare_common(ppkt, spkt)) {
  trace_colo_compare_icmp_miscompare("primary pkt size",
 ppkt->size);
  qemu_hexdump((char *)ppkt->data, stderr, "colo-compare",
@@ -316,7 +330,7 @@ static int colo_packet_compare_other(Packet *spkt, Packet 
*ppkt)
 inet_ntoa(ppkt->ip->ip_dst), spkt->size,
 inet_ntoa(spkt->ip->ip_src),
 inet_ntoa(spkt->ip->ip_dst));
-return colo_packet_compare(ppkt, spkt);
+return colo_packet_compare_common(ppkt, spkt);
  }

  static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time)






Re: [Qemu-devel] [PATCH 08/21] qmp: Improve QMP dispatch error messages

2017-02-24 Thread Markus Armbruster
Eric Blake  writes:

> On 02/23/2017 03:45 PM, Markus Armbruster wrote:
>> Signed-off-by: Markus Armbruster 
>> ---
>>  qapi/qmp-dispatch.c | 14 --
>>  1 file changed, 8 insertions(+), 6 deletions(-)
>> 
>
>> @@ -41,15 +41,17 @@ static QDict *qmp_dispatch_check_obj(const QObject 
>> *request, Error **errp)
>>  
>>  if (!strcmp(arg_name, "execute")) {
>>  if (qobject_type(arg_obj) != QTYPE_QSTRING) {
>> -error_setg(errp, "QMP input object member '%s' expects 
>> '%s'",
>> -   "execute", "string");
>> +error_setg(errp,
>> +   "QMP input object member '%s' must be %s",
>> +   "execute", "a string");
>
> Any reason we can't inline this to:
>
> "QMP input object member 'execute' must be a string"
>
> ? It's not like we're translating the message.

Saves a few bytes.  Residual damage from my 8 bit days, I guess.

Inlined would probably be better for translating.

>>  return NULL;
>>  }
>>  has_exec_key = true;
>>  } else if (!strcmp(arg_name, "arguments")) {
>>  if (qobject_type(arg_obj) != QTYPE_QDICT) {
>> -error_setg(errp, "QMP input object member '%s' expects 
>> '%s'",
>> -   "arguments", "object");
>> +error_setg(errp,
>> +   "QMP input object member '%s' must be %s",
>> +   "arguments", "an object");
>
> and again
>
> Then again, if you use my idea of a QAPI-generated visitor of each input
> wire object, you'd get whatever error message qobject-input normally
> gives, which may render these changes irrelevant.
>
> At any rate, the new wordings are nicer, whether or not you inline
> constant text.
>
> Reviewed-by: Eric Blake 

Thanks!



Re: [Qemu-devel] [PATCH 06/21] qmp: Drop duplicated QMP command object checks

2017-02-24 Thread Markus Armbruster
Eric Blake  writes:

> On 02/23/2017 03:45 PM, Markus Armbruster wrote:
>> qmp_check_input_obj() duplicates qmp_dispatch_check_obj(), except the
>> latter screws up an error message.  handle_qmp_command() runs first
>> the former, then the latter via qmp_dispatch(), masking the screwup.
>> 
>> qemu-ga also masks the screwup, because it also duplicates checks,
>> just differently.
>
> Cleaning up the duplication is a win in my book.
>
>> 
>> qmp_check_input_obj() exists because handle_qmp_command() needs to
>> examine the command before dispatching it.  The previous commit got
>> rid of this need, except for a tracepoint, and a bit of "id" code that
>> relies on qdict not being null.
>> 
>> Fix up the error message in qmp_dispatch_check_obj(), drop
>> qmp_check_input_obj() and the tracepoint.  Protect the "id" code with
>> a conditional.
>> 
>> Signed-off-by: Markus Armbruster 
>> ---
>>  monitor.c   | 74 
>> +
>>  qapi/qmp-dispatch.c |  3 +--
>>  trace-events|  1 -
>>  3 files changed, 7 insertions(+), 71 deletions(-)
>> 
>> diff --git a/monitor.c b/monitor.c
>> index dcf2de7..d83888d 100644
>> --- a/monitor.c
>> +++ b/monitor.c
>> @@ -3697,67 +3697,10 @@ static int monitor_can_read(void *opaque)
>>  return (mon->suspend_cnt == 0) ? 1 : 0;
>>  }
>>  
>> -/*
>> - * Input object checking rules
>> - *
>> - * 1. Input object must be a dict
>> - * 2. The "execute" key must exist
>> - * 3. The "execute" key must be a string
>> - * 4. If the "arguments" key exists, it must be a dict
>> - * 5. If the "id" key exists, it can be anything (ie. json-value)
>> - * 6. Any argument not listed above is considered invalid
>
> You know, with just a little tweak, we could add something to the .json
> file, and let generated code do all the checks of even the top-level
> entities:
>
> { 'struct': 'WireCommand',
>   'data': { 'execute': 'str', '*arguments': 'dict', '*id': 'any' } }
>
> if we had a way to force 'dict' as a subset of 'any' but where it must
> be a QDict.  (If we had a 'dict' recognized by the QAPI code generators,
> we could probably also fix 'object-add' to use '*props':'dict' instead
> of its current use of 'any')
>
> But that sounds like a bit much to ask for in the 2.9 timeframe, so it
> doesn't hold up this patch.

The thought came to me, too, but I banished it due to time pressure
before it could take presentable form.

Valid point on object-add.  If we find more uses for "generic object",
we can explore a new built-in QAPI type.

> We lose a trace, but I don't think that's fatal.

If we need one, it should be added to qmp_dispatch(), where it's also
useful for qemu-ga.

> Reviewed-by: Eric Blake 

Thanks!



Re: [Qemu-devel] [PATCH 05/21] qmp: Clean up how we enforce capability negotiation

2017-02-24 Thread Markus Armbruster
Eric Blake  writes:

> On 02/23/2017 03:45 PM, Markus Armbruster wrote:
>> To enforce capability negotiation before normal operation,
>> handle_qmp_command() inspects every command before it's handed off to
>> qmp_dispatch().  This is a bit of a layering violation, and results in
>> duplicated code.
>> 
>> Before capability negotiation (!cur_mon->in_command_mode), we fail
>> commands other than "qmp_capabilities".  This is what enforces
>> capability negotiation.
>> 
>> Afterwards, we fail command "qmp_capabilities".
>> 
>> Clean this up as follows.
>> 
>> The obvious place to fail a command is the command itself, so move the
>> "afterwards" check to qmp_qmp_capabilities().
>> 
>> We do the "before" check in every other command, but that would be
>> bothersome.  Instead, start without all the other commands, by
>> registering only qmp_qmp_capabilities().  Register the others in
>> qmp_qmp_capabilities().
>> 
>> Additionally, replace the generic human-readable error message for
>> CommandNotFound by one that reminds the user to run qmp_capabilities.
>> Without that, we'd regress commit 2d5a834.
>> 
>> Signed-off-by: Markus Armbruster 
>> ---
>>  monitor.c | 70 
>> ---
>>  1 file changed, 40 insertions(+), 30 deletions(-)
>> 
>
>> @@ -3781,12 +3782,21 @@ static void handle_qmp_command(JSONMessageParser 
>> *parser, GQueue *tokens)
>>  cmd_name = qdict_get_str(qdict, "execute");
>>  trace_handle_qmp_command(mon, cmd_name);
>>  
>> -if (invalid_qmp_mode(mon, cmd_name, )) {
>> -goto err_out;
>> -}
>> -
>>  rsp = qmp_dispatch(req);
>>  
>> +qdict = qdict_get_qdict(qobject_to_qdict(rsp), "error");
>> +if (qdict) {
>> +if (!mon->qmp.in_command_mode
>> +&& !strcmp(qdict_get_try_str(qdict, "class"),
>
> Ouch - this can call strcmp(NULL, ...) if the error is ill-formed.  Is
> that a problem?  Can we assert that "class" will always exist as a string?

It's safe, because it comes from qmp_build_error_object().  But I'll
switch to g_strcmp0() to make it more obviously safe.

>> +   
>> QapiErrorClass_lookup[ERROR_CLASS_COMMAND_NOT_FOUND])) {
>> +/* Provide a more useful error message */
>> +qdict_del(qdict, "desc");
>> +qdict_put(qdict, "desc",
>> +  qstring_from_str("Expecting capabilities negotiation"
>> +   " with 'qmp_capabilities'"));
>> +}
>> +}
>> +
>
> Otherwise, I like it.

Thanks!



Re: [Qemu-devel] [PATCH RFC v3 2/5] keyval: New keyval_parse()

2017-02-24 Thread Markus Armbruster
Eric Blake  writes:

> On 02/24/2017 03:06 PM, Eric Blake wrote:
>
>> On 02/24/2017 02:18 PM, Markus Armbruster wrote:
 I have a preference for option 1 in the long run, but as it seems to be
 upwards compatible from option 2 for -blockdev in 2.9, I'm leaning
 towards option 2 for this release.
>>> 
>>> Let me rename the options:
>>> 
>>> * "no sugar -blockdev": both "FOO" and "noFOO" are rejected.
>>> 
>>> * "positive sugar blockdev": "FOO" is desugared to "FOO=on", "noFOO" to
>>>   "noFOO=on".
>>> 
>>> Which one do you prefer for 2.9?
>>
>> My current leanings:
>>
>> For 2.9: keyval_parse() should have no sugar.  Both "FOO" and "noFOO"
>> are rejected; an '=' must be present except for an implicit key -
>> although if -blockdev is the only client of keyval_parse(), then it's a
>> tossup whether we want to use it with an implicit key.

I agree on not doing implicit values for 2.9.  We have some thinking to
do on "noFOO", and having keyval_parse() support just "FOO" now could
backfire, as explained upthread.

Regarding implicit keys: I think we can just as well have them.  We're
not going to get rid of existing implicit keys.  Since keyval_parse()
has the code for supporting them already, we can just as well keep it.

>> Also for 2.9: turn on the deprecation warning for QemuOpts negative
>> sugar, but leave positive sugar unchanged (it seems like positive sugar
>> is probably more in use than negative sugar, other than chardev nowait).
>
> Capturing the gist of an IRC conversation:
>
> Right now, it appears that '-chardev nowait' is in heavy use ('git grep
> nowait' shows that qemu itself recommends it in documentation, nodelay
> being another one).  Right now, QemuOpts says that '-chardev
> wait,nowait' results in 'wait=off', and '-chardev nowait,wait' results
> in 'wait=on' (that is, last-one-wins semantics) - but in reality, there
> are probably few (if any) clients that rely on this particular semantics.
>
> If we get rid of 'noFOO' magic, we can still teach the chardev QemuOpts
> to recognize optional keys for BOTH 'wait' and 'nowait'; a user that
> specifies neither gets the default behavior as always; a user that
> specifies only 'wait' or 'nowait' gets the requested behavior (although
> now chardev has to check both key spellings rather than one), and a user
> that specifies 'wait,nowait' or 'nowait,wait' causes an error, not at
> QemuOpts parse time, but in the chardev initialization code.  That is,
> for any boolean option where 'noFOO' magic is currently handled by
> QemuOpts, we can still mostly keep back-compat by adding a new 'noFOO'
> key to the object, and teaching that object to check that 'FOO' and
> 'noFOO' are not both selected at once.

Another possibility is to start with deprecating "noFOO" only when we
don't have description for "FOO".

Hmm, could just as well also deprecate it when we do have one, but it
isn't QEMU_OPT_BOOL.

> But whether this change to QemuOpts is the right thing to do is not 2.9
> material, so let's take a breather and revisit it for 2.10 (or whenever
> we try to fix QemuOpts to layer on top of keyval_parse()).

Makes sense.

>> That way, we still have the option of enabling positive sugar blockdev
>> in a later version, if we decide its useful (and after dealing with any
>> QemuOpts fallout), but are not locked into having to use it.

Exactly.



Re: [Qemu-devel] [PATCH v2 3/5] s390x/ipl: Load network boot image

2017-02-24 Thread Thomas Huth
On 23.02.2017 13:20, Cornelia Huck wrote:
> From: Farhan Ali 
> 
> Load the network boot image into guest RAM when the boot
> device selected is a network device. Use some of the reserved
> space in IplBlockCcw to store the start address of the netboot
> image.
> 
> A user could also use 'chreipl'(diag 308/5) to change the boot device.
> So every time we update the IPLB, we need to verify if the selected
> boot device is a network device so we can appropriately load the
> network boot image.
> 
> Signed-off-by: Farhan Ali 
> Signed-off-by: Cornelia Huck 
> ---
>  hw/s390x/ipl.c | 89 
> ++
>  hw/s390x/ipl.h |  4 ++-
>  2 files changed, 92 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c
> index fd656718a7..195cac559f 100644
> --- a/hw/s390x/ipl.c
> +++ b/hw/s390x/ipl.c
> @@ -20,6 +20,7 @@
>  #include "hw/s390x/virtio-ccw.h"
>  #include "hw/s390x/css.h"
>  #include "ipl.h"
> +#include "qemu/error-report.h"
>  
>  #define KERN_IMAGE_START0x01UL
>  #define KERN_PARM_AREA  0x010480UL
> @@ -227,6 +228,12 @@ static bool s390_gen_initial_iplb(S390IPLState *ipl)
>  TYPE_VIRTIO_CCW_DEVICE);
>  SCSIDevice *sd = (SCSIDevice *) object_dynamic_cast(OBJECT(dev_st),
>  
> TYPE_SCSI_DEVICE);
> +VirtIONet *vn = (VirtIONet *) object_dynamic_cast(OBJECT(dev_st),
> +  TYPE_VIRTIO_NET);
> +
> +if (vn) {
> +ipl->netboot = true;
> +}
>  if (virtio_ccw_dev) {
>  CcwDevice *ccw_dev = CCW_DEVICE(virtio_ccw_dev);
>  
> @@ -259,12 +266,86 @@ static bool s390_gen_initial_iplb(S390IPLState *ipl)
>  return false;
>  }
>  
> +static int load_netboot_image(Error **errp)
> +{
> +S390IPLState *ipl = get_ipl_device();
> +char *netboot_filename;
> +MemoryRegion *sysmem =  get_system_memory();
> +MemoryRegion *mr = NULL;
> +void *ram_ptr = NULL;
> +int img_size = -1;
> +
> +mr = memory_region_find(sysmem, 0, 1).mr;
> +if (!mr) {
> +error_setg(errp, "Failed to find memory region at address 0");
> +return -1;
> +}
> +
> +ram_ptr = memory_region_get_ram_ptr(mr);
> +if (!ram_ptr) {
> +error_setg(errp, "No RAM found");
> +goto unref_mr;
> +}
> +
> +netboot_filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, ipl->netboot_fw);
> +if (netboot_filename == NULL) {
> +error_setg(errp, "Could not find network bootloader");
> +goto unref_mr;
> +}
> +
> +img_size = load_elf_ram(netboot_filename, NULL, NULL, >start_addr,
> +NULL, NULL, 1, EM_S390, 0, 0, NULL, false);
> +
> +if (img_size < 0) {
> +img_size = load_image_size(netboot_filename, ram_ptr, ram_size);
> +ipl->start_addr = KERN_IMAGE_START;
> +}
> +
> +if (img_size < 0) {
> +error_setg(errp, "Failed to load network bootloader");
> +}
> +
> +g_free(netboot_filename);
> +
> +unref_mr:
> +memory_region_unref(mr);
> +return img_size;
> +}
> +
> +static bool is_virtio_net_device(IplParameterBlock *iplb)
> +{
> +uint8_t cssid;
> +uint8_t ssid;
> +uint16_t devno;
> +uint16_t schid;
> +SubchDev *sch = NULL;
> +
> +if (iplb->pbt != S390_IPL_TYPE_CCW) {
> +return false;
> +}
> +
> +devno = be16_to_cpu(iplb->ccw.devno);
> +ssid = iplb->ccw.ssid & 3;
> +
> +for (schid = 0; schid < MAX_SCHID; schid++) {
> +for (cssid = 0; cssid < MAX_CSSID; cssid++) {
> +sch = css_find_subch(1, cssid, ssid, schid);
> +
> +if (sch && sch->devno == devno) {
> +return sch->id.cu_model == VIRTIO_ID_NET;
> +}
> +}
> +}
> +   return false;

The above line has only 3 instead of 4 spaces. I wonder why checkpatch
does not complain here...?

> +}
> +
>  void s390_ipl_update_diag308(IplParameterBlock *iplb)
>  {
>  S390IPLState *ipl = get_ipl_device();
>  
>  ipl->iplb = *iplb;
>  ipl->iplb_valid = true;
> +ipl->netboot = is_virtio_net_device(iplb);
>  }
>  
>  IplParameterBlock *s390_ipl_get_iplb(void)
> @@ -288,6 +369,7 @@ void s390_reipl_request(void)
>  void s390_ipl_prepare_cpu(S390CPU *cpu)
>  {
>  S390IPLState *ipl = get_ipl_device();
> +Error *err = NULL;
>  
>  cpu->env.psw.addr = ipl->start_addr;
>  cpu->env.psw.mask = IPL_PSW_MASK;
> @@ -298,6 +380,13 @@ void s390_ipl_prepare_cpu(S390CPU *cpu)
>  ipl->iplb_valid = s390_gen_initial_iplb(ipl);
>  }
>  }
> +if (ipl->netboot) {
> +if (load_netboot_image() < 0) {
> +error_report_err(err);
> +vm_stop(RUN_STATE_INTERNAL_ERROR);
> +}
> +ipl->iplb.ccw.netboot_start_addr = ipl->start_addr;


Re: [Qemu-devel] [PATCH v2 2/5] s390x/ipl: Extend S390IPLState to support network boot

2017-02-24 Thread Thomas Huth
On 23.02.2017 13:20, Cornelia Huck wrote:
> From: Farhan Ali 
> 
> Add new field to S390IPLState to store the name of the network boot
> loader.
> 
> Signed-off-by: Farhan Ali 
> Reviewed-by: Christian Borntraeger 
> Reviewed-by: Cornelia Huck 
> Signed-off-by: Cornelia Huck 
> ---
>  hw/s390x/ipl.c | 1 +
>  hw/s390x/ipl.h | 1 +
>  hw/s390x/s390-virtio-ccw.c | 3 ++-
>  hw/s390x/s390-virtio.c | 2 ++
>  hw/s390x/s390-virtio.h | 1 +
>  5 files changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c
> index 2e2664f22e..fd656718a7 100644
> --- a/hw/s390x/ipl.c
> +++ b/hw/s390x/ipl.c
> @@ -209,6 +209,7 @@ static Property s390_ipl_properties[] = {
>  DEFINE_PROP_STRING("initrd", S390IPLState, initrd),
>  DEFINE_PROP_STRING("cmdline", S390IPLState, cmdline),
>  DEFINE_PROP_STRING("firmware", S390IPLState, firmware),
> +DEFINE_PROP_STRING("netboot_fw", S390IPLState, netboot_fw),
>  DEFINE_PROP_BOOL("enforce_bios", S390IPLState, enforce_bios, false),
>  DEFINE_PROP_BOOL("iplbext_migration", S390IPLState, iplbext_migration,
>   true),
> diff --git a/hw/s390x/ipl.h b/hw/s390x/ipl.h
> index c89109585a..4ad9a7c05e 100644
> --- a/hw/s390x/ipl.h
> +++ b/hw/s390x/ipl.h
> @@ -106,6 +106,7 @@ struct S390IPLState {
>  char *initrd;
>  char *cmdline;
>  char *firmware;
> +char *netboot_fw;
>  uint8_t cssid;
>  uint8_t ssid;
>  uint16_t devno;
> diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
> index 4f0d62b2d8..40914fde6f 100644
> --- a/hw/s390x/s390-virtio-ccw.c
> +++ b/hw/s390x/s390-virtio-ccw.c
> @@ -116,7 +116,8 @@ static void ccw_init(MachineState *machine)
>  /* get a BUS */
>  css_bus = virtual_css_bus_init();
>  s390_init_ipl_dev(machine->kernel_filename, machine->kernel_cmdline,
> -  machine->initrd_filename, "s390-ccw.img", true);
> +  machine->initrd_filename, "s390-ccw.img",
> +  "s390-netboot.img", true);
>  s390_flic_init();
>  
>  dev = qdev_create(NULL, TYPE_S390_PCI_HOST_BRIDGE);
> diff --git a/hw/s390x/s390-virtio.c b/hw/s390x/s390-virtio.c
> index 9cfb09057e..afa4148e6b 100644
> --- a/hw/s390x/s390-virtio.c
> +++ b/hw/s390x/s390-virtio.c
> @@ -65,6 +65,7 @@ void s390_init_ipl_dev(const char *kernel_filename,
> const char *kernel_cmdline,
> const char *initrd_filename,
> const char *firmware,
> +   const char *netboot_fw,
> bool enforce_bios)
>  {
>  Object *new = object_new(TYPE_S390_IPL);
> @@ -78,6 +79,7 @@ void s390_init_ipl_dev(const char *kernel_filename,
>  }
>  qdev_prop_set_string(dev, "cmdline", kernel_cmdline);
>  qdev_prop_set_string(dev, "firmware", firmware);
> +qdev_prop_set_string(dev, "netboot_fw", netboot_fw);
>  qdev_prop_set_bit(dev, "enforce_bios", enforce_bios);
>  object_property_add_child(qdev_get_machine(), TYPE_S390_IPL,
>new, NULL);
> diff --git a/hw/s390x/s390-virtio.h b/hw/s390x/s390-virtio.h
> index f588b80a6e..f2377a3e0e 100644
> --- a/hw/s390x/s390-virtio.h
> +++ b/hw/s390x/s390-virtio.h
> @@ -24,6 +24,7 @@ void s390_init_ipl_dev(const char *kernel_filename,
> const char *kernel_cmdline,
> const char *initrd_filename,
> const char *firmware,
> +   const char *netboot_fw,
> bool enforce_bios);
>  void s390_create_virtio_net(BusState *bus, const char *name);
>  void s390_nmi(NMIState *n, int cpu_index, Error **errp);
> 

Reviewed-by: Thomas Huth 




Re: [Qemu-devel] [PATCH 00/22] target/openrisc updates

2017-02-24 Thread Stafford Horne
On Thu, Feb 23, 2017 at 10:54:43AM -0500, G 3 wrote:
> 
> On Feb 23, 2017, at 9:09 AM, Stafford Horne wrote:
> 
> > On Mon, Feb 13, 2017 at 10:33:09PM -0500, Programmingkid wrote:
> > > 
> > > On Feb 13, 2017, at 10:25 PM, Richard Henderson wrote:
> > > 
> > > > On 02/10/2017 11:39 AM, Stafford Horne wrote:
> > > > > On Thu, Feb 09, 2017 at 09:10:51AM -0500, G 3 wrote:
> > > > > > 
> > > > > > On Feb 8, 2017, at 11:52 PM, qemu-devel-requ...@nongnu.org wrote:
> > > > > > 
> > > > > > > Message: 6
> > > > > > > Date: Wed,  8 Feb 2017 20:51:32 -0800
> > > > > > > From: Richard Henderson 
> > > > > > > To: qemu-devel@nongnu.org
> > > > > > > Cc: sho...@gmail.com
> > > > > > > Subject: [Qemu-devel] [PATCH 00/22] target/openrisc updates
> > > > > > > Message-ID: <20170209045154.16868-1-...@twiddle.net>
> > > > > > > 
> > > > > > > The bulk of this patch set is 2-3 years old, and was mostly
> > > > > > > reviewed by Bastian Koppelmann.  But it languished because
> > > > > > > there were reports of it not booting kernel images, and I
> > > > > > > had problems putting together a set of tools that could even
> > > > > > > build a kernel.
> > > > > > > 
> > > > > > > The OpenRISC community has picked up activity recently,
> > > > > > > with Stafford Horne upstreaming some of the compiler tools.
> > > > > > > He has even done some testing for me of this patch set.
> > > > > > > 
> > > > > > > 
> > > > > > > r~
> > > > > > 
> > > > > > I see you are working on OpenRISC. Would you be able to help
> > > > > > me improve its wiki page?
> > > > > > 
> > > > > > http://wiki.qemu-project.org/Documentation/Platforms/OpenRISC
> > > > > > 
> > > > > > Right now there isn't much information on OpenRISC here. I'm
> > > > > > hoping to make it a lot useful to anyone who is
> > > > > > interesting in this
> > > > > > platform.
> > > > > > 
> > > > > > Would you know of any links to software that works in OpenRISC?
> > > > > > 
> > > > > > Pictures of the OpenRISC target running anything would
> > > > > > be great also.
> > > > > > 
> > > > > > What is your suggested command-line for using OpenRISC?
> > > > > > 
> > > > > > If you have any suggestions on how to improve the page place don't
> > > > > > hesitate to let me know.
> > > > > > 
> > > > > 
> > > > > Hi Richard, G 3,
> > > > > 
> > > > > I am not sure if Richard has the time for this.  But I kind
> > > > > of got him
> > > > > started back on the openrisc work, it would be my fault to
> > > > > burden him
> > > > > with even more work :).  Also, I was helping to do some
> > > > > tests so I have
> > > > > some example commands.
> > > > 
> > > > :-)
> > > > 
> > > > > I don't have a lot of time, but I think I can help to update
> > > > > the above.
> > > > > Probably Richard and I could work together?
> > > > > 
> > > > > I was recently working on updating the openrisc page
> > > > > 
> > > > >  http://openrisc.io
> > > > > 
> > > > > I think I could link between the two for toolchain compile
> > > > > guides and
> > > > > software guides (i.e. debugging and linux).
> > > > > 
> > > > > Richard what do you think?
> > > > 
> > > > Probably the most beneficial thing that we can do is create a
> > > > kernel+initrd that boots into a busybox root shell.  Similar to
> > > > the other test images that we have at
> > > > 
> > > >  http://wiki.qemu-project.org/Testing/System_Images
> > > > 
> > > > 
> > > > r~
> > > > 
> > > 
> > > We could do both the test image and a wiki page.
> > 
> > I posted a test image to the wiki
> > 
> >  http://wiki.qemu-project.org/Testing/System_Images
> > 
> > However, someone on the #qemu irc room mentioned its not ideal to post
> > any images on the wiki and existing images should be taken down due to
> > copyright issues.  I don't quite follow, should we be concerned?
> 
> If this is a Linux test image, it should be fine. Could we add a link to the
> test image to the wiki page?
> 
> > 
> > Also, I did start adding some commands to the wiki, any feedback
> > appreciated.
> > 
> >   http://wiki.qemu-project.org/Documentation/Platforms/OpenRISC
> > 
> > -Stafford
> 
> Thank you so much for all your work. The wiki page looks great. For the
> maintainer, is Jia Liu still maintaining it? I never received a reply from
> him.  Would you be interested in being a maintainer?

No worries, I just copied the commands from my history, some were sent
from Richard.  I hope they can help others in the future.

Last time I sent a patch Jia Liu also asked me to be maintainer,  I
added myself for now.  I am sure there are more qualified people like
Richard, but Ill try to help where I can.

> This line in the wiki doesn't sound right:
> "To get good traces you can also add the following, this will output trace
> info the a file z"
> 
> Maybe this:
> "To get good traces you can also add the following, this will output trace
> info to the file z"
> 
> If z is a file, I really suggest renaming it to something more descriptive.

Done.  If I do any more 

Re: [Qemu-devel] [PATCH v2 1/5] elf-loader: Allow late loading of elf

2017-02-24 Thread Thomas Huth
On 23.02.2017 13:20, Cornelia Huck wrote:
> From: Farhan Ali 
> 
> The current QEMU ROM infrastructure rejects late loading of ROMs.
> And ELFs are currently loaded as ROM, this prevents delayed loading
> of ELFs. So when loading ELF, allow the user to specify if ELF should
> be loaded as ROM or not.
> 
> If an ELF is not loaded as ROM, then they are not restored on a
> guest reboot/reset and so its upto the user to handle the reloading.
> 
> Signed-off-by: Farhan Ali 
> Reviewed-by: Christian Borntraeger 
> Cc: Peter Maydell 
> Signed-off-by: Cornelia Huck 
> ---
>  hw/core/loader.c | 17 +++--
>  include/hw/elf_ops.h | 13 +
>  include/hw/loader.h  | 13 -
>  3 files changed, 36 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/core/loader.c b/hw/core/loader.c
> index ee5abd6eb7..9d1af1f6f3 100644
> --- a/hw/core/loader.c
> +++ b/hw/core/loader.c
> @@ -435,6 +435,19 @@ int load_elf_as(const char *filename,
>  uint64_t *highaddr, int big_endian, int elf_machine,
>  int clear_lsb, int data_swab, AddressSpace *as)
>  {
> +return load_elf_ram(filename, translate_fn, translate_opaque,
> +pentry, lowaddr, highaddr, big_endian, elf_machine,
> +clear_lsb, data_swab, as, true);
> +}
> +
> +/* return < 0 if error, otherwise the number of bytes loaded in memory */
> +int load_elf_ram(const char *filename,
> + uint64_t (*translate_fn)(void *, uint64_t),
> + void *translate_opaque, uint64_t *pentry, uint64_t *lowaddr,
> + uint64_t *highaddr, int big_endian, int elf_machine,
> + int clear_lsb, int data_swab, AddressSpace *as,
> + bool load_rom)
> +{
>  int fd, data_order, target_data_order, must_swab, ret = ELF_LOAD_FAILED;
>  uint8_t e_ident[EI_NIDENT];



The patch looks basically fine to me, but I think it's a little bit
confusing to have a function called load_elf_ram() which can also be
used to load ROMs with a load_rom=1 parameter. If I read
"load_elf_ram()", I'd expect a function that can only read ELFs to RAM.
So what about adding the "load_rom" parameter to load_elf_as() instead
and then making load_elf_ram() a wrapper function to that one with
load_rom=0 ? AFAICS there is only one additional caller to load_elf_as
(in the generic-loader), so the additional effort here should be OK, I
think.



 Thomas




Re: [Qemu-devel] [PATCH] target-s390x: Implement lpp instruction

2017-02-24 Thread Thomas Huth
On 25.02.2017 00:51, Richard Henderson wrote:
> On 02/25/2017 12:50 AM, Miroslav Benes wrote:
>> Linux arch/s390/kernel/head(64).S uses lpp instruction if it is
>> available in facilities list provided by stfl/stfle instruction. This is
>> the case of newer z/System generations and their qemu definition.
>>
>> Signed-off-by: Miroslav Benes 
>> ---
> 
> I can't find LPP in my PoO 11th edition...

LPP is one of the instructions that is documented separately.
You can find its spec here, for example:

http://www-01.ibm.com/support/docview.wss?uid=isg26fcd1cc32246f4c8852574ce0044734a

 Thomas




Re: [Qemu-devel] [PATCH 10/10] ppc: Add SM501 device in config for ppc and ppcemb targets

2017-02-24 Thread Thomas Huth
On 24.02.2017 21:51, BALATON Zoltan wrote:
> On Fri, 24 Feb 2017, Peter Maydell wrote:
>> On 19 February 2017 at 16:35, BALATON Zoltan  wrote:
>>> This is not used by default on any emulated machine yet but it is
>>> still useful to have it compiled so it can be added from the command
>>> line for clients that can use it (e.g. MorphOS has no driver for any
>>> other emulated video cards but can output via SM501)
>>>
>>> Signed-off-by: BALATON Zoltan 
>>
>> If this is a generic PCI device then shouldn't it go into
>> default-configs/pci.mak ?
> 
> This is a multimedia chip usually found on sh4 systems and some ppc
> boards that's why I've only added it to these but I can move it to
> pci.mak if you think it's a better place for it.

If it is not really usable for other boards, then I think it should also
not go into pci.mak. No need to confuse the users of the other boards
with unusable PCI cards here.

 Thomas




Re: [Qemu-devel] [PATCH 1/3] COLO-compare: Add packet size check and some fix

2017-02-24 Thread Zhang Chen

Sorry, This patch has been renamed.

please ignore this patch.


Thanks

Zhang Chen


On 02/25/2017 11:32 AM, Zhang Chen wrote:

Add packet size check in colo_packet_compare_udp()
and colo_packet_compare_udp(), rename function colo_packet_compare()
to colo_packet_compare_common() that we will reuse it later.

Signed-off-by: Zhang Chen 
---
  net/colo-compare.c | 30 ++
  1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index 300f017..e75f0ae 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -180,7 +180,7 @@ static int packet_enqueue(CompareState *s, int mode)
   * return:0  means packet same
   *> 0 || < 0 means packet different
   */
-static int colo_packet_compare(Packet *ppkt, Packet *spkt)
+static int colo_packet_compare_common(Packet *ppkt, Packet *spkt)
  {
  trace_colo_compare_ip_info(ppkt->size, inet_ntoa(ppkt->ip->ip_src),
 inet_ntoa(ppkt->ip->ip_dst), spkt->size,
@@ -190,6 +190,7 @@ static int colo_packet_compare(Packet *ppkt, Packet *spkt)
  if (ppkt->size == spkt->size) {
  return memcmp(ppkt->data, spkt->data, spkt->size);
  } else {
+trace_colo_compare_main("Net packet size are not the same");
  return -1;
  }
  }
@@ -202,9 +203,10 @@ static int colo_packet_compare(Packet *ppkt, Packet *spkt)
  static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
  {
  struct tcphdr *ptcp, *stcp;
-int res;
+int res, network_length;
  
  trace_colo_compare_main("compare tcp");

+
  if (ppkt->size != spkt->size) {
  if (trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE)) {
  trace_colo_compare_main("pkt size not same");
@@ -212,6 +214,12 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet 
*ppkt)
  return -1;
  }
  
+network_length = ppkt->ip->ip_hl * 4;

+if (ppkt->size < network_length + ETH_HLEN) {
+trace_colo_compare_main("tcp packet size error");
+return -1;
+}
+
  ptcp = (struct tcphdr *)ppkt->transport_header;
  stcp = (struct tcphdr *)spkt->transport_header;
  
@@ -260,10 +268,16 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)

   */
  static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
  {
-int ret;
+int ret, network_length;
  
  trace_colo_compare_main("compare udp");

-ret = colo_packet_compare(ppkt, spkt);
+network_length = ppkt->ip->ip_hl * 4;
+if (ppkt->size < network_length + ETH_HLEN) {
+trace_colo_compare_main("udp packet size error");
+return -1;
+}
+
+ret = colo_packet_compare_common(ppkt, spkt);
  
  if (ret) {

  trace_colo_compare_udp_miscompare("primary pkt size", ppkt->size);
@@ -285,12 +299,12 @@ static int colo_packet_compare_icmp(Packet *spkt, Packet 
*ppkt)
  
  trace_colo_compare_main("compare icmp");

  network_length = ppkt->ip->ip_hl * 4;
-if (ppkt->size != spkt->size ||
-ppkt->size < network_length + ETH_HLEN) {
+if (ppkt->size < network_length + ETH_HLEN) {
+trace_colo_compare_main("icmp packet size error");
  return -1;
  }
  
-if (colo_packet_compare(ppkt, spkt)) {

+if (colo_packet_compare_common(ppkt, spkt)) {
  trace_colo_compare_icmp_miscompare("primary pkt size",
 ppkt->size);
  qemu_hexdump((char *)ppkt->data, stderr, "colo-compare",
@@ -316,7 +330,7 @@ static int colo_packet_compare_other(Packet *spkt, Packet 
*ppkt)
 inet_ntoa(ppkt->ip->ip_dst), spkt->size,
 inet_ntoa(spkt->ip->ip_src),
 inet_ntoa(spkt->ip->ip_dst));
-return colo_packet_compare(ppkt, spkt);
+return colo_packet_compare_common(ppkt, spkt);
  }
  
  static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time)


--
Thanks
Zhang Chen






[Qemu-devel] [PATCH 1/3] COLO-compare: Add minimum packet size check and some fix

2017-02-24 Thread Zhang Chen
Add packet minimum size check in colo_packet_compare_udp()
and colo_packet_compare_udp() like colo_packet_compare_icmp(),
rename function colo_packet_compare() to colo_packet_compare_common()
that we will reuse it later.

Signed-off-by: Zhang Chen 
---
 net/colo-compare.c | 30 ++
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index 300f017..e75f0ae 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -180,7 +180,7 @@ static int packet_enqueue(CompareState *s, int mode)
  * return:0  means packet same
  *> 0 || < 0 means packet different
  */
-static int colo_packet_compare(Packet *ppkt, Packet *spkt)
+static int colo_packet_compare_common(Packet *ppkt, Packet *spkt)
 {
 trace_colo_compare_ip_info(ppkt->size, inet_ntoa(ppkt->ip->ip_src),
inet_ntoa(ppkt->ip->ip_dst), spkt->size,
@@ -190,6 +190,7 @@ static int colo_packet_compare(Packet *ppkt, Packet *spkt)
 if (ppkt->size == spkt->size) {
 return memcmp(ppkt->data, spkt->data, spkt->size);
 } else {
+trace_colo_compare_main("Net packet size are not the same");
 return -1;
 }
 }
@@ -202,9 +203,10 @@ static int colo_packet_compare(Packet *ppkt, Packet *spkt)
 static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
 {
 struct tcphdr *ptcp, *stcp;
-int res;
+int res, network_length;
 
 trace_colo_compare_main("compare tcp");
+
 if (ppkt->size != spkt->size) {
 if (trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE)) {
 trace_colo_compare_main("pkt size not same");
@@ -212,6 +214,12 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet 
*ppkt)
 return -1;
 }
 
+network_length = ppkt->ip->ip_hl * 4;
+if (ppkt->size < network_length + ETH_HLEN) {
+trace_colo_compare_main("tcp packet size error");
+return -1;
+}
+
 ptcp = (struct tcphdr *)ppkt->transport_header;
 stcp = (struct tcphdr *)spkt->transport_header;
 
@@ -260,10 +268,16 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet 
*ppkt)
  */
 static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
 {
-int ret;
+int ret, network_length;
 
 trace_colo_compare_main("compare udp");
-ret = colo_packet_compare(ppkt, spkt);
+network_length = ppkt->ip->ip_hl * 4;
+if (ppkt->size < network_length + ETH_HLEN) {
+trace_colo_compare_main("udp packet size error");
+return -1;
+}
+
+ret = colo_packet_compare_common(ppkt, spkt);
 
 if (ret) {
 trace_colo_compare_udp_miscompare("primary pkt size", ppkt->size);
@@ -285,12 +299,12 @@ static int colo_packet_compare_icmp(Packet *spkt, Packet 
*ppkt)
 
 trace_colo_compare_main("compare icmp");
 network_length = ppkt->ip->ip_hl * 4;
-if (ppkt->size != spkt->size ||
-ppkt->size < network_length + ETH_HLEN) {
+if (ppkt->size < network_length + ETH_HLEN) {
+trace_colo_compare_main("icmp packet size error");
 return -1;
 }
 
-if (colo_packet_compare(ppkt, spkt)) {
+if (colo_packet_compare_common(ppkt, spkt)) {
 trace_colo_compare_icmp_miscompare("primary pkt size",
ppkt->size);
 qemu_hexdump((char *)ppkt->data, stderr, "colo-compare",
@@ -316,7 +330,7 @@ static int colo_packet_compare_other(Packet *spkt, Packet 
*ppkt)
inet_ntoa(ppkt->ip->ip_dst), spkt->size,
inet_ntoa(spkt->ip->ip_src),
inet_ntoa(spkt->ip->ip_dst));
-return colo_packet_compare(ppkt, spkt);
+return colo_packet_compare_common(ppkt, spkt);
 }
 
 static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time)
-- 
2.7.4






[Qemu-devel] [PATCH 1/3] COLO-compare: Add packet size check and some fix

2017-02-24 Thread Zhang Chen
Add packet size check in colo_packet_compare_udp()
and colo_packet_compare_udp(), rename function colo_packet_compare()
to colo_packet_compare_common() that we will reuse it later.

Signed-off-by: Zhang Chen 
---
 net/colo-compare.c | 30 ++
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index 300f017..e75f0ae 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -180,7 +180,7 @@ static int packet_enqueue(CompareState *s, int mode)
  * return:0  means packet same
  *> 0 || < 0 means packet different
  */
-static int colo_packet_compare(Packet *ppkt, Packet *spkt)
+static int colo_packet_compare_common(Packet *ppkt, Packet *spkt)
 {
 trace_colo_compare_ip_info(ppkt->size, inet_ntoa(ppkt->ip->ip_src),
inet_ntoa(ppkt->ip->ip_dst), spkt->size,
@@ -190,6 +190,7 @@ static int colo_packet_compare(Packet *ppkt, Packet *spkt)
 if (ppkt->size == spkt->size) {
 return memcmp(ppkt->data, spkt->data, spkt->size);
 } else {
+trace_colo_compare_main("Net packet size are not the same");
 return -1;
 }
 }
@@ -202,9 +203,10 @@ static int colo_packet_compare(Packet *ppkt, Packet *spkt)
 static int colo_packet_compare_tcp(Packet *spkt, Packet *ppkt)
 {
 struct tcphdr *ptcp, *stcp;
-int res;
+int res, network_length;
 
 trace_colo_compare_main("compare tcp");
+
 if (ppkt->size != spkt->size) {
 if (trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE)) {
 trace_colo_compare_main("pkt size not same");
@@ -212,6 +214,12 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet 
*ppkt)
 return -1;
 }
 
+network_length = ppkt->ip->ip_hl * 4;
+if (ppkt->size < network_length + ETH_HLEN) {
+trace_colo_compare_main("tcp packet size error");
+return -1;
+}
+
 ptcp = (struct tcphdr *)ppkt->transport_header;
 stcp = (struct tcphdr *)spkt->transport_header;
 
@@ -260,10 +268,16 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet 
*ppkt)
  */
 static int colo_packet_compare_udp(Packet *spkt, Packet *ppkt)
 {
-int ret;
+int ret, network_length;
 
 trace_colo_compare_main("compare udp");
-ret = colo_packet_compare(ppkt, spkt);
+network_length = ppkt->ip->ip_hl * 4;
+if (ppkt->size < network_length + ETH_HLEN) {
+trace_colo_compare_main("udp packet size error");
+return -1;
+}
+
+ret = colo_packet_compare_common(ppkt, spkt);
 
 if (ret) {
 trace_colo_compare_udp_miscompare("primary pkt size", ppkt->size);
@@ -285,12 +299,12 @@ static int colo_packet_compare_icmp(Packet *spkt, Packet 
*ppkt)
 
 trace_colo_compare_main("compare icmp");
 network_length = ppkt->ip->ip_hl * 4;
-if (ppkt->size != spkt->size ||
-ppkt->size < network_length + ETH_HLEN) {
+if (ppkt->size < network_length + ETH_HLEN) {
+trace_colo_compare_main("icmp packet size error");
 return -1;
 }
 
-if (colo_packet_compare(ppkt, spkt)) {
+if (colo_packet_compare_common(ppkt, spkt)) {
 trace_colo_compare_icmp_miscompare("primary pkt size",
ppkt->size);
 qemu_hexdump((char *)ppkt->data, stderr, "colo-compare",
@@ -316,7 +330,7 @@ static int colo_packet_compare_other(Packet *spkt, Packet 
*ppkt)
inet_ntoa(ppkt->ip->ip_dst), spkt->size,
inet_ntoa(spkt->ip->ip_src),
inet_ntoa(spkt->ip->ip_dst));
-return colo_packet_compare(ppkt, spkt);
+return colo_packet_compare_common(ppkt, spkt);
 }
 
 static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time)
-- 
2.7.4






[Qemu-devel] [PATCH 2/3] COLO-compare: Optimize colo_packet_compare_common

2017-02-24 Thread Zhang Chen
Add offset args for colo_packet_compare_common, optimize
colo_packet_compare_icmp() and colo_packet_compare_udp()
just compare the IP payload.

Signed-off-by: Zhang Chen 
---
 net/colo-compare.c | 28 +---
 1 file changed, 21 insertions(+), 7 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index e75f0ae..9853232 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -180,7 +180,7 @@ static int packet_enqueue(CompareState *s, int mode)
  * return:0  means packet same
  *> 0 || < 0 means packet different
  */
-static int colo_packet_compare_common(Packet *ppkt, Packet *spkt)
+static int colo_packet_compare_common(Packet *ppkt, Packet *spkt, int offset)
 {
 trace_colo_compare_ip_info(ppkt->size, inet_ntoa(ppkt->ip->ip_src),
inet_ntoa(ppkt->ip->ip_dst), spkt->size,
@@ -188,7 +188,8 @@ static int colo_packet_compare_common(Packet *ppkt, Packet 
*spkt)
inet_ntoa(spkt->ip->ip_dst));
 
 if (ppkt->size == spkt->size) {
-return memcmp(ppkt->data, spkt->data, spkt->size);
+return memcmp(ppkt->data + offset, spkt->data + offset,
+  spkt->size - offset);
 } else {
 trace_colo_compare_main("Net packet size are not the same");
 return -1;
@@ -237,8 +238,7 @@ static int colo_packet_compare_tcp(Packet *spkt, Packet 
*ppkt)
 spkt->ip->ip_sum = ppkt->ip->ip_sum;
 }
 
-res = memcmp(ppkt->data + ETH_HLEN, spkt->data + ETH_HLEN,
-(spkt->size - ETH_HLEN));
+res = colo_packet_compare_common(ppkt, spkt, ETH_HLEN);
 
 if (res != 0 && trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE)) {
 trace_colo_compare_pkt_info_src(inet_ntoa(ppkt->ip->ip_src),
@@ -277,7 +277,14 @@ static int colo_packet_compare_udp(Packet *spkt, Packet 
*ppkt)
 return -1;
 }
 
-ret = colo_packet_compare_common(ppkt, spkt);
+/*
+ * Because of ppkt and spkt are both in the same connection,
+ * The ppkt's src ip, dst ip, src port, dst port, ip_proto all are
+ * same with spkt. In addition, IP header's Identification is a random
+ * field, we can handle it in IP fragmentation function later.
+ * So we just compare the ip payload here.
+ */
+ret = colo_packet_compare_common(ppkt, spkt, network_length + ETH_HLEN);
 
 if (ret) {
 trace_colo_compare_udp_miscompare("primary pkt size", ppkt->size);
@@ -304,7 +311,14 @@ static int colo_packet_compare_icmp(Packet *spkt, Packet 
*ppkt)
 return -1;
 }
 
-if (colo_packet_compare_common(ppkt, spkt)) {
+/*
+ * Because of ppkt and spkt are both in the same connection,
+ * The ppkt's src ip, dst ip, src port, dst port, ip_proto all are
+ * same with spkt. In addition, IP header's Identification is a random
+ * field, we can handle it in IP fragmentation function later.
+ * So we just compare the ip payload here.
+ */
+if (colo_packet_compare_common(ppkt, spkt, network_length + ETH_HLEN)) {
 trace_colo_compare_icmp_miscompare("primary pkt size",
ppkt->size);
 qemu_hexdump((char *)ppkt->data, stderr, "colo-compare",
@@ -330,7 +344,7 @@ static int colo_packet_compare_other(Packet *spkt, Packet 
*ppkt)
inet_ntoa(ppkt->ip->ip_dst), spkt->size,
inet_ntoa(spkt->ip->ip_src),
inet_ntoa(spkt->ip->ip_dst));
-return colo_packet_compare_common(ppkt, spkt);
+return colo_packet_compare_common(ppkt, spkt, 0);
 }
 
 static int colo_old_packet_check_one(Packet *pkt, int64_t *check_time)
-- 
2.7.4






[Qemu-devel] [PATCH 3/3] COLO-compare: Make qemu_hexdump() print data when trace-event ON

2017-02-24 Thread Zhang Chen
Signed-off-by: Zhang Chen 
---
 net/colo-compare.c | 18 --
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/net/colo-compare.c b/net/colo-compare.c
index 9853232..5ffc4a3 100644
--- a/net/colo-compare.c
+++ b/net/colo-compare.c
@@ -288,9 +288,13 @@ static int colo_packet_compare_udp(Packet *spkt, Packet 
*ppkt)
 
 if (ret) {
 trace_colo_compare_udp_miscompare("primary pkt size", ppkt->size);
-qemu_hexdump((char *)ppkt->data, stderr, "colo-compare", ppkt->size);
 trace_colo_compare_udp_miscompare("Secondary pkt size", spkt->size);
-qemu_hexdump((char *)spkt->data, stderr, "colo-compare", spkt->size);
+if (trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE)) {
+qemu_hexdump((char *)ppkt->data, stderr, "colo-compare",
+ ppkt->size);
+qemu_hexdump((char *)spkt->data, stderr, "colo-compare",
+ spkt->size);
+}
 }
 
 return ret;
@@ -321,12 +325,14 @@ static int colo_packet_compare_icmp(Packet *spkt, Packet 
*ppkt)
 if (colo_packet_compare_common(ppkt, spkt, network_length + ETH_HLEN)) {
 trace_colo_compare_icmp_miscompare("primary pkt size",
ppkt->size);
-qemu_hexdump((char *)ppkt->data, stderr, "colo-compare",
- ppkt->size);
 trace_colo_compare_icmp_miscompare("Secondary pkt size",
spkt->size);
-qemu_hexdump((char *)spkt->data, stderr, "colo-compare",
- spkt->size);
+if (trace_event_get_state(TRACE_COLO_COMPARE_MISCOMPARE)) {
+qemu_hexdump((char *)ppkt->data, stderr, "colo-compare",
+ ppkt->size);
+qemu_hexdump((char *)spkt->data, stderr, "colo-compare",
+ spkt->size);
+}
 return -1;
 } else {
 return 0;
-- 
2.7.4






[Qemu-devel] [PATCH 0/3] COLO-compare: Optimize the code and fix some bug

2017-02-24 Thread Zhang Chen
This series we will Optimize the code and fix some bug.
Patch1: Add packet minimum size check in compare tcp/udp like compare icmp.
Patch2: Optimize compare_common and increase compare performance.
Patch3: Fix debug info always print bug.

V1:
  - init patch.

Zhang Chen (3):
  COLO-compare: Add minimum packet size check and some fix
  COLO-compare: Optimize colo_packet_compare_common
  COLO-compare: Make qemu_hexdump() print data when trace-event ON

 net/colo-compare.c | 68 --
 1 file changed, 51 insertions(+), 17 deletions(-)

-- 
2.7.4






[Qemu-devel] [PATCH V10 2/2] Add a new qmp command to do checkpoint, query xen replication status

2017-02-24 Thread Zhang Chen
We can call this qmp command to do checkpoint outside of qemu.
Xen colo will need this function.

Signed-off-by: Zhang Chen 
Signed-off-by: Wen Congyang 
Reviewed-by: Eric Blake 

---
 migration/colo.c | 23 +++
 qapi-schema.json | 48 
 2 files changed, 71 insertions(+)

diff --git a/migration/colo.c b/migration/colo.c
index 0114899..2b241b4 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -130,6 +130,29 @@ void qmp_xen_set_replication(bool enable, bool primary,
 }
 }
 
+ReplicationStatus *qmp_query_xen_replication_status(Error **errp)
+{
+Error *err = NULL;
+ReplicationStatus *s = g_new0(ReplicationStatus, 1);
+
+replication_get_error_all();
+if (err) {
+s->error = true;
+s->has_desc = true;
+s->desc = g_strdup(error_get_pretty(err));
+} else {
+s->error = false;
+}
+
+error_free(err);
+return s;
+}
+
+void qmp_xen_colo_do_checkpoint(Error **errp)
+{
+replication_do_checkpoint_all(errp);
+}
+
 static void colo_send_message(QEMUFile *f, COLOMessage msg,
   Error **errp)
 {
diff --git a/qapi-schema.json b/qapi-schema.json
index 9445b93..5aa74e6 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -5931,6 +5931,54 @@
   'data': { 'enable': 'bool', 'primary': 'bool', '*failover' : 'bool' } }
 
 ##
+# @ReplicationStatus:
+#
+# The result format for 'query-xen-replication-status'.
+#
+# @error: true if an error happened, false if replication is normal.
+#
+# @desc: #optional the human readable error description string, when
+#@error is 'true'.
+#
+# Since: 2.9
+##
+{ 'struct': 'ReplicationStatus',
+  'data': { 'error': 'bool', '*desc': 'str' } }
+
+##
+# @query-xen-replication-status:
+#
+# Query replication status while the vm is running.
+#
+# Returns: A @ReplicationResult object showing the status.
+#
+# Example:
+#
+# -> { "execute": "query-xen-replication-status" }
+# <- { "return": { "error": false } }
+#
+# Since: 2.9
+##
+{ 'command': 'query-xen-replication-status',
+  'returns': 'ReplicationStatus' }
+
+##
+# @xen-colo-do-checkpoint:
+#
+# Xen uses this command to notify replication to trigger a checkpoint.
+#
+# Returns: nothing.
+#
+# Example:
+#
+# -> { "execute": "xen-colo-do-checkpoint" }
+# <- { "return": {} }
+#
+# Since: 2.9
+##
+{ 'command': 'xen-colo-do-checkpoint' }
+
+##
 # @GICCapability:
 #
 # The struct describes capability for a specific GIC (Generic
-- 
2.7.4






[Qemu-devel] [PATCH V10 1/2] Add a new qmp command to start/stop replication

2017-02-24 Thread Zhang Chen
We can call this qmp command to start/stop replication outside of qemu.
Like Xen colo need this function.

Signed-off-by: Zhang Chen 
Signed-off-by: Wen Congyang 
Reviewed-by: Eric Blake 
Reviewed-by: Stefano Stabellini 
Reviewed-by: zhanghailiang 

---
 migration/colo.c | 26 ++
 qapi-schema.json | 25 +
 2 files changed, 51 insertions(+)

diff --git a/migration/colo.c b/migration/colo.c
index 93c85c5..0114899 100644
--- a/migration/colo.c
+++ b/migration/colo.c
@@ -19,6 +19,8 @@
 #include "qemu/error-report.h"
 #include "qapi/error.h"
 #include "migration/failover.h"
+#include "replication.h"
+#include "qmp-commands.h"
 
 #define COLO_BUFFER_BASE_SIZE (4 * 1024 * 1024)
 
@@ -104,6 +106,30 @@ void colo_do_failover(MigrationState *s)
 }
 }
 
+void qmp_xen_set_replication(bool enable, bool primary,
+ bool has_failover, bool failover,
+ Error **errp)
+{
+ReplicationMode mode = primary ?
+   REPLICATION_MODE_PRIMARY :
+   REPLICATION_MODE_SECONDARY;
+
+if (has_failover && enable) {
+error_setg(errp, "Parameter 'failover' is only for"
+   " stopping replication");
+return;
+}
+
+if (enable) {
+replication_start_all(mode, errp);
+} else {
+if (!has_failover) {
+failover = NULL;
+}
+replication_stop_all(failover, failover ? NULL : errp);
+}
+}
+
 static void colo_send_message(QEMUFile *f, COLOMessage msg,
   Error **errp)
 {
diff --git a/qapi-schema.json b/qapi-schema.json
index cbdffdd..9445b93 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -5906,6 +5906,31 @@
 { 'command': 'xen-load-devices-state', 'data': {'filename': 'str'} }
 
 ##
+# @xen-set-replication:
+#
+# Enable or disable replication.
+#
+# @enable: true to enable, false to disable.
+#
+# @primary: true for primary or false for secondary.
+#
+# @failover: #optional true to do failover, false to stop. but cannot be
+#specified if 'enable' is true. default value is false.
+#
+# Returns: nothing.
+#
+# Example:
+#
+# -> { "execute": "xen-set-replication",
+#  "arguments": {"enable": true, "primary": false} }
+# <- { "return": {} }
+#
+# Since: 2.9
+##
+{ 'command': 'xen-set-replication',
+  'data': { 'enable': 'bool', 'primary': 'bool', '*failover' : 'bool' } }
+
+##
 # @GICCapability:
 #
 # The struct describes capability for a specific GIC (Generic
-- 
2.7.4






[Qemu-devel] [PATCH V10 0/2] Add new qmp commands to suppurt Xen COLO

2017-02-24 Thread Zhang Chen
Xen COLO depend on qemu COLO replication function.
So, We need new qmp commands for Xen to use qemu replication.

Corresponding libxl patches already in xen.git.
Commit ID:

ed37ef1f91c20f0ab162ce60f8c38400b917fa64
COLO: introduce new API to prepare/start/do/get_error/stop replication

a0ddc0b359375b2418213966dfbdbfab593ecc6f
tools/libxl: introduction of libxl__qmp_restore to load qemu state

V10:
   - Fix ReplicationStatus error comments.
   - Fix query-xen-replication-status example.

Zhang Chen (2):
  Add a new qmp command to start/stop replication
  Add a new qmp command to do checkpoint, query xen replication status

 migration/colo.c | 49 +
 qapi-schema.json | 73 
 2 files changed, 122 insertions(+)

-- 
2.7.4






Re: [Qemu-devel] [PATCH V9 0/2] Add new qmp commands to suppurt Xen COLO

2017-02-24 Thread Zhang Chen



On 02/25/2017 01:24 AM, Eric Blake wrote:

On 02/24/2017 01:22 AM, Zhang Chen wrote:

Xen COLO depend on qemu COLO replication function.
So, We need new qmp commands for Xen to use qemu replication.

Corresponding libxl patches already in xen.git.
Commit ID:

ed37ef1f91c20f0ab162ce60f8c38400b917fa64
COLO: introduce new API to prepare/start/do/get_error/stop replication

a0ddc0b359375b2418213966dfbdbfab593ecc6f
tools/libxl: introduction of libxl__qmp_restore to load qemu state


It's nice to call out what changed between v8 and v9 to focus reviewer's
attention on those aspects.


OK, I will add the note in next version.

Thanks
Zhang Chen




Zhang Chen (2):
   Add a new qmp command to start/stop replication
   Add a new qmp command to do checkpoint, query xen replication status

  migration/colo.c | 49 +
  qapi-schema.json | 74 
  2 files changed, 123 insertions(+)



--
Thanks
Zhang Chen






Re: [Qemu-devel] [PATCH V9 2/2] Add a new qmp command to do checkpoint, query xen replication status

2017-02-24 Thread Zhang Chen



On 02/25/2017 01:27 AM, Eric Blake wrote:

On 02/24/2017 01:22 AM, Zhang Chen wrote:

We can call this qmp command to do checkpoint outside of qemu.
Xen colo will need this function.

Signed-off-by: Zhang Chen 
Signed-off-by: Wen Congyang 
---
  migration/colo.c | 23 +++
  qapi-schema.json | 49 +
  2 files changed, 72 insertions(+)

+++ b/qapi-schema.json
@@ -5931,6 +5931,55 @@
'data': { 'enable': 'bool', 'primary': 'bool', '*failover' : 'bool' } }
  
  ##

+# @ReplicationStatus:
+#
+# The result format for 'query-xen-replication-status'.
+#
+# @error: true to error, false to normal.

Maybe:

true if an error happened, false if replication is normal

Maintainer could touch that up during application, so it doesn't
necessarily require a v10.


Maintainer may be very busy, I will do this fix in V10.




+##
+# @query-xen-replication-status:
+#
+# Query replication status while the vm is running.
+#
+# Returns: A @ReplicationResult object showing the status.
+#
+# Example:
+#
+# -> { "execute": "query-xen-replication-status" }
+# <- { "return": [ { "error": false },
+#  { "error": true } ] }

This example is bogus.  The command does NOT return an array.  It is
sufficient to use:

{ "return": { "error": false } }

but if you want, you could instead do:

{ "return": { "error": true, "desc": "..." } }

but if you do that, please be sure that desc matches an actual error
code that could occur, and not something made up.

If all you do is fix the example with the simpler "error":false (and
maybe the maintainer is okay doing that), then you can add:
Reviewed-by: Eric Blake 


I will fix the example in V10, and which maintainer can pick up this patch?

Thanks
Zhang Chen


--
Thanks
Zhang Chen






Re: [Qemu-devel] [PATCH v4 06/15] target/ppc: remove xer split-out flags(so, ov, ca)

2017-02-24 Thread Richard Henderson

On 02/24/2017 06:05 PM, Nikunj A Dadhania wrote:

Richard Henderson  writes:


On 02/24/2017 01:58 PM, David Gibson wrote:

On Fri, Feb 24, 2017 at 06:18:22AM +0530, Nikunj A Dadhania wrote:

Richard Henderson  writes:


On 02/24/2017 06:56 AM, Nikunj A Dadhania wrote:

Now get rid all the split out variables so, ca, ov. After this patch,
all the bits are stored in CPUPPCState::xer at appropriate places.

Signed-off-by: Nikunj A Dadhania 
---
 target/ppc/cpu.c|   8 +---
 target/ppc/cpu.h|  26 ++--
 target/ppc/int_helper.c |  12 +++---
 target/ppc/translate.c  | 106 +---
 4 files changed, 78 insertions(+), 74 deletions(-)


I do not think this is a good direction to take this.


Hmm, any particular reason?


Right, I suggested this, but based only a suspicion that the split
variables weren't worth the complexity.  I'm happy to be corrected by
someone with better knowledge of TCG, but it'd be nice to know why.


Normally we're interested in minimizing the size of the generated code,
delaying computation until we can show it being used.

Now, ppc is a bit different from other targets (which might compute overflow
for any addition insn) in that it only computes overflow when someone asks for
it.  Moreover, it's fairly rare for the addo/subo/nego instructions to
be used.



Therefore, I'm not 100% sure what the "best" solution is.


Agreed, with that logic, wont it be more efficient to move the OV/CA
updationg to respective callers, and when xer_read/write happens, its
just one tcg_ops.



However, I'd be surprised if the least amount of code places all of
the bits into their canonical location within XER.

Do note that when looking at this, the various methods by which the OV/SO bits
are copied to CR flags ought to be taken into account.


I lost you in the last two para, can you explain in detail?


Reading XER via MFSPR is not the only way to access the CA/OV/SO bits.  One may 
use the "dot" form of the instruction to copy SO to CR0[3].  One may use the 
MCRXRX instruction to copy 5 bits from XER to CR[BF].  One may use the add/sub 
extended instructions to access the CA bit.


Therefore it is not a forgone conclusion that a read of XER will *ever* occur, 
and therefore it is not necessarily most efficient to keep the CA/OV/SO bits in 
the canonical XER form.


I think it's especially important to keep CA separate in order to facilitate 
multi-word addition chains.


I suspect that it's most efficient to keep SO in a form that best simplifies 
"dot" instructions, e.g. since there is no un-dotted "andi" instruction. 
Naturally, the form in which you store SO is going to influence how you store OV.


The other thing that is desirable is to allow the TCG optimizer to delete 
computations that are dead.  That cannot be done if you're constantly folding 
results back into a single XER register.


Consider a sequence like

li  r0, 0
mtspr   xer, r0
addor3, r4, r5
addo.   r6, r7, r8

where we clear XER (and thus SO), perform two computations, and then read SO 
via the dot.  Obviously the two computations of OV are not dead, because they 
get ORed into SO.  However, the first computation of OV32 is dead, shadowed by 
the second, because there is no accumulating SO32 bit.



r~



Re: [Qemu-devel] kvm bug in __rmap_clear_dirty during live migration

2017-02-24 Thread Herongguang (Stephen)



On 2017/2/24 23:14, Paolo Bonzini wrote:



On 24/02/2017 16:10, Chris Friesen wrote:

On 02/23/2017 08:23 PM, Herongguang (Stephen) wrote:


On 2017/2/22 22:43, Paolo Bonzini wrote:



Hopefully Gaohuai and Rongguang can help with this too.

Paolo


Yes, we are looking into and testing this.

I think this can result in any memory corruption, if VM1 writes its
PML buffer into VM2’s VMCS (since sched_in/sched_out notifier of VM1
is not registered yet), then VM1 is destroyed (hence its PML buffer
is freed back to kernel), after that, VM2 starts migration, so CPU
logs VM2’s dirty GFNS into a freed memory, results in any memory
corruption.

As its severity, this commit
(http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=4e59516a12a6ef6dcb660cb3a3f70c64bd60cfec)

is eligible to back port to kernel stable.


Are we expecting that fix to resolve the original issue, or is it a
separate issue that needs fixing in stable?


It should be the original issue.

Paolo

.


Yes, I agree, though we are still testing.




Re: [Qemu-devel] [PATCH] target-s390x: Implement stfl and stfle

2017-02-24 Thread Richard Henderson

On 02/25/2017 12:44 AM, Michal Marek wrote:

+DEF_HELPER_1(stfl, void, env)


DEF_HELPER_FLAGS_1(stfl, TCG_CALL_NO_RWG, void, env)

since this touches no registers, and only writes to lomem which afaik cannot 
fault in kernel mode.



+DEF_HELPER_3(stfle, i64, env, i64, i64)


Unfortunately, we are writing to cc_op, so we do have a TCG register write, so 
I guess this is the best we can do here.



+static int do_stfle(CPUS390XState *env, uint64_t addr, int len)
+{
+S390CPU *cpu = s390_env_get_cpu(env);
+uint8_t data[64];


S390FeatBitmap or S390FeatInit?  Or even a sizeof?
Hard coding 64 certainly doesn't seem right.


+memset(data, 0, sizeof(data));
+res = s390_fill_feat_block(cpu->model->features, S390_FEAT_TYPE_STFL, 
data);
+cpu_physical_memory_write(addr, data, MIN(res, len));


No, not physical memory, you need to write to virtual memory, at least for 
STFLE.  Which, as you'll recall can be used from user-mode.



r~



Re: [Qemu-devel] [PATCH] target-s390x: Implement lpp instruction

2017-02-24 Thread Richard Henderson

On 02/25/2017 12:50 AM, Miroslav Benes wrote:

Linux arch/s390/kernel/head(64).S uses lpp instruction if it is
available in facilities list provided by stfl/stfle instruction. This is
the case of newer z/System generations and their qemu definition.

Signed-off-by: Miroslav Benes 
---


I can't find LPP in my PoO 11th edition...


+static ExitStatus op_lpp(DisasContext *s, DisasOps *o)
+{
+check_privileged(s);
+potential_page_fault(s);
+
+tcg_gen_st_i64(o->in2, cpu_env, offsetof(CPUS390XState, pp));


But you don't need the potential_page_fault, since this is not a store to guest 
memory.



r~



Re: [Qemu-devel] [PATCH v8 1/2] block/vxhs.c: Add support for a new block device type called "vxhs"

2017-02-24 Thread ashish mittal
Thanks!

I hope the following is in line with what you suggested -

We will error out in case either of username, secret-id, or password
are missing.

Good case, passing password via a file -
$ ./qemu-io --trace enable=vxhs* --object
secret,id=xvxhspasswd,file=/tmp/some/file/path  -c 'read 66000 128k'
'json:{"server.host": "127.0.0.1", "server.port": "", "vdisk-id":
"/test.raw", "driver": "vxhs", "user": "ashish",  "password-secret":
"xvxhspasswd"}'
1132@1487977829.151064:vxhs_open_vdiskid Opening vdisk-id /test.raw

1132@1487977829.151141:vxhs_get_creds User ashish, SecretID
xvxhspasswd, Password Str0ngP@ssw0rd   <===   NOTE WILL NOT PRINT
PASSWORD IN FINAL CODE 

1132@1487977829.151168:vxhs_open_hostinfo Adding host 127.0.0.1:
to BDRVVXHSState
1132@1487977829.173062:vxhs_get_vdisk_stat vDisk /test.raw stat ioctl
returned size 196616
read 131072/131072 bytes at offset 66000
128 KiB, 1 ops; 0.0012 sec (99.049 MiB/sec and 792.3930 ops/sec)
1132@1487977829.175141:vxhs_close Closing vdisk /test.raw


Bad case, missing user -
$ ./qemu-io --trace enable=vxhs* --object
secret,id=xvxhspasswd,data=/tmp/some/file/path  -c 'read 66000 128k'
'json:{"server.host": "127.0.0.1", "server.port": "", "vdisk-id":
"/test.raw", "driver": "vxhs"}'
1310@1487978547.771234:vxhs_open_vdiskid Opening vdisk-id /test.raw
can't open device json:{"server.host": "127.0.0.1", "server.port":
"", "vdisk-id": "/test.raw", "driver": "vxhs"}: please specify the
user for authenticating to target

diff --git a/block/vxhs.c b/block/vxhs.c
index 4f0633e..9b60ddf 100644
--- a/block/vxhs.c
+++ b/block/vxhs.c
@@ -17,12 +17,16 @@
 #include "qemu/uri.h"
 #include "qapi/error.h"
 #include "qemu/uuid.h"
+#include "crypto/secret.h"

 #define VXHS_OPT_FILENAME   "filename"
 #define VXHS_OPT_VDISK_ID   "vdisk-id"
 #define VXHS_OPT_SERVER "server"
 #define VXHS_OPT_HOST   "host"
 #define VXHS_OPT_PORT   "port"
+#define VXHS_OPT_USER   "user"
+#define VXHS_OPT_PASSWORD   "password"
+#define VXHS_OPT_SECRETID   "password-secret"
 #define VXHS_UUID_DEF "12345678-1234-1234-1234-123456789012"

 QemuUUID qemu_uuid __attribute__ ((weak));
@@ -136,6 +140,22 @@ static QemuOptsList runtime_opts = {
 .type = QEMU_OPT_STRING,
 .help = "UUID of the VxHS vdisk",
 },
+{
+.name = VXHS_OPT_USER,
+.type = QEMU_OPT_STRING,
+.help = "username for authentication to target",
+},
+{
+.name = VXHS_OPT_PASSWORD,
+.type = QEMU_OPT_STRING,
+.help = "password for authentication to target",
+},
+{
+.name = VXHS_OPT_SECRETID,
+.type = QEMU_OPT_STRING,
+.help = "ID of the secret providing password for"
+"authentication to target",
+},
 { /* end of list */ }
 },
 };
@@ -257,6 +277,9 @@ static int vxhs_open(BlockDriverState *bs, QDict *options,
 const char *server_host_opt;
 char *str = NULL;
 int ret = 0;
+const char *user = NULL;
+const char *secretid = NULL;
+const char *password = NULL;

 ret = vxhs_init_and_ref();
 if (ret < 0) {
@@ -320,6 +343,35 @@ static int vxhs_open(BlockDriverState *bs, QDict *options,
 goto out;
 }

+/* check if we got username and secretid via the options */
+user = qemu_opt_get(opts, VXHS_OPT_USER);
+if (!user) {
+error_setg(_err, "please specify the user for authenticating to "
+   "target");
+qdict_del(backing_options, str);
+ret = -EINVAL;
+goto out;
+}
+
+secretid = qemu_opt_get(opts, VXHS_OPT_SECRETID);
+if (!secretid) {
+error_setg(_err, "please specify the ID of the secret to be "
+   "used for authenticating to target");
+qdict_del(backing_options, str);
+ret = -EINVAL;
+goto out;
+}
+
+/* check if we got password via the --object argument */
+password = qcrypto_secret_lookup_as_utf8(secretid, _err);
+if (local_err != NULL) {
+trace_vxhs_get_creds(user, secretid, password);
+qdict_del(backing_options, str);
+ret = -EINVAL;
+goto out;
+}
+trace_vxhs_get_creds(user, secretid, password);
+
 s->vdisk_hostinfo.host = g_strdup(server_host_opt);

 s->vdisk_hostinfo.port = g_ascii_strtoll(qemu_opt_get(tcp_opts,

Regards,
Ashish


On Fri, Feb 24, 2017 at 1:19 AM, Daniel P. Berrange  wrote:
> On Thu, Feb 23, 2017 at 08:19:29PM -0800, ashish mittal wrote:
>> Hi,
>>
>> Just want to check if the following mechanism for accepting the secret
>> password looks OK?
>>
>> We have yet to internally discuss the semantics of how we plan to use
>> the user-ID/password for authentication. This diff is just to
>> understand how I am expected to accept the secret from the command
>> line.
>>
>> Example specifying 

Re: [Qemu-devel] [PATCH RFC v3 11/15] vfio: ccw: realize VFIO_DEVICE_G(S)ET_IRQ_INFO ioctls

2017-02-24 Thread Alex Williamson
On Fri, 17 Feb 2017 09:29:35 +0100
Dong Jia Shi  wrote:

> Realize VFIO_DEVICE_GET_IRQ_INFO ioctl to retrieve
> VFIO_CCW_IO_IRQ information.
> 
> Realize VFIO_DEVICE_SET_IRQS ioctl to set an eventfd fd for
> VFIO_CCW_IO_IRQ. Once a write operation to the ccw_io_region
> was performed, trigger a signal on this fd.
> 
> Signed-off-by: Dong Jia Shi 
> Reviewed-by: Pierre Morel 
> ---
>  drivers/s390/cio/vfio_ccw_ops.c | 123 
> +++-
>  drivers/s390/cio/vfio_ccw_private.h |   4 ++
>  include/uapi/linux/vfio.h   |  10 ++-
>  3 files changed, 134 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
> index 125818c..ebc38fb 100644
> --- a/drivers/s390/cio/vfio_ccw_ops.c
> +++ b/drivers/s390/cio/vfio_ccw_ops.c
> @@ -202,6 +202,9 @@ static ssize_t vfio_ccw_mdev_write(struct mdev_device 
> *mdev,
>   if (region->ret_code != 0)
>   return region->ret_code;
>  
> + if (private->io_trigger)
> + eventfd_signal(private->io_trigger, 1);
> +
>   return count;
>  }
>  
> @@ -209,7 +212,7 @@ static int vfio_ccw_mdev_get_device_info(struct 
> vfio_device_info *info)
>  {
>   info->flags = VFIO_DEVICE_FLAGS_CCW | VFIO_DEVICE_FLAGS_RESET;
>   info->num_regions = VFIO_CCW_NUM_REGIONS;
> - info->num_irqs = 0;
> + info->num_irqs = VFIO_CCW_NUM_IRQS;
>  
>   return 0;
>  }
> @@ -230,6 +233,83 @@ static int vfio_ccw_mdev_get_region_info(struct 
> vfio_region_info *info,
>   }
>  }
>  
> +int vfio_ccw_mdev_get_irq_info(struct vfio_irq_info *info)
> +{
> + if (info->index != VFIO_CCW_IO_IRQ_INDEX)
> + return -EINVAL;
> +
> + info->count = 1;
> + info->flags = VFIO_IRQ_INFO_EVENTFD | VFIO_IRQ_INFO_NORESIZE;

I tend to think that NORESIZE isn't relevant when count is one.  We
don't expose this flag on vfio-pci for INTx where there's only a single
interrupt.  I don't really see how it applies, it's really trying to
indicate if there are multiple sub-indexes within an index, if some are
enabled, we cannot enable others without first disabling them all.
Thus we can't resize the number of enabled sub-indexes.  There really
are no sub-indexes when count is one.  Thanks,

Alex

> +
> + return 0;
> +}
> +
> +static int vfio_ccw_mdev_set_irqs(struct mdev_device *mdev,
> +   uint32_t flags,
> +   void __user *data)
> +{
> + struct vfio_ccw_private *private;
> + struct eventfd_ctx **ctx;
> +
> + if (!(flags & VFIO_IRQ_SET_ACTION_TRIGGER))
> + return -EINVAL;
> +
> + private = dev_get_drvdata(mdev_parent_dev(mdev));
> + if (!private)
> + return -ENODEV;
> +
> + ctx = >io_trigger;
> +
> + switch (flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
> + case VFIO_IRQ_SET_DATA_NONE:
> + {
> + if (*ctx)
> + eventfd_signal(*ctx, 1);
> + return 0;
> + }
> + case VFIO_IRQ_SET_DATA_BOOL:
> + {
> + uint8_t trigger;
> +
> + if (get_user(trigger, (uint8_t __user *)data))
> + return -EFAULT;
> +
> + if (trigger && *ctx)
> + eventfd_signal(*ctx, 1);
> + return 0;
> + }
> + case VFIO_IRQ_SET_DATA_EVENTFD:
> + {
> + int32_t fd;
> +
> + if (get_user(fd, (int32_t __user *)data))
> + return -EFAULT;
> +
> + if (fd == -1) {
> + if (*ctx)
> + eventfd_ctx_put(*ctx);
> + *ctx = NULL;
> + } else if (fd >= 0) {
> + struct eventfd_ctx *efdctx;
> +
> + efdctx = eventfd_ctx_fdget(fd);
> + if (IS_ERR(efdctx))
> + return PTR_ERR(efdctx);
> +
> + if (*ctx)
> + eventfd_ctx_put(*ctx);
> +
> + *ctx = efdctx;
> + } else
> + return -EINVAL;
> +
> + return 0;
> + }
> + default:
> + return -EINVAL;
> + }
> +}
> +
>  static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device *mdev,
>  unsigned int cmd,
>  unsigned long arg)
> @@ -277,6 +357,47 @@ static ssize_t vfio_ccw_mdev_ioctl(struct mdev_device 
> *mdev,
>  
>   return copy_to_user((void __user *)arg, , minsz);
>   }
> + case VFIO_DEVICE_GET_IRQ_INFO:
> + {
> + struct vfio_irq_info info;
> +
> + minsz = offsetofend(struct vfio_irq_info, count);
> +
> + if (copy_from_user(, (void __user *)arg, minsz))
> + return -EFAULT;
> +
> + if (info.argsz < minsz || info.index >= VFIO_CCW_NUM_IRQS)
> +   

Re: [Qemu-devel] [PATCH RFC v3 07/15] vfio: ccw: introduce ccw_io_region

2017-02-24 Thread Alex Williamson
On Fri, 17 Feb 2017 09:29:31 +0100
Dong Jia Shi  wrote:

> To provide user-space a set of interfaces to:
> 1. pass in a ccw program to perform an I/O operation.
> 2. read back I/O results of the completed I/O operations.
> We introduce an MMIO region for the vfio-ccw device here.
> 
> This region is defined to content:
> 1. areas to store arguments that an ssch required.
> 2. areas to store the I/O results.
> 
> Using pwrite/pread to the device on this region, a user-space program
> could write/read data to/from the vfio-ccw device.
> 
> Signed-off-by: Dong Jia Shi 
> Reviewed-by: Pierre Morel 
> ---
>  arch/s390/include/uapi/asm/Kbuild   |  1 +
>  drivers/s390/cio/vfio_ccw_ops.c | 47 
> +
>  drivers/s390/cio/vfio_ccw_private.h |  4 
>  include/uapi/linux/vfio_ccw.h   | 24 +++
>  4 files changed, 76 insertions(+)
>  create mode 100644 include/uapi/linux/vfio_ccw.h
> 
> diff --git a/arch/s390/include/uapi/asm/Kbuild 
> b/arch/s390/include/uapi/asm/Kbuild
> index bf736e7..fdb9529 100644
> --- a/arch/s390/include/uapi/asm/Kbuild
> +++ b/arch/s390/include/uapi/asm/Kbuild
> @@ -52,3 +52,4 @@ header-y += unistd.h
>  header-y += virtio-ccw.h
>  header-y += vtoc.h
>  header-y += zcrypt.h
> +header-y += vfio_ccw.h
> diff --git a/drivers/s390/cio/vfio_ccw_ops.c b/drivers/s390/cio/vfio_ccw_ops.c
> index b8a2fed..6c06805 100644
> --- a/drivers/s390/cio/vfio_ccw_ops.c
> +++ b/drivers/s390/cio/vfio_ccw_ops.c
> @@ -127,6 +127,51 @@ void vfio_ccw_mdev_release(struct mdev_device *mdev)
>>nb);
>  }
>  
> +static ssize_t vfio_ccw_mdev_read(struct mdev_device *mdev,
> +   char __user *buf,
> +   size_t count,
> +   loff_t *ppos)
> +{
> + struct vfio_ccw_private *private;
> + struct ccw_io_region *region;
> +
> + if (*ppos + count > sizeof(*region))
> + return -EINVAL;

Couldn't this wrap-around and still satisfy this test?

> +
> + private = dev_get_drvdata(mdev_parent_dev(mdev));
> + if (!private)
> + return -ENODEV;
> +
> + region = >io_region;
> + if (copy_to_user(buf, (void *)region + *ppos, count))
> + return -EFAULT;
> +
> + return count;
> +}
> +
> +static ssize_t vfio_ccw_mdev_write(struct mdev_device *mdev,
> +const char __user *buf,
> +size_t count,
> +loff_t *ppos)
> +{
> + struct vfio_ccw_private *private;
> + struct ccw_io_region *region;
> +
> + if (*ppos + count > sizeof(*region))
> + return -EINVAL;

Same here

> +
> + private = dev_get_drvdata(mdev_parent_dev(mdev));
> + if (!private)
> + return -ENODEV;
> +
> + region = >io_region;
> + if (copy_from_user((void *)region + *ppos, buf, count))
> + return -EFAULT;
> + region->ret_code = 0;
> +
> + return count;
> +}
> +
>  static const struct mdev_parent_ops vfio_ccw_mdev_ops = {
>   .owner  = THIS_MODULE,
>   .supported_type_groups  = mdev_type_groups,
> @@ -134,6 +179,8 @@ static const struct mdev_parent_ops vfio_ccw_mdev_ops = {
>   .remove = vfio_ccw_mdev_remove,
>   .open   = vfio_ccw_mdev_open,
>   .release= vfio_ccw_mdev_release,
> + .read   = vfio_ccw_mdev_read,
> + .write  = vfio_ccw_mdev_write,
>  };
>  
>  int vfio_ccw_mdev_reg(struct subchannel *sch)
> diff --git a/drivers/s390/cio/vfio_ccw_private.h 
> b/drivers/s390/cio/vfio_ccw_private.h
> index 5afb3ba..359e96b 100644
> --- a/drivers/s390/cio/vfio_ccw_private.h
> +++ b/drivers/s390/cio/vfio_ccw_private.h
> @@ -10,6 +10,8 @@
>  #ifndef _VFIO_CCW_PRIVATE_H_
>  #define _VFIO_CCW_PRIVATE_H_
>  
> +#include 
> +
>  #include "css.h"
>  
>  /**
> @@ -19,6 +21,7 @@
>   * @avail: available for creating a mediated device
>   * @mdev: pointer to the mediated device
>   * @nb: notifier for vfio events
> + * @io_region: MMIO region to input/output I/O arguments/results
>   */
>  struct vfio_ccw_private {
>   struct subchannel   *sch;
> @@ -26,6 +29,7 @@ struct vfio_ccw_private {
>   atomic_tavail;
>   struct mdev_device  *mdev;
>   struct notifier_block   nb;
> + struct ccw_io_regionio_region;
>  } __aligned(8);
>  
>  extern int vfio_ccw_mdev_reg(struct subchannel *sch);
> diff --git a/include/uapi/linux/vfio_ccw.h b/include/uapi/linux/vfio_ccw.h
> new file mode 100644
> index 000..34a7f6f
> --- /dev/null
> +++ b/include/uapi/linux/vfio_ccw.h
> @@ -0,0 +1,24 @@
> +/*
> + * Interfaces for vfio-ccw
> + *
> + * Copyright IBM Corp. 2017
> + *
> + * Author(s): Dong Jia Shi 
> + */
> +
> +#ifndef _VFIO_CCW_H_
> +#define 

Re: [Qemu-devel] [PATCH 08/21] qmp: Improve QMP dispatch error messages

2017-02-24 Thread Eric Blake
On 02/23/2017 03:45 PM, Markus Armbruster wrote:
> Signed-off-by: Markus Armbruster 
> ---
>  qapi/qmp-dispatch.c | 14 --
>  1 file changed, 8 insertions(+), 6 deletions(-)
> 

> @@ -41,15 +41,17 @@ static QDict *qmp_dispatch_check_obj(const QObject 
> *request, Error **errp)
>  
>  if (!strcmp(arg_name, "execute")) {
>  if (qobject_type(arg_obj) != QTYPE_QSTRING) {
> -error_setg(errp, "QMP input object member '%s' expects '%s'",
> -   "execute", "string");
> +error_setg(errp,
> +   "QMP input object member '%s' must be %s",
> +   "execute", "a string");

Any reason we can't inline this to:

"QMP input object member 'execute' must be a string"

? It's not like we're translating the message.

>  return NULL;
>  }
>  has_exec_key = true;
>  } else if (!strcmp(arg_name, "arguments")) {
>  if (qobject_type(arg_obj) != QTYPE_QDICT) {
> -error_setg(errp, "QMP input object member '%s' expects '%s'",
> -   "arguments", "object");
> +error_setg(errp,
> +   "QMP input object member '%s' must be %s",
> +   "arguments", "an object");

and again

Then again, if you use my idea of a QAPI-generated visitor of each input
wire object, you'd get whatever error message qobject-input normally
gives, which may render these changes irrelevant.

At any rate, the new wordings are nicer, whether or not you inline
constant text.

Reviewed-by: Eric Blake 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH 07/21] qmp: Eliminate silly QERR_QMP_* macros

2017-02-24 Thread Eric Blake
On 02/23/2017 03:45 PM, Markus Armbruster wrote:
> The QERR_ macros are leftovers from the days of "rich" error objects.
> 
> QERR_QMP_BAD_INPUT_OBJECT, QERR_QMP_BAD_INPUT_OBJECT_MEMBER,
> QERR_QMP_EXTRA_MEMBER are used in just one place now, except for one
> use that has crept into qobject-input-visitor.c.
> 
> Drop these macros, to make the (bad) error messages more visible.
> 
> Signed-off-by: Markus Armbruster 
> ---

Reviewed-by: Eric Blake 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH 06/21] qmp: Drop duplicated QMP command object checks

2017-02-24 Thread Eric Blake
On 02/23/2017 03:45 PM, Markus Armbruster wrote:
> qmp_check_input_obj() duplicates qmp_dispatch_check_obj(), except the
> latter screws up an error message.  handle_qmp_command() runs first
> the former, then the latter via qmp_dispatch(), masking the screwup.
> 
> qemu-ga also masks the screwup, because it also duplicates checks,
> just differently.

Cleaning up the duplication is a win in my book.

> 
> qmp_check_input_obj() exists because handle_qmp_command() needs to
> examine the command before dispatching it.  The previous commit got
> rid of this need, except for a tracepoint, and a bit of "id" code that
> relies on qdict not being null.
> 
> Fix up the error message in qmp_dispatch_check_obj(), drop
> qmp_check_input_obj() and the tracepoint.  Protect the "id" code with
> a conditional.
> 
> Signed-off-by: Markus Armbruster 
> ---
>  monitor.c   | 74 
> +
>  qapi/qmp-dispatch.c |  3 +--
>  trace-events|  1 -
>  3 files changed, 7 insertions(+), 71 deletions(-)
> 
> diff --git a/monitor.c b/monitor.c
> index dcf2de7..d83888d 100644
> --- a/monitor.c
> +++ b/monitor.c
> @@ -3697,67 +3697,10 @@ static int monitor_can_read(void *opaque)
>  return (mon->suspend_cnt == 0) ? 1 : 0;
>  }
>  
> -/*
> - * Input object checking rules
> - *
> - * 1. Input object must be a dict
> - * 2. The "execute" key must exist
> - * 3. The "execute" key must be a string
> - * 4. If the "arguments" key exists, it must be a dict
> - * 5. If the "id" key exists, it can be anything (ie. json-value)
> - * 6. Any argument not listed above is considered invalid

You know, with just a little tweak, we could add something to the .json
file, and let generated code do all the checks of even the top-level
entities:

{ 'struct': 'WireCommand',
  'data': { 'execute': 'str', '*arguments': 'dict', '*id': 'any' } }

if we had a way to force 'dict' as a subset of 'any' but where it must
be a QDict.  (If we had a 'dict' recognized by the QAPI code generators,
we could probably also fix 'object-add' to use '*props':'dict' instead
of its current use of 'any')

But that sounds like a bit much to ask for in the 2.9 timeframe, so it
doesn't hold up this patch.

We lose a trace, but I don't think that's fatal.

Reviewed-by: Eric Blake 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH 04/29] 9pfs: introduce openat_nofollow() helper

2017-02-24 Thread Greg Kurz
On Fri, 24 Feb 2017 17:17:30 +
Stefan Hajnoczi  wrote:
[...]
> > > This function doesn't handle absolute paths?  It ignores leading '/' and
> > > therefore treats all paths as relative paths.
> > >   
> > 
> > Yes because any path coming from the client is supposed (*) to be relative 
> > to the
> > shared directory and openat(2) says:  
> 
> Please change the function name since this isn't openat with nofollow
> behavior, it's a subset of openat that only takes relative paths with
> nofollow behavior.

In the v2, this function is only called by local_open_nofollow() actually.
Maybe I should move the stripping of leading '/' characters there ?


pgpBhmV_FI8PX.pgp
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH 11/29] 9pfs: local: lremovexattr: don't follow symlinks

2017-02-24 Thread Greg Kurz
On Mon, 20 Feb 2017 15:40:45 +0100
Greg Kurz  wrote:

> The local_lremovexattr() callback is vulnerable to symlink attacks because
> it calls lremovexattr() which follows symbolic links in all path elements but
> the rightmost one.
> 
> This patch converts local_lremovexattr() to rely on opendir_nofollow() and
> fremovexattrat_nofollow() instead.
> 
> This partly fixes CVE-2016-9602.
> 
> Signed-off-by: Greg Kurz 
> ---
>  hw/9pfs/9p-posix-acl.c  |   10 ++
>  hw/9pfs/9p-xattr-user.c |8 +---
>  hw/9pfs/9p-xattr.c  |8 +---
>  3 files changed, 4 insertions(+), 22 deletions(-)
> 
> diff --git a/hw/9pfs/9p-posix-acl.c b/hw/9pfs/9p-posix-acl.c
> index 0154e2a7605f..c20409104135 100644
> --- a/hw/9pfs/9p-posix-acl.c
> +++ b/hw/9pfs/9p-posix-acl.c
> @@ -58,10 +58,8 @@ static int mp_pacl_removexattr(FsContext *ctx,
> const char *path, const char *name)
>  {
>  int ret;
> -char *buffer;
>  
> -buffer = rpath(ctx, path);
> -ret  = lremovexattr(buffer, MAP_ACL_ACCESS);
> +ret = local_removexattr_nofollow(ctx, MAP_ACL_ACCESS, name);

While reworking on a new implementation fremovexattrat(), I realized the
above should be:

ret = local_removexattr_nofollow(ctx, path, MAP_ACL_ACCESS);

>  if (ret == -1 && errno == ENODATA) {
>  /*
>   * We don't get ENODATA error when trying to remove a
> @@ -71,7 +69,6 @@ static int mp_pacl_removexattr(FsContext *ctx,
>  errno = 0;
>  ret = 0;
>  }
> -g_free(buffer);
>  return ret;
>  }
>  
> @@ -111,10 +108,8 @@ static int mp_dacl_removexattr(FsContext *ctx,
> const char *path, const char *name)
>  {
>  int ret;
> -char *buffer;
>  
> -buffer = rpath(ctx, path);
> -ret  = lremovexattr(buffer, MAP_ACL_DEFAULT);
> +ret = local_removexattr_nofollow(ctx, MAP_ACL_DEFAULT, name);

Same error here.

>  if (ret == -1 && errno == ENODATA) {
>  /*
>   * We don't get ENODATA error when trying to remove a
> @@ -124,7 +119,6 @@ static int mp_dacl_removexattr(FsContext *ctx,
>  errno = 0;
>  ret = 0;
>  }
> -g_free(buffer);
>  return ret;
>  }
>  
> diff --git a/hw/9pfs/9p-xattr-user.c b/hw/9pfs/9p-xattr-user.c
> index 1840a5db66f3..2c90817b75a6 100644
> --- a/hw/9pfs/9p-xattr-user.c
> +++ b/hw/9pfs/9p-xattr-user.c
> @@ -81,9 +81,6 @@ static int mp_user_setxattr(FsContext *ctx, const char 
> *path, const char *name,
>  static int mp_user_removexattr(FsContext *ctx,
> const char *path, const char *name)
>  {
> -char *buffer;
> -int ret;
> -
>  if (strncmp(name, "user.virtfs.", 12) == 0) {
>  /*
>   * Don't allow fetch of user.virtfs namesapce
> @@ -92,10 +89,7 @@ static int mp_user_removexattr(FsContext *ctx,
>  errno = EACCES;
>  return -1;
>  }
> -buffer = rpath(ctx, path);
> -ret = lremovexattr(buffer, name);
> -g_free(buffer);
> -return ret;
> +return local_removexattr_nofollow(ctx, path, name);
>  }
>  
>  XattrOperations mapped_user_xattr = {
> diff --git a/hw/9pfs/9p-xattr.c b/hw/9pfs/9p-xattr.c
> index 72ef820f16d7..6fbfbca7e9a0 100644
> --- a/hw/9pfs/9p-xattr.c
> +++ b/hw/9pfs/9p-xattr.c
> @@ -333,13 +333,7 @@ int pt_setxattr(FsContext *ctx, const char *path, const 
> char *name, void *value,
>  
>  int pt_removexattr(FsContext *ctx, const char *path, const char *name)
>  {
> -char *buffer;
> -int ret;
> -
> -buffer = rpath(ctx, path);
> -ret = lremovexattr(path, name);
> -g_free(buffer);
> -return ret;
> +return local_removexattr_nofollow(ctx, path, name);
>  }
>  
>  ssize_t notsup_getxattr(FsContext *ctx, const char *path, const char *name,
> 



pgpojLAtX0mAD.pgp
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH 05/21] qmp: Clean up how we enforce capability negotiation

2017-02-24 Thread Eric Blake
On 02/23/2017 03:45 PM, Markus Armbruster wrote:
> To enforce capability negotiation before normal operation,
> handle_qmp_command() inspects every command before it's handed off to
> qmp_dispatch().  This is a bit of a layering violation, and results in
> duplicated code.
> 
> Before capability negotiation (!cur_mon->in_command_mode), we fail
> commands other than "qmp_capabilities".  This is what enforces
> capability negotiation.
> 
> Afterwards, we fail command "qmp_capabilities".
> 
> Clean this up as follows.
> 
> The obvious place to fail a command is the command itself, so move the
> "afterwards" check to qmp_qmp_capabilities().
> 
> We do the "before" check in every other command, but that would be
> bothersome.  Instead, start without all the other commands, by
> registering only qmp_qmp_capabilities().  Register the others in
> qmp_qmp_capabilities().
> 
> Additionally, replace the generic human-readable error message for
> CommandNotFound by one that reminds the user to run qmp_capabilities.
> Without that, we'd regress commit 2d5a834.
> 
> Signed-off-by: Markus Armbruster 
> ---
>  monitor.c | 70 
> ---
>  1 file changed, 40 insertions(+), 30 deletions(-)
> 

> @@ -3781,12 +3782,21 @@ static void handle_qmp_command(JSONMessageParser 
> *parser, GQueue *tokens)
>  cmd_name = qdict_get_str(qdict, "execute");
>  trace_handle_qmp_command(mon, cmd_name);
>  
> -if (invalid_qmp_mode(mon, cmd_name, )) {
> -goto err_out;
> -}
> -
>  rsp = qmp_dispatch(req);
>  
> +qdict = qdict_get_qdict(qobject_to_qdict(rsp), "error");
> +if (qdict) {
> +if (!mon->qmp.in_command_mode
> +&& !strcmp(qdict_get_try_str(qdict, "class"),

Ouch - this can call strcmp(NULL, ...) if the error is ill-formed.  Is
that a problem?  Can we assert that "class" will always exist as a string?

> +   
> QapiErrorClass_lookup[ERROR_CLASS_COMMAND_NOT_FOUND])) {
> +/* Provide a more useful error message */
> +qdict_del(qdict, "desc");
> +qdict_put(qdict, "desc",
> +  qstring_from_str("Expecting capabilities negotiation"
> +   " with 'qmp_capabilities'"));
> +}
> +}
> +

Otherwise, I like it.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH 09/10] sm501: Add some more missing registers

2017-02-24 Thread BALATON Zoltan

On Fri, 24 Feb 2017, BALATON Zoltan wrote:

On Fri, 24 Feb 2017, Peter Maydell wrote:

On 19 February 2017 at 16:35, BALATON Zoltan  wrote:

Write only to allow clients to initialise these without failing

Signed-off-by: BALATON Zoltan 


What's the point in write-only register values?


U-boot writes this register during setting up the device and without this it 
would abort QEMU.



What does the real hardware do here?


This register contains bits to set up FIFO parameters and memory priorities 
which we are not emulating so these can be ignored here but the hardware 
would change parameters according the value written.


Sorry, this is for the arbitration_control register. The other registers 
added in this patch are for the 2D engine which is only partially emulated 
but writing the registers is OK as long as no operation using them is 
called. (That case is handled in sm501_2d_operation.) We need to allow 
writes as these are initialised during boot but not used afterwards. Later 
when implementing more of the 2D engine we may use the written values.



If the registers are writes-ignored, there's no need to store
the data written into the state struct; if the registers are
reads-as-written then implement them that way.


Still not sure what do you mean by read-as-written because I think that's 
exactly what is done here, value stored and read back as is, except for 
video_control where there are reserved bits that are always 0.




Re: [Qemu-devel] Qemu and Changed Block Tracking

2017-02-24 Thread Eric Blake
On 02/24/2017 03:31 PM, John Snow wrote:
>>
>> But the Backup Server could instead connect to the NAS directly avoiding
>> load on the frontent LAN
>> and the Qemu Node.
>>
> 
> In a live backup I don't see how you will be removing QEMU from the data
> transfer loop. QEMU is the only process that knows what the correct view
> of the image is, and needs to facilitate.
> 
> It's not safe to copy the blocks directly without QEMU's mediation.

Although we may already have enough tools in place to help achieve that:
create a temporary qcow2 wrapper around the primary image via external
snapshot, so that the primary image is now read-only in qemu; then use
whatever block-status mechanism (whether the NBD block status extension,
or directly reading from a persistent bitmap) to facilitate whatever
more efficient offline transfer of just the relevant portions of that
main file, then live block-commit to get qemu to start writing to the
file again.

In other words, any time your algorithm wants to cause an I/O freeze to
a particular file, the solution is to add a qcow2 external snapshot
followed by a live commit.

So tweaking the proposal a few mails ago:

fsfreeze (optional)
create qcow2 snapshot wrapper as a write lock (via QMP)
fsthaw - now with no risk of violating guest timing constraints
dirtymap = find all blocks that are dirty since last backup (via named
bitmap/NBD block status)
foreach block in dirtymap {
   copy to backup via external software
}
live commit image (via QMP)

The window where guest I/O is frozen is small (the freeze/snapshot
create/thaw steps can be done in less than a second), while the window
where you are extracting incremental backup data is longer (during that
time, guest I/O is happening into a wrapper qcow2 file).

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] Qemu and Changed Block Tracking

2017-02-24 Thread John Snow


On 02/23/2017 09:27 AM, Peter Lieven wrote:
> Am 22.02.2017 um 13:32 schrieb Eric Blake:
>> On 02/22/2017 02:45 AM, Peter Lieven wrote:
 A bit outdated now, but:
 http://wiki.qemu-project.org/Features/IncrementalBackup

 and also a summary I wrote not too far back (PDF):
 https://drive.google.com/file/d/0B3CFr1TuHydWalVJaEdPaE5PbFE

 and I'm sure the Virtuozzo developers could chime in on this subject,
 but basically we do have something similar in the works, as eblake
 says.
>>> Hi John, Hi Erik,
>> It's Eric, but you're not the first to make that typo :)
>>
>>> thanks for your feedback. Are you both the ones working primary on
>>> this topic?
>>> If there is anything to review or help needed, please let me know.
>>>
>>> My 2 cents:
>>> I thing I had in mind if there is no image fleecing available, but
>>> fetching the dirty bitmap
>>> from external would be a feauture to put a write lock on a block device.
>> The whole idea is to use a dirty bitmap coupled with image fleecing,
>> where the point-in-time of the image fleecing is done at a window where
>> the guest I/O is quiescent in order to get a stable fleecing point.  We
>> already support write locks (guest quiesence) using qga to do fsfreeze.
>> You want the time that guest I/O is frozen to be as small as possible
>> (in particular, the Windows implementation of quiescence will fail if
>> you hold things frozen for more than a couple of seconds).
>>
>> Right now, the qcow2 image format does not track write generations, and
>> I don't think we plan on adding that directly into qcow2.  However, you
>> can externally simulate write generations by keeping track of how many
>> image fleecing points you have created (each fleecing point is another
>> write generation).
>>
>>
>>> In this case something like this via QMP (and external software)
>>> should work:
>>> ---8<---
>>>   gen =  write generation of last backup (or 0 for full backup)
>>>   do {
>>>   nextgen = fetch current write generation (via QMP)
>>>   dirtymap = send all block whose write generation is greater
>>> than 'gen' (via QMP)
>> No, we are NOT going to send dirty information via QMP.  Rather, we are
>> going to send it via NBD's extension NBD_CMD_BLOCK_STATUS.  The idea is
>> that a client connects and asks which qemu blocks are dirty, then uses
>> that information to read only the dirty blocks.
> 
> I understand, that for the case of local storage connecting via NBD to
> Qemu to grep a snapshot
> might be a good idea, but consider that you have a NAS for your vServer
> images. May it be NFS,
> iSCSI, CEPH or whatever. In an enterprise scenario I would generally
> except to have a NAS rather
> than local storage.
> 
> When you are going to backup your vServer (full or incremental) you
> shuffle all the traffic through
> Qemu and your Node running the vServer. In this case you run all the
> traffic over the wire twice.
> 
> NAS -> Node -> Qemu - > Backup Server
> 
> But the Backup Server could instead connect to the NAS directly avoiding
> load on the frontent LAN
> and the Qemu Node.
> 

In a live backup I don't see how you will be removing QEMU from the data
transfer loop. QEMU is the only process that knows what the correct view
of the image is, and needs to facilitate.

It's not safe to copy the blocks directly without QEMU's mediation.

--js

> I would like to find a nice solution for this scenario. If not in the
> first step it would maybe be good to
> have this in mind when implementing a dirty block tracking.
> 
> Peter



Re: [Qemu-devel] [PATCH v2 1/1] qemu-char: socket backend: disconnect on write error

2017-02-24 Thread Denis V. Lunev
On 02/24/2017 04:31 PM, Paolo Bonzini wrote:
>
> On 24/02/2017 15:46, Marc-André Lureau wrote:
 +if (ret < 0 && errno != EAGAIN) {
 +if (tcp_chr_read_poll(chr) <= 0) {
 +tcp_chr_disconnect(chr);
 +return len;

>>> This change breaks a number of assumption in vhost-user code. Until now, a
>>> vhost-user function assumed that dev->vhost_ops would remain as long as the
>>> function is running, so it may call vhost_ops callbacks several time, which
>>> may eventually fail to do io, but no crash would occur. The disconnection
>>> would be handled later with the HUP handler. Now, vhost_ops may be cleared
>>> during a write (chr_disconnect -> CHR_EVENT_CLOSED in
>>> net_vhost_user_event). This can be randomly reproduced with
>>> vhost-user-test -p /x86_64/vhost-user/flags-mismatch/subprocess
> Would it work to call tcp_chr_disconnect from an idle source (and
> destroy the source on the next connect)?
>
> Paolo
I think no, but will think more on Monday. Unfortunately
today is official holiday in Russia :(

Right now the code loops forever. May be we should notify the
guest that the problem really happens.

Den



Re: [Qemu-devel] [PATCH RFC v3 2/5] keyval: New keyval_parse()

2017-02-24 Thread Eric Blake
On 02/24/2017 03:06 PM, Eric Blake wrote:

> 
> Also for 2.9: turn on the deprecation warning for QemuOpts negative
> sugar, but leave positive sugar unchanged (it seems like positive sugar
> is probably more in use than negative sugar, other than chardev nowait).

Capturing the gist of an IRC conversation:

Right now, it appears that '-chardev nowait' is in heavy use ('git grep
nowait' shows that qemu itself recommends it in documentation, nodelay
being another one).  Right now, QemuOpts says that '-chardev
wait,nowait' results in 'wait=off', and '-chardev nowait,wait' results
in 'wait=on' (that is, last-one-wins semantics) - but in reality, there
are probably few (if any) clients that rely on this particular semantics.

If we get rid of 'noFOO' magic, we can still teach the chardev QemuOpts
to recognize optional keys for BOTH 'wait' and 'nowait'; a user that
specifies neither gets the default behavior as always; a user that
specifies only 'wait' or 'nowait' gets the requested behavior (although
now chardev has to check both key spellings rather than one), and a user
that specifies 'wait,nowait' or 'nowait,wait' causes an error, not at
QemuOpts parse time, but in the chardev initialization code.  That is,
for any boolean option where 'noFOO' magic is currently handled by
QemuOpts, we can still mostly keep back-compat by adding a new 'noFOO'
key to the object, and teaching that object to check that 'FOO' and
'noFOO' are not both selected at once.

But whether this change to QemuOpts is the right thing to do is not 2.9
material, so let's take a breather and revisit it for 2.10 (or whenever
we try to fix QemuOpts to layer on top of keyval_parse()).


-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] RFC QCOW2 compression speed

2017-02-24 Thread Eric Blake
On 02/24/2017 01:54 PM, Peter Lieven wrote:
> 
> Changing the compression level via environment or a QCOW2 parameter is not a 
> big deal. However, it helps not that much regarding speed. What helps is 
> increasing the window size, but this is an incompatible change. And if we go 
> for an incompatible change I would think about using a better compression 
> algorithm than zlib deflate.

Indeed, lzma (think xz), or zstd, or $compression_of_the_day... may be
better, and it may be disk-image dependent which algorithm is best. It
sounds like we need to improve the qcow2 spec to add a header extension
that describes the current compression algorithm and parameters; if
omitted the image uses zlib with fixed window size, but if present, we
can put everything we need to be able to choose other compression
tunings and be able to decompress (still, it has to be an all-or-none).
We'd also need to set an incompatible_features flag if the compression
header extension is present with non-default values so that older qemu
don't go corrupting the compressed data.  And then you start having to
worry about modularity (do you compile qemu with support for all
compression libraries, or can you pick and choose which ones to compile
in at the risk of which images you can't open, or do you figure out how
to dlopen the right libraries as needed...)

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH RFC v3 2/5] keyval: New keyval_parse()

2017-02-24 Thread Eric Blake
On 02/24/2017 02:18 PM, Markus Armbruster wrote:
>> I have a preference for option 1 in the long run, but as it seems to be
>> upwards compatible from option 2 for -blockdev in 2.9, I'm leaning
>> towards option 2 for this release.
> 
> Let me rename the options:
> 
> * "no sugar -blockdev": both "FOO" and "noFOO" are rejected.
> 
> * "positive sugar blockdev": "FOO" is desugared to "FOO=on", "noFOO" to
>   "noFOO=on".
> 
> Which one do you prefer for 2.9?

My current leanings:

For 2.9: keyval_parse() should have no sugar.  Both "FOO" and "noFOO"
are rejected; an '=' must be present except for an implicit key -
although if -blockdev is the only client of keyval_parse(), then it's a
tossup whether we want to use it with an implicit key.

Also for 2.9: turn on the deprecation warning for QemuOpts negative
sugar, but leave positive sugar unchanged (it seems like positive sugar
is probably more in use than negative sugar, other than chardev nowait).

That way, we still have the option of enabling positive sugar blockdev
in a later version, if we decide its useful (and after dealing with any
QemuOpts fallout), but are not locked into having to use it.


-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH 10/10] ppc: Add SM501 device in config for ppc and ppcemb targets

2017-02-24 Thread BALATON Zoltan

On Fri, 24 Feb 2017, Peter Maydell wrote:

On 19 February 2017 at 16:35, BALATON Zoltan  wrote:

This is not used by default on any emulated machine yet but it is
still useful to have it compiled so it can be added from the command
line for clients that can use it (e.g. MorphOS has no driver for any
other emulated video cards but can output via SM501)

Signed-off-by: BALATON Zoltan 


If this is a generic PCI device then shouldn't it go into
default-configs/pci.mak ?


This is a multimedia chip usually found on sh4 systems and some ppc boards 
that's why I've only added it to these but I can move it to pci.mak if you 
think it's a better place for it.


Thanks again for taking the time to review, if you could answer these 
questions I can try to prepare a v2.





Re: [Qemu-devel] [PATCH 09/10] sm501: Add some more missing registers

2017-02-24 Thread BALATON Zoltan

On Fri, 24 Feb 2017, Peter Maydell wrote:

On 19 February 2017 at 16:35, BALATON Zoltan  wrote:

Write only to allow clients to initialise these without failing

Signed-off-by: BALATON Zoltan 


What's the point in write-only register values?


U-boot writes this register during setting up the device and without this 
it would abort QEMU.



What does the real hardware do here?


This register contains bits to set up FIFO parameters and memory 
priorities which we are not emulating so these can be ignored here but 
the hardware would change parameters according the value written.



If the registers are writes-ignored, there's no need to store
the data written into the state struct; if the registers are
reads-as-written then implement them that way.


I'm not sure what you get on real hardware but it's documented to be R/W 
(except reserved bits that are masked which are always 0). Why is it not 
implemented as read-as-written or what do you mean by that?




Re: [Qemu-devel] [PATCH 08/10] sm501: Add support for panel layer

2017-02-24 Thread BALATON Zoltan

On Fri, 24 Feb 2017, Peter Maydell wrote:

On 19 February 2017 at 16:35, BALATON Zoltan  wrote:

Signed-off-by: BALATON Zoltan 
---
 hw/display/sm501.c | 73 +++---
 1 file changed, 37 insertions(+), 36 deletions(-)

diff --git a/hw/display/sm501.c b/hw/display/sm501.c
index 1bd0303..2e1c4b7 100644
--- a/hw/display/sm501.c
+++ b/hw/display/sm501.c
@@ -2,6 +2,7 @@
  * QEMU SM501 Device
  *
  * Copyright (c) 2008 Shin-ichiro KAWASAKI
+ * Copyright (c) 2016 BALATON Zoltan
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to 
deal
@@ -41,8 +42,11 @@
  *   - Minimum implementation for Linux console : mmio regs and CRT layer.
  *   - 2D graphics acceleration partially supported : only fill rectangle.
  *
- * TODO:
+ * Status: 2016/12/04
+ *   - Misc fixes: endianness, hardware cursor
  *   - Panel support
+ *
+ * TODO:
  *   - Touch panel support
  *   - USB support
  *   - UART support
@@ -1297,53 +1301,62 @@ static inline int get_depth_index(DisplaySurface 
*surface)
 }
 }

-static void sm501_draw_crt(SM501State *s)
+static void sm501_update_display(void *opaque)
 {
+SM501State *s = (SM501State *)opaque;
 DisplaySurface *surface = qemu_console_surface(s->con);
 int y, c_x, c_y;
-uint8_t *hwc_src, *src = s->local_mem;
-int width = get_width(s, 1);
-int height = get_height(s, 1);
-int src_bpp = get_bpp(s, 1);
+int crt = (s->dc_crt_control & SM501_DC_CRT_CONTROL_SEL) ? 1 : 0;
+int width = get_width(s, crt);
+int height = get_height(s, crt);
+int src_bpp = get_bpp(s, crt);
 int dst_bpp = surface_bytes_per_pixel(surface);
-uint32_t *palette = (uint32_t *)>dc_palette[SM501_DC_CRT_PALETTE -
-   SM501_DC_PANEL_PALETTE];
-uint8_t hwc_palette[3 * 3];
-int ds_depth_index = get_depth_index(surface);
+int dst_depth_index = get_depth_index(surface);


Please don't change variable names in the middle of a patch that's
adding new functionality, it makes the patch harder to review.


Where should I do it then? Again another patch?


 draw_line_func *draw_line = NULL;
 draw_hwc_line_func *draw_hwc_line = NULL;
 int full_update = 0;
 int y_start = -1;
 ram_addr_t page_min = ~0l;
 ram_addr_t page_max = 0l;
-ram_addr_t offset = 0;
+ram_addr_t offset;
+uint32_t *palette;
+uint8_t hwc_palette[3 * 3];
+uint8_t *hwc_src;
+
+if (!((crt ? s->dc_crt_control : s->dc_panel_control)
+  & SM501_DC_CRT_CONTROL_ENABLE)) {
+return;
+}
+
+palette = (uint32_t *)(crt ? >dc_palette[SM501_DC_CRT_PALETTE -
+SM501_DC_PANEL_PALETTE]
+   : >dc_palette[0]);

 /* choose draw_line function */
 switch (src_bpp) {
 case 1:
-draw_line = draw_line8_funcs[ds_depth_index];
+draw_line = draw_line8_funcs[dst_depth_index];
 break;
 case 2:
-draw_line = draw_line16_funcs[ds_depth_index];
+draw_line = draw_line16_funcs[dst_depth_index];
 break;
 case 4:
-draw_line = draw_line32_funcs[ds_depth_index];
+draw_line = draw_line32_funcs[dst_depth_index];
 break;
 default:
-printf("sm501 draw crt : invalid DC_CRT_CONTROL=%x.\n",
-   s->dc_crt_control);
+printf("sm501 update display : invalid control register value.\n");


This shouldn't be in the same patch as "add panel" either.


I think it's related because adding panel layer makes this function not 
only specific the the CRT layer, hence the change in the error message.




Re: [Qemu-devel] [PATCH 06/10] sm501: Fix device endianness

2017-02-24 Thread BALATON Zoltan

On Fri, 24 Feb 2017, Peter Maydell wrote:

On 19 February 2017 at 16:35, BALATON Zoltan  wrote:

Signed-off-by: BALATON Zoltan 
---
 hw/display/sm501.c  |  6 +++---
 hw/display/sm501_template.h | 31 ++-
 2 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/hw/display/sm501.c b/hw/display/sm501.c
index 9091bb5..3d32a3c 100644
--- a/hw/display/sm501.c
+++ b/hw/display/sm501.c
@@ -846,7 +846,7 @@ static const MemoryRegionOps sm501_system_config_ops = {
 .min_access_size = 4,
 .max_access_size = 4,
 },
-.endianness = DEVICE_NATIVE_ENDIAN,
+.endianness = DEVICE_LITTLE_ENDIAN,
 };

 static uint32_t sm501_palette_read(void *opaque, hwaddr addr)
@@ -1082,7 +1082,7 @@ static const MemoryRegionOps sm501_disp_ctrl_ops = {
 .min_access_size = 4,
 .max_access_size = 4,
 },
-.endianness = DEVICE_NATIVE_ENDIAN,
+.endianness = DEVICE_LITTLE_ENDIAN,
 };

 static uint64_t sm501_2d_engine_read(void *opaque, hwaddr addr,
@@ -1170,7 +1170,7 @@ static const MemoryRegionOps sm501_2d_engine_ops = {
 .min_access_size = 4,
 .max_access_size = 4,
 },
-.endianness = DEVICE_NATIVE_ENDIAN,
+.endianness = DEVICE_LITTLE_ENDIAN,
 };


Does this still all work for the sh4eb (big-endian) sysbus device case?


Not sure. Is there some image somewhere I can test with?


 /* draw line functions for all console modes */
diff --git a/hw/display/sm501_template.h b/hw/display/sm501_template.h
index 832ee61..5b516d6 100644
--- a/hw/display/sm501_template.h
+++ b/hw/display/sm501_template.h
@@ -64,10 +64,16 @@ static void glue(draw_line16_, PIXEL_NAME)(
 uint8_t r, g, b;

 do {
-rgb565 = lduw_p(s);
-r = ((rgb565 >> 11) & 0x1f) << 3;
-g = ((rgb565 >>  5) & 0x3f) << 2;
-b = ((rgb565 >>  0) & 0x1f) << 3;
+rgb565 = lduw_le_p(s);
+#if defined(TARGET_WORDS_BIGENDIAN)
+r = (rgb565 >> 8) & 0xf8;
+g = (rgb565 >> 3) & 0xfc;
+b = (rgb565 << 3) & 0xf8;
+#else
+b = (rgb565 >> 8) & 0xf8;
+g = (rgb565 >> 3) & 0xfc;
+r = (rgb565 << 3) & 0xf8;
+#endif
 *(PIXEL_TYPE *)d = glue(rgb_to_pixel, PIXEL_NAME)(r, g, b);
 s += 2;
 d += BPP;
@@ -80,15 +86,14 @@ static void glue(draw_line32_, PIXEL_NAME)(
 uint8_t r, g, b;

 do {
-ldub_p(s);
 #if defined(TARGET_WORDS_BIGENDIAN)
+r = s[0];
+g = s[1];
+b = s[2];
+#else
 r = s[1];
 g = s[2];
 b = s[3];
-#else
-b = s[0];
-g = s[1];
-r = s[2];
 #endif


This looks really suspicious. Previously we had
TARGET_WORDS_BIGENDIAN -> bytes are XRGB
otherwise bytes are BGRX

Now we have
TARGET_WORDS_BIGENDIAN -> bytes are RGBX
otherwise  -> bytes are XRGB

That doesn't seem very plausible at all.


I've tested it with sh4 and ppc guests running on x86_64 host and these 
work now while previous code resulted in mixed up colors. Maybe the host 
endianness could also be a factor and the previous code assumed big endian 
host or the previous code was already broken and only worked with the 
default 8 bit depth. I'm not completely sure I understand endian handling 
in QEMU to know if this is correct besides on what I've tested but at 
least little endian and big endian guests should work on little endian 
hosts now with my patch. I can't test on big endian host.


I've used this image: https://people.debian.org/~aurel32/qemu/sh4/
with video=800x600-16 kernel parameter where changing -16 to different bit 
depths reproduces the problem.



 *(PIXEL_TYPE *)d = glue(rgb_to_pixel, PIXEL_NAME)(r, g, b);
 s += 4;




@@ -103,7 +108,7 @@ static void glue(draw_hwc_line_, PIXEL_NAME)(SM501State *s, 
int crt,
  uint8_t *palette, int c_y, uint8_t *d, int width)
 {
 int x, i;
-uint8_t *pixval, bitset = 0;
+uint8_t *pixval, r, g, b, bitset = 0;

 /* get hardware cursor pattern */
 uint32_t cursor_addr = get_hwc_address(s, crt);
@@ -129,9 +134,9 @@ static void glue(draw_hwc_line_, PIXEL_NAME)(SM501State *s, 
int crt,
 /* write pixel */
 if (v) {
 v--;
-uint8_t r = palette[v * 3 + 0];
-uint8_t g = palette[v * 3 + 1];
-uint8_t b = palette[v * 3 + 2];
+r = palette[v * 3 + 0];
+g = palette[v * 3 + 1];
+b = palette[v * 3 + 2];
 *(PIXEL_TYPE *)d = glue(rgb_to_pixel, PIXEL_NAME)(r, g, b);
 }
 d += BPP;


This doesn't seem to be related to the rest of this patch?


Should I split it off? Making every simple change another patch may double 
the size of the series but if that's preferred I can make this a separate 
patch.




Re: [Qemu-devel] [PATCH] vhost-user: delay vhost_user_stop

2017-02-24 Thread Marc-André Lureau
Hi

On Sat, Feb 25, 2017 at 12:25 AM Marc-André Lureau <
marcandre.lur...@redhat.com> wrote:

> Since commit b0a335e351103bf92f3f9d0bd5759311be8156ac, a socket write
> may trigger a disconnect events, calling vhost_user_stop() and clearing
> all the vhost_dev strutures holding data that vhost.c functions expect
> to remain valid. Delay the cleanup to keep the vhost_dev structure
> valid during the vhost.c functions.
>
> Signed-off-by: Marc-André Lureau 
> ---
>  net/vhost-user.c | 49 +++--
>  1 file changed, 43 insertions(+), 6 deletions(-)
>
> diff --git a/net/vhost-user.c b/net/vhost-user.c
> index 77b8110f8c..00573c8ac8 100644
> --- a/net/vhost-user.c
> +++ b/net/vhost-user.c
> @@ -25,6 +25,7 @@ typedef struct VhostUserState {
>  guint watch;
>  uint64_t acked_features;
>  bool started;
> +QEMUBH *chr_closed_bh;
>  } VhostUserState;
>
>  VHostNetState *vhost_user_get_vhost_net(NetClientState *nc)
> @@ -190,9 +191,40 @@ static gboolean net_vhost_user_watch(GIOChannel
> *chan, GIOCondition cond,
>
>  qemu_chr_fe_disconnect(>chr);
>
> +s->watch = 0;
>  return FALSE;
>  }
>
> +static void chr_closed_bh(void *opaque)
> +{
> +const char *name = opaque;
> +NetClientState *ncs[MAX_QUEUE_NUM];
> +VhostUserState *s;
> +Error *err = NULL;
> +int queues;
> +
> +queues = qemu_find_net_clients_except(name, ncs,
> +  NET_CLIENT_DRIVER_NIC,
> +  MAX_QUEUE_NUM);
> +assert(queues < MAX_QUEUE_NUM);
> +
> +s = DO_UPCAST(VhostUserState, nc, ncs[0]);
> +
> +qmp_set_link(name, false, );
> +vhost_user_stop(queues, ncs);
> +if (s->watch) {
> +g_source_remove(s->watch);
> +}
> +s->watch = 0;
> +
> +qemu_bh_delete(s->chr_closed_bh);
> +s->chr_closed_bh = NULL;
> +
> +if (err) {
> +error_report_err(err);
> +}
> +}
> +
>  static void net_vhost_user_event(void *opaque, int event)
>  {
>  const char *name = opaque;
> @@ -212,20 +244,25 @@ static void net_vhost_user_event(void *opaque, int
> event)
>  trace_vhost_user_event(chr->label, event);
>  switch (event) {
>  case CHR_EVENT_OPENED:
> -s->watch = qemu_chr_fe_add_watch(>chr, G_IO_HUP,
> - net_vhost_user_watch, s);
>  if (vhost_user_start(queues, ncs, >chr) < 0) {
>  qemu_chr_fe_disconnect(>chr);
>  return;
>  }
> +s->watch = qemu_chr_fe_add_watch(>chr, G_IO_HUP,
> + net_vhost_user_watch, s);
>  qmp_set_link(name, true, );
> +s->chr_closed_bh = qemu_bh_new(chr_closed_bh, opaque);
>  s->started = true;
>  break;
>  case CHR_EVENT_CLOSED:
> -qmp_set_link(name, false, );
> -vhost_user_stop(queues, ncs);
> -g_source_remove(s->watch);
> -s->watch = 0;
> +/* a close event may happen during a read/write, but vhost
> + * code assumes the vhost_dev remains setup, so delay the
> + * stop & clear to idle.
> + * FIXME: better handle failure in vhost code, remove bh
> + */
> +if (s->chr_closed_bh) {
> +qemu_bh_schedule(s->chr_closed_bh);
>

And this is also racy if it gets a CHR_EVENT_OPENED before the bh is run.
/me not sure we want to go there.


> +}
>  break;
>  }
>
> --
> 2.12.0.rc2.3.gc93709801
>
>
> --
Marc-André Lureau


Re: [Qemu-devel] [PATCH 04/10] sm501: Add emulation of chip connected via PCI

2017-02-24 Thread BALATON Zoltan

On Fri, 24 Feb 2017, Peter Maydell wrote:

On 19 February 2017 at 16:35, BALATON Zoltan  wrote:

Only the display controller part is created automatically on PCI

Signed-off-by: BALATON Zoltan 
---
 hw/display/sm501.c  | 58 +
 hw/display/sm501_template.h |  8 +++
 2 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/hw/display/sm501.c b/hw/display/sm501.c
index b592022..e966896 100644
--- a/hw/display/sm501.c
+++ b/hw/display/sm501.c
@@ -31,6 +31,7 @@
 #include "ui/console.h"
 #include "hw/devices.h"
 #include "hw/sysbus.h"
+#include "hw/pci/pci.h"
 #include "qemu/range.h"
 #include "ui/pixel_ops.h"
 #include "exec/address-spaces.h"
@@ -460,7 +461,6 @@ typedef struct SM501State {
 QemuConsole *con;

 /* status & internal resources */
-hwaddr base;
 uint32_t local_mem_size_index;
 uint8_t *local_mem;
 MemoryRegion local_mem_region;
@@ -1397,12 +1397,11 @@ static const GraphicHwOps sm501_ops = {
 .gfx_update  = sm501_update_display,
 };

-static void sm501_init(SM501State *s, DeviceState *dev, uint32_t base,
+static void sm501_init(SM501State *s, DeviceState *dev,
uint32_t local_mem_bytes)
 {
 MemoryRegion *mr;

-s->base = base;
 s->local_mem_size_index = get_local_mem_size_index(local_mem_bytes);
 SM501_DPRINTF("sm501 local mem size=%x. index=%d\n", get_local_mem_size(s),
   s->local_mem_size_index);
@@ -1457,7 +1456,7 @@ static void sm501_realize_sysbus(DeviceState *dev, Error 
**errp)
 SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
 DeviceState *usb_dev;

-sm501_init(>state, dev, s->base, s->vram_size);
+sm501_init(>state, dev, s->vram_size);
 sysbus_init_mmio(sbd, >state.local_mem_region);
 sysbus_init_mmio(sbd, >state.mmio_region);

@@ -1505,9 +1504,60 @@ static const TypeInfo sm501_sysbus_info = {
 .class_init= sm501_sysbus_class_init,
 };

+#define TYPE_PCI_SM501 "sm501"
+#define PCI_SM501(obj) OBJECT_CHECK(SM501PCIState, (obj), TYPE_PCI_SM501)
+
+typedef struct {
+/*< private >*/
+PCIDevice parent_obj;
+/*< public >*/
+SM501State state;
+uint32_t vram_size;
+} SM501PCIState;
+
+static void sm501_realize_pci(PCIDevice *dev, Error **errp)
+{
+SM501PCIState *s = PCI_SM501(dev);
+
+sm501_init(>state, DEVICE(dev), s->vram_size);
+pci_register_bar(dev, 0, PCI_BASE_ADDRESS_SPACE_MEMORY,
+ >state.local_mem_region);
+pci_register_bar(dev, 1, PCI_BASE_ADDRESS_SPACE_MEMORY,
+ >state.mmio_region);
+}
+
+static Property sm501_pci_properties[] = {
+DEFINE_PROP_UINT32("vram-size", SM501PCIState, vram_size,
+   64 * 1024 * 1024),
+DEFINE_PROP_END_OF_LIST(),
+};
+
+static void sm501_pci_class_init(ObjectClass *klass, void *data)
+{
+DeviceClass *dc = DEVICE_CLASS(klass);
+PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
+
+k->realize = sm501_realize_pci;
+k->vendor_id = 0x126f;
+k->device_id = 0x0501;
+k->class_id = PCI_CLASS_DISPLAY_OTHER;
+set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories);
+dc->desc = "SM501 Display Controller";
+dc->props = sm501_pci_properties;
+dc->hotpluggable = false;


This needs a reset function and a vmsd as well.


+}
+
+static const TypeInfo sm501_pci_info = {
+.name  = TYPE_PCI_SM501,
+.parent= TYPE_PCI_DEVICE,
+.instance_size = sizeof(SM501PCIState),
+.class_init= sm501_pci_class_init,
+};
+
 static void sm501_register_types(void)
 {
 type_register_static(_sysbus_info);
+type_register_static(_pci_info);
 }

 type_init(sm501_register_types)
diff --git a/hw/display/sm501_template.h b/hw/display/sm501_template.h
index 16e500b..832ee61 100644
--- a/hw/display/sm501_template.h
+++ b/hw/display/sm501_template.h
@@ -103,13 +103,13 @@ static void glue(draw_hwc_line_, PIXEL_NAME)(SM501State 
*s, int crt,
  uint8_t *palette, int c_y, uint8_t *d, int width)
 {
 int x, i;
-uint8_t bitset = 0;
+uint8_t *pixval, bitset = 0;

 /* get hardware cursor pattern */
 uint32_t cursor_addr = get_hwc_address(s, crt);
 assert(0 <= c_y && c_y < SM501_HWC_HEIGHT);
 cursor_addr += SM501_HWC_WIDTH * c_y / 4;  /* 4 pixels per byte */
-cursor_addr += s->base;
+pixval = s->local_mem + cursor_addr;

 /* get cursor position */
 x = get_hwc_x(s, crt);
@@ -120,8 +120,8 @@ static void glue(draw_hwc_line_, PIXEL_NAME)(SM501State *s, 
int crt,

 /* get pixel value */
 if (i % 4 == 0) {
-bitset = ldub_phys(_space_memory, cursor_addr);
-cursor_addr++;
+bitset = ldub_p(pixval);
+pixval++;
 }
 v = bitset & 3;
 bitset >>= 2;


Dropping the requirement for a base addr is not really the same
change as adding the PCI device.


This is needed for PCI device because the base is not accessible 

Re: [Qemu-devel] [PATCH 03/10] sm501: QOMify

2017-02-24 Thread BALATON Zoltan

On Fri, 24 Feb 2017, Peter Maydell wrote:

On 19 February 2017 at 16:35, BALATON Zoltan  wrote:

Signed-off-by: BALATON Zoltan 
---
 hw/display/sm501.c   | 133 +++
 hw/sh4/r2d.c |  11 -
 include/hw/devices.h |   5 --
 3 files changed, 101 insertions(+), 48 deletions(-)

diff --git a/hw/display/sm501.c b/hw/display/sm501.c
index 4eb085c..b592022 100644
--- a/hw/display/sm501.c
+++ b/hw/display/sm501.c
@@ -58,8 +58,8 @@
 #define SM501_DPRINTF(fmt, ...) do {} while (0)
 #endif

-
 #define MMIO_BASE_OFFSET 0x3e0
+#define MMIO_SIZE 0x20

 /* SM501 register definitions taken from "linux/include/linux/sm501-regs.h" */

@@ -464,6 +464,7 @@ typedef struct SM501State {
 uint32_t local_mem_size_index;
 uint8_t *local_mem;
 MemoryRegion local_mem_region;
+MemoryRegion mmio_region;
 uint32_t last_width;
 uint32_t last_height;

@@ -1396,20 +1397,14 @@ static const GraphicHwOps sm501_ops = {
 .gfx_update  = sm501_update_display,
 };

-void sm501_init(MemoryRegion *address_space_mem, uint32_t base,
-uint32_t local_mem_bytes, qemu_irq irq, Chardev *chr)
+static void sm501_init(SM501State *s, DeviceState *dev, uint32_t base,
+   uint32_t local_mem_bytes)
 {
-SM501State *s;
-DeviceState *dev;
-MemoryRegion *sm501_system_config = g_new(MemoryRegion, 1);
-MemoryRegion *sm501_disp_ctrl = g_new(MemoryRegion, 1);
-MemoryRegion *sm501_2d_engine = g_new(MemoryRegion, 1);
-
-/* allocate management data region */
-s = g_new0(SM501State, 1);
+MemoryRegion *mr;
+
 s->base = base;
 s->local_mem_size_index = get_local_mem_size_index(local_mem_bytes);
-SM501_DPRINTF("local mem size=%x. index=%d\n", get_local_mem_size(s),
+SM501_DPRINTF("sm501 local mem size=%x. index=%d\n", get_local_mem_size(s),
   s->local_mem_size_index);
 s->system_control = 0x0010; /* 2D engine FIFO empty */
 s->misc_control = SM501_MISC_IRQ_INVERT; /* assumes SH, active=low */
@@ -1417,46 +1412,102 @@ void sm501_init(MemoryRegion *address_space_mem, 
uint32_t base,
 s->dc_crt_control = 0x0001;

 /* allocate local memory */
-memory_region_init_ram(>local_mem_region, NULL, "sm501.local",
+memory_region_init_ram(>local_mem_region, OBJECT(dev), "sm501.local",
local_mem_bytes, _fatal);
 vmstate_register_ram_global(>local_mem_region);
 memory_region_set_log(>local_mem_region, true, DIRTY_MEMORY_VGA);
 s->local_mem = memory_region_get_ram_ptr(>local_mem_region);
-memory_region_add_subregion(address_space_mem, base, >local_mem_region);
-
-/* map mmio */
-memory_region_init_io(sm501_system_config, NULL, _system_config_ops,
-  s, "sm501-system-config", 0x6c);
-memory_region_add_subregion(address_space_mem, base + MMIO_BASE_OFFSET,
-sm501_system_config);
-memory_region_init_io(sm501_disp_ctrl, NULL, _disp_ctrl_ops, s,
+
+/* allocate mmio */
+memory_region_init(>mmio_region, OBJECT(dev), "sm501.mmio", MMIO_SIZE);
+mr = g_new(MemoryRegion, 1);


There's no need to dynamically allocate any of these memory regions;
just make them be MemoryRegion fields inside SM501State.


+memory_region_init_io(mr, OBJECT(dev), _system_config_ops, s,
+  "sm501-system-config", 0x6c);
+memory_region_add_subregion(>mmio_region, SM501_SYS_CONFIG, mr);
+mr = g_new(MemoryRegion, 1);
+memory_region_init_io(mr, OBJECT(dev), _disp_ctrl_ops, s,
   "sm501-disp-ctrl", 0x1000);
-memory_region_add_subregion(address_space_mem,
-base + MMIO_BASE_OFFSET + SM501_DC,
-sm501_disp_ctrl);
-memory_region_init_io(sm501_2d_engine, NULL, _2d_engine_ops, s,
+memory_region_add_subregion(>mmio_region, SM501_DC, mr);
+mr = g_new(MemoryRegion, 1);
+memory_region_init_io(mr, OBJECT(dev), _2d_engine_ops, s,
   "sm501-2d-engine", 0x54);
-memory_region_add_subregion(address_space_mem,
-base + MMIO_BASE_OFFSET + SM501_2D_ENGINE,
-sm501_2d_engine);
+memory_region_add_subregion(>mmio_region, SM501_2D_ENGINE, mr);
+
+/* create qemu graphic console */
+s->con = graphic_console_init(DEVICE(dev), 0, _ops, s);
+}
+
+#define TYPE_SYSBUS_SM501 "sysbus-sm501"
+#define SYSBUS_SM501(obj) \
+OBJECT_CHECK(SM501SysBusState, (obj), TYPE_SYSBUS_SM501)
+
+typedef struct {
+/*< private >*/
+SysBusDevice parent_obj;
+/*< public >*/
+SM501State state;
+uint32_t vram_size;
+uint32_t base;
+void *chr_state;
+} SM501SysBusState;
+
+static void sm501_realize_sysbus(DeviceState *dev, Error **errp)
+{
+SM501SysBusState *s = SYSBUS_SM501(dev);
+SysBusDevice *sbd = 

[Qemu-devel] [PATCH] vhost-user: delay vhost_user_stop

2017-02-24 Thread Marc-André Lureau
Since commit b0a335e351103bf92f3f9d0bd5759311be8156ac, a socket write
may trigger a disconnect events, calling vhost_user_stop() and clearing
all the vhost_dev strutures holding data that vhost.c functions expect
to remain valid. Delay the cleanup to keep the vhost_dev structure
valid during the vhost.c functions.

Signed-off-by: Marc-André Lureau 
---
 net/vhost-user.c | 49 +++--
 1 file changed, 43 insertions(+), 6 deletions(-)

diff --git a/net/vhost-user.c b/net/vhost-user.c
index 77b8110f8c..00573c8ac8 100644
--- a/net/vhost-user.c
+++ b/net/vhost-user.c
@@ -25,6 +25,7 @@ typedef struct VhostUserState {
 guint watch;
 uint64_t acked_features;
 bool started;
+QEMUBH *chr_closed_bh;
 } VhostUserState;
 
 VHostNetState *vhost_user_get_vhost_net(NetClientState *nc)
@@ -190,9 +191,40 @@ static gboolean net_vhost_user_watch(GIOChannel *chan, 
GIOCondition cond,
 
 qemu_chr_fe_disconnect(>chr);
 
+s->watch = 0;
 return FALSE;
 }
 
+static void chr_closed_bh(void *opaque)
+{
+const char *name = opaque;
+NetClientState *ncs[MAX_QUEUE_NUM];
+VhostUserState *s;
+Error *err = NULL;
+int queues;
+
+queues = qemu_find_net_clients_except(name, ncs,
+  NET_CLIENT_DRIVER_NIC,
+  MAX_QUEUE_NUM);
+assert(queues < MAX_QUEUE_NUM);
+
+s = DO_UPCAST(VhostUserState, nc, ncs[0]);
+
+qmp_set_link(name, false, );
+vhost_user_stop(queues, ncs);
+if (s->watch) {
+g_source_remove(s->watch);
+}
+s->watch = 0;
+
+qemu_bh_delete(s->chr_closed_bh);
+s->chr_closed_bh = NULL;
+
+if (err) {
+error_report_err(err);
+}
+}
+
 static void net_vhost_user_event(void *opaque, int event)
 {
 const char *name = opaque;
@@ -212,20 +244,25 @@ static void net_vhost_user_event(void *opaque, int event)
 trace_vhost_user_event(chr->label, event);
 switch (event) {
 case CHR_EVENT_OPENED:
-s->watch = qemu_chr_fe_add_watch(>chr, G_IO_HUP,
- net_vhost_user_watch, s);
 if (vhost_user_start(queues, ncs, >chr) < 0) {
 qemu_chr_fe_disconnect(>chr);
 return;
 }
+s->watch = qemu_chr_fe_add_watch(>chr, G_IO_HUP,
+ net_vhost_user_watch, s);
 qmp_set_link(name, true, );
+s->chr_closed_bh = qemu_bh_new(chr_closed_bh, opaque);
 s->started = true;
 break;
 case CHR_EVENT_CLOSED:
-qmp_set_link(name, false, );
-vhost_user_stop(queues, ncs);
-g_source_remove(s->watch);
-s->watch = 0;
+/* a close event may happen during a read/write, but vhost
+ * code assumes the vhost_dev remains setup, so delay the
+ * stop & clear to idle.
+ * FIXME: better handle failure in vhost code, remove bh
+ */
+if (s->chr_closed_bh) {
+qemu_bh_schedule(s->chr_closed_bh);
+}
 break;
 }
 
-- 
2.12.0.rc2.3.gc93709801




Re: [Qemu-devel] [PATCH 04/21] qmp: Dumb down how we run QMP command registration

2017-02-24 Thread Markus Armbruster
Eric Blake  writes:

> On 02/23/2017 03:45 PM, Markus Armbruster wrote:
>> The way we get QMP commands registered is high tech:
>> 
>> * qapi-commands.py generates qmp_init_marshal() that does the actual work
>> 
>> * it also generates the magic to register it as a MODULE_INIT_QAPI
>>   function, so it runs when someone calls
>>   module_call_init(MODULE_INIT_QAPI)
>> 
>> * main() calls module_call_init()
>> 
>> QEMU needs to register a few non-qapified commands.  Same high tech
>> works: monitor.c has its own qmp_init_marshal() along with the magic
>> to make it run in module_call_init(MODULE_INIT_QAPI).
>
> Eww. Two static functions with the same name, which really messes with
> gdb debugging.
>
>> 
>> QEMU also needs to unregister commands that are not wanted in this
>> build's configuration (commit 5032a16).  Simple enough:
>> qmp_unregister_commands_hack().  The difficulty is to make it run
>> after the generated qmp_init_marshal().  We can't simply run it in
>> monitor.c's qmp_init_marshal(), because the order in which the
>> registered functions run is indeterminate.  So qmp_init_marshal()
>> registers qmp_init_marshal() separately.  Since registering *appends*
>
> Another case of "A sets up A" when you meant "A sets up B", but this
> time, I'm fairly certain you mean qmp_init_marshal() registers
> qmp_unregister_commands_hack() separately.

Correct.  I'll fix it.

>> to the list of registered functions, this will make it run after all
>> the functions that have been registered already.
>> 
>> I suspect it takes a long and expensive computer science education to
>> not find this silly.
>
> ROFL
> (does that mean I didn't spend enough on my education? I'm sure you
> could arrange to sell me an online certificate if I wanted more... :)

Plenty of peddlers out there!

>> Dumb it down as follows:
>> 
>> * Drop MODULE_INIT_QAPI entirely
>> 
>> * Give the generated qmp_init_marshal() external linkage.
>> 
>> * Call it instead of module_call_init(MODULE_INIT_QAPI)
>> 
>> * Except in QEMU proper, call new monitor_init_qmp_commands() that in
>>   turn calls the generated qmp_init_marshal(), registers the
>>   additional commands and unregisters the unwanted ones.
>> 
>> Signed-off-by: Markus Armbruster 
>> ---
>
>> -static void qmp_init_marshal(void)
>> +void monitor_init_qmp_commands(void)
>>  {
>> +qmp_init_marshal();
>> +
>>  qmp_register_command("query-qmp-schema", qmp_query_qmp_schema,
>>   QCO_NO_OPTIONS);
>>  qmp_register_command("device_add", qmp_device_add,
>> @@ -1004,12 +1006,9 @@ static void qmp_init_marshal(void)
>>  qmp_register_command("netdev_add", qmp_netdev_add,
>>   QCO_NO_OPTIONS);
>>  
>> -/* call it after the rest of qapi_init() */
>> -register_module_init(qmp_unregister_commands_hack, MODULE_INIT_QAPI);
>> +qmp_unregister_commands_hack();
>
> We could inline this function now, but keeping the _hack() name around
> reminds us to consider conditional-compilation support in the generator
> at a later date, so I'm fine with the approach you took.

Exactly.

> Reviewed-by: Eric Blake 

Thanks!



Re: [Qemu-devel] [PATCH 02/10] sm501: Use defines instead of constants where available

2017-02-24 Thread BALATON Zoltan

On Fri, 24 Feb 2017, Peter Maydell wrote:

On 19 February 2017 at 16:35, BALATON Zoltan  wrote:

Signed-off-by: BALATON Zoltan 
---
 hw/display/sm501.c  | 8 
 hw/display/sm501_template.h | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/hw/display/sm501.c b/hw/display/sm501.c
index 4f40dee..4eb085c 100644
--- a/hw/display/sm501.c
+++ b/hw/display/sm501.c
@@ -555,7 +555,7 @@ static uint32_t get_local_mem_size_index(uint32_t size)
 static inline int is_hwc_enabled(SM501State *state, int crt)
 {
 uint32_t addr = crt ? state->dc_crt_hwc_addr : state->dc_panel_hwc_addr;
-return addr & 0x8000;
+return addr & SM501_HWC_EN;
 }

 /**
@@ -1411,9 +1411,9 @@ void sm501_init(MemoryRegion *address_space_mem, uint32_t 
base,
 s->local_mem_size_index = get_local_mem_size_index(local_mem_bytes);
 SM501_DPRINTF("local mem size=%x. index=%d\n", get_local_mem_size(s),
   s->local_mem_size_index);
-s->system_control = 0x0010;
-s->misc_control = 0x1000; /* assumes SH, active=low */
-s->dc_panel_control = 0x0001;
+s->system_control = 0x0010; /* 2D engine FIFO empty */
+s->misc_control = SM501_MISC_IRQ_INVERT; /* assumes SH, active=low */
+s->dc_panel_control = 0x0001; /* FIFO level 3 */


s->misc_control was set to 0x1000, but is now set to
SM501_MISC_IRQ_INVERT, which is (1 << 16), which is not the
same value... 0x1000 would be SM501_MISC_DAC_POWER.


Thanks for reviewing these patches.

The comment suggests that the intended bit was the IRQ invert bit so the 
constant used before may have been a bug. (In my understanding nothing 
really uses this bit so I could set it either way you prefer but I think 
this is the correct one.) What should I do?

1. Leave it as it is
2. Set it to the old constant contradicting the comment
3. Make it a separate patch?



Re: [Qemu-devel] [PATCH RFC v3 2/5] keyval: New keyval_parse()

2017-02-24 Thread Markus Armbruster
Eric Blake  writes:

> On 02/24/2017 01:06 PM, Markus Armbruster wrote:
>
>> 
>> The value of an implied key is not subject to key desugaring.
>
> And that's a good thing.
>
>> 
>> Without an implied key, the "node" would desugar to "de=off".
>> 
>>> How about:
>>>
>>> for 2.9: -blockdev has no magic at all (you HAVE to spell 'foo=off'
>>> rather than 'foo' for any boolean parameters that you want disabled in
>>> -blockdev);
>> 
>> You mean "rather than 'nofoo'".
>
> Yes.  Thanks for catching the intent.
>
>> 
>> "No magic at all" implies you also have to spell "foo=on" rather than
>> "foo".
>
> I'm a little more lenient for allowing 'foo' meaning 'foo=on' as a
> shortcut (although making it force a QBool rather than a string "on" may
> be nicer).

No objection.

> I'm also okay if -blockdev has no sugar at all in 2.9 (we
> can add "foo" => "foo=on" sugar later if it makes QemuOpts conversion
> easier in 2.10; that's much more palatable than adding "nofoo" =>
> "foo=off" sugar).
>
>> I believe we need to have both or none in 2.9.  If we have only the "on"
>> sugar, "no-flush" works, and we can't add the "off" sugar without
>> breaking it.
>
> I think the "off" sugar should die, if we can convince ourselves that no
> one is using it that can't catch up to modern usage.

Yes, but we should be careful not to create problems now in case we
can't.

>>> -blockdev); and in QemuOpts, we issue deprecation warnings but keep
>>> existing behavior any time someone uses 'noFOO' that we turn into
>>> 'FOO=no'.  Then, for 2.10, we can decide to remove the deprecation
>>> warnings if they pointed out real (ab)use of the sugar in the wild, or
>>> (hopefully) to kill the sugar entirely (as part of converting QemuOpts
>>> over to keyval_parse).
>>>
>>> In short, getting rid of the 'no' prefix magic, after suitable
>>> deprecation warnings, seems like a good plan to me; and having -blockdev
>>> be slightly tighter than the rest of command line options for 2.9 only
>>> is no real loss (no one uses -blockdev yet, so they can be written to
>>> avoid the use of 'no' magic to begin with).
>> 
>> Okay, I'll post QemuOpts patch to deprecate noKEY sugar.  Let's see
>> whether anyone screams.
>
> Summarizing to make sure we're on the same track:
>
> 2.9:
> QemuOpts:
>   "noFOO" => "FOO=off" sugar - deprecation warning
>   "FOO" => "FOO=on" sugar - silently stays the same

Yes, if we can get consensus.

> -blockdev option 1:
>   "FOO" => error, no '=' present (we haven't decided on sugaring this yet)
>   "noFOO" => error, no '=' present (no off sugar, but also avoids
> "noFOO" => "noFOO=on" sugar)

Yes.

> -blockdev option 2:
>   "FOO" => "FOO=on" - enabled boolean sugar remains compact, if it is
> not ambiguous, but anything we can do to limit it to just boolean values
> rather than the string "on" might be nice
>   "noFOO" => "noFOO=on" - different behavior than QemuOpts, but less magic

No, because that would create problems if we can't get rid of noFOO
sugar elsewhere.

> 2.10:
> QemuOpts will be implemented on keyval_parse(), behaving the same as
> -blockdev.

Not sure I can complete the job in 2.10, but the text works just as well
for a later version.

>If we took option 1 in 2.9, we have:
>   "noFOO" => "noFOO=on" (which probably triggers an error that "noFOO"
> is not an option when strict parsing is in effect) (the deprecation
> period is over, so we changed command-line behavior, but only for people
> that didn't pay attention to the warnings in 2.9)

Yes.

>   "FOO" => "FOO=on" - keep this one to minimize back-compat breakage

Yes.

> If we took option 2 in 2.9, we can still turn on "FOO" => "FOO=on" sugar
> to get back to option 1 in 2.10
>
> or we can stick to option 2 forevermore:
>   "FOO" => error, no '=' present. No longer possible to specify booleans
> without the more verbose "FOO=on"

Now I'm getting confused.  Isn't that "-blockdev option 1"?

> I have a preference for option 1 in the long run, but as it seems to be
> upwards compatible from option 2 for -blockdev in 2.9, I'm leaning
> towards option 2 for this release.

Let me rename the options:

* "no sugar -blockdev": both "FOO" and "noFOO" are rejected.

* "positive sugar blockdev": "FOO" is desugared to "FOO=on", "noFOO" to
  "noFOO=on".

Which one do you prefer for 2.9?



Re: [Qemu-devel] [PATCH] qemu-img: make convert async

2017-02-24 Thread Peter Lieven

> Am 24.02.2017 um 16:02 schrieb Kevin Wolf :
> 
> Am 21.02.2017 um 13:29 hat Peter Lieven geschrieben:
>> the convert process is currently completely implemented with sync operations.
>> That means it reads one buffer and then writes it. No parallelism and each 
>> sync
>> request takes as long as it takes until it is completed.
>> 
>> This can be a big performance hit when the convert process reads and writes
>> to devices which do not benefit from kernel readahead or pagecache.
>> In our environment we heavily have the following two use cases when using
>> qemu-img convert.
>> 
>> a) reading from NFS and writing to iSCSI for deploying templates
>> b) reading from iSCSI and writing to NFS for backups
>> 
>> In both processes we use libiscsi and libnfs so we have no kernel cache.
>> 
>> This patch changes the convert process to work with parallel running 
>> coroutines
>> which can significantly improve performance for network storage devices:
>> 
>> qemu-img (master)
>> nfs -> iscsi 22.8 secs
>> nfs -> ram   11.7 secs
>> ram -> iscsi 12.3 secs
>> 
>> qemu-img-async (8 coroutines, in-order write disabled)
>> nfs -> iscsi 11.0 secs
>> nfs -> ram   10.4 secs
>> ram -> iscsi  9.0 secs
>> 
>> This patches introduces 2 new cmdline parameters. The -m parameter to specify
>> the number of coroutines running in parallel (defaults to 8). And the -W 
>> paremeter to
>> allow qemu-img to write to the target out of order rather than sequential. 
>> This improves
>> performance as the writes do not have to wait for each other to complete.
>> 
>> Signed-off-by: Peter Lieven 
> 
>> @@ -1563,9 +1581,13 @@ static int convert_read(ImgConvertState *s, int64_t 
>> sector_num, int nb_sectors,
>> blk = s->src[s->src_cur];
>> bs_sectors = s->src_sectors[s->src_cur];
>> 
>> n = MIN(nb_sectors, bs_sectors - (sector_num - s->src_cur_offset));
> 
> convert_co_read() uses global state here (s->src, s->src_cur,
> s->src_cur_offset) while not running under a lock. Are you sure that
> this is correct when some coroutines are operating on the first source
> image, but others are already working on the second one?

You are right that could cause problems. I have to change that to use local 
variables.

> 
> The same variables are used in convert_iteration_sectors(), but I think
> there it's okay because we're just starting a new request and are
> holding s->lock.
> 
>> @@ -1651,12 +1685,122 @@ static int convert_write(ImgConvertState *s, 
>> int64_t sector_num, int nb_sectors,
>> return 0;
>> }
>> 
>> -static int convert_do_copy(ImgConvertState *s)
>> +static void coroutine_fn convert_co_do_copy(void *opaque)
>> {
>> +ImgConvertState *s = opaque;
>> uint8_t *buf = NULL;
>> -int64_t sector_num, allocated_done;
>> -int ret;
>> -int n;
>> +int ret, i;
>> +int index = -1;
>> +
>> +for (i = 0; i < s->num_coroutines; i++) {
>> +if (s->co[i] == qemu_coroutine_self()) {
>> +index = i;
>> +break;
>> +}
>> +}
>> +assert(index >= 0);
>> +
>> +s->running_coroutines++;
>> +buf = blk_blockalign(s->target, s->buf_sectors * BDRV_SECTOR_SIZE);
>> +
>> +while (1) {
>> +int n;
>> +int64_t sector_num;
>> +enum ImgConvertBlockStatus status;
>> +
>> +qemu_co_mutex_lock(>lock);
>> +if (s->ret != -EINPROGRESS || s->sector_num >= s->total_sectors) {
>> +qemu_co_mutex_unlock(>lock);
>> +goto out;
>> +}
>> +n = convert_iteration_sectors(s, s->sector_num);
>> +if (n < 0) {
>> +qemu_co_mutex_unlock(>lock);
>> +s->ret = n;
>> +goto out;
>> +}
>> +/* safe current sector and allocation status to local variables */
> 
> s/safe/save/
> 
>> +sector_num = s->sector_num;
>> +status = s->status;
>> +if (!s->min_sparse && s->status == BLK_ZERO) {
>> +n = MIN(n, s->buf_sectors);
>> +}
>> +/* increment global sector counter so that other coroutines can
>> + * already continue reading beyond this request */
>> +s->sector_num += n;
>> +qemu_co_mutex_unlock(>lock);
> 
> Here we unlock, so after this point we can only access globally valid
> values from s, but not request-specific ones like s->status or
> s->sector_num...
> 
>> +if (status == BLK_DATA || (!s->min_sparse && s->status == 
>> BLK_ZERO)) {
> 
> ...but s->status is used here...
> 
>> +s->allocated_done += n;
>> +qemu_progress_print(100.0 * s->allocated_done /
>> +s->allocated_sectors, 0);
>> +}
>> +
>> +if (status == BLK_DATA) {
>> +ret = convert_co_read(s, sector_num, n, buf);
>> +if (ret < 0) {
>> +error_report("error while reading sector %" PRId64
>> + ": %s", sector_num, strerror(-ret));
>> + 

Re: [Qemu-devel] make check: crash in vhost-user chardev cleanup

2017-02-24 Thread Dr. David Alan Gilbert
* Marc-André Lureau (marcandre.lur...@gmail.com) wrote:
> Hi
> 
> On Fri, Feb 24, 2017 at 11:54 PM Dr. David Alan Gilbert 
> wrote:
> 
> > Hi,
> >   I'm seeing a hang+crash doing a make check that
> > looks like it's coming from vhost-user's chardev cleanup,
> > so I'm suspecting your e0b283e7c.
> > The synptom is that the make check hangs in the qtest
> > part, and a dmesg shows that a qemu crashed.
> >
> 
> See that thread:
> https://lists.nongnu.org/archive/html/qemu-devel/2017-02/msg06015.html

Ah OK, as long as it's known about.  It's a reliable crash in my main build
world on my laptop.

Dave

> 
> > I got a backtrace that shows:
> >
> > (gdb) where
> > #0  0x560d3344f03e in qemu_chr_fe_read_all (be=be@entry=0x0,
> > buf=buf@entry=0x7ffea7a2f520 "\v", len=len@entry=12) at
> > /home/dgilbert/git/qemu/chardev/char.c:209
> > #1  0x560d33199d64 in vhost_user_read (msg=msg@entry=0x7ffea7a2f520,
> > dev=)
> > at /home/dgilbert/git/qemu/hw/virtio/vhost-user.c:125
> > #2  0x560d3319a11f in vhost_user_get_vring_base (dev=0x560d35a614a0,
> > ring=0x7ffea7a2f670)
> > at /home/dgilbert/git/qemu/hw/virtio/vhost-user.c:417
> > #3  0x560d33196abb in vhost_virtqueue_stop 
> > (dev=dev@entry=0x560d35a614a0,
> > vdev=vdev@entry=0x560d3729cc00, vq=0x560d35a61678, idx=1) at
> > /home/dgilbert/git/qemu/hw/virtio/vhost.c:1070
> > #4  0x560d33198cd4 in vhost_dev_stop (hdev=hdev@entry=0x560d35a614a0,
> > vdev=vdev@entry=0x560d3729cc00)
> > at /home/dgilbert/git/qemu/hw/virtio/vhost.c:1533
> > #5  0x560d3317bfd8 in vhost_net_stop_one (net=0x560d35a614a0,
> > dev=0x560d3729cc00)
> > at /home/dgilbert/git/qemu/hw/net/vhost_net.c:288
> > #6  0x560d3317c4aa in vhost_net_stop (dev=dev@entry=0x560d3729cc00,
> > ncs=, total_queues=total_queues@entry=1) at
> > /home/dgilbert/git/qemu/hw/net/vhost_net.c:367
> > #7  0x560d33179201 in virtio_net_vhost_status (status=,
> > n=0x560d3729cc00)
> > at /home/dgilbert/git/qemu/hw/net/virtio-net.c:176
> > #8  0x560d33179201 in virtio_net_set_status (vdev=0x560d3729cc00,
> > status=7 '\a')
> > at /home/dgilbert/git/qemu/hw/net/virtio-net.c:250
> > #9  0x560d3339ce99 in qemu_del_net_client (nc=0x560d35a61250) at
> > /home/dgilbert/git/qemu/net/net.c:390
> > #10 0x560d3339e30d in net_cleanup () at
> > /home/dgilbert/git/qemu/net/net.c:1454
> > #11 0x560d330ee940 in main (argc=, argv= > out>, envp=)
> > at /home/dgilbert/git/qemu/vl.c:4637
> > (gdb)
> >
> > #0  qemu_chr_fe_read_all (be=be@entry=0x0, buf=buf@entry=0x7ffea7a2f520
> > "\v", len=len@entry=12)
> > at /home/dgilbert/git/qemu/chardev/char.c:209
> > 209 Chardev *s = be->chr;
> >
> > Dave
> >
> > --
> > Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
> >
> > --
> Marc-André Lureau
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK



Re: [Qemu-devel] make check: crash in vhost-user chardev cleanup

2017-02-24 Thread Marc-André Lureau
Hi

On Fri, Feb 24, 2017 at 11:54 PM Dr. David Alan Gilbert 
wrote:

> Hi,
>   I'm seeing a hang+crash doing a make check that
> looks like it's coming from vhost-user's chardev cleanup,
> so I'm suspecting your e0b283e7c.
> The synptom is that the make check hangs in the qtest
> part, and a dmesg shows that a qemu crashed.
>

See that thread:
https://lists.nongnu.org/archive/html/qemu-devel/2017-02/msg06015.html


> I got a backtrace that shows:
>
> (gdb) where
> #0  0x560d3344f03e in qemu_chr_fe_read_all (be=be@entry=0x0,
> buf=buf@entry=0x7ffea7a2f520 "\v", len=len@entry=12) at
> /home/dgilbert/git/qemu/chardev/char.c:209
> #1  0x560d33199d64 in vhost_user_read (msg=msg@entry=0x7ffea7a2f520,
> dev=)
> at /home/dgilbert/git/qemu/hw/virtio/vhost-user.c:125
> #2  0x560d3319a11f in vhost_user_get_vring_base (dev=0x560d35a614a0,
> ring=0x7ffea7a2f670)
> at /home/dgilbert/git/qemu/hw/virtio/vhost-user.c:417
> #3  0x560d33196abb in vhost_virtqueue_stop (dev=dev@entry=0x560d35a614a0,
> vdev=vdev@entry=0x560d3729cc00, vq=0x560d35a61678, idx=1) at
> /home/dgilbert/git/qemu/hw/virtio/vhost.c:1070
> #4  0x560d33198cd4 in vhost_dev_stop (hdev=hdev@entry=0x560d35a614a0,
> vdev=vdev@entry=0x560d3729cc00)
> at /home/dgilbert/git/qemu/hw/virtio/vhost.c:1533
> #5  0x560d3317bfd8 in vhost_net_stop_one (net=0x560d35a614a0,
> dev=0x560d3729cc00)
> at /home/dgilbert/git/qemu/hw/net/vhost_net.c:288
> #6  0x560d3317c4aa in vhost_net_stop (dev=dev@entry=0x560d3729cc00,
> ncs=, total_queues=total_queues@entry=1) at
> /home/dgilbert/git/qemu/hw/net/vhost_net.c:367
> #7  0x560d33179201 in virtio_net_vhost_status (status=,
> n=0x560d3729cc00)
> at /home/dgilbert/git/qemu/hw/net/virtio-net.c:176
> #8  0x560d33179201 in virtio_net_set_status (vdev=0x560d3729cc00,
> status=7 '\a')
> at /home/dgilbert/git/qemu/hw/net/virtio-net.c:250
> #9  0x560d3339ce99 in qemu_del_net_client (nc=0x560d35a61250) at
> /home/dgilbert/git/qemu/net/net.c:390
> #10 0x560d3339e30d in net_cleanup () at
> /home/dgilbert/git/qemu/net/net.c:1454
> #11 0x560d330ee940 in main (argc=, argv= out>, envp=)
> at /home/dgilbert/git/qemu/vl.c:4637
> (gdb)
>
> #0  qemu_chr_fe_read_all (be=be@entry=0x0, buf=buf@entry=0x7ffea7a2f520
> "\v", len=len@entry=12)
> at /home/dgilbert/git/qemu/chardev/char.c:209
> 209 Chardev *s = be->chr;
>
> Dave
>
> --
> Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK
>
> --
Marc-André Lureau


Re: [Qemu-devel] RFC QCOW2 compression speed

2017-02-24 Thread Peter Lieven

> Am 24.02.2017 um 12:36 schrieb Laszlo Ersek :
> 
> On 02/24/17 10:33, Peter Lieven wrote:
>> Hi,
>> 
>> 
>> i have been experimenting with deflate compression paramters in the past and 
>> found that the values chosen
>> 
>> in QCOW2 are not that optimal. In fact creation of compressed QCOW2 is very 
>> slow.
> 
> I agree, I find it slow as well.
> 
>> And especially for backups
>> 
>> compression would be good, but its so slow thats almost infeasible to use.
>> 
>> 
>> The current settings to inflateinit2 are compression level 6 
>> (Z_DEFAULT_COMPRESSION) and windowBits 12.
>> 
>> Increasing windowBits increases speed and yields better compression. 
>> Lowering the compression level increases
>> 
>> speed, but results in worse compression.
>> 
>> 
>> The issue here is that changing windowBits would (in theory) require a 
>> change of the parameters to deflateInit2 in
>> 
>> decompression which means that this change would make QCOW2 images 
>> incompatible.
>> 
>> 
>> As for discussion here is my experiment of compression a Ubuntu1604 
>> installation in ram.
>> 
>> level 6, windowBits 12   -> 42 seconds, 470MB size
>> 
>> level 6, windowBits 15   -> 35 seconds, 448MB size
>> 
>> level 1, windowBits 12   -> 30 seconds, 500MB size
>> 
>> level 1, windowBits 15   -> 15 seconds, 483MB size
>> 
>> 
>> Strange enough decompression works with unchanged parameters for all files. 
>> But the documentation of zlib claims
>> 
>> it should produce an error if a windowBits 15 stream is deflated with a 
>> smaller windowBits parameter.
>> 
>> 
>> What are your thoughts in introducing a switch and/or feature to use 
>> alternate settings for inflate?
>> 
>> If we need to introduce an incompatible feature for the bigger windowBits 
>> might it be worth thinking of using
>> 
>> an alternate compression algorithm like LZ4?
> 
> If the compression level was exposed with a command line option (and/or
> an environment variable, a la $GZIP), I would welcome it. I have
> frequently missed this feature, but I always figured it wouldn't be
> important enough to bother block people with it.

Changing the compression level via environment or a QCOW2 parameter is not a 
big deal. However, it helps not that much regarding speed. What helps is 
increasing the window size, but this is an incompatible change. And if we go 
for an incompatible change I would think about using a better compression 
algorithm than zlib deflate.

Peter


[Qemu-devel] make check: crash in vhost-user chardev cleanup

2017-02-24 Thread Dr. David Alan Gilbert
Hi,
  I'm seeing a hang+crash doing a make check that
looks like it's coming from vhost-user's chardev cleanup,
so I'm suspecting your e0b283e7c.
The synptom is that the make check hangs in the qtest
part, and a dmesg shows that a qemu crashed.
I got a backtrace that shows:

(gdb) where
#0  0x560d3344f03e in qemu_chr_fe_read_all (be=be@entry=0x0, 
buf=buf@entry=0x7ffea7a2f520 "\v", len=len@entry=12) at 
/home/dgilbert/git/qemu/chardev/char.c:209
#1  0x560d33199d64 in vhost_user_read (msg=msg@entry=0x7ffea7a2f520, 
dev=)
at /home/dgilbert/git/qemu/hw/virtio/vhost-user.c:125
#2  0x560d3319a11f in vhost_user_get_vring_base (dev=0x560d35a614a0, 
ring=0x7ffea7a2f670)
at /home/dgilbert/git/qemu/hw/virtio/vhost-user.c:417
#3  0x560d33196abb in vhost_virtqueue_stop (dev=dev@entry=0x560d35a614a0, 
vdev=vdev@entry=0x560d3729cc00, vq=0x560d35a61678, idx=1) at 
/home/dgilbert/git/qemu/hw/virtio/vhost.c:1070
#4  0x560d33198cd4 in vhost_dev_stop (hdev=hdev@entry=0x560d35a614a0, 
vdev=vdev@entry=0x560d3729cc00)
at /home/dgilbert/git/qemu/hw/virtio/vhost.c:1533
#5  0x560d3317bfd8 in vhost_net_stop_one (net=0x560d35a614a0, 
dev=0x560d3729cc00)
at /home/dgilbert/git/qemu/hw/net/vhost_net.c:288
#6  0x560d3317c4aa in vhost_net_stop (dev=dev@entry=0x560d3729cc00, 
ncs=, total_queues=total_queues@entry=1) at 
/home/dgilbert/git/qemu/hw/net/vhost_net.c:367
#7  0x560d33179201 in virtio_net_vhost_status (status=, 
n=0x560d3729cc00)
at /home/dgilbert/git/qemu/hw/net/virtio-net.c:176
#8  0x560d33179201 in virtio_net_set_status (vdev=0x560d3729cc00, status=7 
'\a')
at /home/dgilbert/git/qemu/hw/net/virtio-net.c:250
#9  0x560d3339ce99 in qemu_del_net_client (nc=0x560d35a61250) at 
/home/dgilbert/git/qemu/net/net.c:390
#10 0x560d3339e30d in net_cleanup () at 
/home/dgilbert/git/qemu/net/net.c:1454
#11 0x560d330ee940 in main (argc=, argv=, 
envp=)
at /home/dgilbert/git/qemu/vl.c:4637
(gdb) 

#0  qemu_chr_fe_read_all (be=be@entry=0x0, buf=buf@entry=0x7ffea7a2f520 "\v", 
len=len@entry=12)
at /home/dgilbert/git/qemu/chardev/char.c:209
209 Chardev *s = be->chr;

Dave

--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK



Re: [Qemu-devel] [PATCH 00/21] qapi: QMP dispatch and input visitor work

2017-02-24 Thread no-reply
Hi,

This series seems to have some coding style problems. See output below for
more information:

Message-id: 1487886317-27400-1-git-send-email-arm...@redhat.com
Type: series
Subject: [Qemu-devel] [PATCH 00/21] qapi: QMP dispatch and input visitor work

=== TEST SCRIPT BEGIN ===
#!/bin/bash

BASE=base
n=1
total=$(git log --oneline $BASE.. | wc -l)
failed=0

# Useful git options
git config --local diff.renamelimit 0
git config --local diff.renames True

commits="$(git log --format=%H --reverse $BASE..)"
for c in $commits; do
echo "Checking PATCH $n/$total: $(git log -n 1 --format=%s $c)..."
if ! git show $c --format=email | ./scripts/checkpatch.pl --mailback -; then
failed=1
echo
fi
n=$((n+1))
done

exit $failed
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
From https://github.com/patchew-project/qemu
 * [new tag] 
patchew/1487886317-27400-1-git-send-email-arm...@redhat.com -> 
patchew/1487886317-27400-1-git-send-email-arm...@redhat.com
Switched to a new branch 'test'
700a2ac qapi: Improve qobject visitor documentation
175f23f qapi: Make input visitors detect unvisited list tails
f8ca0b1 tests: Cover partial input visit of list
756fdaa tests-qobject-input-strict: Merge into test-qobject-input-visitor
eb9410c qapi: Drop unused non-strict qobject input visitor
172f593 test-qobject-input-visitor: Use strict visitor
5685ece qom: Make object_property_set_qobject()'s input visitor strict
6ea05df qapi: Make string input and opts visitor require non-null input
9112d89 qapi: Drop string input visitor method optional()
dae8b6a qapi: Improve qobject input visitor error reporting
92c4f7b qapi: Make QObject input visitor set *list reliably
015ae02 qapi: Clean up after commit 3d344c2
48848af qapi: Improve a QObject input visitor error message
f531352 qmp: Improve QMP dispatch error messages
0fa4ae4 qmp: Eliminate silly QERR_QMP_* macros
458f762 qmp: Drop duplicated QMP command object checks
0c32703 qmp: Clean up how we enforce capability negotiation
e2a3724 qmp: Dumb down how we run QMP command registration
16e7bcf qmp-test: New, covering basic QMP protocol
173fd91 libqtest: Work around a "QMP wants a newline" bug
7d1c822 qga: Fix crash on non-dictionary QMP argument

=== OUTPUT BEGIN ===
Checking PATCH 1/21: qga: Fix crash on non-dictionary QMP argument...
Checking PATCH 2/21: libqtest: Work around a "QMP wants a newline" bug...
Checking PATCH 3/21: qmp-test: New, covering basic QMP protocol...
Checking PATCH 4/21: qmp: Dumb down how we run QMP command registration...
Checking PATCH 5/21: qmp: Clean up how we enforce capability negotiation...
Checking PATCH 6/21: qmp: Drop duplicated QMP command object checks...
Checking PATCH 7/21: qmp: Eliminate silly QERR_QMP_* macros...
Checking PATCH 8/21: qmp: Improve QMP dispatch error messages...
Checking PATCH 9/21: qapi: Improve a QObject input visitor error message...
Checking PATCH 10/21: qapi: Clean up after commit 3d344c2...
Checking PATCH 11/21: qapi: Make QObject input visitor set *list reliably...
Checking PATCH 12/21: qapi: Improve qobject input visitor error reporting...
Checking PATCH 13/21: qapi: Drop string input visitor method optional()...
Checking PATCH 14/21: qapi: Make string input and opts visitor require non-null 
input...
Checking PATCH 15/21: qom: Make object_property_set_qobject()'s input visitor 
strict...
Checking PATCH 16/21: test-qobject-input-visitor: Use strict visitor...
Checking PATCH 17/21: qapi: Drop unused non-strict qobject input visitor...
Checking PATCH 18/21: tests-qobject-input-strict: Merge into 
test-qobject-input-visitor...
ERROR: line over 90 characters
#484: FILE: tests/test-qobject-input-visitor.c:847:
+v = visitor_input_test_init(data, "{ 'integer': -42, 'boolean': true, 
'string': 'foo', 'extra': 42 }");

ERROR: line over 90 characters
#498: FILE: tests/test-qobject-input-visitor.c:861:
+v = visitor_input_test_init(data, "{ 'string0': 'string0', 'dict1': { 
'string1': 'string1', 'dict2': { 'userdef1': { 'integer': 42, 'string': 
'string', 'extra': [42, 23, {'foo':'bar'}] }, 'string2': 'string2'}}}");

ERROR: line over 90 characters
#512: FILE: tests/test-qobject-input-visitor.c:875:
+v = visitor_input_test_init(data, "[ { 'string': 'string0', 'integer': 42 
}, { 'string': 'string1', 'integer': 43 }, { 'string': 'string2', 'integer': 
44, 'extra': 'ggg' } ]");

ERROR: line over 90 characters
#585: FILE: tests/test-qobject-input-visitor.c:948:
+v = visitor_input_test_init(data, "{ 'string': 'c', 'integer': 41, 
'boolean': true }");

WARNING: line over 80 characters
#592: FILE: tests/test-qobject-input-visitor.c:955:
+static void test_visitor_in_fail_union_flat_no_discrim(TestInputVisitorData 
*data,

ERROR: line over 90 characters
#600: FILE: tests/test-qobject-input-visitor.c:963:
+v = visitor_input_test_init(data, "{ 'integer': 42, 'string': 'c', 
'string1': 'd', 'string2': 'e' }");

total: 5 errors, 1 warnings, 233 lines checked

Your patch 

Re: [Qemu-devel] [PATCH 23/29] 9pfs: local: chmod: don't follow symlinks

2017-02-24 Thread Greg Kurz
On Fri, 24 Feb 2017 17:22:19 +0100
Jann Horn  wrote:
> [...]
> And unfortunately, that flags argument is not actually present in the
> real syscall.
> See this glibc code:
> 
> int
> fchmodat (int fd, const char *file, mode_t mode, int flag)
> {
>   if (flag & ~AT_SYMLINK_NOFOLLOW)
> return INLINE_SYSCALL_ERROR_RETURN_VALUE (EINVAL);
> #ifndef __NR_lchmod /* Linux so far has no lchmod syscall.  */
>   if (flag & AT_SYMLINK_NOFOLLOW)
> return INLINE_SYSCALL_ERROR_RETURN_VALUE (ENOTSUP);
> #endif
> 
>   return INLINE_SYSCALL (fchmodat, 3, fd, file, mode);
> }
> 
> and this kernel code:
> 
> SYSCALL_DEFINE3(fchmodat, int, dfd, const char __user *, filename,
> umode_t, mode)
> {
>   [...]
> }
> 
> So to fix this, you'll probably have to add a new syscall fchmodat2()
> to the kernel,
> wire it up for all the architectures and get the various libc
> implementations to adopt
> that. That's going to be quite tedious. :(

Yeah, Eric and I had a discussion about that on irc. I'll start to
work on the kernel part, at least. Indeed, adoption by the various
libc is likely to take some time... When the syscalls are available
in the kernel, maybe it is possible to implement something in the
9pfs code with the syscall() function.

In the meantime, we'll have to live with a degraded version of
fchmodat() based on openat()+fchmod(). This will fail if the
file isn't accessible but it is better than allowing the guest
to chmod() any file on the host.


pgpqEQlPFq4c6.pgp
Description: OpenPGP digital signature


Re: [Qemu-devel] [PULL 00/19] Block layer patches

2017-02-24 Thread no-reply
Hi,

This series seems to have some coding style problems. See output below for
more information:

Message-id: 1487960230-18054-1-git-send-email-kw...@redhat.com
Type: series
Subject: [Qemu-devel] [PULL 00/19] Block layer patches

=== TEST SCRIPT BEGIN ===
#!/bin/bash

BASE=base
n=1
total=$(git log --oneline $BASE.. | wc -l)
failed=0

# Useful git options
git config --local diff.renamelimit 0
git config --local diff.renames True

commits="$(git log --format=%H --reverse $BASE..)"
for c in $commits; do
echo "Checking PATCH $n/$total: $(git log -n 1 --format=%s $c)..."
if ! git show $c --format=email | ./scripts/checkpatch.pl --mailback -; then
failed=1
echo
fi
n=$((n+1))
done

exit $failed
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
From https://github.com/patchew-project/qemu
 - [tag update]  
patchew/1487850673-26455-1-git-send-email-vijay.kil...@gmail.com -> 
patchew/1487850673-26455-1-git-send-email-vijay.kil...@gmail.com
 * [new tag] patchew/1487960230-18054-1-git-send-email-kw...@redhat.com 
-> patchew/1487960230-18054-1-git-send-email-kw...@redhat.com
Switched to a new branch 'test'
88f046d tests: Use opened block node for block job tests
3b26fc9 vvfat: Use opened node as backing file
b7dea37 block: Add bdrv_new_open_driver()
06e0d10 block: Factor out bdrv_open_driver()
a5063d5 block: Use BlockBackend for image probing
e71e684 block: Factor out bdrv_open_child_bs()
0a67f78 block: Attach bs->file only during .bdrv_open()
9532e71 block: Pass BdrvChild to bdrv_truncate()
19d5c41 mirror: Resize active commit base in mirror_run()
ad44d55 qcow2: Use BB for resizing in qcow2_amend_options()
df54cc4 blockdev: Use BlockBackend to resize in qmp_block_resize()
c437206 iotests: Fix another race in 030
494b0ae qemu-img: Improve documentation for PREALLOC_MODE_FALLOC
39ee155 qemu-img: Truncate before full preallocation
72360e3 qemu-img: Add tests for raw image preallocation
f9bf4f9 qemu-img: Do not truncate before preallocation
3515c08 qemu-iotests: redirect nbd server stdout to /dev/null
dc728ce qemu-iotests: add ability to exclude certain protocols from tests
abc2f9d qemu-iotests: Test 137 only supports 'file' protocol

=== OUTPUT BEGIN ===
Checking PATCH 1/19: qemu-iotests: Test 137 only supports 'file' protocol...
Checking PATCH 2/19: qemu-iotests: add ability to exclude certain protocols 
from tests...
Checking PATCH 3/19: qemu-iotests: redirect nbd server stdout to /dev/null...
Checking PATCH 4/19: qemu-img: Do not truncate before preallocation...
Checking PATCH 5/19: qemu-img: Add tests for raw image preallocation...
Checking PATCH 6/19: qemu-img: Truncate before full preallocation...
Checking PATCH 7/19: qemu-img: Improve documentation for PREALLOC_MODE_FALLOC...
Checking PATCH 8/19: iotests: Fix another race in 030...
Checking PATCH 9/19: blockdev: Use BlockBackend to resize in 
qmp_block_resize()...
Checking PATCH 10/19: qcow2: Use BB for resizing in qcow2_amend_options()...
Checking PATCH 11/19: mirror: Resize active commit base in mirror_run()...
Checking PATCH 12/19: block: Pass BdrvChild to bdrv_truncate()...
Checking PATCH 13/19: block: Attach bs->file only during .bdrv_open()...
Checking PATCH 14/19: block: Factor out bdrv_open_child_bs()...
Checking PATCH 15/19: block: Use BlockBackend for image probing...
Checking PATCH 16/19: block: Factor out bdrv_open_driver()...
Checking PATCH 17/19: block: Add bdrv_new_open_driver()...
Checking PATCH 18/19: vvfat: Use opened node as backing file...
ERROR: "(foo*)" should be "(foo *)"
#33: FILE: block/vvfat.c:2971:
+.instance_size  = sizeof(void*),

ERROR: "(foo**)" should be "(foo **)"
#44: FILE: block/vvfat.c:3042:
+*(void**) backing->opaque = s;

total: 2 errors, 0 warnings, 25 lines checked

Your patch has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

Checking PATCH 19/19: tests: Use opened block node for block job tests...
=== OUTPUT END ===

Test command exited with code: 1


---
Email generated automatically by Patchew [http://patchew.org/].
Please send your feedback to patchew-de...@freelists.org

Re: [Qemu-devel] [PATCH 04/21] qmp: Dumb down how we run QMP command registration

2017-02-24 Thread Eric Blake
On 02/23/2017 03:45 PM, Markus Armbruster wrote:
> The way we get QMP commands registered is high tech:
> 
> * qapi-commands.py generates qmp_init_marshal() that does the actual work
> 
> * it also generates the magic to register it as a MODULE_INIT_QAPI
>   function, so it runs when someone calls
>   module_call_init(MODULE_INIT_QAPI)
> 
> * main() calls module_call_init()
> 
> QEMU needs to register a few non-qapified commands.  Same high tech
> works: monitor.c has its own qmp_init_marshal() along with the magic
> to make it run in module_call_init(MODULE_INIT_QAPI).

Eww. Two static functions with the same name, which really messes with
gdb debugging.

> 
> QEMU also needs to unregister commands that are not wanted in this
> build's configuration (commit 5032a16).  Simple enough:
> qmp_unregister_commands_hack().  The difficulty is to make it run
> after the generated qmp_init_marshal().  We can't simply run it in
> monitor.c's qmp_init_marshal(), because the order in which the
> registered functions run is indeterminate.  So qmp_init_marshal()
> registers qmp_init_marshal() separately.  Since registering *appends*

Another case of "A sets up A" when you meant "A sets up B", but this
time, I'm fairly certain you mean qmp_init_marshal() registers
qmp_unregister_commands_hack() separately.

> to the list of registered functions, this will make it run after all
> the functions that have been registered already.
> 
> I suspect it takes a long and expensive computer science education to
> not find this silly.

ROFL
(does that mean I didn't spend enough on my education? I'm sure you
could arrange to sell me an online certificate if I wanted more... :)

> 
> Dumb it down as follows:
> 
> * Drop MODULE_INIT_QAPI entirely
> 
> * Give the generated qmp_init_marshal() external linkage.
> 
> * Call it instead of module_call_init(MODULE_INIT_QAPI)
> 
> * Except in QEMU proper, call new monitor_init_qmp_commands() that in
>   turn calls the generated qmp_init_marshal(), registers the
>   additional commands and unregisters the unwanted ones.
> 
> Signed-off-by: Markus Armbruster 
> ---

> -static void qmp_init_marshal(void)
> +void monitor_init_qmp_commands(void)
>  {
> +qmp_init_marshal();
> +
>  qmp_register_command("query-qmp-schema", qmp_query_qmp_schema,
>   QCO_NO_OPTIONS);
>  qmp_register_command("device_add", qmp_device_add,
> @@ -1004,12 +1006,9 @@ static void qmp_init_marshal(void)
>  qmp_register_command("netdev_add", qmp_netdev_add,
>   QCO_NO_OPTIONS);
>  
> -/* call it after the rest of qapi_init() */
> -register_module_init(qmp_unregister_commands_hack, MODULE_INIT_QAPI);
> +qmp_unregister_commands_hack();

We could inline this function now, but keeping the _hack() name around
reminds us to consider conditional-compilation support in the generator
at a later date, so I'm fine with the approach you took.

Reviewed-by: Eric Blake 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-devel] [PATCH 1/3] qom-qobject: introduce object_property_{g, s}et_ptr

2017-02-24 Thread Markus Armbruster
Paolo Bonzini  writes:

[...]
> I'm leaving these three patches out of the pull request (and 2.9) while
> the discussion goes on.

Appreciated!  Please remind me to come back to this discussion once the
freeze madness has died down.



Re: [Qemu-devel] [PATCH] os: don't corrupt pre-existing memory-backend data with prealloc

2017-02-24 Thread Eric Blake
On 02/24/2017 11:27 AM, Daniel P. Berrange wrote:
> When using a memory-backend object with prealloc turned on, QEMU
> will memset() the first byte in every memory page to zero. While
> this might have been acceptable for memory backends associated
> with RAM, this corrupts application data for NVDIMMs.
> 
> Instead of setting every page to zero, read the current byte
> value and then just write that same value back, so we are not
> corrupting the original data. Directly write the value instead
> of memset()ing it, since there's no benefit to memset for a
> single byte write.
> 
> Signed-off-by: Daniel P. Berrange 
> ---
> 

>  /* MAP_POPULATE silently ignores failures */
>  for (i = 0; i < numpages; i++) {
> -memset(area + (hpagesize * i), 0, 1);
> +/*
> + * Read & write back the same value, so we don't
> + * corrupt existinng user/app data that might be

s/existinng/existing/

> + * stored.
> + *
> + * 'volatile' to stop compiler optimizing this away
> + * to a no-op
> + *
> + * TODO: get a better solution from kernel so we
> + * don't need to write at all so we don't cause
> + * wear on the storage backing the region...
> + */
> +volatile char val = *(area + (hpagesize * i));
> +*(area + (hpagesize * i)) = val;
>  }
>  }
>  
> 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


[Qemu-devel] [PATCH v3 07/16] postcopy: Plumb pagesize down into place helpers

2017-02-24 Thread Dr. David Alan Gilbert (git)
From: "Dr. David Alan Gilbert" 

Now we deal with normal size pages and huge pages we need
to tell the place handlers the size we're dealing with
and make sure the temporary page is large enough.

Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Juan Quintela 
Reviewed-by: Laurent Vivier 
---
 include/migration/postcopy-ram.h |  6 +++--
 migration/postcopy-ram.c | 47 
 migration/ram.c  | 15 +++--
 3 files changed, 40 insertions(+), 28 deletions(-)

diff --git a/include/migration/postcopy-ram.h b/include/migration/postcopy-ram.h
index 43bbbca..8e036b9 100644
--- a/include/migration/postcopy-ram.h
+++ b/include/migration/postcopy-ram.h
@@ -74,13 +74,15 @@ void postcopy_discard_send_finish(MigrationState *ms,
  *to use other postcopy_ routines to allocate.
  * returns 0 on success
  */
-int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from);
+int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
+size_t pagesize);
 
 /*
  * Place a zero page at (host) atomically
  * returns 0 on success
  */
-int postcopy_place_page_zero(MigrationIncomingState *mis, void *host);
+int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
+ size_t pagesize);
 
 /*
  * Allocate a page of memory that can be mapped at a later point in time
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index 1e3d22f..a8b7fed 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -321,7 +321,7 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState 
*mis)
 migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
 
 if (mis->postcopy_tmp_page) {
-munmap(mis->postcopy_tmp_page, getpagesize());
+munmap(mis->postcopy_tmp_page, mis->largest_page_size);
 mis->postcopy_tmp_page = NULL;
 }
 trace_postcopy_ram_incoming_cleanup_exit();
@@ -543,13 +543,14 @@ int postcopy_ram_enable_notify(MigrationIncomingState 
*mis)
  * Place a host page (from) at (host) atomically
  * returns 0 on success
  */
-int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from)
+int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from,
+size_t pagesize)
 {
 struct uffdio_copy copy_struct;
 
 copy_struct.dst = (uint64_t)(uintptr_t)host;
 copy_struct.src = (uint64_t)(uintptr_t)from;
-copy_struct.len = getpagesize();
+copy_struct.len = pagesize;
 copy_struct.mode = 0;
 
 /* copy also acks to the kernel waking the stalled thread up
@@ -559,8 +560,8 @@ int postcopy_place_page(MigrationIncomingState *mis, void 
*host, void *from)
  */
 if (ioctl(mis->userfault_fd, UFFDIO_COPY, _struct)) {
 int e = errno;
-error_report("%s: %s copy host: %p from: %p",
- __func__, strerror(e), host, from);
+error_report("%s: %s copy host: %p from: %p (size: %zd)",
+ __func__, strerror(e), host, from, pagesize);
 
 return -e;
 }
@@ -573,23 +574,29 @@ int postcopy_place_page(MigrationIncomingState *mis, void 
*host, void *from)
  * Place a zero page at (host) atomically
  * returns 0 on success
  */
-int postcopy_place_page_zero(MigrationIncomingState *mis, void *host)
+int postcopy_place_page_zero(MigrationIncomingState *mis, void *host,
+ size_t pagesize)
 {
-struct uffdio_zeropage zero_struct;
+trace_postcopy_place_page_zero(host);
 
-zero_struct.range.start = (uint64_t)(uintptr_t)host;
-zero_struct.range.len = getpagesize();
-zero_struct.mode = 0;
+if (pagesize == getpagesize()) {
+struct uffdio_zeropage zero_struct;
+zero_struct.range.start = (uint64_t)(uintptr_t)host;
+zero_struct.range.len = getpagesize();
+zero_struct.mode = 0;
 
-if (ioctl(mis->userfault_fd, UFFDIO_ZEROPAGE, _struct)) {
-int e = errno;
-error_report("%s: %s zero host: %p",
- __func__, strerror(e), host);
+if (ioctl(mis->userfault_fd, UFFDIO_ZEROPAGE, _struct)) {
+int e = errno;
+error_report("%s: %s zero host: %p",
+ __func__, strerror(e), host);
 
-return -e;
+return -e;
+}
+} else {
+/* TODO: The kernel can't use UFFDIO_ZEROPAGE for hugepages */
+assert(0);
 }
 
-trace_postcopy_place_page_zero(host);
 return 0;
 }
 
@@ -604,7 +611,7 @@ int postcopy_place_page_zero(MigrationIncomingState *mis, 
void *host)
 void *postcopy_get_tmp_page(MigrationIncomingState *mis)
 {
 if (!mis->postcopy_tmp_page) {
-mis->postcopy_tmp_page = mmap(NULL, getpagesize(),
+mis->postcopy_tmp_page = mmap(NULL, mis->largest_page_size,
  PROT_READ 

[Qemu-devel] [PATCH v3 08/16] postcopy: Use temporary for placing zero huge pages

2017-02-24 Thread Dr. David Alan Gilbert (git)
From: "Dr. David Alan Gilbert" 

The kernel can't do UFFDIO_ZEROPAGE for huge pages, so we have
to allocate a temporary (always zero) page and use UFFDIO_COPYPAGE
on it.

Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Juan Quintela 
Reviewed-by: Laurent Vivier 
---
 include/migration/migration.h |  1 +
 migration/postcopy-ram.c  | 23 +--
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/include/migration/migration.h b/include/migration/migration.h
index e8e2a7f..cbfe79c 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -108,6 +108,7 @@ struct MigrationIncomingState {
 QEMUFile *to_src_file;
 QemuMutex rp_mutex;/* We send replies from multiple threads */
 void *postcopy_tmp_page;
+void *postcopy_tmp_zero_page;
 
 QEMUBH *bh;
 
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index a8b7fed..4c736d2 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -324,6 +324,10 @@ int postcopy_ram_incoming_cleanup(MigrationIncomingState 
*mis)
 munmap(mis->postcopy_tmp_page, mis->largest_page_size);
 mis->postcopy_tmp_page = NULL;
 }
+if (mis->postcopy_tmp_zero_page) {
+munmap(mis->postcopy_tmp_zero_page, mis->largest_page_size);
+mis->postcopy_tmp_zero_page = NULL;
+}
 trace_postcopy_ram_incoming_cleanup_exit();
 return 0;
 }
@@ -593,8 +597,23 @@ int postcopy_place_page_zero(MigrationIncomingState *mis, 
void *host,
 return -e;
 }
 } else {
-/* TODO: The kernel can't use UFFDIO_ZEROPAGE for hugepages */
-assert(0);
+/* The kernel can't use UFFDIO_ZEROPAGE for hugepages */
+if (!mis->postcopy_tmp_zero_page) {
+mis->postcopy_tmp_zero_page = mmap(NULL, mis->largest_page_size,
+   PROT_READ | PROT_WRITE,
+   MAP_PRIVATE | MAP_ANONYMOUS,
+   -1, 0);
+if (mis->postcopy_tmp_zero_page == MAP_FAILED) {
+int e = errno;
+mis->postcopy_tmp_zero_page = NULL;
+error_report("%s: %s mapping large zero page",
+ __func__, strerror(e));
+return -e;
+}
+memset(mis->postcopy_tmp_zero_page, '\0', mis->largest_page_size);
+}
+return postcopy_place_page(mis, host, mis->postcopy_tmp_zero_page,
+   pagesize);
 }
 
 return 0;
-- 
2.9.3




[Qemu-devel] [PATCH v3 01/16] postcopy: Transmit ram size summary word

2017-02-24 Thread Dr. David Alan Gilbert (git)
From: "Dr. David Alan Gilbert" 

Replace the host page-size in the 'advise' command by a pagesize
summary bitmap; if the VM is just using normal RAM then
this will be exactly the same as before, however if they're using
huge pages they'll be different, and thus:
   a) Migration from/to old qemu's that don't understand huge pages
  will fail early.
   b) Migrations with different size RAMBlocks will also fail early.

This catches it very early; earlier than the detailed per-block
check in the next patch.

Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Laurent Vivier 
Reviewed-by: Juan Quintela 
---
 include/migration/migration.h |  1 +
 migration/ram.c   | 17 +
 migration/savevm.c| 32 +---
 3 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/include/migration/migration.h b/include/migration/migration.h
index 1735d66..aafa68f 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -375,6 +375,7 @@ void global_state_store_running(void);
 void flush_page_queue(MigrationState *ms);
 int ram_save_queue_pages(MigrationState *ms, const char *rbname,
  ram_addr_t start, ram_addr_t len);
+uint64_t ram_pagesize_summary(void);
 
 PostcopyState postcopy_state_get(void);
 /* Set the state and return the old state */
diff --git a/migration/ram.c b/migration/ram.c
index f289fcd..1f6397a 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -600,6 +600,23 @@ static void migration_bitmap_sync_init(void)
 iterations_prev = 0;
 }
 
+/* Returns a summary bitmap of the page sizes of all RAMBlocks;
+ * for VMs with just normal pages this is equivalent to the
+ * host page size.  If it's got some huge pages then it's the OR
+ * of all the different page sizes.
+ */
+uint64_t ram_pagesize_summary(void)
+{
+RAMBlock *block;
+uint64_t summary = 0;
+
+QLIST_FOREACH_RCU(block, _list.blocks, next) {
+summary |= block->page_size;
+}
+
+return summary;
+}
+
 static void migration_bitmap_sync(void)
 {
 RAMBlock *block;
diff --git a/migration/savevm.c b/migration/savevm.c
index 5ecd264..1c19fe5 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -869,7 +869,7 @@ int qemu_savevm_send_packaged(QEMUFile *f, const uint8_t 
*buf, size_t len)
 void qemu_savevm_send_postcopy_advise(QEMUFile *f)
 {
 uint64_t tmp[2];
-tmp[0] = cpu_to_be64(getpagesize());
+tmp[0] = cpu_to_be64(ram_pagesize_summary());
 tmp[1] = cpu_to_be64(1ul << qemu_target_page_bits());
 
 trace_qemu_savevm_send_postcopy_advise();
@@ -1346,7 +1346,7 @@ static int qemu_loadvm_state_main(QEMUFile *f, 
MigrationIncomingState *mis);
 static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis)
 {
 PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
-uint64_t remote_hps, remote_tps;
+uint64_t remote_pagesize_summary, local_pagesize_summary, remote_tps;
 
 trace_loadvm_postcopy_handle_advise();
 if (ps != POSTCOPY_INCOMING_NONE) {
@@ -1359,17 +1359,27 @@ static int 
loadvm_postcopy_handle_advise(MigrationIncomingState *mis)
 return -1;
 }
 
-remote_hps = qemu_get_be64(mis->from_src_file);
-if (remote_hps != getpagesize())  {
+remote_pagesize_summary = qemu_get_be64(mis->from_src_file);
+local_pagesize_summary = ram_pagesize_summary();
+
+if (remote_pagesize_summary != local_pagesize_summary)  {
 /*
- * Some combinations of mismatch are probably possible but it gets
- * a bit more complicated.  In particular we need to place whole
- * host pages on the dest at once, and we need to ensure that we
- * handle dirtying to make sure we never end up sending part of
- * a hostpage on it's own.
+ * This detects two potential causes of mismatch:
+ *   a) A mismatch in host page sizes
+ *  Some combinations of mismatch are probably possible but it gets
+ *  a bit more complicated.  In particular we need to place whole
+ *  host pages on the dest at once, and we need to ensure that we
+ *  handle dirtying to make sure we never end up sending part of
+ *  a hostpage on it's own.
+ *   b) The use of different huge page sizes on source/destination
+ *  a more fine grain test is performed during RAM block migration
+ *  but this test here causes a nice early clear failure, and
+ *  also fails when passed to an older qemu that doesn't
+ *  do huge pages.
  */
-error_report("Postcopy needs matching host page sizes (s=%d d=%d)",
- (int)remote_hps, getpagesize());
+error_report("Postcopy needs matching RAM page sizes (s=%" PRIx64
+ " d=%" PRIx64 ")",
+ 

Re: [Qemu-devel] [PATCH 1/5] elf-loader: Allow late loading of elf

2017-02-24 Thread Thomas Huth
On 24.02.2017 15:21, Farhan Ali wrote:
> 
> 
> On 02/24/2017 05:44 AM, Thomas Huth wrote:
>> On 21.02.2017 11:23, Christian Borntraeger wrote:
>>> On 02/20/2017 04:33 PM, Thomas Huth wrote:
 On 20.02.2017 15:19, Cornelia Huck wrote:
> From: Farhan Ali 
>
> The current QEMU ROM infrastructure rejects late loading of ROMs.
> And ELFs are currently loaded as ROM, this prevents delayed loading
> of ELFs. So when loading ELF, allow the user to specify if ELF should
> be loaded as ROM or not.
>
> If an ELF is not loaded as ROM, then they are not restored on a
> guest reboot/reset and so its upto the user to handle the reloading.

 Could you maybe also explain here why you need such a delayed ELF
 loading? Why can't you load the s390-netboot.img at the same time as
 s390-ccw.img?
>>>
>>> Please read the cover letter for some details how to build such a
>>> netrom.
>>
>> Sure, understood, but I still did not see an explanation why this can't
>> be loaded as "ROM", too / why it needs to be loaded "delayed"? Does the
>> image data need to be writable in memory? Or is the information not
>> available yet at that point in time, whether the user wants to do a
>> network boot or not? Don't get me wrong, I'm basically fine with this
>> patch, I'm just missing some explanation *why* you have to do it this
>> way.
>>
>>> Long term we certainly want to have a look at implementing something in
>>> the ccw bios, but this (tcp stack, virtio net, etc) will take some (a
>>> lot?)
>>> more time.
>>
>> Just an idea: There's a complete TFTP-boot-over-virtio-net stack in
>> SLOF, written by IBM ... maybe you could use that for s390x, too?
>>
>>> This patch (1) has another advantage.
>>> Right now we load the ccw bios all the time, even for -kernel xxx boot,
>>> to allow the guest user to use chreipl to reboot from a disk. With this
>>> in place we can rework our ccw bios loader to load the ccw bios lazily
>>> as well.
>>
>> But this "RAM"-loader here has still the problem that the images are not
>> restored during reboot/reset, right? ... so before you're going to
>> implement that as well, let me ask another question: Did you already
>> have a look at the generic loader device? I think that might already
>> provide what you're trying to implement here, so your problem could
>> maybe be solved with some few lines of creating an instance of that
>> device instead (see
>> https://lists.gnu.org/archive/html/qemu-devel/2017-01/msg05366.html for
>> an example how to use that device).
>>
>>  Thomas
>>
> 
> Hi Thomas,
> 
> I did think of the generic loader device, but doesn't the generic loader
> device also load ELF's as ROMs? This would again bring us back to the
> problem of ROMs cannot be loaded after initializations of the ROM. So if
> a user wants to switch boot device from disk to network device we still
> wouldn't be able to load the ROM?

OK, never mind, I just had another look at the generic loader code, and
it seems like the ELF loading is also only done there during the
realize() function, not during the reset() function. Sorry, I somehow
got that wrong when I looked at that code for the first time... :-/

 Thomas




[Qemu-devel] [PATCH v3 02/16] postcopy: Transmit and compare individual page sizes

2017-02-24 Thread Dr. David Alan Gilbert (git)
From: "Dr. David Alan Gilbert" 

When using postcopy with hugepages, we require the source
and destination page sizes for any RAMBlock to match; note
that different RAMBlocks in the same VM can have different
page sizes.

Transmit them as part of the RAM information header and
fail if there's a difference.

Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Laurent Vivier 
Reviewed-by: Juan Quintela 
---
 migration/ram.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/migration/ram.c b/migration/ram.c
index 1f6397a..fbd987a 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2027,6 +2027,9 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
 qemu_put_byte(f, strlen(block->idstr));
 qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr));
 qemu_put_be64(f, block->used_length);
+if (migrate_postcopy_ram() && block->page_size != qemu_host_page_size) 
{
+qemu_put_be64(f, block->page_size);
+}
 }
 
 rcu_read_unlock();
@@ -2528,6 +2531,8 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
  * be atomic
  */
 bool postcopy_running = postcopy_state_get() >= 
POSTCOPY_INCOMING_LISTENING;
+/* ADVISE is earlier, it shows the source has the postcopy capability on */
+bool postcopy_advised = postcopy_state_get() >= POSTCOPY_INCOMING_ADVISE;
 
 seq_iter++;
 
@@ -2592,6 +2597,18 @@ static int ram_load(QEMUFile *f, void *opaque, int 
version_id)
 error_report_err(local_err);
 }
 }
+/* For postcopy we need to check hugepage sizes match */
+if (postcopy_advised &&
+block->page_size != qemu_host_page_size) {
+uint64_t remote_page_size = qemu_get_be64(f);
+if (remote_page_size != block->page_size) {
+error_report("Mismatched RAM page size %s "
+ "(local) %zd != %" PRId64,
+ id, block->page_size,
+ remote_page_size);
+ret = -EINVAL;
+}
+}
 ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
   block->idstr);
 } else {
-- 
2.9.3




Re: [Qemu-devel] [PATCH RFC v3 2/5] keyval: New keyval_parse()

2017-02-24 Thread Eric Blake
On 02/24/2017 01:06 PM, Markus Armbruster wrote:

> 
> The value of an implied key is not subject to key desugaring.

And that's a good thing.

> 
> Without an implied key, the "node" would desugar to "de=off".
> 
>> How about:
>>
>> for 2.9: -blockdev has no magic at all (you HAVE to spell 'foo=off'
>> rather than 'foo' for any boolean parameters that you want disabled in
>> -blockdev);
> 
> You mean "rather than 'nofoo'".

Yes.  Thanks for catching the intent.

> 
> "No magic at all" implies you also have to spell "foo=on" rather than
> "foo".

I'm a little more lenient for allowing 'foo' meaning 'foo=on' as a
shortcut (although making it force a QBool rather than a string "on" may
be nicer).  I'm also okay if -blockdev has no sugar at all in 2.9 (we
can add "foo" => "foo=on" sugar later if it makes QemuOpts conversion
easier in 2.10; that's much more palatable than adding "nofoo" =>
"foo=off" sugar).

> 
> I believe we need to have both or none in 2.9.  If we have only the "on"
> sugar, "no-flush" works, and we can't add the "off" sugar without
> breaking it.

I think the "off" sugar should die, if we can convince ourselves that no
one is using it that can't catch up to modern usage.

> 
>> -blockdev); and in QemuOpts, we issue deprecation warnings but keep
>> existing behavior any time someone uses 'noFOO' that we turn into
>> 'FOO=no'.  Then, for 2.10, we can decide to remove the deprecation
>> warnings if they pointed out real (ab)use of the sugar in the wild, or
>> (hopefully) to kill the sugar entirely (as part of converting QemuOpts
>> over to keyval_parse).
>>
>> In short, getting rid of the 'no' prefix magic, after suitable
>> deprecation warnings, seems like a good plan to me; and having -blockdev
>> be slightly tighter than the rest of command line options for 2.9 only
>> is no real loss (no one uses -blockdev yet, so they can be written to
>> avoid the use of 'no' magic to begin with).
> 
> Okay, I'll post QemuOpts patch to deprecate noKEY sugar.  Let's see
> whether anyone screams.

Summarizing to make sure we're on the same track:

2.9:
QemuOpts:
  "noFOO" => "FOO=off" sugar - deprecation warning
  "FOO" => "FOO=on" sugar - silently stays the same

-blockdev option 1:
  "FOO" => error, no '=' present (we haven't decided on sugaring this yet)
  "noFOO" => error, no '=' present (no off sugar, but also avoids
"noFOO" => "noFOO=on" sugar)

-blockdev option 2:
  "FOO" => "FOO=on" - enabled boolean sugar remains compact, if it is
not ambiguous, but anything we can do to limit it to just boolean values
rather than the string "on" might be nice
  "noFOO" => "noFOO=on" - different behavior than QemuOpts, but less magic

2.10:
QemuOpts will be implemented on keyval_parse(), behaving the same as
-blockdev. If we took option 1 in 2.9, we have:
  "noFOO" => "noFOO=on" (which probably triggers an error that "noFOO"
is not an option when strict parsing is in effect) (the deprecation
period is over, so we changed command-line behavior, but only for people
that didn't pay attention to the warnings in 2.9)
  "FOO" => "FOO=on" - keep this one to minimize back-compat breakage

If we took option 2 in 2.9, we can still turn on "FOO" => "FOO=on" sugar
to get back to option 1 in 2.10

or we can stick to option 2 forevermore:
  "FOO" => error, no '=' present. No longer possible to specify booleans
without the more verbose "FOO=on"

I have a preference for option 1 in the long run, but as it seems to be
upwards compatible from option 2 for -blockdev in 2.9, I'm leaning
towards option 2 for this release.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


[Qemu-devel] [PULL 19/19] tests: Use opened block node for block job tests

2017-02-24 Thread Kevin Wolf
blk_insert_bs() and block job related functions will soon require an
opened block node (permission calculations will involve the block
driver), so let our tests be consistent with the real users in this
respect.

Signed-off-by: Kevin Wolf 
Reviewed-by: Max Reitz 
---
 tests/test-blockjob-txn.c | 6 +-
 tests/test-blockjob.c | 6 +-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tests/test-blockjob-txn.c b/tests/test-blockjob-txn.c
index b132e39..f6dfd08 100644
--- a/tests/test-blockjob-txn.c
+++ b/tests/test-blockjob-txn.c
@@ -96,7 +96,10 @@ static BlockJob *test_block_job_start(unsigned int 
iterations,
 char job_id[24];
 
 data = g_new0(TestBlockJobCBData, 1);
-bs = bdrv_new();
+
+bs = bdrv_open("null-co://", NULL, NULL, 0, _abort);
+g_assert_nonnull(bs);
+
 snprintf(job_id, sizeof(job_id), "job%u", counter++);
 s = block_job_create(job_id, _block_job_driver, bs, 0,
  BLOCK_JOB_DEFAULT, test_block_job_cb,
@@ -242,6 +245,7 @@ static void test_pair_jobs_fail_cancel_race(void)
 int main(int argc, char **argv)
 {
 qemu_init_main_loop(_abort);
+bdrv_init();
 
 g_test_init(, , NULL);
 g_test_add_func("/single/success", test_single_job_success);
diff --git a/tests/test-blockjob.c b/tests/test-blockjob.c
index 60b78a3..068c9e4 100644
--- a/tests/test-blockjob.c
+++ b/tests/test-blockjob.c
@@ -54,7 +54,10 @@ static BlockJob *do_test_id(BlockBackend *blk, const char 
*id,
 static BlockBackend *create_blk(const char *name)
 {
 BlockBackend *blk = blk_new();
-BlockDriverState *bs = bdrv_new();
+BlockDriverState *bs;
+
+bs = bdrv_open("null-co://", NULL, NULL, 0, _abort);
+g_assert_nonnull(bs);
 
 blk_insert_bs(blk, bs);
 bdrv_unref(bs);
@@ -140,6 +143,7 @@ static void test_job_ids(void)
 int main(int argc, char **argv)
 {
 qemu_init_main_loop(_abort);
+bdrv_init();
 
 g_test_init(, , NULL);
 g_test_add_func("/blockjob/ids", test_job_ids);
-- 
1.8.3.1




[Qemu-devel] [PULL 16/19] block: Factor out bdrv_open_driver()

2017-02-24 Thread Kevin Wolf
This is a function that doesn't do any option parsing, but just does
some basic BlockDriverState setup and calls the .bdrv_open() function of
the block driver.

Signed-off-by: Kevin Wolf 
Reviewed-by: Max Reitz 
---
 block.c | 112 +---
 1 file changed, 65 insertions(+), 47 deletions(-)

diff --git a/block.c b/block.c
index 5f1e8aa..f96d26b 100644
--- a/block.c
+++ b/block.c
@@ -925,6 +925,67 @@ out:
 g_free(gen_node_name);
 }
 
+static int bdrv_open_driver(BlockDriverState *bs, BlockDriver *drv,
+const char *node_name, QDict *options,
+int open_flags, Error **errp)
+{
+Error *local_err = NULL;
+int ret;
+
+bdrv_assign_node_name(bs, node_name, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+return -EINVAL;
+}
+
+bs->drv = drv;
+bs->opaque = g_malloc0(drv->instance_size);
+
+if (drv->bdrv_file_open) {
+assert(!drv->bdrv_needs_filename || bs->filename[0]);
+ret = drv->bdrv_file_open(bs, options, open_flags, _err);
+} else {
+ret = drv->bdrv_open(bs, options, open_flags, _err);
+}
+
+if (ret < 0) {
+if (local_err) {
+error_propagate(errp, local_err);
+} else if (bs->filename[0]) {
+error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
+} else {
+error_setg_errno(errp, -ret, "Could not open image");
+}
+goto free_and_fail;
+}
+
+ret = refresh_total_sectors(bs, bs->total_sectors);
+if (ret < 0) {
+error_setg_errno(errp, -ret, "Could not refresh total sector count");
+goto free_and_fail;
+}
+
+bdrv_refresh_limits(bs, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+ret = -EINVAL;
+goto free_and_fail;
+}
+
+assert(bdrv_opt_mem_align(bs) != 0);
+assert(bdrv_min_mem_align(bs) != 0);
+assert(is_power_of_2(bs->bl.request_alignment));
+
+return 0;
+
+free_and_fail:
+/* FIXME Close bs first if already opened*/
+g_free(bs->opaque);
+bs->opaque = NULL;
+bs->drv = NULL;
+return ret;
+}
+
 QemuOptsList bdrv_runtime_opts = {
 .name = "bdrv_common",
 .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
@@ -1019,14 +1080,6 @@ static int bdrv_open_common(BlockDriverState *bs, 
BlockBackend *file,
 trace_bdrv_open_common(bs, filename ?: "", bs->open_flags,
drv->format_name);
 
-node_name = qemu_opt_get(opts, "node-name");
-bdrv_assign_node_name(bs, node_name, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-ret = -EINVAL;
-goto fail_opts;
-}
-
 bs->read_only = !(bs->open_flags & BDRV_O_RDWR);
 
 if (use_bdrv_whitelist && !bdrv_is_whitelisted(drv, bs->read_only)) {
@@ -1092,54 +1145,19 @@ static int bdrv_open_common(BlockDriverState *bs, 
BlockBackend *file,
 }
 pstrcpy(bs->exact_filename, sizeof(bs->exact_filename), bs->filename);
 
-bs->drv = drv;
-bs->opaque = g_malloc0(drv->instance_size);
-
 /* Open the image, either directly or using a protocol */
 open_flags = bdrv_open_flags(bs, bs->open_flags);
-if (drv->bdrv_file_open) {
-assert(file == NULL);
-assert(!drv->bdrv_needs_filename || filename != NULL);
-ret = drv->bdrv_file_open(bs, options, open_flags, _err);
-} else {
-ret = drv->bdrv_open(bs, options, open_flags, _err);
-}
-
-if (ret < 0) {
-if (local_err) {
-error_propagate(errp, local_err);
-} else if (bs->filename[0]) {
-error_setg_errno(errp, -ret, "Could not open '%s'", bs->filename);
-} else {
-error_setg_errno(errp, -ret, "Could not open image");
-}
-goto free_and_fail;
-}
+node_name = qemu_opt_get(opts, "node-name");
 
-ret = refresh_total_sectors(bs, bs->total_sectors);
+assert(!drv->bdrv_file_open || file == NULL);
+ret = bdrv_open_driver(bs, drv, node_name, options, open_flags, errp);
 if (ret < 0) {
-error_setg_errno(errp, -ret, "Could not refresh total sector count");
-goto free_and_fail;
-}
-
-bdrv_refresh_limits(bs, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-ret = -EINVAL;
-goto free_and_fail;
+goto fail_opts;
 }
 
-assert(bdrv_opt_mem_align(bs) != 0);
-assert(bdrv_min_mem_align(bs) != 0);
-assert(is_power_of_2(bs->bl.request_alignment));
-
 qemu_opts_del(opts);
 return 0;
 
-free_and_fail:
-g_free(bs->opaque);
-bs->opaque = NULL;
-bs->drv = NULL;
 fail_opts:
 qemu_opts_del(opts);
 return ret;
-- 
1.8.3.1




[Qemu-devel] [PULL 17/19] block: Add bdrv_new_open_driver()

2017-02-24 Thread Kevin Wolf
This function allows to create more or less normal BlockDriverStates
even for BlockDrivers that aren't globally registered (e.g. helper
filters for block jobs).

Signed-off-by: Kevin Wolf 
Reviewed-by: Max Reitz 
---
 block.c   | 30 +-
 include/block/block.h |  2 ++
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index f96d26b..1368cf5 100644
--- a/block.c
+++ b/block.c
@@ -939,13 +939,16 @@ static int bdrv_open_driver(BlockDriverState *bs, 
BlockDriver *drv,
 }
 
 bs->drv = drv;
+bs->read_only = !(bs->open_flags & BDRV_O_RDWR);
 bs->opaque = g_malloc0(drv->instance_size);
 
 if (drv->bdrv_file_open) {
 assert(!drv->bdrv_needs_filename || bs->filename[0]);
 ret = drv->bdrv_file_open(bs, options, open_flags, _err);
-} else {
+} else if (drv->bdrv_open) {
 ret = drv->bdrv_open(bs, options, open_flags, _err);
+} else {
+ret = 0;
 }
 
 if (ret < 0) {
@@ -986,6 +989,31 @@ free_and_fail:
 return ret;
 }
 
+BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name,
+   int flags, Error **errp)
+{
+BlockDriverState *bs;
+int ret;
+
+bs = bdrv_new();
+bs->open_flags = flags;
+bs->explicit_options = qdict_new();
+bs->options = qdict_new();
+bs->opaque = NULL;
+
+update_options_from_flags(bs->options, flags);
+
+ret = bdrv_open_driver(bs, drv, node_name, bs->options, flags, errp);
+if (ret < 0) {
+QDECREF(bs->explicit_options);
+QDECREF(bs->options);
+bdrv_unref(bs);
+return NULL;
+}
+
+return bs;
+}
+
 QemuOptsList bdrv_runtime_opts = {
 .name = "bdrv_common",
 .head = QTAILQ_HEAD_INITIALIZER(bdrv_runtime_opts.head),
diff --git a/include/block/block.h b/include/block/block.h
index a4cd06f..bde5ebd 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -215,6 +215,8 @@ int bdrv_open_backing_file(BlockDriverState *bs, QDict 
*parent_options,
const char *bdref_key, Error **errp);
 BlockDriverState *bdrv_open(const char *filename, const char *reference,
 QDict *options, int flags, Error **errp);
+BlockDriverState *bdrv_new_open_driver(BlockDriver *drv, const char *node_name,
+   int flags, Error **errp);
 BlockReopenQueue *bdrv_reopen_queue(BlockReopenQueue *bs_queue,
 BlockDriverState *bs,
 QDict *options, int flags);
-- 
1.8.3.1




Re: [Qemu-devel] Non-flat command line option argument syntax

2017-02-24 Thread Markus Armbruster
Eric Blake  writes:

> On 02/24/2017 10:39 AM, Daniel P. Berrange wrote:
>> On Fri, Feb 24, 2017 at 05:04:34PM +0100, Markus Armbruster wrote:
>>> Markus Armbruster  writes:
>>>
>>> [...]
 === Dotted keys ===

 One sufficiently powerful syntax extension already exists: the dotted
 key convention.  It's syntactically unambiguous only when none of the
 KEYs involved contains '.'  To adopt it across the board, we'd have to
 outlaw '.' in KEYs.  QAPI outlaws '.' already,
>>>
>>> *Except* in __RFQDN_ prefixes.
>>>
>
>>> Possible solutions:
>>>
>>> (a) Outlaw domain names with '_'.  If KEY starts with "__", everything
>>> up to the third '_' is an __RFQDN_ prefix.
>
> While there ARE valid DNS names with _ (such as
> _http._sctp.www.example.com mentioned on
> https://en.wikipedia.org/wiki/Hostname), we are at least told by RFC 952
> that _ is not valid in hostnames.  Note that '-' is permitted, and that
> transliterates to '_', so from the flattened name used in C code it's
> harder to tell what is meant, but from the QMP side, I think we are
> guaranteed that no RFQDN will confuse us with any extra _.

DNS registrars may fail to enforce the hostname rules.  But we can
specify that only __RFQDN_ prefixes derived from conforming hostnames
are allowed.  If someone breaks dotted keys by insisting on deriving his
from a non-conforming hostname, they get to keep the pieces.

We better clarify qapi-code-gen.txt, though.

>>>
>>> (b) Outlaw '_' in the name part that follows __RFQDN_.  If KEY starts
>>> with "__", everything up to the last '_' is an __RFQDN_ prefix.
>
> Reasonable enough, since we ask new interfaces to use '-' rather than
> '_'. But does break existing vendors (a quick grep of RHEL 7.3
> downstream finds at least:
>
> qapi-schema.json:{ 'command': '__com.redhat_qxl_screendump', 'data': {
> 'id' : 'str',
>
> So we'd have to get downstream buy-in to this plan, and downstreams may
> have to hack around our new restrictions for a while.
>
>>>
>>> (c) Your bright idea.
>> 
>> Define a new downstream vendor naming convention. IMHO it is reasonable
>> to argue that the downstream vendor extensions are outside the scope of
>> backwards compatibility guarantees we normally apply for our CLI args.
>> 
>> Thus, simply say that vendors must replace all '.' with _ in their
>> namespace prefix. eg They should use '__com_example_medium.rare=on'
>> which would mean a property '__com_example_medium' which is a struct
>> containing a property rare with value on

Certainly simpler, and arguably what we should've done back when we
created the convention for QMP.  But changing it now would be a pain.

> Same argument as (b) - we'd have to get downstream buy-in (and in this
> case, it affects even more cases: ALL downstream extensions have to be
> changed, rather than just the ones using _ after the __RFQDN_ portion).
> But has the slightly nice appeal of avoiding the '.' vs. '_'
> transliteration confusion that catches us only on downstream extensions
> (it's hard to special-case '.' as being permitted in QAPI, but only for
> downstream, because we can easily forget to test it).
>
> If we are happy with the change forced on downstream vendors, I'm okay
> with (c); if not, I think (a) is safe.

Looks like (a) is the least painful.  I'll give it a try.



[Qemu-devel] [PULL 15/19] block: Use BlockBackend for image probing

2017-02-24 Thread Kevin Wolf
This fixes the use of a parent-less BdrvChild in bdrv_open_inherit() by
converting it into a BlockBackend. Which is exactly what it should be,
image probing is an external, standalone user of a node. The requests
can't be considered to originate from the format driver node because
that one isn't even opened yet.

Signed-off-by: Kevin Wolf 
Reviewed-by: Max Reitz 
---
 block.c | 36 +++-
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/block.c b/block.c
index 6987400..5f1e8aa 100644
--- a/block.c
+++ b/block.c
@@ -588,21 +588,20 @@ BlockDriver *bdrv_probe_all(const uint8_t *buf, int 
buf_size,
 return drv;
 }
 
-static int find_image_format(BdrvChild *file, const char *filename,
+static int find_image_format(BlockBackend *file, const char *filename,
  BlockDriver **pdrv, Error **errp)
 {
-BlockDriverState *bs = file->bs;
 BlockDriver *drv;
 uint8_t buf[BLOCK_PROBE_BUF_SIZE];
 int ret = 0;
 
 /* Return the raw BlockDriver * to scsi-generic devices or empty drives */
-if (bdrv_is_sg(bs) || !bdrv_is_inserted(bs) || bdrv_getlength(bs) == 0) {
+if (blk_is_sg(file) || !blk_is_inserted(file) || blk_getlength(file) == 0) 
{
 *pdrv = _raw;
 return ret;
 }
 
-ret = bdrv_pread(file, 0, buf, sizeof(buf));
+ret = blk_pread(file, 0, buf, sizeof(buf));
 if (ret < 0) {
 error_setg_errno(errp, -ret, "Could not read image for determining its 
"
  "format");
@@ -974,7 +973,7 @@ QemuOptsList bdrv_runtime_opts = {
  *
  * Removes all processed options from *options.
  */
-static int bdrv_open_common(BlockDriverState *bs, BdrvChild *file,
+static int bdrv_open_common(BlockDriverState *bs, BlockBackend *file,
 QDict *options, Error **errp)
 {
 int ret, open_flags;
@@ -1005,7 +1004,7 @@ static int bdrv_open_common(BlockDriverState *bs, 
BdrvChild *file,
 assert(drv != NULL);
 
 if (file != NULL) {
-filename = file->bs->filename;
+filename = blk_bs(file)->filename;
 } else {
 filename = qdict_get_try_str(options, "filename");
 }
@@ -1707,7 +1706,7 @@ static BlockDriverState *bdrv_open_inherit(const char 
*filename,
Error **errp)
 {
 int ret;
-BdrvChild *file = NULL;
+BlockBackend *file = NULL;
 BlockDriverState *bs;
 BlockDriver *drv = NULL;
 const char *drvname;
@@ -1805,19 +1804,24 @@ static BlockDriverState *bdrv_open_inherit(const char 
*filename,
 qdict_del(options, "backing");
 }
 
-/* Open image file without format layer. This BdrvChild is only used for
+/* Open image file without format layer. This BlockBackend is only used for
  * probing, the block drivers will do their own bdrv_open_child() for the
  * same BDS, which is why we put the node name back into options. */
 if ((flags & BDRV_O_PROTOCOL) == 0) {
-/* FIXME Shouldn't attach a child to a node that isn't opened yet. */
-file = bdrv_open_child(filename, options, "file", bs,
-   _file, true, _err);
+BlockDriverState *file_bs;
+
+file_bs = bdrv_open_child_bs(filename, options, "file", bs,
+ _file, true, _err);
 if (local_err) {
 goto fail;
 }
-if (file != NULL) {
+if (file_bs != NULL) {
+file = blk_new();
+blk_insert_bs(file, file_bs);
+bdrv_unref(file_bs);
+
 qdict_put(options, "file",
-  qstring_from_str(bdrv_get_node_name(file->bs)));
+  qstring_from_str(bdrv_get_node_name(file_bs)));
 }
 }
 
@@ -1859,7 +1863,7 @@ static BlockDriverState *bdrv_open_inherit(const char 
*filename,
 }
 
 if (file) {
-bdrv_unref_child(bs, file);
+blk_unref(file);
 file = NULL;
 }
 
@@ -1921,9 +1925,7 @@ static BlockDriverState *bdrv_open_inherit(const char 
*filename,
 return bs;
 
 fail:
-if (file != NULL) {
-bdrv_unref_child(bs, file);
-}
+blk_unref(file);
 if (bs->file != NULL) {
 bdrv_unref_child(bs, bs->file);
 }
-- 
1.8.3.1




[Qemu-devel] [PULL 14/19] block: Factor out bdrv_open_child_bs()

2017-02-24 Thread Kevin Wolf
This is the part of bdrv_open_child() that opens a BDS with option
inheritance, but doesn't attach it as a child to the parent yet.

Signed-off-by: Kevin Wolf 
Reviewed-by: Max Reitz 
---
 block.c | 61 +
 1 file changed, 37 insertions(+), 24 deletions(-)

diff --git a/block.c b/block.c
index 40c4dee..6987400 100644
--- a/block.c
+++ b/block.c
@@ -1546,28 +1546,12 @@ free_exit:
 return ret;
 }
 
-/*
- * Opens a disk image whose options are given as BlockdevRef in another block
- * device's options.
- *
- * If allow_none is true, no image will be opened if filename is false and no
- * BlockdevRef is given. NULL will be returned, but errp remains unset.
- *
- * bdrev_key specifies the key for the image's BlockdevRef in the options 
QDict.
- * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
- * itself, all options starting with "${bdref_key}." are considered part of the
- * BlockdevRef.
- *
- * The BlockdevRef will be removed from the options QDict.
- */
-BdrvChild *bdrv_open_child(const char *filename,
-   QDict *options, const char *bdref_key,
-   BlockDriverState* parent,
-   const BdrvChildRole *child_role,
-   bool allow_none, Error **errp)
+static BlockDriverState *
+bdrv_open_child_bs(const char *filename, QDict *options, const char *bdref_key,
+   BlockDriverState *parent, const BdrvChildRole *child_role,
+   bool allow_none, Error **errp)
 {
-BdrvChild *c = NULL;
-BlockDriverState *bs;
+BlockDriverState *bs = NULL;
 QDict *image_options;
 char *bdref_key_dot;
 const char *reference;
@@ -1594,11 +1578,40 @@ BdrvChild *bdrv_open_child(const char *filename,
 goto done;
 }
 
-c = bdrv_attach_child(parent, bs, bdref_key, child_role);
-
 done:
 qdict_del(options, bdref_key);
-return c;
+return bs;
+}
+
+/*
+ * Opens a disk image whose options are given as BlockdevRef in another block
+ * device's options.
+ *
+ * If allow_none is true, no image will be opened if filename is false and no
+ * BlockdevRef is given. NULL will be returned, but errp remains unset.
+ *
+ * bdrev_key specifies the key for the image's BlockdevRef in the options 
QDict.
+ * That QDict has to be flattened; therefore, if the BlockdevRef is a QDict
+ * itself, all options starting with "${bdref_key}." are considered part of the
+ * BlockdevRef.
+ *
+ * The BlockdevRef will be removed from the options QDict.
+ */
+BdrvChild *bdrv_open_child(const char *filename,
+   QDict *options, const char *bdref_key,
+   BlockDriverState *parent,
+   const BdrvChildRole *child_role,
+   bool allow_none, Error **errp)
+{
+BlockDriverState *bs;
+
+bs = bdrv_open_child_bs(filename, options, bdref_key, parent, child_role,
+allow_none, errp);
+if (bs == NULL) {
+return NULL;
+}
+
+return bdrv_attach_child(parent, bs, bdref_key, child_role);
 }
 
 static BlockDriverState *bdrv_append_temp_snapshot(BlockDriverState *bs,
-- 
1.8.3.1




[Qemu-devel] [PULL 12/19] block: Pass BdrvChild to bdrv_truncate()

2017-02-24 Thread Kevin Wolf
Signed-off-by: Kevin Wolf 
Reviewed-by: Max Reitz 
---
 block.c| 3 ++-
 block/blkdebug.c   | 2 +-
 block/block-backend.c  | 2 +-
 block/crypto.c | 2 +-
 block/parallels.c  | 8 
 block/qcow.c   | 4 ++--
 block/qcow2-refcount.c | 2 +-
 block/qcow2.c  | 4 ++--
 block/raw-format.c | 2 +-
 block/vhdx-log.c   | 2 +-
 block/vhdx.c   | 2 +-
 include/block/block.h  | 2 +-
 12 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/block.c b/block.c
index 743c349..d951b5d 100644
--- a/block.c
+++ b/block.c
@@ -2626,8 +2626,9 @@ exit:
 /**
  * Truncate file to 'offset' bytes (needed only for file protocols)
  */
-int bdrv_truncate(BlockDriverState *bs, int64_t offset)
+int bdrv_truncate(BdrvChild *child, int64_t offset)
 {
+BlockDriverState *bs = child->bs;
 BlockDriver *drv = bs->drv;
 int ret;
 if (!drv)
diff --git a/block/blkdebug.c b/block/blkdebug.c
index d8eee1b..6117ce5 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -663,7 +663,7 @@ static int64_t blkdebug_getlength(BlockDriverState *bs)
 
 static int blkdebug_truncate(BlockDriverState *bs, int64_t offset)
 {
-return bdrv_truncate(bs->file->bs, offset);
+return bdrv_truncate(bs->file, offset);
 }
 
 static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options)
diff --git a/block/block-backend.c b/block/block-backend.c
index 819f272..492e71e 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1605,7 +1605,7 @@ int blk_truncate(BlockBackend *blk, int64_t offset)
 return -ENOMEDIUM;
 }
 
-return bdrv_truncate(blk_bs(blk), offset);
+return bdrv_truncate(blk->root, offset);
 }
 
 static void blk_pdiscard_entry(void *opaque)
diff --git a/block/crypto.c b/block/crypto.c
index 7aa7eb5..e05e4dd 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -383,7 +383,7 @@ static int block_crypto_truncate(BlockDriverState *bs, 
int64_t offset)
 
 offset += payload_offset;
 
-return bdrv_truncate(bs->file->bs, offset);
+return bdrv_truncate(bs->file, offset);
 }
 
 static void block_crypto_close(BlockDriverState *bs)
diff --git a/block/parallels.c b/block/parallels.c
index 2ccefa7..ac94dfb 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -215,7 +215,7 @@ static int64_t allocate_clusters(BlockDriverState *bs, 
int64_t sector_num,
  s->data_end << BDRV_SECTOR_BITS,
  space << BDRV_SECTOR_BITS, 0);
 } else {
-ret = bdrv_truncate(bs->file->bs,
+ret = bdrv_truncate(bs->file,
 (s->data_end + space) << BDRV_SECTOR_BITS);
 }
 if (ret < 0) {
@@ -449,7 +449,7 @@ static int parallels_check(BlockDriverState *bs, 
BdrvCheckResult *res,
 size - res->image_end_offset);
 res->leaks += count;
 if (fix & BDRV_FIX_LEAKS) {
-ret = bdrv_truncate(bs->file->bs, res->image_end_offset);
+ret = bdrv_truncate(bs->file, res->image_end_offset);
 if (ret < 0) {
 res->check_errors++;
 return ret;
@@ -681,7 +681,7 @@ static int parallels_open(BlockDriverState *bs, QDict 
*options, int flags,
 goto fail_options;
 }
 if (!bdrv_has_zero_init(bs->file->bs) ||
-bdrv_truncate(bs->file->bs, bdrv_getlength(bs->file->bs)) != 0) {
+bdrv_truncate(bs->file, bdrv_getlength(bs->file->bs)) != 0) {
 s->prealloc_mode = PRL_PREALLOC_MODE_FALLOCATE;
 }
 
@@ -724,7 +724,7 @@ static void parallels_close(BlockDriverState *bs)
 }
 
 if (bs->open_flags & BDRV_O_RDWR) {
-bdrv_truncate(bs->file->bs, s->data_end << BDRV_SECTOR_BITS);
+bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS);
 }
 
 g_free(s->bat_dirty_bmap);
diff --git a/block/qcow.c b/block/qcow.c
index fb738fc..4534515 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -467,7 +467,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs,
 /* round to cluster size */
 cluster_offset = (cluster_offset + s->cluster_size - 1) &
 ~(s->cluster_size - 1);
-bdrv_truncate(bs->file->bs, cluster_offset + s->cluster_size);
+bdrv_truncate(bs->file, cluster_offset + s->cluster_size);
 /* if encrypted, we must initialize the cluster
content which won't be written */
 if (bs->encrypted &&
@@ -909,7 +909,7 @@ static int qcow_make_empty(BlockDriverState *bs)
 if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table,
 l1_length) < 0)
 return -1;
-ret = bdrv_truncate(bs->file->bs, s->l1_table_offset + l1_length);
+ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length);
 if (ret < 0)
 return ret;
 
diff --git a/block/qcow2-refcount.c 

[Qemu-devel] [PULL 11/19] mirror: Resize active commit base in mirror_run()

2017-02-24 Thread Kevin Wolf
This is more consistent with the commit block job, and it moves the code
to a place where we already have the necessary BlockBackends to resize
the base image when bdrv_truncate() is changed to require a BdrvChild.

Signed-off-by: Kevin Wolf 
Reviewed-by: Max Reitz 
---
 block/mirror.c | 50 ++
 1 file changed, 22 insertions(+), 28 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index ca8547b..3d50857 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -662,7 +662,28 @@ static void coroutine_fn mirror_run(void *opaque)
 if (s->bdev_length < 0) {
 ret = s->bdev_length;
 goto immediate_exit;
-} else if (s->bdev_length == 0) {
+}
+
+/* Active commit must resize the base image if its size differs from the
+ * active layer. */
+if (s->base == blk_bs(s->target)) {
+int64_t base_length;
+
+base_length = blk_getlength(s->target);
+if (base_length < 0) {
+ret = base_length;
+goto immediate_exit;
+}
+
+if (s->bdev_length > base_length) {
+ret = blk_truncate(s->target, s->bdev_length);
+if (ret < 0) {
+goto immediate_exit;
+}
+}
+}
+
+if (s->bdev_length == 0) {
 /* Report BLOCK_JOB_READY and wait for complete. */
 block_job_event_ready(>common);
 s->synced = true;
@@ -1063,9 +1084,7 @@ void commit_active_start(const char *job_id, 
BlockDriverState *bs,
  BlockCompletionFunc *cb, void *opaque, Error **errp,
  bool auto_complete)
 {
-int64_t length, base_length;
 int orig_base_flags;
-int ret;
 Error *local_err = NULL;
 
 orig_base_flags = bdrv_get_flags(base);
@@ -1074,31 +1093,6 @@ void commit_active_start(const char *job_id, 
BlockDriverState *bs,
 return;
 }
 
-length = bdrv_getlength(bs);
-if (length < 0) {
-error_setg_errno(errp, -length,
- "Unable to determine length of %s", bs->filename);
-goto error_restore_flags;
-}
-
-base_length = bdrv_getlength(base);
-if (base_length < 0) {
-error_setg_errno(errp, -base_length,
- "Unable to determine length of %s", base->filename);
-goto error_restore_flags;
-}
-
-if (length > base_length) {
-ret = bdrv_truncate(base, length);
-if (ret < 0) {
-error_setg_errno(errp, -ret,
-"Top image %s is larger than base image %s, and "
- "resize of base image failed",
- bs->filename, base->filename);
-goto error_restore_flags;
-}
-}
-
 mirror_start_job(job_id, bs, creation_flags, base, NULL, speed, 0, 0,
  MIRROR_LEAVE_BACKING_CHAIN,
  on_error, on_error, true, cb, opaque, _err,
-- 
1.8.3.1




Re: [Qemu-devel] [PATCH RFC v3 2/5] keyval: New keyval_parse()

2017-02-24 Thread Markus Armbruster
Eric Blake  writes:

> On 02/24/2017 01:58 AM, Markus Armbruster wrote:
>
>>> I hate the 'no$key' sugar, and would love to get rid of it.  It makes
>>> things ambiguous (thus confusing) for precious little gain: 'novocaine'
>>> can mean 'novocaine=on' or 'vocaine=off'.  QemuOpts picks the latter,
>>> even when a QemuOpt named 'novocaine' has been declared.
>>>
>>> keyval_parse() is meant to behave like QemuOpts, so I'm reproducing this
>>> bit of bad sugar, reluctantly.
>> 
>> Implied value alternatives / variations:
>> 
>> (1) Don't implement the "no" sugar
>> 
>> I think we'd have to deprecate it in QemuOpts now, to give users
>> some time to adapt before keyval_parse() replaces QemuOpts.
>> 
>
>> 
>> If we're happy with bad sugar like "can't abbreviate novocaine=on to
>> novocaine" (confusing), "filename without a value gets you the file
>> 'on'", we reimplement the bad sugar and move on.
>> 
>> If not, we have the choice between less magic or more magic.
>> 
>> Less magic: (1) and/or (2).
>> 
>> More magic: (3) or something like it.
>> 
>> If we don't want to decide right now, we can do
>> 
>> (4) Don't implement any implied value sugar for now
>> 
>> You have to spell out =on and =off.  Big fat comment to remind us
>> about the QemuOpts incompatibility.
>> 
>> I think this would be acceptable for -blockdev in 2.9.
>> 
>> Opinions?
>
> Do any existing blockdev sub-parameters begin with 'no'?  Ouch:
> BlockdevOptions has '*cache':'BlockdevCacheOptions' which in turn has
> '*no-flush':'bool'.

I hate negative flags.

>  Even worse, I think you could abuse 'nono-flush' to
> enable flushing - my head hurts.

"no-flush" gets desugared to "-flush=off", which doesn't work.

"nono-flush" gets desugared to "no-flush=off", which works.

> And of course, there's the obvious BlockDevOptions '*node-name':'str'.
> No one in their right mind would expect 'node-name' without parameters
> to result in 'de-name=no'.

This is something I'd be even willing to break for existing options.

> Also, I found that at least libvirt has existing command line usage
> -numa node,nodeid=0, where qemu_numa_opts uses .implied_opt_name="type",
> .desc = {{0}} to be visited by OptsVisitor.  Because of the implied opt,
> we get the desired "type=node" rather than the accidental "de=no", but
> the magic of implied keys can certainly make it weird to think about.

Correct.

The value of an implied key is not subject to key desugaring.

Without an implied key, the "node" would desugar to "de=off".

> How about:
>
> for 2.9: -blockdev has no magic at all (you HAVE to spell 'foo=off'
> rather than 'foo' for any boolean parameters that you want disabled in
> -blockdev);

You mean "rather than 'nofoo'".

"No magic at all" implies you also have to spell "foo=on" rather than
"foo".

I believe we need to have both or none in 2.9.  If we have only the "on"
sugar, "no-flush" works, and we can't add the "off" sugar without
breaking it.

> -blockdev); and in QemuOpts, we issue deprecation warnings but keep
> existing behavior any time someone uses 'noFOO' that we turn into
> 'FOO=no'.  Then, for 2.10, we can decide to remove the deprecation
> warnings if they pointed out real (ab)use of the sugar in the wild, or
> (hopefully) to kill the sugar entirely (as part of converting QemuOpts
> over to keyval_parse).
>
> In short, getting rid of the 'no' prefix magic, after suitable
> deprecation warnings, seems like a good plan to me; and having -blockdev
> be slightly tighter than the rest of command line options for 2.9 only
> is no real loss (no one uses -blockdev yet, so they can be written to
> avoid the use of 'no' magic to begin with).

Okay, I'll post QemuOpts patch to deprecate noKEY sugar.  Let's see
whether anyone screams.



Re: [Qemu-devel] [PULL 0/3] VFIO updates 2017-02-23

2017-02-24 Thread Peter Maydell
On 23 February 2017 at 19:32, Alex Williamson
 wrote:
> The following changes since commit 796b288f7be875045670f963ce1b3c8e96ac:
>
>   Merge remote-tracking branch 'remotes/cody/tags/block-pull-request' into 
> staging (2017-02-21 15:48:22 +)
>
> are available in the git repository at:
>
>
>   git://github.com/awilliam/qemu-vfio.git tags/vfio-updates-20170223.0
>
> for you to fetch changes up to c2b2e158cc7b1cb431bd6039824ec13c3184a775:
>
>   vfio/pci-quirks.c: Disable stolen memory for igd VFIO (2017-02-22 13:19:59 
> -0700)
>
> 
> VFIO updates 2017-02-23
>
>  - Report qdev_unplug errors (Alex Williamson)
>  - Fix ecap ID 0 handling, improve comment (Alex Williamson)
>  - Disable IGD stolen memory in UPT mode too (Xiong Zhang)
>
> 

Applied, thanks.

-- PMM



[Qemu-devel] [PULL 07/19] qemu-img: Improve documentation for PREALLOC_MODE_FALLOC

2017-02-24 Thread Kevin Wolf
From: Nir Soffer 

Now that we are truncating the file in both PREALLOC_MODE_FULL and
PREALLOC_MODE_OFF, not truncating in PREALLOC_MODE_FALLOC looks odd.
Add a comment explaining why we do not truncate in this case.

Signed-off-by: Nir Soffer 
Signed-off-by: Kevin Wolf 
---
 block/file-posix.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index d24e34b..4de1abd 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1594,9 +1594,14 @@ static int raw_create(const char *filename, QemuOpts 
*opts, Error **errp)
 switch (prealloc) {
 #ifdef CONFIG_POSIX_FALLOCATE
 case PREALLOC_MODE_FALLOC:
-/* posix_fallocate() doesn't set errno. */
+/*
+ * Truncating before posix_fallocate() makes it about twice slower on
+ * file systems that do not support fallocate(), trying to check if a
+ * block is allocated before allocating it, so don't do that here.
+ */
 result = -posix_fallocate(fd, 0, total_size);
 if (result != 0) {
+/* posix_fallocate() doesn't set errno. */
 error_setg_errno(errp, -result,
  "Could not preallocate data for the new file");
 }
-- 
1.8.3.1




[Qemu-devel] [PATCH v3 15/16] postcopy: Add doc about hugepages and postcopy

2017-02-24 Thread Dr. David Alan Gilbert (git)
From: "Dr. David Alan Gilbert" 

Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Juan Quintela 
Reviewed-by: Laurent Vivier 
---
 docs/migration.txt | 13 +
 1 file changed, 13 insertions(+)

diff --git a/docs/migration.txt b/docs/migration.txt
index 6503c17..b462ead 100644
--- a/docs/migration.txt
+++ b/docs/migration.txt
@@ -482,3 +482,16 @@ request for a page that has already been sent is ignored.  
Duplicate requests
 such as this can happen as a page is sent at about the same time the
 destination accesses it.
 
+=== Postcopy with hugepages ===
+
+Postcopy now works with hugetlbfs backed memory:
+  a) The linux kernel on the destination must support userfault on hugepages.
+  b) The huge-page configuration on the source and destination VMs must be
+ identical; i.e. RAMBlocks on both sides must use the same page size.
+  c) Note that -mem-path /dev/hugepages  will fall back to allocating normal
+ RAM if it doesn't have enough hugepages, triggering (b) to fail.
+ Using -mem-prealloc enforces the allocation using hugepages.
+  d) Care should be taken with the size of hugepage used; postcopy with 2MB
+ hugepages works well, however 1GB hugepages are likely to be problematic
+ since it takes ~1 second to transfer a 1GB hugepage across a 10Gbps link,
+ and until the full page is transferred the destination thread is blocked.
-- 
2.9.3




[Qemu-devel] [PULL 06/19] qemu-img: Truncate before full preallocation

2017-02-24 Thread Kevin Wolf
From: Nir Soffer 

In a previous commit (qemu-img: Do not truncate before preallocation) we
moved truncate to the PREALLOC_MODE_OFF branch to avoid slowdown in
posix_fallocate().

However this change is not optimal when using PREALLOC_MODE_FULL, since
knowing the final size from the beginning could allow the file system
driver to do less allocations and possibly avoid fragmentation of the
file.

Now we truncate also before doing full preallocation.

Signed-off-by: Nir Soffer 
Signed-off-by: Kevin Wolf 
---
 block/file-posix.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/block/file-posix.c b/block/file-posix.c
index 442f080..d24e34b 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1604,6 +1604,17 @@ static int raw_create(const char *filename, QemuOpts 
*opts, Error **errp)
 #endif
 case PREALLOC_MODE_FULL:
 {
+/*
+ * Knowing the final size from the beginning could allow the file
+ * system driver to do less allocations and possibly avoid
+ * fragmentation of the file.
+ */
+if (ftruncate(fd, total_size) != 0) {
+result = -errno;
+error_setg_errno(errp, -result, "Could not resize file");
+goto out_close;
+}
+
 int64_t num = 0, left = total_size;
 buf = g_malloc0(65536);
 
@@ -1642,6 +1653,7 @@ static int raw_create(const char *filename, QemuOpts 
*opts, Error **errp)
 break;
 }
 
+out_close:
 if (qemu_close(fd) != 0 && result == 0) {
 result = -errno;
 error_setg_errno(errp, -result, "Could not close the new file");
-- 
1.8.3.1




[Qemu-devel] [PATCH v3 13/16] postcopy: Update userfaultfd.h header

2017-02-24 Thread Dr. David Alan Gilbert (git)
From: "Dr. David Alan Gilbert" 

Just the userfaultfd.h update from Paolo's header
update run;

* Drop this patch after Paolo's update goes in *

Signed-off-by: Dr. David Alan Gilbert 
---
 linux-headers/linux/userfaultfd.h | 67 +--
 1 file changed, 58 insertions(+), 9 deletions(-)

diff --git a/linux-headers/linux/userfaultfd.h 
b/linux-headers/linux/userfaultfd.h
index 19e8453..2ed5dc3 100644
--- a/linux-headers/linux/userfaultfd.h
+++ b/linux-headers/linux/userfaultfd.h
@@ -11,13 +11,18 @@
 
 #include 
 
-#define UFFD_API ((__u64)0xAA)
 /*
- * After implementing the respective features it will become:
- * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
- *   UFFD_FEATURE_EVENT_FORK)
+ * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and
+ * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR.  In
+ * userfaultfd.h we assumed the kernel was reading (instead _IOC_READ
+ * means the userland is reading).
  */
-#define UFFD_API_FEATURES (0)
+#define UFFD_API ((__u64)0xAA)
+#define UFFD_API_FEATURES (UFFD_FEATURE_EVENT_FORK |   \
+  UFFD_FEATURE_EVENT_REMAP |   \
+  UFFD_FEATURE_EVENT_MADVDONTNEED |\
+  UFFD_FEATURE_MISSING_HUGETLBFS | \
+  UFFD_FEATURE_MISSING_SHMEM)
 #define UFFD_API_IOCTLS\
((__u64)1 << _UFFDIO_REGISTER | \
 (__u64)1 << _UFFDIO_UNREGISTER |   \
@@ -26,6 +31,9 @@
((__u64)1 << _UFFDIO_WAKE | \
 (__u64)1 << _UFFDIO_COPY | \
 (__u64)1 << _UFFDIO_ZEROPAGE)
+#define UFFD_API_RANGE_IOCTLS_BASIC\
+   ((__u64)1 << _UFFDIO_WAKE | \
+(__u64)1 << _UFFDIO_COPY)
 
 /*
  * Valid ioctl command number range with this API is from 0x00 to
@@ -72,6 +80,21 @@ struct uffd_msg {
} pagefault;
 
struct {
+   __u32   ufd;
+   } fork;
+
+   struct {
+   __u64   from;
+   __u64   to;
+   __u64   len;
+   } remap;
+
+   struct {
+   __u64   start;
+   __u64   end;
+   } madv_dn;
+
+   struct {
/* unused reserved fields */
__u64   reserved1;
__u64   reserved2;
@@ -84,9 +107,9 @@ struct uffd_msg {
  * Start at 0x12 and not at 0 to be more strict against bugs.
  */
 #define UFFD_EVENT_PAGEFAULT   0x12
-#if 0 /* not available yet */
 #define UFFD_EVENT_FORK0x13
-#endif
+#define UFFD_EVENT_REMAP   0x14
+#define UFFD_EVENT_MADVDONTNEED0x15
 
 /* flags for UFFD_EVENT_PAGEFAULT */
 #define UFFD_PAGEFAULT_FLAG_WRITE  (1<<0)  /* If this was a write fault */
@@ -104,11 +127,37 @@ struct uffdio_api {
 * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
 * are to be considered implicitly always enabled in all kernels as
 * long as the uffdio_api.api requested matches UFFD_API.
+*
+* UFFD_FEATURE_MISSING_HUGETLBFS means an UFFDIO_REGISTER
+* with UFFDIO_REGISTER_MODE_MISSING mode will succeed on
+* hugetlbfs virtual memory ranges. Adding or not adding
+* UFFD_FEATURE_MISSING_HUGETLBFS to uffdio_api.features has
+* no real functional effect after UFFDIO_API returns, but
+* it's only useful for an initial feature set probe at
+* UFFDIO_API time. There are two ways to use it:
+*
+* 1) by adding UFFD_FEATURE_MISSING_HUGETLBFS to the
+*uffdio_api.features before calling UFFDIO_API, an error
+*will be returned by UFFDIO_API on a kernel without
+*hugetlbfs missing support
+*
+* 2) the UFFD_FEATURE_MISSING_HUGETLBFS can not be added in
+*uffdio_api.features and instead it will be set by the
+*kernel in the uffdio_api.features if the kernel supports
+*it, so userland can later check if the feature flag is
+*present in uffdio_api.features after UFFDIO_API
+*succeeded.
+*
+* UFFD_FEATURE_MISSING_SHMEM works the same as
+* UFFD_FEATURE_MISSING_HUGETLBFS, but it applies to shmem
+* (i.e. tmpfs and other shmem based APIs).
 */
-#if 0 /* not available yet */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
 #define UFFD_FEATURE_EVENT_FORK(1<<1)
-#endif
+#define UFFD_FEATURE_EVENT_REMAP   (1<<2)
+#define UFFD_FEATURE_EVENT_MADVDONTNEED(1<<3)
+#define UFFD_FEATURE_MISSING_HUGETLBFS (1<<4)
+#define UFFD_FEATURE_MISSING_SHMEM (1<<5)
__u64 features;
 
__u64 ioctls;
-- 
2.9.3




[Qemu-devel] [PULL 04/19] qemu-img: Do not truncate before preallocation

2017-02-24 Thread Kevin Wolf
From: Nir Soffer 

When using file system that does not support fallocate() (e.g. NFS <
4.2), truncating the file only when preallocation=OFF speeds up creating
raw file.

Here is example run, tested on Fedora 24 machine, creating raw file on
NFS version 3 server.

$ time ./qemu-img-master create -f raw -o preallocation=falloc mnt/test 1g
Formatting 'mnt/test', fmt=raw size=1073741824 preallocation=falloc

real0m21.185s
user0m0.022s
sys 0m0.574s

$ time ./qemu-img-fix create -f raw -o preallocation=falloc mnt/test 1g
Formatting 'mnt/test', fmt=raw size=1073741824 preallocation=falloc

real0m11.601s
user0m0.016s
sys 0m0.525s

$ time dd if=/dev/zero of=mnt/test bs=1M count=1024 oflag=direct
1024+0 records in
1024+0 records out
1073741824 bytes (1.1 GB, 1.0 GiB) copied, 15.6627 s, 68.6 MB/s

real0m16.104s
user0m0.009s
sys 0m0.220s

Running with strace we can see that without this change we do one
pread() and one pwrite() for each block. With this change, we do only
one pwrite() per block.

$ strace ./qemu-img-master create -f raw -o preallocation=falloc mnt/test 8192
...
pread64(9, "\0", 1, 4095)   = 1
pwrite64(9, "\0", 1, 4095)  = 1
pread64(9, "\0", 1, 8191)   = 1
pwrite64(9, "\0", 1, 8191)  = 1

$ strace ./qemu-img-fix create -f raw -o preallocation=falloc mnt/test 8192
...
pwrite64(9, "\0", 1, 4095)  = 1
pwrite64(9, "\0", 1, 8191)  = 1

This happens because posix_fallocate is checking if each block is
allocated before writing a byte to the block, and when truncating the
file before preallocation, all blocks are unallocated.

Signed-off-by: Nir Soffer 
Signed-off-by: Kevin Wolf 
---
 block/file-posix.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 2134e0e..442f080 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1591,12 +1591,6 @@ static int raw_create(const char *filename, QemuOpts 
*opts, Error **errp)
 #endif
 }
 
-if (ftruncate(fd, total_size) != 0) {
-result = -errno;
-error_setg_errno(errp, -result, "Could not resize file");
-goto out_close;
-}
-
 switch (prealloc) {
 #ifdef CONFIG_POSIX_FALLOCATE
 case PREALLOC_MODE_FALLOC:
@@ -1636,6 +1630,10 @@ static int raw_create(const char *filename, QemuOpts 
*opts, Error **errp)
 break;
 }
 case PREALLOC_MODE_OFF:
+if (ftruncate(fd, total_size) != 0) {
+result = -errno;
+error_setg_errno(errp, -result, "Could not resize file");
+}
 break;
 default:
 result = -EINVAL;
@@ -1644,7 +1642,6 @@ static int raw_create(const char *filename, QemuOpts 
*opts, Error **errp)
 break;
 }
 
-out_close:
 if (qemu_close(fd) != 0 && result == 0) {
 result = -errno;
 error_setg_errno(errp, -result, "Could not close the new file");
-- 
1.8.3.1




[Qemu-devel] [PATCH v3 16/16] postcopy: Add extra check for COPY function

2017-02-24 Thread Dr. David Alan Gilbert (git)
From: "Dr. David Alan Gilbert" 

As an extra sanity check, make sure the region we're registering
can perform UFFDIO_COPY;  the COPY will fail later but this
gives a cleaner failure.

Signed-off-by: Dr. David Alan Gilbert 
---
 migration/postcopy-ram.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index 102fb61..effbeb6 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -378,6 +378,10 @@ static int ram_block_enable_notify(const char *block_name, 
void *host_addr,
 error_report("%s userfault register: %s", __func__, strerror(errno));
 return -1;
 }
+if (!(reg_struct.ioctls & ((__u64)1 << _UFFDIO_COPY))) {
+error_report("%s userfault: Region doesn't support COPY", __func__);
+return -1;
+}
 
 return 0;
 }
-- 
2.9.3




Re: [Qemu-devel] [PATCH v14 00/24] MTTCG Base enabling patches with ARM enablement

2017-02-24 Thread Alex Bennée

G 3  writes:

> Hi I was wondering if your MTTCG patches have been tested with a
> PowerPC guest yet.

You would have to talk to PPC guys about the current status of MTTCG for
the PowerPC architecture. You can force it to run with -accel
tcg,thread=multi but it will likely behave weirdly if the atomic/barrier
updates haven't been made to the front end.

> Also do you have a repo someone could clone to test
> out all your patches?

Sure:

  https://github.com/stsquad/qemu/tree/mttcg/base-patches-v14

Hopefully it will get merged in the next couple of days ;-)

--
Alex Bennée



[Qemu-devel] [PATCH v3 12/16] postcopy: Allow hugepages

2017-02-24 Thread Dr. David Alan Gilbert (git)
From: "Dr. David Alan Gilbert" 

Allow huge pages in postcopy.

Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Juan Quintela 
Reviewed-by: Laurent Vivier 
---
 migration/postcopy-ram.c | 25 +
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index 03cbd6e..6b30b43 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -85,24 +85,6 @@ static bool ufd_version_check(int ufd)
 }
 
 /*
- * Check for things that postcopy won't support; returns 0 if the block
- * is fine.
- */
-static int check_range(const char *block_name, void *host_addr,
-  ram_addr_t offset, ram_addr_t length, void *opaque)
-{
-RAMBlock *rb = qemu_ram_block_by_name(block_name);
-
-if (qemu_ram_pagesize(rb) > getpagesize()) {
-error_report("Postcopy doesn't support large page sizes yet (%s)",
- block_name);
-return -E2BIG;
-}
-
-return 0;
-}
-
-/*
  * Note: This has the side effect of munlock'ing all of RAM, that's
  * normally fine since if the postcopy succeeds it gets turned back on at the
  * end.
@@ -122,12 +104,6 @@ bool postcopy_ram_supported_by_host(void)
 goto out;
 }
 
-/* Check for anything about the RAMBlocks we don't support */
-if (qemu_ram_foreach_block(check_range, NULL)) {
-/* check_range will have printed its own error */
-goto out;
-}
-
 ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
 if (ufd == -1) {
 error_report("%s: userfaultfd not available: %s", __func__,
@@ -139,6 +115,7 @@ bool postcopy_ram_supported_by_host(void)
 if (!ufd_version_check(ufd)) {
 goto out;
 }
+/* TODO: Only allow huge pages if the kernel supports it */
 
 /*
  * userfault and mlock don't go together; we'll put it back later if
-- 
2.9.3




Re: [Qemu-devel] Hang in aio/multi/mutex/mcs

2017-02-24 Thread Alex Bennée

Paolo Bonzini  writes:

> On 23/02/2017 20:48, Alex Bennée wrote:
>> Hope that helps the debugging ;-)
>
> Worst case we can just remove the test (it's only there for performance
> comparison, not a bug in actual QEMU code), but this seems to help here:
>
> diff --git a/tests/test-aio-multithread.c b/tests/test-aio-multithread.c
> index f11e990..8b0b40e 100644
> --- a/tests/test-aio-multithread.c
> +++ b/tests/test-aio-multithread.c
> @@ -309,7 +309,7 @@ static void mcs_mutex_lock(void)
>  static void mcs_mutex_unlock(void)
>  {
>  int next;
> -if (nodes[id].next == -1) {
> +if (atomic_read([id].next) == -1) {
>  if (atomic_read(_head) == id &&
>  atomic_cmpxchg(_head, id, -1) == id) {
>  /* Last item in the list, exit.  */
> @@ -323,7 +323,7 @@ static void mcs_mutex_unlock(void)
>  }
>
>  /* Wake up the next in line.  */
> -next = nodes[id].next;
> +next = atomic_read([id].next);
>  nodes[next].locked = 0;
>  qemu_futex_wake([next].locked, 1);
>  }

Well it certainly looks like it improves the results on Travis.

Tested-by: Alex Bennée 
Reviewed-by: Alex Bennée 

--
Alex Bennée



[Qemu-devel] [PATCH v3 10/16] postcopy: Mask fault addresses to huge page boundary

2017-02-24 Thread Dr. David Alan Gilbert (git)
From: "Dr. David Alan Gilbert" 

Currently the fault address received by userfault is rounded to
the host page boundary and a host page is requested from the source.
Use the current RAMBlock page size instead of the general host page
size so that for RAMBlocks backed by huge pages we request the whole
huge page.

Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Juan Quintela 
Reviewed-by: Laurent Vivier 
---
 migration/postcopy-ram.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index 4c736d2..03cbd6e 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -403,7 +403,6 @@ static void *postcopy_ram_fault_thread(void *opaque)
 MigrationIncomingState *mis = opaque;
 struct uffd_msg msg;
 int ret;
-size_t hostpagesize = getpagesize();
 RAMBlock *rb = NULL;
 RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */
 
@@ -470,7 +469,7 @@ static void *postcopy_ram_fault_thread(void *opaque)
 break;
 }
 
-rb_offset &= ~(hostpagesize - 1);
+rb_offset &= ~(qemu_ram_pagesize(rb) - 1);
 trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
 qemu_ram_get_idstr(rb),
 rb_offset);
@@ -482,11 +481,11 @@ static void *postcopy_ram_fault_thread(void *opaque)
 if (rb != last_rb) {
 last_rb = rb;
 migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
- rb_offset, hostpagesize);
+ rb_offset, qemu_ram_pagesize(rb));
 } else {
 /* Save some space */
 migrate_send_rp_req_pages(mis, NULL,
- rb_offset, hostpagesize);
+ rb_offset, qemu_ram_pagesize(rb));
 }
 }
 trace_postcopy_ram_fault_thread_exit();
-- 
2.9.3




[Qemu-devel] [PATCH v3 14/16] postcopy: Check for userfault+hugepage feature

2017-02-24 Thread Dr. David Alan Gilbert (git)
From: "Dr. David Alan Gilbert" 

We need extra Linux kernel support (~4.11) to support userfaults
on hugetlbfs; check for them.

Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Juan Quintela 
Reviewed-by: Laurent Vivier 
---
 migration/postcopy-ram.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
index 6b30b43..102fb61 100644
--- a/migration/postcopy-ram.c
+++ b/migration/postcopy-ram.c
@@ -81,6 +81,17 @@ static bool ufd_version_check(int ufd)
 return false;
 }
 
+if (getpagesize() != ram_pagesize_summary()) {
+bool have_hp = false;
+/* We've got a huge page */
+#ifdef UFFD_FEATURE_MISSING_HUGETLBFS
+have_hp = api_struct.features & UFFD_FEATURE_MISSING_HUGETLBFS;
+#endif
+if (!have_hp) {
+error_report("Userfault on this host does not support huge pages");
+return false;
+}
+}
 return true;
 }
 
@@ -115,7 +126,6 @@ bool postcopy_ram_supported_by_host(void)
 if (!ufd_version_check(ufd)) {
 goto out;
 }
-/* TODO: Only allow huge pages if the kernel supports it */
 
 /*
  * userfault and mlock don't go together; we'll put it back later if
-- 
2.9.3




Re: [Qemu-devel] [PATCH risu 0/9] risu: refactor and reduce CPU-specific code

2017-02-24 Thread Peter Maydell
On 24 February 2017 at 18:15, Laurent Vivier  wrote:
> Le 24/02/2017 à 18:35, Peter Maydell a écrit :
>> My motivation for all this, incidentally, is that I wanted to have
>> a go at resurrecting the x86 backend as a test case for how we
>> should support variable-length instruction sets.
>
> I think m68k is needing that too.

Yep; but I don't have an m68k to hand, whereas I do have
x86-64 ;-)

thanks
-- PMM



Re: [Qemu-devel] [PATCH v9 2/5] hw/intc/arm_gicv3_kvm: Add ICC_SRE_EL1 register to vmstate

2017-02-24 Thread Auger Eric
Hi Peter,

On 24/02/2017 18:57, Peter Maydell wrote:
> On 24 February 2017 at 17:53, Auger Eric  wrote:
>> Hi,
>>
>> On 23/02/2017 12:51, vijay.kil...@gmail.com wrote:
>>> From: Vijaya Kumar K 
>>>
>>> To Save and Restore ICC_SRE_EL1 register introduce vmstate
>>> subsection and load only if non-zero.
>> != 7
>>
>>> Also initialize icc_sre_el1 with to 0x7 in pre_load
>>> function.
>>>
>>> Signed-off-by: Vijaya Kumar K 
>>> ---
>>>  hw/intc/arm_gicv3_common.c | 36 
>>> 
>>>  include/hw/intc/arm_gicv3_common.h |  1 +
>>>  2 files changed, 37 insertions(+)
>>>
>>> diff --git a/hw/intc/arm_gicv3_common.c b/hw/intc/arm_gicv3_common.c
>>> index 16b9b0f..5b0e456 100644
>>> --- a/hw/intc/arm_gicv3_common.c
>>> +++ b/hw/intc/arm_gicv3_common.c
>>> @@ -70,6 +70,38 @@ static const VMStateDescription vmstate_gicv3_cpu_virt = 
>>> {
>>>  }
>>>  };
>>>
>>> +static int icc_sre_el1_reg_pre_load(void *opaque)
>>> +{
>>> +GICv3CPUState *cs = opaque;
>>> +
>>> +   /*
>>> +* If the sre_el1 subsection is not transferred this
>>> +* means SRE_EL1 is 0x7 (which might not be the same as
>>> +* our reset value).
>>> +*/
>>> +cs->icc_sre_el1 = 0x7;
>>> +return 0;
>>> +}
>> As Peter asked before I don't really get why we need the pre_load
>> function here.
> 
> No, it's correct. As the comment says, the reset value
> of icc_sre_el1 might not be 7 (it is right now, but it's
> less of a trap for future changes to not assume it).
> So the way migration works is that we use pre-load on the destination
> end to set the value to 0x7, then if the source end transfers the data
> we get the transferred value, otherwise we end up with the default
> value as set in the pre-load function.

OK that clarifies.

Thanks!

Eric
> 
> thanks
> -- PMM
> 



[Qemu-devel] [PATCH v3 09/16] postcopy: Load huge pages in one go

2017-02-24 Thread Dr. David Alan Gilbert (git)
From: "Dr. David Alan Gilbert" 

The existing postcopy RAM load loop already ensures that it
glues together whole host-pages from the target page size chunks sent
over the wire.  Modify the definition of host page that it uses
to be the RAM block page size and thus be huge pages where appropriate.

Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Juan Quintela 
Reviewed-by: Laurent Vivier 
---
 migration/ram.c | 13 -
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/migration/ram.c b/migration/ram.c
index ff86664..9f28da2 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -2390,7 +2390,7 @@ static int ram_load_postcopy(QEMUFile *f)
 {
 int flags = 0, ret = 0;
 bool place_needed = false;
-bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
+bool matching_page_sizes = false;
 MigrationIncomingState *mis = migration_incoming_get_current();
 /* Temporary page that is later 'placed' */
 void *postcopy_host_page = postcopy_get_tmp_page(mis);
@@ -2420,8 +2420,11 @@ static int ram_load_postcopy(QEMUFile *f)
 ret = -EINVAL;
 break;
 }
+matching_page_sizes = block->page_size == TARGET_PAGE_SIZE;
 /*
- * Postcopy requires that we place whole host pages atomically.
+ * Postcopy requires that we place whole host pages atomically;
+ * these may be huge pages for RAMBlocks that are backed by
+ * hugetlbfs.
  * To make it atomic, the data is read into a temporary page
  * that's moved into place later.
  * The migration protocol uses,  possibly smaller, target-pages
@@ -2429,9 +2432,9 @@ static int ram_load_postcopy(QEMUFile *f)
  * of a host page in order.
  */
 page_buffer = postcopy_host_page +
-  ((uintptr_t)host & ~qemu_host_page_mask);
+  ((uintptr_t)host & (block->page_size - 1));
 /* If all TP are zero then we can optimise the place */
-if (!((uintptr_t)host & ~qemu_host_page_mask)) {
+if (!((uintptr_t)host & (block->page_size - 1))) {
 all_zero = true;
 } else {
 /* not the 1st TP within the HP */
@@ -2449,7 +2452,7 @@ static int ram_load_postcopy(QEMUFile *f)
  * page
  */
 place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
- ~qemu_host_page_mask) == 0;
+ (block->page_size - 1)) == 0;
 place_source = postcopy_host_page;
 }
 last_host = host;
-- 
2.9.3




[Qemu-devel] [PATCH v3 11/16] postcopy: Send whole huge pages

2017-02-24 Thread Dr. David Alan Gilbert (git)
From: "Dr. David Alan Gilbert" 

The RAM save code uses ram_save_host_page to send whole
host pages at a time;  change this to use the host page size associated
with the RAM Block which may be a huge page.

Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Juan Quintela 
Reviewed-by: Laurent Vivier 
---
 migration/ram.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/migration/ram.c b/migration/ram.c
index 9f28da2..719425b 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -1302,6 +1302,8 @@ static int ram_save_target_page(MigrationState *ms, 
QEMUFile *f,
  * offset to point into the middle of a host page
  * in which case the remainder of the hostpage is sent.
  * Only dirty target pages are sent.
+ * Note that the host page size may be a huge page for this
+ * block.
  *
  * Returns: Number of pages written.
  *
@@ -1320,6 +1322,8 @@ static int ram_save_host_page(MigrationState *ms, 
QEMUFile *f,
   ram_addr_t dirty_ram_abs)
 {
 int tmppages, pages = 0;
+size_t pagesize = qemu_ram_pagesize(pss->block);
+
 do {
 tmppages = ram_save_target_page(ms, f, pss, last_stage,
 bytes_transferred, dirty_ram_abs);
@@ -1330,7 +1334,7 @@ static int ram_save_host_page(MigrationState *ms, 
QEMUFile *f,
 pages += tmppages;
 pss->offset += TARGET_PAGE_SIZE;
 dirty_ram_abs += TARGET_PAGE_SIZE;
-} while (pss->offset & (qemu_host_page_size - 1));
+} while (pss->offset & (pagesize - 1));
 
 /* The offset we leave with is the last one we looked at */
 pss->offset -= TARGET_PAGE_SIZE;
-- 
2.9.3




[Qemu-devel] [PATCH v3 06/16] postcopy: Record largest page size

2017-02-24 Thread Dr. David Alan Gilbert (git)
From: "Dr. David Alan Gilbert" 

Record the largest page size in use; we'll need it soon for allocating
temporary buffers.

Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Juan Quintela 
Reviewed-by: Laurent Vivier 
---
 exec.c| 13 +
 include/exec/cpu-common.h |  1 +
 include/migration/migration.h |  1 +
 migration/migration.c |  1 +
 4 files changed, 16 insertions(+)

diff --git a/exec.c b/exec.c
index ec3a11f..099794a 100644
--- a/exec.c
+++ b/exec.c
@@ -1524,6 +1524,19 @@ size_t qemu_ram_pagesize(RAMBlock *rb)
 return rb->page_size;
 }
 
+/* Returns the largest size of page in use */
+size_t qemu_ram_pagesize_largest(void)
+{
+RAMBlock *block;
+size_t largest = 0;
+
+QLIST_FOREACH_RCU(block, _list.blocks, next) {
+largest = MAX(largest, qemu_ram_pagesize(block));
+}
+
+return largest;
+}
+
 static int memory_try_enable_merging(void *addr, size_t len)
 {
 if (!machine_mem_merge(current_machine)) {
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 1350c2e..8c305aa 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -64,6 +64,7 @@ void qemu_ram_set_idstr(RAMBlock *block, const char *name, 
DeviceState *dev);
 void qemu_ram_unset_idstr(RAMBlock *block);
 const char *qemu_ram_get_idstr(RAMBlock *rb);
 size_t qemu_ram_pagesize(RAMBlock *block);
+size_t qemu_ram_pagesize_largest(void);
 
 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
 int len, int is_write);
diff --git a/include/migration/migration.h b/include/migration/migration.h
index aafa68f..e8e2a7f 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -92,6 +92,7 @@ struct MigrationIncomingState {
  */
 QemuEvent main_thread_load_event;
 
+size_t largest_page_size;
 bool   have_fault_thread;
 QemuThread fault_thread;
 QemuSemaphore  fault_thread_sem;
diff --git a/migration/migration.c b/migration/migration.c
index c6ae69d..869697e 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -383,6 +383,7 @@ static void process_incoming_migration_co(void *opaque)
 int ret;
 
 mis->from_src_file = f;
+mis->largest_page_size = qemu_ram_pagesize_largest();
 postcopy_state_set(POSTCOPY_INCOMING_NONE);
 migrate_set_state(>state, MIGRATION_STATUS_NONE,
   MIGRATION_STATUS_ACTIVE);
-- 
2.9.3




[Qemu-devel] [PULL 18/19] vvfat: Use opened node as backing file

2017-02-24 Thread Kevin Wolf
We should not try to assign a not yet opened node as the backing file,
because as soon as the permission system is added it will fail.  The
just added bdrv_new_open_driver() function is the right tool to open a
file with an internal driver, use it.

In case anyone wonders whether that magic fake backing file to trigger a
special action on 'commit' actually works today: No, not for me. One
reason is that we've been adding a raw format driver on top for several
years now and raw doesn't support commit. Other reasons include that the
backing file isn't writable and the driver doesn't support reopen, and
it's also size 0 and the driver doesn't support bdrv_truncate. All of
these are easily fixable, but then 'commit' ended up in an infinite loop
deep in the vvfat code for me, so I thought I'd best leave it alone. I'm
not really sure what it was supposed to do anyway.

Signed-off-by: Kevin Wolf 
Reviewed-by: Max Reitz 
---
 block/vvfat.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/block/vvfat.c b/block/vvfat.c
index c6bf67e..7f230be 100644
--- a/block/vvfat.c
+++ b/block/vvfat.c
@@ -2968,6 +2968,7 @@ static void write_target_close(BlockDriverState *bs) {
 
 static BlockDriver vvfat_write_target = {
 .format_name= "vvfat_write_target",
+.instance_size  = sizeof(void*),
 .bdrv_co_pwritev= write_target_commit,
 .bdrv_close = write_target_close,
 };
@@ -3036,14 +3037,13 @@ static int enable_write_target(BlockDriverState *bs, 
Error **errp)
 unlink(s->qcow_filename);
 #endif
 
-backing = bdrv_new();
+backing = bdrv_new_open_driver(_write_target, NULL, 
BDRV_O_ALLOW_RDWR,
+   _abort);
+*(void**) backing->opaque = s;
+
 bdrv_set_backing_hd(s->bs, backing);
 bdrv_unref(backing);
 
-s->bs->backing->bs->drv = _write_target;
-s->bs->backing->bs->opaque = g_new(void *, 1);
-*(void**)s->bs->backing->bs->opaque = s;
-
 return 0;
 
 err:
-- 
1.8.3.1




[Qemu-devel] [PATCH v3 05/16] postcopy: enhance ram_block_discard_range for hugepages

2017-02-24 Thread Dr. David Alan Gilbert (git)
From: "Dr. David Alan Gilbert" 

Unfortunately madvise DONTNEED doesn't work on hugepagetlb
so use fallocate(FALLOC_FL_PUNCH_HOLE)
qemu_fd_getpagesize only sets the page based off a file
if the file is from hugetlbfs.

Signed-off-by: Dr. David Alan Gilbert 
Reviewed-by: Juan Quintela 
Reviewed-by: Laurent Vivier 
---
 exec.c | 24 
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/exec.c b/exec.c
index ddbc314..ec3a11f 100644
--- a/exec.c
+++ b/exec.c
@@ -46,6 +46,11 @@
 #include "sysemu/xen-mapcache.h"
 #include "trace-root.h"
 
+#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
+#include 
+#include 
+#endif
+
 #endif
 #include "exec/cpu-all.h"
 #include "qemu/rcu_queue.h"
@@ -3320,12 +3325,23 @@ int ram_block_discard_range(RAMBlock *rb, uint64_t 
start, size_t length)
 
 errno = ENOTSUP; /* If we are missing MADVISE etc */
 
+if (rb->page_size == qemu_host_page_size) {
 #if defined(CONFIG_MADVISE)
-/* Note: We need the madvise MADV_DONTNEED behaviour of definitely
- * freeing the page.
- */
-ret = madvise(host_startaddr, length, MADV_DONTNEED);
+/* Note: We need the madvise MADV_DONTNEED behaviour of definitely
+ * freeing the page.
+ */
+ret = madvise(host_startaddr, length, MADV_DONTNEED);
 #endif
+} else {
+/* Huge page case  - unfortunately it can't do DONTNEED, but
+ * it can do the equivalent by FALLOC_FL_PUNCH_HOLE in the
+ * huge page file.
+ */
+#ifdef CONFIG_FALLOCATE_PUNCH_HOLE
+ret = fallocate(rb->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+start, length);
+#endif
+}
 if (ret) {
 ret = -errno;
 error_report("ram_block_discard_range: Failed to discard range "
-- 
2.9.3




  1   2   3   4   5   >