Re: [PATCH v3] hw/block/nvme: align with existing style

2021-04-15 Thread Klaus Jensen

On Apr 16 09:22, Gollu Appalanaidu wrote:

Use lower case hexadecimal format for the constants and in
comments use the same format as used in Spec. ("h")

Signed-off-by: Gollu Appalanaidu 
---
-v3: Add Suggestions (Philippe)
Describe the NVMe subsystem style in nvme.c header

-v2: Address review comments (Klaus)
use lower case hexa format for the code and in comments
use the same format as used in Spec. ("h")

hw/block/nvme-ns.c   |  2 +-
hw/block/nvme.c  | 46 +---
include/block/nvme.h | 10 +-
3 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 7bb618f182..a0895614d9 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -303,7 +303,7 @@ static void nvme_ns_init_zoned(NvmeNamespace *ns)

id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));

-/* MAR/MOR are zeroes-based, 0x means no limit */
+/* MAR/MOR are zeroes-based, Fh means no limit */
id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1);
id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1);
id_ns_z->zoc = 0;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 624a1431d0..cbe7f33daa 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -134,6 +134,12 @@
 * Setting this property to true enables Read Across Zone Boundaries.
 */

+/**
+ * While QEMU coding style prefers lowercase hexadecimal in constants,
+ * the NVMe subsystem use the format from the NVMe specifications in
+ * the comments: no '0x' prefix, uppercase, 'h' hexadecimal suffix
+ */
+
#include "qemu/osdep.h"
#include "qemu/units.h"
#include "qemu/error-report.h"
@@ -3607,18 +3613,18 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)

/*
 * In the base NVM command set, Flush may apply to all namespaces
- * (indicated by NSID being set to 0x). But if that feature is used
+ * (indicated by NSID being set to h). But if that feature is used
 * along with TP 4056 (Namespace Types), it may be pretty screwed up.
 *
- * If NSID is indeed set to 0x, we simply cannot associate the
+ * If NSID is indeed set to h, we simply cannot associate the
 * opcode with a specific command since we cannot determine a unique I/O
 * command set. Opcode 0x0 could have any other meaning than something
 * equivalent to flushing and say it DOES have completely different
- * semantics in some other command set - does an NSID of 0x then
+ * semantics in some other command set - does an NSID of h then
 * mean "for all namespaces, apply whatever command set specific command
 * that uses the 0x0 opcode?" Or does it mean "for all namespaces, apply
 * whatever command that uses the 0x0 opcode if, and only if, it allows
- * NSID to be 0x"?
+ * NSID to be h"?
 *
 * Anyway (and luckily), for now, we do not care about this since the
 * device only supports namespace types that includes the NVM Flush command
@@ -3934,7 +3940,7 @@ static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t 
rae, uint32_t buf_len,
NVME_CHANGED_NSID_SIZE) {
/*
 * If more than 1024 namespaces, the first entry in the log page should
- * be set to 0x and the others to 0 as spec.
+ * be set to h and the others to 0 as spec.
 */
if (i == ARRAY_SIZE(nslist)) {
memset(nslist, 0x0, sizeof(nslist));
@@ -4332,7 +4338,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req,
trace_pci_nvme_identify_nslist(min_nsid);

/*
- * Both 0x (NVME_NSID_BROADCAST) and 0xfffe are invalid values
+ * Both h (NVME_NSID_BROADCAST) and Eh are invalid values
 * since the Active Namespace ID List should return namespaces with ids
 * *higher* than the NSID specified in the command. This is also specified
 * in the spec (NVM Express v1.3d, Section 5.15.4).
@@ -4379,7 +4385,7 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, 
NvmeRequest *req,
trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);

/*
- * Same as in nvme_identify_nslist(), 0x/0xfffe are invalid.
+ * Same as in nvme_identify_nslist(), h/Eh are invalid.
 */
if (min_nsid >= NVME_NSID_BROADCAST - 1) {
return NVME_INVALID_NSID | NVME_DNR;
@@ -4595,7 +4601,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest 
*req)
/*
 * The Reservation Notification Mask and Reservation Persistence
 * features require a status code of Invalid Field in Command when
- * NSID is 0x. Since the device does not support those
+ * NSID is h. Since the device does not support those
 * features we can always return Invalid Namespace or Format as we
 * should do for all other features.
 

[PATCH] block/file-posix: Fix problem with fallocate(PUNCH_HOLE) on GPFS

2021-04-15 Thread Thomas Huth
A customer reported that running

 qemu-img convert -t none -O qcow2 -f qcow2 input.qcow2 output.qcow2

fails for them with the following error message when the images are
stored on a GPFS file system:

 qemu-img: error while writing sector 0: Invalid argument

After analyzing the strace output, it seems like the problem is in
handle_aiocb_write_zeroes(): The call to fallocate(FALLOC_FL_PUNCH_HOLE)
returns EINVAL, which can apparently happen if the file system has
a different idea of the granularity of the operation. It's arguably
a bug in GPFS, since the PUNCH_HOLE mode should not result in EINVAL
according to the man-page of fallocate(), but the file system is out
there in production and so we have to deal with it. In commit 294682cc3a
("block: workaround for unaligned byte range in fallocate()") we also
already applied the a work-around for the same problem to the earlier
fallocate(FALLOC_FL_ZERO_RANGE) call, so do it now similar with the
PUNCH_HOLE call.

Signed-off-by: Thomas Huth 
---
 block/file-posix.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/block/file-posix.c b/block/file-posix.c
index 20e14f8e96..7a40428d52 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1675,6 +1675,13 @@ static int handle_aiocb_write_zeroes(void *opaque)
 }
 s->has_fallocate = false;
 } else if (ret != -ENOTSUP) {
+if (ret == -EINVAL) {
+/*
+ * File systems like GPFS do not like unaligned byte ranges,
+ * treat it like unsupported (so caller falls back to pwrite)
+ */
+return -ENOTSUP;
+}
 return ret;
 } else {
 s->has_discard = false;
-- 
2.27.0




[PATCH v3] hw/block/nvme: align with existing style

2021-04-15 Thread Gollu Appalanaidu
Use lower case hexadecimal format for the constants and in
comments use the same format as used in Spec. ("h")

Signed-off-by: Gollu Appalanaidu 
---
-v3: Add Suggestions (Philippe)
 Describe the NVMe subsystem style in nvme.c header

-v2: Address review comments (Klaus)
 use lower case hexa format for the code and in comments
 use the same format as used in Spec. ("h")

 hw/block/nvme-ns.c   |  2 +-
 hw/block/nvme.c  | 46 +---
 include/block/nvme.h | 10 +-
 3 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 7bb618f182..a0895614d9 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -303,7 +303,7 @@ static void nvme_ns_init_zoned(NvmeNamespace *ns)
 
 id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
 
-/* MAR/MOR are zeroes-based, 0x means no limit */
+/* MAR/MOR are zeroes-based, Fh means no limit */
 id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1);
 id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1);
 id_ns_z->zoc = 0;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 624a1431d0..cbe7f33daa 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -134,6 +134,12 @@
  * Setting this property to true enables Read Across Zone Boundaries.
  */
 
+/**
+ * While QEMU coding style prefers lowercase hexadecimal in constants,
+ * the NVMe subsystem use the format from the NVMe specifications in
+ * the comments: no '0x' prefix, uppercase, 'h' hexadecimal suffix
+ */
+
 #include "qemu/osdep.h"
 #include "qemu/units.h"
 #include "qemu/error-report.h"
@@ -3607,18 +3613,18 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 
 /*
  * In the base NVM command set, Flush may apply to all namespaces
- * (indicated by NSID being set to 0x). But if that feature is used
+ * (indicated by NSID being set to h). But if that feature is used
  * along with TP 4056 (Namespace Types), it may be pretty screwed up.
  *
- * If NSID is indeed set to 0x, we simply cannot associate the
+ * If NSID is indeed set to h, we simply cannot associate the
  * opcode with a specific command since we cannot determine a unique I/O
  * command set. Opcode 0x0 could have any other meaning than something
  * equivalent to flushing and say it DOES have completely different
- * semantics in some other command set - does an NSID of 0x then
+ * semantics in some other command set - does an NSID of h then
  * mean "for all namespaces, apply whatever command set specific command
  * that uses the 0x0 opcode?" Or does it mean "for all namespaces, apply
  * whatever command that uses the 0x0 opcode if, and only if, it allows
- * NSID to be 0x"?
+ * NSID to be h"?
  *
  * Anyway (and luckily), for now, we do not care about this since the
  * device only supports namespace types that includes the NVM Flush command
@@ -3934,7 +3940,7 @@ static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t 
rae, uint32_t buf_len,
 NVME_CHANGED_NSID_SIZE) {
 /*
  * If more than 1024 namespaces, the first entry in the log page should
- * be set to 0x and the others to 0 as spec.
+ * be set to h and the others to 0 as spec.
  */
 if (i == ARRAY_SIZE(nslist)) {
 memset(nslist, 0x0, sizeof(nslist));
@@ -4332,7 +4338,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req,
 trace_pci_nvme_identify_nslist(min_nsid);
 
 /*
- * Both 0x (NVME_NSID_BROADCAST) and 0xfffe are invalid values
+ * Both h (NVME_NSID_BROADCAST) and Eh are invalid values
  * since the Active Namespace ID List should return namespaces with ids
  * *higher* than the NSID specified in the command. This is also specified
  * in the spec (NVM Express v1.3d, Section 5.15.4).
@@ -4379,7 +4385,7 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, 
NvmeRequest *req,
 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
 
 /*
- * Same as in nvme_identify_nslist(), 0x/0xfffe are invalid.
+ * Same as in nvme_identify_nslist(), h/Eh are invalid.
  */
 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
 return NVME_INVALID_NSID | NVME_DNR;
@@ -4595,7 +4601,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest 
*req)
 /*
  * The Reservation Notification Mask and Reservation Persistence
  * features require a status code of Invalid Field in Command when
- * NSID is 0x. Since the device does not support those
+ * NSID is h. Since the device does not support those
  * features we can always return Invalid Namespace or Format as we
  * should do for all 

Re: [PATCH v2] hw/block/nvme: align with existing style

2021-04-15 Thread Gollu Appalanaidu

On Thu, Apr 15, 2021 at 07:18:50PM +0200, Klaus Jensen wrote:

On Apr 15 15:13, Philippe Mathieu-Daudé wrote:

On 4/15/21 2:00 PM, Gollu Appalanaidu wrote:

Make uniform hexadecimal numbers format.

Signed-off-by: Gollu Appalanaidu 
---
-v2: Address review comments (Klaus)
use lower case hexa format for the code and in comments
use the same format as used in Spec. ("h")


^ This comment is relevant to the commit message.

Also it would be nice if the subsystem could describe somewhere
what is its style. Not sure where... The file header is probably
the simplest place.

Something like:

"While QEMU coding style prefers lowercase hexadecimal in constants,
the NVMe subsystem use the format from the NVMe specifications in
the comments: no '0x' prefix, uppercase, 'h' hexadecimal suffix."



+1 for that suggestion.



Sure, will add it and send v3.


hw/block/nvme-ns.c   |  2 +-
hw/block/nvme.c  | 40 
include/block/nvme.h | 10 +-
3 files changed, 26 insertions(+), 26 deletions(-)








Re: [PATCH 0/5] block, migration: improve debugging of migration bdrv_flush failure

2021-04-15 Thread Connor Kuehl

On 4/15/21 8:58 AM, Daniel P. Berrangé wrote:

I spent a while debugging a tricky migration failure today which was
ultimately caused by fdatasync() getting EACCESS. The existing probes
were not sufficient to diagnose this, so I had to resort to GDB. This
improves probes and block error reporting to make future diagnosis
possible without GDB.

Daniel P. Berrangé (5):
   migration: add trace point when vm_stop_force_state fails
   softmmu: add trace point when bdrv_flush_all fails
   block: preserve errno from fdatasync failures
   block: add trace point when fdatasync fails
   block: remove duplicate trace.h include

  block/file-posix.c | 10 +-
  block/trace-events |  1 +
  migration/migration.c  |  1 +
  migration/trace-events |  1 +
  softmmu/cpus.c |  7 ++-
  softmmu/trace-events   |  3 +++
  6 files changed, 17 insertions(+), 6 deletions(-)



For the series:

Reviewed-by: Connor Kuehl 




about mirror cancel

2021-04-15 Thread Vladimir Sementsov-Ogievskiy

Hi all!

Recently I've implemented fast-cancelling of mirror job: do 
bdrv_cancel_in_flight() in mirror_cancel().

Now I'm in doubt: is it a correct thing? I heard, that mirror-cancel is a kind 
of valid mirror completion..

Looking at documentation:

# Note that if you issue 'block-job-cancel' after 'drive-mirror' has indicated
# (via the event BLOCK_JOB_READY) that the source and destination are
# synchronized, then the event triggered by this command changes to
# BLOCK_JOB_COMPLETED, to indicate that the mirroring has ended and the
# destination now has a point-in-time copy tied to the time of the cancellation.

So, in other words, do we guarantee something to the user, if it calls 
mirror-cancel in ready state? Does this abuse exist in libvirt?



Note, that if cancelling all in-flight requests on target is wrong on mirror 
cancel, we still don't have real bug, as the only implementation of 
.bdrv_cancel_in_flight is stopping reconnect waiting in nbd driver. So, we'll 
cancel requests only if connection is already lost anyway.

But that probably means, that correct name of the handler would be 
.bdrv_cancel_in_fligth_requests_that_will_most_probably_fail_anyway()..

And it also means, that abuse of mirror-cancel as valid completion is a bad 
idea, as we can't distinguish the cases when user calls job-cancel to have a 
kind of point-in-time copy, or just to cancel job (and being not interested in 
the final state of target).

So, probably we need an option boolean argument for blockjob-cancel, like 
"hard", that will cancel in-flight writes on target node..

--
Best regards,
Vladimir



Re: [PATCH v2] hw/block/nvme: align with existing style

2021-04-15 Thread Klaus Jensen

On Apr 15 15:13, Philippe Mathieu-Daudé wrote:

On 4/15/21 2:00 PM, Gollu Appalanaidu wrote:

Make uniform hexadecimal numbers format.

Signed-off-by: Gollu Appalanaidu 
---
-v2: Address review comments (Klaus)
use lower case hexa format for the code and in comments
use the same format as used in Spec. ("h")


^ This comment is relevant to the commit message.

Also it would be nice if the subsystem could describe somewhere
what is its style. Not sure where... The file header is probably
the simplest place.

Something like:

"While QEMU coding style prefers lowercase hexadecimal in constants,
the NVMe subsystem use the format from the NVMe specifications in
the comments: no '0x' prefix, uppercase, 'h' hexadecimal suffix."



+1 for that suggestion.


 hw/block/nvme-ns.c   |  2 +-
 hw/block/nvme.c  | 40 
 include/block/nvme.h | 10 +-
 3 files changed, 26 insertions(+), 26 deletions(-)





signature.asc
Description: PGP signature


Re: [PATCH 4/5] block: add trace point when fdatasync fails

2021-04-15 Thread Dr. David Alan Gilbert
* Daniel P. Berrangé (berra...@redhat.com) wrote:
> A flush failure is a critical failure scenario for some operations.
> For example, it will prevent migration from completing, as it will
> make vm_stop() report an error. Thus it is important to have a
> trace point present for debugging.
> 
> Signed-off-by: Daniel P. Berrangé 

I'd have had to admit to not having thought that would fail; the fact
we're debugging something where it does, suggests it's a good idea!


Reviewed-by: Dr. David Alan Gilbert 

> ---
>  block/file-posix.c | 2 ++
>  block/trace-events | 1 +
>  2 files changed, 3 insertions(+)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 99cf452f84..6aafeda44f 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -1362,6 +1362,8 @@ static int handle_aiocb_flush(void *opaque)
>  
>  ret = qemu_fdatasync(aiocb->aio_fildes);
>  if (ret == -1) {
> +trace_file_flush_fdatasync_failed(errno);
> +
>  /* There is no clear definition of the semantics of a failing 
> fsync(),
>   * so we may have to assume the worst. The sad truth is that this
>   * assumption is correct for Linux. Some pages are now probably 
> marked
> diff --git a/block/trace-events b/block/trace-events
> index 1a12d634e2..c8a943e992 100644
> --- a/block/trace-events
> +++ b/block/trace-events
> @@ -206,6 +206,7 @@ file_copy_file_range(void *bs, int src, int64_t src_off, 
> int dst, int64_t dst_of
>  file_FindEjectableOpticalMedia(const char *media) "Matching using %s"
>  file_setup_cdrom(const char *partition) "Using %s as optical disc"
>  file_hdev_is_sg(int type, int version) "SG device found: type=%d, version=%d"
> +file_flush_fdatasync_failed(int err) "errno %d"
>  
>  # sheepdog.c
>  sheepdog_reconnect_to_sdog(void) "Wait for connection to be established"
> -- 
> 2.30.2
> 
-- 
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK




Re: [PATCH 5/5] block: remove duplicate trace.h include

2021-04-15 Thread Dr. David Alan Gilbert
* Daniel P. Berrangé (berra...@redhat.com) wrote:
> Signed-off-by: Daniel P. Berrangé 

Reviewed-by: Dr. David Alan Gilbert 

> ---
>  block/file-posix.c | 2 --
>  1 file changed, 2 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 6aafeda44f..2538e43299 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -106,8 +106,6 @@
>  #include 
>  #endif
>  
> -#include "trace.h"
> -
>  /* OS X does not have O_DSYNC */
>  #ifndef O_DSYNC
>  #ifdef O_SYNC
> -- 
> 2.30.2
> 
-- 
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK




Re: [PATCH 1/5] migration: add trace point when vm_stop_force_state fails

2021-04-15 Thread Dr. David Alan Gilbert
* Daniel P. Berrangé (berra...@redhat.com) wrote:
> This is a critical failure scenario for migration that is hard to
> diagnose from existing probes. Most likely it is caused by an error
> from bdrv_flush(), but we're not logging the errno anywhere, hence
> this new probe.
> 
> Signed-off-by: Daniel P. Berrangé 

Reviewed-by: Dr. David Alan Gilbert 

> ---
>  migration/migration.c  | 1 +
>  migration/trace-events | 1 +
>  2 files changed, 2 insertions(+)
> 
> diff --git a/migration/migration.c b/migration/migration.c
> index 8ca034136b..bee0dcd501 100644
> --- a/migration/migration.c
> +++ b/migration/migration.c
> @@ -3121,6 +3121,7 @@ static void migration_completion(MigrationState *s)
>  if (!ret) {
>  bool inactivate = !migrate_colo_enabled();
>  ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
> +trace_migration_completion_vm_stop(ret);
>  if (ret >= 0) {
>  ret = migration_maybe_pause(s, _active_state,
>  MIGRATION_STATUS_DEVICE);
> diff --git a/migration/trace-events b/migration/trace-events
> index 668c562fed..8ec28432eb 100644
> --- a/migration/trace-events
> +++ b/migration/trace-events
> @@ -149,6 +149,7 @@ migrate_pending(uint64_t size, uint64_t max, uint64_t 
> pre, uint64_t compat, uint
>  migrate_send_rp_message(int msg_type, uint16_t len) "%d: len %d"
>  migrate_send_rp_recv_bitmap(char *name, int64_t size) "block '%s' size 
> 0x%"PRIi64
>  migration_completion_file_err(void) ""
> +migration_completion_vm_stop(int ret) "ret %d"
>  migration_completion_postcopy_end(void) ""
>  migration_completion_postcopy_end_after_complete(void) ""
>  migration_rate_limit_pre(int ms) "%d ms"
> -- 
> 2.30.2
> 
-- 
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK




Re: [PATCH 2/5] softmmu: add trace point when bdrv_flush_all fails

2021-04-15 Thread Dr. David Alan Gilbert
* Daniel P. Berrangé (berra...@redhat.com) wrote:
> The VM stop process has to flush outstanding I/O and this is a critical
> failure scenario that is hard to diagnose. Add a probe point that
> records the flush return code.
> 
> Signed-off-by: Daniel P. Berrangé 

Reviewed-by: Dr. David Alan Gilbert 

> ---
>  softmmu/cpus.c   | 7 ++-
>  softmmu/trace-events | 3 +++
>  2 files changed, 9 insertions(+), 1 deletion(-)
> 
> diff --git a/softmmu/cpus.c b/softmmu/cpus.c
> index a7ee431187..c3caaeb26e 100644
> --- a/softmmu/cpus.c
> +++ b/softmmu/cpus.c
> @@ -44,6 +44,7 @@
>  #include "sysemu/whpx.h"
>  #include "hw/boards.h"
>  #include "hw/hw.h"
> +#include "trace.h"
>  
>  #ifdef CONFIG_LINUX
>  
> @@ -266,6 +267,7 @@ static int do_vm_stop(RunState state, bool send_stop)
>  
>  bdrv_drain_all();
>  ret = bdrv_flush_all();
> +trace_vm_stop_flush_all(ret);
>  
>  return ret;
>  }
> @@ -704,12 +706,15 @@ int vm_stop_force_state(RunState state)
>  if (runstate_is_running()) {
>  return vm_stop(state);
>  } else {
> +int ret;
>  runstate_set(state);
>  
>  bdrv_drain_all();
>  /* Make sure to return an error if the flush in a previous vm_stop()
>   * failed. */
> -return bdrv_flush_all();
> +ret = bdrv_flush_all();
> +trace_vm_stop_flush_all(ret);
> +return ret;
>  }
>  }
>  
> diff --git a/softmmu/trace-events b/softmmu/trace-events
> index b80ca042e1..f11b427544 100644
> --- a/softmmu/trace-events
> +++ b/softmmu/trace-events
> @@ -19,6 +19,9 @@ flatview_new(void *view, void *root) "%p (root %p)"
>  flatview_destroy(void *view, void *root) "%p (root %p)"
>  flatview_destroy_rcu(void *view, void *root) "%p (root %p)"
>  
> +# softmmu.c
> +vm_stop_flush_all(int ret) "ret %d"
> +
>  # vl.c
>  vm_state_notify(int running, int reason, const char *reason_str) "running %d 
> reason %d (%s)"
>  load_file(const char *name, const char *path) "name %s location %s"
> -- 
> 2.30.2
> 
-- 
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK




[RFC PATCH 2/2] qemu-img convert: Fix sparseness detection

2021-04-15 Thread Kevin Wolf
In order to avoid RMW cycles, is_allocated_sectors() treats zeroed areas
like non-zero data if the end of the checked area isn't aligned. This
can improve the efficiency of the conversion and was introduced in
commit 8dcd3c9b91a.

However, it comes with a correctness problem: qemu-img convert is
supposed to sparsify areas that contain only zeros, which it doesn't do
any more. It turns out that this even happens when not only the
unaligned area is zeroed, but also the blocks before and after it. In
the bug report, conversion of a fragmented 10G image containing only
zeros resulted in an image consuming 2.82 GiB even though the expected
size is only 4 KiB.

As a tradeoff between both, let's ignore zeroed sectors only after
non-zero data to fix the alignment, but if we're only looking at zeros,
keep them as such, even if it may mean additional RMW cycles.

Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1882917
Signed-off-by: Kevin Wolf 
---
 qemu-img.c | 18 --
 tests/qemu-iotests/122.out | 12 
 2 files changed, 8 insertions(+), 22 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index a5993682aa..ca4eba2dd1 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1168,20 +1168,10 @@ static int is_allocated_sectors(const uint8_t *buf, int 
n, int *pnum,
 }
 
 tail = (sector_num + i) & (alignment - 1);
-if (tail) {
-if (is_zero && i <= tail) {
-/* treat unallocated areas which only consist
- * of a small tail as allocated. */
-is_zero = false;
-}
-if (!is_zero) {
-/* align up end offset of allocated areas. */
-i += alignment - tail;
-i = MIN(i, n);
-} else {
-/* align down end offset of zero areas. */
-i -= tail;
-}
+if (tail && !is_zero) {
+/* align up end offset of allocated areas. */
+i += alignment - tail;
+i = MIN(i, n);
 }
 *pnum = i;
 return !is_zero;
diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out
index dcc44a2304..fe0ea34164 100644
--- a/tests/qemu-iotests/122.out
+++ b/tests/qemu-iotests/122.out
@@ -199,11 +199,9 @@ convert -S 4k
 [{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
 { "start": 4096, "length": 4096, "depth": 0, "zero": true, "data": false},
 { "start": 8192, "length": 4096, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
-{ "start": 12288, "length": 4096, "depth": 0, "zero": true, "data": false},
-{ "start": 16384, "length": 4096, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
-{ "start": 20480, "length": 46080, "depth": 0, "zero": true, "data": false},
-{ "start": 66560, "length": 1024, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
-{ "start": 67584, "length": 67041280, "depth": 0, "zero": true, "data": false}]
+{ "start": 12288, "length": 5120, "depth": 0, "zero": true, "data": false},
+{ "start": 17408, "length": 3072, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
+{ "start": 20480, "length": 67088384, "depth": 0, "zero": true, "data": false}]
 
 convert -c -S 4k
 [{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true},
@@ -215,9 +213,7 @@ convert -c -S 4k
 
 convert -S 8k
 [{ "start": 0, "length": 24576, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
-{ "start": 24576, "length": 41984, "depth": 0, "zero": true, "data": false},
-{ "start": 66560, "length": 1024, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
-{ "start": 67584, "length": 67041280, "depth": 0, "zero": true, "data": false}]
+{ "start": 24576, "length": 67084288, "depth": 0, "zero": true, "data": false}]
 
 convert -c -S 8k
 [{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true},
-- 
2.30.2




[RFC PATCH 1/2] iotests: Test qemu-img convert of zeroed data cluster

2021-04-15 Thread Kevin Wolf
This demonstrates what happens when the block status changes in
sub-min_sparse granularity, but all of the parts are zeroed out. The
alignment logic in is_allocated_sectors() prevents that the target image
remains fully sparse as expected, but turns it into a data cluster of
explicit zeros.

Signed-off-by: Kevin Wolf 
---
 tests/qemu-iotests/122 |  1 +
 tests/qemu-iotests/122.out | 10 --
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/122 b/tests/qemu-iotests/122
index 5d550ed13e..7a213a4df9 100755
--- a/tests/qemu-iotests/122
+++ b/tests/qemu-iotests/122
@@ -251,6 +251,7 @@ $QEMU_IO -c "write -P 0 0 64k" "$TEST_IMG" 2>&1 | 
_filter_qemu_io | _filter_test
 $QEMU_IO -c "write 0 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
 $QEMU_IO -c "write 8k 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
 $QEMU_IO -c "write 17k 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | _filter_testdir
+$QEMU_IO -c "write -P 0 65k 1k" "$TEST_IMG" 2>&1 | _filter_qemu_io | 
_filter_testdir
 
 for min_sparse in 4k 8k; do
 echo
diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out
index 3a3e121d57..dcc44a2304 100644
--- a/tests/qemu-iotests/122.out
+++ b/tests/qemu-iotests/122.out
@@ -192,6 +192,8 @@ wrote 1024/1024 bytes at offset 8192
 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 wrote 1024/1024 bytes at offset 17408
 1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+wrote 1024/1024 bytes at offset 66560
+1 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 
 convert -S 4k
 [{ "start": 0, "length": 4096, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
@@ -199,7 +201,9 @@ convert -S 4k
 { "start": 8192, "length": 4096, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
 { "start": 12288, "length": 4096, "depth": 0, "zero": true, "data": false},
 { "start": 16384, "length": 4096, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
-{ "start": 20480, "length": 67088384, "depth": 0, "zero": true, "data": false}]
+{ "start": 20480, "length": 46080, "depth": 0, "zero": true, "data": false},
+{ "start": 66560, "length": 1024, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
+{ "start": 67584, "length": 67041280, "depth": 0, "zero": true, "data": false}]
 
 convert -c -S 4k
 [{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true},
@@ -211,7 +215,9 @@ convert -c -S 4k
 
 convert -S 8k
 [{ "start": 0, "length": 24576, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
-{ "start": 24576, "length": 67084288, "depth": 0, "zero": true, "data": false}]
+{ "start": 24576, "length": 41984, "depth": 0, "zero": true, "data": false},
+{ "start": 66560, "length": 1024, "depth": 0, "zero": false, "data": true, 
"offset": OFFSET},
+{ "start": 67584, "length": 67041280, "depth": 0, "zero": true, "data": false}]
 
 convert -c -S 8k
 [{ "start": 0, "length": 1024, "depth": 0, "zero": false, "data": true},
-- 
2.30.2




[RFC PATCH 0/2] qemu-img convert: Fix sparseness detection

2021-04-15 Thread Kevin Wolf
Peter, three years ago you changed 'qemu-img convert' to sacrifice some
sparsification in order to get aligned requests on the target image. At
the time, I thought the impact would be small, but it turns out that
this can end up wasting gigabytes of storagee (like converting a fully
zeroed 10 GB image taking 2.8 GB instead of a few kilobytes).

https://bugzilla.redhat.com/show_bug.cgi?id=1882917

I'm not entirely sure how to attack this best since this is a tradeoff,
but maybe the approach in this series is still good enough for the case
that you wanted to fix back then?

Of course, it would be possible to have a more complete fix like looking
forward a few blocks more before writing data, but that would probably
not be entirely trivial because you would have to merge blocks with ZERO
block status with DATA blocks that contain only zeros. I'm not sure if
it's worth this complication of the code.

Kevin Wolf (2):
  iotests: Test qemu-img convert of zeroed data cluster
  qemu-img convert: Fix sparseness detection

 qemu-img.c | 18 --
 tests/qemu-iotests/122 |  1 +
 tests/qemu-iotests/122.out |  6 --
 3 files changed, 9 insertions(+), 16 deletions(-)

-- 
2.30.2




[PATCH 5/5] block: remove duplicate trace.h include

2021-04-15 Thread Daniel P . Berrangé
Signed-off-by: Daniel P. Berrangé 
---
 block/file-posix.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 6aafeda44f..2538e43299 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -106,8 +106,6 @@
 #include 
 #endif
 
-#include "trace.h"
-
 /* OS X does not have O_DSYNC */
 #ifndef O_DSYNC
 #ifdef O_SYNC
-- 
2.30.2




[PATCH 4/5] block: add trace point when fdatasync fails

2021-04-15 Thread Daniel P . Berrangé
A flush failure is a critical failure scenario for some operations.
For example, it will prevent migration from completing, as it will
make vm_stop() report an error. Thus it is important to have a
trace point present for debugging.

Signed-off-by: Daniel P. Berrangé 
---
 block/file-posix.c | 2 ++
 block/trace-events | 1 +
 2 files changed, 3 insertions(+)

diff --git a/block/file-posix.c b/block/file-posix.c
index 99cf452f84..6aafeda44f 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1362,6 +1362,8 @@ static int handle_aiocb_flush(void *opaque)
 
 ret = qemu_fdatasync(aiocb->aio_fildes);
 if (ret == -1) {
+trace_file_flush_fdatasync_failed(errno);
+
 /* There is no clear definition of the semantics of a failing fsync(),
  * so we may have to assume the worst. The sad truth is that this
  * assumption is correct for Linux. Some pages are now probably marked
diff --git a/block/trace-events b/block/trace-events
index 1a12d634e2..c8a943e992 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -206,6 +206,7 @@ file_copy_file_range(void *bs, int src, int64_t src_off, 
int dst, int64_t dst_of
 file_FindEjectableOpticalMedia(const char *media) "Matching using %s"
 file_setup_cdrom(const char *partition) "Using %s as optical disc"
 file_hdev_is_sg(int type, int version) "SG device found: type=%d, version=%d"
+file_flush_fdatasync_failed(int err) "errno %d"
 
 # sheepdog.c
 sheepdog_reconnect_to_sdog(void) "Wait for connection to be established"
-- 
2.30.2




[PATCH 3/5] block: preserve errno from fdatasync failures

2021-04-15 Thread Daniel P . Berrangé
When fdatasync() fails on a file backend we set a flag that
short-circuits any future attempts to call fdatasync(). The
first failure returns the true errno, but the later short-
circuited calls return a generic EIO. The latter is unhelpful
because fdatasync() can return a variety of errnos, including
EACCESS.

Signed-off-by: Daniel P. Berrangé 
---
 block/file-posix.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 20e14f8e96..99cf452f84 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -160,7 +160,7 @@ typedef struct BDRVRawState {
 bool discard_zeroes:1;
 bool use_linux_aio:1;
 bool use_linux_io_uring:1;
-bool page_cache_inconsistent:1;
+int page_cache_inconsistent; /* errno from fdatasync failure */
 bool has_fallocate;
 bool needs_alignment;
 bool drop_cache;
@@ -1357,7 +1357,7 @@ static int handle_aiocb_flush(void *opaque)
 int ret;
 
 if (s->page_cache_inconsistent) {
-return -EIO;
+return -s->page_cache_inconsistent;
 }
 
 ret = qemu_fdatasync(aiocb->aio_fildes);
@@ -1376,7 +1376,7 @@ static int handle_aiocb_flush(void *opaque)
  * Obviously, this doesn't affect O_DIRECT, which bypasses the page
  * cache. */
 if ((s->open_flags & O_DIRECT) == 0) {
-s->page_cache_inconsistent = true;
+s->page_cache_inconsistent = errno;
 }
 return -errno;
 }
-- 
2.30.2




[PATCH 2/5] softmmu: add trace point when bdrv_flush_all fails

2021-04-15 Thread Daniel P . Berrangé
The VM stop process has to flush outstanding I/O and this is a critical
failure scenario that is hard to diagnose. Add a probe point that
records the flush return code.

Signed-off-by: Daniel P. Berrangé 
---
 softmmu/cpus.c   | 7 ++-
 softmmu/trace-events | 3 +++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/softmmu/cpus.c b/softmmu/cpus.c
index a7ee431187..c3caaeb26e 100644
--- a/softmmu/cpus.c
+++ b/softmmu/cpus.c
@@ -44,6 +44,7 @@
 #include "sysemu/whpx.h"
 #include "hw/boards.h"
 #include "hw/hw.h"
+#include "trace.h"
 
 #ifdef CONFIG_LINUX
 
@@ -266,6 +267,7 @@ static int do_vm_stop(RunState state, bool send_stop)
 
 bdrv_drain_all();
 ret = bdrv_flush_all();
+trace_vm_stop_flush_all(ret);
 
 return ret;
 }
@@ -704,12 +706,15 @@ int vm_stop_force_state(RunState state)
 if (runstate_is_running()) {
 return vm_stop(state);
 } else {
+int ret;
 runstate_set(state);
 
 bdrv_drain_all();
 /* Make sure to return an error if the flush in a previous vm_stop()
  * failed. */
-return bdrv_flush_all();
+ret = bdrv_flush_all();
+trace_vm_stop_flush_all(ret);
+return ret;
 }
 }
 
diff --git a/softmmu/trace-events b/softmmu/trace-events
index b80ca042e1..f11b427544 100644
--- a/softmmu/trace-events
+++ b/softmmu/trace-events
@@ -19,6 +19,9 @@ flatview_new(void *view, void *root) "%p (root %p)"
 flatview_destroy(void *view, void *root) "%p (root %p)"
 flatview_destroy_rcu(void *view, void *root) "%p (root %p)"
 
+# softmmu.c
+vm_stop_flush_all(int ret) "ret %d"
+
 # vl.c
 vm_state_notify(int running, int reason, const char *reason_str) "running %d 
reason %d (%s)"
 load_file(const char *name, const char *path) "name %s location %s"
-- 
2.30.2




[PATCH 1/5] migration: add trace point when vm_stop_force_state fails

2021-04-15 Thread Daniel P . Berrangé
This is a critical failure scenario for migration that is hard to
diagnose from existing probes. Most likely it is caused by an error
from bdrv_flush(), but we're not logging the errno anywhere, hence
this new probe.

Signed-off-by: Daniel P. Berrangé 
---
 migration/migration.c  | 1 +
 migration/trace-events | 1 +
 2 files changed, 2 insertions(+)

diff --git a/migration/migration.c b/migration/migration.c
index 8ca034136b..bee0dcd501 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -3121,6 +3121,7 @@ static void migration_completion(MigrationState *s)
 if (!ret) {
 bool inactivate = !migrate_colo_enabled();
 ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+trace_migration_completion_vm_stop(ret);
 if (ret >= 0) {
 ret = migration_maybe_pause(s, _active_state,
 MIGRATION_STATUS_DEVICE);
diff --git a/migration/trace-events b/migration/trace-events
index 668c562fed..8ec28432eb 100644
--- a/migration/trace-events
+++ b/migration/trace-events
@@ -149,6 +149,7 @@ migrate_pending(uint64_t size, uint64_t max, uint64_t pre, 
uint64_t compat, uint
 migrate_send_rp_message(int msg_type, uint16_t len) "%d: len %d"
 migrate_send_rp_recv_bitmap(char *name, int64_t size) "block '%s' size 
0x%"PRIi64
 migration_completion_file_err(void) ""
+migration_completion_vm_stop(int ret) "ret %d"
 migration_completion_postcopy_end(void) ""
 migration_completion_postcopy_end_after_complete(void) ""
 migration_rate_limit_pre(int ms) "%d ms"
-- 
2.30.2




[PATCH 0/5] block, migration: improve debugging of migration bdrv_flush failure

2021-04-15 Thread Daniel P . Berrangé
I spent a while debugging a tricky migration failure today which was
ultimately caused by fdatasync() getting EACCESS. The existing probes
were not sufficient to diagnose this, so I had to resort to GDB. This
improves probes and block error reporting to make future diagnosis
possible without GDB.

Daniel P. Berrangé (5):
  migration: add trace point when vm_stop_force_state fails
  softmmu: add trace point when bdrv_flush_all fails
  block: preserve errno from fdatasync failures
  block: add trace point when fdatasync fails
  block: remove duplicate trace.h include

 block/file-posix.c | 10 +-
 block/trace-events |  1 +
 migration/migration.c  |  1 +
 migration/trace-events |  1 +
 softmmu/cpus.c |  7 ++-
 softmmu/trace-events   |  3 +++
 6 files changed, 17 insertions(+), 6 deletions(-)

-- 
2.30.2





Re: [PATCH v2] hw/block/nvme: align with existing style

2021-04-15 Thread Philippe Mathieu-Daudé
On 4/15/21 2:00 PM, Gollu Appalanaidu wrote:
> Make uniform hexadecimal numbers format.
> 
> Signed-off-by: Gollu Appalanaidu 
> ---
> -v2: Address review comments (Klaus)
> use lower case hexa format for the code and in comments 
> use the same format as used in Spec. ("h")

^ This comment is relevant to the commit message.

Also it would be nice if the subsystem could describe somewhere
what is its style. Not sure where... The file header is probably
the simplest place.

Something like:

"While QEMU coding style prefers lowercase hexadecimal in constants,
the NVMe subsystem use the format from the NVMe specifications in
the comments: no '0x' prefix, uppercase, 'h' hexadecimal suffix."

>  hw/block/nvme-ns.c   |  2 +-
>  hw/block/nvme.c  | 40 
>  include/block/nvme.h | 10 +-
>  3 files changed, 26 insertions(+), 26 deletions(-)




[PATCH v2] hw/block/nvme: align with existing style

2021-04-15 Thread Gollu Appalanaidu
Make uniform hexadecimal numbers format.

Signed-off-by: Gollu Appalanaidu 
---
-v2: Address review comments (Klaus)
use lower case hexa format for the code and in comments 
use the same format as used in Spec. ("h")


 hw/block/nvme-ns.c   |  2 +-
 hw/block/nvme.c  | 40 
 include/block/nvme.h | 10 +-
 3 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
index 7bb618f182..a0895614d9 100644
--- a/hw/block/nvme-ns.c
+++ b/hw/block/nvme-ns.c
@@ -303,7 +303,7 @@ static void nvme_ns_init_zoned(NvmeNamespace *ns)
 
 id_ns_z = g_malloc0(sizeof(NvmeIdNsZoned));
 
-/* MAR/MOR are zeroes-based, 0x means no limit */
+/* MAR/MOR are zeroes-based, Fh means no limit */
 id_ns_z->mar = cpu_to_le32(ns->params.max_active_zones - 1);
 id_ns_z->mor = cpu_to_le32(ns->params.max_open_zones - 1);
 id_ns_z->zoc = 0;
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 624a1431d0..f50e25c501 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -3607,18 +3607,18 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)
 
 /*
  * In the base NVM command set, Flush may apply to all namespaces
- * (indicated by NSID being set to 0x). But if that feature is used
+ * (indicated by NSID being set to h). But if that feature is used
  * along with TP 4056 (Namespace Types), it may be pretty screwed up.
  *
- * If NSID is indeed set to 0x, we simply cannot associate the
+ * If NSID is indeed set to h, we simply cannot associate the
  * opcode with a specific command since we cannot determine a unique I/O
  * command set. Opcode 0x0 could have any other meaning than something
  * equivalent to flushing and say it DOES have completely different
- * semantics in some other command set - does an NSID of 0x then
+ * semantics in some other command set - does an NSID of h then
  * mean "for all namespaces, apply whatever command set specific command
  * that uses the 0x0 opcode?" Or does it mean "for all namespaces, apply
  * whatever command that uses the 0x0 opcode if, and only if, it allows
- * NSID to be 0x"?
+ * NSID to be h"?
  *
  * Anyway (and luckily), for now, we do not care about this since the
  * device only supports namespace types that includes the NVM Flush command
@@ -3934,7 +3934,7 @@ static uint16_t nvme_changed_nslist(NvmeCtrl *n, uint8_t 
rae, uint32_t buf_len,
 NVME_CHANGED_NSID_SIZE) {
 /*
  * If more than 1024 namespaces, the first entry in the log page should
- * be set to 0x and the others to 0 as spec.
+ * be set to h and the others to 0 as spec.
  */
 if (i == ARRAY_SIZE(nslist)) {
 memset(nslist, 0x0, sizeof(nslist));
@@ -4332,7 +4332,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
NvmeRequest *req,
 trace_pci_nvme_identify_nslist(min_nsid);
 
 /*
- * Both 0x (NVME_NSID_BROADCAST) and 0xfffe are invalid values
+ * Both h (NVME_NSID_BROADCAST) and Eh are invalid values
  * since the Active Namespace ID List should return namespaces with ids
  * *higher* than the NSID specified in the command. This is also specified
  * in the spec (NVM Express v1.3d, Section 5.15.4).
@@ -4379,7 +4379,7 @@ static uint16_t nvme_identify_nslist_csi(NvmeCtrl *n, 
NvmeRequest *req,
 trace_pci_nvme_identify_nslist_csi(min_nsid, c->csi);
 
 /*
- * Same as in nvme_identify_nslist(), 0x/0xfffe are invalid.
+ * Same as in nvme_identify_nslist(), h/Eh are invalid.
  */
 if (min_nsid >= NVME_NSID_BROADCAST - 1) {
 return NVME_INVALID_NSID | NVME_DNR;
@@ -4595,7 +4595,7 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeRequest 
*req)
 /*
  * The Reservation Notification Mask and Reservation Persistence
  * features require a status code of Invalid Field in Command when
- * NSID is 0x. Since the device does not support those
+ * NSID is h. Since the device does not support those
  * features we can always return Invalid Namespace or Format as we
  * should do for all other features.
  */
@@ -4854,8 +4854,8 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest 
*req)
 return NVME_INVALID_FIELD | NVME_DNR;
 }
 
-trace_pci_nvme_setfeat_numq((dw11 & 0x) + 1,
-((dw11 >> 16) & 0x) + 1,
+trace_pci_nvme_setfeat_numq((dw11 & 0x) + 1,
+((dw11 >> 16) & 0x) + 1,
 n->params.max_ioqpairs,
 n->params.max_ioqpairs);
 

Re: [PATCH 0/3] file-posix: fix refresh_limits for SCSI devices

2021-04-15 Thread no-reply
Patchew URL: 
https://patchew.org/QEMU/20210415124307.428203-1-pbonz...@redhat.com/



Hi,

This series seems to have some coding style problems. See output below for
more information:

Type: series
Message-id: 20210415124307.428203-1-pbonz...@redhat.com
Subject: [PATCH 0/3] file-posix: fix refresh_limits for SCSI devices

=== TEST SCRIPT BEGIN ===
#!/bin/bash
git rev-parse base > /dev/null || exit 0
git config --local diff.renamelimit 0
git config --local diff.renames True
git config --local diff.algorithm histogram
./scripts/checkpatch.pl --mailback base..
=== TEST SCRIPT END ===

Updating 3c8cf5a9c21ff8782164d1def7f44bd888713384
From https://github.com/patchew-project/qemu
 * [new tag] patchew/20210415124307.428203-1-pbonz...@redhat.com -> 
patchew/20210415124307.428203-1-pbonz...@redhat.com
Switched to a new branch 'test'
37feb25 file-posix: fix max_iov for /dev/sg devices
eec0521 file-posix: try BLKSECTGET on block devices too, do not round to power 
of 2
c44bc38 scsi-generic: pass max_segments via max_iov field in BlockLimits

=== OUTPUT BEGIN ===
1/3 Checking commit c44bc386ba20 (scsi-generic: pass max_segments via max_iov 
field in BlockLimits)
WARNING: line over 80 characters
#51: FILE: hw/scsi/scsi-generic.c:186:
+max_transfer = MIN_NON_ZERO(max_transfer, max_iov * 
qemu_real_host_page_size)

total: 0 errors, 1 warnings, 23 lines checked

Patch 1/3 has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.
2/3 Checking commit eec052142154 (file-posix: try BLKSECTGET on block devices 
too, do not round to power of 2)
3/3 Checking commit 37feb259bbc4 (file-posix: fix max_iov for /dev/sg devices)
ERROR: Missing Signed-off-by: line(s)

total: 1 errors, 0 warnings, 17 lines checked

Patch 3/3 has style problems, please review.  If any of these errors
are false positives report them to the maintainer, see
CHECKPATCH in MAINTAINERS.

=== OUTPUT END ===

Test command exited with code: 1


The full log is available at
http://patchew.org/logs/20210415124307.428203-1-pbonz...@redhat.com/testing.checkpatch/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

[PATCH 1/3] scsi-generic: pass max_segments via max_iov field in BlockLimits

2021-04-15 Thread Paolo Bonzini
I/O to a disk via read/write is not limited by the number of segments allowed
by the host adapter; the kernel can split requests if needed, and the limit
imposed by the host adapter can be very low (256k or so) to avoid that SG_IO
returns EINVAL if memory is heavily fragmented.

Since this value is only interesting for SG_IO-based I/O, do not include
it in the max_transfer and only take it into account when patching the
block limits VPD page in the scsi-generic device.

Signed-off-by: Paolo Bonzini 
---
 block/file-posix.c | 3 +--
 hw/scsi/scsi-generic.c | 6 --
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 20e14f8e96..9e316a2a97 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1252,8 +1252,7 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 
 ret = sg_get_max_segments(s->fd);
 if (ret > 0) {
-bs->bl.max_transfer = MIN(bs->bl.max_transfer,
-  ret * qemu_real_host_page_size);
+bs->bl.max_iov = ret;
 }
 }
 
diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index 98c30c5d5c..82e1e2ee79 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -179,10 +179,12 @@ static void scsi_handle_inquiry_reply(SCSIGenericReq *r, 
SCSIDevice *s)
 (r->req.cmd.buf[1] & 0x01)) {
 page = r->req.cmd.buf[2];
 if (page == 0xb0) {
-uint32_t max_transfer =
-blk_get_max_transfer(s->conf.blk) / s->blocksize;
+uint32_t max_transfer = blk_get_max_transfer(s->conf.blk);
+uint32_t max_iov = blk_get_max_iov(s->conf.blk);
 
 assert(max_transfer);
+max_transfer = MIN_NON_ZERO(max_transfer, max_iov * 
qemu_real_host_page_size)
+/ s->blocksize;
 stl_be_p(>buf[8], max_transfer);
 /* Also take care of the opt xfer len. */
 stl_be_p(>buf[12],
-- 
2.30.1





[PATCH 0/3] file-posix: fix refresh_limits for SCSI devices

2021-04-15 Thread Paolo Bonzini
refresh_limits is not doing anything for block devices, and is retrieving
the maximum number of s/g list entries incorrectly for character devices.

Patches 2-3 fix these problems, while patch 1 is a small improvement to
avoid making the BlockLimits unnecessarily restrictive when SG_IO is not
in use.

Paolo

Paolo Bonzini (3):
  scsi-generic: pass max_segments via max_iov field in BlockLimits
  file-posix: try BLKSECTGET on block devices too, do not round to power
of 2
  file-posix: fix max_iov for /dev/sg devices

 block/file-posix.c | 37 +++--
 hw/scsi/scsi-generic.c |  6 --
 2 files changed, 27 insertions(+), 16 deletions(-)

-- 
2.30.1




[PATCH 2/3] file-posix: try BLKSECTGET on block devices too, do not round to power of 2

2021-04-15 Thread Paolo Bonzini
bs->sg is only true for character devices, but block devices can also
be used with scsi-block and scsi-generic.  Unfortunately BLKSECTGET
returns bytes for /dev/sgN devices and sectors for block devices, so
account for that in the code.

The maximum transfer also need not be a power of 2 (for example I have
seen disks with 1280 KiB maximum transfer) so there's no need to pass
the result through pow2floor.

Signed-off-by: Paolo Bonzini 
---
 block/file-posix.c | 27 +--
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 9e316a2a97..e37a6bb8de 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1173,13 +1173,13 @@ static void raw_reopen_abort(BDRVReopenState *state)
 s->reopen_state = NULL;
 }
 
-static int sg_get_max_transfer_length(int fd)
+static int sg_get_max_transfer_length(int fd, struct stat *st)
 {
 #ifdef BLKSECTGET
 int max_bytes = 0;
 
 if (ioctl(fd, BLKSECTGET, _bytes) == 0) {
-return max_bytes;
+return S_ISBLK(st->st_mode) ? max_bytes * 512 : max_bytes;
 } else {
 return -errno;
 }
@@ -1188,7 +1188,7 @@ static int sg_get_max_transfer_length(int fd)
 #endif
 }
 
-static int sg_get_max_segments(int fd)
+static int sg_get_max_segments(int fd, struct stat *st)
 {
 #ifdef CONFIG_LINUX
 char buf[32];
@@ -1197,15 +1197,9 @@ static int sg_get_max_segments(int fd)
 int ret;
 int sysfd = -1;
 long max_segments;
-struct stat st;
-
-if (fstat(fd, )) {
-ret = -errno;
-goto out;
-}
 
 sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
-major(st.st_rdev), minor(st.st_rdev));
+major(st->st_rdev), minor(st->st_rdev));
 sysfd = open(sysfspath, O_RDONLY);
 if (sysfd == -1) {
 ret = -errno;
@@ -1242,15 +1236,20 @@ out:
 static void raw_refresh_limits(BlockDriverState *bs, Error **errp)
 {
 BDRVRawState *s = bs->opaque;
+struct stat st;
+
+if (fstat(s->fd, )) {
+return;
+}
 
-if (bs->sg) {
-int ret = sg_get_max_transfer_length(s->fd);
+if (bs->sg || S_ISBLK(st.st_mode)) {
+int ret = sg_get_max_transfer_length(s->fd, );
 
 if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
-bs->bl.max_transfer = pow2floor(ret);
+bs->bl.max_transfer = ret;
 }
 
-ret = sg_get_max_segments(s->fd);
+ret = sg_get_max_segments(s->fd, );
 if (ret > 0) {
 bs->bl.max_iov = ret;
 }
-- 
2.30.1





[PATCH 3/3] file-posix: fix max_iov for /dev/sg devices

2021-04-15 Thread Paolo Bonzini
Even though it was only called for devices that have bs->sg set (which
must be character devices),
sg_get_max_segments looked at /sys/dev/block which only works for
block devices.

On Linux the sg driver has its own way to provide the maximum number of
iovecs in a scatter/gather list.
---
 block/file-posix.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/block/file-posix.c b/block/file-posix.c
index e37a6bb8de..b348b37ccb 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1198,6 +1198,17 @@ static int sg_get_max_segments(int fd, struct stat *st)
 int sysfd = -1;
 long max_segments;
 
+if (S_ISCHR(st->st_mode)) {
+if (ioctl(fd, SG_GET_SG_TABLESIZE, ) == 0) {
+return ret;
+}
+return -EIO;
+}
+
+if (!S_ISBLK(st->st_mode)) {
+return -ENOTSUP;
+}
+
 sysfspath = g_strdup_printf("/sys/dev/block/%u:%u/queue/max_segments",
 major(st->st_rdev), minor(st->st_rdev));
 sysfd = open(sysfspath, O_RDONLY);
-- 
2.30.1




[PATCH 3/4] hw/block/fdc: Extract ISA floppy controllers to fdc-isa.c

2021-04-15 Thread Philippe Mathieu-Daudé
Some machines use floppy controllers via the SysBus interface,
and don't need to pull in all the ISA code.
Extract the ISA specific code to a new unit: fdc-isa.c, and
add a new Kconfig symbol: "FDC_ISA".

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/block/fdc-isa.c   | 313 +++
 hw/block/fdc.c   | 257 ---
 MAINTAINERS  |   1 +
 hw/block/Kconfig |   4 +
 hw/block/meson.build |   1 +
 hw/i386/Kconfig  |   2 +-
 hw/isa/Kconfig   |   6 +-
 hw/sparc64/Kconfig   |   2 +-
 8 files changed, 324 insertions(+), 262 deletions(-)
 create mode 100644 hw/block/fdc-isa.c

diff --git a/hw/block/fdc-isa.c b/hw/block/fdc-isa.c
new file mode 100644
index 000..97f3f9e5c0a
--- /dev/null
+++ b/hw/block/fdc-isa.c
@@ -0,0 +1,313 @@
+/*
+ * QEMU Floppy disk emulator (Intel 82078)
+ *
+ * Copyright (c) 2003, 2007 Jocelyn Mayer
+ * Copyright (c) 2008 Hervé Poussineau
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+/*
+ * The controller is used in Sun4m systems in a slightly different
+ * way. There are changes in DOR register and DMA is not available.
+ */
+
+#include "qemu/osdep.h"
+#include "hw/block/fdc.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qemu/timer.h"
+#include "hw/acpi/aml-build.h"
+#include "hw/irq.h"
+#include "hw/isa/isa.h"
+#include "hw/qdev-properties.h"
+#include "hw/qdev-properties-system.h"
+#include "migration/vmstate.h"
+#include "hw/block/block.h"
+#include "sysemu/block-backend.h"
+#include "sysemu/blockdev.h"
+#include "sysemu/sysemu.h"
+#include "qemu/log.h"
+#include "qemu/main-loop.h"
+#include "qemu/module.h"
+#include "trace.h"
+#include "qom/object.h"
+#include "fdc-internal.h"
+
+OBJECT_DECLARE_SIMPLE_TYPE(FDCtrlISABus, ISA_FDC)
+
+struct FDCtrlISABus {
+ISADevice parent_obj;
+
+uint32_t iobase;
+uint32_t irq;
+uint32_t dma;
+struct FDCtrl state;
+int32_t bootindexA;
+int32_t bootindexB;
+};
+
+static void fdctrl_external_reset_isa(DeviceState *d)
+{
+FDCtrlISABus *isa = ISA_FDC(d);
+FDCtrl *s = >state;
+
+fdctrl_reset(s, 0);
+}
+
+void isa_fdc_init_drives(ISADevice *fdc, DriveInfo **fds)
+{
+fdctrl_init_drives(_FDC(fdc)->state.bus, fds);
+}
+
+static const MemoryRegionPortio fdc_portio_list[] = {
+{ 1, 5, 1, .read = fdctrl_read, .write = fdctrl_write },
+{ 7, 1, 1, .read = fdctrl_read, .write = fdctrl_write },
+PORTIO_END_OF_LIST(),
+};
+
+static void isabus_fdc_realize(DeviceState *dev, Error **errp)
+{
+ISADevice *isadev = ISA_DEVICE(dev);
+FDCtrlISABus *isa = ISA_FDC(dev);
+FDCtrl *fdctrl = >state;
+Error *err = NULL;
+
+isa_register_portio_list(isadev, >portio_list,
+ isa->iobase, fdc_portio_list, fdctrl,
+ "fdc");
+
+isa_init_irq(isadev, >irq, isa->irq);
+fdctrl->dma_chann = isa->dma;
+if (fdctrl->dma_chann != -1) {
+fdctrl->dma = isa_get_dma(isa_bus_from_device(isadev), isa->dma);
+if (!fdctrl->dma) {
+error_setg(errp, "ISA controller does not support DMA");
+return;
+}
+}
+
+qdev_set_legacy_instance_id(dev, isa->iobase, 2);
+
+fdctrl_realize_common(dev, fdctrl, );
+if (err != NULL) {
+error_propagate(errp, err);
+return;
+}
+}
+
+FloppyDriveType isa_fdc_get_drive_type(ISADevice *fdc, int i)
+{
+FDCtrlISABus *isa = ISA_FDC(fdc);
+
+return isa->state.drives[i].drive;
+}
+
+static void isa_fdc_get_drive_max_chs(FloppyDriveType type, uint8_t *maxc,
+  uint8_t *maxh, uint8_t *maxs)
+{
+const FDFormat *fdf;
+
+*maxc = *maxh = *maxs = 0;
+for (fdf = fd_formats; fdf->drive != FLOPPY_DRIVE_TYPE_NONE; fdf++) {
+if (fdf->drive != type) {
+continue;
+}
+if (*maxc < fdf->max_track) {
+*maxc = fdf->max_track;
+

[PATCH 2/4] hw/block/fdc: Declare shared prototypes in fdc-internal.h

2021-04-15 Thread Philippe Mathieu-Daudé
We want to extract ISA/SysBus code from the generic fdc.c file.
First, declare the prototypes we will access from the new units
into a new local header: "fdc-internal.h".

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/block/fdc-internal.h | 155 
 hw/block/fdc.c  | 126 +++-
 MAINTAINERS |   1 +
 3 files changed, 164 insertions(+), 118 deletions(-)
 create mode 100644 hw/block/fdc-internal.h

diff --git a/hw/block/fdc-internal.h b/hw/block/fdc-internal.h
new file mode 100644
index 000..ddd41461ff3
--- /dev/null
+++ b/hw/block/fdc-internal.h
@@ -0,0 +1,155 @@
+/*
+ * QEMU Floppy disk emulator (Intel 82078)
+ *
+ * Copyright (c) 2003, 2007 Jocelyn Mayer
+ * Copyright (c) 2008 Hervé Poussineau
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef HW_BLOCK_FDC_INTERNAL_H
+#define HW_BLOCK_FDC_INTERNAL_H
+
+#include "exec/memory.h"
+#include "exec/ioport.h"
+#include "hw/block/block.h"
+#include "qapi/qapi-types-block.h"
+
+typedef struct FDCtrl FDCtrl;
+
+/* Floppy bus emulation */
+
+typedef struct FloppyBus {
+BusState bus;
+FDCtrl *fdc;
+} FloppyBus;
+
+/* Floppy disk drive emulation */
+
+typedef enum FDriveRate {
+FDRIVE_RATE_500K = 0x00,  /* 500 Kbps */
+FDRIVE_RATE_300K = 0x01,  /* 300 Kbps */
+FDRIVE_RATE_250K = 0x02,  /* 250 Kbps */
+FDRIVE_RATE_1M   = 0x03,  /*   1 Mbps */
+} FDriveRate;
+
+typedef enum FDriveSize {
+FDRIVE_SIZE_UNKNOWN,
+FDRIVE_SIZE_350,
+FDRIVE_SIZE_525,
+} FDriveSize;
+
+typedef struct FDFormat {
+FloppyDriveType drive;
+uint8_t last_sect;
+uint8_t max_track;
+uint8_t max_head;
+FDriveRate rate;
+} FDFormat;
+
+typedef enum FDiskFlags {
+FDISK_DBL_SIDES  = 0x01,
+} FDiskFlags;
+
+typedef struct FDrive {
+FDCtrl *fdctrl;
+BlockBackend *blk;
+BlockConf *conf;
+/* Drive status */
+FloppyDriveType drive;/* CMOS drive type*/
+uint8_t perpendicular;/* 2.88 MB access mode*/
+/* Position */
+uint8_t head;
+uint8_t track;
+uint8_t sect;
+/* Media */
+FloppyDriveType disk; /* Current disk type  */
+FDiskFlags flags;
+uint8_t last_sect;/* Nb sector per track*/
+uint8_t max_track;/* Nb of tracks   */
+uint16_t bps; /* Bytes per sector   */
+uint8_t ro;   /* Is read-only   */
+uint8_t media_changed;/* Is media changed   */
+uint8_t media_rate;   /* Data rate of medium*/
+
+bool media_validated; /* Have we validated the media? */
+} FDrive;
+
+struct FDCtrl {
+MemoryRegion iomem;
+qemu_irq irq;
+/* Controller state */
+QEMUTimer *result_timer;
+int dma_chann;
+uint8_t phase;
+IsaDma *dma;
+/* Controller's identification */
+uint8_t version;
+/* HW */
+uint8_t sra;
+uint8_t srb;
+uint8_t dor;
+uint8_t dor_vmstate; /* only used as temp during vmstate */
+uint8_t tdr;
+uint8_t dsr;
+uint8_t msr;
+uint8_t cur_drv;
+uint8_t status0;
+uint8_t status1;
+uint8_t status2;
+/* Command FIFO */
+uint8_t *fifo;
+int32_t fifo_size;
+uint32_t data_pos;
+uint32_t data_len;
+uint8_t data_state;
+uint8_t data_dir;
+uint8_t eot; /* last wanted sector */
+/* States kept only to be returned back */
+/* precompensation */
+uint8_t precomp_trk;
+uint8_t config;
+uint8_t lock;
+/* Power down config (also with status regB access mode */
+uint8_t pwrd;
+/* Floppy drives */
+FloppyBus bus;
+uint8_t num_floppies;
+FDrive drives[MAX_FD];
+struct {
+FloppyDriveType type;
+} qdev_for_drives[MAX_FD];
+int reset_sensei;
+FloppyDriveType fallback; /* type=auto failure fallback */
+/* Timers state */
+uint8_t timer0;
+uint8_t timer1;
+PortioList 

[PATCH 4/4] hw/block/fdc: Extract SysBus floppy controllers to fdc-sysbus.c

2021-04-15 Thread Philippe Mathieu-Daudé
Some machines use floppy controllers via the SysBus interface,
and don't need to pull in all the SysBus code.
Extract the SysBus specific code to a new unit: fdc-sysbus.c,
and add a new Kconfig symbol: "FDC_SYSBUS".

Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/block/fdc-sysbus.c | 252 ++
 hw/block/fdc.c| 220 
 MAINTAINERS   |   1 +
 hw/block/Kconfig  |   4 +
 hw/block/meson.build  |   1 +
 hw/block/trace-events |   2 +
 hw/mips/Kconfig   |   2 +-
 hw/sparc/Kconfig  |   2 +-
 8 files changed, 262 insertions(+), 222 deletions(-)
 create mode 100644 hw/block/fdc-sysbus.c

diff --git a/hw/block/fdc-sysbus.c b/hw/block/fdc-sysbus.c
new file mode 100644
index 000..71755fd6ae4
--- /dev/null
+++ b/hw/block/fdc-sysbus.c
@@ -0,0 +1,252 @@
+/*
+ * QEMU Floppy disk emulator (Intel 82078)
+ *
+ * Copyright (c) 2003, 2007 Jocelyn Mayer
+ * Copyright (c) 2008 Hervé Poussineau
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to 
deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qom/object.h"
+#include "hw/sysbus.h"
+#include "hw/block/fdc.h"
+#include "migration/vmstate.h"
+#include "fdc-internal.h"
+#include "trace.h"
+
+#define TYPE_SYSBUS_FDC "base-sysbus-fdc"
+typedef struct FDCtrlSysBusClass FDCtrlSysBusClass;
+typedef struct FDCtrlSysBus FDCtrlSysBus;
+DECLARE_OBJ_CHECKERS(FDCtrlSysBus, FDCtrlSysBusClass,
+ SYSBUS_FDC, TYPE_SYSBUS_FDC)
+
+struct FDCtrlSysBusClass {
+/*< private >*/
+SysBusDeviceClass parent_class;
+/*< public >*/
+
+bool use_strict_io;
+};
+
+struct FDCtrlSysBus {
+/*< private >*/
+SysBusDevice parent_obj;
+/*< public >*/
+
+struct FDCtrl state;
+};
+
+static uint64_t fdctrl_read_mem(void *opaque, hwaddr reg, unsigned ize)
+{
+return fdctrl_read(opaque, (uint32_t)reg);
+}
+
+static void fdctrl_write_mem(void *opaque, hwaddr reg,
+ uint64_t value, unsigned size)
+{
+fdctrl_write(opaque, (uint32_t)reg, value);
+}
+
+static const MemoryRegionOps fdctrl_mem_ops = {
+.read = fdctrl_read_mem,
+.write = fdctrl_write_mem,
+.endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static const MemoryRegionOps fdctrl_mem_strict_ops = {
+.read = fdctrl_read_mem,
+.write = fdctrl_write_mem,
+.endianness = DEVICE_NATIVE_ENDIAN,
+.valid = {
+.min_access_size = 1,
+.max_access_size = 1,
+},
+};
+
+static void fdctrl_external_reset_sysbus(DeviceState *d)
+{
+FDCtrlSysBus *sys = SYSBUS_FDC(d);
+FDCtrl *s = >state;
+
+fdctrl_reset(s, 0);
+}
+
+static void fdctrl_handle_tc(void *opaque, int irq, int level)
+{
+trace_fdctrl_tc_pulse(level);
+}
+
+void fdctrl_init_sysbus(qemu_irq irq, int dma_chann,
+hwaddr mmio_base, DriveInfo **fds)
+{
+FDCtrl *fdctrl;
+DeviceState *dev;
+SysBusDevice *sbd;
+FDCtrlSysBus *sys;
+
+dev = qdev_new("sysbus-fdc");
+sys = SYSBUS_FDC(dev);
+fdctrl = >state;
+fdctrl->dma_chann = dma_chann; /* FIXME */
+sbd = SYS_BUS_DEVICE(dev);
+sysbus_realize_and_unref(sbd, _fatal);
+sysbus_connect_irq(sbd, 0, irq);
+sysbus_mmio_map(sbd, 0, mmio_base);
+
+fdctrl_init_drives(>state.bus, fds);
+}
+
+void sun4m_fdctrl_init(qemu_irq irq, hwaddr io_base,
+   DriveInfo **fds, qemu_irq *fdc_tc)
+{
+DeviceState *dev;
+FDCtrlSysBus *sys;
+
+dev = qdev_new("sun-fdtwo");
+sysbus_realize_and_unref(SYS_BUS_DEVICE(dev), _fatal);
+sys = SYSBUS_FDC(dev);
+sysbus_connect_irq(SYS_BUS_DEVICE(sys), 0, irq);
+sysbus_mmio_map(SYS_BUS_DEVICE(sys), 0, io_base);
+*fdc_tc = qdev_get_gpio_in(dev, 0);
+
+fdctrl_init_drives(>state.bus, fds);
+}
+
+static void sysbus_fdc_common_initfn(Object *obj)
+{
+DeviceState *dev = DEVICE(obj);
+FDCtrlSysBusClass *sbdc = SYSBUS_FDC_GET_CLASS(obj);
+SysBusDevice *sbd = 

[PATCH 0/4] hw/block/fdc: Allow Kconfig-selecting ISA bus/SysBus floppy controllers

2021-04-15 Thread Philippe Mathieu-Daudé
Hi,

The floppy disc controllers pulls in irrelevant devices (sysbus in
an ISA-only machine, ISA bus + isa devices on a sysbus-only machine).

This series clean that by extracting each device in its own file,
adding the corresponding Kconfig symbols: FDC_ISA and FDC_SYSBUS.

Regards,

Phil.

Philippe Mathieu-Daudé (4):
  hw/block/fdc: Replace disabled fprintf() by trace event
  hw/block/fdc: Declare shared prototypes in fdc-internal.h
  hw/block/fdc: Extract ISA floppy controllers to fdc-isa.c
  hw/block/fdc: Extract SysBus floppy controllers to fdc-sysbus.c

 hw/block/fdc-internal.h | 155 ++
 hw/block/fdc-isa.c  | 313 +
 hw/block/fdc-sysbus.c   | 252 +
 hw/block/fdc.c  | 608 +---
 MAINTAINERS |   3 +
 hw/block/Kconfig|   8 +
 hw/block/meson.build|   2 +
 hw/block/trace-events   |   3 +
 hw/i386/Kconfig |   2 +-
 hw/isa/Kconfig  |   6 +-
 hw/mips/Kconfig |   2 +-
 hw/sparc/Kconfig|   2 +-
 hw/sparc64/Kconfig  |   2 +-
 13 files changed, 751 insertions(+), 607 deletions(-)
 create mode 100644 hw/block/fdc-internal.h
 create mode 100644 hw/block/fdc-isa.c
 create mode 100644 hw/block/fdc-sysbus.c

-- 
2.26.3





[PATCH 1/4] hw/block/fdc: Replace disabled fprintf() by trace event

2021-04-15 Thread Philippe Mathieu-Daudé
Signed-off-by: Philippe Mathieu-Daudé 
---
 hw/block/fdc.c| 7 +--
 hw/block/trace-events | 1 +
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/hw/block/fdc.c b/hw/block/fdc.c
index a825c2acbae..1d3a0473678 100644
--- a/hw/block/fdc.c
+++ b/hw/block/fdc.c
@@ -1242,12 +1242,7 @@ static void fdctrl_external_reset_isa(DeviceState *d)
 
 static void fdctrl_handle_tc(void *opaque, int irq, int level)
 {
-//FDCtrl *s = opaque;
-
-if (level) {
-// XXX
-FLOPPY_DPRINTF("TC pulsed\n");
-}
+trace_fdctrl_tc_pulse(level);
 }
 
 /* Change IRQ state */
diff --git a/hw/block/trace-events b/hw/block/trace-events
index fa12e3a67a7..306989c193c 100644
--- a/hw/block/trace-events
+++ b/hw/block/trace-events
@@ -3,6 +3,7 @@
 # fdc.c
 fdc_ioport_read(uint8_t reg, uint8_t value) "read reg 0x%02x val 0x%02x"
 fdc_ioport_write(uint8_t reg, uint8_t value) "write reg 0x%02x val 0x%02x"
+fdctrl_tc_pulse(int level) "TC pulse: %u"
 
 # pflash_cfi01.c
 # pflash_cfi02.c
-- 
2.26.3




Re: [PATCH 1/2] util/async: add a human-readable name to BHs for debugging

2021-04-15 Thread Philippe Mathieu-Daudé
On 4/14/21 10:02 PM, Stefan Hajnoczi wrote:
> It can be difficult to debug issues with BHs in production environments.
> Although BHs can usually be identified by looking up their ->cb()
> function pointer, this requires debug information for the program. It is
> also not possible to print human-readable diagnostics about BHs because
> they have no identifier.
> 
> This patch adds a name to each BH. The name is not unique per instance
> but differentiates between cb() functions, which is usually enough. It's
> done by changing aio_bh_new() and friends to macros that stringify cb.
> 
> The next patch will use the name field when reporting leaked BHs.
> 
> Signed-off-by: Stefan Hajnoczi 
> ---
>  include/block/aio.h| 31 ---
>  include/qemu/main-loop.h   |  4 +++-
>  tests/unit/ptimer-test-stubs.c |  2 +-
>  util/async.c   |  9 +++--
>  util/main-loop.c   |  4 ++--
>  5 files changed, 41 insertions(+), 9 deletions(-)

Reviewed-by: Philippe Mathieu-Daudé 




Re: [PATCH 0/2] util/async: print leaked BH name when AioContext finalizes

2021-04-15 Thread Fam Zheng
On 2021-04-14 21:02, Stefan Hajnoczi wrote:
> Eric Ernst and I debugged a BH leak and it was more involved than it should 
> be.
> The problem is that BHs don't have a human-readable identifier, so low-level
> debugging techniques and inferences about the code are required to figure out
> which BH was leaked in production environments without easy debug access.
> 
> The leak ended up already being fixed upstream but let's improve diagnostics
> for leaked BHs so that this becomes quick and easy in the future.
> 
> Stefan Hajnoczi (2):
>   util/async: add a human-readable name to BHs for debugging
>   util/async: print leaked BH name when AioContext finalizes
> 
>  include/block/aio.h| 31 ---
>  include/qemu/main-loop.h   |  4 +++-
>  tests/unit/ptimer-test-stubs.c |  2 +-
>  util/async.c   | 25 +
>  util/main-loop.c   |  4 ++--
>  5 files changed, 55 insertions(+), 11 deletions(-)
> 
> -- 
> 2.30.2
> 

Reviewed-by: Fam Zheng 



Re: [PATCH 2/2] hw/block/nvme: align reserved fields declarations

2021-04-15 Thread Klaus Jensen

On Mar 17 15:00, Gollu Appalanaidu wrote:

Align the Reserved fields declaration in NvmeBar

Signed-off-by: Gollu Appalanaidu 
---
include/block/nvme.h | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/block/nvme.h b/include/block/nvme.h
index fc65cfcb01..e5bd00bb85 100644
--- a/include/block/nvme.h
+++ b/include/block/nvme.h
@@ -7,7 +7,7 @@ typedef struct QEMU_PACKED NvmeBar {
uint32_tintms;
uint32_tintmc;
uint32_tcc;
-uint32_trsvd1;
+uint8_t rsvd24[4];
uint32_tcsts;
uint32_tnssrc;
uint32_taqa;
--
2.17.1



Thanks. Applied to nvme-next with a small commit message fixup!


signature.asc
Description: PGP signature


Re: [PATCH 1/2] hw/block/nvme: align with existing style

2021-04-15 Thread Klaus Jensen

On Mar 17 15:00, Gollu Appalanaidu wrote:

Make uniform hexadecimal numbers format.

Signed-off-by: Gollu Appalanaidu 
---
hw/block/nvme.c  | 30 +++---
include/block/nvme.h | 10 +-
2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index d439e44db8..21e85374bf 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -2728,18 +2728,18 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest 
*req)

/*
 * In the base NVM command set, Flush may apply to all namespaces
- * (indicated by NSID being set to 0x). But if that feature is used
+ * (indicated by NSID being set to 0x). But if that feature is used
 * along with TP 4056 (Namespace Types), it may be pretty screwed up.
 *
- * If NSID is indeed set to 0x, we simply cannot associate the
+ * If NSID is indeed set to 0x, we simply cannot associate the
 * opcode with a specific command since we cannot determine a unique I/O
 * command set. Opcode 0x0 could have any other meaning than something
 * equivalent to flushing and say it DOES have completely different
- * semantics in some other command set - does an NSID of 0x then
+ * semantics in some other command set - does an NSID of 0x then
 * mean "for all namespaces, apply whatever command set specific command
 * that uses the 0x0 opcode?" Or does it mean "for all namespaces, apply
 * whatever command that uses the 0x0 opcode if, and only if, it allows
- * NSID to be 0x"?
+ * NSID to be 0x"?
 *
 * Anyway (and luckily), for now, we do not care about this since the
 * device only supports namespace types that includes the NVM Flush command
@@ -3948,8 +3948,8 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeRequest 
*req)
return NVME_INVALID_FIELD | NVME_DNR;
}

-trace_pci_nvme_setfeat_numq((dw11 & 0x) + 1,
-((dw11 >> 16) & 0x) + 1,
+trace_pci_nvme_setfeat_numq((dw11 & 0x) + 1,
+((dw11 >> 16) & 0x) + 1,
n->params.max_ioqpairs,
n->params.max_ioqpairs);
req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
@@ -4436,7 +4436,7 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
}
break;
case 0x20:  /* NSSR */
-if (data == 0x4E564D65) {
+if (data == 0x4e564d65) {
trace_pci_nvme_ub_mmiowr_ssreset_unsupported();
} else {
/* The spec says that writes of other values have no effect */
@@ -4506,11 +4506,11 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
n->bar.cmbmsc = (n->bar.cmbmsc & 0x) | (data << 32);
return;

-case 0xE00: /* PMRCAP */
+case 0xe00: /* PMRCAP */
NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrcap_readonly,
   "invalid write to PMRCAP register, ignored");
return;
-case 0xE04: /* PMRCTL */
+case 0xe04: /* PMRCTL */
n->bar.pmrctl = data;
if (NVME_PMRCTL_EN(data)) {
memory_region_set_enabled(>pmr.dev->mr, true);
@@ -4521,19 +4521,19 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
n->pmr.cmse = false;
}
return;
-case 0xE08: /* PMRSTS */
+case 0xe08: /* PMRSTS */
NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrsts_readonly,
   "invalid write to PMRSTS register, ignored");
return;
-case 0xE0C: /* PMREBS */
+case 0xe0C: /* PMREBS */
NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrebs_readonly,
   "invalid write to PMREBS register, ignored");
return;
-case 0xE10: /* PMRSWTP */
+case 0xe10: /* PMRSWTP */
NVME_GUEST_ERR(pci_nvme_ub_mmiowr_pmrswtp_readonly,
   "invalid write to PMRSWTP register, ignored");
return;
-case 0xE14: /* PMRMSCL */
+case 0xe14: /* PMRMSCL */
if (!NVME_CAP_PMRS(n->bar.cap)) {
return;
}
@@ -4553,7 +4553,7 @@ static void nvme_write_bar(NvmeCtrl *n, hwaddr offset, 
uint64_t data,
}

return;
-case 0xE18: /* PMRMSCU */
+case 0xe18: /* PMRMSCU */
if (!NVME_CAP_PMRS(n->bar.cap)) {
return;
}
@@ -4595,7 +4595,7 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr addr, 
unsigned size)
 * from PMRSTS should ensure prior writes
 * made it to persistent media
 */
-if (addr == 0xE08 &&
+if (addr == 0xe08 &&
(NVME_PMRCAP_PMRWBM(n->bar.pmrcap) & 0x02)) {
memory_region_msync(>pmr.dev->mr, 0, n->pmr.dev->size);
}
diff --git a/include/block/nvme.h b/include/block/nvme.h
index 372d0f2799..fc65cfcb01 100644
--- a/include/block/nvme.h
+++ 

Re: [PATCH] hw/block/nvme: remove redundant invalid_lba_range trace

2021-04-15 Thread Klaus Jensen

On Apr 14 12:34, Gollu Appalanaidu wrote:

Currently pci_nvme_err_invalid_lba_range tace being called indvidually
at each function, add this in nvme_check_bounds and remove redundant
usage of it.

Signed-off-by: Gollu Appalanaidu 
---
hw/block/nvme.c | 9 +
1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 6842b01ab5..c67d3315a1 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1424,6 +1424,7 @@ static inline uint16_t nvme_check_bounds(NvmeNamespace 
*ns, uint64_t slba,
uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);

if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
+trace_pci_nvme_err_invalid_lba_range(slba, nlb, nsze);
return NVME_LBA_RANGE | NVME_DNR;
}

@@ -2266,7 +2267,6 @@ static void nvme_copy_in_complete(NvmeRequest *req)

status = nvme_check_bounds(ns, sdlba, ctx->nlb);
if (status) {
-trace_pci_nvme_err_invalid_lba_range(sdlba, ctx->nlb, ns->id_ns.nsze);
goto invalid;
}

@@ -2528,8 +2528,6 @@ static uint16_t nvme_dsm(NvmeCtrl *n, NvmeRequest *req)
uint32_t nlb = le32_to_cpu(range[i].nlb);

if (nvme_check_bounds(ns, slba, nlb)) {
-trace_pci_nvme_err_invalid_lba_range(slba, nlb,
- ns->id_ns.nsze);
continue;
}

@@ -2602,7 +2600,6 @@ static uint16_t nvme_verify(NvmeCtrl *n, NvmeRequest *req)

status = nvme_check_bounds(ns, slba, nlb);
if (status) {
-trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
return status;
}

@@ -2687,7 +2684,6 @@ static uint16_t nvme_copy(NvmeCtrl *n, NvmeRequest *req)

status = nvme_check_bounds(ns, slba, _nlb);
if (status) {
-trace_pci_nvme_err_invalid_lba_range(slba, _nlb, ns->id_ns.nsze);
goto out;
}

@@ -2816,7 +2812,6 @@ static uint16_t nvme_compare(NvmeCtrl *n, NvmeRequest 
*req)

status = nvme_check_bounds(ns, slba, nlb);
if (status) {
-trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
return status;
}

@@ -2935,7 +2930,6 @@ static uint16_t nvme_read(NvmeCtrl *n, NvmeRequest *req)

status = nvme_check_bounds(ns, slba, nlb);
if (status) {
-trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
goto invalid;
}

@@ -3015,7 +3009,6 @@ static uint16_t nvme_do_write(NvmeCtrl *n, NvmeRequest 
*req, bool append,

status = nvme_check_bounds(ns, slba, nlb);
if (status) {
-trace_pci_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
goto invalid;
}



Thanks. Applied to nvme-next!


signature.asc
Description: PGP signature