Re: [PATCH] nvme: Print 'cqid' for nvme_del_cq

2020-03-25 Thread Stefano Garzarella
On Tue, Mar 24, 2020 at 11:06:46PM +0900, Minwoo Im wrote:
> The given argument for this trace should be cqid, not sqid.
> 
> Signed-off-by: Minwoo Im 
> ---
>  hw/block/trace-events | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Reviewed-by: Stefano Garzarella 

> 
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index f78939fa9da1..bf6d11b58b85 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -37,7 +37,7 @@ nvme_rw(const char *verb, uint32_t blk_count, uint64_t 
> byte_count, uint64_t lba)
>  nvme_create_sq(uint64_t addr, uint16_t sqid, uint16_t cqid, uint16_t qsize, 
> uint16_t qflags) "create submission queue, addr=0x%"PRIx64", sqid=%"PRIu16", 
> cqid=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16""
>  nvme_create_cq(uint64_t addr, uint16_t cqid, uint16_t vector, uint16_t size, 
> uint16_t qflags, int ien) "create completion queue, addr=0x%"PRIx64", 
> cqid=%"PRIu16", vector=%"PRIu16", qsize=%"PRIu16", qflags=%"PRIu16", ien=%d"
>  nvme_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
> -nvme_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16""
> +nvme_del_cq(uint16_t cqid) "deleted completion queue, cqid=%"PRIu16""
>  nvme_identify_ctrl(void) "identify controller"
>  nvme_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
>  nvme_identify_nslist(uint16_t ns) "identify namespace list, nsid=%"PRIu16""
> -- 
> 2.17.1
> 
> 




[PATCH v2 2/2] iotests: rework test finding

2020-03-25 Thread Vladimir Sementsov-Ogievskiy
Add python script with new logic of searching for tests:

Old behavior:
 - tests are named [0-9][0-9][0-9]
 - tests must be registered in group file (even if test doesn't belong
   to any group, like 142)

New behavior:
 - group file is dropped
 - tests are searched by file-name instead of group file, so it's not
   needed more to "register the test", just create it with name
   test-*. Old names like [0-9][0-9][0-9] are supported too, but not
   recommended for new tests
 - groups are parsed from '# group: ' line inside test files
 - optional file group.local may be used to define some additional
   groups for downstreams
 - 'disabled' group is used to temporary disable tests. So instead of
   commenting tests in old 'group' file you now can add them to
   disabled group with help of 'group.local' file

Benefits:
 - no rebase conflicts in group file on patch porting from branch to
   branch
 - no conflicts in upstream, when different series want to occupy same
   test number
 - meaningful names for test files
   For example, with digital number, when some person wants to add some
   test about block-stream, he most probably will just create a new
   test. But if there would be test-block-stream test already, he will
   at first look at it and may be just add a test-case into it.
   And anyway meaningful names are better and test-* notation is
   already used in tests directory.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 docs/devel/testing.rst   |  51 +-
 tests/qemu-iotests/check |  20 +--
 tests/qemu-iotests/find_tests.py |  72 
 tests/qemu-iotests/group | 298 ---
 4 files changed, 132 insertions(+), 309 deletions(-)
 create mode 100755 tests/qemu-iotests/find_tests.py
 delete mode 100644 tests/qemu-iotests/group

diff --git a/docs/devel/testing.rst b/docs/devel/testing.rst
index 770a987ea4..5532f1abe4 100644
--- a/docs/devel/testing.rst
+++ b/docs/devel/testing.rst
@@ -153,7 +153,7 @@ check-block
 ---
 
 ``make check-block`` runs a subset of the block layer iotests (the tests that
-are in the "auto" group in ``tests/qemu-iotests/group``).
+are in the "auto" group).
 See the "QEMU iotests" section below for more information.
 
 GCC gcov support
@@ -267,6 +267,55 @@ another application on the host may have locked the file, 
possibly leading to a
 test failure.  If using such devices are explicitly desired, consider adding
 ``locking=off`` option to disable image locking.
 
+Test case groups
+
+
+Test may belong to some groups, you may define it in the comment inside the
+test. By convention, test groups are listed in the second line of the test
+file, after "#!/..." line, like this:
+
+.. code::
+
+  #!/usr/bin/env python3
+  # group: auto quick
+  #
+  ...
+
+Additional way of defining groups is creating tests/qemu-iotests/group.local
+file. This should be used only for downstream (this file should never appear
+in upstream). This file may be used for defining some downstream test groups
+or for temporary disable tests, like this:
+
+.. code::
+
+  # groups for some company downstream process
+  #
+  # ci - tests to run on build
+  # down - our downstream tests, not for upstream
+  #
+  # Format of each line is:
+
+  013 ci
+  210 disabled
+  215 disabled
+  test-our-ugly-workaround down ci
+
+The following groups are defined:
+
+- quick : Tests in this group should finish within some few seconds.
+
+- img : Tests in this group can be used to excercise the qemu-img tool.
+
+- auto : Tests in this group are used during "make check" and should be
+  runnable in any case. That means they should run with every QEMU binary
+  (also non-x86), with every QEMU configuration (i.e. must not fail if
+  an optional feature is not compiled in - but reporting a "skip" is ok),
+  work at least with the qcow2 file format, work with all kind of host
+  filesystems and users (e.g. "nobody" or "root") and must not take too
+  much memory and disk space (since CI pipelines tend to fail otherwise).
+
+- disabled : Tests in this group are disabled and ignored by check.
+
 .. _docker-ref:
 
 Docker based tests
diff --git a/tests/qemu-iotests/check b/tests/qemu-iotests/check
index f7a2d3d6c3..09b2ced2f0 100755
--- a/tests/qemu-iotests/check
+++ b/tests/qemu-iotests/check
@@ -168,9 +168,7 @@ do
 if $group
 then
 # arg after -g
-group_list=$(sed -n <"$source_iotests/group" -e 's/$/ /' -e 
"/^[0-9][0-9][0-9].* $r /"'{
-s/ .*//p
-}')
+group_list=$(./find_tests.py "$r")
 if [ -z "$group_list" ]
 then
 echo "Group \"$r\" is empty or not defined?"
@@ -193,10 +191,8 @@ s/ .*//p
 then
 # arg after -x
 # Populate $tmp.list with all tests
-awk '/^[0-9]{3,}/ {print $1}' "${source_iotests}/group" > $tmp.list 
2>/dev/null
-group_list=$(sed -n <"$source_iotests/group" -e 's/$/ /' -e 
"/^[0-9][0-9][0-9].* $r /"'{
-s/ .*//p
-}')
+./find_tests.py > $

Re: [PATCH v5 08/26] nvme: refactor device realization

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 00:43 -0700, Klaus Birkelund Jensen wrote:
> On Feb 12 11:27, Maxim Levitsky wrote:
> > On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> > > This patch splits up nvme_realize into multiple individual functions,
> > > each initializing a different subset of the device.
> > > 
> > > Signed-off-by: Klaus Jensen 
> > > ---
> > >  hw/block/nvme.c | 175 +++-
> > >  hw/block/nvme.h |  21 ++
> > >  2 files changed, 133 insertions(+), 63 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index e1810260d40b..81514eaef63a 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -44,6 +44,7 @@
> > >  #include "nvme.h"
> > >  
> > >  #define NVME_SPEC_VER 0x00010201
> > > +#define NVME_MAX_QS PCI_MSIX_FLAGS_QSIZE
> > >  
> > >  #define NVME_GUEST_ERR(trace, fmt, ...) \
> > >  do { \
> > > @@ -1325,67 +1326,106 @@ static const MemoryRegionOps nvme_cmb_ops = {
> > >  },
> > >  };
> > >  
> > > -static void nvme_realize(PCIDevice *pci_dev, Error **errp)
> > > +static int nvme_check_constraints(NvmeCtrl *n, Error **errp)
> > >  {
> > > -NvmeCtrl *n = NVME(pci_dev);
> > > -NvmeIdCtrl *id = &n->id_ctrl;
> > > -
> > > -int i;
> > > -int64_t bs_size;
> > > -uint8_t *pci_conf;
> > > -
> > > -if (!n->params.num_queues) {
> > > -error_setg(errp, "num_queues can't be zero");
> > > -return;
> > > -}
> > > +NvmeParams *params = &n->params;
> > >  
> > >  if (!n->conf.blk) {
> > > -error_setg(errp, "drive property not set");
> > > -return;
> > > +error_setg(errp, "nvme: block backend not configured");
> > > +return 1;
> > 
> > As a matter of taste, negative values indicate error, and 0 is the success 
> > value.
> > In Linux kernel this is even an official rule.
> > >  }
> 
> Fixed.
> 
> > >  
> > > -bs_size = blk_getlength(n->conf.blk);
> > > -if (bs_size < 0) {
> > > -error_setg(errp, "could not get backing file size");
> > > -return;
> > > +if (!params->serial) {
> > > +error_setg(errp, "nvme: serial not configured");
> > > +return 1;
> > >  }
> > >  
> > > -if (!n->params.serial) {
> > > -error_setg(errp, "serial property not set");
> > > -return;
> > > +if ((params->num_queues < 1 || params->num_queues > NVME_MAX_QS)) {
> > > +error_setg(errp, "nvme: invalid queue configuration");
> > 
> > Maybe something like "nvme: invalid queue count specified, should be 
> > between 1 and ..."?
> > > +return 1;
> > >  }
> 
> Fixed.
Thanks
> 
> > > +
> > > +return 0;
> > > +}
> > > +
> > > +static int nvme_init_blk(NvmeCtrl *n, Error **errp)
> > > +{
> > >  blkconf_blocksizes(&n->conf);
> > >  if (!blkconf_apply_backend_options(&n->conf, 
> > > blk_is_read_only(n->conf.blk),
> > > -   false, errp)) {
> > > -return;
> > > +false, errp)) {
> > > +return 1;
> > >  }
> > >  
> > > -pci_conf = pci_dev->config;
> > > -pci_conf[PCI_INTERRUPT_PIN] = 1;
> > > -pci_config_set_prog_interface(pci_dev->config, 0x2);
> > > -pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
> > > -pcie_endpoint_cap_init(pci_dev, 0x80);
> > > +return 0;
> > > +}
> > >  
> > > +static void nvme_init_state(NvmeCtrl *n)
> > > +{
> > >  n->num_namespaces = 1;
> > >  n->reg_size = pow2ceil(0x1004 + 2 * (n->params.num_queues + 1) * 4);
> > 
> > Isn't that wrong?
> > First 4K of mmio (0x1000) is the registers, and that is followed by the 
> > doorbells,
> > and each doorbell takes 8 bytes (assuming regular doorbell stride).
> > so n->params.num_queues + 1 should be total number of queues, thus the 
> > 0x1004 should be 0x1000 IMHO.
> > I might miss some rounding magic here though.
> > 
> 
> Yeah. I think you are right. It all becomes slightly more fishy due to
> the num_queues device parameter being 1's based and accounts for the
> admin queue pair.
> 
> But in get/set features, the value has to be 0's based and only account
> for the I/O queues, so we need to subtract 2 from the value. It's
> confusing all around.
Yea, I can't agree more on that. The zero based values had bitten
me few times while I developed nvme-mdev as well.

> 
> Since the admin queue pair isn't really optional I think it would be
> better that we introduces a new max_ioqpairs parameter that is 1's
> based, counts number of pairs and obviously only accounts for the io
> queues.
> 
> I guess we need to keep the num_queues parameter around for
> compatibility.
> 
> The doorbells are only 4 bytes btw, but the calculation still looks
I don't understand that. Each doorbell is indeed 4 bytes, but they come
in pairs so each doorbell pair is 8 bytes.

BTW, the spec has so called doorbell stride, which allows to artificially 
increase
each doorbell by a power of two. This was intended for softw

[PATCH v2 0/2] Rework iotests finding

2020-03-25 Thread Vladimir Sementsov-Ogievskiy
Hi all!

When sending iotests to upstream or do patch porting from one branch
to another we very often have to resolve conflicts in group file, as
many absolutely independent features are intersecting by this file.
These conflicts are simple, but imagine how much time we all have
already spent on resolving them? Let's finally get rid of group file.

Next, another thing I don't like about iotests is race for test number
chosing: you should search through mail box, before chosing test number
for new test.

So, I propose to get rid of group file and search for tests another way
[look at patch 02]. Additionally I propose to move to human-readable
names for test files, with notation test-* .

v1 was one patch "[PATCH] iotests: drop group file"

Vladimir Sementsov-Ogievskiy (2):
  iotests: define group in each iotests
  iotests: rework test finding

 docs/devel/testing.rst   |  51 +-
 tests/qemu-iotests/001   |   1 +
 tests/qemu-iotests/002   |   1 +
 tests/qemu-iotests/003   |   1 +
 tests/qemu-iotests/004   |   1 +
 tests/qemu-iotests/005   |   1 +
 tests/qemu-iotests/007   |   1 +
 tests/qemu-iotests/008   |   1 +
 tests/qemu-iotests/009   |   1 +
 tests/qemu-iotests/010   |   1 +
 tests/qemu-iotests/011   |   1 +
 tests/qemu-iotests/012   |   1 +
 tests/qemu-iotests/013   |   1 +
 tests/qemu-iotests/014   |   1 +
 tests/qemu-iotests/015   |   1 +
 tests/qemu-iotests/017   |   1 +
 tests/qemu-iotests/018   |   1 +
 tests/qemu-iotests/019   |   1 +
 tests/qemu-iotests/020   |   1 +
 tests/qemu-iotests/021   |   1 +
 tests/qemu-iotests/022   |   1 +
 tests/qemu-iotests/023   |   1 +
 tests/qemu-iotests/024   |   1 +
 tests/qemu-iotests/025   |   1 +
 tests/qemu-iotests/026   |   1 +
 tests/qemu-iotests/027   |   1 +
 tests/qemu-iotests/028   |   1 +
 tests/qemu-iotests/029   |   1 +
 tests/qemu-iotests/030   |   1 +
 tests/qemu-iotests/031   |   1 +
 tests/qemu-iotests/032   |   1 +
 tests/qemu-iotests/033   |   1 +
 tests/qemu-iotests/034   |   1 +
 tests/qemu-iotests/035   |   1 +
 tests/qemu-iotests/036   |   1 +
 tests/qemu-iotests/037   |   1 +
 tests/qemu-iotests/038   |   1 +
 tests/qemu-iotests/039   |   1 +
 tests/qemu-iotests/040   |   1 +
 tests/qemu-iotests/041   |   1 +
 tests/qemu-iotests/042   |   1 +
 tests/qemu-iotests/043   |   1 +
 tests/qemu-iotests/044   |   1 +
 tests/qemu-iotests/045   |   1 +
 tests/qemu-iotests/046   |   1 +
 tests/qemu-iotests/047   |   1 +
 tests/qemu-iotests/048   |   1 +
 tests/qemu-iotests/049   |   1 +
 tests/qemu-iotests/050   |   1 +
 tests/qemu-iotests/051   |   1 +
 tests/qemu-iotests/052   |   1 +
 tests/qemu-iotests/053   |   1 +
 tests/qemu-iotests/054   |   1 +
 tests/qemu-iotests/055   |   1 +
 tests/qemu-iotests/056   |   1 +
 tests/qemu-iotests/057   |   1 +
 tests/qemu-iotests/058   |   1 +
 tests/qemu-iotests/059   |   1 +
 tests/qemu-iotests/060   |   1 +
 tests/qemu-iotests/061   |   1 +
 tests/qemu-iotests/062   |   1 +
 tests/qemu-iotests/063   |   1 +
 tests/qemu-iotests/064   |   1 +
 tests/qemu-iotests/065   |   1 +
 tests/qemu-iotests/066   |   1 +
 tests/qemu-iotests/067   |   1 +
 tests/qemu-iotests/068   |   1 +
 tests/qemu-iotests/069   |   1 +
 tests/qemu-iotests/070   |   1 +
 tests/qemu-iotests/071   |   1 +
 tests/qemu-iotests/072   |   1 +
 tests/qemu-iotests/073   |   1 +
 tests/qemu-iotests/074   |   1 +
 tests/qemu-iotests/075   |   1 +
 tests/qemu-iotests/076   |   1 +
 tests/qemu-iotests/077   |   1 +
 tests/qemu-iotests/078   |   1 +
 tests/qemu-iotests/079   |   1 +
 tests/qemu-iotests/080   |   1 +
 tests/qemu-iotests/081   |   1 +
 tests/qemu-iotests/082   |   1 +
 tests/qemu-iotests/083   |   1 +
 tests/qemu-iotests/084   |   1 +
 tests/qemu-iotests/085   |   1 +
 tests/qemu-iotests/086   |   1 +
 tests/qemu-iotests/087   |   1 +
 tests/qemu-iotests/088   |   1 +
 tests/qemu-iotests/089   |   1 +
 tests/qemu-iotests/090   |   1 +
 tests/qemu-iotests/091   |   1 +
 tests/qemu-iotests/092   |   1 +
 tests/qemu-iotests/093   |   1 +
 tests/qemu-iotests/094   |   1 +
 tests/qemu-iotests/095   |   1 +
 tests/qemu-iotests/096   |   1 +
 tests/qemu-iotests/097   |   1 +
 tests/qemu-iotests/098   |   1 +
 tests/qemu-iotests/099   |   1 +
 tests/qemu-iotests/10

Re: [PATCH v5 16/26] nvme: refactor prp mapping

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 00:51 -0700, Klaus Birkelund Jensen wrote:
> On Feb 12 13:44, Maxim Levitsky wrote:
> > On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> > > Refactor nvme_map_prp and allow PRPs to be located in the CMB. The logic
> > > ensures that if some of the PRP is in the CMB, all of it must be located
> > > there, as per the specification.
> > 
> > To be honest this looks like not refactoring but a bugfix
> > (old code was just assuming that if first prp entry is in cmb, the rest 
> > also is)
> 
> I split it up into a separate bugfix patch.
> 
> > > 
> > > Also combine nvme_dma_{read,write}_prp into a single nvme_dma_prp that
> > > takes an additional DMADirection parameter.
> > 
> > To be honest 'nvme_dma_prp' was not a clear function name to me at first 
> > glance.
> > Could you rename this to nvme_dma_prp_rw or so? (Although even that is 
> > somewhat unclear
> > to convey the meaning of read/write the data to/from the guest memory areas 
> > defined by the prp list.
> > Also could you split this change into a new patch?
> > 
> 
> Splitting into new patch.
> 
> > > 
> > > Signed-off-by: Klaus Jensen 
> > > Signed-off-by: Klaus Jensen 
> > 
> > Now you even use your both addresses :-)
> > 
> > > ---
> > >  hw/block/nvme.c   | 245 +++---
> > >  hw/block/nvme.h   |   2 +-
> > >  hw/block/trace-events |   1 +
> > >  include/block/nvme.h  |   1 +
> > >  4 files changed, 160 insertions(+), 89 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index 4acfc85b56a2..334265efb21e 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -58,6 +58,11 @@
> > >  
> > >  static void nvme_process_sq(void *opaque);
> > >  
> > > +static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
> > > +{
> > > +return &n->cmbuf[addr - n->ctrl_mem.addr];
> > > +}
> > 
> > To my taste I would put this together with the patch that
> > added nvme_addr_is_cmb. I know that some people are against
> > this citing the fact that you should use the code you add
> > in the same patch. Your call.
> > 
> > Regardless of this I also prefer to put refactoring patches first in the 
> > series.
Thanks!
> > 
> > > +
> > >  static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
> > >  {
> > >  hwaddr low = n->ctrl_mem.addr;
> > > @@ -152,138 +157,187 @@ static void nvme_irq_deassert(NvmeCtrl *n, 
> > > NvmeCQueue *cq)
> > >  }
> > >  }
> > >  
> > > -static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, 
> > > uint64_t prp1,
> > > - uint64_t prp2, uint32_t len, NvmeCtrl *n)
> > > +static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector 
> > > *iov,
> > > +uint64_t prp1, uint64_t prp2, uint32_t len, NvmeRequest *req)
> > 
> > Split line alignment (it was correct before).
> > Also while at the refactoring, it would be great to add some documentation
> > to this and few more functions, since its not clear immediately what this 
> > does.
> > 
> > 
> > >  {
> > >  hwaddr trans_len = n->page_size - (prp1 % n->page_size);
> > >  trans_len = MIN(len, trans_len);
> > >  int num_prps = (len >> n->page_bits) + 1;
> > > +uint16_t status = NVME_SUCCESS;
> > > +bool is_cmb = false;
> > > +bool prp_list_in_cmb = false;
> > > +
> > > +trace_nvme_dev_map_prp(nvme_cid(req), req->cmd.opcode, trans_len, 
> > > len,
> > > +prp1, prp2, num_prps);
> > >  
> > >  if (unlikely(!prp1)) {
> > >  trace_nvme_dev_err_invalid_prp();
> > >  return NVME_INVALID_FIELD | NVME_DNR;
> > > -} else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
> > > -   prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) 
> > > {
> > > -qsg->nsg = 0;
> > > +}
> > > +
> > > +if (nvme_addr_is_cmb(n, prp1)) {
> > > +is_cmb = true;
> > > +
> > >  qemu_iovec_init(iov, num_prps);
> > > -qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - n->ctrl_mem.addr], 
> > > trans_len);
> > > +
> > > +/*
> > > + * PRPs do not cross page boundaries, so if the start address 
> > > (here,
> > > + * prp1) is within the CMB, it cannot cross outside the 
> > > controller
> > > + * memory buffer range. This is ensured by
> > > + *
> > > + *   len = n->page_size - (addr % n->page_size)
> > > + *
> > > + * Thus, we can directly add to the iovec without risking an out 
> > > of
> > > + * bounds access. This also holds for the remaining 
> > > qemu_iovec_add
> > > + * calls.
> > > + */
> > > +qemu_iovec_add(iov, nvme_addr_to_cmb(n, prp1), trans_len);
> > >  } else {
> > >  pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
> > >  qemu_sglist_add(qsg, prp1, trans_len);
> > >  }
> > > +
> > >  len -= trans_len;
> > >  if (len) {
> > >  if (unlikely(!prp2)) {
> > >  trace_nvme_dev_err_invalid

Re: [PATCH v5 09/26] nvme: add temperature threshold feature

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 00:44 -0700, Klaus Birkelund Jensen wrote:
> On Feb 12 11:31, Maxim Levitsky wrote:
> > On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> > > It might seem wierd to implement this feature for an emulated device,
> > > but it is mandatory to support and the feature is useful for testing
> > > asynchronous event request support, which will be added in a later
> > > patch.
> > 
> > Absolutely but as the old saying is, rules are rules.
> > At least, to the defense of the spec, making this mandatory
> > forced the vendors to actually report some statistics about
> > the device in neutral format as opposed to yet another
> > vendor proprietary thing (I am talking about SMART log page).
> > 
> > > 
> > > Signed-off-by: Klaus Jensen 
> > 
> > I noticed that you sign off some patches with your @samsung.com email,
> > and some with @cnexlabs.com
> > Is there a reason for that?
> 
> Yeah. Some of this code was made while I was at CNEX Labs. I've since
> moved to Samsung. But credit where credit's due.
I suspected something like that, but I just wanted to be sure that this is 
intentional,
and it looks all right to me now.

> 
> > 
> > 
> > > ---
> > >  hw/block/nvme.c  | 50 
> > >  hw/block/nvme.h  |  2 ++
> > >  include/block/nvme.h |  7 ++-
> > >  3 files changed, 58 insertions(+), 1 deletion(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index 81514eaef63a..f72348344832 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -45,6 +45,9 @@
> > >  
> > >  #define NVME_SPEC_VER 0x00010201
> > >  #define NVME_MAX_QS PCI_MSIX_FLAGS_QSIZE
> > > +#define NVME_TEMPERATURE 0x143
> > > +#define NVME_TEMPERATURE_WARNING 0x157
> > > +#define NVME_TEMPERATURE_CRITICAL 0x175
> > >  
> > >  #define NVME_GUEST_ERR(trace, fmt, ...) \
> > >  do { \
> > > @@ -798,9 +801,31 @@ static uint16_t nvme_get_feature_timestamp(NvmeCtrl 
> > > *n, NvmeCmd *cmd)
> > >  static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest 
> > > *req)
> > >  {
> > >  uint32_t dw10 = le32_to_cpu(cmd->cdw10);
> > > +uint32_t dw11 = le32_to_cpu(cmd->cdw11);
> > >  uint32_t result;
> > >  
> > >  switch (dw10) {
> > > +case NVME_TEMPERATURE_THRESHOLD:
> > > +result = 0;
> > > +
> > > +/*
> > > + * The controller only implements the Composite Temperature 
> > > sensor, so
> > > + * return 0 for all other sensors.
> > > + */
> > > +if (NVME_TEMP_TMPSEL(dw11)) {
> > > +break;
> > > +}
> > > +
> > > +switch (NVME_TEMP_THSEL(dw11)) {
> > > +case 0x0:
> > > +result = cpu_to_le16(n->features.temp_thresh_hi);
> > > +break;
> > > +case 0x1:
> > > +result = cpu_to_le16(n->features.temp_thresh_low);
> > > +break;
> > > +}
> > > +
> > > +break;
> > >  case NVME_VOLATILE_WRITE_CACHE:
> > >  result = blk_enable_write_cache(n->conf.blk);
> > >  trace_nvme_dev_getfeat_vwcache(result ? "enabled" : "disabled");
> > > @@ -845,6 +870,23 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, 
> > > NvmeCmd *cmd, NvmeRequest *req)
> > >  uint32_t dw11 = le32_to_cpu(cmd->cdw11);
> > >  
> > >  switch (dw10) {
> > > +case NVME_TEMPERATURE_THRESHOLD:
> > > +if (NVME_TEMP_TMPSEL(dw11)) {
> > > +break;
> > > +}
> > > +
> > > +switch (NVME_TEMP_THSEL(dw11)) {
> > > +case 0x0:
> > > +n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
> > > +break;
> > > +case 0x1:
> > > +n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
> > > +break;
> > > +default:
> > > +return NVME_INVALID_FIELD | NVME_DNR;
> > > +}
> > > +
> > > +break;
> > >  case NVME_VOLATILE_WRITE_CACHE:
> > >  blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
> > >  break;
> > > @@ -1366,6 +1408,9 @@ static void nvme_init_state(NvmeCtrl *n)
> > >  n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
> > >  n->sq = g_new0(NvmeSQueue *, n->params.num_queues);
> > >  n->cq = g_new0(NvmeCQueue *, n->params.num_queues);
> > > +
> > > +n->temperature = NVME_TEMPERATURE;
> > 
> > This appears not to be used in the patch.
> > I think you should move that to the next patch that
> > adds the get log page support.
> > 
> 
> Fixed.
Thanks
> 
> > > +n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
> > >  }
> > >  
> > >  static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
> > > @@ -1447,6 +1492,11 @@ static void nvme_init_ctrl(NvmeCtrl *n)
> > >  id->acl = 3;
> > >  id->frmw = 7 << 1;
> > >  id->lpa = 1 << 0;
> > > +
> > > +/* recommended default value (~70 C) */
> > > +id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
> > > +id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
> > > 

[PATCH v2 1/2] iotests: define group in each iotests

2020-03-25 Thread Vladimir Sementsov-Ogievskiy
We are going to drop group file. Define group in tests as a preparatory
step.

The patch is generated by

cd tests/qemu-iotests

grep '^[0-9]\{3\} ' group | while read line; do
file=$(awk '{print $1}' <<< "$line");
groups=$(sed -e 's/^... //' <<< "$line");
awk "NR==2{print \"# group: $groups\"}1" $file > tmp;
cat tmp > $file;
done

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 tests/qemu-iotests/001 | 1 +
 tests/qemu-iotests/002 | 1 +
 tests/qemu-iotests/003 | 1 +
 tests/qemu-iotests/004 | 1 +
 tests/qemu-iotests/005 | 1 +
 tests/qemu-iotests/007 | 1 +
 tests/qemu-iotests/008 | 1 +
 tests/qemu-iotests/009 | 1 +
 tests/qemu-iotests/010 | 1 +
 tests/qemu-iotests/011 | 1 +
 tests/qemu-iotests/012 | 1 +
 tests/qemu-iotests/013 | 1 +
 tests/qemu-iotests/014 | 1 +
 tests/qemu-iotests/015 | 1 +
 tests/qemu-iotests/017 | 1 +
 tests/qemu-iotests/018 | 1 +
 tests/qemu-iotests/019 | 1 +
 tests/qemu-iotests/020 | 1 +
 tests/qemu-iotests/021 | 1 +
 tests/qemu-iotests/022 | 1 +
 tests/qemu-iotests/023 | 1 +
 tests/qemu-iotests/024 | 1 +
 tests/qemu-iotests/025 | 1 +
 tests/qemu-iotests/026 | 1 +
 tests/qemu-iotests/027 | 1 +
 tests/qemu-iotests/028 | 1 +
 tests/qemu-iotests/029 | 1 +
 tests/qemu-iotests/030 | 1 +
 tests/qemu-iotests/031 | 1 +
 tests/qemu-iotests/032 | 1 +
 tests/qemu-iotests/033 | 1 +
 tests/qemu-iotests/034 | 1 +
 tests/qemu-iotests/035 | 1 +
 tests/qemu-iotests/036 | 1 +
 tests/qemu-iotests/037 | 1 +
 tests/qemu-iotests/038 | 1 +
 tests/qemu-iotests/039 | 1 +
 tests/qemu-iotests/040 | 1 +
 tests/qemu-iotests/041 | 1 +
 tests/qemu-iotests/042 | 1 +
 tests/qemu-iotests/043 | 1 +
 tests/qemu-iotests/044 | 1 +
 tests/qemu-iotests/045 | 1 +
 tests/qemu-iotests/046 | 1 +
 tests/qemu-iotests/047 | 1 +
 tests/qemu-iotests/048 | 1 +
 tests/qemu-iotests/049 | 1 +
 tests/qemu-iotests/050 | 1 +
 tests/qemu-iotests/051 | 1 +
 tests/qemu-iotests/052 | 1 +
 tests/qemu-iotests/053 | 1 +
 tests/qemu-iotests/054 | 1 +
 tests/qemu-iotests/055 | 1 +
 tests/qemu-iotests/056 | 1 +
 tests/qemu-iotests/057 | 1 +
 tests/qemu-iotests/058 | 1 +
 tests/qemu-iotests/059 | 1 +
 tests/qemu-iotests/060 | 1 +
 tests/qemu-iotests/061 | 1 +
 tests/qemu-iotests/062 | 1 +
 tests/qemu-iotests/063 | 1 +
 tests/qemu-iotests/064 | 1 +
 tests/qemu-iotests/065 | 1 +
 tests/qemu-iotests/066 | 1 +
 tests/qemu-iotests/067 | 1 +
 tests/qemu-iotests/068 | 1 +
 tests/qemu-iotests/069 | 1 +
 tests/qemu-iotests/070 | 1 +
 tests/qemu-iotests/071 | 1 +
 tests/qemu-iotests/072 | 1 +
 tests/qemu-iotests/073 | 1 +
 tests/qemu-iotests/074 | 1 +
 tests/qemu-iotests/075 | 1 +
 tests/qemu-iotests/076 | 1 +
 tests/qemu-iotests/077 | 1 +
 tests/qemu-iotests/078 | 1 +
 tests/qemu-iotests/079 | 1 +
 tests/qemu-iotests/080 | 1 +
 tests/qemu-iotests/081 | 1 +
 tests/qemu-iotests/082 | 1 +
 tests/qemu-iotests/083 | 1 +
 tests/qemu-iotests/084 | 1 +
 tests/qemu-iotests/085 | 1 +
 tests/qemu-iotests/086 | 1 +
 tests/qemu-iotests/087 | 1 +
 tests/qemu-iotests/088 | 1 +
 tests/qemu-iotests/089 | 1 +
 tests/qemu-iotests/090 | 1 +
 tests/qemu-iotests/091 | 1 +
 tests/qemu-iotests/092 | 1 +
 tests/qemu-iotests/093 | 1 +
 tests/qemu-iotests/094 | 1 +
 tests/qemu-iotests/095 | 1 +
 tests/qemu-iotests/096 | 1 +
 tests/qemu-iotests/097 | 1 +
 tests/qemu-iotests/098 | 1 +
 tests/qemu-iotests/099 | 1 +
 tests/qemu-iotests/101 | 1 +
 tests/qemu-iotests/102 | 1 +
 tests/qemu-iotests/103 | 1 +
 tests/qemu-iotests/104 | 1 +
 tests/qemu-iotests/105 | 1 +
 tests/qemu-iotests/106 | 1 +
 tests/qemu-iotests/107 | 1 +
 tests/qemu-iotests/108 | 1 +
 tests/qemu-iotests/109 | 1 +
 tests/qemu-iotests/110 | 1 +
 tests/qemu-iotests/111 | 1 +
 tests/qemu-iotests/112 | 1 +
 tests/qemu-iotests/113 | 1 +
 tests/qemu-iotests/114 | 1 +
 tests/qemu-iotests/115 | 1 +
 tests/qemu-iotests/116 | 1 +
 tests/qemu-iotests/117 | 1 +
 tests/qemu-iotests/118 | 1 +
 tests/qemu-iotests/119 | 1 +
 tests/qemu-iotests/120 | 1 +
 tests/qemu-iotests/121 | 1 +
 tests/qemu-iotests/122 | 1 +
 tests/qemu-iotests/123 | 1 +
 tests/qemu-iotests/124 | 1 +
 tests/qemu-iotests/125 | 1 +
 tests/qemu-iotests/126 | 1 +
 tests/qemu-iotests/127 | 1 +
 tests/qemu-iotests/128 | 1 +
 tests/qemu-iotests/129 | 1 +
 tests/qemu-iotests/130 | 1 +
 tests/qemu-iotests/131 | 1 +
 tests/qemu-iotests/132 | 1 +
 tests/qemu-iotests/133 | 1 +
 tests/qemu-iotests/134 | 1 +
 tests/qemu-iotests/135 | 1 +
 tests/qemu-iotests/136 | 1 +
 tests/qemu-iotests/137 | 1 +
 tests/qemu-iotests/138 | 1 +
 tests/qemu-iotests/139 | 1 +
 tests/qemu-iotests/140 | 1 +
 tests/qemu-iotests/141 | 1 +
 tests/qemu-iotests/143 | 1 +
 tests/qemu-iotests/144 | 1 +
 tests/qemu-iotests/145 | 1 +
 tests/qemu-iotests/146 | 1 +
 tests/qemu-iotests/147 | 1 +
 tests/qemu-iotests/148 | 1 +
 tests/qemu-iotests/149 | 1 +
 tests/qemu-iotests/150 | 1 +
 tests/qemu-iotests/151 | 1 +
 tests/qemu-iotests/152 | 1 +
 tests/qemu-iotests/153 | 1 +
 tests/qemu-iotests/154 | 1 +
 tests/qemu-iotests/155 | 1 +
 tests/qemu-iotests/156 | 1 +
 tests/qemu-

Re: [PATCH v5 17/26] nvme: allow multiple aios per command

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 00:53 -0700, Klaus Birkelund Jensen wrote:
> On Feb 12 13:48, Maxim Levitsky wrote:
> > On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> > > This refactors how the device issues asynchronous block backend
> > > requests. The NvmeRequest now holds a queue of NvmeAIOs that are
> > > associated with the command. This allows multiple aios to be issued for
> > > a command. Only when all requests have been completed will the device
> > > post a completion queue entry.
> > > 
> > > Because the device is currently guaranteed to only issue a single aio
> > > request per command, the benefit is not immediately obvious. But this
> > > functionality is required to support metadata, the dataset management
> > > command and other features.
> > 
> > I don't know what the strategy will be chosen for supporting metadata
> > (qemu doesn't have any notion of metadata in the block layer), but for 
> > dataset management
> > you are right. Dataset management command can contain a table of areas to 
> > discard
> > (although in reality I have seen no driver putting there more that one 
> > entry).
> > 
> 
> The strategy is different depending on how the metadata is transferred
> between host and device. For the "separate buffer" case, metadata is
> transferred using a separate memory pointer in the nvme command (MPTR).
> In this case the metadata is kept separately on a new blockdev attached
> to the namespace.
Looks reasonable.
> 


> In the other case, metadata is transferred as part of an extended lba
> (say 512 + 8 bytes) and kept inline on the main namespace blockdev. This
> is challenging for QEMU as it breaks interoperability of the image with
> other devices. But that is a discussion for fresh RFC ;)

Yes, this one is quite problemetic. IMHO even the kernel opted out to not
support this kind of metadata (I know that since I played with one of Intel's 
enterprise
SSDs when I developed nvme-mdev, and sadly this is the only kind of metadata it 
supports).
I guess if we have to support this format (for the sake of making our nvme 
virtual device
as feature complete as possible for driver development), I would emulate this 
with a
separate drive as well.

> 
> Note that the support for multiple AIOs is also used for DULBE support
This is a typo? I don't recall something like that from the spec.

> down the line when I get around to posting those patches. So this is
> preparatory for a lot of features that requires persistant state across
> device power off.
All right. Thanks again for your work. I wish I had all these features
when I developed nvme-mdev, it would make my life much easier.

> 
> > 
> > > 
> > > Signed-off-by: Klaus Jensen 
> > > Signed-off-by: Klaus Jensen 
> > > ---
> > >  hw/block/nvme.c   | 449 +-
> > >  hw/block/nvme.h   | 134 +++--
> > >  hw/block/trace-events |   8 +
> > >  3 files changed, 480 insertions(+), 111 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index 334265efb21e..e97da35c4ca1 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -19,7 +19,8 @@
> > >   *  -drive file=,if=none,id=
> > >   *  -device nvme,drive=,serial=,id=, 
> > > \
> > >   *  cmb_size_mb=, \
> > > - *  num_queues=
> > > + *  num_queues=, \
> > > + *  mdts=
> > 
> > Could you split mdts checks into a separate patch? This is not related to 
> > the series.
> 
> Absolutely. Done.
Perfect, thanks!
> 
> > 
> > >   *
> > >   * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
> > >   * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
> > > @@ -57,6 +58,7 @@
> > >  } while (0)
> > >  
> > >  static void nvme_process_sq(void *opaque);
> > > +static void nvme_aio_cb(void *opaque, int ret);
> > >  
> > >  static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
> > >  {
> > > @@ -341,6 +343,107 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t 
> > > *ptr, uint32_t len,
> > >  return status;
> > >  }
> > >  
> > > +static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > > +{
> > > +NvmeNamespace *ns = req->ns;
> > > +
> > > +uint32_t len = req->nlb << nvme_ns_lbads(ns);
> > > +uint64_t prp1 = le64_to_cpu(cmd->prp1);
> > > +uint64_t prp2 = le64_to_cpu(cmd->prp2);
> > > +
> > > +return nvme_map_prp(n, &req->qsg, &req->iov, prp1, prp2, len, req);
> > > +}
> > 
> > Same here, this is another nice refactoring and it should be in separate 
> > patch.
> 
> Done.
> 
> > 
> > > +
> > > +static void nvme_aio_destroy(NvmeAIO *aio)
> > > +{
> > > +g_free(aio);
> > > +}
> > > +
> > > +static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio,
> > > +NvmeAIOOp opc)
> > > +{
> > > +aio->opc = opc;
> > > +
> > > +trace_nvme_dev_req_register_aio(nvme_cid(req), aio, 
> > > blk_name(aio->blk),
> > > +aio->offset, aio->len, nvme_aio_opc_str(a

Re: [PATCH v5 10/26] nvme: add support for the get log page command

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 00:45 -0700, Klaus Birkelund Jensen wrote:
> On Feb 12 11:35, Maxim Levitsky wrote:
> > On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> > > Add support for the Get Log Page command and basic implementations of
> > > the mandatory Error Information, SMART / Health Information and Firmware
> > > Slot Information log pages.
> > > 
> > > In violation of the specification, the SMART / Health Information log
> > > page does not persist information over the lifetime of the controller
> > > because the device has no place to store such persistent state.
> > 
> > Yea, not the end of the world.
> > > 
> > > Note that the LPA field in the Identify Controller data structure
> > > intentionally has bit 0 cleared because there is no namespace specific
> > > information in the SMART / Health information log page.
> > 
> > Makes sense.
> > > 
> > > Required for compliance with NVMe revision 1.2.1. See NVM Express 1.2.1,
> > > Section 5.10 ("Get Log Page command").
> > > 
> > > Signed-off-by: Klaus Jensen 
> > > ---
> > >  hw/block/nvme.c   | 122 +-
> > >  hw/block/nvme.h   |  10 
> > >  hw/block/trace-events |   2 +
> > >  include/block/nvme.h  |   2 +-
> > >  4 files changed, 134 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index f72348344832..468c36918042 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -569,6 +569,123 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd 
> > > *cmd)
> > >  return NVME_SUCCESS;
> > >  }
> > >  
> > > +static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t 
> > > buf_len,
> > > +uint64_t off, NvmeRequest *req)
> > > +{
> > > +uint64_t prp1 = le64_to_cpu(cmd->prp1);
> > > +uint64_t prp2 = le64_to_cpu(cmd->prp2);
> > > +uint32_t nsid = le32_to_cpu(cmd->nsid);
> > > +
> > > +uint32_t trans_len;
> > > +time_t current_ms;
> > > +uint64_t units_read = 0, units_written = 0, read_commands = 0,
> > > +write_commands = 0;
> > > +NvmeSmartLog smart;
> > > +BlockAcctStats *s;
> > > +
> > > +if (nsid && nsid != 0x) {
> > > +return NVME_INVALID_FIELD | NVME_DNR;
> > > +}
> > > +
> > > +s = blk_get_stats(n->conf.blk);
> > > +
> > > +units_read = s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
> > > +units_written = s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
> > > +read_commands = s->nr_ops[BLOCK_ACCT_READ];
> > > +write_commands = s->nr_ops[BLOCK_ACCT_WRITE];
> > > +
> > > +if (off > sizeof(smart)) {
> > > +return NVME_INVALID_FIELD | NVME_DNR;
> > > +}
> > > +
> > > +trans_len = MIN(sizeof(smart) - off, buf_len);
> > > +
> > > +memset(&smart, 0x0, sizeof(smart));
> > > +
> > > +smart.data_units_read[0] = cpu_to_le64(units_read / 1000);
> > > +smart.data_units_written[0] = cpu_to_le64(units_written / 1000);
> > > +smart.host_read_commands[0] = cpu_to_le64(read_commands);
> > > +smart.host_write_commands[0] = cpu_to_le64(write_commands);
> > > +
> > > +smart.temperature[0] = n->temperature & 0xff;
> > > +smart.temperature[1] = (n->temperature >> 8) & 0xff;
> > > +
> > > +if ((n->temperature > n->features.temp_thresh_hi) ||
> > > +(n->temperature < n->features.temp_thresh_low)) {
> > > +smart.critical_warning |= NVME_SMART_TEMPERATURE;
> > > +}
> > > +
> > > +current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
> > > +smart.power_on_hours[0] = cpu_to_le64(
> > > +(((current_ms - n->starttime_ms) / 1000) / 60) / 60);
> > > +
> > > +return nvme_dma_read_prp(n, (uint8_t *) &smart + off, trans_len, 
> > > prp1,
> > > +prp2);
> > > +}
> > 
> > Looks OK.
> > > +
> > > +static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t 
> > > buf_len,
> > > +uint64_t off, NvmeRequest *req)
> > > +{
> > > +uint32_t trans_len;
> > > +uint64_t prp1 = le64_to_cpu(cmd->prp1);
> > > +uint64_t prp2 = le64_to_cpu(cmd->prp2);
> > > +NvmeFwSlotInfoLog fw_log;
> > > +
> > > +if (off > sizeof(fw_log)) {
> > > +return NVME_INVALID_FIELD | NVME_DNR;
> > > +}
> > > +
> > > +memset(&fw_log, 0, sizeof(NvmeFwSlotInfoLog));
> > > +
> > > +trans_len = MIN(sizeof(fw_log) - off, buf_len);
> > > +
> > > +return nvme_dma_read_prp(n, (uint8_t *) &fw_log + off, trans_len, 
> > > prp1,
> > > +prp2);
> > > +}
> > 
> > Looks OK
> > > +
> > > +static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > > +{
> > > +uint32_t dw10 = le32_to_cpu(cmd->cdw10);
> > > +uint32_t dw11 = le32_to_cpu(cmd->cdw11);
> > > +uint32_t dw12 = le32_to_cpu(cmd->cdw12);
> > > +uint32_t dw13 = le32_to_cpu(cmd->cdw13);
> > > +uint8_t  lid = dw10 & 0xff;
> > > +uint8_t  rae = (dw10 >> 15) & 0x1;
> > > +uint32_t numdl, numdu;
> > > +uint64_t off, lpol, lpou;
> > > +size_t   len;
> > > +
> >

Re: [PATCH v5 15/26] nvme: bump supported specification to 1.3

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 00:50 -0700, Klaus Birkelund Jensen wrote:
> On Feb 12 12:35, Maxim Levitsky wrote:
> > On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> > > Add new fields to the Identify Controller and Identify Namespace data
> > > structures accoding to NVM Express 1.3d.
> > > 
> > > NVM Express 1.3d requires the following additional features:
> > >   - addition of the Namespace Identification Descriptor List (CNS 03h)
> > > for the Identify command
> > >   - support for returning Command Sequence Error if a Set Features
> > > command is submitted for the Number of Queues feature after any I/O
> > > queues have been created.
> > >   - The addition of the Log Specific Field (LSP) in the Get Log Page
> > > command.
> > > 
> > > Signed-off-by: Klaus Jensen 
> > > ---
> > >  hw/block/nvme.c   | 57 ---
> > >  hw/block/nvme.h   |  1 +
> > >  hw/block/trace-events |  3 ++-
> > >  include/block/nvme.h  | 20 ++-
> > >  4 files changed, 71 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index 900732bb2f38..4acfc85b56a2 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -9,7 +9,7 @@
> > >   */
> > >  
> > >  /**
> > > - * Reference Specification: NVM Express 1.2.1
> > > + * Reference Specification: NVM Express 1.3d
> > >   *
> > >   *   https://nvmexpress.org/resources/specifications/
> > >   */
> > > @@ -43,7 +43,7 @@
> > >  #include "trace.h"
> > >  #include "nvme.h"
> > >  
> > > -#define NVME_SPEC_VER 0x00010201
> > > +#define NVME_SPEC_VER 0x00010300
> > >  #define NVME_MAX_QS PCI_MSIX_FLAGS_QSIZE
> > >  #define NVME_TEMPERATURE 0x143
> > >  #define NVME_TEMPERATURE_WARNING 0x157
> > > @@ -735,6 +735,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd 
> > > *cmd, NvmeRequest *req)
> > >  uint32_t dw12 = le32_to_cpu(cmd->cdw12);
> > >  uint32_t dw13 = le32_to_cpu(cmd->cdw13);
> > >  uint8_t  lid = dw10 & 0xff;
> > > +uint8_t  lsp = (dw10 >> 8) & 0xf;
> > >  uint8_t  rae = (dw10 >> 15) & 0x1;
> > >  uint32_t numdl, numdu;
> > >  uint64_t off, lpol, lpou;
> > > @@ -752,7 +753,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd 
> > > *cmd, NvmeRequest *req)
> > >  return NVME_INVALID_FIELD | NVME_DNR;
> > >  }
> > >  
> > > -trace_nvme_dev_get_log(nvme_cid(req), lid, rae, len, off);
> > > +trace_nvme_dev_get_log(nvme_cid(req), lid, lsp, rae, len, off);
> > >  
> > >  switch (lid) {
> > >  case NVME_LOG_ERROR_INFO:
> > > @@ -863,6 +864,8 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd 
> > > *cmd)
> > >  cq = g_malloc0(sizeof(*cq));
> > >  nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
> > >  NVME_CQ_FLAGS_IEN(qflags));
> > 
> > Code alignment on that '('
> > > +
> > > +n->qs_created = true;
> > 
> > Should be done also at nvme_create_sq
> 
> No, because you can't create a SQ without a matching CQ:
True, I missed that.

> 
> if (unlikely(!cqid || nvme_check_cqid(n, cqid))) {
> trace_nvme_dev_err_invalid_create_sq_cqid(cqid);
> return NVME_INVALID_CQID | NVME_DNR;
> }
> 
> 
> So if there is a matching cq, then qs_created = true.
> 
> > >  return NVME_SUCCESS;
> > >  }
> > >  
> > > @@ -924,6 +927,47 @@ static uint16_t nvme_identify_ns_list(NvmeCtrl *n, 
> > > NvmeIdentify *c)
> > >  return ret;
> > >  }
> > >  
> > > +static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeCmd *c)
> > > +{
> > > +static const int len = 4096;
> > 
> > The spec caps the Identify payload size to 4K,
> > thus this should go to nvme.h
> 
> Done.
> 
> > > +
> > > +struct ns_descr {
> > > +uint8_t nidt;
> > > +uint8_t nidl;
> > > +uint8_t rsvd2[2];
> > > +uint8_t nid[16];
> > > +};
> > 
> > This is also part of the spec, thus should
> > move to nvme.h
> > 
> 
> Done - and cleaned up.
Perfect, thanks!
> 
> > > +
> > > +uint32_t nsid = le32_to_cpu(c->nsid);
> > > +uint64_t prp1 = le64_to_cpu(c->prp1);
> > > +uint64_t prp2 = le64_to_cpu(c->prp2);
> > > +
> > > +struct ns_descr *list;
> > > +uint16_t ret;
> > > +
> > > +trace_nvme_dev_identify_ns_descr_list(nsid);
> > > +
> > > +if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
> > > +trace_nvme_dev_err_invalid_ns(nsid, n->num_namespaces);
> > > +return NVME_INVALID_NSID | NVME_DNR;
> > > +}
> > > +
> > > +/*
> > > + * Because the NGUID and EUI64 fields are 0 in the Identify 
> > > Namespace data
> > > + * structure, a Namespace UUID (nidt = 0x3) must be reported in the
> > > + * Namespace Identification Descriptor. Add a very basic Namespace 
> > > UUID
> > > + * here.
> > 
> > Some per namespace uuid qemu property will be very nice to have to have a 
> > uuid that
> > is at least somewhat unique.
> > Linux kernel I think might complain if it detects namespaces with duplicate 
> > uuids.
> 

Re: [PATCH v5 12/26] nvme: add missing mandatory features

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 00:47 -0700, Klaus Birkelund Jensen wrote:
> On Feb 12 12:27, Maxim Levitsky wrote:
> > On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> > > Add support for returning a resonable response to Get/Set Features of
> > > mandatory features.
> > > 
> > > Signed-off-by: Klaus Jensen 
> > > ---
> > >  hw/block/nvme.c   | 57 ---
> > >  hw/block/trace-events |  2 ++
> > >  include/block/nvme.h  |  3 ++-
> > >  3 files changed, 58 insertions(+), 4 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index a186d95df020..3267ee2de47a 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -1008,7 +1008,15 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, 
> > > NvmeCmd *cmd, NvmeRequest *req)
> > >  uint32_t dw11 = le32_to_cpu(cmd->cdw11);
> > >  uint32_t result;
> > >  
> > > +trace_nvme_dev_getfeat(nvme_cid(req), dw10);
> > > +
> > >  switch (dw10) {
> > > +case NVME_ARBITRATION:
> > > +result = cpu_to_le32(n->features.arbitration);
> > > +break;
> > > +case NVME_POWER_MANAGEMENT:
> > > +result = cpu_to_le32(n->features.power_mgmt);
> > > +break;
> > >  case NVME_TEMPERATURE_THRESHOLD:
> > >  result = 0;
> > >  
> > > @@ -1029,6 +1037,9 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, 
> > > NvmeCmd *cmd, NvmeRequest *req)
> > >  break;
> > >  }
> > >  
> > > +break;
> > > +case NVME_ERROR_RECOVERY:
> > > +result = cpu_to_le32(n->features.err_rec);
> > >  break;
> > >  case NVME_VOLATILE_WRITE_CACHE:
> > >  result = blk_enable_write_cache(n->conf.blk);
> > 
> > This is existing code but still like to point out that endianess conversion 
> > is missing.
> 
> Fixed.
> 
> > Also we need to think if we need to do some flush if the write cache is 
> > disabled.
> > I don't know yet that area well enough.
> > 
> 
> Looking at the block layer code it just sets a flag when disabling, but
> subsequent requests will have BDRV_REQ_FUA set. So to make sure that
> stuff in the cache is flushed, let's do a flush.
Good to know!

> 
> > > @@ -1041,6 +1052,19 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, 
> > > NvmeCmd *cmd, NvmeRequest *req)
> > >  break;
> > >  case NVME_TIMESTAMP:
> > >  return nvme_get_feature_timestamp(n, cmd);
> > > +case NVME_INTERRUPT_COALESCING:
> > > +result = cpu_to_le32(n->features.int_coalescing);
> > > +break;
> > > +case NVME_INTERRUPT_VECTOR_CONF:
> > > +if ((dw11 & 0x) > n->params.num_queues) {
> > 
> > Looks like it should be >= since interrupt vector is not zero based.
> 
> Fixed in other patch.
> 
> > > +return NVME_INVALID_FIELD | NVME_DNR;
> > > +}
> > > +
> > > +result = cpu_to_le32(n->features.int_vector_config[dw11 & 
> > > 0x]);
> > > +break;
> > > +case NVME_WRITE_ATOMICITY:
> > > +result = cpu_to_le32(n->features.write_atomicity);
> > > +break;
> > >  case NVME_ASYNCHRONOUS_EVENT_CONF:
> > >  result = cpu_to_le32(n->features.async_config);
> > >  break;
> > > @@ -1076,6 +1100,8 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, 
> > > NvmeCmd *cmd, NvmeRequest *req)
> > >  uint32_t dw10 = le32_to_cpu(cmd->cdw10);
> > >  uint32_t dw11 = le32_to_cpu(cmd->cdw11);
> > >  
> > > +trace_nvme_dev_setfeat(nvme_cid(req), dw10, dw11);
> > > +
> > >  switch (dw10) {
> > >  case NVME_TEMPERATURE_THRESHOLD:
> > >  if (NVME_TEMP_TMPSEL(dw11)) {
> > > @@ -1116,6 +1142,13 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, 
> > > NvmeCmd *cmd, NvmeRequest *req)
> > >  case NVME_ASYNCHRONOUS_EVENT_CONF:
> > >  n->features.async_config = dw11;
> > >  break;
> > > +case NVME_ARBITRATION:
> > > +case NVME_POWER_MANAGEMENT:
> > > +case NVME_ERROR_RECOVERY:
> > > +case NVME_INTERRUPT_COALESCING:
> > > +case NVME_INTERRUPT_VECTOR_CONF:
> > > +case NVME_WRITE_ATOMICITY:
> > > +return NVME_FEAT_NOT_CHANGABLE | NVME_DNR;
> > >  default:
> > >  trace_nvme_dev_err_invalid_setfeat(dw10);
> > >  return NVME_INVALID_FIELD | NVME_DNR;
> > > @@ -1689,6 +1722,21 @@ static void nvme_init_state(NvmeCtrl *n)
> > >  n->temperature = NVME_TEMPERATURE;
> > >  n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
> > >  n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
> > > +
> > > +/*
> > > + * There is no limit on the number of commands that the controller 
> > > may
> > > + * launch at one time from a particular Submission Queue.
> > > + */
> > > +n->features.arbitration = 0x7;
> > 
> > A nice #define in nvme.h stating that 0x7 means no burst limit would be 
> > nice.
> > 
> 
> Done.
> 
> > > +
> > > +n->features.int_vector_config = g_malloc0_n(n->params.num_queues,
> > > +sizeof(*n->features.int_vector_co

Re: [PATCH v5 21/26] nvme: add support for scatter gather lists

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 00:54 -0700, Klaus Birkelund Jensen wrote:
> On Feb 12 14:07, Maxim Levitsky wrote:
> > On Tue, 2020-02-04 at 10:52 +0100, Klaus Jensen wrote:
> > > For now, support the Data Block, Segment and Last Segment descriptor
> > > types.
> > > 
> > > See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)").
> > > 
> > > Signed-off-by: Klaus Jensen 
> > > Acked-by: Fam Zheng 
> > > ---
> > >  block/nvme.c  |  18 +-
> > >  hw/block/nvme.c   | 375 +++---
> > >  hw/block/trace-events |   4 +
> > >  include/block/nvme.h  |  62 ++-
> > >  4 files changed, 389 insertions(+), 70 deletions(-)
> > > 
> > > diff --git a/block/nvme.c b/block/nvme.c
> > > index d41c4bda6e39..521f521054d5 100644
> > > --- a/block/nvme.c
> > > +++ b/block/nvme.c
> > > @@ -446,7 +446,7 @@ static void nvme_identify(BlockDriverState *bs, int 
> > > namespace, Error **errp)
> > >  error_setg(errp, "Cannot map buffer for DMA");
> > >  goto out;
> > >  }
> > > -cmd.prp1 = cpu_to_le64(iova);
> > > +cmd.dptr.prp.prp1 = cpu_to_le64(iova);
> > >  
> > >  if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
> > >  error_setg(errp, "Failed to identify controller");
> > > @@ -545,7 +545,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, 
> > > Error **errp)
> > >  }
> > >  cmd = (NvmeCmd) {
> > >  .opcode = NVME_ADM_CMD_CREATE_CQ,
> > > -.prp1 = cpu_to_le64(q->cq.iova),
> > > +.dptr.prp.prp1 = cpu_to_le64(q->cq.iova),
> > >  .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)),
> > >  .cdw11 = cpu_to_le32(0x3),
> > >  };
> > > @@ -556,7 +556,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, 
> > > Error **errp)
> > >  }
> > >  cmd = (NvmeCmd) {
> > >  .opcode = NVME_ADM_CMD_CREATE_SQ,
> > > -.prp1 = cpu_to_le64(q->sq.iova),
> > > +.dptr.prp.prp1 = cpu_to_le64(q->sq.iova),
> > >  .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)),
> > >  .cdw11 = cpu_to_le32(0x1 | (n << 16)),
> > >  };
> > > @@ -906,16 +906,16 @@ try_map:
> > >  case 0:
> > >  abort();
> > >  case 1:
> > > -cmd->prp1 = pagelist[0];
> > > -cmd->prp2 = 0;
> > > +cmd->dptr.prp.prp1 = pagelist[0];
> > > +cmd->dptr.prp.prp2 = 0;
> > >  break;
> > >  case 2:
> > > -cmd->prp1 = pagelist[0];
> > > -cmd->prp2 = pagelist[1];
> > > +cmd->dptr.prp.prp1 = pagelist[0];
> > > +cmd->dptr.prp.prp2 = pagelist[1];
> > >  break;
> > >  default:
> > > -cmd->prp1 = pagelist[0];
> > > -cmd->prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
> > > +cmd->dptr.prp.prp1 = pagelist[0];
> > > +cmd->dptr.prp.prp2 = cpu_to_le64(req->prp_list_iova + 
> > > sizeof(uint64_t));
> > >  break;
> > >  }
> > >  trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries);
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index 204ae1d33234..a91c60fdc111 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -75,8 +75,10 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, 
> > > hwaddr addr)
> > >  
> > >  static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
> > >  {
> > > -if (n->cmbsz && nvme_addr_is_cmb(n, addr)) {
> > > -memcpy(buf, (void *) &n->cmbuf[addr - n->ctrl_mem.addr], size);
> > > +hwaddr hi = addr + size;
> > 
> > Are you sure you don't want to check for overflow here?
> > Its theoretical issue since addr has to be almost full 64 bit
> > but still for those things I check this very defensively.
> > 
> 
> The use of nvme_addr_read in map_prp simply cannot overflow due to how
> the size is calculated, but for SGLs it's different. But the overflow is
> checked in map_sgl because we have to return a special error code in
> that case.
> 
> On the other hand there may be other callers of nvme_addr_read in the
> future that does not check this, so I'll re-add it.
Yep. For security things, as much checks as possible is always good.
Thanks!

> 
> > > +
> > > +if (n->cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, 
> > > hi)) {
> > 
> > Here you fix the bug I mentioned in patch 6. I suggest you to move the fix 
> > there.
> 
> Done.
> 
> > > +memcpy(buf, nvme_addr_to_cmb(n, addr), size);
> > >  return 0;
> > >  }
> > >  
> > > @@ -159,6 +161,48 @@ static void nvme_irq_deassert(NvmeCtrl *n, 
> > > NvmeCQueue *cq)
> > >  }
> > >  }
> > >  
> > > +static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr 
> > > addr,
> > > +size_t len)
> > > +{
> > > +if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len)) {
> > > +return NVME_DATA_TRANSFER_ERROR;
> > > +}
> > > +
> > > +qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
> > > +
> > > +return NVME_SUCCESS;
> > > +}
> > > +
> > > +st

Re: [PATCH v5 20/26] nvme: handle dma errors

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 00:53 -0700, Klaus Birkelund Jensen wrote:
> On Feb 12 13:52, Maxim Levitsky wrote:
> > On Tue, 2020-02-04 at 10:52 +0100, Klaus Jensen wrote:
> > > Handling DMA errors gracefully is required for the device to pass the
> > > block/011 test ("disable PCI device while doing I/O") in the blktests
> > > suite.
> > > 
> > > With this patch the device passes the test by retrying "critical"
> > > transfers (posting of completion entries and processing of submission
> > > queue entries).
> > > 
> > > If DMA errors occur at any other point in the execution of the command
> > > (say, while mapping the PRPs), the command is aborted with a Data
> > > Transfer Error status code.
> > > 
> > > Signed-off-by: Klaus Jensen 
> > > ---
> > >  hw/block/nvme.c   | 42 +-
> > >  hw/block/trace-events |  2 ++
> > >  include/block/nvme.h  |  2 +-
> > >  3 files changed, 36 insertions(+), 10 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index f8c81b9e2202..204ae1d33234 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -73,14 +73,14 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, 
> > > hwaddr addr)
> > >  return addr >= low && addr < hi;
> > >  }
> > >  
> > > -static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
> > > +static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
> > >  {
> > >  if (n->cmbsz && nvme_addr_is_cmb(n, addr)) {
> > >  memcpy(buf, (void *) &n->cmbuf[addr - n->ctrl_mem.addr], size);
> > > -return;
> > > +return 0;
> > >  }
> > >  
> > > -pci_dma_read(&n->parent_obj, addr, buf, size);
> > > +return pci_dma_read(&n->parent_obj, addr, buf, size);
> > >  }
> > >  
> > >  static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
> > > @@ -168,6 +168,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList 
> > > *qsg, QEMUIOVector *iov,
> > >  uint16_t status = NVME_SUCCESS;
> > >  bool is_cmb = false;
> > >  bool prp_list_in_cmb = false;
> > > +int ret;
> > >  
> > >  trace_nvme_dev_map_prp(nvme_cid(req), req->cmd.opcode, trans_len, 
> > > len,
> > >  prp1, prp2, num_prps);
> > > @@ -218,7 +219,12 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList 
> > > *qsg, QEMUIOVector *iov,
> > >  
> > >  nents = (len + n->page_size - 1) >> n->page_bits;
> > >  prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
> > > -nvme_addr_read(n, prp2, (void *) prp_list, prp_trans);
> > > +ret = nvme_addr_read(n, prp2, (void *) prp_list, prp_trans);
> > > +if (ret) {
> > > +trace_nvme_dev_err_addr_read(prp2);
> > > +status = NVME_DATA_TRANSFER_ERROR;
> > > +goto unmap;
> > > +}
> > >  while (len != 0) {
> > >  uint64_t prp_ent = le64_to_cpu(prp_list[i]);
> > >  
> > > @@ -237,7 +243,13 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList 
> > > *qsg, QEMUIOVector *iov,
> > >  i = 0;
> > >  nents = (len + n->page_size - 1) >> n->page_bits;
> > >  prp_trans = MIN(n->max_prp_ents, nents) * 
> > > sizeof(uint64_t);
> > > -nvme_addr_read(n, prp_ent, (void *) prp_list, 
> > > prp_trans);
> > > +ret = nvme_addr_read(n, prp_ent, (void *) prp_list,
> > > +prp_trans);
> > > +if (ret) {
> > > +trace_nvme_dev_err_addr_read(prp_ent);
> > > +status = NVME_DATA_TRANSFER_ERROR;
> > > +goto unmap;
> > > +}
> > >  prp_ent = le64_to_cpu(prp_list[i]);
> > >  }
> > >  
> > > @@ -443,6 +455,7 @@ static void nvme_post_cqes(void *opaque)
> > >  NvmeCQueue *cq = opaque;
> > >  NvmeCtrl *n = cq->ctrl;
> > >  NvmeRequest *req, *next;
> > > +int ret;
> > >  
> > >  QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
> > >  NvmeSQueue *sq;
> > > @@ -452,15 +465,21 @@ static void nvme_post_cqes(void *opaque)
> > >  break;
> > >  }
> > >  
> > > -QTAILQ_REMOVE(&cq->req_list, req, entry);
> > >  sq = req->sq;
> > >  req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
> > >  req->cqe.sq_id = cpu_to_le16(sq->sqid);
> > >  req->cqe.sq_head = cpu_to_le16(sq->head);
> > >  addr = cq->dma_addr + cq->tail * n->cqe_size;
> > > -nvme_inc_cq_tail(cq);
> > > -pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
> > > +ret = pci_dma_write(&n->parent_obj, addr, (void *)&req->cqe,
> > >  sizeof(req->cqe));
> > > +if (ret) {
> > > +trace_nvme_dev_err_addr_write(addr);
> > > +timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
> > > +   

Re: [PATCH v5 22/26] nvme: support multiple namespaces

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 00:55 -0700, Klaus Birkelund Jensen wrote:
> On Feb 12 14:34, Maxim Levitsky wrote:
> > On Tue, 2020-02-04 at 10:52 +0100, Klaus Jensen wrote:
> > > This adds support for multiple namespaces by introducing a new 'nvme-ns'
> > > device model. The nvme device creates a bus named from the device name
> > > ('id'). The nvme-ns devices then connect to this and registers
> > > themselves with the nvme device.
> > > 
> > > This changes how an nvme device is created. Example with two namespaces:
> > > 
> > >   -drive file=nvme0n1.img,if=none,id=disk1
> > >   -drive file=nvme0n2.img,if=none,id=disk2
> > >   -device nvme,serial=deadbeef,id=nvme0
> > >   -device nvme-ns,drive=disk1,bus=nvme0,nsid=1
> > >   -device nvme-ns,drive=disk2,bus=nvme0,nsid=2
> > > 
> > > The drive property is kept on the nvme device to keep the change
> > > backward compatible, but the property is now optional. Specifying a
> > > drive for the nvme device will always create the namespace with nsid 1.
> > 
> > Very reasonable way to do it. 
> > > 
> > > Signed-off-by: Klaus Jensen 
> > > Signed-off-by: Klaus Jensen 
> > > ---
> > >  hw/block/Makefile.objs |   2 +-
> > >  hw/block/nvme-ns.c | 158 +++
> > >  hw/block/nvme-ns.h |  60 +++
> > >  hw/block/nvme.c| 235 +
> > >  hw/block/nvme.h|  47 -
> > >  hw/block/trace-events  |   6 +-
> > >  6 files changed, 389 insertions(+), 119 deletions(-)
> > >  create mode 100644 hw/block/nvme-ns.c
> > >  create mode 100644 hw/block/nvme-ns.h
> > > 
> > > diff --git a/hw/block/Makefile.objs b/hw/block/Makefile.objs
> > > index 28c2495a00dc..45f463462f1e 100644
> > > --- a/hw/block/Makefile.objs
> > > +++ b/hw/block/Makefile.objs
> > > @@ -7,7 +7,7 @@ common-obj-$(CONFIG_PFLASH_CFI02) += pflash_cfi02.o
> > >  common-obj-$(CONFIG_XEN) += xen-block.o
> > >  common-obj-$(CONFIG_ECC) += ecc.o
> > >  common-obj-$(CONFIG_ONENAND) += onenand.o
> > > -common-obj-$(CONFIG_NVME_PCI) += nvme.o
> > > +common-obj-$(CONFIG_NVME_PCI) += nvme.o nvme-ns.o
> > >  common-obj-$(CONFIG_SWIM) += swim.o
> > >  
> > >  obj-$(CONFIG_SH4) += tc58128.o
> > > diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> > > new file mode 100644
> > > index ..0e5be44486f4
> > > --- /dev/null
> > > +++ b/hw/block/nvme-ns.c
> > > @@ -0,0 +1,158 @@
> > > +#include "qemu/osdep.h"
> > > +#include "qemu/units.h"
> > > +#include "qemu/cutils.h"
> > > +#include "qemu/log.h"
> > > +#include "hw/block/block.h"
> > > +#include "hw/pci/msix.h"
> > 
> > Do you need this include?
> 
> No, I needed hw/pci/pci.h instead :)
I think it compiled without that include,
but including pci.h for  a pci device a a right thing
anyway.

> 
> > > +#include "sysemu/sysemu.h"
> > > +#include "sysemu/block-backend.h"
> > > +#include "qapi/error.h"
> > > +
> > > +#include "hw/qdev-properties.h"
> > > +#include "hw/qdev-core.h"
> > > +
> > > +#include "nvme.h"
> > > +#include "nvme-ns.h"
> > > +
> > > +static int nvme_ns_init(NvmeNamespace *ns)
> > > +{
> > > +NvmeIdNs *id_ns = &ns->id_ns;
> > > +
> > > +id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
> > > +id_ns->nuse = id_ns->ncap = id_ns->nsze =
> > > +cpu_to_le64(nvme_ns_nlbas(ns));
> > 
> > Nitpick: To be honest I don't really like that chain assignment, 
> > especially since it forces to wrap the line, but that is just my
> > personal taste.
> 
> Fixed, and also added a comment as to why they are the same.
> 
> > > +
> > > +return 0;
> > > +}
> > > +
> > > +static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, NvmeIdCtrl 
> > > *id,
> > > +Error **errp)
> > > +{
> > > +uint64_t perm, shared_perm;
> > > +
> > > +Error *local_err = NULL;
> > > +int ret;
> > > +
> > > +perm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
> > > +shared_perm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
> > > +BLK_PERM_GRAPH_MOD;
> > > +
> > > +ret = blk_set_perm(ns->blk, perm, shared_perm, &local_err);
> > > +if (ret) {
> > > +error_propagate_prepend(errp, local_err, "blk_set_perm: ");
> > > +return ret;
> > > +}
> > 
> > You should consider using blkconf_apply_backend_options.
> > Take a look at for example virtio_blk_device_realize.
> > That will give you support for read only block devices as well.
> 
> So, yeah. There is a reason for this. And I will add that as a comment,
> but I will write it here for posterity.
> 
> The problem is when the nvme-ns device starts getting more than just a
> single drive attached (I have patches ready that will add a "metadata"
> and a "state" drive). The blkconf_ functions work on a BlockConf that
> embeds a BlockBackend, so you can't have one BlockConf with multiple
> BlockBackend's. That is why I'm kinda copying the "good parts" of
> the blkconf_apply_backend_options code here.
All right. but I guess that eventually this code will need a review
from someone that knows the block 

Re: [PATCH v5 14/26] nvme: make sure ncqr and nsqr is valid

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 00:48 -0700, Klaus Birkelund Jensen wrote:
> On Feb 12 12:30, Maxim Levitsky wrote:
> > On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> > > 0x is not an allowed value for NCQR and NSQR in Set Features on
> > > Number of Queues.
> > > 
> > > Signed-off-by: Klaus Jensen 
> > > ---
> > >  hw/block/nvme.c | 4 
> > >  1 file changed, 4 insertions(+)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index 30c5b3e7a67d..900732bb2f38 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -1133,6 +1133,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, 
> > > NvmeCmd *cmd, NvmeRequest *req)
> > >  blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
> > >  break;
> > >  case NVME_NUMBER_OF_QUEUES:
> > > +if ((dw11 & 0x) == 0x || ((dw11 >> 16) & 0x) == 
> > > 0x) {
> > > +return NVME_INVALID_FIELD | NVME_DNR;
> > > +}
> > 
> > Very minor nitpick: since this spec requirement is not obvious, a 
> > quote/reference to the spec
> > would be nice to have here. 
> > 
> 
> Added.
Thanks!
> 
> > > +
> > >  trace_nvme_dev_setfeat_numq((dw11 & 0x) + 1,
> > >  ((dw11 >> 16) & 0x) + 1, n->params.num_queues - 1,
> > >  n->params.num_queues - 1);
> > 
> > Reviewed-by: Maxim Levitsky 
> > 
> > Best regards,
> > Maxim Levitsky
> > 
> 
> 

Best regards,
Maxim Levitsky







Re: [PATCH v5 10/26] nvme: add support for the get log page command

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 00:45 -0700, Klaus Birkelund Jensen wrote:
> On Feb 12 11:35, Maxim Levitsky wrote:
> > On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> > > Add support for the Get Log Page command and basic implementations of
> > > the mandatory Error Information, SMART / Health Information and Firmware
> > > Slot Information log pages.
> > > 
> > > In violation of the specification, the SMART / Health Information log
> > > page does not persist information over the lifetime of the controller
> > > because the device has no place to store such persistent state.
> > 
> > Yea, not the end of the world.
> > > 
> > > Note that the LPA field in the Identify Controller data structure
> > > intentionally has bit 0 cleared because there is no namespace specific
> > > information in the SMART / Health information log page.
> > 
> > Makes sense.
> > > 
> > > Required for compliance with NVMe revision 1.2.1. See NVM Express 1.2.1,
> > > Section 5.10 ("Get Log Page command").
> > > 
> > > Signed-off-by: Klaus Jensen 
> > > ---
> > >  hw/block/nvme.c   | 122 +-
> > >  hw/block/nvme.h   |  10 
> > >  hw/block/trace-events |   2 +
> > >  include/block/nvme.h  |   2 +-
> > >  4 files changed, 134 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> > > index f72348344832..468c36918042 100644
> > > --- a/hw/block/nvme.c
> > > +++ b/hw/block/nvme.c
> > > @@ -569,6 +569,123 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd 
> > > *cmd)
> > >  return NVME_SUCCESS;
> > >  }
> > >  
> > > +static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t 
> > > buf_len,
> > > +uint64_t off, NvmeRequest *req)
> > > +{
> > > +uint64_t prp1 = le64_to_cpu(cmd->prp1);
> > > +uint64_t prp2 = le64_to_cpu(cmd->prp2);
> > > +uint32_t nsid = le32_to_cpu(cmd->nsid);
> > > +
> > > +uint32_t trans_len;
> > > +time_t current_ms;
> > > +uint64_t units_read = 0, units_written = 0, read_commands = 0,
> > > +write_commands = 0;
> > > +NvmeSmartLog smart;
> > > +BlockAcctStats *s;
> > > +
> > > +if (nsid && nsid != 0x) {
> > > +return NVME_INVALID_FIELD | NVME_DNR;
> > > +}
> > > +
> > > +s = blk_get_stats(n->conf.blk);
> > > +
> > > +units_read = s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
> > > +units_written = s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
> > > +read_commands = s->nr_ops[BLOCK_ACCT_READ];
> > > +write_commands = s->nr_ops[BLOCK_ACCT_WRITE];
> > > +
> > > +if (off > sizeof(smart)) {
> > > +return NVME_INVALID_FIELD | NVME_DNR;
> > > +}
> > > +
> > > +trans_len = MIN(sizeof(smart) - off, buf_len);
> > > +
> > > +memset(&smart, 0x0, sizeof(smart));
> > > +
> > > +smart.data_units_read[0] = cpu_to_le64(units_read / 1000);
> > > +smart.data_units_written[0] = cpu_to_le64(units_written / 1000);
> > > +smart.host_read_commands[0] = cpu_to_le64(read_commands);
> > > +smart.host_write_commands[0] = cpu_to_le64(write_commands);
> > > +
> > > +smart.temperature[0] = n->temperature & 0xff;
> > > +smart.temperature[1] = (n->temperature >> 8) & 0xff;
> > > +
> > > +if ((n->temperature > n->features.temp_thresh_hi) ||
> > > +(n->temperature < n->features.temp_thresh_low)) {
> > > +smart.critical_warning |= NVME_SMART_TEMPERATURE;
> > > +}
> > > +
> > > +current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
> > > +smart.power_on_hours[0] = cpu_to_le64(
> > > +(((current_ms - n->starttime_ms) / 1000) / 60) / 60);
> > > +
> > > +return nvme_dma_read_prp(n, (uint8_t *) &smart + off, trans_len, 
> > > prp1,
> > > +prp2);
> > > +}
> > 
> > Looks OK.
> > > +
> > > +static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t 
> > > buf_len,
> > > +uint64_t off, NvmeRequest *req)
> > > +{
> > > +uint32_t trans_len;
> > > +uint64_t prp1 = le64_to_cpu(cmd->prp1);
> > > +uint64_t prp2 = le64_to_cpu(cmd->prp2);
> > > +NvmeFwSlotInfoLog fw_log;
> > > +
> > > +if (off > sizeof(fw_log)) {
> > > +return NVME_INVALID_FIELD | NVME_DNR;
> > > +}
> > > +
> > > +memset(&fw_log, 0, sizeof(NvmeFwSlotInfoLog));
> > > +
> > > +trans_len = MIN(sizeof(fw_log) - off, buf_len);
> > > +
> > > +return nvme_dma_read_prp(n, (uint8_t *) &fw_log + off, trans_len, 
> > > prp1,
> > > +prp2);
> > > +}
> > 
> > Looks OK
> > > +
> > > +static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> > > +{
> > > +uint32_t dw10 = le32_to_cpu(cmd->cdw10);
> > > +uint32_t dw11 = le32_to_cpu(cmd->cdw11);
> > > +uint32_t dw12 = le32_to_cpu(cmd->cdw12);
> > > +uint32_t dw13 = le32_to_cpu(cmd->cdw13);
> > > +uint8_t  lid = dw10 & 0xff;
> > > +uint8_t  rae = (dw10 >> 15) & 0x1;
> > > +uint32_t numdl, numdu;
> > > +uint64_t off, lpol, lpou;
> > > +size_t   len;
> > > +
> >

Re: [PATCH v6 00/42] nvme: support NVMe v1.3d, SGLs and multiple namespaces

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:28 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Hi,
> 
> So this patchset kinda blew up in size (wrt. number of patches) after
> Maxim's comments (26 -> 42), but Maxim's comments about splitting up a
> bunch of the patches made a lot of sense.
I don't think this is bad. 
You might actually found the ultimate question of life the universe and 
everything.
;-)

Best regards,
Maxim Levitsky

> 
> v6 primarily splits up the big nasty patches into more digestible parts.
> Specifically the 'nvme: refactor prp mapping' and 'nvme: allow multiple
> aios per command' patches has been split up according to Maxim's
> comments. Most additions to the shared include/block/nvme.h has also
> been consolidated into a single patch (also according to Maxim's
> comments). A lot of the patches still carries a 'Reviewed-By', but
> git-backport-diff reports some changes due to changes/additions in some
> of the early patches.
> 
> The only real "addition" is a new "max_ioqpairs" parameter for the
> device. This is to fix some confusion about the current "num_queues"
> parameter. See "nvme: add max_ioqpairs device parameter".
> 
> Maxim, I responded to your comments in the original thread and I believe
> that all your comments has been adressed.
> 
> Also, I *did* change the line indentation style - I hope I caught 'em
> all :)
> 
> 
> Klaus Jensen (42):
>   nvme: rename trace events to nvme_dev
>   nvme: remove superfluous breaks
>   nvme: move device parameters to separate struct
>   nvme: bump spec data structures to v1.3
>   nvme: use constant for identify data size
>   nvme: add identify cns values in header
>   nvme: refactor nvme_addr_read
>   nvme: add support for the abort command
>   nvme: add max_ioqpairs device parameter
>   nvme: refactor device realization
>   nvme: add temperature threshold feature
>   nvme: add support for the get log page command
>   nvme: add support for the asynchronous event request command
>   nvme: add missing mandatory features
>   nvme: additional tracing
>   nvme: make sure ncqr and nsqr is valid
>   nvme: add log specific field to trace events
>   nvme: support identify namespace descriptor list
>   nvme: enforce valid queue creation sequence
>   nvme: provide the mandatory subnqn field
>   nvme: bump supported version to v1.3
>   nvme: memset preallocated requests structures
>   nvme: add mapping helpers
>   nvme: remove redundant has_sg member
>   nvme: refactor dma read/write
>   nvme: pass request along for tracing
>   nvme: add request mapping helper
>   nvme: verify validity of prp lists in the cmb
>   nvme: refactor request bounds checking
>   nvme: add check for mdts
>   nvme: add check for prinfo
>   nvme: allow multiple aios per command
>   nvme: use preallocated qsg/iov in nvme_dma_prp
>   pci: pass along the return value of dma_memory_rw
>   nvme: handle dma errors
>   nvme: add support for scatter gather lists
>   nvme: refactor identify active namespace id list
>   nvme: support multiple namespaces
>   pci: allocate pci id for nvme
>   nvme: change controller pci id
>   nvme: remove redundant NvmeCmd pointer parameter
>   nvme: make lba data size configurable
> 
>  MAINTAINERS|1 +
>  block/nvme.c   |   18 +-
>  docs/specs/nvme.txt|   25 +
>  docs/specs/pci-ids.txt |1 +
>  hw/block/Makefile.objs |2 +-
>  hw/block/nvme-ns.c |  162 
>  hw/block/nvme-ns.h |   62 ++
>  hw/block/nvme.c| 2041 
>  hw/block/nvme.h|  205 +++-
>  hw/block/trace-events  |  206 ++--
>  hw/core/machine.c  |1 +
>  include/block/nvme.h   |  178 +++-
>  include/hw/pci/pci.h   |4 +-
>  13 files changed, 2347 insertions(+), 559 deletions(-)
>  create mode 100644 docs/specs/nvme.txt
>  create mode 100644 hw/block/nvme-ns.c
>  create mode 100644 hw/block/nvme-ns.h
> 









Re: [PATCH v6 04/42] nvme: bump spec data structures to v1.3

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:28 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Add missing fields in the Identify Controller and Identify Namespace
> data structures to bring them in line with NVMe v1.3.
> 
> This also adds data structures and defines for SGL support which
> requires a couple of trivial changes to the nvme block driver as well.
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Fam Zheng 
> ---
>  block/nvme.c |  18 ++---
>  hw/block/nvme.c  |  12 ++--
>  include/block/nvme.h | 153 ++-
>  3 files changed, 151 insertions(+), 32 deletions(-)
> 
> diff --git a/block/nvme.c b/block/nvme.c
> index d41c4bda6e39..99b9bb3dac96 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c
> @@ -446,7 +446,7 @@ static void nvme_identify(BlockDriverState *bs, int 
> namespace, Error **errp)
>  error_setg(errp, "Cannot map buffer for DMA");
>  goto out;
>  }
> -cmd.prp1 = cpu_to_le64(iova);
> +cmd.dptr.prp1 = cpu_to_le64(iova);
>  
>  if (nvme_cmd_sync(bs, s->queues[0], &cmd)) {
>  error_setg(errp, "Failed to identify controller");
> @@ -545,7 +545,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error 
> **errp)
>  }
>  cmd = (NvmeCmd) {
>  .opcode = NVME_ADM_CMD_CREATE_CQ,
> -.prp1 = cpu_to_le64(q->cq.iova),
> +.dptr.prp1 = cpu_to_le64(q->cq.iova),
>  .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)),
>  .cdw11 = cpu_to_le32(0x3),
>  };
> @@ -556,7 +556,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error 
> **errp)
>  }
>  cmd = (NvmeCmd) {
>  .opcode = NVME_ADM_CMD_CREATE_SQ,
> -.prp1 = cpu_to_le64(q->sq.iova),
> +.dptr.prp1 = cpu_to_le64(q->sq.iova),
>  .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)),
>  .cdw11 = cpu_to_le32(0x1 | (n << 16)),
>  };
> @@ -906,16 +906,16 @@ try_map:
>  case 0:
>  abort();
>  case 1:
> -cmd->prp1 = pagelist[0];
> -cmd->prp2 = 0;
> +cmd->dptr.prp1 = pagelist[0];
> +cmd->dptr.prp2 = 0;
>  break;
>  case 2:
> -cmd->prp1 = pagelist[0];
> -cmd->prp2 = pagelist[1];
> +cmd->dptr.prp1 = pagelist[0];
> +cmd->dptr.prp2 = pagelist[1];
>  break;
>  default:
> -cmd->prp1 = pagelist[0];
> -cmd->prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
> +cmd->dptr.prp1 = pagelist[0];
> +cmd->dptr.prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
>  break;
>  }
>  trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries);
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index b532818b4b76..40cb176dea3c 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -372,8 +372,8 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
> NvmeCmd *cmd,
>  NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
>  uint32_t nlb  = le32_to_cpu(rw->nlb) + 1;
>  uint64_t slba = le64_to_cpu(rw->slba);
> -uint64_t prp1 = le64_to_cpu(rw->prp1);
> -uint64_t prp2 = le64_to_cpu(rw->prp2);
> +uint64_t prp1 = le64_to_cpu(rw->dptr.prp1);
> +uint64_t prp2 = le64_to_cpu(rw->dptr.prp2);
>  
>  uint8_t lba_index  = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
>  uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
> @@ -763,8 +763,8 @@ static inline uint64_t nvme_get_timestamp(const NvmeCtrl 
> *n)
>  
>  static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, NvmeCmd *cmd)
>  {
> -uint64_t prp1 = le64_to_cpu(cmd->prp1);
> -uint64_t prp2 = le64_to_cpu(cmd->prp2);
> +uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
> +uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
>  
>  uint64_t timestamp = nvme_get_timestamp(n);
>  
> @@ -802,8 +802,8 @@ static uint16_t nvme_set_feature_timestamp(NvmeCtrl *n, 
> NvmeCmd *cmd)
>  {
>  uint16_t ret;
>  uint64_t timestamp;
> -uint64_t prp1 = le64_to_cpu(cmd->prp1);
> -uint64_t prp2 = le64_to_cpu(cmd->prp2);
> +uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
> +uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
>  
>  ret = nvme_dma_write_prp(n, (uint8_t *)×tamp,
>  sizeof(timestamp), prp1, prp2);
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 8fb941c6537c..a083c1b3a613 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -205,15 +205,53 @@ enum NvmeCmbszMask {
>  #define NVME_CMBSZ_GETSIZE(cmbsz) \
>  (NVME_CMBSZ_SZ(cmbsz) * (1 << (12 + 4 * NVME_CMBSZ_SZU(cmbsz
>  
> +enum NvmeSglDescriptorType {
> +NVME_SGL_DESCR_TYPE_DATA_BLOCK  = 0x0,
> +NVME_SGL_DESCR_TYPE_BIT_BUCKET  = 0x1,
> +NVME_SGL_DESCR_TYPE_SEGMENT = 0x2,
> +NVME_SGL_DESCR_TYPE_LAST_SEGMENT= 0x3,
> +NVME_SGL_DESCR_TYPE_KEYED_DATA_BLOCK= 0x4,
> +
> +NVME_SGL_DESCR_TYPE_VENDOR_SPECIFIC = 0xf,
> +};
OK

> +
> +enum NvmeSglDescriptorSubtype {
> +NVME_

Re: [PATCH v6 03/42] nvme: move device parameters to separate struct

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:28 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Move device configuration parameters to separate struct to make it
> explicit what is configurable and what is set internally.
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Keith Busch 
> Reviewed-by: Maxim Levitsky 
> ---
>  hw/block/nvme.c | 44 ++--
>  hw/block/nvme.h | 16 +---
>  2 files changed, 35 insertions(+), 25 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 9740948b354a..b532818b4b76 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -64,12 +64,12 @@ static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void 
> *buf, int size)
>  
>  static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
>  {
> -return sqid < n->num_queues && n->sq[sqid] != NULL ? 0 : -1;
> +return sqid < n->params.num_queues && n->sq[sqid] != NULL ? 0 : -1;
>  }
>  
>  static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
>  {
> -return cqid < n->num_queues && n->cq[cqid] != NULL ? 0 : -1;
> +return cqid < n->params.num_queues && n->cq[cqid] != NULL ? 0 : -1;
>  }
>  
>  static void nvme_inc_cq_tail(NvmeCQueue *cq)
> @@ -631,7 +631,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
>  trace_nvme_dev_err_invalid_create_cq_addr(prp1);
>  return NVME_INVALID_FIELD | NVME_DNR;
>  }
> -if (unlikely(vector > n->num_queues)) {
> +if (unlikely(vector > n->params.num_queues)) {
>  trace_nvme_dev_err_invalid_create_cq_vector(vector);
>  return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
>  }
> @@ -783,7 +783,8 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  trace_nvme_dev_getfeat_vwcache(result ? "enabled" : "disabled");
>  break;
>  case NVME_NUMBER_OF_QUEUES:
> -result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 
> 16));
> +result = cpu_to_le32((n->params.num_queues - 2) |
> + ((n->params.num_queues - 2) << 16));
>  trace_nvme_dev_getfeat_numq(result);
>  break;
>  case NVME_TIMESTAMP:
> @@ -827,9 +828,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  case NVME_NUMBER_OF_QUEUES:
>  trace_nvme_dev_setfeat_numq((dw11 & 0x) + 1,
>  ((dw11 >> 16) & 0x) + 1,
> -n->num_queues - 1, n->num_queues - 1);
> -req->cqe.result =
> -cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
> +n->params.num_queues - 1,
> +n->params.num_queues - 1);
> +req->cqe.result = cpu_to_le32((n->params.num_queues - 2) |
> +  ((n->params.num_queues - 2) << 16));
>  break;
>  case NVME_TIMESTAMP:
>  return nvme_set_feature_timestamp(n, cmd);
> @@ -900,12 +902,12 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
>  
>  blk_drain(n->conf.blk);
>  
> -for (i = 0; i < n->num_queues; i++) {
> +for (i = 0; i < n->params.num_queues; i++) {
>  if (n->sq[i] != NULL) {
>  nvme_free_sq(n->sq[i], n);
>  }
>  }
> -for (i = 0; i < n->num_queues; i++) {
> +for (i = 0; i < n->params.num_queues; i++) {
>  if (n->cq[i] != NULL) {
>  nvme_free_cq(n->cq[i], n);
>  }
> @@ -1308,7 +1310,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
> **errp)
>  int64_t bs_size;
>  uint8_t *pci_conf;
>  
> -if (!n->num_queues) {
> +if (!n->params.num_queues) {
>  error_setg(errp, "num_queues can't be zero");
>  return;
>  }
> @@ -1324,7 +1326,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
> **errp)
>  return;
>  }
>  
> -if (!n->serial) {
> +if (!n->params.serial) {
>  error_setg(errp, "serial property not set");
>  return;
>  }
> @@ -1341,25 +1343,25 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
> **errp)
>  pcie_endpoint_cap_init(pci_dev, 0x80);
>  
>  n->num_namespaces = 1;
> -n->reg_size = pow2ceil(0x1004 + 2 * (n->num_queues + 1) * 4);
> +n->reg_size = pow2ceil(0x1004 + 2 * (n->params.num_queues + 1) * 4);
>  n->ns_size = bs_size / (uint64_t)n->num_namespaces;
>  
>  n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
> -n->sq = g_new0(NvmeSQueue *, n->num_queues);
> -n->cq = g_new0(NvmeCQueue *, n->num_queues);
> +n->sq = g_new0(NvmeSQueue *, n->params.num_queues);
> +n->cq = g_new0(NvmeCQueue *, n->params.num_queues);
>  
>  memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n,
>"nvme", n->reg_size);
>  pci_register_bar(pci_dev, 0,
>  PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
>  &n->iomem);
> -msix_init_exclusive_bar(pci_dev, n->num_que

Re: [PATCH v6 01/42] nvme: rename trace events to nvme_dev

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:28 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Change the prefix of all nvme device related trace events to 'nvme_dev'
> to not clash with trace events from the nvme block driver.
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Keith Busch 
> Reviewed-by: Maxim Levitsky 
> ---
>  hw/block/nvme.c   | 188 +-
>  hw/block/trace-events | 172 +++---
>  2 files changed, 180 insertions(+), 180 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index d28335cbf377..3e4b18956ed2 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -112,16 +112,16 @@ static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
>  {
>  if (cq->irq_enabled) {
>  if (msix_enabled(&(n->parent_obj))) {
> -trace_nvme_irq_msix(cq->vector);
> +trace_nvme_dev_irq_msix(cq->vector);
>  msix_notify(&(n->parent_obj), cq->vector);
>  } else {
> -trace_nvme_irq_pin();
> +trace_nvme_dev_irq_pin();
>  assert(cq->cqid < 64);
>  n->irq_status |= 1 << cq->cqid;
>  nvme_irq_check(n);
>  }
>  } else {
> -trace_nvme_irq_masked();
> +trace_nvme_dev_irq_masked();
>  }
>  }
>  
> @@ -146,7 +146,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
> QEMUIOVector *iov, uint64_t prp1,
>  int num_prps = (len >> n->page_bits) + 1;
>  
>  if (unlikely(!prp1)) {
> -trace_nvme_err_invalid_prp();
> +trace_nvme_dev_err_invalid_prp();
>  return NVME_INVALID_FIELD | NVME_DNR;
>  } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
> prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
> @@ -160,7 +160,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
> QEMUIOVector *iov, uint64_t prp1,
>  len -= trans_len;
>  if (len) {
>  if (unlikely(!prp2)) {
> -trace_nvme_err_invalid_prp2_missing();
> +trace_nvme_dev_err_invalid_prp2_missing();
>  goto unmap;
>  }
>  if (len > n->page_size) {
> @@ -176,7 +176,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
> QEMUIOVector *iov, uint64_t prp1,
>  
>  if (i == n->max_prp_ents - 1 && len > n->page_size) {
>  if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
> -trace_nvme_err_invalid_prplist_ent(prp_ent);
> +trace_nvme_dev_err_invalid_prplist_ent(prp_ent);
>  goto unmap;
>  }
>  
> @@ -189,7 +189,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
> QEMUIOVector *iov, uint64_t prp1,
>  }
>  
>  if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
> -trace_nvme_err_invalid_prplist_ent(prp_ent);
> +trace_nvme_dev_err_invalid_prplist_ent(prp_ent);
>  goto unmap;
>  }
>  
> @@ -204,7 +204,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
> QEMUIOVector *iov, uint64_t prp1,
>  }
>  } else {
>  if (unlikely(prp2 & (n->page_size - 1))) {
> -trace_nvme_err_invalid_prp2_align(prp2);
> +trace_nvme_dev_err_invalid_prp2_align(prp2);
>  goto unmap;
>  }
>  if (qsg->nsg) {
> @@ -252,20 +252,20 @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t 
> *ptr, uint32_t len,
>  QEMUIOVector iov;
>  uint16_t status = NVME_SUCCESS;
>  
> -trace_nvme_dma_read(prp1, prp2);
> +trace_nvme_dev_dma_read(prp1, prp2);
>  
>  if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
>  return NVME_INVALID_FIELD | NVME_DNR;
>  }
>  if (qsg.nsg > 0) {
>  if (unlikely(dma_buf_read(ptr, len, &qsg))) {
> -trace_nvme_err_invalid_dma();
> +trace_nvme_dev_err_invalid_dma();
>  status = NVME_INVALID_FIELD | NVME_DNR;
>  }
>  qemu_sglist_destroy(&qsg);
>  } else {
>  if (unlikely(qemu_iovec_from_buf(&iov, 0, ptr, len) != len)) {
> -trace_nvme_err_invalid_dma();
> +trace_nvme_dev_err_invalid_dma();
>  status = NVME_INVALID_FIELD | NVME_DNR;
>  }
>  qemu_iovec_destroy(&iov);
> @@ -354,7 +354,7 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, 
> NvmeNamespace *ns, NvmeCmd *cmd,
>  uint32_t count = nlb << data_shift;
>  
>  if (unlikely(slba + nlb > ns->id_ns.nsze)) {
> -trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
> +trace_nvme_dev_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
>  return NVME_LBA_RANGE | NVME_DNR;
>  }
>  
> @@ -382,11 +382,11 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
> NvmeCmd *cmd,
>  int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
>  enum BlockAcctType acct = is_write ? BLOCK_A

Re: [PATCH v6 05/42] nvme: use constant for identify data size

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:28 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 40cb176dea3c..f716f690a594 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -679,7 +679,7 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
> NvmeIdentify *c)
>  
>  static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
>  {
> -static const int data_len = 4 * KiB;
> +static const int data_len = NVME_IDENTIFY_DATA_SIZE;
>  uint32_t min_nsid = le32_to_cpu(c->nsid);
>  uint64_t prp1 = le64_to_cpu(c->prp1);
>  uint64_t prp2 = le64_to_cpu(c->prp2);

I'll probably squash this with some other refactoring patch,
but I absolutely don't mind leaving this as is.
Fine grained patches never cause any harm.

Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky






Re: [PATCH v6 07/42] nvme: refactor nvme_addr_read

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:28 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Pull the controller memory buffer check to its own function. The check
> will be used on its own in later patches.
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Keith Busch 
> ---
>  hw/block/nvme.c | 16 
>  1 file changed, 12 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index b38d7e548a60..08a83d449de3 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -52,14 +52,22 @@
>  
>  static void nvme_process_sq(void *opaque);
>  
> +static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
> +{
> +hwaddr low = n->ctrl_mem.addr;
> +hwaddr hi  = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
> +
> +return addr >= low && addr < hi;
> +}
> +
>  static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>  {
> -if (n->cmbsz && addr >= n->ctrl_mem.addr &&
> -addr < (n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size))) {
> +if (n->cmbsz && nvme_addr_is_cmb(n, addr)) {
>  memcpy(buf, (void *)&n->cmbuf[addr - n->ctrl_mem.addr], size);
> -} else {
> -pci_dma_read(&n->parent_obj, addr, buf, size);
> +return;
>  }
> +
> +pci_dma_read(&n->parent_obj, addr, buf, size);
>  }
>  
>  static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)

Note that this patch still contains a bug that it removes the check against the 
accessed
size, which you fix in later patch.
I prefer to not add a bug in first place
However if you have a reason for this, I won't mind.

Best regards,
Maxim Levitsky








Re: [PATCH v6 09/42] nvme: add max_ioqpairs device parameter

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:28 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> The num_queues device paramater has a slightly confusing meaning because
> it accounts for the admin queue pair which is not really optional.
> Secondly, it is really a maximum value of queues allowed.
> 
> Add a new max_ioqpairs parameter that only accounts for I/O queue pairs,
> but keep num_queues for compatibility.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 45 ++---
>  hw/block/nvme.h |  4 +++-
>  2 files changed, 29 insertions(+), 20 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 7cf7cf55143e..7dfd8a1a392d 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -19,7 +19,7 @@
>   *  -drive file=,if=none,id=
>   *  -device nvme,drive=,serial=,id=, \
>   *  cmb_size_mb=, \
> - *  num_queues=
> + *  max_ioqpairs=
>   *
>   * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
>   * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
> @@ -27,6 +27,7 @@
>  
>  #include "qemu/osdep.h"
>  #include "qemu/units.h"
> +#include "qemu/error-report.h"
>  #include "hw/block/block.h"
>  #include "hw/pci/msix.h"
>  #include "hw/pci/pci.h"
> @@ -72,12 +73,12 @@ static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void 
> *buf, int size)
>  
>  static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
>  {
> -return sqid < n->params.num_queues && n->sq[sqid] != NULL ? 0 : -1;
> +return sqid < n->params.max_ioqpairs + 1 && n->sq[sqid] != NULL ? 0 : -1;
>  }
>  
>  static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
>  {
> -return cqid < n->params.num_queues && n->cq[cqid] != NULL ? 0 : -1;
> +return cqid < n->params.max_ioqpairs + 1 && n->cq[cqid] != NULL ? 0 : -1;
>  }
>  
>  static void nvme_inc_cq_tail(NvmeCQueue *cq)
> @@ -639,7 +640,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
>  trace_nvme_dev_err_invalid_create_cq_addr(prp1);
>  return NVME_INVALID_FIELD | NVME_DNR;
>  }
> -if (unlikely(vector > n->params.num_queues)) {
> +if (unlikely(vector > n->params.max_ioqpairs + 1)) {
>  trace_nvme_dev_err_invalid_create_cq_vector(vector);
>  return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
>  }
> @@ -803,8 +804,8 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  trace_nvme_dev_getfeat_vwcache(result ? "enabled" : "disabled");
>  break;
>  case NVME_NUMBER_OF_QUEUES:
> -result = cpu_to_le32((n->params.num_queues - 2) |
> - ((n->params.num_queues - 2) << 16));
> +result = cpu_to_le32((n->params.max_ioqpairs - 1) |
> + ((n->params.max_ioqpairs - 1) << 16));
>  trace_nvme_dev_getfeat_numq(result);
>  break;
>  case NVME_TIMESTAMP:
> @@ -848,10 +849,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  case NVME_NUMBER_OF_QUEUES:
>  trace_nvme_dev_setfeat_numq((dw11 & 0x) + 1,
>  ((dw11 >> 16) & 0x) + 1,
> -n->params.num_queues - 1,
> -n->params.num_queues - 1);
> -req->cqe.result = cpu_to_le32((n->params.num_queues - 2) |
> -  ((n->params.num_queues - 2) << 16));
> +n->params.max_ioqpairs,
> +n->params.max_ioqpairs);
> +req->cqe.result = cpu_to_le32((n->params.max_ioqpairs - 1) |
> +  ((n->params.max_ioqpairs - 1) << 16));
>  break;
>  case NVME_TIMESTAMP:
>  return nvme_set_feature_timestamp(n, cmd);
> @@ -924,12 +925,12 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
>  
>  blk_drain(n->conf.blk);
>  
> -for (i = 0; i < n->params.num_queues; i++) {
> +for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
>  if (n->sq[i] != NULL) {
>  nvme_free_sq(n->sq[i], n);
>  }
>  }
> -for (i = 0; i < n->params.num_queues; i++) {
> +for (i = 0; i < n->params.max_ioqpairs + 1; i++) {
>  if (n->cq[i] != NULL) {
>  nvme_free_cq(n->cq[i], n);
>  }
> @@ -1332,9 +1333,15 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
> **errp)
>  int64_t bs_size;
>  uint8_t *pci_conf;
>  
> -if (!n->params.num_queues) {
> -error_setg(errp, "num_queues can't be zero");
> -return;
> +if (n->params.num_queues) {
> +warn_report("nvme: num_queues is deprecated; please use max_ioqpairs 
> "
> +"instead");
> +
> +n->params.max_ioqpairs = n->params.num_queues - 1;
> +}
> +
> +if (!n->params.max_ioqpairs) {
> +error_setg(errp, "max_ioqpairs can't be less than 1");
>  }
This is not even a nitpick, but just and

Re: [PATCH v6 08/42] nvme: add support for the abort command

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:28 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Required for compliance with NVMe revision 1.2.1. See NVM Express 1.2.1,
> Section 5.1 ("Abort command").
> 
> The Abort command is a best effort command; for now, the device always
> fails to abort the given command.
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Keith Busch 
> ---
>  hw/block/nvme.c | 27 +++
>  1 file changed, 27 insertions(+)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 08a83d449de3..7cf7cf55143e 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -729,6 +729,18 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
>  }
>  }
>  
> +static uint16_t nvme_abort(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> +{
> +uint16_t sqid = le32_to_cpu(cmd->cdw10) & 0x;
> +
> +req->cqe.result = 1;
> +if (nvme_check_sqid(n, sqid)) {
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +
> +return NVME_SUCCESS;
> +}
> +
>  static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
>  {
>  trace_nvme_dev_setfeat_timestamp(ts);
> @@ -863,6 +875,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, 
> NvmeRequest *req)
>  return nvme_create_cq(n, cmd);
>  case NVME_ADM_CMD_IDENTIFY:
>  return nvme_identify(n, cmd);
> +case NVME_ADM_CMD_ABORT:
> +return nvme_abort(n, cmd, req);
>  case NVME_ADM_CMD_SET_FEATURES:
>  return nvme_set_feature(n, cmd, req);
>  case NVME_ADM_CMD_GET_FEATURES:
> @@ -1375,6 +1389,19 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
> **errp)
>  id->ieee[1] = 0x02;
>  id->ieee[2] = 0xb3;
>  id->oacs = cpu_to_le16(0);
> +
> +/*
> + * Because the controller always completes the Abort command immediately,
> + * there can never be more than one concurrently executing Abort command,
> + * so this value is never used for anything. Note that there can easily 
> be
> + * many Abort commands in the queues, but they are not considered
> + * "executing" until processed by nvme_abort.
> + *
> + * The specification recommends a value of 3 for Abort Command Limit 
> (four
> + * concurrently outstanding Abort commands), so lets use that though it 
> is
> + * inconsequential.
> + */
> +id->acl = 3;
>  id->frmw = 7 << 1;
>  id->lpa = 1 << 0;
>  id->sqes = (0x6 << 4) | 0x6;

You forgot to move my reviewed-by from the previous version
I see that you also fixed the white space problem, thanks!
So, 

Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky







Re: [PATCH v6 06/42] nvme: add identify cns values in header

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:28 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index f716f690a594..b38d7e548a60 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -709,11 +709,11 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
>  NvmeIdentify *c = (NvmeIdentify *)cmd;
>  
>  switch (le32_to_cpu(c->cns)) {
> -case 0x00:
> +case NVME_ID_CNS_NS:
>  return nvme_identify_ns(n, c);
> -case 0x01:
> +case NVME_ID_CNS_CTRL:
>  return nvme_identify_ctrl(n, c);
> -case 0x02:
> +case NVME_ID_CNS_NS_ACTIVE_LIST:
>  return nvme_identify_nslist(n, c);
>  default:
>  trace_nvme_dev_err_invalid_identify_cns(le32_to_cpu(c->cns));

This is a very good candidate to be squished with the patch 5 IMHO,
but you can leave this as is as well. I don't mind.

Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky








Re: [PATCH v6 12/42] nvme: add support for the get log page command

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:28 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Add support for the Get Log Page command and basic implementations of
> the mandatory Error Information, SMART / Health Information and Firmware
> Slot Information log pages.
> 
> In violation of the specification, the SMART / Health Information log
> page does not persist information over the lifetime of the controller
> because the device has no place to store such persistent state.
> 
> Note that the LPA field in the Identify Controller data structure
> intentionally has bit 0 cleared because there is no namespace specific
> information in the SMART / Health information log page.
> 
> Required for compliance with NVMe revision 1.2.1. See NVM Express 1.2.1,
> Section 5.10 ("Get Log Page command").
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Keith Busch 
> ---
>  hw/block/nvme.c   | 138 +-
>  hw/block/nvme.h   |  10 +++
>  hw/block/trace-events |   2 +
>  3 files changed, 149 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 64c42101df5c..83ff3fbfb463 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -569,6 +569,138 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd 
> *cmd)
>  return NVME_SUCCESS;
>  }
>  
> +static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> +uint64_t off, NvmeRequest *req)
> +{
> +uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
> +uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
> +uint32_t nsid = le32_to_cpu(cmd->nsid);
> +
> +uint32_t trans_len;
> +time_t current_ms;
> +uint64_t units_read = 0, units_written = 0;
> +uint64_t read_commands = 0, write_commands = 0;
> +NvmeSmartLog smart;
> +BlockAcctStats *s;
> +
> +if (nsid && nsid != 0x) {
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +
> +s = blk_get_stats(n->conf.blk);
> +
> +units_read = s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
> +units_written = s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
> +read_commands = s->nr_ops[BLOCK_ACCT_READ];
> +write_commands = s->nr_ops[BLOCK_ACCT_WRITE];
> +
> +if (off > sizeof(smart)) {
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +
> +trans_len = MIN(sizeof(smart) - off, buf_len);
> +
> +memset(&smart, 0x0, sizeof(smart));
> +
> +smart.data_units_read[0] = cpu_to_le64(units_read / 1000);
> +smart.data_units_written[0] = cpu_to_le64(units_written / 1000);
> +smart.host_read_commands[0] = cpu_to_le64(read_commands);
> +smart.host_write_commands[0] = cpu_to_le64(write_commands);
> +
> +smart.temperature[0] = n->temperature & 0xff;
> +smart.temperature[1] = (n->temperature >> 8) & 0xff;
> +
> +if ((n->temperature > n->features.temp_thresh_hi) ||
> +(n->temperature < n->features.temp_thresh_low)) {
> +smart.critical_warning |= NVME_SMART_TEMPERATURE;
> +}
> +
> +current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
> +smart.power_on_hours[0] =
> +cpu_to_le64current_ms - n->starttime_ms) / 1000) / 60) / 60);
OH, I didn't notice that you didn't have the endian conversion in V5, it is 
needed here
of course.

> +
> +return nvme_dma_read_prp(n, (uint8_t *) &smart + off, trans_len, prp1,
> + prp2);
> +}
> +
> +static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> + uint64_t off, NvmeRequest *req)
> +{
> +uint32_t trans_len;
> +uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
> +uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
> +NvmeFwSlotInfoLog fw_log;
> +
> +if (off > sizeof(fw_log)) {
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +
> +memset(&fw_log, 0, sizeof(NvmeFwSlotInfoLog));
> +
> +trans_len = MIN(sizeof(fw_log) - off, buf_len);
> +
> +return nvme_dma_read_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1,
> + prp2);
> +}
> +
> +static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> +uint64_t off, NvmeRequest *req)
> +{
> +uint32_t trans_len;
> +uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
> +uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
> +uint8_t errlog[64];
I'll would replace this with sizeof(NvmeErrorLogEntry)
(and add NvmeErrorLogEntry to the nvme.h), just for the sake of consistency,
and in case we end up reporting some errors to the log in the future.


> +
> +if (off > sizeof(errlog)) {
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +
> +memset(errlog, 0x0, sizeof(errlog));
> +
> +trans_len = MIN(sizeof(errlog) - off, buf_len);
> +
> +return nvme_dma_read_prp(n, errlog, trans_len, prp1, prp2);
> +}
Besides this, looks good now.

> +
> +static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> +{
> 

Re: [PATCH v6 14/42] nvme: add missing mandatory features

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Add support for returning a resonable response to Get/Set Features of
> mandatory features.
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Keith Busch 
> ---
>  hw/block/nvme.c   | 60 ++-
>  hw/block/trace-events |  2 ++
>  include/block/nvme.h  |  6 -
>  3 files changed, 66 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index ff8975cd6667..eb9c722df968 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1025,7 +1025,15 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  uint32_t dw11 = le32_to_cpu(cmd->cdw11);
>  uint32_t result;
>  
> +trace_nvme_dev_getfeat(nvme_cid(req), dw10);
> +
>  switch (dw10) {
> +case NVME_ARBITRATION:
> +result = cpu_to_le32(n->features.arbitration);
> +break;
> +case NVME_POWER_MANAGEMENT:
> +result = cpu_to_le32(n->features.power_mgmt);
> +break;
>  case NVME_TEMPERATURE_THRESHOLD:
>  result = 0;
>  
> @@ -1046,9 +1054,12 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  break;
>  }
>  
> +break;
> +case NVME_ERROR_RECOVERY:
> +result = cpu_to_le32(n->features.err_rec);
>  break;
>  case NVME_VOLATILE_WRITE_CACHE:
> -result = blk_enable_write_cache(n->conf.blk);
> +result = cpu_to_le32(blk_enable_write_cache(n->conf.blk));
>  trace_nvme_dev_getfeat_vwcache(result ? "enabled" : "disabled");
>  break;
>  case NVME_NUMBER_OF_QUEUES:
> @@ -1058,6 +1069,19 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  break;
>  case NVME_TIMESTAMP:
>  return nvme_get_feature_timestamp(n, cmd);
> +case NVME_INTERRUPT_COALESCING:
> +result = cpu_to_le32(n->features.int_coalescing);
> +break;
> +case NVME_INTERRUPT_VECTOR_CONF:
> +if ((dw11 & 0x) > n->params.max_ioqpairs + 1) {
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
I still think that this should be >= since the interrupt vector is not zero 
based.
So if we have for example 3 IO queues, then we have 4 queues in total
which translates to irq numbers 0..3.

BTW the user of the device doesn't have to have 1:1 mapping between qid and msi 
interrupt index,
in fact when MSI is not used, all the queues will map to the same vector, which 
will be interrupt 0
from point of view of the device IMHO.
So it kind of makes sense IMHO to have num_irqs or something, even if it 
technically equals to number of queues.


> +
> +result = cpu_to_le32(n->features.int_vector_config[dw11 & 0x]);
> +break;
> +case NVME_WRITE_ATOMICITY:
> +result = cpu_to_le32(n->features.write_atomicity);
> +break;
>  case NVME_ASYNCHRONOUS_EVENT_CONF:
>  result = cpu_to_le32(n->features.async_config);
>  break;
> @@ -1093,6 +1117,8 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  uint32_t dw10 = le32_to_cpu(cmd->cdw10);
>  uint32_t dw11 = le32_to_cpu(cmd->cdw11);
>  
> +trace_nvme_dev_setfeat(nvme_cid(req), dw10, dw11);
> +
>  switch (dw10) {
>  case NVME_TEMPERATURE_THRESHOLD:
>  if (NVME_TEMP_TMPSEL(dw11)) {
> @@ -1120,6 +1146,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  
>  break;
>  case NVME_VOLATILE_WRITE_CACHE:
> +if (blk_enable_write_cache(n->conf.blk)) {
> +blk_flush(n->conf.blk);
> +}

(not your fault) but the blk_enable_write_cache function name is highly 
misleading,
since it doesn't enable anything but just gets the flag if the write cache is 
enabled.
It really should be called blk_get_enable_write_cache.

> +
>  blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
>  break;
>  case NVME_NUMBER_OF_QUEUES:
> @@ -1135,6 +1165,13 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  case NVME_ASYNCHRONOUS_EVENT_CONF:
>  n->features.async_config = dw11;
>  break;
> +case NVME_ARBITRATION:
> +case NVME_POWER_MANAGEMENT:
> +case NVME_ERROR_RECOVERY:
> +case NVME_INTERRUPT_COALESCING:
> +case NVME_INTERRUPT_VECTOR_CONF:
> +case NVME_WRITE_ATOMICITY:
> +return NVME_FEAT_NOT_CHANGABLE | NVME_DNR;
>  default:
>  trace_nvme_dev_err_invalid_setfeat(dw10);
>  return NVME_INVALID_FIELD | NVME_DNR;
> @@ -1716,6 +1753,25 @@ static void nvme_init_state(NvmeCtrl *n)
>  n->temperature = NVME_TEMPERATURE;
>  n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
>  n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
> +
> +/*
> + * There is no limit on the number of commands that the controller may
> + * launch at one time 

Re: [PATCH v6 17/42] nvme: add log specific field to trace events

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> The LSP field is not used directly now, but include it in the trace.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 3 ++-
>  hw/block/trace-events | 2 +-
>  2 files changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index e56142c4ea99..16de3ca1c5d5 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -760,6 +760,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, 
> NvmeRequest *req)
>  uint32_t dw12 = le32_to_cpu(cmd->cdw12);
>  uint32_t dw13 = le32_to_cpu(cmd->cdw13);
>  uint8_t  lid = dw10 & 0xff;
> +uint8_t  lsp = (dw10 >> 8) & 0xf;
>  uint8_t  rae = (dw10 >> 15) & 0x1;
>  uint32_t numdl, numdu;
>  uint64_t off, lpol, lpou;
> @@ -777,7 +778,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, 
> NvmeRequest *req)
>  return NVME_INVALID_FIELD | NVME_DNR;
>  }
>  
> -trace_nvme_dev_get_log(nvme_cid(req), lid, rae, len, off);
> +trace_nvme_dev_get_log(nvme_cid(req), lid, lsp, rae, len, off);
>  
>  switch (lid) {
>  case NVME_LOG_ERROR_INFO:
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index dde1d22bc39a..13e2c71664f6 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -48,7 +48,7 @@ nvme_dev_getfeat_numq(int result) "get feature number of 
> queues, result=%d"
>  nvme_dev_setfeat_numq(int reqcq, int reqsq, int gotcq, int gotsq) "requested 
> cq_count=%d sq_count=%d, responding with cq_count=%d sq_count=%d"
>  nvme_dev_setfeat_timestamp(uint64_t ts) "set feature timestamp = 0x%"PRIx64""
>  nvme_dev_getfeat_timestamp(uint64_t ts) "get feature timestamp = 0x%"PRIx64""
> -nvme_dev_get_log(uint16_t cid, uint8_t lid, uint8_t rae, uint32_t len, 
> uint64_t off) "cid %"PRIu16" lid 0x%"PRIx8" rae 0x%"PRIx8" len %"PRIu32" off 
> %"PRIu64""
> +nvme_dev_get_log(uint16_t cid, uint8_t lid, uint8_t lsp, uint8_t rae, 
> uint32_t len, uint64_t off) "cid %"PRIu16" lid 0x%"PRIx8" lsp 0x%"PRIx8" rae 
> 0x%"PRIx8" len %"PRIu32" off %"PRIu64""
>  nvme_dev_process_aers(int queued) "queued %d"
>  nvme_dev_aer(uint16_t cid) "cid %"PRIu16""
>  nvme_dev_aer_aerl_exceeded(void) "aerl exceeded"
Perfect!
Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky






Re: [PATCH v6 13/42] nvme: add support for the asynchronous event request command

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:28 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Required for compliance with NVMe revision 1.2.1. See NVM Express 1.2.1,
> Section 5.2 ("Asynchronous Event Request command").
> 
> Mostly imported from Keith's qemu-nvme tree. Modified with a max number
> of queued events (controllable with the aer_max_queued device
> parameter). The spec states that the controller *should* retain
> events, so we do best effort here.
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Keith Busch 
> Reviewed-by: Maxim Levitsky 
> ---
>  hw/block/nvme.c   | 178 --
>  hw/block/nvme.h   |  14 +++-
>  hw/block/trace-events |   9 +++
>  include/block/nvme.h  |   8 +-
>  4 files changed, 199 insertions(+), 10 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 83ff3fbfb463..ff8975cd6667 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -325,6 +325,85 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, 
> NvmeRequest *req)
>  timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
>  }
>  
> +static void nvme_process_aers(void *opaque)
> +{
> +NvmeCtrl *n = opaque;
> +NvmeAsyncEvent *event, *next;
> +
> +trace_nvme_dev_process_aers(n->aer_queued);
> +
> +QTAILQ_FOREACH_SAFE(event, &n->aer_queue, entry, next) {
> +NvmeRequest *req;
> +NvmeAerResult *result;
> +
> +/* can't post cqe if there is nothing to complete */
> +if (!n->outstanding_aers) {
> +trace_nvme_dev_no_outstanding_aers();
> +break;
> +}
> +
> +/* ignore if masked (cqe posted, but event not cleared) */
> +if (n->aer_mask & (1 << event->result.event_type)) {
> +trace_nvme_dev_aer_masked(event->result.event_type, n->aer_mask);
> +continue;
> +}
> +
> +QTAILQ_REMOVE(&n->aer_queue, event, entry);
> +n->aer_queued--;
> +
> +n->aer_mask |= 1 << event->result.event_type;
> +n->outstanding_aers--;
> +
> +req = n->aer_reqs[n->outstanding_aers];
> +
> +result = (NvmeAerResult *) &req->cqe.result;
> +result->event_type = event->result.event_type;
> +result->event_info = event->result.event_info;
> +result->log_page = event->result.log_page;
> +g_free(event);
> +
> +req->status = NVME_SUCCESS;
> +
> +trace_nvme_dev_aer_post_cqe(result->event_type, result->event_info,
> +result->log_page);
> +
> +nvme_enqueue_req_completion(&n->admin_cq, req);
> +}
> +}
> +
> +static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
> +   uint8_t event_info, uint8_t log_page)
> +{
> +NvmeAsyncEvent *event;
> +
> +trace_nvme_dev_enqueue_event(event_type, event_info, log_page);
> +
> +if (n->aer_queued == n->params.aer_max_queued) {
> +trace_nvme_dev_enqueue_event_noqueue(n->aer_queued);
> +return;
> +}
> +
> +event = g_new(NvmeAsyncEvent, 1);
> +event->result = (NvmeAerResult) {
> +.event_type = event_type,
> +.event_info = event_info,
> +.log_page   = log_page,
> +};
> +
> +QTAILQ_INSERT_TAIL(&n->aer_queue, event, entry);
> +n->aer_queued++;
> +
> +nvme_process_aers(n);
> +}
> +
> +static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
> +{
> +n->aer_mask &= ~(1 << event_type);
> +if (!QTAILQ_EMPTY(&n->aer_queue)) {
> +nvme_process_aers(n);
> +}
> +}
> +
>  static void nvme_rw_cb(void *opaque, int ret)
>  {
>  NvmeRequest *req = opaque;
> @@ -569,8 +648,9 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
>  return NVME_SUCCESS;
>  }
>  
> -static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> -uint64_t off, NvmeRequest *req)
> +static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
> +uint32_t buf_len, uint64_t off,
> +NvmeRequest *req)
>  {
>  uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
>  uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
> @@ -619,6 +699,10 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd 
> *cmd, uint32_t buf_len,
>  smart.power_on_hours[0] =
>  cpu_to_le64current_ms - n->starttime_ms) / 1000) / 60) / 60);
>  
> +if (!rae) {
> +nvme_clear_events(n, NVME_AER_TYPE_SMART);
> +}
> +
>  return nvme_dma_read_prp(n, (uint8_t *) &smart + off, trans_len, prp1,
>   prp2);
>  }
> @@ -643,14 +727,19 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd 
> *cmd, uint32_t buf_len,
>   prp2);
>  }
>  
> -static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> -uint64_t off, NvmeRequest *req)
> +static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *

Re: [PATCH v6 10/42] nvme: refactor device realization

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:28 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> This patch splits up nvme_realize into multiple individual functions,
> each initializing a different subset of the device.
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Keith Busch 
> ---
>  hw/block/nvme.c | 178 ++--
>  hw/block/nvme.h |  23 ++-
>  2 files changed, 134 insertions(+), 67 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 7dfd8a1a392d..665485045066 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -44,6 +44,8 @@
>  #include "trace.h"
>  #include "nvme.h"
>  
> +#define NVME_CMB_BIR 2
> +
>  #define NVME_GUEST_ERR(trace, fmt, ...) \
>  do { \
>  (trace_##trace)(__VA_ARGS__); \
> @@ -63,7 +65,7 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr 
> addr)
>  
>  static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>  {
> -if (n->cmbsz && nvme_addr_is_cmb(n, addr)) {
> +if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
>  memcpy(buf, (void *)&n->cmbuf[addr - n->ctrl_mem.addr], size);
>  return;
>  }
> @@ -157,7 +159,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
> QEMUIOVector *iov, uint64_t prp1,
>  if (unlikely(!prp1)) {
>  trace_nvme_dev_err_invalid_prp();
>  return NVME_INVALID_FIELD | NVME_DNR;
> -} else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
> +} else if (n->bar.cmbsz && prp1 >= n->ctrl_mem.addr &&
> prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
>  qsg->nsg = 0;
>  qemu_iovec_init(iov, num_prps);
> @@ -1324,14 +1326,9 @@ static const MemoryRegionOps nvme_cmb_ops = {
>  },
>  };
>  
> -static void nvme_realize(PCIDevice *pci_dev, Error **errp)
> +static int nvme_check_constraints(NvmeCtrl *n, Error **errp)
>  {
> -NvmeCtrl *n = NVME(pci_dev);
> -NvmeIdCtrl *id = &n->id_ctrl;
> -
> -int i;
> -int64_t bs_size;
> -uint8_t *pci_conf;
> +NvmeParams *params = &n->params;
>  
>  if (n->params.num_queues) {
>  warn_report("nvme: num_queues is deprecated; please use max_ioqpairs 
> "
> @@ -1340,57 +1337,100 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
> **errp)
>  n->params.max_ioqpairs = n->params.num_queues - 1;
>  }
>  
> -if (!n->params.max_ioqpairs) {
> -error_setg(errp, "max_ioqpairs can't be less than 1");
> +if (params->max_ioqpairs < 1 ||
> +params->max_ioqpairs > PCI_MSIX_FLAGS_QSIZE) {
> +error_setg(errp, "nvme: max_ioqpairs must be ");
Looks like the error message is not complete now.
> +return -1;
>  }
>  
>  if (!n->conf.blk) {
> -error_setg(errp, "drive property not set");
> -return;
> +error_setg(errp, "nvme: block backend not configured");
> +return -1;
>  }
>  
> -bs_size = blk_getlength(n->conf.blk);
> -if (bs_size < 0) {
> -error_setg(errp, "could not get backing file size");
> -return;
> +if (!params->serial) {
> +error_setg(errp, "nvme: serial not configured");
> +return -1;
>  }
>  
> -if (!n->params.serial) {
> -error_setg(errp, "serial property not set");
> -return;
> -}
> +return 0;
> +}
> +
> +static int nvme_init_blk(NvmeCtrl *n, Error **errp)
> +{
>  blkconf_blocksizes(&n->conf);
>  if (!blkconf_apply_backend_options(&n->conf, 
> blk_is_read_only(n->conf.blk),
> false, errp)) {
> -return;
> +return -1;
>  }
>  
> -pci_conf = pci_dev->config;
> -pci_conf[PCI_INTERRUPT_PIN] = 1;
> -pci_config_set_prog_interface(pci_dev->config, 0x2);
> -pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
> -pcie_endpoint_cap_init(pci_dev, 0x80);
> +return 0;
> +}
>  
> +static void nvme_init_state(NvmeCtrl *n)
> +{
>  n->num_namespaces = 1;
>  n->reg_size = pow2ceil(0x1008 + 2 * (n->params.max_ioqpairs) * 4);
> -n->ns_size = bs_size / (uint64_t)n->num_namespaces;
> -
>  n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
>  n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
>  n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
> +}
>  
> -memory_region_init_io(&n->iomem, OBJECT(n), &nvme_mmio_ops, n,
> -  "nvme", n->reg_size);
> -pci_register_bar(pci_dev, 0,
> -PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
> -&n->iomem);
> +static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
> +{
> +NVME_CMBLOC_SET_BIR(n->bar.cmbloc, NVME_CMB_BIR);
> +NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0);
> +
> +NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
> +NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
> +NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
> +NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
> +NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
> +NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2);

Re: [PATCH v6 15/42] nvme: additional tracing

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Add additional trace calls for nvme_enqueue_req_completion, mmio and
> doorbell writes.
> 
> Also, streamline nvme_identify_ns and nvme_identify_ns_list. They do not
> need to repeat the command, it is already in the trace name.
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Keith Busch 
> ---
>  hw/block/nvme.c   | 10 ++
>  hw/block/trace-events |  9 +++--
>  2 files changed, 17 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index eb9c722df968..85c7c86b35f0 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -320,6 +320,8 @@ static void nvme_post_cqes(void *opaque)
>  static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
>  {
>  assert(cq->cqid == req->sq->cqid);
> +trace_nvme_dev_enqueue_req_completion(nvme_cid(req), cq->cqid,
> +  req->status);
>  QTAILQ_REMOVE(&req->sq->out_req_list, req, entry);
>  QTAILQ_INSERT_TAIL(&cq->req_list, req, entry);
>  timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> @@ -1527,6 +1529,8 @@ static uint64_t nvme_mmio_read(void *opaque, hwaddr 
> addr, unsigned size)
>  uint8_t *ptr = (uint8_t *)&n->bar;
>  uint64_t val = 0;
>  
> +trace_nvme_dev_mmio_read(addr);
> +
>  if (unlikely(addr & (sizeof(uint32_t) - 1))) {
>  NVME_GUEST_ERR(nvme_dev_ub_mmiord_misaligned32,
> "MMIO read not 32-bit aligned,"
> @@ -1601,6 +1605,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, 
> int val)
>  return;
>  }
>  
> +trace_nvme_dev_mmio_doorbell_cq(cq->cqid, new_head);
> +
>  start_sqs = nvme_cq_full(cq) ? 1 : 0;
>  cq->head = new_head;
>  if (start_sqs) {
> @@ -1653,6 +1659,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, 
> int val)
>  return;
>  }
>  
> +trace_nvme_dev_mmio_doorbell_sq(sq->sqid, new_tail);
> +
>  sq->tail = new_tail;
>  timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
>  }
> @@ -1661,6 +1669,8 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, 
> int val)
>  static void nvme_mmio_write(void *opaque, hwaddr addr, uint64_t data,
>  unsigned size)
>  {
> +trace_nvme_dev_mmio_write(addr, data);
> +
>  NvmeCtrl *n = (NvmeCtrl *)opaque;
>  if (addr < sizeof(n->bar)) {
>  nvme_write_bar(n, addr, data, size);
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index 4cf39961989d..dde1d22bc39a 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -39,8 +39,8 @@ nvme_dev_create_cq(uint64_t addr, uint16_t cqid, uint16_t 
> vector, uint16_t size,
>  nvme_dev_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
>  nvme_dev_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16""
>  nvme_dev_identify_ctrl(void) "identify controller"
> -nvme_dev_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
> -nvme_dev_identify_nslist(uint16_t ns) "identify namespace list, 
> nsid=%"PRIu16""
> +nvme_dev_identify_ns(uint32_t ns) "nsid %"PRIu32""
> +nvme_dev_identify_nslist(uint32_t ns) "nsid %"PRIu32""
>  nvme_dev_getfeat(uint16_t cid, uint32_t fid) "cid %"PRIu16" fid 0x%"PRIx32""
>  nvme_dev_setfeat(uint16_t cid, uint32_t fid, uint32_t val) "cid %"PRIu16" 
> fid 0x%"PRIx32" val 0x%"PRIx32""
>  nvme_dev_getfeat_vwcache(const char* result) "get feature volatile write 
> cache, result=%s"
> @@ -54,10 +54,13 @@ nvme_dev_aer(uint16_t cid) "cid %"PRIu16""
>  nvme_dev_aer_aerl_exceeded(void) "aerl exceeded"
>  nvme_dev_aer_masked(uint8_t type, uint8_t mask) "type 0x%"PRIx8" mask 
> 0x%"PRIx8""
>  nvme_dev_aer_post_cqe(uint8_t typ, uint8_t info, uint8_t log_page) "type 
> 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8""
> +nvme_dev_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint16_t 
> status) "cid %"PRIu16" cqid %"PRIu16" status 0x%"PRIx16""
>  nvme_dev_enqueue_event(uint8_t typ, uint8_t info, uint8_t log_page) "type 
> 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8""
>  nvme_dev_enqueue_event_noqueue(int queued) "queued %d"
>  nvme_dev_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8""
>  nvme_dev_no_outstanding_aers(void) "ignoring event; no outstanding AERs"
> +nvme_dev_mmio_read(uint64_t addr) "addr 0x%"PRIx64""
> +nvme_dev_mmio_write(uint64_t addr, uint64_t data) "addr 0x%"PRIx64" data 
> 0x%"PRIx64""
>  nvme_dev_mmio_intm_set(uint64_t data, uint64_t new_mask) "wrote MMIO, 
> interrupt mask set, data=0x%"PRIx64", new_mask=0x%"PRIx64""
>  nvme_dev_mmio_intm_clr(uint64_t data, uint64_t new_mask) "wrote MMIO, 
> interrupt mask clr, data=0x%"PRIx64", new_mask=0x%"PRIx64""
>  nvme_dev_mmio_cfg(uint64_t data) "wrote MMIO, config controller 
> config=0x%"PRIx64""
> @@ -70,6 +73,8 @@ nvme_dev_mmio_start_success(void) "setting controller 
> enable bit succeeded"
>  nvme_dev_mmio_stopped(void) "cleared cont

Re: [PATCH v6 11/42] nvme: add temperature threshold feature

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:28 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> It might seem wierd to implement this feature for an emulated device,
> but it is mandatory to support and the feature is useful for testing
> asynchronous event request support, which will be added in a later
> patch.
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Keith Busch 
> ---
>  hw/block/nvme.c  | 48 
>  hw/block/nvme.h  |  2 ++
>  include/block/nvme.h |  8 +++-
>  3 files changed, 57 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 665485045066..64c42101df5c 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -45,6 +45,9 @@
>  #include "nvme.h"
>  
>  #define NVME_CMB_BIR 2
> +#define NVME_TEMPERATURE 0x143
> +#define NVME_TEMPERATURE_WARNING 0x157
> +#define NVME_TEMPERATURE_CRITICAL 0x175
>  
>  #define NVME_GUEST_ERR(trace, fmt, ...) \
>  do { \
> @@ -798,9 +801,31 @@ static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, 
> NvmeCmd *cmd)
>  static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>  {
>  uint32_t dw10 = le32_to_cpu(cmd->cdw10);
> +uint32_t dw11 = le32_to_cpu(cmd->cdw11);
>  uint32_t result;
>  
>  switch (dw10) {
> +case NVME_TEMPERATURE_THRESHOLD:
> +result = 0;
> +
> +/*
> + * The controller only implements the Composite Temperature sensor, 
> so
> + * return 0 for all other sensors.
> + */
> +if (NVME_TEMP_TMPSEL(dw11)) {
> +break;
> +}
> +
> +switch (NVME_TEMP_THSEL(dw11)) {
> +case 0x0:
> +result = cpu_to_le16(n->features.temp_thresh_hi);
> +break;
> +case 0x1:
> +result = cpu_to_le16(n->features.temp_thresh_low);
> +break;
> +}
> +
> +break;
>  case NVME_VOLATILE_WRITE_CACHE:
>  result = blk_enable_write_cache(n->conf.blk);
>  trace_nvme_dev_getfeat_vwcache(result ? "enabled" : "disabled");
> @@ -845,6 +870,23 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  uint32_t dw11 = le32_to_cpu(cmd->cdw11);
>  
>  switch (dw10) {
> +case NVME_TEMPERATURE_THRESHOLD:
> +if (NVME_TEMP_TMPSEL(dw11)) {
> +break;
> +}
> +
> +switch (NVME_TEMP_THSEL(dw11)) {
> +case 0x0:
> +n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
> +break;
> +case 0x1:
> +n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
> +break;
> +default:
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +
> +break;
>  case NVME_VOLATILE_WRITE_CACHE:
>  blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
>  break;
> @@ -1374,6 +1416,7 @@ static void nvme_init_state(NvmeCtrl *n)
>  n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
>  n->sq = g_new0(NvmeSQueue *, n->params.max_ioqpairs + 1);
>  n->cq = g_new0(NvmeCQueue *, n->params.max_ioqpairs + 1);
> +n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
>  }
>  
>  static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
> @@ -1451,6 +1494,11 @@ static void nvme_init_ctrl(NvmeCtrl *n)
>  id->acl = 3;
>  id->frmw = 7 << 1;
>  id->lpa = 1 << 0;
> +
> +/* recommended default value (~70 C) */
> +id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
> +id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
> +
>  id->sqes = (0x6 << 4) | 0x6;
>  id->cqes = (0x4 << 4) | 0x4;
>  id->nn = cpu_to_le32(n->num_namespaces);
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index b7c465560eea..8cda5f02c622 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -108,6 +108,7 @@ typedef struct NvmeCtrl {
>  uint64_tirq_status;
>  uint64_thost_timestamp; /* Timestamp sent by the 
> host */
>  uint64_ttimestamp_set_qemu_clock_ms;/* QEMU clock time */
> +uint16_ttemperature;
You forgot to move this too.

>  
>  NvmeNamespace   *namespaces;
>  NvmeSQueue  **sq;
> @@ -115,6 +116,7 @@ typedef struct NvmeCtrl {
>  NvmeSQueue  admin_sq;
>  NvmeCQueue  admin_cq;
>  NvmeIdCtrl  id_ctrl;
> +NvmeFeatureVal  features;
>  } NvmeCtrl;
>  
>  static inline uint64_t nvme_ns_nlbas(NvmeCtrl *n, NvmeNamespace *ns)
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index a083c1b3a613..91fc4738a3e0 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -688,7 +688,13 @@ enum NvmeIdCtrlOncs {
>  typedef struct NvmeFeatureVal {
>  uint32_tarbitration;
>  uint32_tpower_mgmt;
> -uint32_ttemp_thresh;
> +union {
> +struct {
> +uint16_t temp_thresh_hi;
> +uint16_t temp_thresh_low;
> +};
> +uint32_t temp_thresh;
> +};
>  uint32_terr_rec;
>  uin

Re: [PATCH v6 18/42] nvme: support identify namespace descriptor list

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Since we are not providing the NGUID or EUI64 fields, we must support
> the Namespace UUID. We do not have any way of storing a persistent
> unique identifier, so conjure up a UUID that is just the namespace id.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 38 ++
>  hw/block/trace-events |  1 +
>  2 files changed, 39 insertions(+)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 16de3ca1c5d5..007f8817f101 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -942,6 +942,42 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
> NvmeIdentify *c)
>  return ret;
>  }
>  
> +static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeIdentify *c)
> +{
> +uint32_t nsid = le32_to_cpu(c->nsid);
> +uint64_t prp1 = le64_to_cpu(c->prp1);
> +uint64_t prp2 = le64_to_cpu(c->prp2);
> +
> +void *list;
> +uint16_t ret;
> +NvmeIdNsDescr *ns_descr;
> +
> +trace_nvme_dev_identify_ns_descr_list(nsid);
> +
> +if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
> +trace_nvme_dev_err_invalid_ns(nsid, n->num_namespaces);
> +return NVME_INVALID_NSID | NVME_DNR;
> +}
> +
> +list = g_malloc0(NVME_IDENTIFY_DATA_SIZE);
> +ns_descr = list;
> +
> +/*
> + * Because the NGUID and EUI64 fields are 0 in the Identify Namespace 
> data
> + * structure, a Namespace UUID (nidt = 0x3) must be reported in the
> + * Namespace Identification Descriptor. Add a very basic Namespace UUID
> + * here.
> + */
> +ns_descr->nidt = NVME_NIDT_UUID;
> +ns_descr->nidl = NVME_NIDT_UUID_LEN;
> +stl_be_p(ns_descr + sizeof(*ns_descr), nsid);
> +
> +ret = nvme_dma_read_prp(n, (uint8_t *) list, NVME_IDENTIFY_DATA_SIZE, 
> prp1,
> +prp2);
> +g_free(list);
> +return ret;
> +}
> +
>  static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
>  {
>  NvmeIdentify *c = (NvmeIdentify *)cmd;
> @@ -953,6 +989,8 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
>  return nvme_identify_ctrl(n, c);
>  case NVME_ID_CNS_NS_ACTIVE_LIST:
>  return nvme_identify_nslist(n, c);
> +case NVME_ID_CNS_NS_DESCR_LIST:
> +return nvme_identify_ns_descr_list(n, c);
>  default:
>  trace_nvme_dev_err_invalid_identify_cns(le32_to_cpu(c->cns));
>  return NVME_INVALID_FIELD | NVME_DNR;
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index 13e2c71664f6..4cde0844ef64 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -41,6 +41,7 @@ nvme_dev_del_cq(uint16_t cqid) "deleted completion queue, 
> sqid=%"PRIu16""
>  nvme_dev_identify_ctrl(void) "identify controller"
>  nvme_dev_identify_ns(uint32_t ns) "nsid %"PRIu32""
>  nvme_dev_identify_nslist(uint32_t ns) "nsid %"PRIu32""
> +nvme_dev_identify_ns_descr_list(uint32_t ns) "nsid %"PRIu32""
>  nvme_dev_getfeat(uint16_t cid, uint32_t fid) "cid %"PRIu16" fid 0x%"PRIx32""
>  nvme_dev_setfeat(uint16_t cid, uint32_t fid, uint32_t val) "cid %"PRIu16" 
> fid 0x%"PRIx32" val 0x%"PRIx32""
>  nvme_dev_getfeat_vwcache(const char* result) "get feature volatile write 
> cache, result=%s"

I think that we should add namespace uuid as a device parameter,
but its OK to do this in follow up patch.


Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky






Re: [PATCH v6 25/42] nvme: refactor dma read/write

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Refactor the nvme_dma_{read,write}_prp functions into a common function
> taking a DMADirection parameter.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 89 -
>  1 file changed, 43 insertions(+), 46 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index e40c080c3b48..809d00443369 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -299,55 +299,50 @@ unmap:
>  return status;
>  }
>  
> -static uint16_t nvme_dma_write_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
> -   uint64_t prp1, uint64_t prp2)
> +static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
> + uint64_t prp1, uint64_t prp2, DMADirection dir)
>  {
>  QEMUSGList qsg;
>  QEMUIOVector iov;
>  uint16_t status = NVME_SUCCESS;
>  
> -if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
> -return NVME_INVALID_FIELD | NVME_DNR;
> +status = nvme_map_prp(&qsg, &iov, prp1, prp2, len, n);
> +if (status) {
> +return status;
>  }
> -if (qsg.nsg > 0) {
> -if (dma_buf_write(ptr, len, &qsg)) {
> -status = NVME_INVALID_FIELD | NVME_DNR;
> -}
> -qemu_sglist_destroy(&qsg);
> -} else {
> -if (qemu_iovec_to_buf(&iov, 0, ptr, len) != len) {
> -status = NVME_INVALID_FIELD | NVME_DNR;
> -}
> -qemu_iovec_destroy(&iov);
> -}
> -return status;
> -}
>  
> -static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
> -uint64_t prp1, uint64_t prp2)
> -{
> -QEMUSGList qsg;
> -QEMUIOVector iov;
> -uint16_t status = NVME_SUCCESS;
> +if (qsg.nsg > 0) {
> +uint64_t residual;
>  
> -trace_nvme_dev_dma_read(prp1, prp2);
> +if (dir == DMA_DIRECTION_TO_DEVICE) {
> +residual = dma_buf_write(ptr, len, &qsg);
> +} else {
> +residual = dma_buf_read(ptr, len, &qsg);
> +}
>  
> -if (nvme_map_prp(&qsg, &iov, prp1, prp2, len, n)) {
> -return NVME_INVALID_FIELD | NVME_DNR;
> -}
> -if (qsg.nsg > 0) {
> -if (unlikely(dma_buf_read(ptr, len, &qsg))) {
> +if (unlikely(residual)) {
>  trace_nvme_dev_err_invalid_dma();
>  status = NVME_INVALID_FIELD | NVME_DNR;
>  }
> +
>  qemu_sglist_destroy(&qsg);
>  } else {
> -if (unlikely(qemu_iovec_from_buf(&iov, 0, ptr, len) != len)) {
> +size_t bytes;
> +
> +if (dir == DMA_DIRECTION_TO_DEVICE) {
> +bytes = qemu_iovec_to_buf(&iov, 0, ptr, len);
> +} else {
> +bytes = qemu_iovec_from_buf(&iov, 0, ptr, len);
> +}
> +
> +if (unlikely(bytes != len)) {
>  trace_nvme_dev_err_invalid_dma();
>  status = NVME_INVALID_FIELD | NVME_DNR;
>  }
> +
>  qemu_iovec_destroy(&iov);
>  }
> +
>  return status;
>  }
>  
> @@ -775,8 +770,8 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd 
> *cmd, uint8_t rae,
>  nvme_clear_events(n, NVME_AER_TYPE_SMART);
>  }
>  
> -return nvme_dma_read_prp(n, (uint8_t *) &smart + off, trans_len, prp1,
> - prp2);
> +return nvme_dma_prp(n, (uint8_t *) &smart + off, trans_len, prp1, prp2,
> +DMA_DIRECTION_FROM_DEVICE);
>  }
>  
>  static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> @@ -795,8 +790,8 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd 
> *cmd, uint32_t buf_len,
>  
>  trans_len = MIN(sizeof(fw_log) - off, buf_len);
>  
> -return nvme_dma_read_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1,
> - prp2);
> +return nvme_dma_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1, prp2,
> +DMA_DIRECTION_FROM_DEVICE);
>  }
>  
>  static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
> @@ -820,7 +815,8 @@ static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd 
> *cmd, uint8_t rae,
>  
>  trans_len = MIN(sizeof(errlog) - off, buf_len);
>  
> -return nvme_dma_read_prp(n, errlog, trans_len, prp1, prp2);
> +return nvme_dma_prp(n, errlog, trans_len, prp1, prp2,
> +DMA_DIRECTION_FROM_DEVICE);
>  }
>  
>  static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> @@ -963,8 +959,8 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, 
> NvmeIdentify *c)
>  
>  trace_nvme_dev_identify_ctrl();
>  
> -return nvme_dma_read_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl),
> -prp1, prp2);
> +return nvme_dma_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), prp1,
> +prp2, DMA_DIRECTION_FROM_DEVICE);
>  }
>  
>  static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
> @@ -983,8 +979,8 @@ static uint16_t nvme_

Re: [PATCH v6 16/42] nvme: make sure ncqr and nsqr is valid

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> 0x is not an allowed value for NCQR and NSQR in Set Features on
> Number of Queues.
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Keith Busch 
> Reviewed-by: Maxim Levitsky 
> ---
>  hw/block/nvme.c | 8 
>  1 file changed, 8 insertions(+)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 85c7c86b35f0..e56142c4ea99 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1155,6 +1155,14 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
>  break;
>  case NVME_NUMBER_OF_QUEUES:
> +/*
> + * NVMe v1.3, Section 5.21.1.7: 0x is not an allowed value for 
> NCQR
> + * and NSQR.
> + */
> +if ((dw11 & 0x) == 0x || ((dw11 >> 16) & 0x) == 0x) {
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +
>  trace_nvme_dev_setfeat_numq((dw11 & 0x) + 1,
>  ((dw11 >> 16) & 0x) + 1,
>  n->params.max_ioqpairs,

Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky






Re: [PATCH v6 22/42] nvme: memset preallocated requests structures

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> This is preparatory to subsequent patches that change how QSGs/IOVs are
> handled. It is important that the qsg and iov members of the NvmeRequest
> are initially zeroed.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 26c4b6e69f72..08267e847671 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -597,7 +597,7 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, 
> uint64_t dma_addr,
>  sq->size = size;
>  sq->cqid = cqid;
>  sq->head = sq->tail = 0;
> -sq->io_req = g_new(NvmeRequest, sq->size);
> +sq->io_req = g_new0(NvmeRequest, sq->size);
>  
>  QTAILQ_INIT(&sq->req_list);
>  QTAILQ_INIT(&sq->out_req_list);

Reviewed-by: Maxim Levitsky 
Best regards,
Maxim Levitsky






Re: [PATCH v6 19/42] nvme: enforce valid queue creation sequence

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Support returning Command Sequence Error if Set Features on Number of
> Queues is called after queues have been created.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 7 +++
>  hw/block/nvme.h | 1 +
>  2 files changed, 8 insertions(+)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 007f8817f101..b40d27cddc46 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -881,6 +881,8 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
>  cq = g_malloc0(sizeof(*cq));
>  nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
>  NVME_CQ_FLAGS_IEN(qflags));
> +
> +n->qs_created = true;
Very minor nitpick, maybe it is worth mentioning in a comment,
why this is only needed in CQ creation, as you explained to me.


>  return NVME_SUCCESS; 
>  }
>  
> @@ -1194,6 +1196,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
>  break;
>  case NVME_NUMBER_OF_QUEUES:
> +if (n->qs_created) {
> +return NVME_CMD_SEQ_ERROR | NVME_DNR;
> +}
> +
>  /*
>   * NVMe v1.3, Section 5.21.1.7: 0x is not an allowed value for 
> NCQR
>   * and NSQR.
> @@ -1332,6 +1338,7 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
>  
>  n->aer_queued = 0;
>  n->outstanding_aers = 0;
> +n->qs_created = false;
>  
>  blk_flush(n->conf.blk);
>  n->bar.cc = 0;
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index b709a8bb8d40..b4d1738a3d0a 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -99,6 +99,7 @@ typedef struct NvmeCtrl {
>  BlockConfconf;
>  NvmeParams   params;
>  
> +boolqs_created;
>  uint32_tpage_size;
>  uint16_tpage_bits;
>  uint16_tmax_prp_ents;

Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky







Re: [PATCH v6 20/42] nvme: provide the mandatory subnqn field

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index b40d27cddc46..74061d08fd2e 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1925,6 +1925,9 @@ static void nvme_init_ctrl(NvmeCtrl *n)
>  id->nn = cpu_to_le32(n->num_namespaces);
>  id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP);
>  
> +pstrcpy((char *) id->subnqn, sizeof(id->subnqn), 
> "nqn.2019-08.org.qemu:");
> +pstrcat((char *) id->subnqn, sizeof(id->subnqn), n->params.serial);
> +
>  id->psd[0].mp = cpu_to_le16(0x9c4);
>  id->psd[0].enlat = cpu_to_le32(0x10);
>  id->psd[0].exlat = cpu_to_le32(0x4);
Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky






Re: [PATCH v6 21/42] nvme: bump supported version to v1.3

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 74061d08fd2e..26c4b6e69f72 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -44,6 +44,7 @@
>  #include "trace.h"
>  #include "nvme.h"
>  
> +#define NVME_SPEC_VER 0x00010300
>  #define NVME_CMB_BIR 2
>  #define NVME_TEMPERATURE 0x143
>  #define NVME_TEMPERATURE_WARNING 0x157
> @@ -1898,6 +1899,7 @@ static void nvme_init_ctrl(NvmeCtrl *n)
>  id->ieee[0] = 0x00;
>  id->ieee[1] = 0x02;
>  id->ieee[2] = 0xb3;
> +id->ver = cpu_to_le32(NVME_SPEC_VER);
>  id->oacs = cpu_to_le16(0);
>  
>  /*
> @@ -1942,7 +1944,7 @@ static void nvme_init_ctrl(NvmeCtrl *n)
>  NVME_CAP_SET_CSS(n->bar.cap, 1);
>  NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
>  
> -n->bar.vs = 0x00010200;
> +n->bar.vs = NVME_SPEC_VER;
>  n->bar.intmc = n->bar.intms = 0;
>  }
>  
Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky






Re: [PATCH v6 23/42] nvme: add mapping helpers

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Add nvme_map_addr, nvme_map_addr_cmb and nvme_addr_to_cmb helpers and
> use them in nvme_map_prp.
> 
> This fixes a bug where in the case of a CMB transfer, the device would
> map to the buffer with a wrong length.
> 
> Fixes: b2b2b67a00574 ("nvme: Add support for Read Data and Write Data in 
> CMBs.")
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 97 +++
>  hw/block/trace-events |  1 +
>  2 files changed, 81 insertions(+), 17 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 08267e847671..187c816eb6ad 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -59,6 +59,11 @@
>  
>  static void nvme_process_sq(void *opaque);
>  
> +static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
> +{
> +return &n->cmbuf[addr - n->ctrl_mem.addr];
> +}
> +
>  static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
>  {
>  hwaddr low = n->ctrl_mem.addr;
> @@ -70,7 +75,7 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr 
> addr)
>  static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>  {
>  if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
> -memcpy(buf, (void *)&n->cmbuf[addr - n->ctrl_mem.addr], size);
> +memcpy(buf, nvme_addr_to_cmb(n, addr), size);
>  return;
>  }
>  
> @@ -153,29 +158,79 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue 
> *cq)
>  }
>  }
>  
> +static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr 
> addr,
> +  size_t len)
> +{
> +if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len)) {
> +return NVME_DATA_TRAS_ERROR;
> +}

I just noticed that
in theory (not that it really matters) but addr+len refers to the byte which is 
already 
not the part of the transfer.


> +
> +qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
Also intersting is we can add 0 sized iovec.


> +
> +return NVME_SUCCESS;
> +}
> +
> +static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector 
> *iov,
> +  hwaddr addr, size_t len)
> +{
> +if (nvme_addr_is_cmb(n, addr)) {
> +if (qsg && qsg->sg) {
> +return NVME_INVALID_USE_OF_CMB | NVME_DNR;
> +}
> +
> +assert(iov);
> +
> +if (!iov->iov) {
> +qemu_iovec_init(iov, 1);
> +}
> +
> +return nvme_map_addr_cmb(n, iov, addr, len);
> +}
> +
> +if (iov && iov->iov) {
> +return NVME_INVALID_USE_OF_CMB | NVME_DNR;
> +}
> +
> +assert(qsg);
> +
> +if (!qsg->sg) {
> +pci_dma_sglist_init(qsg, &n->parent_obj, 1);
> +}
> +
> +qemu_sglist_add(qsg, addr, len);
> +
> +return NVME_SUCCESS;
> +}
Looks very good.

> +
>  static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t 
> prp1,
>   uint64_t prp2, uint32_t len, NvmeCtrl *n)
>  {
>  hwaddr trans_len = n->page_size - (prp1 % n->page_size);
>  trans_len = MIN(len, trans_len);
>  int num_prps = (len >> n->page_bits) + 1;
> +uint16_t status;
>  
>  if (unlikely(!prp1)) {
>  trace_nvme_dev_err_invalid_prp();
>  return NVME_INVALID_FIELD | NVME_DNR;
> -} else if (n->bar.cmbsz && prp1 >= n->ctrl_mem.addr &&
> -   prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
> -qsg->nsg = 0;
> +}
> +
> +if (nvme_addr_is_cmb(n, prp1)) {
>  qemu_iovec_init(iov, num_prps);
> -qemu_iovec_add(iov, (void *)&n->cmbuf[prp1 - n->ctrl_mem.addr], 
> trans_len);
>  } else {
>  pci_dma_sglist_init(qsg, &n->parent_obj, num_prps);
> -qemu_sglist_add(qsg, prp1, trans_len);
>  }
> +
> +status = nvme_map_addr(n, qsg, iov, prp1, trans_len);
> +if (status) {
> +goto unmap;
> +}
> +
>  len -= trans_len;
>  if (len) {
>  if (unlikely(!prp2)) {
>  trace_nvme_dev_err_invalid_prp2_missing();
> +status = NVME_INVALID_FIELD | NVME_DNR;
>  goto unmap;
>  }
>  if (len > n->page_size) {
> @@ -192,6 +247,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
> QEMUIOVector *iov, uint64_t prp1,
>  if (i == n->max_prp_ents - 1 && len > n->page_size) {
>  if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
>  trace_nvme_dev_err_invalid_prplist_ent(prp_ent);
> +status = NVME_INVALID_FIELD | NVME_DNR;
>  goto unmap;
>  }
>  
> @@ -205,14 +261,14 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
> QEMUIOVector *iov, uint64_t prp1,
>  
>  if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
>  trace_nvme_dev_err_invalid_prplist_ent(prp_ent);
> +status = NVME_INVALI

Re: [PATCH v6 26/42] nvme: pass request along for tracing

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 67 +--
>  hw/block/trace-events |  2 +-
>  2 files changed, 40 insertions(+), 29 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 809d00443369..3e9c2ed434c2 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -202,14 +202,18 @@ static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList 
> *qsg, QEMUIOVector *iov,
>  return NVME_SUCCESS;
>  }
>  
> -static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t 
> prp1,
> - uint64_t prp2, uint32_t len, NvmeCtrl *n)
> +static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
> + uint64_t prp1, uint64_t prp2, uint32_t len,
> + NvmeRequest *req)
>  {
>  hwaddr trans_len = n->page_size - (prp1 % n->page_size);
>  trans_len = MIN(len, trans_len);
>  int num_prps = (len >> n->page_bits) + 1;
>  uint16_t status;
>  
> +trace_nvme_dev_map_prp(nvme_cid(req), trans_len, len, prp1, prp2,
> +   num_prps);
> +
>  if (unlikely(!prp1)) {
>  trace_nvme_dev_err_invalid_prp();
>  return NVME_INVALID_FIELD | NVME_DNR;
> @@ -300,13 +304,14 @@ unmap:
>  }
>  
>  static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
> - uint64_t prp1, uint64_t prp2, DMADirection dir)
> + uint64_t prp1, uint64_t prp2, DMADirection dir,
> + NvmeRequest *req)
>  {
>  QEMUSGList qsg;
>  QEMUIOVector iov;
>  uint16_t status = NVME_SUCCESS;
>  
> -status = nvme_map_prp(&qsg, &iov, prp1, prp2, len, n);
> +status = nvme_map_prp(n, &qsg, &iov, prp1, prp2, len, req);
>  if (status) {
>  return status;
>  }
> @@ -547,7 +552,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
> NvmeCmd *cmd,
>  return NVME_LBA_RANGE | NVME_DNR;
>  }
>  
> -if (nvme_map_prp(&req->qsg, &req->iov, prp1, prp2, data_size, n)) {
> +if (nvme_map_prp(n, &req->qsg, &req->iov, prp1, prp2, data_size, req)) {
>  block_acct_invalid(blk_get_stats(n->conf.blk), acct);
>  return NVME_INVALID_FIELD | NVME_DNR;
>  }
> @@ -771,7 +776,7 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd 
> *cmd, uint8_t rae,
>  }
>  
>  return nvme_dma_prp(n, (uint8_t *) &smart + off, trans_len, prp1, prp2,
> -DMA_DIRECTION_FROM_DEVICE);
> +DMA_DIRECTION_FROM_DEVICE, req);
>  }
>  
>  static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> @@ -791,7 +796,7 @@ static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd 
> *cmd, uint32_t buf_len,
>  trans_len = MIN(sizeof(fw_log) - off, buf_len);
>  
>  return nvme_dma_prp(n, (uint8_t *) &fw_log + off, trans_len, prp1, prp2,
> -DMA_DIRECTION_FROM_DEVICE);
> +DMA_DIRECTION_FROM_DEVICE, req);
>  }
>  
>  static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
> @@ -816,7 +821,7 @@ static uint16_t nvme_error_info(NvmeCtrl *n, NvmeCmd 
> *cmd, uint8_t rae,
>  trans_len = MIN(sizeof(errlog) - off, buf_len);
>  
>  return nvme_dma_prp(n, errlog, trans_len, prp1, prp2,
> -DMA_DIRECTION_FROM_DEVICE);
> +DMA_DIRECTION_FROM_DEVICE, req);
>  }
>  
>  static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> @@ -952,7 +957,8 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
>  return NVME_SUCCESS;
>  }
>  
> -static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c)
> +static uint16_t nvme_identify_ctrl(NvmeCtrl *n, NvmeIdentify *c,
> +   NvmeRequest *req)
>  {
>  uint64_t prp1 = le64_to_cpu(c->prp1);
>  uint64_t prp2 = le64_to_cpu(c->prp2);
> @@ -960,10 +966,11 @@ static uint16_t nvme_identify_ctrl(NvmeCtrl *n, 
> NvmeIdentify *c)
>  trace_nvme_dev_identify_ctrl();
>  
>  return nvme_dma_prp(n, (uint8_t *)&n->id_ctrl, sizeof(n->id_ctrl), prp1,
> -prp2, DMA_DIRECTION_FROM_DEVICE);
> +prp2, DMA_DIRECTION_FROM_DEVICE, req);
>  }
>  
> -static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c)
> +static uint16_t nvme_identify_ns(NvmeCtrl *n, NvmeIdentify *c,
> + NvmeRequest *req)
>  {
>  NvmeNamespace *ns;
>  uint32_t nsid = le32_to_cpu(c->nsid);
> @@ -980,10 +987,11 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
> NvmeIdentify *c)
>  ns = &n->namespaces[nsid - 1];
>  
>  return nvme_dma_prp(n, (uint8_t *)&ns->id_ns, sizeof(ns->id_ns), prp1,
> -prp2, DMA_DIRECTION_FROM_DEVICE);
> +prp2, DMA_DIRECTION_FROM_DEVICE, req);

Re: [PATCH v6 27/42] nvme: add request mapping helper

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Introduce the nvme_map helper to remove some noise in the main nvme_rw
> function.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 13 ++---
>  1 file changed, 10 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 3e9c2ed434c2..850087aac967 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -351,6 +351,15 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, 
> uint32_t len,
>  return status;
>  }
>  
> +static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, QEMUSGList *qsg,
> + QEMUIOVector *iov, size_t len, NvmeRequest *req)
> +{
> +uint64_t prp1 = le64_to_cpu(cmd->dptr.prp1);
> +uint64_t prp2 = le64_to_cpu(cmd->dptr.prp2);
> +
> +return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req);
> +}
> +
>  static void nvme_post_cqes(void *opaque)
>  {
>  NvmeCQueue *cq = opaque;
> @@ -534,8 +543,6 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
> NvmeCmd *cmd,
>  NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
>  uint32_t nlb  = le32_to_cpu(rw->nlb) + 1;
>  uint64_t slba = le64_to_cpu(rw->slba);
> -uint64_t prp1 = le64_to_cpu(rw->dptr.prp1);
> -uint64_t prp2 = le64_to_cpu(rw->dptr.prp2);
>  
>  uint8_t lba_index  = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
>  uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
> @@ -552,7 +559,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
> NvmeCmd *cmd,
>  return NVME_LBA_RANGE | NVME_DNR;
>  }
>  
> -if (nvme_map_prp(n, &req->qsg, &req->iov, prp1, prp2, data_size, req)) {
> +if (nvme_map(n, cmd, &req->qsg, &req->iov, data_size, req)) {
>  block_acct_invalid(blk_get_stats(n->conf.blk), acct);
>  return NVME_INVALID_FIELD | NVME_DNR;
>  }

Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky






Re: [PATCH v6 28/42] nvme: verify validity of prp lists in the cmb

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Before this patch the device already supported this, but it did not
> check for the validity of it nor announced the support in the LISTS
> field.
> 
> If some of the PRPs in a PRP list are in the CMB, then ALL entries must
> be there. This patch makes sure that is verified as well as properly
> announcing support for PRP lists in the CMB.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 14 +-
>  1 file changed, 13 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 850087aac967..eecfad694bf8 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -210,6 +210,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList 
> *qsg, QEMUIOVector *iov,
>  trans_len = MIN(len, trans_len);
>  int num_prps = (len >> n->page_bits) + 1;
>  uint16_t status;
> +bool prp_list_in_cmb = false;
>  
>  trace_nvme_dev_map_prp(nvme_cid(req), trans_len, len, prp1, prp2,
> num_prps);
> @@ -237,11 +238,16 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList 
> *qsg, QEMUIOVector *iov,
>  status = NVME_INVALID_FIELD | NVME_DNR;
>  goto unmap;
>  }
> +
>  if (len > n->page_size) {
>  uint64_t prp_list[n->max_prp_ents];
>  uint32_t nents, prp_trans;
>  int i = 0;
>  
> +if (nvme_addr_is_cmb(n, prp2)) {
> +prp_list_in_cmb = true;
> +}
> +
>  nents = (len + n->page_size - 1) >> n->page_bits;
>  prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
>  nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
> @@ -255,6 +261,11 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList 
> *qsg, QEMUIOVector *iov,
>  goto unmap;
>  }
>  
> +if (prp_list_in_cmb != nvme_addr_is_cmb(n, prp_ent)) {
> +status = NVME_INVALID_USE_OF_CMB | NVME_DNR;
> +goto unmap;
> +}
> +
>  i = 0;
>  nents = (len + n->page_size - 1) >> n->page_bits;
>  prp_trans = MIN(n->max_prp_ents, nents) * 
> sizeof(uint64_t);
> @@ -274,6 +285,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList 
> *qsg, QEMUIOVector *iov,
>  if (status) {
>  goto unmap;
>  }
> +
>  len -= trans_len;
>  i++;
>  }
> @@ -1931,7 +1943,7 @@ static void nvme_init_cmb(NvmeCtrl *n, PCIDevice 
> *pci_dev)
>  
>  NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
>  NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
> -NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
> +NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 1);
>  NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
>  NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
>  NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2);

Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky






Re: [PATCH v6 30/42] nvme: add check for mdts

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Add 'mdts' device parameter to control the Maximum Data Transfer Size of
> the controller and check that it is respected.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 29 -
>  hw/block/nvme.h   |  4 +++-
>  hw/block/trace-events |  1 +
>  3 files changed, 32 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index ba520c76bae5..7d5340c272c6 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -19,7 +19,8 @@
>   *  -drive file=,if=none,id=
>   *  -device nvme,drive=,serial=,id=, \
>   *  cmb_size_mb=, \
> - *  max_ioqpairs=
> + *  max_ioqpairs=, \
> + *  mdts=
>   *
>   * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
>   * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
> @@ -491,6 +492,19 @@ static void nvme_clear_events(NvmeCtrl *n, uint8_t 
> event_type)
>  }
>  }
>  
> +static inline uint16_t nvme_check_mdts(NvmeCtrl *n, size_t len,
> +   NvmeRequest *req)
> +{
> +uint8_t mdts = n->params.mdts;
> +
> +if (mdts && len > n->page_size << mdts) {
> +trace_nvme_dev_err_mdts(nvme_cid(req), n->page_size << mdts, len);
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +
> +return NVME_SUCCESS;
> +}
> +
>  static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
>   uint64_t slba, uint32_t nlb,
>   NvmeRequest *req)
> @@ -581,6 +595,12 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
> NvmeCmd *cmd,
>  
>  trace_nvme_dev_rw(is_write ? "write" : "read", nlb, data_size, slba);
>  
> +status = nvme_check_mdts(n, data_size, req);
> +if (status) {
> +block_acct_invalid(blk_get_stats(n->conf.blk), acct);
> +return status;
> +}
> +
>  status = nvme_check_bounds(n, ns, slba, nlb, req);
>  if (status) {
>  block_acct_invalid(blk_get_stats(n->conf.blk), acct);
> @@ -871,6 +891,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, 
> NvmeRequest *req)
>  uint32_t numdl, numdu;
>  uint64_t off, lpol, lpou;
>  size_t   len;
> +uint16_t status;
>  
>  numdl = (dw10 >> 16);
>  numdu = (dw11 & 0x);
> @@ -886,6 +907,11 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, 
> NvmeRequest *req)
>  
>  trace_nvme_dev_get_log(nvme_cid(req), lid, lsp, rae, len, off);
>  
> +status = nvme_check_mdts(n, len, req);
> +if (status) {
> +return status;
> +}
> +
>  switch (lid) {
>  case NVME_LOG_ERROR_INFO:
>  return nvme_error_info(n, cmd, rae, len, off, req);
> @@ -2011,6 +2037,7 @@ static void nvme_init_ctrl(NvmeCtrl *n)
>  id->ieee[0] = 0x00;
>  id->ieee[1] = 0x02;
>  id->ieee[2] = 0xb3;
> +id->mdts = params->mdts;
>  id->ver = cpu_to_le32(NVME_SPEC_VER);
>  id->oacs = cpu_to_le16(0);
>  
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index 442b17bf1701..b05c2153aebf 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -9,7 +9,8 @@
>  DEFINE_PROP_UINT32("num_queues", _state, _props.num_queues, 0), \
>  DEFINE_PROP_UINT32("max_ioqpairs", _state, _props.max_ioqpairs, 64), \
>  DEFINE_PROP_UINT8("aerl", _state, _props.aerl, 3), \
> -DEFINE_PROP_UINT32("aer_max_queued", _state, _props.aer_max_queued, 64)
> +DEFINE_PROP_UINT32("aer_max_queued", _state, _props.aer_max_queued, 64), 
> \
> +DEFINE_PROP_UINT8("mdts", _state, _props.mdts, 7)
>  
>  typedef struct NvmeParams {
>  char *serial;
> @@ -18,6 +19,7 @@ typedef struct NvmeParams {
>  uint32_t cmb_size_mb;
>  uint8_t  aerl;
>  uint32_t aer_max_queued;
> +uint8_t  mdts;
>  } NvmeParams;
>  
>  typedef struct NvmeAsyncEvent {
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index e31e652fa04e..2df6aa38df1b 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -79,6 +79,7 @@ nvme_dev_mmio_doorbell_cq(uint16_t cqid, uint16_t new_head) 
> "cqid %"PRIu16" new_
>  nvme_dev_mmio_doorbell_sq(uint16_t sqid, uint16_t new_tail) "cqid %"PRIu16" 
> new_tail %"PRIu16""
>  
>  # nvme traces for error conditions
> +nvme_dev_err_mdts(uint16_t cid, size_t mdts, size_t len) "cid %"PRIu16" mdts 
> %"PRIu64" len %"PRIu64""
>  nvme_dev_err_invalid_dma(void) "PRP/SGL is too small for transfer size"
>  nvme_dev_err_invalid_prplist_ent(uint64_t prplist) "PRP list entry is null 
> or not page aligned: 0x%"PRIx64""
>  nvme_dev_err_invalid_prp2_align(uint64_t prp2) "PRP2 is not page aligned: 
> 0x%"PRIx64""


Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky






Re: [PATCH v6 33/42] nvme: use preallocated qsg/iov in nvme_dma_prp

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Since clean up of the request qsg/iov has been moved to the common
> nvme_enqueue_req_completion function, there is no need to use a stack
> allocated qsg/iov in nvme_dma_prp.
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Keith Busch 
> Reviewed-by: Maxim Levitsky 
> ---
>  hw/block/nvme.c | 18 ++
>  1 file changed, 6 insertions(+), 12 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 817384e3b1a9..15ca2417af04 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -321,45 +321,39 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, 
> uint32_t len,
>   uint64_t prp1, uint64_t prp2, DMADirection dir,
>   NvmeRequest *req)
>  {
> -QEMUSGList qsg;
> -QEMUIOVector iov;
>  uint16_t status = NVME_SUCCESS;
>  
> -status = nvme_map_prp(n, &qsg, &iov, prp1, prp2, len, req);
> +status = nvme_map_prp(n, &req->qsg, &req->iov, prp1, prp2, len, req);
>  if (status) {
>  return status;
>  }
>  
> -if (qsg.nsg > 0) {
> +if (req->qsg.nsg > 0) {
>  uint64_t residual;
>  
>  if (dir == DMA_DIRECTION_TO_DEVICE) {
> -residual = dma_buf_write(ptr, len, &qsg);
> +residual = dma_buf_write(ptr, len, &req->qsg);
>  } else {
> -residual = dma_buf_read(ptr, len, &qsg);
> +residual = dma_buf_read(ptr, len, &req->qsg);
>  }
>  
>  if (unlikely(residual)) {
>  trace_nvme_dev_err_invalid_dma();
>  status = NVME_INVALID_FIELD | NVME_DNR;
>  }
> -
> -qemu_sglist_destroy(&qsg);
>  } else {
>  size_t bytes;
>  
>  if (dir == DMA_DIRECTION_TO_DEVICE) {
> -bytes = qemu_iovec_to_buf(&iov, 0, ptr, len);
> +bytes = qemu_iovec_to_buf(&req->iov, 0, ptr, len);
>  } else {
> -bytes = qemu_iovec_from_buf(&iov, 0, ptr, len);
> +bytes = qemu_iovec_from_buf(&req->iov, 0, ptr, len);
>  }
>  
>  if (unlikely(bytes != len)) {
>  trace_nvme_dev_err_invalid_dma();
>  status = NVME_INVALID_FIELD | NVME_DNR;
>  }
> -
> -qemu_iovec_destroy(&iov);
>  }
>  
>  return status;
Only minor changes from the previous version,
so 
Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky








Re: [PATCH v6 24/42] nvme: remove redundant has_sg member

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Remove the has_sg member from NvmeRequest since it's redundant.

To be honest this patch also replaces the dma_acct_start with block_acct_start
which looks right to me, and IMHO its OK to have both in the same patch,
but that should be mentioned in the commit message

With this fixed,
Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky

> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 18 --
>  hw/block/nvme.h |  1 -
>  2 files changed, 12 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 187c816eb6ad..e40c080c3b48 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -484,16 +484,20 @@ static void nvme_rw_cb(void *opaque, int ret)
>  block_acct_failed(blk_get_stats(n->conf.blk), &req->acct);
>  req->status = NVME_INTERNAL_DEV_ERROR;
>  }
> -if (req->has_sg) {
> +
> +if (req->qsg.nalloc) {
>  qemu_sglist_destroy(&req->qsg);
>  }
> +if (req->iov.nalloc) {
> +qemu_iovec_destroy(&req->iov);
> +}
> +
>  nvme_enqueue_req_completion(cq, req);
>  }
>  
>  static uint16_t nvme_flush(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
>  NvmeRequest *req)
>  {
> -req->has_sg = false;
>  block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
>   BLOCK_ACCT_FLUSH);
>  req->aiocb = blk_aio_flush(n->conf.blk, nvme_rw_cb, req);
> @@ -517,7 +521,6 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, 
> NvmeNamespace *ns, NvmeCmd *cmd,
>  return NVME_LBA_RANGE | NVME_DNR;
>  }
>  
> -req->has_sg = false;
>  block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
>   BLOCK_ACCT_WRITE);
>  req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, offset, count,
> @@ -554,16 +557,19 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
> NvmeCmd *cmd,
>  return NVME_INVALID_FIELD | NVME_DNR;
>  }
>  
> -dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct);
>  if (req->qsg.nsg > 0) {
> -req->has_sg = true;
> +block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 
> req->qsg.size,
> + acct);
> +
>  req->aiocb = is_write ?
>  dma_blk_write(n->conf.blk, &req->qsg, data_offset, 
> BDRV_SECTOR_SIZE,
>nvme_rw_cb, req) :
>  dma_blk_read(n->conf.blk, &req->qsg, data_offset, 
> BDRV_SECTOR_SIZE,
>   nvme_rw_cb, req);
>  } else {
> -req->has_sg = false;
> +block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 
> req->iov.size,
> + acct);
> +
>  req->aiocb = is_write ?
>  blk_aio_pwritev(n->conf.blk, data_offset, &req->iov, 0, 
> nvme_rw_cb,
>  req) :
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index b4d1738a3d0a..442b17bf1701 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -29,7 +29,6 @@ typedef struct NvmeRequest {
>  struct NvmeSQueue   *sq;
>  BlockAIOCB  *aiocb;
>  uint16_tstatus;
> -boolhas_sg;
>  NvmeCqe cqe;
>  BlockAcctCookie acct;
>  QEMUSGList  qsg;







Re: [PATCH v6 31/42] nvme: add check for prinfo

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Check the validity of the PRINFO field.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 50 ---
>  hw/block/trace-events |  1 +
>  include/block/nvme.h  |  1 +
>  3 files changed, 44 insertions(+), 8 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 7d5340c272c6..0d2b5b45b0c5 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -505,6 +505,17 @@ static inline uint16_t nvme_check_mdts(NvmeCtrl *n, 
> size_t len,
>  return NVME_SUCCESS;
>  }
>  
> +static inline uint16_t nvme_check_prinfo(NvmeCtrl *n, NvmeNamespace *ns,
> + uint16_t ctrl, NvmeRequest *req)
> +{
> +if ((ctrl & NVME_RW_PRINFO_PRACT) && !(ns->id_ns.dps & DPS_TYPE_MASK)) {
> +trace_nvme_dev_err_prinfo(nvme_cid(req), ctrl);
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}

I refreshed my (still very limited) knowelege on the metadata
and the protection info, and this is what I found:

I think that this is very far from complete, because we also have:

1. PRCHECK. According to the spec it is independent of PRACT
   And when some of it is set, 
   together with enabled protection (the DPS field in namespace),
   Then the 8 bytes of the protection info is checked (optionally using the
   the EILBRT and ELBAT/ELBATM fields in the command and CRC of the data for 
the guard field)

   So this field should also be checked to be zero when protection is disabled
   (I don't see an explicit requirement for that in the spec, but neither I see
   such requirement for PRACT)

2. The protection values to be written / checked ((E)ILBRT/(E)LBATM/(E)LBAT)
   Same here, but also these should not be set when PRCHECK is not set for 
reads,
   plus some are protection type specific.


The spec does mention the 'Invalid Protection Information' error code which
refers to invalid values in the PRINFO field.
So this error code I think should be returned instead of the 'Invalid field'

Another thing to optionaly check is that the metadata pointer for separate 
metadata.
 Is zero as long as we don't support metadata
(again I don't see an explicit requirement for this in the spec, but it 
mentions:

"This field is valid only if the command has metadata that is not interleaved 
with
the logical block data, as specified in the Format NVM command"

)


> +
> +return NVME_SUCCESS;
> +}
> +
>  static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
>   uint64_t slba, uint32_t nlb,
>   NvmeRequest *req)
> @@ -564,11 +575,22 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, 
> NvmeNamespace *ns, NvmeCmd *cmd,
>  uint32_t nlb  = le16_to_cpu(rw->nlb) + 1;
>  uint64_t offset = slba << data_shift;
>  uint32_t count = nlb << data_shift;
> +uint16_t ctrl = le16_to_cpu(rw->control);
>  uint16_t status;
>  
> +status = nvme_check_prinfo(n, ns, ctrl, req);
> +if (status) {
> +goto invalid;
> +}
> +
> +if (ctrl & NVME_RW_PRINFO_PRCHK_MASK) {
> +status = NVME_INVALID_PROT_INFO | NVME_DNR;
> +goto invalid;
> +}
> +
>  status = nvme_check_bounds(n, ns, slba, nlb, req);
>  if (status) {
> -return status;
> +goto invalid;
>  }
>  
>  block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
> @@ -576,6 +598,10 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, 
> NvmeNamespace *ns, NvmeCmd *cmd,
>  req->aiocb = blk_aio_pwrite_zeroes(n->conf.blk, offset, count,
>  BDRV_REQ_MAY_UNMAP, nvme_rw_cb, req);
>  return NVME_NO_COMPLETE;
> +
> +invalid:
> +block_acct_invalid(blk_get_stats(n->conf.blk), BLOCK_ACCT_WRITE);
> +return status;
>  }
>  
>  static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
> @@ -584,6 +610,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
> NvmeCmd *cmd,
>  NvmeRwCmd *rw = (NvmeRwCmd *)cmd;
>  uint32_t nlb  = le32_to_cpu(rw->nlb) + 1;
>  uint64_t slba = le64_to_cpu(rw->slba);
> +uint16_t ctrl = le16_to_cpu(rw->control);
>  
>  uint8_t lba_index  = NVME_ID_NS_FLBAS_INDEX(ns->id_ns.flbas);
>  uint8_t data_shift = ns->id_ns.lbaf[lba_index].ds;
> @@ -597,19 +624,22 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
> NvmeCmd *cmd,
>  
>  status = nvme_check_mdts(n, data_size, req);
>  if (status) {
> -block_acct_invalid(blk_get_stats(n->conf.blk), acct);
> -return status;
> +goto invalid;
> +}
> +
> +status = nvme_check_prinfo(n, ns, ctrl, req);
> +if (status) {
> +goto invalid;
>  }
>  
>  status = nvme_check_bounds(n, ns, slba, nlb, req);
>  if (status) {
> -block_acct_invalid(blk_get_stats(n->conf.blk), acct);
> -return status;
> +goto invalid;
>  

Re: [PATCH v6 36/42] nvme: add support for scatter gather lists

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> For now, support the Data Block, Segment and Last Segment descriptor
> types.
> 
> See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)").
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Keith Busch 
> ---
>  hw/block/nvme.c   | 310 +++---
>  hw/block/trace-events |   4 +
>  2 files changed, 262 insertions(+), 52 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 49d323566393..b89b96990f52 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -76,7 +76,12 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr 
> addr)
>  
>  static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>  {
> -if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
> +hwaddr hi = addr + size;
> +if (hi < addr) {
> +return 1;
> +}
> +
> +if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, 
> hi)) {

I would suggest to split this into a separate patch as well, since this 
contains not just one but 2 bugfixes
for this function and they are not related to sg lists.
Or at least move this to 'nvme: refactor nvme_addr_read' and rename this patch
to something like 'nvme: fix and refactor nvme_addr_read'


>  memcpy(buf, nvme_addr_to_cmb(n, addr), size);
>  return 0;
>  }
> @@ -328,13 +333,242 @@ unmap:
>  return status;
>  }
>  
> -static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
> - uint64_t prp1, uint64_t prp2, DMADirection dir,
> +static uint16_t nvme_map_sgl_data(NvmeCtrl *n, QEMUSGList *qsg,
> +  QEMUIOVector *iov,
> +  NvmeSglDescriptor *segment, uint64_t nsgld,
> +  size_t *len, NvmeRequest *req)
> +{
> +dma_addr_t addr, trans_len;
> +uint32_t blk_len;
> +uint16_t status;
> +
> +for (int i = 0; i < nsgld; i++) {
> +uint8_t type = NVME_SGL_TYPE(segment[i].type);
> +
> +if (type != NVME_SGL_DESCR_TYPE_DATA_BLOCK) {
> +switch (type) {
> +case NVME_SGL_DESCR_TYPE_BIT_BUCKET:
> +case NVME_SGL_DESCR_TYPE_KEYED_DATA_BLOCK:
> +return NVME_SGL_DESCR_TYPE_INVALID | NVME_DNR;
> +default:
To be honest I don't like that 'default'
I would explicitly state which segment types remain 
(I think segment list and last segment list, and various reserved types)
In fact for the reserved types you probably also want to return 
NVME_SGL_DESCR_TYPE_INVALID)

Also this function as well really begs to have a description prior to it,
something like 'map a sg list section, assuming that it only contains SGL data 
descriptions,
caller has to ensure this'.


> +return NVME_INVALID_NUM_SGL_DESCRS | NVME_DNR;
> +}
> +}
> +
> +if (*len == 0) {
> +uint16_t sgls = le16_to_cpu(n->id_ctrl.sgls);
Nitpick: I would add a small comment here as well describiing
what this does (We reach this point if sg list covers more that that
was specified in the commmand, and the NVME_CTRL_SGLS_EXCESS_LENGTH controller
capability indicates that we support just throwing the extra data away)

> +if (sgls & NVME_CTRL_SGLS_EXCESS_LENGTH) {
> +break;
> +}
> +
> +trace_nvme_dev_err_invalid_sgl_excess_length(nvme_cid(req));
> +return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
> +}
> +
> +addr = le64_to_cpu(segment[i].addr);
> +blk_len = le32_to_cpu(segment[i].len);
> +
> +if (!blk_len) {
> +continue;
> +}
> +
> +if (UINT64_MAX - addr < blk_len) {
> +return NVME_DATA_SGL_LEN_INVALID | NVME_DNR;
> +}
Good!
> +
> +trans_len = MIN(*len, blk_len);
> +
> +status = nvme_map_addr(n, qsg, iov, addr, trans_len);
> +if (status) {
> +return status;
> +}
> +
> +*len -= trans_len;
> +}
> +
> +return NVME_SUCCESS;
> +}
> +
> +static uint16_t nvme_map_sgl(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
> + NvmeSglDescriptor sgl, size_t len,
>   NvmeRequest *req)
> +{
> +/*
> + * Read the segment in chunks of 256 descriptors (one 4k page) to avoid
> + * dynamically allocating a potentially large SGL. The spec allows the 
> SGL
> + * to be larger than the command transfer size, so it is not bounded by
> + * MDTS.
> + */
Now this is a very good comment!

However I don't fully understand the note about the SGL. I assume that you mean
that the data that SGL covers still should be less that MDTS, but the actual 
SGL chain,
if assembled really in inefficient way (like 1 byte per each data descriptor) 
might be larger.


> +const int SEG_CHUNK_SIZE = 256;
> +
> +NvmeSglDescriptor segment[SEG_CHUNK_SIZE], *

Re: [PATCH v6 37/42] nvme: refactor identify active namespace id list

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Prepare to support inactive namespaces.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 8 
>  1 file changed, 4 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index b89b96990f52..bf9fb500842a 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1505,16 +1505,16 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
> NvmeIdentify *c,
>  uint32_t min_nsid = le32_to_cpu(c->nsid);
>  uint32_t *list;
>  uint16_t ret;
> -int i, j = 0;
> +int j = 0;
>  
>  trace_nvme_dev_identify_nslist(min_nsid);
>  
>  list = g_malloc0(data_len);
> -for (i = 0; i < n->num_namespaces; i++) {
> -if (i < min_nsid) {
> +for (int i = 1; i <= n->num_namespaces; i++) {
> +if (i <= min_nsid) {
>  continue;
>  }
> -list[j++] = cpu_to_le32(i + 1);
> +list[j++] = cpu_to_le32(i);
>  if (j == data_len / sizeof(uint32_t)) {
>  break;
>  }


Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky







Re: [PATCH v6 42/42] nvme: make lba data size configurable

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Keith Busch 
> ---
>  hw/block/nvme-ns.c | 7 ++-
>  hw/block/nvme-ns.h | 4 +++-
>  hw/block/nvme.c| 1 +
>  3 files changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> index 6d975104171d..d7e5c81c5f16 100644
> --- a/hw/block/nvme-ns.c
> +++ b/hw/block/nvme-ns.c
> @@ -18,7 +18,7 @@ static int nvme_ns_init(NvmeNamespace *ns)
>  {
>  NvmeIdNs *id_ns = &ns->id_ns;
>  
> -id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
> +id_ns->lbaf[0].ds = ns->params.lbads;
>  id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns));
>  
>  /* no thin provisioning */
> @@ -78,6 +78,11 @@ static int nvme_ns_check_constraints(NvmeNamespace *ns, 
> Error **errp)
>  return 1;
>  }
>  
> +if (ns->params.lbads < 9 || ns->params.lbads > 12) {
> +error_setg(errp, "unsupported lbads (supported: 9-12)");
> +return 1;
> +}
> +
>  return 0;
>  }
>  
> diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> index 3c3651d485d0..43b78f8b8d9c 100644
> --- a/hw/block/nvme-ns.h
> +++ b/hw/block/nvme-ns.h
> @@ -7,10 +7,12 @@
>  
>  #define DEFINE_NVME_NS_PROPERTIES(_state, _props) \
>  DEFINE_PROP_DRIVE("drive", _state, blk), \
> -DEFINE_PROP_UINT32("nsid", _state, _props.nsid, 0)
> +DEFINE_PROP_UINT32("nsid", _state, _props.nsid, 0), \
> +DEFINE_PROP_UINT8("lbads", _state, _props.lbads, BDRV_SECTOR_BITS)
>  
>  typedef struct NvmeNamespaceParams {
>  uint32_t nsid;
> +uint8_t  lbads;
>  } NvmeNamespaceParams;
>  
>  typedef struct NvmeNamespace {
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 4f1504fc00fe..61a9da970d41 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -2624,6 +2624,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
> **errp)
>  if (n->namespace.blk) {
>  ns = &n->namespace;
>  ns->params.nsid = 1;
> +ns->params.lbads = BDRV_SECTOR_BITS;
>  
>  if (nvme_ns_setup(n, ns, errp)) {
>  return;

Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky






Re: [PATCH v6 29/42] nvme: refactor request bounds checking

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 28 ++--
>  1 file changed, 22 insertions(+), 6 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index eecfad694bf8..ba520c76bae5 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -491,6 +491,20 @@ static void nvme_clear_events(NvmeCtrl *n, uint8_t 
> event_type)
>  }
>  }
>  
> +static inline uint16_t nvme_check_bounds(NvmeCtrl *n, NvmeNamespace *ns,
> + uint64_t slba, uint32_t nlb,
> + NvmeRequest *req)
> +{
> +uint64_t nsze = le64_to_cpu(ns->id_ns.nsze);
> +
> +if (unlikely(UINT64_MAX - slba < nlb || slba + nlb > nsze)) {
> +trace_nvme_dev_err_invalid_lba_range(slba, nlb, nsze);
> +return NVME_LBA_RANGE | NVME_DNR;
> +}
> +
> +return NVME_SUCCESS;
> +}
Looks good.

> +
>  static void nvme_rw_cb(void *opaque, int ret)
>  {
>  NvmeRequest *req = opaque;
> @@ -536,10 +550,11 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, 
> NvmeNamespace *ns, NvmeCmd *cmd,
>  uint32_t nlb  = le16_to_cpu(rw->nlb) + 1;
>  uint64_t offset = slba << data_shift;
>  uint32_t count = nlb << data_shift;
> +uint16_t status;
>  
> -if (unlikely(slba + nlb > ns->id_ns.nsze)) {
> -trace_nvme_dev_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
> -return NVME_LBA_RANGE | NVME_DNR;
> +status = nvme_check_bounds(n, ns, slba, nlb, req);
> +if (status) {
> +return status;
>  }
>  
>  block_acct_start(blk_get_stats(n->conf.blk), &req->acct, 0,
> @@ -562,13 +577,14 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
> NvmeCmd *cmd,
>  uint64_t data_offset = slba << data_shift;
>  int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
>  enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
> +uint16_t status;
>  
>  trace_nvme_dev_rw(is_write ? "write" : "read", nlb, data_size, slba);
>  
> -if (unlikely((slba + nlb) > ns->id_ns.nsze)) {
> +status = nvme_check_bounds(n, ns, slba, nlb, req);
> +if (status) {
>  block_acct_invalid(blk_get_stats(n->conf.blk), acct);
> -trace_nvme_dev_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
> -return NVME_LBA_RANGE | NVME_DNR;
> +return status;
>  }
>  
>  if (nvme_map(n, cmd, &req->qsg, &req->iov, data_size, req)) {
Looks good as well, once we get support for discard, it will
use this as well, but for now indeed only write zeros and read/write
need bounds checking on the IO path.

Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky








Re: [PATCH v6 32/42] nvme: allow multiple aios per command

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> This refactors how the device issues asynchronous block backend
> requests. The NvmeRequest now holds a queue of NvmeAIOs that are
> associated with the command. This allows multiple aios to be issued for
> a command. Only when all requests have been completed will the device
> post a completion queue entry.
> 
> Because the device is currently guaranteed to only issue a single aio
> request per command, the benefit is not immediately obvious. But this
> functionality is required to support metadata, the dataset management
> command and other features.
> 
> Signed-off-by: Klaus Jensen 
> Signed-off-by: Klaus Jensen 
> Acked-by: Keith Busch 
> ---
>  hw/block/nvme.c   | 377 +++---
>  hw/block/nvme.h   | 129 +--
>  hw/block/trace-events |   6 +
>  3 files changed, 407 insertions(+), 105 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 0d2b5b45b0c5..817384e3b1a9 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -59,6 +59,7 @@
>  } while (0)
>  
>  static void nvme_process_sq(void *opaque);
> +static void nvme_aio_cb(void *opaque, int ret);
>  
>  static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
>  {
> @@ -373,6 +374,99 @@ static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, 
> QEMUSGList *qsg,
>  return nvme_map_prp(n, qsg, iov, prp1, prp2, len, req);
>  }
>  
> +static void nvme_aio_destroy(NvmeAIO *aio)
> +{
> +g_free(aio);
> +}
> +
> +static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio,
I guess I'll call this nvme_req_add_aio,
or nvme_add_aio_to_reg.
Thoughts?
Also you can leave this as is, but add a comment on top explaining this

> + NvmeAIOOp opc)
> +{
> +aio->opc = opc;
> +
> +trace_nvme_dev_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk),
> +aio->offset, aio->len,
> +nvme_aio_opc_str(aio), req);
> +
> +if (req) {
> +QTAILQ_INSERT_TAIL(&req->aio_tailq, aio, tailq_entry);
> +}
> +}
> +
> +static void nvme_submit_aio(NvmeAIO *aio)
OK, this name makes sense
Also please add a comment on top.
> +{
> +BlockBackend *blk = aio->blk;
> +BlockAcctCookie *acct = &aio->acct;
> +BlockAcctStats *stats = blk_get_stats(blk);
> +
> +bool is_write;
> +
> +switch (aio->opc) {
> +case NVME_AIO_OPC_NONE:
> +break;
> +
> +case NVME_AIO_OPC_FLUSH:
> +block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH);
> +aio->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio);
> +break;
> +
> +case NVME_AIO_OPC_WRITE_ZEROES:
> +block_acct_start(stats, acct, aio->len, BLOCK_ACCT_WRITE);
> +aio->aiocb = blk_aio_pwrite_zeroes(blk, aio->offset, aio->len,
> +   BDRV_REQ_MAY_UNMAP, nvme_aio_cb,
> +   aio);
> +break;
> +
> +case NVME_AIO_OPC_READ:
> +case NVME_AIO_OPC_WRITE:
> +is_write = (aio->opc == NVME_AIO_OPC_WRITE);
> +
> +block_acct_start(stats, acct, aio->len,
> + is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
> +
> +if (aio->qsg) {
> +if (is_write) {
> +aio->aiocb = dma_blk_write(blk, aio->qsg, aio->offset,
> +   BDRV_SECTOR_SIZE, nvme_aio_cb, 
> aio);
> +} else {
> +aio->aiocb = dma_blk_read(blk, aio->qsg, aio->offset,
> +  BDRV_SECTOR_SIZE, nvme_aio_cb, 
> aio);
> +}
> +} else {
> +if (is_write) {
> +aio->aiocb = blk_aio_pwritev(blk, aio->offset, aio->iov, 0,
> + nvme_aio_cb, aio);
> +} else {
> +aio->aiocb = blk_aio_preadv(blk, aio->offset, aio->iov, 0,
> +nvme_aio_cb, aio);
> +}
> +}
Looks much better that way that a early return!

> +
> +break;
> +}
> +}
> +
> +static void nvme_rw_aio(BlockBackend *blk, uint64_t offset, NvmeRequest *req)
> +{
> +NvmeAIO *aio;
> +size_t len = req->qsg.nsg > 0 ? req->qsg.size : req->iov.size;
> +
> +aio = g_new0(NvmeAIO, 1);
> +
> +*aio = (NvmeAIO) {
> +.blk = blk,
> +.offset = offset,
> +.len = len,
> +.req = req,
> +.qsg = req->qsg.sg ? &req->qsg : NULL,
> +.iov = req->iov.iov ? &req->iov : NULL,
OK, this is the fix for the bug I mentioned in V5, looks good.

> +};
> +
> +nvme_req_register_aio(req, aio, nvme_req_is_write(req) ?
> +  NVME_AIO_OPC_WRITE : NVME_AIO_OPC_READ);
> +nvme_submit_aio(aio);
> +}
> +
>  static void nvme_post_cqes(void *opaque)
>  {
>  NvmeCQueue *cq = opaque;
> @@ -396,6 +490,7 @@ s

Re: [PATCH v6 35/42] nvme: handle dma errors

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> Handling DMA errors gracefully is required for the device to pass the
> block/011 test ("disable PCI device while doing I/O") in the blktests
> suite.
> 
> With this patch the device passes the test by retrying "critical"
> transfers (posting of completion entries and processing of submission
> queue entries).
> 
> If DMA errors occur at any other point in the execution of the command
> (say, while mapping the PRPs), the command is aborted with a Data
> Transfer Error status code.
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Keith Busch 
> ---
>  hw/block/nvme.c   | 45 ---
>  hw/block/trace-events |  2 ++
>  include/block/nvme.h  |  2 +-
>  3 files changed, 37 insertions(+), 12 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 15ca2417af04..49d323566393 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -74,14 +74,14 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr 
> addr)
>  return addr >= low && addr < hi;
>  }
>  
> -static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
> +static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>  {
>  if (n->bar.cmbsz && nvme_addr_is_cmb(n, addr)) {
>  memcpy(buf, nvme_addr_to_cmb(n, addr), size);
> -return;
> +return 0;
>  }
>  
> -pci_dma_read(&n->parent_obj, addr, buf, size);
> +return pci_dma_read(&n->parent_obj, addr, buf, size);
>  }
>  
>  static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
> @@ -164,7 +164,7 @@ static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, 
> QEMUIOVector *iov, hwaddr addr,
>size_t len)
>  {
>  if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len)) {
> -return NVME_DATA_TRAS_ERROR;
> +return NVME_DATA_TRANSFER_ERROR;

Minor nitpick: this is also a non functional refactoring.
I don't think that each piece of a refactoring should be in a separate patch,
so I usually group all the non functional (aka cosmetic) refactoring in one 
patch, usually the first in the series.
But I try not to leave such refactoring in the functional patches.

However, since there is not that much cases like that left, I don't mind 
leaving this particular case as is.

>  }
>  
>  qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
> @@ -213,6 +213,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList 
> *qsg, QEMUIOVector *iov,
>  int num_prps = (len >> n->page_bits) + 1;
>  uint16_t status;
>  bool prp_list_in_cmb = false;
> +int ret;
>  
>  trace_nvme_dev_map_prp(nvme_cid(req), trans_len, len, prp1, prp2,
> num_prps);
> @@ -252,7 +253,12 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList 
> *qsg, QEMUIOVector *iov,
>  
>  nents = (len + n->page_size - 1) >> n->page_bits;
>  prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
> -nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
> +ret = nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
> +if (ret) {
> +trace_nvme_dev_err_addr_read(prp2);
> +status = NVME_DATA_TRANSFER_ERROR;
> +goto unmap;
> +}
>  while (len != 0) {
>  uint64_t prp_ent = le64_to_cpu(prp_list[i]);
>  
> @@ -271,8 +277,13 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList 
> *qsg, QEMUIOVector *iov,
>  i = 0;
>  nents = (len + n->page_size - 1) >> n->page_bits;
>  prp_trans = MIN(n->max_prp_ents, nents) * 
> sizeof(uint64_t);
> -nvme_addr_read(n, prp_ent, (void *)prp_list,
> -prp_trans);
> +ret = nvme_addr_read(n, prp_ent, (void *)prp_list,
> + prp_trans);
> +if (ret) {
> +trace_nvme_dev_err_addr_read(prp_ent);
> +status = NVME_DATA_TRANSFER_ERROR;
> +goto unmap;
> +}
>  prp_ent = le64_to_cpu(prp_list[i]);
>  }
>  
> @@ -466,6 +477,7 @@ static void nvme_post_cqes(void *opaque)
>  NvmeCQueue *cq = opaque;
>  NvmeCtrl *n = cq->ctrl;
>  NvmeRequest *req, *next;
> +int ret;
>  
>  QTAILQ_FOREACH_SAFE(req, &cq->req_list, entry, next) {
>  NvmeSQueue *sq;
> @@ -475,15 +487,21 @@ static void nvme_post_cqes(void *opaque)
>  break;
>  }
>  
> -QTAILQ_REMOVE(&cq->req_list, req, entry);
>  sq = req->sq;
>  req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
>  req->cqe.sq_id = cpu_to_le16(sq->sqid);
>  req->cqe.sq_head = cpu_to_le16(sq->head);
>  addr = cq->dma_addr + cq->tail * n->cqe_size;
> +ret 

Re: [PATCH v6 38/42] nvme: support multiple namespaces

2020-03-25 Thread Maxim Levitsky
On Mon, 2020-03-16 at 07:29 -0700, Klaus Jensen wrote:
> From: Klaus Jensen 
> 
> This adds support for multiple namespaces by introducing a new 'nvme-ns'
> device model. The nvme device creates a bus named from the device name
> ('id'). The nvme-ns devices then connect to this and registers
> themselves with the nvme device.
> 
> This changes how an nvme device is created. Example with two namespaces:
> 
>   -drive file=nvme0n1.img,if=none,id=disk1
>   -drive file=nvme0n2.img,if=none,id=disk2
>   -device nvme,serial=deadbeef,id=nvme0
>   -device nvme-ns,drive=disk1,bus=nvme0,nsid=1
>   -device nvme-ns,drive=disk2,bus=nvme0,nsid=2
> 
> The drive property is kept on the nvme device to keep the change
> backward compatible, but the property is now optional. Specifying a
> drive for the nvme device will always create the namespace with nsid 1.
> 
> Signed-off-by: Klaus Jensen 
> Signed-off-by: Klaus Jensen 
> Reviewed-by: Keith Busch 
> ---
>  hw/block/Makefile.objs |   2 +-
>  hw/block/nvme-ns.c | 157 +++
>  hw/block/nvme-ns.h |  60 +++
>  hw/block/nvme.c| 233 ++---
>  hw/block/nvme.h|  47 -
>  hw/block/trace-events  |   4 +-
>  6 files changed, 389 insertions(+), 114 deletions(-)
>  create mode 100644 hw/block/nvme-ns.c
>  create mode 100644 hw/block/nvme-ns.h
> 
> diff --git a/hw/block/Makefile.objs b/hw/block/Makefile.objs
> index 4b4a2b338dc4..d9141d6a4b9b 100644
> --- a/hw/block/Makefile.objs
> +++ b/hw/block/Makefile.objs
> @@ -7,7 +7,7 @@ common-obj-$(CONFIG_PFLASH_CFI02) += pflash_cfi02.o
>  common-obj-$(CONFIG_XEN) += xen-block.o
>  common-obj-$(CONFIG_ECC) += ecc.o
>  common-obj-$(CONFIG_ONENAND) += onenand.o
> -common-obj-$(CONFIG_NVME_PCI) += nvme.o
> +common-obj-$(CONFIG_NVME_PCI) += nvme.o nvme-ns.o
>  common-obj-$(CONFIG_SWIM) += swim.o
>  
>  common-obj-$(CONFIG_SH4) += tc58128.o
> diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> new file mode 100644
> index ..6d975104171d
> --- /dev/null
> +++ b/hw/block/nvme-ns.c
> @@ -0,0 +1,157 @@
> +#include "qemu/osdep.h"
> +#include "qemu/units.h"
> +#include "qemu/cutils.h"
> +#include "qemu/log.h"
> +#include "hw/block/block.h"
> +#include "hw/pci/pci.h"
> +#include "sysemu/sysemu.h"
> +#include "sysemu/block-backend.h"
> +#include "qapi/error.h"
> +
> +#include "hw/qdev-properties.h"
> +#include "hw/qdev-core.h"
> +
> +#include "nvme.h"
> +#include "nvme-ns.h"
> +
> +static int nvme_ns_init(NvmeNamespace *ns)
> +{
> +NvmeIdNs *id_ns = &ns->id_ns;
> +
> +id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
> +id_ns->nsze = cpu_to_le64(nvme_ns_nlbas(ns));
> +
> +/* no thin provisioning */
> +id_ns->ncap = id_ns->nsze;
> +id_ns->nuse = id_ns->ncap;
> +
> +return 0;
> +}
Looks great!

> +
> +static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, NvmeIdCtrl *id,
> +Error **errp)
> +{
> +uint64_t perm, shared_perm;
> +
> +Error *local_err = NULL;
> +int ret;
> +
> +perm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
> +shared_perm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
> +BLK_PERM_GRAPH_MOD;
> +
> +ret = blk_set_perm(ns->blk, perm, shared_perm, &local_err);
> +if (ret) {
> +error_propagate_prepend(errp, local_err,
> +"could not set block permissions: ");
> +return ret;
> +}
> +
> +ns->size = blk_getlength(ns->blk);
> +if (ns->size < 0) {
> +error_setg_errno(errp, -ns->size, "could not get blockdev size");
> +return -1;
> +}
> +
> +switch (n->conf.wce) {
> +case ON_OFF_AUTO_ON:
> +n->features.volatile_wc = 1;
> +break;
> +case ON_OFF_AUTO_OFF:
> +n->features.volatile_wc = 0;
> +case ON_OFF_AUTO_AUTO:
> +n->features.volatile_wc = blk_enable_write_cache(ns->blk);
> +break;
> +default:
> +abort();
> +}
> +
> +blk_set_enable_write_cache(ns->blk, n->features.volatile_wc);
> +
> +return 0;
> +}

This needs review from someone that knows the block layer better that I do.
I still think that maybe you can somehow use the blkconf_apply_backend_options
(or even extend it to suit you somehow). I'll leave this to the block layer 
folks.



> +
> +static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
> +{
> +if (!ns->blk) {
> +error_setg(errp, "block backend not configured");
> +return 1;
> +}
> +
> +return 0;
> +}
> +
> +int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
> +{
> +if (nvme_ns_check_constraints(ns, errp)) {
> +return -1;
> +}
> +
> +if (nvme_ns_init_blk(n, ns, &n->id_ctrl, errp)) {
> +return -1;
> +}
> +
> +nvme_ns_init(ns);
> +if (nvme_register_namespace(n, ns, errp)) {
> +return -1;
> +}
> +
> +return 0;
> +}
> +
> +static void nvme_ns_realize(DeviceState *dev, Error **errp)

Re: [PATCH 2/6] block/mirror: fix use after free of local_err

2020-03-25 Thread Max Reitz
On 24.03.20 16:36, Vladimir Sementsov-Ogievskiy wrote:
> local_err is used again in mirror_exit_common() after
> bdrv_set_backing_hd(), so we must zero it. Otherwise try to set
> non-NULL local_err will crash.

OK, but wouldn’t it be better hygiene to set it to NULL every time it is
freed?  (There is a second instance of error_report_err() in this
function.  I’m a bit worried we might introduce another local_err use
after that one at some point in the future, and forget to run the cocci
script then.)

Are the cocci scripts run regularly by someone?  E.g. as part of a pull
to master?

Max

> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/mirror.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/block/mirror.c b/block/mirror.c
> index 447051dbc6..6203e5946e 100644
> --- a/block/mirror.c
> +++ b/block/mirror.c
> @@ -678,6 +678,7 @@ static int mirror_exit_common(Job *job)
>  bdrv_set_backing_hd(target_bs, backing, &local_err);
>  if (local_err) {
>  error_report_err(local_err);
> +local_err = NULL;
>  ret = -EPERM;
>  }
>  }
> 




signature.asc
Description: OpenPGP digital signature


Re: block stream and bitmaps

2020-03-25 Thread Kevin Wolf
Am 24.03.2020 um 20:19 hat John Snow geschrieben:
> 
> 
> On 3/24/20 6:18 AM, Kevin Wolf wrote:
> > Am 23.03.2020 um 19:06 hat John Snow geschrieben:
> >> Hi Kevin,
> >>
> >> I'm hoping to get some preliminary ideas from you (capped at five
> >> minutes' effort?) on this subject. My ideas here are a bit shaky, I only
> >> have really rough notions here.
> >>
> >> We want to use bitmaps with 'drive' semantics; i.e. tracking changes to
> >> the visible guest data. What we have are bitmaps with node semantics,
> >> tracking changes to the data at a particular level in the graph.
> >>
> >> For commit, this isn't a big deal: we can disable bitmaps in the backing
> >> file while we commit and then re-enable it on completion. We usually
> >> have a separate bitmap enabled on the root node that is recording writes
> >> during this time, and can be moved later.
> >>
> >> For streaming, this is more challenging: new writes will dirty the
> >> bitmap, but so will writes from the stream job itself.
> >>
> >> Semantically, we want to ignore writes from the stream while recording
> >> them from the guest -- differentiating based on source.
> > 
> > No, based on source is actually not what you want. What you really want
> > is that BDRV_REQ_WRITE_UNCHANGED doesn't mark any blocks dirty.
> > 
> 
> This is why I sent the mail, I figured you'd know the better incision
> point, and I was right!
> 
> > We discussed this specific case of streaming at FOSDEM (with Paolo and
> > probably Nir). Paolo was even convinced that unchanged writes already
> > behave like this, but we agreed that dirtying blocks for them would be a
> > bug. After checking that the code is indeed buggy, I was planning to
> > send a patch, but never got around to actually do that. Sorry about
> > that.
> 
> Glad to hear it has been given consideration, though!

Yes, if we hadn't talked about it, I probably wouldn't have had this
suggestion for you.

Anyway, someone still needs to write that patch. Should I try to find
time for it or are you (or someone else working on bitmaps) going to do
that?

> >> Bitmaps aren't really geared to do that right now. With the changes to
> >> Bdrv Roles that Max was engineering, do you think it's possible to add
> >> some kind of write source discrimination to bitmaps, or is that too messy?
> > 
> > I don't think it would work because copy-on-read requests come from the
> > same parent node as writes (no matter whether the legacy code in
> > block/io.c or a copy-on-read filter node is used).
> > 
> 
> Oh, understood. Rule that approach out, then.
> 
> >> For both commit and stream, it might be nice to say: "This bitmap is
> >> enabled, but ignores writes from [all? specific types? specific
> >> instances?] jobs.
> > 
> > Commit is a bit trickier, because it's not WRITE_UNCHANGED. The result
> > is only unchanged for the top layer, but not for the backing file you're
> > committing to. Not sure if we can represent this condition somehow.
> > 
> 
> Nothing comes to mind apart from a semantic that applies to a graph
> subsection instead of an individual node.
> 
> i.e. UNCHANGED as applied to [A --> B].
> 
> Not saying that's reasonable to develop... or necessarily even possible
> to enforce, just nothing else comes to mind.

And then the result for that request would be that it doesn't dirty any
bitmaps at A, but it does dirty the bitmaps at B?

I mean, many things can be developed, but being sure that they actually
implement the right thing is harder... So as long as we have the
workaround you mentioned above, maybe let's just ignore this for now.

> >> Or, I wonder if what we truly want is some kind of bitmap "forwarder"
> >> object on block-backend objects that represent the semantic drive view,
> >> and only writes through that *backend* get forwarded to the bitmaps
> >> attached to whatever node the bitmap is actually associated with.
> >>
> >> (That might wind up causing weird problems too, though... since those
> >> objects are no longer intended to be user-addressable, managing that
> >> configuration might get intensely strange.)
> > 
> > Hm... Drive-based does suggest that it's managed at the BlockBackend
> > level. So having a bitmap that isn't added as a dirty bitmap to the BDS,
> > but only to the BB does make sense to me. The BB would be addressed
> > with the qdev ID of the device, as usual (which underlines that it's
> > really per device).
> > 
> 
> That's the rough idea, though if it's needed or not is unclear. We might
> be able to get by with node semantics if we jazz them up enough...?

Ah. I just took it as a given that you need or want BlockBackend-based
bitmaps because that's how you started the email. Maybe I should look at
the bigger picture before suggesting things.

So what was the real motivation behind it?

> Working around all the edge bases of a drive-semantic bitmap seem
> difficult to reason about.
> 
> In general, it should likely be made persistent against the root-most
> node to which writ

Re: [PATCH 2/6] block/mirror: fix use after free of local_err

2020-03-25 Thread Max Reitz
On 25.03.20 12:11, Max Reitz wrote:
> On 24.03.20 16:36, Vladimir Sementsov-Ogievskiy wrote:
>> local_err is used again in mirror_exit_common() after
>> bdrv_set_backing_hd(), so we must zero it. Otherwise try to set
>> non-NULL local_err will crash.
> 
> OK, but wouldn’t it be better hygiene to set it to NULL every time it is
> freed?  (There is a second instance of error_report_err() in this
> function.  I’m a bit worried we might introduce another local_err use
> after that one at some point in the future, and forget to run the cocci
> script then.)
> 
> Are the cocci scripts run regularly by someone?  E.g. as part of a pull
> to master?

Doesn’t look like it.  I’m currently running everything, and there’s a
lot of results so far.

Max



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 2/6] block/mirror: fix use after free of local_err

2020-03-25 Thread Vladimir Sementsov-Ogievskiy

25.03.2020 14:11, Max Reitz wrote:

On 24.03.20 16:36, Vladimir Sementsov-Ogievskiy wrote:

local_err is used again in mirror_exit_common() after
bdrv_set_backing_hd(), so we must zero it. Otherwise try to set
non-NULL local_err will crash.


OK, but wouldn’t it be better hygiene to set it to NULL every time it is
freed?  (There is a second instance of error_report_err() in this
function.  I’m a bit worried we might introduce another local_err use
after that one at some point in the future, and forget to run the cocci
script then.)


Yes, it's better. But if we now decide to fix all such things, it would be
huge series. May be too huge for 5.0..

So I decided to fix only real obvious problems now.

Hmm huge or not?

Ok, let's try with less restrictions:

--- a/scripts/coccinelle/error-use-after-free.cocci
+++ b/scripts/coccinelle/error-use-after-free.cocci
@@ -28,7 +28,7 @@ expression err;

  fn(...)
  {
- <...
+ ... when any
 (
  error_free(err);
 +err = NULL;
@@ -46,7 +46,5 @@ expression err;
 +err = NULL;
 )
  ... when != err = NULL
- when != exit(...)
- fn2(..., err, ...)
- ...>
+ when any
  }


on block/ directory:

spatch --sp-file scripts/coccinelle/error-use-after-free.cocci --macro-file 
scripts/cocci-macro-file.h --in-place --no-show-diff --use-gitgrep block
git diff --stat
 scripts/coccinelle/error-use-after-free.cocci |  6 ++
 block/block-backend.c |  1 +
 block/commit.c|  4 
 block/crypto.c|  1 +
 block/file-posix.c|  5 +
 block/mirror.c|  2 ++
 block/monitor/block-hmp-cmds.c|  4 
 block/parallels.c |  3 +++
 block/qapi-sysemu.c   |  2 ++
 block/qapi.c  |  1 +
 block/qcow.c  |  2 ++
 block/qcow2-cluster.c |  1 +
 block/qcow2-refcount.c|  1 +
 block/qcow2-snapshot.c|  3 +++
 block/qcow2.c |  4 
 block/replication.c   |  1 +
 block/sheepdog.c  | 13 +
 block/stream.c|  1 +
 block/vdi.c   |  2 ++
 block/vhdx.c  |  2 ++
 block/vmdk.c  |  2 ++
 block/vpc.c   |  2 ++
 block/vvfat.c |  2 ++
 23 files changed, 61 insertions(+), 4 deletions(-)


If you want, I'll send series.



Are the cocci scripts run regularly by someone?  E.g. as part of a pull
to master?


I'm afraid not. I have a plan in my mind, to make python checkcode, which will
work in pair with checkpatch somehow, and will work on workdir instead of
patch. It will simplify significantly adding different code checks, including
starting coccinelle scripts.



Max


Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  block/mirror.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/block/mirror.c b/block/mirror.c
index 447051dbc6..6203e5946e 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -678,6 +678,7 @@ static int mirror_exit_common(Job *job)
  bdrv_set_backing_hd(target_bs, backing, &local_err);
  if (local_err) {
  error_report_err(local_err);
+local_err = NULL;
  ret = -EPERM;
  }
  }







--
Best regards,
Vladimir



Re: [PATCH v2 0/2] Rework iotests finding

2020-03-25 Thread no-reply
Patchew URL: 
https://patchew.org/QEMU/20200325102131.23270-1-vsement...@virtuozzo.com/



Hi,

This series failed the docker-quick@centos7 build test. Please find the testing 
commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#!/bin/bash
make docker-image-centos7 V=1 NETWORK=1
time make docker-test-quick@centos7 SHOW_ENV=1 J=14 NETWORK=1
=== TEST SCRIPT END ===

  CC  s_eq128.o
./check: line 171: ./find_tests.py: No such file or directory
Group "auto" is empty or not defined?
make: *** [check-tests/check-block.sh] Error 1
make: *** Waiting for unfinished jobs
  CC  s_le128.o
  CC  s_lt128.o
---
raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', 
'--label', 'com.qemu.instance.uuid=ea762c961bed46aea47cd60fda18c0de', '-u', 
'1003', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=', 
'-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 
'SHOW_ENV=1', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', 
'/home/patchew2/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', 
'/var/tmp/patchew-tester-tmp-nf_7yydp/src/docker-src.2020-03-25-07.53.52.20951:/var/tmp/qemu:z,ro',
 'qemu:centos7', '/var/tmp/qemu/run', 'test-quick']' returned non-zero exit 
status 2.
filter=--filter=label=com.qemu.instance.uuid=ea762c961bed46aea47cd60fda18c0de
make[1]: *** [docker-run] Error 1
make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-nf_7yydp/src'
make: *** [docker-run-test-quick@centos7] Error 2

real2m47.178s
user0m8.387s


The full log is available at
http://patchew.org/logs/20200325102131.23270-1-vsement...@virtuozzo.com/testing.docker-quick@centos7/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com

Re: [PATCH 2/6] block/mirror: fix use after free of local_err

2020-03-25 Thread Max Reitz
On 25.03.20 12:47, Vladimir Sementsov-Ogievskiy wrote:
> 25.03.2020 14:11, Max Reitz wrote:
>> On 24.03.20 16:36, Vladimir Sementsov-Ogievskiy wrote:
>>> local_err is used again in mirror_exit_common() after
>>> bdrv_set_backing_hd(), so we must zero it. Otherwise try to set
>>> non-NULL local_err will crash.
>>
>> OK, but wouldn’t it be better hygiene to set it to NULL every time it is
>> freed?  (There is a second instance of error_report_err() in this
>> function.  I’m a bit worried we might introduce another local_err use
>> after that one at some point in the future, and forget to run the cocci
>> script then.)
> 
> Yes, it's better. But if we now decide to fix all such things, it would be
> huge series. May be too huge for 5.0..
> 
> So I decided to fix only real obvious problems now.

Reasonable, yes.

> Hmm huge or not?
> 
> Ok, let's try with less restrictions:
> 
> --- a/scripts/coccinelle/error-use-after-free.cocci
> +++ b/scripts/coccinelle/error-use-after-free.cocci
> @@ -28,7 +28,7 @@ expression err;
> 
>   fn(...)
>   {
> - <...
> + ... when any
>  (
>   error_free(err);
>  +    err = NULL;
> @@ -46,7 +46,5 @@ expression err;
>  +    err = NULL;
>  )
>   ... when != err = NULL
> - when != exit(...)
> - fn2(..., err, ...)
> - ...>
> + when any
>   }
> 
> 
> on block/ directory:
> 
> spatch --sp-file scripts/coccinelle/error-use-after-free.cocci
> --macro-file scripts/cocci-macro-file.h --in-place --no-show-diff
> --use-gitgrep block
> git diff --stat
>  scripts/coccinelle/error-use-after-free.cocci |  6 ++
>  block/block-backend.c |  1 +
>  block/commit.c    |  4 
>  block/crypto.c    |  1 +
>  block/file-posix.c    |  5 +
>  block/mirror.c    |  2 ++
>  block/monitor/block-hmp-cmds.c    |  4 
>  block/parallels.c |  3 +++
>  block/qapi-sysemu.c   |  2 ++
>  block/qapi.c  |  1 +
>  block/qcow.c  |  2 ++
>  block/qcow2-cluster.c |  1 +
>  block/qcow2-refcount.c    |  1 +
>  block/qcow2-snapshot.c    |  3 +++
>  block/qcow2.c |  4 
>  block/replication.c   |  1 +
>  block/sheepdog.c  | 13 +
>  block/stream.c    |  1 +
>  block/vdi.c   |  2 ++
>  block/vhdx.c  |  2 ++
>  block/vmdk.c  |  2 ++
>  block/vpc.c   |  2 ++
>  block/vvfat.c |  2 ++
>  23 files changed, 61 insertions(+), 4 deletions(-)
> 
> 
> If you want, I'll send series.
> 
>>
>> Are the cocci scripts run regularly by someone?  E.g. as part of a pull
>> to master?
> 
> I'm afraid not. I have a plan in my mind, to make python checkcode,
> which will
> work in pair with checkpatch somehow, and will work on workdir instead of
> patch. It will simplify significantly adding different code checks,
> including
> starting coccinelle scripts.
Hm.  I think we need to prepare for noone running the cocci scripts
(i.e., we should use the above variant with less restrictions so that
future patches are less likely to reintroduce the problem), or we need
to ensure the cocci scripts are run regularly.

We can of course also do both.  In this case I think it makes sense to
do a less-restricted version, because I think it can never hurt to set
pointers to NULL after freeing them.  (We could do an exception for when
the error-freeing is immediately followed by a goto out, but I think
that would make it too complicated.)

I’d like to start running the cocci scripts myself before every pull
request, but unfortunately there are still a number of diffs in the
block area.  I think I’ll send a series to fix those and then I can run
the scripts regularly to prevent regressions.  So I’ll leave it up to
you whether you think a less-restricted version would make sense.

Max



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 2/6] block/mirror: fix use after free of local_err

2020-03-25 Thread Max Reitz
On 24.03.20 16:36, Vladimir Sementsov-Ogievskiy wrote:
> local_err is used again in mirror_exit_common() after
> bdrv_set_backing_hd(), so we must zero it. Otherwise try to set
> non-NULL local_err will crash.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/mirror.c | 1 +
>  1 file changed, 1 insertion(+)

Considering Dave has taken patches 4 and 5, I think it makes sense for
me to take this one now; so, thanks for the patch and the reviews,
applied to my block branch:

https://git.xanclic.moe/XanClic/qemu/commits/branch/block

Max



signature.asc
Description: OpenPGP digital signature


Re: [PATCH v2 0/2] Rework iotests finding

2020-03-25 Thread Vladimir Sementsov-Ogievskiy

25.03.2020 14:56, no-re...@patchew.org wrote:

Patchew URL: 
https://patchew.org/QEMU/20200325102131.23270-1-vsement...@virtuozzo.com/



Hi,

This series failed the docker-quick@centos7 build test. Please find the testing 
commands and
their output below. If you have Docker installed, you can probably reproduce it
locally.

=== TEST SCRIPT BEGIN ===
#!/bin/bash
make docker-image-centos7 V=1 NETWORK=1
time make docker-test-quick@centos7 SHOW_ENV=1 J=14 NETWORK=1
=== TEST SCRIPT END ===

   CC  s_eq128.o
./check: line 171: ./find_tests.py: No such file or directory



Hmm but my patches add find_tests.py in same directory as check script. What's 
wrong with it, who can help?

For me, "make check" works and "cd tests/qemu-iotests; ./check -qcow2" works 
too.


Group "auto" is empty or not defined?
make: *** [check-tests/check-block.sh] Error 1
make: *** Waiting for unfinished jobs
   CC  s_le128.o
   CC  s_lt128.o
---
 raise CalledProcessError(retcode, cmd)
subprocess.CalledProcessError: Command '['sudo', '-n', 'docker', 'run', 
'--label', 'com.qemu.instance.uuid=ea762c961bed46aea47cd60fda18c0de', '-u', 
'1003', '--security-opt', 'seccomp=unconfined', '--rm', '-e', 'TARGET_LIST=', 
'-e', 'EXTRA_CONFIGURE_OPTS=', '-e', 'V=', '-e', 'J=14', '-e', 'DEBUG=', '-e', 
'SHOW_ENV=1', '-e', 'CCACHE_DIR=/var/tmp/ccache', '-v', 
'/home/patchew2/.cache/qemu-docker-ccache:/var/tmp/ccache:z', '-v', 
'/var/tmp/patchew-tester-tmp-nf_7yydp/src/docker-src.2020-03-25-07.53.52.20951:/var/tmp/qemu:z,ro',
 'qemu:centos7', '/var/tmp/qemu/run', 'test-quick']' returned non-zero exit 
status 2.
filter=--filter=label=com.qemu.instance.uuid=ea762c961bed46aea47cd60fda18c0de
make[1]: *** [docker-run] Error 1
make[1]: Leaving directory `/var/tmp/patchew-tester-tmp-nf_7yydp/src'
make: *** [docker-run-test-quick@centos7] Error 2

real2m47.178s
user0m8.387s


The full log is available at
http://patchew.org/logs/20200325102131.23270-1-vsement...@virtuozzo.com/testing.docker-quick@centos7/?type=message.
---
Email generated automatically by Patchew [https://patchew.org/].
Please send your feedback to patchew-de...@redhat.com




--
Best regards,
Vladimir



Re: [PATCH v2 3/4] qcow2: Avoid feature name extension on small cluster size

2020-03-25 Thread Max Reitz
On 24.03.20 18:42, Eric Blake wrote:
> As the feature name table can be quite large (over 9k if all 64 bits
> of all three feature fields have names; a mere 8 features leaves only
> 8 bytes for a backing file name in a 512-byte cluster), it is unwise
> to emit this optional header in images with small cluster sizes.
> 
> Update iotest 036 to skip running on small cluster sizes; meanwhile,
> note that iotest 061 never passed on alternative cluster sizes
> (however, I limited this patch to tests with output affected by adding
> feature names, rather than auditing for other tests that are not
> robust to alternative cluster sizes).

That’s a bit more brave than necessary, considering I don’t think anyone
has ever run the iotests with the cluster_size option.  (I certainly
don’t, and I don’t plan to, because I don’t think it’s that important to
test that.)  There are certainly many other iotests that fail with a
non-default cluster size.

Not that it’s wrong care about it.  On the opposite, I’m happy you do. :)

> Signed-off-by: Eric Blake 
> Reviewed-by: Vladimir Sementsov-Ogievskiy 
> Reviewed-by: Alberto Garcia 
> ---
>  block/qcow2.c  | 11 +--
>  tests/qemu-iotests/036 |  6 --
>  tests/qemu-iotests/061 |  6 --
>  3 files changed, 17 insertions(+), 6 deletions(-)
> 
> diff --git a/block/qcow2.c b/block/qcow2.c
> index 67b0c214fba4..9475ace57163 100644
> --- a/block/qcow2.c
> +++ b/block/qcow2.c
> @@ -2823,8 +2823,15 @@ int qcow2_update_header(BlockDriverState *bs)
>  buflen -= ret;
>  }
> 
> -/* Feature table */
> -if (s->qcow_version >= 3) {
> +/*
> + * Feature table.  A mere 8 feature names occupies 392 bytes, and
> + * when coupled with the v3 minimum header of 104 bytes plus the
> + * 8-byte end-of-extension marker, that would leave only 8 bytes
> + * for a backing file name in an image with 512-byte clusters.
> + * Thus, we choose to omit this header for cluster sizes 4k and
> + * smaller.

Can’t we do this decision more dynamically?  Like, always omit it when
cluster_size - sizeof(features) < 2k/3k/...?

Max

> + */
> +if (s->qcow_version >= 3 && s->cluster_size > 4096) {
>  static const Qcow2Feature features[] = {
>  {
>  .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,



signature.asc
Description: OpenPGP digital signature


Re: [PATCH 2/6] block/mirror: fix use after free of local_err

2020-03-25 Thread Eric Blake

On 3/25/20 6:11 AM, Max Reitz wrote:

On 24.03.20 16:36, Vladimir Sementsov-Ogievskiy wrote:

local_err is used again in mirror_exit_common() after
bdrv_set_backing_hd(), so we must zero it. Otherwise try to set
non-NULL local_err will crash.


OK, but wouldn’t it be better hygiene to set it to NULL every time it is
freed?


If we change the signature to error_report_err(&local_err), where 
error_report_err both reports the error (if any) AND sets local_err to 
NULL, then we fix the problem for all callers.  It's a global 
search-and-replace job (Coccinelle is great for that) to update all 
callers to the new signature.



 (There is a second instance of error_report_err() in this
function.  I’m a bit worried we might introduce another local_err use
after that one at some point in the future, and forget to run the cocci
script then.)

Are the cocci scripts run regularly by someone?  E.g. as part of a pull
to master?


I'm not aware of any automated procedure for it at the moment; rather, 
it is still ad hoc as someone notices something needs to be re-run.  But 
there was another thread about someone considering automating Cocci 
scripts as part of the Euler robot...


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v2 0/2] Rework iotests finding

2020-03-25 Thread Eric Blake

On 3/25/20 5:21 AM, Vladimir Sementsov-Ogievskiy wrote:

Hi all!

When sending iotests to upstream or do patch porting from one branch
to another we very often have to resolve conflicts in group file, as
many absolutely independent features are intersecting by this file.
These conflicts are simple, but imagine how much time we all have
already spent on resolving them? Let's finally get rid of group file.

Next, another thing I don't like about iotests is race for test number
chosing: you should search through mail box, before chosing test number
for new test.

So, I propose to get rid of group file and search for tests another way
[look at patch 02]. Additionally I propose to move to human-readable
names for test files, with notation test-* .


I suggest swapping the name: It's easier to write a glob for *-test vs. 
*-test.out than it is to write for test-* but not test-*.out.  (You 
don't want to execute the output files as a test).  That is, I suggest 
that ./check consider all 3-digit files and all files ending in -test as 
tests.




v1 was one patch "[PATCH] iotests: drop group file"

Vladimir Sementsov-Ogievskiy (2):
   iotests: define group in each iotests
   iotests: rework test finding


Do you plan on an additional patch (or set of patches) to rename some or 
all of the existing 3-digit tests?


Overall, having sensibly named tests makes sense to me.

If we are going to rename files, I'd also suggest that we rename the 
directory: tests/qemu-iotests/ feels redundant, compared to tests/iotests.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v2 3/4] qcow2: Avoid feature name extension on small cluster size

2020-03-25 Thread Eric Blake

On 3/25/20 7:42 AM, Max Reitz wrote:

On 24.03.20 18:42, Eric Blake wrote:

As the feature name table can be quite large (over 9k if all 64 bits
of all three feature fields have names; a mere 8 features leaves only
8 bytes for a backing file name in a 512-byte cluster), it is unwise
to emit this optional header in images with small cluster sizes.

Update iotest 036 to skip running on small cluster sizes; meanwhile,
note that iotest 061 never passed on alternative cluster sizes
(however, I limited this patch to tests with output affected by adding
feature names, rather than auditing for other tests that are not
robust to alternative cluster sizes).





-/* Feature table */
-if (s->qcow_version >= 3) {
+/*
+ * Feature table.  A mere 8 feature names occupies 392 bytes, and
+ * when coupled with the v3 minimum header of 104 bytes plus the
+ * 8-byte end-of-extension marker, that would leave only 8 bytes
+ * for a backing file name in an image with 512-byte clusters.
+ * Thus, we choose to omit this header for cluster sizes 4k and
+ * smaller.


Can’t we do this decision more dynamically?  Like, always omit it when
cluster_size - sizeof(features) < 2k/3k/...?

Max


+ */
+if (s->qcow_version >= 3 && s->cluster_size > 4096) {


Yes.  But when you consider that sizeof(features) is a compile-time 
constant, it isn't really dynamic for a given qemu release, but rather a 
different way to spell things; about the only thing it would buy us is 
that our cutoff window for what cluster size no longer gets the header 
may automatically shift from 2k to 4k to 8k as future qemu versions add 
more qcow2 features.  If we want to write it like that, which size limit 
do you propose?  Or asked differently, how much space should we reserve 
for other extension headers + backing file name?


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v2 0/2] Rework iotests finding

2020-03-25 Thread Vladimir Sementsov-Ogievskiy

25.03.2020 16:08, Eric Blake wrote:

On 3/25/20 5:21 AM, Vladimir Sementsov-Ogievskiy wrote:

Hi all!

When sending iotests to upstream or do patch porting from one branch
to another we very often have to resolve conflicts in group file, as
many absolutely independent features are intersecting by this file.
These conflicts are simple, but imagine how much time we all have
already spent on resolving them? Let's finally get rid of group file.

Next, another thing I don't like about iotests is race for test number
chosing: you should search through mail box, before chosing test number
for new test.

So, I propose to get rid of group file and search for tests another way
[look at patch 02]. Additionally I propose to move to human-readable
names for test files, with notation test-* .


I suggest swapping the name: It's easier to write a glob for *-test vs. 
*-test.out than it is to write for test-* but not test-*.out.  (You don't want 
to execute the output files as a test).  That is, I suggest that ./check 
consider all 3-digit files and all files ending in -test as tests.


I like the idea. It also helps typing test name in terminal: stre -> stream-test instead 
of testre -> test-stream.





v1 was one patch "[PATCH] iotests: drop group file"

Vladimir Sementsov-Ogievskiy (2):
   iotests: define group in each iotests
   iotests: rework test finding


Do you plan on an additional patch (or set of patches) to rename some or all of 
the existing 3-digit tests?


I think, I'll rename tests that I've written.



Overall, having sensibly named tests makes sense to me.

If we are going to rename files, I'd also suggest that we rename the directory: 
tests/qemu-iotests/ feels redundant, compared to tests/iotests.



Agree.


--
Best regards,
Vladimir



Re: [PATCH] nvme: Print 'cqid' for nvme_del_cq

2020-03-25 Thread Kevin Wolf
Am 24.03.2020 um 15:06 hat Minwoo Im geschrieben:
> The given argument for this trace should be cqid, not sqid.
> 
> Signed-off-by: Minwoo Im 

Thanks, applied to the block branch.

Kevin




[PATCH v2 1/6] block/block-copy: rename in-flight requests to tasks

2020-03-25 Thread Vladimir Sementsov-Ogievskiy
We are going to use aio-task-pool API and extend in-flight request
structure to be a successor of AioTask, so rename things appropriately.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/block-copy.c | 99 +++---
 1 file changed, 49 insertions(+), 50 deletions(-)

diff --git a/block/block-copy.c b/block/block-copy.c
index 05227e18bf..61d1d26991 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -24,12 +24,12 @@
 #define BLOCK_COPY_MAX_BUFFER (1 * MiB)
 #define BLOCK_COPY_MAX_MEM (128 * MiB)
 
-typedef struct BlockCopyInFlightReq {
+typedef struct BlockCopyTask {
 int64_t offset;
 int64_t bytes;
-QLIST_ENTRY(BlockCopyInFlightReq) list;
-CoQueue wait_queue; /* coroutines blocked on this request */
-} BlockCopyInFlightReq;
+QLIST_ENTRY(BlockCopyTask) list;
+CoQueue wait_queue; /* coroutines blocked on this task */
+} BlockCopyTask;
 
 typedef struct BlockCopyState {
 /*
@@ -45,7 +45,7 @@ typedef struct BlockCopyState {
 bool use_copy_range;
 int64_t copy_size;
 uint64_t len;
-QLIST_HEAD(, BlockCopyInFlightReq) inflight_reqs;
+QLIST_HEAD(, BlockCopyTask) tasks;
 
 BdrvRequestFlags write_flags;
 
@@ -73,15 +73,14 @@ typedef struct BlockCopyState {
 SharedResource *mem;
 } BlockCopyState;
 
-static BlockCopyInFlightReq *find_conflicting_inflight_req(BlockCopyState *s,
-   int64_t offset,
-   int64_t bytes)
+static BlockCopyTask *find_conflicting_task(BlockCopyState *s,
+int64_t offset, int64_t bytes)
 {
-BlockCopyInFlightReq *req;
+BlockCopyTask *t;
 
-QLIST_FOREACH(req, &s->inflight_reqs, list) {
-if (offset + bytes > req->offset && offset < req->offset + req->bytes) 
{
-return req;
+QLIST_FOREACH(t, &s->tasks, list) {
+if (offset + bytes > t->offset && offset < t->offset + t->bytes) {
+return t;
 }
 }
 
@@ -89,73 +88,73 @@ static BlockCopyInFlightReq 
*find_conflicting_inflight_req(BlockCopyState *s,
 }
 
 /*
- * If there are no intersecting requests return false. Otherwise, wait for the
- * first found intersecting request to finish and return true.
+ * If there are no intersecting tasks return false. Otherwise, wait for the
+ * first found intersecting tasks to finish and return true.
  */
 static bool coroutine_fn block_copy_wait_one(BlockCopyState *s, int64_t offset,
  int64_t bytes)
 {
-BlockCopyInFlightReq *req = find_conflicting_inflight_req(s, offset, 
bytes);
+BlockCopyTask *task = find_conflicting_task(s, offset, bytes);
 
-if (!req) {
+if (!task) {
 return false;
 }
 
-qemu_co_queue_wait(&req->wait_queue, NULL);
+qemu_co_queue_wait(&task->wait_queue, NULL);
 
 return true;
 }
 
 /* Called only on full-dirty region */
-static void block_copy_inflight_req_begin(BlockCopyState *s,
-  BlockCopyInFlightReq *req,
-  int64_t offset, int64_t bytes)
+static void block_copy_task_begin(BlockCopyState *s, BlockCopyTask *task,
+  int64_t offset, int64_t bytes)
 {
-assert(!find_conflicting_inflight_req(s, offset, bytes));
+assert(!find_conflicting_task(s, offset, bytes));
 
 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
 s->in_flight_bytes += bytes;
 
-req->offset = offset;
-req->bytes = bytes;
-qemu_co_queue_init(&req->wait_queue);
-QLIST_INSERT_HEAD(&s->inflight_reqs, req, list);
+task->offset = offset;
+task->bytes = bytes;
+qemu_co_queue_init(&task->wait_queue);
+QLIST_INSERT_HEAD(&s->tasks, task, list);
 }
 
 /*
- * block_copy_inflight_req_shrink
+ * block_copy_task_shrink
  *
- * Drop the tail of the request to be handled later. Set dirty bits back and
- * wake up all requests waiting for us (may be some of them are not 
intersecting
- * with shrunk request)
+ * Drop the tail of the task to be handled later. Set dirty bits back and
+ * wake up all tasks waiting for us (may be some of them are not intersecting
+ * with shrunk task)
  */
-static void coroutine_fn block_copy_inflight_req_shrink(BlockCopyState *s,
-BlockCopyInFlightReq *req, int64_t new_bytes)
+static void coroutine_fn block_copy_task_shrink(BlockCopyState *s,
+BlockCopyTask *task,
+int64_t new_bytes)
 {
-if (new_bytes == req->bytes) {
+if (new_bytes == task->bytes) {
 return;
 }
 
-assert(new_bytes > 0 && new_bytes < req->bytes);
+assert(new_bytes > 0 && new_bytes < task->bytes);
 
-s->in_flight_bytes -= req->bytes - new_bytes;
+s->in_flight_bytes -= task->bytes - new_bytes;
 bdrv_set_dirty_bitmap(s->copy_bitmap,
-

[PATCH v2 0/6] block-copy: use aio-task-pool

2020-03-25 Thread Vladimir Sementsov-Ogievskiy
Hi all!

This is the next step of improving block-copy: use aio task pool.

Async copying loop has better performance than linear, which is shown
in original series (was
"[RFC 00/24] backup performance: block_status + async", so this is
called v2)

Vladimir Sementsov-Ogievskiy (6):
  block/block-copy: rename in-flight requests to tasks
  block/block-copy: alloc task on each iteration
  block/block-copy: add state pointer to BlockCopyTask
  block/block-copy: move task size initial calculation to _task_create
  block/block-copy: move block_copy_task_create down
  block/block-copy: use aio-task-pool API

 block/block-copy.c | 250 ++---
 1 file changed, 168 insertions(+), 82 deletions(-)

-- 
2.21.0




[PATCH v2 3/6] block/block-copy: add state pointer to BlockCopyTask

2020-03-25 Thread Vladimir Sementsov-Ogievskiy
We are going to use aio-task-pool API, so we'll need state pointer in
BlockCopyTask anyway. Add it now and use where possible.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/block-copy.c | 30 --
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/block/block-copy.c b/block/block-copy.c
index 0d9ba0..63d8468b27 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -25,6 +25,7 @@
 #define BLOCK_COPY_MAX_MEM (128 * MiB)
 
 typedef struct BlockCopyTask {
+BlockCopyState *s;
 int64_t offset;
 int64_t bytes;
 QLIST_ENTRY(BlockCopyTask) list;
@@ -116,8 +117,11 @@ static BlockCopyTask 
*block_copy_task_create(BlockCopyState *s,
 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
 s->in_flight_bytes += bytes;
 
-task->offset = offset;
-task->bytes = bytes;
+*task = (BlockCopyTask) {
+.s = s,
+.offset = offset,
+.bytes = bytes,
+};
 qemu_co_queue_init(&task->wait_queue);
 QLIST_INSERT_HEAD(&s->tasks, task, list);
 
@@ -131,8 +135,7 @@ static BlockCopyTask *block_copy_task_create(BlockCopyState 
*s,
  * wake up all tasks waiting for us (may be some of them are not intersecting
  * with shrunk task)
  */
-static void coroutine_fn block_copy_task_shrink(BlockCopyState *s,
-BlockCopyTask *task,
+static void coroutine_fn block_copy_task_shrink(BlockCopyTask *task,
 int64_t new_bytes)
 {
 if (new_bytes == task->bytes) {
@@ -141,21 +144,20 @@ static void coroutine_fn 
block_copy_task_shrink(BlockCopyState *s,
 
 assert(new_bytes > 0 && new_bytes < task->bytes);
 
-s->in_flight_bytes -= task->bytes - new_bytes;
-bdrv_set_dirty_bitmap(s->copy_bitmap,
+task->s->in_flight_bytes -= task->bytes - new_bytes;
+bdrv_set_dirty_bitmap(task->s->copy_bitmap,
   task->offset + new_bytes, task->bytes - new_bytes);
-s->in_flight_bytes -= task->bytes - new_bytes;
+task->s->in_flight_bytes -= task->bytes - new_bytes;
 
 task->bytes = new_bytes;
 qemu_co_queue_restart_all(&task->wait_queue);
 }
 
-static void coroutine_fn block_copy_task_end(BlockCopyState *s,
- BlockCopyTask *task, int ret)
+static void coroutine_fn block_copy_task_end(BlockCopyTask *task, int ret)
 {
-s->in_flight_bytes -= task->bytes;
+task->s->in_flight_bytes -= task->bytes;
 if (ret < 0) {
-bdrv_set_dirty_bitmap(s->copy_bitmap, task->offset, task->bytes);
+bdrv_set_dirty_bitmap(task->s->copy_bitmap, task->offset, task->bytes);
 }
 QLIST_REMOVE(task, list);
 qemu_co_queue_restart_all(&task->wait_queue);
@@ -503,9 +505,9 @@ static int coroutine_fn 
block_copy_dirty_clusters(BlockCopyState *s,
 ret = block_copy_block_status(s, offset, cur_bytes, &status_bytes);
 assert(ret >= 0); /* never fail */
 cur_bytes = MIN(cur_bytes, status_bytes);
-block_copy_task_shrink(s, task, cur_bytes);
+block_copy_task_shrink(task, cur_bytes);
 if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) {
-block_copy_task_end(s, task, 0);
+block_copy_task_end(task, 0);
 progress_set_remaining(s->progress,
bdrv_get_dirty_count(s->copy_bitmap) +
s->in_flight_bytes);
@@ -521,7 +523,7 @@ static int coroutine_fn 
block_copy_dirty_clusters(BlockCopyState *s,
 ret = block_copy_do_copy(s, offset, cur_bytes, ret & BDRV_BLOCK_ZERO,
  error_is_read);
 co_put_to_shres(s->mem, cur_bytes);
-block_copy_task_end(s, task, ret);
+block_copy_task_end(task, ret);
 if (ret < 0) {
 return ret;
 }
-- 
2.21.0




[PATCH v2 4/6] block/block-copy: move task size initial calculation to _task_create

2020-03-25 Thread Vladimir Sementsov-Ogievskiy
Comment "Called only on full-dirty region" without corresponding
assertion is a very unsafe thing. Adding assertion means call
bdrv_dirty_bitmap_next_zero twice. Instead, let's move
bdrv_dirty_bitmap_next_zero call to block_copy_task_create. It also
allows to drop cur_bytes variable which partly duplicate task->bytes.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/block-copy.c | 47 --
 1 file changed, 25 insertions(+), 22 deletions(-)

diff --git a/block/block-copy.c b/block/block-copy.c
index 63d8468b27..dd406eb4bb 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -106,12 +106,23 @@ static bool coroutine_fn 
block_copy_wait_one(BlockCopyState *s, int64_t offset,
 return true;
 }
 
-/* Called only on full-dirty region */
 static BlockCopyTask *block_copy_task_create(BlockCopyState *s,
  int64_t offset, int64_t bytes)
 {
+int64_t next_zero;
 BlockCopyTask *task = g_new(BlockCopyTask, 1);
 
+assert(bdrv_dirty_bitmap_get(s->copy_bitmap, offset));
+
+bytes = MIN(bytes, s->copy_size);
+next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, offset, bytes);
+if (next_zero >= 0) {
+assert(next_zero > offset); /* offset is dirty */
+assert(next_zero < offset + bytes); /* no need to do MIN() */
+bytes = next_zero - offset;
+}
+
+/* region is dirty, so no existent tasks possible in it */
 assert(!find_conflicting_task(s, offset, bytes));
 
 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
@@ -480,7 +491,7 @@ static int coroutine_fn 
block_copy_dirty_clusters(BlockCopyState *s,
 
 while (bytes) {
 g_autofree BlockCopyTask *task = NULL;
-int64_t next_zero, cur_bytes, status_bytes;
+int64_t status_bytes;
 
 if (!bdrv_dirty_bitmap_get(s->copy_bitmap, offset)) {
 trace_block_copy_skip(s, offset);
@@ -491,21 +502,13 @@ static int coroutine_fn 
block_copy_dirty_clusters(BlockCopyState *s,
 
 found_dirty = true;
 
-cur_bytes = MIN(bytes, s->copy_size);
+task = block_copy_task_create(s, offset, bytes);
 
-next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, offset,
-cur_bytes);
-if (next_zero >= 0) {
-assert(next_zero > offset); /* offset is dirty */
-assert(next_zero < offset + cur_bytes); /* no need to do MIN() */
-cur_bytes = next_zero - offset;
-}
-task = block_copy_task_create(s, offset, cur_bytes);
-
-ret = block_copy_block_status(s, offset, cur_bytes, &status_bytes);
+ret = block_copy_block_status(s, offset, task->bytes, &status_bytes);
 assert(ret >= 0); /* never fail */
-cur_bytes = MIN(cur_bytes, status_bytes);
-block_copy_task_shrink(task, cur_bytes);
+if (status_bytes < task->bytes) {
+block_copy_task_shrink(task, status_bytes);
+}
 if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) {
 block_copy_task_end(task, 0);
 progress_set_remaining(s->progress,
@@ -519,19 +522,19 @@ static int coroutine_fn 
block_copy_dirty_clusters(BlockCopyState *s,
 
 trace_block_copy_process(s, offset);
 
-co_get_from_shres(s->mem, cur_bytes);
-ret = block_copy_do_copy(s, offset, cur_bytes, ret & BDRV_BLOCK_ZERO,
+co_get_from_shres(s->mem, task->bytes);
+ret = block_copy_do_copy(s, offset, task->bytes, ret & BDRV_BLOCK_ZERO,
  error_is_read);
-co_put_to_shres(s->mem, cur_bytes);
+co_put_to_shres(s->mem, task->bytes);
 block_copy_task_end(task, ret);
 if (ret < 0) {
 return ret;
 }
 
-progress_work_done(s->progress, cur_bytes);
-s->progress_bytes_callback(cur_bytes, s->progress_opaque);
-offset += cur_bytes;
-bytes -= cur_bytes;
+progress_work_done(s->progress, task->bytes);
+s->progress_bytes_callback(task->bytes, s->progress_opaque);
+offset += task->bytes;
+bytes -= task->bytes;
 }
 
 return found_dirty;
-- 
2.21.0




[PATCH v2 2/6] block/block-copy: alloc task on each iteration

2020-03-25 Thread Vladimir Sementsov-Ogievskiy
We are going to use aio-task-pool API, so tasks will be handled in
parallel. We need therefore separate allocated task on each iteration.
Introduce this logic now.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/block-copy.c | 18 +++---
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/block/block-copy.c b/block/block-copy.c
index 61d1d26991..0d9ba0 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -106,9 +106,11 @@ static bool coroutine_fn 
block_copy_wait_one(BlockCopyState *s, int64_t offset,
 }
 
 /* Called only on full-dirty region */
-static void block_copy_task_begin(BlockCopyState *s, BlockCopyTask *task,
-  int64_t offset, int64_t bytes)
+static BlockCopyTask *block_copy_task_create(BlockCopyState *s,
+ int64_t offset, int64_t bytes)
 {
+BlockCopyTask *task = g_new(BlockCopyTask, 1);
+
 assert(!find_conflicting_task(s, offset, bytes));
 
 bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
@@ -118,6 +120,8 @@ static void block_copy_task_begin(BlockCopyState *s, 
BlockCopyTask *task,
 task->bytes = bytes;
 qemu_co_queue_init(&task->wait_queue);
 QLIST_INSERT_HEAD(&s->tasks, task, list);
+
+return task;
 }
 
 /*
@@ -473,7 +477,7 @@ static int coroutine_fn 
block_copy_dirty_clusters(BlockCopyState *s,
 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
 
 while (bytes) {
-BlockCopyTask task;
+g_autofree BlockCopyTask *task = NULL;
 int64_t next_zero, cur_bytes, status_bytes;
 
 if (!bdrv_dirty_bitmap_get(s->copy_bitmap, offset)) {
@@ -494,14 +498,14 @@ static int coroutine_fn 
block_copy_dirty_clusters(BlockCopyState *s,
 assert(next_zero < offset + cur_bytes); /* no need to do MIN() */
 cur_bytes = next_zero - offset;
 }
-block_copy_task_begin(s, &task, offset, cur_bytes);
+task = block_copy_task_create(s, offset, cur_bytes);
 
 ret = block_copy_block_status(s, offset, cur_bytes, &status_bytes);
 assert(ret >= 0); /* never fail */
 cur_bytes = MIN(cur_bytes, status_bytes);
-block_copy_task_shrink(s, &task, cur_bytes);
+block_copy_task_shrink(s, task, cur_bytes);
 if (s->skip_unallocated && !(ret & BDRV_BLOCK_ALLOCATED)) {
-block_copy_task_end(s, &task, 0);
+block_copy_task_end(s, task, 0);
 progress_set_remaining(s->progress,
bdrv_get_dirty_count(s->copy_bitmap) +
s->in_flight_bytes);
@@ -517,7 +521,7 @@ static int coroutine_fn 
block_copy_dirty_clusters(BlockCopyState *s,
 ret = block_copy_do_copy(s, offset, cur_bytes, ret & BDRV_BLOCK_ZERO,
  error_is_read);
 co_put_to_shres(s->mem, cur_bytes);
-block_copy_task_end(s, &task, ret);
+block_copy_task_end(s, task, ret);
 if (ret < 0) {
 return ret;
 }
-- 
2.21.0




[PATCH v2 5/6] block/block-copy: move block_copy_task_create down

2020-03-25 Thread Vladimir Sementsov-Ogievskiy
Simple movement without any change. It's needed for the following
patch, as this function will need to use some staff which is currently
below it.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/block-copy.c | 66 +++---
 1 file changed, 33 insertions(+), 33 deletions(-)

diff --git a/block/block-copy.c b/block/block-copy.c
index dd406eb4bb..910947cb43 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -106,39 +106,6 @@ static bool coroutine_fn 
block_copy_wait_one(BlockCopyState *s, int64_t offset,
 return true;
 }
 
-static BlockCopyTask *block_copy_task_create(BlockCopyState *s,
- int64_t offset, int64_t bytes)
-{
-int64_t next_zero;
-BlockCopyTask *task = g_new(BlockCopyTask, 1);
-
-assert(bdrv_dirty_bitmap_get(s->copy_bitmap, offset));
-
-bytes = MIN(bytes, s->copy_size);
-next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, offset, bytes);
-if (next_zero >= 0) {
-assert(next_zero > offset); /* offset is dirty */
-assert(next_zero < offset + bytes); /* no need to do MIN() */
-bytes = next_zero - offset;
-}
-
-/* region is dirty, so no existent tasks possible in it */
-assert(!find_conflicting_task(s, offset, bytes));
-
-bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
-s->in_flight_bytes += bytes;
-
-*task = (BlockCopyTask) {
-.s = s,
-.offset = offset,
-.bytes = bytes,
-};
-qemu_co_queue_init(&task->wait_queue);
-QLIST_INSERT_HEAD(&s->tasks, task, list);
-
-return task;
-}
-
 /*
  * block_copy_task_shrink
  *
@@ -361,6 +328,39 @@ out:
 return ret;
 }
 
+static BlockCopyTask *block_copy_task_create(BlockCopyState *s,
+ int64_t offset, int64_t bytes)
+{
+int64_t next_zero;
+BlockCopyTask *task = g_new(BlockCopyTask, 1);
+
+assert(bdrv_dirty_bitmap_get(s->copy_bitmap, offset));
+
+bytes = MIN(bytes, s->copy_size);
+next_zero = bdrv_dirty_bitmap_next_zero(s->copy_bitmap, offset, bytes);
+if (next_zero >= 0) {
+assert(next_zero > offset); /* offset is dirty */
+assert(next_zero < offset + bytes); /* no need to do MIN() */
+bytes = next_zero - offset;
+}
+
+/* region is dirty, so no existent tasks possible in it */
+assert(!find_conflicting_task(s, offset, bytes));
+
+bdrv_reset_dirty_bitmap(s->copy_bitmap, offset, bytes);
+s->in_flight_bytes += bytes;
+
+*task = (BlockCopyTask) {
+.s = s,
+.offset = offset,
+.bytes = bytes,
+};
+qemu_co_queue_init(&task->wait_queue);
+QLIST_INSERT_HEAD(&s->tasks, task, list);
+
+return task;
+}
+
 static int block_copy_block_status(BlockCopyState *s, int64_t offset,
int64_t bytes, int64_t *pnum)
 {
-- 
2.21.0




[PATCH v2 6/6] block/block-copy: use aio-task-pool API

2020-03-25 Thread Vladimir Sementsov-Ogievskiy
Run block_copy iterations in parallel in aio tasks.

Changes:
  - BlockCopyTask becomes aio task structure. Add zeroes field to pass
it to block_copy_do_copy
  - add call state - it's a state of one call of block_copy(), shared
between parallel tasks. For now used only to keep information about
first error: is it read or not.
  - convert block_copy_dirty_clusters to aio-task loop.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/block-copy.c | 104 +++--
 1 file changed, 91 insertions(+), 13 deletions(-)

diff --git a/block/block-copy.c b/block/block-copy.c
index 910947cb43..9994598eb7 100644
--- a/block/block-copy.c
+++ b/block/block-copy.c
@@ -19,15 +19,27 @@
 #include "block/block-copy.h"
 #include "sysemu/block-backend.h"
 #include "qemu/units.h"
+#include "qemu/coroutine.h"
+#include "block/aio_task.h"
 
 #define BLOCK_COPY_MAX_COPY_RANGE (16 * MiB)
 #define BLOCK_COPY_MAX_BUFFER (1 * MiB)
 #define BLOCK_COPY_MAX_MEM (128 * MiB)
+#define BLOCK_COPY_MAX_WORKERS 64
+
+typedef struct BlockCopyCallState {
+bool failed;
+bool error_is_read;
+} BlockCopyCallState;
 
 typedef struct BlockCopyTask {
+AioTask task;
+
 BlockCopyState *s;
+BlockCopyCallState *call_state;
 int64_t offset;
 int64_t bytes;
+bool zeroes;
 QLIST_ENTRY(BlockCopyTask) list;
 CoQueue wait_queue; /* coroutines blocked on this task */
 } BlockCopyTask;
@@ -225,6 +237,30 @@ void block_copy_set_progress_meter(BlockCopyState *s, 
ProgressMeter *pm)
 s->progress = pm;
 }
 
+/* Takes ownership on @task */
+static coroutine_fn int block_copy_task_run(AioTaskPool *pool,
+BlockCopyTask *task)
+{
+if (!pool) {
+int ret = task->task.func(&task->task);
+
+g_free(task);
+return ret;
+}
+
+aio_task_pool_wait_slot(pool);
+if (aio_task_pool_status(pool) < 0) {
+co_put_to_shres(task->s->mem, task->bytes);
+block_copy_task_end(task, -EAGAIN);
+g_free(task);
+return aio_task_pool_status(pool);
+}
+
+aio_task_pool_start_task(pool, &task->task);
+
+return 0;
+}
+
 /*
  * block_copy_do_copy
  *
@@ -328,8 +364,32 @@ out:
 return ret;
 }
 
+static coroutine_fn int block_copy_task_entry(AioTask *task)
+{
+BlockCopyTask *t = container_of(task, BlockCopyTask, task);
+bool error_is_read;
+int ret;
+
+ret = block_copy_do_copy(t->s, t->offset, t->bytes, t->zeroes,
+ &error_is_read);
+if (ret < 0 && !t->call_state->failed) {
+t->call_state->failed = true;
+t->call_state->error_is_read = error_is_read;
+} else {
+progress_work_done(t->s->progress, t->bytes);
+t->s->progress_bytes_callback(t->bytes, t->s->progress_opaque);
+}
+co_put_to_shres(t->s->mem, t->bytes);
+block_copy_task_end(t, ret);
+
+return ret;
+}
+
+/* Called only on full-dirty region */
 static BlockCopyTask *block_copy_task_create(BlockCopyState *s,
- int64_t offset, int64_t bytes)
+ BlockCopyCallState *call_state,
+ int64_t offset,
+ int64_t bytes)
 {
 int64_t next_zero;
 BlockCopyTask *task = g_new(BlockCopyTask, 1);
@@ -351,7 +411,9 @@ static BlockCopyTask *block_copy_task_create(BlockCopyState 
*s,
 s->in_flight_bytes += bytes;
 
 *task = (BlockCopyTask) {
+.task.func = block_copy_task_entry,
 .s = s,
+.call_state = call_state,
 .offset = offset,
 .bytes = bytes,
 };
@@ -478,6 +540,8 @@ static int coroutine_fn 
block_copy_dirty_clusters(BlockCopyState *s,
 {
 int ret = 0;
 bool found_dirty = false;
+AioTaskPool *aio = NULL;
+BlockCopyCallState call_state = {false, false};
 
 /*
  * block_copy() user is responsible for keeping source and target in same
@@ -489,8 +553,8 @@ static int coroutine_fn 
block_copy_dirty_clusters(BlockCopyState *s,
 assert(QEMU_IS_ALIGNED(offset, s->cluster_size));
 assert(QEMU_IS_ALIGNED(bytes, s->cluster_size));
 
-while (bytes) {
-g_autofree BlockCopyTask *task = NULL;
+while (bytes && aio_task_pool_status(aio) == 0) {
+BlockCopyTask *task;
 int64_t status_bytes;
 
 if (!bdrv_dirty_bitmap_get(s->copy_bitmap, offset)) {
@@ -502,7 +566,7 @@ static int coroutine_fn 
block_copy_dirty_clusters(BlockCopyState *s,
 
 found_dirty = true;
 
-task = block_copy_task_create(s, offset, bytes);
+task = block_copy_task_create(s, &call_state, offset, bytes);
 
 ret = block_copy_block_status(s, offset, task->bytes, &status_bytes);
 assert(ret >= 0); /* never fail */
@@ -511,6 +575,7 @@ static int coroutine_fn 
block_copy_dirty_clusters(BlockCopyState *s,
 }
 if (s->skip_unallocated && !(ret & BDRV_BLOCK

Re: [PATCH v2 3/4] qcow2: Avoid feature name extension on small cluster size

2020-03-25 Thread Max Reitz
On 25.03.20 14:18, Eric Blake wrote:
> On 3/25/20 7:42 AM, Max Reitz wrote:
>> On 24.03.20 18:42, Eric Blake wrote:
>>> As the feature name table can be quite large (over 9k if all 64 bits
>>> of all three feature fields have names; a mere 8 features leaves only
>>> 8 bytes for a backing file name in a 512-byte cluster), it is unwise
>>> to emit this optional header in images with small cluster sizes.
>>>
>>> Update iotest 036 to skip running on small cluster sizes; meanwhile,
>>> note that iotest 061 never passed on alternative cluster sizes
>>> (however, I limited this patch to tests with output affected by adding
>>> feature names, rather than auditing for other tests that are not
>>> robust to alternative cluster sizes).
>>
> 
>>> -    /* Feature table */
>>> -    if (s->qcow_version >= 3) {
>>> +    /*
>>> + * Feature table.  A mere 8 feature names occupies 392 bytes, and
>>> + * when coupled with the v3 minimum header of 104 bytes plus the
>>> + * 8-byte end-of-extension marker, that would leave only 8 bytes
>>> + * for a backing file name in an image with 512-byte clusters.
>>> + * Thus, we choose to omit this header for cluster sizes 4k and
>>> + * smaller.
>>
>> Can’t we do this decision more dynamically?  Like, always omit it when
>> cluster_size - sizeof(features) < 2k/3k/...?
>>
>> Max
>>
>>> + */
>>> +    if (s->qcow_version >= 3 && s->cluster_size > 4096) {
> 
> Yes.  But when you consider that sizeof(features) is a compile-time
> constant, it isn't really dynamic for a given qemu release, but rather a
> different way to spell things; about the only thing it would buy us is
> that our cutoff window for what cluster size no longer gets the header
> may automatically shift from 2k to 4k to 8k as future qemu versions add
> more qcow2 features.

Yes.

> If we want to write it like that, which size limit
> do you propose?  Or asked differently, how much space should we reserve
> for other extension headers + backing file name?

Well, that was the “2k/3k/...” list. :)

The backing file name is limited to 1k, so I suppose that plus 1k for a
potential external data filename, plus 1k for the rest, so 3k in total?

Now that I look into the spec, I see that it actually doesn’t say that
the backing filename has to be part of the header cluster.  But, well.

It also only says that the image header must be part of the first
cluster, which in my opinion doesn’t necessarily include its extensions.
 So header extensions (and the backing filename) could actually be in
consecutive clusters.  But that of course would be much more difficult
to implement.

Max



signature.asc
Description: OpenPGP digital signature


Re: [PATCH v2 3/4] qcow2: Avoid feature name extension on small cluster size

2020-03-25 Thread Eric Blake

On 3/25/20 8:52 AM, Max Reitz wrote:


If we want to write it like that, which size limit
do you propose?  Or asked differently, how much space should we reserve
for other extension headers + backing file name?


Well, that was the “2k/3k/...” list. :)

The backing file name is limited to 1k, so I suppose that plus 1k for a
potential external data filename, plus 1k for the rest, so 3k in total?

Now that I look into the spec, I see that it actually doesn’t say that
the backing filename has to be part of the header cluster.  But, well.


qemu enforces that the header is one cluster.  But you're right, that 
does not appear to directly be a limitation mandated by the spec, and we 
could relax qemu to allow the header to be several consecutive clusters. 
 The tricky part, however, is that the backing file name is NOT 
described by a header extension, but rather is just whatever bytes occur 
after the final header extension.  There's no clear indication anywhere 
on when to stop reading those bytes, other than by an implicit limit 
such as insisting those bytes fall within the first cluster.


Had we been smarter when designing v3, we would have made the backing 
file name a header extension (in fact, it would have been possible to 
design the additional fields of v3 to look like an unknown header 
extension when parsed by a v2 binary) - but hindsight is 20/20.




It also only says that the image header must be part of the first
cluster, which in my opinion doesn’t necessarily include its extensions.
  So header extensions (and the backing filename) could actually be in
consecutive clusters.  But that of course would be much more difficult
to implement.


We'd still want a sane limit even for small-cluster images (maybe "no 
more than 2M of header information, regardless of cluster size); or 
maybe even introduce a NEW header field and/or extension to make it 
obvious how many clusters are being used for the purpose of the metadata 
header in this particular image (with sane fallbacks for when that 
extension is not present).  But you're right - it comes with complexity. 
 This patch as written is safe for 5.0-rc1, but this discussion about 
teaching qemu to permit headers larger than 1 cluster is squarely in the 
5.1 category, if at all.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




Re: [PATCH-for-5.0 v2 4/4] sheepdog: Consistently set bdrv_has_zero_init_truncate

2020-03-25 Thread Philippe Mathieu-Daudé

On 3/24/20 6:42 PM, Eric Blake wrote:

block_int.h claims that .bdrv_has_zero_init must return 0 if
.bdrv_has_zero_init_truncate does likewise; but this is violated if
only the former callback is provided if .bdrv_co_truncate also exists.
When adding the latter callback, it was mistakenly added to only one
of the three possible sheepdog instantiations.

Fixes: 1dcaf527
Signed-off-by: Eric Blake 
---
  block/sheepdog.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index cfa84338a2d6..522c16a93676 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -3269,6 +3269,7 @@ static BlockDriver bdrv_sheepdog_tcp = {
  .bdrv_co_create   = sd_co_create,
  .bdrv_co_create_opts  = sd_co_create_opts,
  .bdrv_has_zero_init   = bdrv_has_zero_init_1,
+.bdrv_has_zero_init_truncate  = bdrv_has_zero_init_1,
  .bdrv_getlength   = sd_getlength,
  .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
  .bdrv_co_truncate = sd_co_truncate,
@@ -3307,6 +3308,7 @@ static BlockDriver bdrv_sheepdog_unix = {
  .bdrv_co_create   = sd_co_create,
  .bdrv_co_create_opts  = sd_co_create_opts,
  .bdrv_has_zero_init   = bdrv_has_zero_init_1,
+.bdrv_has_zero_init_truncate  = bdrv_has_zero_init_1,
  .bdrv_getlength   = sd_getlength,
  .bdrv_get_allocated_file_size = sd_get_allocated_file_size,
  .bdrv_co_truncate = sd_co_truncate,



Reviewed-by: Philippe Mathieu-Daudé 




[PATCH] backup: don't acquire aio_context in backup_clean

2020-03-25 Thread Stefan Reiter
backup_clean is only ever called as a handler via job_exit, which
already acquires the job's context. The job's context is guaranteed to
be the same as the one used by backup_top via backup_job_create.

Since the previous logic effectively acquired the lock twice, this
broke cleanup of backups for disks using IO threads, since the BDRV_POLL_WHILE
in bdrv_backup_top_drop -> bdrv_do_drained_begin would only release the lock
once, thus deadlocking with the IO thread.

Signed-off-by: Stefan Reiter 
---

This is a fix for the issue discussed in this part of the thread:
https://lists.gnu.org/archive/html/qemu-devel/2020-03/msg07639.html
...not the original problem (core dump) posted by Dietmar.

I've still seen it occasionally hang during a backup abort. I'm trying to figure
out why that happens, stack trace indicates a similar problem with the main
thread hanging at bdrv_do_drained_begin, though I have no clue why as of yet.

 block/backup.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 7430ca5883..a7a7dcaf4c 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -126,11 +126,7 @@ static void backup_abort(Job *job)
 static void backup_clean(Job *job)
 {
 BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
-AioContext *aio_context = bdrv_get_aio_context(s->backup_top);
-
-aio_context_acquire(aio_context);
 bdrv_backup_top_drop(s->backup_top);
-aio_context_release(aio_context);
 }
 
 void backup_do_checkpoint(BlockJob *job, Error **errp)
-- 
2.25.2





test-aio failure with liburing

2020-03-25 Thread Cole Robinson
Using qemu.git master with liburing-devel installed. 100% reproducible
test failure for me

$ uname -r
5.6.0-0.rc5.git0.2.fc32.x86_64
$ rpm -q liburing
liburing-0.5-1.fc32.x86_64

$ ./tests/test-aio
# random seed: R02S70cd9b447cc815ed3194d31e97371542
1..28
# Start of aio tests
ok 1 /aio/acquire
ok 2 /aio/external-client
# Start of bh tests
ok 3 /aio/bh/schedule
ok 4 /aio/bh/schedule10
ok 5 /aio/bh/cancel
ok 6 /aio/bh/delete
ok 7 /aio/bh/flush
# Start of callback-delete tests
ok 8 /aio/bh/callback-delete/one
ok 9 /aio/bh/callback-delete/many
# End of callback-delete tests
# End of bh tests
# Start of event tests
ok 10 /aio/event/add-remove
ok 11 /aio/event/wait
**
ERROR:tests/test-aio.c:374:test_flush_event_notifier: assertion failed:
(aio_poll(ctx, false))
Bail out! ERROR:tests/test-aio.c:374:test_flush_event_notifier:
assertion failed: (aio_poll(ctx, false))
Aborted (core dumped)


Configuring with --disable-linux-io-uring makes the failure go away

I saw this patch about another uring test-aio failure, but that patch is
already in git master, and the failure is different:
https://lists.gnu.org/archive/html/qemu-devel/2020-03/msg06299.html


Thanks,
Cole




[PATCH 1/2] Revert "mirror: Don't let an operation wait for itself"

2020-03-25 Thread Kevin Wolf
This reverts commit 7e6c4ff792734e196c8ca82564c56b5e7c6288ca.

The fix was incomplete as it only protected against requests waiting for
themselves, but not against requests waiting for each other. We need a
different solution.

Signed-off-by: Kevin Wolf 
---
 block/mirror.c | 21 +
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 447051dbc6..393131b135 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -283,14 +283,11 @@ static int mirror_cow_align(MirrorBlockJob *s, int64_t 
*offset,
 }
 
 static inline void coroutine_fn
-mirror_wait_for_any_operation(MirrorBlockJob *s, MirrorOp *self, bool active)
+mirror_wait_for_any_operation(MirrorBlockJob *s, bool active)
 {
 MirrorOp *op;
 
 QTAILQ_FOREACH(op, &s->ops_in_flight, next) {
-if (self == op) {
-continue;
-}
 /* Do not wait on pseudo ops, because it may in turn wait on
  * some other operation to start, which may in fact be the
  * caller of this function.  Since there is only one pseudo op
@@ -305,10 +302,10 @@ mirror_wait_for_any_operation(MirrorBlockJob *s, MirrorOp 
*self, bool active)
 }
 
 static inline void coroutine_fn
-mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s, MirrorOp *self)
+mirror_wait_for_free_in_flight_slot(MirrorBlockJob *s)
 {
 /* Only non-active operations use up in-flight slots */
-mirror_wait_for_any_operation(s, self, false);
+mirror_wait_for_any_operation(s, false);
 }
 
 /* Perform a mirror copy operation.
@@ -351,7 +348,7 @@ static void coroutine_fn mirror_co_read(void *opaque)
 
 while (s->buf_free_count < nb_chunks) {
 trace_mirror_yield_in_flight(s, op->offset, s->in_flight);
-mirror_wait_for_free_in_flight_slot(s, op);
+mirror_wait_for_free_in_flight_slot(s);
 }
 
 /* Now make a QEMUIOVector taking enough granularity-sized chunks
@@ -558,7 +555,7 @@ static uint64_t coroutine_fn 
mirror_iteration(MirrorBlockJob *s)
 
 while (s->in_flight >= MAX_IN_FLIGHT) {
 trace_mirror_yield_in_flight(s, offset, s->in_flight);
-mirror_wait_for_free_in_flight_slot(s, pseudo_op);
+mirror_wait_for_free_in_flight_slot(s);
 }
 
 if (s->ret < 0) {
@@ -612,7 +609,7 @@ static void mirror_free_init(MirrorBlockJob *s)
 static void coroutine_fn mirror_wait_for_all_io(MirrorBlockJob *s)
 {
 while (s->in_flight > 0) {
-mirror_wait_for_free_in_flight_slot(s, NULL);
+mirror_wait_for_free_in_flight_slot(s);
 }
 }
 
@@ -809,7 +806,7 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
 if (s->in_flight >= MAX_IN_FLIGHT) {
 trace_mirror_yield(s, UINT64_MAX, s->buf_free_count,
s->in_flight);
-mirror_wait_for_free_in_flight_slot(s, NULL);
+mirror_wait_for_free_in_flight_slot(s);
 continue;
 }
 
@@ -962,7 +959,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
 /* Do not start passive operations while there are active
  * writes in progress */
 while (s->in_active_write_counter) {
-mirror_wait_for_any_operation(s, NULL, true);
+mirror_wait_for_any_operation(s, true);
 }
 
 if (s->ret < 0) {
@@ -988,7 +985,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
 if (s->in_flight >= MAX_IN_FLIGHT || s->buf_free_count == 0 ||
 (cnt == 0 && s->in_flight > 0)) {
 trace_mirror_yield(s, cnt, s->buf_free_count, s->in_flight);
-mirror_wait_for_free_in_flight_slot(s, NULL);
+mirror_wait_for_free_in_flight_slot(s);
 continue;
 } else if (cnt != 0) {
 delay_ns = mirror_iteration(s);
-- 
2.20.1




[PATCH 2/2] mirror: Wait only for in-flight operations

2020-03-25 Thread Kevin Wolf
mirror_wait_for_free_in_flight_slot() just picks a random operation to
wait for. However, a MirrorOp is already in s->ops_in_flight when
mirror_co_read() waits for free slots, so if not enough slots are
immediately available, an operation can end up waiting for itself, or
two or more operations can wait for each other to complete, which
results in a hang.

Fix this by adding a flag to MirrorOp that tells us if the request is
already in flight (and therefore occupies slots that it will later
free), and picking only such operations for waiting.

Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1794692
Signed-off-by: Kevin Wolf 
---
 block/mirror.c | 8 +++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/block/mirror.c b/block/mirror.c
index 393131b135..7fef52ded2 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -102,6 +102,7 @@ struct MirrorOp {
 
 bool is_pseudo_op;
 bool is_active_write;
+bool is_in_flight;
 CoQueue waiting_requests;
 Coroutine *co;
 
@@ -293,7 +294,9 @@ mirror_wait_for_any_operation(MirrorBlockJob *s, bool 
active)
  * caller of this function.  Since there is only one pseudo op
  * at any given time, we will always find some real operation
  * to wait on. */
-if (!op->is_pseudo_op && op->is_active_write == active) {
+if (!op->is_pseudo_op && op->is_in_flight &&
+op->is_active_write == active)
+{
 qemu_co_queue_wait(&op->waiting_requests, NULL);
 return;
 }
@@ -367,6 +370,7 @@ static void coroutine_fn mirror_co_read(void *opaque)
 /* Copy the dirty cluster.  */
 s->in_flight++;
 s->bytes_in_flight += op->bytes;
+op->is_in_flight = true;
 trace_mirror_one_iteration(s, op->offset, op->bytes);
 
 ret = bdrv_co_preadv(s->mirror_top_bs->backing, op->offset, op->bytes,
@@ -382,6 +386,7 @@ static void coroutine_fn mirror_co_zero(void *opaque)
 op->s->in_flight++;
 op->s->bytes_in_flight += op->bytes;
 *op->bytes_handled = op->bytes;
+op->is_in_flight = true;
 
 ret = blk_co_pwrite_zeroes(op->s->target, op->offset, op->bytes,
op->s->unmap ? BDRV_REQ_MAY_UNMAP : 0);
@@ -396,6 +401,7 @@ static void coroutine_fn mirror_co_discard(void *opaque)
 op->s->in_flight++;
 op->s->bytes_in_flight += op->bytes;
 *op->bytes_handled = op->bytes;
+op->is_in_flight = true;
 
 ret = blk_co_pdiscard(op->s->target, op->offset, op->bytes);
 mirror_write_complete(op, ret);
-- 
2.20.1




[PATCH 0/2] mirror: Fix hang (operation waiting for itself/circular dependency)

2020-03-25 Thread Kevin Wolf
The recent fix didn't actually fix the whole problem. Operations can't
only wait for themselves, but we can also end up with circular
dependencies like two operations waiting for each other to complete.

This reverts the first fix and implements another approach.

Kevin Wolf (2):
  Revert "mirror: Don't let an operation wait for itself"
  mirror: Wait only for in-flight operations

 block/mirror.c | 29 -
 1 file changed, 16 insertions(+), 13 deletions(-)

-- 
2.20.1




Re: [PATCH 1/2] Revert "mirror: Don't let an operation wait for itself"

2020-03-25 Thread Eric Blake

On 3/25/20 12:23 PM, Kevin Wolf wrote:

This reverts commit 7e6c4ff792734e196c8ca82564c56b5e7c6288ca.

The fix was incomplete as it only protected against requests waiting for
themselves, but not against requests waiting for each other. We need a
different solution.

Signed-off-by: Kevin Wolf 
---
  block/mirror.c | 21 +
  1 file changed, 9 insertions(+), 12 deletions(-)


Reviewed-by: Eric Blake 


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




Re: [PATCH 2/2] mirror: Wait only for in-flight operations

2020-03-25 Thread Eric Blake

On 3/25/20 12:23 PM, Kevin Wolf wrote:

mirror_wait_for_free_in_flight_slot() just picks a random operation to
wait for. However, a MirrorOp is already in s->ops_in_flight when
mirror_co_read() waits for free slots, so if not enough slots are
immediately available, an operation can end up waiting for itself, or
two or more operations can wait for each other to complete, which
results in a hang.

Fix this by adding a flag to MirrorOp that tells us if the request is
already in flight (and therefore occupies slots that it will later
free), and picking only such operations for waiting.

Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1794692
Signed-off-by: Kevin Wolf 
---
  block/mirror.c | 8 +++-
  1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/block/mirror.c b/block/mirror.c
index 393131b135..7fef52ded2 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -102,6 +102,7 @@ struct MirrorOp {
  
  bool is_pseudo_op;

  bool is_active_write;
+bool is_in_flight;
  CoQueue waiting_requests;
  Coroutine *co;
  
@@ -293,7 +294,9 @@ mirror_wait_for_any_operation(MirrorBlockJob *s, bool active)

   * caller of this function.  Since there is only one pseudo op
   * at any given time, we will always find some real operation
   * to wait on. */
-if (!op->is_pseudo_op && op->is_active_write == active) {
+if (!op->is_pseudo_op && op->is_in_flight &&
+op->is_active_write == active)
+{
  qemu_co_queue_wait(&op->waiting_requests, NULL);


Looks like a one-way transition - op->is_in_flight always starts as 
false, and only ever gets set to true (once the op is finished, op is no 
longer needed).  And being more selective on what you wait for here does 
look like it should work in more cases than what patch 1 reverted.


Reviewed-by: Eric Blake 

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




Re: [PATCH 0/2] mirror: Fix hang (operation waiting for itself/circular dependency)

2020-03-25 Thread Kevin Wolf
Am 25.03.2020 um 18:23 hat Kevin Wolf geschrieben:
> The recent fix didn't actually fix the whole problem. Operations can't
> only wait for themselves, but we can also end up with circular
> dependencies like two operations waiting for each other to complete.
> 
> This reverts the first fix and implements another approach.

Hm, somehow this seems to break iotests 151. I don't actually understand
the backtrace, because that's during job initialisation, so my changes
shouldn't have had any effect yet:

(gdb) bt
#0  0x7fba6d85057f in raise () at /lib64/libc.so.6
#1  0x7fba6d83a895 in abort () at /lib64/libc.so.6
#2  0x5624d94d109a in bitmap_new (nbits=) at 
/home/kwolf/source/qemu/include/qemu/bitmap.h:103
#3  0x5624d94d109a in mirror_run (job=0x5624dc8d5560, errp=) 
at block/mirror.c:922
#4  0x5624d988053f in job_co_entry (opaque=0x5624dc8d5560) at job.c:878
#5  0x5624d998d3bb in coroutine_trampoline (i0=, 
i1=) at util/coroutine-ucontext.c:115
#6  0x7fba6d866250 in __start_context () at /lib64/libc.so.6
#7  0x7fffa2d48130 in  ()
#8  0x in  ()

Something to check tomorrow.

Kevin




[PATCH 1/2] block: pass BlockDriver reference to the .bdrv_co_create

2020-03-25 Thread Maxim Levitsky
This will allow to reuse a single generic .bdrv_co_create
implementation for several drivers.
No functional changes.

Signed-off-by: Maxim Levitsky 
---
 block.c   | 3 ++-
 block/crypto.c| 3 ++-
 block/file-posix.c| 4 +++-
 block/file-win32.c| 4 +++-
 block/gluster.c   | 3 ++-
 block/nfs.c   | 4 +++-
 block/parallels.c | 3 ++-
 block/qcow.c  | 3 ++-
 block/qcow2.c | 4 +++-
 block/qed.c   | 3 ++-
 block/raw-format.c| 4 +++-
 block/rbd.c   | 3 ++-
 block/sheepdog.c  | 4 +++-
 block/ssh.c   | 4 +++-
 block/vdi.c   | 4 +++-
 block/vhdx.c  | 3 ++-
 block/vmdk.c  | 4 +++-
 block/vpc.c   | 6 --
 include/block/block_int.h | 3 ++-
 19 files changed, 49 insertions(+), 20 deletions(-)

diff --git a/block.c b/block.c
index cccae5add9..ff23e20443 100644
--- a/block.c
+++ b/block.c
@@ -483,7 +483,8 @@ static void coroutine_fn bdrv_create_co_entry(void *opaque)
 CreateCo *cco = opaque;
 assert(cco->drv);
 
-ret = cco->drv->bdrv_co_create_opts(cco->filename, cco->opts, &local_err);
+ret = cco->drv->bdrv_co_create_opts(cco->drv,
+cco->filename, cco->opts, &local_err);
 error_propagate(&cco->err, local_err);
 cco->ret = ret;
 }
diff --git a/block/crypto.c b/block/crypto.c
index 4425ebeb47..d577f89659 100644
--- a/block/crypto.c
+++ b/block/crypto.c
@@ -601,7 +601,8 @@ fail:
 return ret;
 }
 
-static int coroutine_fn block_crypto_co_create_opts_luks(const char *filename,
+static int coroutine_fn block_crypto_co_create_opts_luks(BlockDriver *drv,
+ const char *filename,
  QemuOpts *opts,
  Error **errp)
 {
diff --git a/block/file-posix.c b/block/file-posix.c
index 9bc3838b2a..65bc980bc4 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -2405,7 +2405,9 @@ out:
 return result;
 }
 
-static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts 
*opts,
+static int coroutine_fn raw_co_create_opts(BlockDriver *drv,
+   const char *filename,
+   QemuOpts *opts,
Error **errp)
 {
 BlockdevCreateOptions options;
diff --git a/block/file-win32.c b/block/file-win32.c
index 77e8ff7b68..15859839a1 100644
--- a/block/file-win32.c
+++ b/block/file-win32.c
@@ -588,7 +588,9 @@ static int raw_co_create(BlockdevCreateOptions *options, 
Error **errp)
 return 0;
 }
 
-static int coroutine_fn raw_co_create_opts(const char *filename, QemuOpts 
*opts,
+static int coroutine_fn raw_co_create_opts(BlockDriver *drv,
+   const char *filename,
+   QemuOpts *opts,
Error **errp)
 {
 BlockdevCreateOptions options;
diff --git a/block/gluster.c b/block/gluster.c
index 4fa4a77a47..0aa1f2cda4 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -1130,7 +1130,8 @@ out:
 return ret;
 }
 
-static int coroutine_fn qemu_gluster_co_create_opts(const char *filename,
+static int coroutine_fn qemu_gluster_co_create_opts(BlockDriver *drv,
+const char *filename,
 QemuOpts *opts,
 Error **errp)
 {
diff --git a/block/nfs.c b/block/nfs.c
index 9a6311e270..cc2413d5ab 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -662,7 +662,9 @@ out:
 return ret;
 }
 
-static int coroutine_fn nfs_file_co_create_opts(const char *url, QemuOpts 
*opts,
+static int coroutine_fn nfs_file_co_create_opts(BlockDriver *drv,
+const char *url,
+QemuOpts *opts,
 Error **errp)
 {
 BlockdevCreateOptions *create_options;
diff --git a/block/parallels.c b/block/parallels.c
index 7a01997659..6d4ed77f16 100644
--- a/block/parallels.c
+++ b/block/parallels.c
@@ -609,7 +609,8 @@ exit:
 goto out;
 }
 
-static int coroutine_fn parallels_co_create_opts(const char *filename,
+static int coroutine_fn parallels_co_create_opts(BlockDriver *drv,
+ const char *filename,
  QemuOpts *opts,
  Error **errp)
 {
diff --git a/block/qcow.c b/block/qcow.c
index fce8989868..8973e4e565 100644
--- a/block/qcow.c
+++ b/block/qcow.c
@@ -934,7 +934,8 @@ exit:
 return ret;
 }
 
-static int coroutine_fn qcow_co_create_opts(const char *filename,
+static int coroutine_fn qcow_co

[PATCH 0/2] Fix the generic image creation code

2020-03-25 Thread Maxim Levitsky
The recent patches from Max Reitz allowed some block drivers to not
provide the .bdrv_co_create_opts and still allow qemu-img to
create/format images as long as the image is already existing
(that is the case with various block storage drivers like nbd/iscsi/nvme, etc)

However it was found out that some places in the code depend on the
.bdrv_co_create_opts/.create_opts to be != NULL to decide if to allow
image creation.

To avoid adding failback code to all these places, just make generic failback
code be used by the drivers that need it, so that for outside user, there
is no diffirence if failback was used or not.

Best regards,
Maxim Levitsky

Maxim Levitsky (2):
  block: pass BlockDriver reference to the .bdrv_co_create
  block: trickle down the fallback image creation function use to the
block drivers

 block.c   | 38 ++
 block/crypto.c|  3 ++-
 block/file-posix.c| 11 +--
 block/file-win32.c|  4 +++-
 block/gluster.c   |  3 ++-
 block/iscsi.c | 16 
 block/nbd.c   |  6 ++
 block/nfs.c   |  4 +++-
 block/nvme.c  |  3 +++
 block/parallels.c |  3 ++-
 block/qcow.c  |  3 ++-
 block/qcow2.c |  4 +++-
 block/qed.c   |  3 ++-
 block/raw-format.c|  4 +++-
 block/rbd.c   |  3 ++-
 block/sheepdog.c  |  4 +++-
 block/ssh.c   |  4 +++-
 block/vdi.c   |  4 +++-
 block/vhdx.c  |  3 ++-
 block/vmdk.c  |  4 +++-
 block/vpc.c   |  6 --
 include/block/block.h |  7 +++
 include/block/block_int.h |  3 ++-
 23 files changed, 95 insertions(+), 48 deletions(-)

-- 
2.17.2




[PATCH 2/2] block: trickle down the fallback image creation function use to the block drivers

2020-03-25 Thread Maxim Levitsky
Instead of checking the .bdrv_co_create_opts to see if we need the failback,
just implement the .bdrv_co_create_opts in the drivers that need it.

This way we don't break various places that need to know if the underlying
protocol/format really supports image creation, and this way we still
allow some drivers to not support image creation.

Fixes: fd17146cd93d1704cd96d7c2757b325fc7aac6fd
Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1816007

Note that technically this driver reverts the image creation failback
for the vxhs driver since I don't have a means to test it,
and IMHO it is better to leave it not supported as it was prior to
generic image creation patches.

Also drop iscsi_create_opts which was left accidently

Signed-off-by: Maxim Levitsky 
---
 block.c   | 35 ---
 block/file-posix.c|  7 ++-
 block/iscsi.c | 16 
 block/nbd.c   |  6 ++
 block/nvme.c  |  3 +++
 include/block/block.h |  7 +++
 6 files changed, 46 insertions(+), 28 deletions(-)

diff --git a/block.c b/block.c
index ff23e20443..72fdf56af7 100644
--- a/block.c
+++ b/block.c
@@ -598,8 +598,15 @@ static int 
create_file_fallback_zero_first_sector(BlockBackend *blk,
 return 0;
 }
 
-static int bdrv_create_file_fallback(const char *filename, BlockDriver *drv,
- QemuOpts *opts, Error **errp)
+/**
+ * Simple implementation of bdrv_co_create_opts for protocol drivers
+ * which only support creation via opening a file
+ * (usually existing raw storage device)
+ */
+int coroutine_fn bdrv_co_create_opts_simple(BlockDriver *drv,
+   const char *filename,
+   QemuOpts *opts,
+   Error **errp)
 {
 BlockBackend *blk;
 QDict *options;
@@ -663,11 +670,7 @@ int bdrv_create_file(const char *filename, QemuOpts *opts, 
Error **errp)
 return -ENOENT;
 }
 
-if (drv->bdrv_co_create_opts) {
-return bdrv_create(drv, filename, opts, errp);
-} else {
-return bdrv_create_file_fallback(filename, drv, opts, errp);
-}
+return bdrv_create(drv, filename, opts, errp);
 }
 
 int coroutine_fn bdrv_co_delete_file(BlockDriverState *bs, Error **errp)
@@ -1592,9 +1595,9 @@ QemuOptsList bdrv_runtime_opts = {
 },
 };
 
-static QemuOptsList fallback_create_opts = {
-.name = "fallback-create-opts",
-.head = QTAILQ_HEAD_INITIALIZER(fallback_create_opts.head),
+QemuOptsList bdrv_create_opts_simple = {
+.name = "simple-create-opts",
+.head = QTAILQ_HEAD_INITIALIZER(bdrv_create_opts_simple.head),
 .desc = {
 {
 .name = BLOCK_OPT_SIZE,
@@ -5963,13 +5966,15 @@ void bdrv_img_create(const char *filename, const char 
*fmt,
 return;
 }
 
+if (!proto_drv->create_opts) {
+error_setg(errp, "Protocol driver '%s' does not support image 
creation",
+   proto_drv->format_name);
+return;
+}
+
 /* Create parameter list */
 create_opts = qemu_opts_append(create_opts, drv->create_opts);
-if (proto_drv->create_opts) {
-create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
-} else {
-create_opts = qemu_opts_append(create_opts, &fallback_create_opts);
-}
+create_opts = qemu_opts_append(create_opts, proto_drv->create_opts);
 
 opts = qemu_opts_create(create_opts, NULL, 0, &error_abort);
 
diff --git a/block/file-posix.c b/block/file-posix.c
index 65bc980bc4..7e19bbff5f 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -3513,6 +3513,8 @@ static BlockDriver bdrv_host_device = {
 .bdrv_reopen_prepare = raw_reopen_prepare,
 .bdrv_reopen_commit  = raw_reopen_commit,
 .bdrv_reopen_abort   = raw_reopen_abort,
+.bdrv_co_create_opts = bdrv_co_create_opts_simple,
+.create_opts = &bdrv_create_opts_simple,
 .mutable_opts= mutable_opts,
 .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
 .bdrv_co_pwrite_zeroes = hdev_co_pwrite_zeroes,
@@ -3639,10 +3641,11 @@ static BlockDriver bdrv_host_cdrom = {
 .bdrv_reopen_prepare = raw_reopen_prepare,
 .bdrv_reopen_commit  = raw_reopen_commit,
 .bdrv_reopen_abort   = raw_reopen_abort,
+.bdrv_co_create_opts = bdrv_co_create_opts_simple,
+.create_opts = &bdrv_create_opts_simple,
 .mutable_opts= mutable_opts,
 .bdrv_co_invalidate_cache = raw_co_invalidate_cache,
 
-
 .bdrv_co_preadv = raw_co_preadv,
 .bdrv_co_pwritev= raw_co_pwritev,
 .bdrv_co_flush_to_disk  = raw_co_flush_to_disk,
@@ -3771,6 +3774,8 @@ static BlockDriver bdrv_host_cdrom = {
 .bdrv_reopen_prepare = raw_reopen_prepare,
 .bdrv_reopen_commit  = raw_reopen_commit,
 .bdrv_reopen_abort   = raw_reopen_abort,
+.bdrv_co_create_opts = bdrv_co_create_opts_simple,
+.create_opts = &bdrv_create_opts_simp

Re: [PATCH] backup: don't acquire aio_context in backup_clean

2020-03-25 Thread Vladimir Sementsov-Ogievskiy

25.03.2020 18:50, Stefan Reiter wrote:

backup_clean is only ever called as a handler via job_exit, which


Hmm.. I'm afraid it's not quite correct.

job_clean

  job_finalize_single

 job_completed_txn_abort (lock aio context)

 job_do_finalize


Hmm. job_do_finalize calls job_completed_txn_abort, which cares to lock aio 
context..
And on the same time, it directaly calls job_txn_apply(job->txn, 
job_finalize_single)
without locking. Is it a bug?

And, even if job_do_finalize called always with locked context, where is 
guarantee that all
context of all jobs in txn are locked?

Still, let's look through its callers.

  job_finalize

   qmp_block_job_finalize (lock aio context)
   qmp_job_finalize (lock aio context)
   test_cancel_concluded (doesn't lock, but it's a test)

  job_completed_txn_success

   job_completed

job_exit (lock aio context)

job_cancel

 blockdev_mark_auto_del (lock aio context)

 job_user_cancel

 qmp_block_job_cancel (locks context)
 qmp_job_cancel  (locks context)

 job_cancel_err

  job_cancel_sync (return job_finish_sync(job, 
&job_cancel_err, NULL);, job_finish_sync just calls callback)

   replication_close (it's .bdrv_close.. Hmm, I 
don't see context locking, where is it ?)

   replication_stop (locks context)

   drive_backup_abort (locks context)

   blockdev_backup_abort (locks context)

   job_cancel_sync_all (locks context)

   cancel_common (locks context)

 test_* (I don't care)


already acquires the job's context. The job's context is guaranteed to
be the same as the one used by backup_top via backup_job_create.

Since the previous logic effectively acquired the lock twice, this
broke cleanup of backups for disks using IO threads, since the BDRV_POLL_WHILE
in bdrv_backup_top_drop -> bdrv_do_drained_begin would only release the lock
once, thus deadlocking with the IO thread.

Signed-off-by: Stefan Reiter 


Just note, that this thing were recently touched by 0abf2581717a19 , so add 
Sergio (its author) to CC.


---

This is a fix for the issue discussed in this part of the thread:
https://lists.gnu.org/archive/html/qemu-devel/2020-03/msg07639.html
...not the original problem (core dump) posted by Dietmar.

I've still seen it occasionally hang during a backup abort. I'm trying to figure
out why that happens, stack trace indicates a similar problem with the main
thread hanging at bdrv_do_drained_begin, though I have no clue why as of yet.

  block/backup.c | 4 
  1 file changed, 4 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 7430ca5883..a7a7dcaf4c 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -126,11 +126,7 @@ static void backup_abort(Job *job)
  static void backup_clean(Job *job)
  {
  BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
-AioContext *aio_context = bdrv_get_aio_context(s->backup_top);
-
-aio_context_acquire(aio_context);
  bdrv_backup_top_drop(s->backup_top);
-aio_context_release(aio_context);
  }
  
  void backup_do_checkpoint(BlockJob *job, Error **errp)





--
Best regards,
Vladimir



[PATCH v2] block: make BlockConf.*_size properties 32-bit

2020-03-25 Thread Roman Kagan
Devices (virtio-blk, scsi, etc.) and the block layer are happy to use
32-bit for logical_block_size, physical_block_size, and min_io_size.
However, the properties in BlockConf are defined as uint16_t limiting
the values to 32768.

This appears unnecessary tight, and we've seen bigger block sizes handy
at times.

Make them 32 bit instead and lift the limitation up to 2 MiB which
appears to be good enough for everybody.

As the values can now be fairly big and awkward to type, make the
property setter accept common size suffixes (k, m).

Signed-off-by: Roman Kagan 
---
v1 -> v2:
- cap the property at 2 MiB [Eric]
- accept size suffixes

 include/hw/block/block.h |  8 
 include/hw/qdev-properties.h |  2 +-
 hw/core/qdev-properties.c| 31 +--
 3 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/include/hw/block/block.h b/include/hw/block/block.h
index d7246f3862..9dd6bba56a 100644
--- a/include/hw/block/block.h
+++ b/include/hw/block/block.h
@@ -18,9 +18,9 @@
 
 typedef struct BlockConf {
 BlockBackend *blk;
-uint16_t physical_block_size;
-uint16_t logical_block_size;
-uint16_t min_io_size;
+uint32_t physical_block_size;
+uint32_t logical_block_size;
+uint32_t min_io_size;
 uint32_t opt_io_size;
 int32_t bootindex;
 uint32_t discard_granularity;
@@ -51,7 +51,7 @@ static inline unsigned int get_physical_block_exp(BlockConf 
*conf)
   _conf.logical_block_size),\
 DEFINE_PROP_BLOCKSIZE("physical_block_size", _state,\
   _conf.physical_block_size),   \
-DEFINE_PROP_UINT16("min_io_size", _state, _conf.min_io_size, 0),\
+DEFINE_PROP_UINT32("min_io_size", _state, _conf.min_io_size, 0),\
 DEFINE_PROP_UINT32("opt_io_size", _state, _conf.opt_io_size, 0),\
 DEFINE_PROP_UINT32("discard_granularity", _state,   \
_conf.discard_granularity, -1),  \
diff --git a/include/hw/qdev-properties.h b/include/hw/qdev-properties.h
index f161604fb6..f9e0f8c041 100644
--- a/include/hw/qdev-properties.h
+++ b/include/hw/qdev-properties.h
@@ -197,7 +197,7 @@ extern const PropertyInfo qdev_prop_pcie_link_width;
 #define DEFINE_PROP_BIOS_CHS_TRANS(_n, _s, _f, _d) \
 DEFINE_PROP_SIGNED(_n, _s, _f, _d, qdev_prop_bios_chs_trans, int)
 #define DEFINE_PROP_BLOCKSIZE(_n, _s, _f) \
-DEFINE_PROP_UNSIGNED(_n, _s, _f, 0, qdev_prop_blocksize, uint16_t)
+DEFINE_PROP_UNSIGNED(_n, _s, _f, 0, qdev_prop_blocksize, uint32_t)
 #define DEFINE_PROP_PCI_HOST_DEVADDR(_n, _s, _f) \
 DEFINE_PROP(_n, _s, _f, qdev_prop_pci_host_devaddr, PCIHostDeviceAddress)
 #define DEFINE_PROP_OFF_AUTO_PCIBAR(_n, _s, _f, _d) \
diff --git a/hw/core/qdev-properties.c b/hw/core/qdev-properties.c
index 2047114fca..a571be7607 100644
--- a/hw/core/qdev-properties.c
+++ b/hw/core/qdev-properties.c
@@ -14,6 +14,7 @@
 #include "qapi/visitor.h"
 #include "chardev/char.h"
 #include "qemu/uuid.h"
+#include "qemu/units.h"
 
 void qdev_prop_set_after_realize(DeviceState *dev, const char *name,
   Error **errp)
@@ -729,30 +730,39 @@ const PropertyInfo qdev_prop_pci_devfn = {
 
 /* --- blocksize --- */
 
+/* lower limit is sector size */
+#define MIN_BLOCK_SIZE  512
+#define MIN_BLOCK_SIZE_STR  "512 B"
+/* upper limit is arbitrary, 2 MiB looks sufficient */
+#define MAX_BLOCK_SIZE  (2 * MiB)
+#define MAX_BLOCK_SIZE_STR  "2 MiB"
+
 static void set_blocksize(Object *obj, Visitor *v, const char *name,
   void *opaque, Error **errp)
 {
 DeviceState *dev = DEVICE(obj);
 Property *prop = opaque;
-uint16_t value, *ptr = qdev_get_prop_ptr(dev, prop);
+uint32_t *ptr = qdev_get_prop_ptr(dev, prop);
+uint64_t value;
 Error *local_err = NULL;
-const int64_t min = 512;
-const int64_t max = 32768;
 
 if (dev->realized) {
 qdev_prop_set_after_realize(dev, name, errp);
 return;
 }
 
-visit_type_uint16(v, name, &value, &local_err);
+visit_type_size(v, name, &value, &local_err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
 }
 /* value of 0 means "unset" */
-if (value && (value < min || value > max)) {
-error_setg(errp, QERR_PROPERTY_VALUE_OUT_OF_RANGE,
-   dev->id ? : "", name, (int64_t)value, min, max);
+if (value && (value < MIN_BLOCK_SIZE || value > MAX_BLOCK_SIZE)) {
+error_setg(errp,
+   "Property %s.%s doesn't take value %" PRIu64
+   " (minimum: " MIN_BLOCK_SIZE_STR
+   ", maximum: " MAX_BLOCK_SIZE_STR ")",
+   dev->id ? : "", name, value);
 return;
 }
 
@@ -768,9 +778,10 @@ static void set_blocksize(Object *obj, Visitor *v, const 
char *name,
 }
 
 const PropertyInfo qdev_prop_blocksize = {
-.name  = "uint16"