Re: Question about (and problem with) pflash data access

2020-02-12 Thread Alexey Kardashevskiy



On 13/02/2020 10:50, Philippe Mathieu-Daudé wrote:
> Cc'ing Paolo and Alexey.
> 
> On 2/13/20 12:09 AM, Guenter Roeck wrote:
>> On Wed, Feb 12, 2020 at 10:39:30PM +0100, Philippe Mathieu-Daudé wrote:
>>> Cc'ing Jean-Christophe and Peter.
>>>
>>> On 2/12/20 7:46 PM, Guenter Roeck wrote:
 Hi,

 I have been playing with pflash recently. For the most part it works,
 but I do have an odd problem when trying to instantiate pflash on sx1.

 My data file looks as follows.

 000 0001       
 020        
 *
 0002000 0002       
 0002020        
 *
 0004000 0003       
 0004020        
 ...

 In the sx1 machine, this becomes:

 000 6001       
 020        
 *
 0002000 6002       
 0002020        
 *
 0004000 6003       
 0004020        
 *
 ...

 pflash is instantiated with "-drive
 file=flash.32M.test,format=raw,if=pflash".

 I don't have much success with pflash tracing - data accesses don't
 show up there.

 I did find a number of problems with the sx1 emulation, but I have
 no clue
 what is going on with pflash. As far as I can see pflash works fine on
 other machines. Can someone give me a hint what to look out for ?
>>>
>>> This is specific to the SX1, introduced in commit 997641a84ff:
>>>
>>>   64 static uint64_t static_read(void *opaque, hwaddr offset,
>>>   65 unsigned size)
>>>   66 {
>>>   67 uint32_t *val = (uint32_t *) opaque;
>>>   68 uint32_t mask = (4 / size) - 1;
>>>   69
>>>   70 return *val >> ((offset & mask) << 3);
>>>   71 }
>>>
>>> Only guessing, this looks like some hw parity, and I imagine you need to
>>> write the parity bits in your flash.32M file before starting QEMU,
>>> then it
>>> would appear "normal" within the guest.
>>>
>> I thought this might be related, but that is not the case. I added log
>> messages, and even ran the code in gdb. static_read() and static_write()
>> are not executed.
>>
>> Also,
>>
>>  memory_region_init_io([0], NULL, _ops, ,
>>    "sx1.cs0", OMAP_CS0_SIZE - flash_size);
>>   ^^
>>  memory_region_add_subregion(address_space,
>>  OMAP_CS0_BASE + flash_size, [0]);
>>  ^^
>>
>> suggests that the code is only executed for memory accesses _after_
>> the actual flash. The memory tree is:
>>
>> memory-region: system
>>    - (prio 0, i/o): system
>>  -01ff (prio 0, romd): omap_sx1.flash0-1
>>  -01ff (prio 0, rom): omap_sx1.flash0-0
> 
> Eh two memory regions with same size and same priority... Is this legal?


I'd say yes if used with memory_region_set_enabled() to make sure only
one is enabled. Having both enabled is weird and we should print a
warning. Thanks,



> 
> (qemu) info mtree -f -d
> FlatView #0
>  AS "memory", root: system
>  AS "cpu-memory-0", root: system
>  Root memory region: system
>   -01ff (prio 0, romd): omap_sx1.flash0-1
>   0200-03ff (prio 0, i/o): sx1.cs0
>   0400-07ff (prio 0, i/o): sx1.cs1
>   0800-0bff (prio 0, i/o): sx1.cs3
>   1000-11ff (prio 0, ram): omap1.dram
>   2000-2002 (prio 0, ram): omap1.sram
>   ...
>   Dispatch
>     Physical sections
>   #0 @.. (noname) [unassigned]
>   #1 @..01ff omap_sx1.flash0-1 [not dirty]
>   #2 @0200..03ff sx1.cs0 [ROM]
>   #3 @0400..07ff sx1.cs1 [watch]
>   #4 @0800..0bff sx1.cs3
>   #5 @1000..11ff omap1.dram
>   #6 @2000..2002 omap1.sram
>   ...
>     Nodes (9 bits per level, 6 levels) ptr=[3] skip=4
>   [0]
>   0   skip=3  ptr=[3]
>   1..511  skip=1  ptr=NIL
>   [1]
>   0   skip=2  ptr=[3]
>   1..511  skip=1  ptr=NIL
>   [2]
>   0   skip=1  ptr=[3]
>   1..511  skip=1  ptr=NIL
>   [3]
>   0   skip=1  ptr=[4]
>   1   skip=1  ptr=[5]
>   2   skip=2  ptr=[7]
>   3..13   skip=1  ptr=NIL
>  14   skip=2  ptr=[9]
>  15   skip=2  ptr=[11]
>  16..511  skip=1  ptr=NIL
> 

[PATCH 2/2] vhost-use-blk: convert to new virtio_delete_queue

2020-02-12 Thread pannengyuan
From: Pan Nengyuan 

use the new virtio_delete_queue function to cleanup.

Signed-off-by: Pan Nengyuan 
---
 hw/block/vhost-user-blk.c  | 11 +++
 include/hw/virtio/vhost-user-blk.h |  1 +
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index 2eba8b9db0..ed6a5cc03b 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -420,9 +420,10 @@ static void vhost_user_blk_device_realize(DeviceState 
*dev, Error **errp)
 virtio_init(vdev, "virtio-blk", VIRTIO_ID_BLOCK,
 sizeof(struct virtio_blk_config));
 
+s->virtqs = g_new0(VirtQueue *, s->num_queues);
 for (i = 0; i < s->num_queues; i++) {
-virtio_add_queue(vdev, s->queue_size,
- vhost_user_blk_handle_output);
+s->virtqs[i] = virtio_add_queue(vdev, s->queue_size,
+vhost_user_blk_handle_output);
 }
 
 s->inflight = g_new0(struct vhost_inflight, 1);
@@ -461,8 +462,9 @@ virtio_err:
 g_free(s->vqs);
 g_free(s->inflight);
 for (i = 0; i < s->num_queues; i++) {
-virtio_del_queue(vdev, i);
+virtio_delete_queue(s->virtqs[i]);
 }
+g_free(s->virtqs);
 virtio_cleanup(vdev);
 vhost_user_cleanup(>vhost_user);
 }
@@ -482,8 +484,9 @@ static void vhost_user_blk_device_unrealize(DeviceState 
*dev, Error **errp)
 g_free(s->inflight);
 
 for (i = 0; i < s->num_queues; i++) {
-virtio_del_queue(vdev, i);
+virtio_delete_queue(s->virtqs[i]);
 }
+g_free(s->virtqs);
 virtio_cleanup(vdev);
 vhost_user_cleanup(>vhost_user);
 }
diff --git a/include/hw/virtio/vhost-user-blk.h 
b/include/hw/virtio/vhost-user-blk.h
index 108bfadeeb..f68911f6f0 100644
--- a/include/hw/virtio/vhost-user-blk.h
+++ b/include/hw/virtio/vhost-user-blk.h
@@ -37,6 +37,7 @@ typedef struct VHostUserBlk {
 struct vhost_inflight *inflight;
 VhostUserState vhost_user;
 struct vhost_virtqueue *vqs;
+VirtQueue **virtqs;
 guint watch;
 bool connected;
 } VHostUserBlk;
-- 
2.21.0.windows.1





[PATCH 1/2] vhost-user-blk: delete virtioqueues in unrealize to fix memleaks

2020-02-12 Thread pannengyuan
From: Pan Nengyuan 

virtio queues forgot to delete in unrealize, and aslo error path in
realize, this patch fix these memleaks, the leak stack is as follow:

Direct leak of 114688 byte(s) in 16 object(s) allocated from:
#0 0x7f24024fdbf0 in calloc (/lib64/libasan.so.3+0xcabf0)
#1 0x7f2401642015 in g_malloc0 (/lib64/libglib-2.0.so.0+0x50015)
#2 0x55ad175a6447 in virtio_add_queue /mnt/sdb/qemu/hw/virtio/virtio.c:2327
#3 0x55ad17570cf9 in vhost_user_blk_device_realize 
/mnt/sdb/qemu/hw/block/vhost-user-blk.c:419
#4 0x55ad175a3707 in virtio_device_realize 
/mnt/sdb/qemu/hw/virtio/virtio.c:3509
#5 0x55ad176ad0d1 in device_set_realized /mnt/sdb/qemu/hw/core/qdev.c:876
#6 0x55ad1781ff9d in property_set_bool /mnt/sdb/qemu/qom/object.c:2080
#7 0x55ad178245ae in object_property_set_qobject 
/mnt/sdb/qemu/qom/qom-qobject.c:26
#8 0x55ad17821eb4 in object_property_set_bool 
/mnt/sdb/qemu/qom/object.c:1338
#9 0x55ad177aeed7 in virtio_pci_realize 
/mnt/sdb/qemu/hw/virtio/virtio-pci.c:1801

Reported-by: Euler Robot 
Signed-off-by: Pan Nengyuan 
---
 hw/block/vhost-user-blk.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/hw/block/vhost-user-blk.c b/hw/block/vhost-user-blk.c
index d8c459c575..2eba8b9db0 100644
--- a/hw/block/vhost-user-blk.c
+++ b/hw/block/vhost-user-blk.c
@@ -460,6 +460,9 @@ reconnect:
 virtio_err:
 g_free(s->vqs);
 g_free(s->inflight);
+for (i = 0; i < s->num_queues; i++) {
+virtio_del_queue(vdev, i);
+}
 virtio_cleanup(vdev);
 vhost_user_cleanup(>vhost_user);
 }
@@ -468,6 +471,7 @@ static void vhost_user_blk_device_unrealize(DeviceState 
*dev, Error **errp)
 {
 VirtIODevice *vdev = VIRTIO_DEVICE(dev);
 VHostUserBlk *s = VHOST_USER_BLK(dev);
+int i;
 
 virtio_set_status(vdev, 0);
 qemu_chr_fe_set_handlers(>chardev,  NULL, NULL, NULL,
@@ -476,6 +480,10 @@ static void vhost_user_blk_device_unrealize(DeviceState 
*dev, Error **errp)
 vhost_dev_free_inflight(s->inflight);
 g_free(s->vqs);
 g_free(s->inflight);
+
+for (i = 0; i < s->num_queues; i++) {
+virtio_del_queue(vdev, i);
+}
 virtio_cleanup(vdev);
 vhost_user_cleanup(>vhost_user);
 }
-- 
2.21.0.windows.1





[PATCH 0/2] delete virtio queues in vhost-user-blk-unrealize

2020-02-12 Thread pannengyuan
From: Pan Nengyuan 

This series patch fix memleaks when detaching vhost-user-blk device.
1. use old virtio_del_queue to fix memleaks, it's easier for stable branches to 
merge.
   As the discussion in 
https://lists.nongnu.org/archive/html/qemu-devel/2020-01/msg02903.html

2. convert virtio_del_queue to the new one(virtio_delete_queue).

Pan Nengyuan (2):
  vhost-user-blk: delete virtioqueues in unrealize to fix memleaks
  vhost-use-blk: convert to new virtio_delete_queue

 hw/block/vhost-user-blk.c  | 15 +--
 include/hw/virtio/vhost-user-blk.h |  1 +
 2 files changed, 14 insertions(+), 2 deletions(-)

-- 
2.21.0.windows.1





Re: Question about (and problem with) pflash data access

2020-02-12 Thread Philippe Mathieu-Daudé

Cc'ing Paolo and Alexey.

On 2/13/20 12:09 AM, Guenter Roeck wrote:

On Wed, Feb 12, 2020 at 10:39:30PM +0100, Philippe Mathieu-Daudé wrote:

Cc'ing Jean-Christophe and Peter.

On 2/12/20 7:46 PM, Guenter Roeck wrote:

Hi,

I have been playing with pflash recently. For the most part it works,
but I do have an odd problem when trying to instantiate pflash on sx1.

My data file looks as follows.

000 0001       
020        
*
0002000 0002       
0002020        
*
0004000 0003       
0004020        
...

In the sx1 machine, this becomes:

000 6001       
020        
*
0002000 6002       
0002020        
*
0004000 6003       
0004020        
*
...

pflash is instantiated with "-drive file=flash.32M.test,format=raw,if=pflash".

I don't have much success with pflash tracing - data accesses don't
show up there.

I did find a number of problems with the sx1 emulation, but I have no clue
what is going on with pflash. As far as I can see pflash works fine on
other machines. Can someone give me a hint what to look out for ?


This is specific to the SX1, introduced in commit 997641a84ff:

  64 static uint64_t static_read(void *opaque, hwaddr offset,
  65 unsigned size)
  66 {
  67 uint32_t *val = (uint32_t *) opaque;
  68 uint32_t mask = (4 / size) - 1;
  69
  70 return *val >> ((offset & mask) << 3);
  71 }

Only guessing, this looks like some hw parity, and I imagine you need to
write the parity bits in your flash.32M file before starting QEMU, then it
would appear "normal" within the guest.


I thought this might be related, but that is not the case. I added log
messages, and even ran the code in gdb. static_read() and static_write()
are not executed.

Also,

 memory_region_init_io([0], NULL, _ops, ,
   "sx1.cs0", OMAP_CS0_SIZE - flash_size);
  ^^
 memory_region_add_subregion(address_space,
 OMAP_CS0_BASE + flash_size, [0]);
 ^^

suggests that the code is only executed for memory accesses _after_
the actual flash. The memory tree is:

memory-region: system
   - (prio 0, i/o): system
 -01ff (prio 0, romd): omap_sx1.flash0-1
 -01ff (prio 0, rom): omap_sx1.flash0-0


Eh two memory regions with same size and same priority... Is this legal?

(qemu) info mtree -f -d
FlatView #0
 AS "memory", root: system
 AS "cpu-memory-0", root: system
 Root memory region: system
  -01ff (prio 0, romd): omap_sx1.flash0-1
  0200-03ff (prio 0, i/o): sx1.cs0
  0400-07ff (prio 0, i/o): sx1.cs1
  0800-0bff (prio 0, i/o): sx1.cs3
  1000-11ff (prio 0, ram): omap1.dram
  2000-2002 (prio 0, ram): omap1.sram
  ...
  Dispatch
Physical sections
  #0 @.. (noname) [unassigned]
  #1 @..01ff omap_sx1.flash0-1 [not dirty]
  #2 @0200..03ff sx1.cs0 [ROM]
  #3 @0400..07ff sx1.cs1 [watch]
  #4 @0800..0bff sx1.cs3
  #5 @1000..11ff omap1.dram
  #6 @2000..2002 omap1.sram
  ...
Nodes (9 bits per level, 6 levels) ptr=[3] skip=4
  [0]
  0   skip=3  ptr=[3]
  1..511  skip=1  ptr=NIL
  [1]
  0   skip=2  ptr=[3]
  1..511  skip=1  ptr=NIL
  [2]
  0   skip=1  ptr=[3]
  1..511  skip=1  ptr=NIL
  [3]
  0   skip=1  ptr=[4]
  1   skip=1  ptr=[5]
  2   skip=2  ptr=[7]
  3..13   skip=1  ptr=NIL
 14   skip=2  ptr=[9]
 15   skip=2  ptr=[11]
 16..511  skip=1  ptr=NIL
  [4]
  0..63   skip=0  ptr=#1
 64..127  skip=0  ptr=#2
128..255  skip=0  ptr=#3
256..383  skip=0  ptr=#4
384..511  skip=1  ptr=NIL

So the romd wins.


 0200-03ff (prio 0, i/o): sx1.cs0

I thought that the dual memory assignment (omap_sx1.flash0-1 and
omap_sx1.flash0-0) might play a role, but removing that didn't make
a difference either (not that I have any idea what it is supposed
to be used for).

Thanks,
Guenter






Re: Question about (and problem with) pflash data access

2020-02-12 Thread Guenter Roeck
On Wed, Feb 12, 2020 at 10:39:30PM +0100, Philippe Mathieu-Daudé wrote:
> Cc'ing Jean-Christophe and Peter.
> 
> On 2/12/20 7:46 PM, Guenter Roeck wrote:
> > Hi,
> > 
> > I have been playing with pflash recently. For the most part it works,
> > but I do have an odd problem when trying to instantiate pflash on sx1.
> > 
> > My data file looks as follows.
> > 
> > 000 0001       
> > 020        
> > *
> > 0002000 0002       
> > 0002020        
> > *
> > 0004000 0003       
> > 0004020        
> > ...
> > 
> > In the sx1 machine, this becomes:
> > 
> > 000 6001       
> > 020        
> > *
> > 0002000 6002       
> > 0002020        
> > *
> > 0004000 6003       
> > 0004020        
> > *
> > ...
> > 
> > pflash is instantiated with "-drive 
> > file=flash.32M.test,format=raw,if=pflash".
> > 
> > I don't have much success with pflash tracing - data accesses don't
> > show up there.
> > 
> > I did find a number of problems with the sx1 emulation, but I have no clue
> > what is going on with pflash. As far as I can see pflash works fine on
> > other machines. Can someone give me a hint what to look out for ?
> 
> This is specific to the SX1, introduced in commit 997641a84ff:
> 
>  64 static uint64_t static_read(void *opaque, hwaddr offset,
>  65 unsigned size)
>  66 {
>  67 uint32_t *val = (uint32_t *) opaque;
>  68 uint32_t mask = (4 / size) - 1;
>  69
>  70 return *val >> ((offset & mask) << 3);
>  71 }
> 
> Only guessing, this looks like some hw parity, and I imagine you need to
> write the parity bits in your flash.32M file before starting QEMU, then it
> would appear "normal" within the guest.
> 
I thought this might be related, but that is not the case. I added log
messages, and even ran the code in gdb. static_read() and static_write()
are not executed.

Also,

memory_region_init_io([0], NULL, _ops, ,
  "sx1.cs0", OMAP_CS0_SIZE - flash_size);
 ^^
memory_region_add_subregion(address_space,
OMAP_CS0_BASE + flash_size, [0]);
^^

suggests that the code is only executed for memory accesses _after_
the actual flash. The memory tree is:

memory-region: system
  - (prio 0, i/o): system
-01ff (prio 0, romd): omap_sx1.flash0-1
-01ff (prio 0, rom): omap_sx1.flash0-0
0200-03ff (prio 0, i/o): sx1.cs0

I thought that the dual memory assignment (omap_sx1.flash0-1 and
omap_sx1.flash0-0) might play a role, but removing that didn't make
a difference either (not that I have any idea what it is supposed
to be used for).

Thanks,
Guenter



Re: [PATCH] block: make BlockConf.*_size properties 32-bit

2020-02-12 Thread Eric Blake

On 2/11/20 5:54 AM, Roman Kagan wrote:

Devices (virtio-blk, scsi, etc.) and the block layer are happy to use
32-bit for logical_block_size, physical_block_size, and min_io_size.
However, the properties in BlockConf are defined as uint16_t limiting
the values to 32768.

This appears unnecessary tight, and we've seen bigger block sizes handy
at times.


What larger sizes?  I could see 64k or maybe even 1M block sizes,...



Make them 32 bit instead and lift the limitation.

Signed-off-by: Roman Kagan 
---
  hw/core/qdev-properties.c| 21 -
  include/hw/block/block.h |  8 
  include/hw/qdev-properties.h |  2 +-
  3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/hw/core/qdev-properties.c b/hw/core/qdev-properties.c
index 7f93bfeb88..5f84e4a3b8 100644
--- a/hw/core/qdev-properties.c
+++ b/hw/core/qdev-properties.c
@@ -716,30 +716,32 @@ const PropertyInfo qdev_prop_pci_devfn = {
  
  /* --- blocksize --- */
  
+#define MIN_BLOCK_SIZE 512

+#define MAX_BLOCK_SIZE 2147483648


...but 2G block sizes are going to have tremendous performance problems.

I'm not necessarily opposed to the widening to a 32-bit type, but think 
you need more justification or a smaller number for the max block size, 
particularly since qcow2 refuses to use cluster sizes larger than 2M and 
it makes no sense to allow a block size larger than a cluster size.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




Re: Question about (and problem with) pflash data access

2020-02-12 Thread Philippe Mathieu-Daudé

Cc'ing Jean-Christophe and Peter.

On 2/12/20 7:46 PM, Guenter Roeck wrote:

Hi,

I have been playing with pflash recently. For the most part it works,
but I do have an odd problem when trying to instantiate pflash on sx1.

My data file looks as follows.

000 0001       
020        
*
0002000 0002       
0002020        
*
0004000 0003       
0004020        
...

In the sx1 machine, this becomes:

000 6001       
020        
*
0002000 6002       
0002020        
*
0004000 6003       
0004020        
*
...

pflash is instantiated with "-drive file=flash.32M.test,format=raw,if=pflash".

I don't have much success with pflash tracing - data accesses don't
show up there.

I did find a number of problems with the sx1 emulation, but I have no clue
what is going on with pflash. As far as I can see pflash works fine on
other machines. Can someone give me a hint what to look out for ?


This is specific to the SX1, introduced in commit 997641a84ff:

 64 static uint64_t static_read(void *opaque, hwaddr offset,
 65 unsigned size)
 66 {
 67 uint32_t *val = (uint32_t *) opaque;
 68 uint32_t mask = (4 / size) - 1;
 69
 70 return *val >> ((offset & mask) << 3);
 71 }

Only guessing, this looks like some hw parity, and I imagine you need to 
write the parity bits in your flash.32M file before starting QEMU, then 
it would appear "normal" within the guest.





Question about (and problem with) pflash data access

2020-02-12 Thread Guenter Roeck
Hi,

I have been playing with pflash recently. For the most part it works,
but I do have an odd problem when trying to instantiate pflash on sx1.

My data file looks as follows.

000 0001       
020        
*
0002000 0002       
0002020        
*
0004000 0003       
0004020        
...

In the sx1 machine, this becomes:

000 6001       
020        
*
0002000 6002       
0002020        
*
0004000 6003       
0004020        
*
...

pflash is instantiated with "-drive file=flash.32M.test,format=raw,if=pflash".

I don't have much success with pflash tracing - data accesses don't
show up there.

I did find a number of problems with the sx1 emulation, but I have no clue
what is going on with pflash. As far as I can see pflash works fine on
other machines. Can someone give me a hint what to look out for ?

Thanks,
Guenter



Re: [PATCH v2] virtio: increase virtuqueue size for virtio-scsi and virtio-blk

2020-02-12 Thread Stefan Hajnoczi
On Tue, Feb 11, 2020 at 05:14:14PM +0300, Denis Plotnikov wrote:
> The goal is to reduce the amount of requests issued by a guest on
> 1M reads/writes. This rises the performance up to 4% on that kind of
> disk access pattern.
> 
> The maximum chunk size to be used for the guest disk accessing is
> limited with seg_max parameter, which represents the max amount of
> pices in the scatter-geather list in one guest disk request.
> 
> Since seg_max is virqueue_size dependent, increasing the virtqueue
> size increases seg_max, which, in turn, increases the maximum size
> of data to be read/write from a guest disk.
> 
> More details in the original problem statment:
> https://lists.gnu.org/archive/html/qemu-devel/2017-12/msg03721.html
> 
> Suggested-by: Denis V. Lunev 
> Signed-off-by: Denis Plotnikov 
> ---
>  hw/block/virtio-blk.c | 4 ++--
>  hw/core/machine.c | 2 ++
>  hw/scsi/virtio-scsi.c | 4 ++--
>  3 files changed, 6 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
> index 09f46ed85f..6df3a7a6df 100644
> --- a/hw/block/virtio-blk.c
> +++ b/hw/block/virtio-blk.c
> @@ -914,7 +914,7 @@ static void virtio_blk_update_config(VirtIODevice *vdev, 
> uint8_t *config)
>  memset(, 0, sizeof(blkcfg));
>  virtio_stq_p(vdev, , capacity);
>  virtio_stl_p(vdev, _max,
> - s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 128 - 2);
> + s->conf.seg_max_adjust ? s->conf.queue_size - 2 : 256 - 2);

This value must not change on older machine types.  So does this patch
need to turn seg-max-adjust *on* in hw_compat_4_2 so that old machine
types get 126 instead of 254?

>  virtio_stw_p(vdev, , conf->cyls);
>  virtio_stl_p(vdev, _size, blk_size);
>  virtio_stw_p(vdev, _io_size, conf->min_io_size / blk_size);
> @@ -1272,7 +1272,7 @@ static Property virtio_blk_properties[] = {
>  DEFINE_PROP_BIT("request-merging", VirtIOBlock, conf.request_merging, 0,
>  true),
>  DEFINE_PROP_UINT16("num-queues", VirtIOBlock, conf.num_queues, 1),
> -DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 128),
> +DEFINE_PROP_UINT16("queue-size", VirtIOBlock, conf.queue_size, 256),
>  DEFINE_PROP_BOOL("seg-max-adjust", VirtIOBlock, conf.seg_max_adjust, 
> true),
>  DEFINE_PROP_LINK("iothread", VirtIOBlock, conf.iothread, TYPE_IOTHREAD,
>   IOThread *),
> diff --git a/hw/core/machine.c b/hw/core/machine.c
> index 2501b540ec..3427d6cf4c 100644
> --- a/hw/core/machine.c
> +++ b/hw/core/machine.c
> @@ -28,6 +28,8 @@
>  #include "hw/mem/nvdimm.h"
>  
>  GlobalProperty hw_compat_4_2[] = {
> +{ "virtio-blk-device", "queue-size", "128"},
> +{ "virtio-scsi-device", "virtqueue_size", "128"},
>  { "virtio-blk-device", "x-enable-wce-if-config-wce", "off" },
>  { "virtio-blk-device", "seg-max-adjust", "off"},
>  { "virtio-scsi-device", "seg_max_adjust", "off"},
> diff --git a/hw/scsi/virtio-scsi.c b/hw/scsi/virtio-scsi.c
> index 3b61563609..b38f50a429 100644
> --- a/hw/scsi/virtio-scsi.c
> +++ b/hw/scsi/virtio-scsi.c
> @@ -660,7 +660,7 @@ static void virtio_scsi_get_config(VirtIODevice *vdev,
>  
>  virtio_stl_p(vdev, >num_queues, s->conf.num_queues);
>  virtio_stl_p(vdev, >seg_max,
> - s->conf.seg_max_adjust ? s->conf.virtqueue_size - 2 : 128 - 
> 2);
> + s->conf.seg_max_adjust ? s->conf.virtqueue_size - 2 : 256 - 
> 2);
>  virtio_stl_p(vdev, >max_sectors, s->conf.max_sectors);
>  virtio_stl_p(vdev, >cmd_per_lun, s->conf.cmd_per_lun);
>  virtio_stl_p(vdev, >event_info_size, sizeof(VirtIOSCSIEvent));
> @@ -965,7 +965,7 @@ static void virtio_scsi_device_unrealize(DeviceState 
> *dev, Error **errp)
>  static Property virtio_scsi_properties[] = {
>  DEFINE_PROP_UINT32("num_queues", VirtIOSCSI, parent_obj.conf.num_queues, 
> 1),
>  DEFINE_PROP_UINT32("virtqueue_size", VirtIOSCSI,
> - parent_obj.conf.virtqueue_size, 
> 128),
> + parent_obj.conf.virtqueue_size, 
> 256),
>  DEFINE_PROP_BOOL("seg_max_adjust", VirtIOSCSI,
>parent_obj.conf.seg_max_adjust, true),
>  DEFINE_PROP_UINT32("max_sectors", VirtIOSCSI, 
> parent_obj.conf.max_sectors,
> -- 
> 2.17.0
> 
> 


signature.asc
Description: PGP signature


Re: [PATCH] nbd-client: Support leading / in NBD URI

2020-02-12 Thread Maxim Levitsky
On Wed, 2020-02-12 at 14:33 +0100, Ján Tomko wrote:
> On Tue, Feb 11, 2020 at 08:31:01PM -0600, Eric Blake wrote:
> > The NBD URI specification [1] states that only one leading slash at
> > the beginning of the URI path component is stripped, not all such
> > slashes.  This becomes important to a patch I just proposed to nbdkit
> > [2], which would allow the exportname to select a file embedded within
> > an ext2 image: ext2fs demands an absolute pathname beginning with '/',
> > and because qemu was inadvertantly stripping it, my nbdkit patch had
> > to work around the behavior.
> > 
> > [1] https://github.com/NetworkBlockDevice/nbd/blob/master/doc/uri.md
> > [2] https://www.redhat.com/archives/libguestfs/2020-February/msg00109.html
> > 
> > Note that the qemu bug only affects handling of URIs such as
> > nbd://host:port//abs/path (where '/abs/path' should be the export
> > name); it is still possible to use --image-opts and pass the desired
> > export name with a leading slash directly through JSON even without
> > this patch.
> > 
> > Signed-off-by: Eric Blake 
> > ---
> > block/nbd.c | 6 --
> > 1 file changed, 4 insertions(+), 2 deletions(-)
> > 
> 
> Reviewed-by: Ján Tomko 
> 
> Jano
Note that I had a bug open for this.
https://bugzilla.redhat.com/show_bug.cgi?id=1728545

I expected this to be a feature to be honest,
and was afraid to break existing users that might rely on this.

Best regards,
Maxim Levitsky




Re: [PATCH] nbd-client: Support leading / in NBD URI

2020-02-12 Thread Richard W.M. Jones
On Tue, Feb 11, 2020 at 08:31:01PM -0600, Eric Blake wrote:
> The NBD URI specification [1] states that only one leading slash at
> the beginning of the URI path component is stripped, not all such
> slashes.  This becomes important to a patch I just proposed to nbdkit
> [2], which would allow the exportname to select a file embedded within
> an ext2 image: ext2fs demands an absolute pathname beginning with '/',
> and because qemu was inadvertantly stripping it, my nbdkit patch had
> to work around the behavior.
> 
> [1] https://github.com/NetworkBlockDevice/nbd/blob/master/doc/uri.md
> [2] https://www.redhat.com/archives/libguestfs/2020-February/msg00109.html
> 
> Note that the qemu bug only affects handling of URIs such as
> nbd://host:port//abs/path (where '/abs/path' should be the export
> name); it is still possible to use --image-opts and pass the desired
> export name with a leading slash directly through JSON even without
> this patch.
> 
> Signed-off-by: Eric Blake 
> ---
>  block/nbd.c | 6 --
>  1 file changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/block/nbd.c b/block/nbd.c
> index d085554f21ea..82f9b7ef50a5 100644
> --- a/block/nbd.c
> +++ b/block/nbd.c
> @@ -1516,8 +1516,10 @@ static int nbd_parse_uri(const char *filename, QDict 
> *options)
>  goto out;
>  }
> 
> -p = uri->path ? uri->path : "/";
> -p += strspn(p, "/");
> +p = uri->path ? uri->path : "";
> +if (p[0] == '/') {
> +p++;
> +}
>  if (p[0]) {
>  qdict_put_str(options, "export", p);
>  }

Looks reasonable, ACK.

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
virt-top is 'top' for virtual machines.  Tiny program with many
powerful monitoring features, net stats, disk stats, logging, etc.
http://people.redhat.com/~rjones/virt-top




Re: [PATCH] nbd-client: Support leading / in NBD URI

2020-02-12 Thread Ján Tomko

On Tue, Feb 11, 2020 at 08:31:01PM -0600, Eric Blake wrote:

The NBD URI specification [1] states that only one leading slash at
the beginning of the URI path component is stripped, not all such
slashes.  This becomes important to a patch I just proposed to nbdkit
[2], which would allow the exportname to select a file embedded within
an ext2 image: ext2fs demands an absolute pathname beginning with '/',
and because qemu was inadvertantly stripping it, my nbdkit patch had
to work around the behavior.

[1] https://github.com/NetworkBlockDevice/nbd/blob/master/doc/uri.md
[2] https://www.redhat.com/archives/libguestfs/2020-February/msg00109.html

Note that the qemu bug only affects handling of URIs such as
nbd://host:port//abs/path (where '/abs/path' should be the export
name); it is still possible to use --image-opts and pass the desired
export name with a leading slash directly through JSON even without
this patch.

Signed-off-by: Eric Blake 
---
block/nbd.c | 6 --
1 file changed, 4 insertions(+), 2 deletions(-)



Reviewed-by: Ján Tomko 

Jano


signature.asc
Description: PGP signature


Re: [PATCH v5 01/26] nvme: rename trace events to nvme_dev

2020-02-12 Thread Maxim Levitsky
On Wed, 2020-02-12 at 14:08 +0100, Klaus Birkelund Jensen wrote:
> On Feb 12 11:08, Maxim Levitsky wrote:
> > On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> > > Change the prefix of all nvme device related trace events to 'nvme_dev'
> > > to not clash with trace events from the nvme block driver.
> > > 
> 
> Hi Maxim,
> 
> Thank you very much for your thorough reviews! Utterly appreciated!

Thanks to you for the patch series!

> 
> I'll start going through your suggested changes. There is a bit of work
> to do on splitting patches into refactoring and bugfixes, but I can
> definitely see the reason for this, so I'll get to work.
> 
> You mention the alignment with split lines alot. I actually thought I
> was following CODING_STYLE.rst (which allows a single 4 space indent for
> functions, but not statements such as if/else and while/for). But since
> hw/block/nvme.c is originally written in the style of aligning with the
> opening paranthesis I'm in the wrong here, so I will of course amend
> it. Should have done that from the beginning, it's just my personal
> taste shining through ;)

TO be honest this is my personal taste as well, but after *many* review
complains about this I consider that aligning on opening paranthesis 
is kind of an official style.

If others are OK with this though, I am personally 100% fine with leaving the
split lines as is.


Best regards,
Maxim Levitsky




Re: [PATCH v5 01/26] nvme: rename trace events to nvme_dev

2020-02-12 Thread Klaus Birkelund Jensen
On Feb 12 11:08, Maxim Levitsky wrote:
> On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> > Change the prefix of all nvme device related trace events to 'nvme_dev'
> > to not clash with trace events from the nvme block driver.
> > 

Hi Maxim,

Thank you very much for your thorough reviews! Utterly appreciated!

I'll start going through your suggested changes. There is a bit of work
to do on splitting patches into refactoring and bugfixes, but I can
definitely see the reason for this, so I'll get to work.

You mention the alignment with split lines alot. I actually thought I
was following CODING_STYLE.rst (which allows a single 4 space indent for
functions, but not statements such as if/else and while/for). But since
hw/block/nvme.c is originally written in the style of aligning with the
opening paranthesis I'm in the wrong here, so I will of course amend
it. Should have done that from the beginning, it's just my personal
taste shining through ;)


Thanks again,
Klaus



Re: Cross-project NBD extension proposal: NBD_INFO_INIT_STATE

2020-02-12 Thread Eric Blake

On 2/12/20 6:36 AM, Richard W.M. Jones wrote:


Okay, in v2, I will start with just two bits, NBD_INIT_SPARSE
(entire image is sparse, nothing is allocated) and NBD_INIT_ZERO
(entire image reads as zero), and save any future bits for later
additions.  Do we think that 16 bits is sufficient for the amount of
initial information likely to be exposed?


So as I understand the proposal, the 16 bit limit comes about because
we want a round 4 byte reply, 16 bits are used by NBD_INFO_INIT_STATE
and that leaves 16 bits feature bits.  Therefore the only way to go
from there is to have 32 feature bits but an awkward unaligned 6 byte
structure, or 48 feature bits (8 byte structure).


In general, the NBD protocol has NOT focused on alignment issues (for 
good or for bad).  For example, NBD_INFO_BLOCK_SIZE is 18 bytes; all 
NBD_CMD_* 32-bit requests are 28 bytes except for NBD_CMD_WRITE which 
can send unaligned payload with no further padding, and so forth.




I guess given those constraints we can stick with 16 feature bits, and
if we ever needed more then we'd have to introduce NBD_INFO_INIT_STATE2.

The only thing I can think of which might be useful is a "fully
preallocated" bit which might be used as an indication that writes are
fast and are unlikely to fail with ENOSPC.


and which would be mutually-exclusive with NBD_INFO_SPARSE (except for 
an image of size 0).  That bit would ALSO be an indication that the user 
may not want to punch holes into the image, but preserve the 
fully-allocated state (and thus avoid NBD_CMD_TRIM as well as passing 
NBD_CMD_FLAG_NO_HOLE to any WRITE_ZEROES request).





Are we in agreement that
my addition of an NBD_INFO_ response to NBD_OPT_GO is the best way
to expose initial state bits?


Seems reasonable to me.

Rich.



--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v5 26/26] nvme: make lba data size configurable

2020-02-12 Thread Maxim Levitsky
On Thu, 2020-02-06 at 08:24 +0100, Klaus Birkelund Jensen wrote:
> On Feb  5 01:43, Keith Busch wrote:
> > On Tue, Feb 04, 2020 at 10:52:08AM +0100, Klaus Jensen wrote:
> > > Signed-off-by: Klaus Jensen 
> > > ---
> > >  hw/block/nvme-ns.c | 2 +-
> > >  hw/block/nvme-ns.h | 4 +++-
> > >  hw/block/nvme.c| 1 +
> > >  3 files changed, 5 insertions(+), 2 deletions(-)
> > > 
> > > diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> > > index 0e5be44486f4..981d7101b8f2 100644
> > > --- a/hw/block/nvme-ns.c
> > > +++ b/hw/block/nvme-ns.c
> > > @@ -18,7 +18,7 @@ static int nvme_ns_init(NvmeNamespace *ns)
> > >  {
> > >  NvmeIdNs *id_ns = >id_ns;
> > >  
> > > -id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
> > > +id_ns->lbaf[0].ds = ns->params.lbads;
> > >  id_ns->nuse = id_ns->ncap = id_ns->nsze =
> > >  cpu_to_le64(nvme_ns_nlbas(ns));
> > >  
> > > diff --git a/hw/block/nvme-ns.h b/hw/block/nvme-ns.h
> > > index b564bac25f6d..f1fe4db78b41 100644
> > > --- a/hw/block/nvme-ns.h
> > > +++ b/hw/block/nvme-ns.h
> > > @@ -7,10 +7,12 @@
> > >  
> > >  #define DEFINE_NVME_NS_PROPERTIES(_state, _props) \
> > >  DEFINE_PROP_DRIVE("drive", _state, blk), \
> > > -DEFINE_PROP_UINT32("nsid", _state, _props.nsid, 0)
> > > +DEFINE_PROP_UINT32("nsid", _state, _props.nsid, 0), \
> > > +DEFINE_PROP_UINT8("lbads", _state, _props.lbads, BDRV_SECTOR_BITS)
> > 
> > I think we need to validate the parameter is between 9 and 12 before
> > trusting it can be used safely.
> > 
> > Alternatively, add supported formats to the lbaf array and let the host
> > decide on a live system with the 'format' command.
> 
> The device does not yet support Format NVM, but we have a patch ready
> for that to be submitted with a new series when this is merged.
> 
> For now, while it does not support Format, I will change this patch such
> that it defaults to 9 (BRDV_SECTOR_BITS) and only accept 12 as an
> alternative (while always keeping the number of formats available to 1).
Looks like a good idea.

Best regards,
Maxim Levitsky




Re: [PATCH v5 24/26] nvme: change controller pci id

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:52 +0100, Klaus Jensen wrote:
> There are two reasons for changing this:
> 
>   1. The nvme device currently uses an internal Intel device id.
> 
>   2. Since commits "nvme: fix write zeroes offset and count" and "nvme:
>  support multiple namespaces" the controller device no longer has
>  the quirks that the Linux kernel think it has.
> 
>  As the quirks are applied based on pci vendor and device id, change
>  them to get rid of the quirks.
> 
> To keep backward compatibility, add a new 'x-use-intel-id' parameter to
> the nvme device to force use of the Intel vendor and device id. This is
> off by default but add a compat property to set this for machines 4.2
> and older.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 13 +
>  hw/block/nvme.h   |  4 +++-
>  hw/core/machine.c |  1 +
>  3 files changed, 13 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 3a377bc56734..bdef53a590b0 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -2467,8 +2467,15 @@ static void nvme_init_pci(NvmeCtrl *n, PCIDevice 
> *pci_dev)
>  
>  pci_conf[PCI_INTERRUPT_PIN] = 1;
>  pci_config_set_prog_interface(pci_conf, 0x2);
> -pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
> -pci_config_set_device_id(pci_conf, 0x5845);
> +
> +if (n->params.use_intel_id) {
> +pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_INTEL);
> +pci_config_set_device_id(pci_conf, 0x5846);
> +} else {
> +pci_config_set_vendor_id(pci_conf, PCI_VENDOR_ID_REDHAT);
> +pci_config_set_device_id(pci_conf, PCI_DEVICE_ID_REDHAT_NVME);
> +}
> +
>  pci_config_set_class(pci_conf, PCI_CLASS_STORAGE_EXPRESS);
>  pcie_endpoint_cap_init(pci_dev, 0x80);
>  
> @@ -2638,8 +2645,6 @@ static void nvme_class_init(ObjectClass *oc, void *data)
>  pc->realize = nvme_realize;
>  pc->exit = nvme_exit;
>  pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
> -pc->vendor_id = PCI_VENDOR_ID_INTEL;
> -pc->device_id = 0x5845;
>  pc->revision = 2;
>  
>  set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index c3cef0f024da..6b584f53ed64 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -12,7 +12,8 @@
>  DEFINE_PROP_UINT32("num_queues", _state, _props.num_queues, 64), \
>  DEFINE_PROP_UINT8("aerl", _state, _props.aerl, 3), \
>  DEFINE_PROP_UINT32("aer_max_queued", _state, _props.aer_max_queued, 64), 
> \
> -DEFINE_PROP_UINT8("mdts", _state, _props.mdts, 7)
> +DEFINE_PROP_UINT8("mdts", _state, _props.mdts, 7), \
> +DEFINE_PROP_BOOL("x-use-intel-id", _state, _props.use_intel_id, false)
>  
>  typedef struct NvmeParams {
>  char *serial;
> @@ -21,6 +22,7 @@ typedef struct NvmeParams {
>  uint8_t  aerl;
>  uint32_t aer_max_queued;
>  uint8_t  mdts;
> +bool use_intel_id;
>  } NvmeParams;
>  
>  typedef struct NvmeAsyncEvent {
> diff --git a/hw/core/machine.c b/hw/core/machine.c
> index 3e288bfceb7f..984412d98c9d 100644
> --- a/hw/core/machine.c
> +++ b/hw/core/machine.c
> @@ -34,6 +34,7 @@ GlobalProperty hw_compat_4_2[] = {
>  { "vhost-blk-device", "seg_max_adjust", "off"},
>  { "usb-host", "suppress-remote-wake", "off" },
>  { "usb-redir", "suppress-remote-wake", "off" },
> +{ "nvme", "x-use-intel-id", "on"},
>  };
>  const size_t hw_compat_4_2_len = G_N_ELEMENTS(hw_compat_4_2);
>  

Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky




Re: [PATCH v5 25/26] nvme: remove redundant NvmeCmd pointer parameter

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:52 +0100, Klaus Jensen wrote:
> The command struct is available in the NvmeRequest that we generally
> pass around anyway.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 198 
>  1 file changed, 98 insertions(+), 100 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index bdef53a590b0..5fe2e2fe1fa9 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -566,16 +566,18 @@ unmap:
>  }
>  
>  static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
> -NvmeCmd *cmd, DMADirection dir, NvmeRequest *req)
> +DMADirection dir, NvmeRequest *req)
>  {
>  uint16_t status = NVME_SUCCESS;
>  size_t bytes;
> +uint64_t prp1, prp2;
>  
> -switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
> +switch (NVME_CMD_FLAGS_PSDT(req->cmd.flags)) {
>  case PSDT_PRP:
> -status = nvme_map_prp(n, >qsg, >iov,
> -le64_to_cpu(cmd->dptr.prp.prp1), le64_to_cpu(cmd->dptr.prp.prp2),
> -len, req);
> +prp1 = le64_to_cpu(req->cmd.dptr.prp.prp1);
> +prp2 = le64_to_cpu(req->cmd.dptr.prp.prp2);
> +
> +status = nvme_map_prp(n, >qsg, >iov, prp1, prp2, len, req);
>  if (status) {
>  return status;
>  }
> @@ -589,7 +591,7 @@ static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, 
> uint32_t len,
>  return NVME_INVALID_FIELD;
>  }
>  
> -status = nvme_map_sgl(n, >qsg, >iov, cmd->dptr.sgl, len,
> +status = nvme_map_sgl(n, >qsg, >iov, req->cmd.dptr.sgl, 
> len,
>  req);
>  if (status) {
>  return status;
> @@ -632,20 +634,21 @@ static uint16_t nvme_dma(NvmeCtrl *n, uint8_t *ptr, 
> uint32_t len,
>  return status;
>  }
>  
> -static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> +static uint16_t nvme_map(NvmeCtrl *n, NvmeRequest *req)
>  {
>  uint32_t len = req->nlb << nvme_ns_lbads(req->ns);
>  uint64_t prp1, prp2;
>  
> -switch (NVME_CMD_FLAGS_PSDT(cmd->flags)) {
> +switch (NVME_CMD_FLAGS_PSDT(req->cmd.flags)) {
>  case PSDT_PRP:
> -prp1 = le64_to_cpu(cmd->dptr.prp.prp1);
> -prp2 = le64_to_cpu(cmd->dptr.prp.prp2);
> +prp1 = le64_to_cpu(req->cmd.dptr.prp.prp1);
> +prp2 = le64_to_cpu(req->cmd.dptr.prp.prp2);
>  
>  return nvme_map_prp(n, >qsg, >iov, prp1, prp2, len, req);
>  case PSDT_SGL_MPTR_CONTIGUOUS:
>  case PSDT_SGL_MPTR_SGL:
> -return nvme_map_sgl(n, >qsg, >iov, cmd->dptr.sgl, len, 
> req);
> +return nvme_map_sgl(n, >qsg, >iov, req->cmd.dptr.sgl, len,
> +req);
>  default:
>  return NVME_INVALID_FIELD;
>  }
> @@ -1024,7 +1027,7 @@ static void nvme_aio_cb(void *opaque, int ret)
>  nvme_aio_destroy(aio);
>  }
>  
> -static uint16_t nvme_flush(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> +static uint16_t nvme_flush(NvmeCtrl *n, NvmeRequest *req)
>  {
>  NvmeNamespace *ns = req->ns;
>  NvmeAIO *aio = g_new0(NvmeAIO, 1);
> @@ -1040,12 +1043,12 @@ static uint16_t nvme_flush(NvmeCtrl *n, NvmeCmd *cmd, 
> NvmeRequest *req)
>  return NVME_NO_COMPLETE;
>  }
>  
> -static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> +static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeRequest *req)
>  {
>  NvmeAIO *aio;
>  
>  NvmeNamespace *ns = req->ns;
> -NvmeRwCmd *rw = (NvmeRwCmd *) cmd;
> +NvmeRwCmd *rw = (NvmeRwCmd *) >cmd;
>  
>  int64_t offset;
>  size_t count;
> @@ -1081,9 +1084,9 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  return NVME_NO_COMPLETE;
>  }
>  
> -static uint16_t nvme_rw(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> +static uint16_t nvme_rw(NvmeCtrl *n, NvmeRequest *req)
>  {
> -NvmeRwCmd *rw = (NvmeRwCmd *) cmd;
> +NvmeRwCmd *rw = (NvmeRwCmd *) >cmd;
>  NvmeNamespace *ns = req->ns;
>  int status;
>  
> @@ -1103,7 +1106,7 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeCmd *cmd, 
> NvmeRequest *req)
>  return status;
>  }
>  
> -status = nvme_map(n, cmd, req);
> +status = nvme_map(n, req);
>  if (status) {
>  block_acct_invalid(blk_get_stats(ns->blk), acct);
>  return status;
> @@ -1115,12 +1118,12 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeCmd *cmd, 
> NvmeRequest *req)
>  return NVME_NO_COMPLETE;
>  }
>  
> -static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> +static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeRequest *req)
>  {
> -uint32_t nsid = le32_to_cpu(cmd->nsid);
> +uint32_t nsid = le32_to_cpu(req->cmd.nsid);
>  
>  trace_nvme_dev_io_cmd(nvme_cid(req), nsid, le16_to_cpu(req->sq->sqid),
> -cmd->opcode);
> +req->cmd.opcode);
>  
>  req->ns = nvme_ns(n, nsid);
>  
> @@ -1128,16 +1131,16 @@ static uint16_t nvme_io_cmd(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  return nvme_nsid_err(n, nsid);
>  }

Re: Cross-project NBD extension proposal: NBD_INFO_INIT_STATE

2020-02-12 Thread Richard W.M. Jones


On Wed, Feb 12, 2020 at 06:09:11AM -0600, Eric Blake wrote:
> On 2/12/20 1:27 AM, Wouter Verhelst wrote:
> >Hi,
> >
> >On Mon, Feb 10, 2020 at 10:52:55PM +, Richard W.M. Jones wrote:
> >>But anyway ... could a flag indicating that the whole image is sparse
> >>be useful, either as well as NBD_INIT_SPARSE or instead of it?  You
> >>could use it to avoid an initial disk trim, which is something that
> >>mke2fs does:
> >
> >Yeah, I think that could definitely be useful. I honestly can't see a
> >use for NBD_INIT_SPARSE as defined in this proposal; and I don't think
> >it's generally useful to have a feature if we can't think of a use case
> >for it (that creates added complexity for no benefit).
> >
> >If we can find a reasonable use case for NBD_INIT_SPARSE as defined in
> >this proposal, then just add a third bit (NBD_INIT_ALL_SPARSE or
> >something) that says "the whole image is sparse". Otherwise, I think we
> >should redefine NBD_INIT_SPARSE to say that.
> 
> Okay, in v2, I will start with just two bits, NBD_INIT_SPARSE
> (entire image is sparse, nothing is allocated) and NBD_INIT_ZERO
> (entire image reads as zero), and save any future bits for later
> additions.  Do we think that 16 bits is sufficient for the amount of
> initial information likely to be exposed?

So as I understand the proposal, the 16 bit limit comes about because
we want a round 4 byte reply, 16 bits are used by NBD_INFO_INIT_STATE
and that leaves 16 bits feature bits.  Therefore the only way to go
from there is to have 32 feature bits but an awkward unaligned 6 byte
structure, or 48 feature bits (8 byte structure).

I guess given those constraints we can stick with 16 feature bits, and
if we ever needed more then we'd have to introduce NBD_INFO_INIT_STATE2.

The only thing I can think of which might be useful is a "fully
preallocated" bit which might be used as an indication that writes are
fast and are unlikely to fail with ENOSPC.

> Are we in agreement that
> my addition of an NBD_INFO_ response to NBD_OPT_GO is the best way
> to expose initial state bits?

Seems reasonable to me.

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
Fedora Windows cross-compiler. Compile Windows programs, test, and
build Windows installers. Over 100 libraries supported.
http://fedoraproject.org/wiki/MinGW




Re: [PATCH v5 23/26] pci: allocate pci id for nvme

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:52 +0100, Klaus Jensen wrote:
> The emulated nvme device (hw/block/nvme.c) is currently using an
> internal Intel device id.
> 
> Prepare to change that by allocating a device id under the 1b36 (Red
> Hat, Inc.) vendor id.

> 
> Signed-off-by: Klaus Jensen 
> ---
>  MAINTAINERS|  1 +
>  docs/specs/nvme.txt| 10 ++
>  docs/specs/pci-ids.txt |  1 +
>  include/hw/pci/pci.h   |  1 +
>  4 files changed, 13 insertions(+)
>  create mode 100644 docs/specs/nvme.txt
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 1f0bc72f2189..14a018e9c0ae 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -1645,6 +1645,7 @@ L: qemu-block@nongnu.org
>  S: Supported
>  F: hw/block/nvme*
>  F: tests/qtest/nvme-test.c
> +F: docs/specs/nvme.txt
>  
>  megasas
>  M: Hannes Reinecke 
> diff --git a/docs/specs/nvme.txt b/docs/specs/nvme.txt
> new file mode 100644
> index ..6ec7ddbc7ee0
> --- /dev/null
> +++ b/docs/specs/nvme.txt
> @@ -0,0 +1,10 @@
> +NVM Express Controller
> +==
> +
> +The nvme device (-device nvme) emulates an NVM Express Controller.
> +
> +
> +Reference Specifications
> +
> +
> +  https://nvmexpress.org/resources/specifications/

Nitpick: maybe mention the nvme version here, plus some TODOs that are left?

> diff --git a/docs/specs/pci-ids.txt b/docs/specs/pci-ids.txt
> index 4d53e5c7d9d5..abbdbca6be38 100644
> --- a/docs/specs/pci-ids.txt
> +++ b/docs/specs/pci-ids.txt
> @@ -63,6 +63,7 @@ PCI devices (other than virtio):
>  1b36:000b  PCIe Expander Bridge (-device pxb-pcie)
>  1b36:000d  PCI xhci usb host adapter
>  1b36:000f  mdpy (mdev sample device), linux/samples/vfio-mdev/mdpy.c
> +1b36:0010  PCIe NVMe device (-device nvme)
>  
>  All these devices are documented in docs/specs.
>  
> diff --git a/include/hw/pci/pci.h b/include/hw/pci/pci.h
> index b5013b834b20..9a20c309d0f2 100644
> --- a/include/hw/pci/pci.h
> +++ b/include/hw/pci/pci.h
> @@ -103,6 +103,7 @@ extern bool pci_available;
>  #define PCI_DEVICE_ID_REDHAT_XHCI0x000d
>  #define PCI_DEVICE_ID_REDHAT_PCIE_BRIDGE 0x000e
>  #define PCI_DEVICE_ID_REDHAT_MDPY0x000f
> +#define PCI_DEVICE_ID_REDHAT_NVME0x0010
>  #define PCI_DEVICE_ID_REDHAT_QXL 0x0100
>  
>  #define FMT_PCIBUS  PRIx64

Other than the actual ID assignment which is not something
I can approve/allocate:

Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky






Re: [PATCH v5 22/26] nvme: support multiple namespaces

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:52 +0100, Klaus Jensen wrote:
> This adds support for multiple namespaces by introducing a new 'nvme-ns'
> device model. The nvme device creates a bus named from the device name
> ('id'). The nvme-ns devices then connect to this and registers
> themselves with the nvme device.
> 
> This changes how an nvme device is created. Example with two namespaces:
> 
>   -drive file=nvme0n1.img,if=none,id=disk1
>   -drive file=nvme0n2.img,if=none,id=disk2
>   -device nvme,serial=deadbeef,id=nvme0
>   -device nvme-ns,drive=disk1,bus=nvme0,nsid=1
>   -device nvme-ns,drive=disk2,bus=nvme0,nsid=2
> 
> The drive property is kept on the nvme device to keep the change
> backward compatible, but the property is now optional. Specifying a
> drive for the nvme device will always create the namespace with nsid 1.
Very reasonable way to do it. 
> 
> Signed-off-by: Klaus Jensen 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/Makefile.objs |   2 +-
>  hw/block/nvme-ns.c | 158 +++
>  hw/block/nvme-ns.h |  60 +++
>  hw/block/nvme.c| 235 +
>  hw/block/nvme.h|  47 -
>  hw/block/trace-events  |   6 +-
>  6 files changed, 389 insertions(+), 119 deletions(-)
>  create mode 100644 hw/block/nvme-ns.c
>  create mode 100644 hw/block/nvme-ns.h
> 
> diff --git a/hw/block/Makefile.objs b/hw/block/Makefile.objs
> index 28c2495a00dc..45f463462f1e 100644
> --- a/hw/block/Makefile.objs
> +++ b/hw/block/Makefile.objs
> @@ -7,7 +7,7 @@ common-obj-$(CONFIG_PFLASH_CFI02) += pflash_cfi02.o
>  common-obj-$(CONFIG_XEN) += xen-block.o
>  common-obj-$(CONFIG_ECC) += ecc.o
>  common-obj-$(CONFIG_ONENAND) += onenand.o
> -common-obj-$(CONFIG_NVME_PCI) += nvme.o
> +common-obj-$(CONFIG_NVME_PCI) += nvme.o nvme-ns.o
>  common-obj-$(CONFIG_SWIM) += swim.o
>  
>  obj-$(CONFIG_SH4) += tc58128.o
> diff --git a/hw/block/nvme-ns.c b/hw/block/nvme-ns.c
> new file mode 100644
> index ..0e5be44486f4
> --- /dev/null
> +++ b/hw/block/nvme-ns.c
> @@ -0,0 +1,158 @@
> +#include "qemu/osdep.h"
> +#include "qemu/units.h"
> +#include "qemu/cutils.h"
> +#include "qemu/log.h"
> +#include "hw/block/block.h"
> +#include "hw/pci/msix.h"
Do you need this include?
> +#include "sysemu/sysemu.h"
> +#include "sysemu/block-backend.h"
> +#include "qapi/error.h"
> +
> +#include "hw/qdev-properties.h"
> +#include "hw/qdev-core.h"
> +
> +#include "nvme.h"
> +#include "nvme-ns.h"
> +
> +static int nvme_ns_init(NvmeNamespace *ns)
> +{
> +NvmeIdNs *id_ns = >id_ns;
> +
> +id_ns->lbaf[0].ds = BDRV_SECTOR_BITS;
> +id_ns->nuse = id_ns->ncap = id_ns->nsze =
> +cpu_to_le64(nvme_ns_nlbas(ns));
Nitpick: To be honest I don't really like that chain assignment, 
especially since it forces to wrap the line, but that is just my
personal taste.
> +
> +return 0;
> +}
> +
> +static int nvme_ns_init_blk(NvmeCtrl *n, NvmeNamespace *ns, NvmeIdCtrl *id,
> +Error **errp)
> +{
> +uint64_t perm, shared_perm;
> +
> +Error *local_err = NULL;
> +int ret;
> +
> +perm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
> +shared_perm = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED |
> +BLK_PERM_GRAPH_MOD;
> +
> +ret = blk_set_perm(ns->blk, perm, shared_perm, _err);
> +if (ret) {
> +error_propagate_prepend(errp, local_err, "blk_set_perm: ");
> +return ret;
> +}

You should consider using blkconf_apply_backend_options.
Take a look at for example virtio_blk_device_realize.
That will give you support for read only block devices as well.

I personally only once grazed the area of block permissions,
so I prefer someone from the block layer to review this as well.

> +
> +ns->size = blk_getlength(ns->blk);
> +if (ns->size < 0) {
> +error_setg_errno(errp, -ns->size, "blk_getlength");
> +return 1;
> +}
> +
> +switch (n->conf.wce) {
> +case ON_OFF_AUTO_ON:
> +n->features.volatile_wc = 1;
> +break;
> +case ON_OFF_AUTO_OFF:
> +n->features.volatile_wc = 0;
> +case ON_OFF_AUTO_AUTO:
> +n->features.volatile_wc = blk_enable_write_cache(ns->blk);
> +break;
> +default:
> +abort();
> +}
> +
> +blk_set_enable_write_cache(ns->blk, n->features.volatile_wc);
> +
> +return 0;

Nitpick: also I just noticed that you call the controller 'n' I didn't paid 
attention to this
before. I think something like 'ctrl' or ctl would be more readable.

> +}
> +
> +static int nvme_ns_check_constraints(NvmeNamespace *ns, Error **errp)
> +{
> +if (!ns->blk) {
> +error_setg(errp, "block backend not configured");
> +return 1;
> +}
> +
> +return 0;
> +}
> +
> +int nvme_ns_setup(NvmeCtrl *n, NvmeNamespace *ns, Error **errp)
> +{
> +Error *local_err = NULL;
> +
> +if (nvme_ns_check_constraints(ns, _err)) {
> +error_propagate_prepend(errp, local_err,
> +"nvme_ns_check_constraints: ");
> 

Re: [PATCH] nbd: Fix regression with multiple meta contexts

2020-02-12 Thread Laurent Vivier
Le 12/02/2020 à 13:10, Eric Blake a écrit :
> On 2/12/20 3:24 AM, Laurent Vivier wrote:
>> Le 06/02/2020 à 18:38, Eric Blake a écrit :
>>> Detected by a hang in the libnbd testsuite.  If a client requests
>>> multiple meta contexts (both base:allocation and qemu:dirty-bitmap:x)
>>> at the same time, our attempt to silence a false-positive warning
>>> about a potential uninitialized variable introduced botched logic: we
>>> were short-circuiting the second context, and never sending the
>>> NBD_REPLY_FLAG_DONE.  Combining two 'if' into one 'if/else' in
>>> bdf200a55 was wrong (I'm a bit embarrassed that such a change was my
>>> initial suggestion after the v1 patch, then I did not review the v2
>>> patch that actually got committed). Revert that, and instead silence
>>> the false positive warning by replacing 'return ret' with 'return 0'
>>> (the value it always has at that point in the code, even though it
>>> eluded the deduction abilities of the robot that reported the false
>>> positive).
>>>
>>> Fixes: bdf200a5535
>>> Signed-off-by: Eric Blake 
>>> ---
>>>
>>> It's never fun when a regression is caused by a patch taken through
>>> qemu-trivial, proving that the patch was not trivial after all.
>>
>> Do you want this one be merged using the trivial branch?
> 
> Up to you; I'm also fine taking it through my NBD tree as I have a few
> other NBD patches landing soon.
> 

For the moment, I have only one patch in my queue so I think you can
take it.

Thanks,
Laurent



Re: [PATCH] nbd: Fix regression with multiple meta contexts

2020-02-12 Thread Eric Blake

On 2/12/20 3:24 AM, Laurent Vivier wrote:

Le 06/02/2020 à 18:38, Eric Blake a écrit :

Detected by a hang in the libnbd testsuite.  If a client requests
multiple meta contexts (both base:allocation and qemu:dirty-bitmap:x)
at the same time, our attempt to silence a false-positive warning
about a potential uninitialized variable introduced botched logic: we
were short-circuiting the second context, and never sending the
NBD_REPLY_FLAG_DONE.  Combining two 'if' into one 'if/else' in
bdf200a55 was wrong (I'm a bit embarrassed that such a change was my
initial suggestion after the v1 patch, then I did not review the v2
patch that actually got committed). Revert that, and instead silence
the false positive warning by replacing 'return ret' with 'return 0'
(the value it always has at that point in the code, even though it
eluded the deduction abilities of the robot that reported the false
positive).

Fixes: bdf200a5535
Signed-off-by: Eric Blake 
---

It's never fun when a regression is caused by a patch taken through
qemu-trivial, proving that the patch was not trivial after all.


Do you want this one be merged using the trivial branch?


Up to you; I'm also fine taking it through my NBD tree as I have a few 
other NBD patches landing soon.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




Re: Cross-project NBD extension proposal: NBD_INFO_INIT_STATE

2020-02-12 Thread Eric Blake

On 2/12/20 1:27 AM, Wouter Verhelst wrote:

Hi,

On Mon, Feb 10, 2020 at 10:52:55PM +, Richard W.M. Jones wrote:

But anyway ... could a flag indicating that the whole image is sparse
be useful, either as well as NBD_INIT_SPARSE or instead of it?  You
could use it to avoid an initial disk trim, which is something that
mke2fs does:


Yeah, I think that could definitely be useful. I honestly can't see a
use for NBD_INIT_SPARSE as defined in this proposal; and I don't think
it's generally useful to have a feature if we can't think of a use case
for it (that creates added complexity for no benefit).

If we can find a reasonable use case for NBD_INIT_SPARSE as defined in
this proposal, then just add a third bit (NBD_INIT_ALL_SPARSE or
something) that says "the whole image is sparse". Otherwise, I think we
should redefine NBD_INIT_SPARSE to say that.


Okay, in v2, I will start with just two bits, NBD_INIT_SPARSE (entire 
image is sparse, nothing is allocated) and NBD_INIT_ZERO (entire image 
reads as zero), and save any future bits for later additions.  Do we 
think that 16 bits is sufficient for the amount of initial information 
likely to be exposed?  Are we in agreement that my addition of an 
NBD_INFO_ response to NBD_OPT_GO is the best way to expose initial state 
bits?


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org




Re: [PATCH v5 21/26] nvme: add support for scatter gather lists

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:52 +0100, Klaus Jensen wrote:
> For now, support the Data Block, Segment and Last Segment descriptor
> types.
> 
> See NVM Express 1.3d, Section 4.4 ("Scatter Gather List (SGL)").
> 
> Signed-off-by: Klaus Jensen 
> Acked-by: Fam Zheng 
> ---
>  block/nvme.c  |  18 +-
>  hw/block/nvme.c   | 375 +++---
>  hw/block/trace-events |   4 +
>  include/block/nvme.h  |  62 ++-
>  4 files changed, 389 insertions(+), 70 deletions(-)
> 
> diff --git a/block/nvme.c b/block/nvme.c
> index d41c4bda6e39..521f521054d5 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c
> @@ -446,7 +446,7 @@ static void nvme_identify(BlockDriverState *bs, int 
> namespace, Error **errp)
>  error_setg(errp, "Cannot map buffer for DMA");
>  goto out;
>  }
> -cmd.prp1 = cpu_to_le64(iova);
> +cmd.dptr.prp.prp1 = cpu_to_le64(iova);
>  
>  if (nvme_cmd_sync(bs, s->queues[0], )) {
>  error_setg(errp, "Failed to identify controller");
> @@ -545,7 +545,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error 
> **errp)
>  }
>  cmd = (NvmeCmd) {
>  .opcode = NVME_ADM_CMD_CREATE_CQ,
> -.prp1 = cpu_to_le64(q->cq.iova),
> +.dptr.prp.prp1 = cpu_to_le64(q->cq.iova),
>  .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)),
>  .cdw11 = cpu_to_le32(0x3),
>  };
> @@ -556,7 +556,7 @@ static bool nvme_add_io_queue(BlockDriverState *bs, Error 
> **errp)
>  }
>  cmd = (NvmeCmd) {
>  .opcode = NVME_ADM_CMD_CREATE_SQ,
> -.prp1 = cpu_to_le64(q->sq.iova),
> +.dptr.prp.prp1 = cpu_to_le64(q->sq.iova),
>  .cdw10 = cpu_to_le32(((queue_size - 1) << 16) | (n & 0x)),
>  .cdw11 = cpu_to_le32(0x1 | (n << 16)),
>  };
> @@ -906,16 +906,16 @@ try_map:
>  case 0:
>  abort();
>  case 1:
> -cmd->prp1 = pagelist[0];
> -cmd->prp2 = 0;
> +cmd->dptr.prp.prp1 = pagelist[0];
> +cmd->dptr.prp.prp2 = 0;
>  break;
>  case 2:
> -cmd->prp1 = pagelist[0];
> -cmd->prp2 = pagelist[1];
> +cmd->dptr.prp.prp1 = pagelist[0];
> +cmd->dptr.prp.prp2 = pagelist[1];
>  break;
>  default:
> -cmd->prp1 = pagelist[0];
> -cmd->prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
> +cmd->dptr.prp.prp1 = pagelist[0];
> +cmd->dptr.prp.prp2 = cpu_to_le64(req->prp_list_iova + 
> sizeof(uint64_t));
>  break;
>  }
>  trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries);
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 204ae1d33234..a91c60fdc111 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -75,8 +75,10 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr 
> addr)
>  
>  static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>  {
> -if (n->cmbsz && nvme_addr_is_cmb(n, addr)) {
> -memcpy(buf, (void *) >cmbuf[addr - n->ctrl_mem.addr], size);
> +hwaddr hi = addr + size;
Are you sure you don't want to check for overflow here?
Its theoretical issue since addr has to be almost full 64 bit
but still for those things I check this very defensively.

> +
> +if (n->cmbsz && nvme_addr_is_cmb(n, addr) && nvme_addr_is_cmb(n, hi)) {
Here you fix the bug I mentioned in patch 6. I suggest you to move the fix 
there.
> +memcpy(buf, nvme_addr_to_cmb(n, addr), size);
>  return 0;
>  }
>  
> @@ -159,6 +161,48 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue 
> *cq)
>  }
>  }
>  
> +static uint16_t nvme_map_addr_cmb(NvmeCtrl *n, QEMUIOVector *iov, hwaddr 
> addr,
> +size_t len)
> +{
> +if (!nvme_addr_is_cmb(n, addr) || !nvme_addr_is_cmb(n, addr + len)) {
> +return NVME_DATA_TRANSFER_ERROR;
> +}
> +
> +qemu_iovec_add(iov, nvme_addr_to_cmb(n, addr), len);
> +
> +return NVME_SUCCESS;
> +}
> +
> +static uint16_t nvme_map_addr(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector 
> *iov,
> +hwaddr addr, size_t len)
> +{
> +bool addr_is_cmb = nvme_addr_is_cmb(n, addr);
> +
> +if (addr_is_cmb) {
> +if (qsg->sg) {
> +return NVME_INVALID_USE_OF_CMB | NVME_DNR;
> +}
> +
> +if (!iov->iov) {
> +qemu_iovec_init(iov, 1);
> +}
> +
> +return nvme_map_addr_cmb(n, iov, addr, len);
> +}
> +
> +if (iov->iov) {
> +return NVME_INVALID_USE_OF_CMB | NVME_DNR;
> +}
> +
> +if (!qsg->sg) {
> +pci_dma_sglist_init(qsg, >parent_obj, 1);
> +}
> +
> +qemu_sglist_add(qsg, addr, len);
> +
> +return NVME_SUCCESS;
> +}

Very good refactoring. I would also suggest you to move this to a separate
patch. I always put refactoring first and then patches that add features.

> +
>  static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
>  uint64_t prp1, uint64_t prp2, uint32_t len, NvmeRequest *req)
>  {
> @@ -307,15 +351,240 @@ unmap:

Re: [PATCH v5 20/26] nvme: handle dma errors

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:52 +0100, Klaus Jensen wrote:
> Handling DMA errors gracefully is required for the device to pass the
> block/011 test ("disable PCI device while doing I/O") in the blktests
> suite.
> 
> With this patch the device passes the test by retrying "critical"
> transfers (posting of completion entries and processing of submission
> queue entries).
> 
> If DMA errors occur at any other point in the execution of the command
> (say, while mapping the PRPs), the command is aborted with a Data
> Transfer Error status code.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 42 +-
>  hw/block/trace-events |  2 ++
>  include/block/nvme.h  |  2 +-
>  3 files changed, 36 insertions(+), 10 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index f8c81b9e2202..204ae1d33234 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -73,14 +73,14 @@ static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr 
> addr)
>  return addr >= low && addr < hi;
>  }
>  
> -static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
> +static int nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>  {
>  if (n->cmbsz && nvme_addr_is_cmb(n, addr)) {
>  memcpy(buf, (void *) >cmbuf[addr - n->ctrl_mem.addr], size);
> -return;
> +return 0;
>  }
>  
> -pci_dma_read(>parent_obj, addr, buf, size);
> +return pci_dma_read(>parent_obj, addr, buf, size);
>  }
>  
>  static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
> @@ -168,6 +168,7 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList 
> *qsg, QEMUIOVector *iov,
>  uint16_t status = NVME_SUCCESS;
>  bool is_cmb = false;
>  bool prp_list_in_cmb = false;
> +int ret;
>  
>  trace_nvme_dev_map_prp(nvme_cid(req), req->cmd.opcode, trans_len, len,
>  prp1, prp2, num_prps);
> @@ -218,7 +219,12 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList 
> *qsg, QEMUIOVector *iov,
>  
>  nents = (len + n->page_size - 1) >> n->page_bits;
>  prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
> -nvme_addr_read(n, prp2, (void *) prp_list, prp_trans);
> +ret = nvme_addr_read(n, prp2, (void *) prp_list, prp_trans);
> +if (ret) {
> +trace_nvme_dev_err_addr_read(prp2);
> +status = NVME_DATA_TRANSFER_ERROR;
> +goto unmap;
> +}
>  while (len != 0) {
>  uint64_t prp_ent = le64_to_cpu(prp_list[i]);
>  
> @@ -237,7 +243,13 @@ static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList 
> *qsg, QEMUIOVector *iov,
>  i = 0;
>  nents = (len + n->page_size - 1) >> n->page_bits;
>  prp_trans = MIN(n->max_prp_ents, nents) * 
> sizeof(uint64_t);
> -nvme_addr_read(n, prp_ent, (void *) prp_list, prp_trans);
> +ret = nvme_addr_read(n, prp_ent, (void *) prp_list,
> +prp_trans);
> +if (ret) {
> +trace_nvme_dev_err_addr_read(prp_ent);
> +status = NVME_DATA_TRANSFER_ERROR;
> +goto unmap;
> +}
>  prp_ent = le64_to_cpu(prp_list[i]);
>  }
>  
> @@ -443,6 +455,7 @@ static void nvme_post_cqes(void *opaque)
>  NvmeCQueue *cq = opaque;
>  NvmeCtrl *n = cq->ctrl;
>  NvmeRequest *req, *next;
> +int ret;
>  
>  QTAILQ_FOREACH_SAFE(req, >req_list, entry, next) {
>  NvmeSQueue *sq;
> @@ -452,15 +465,21 @@ static void nvme_post_cqes(void *opaque)
>  break;
>  }
>  
> -QTAILQ_REMOVE(>req_list, req, entry);
>  sq = req->sq;
>  req->cqe.status = cpu_to_le16((req->status << 1) | cq->phase);
>  req->cqe.sq_id = cpu_to_le16(sq->sqid);
>  req->cqe.sq_head = cpu_to_le16(sq->head);
>  addr = cq->dma_addr + cq->tail * n->cqe_size;
> -nvme_inc_cq_tail(cq);
> -pci_dma_write(>parent_obj, addr, (void *)>cqe,
> +ret = pci_dma_write(>parent_obj, addr, (void *)>cqe,
>  sizeof(req->cqe));
> +if (ret) {
> +trace_nvme_dev_err_addr_write(addr);
> +timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) +
> +100 * SCALE_MS);
> +break;
> +}
> +QTAILQ_REMOVE(>req_list, req, entry);
> +nvme_inc_cq_tail(cq);
>  nvme_req_clear(req);
>  QTAILQ_INSERT_TAIL(>req_list, req, entry);
>  }
> @@ -1588,7 +1607,12 @@ static void nvme_process_sq(void *opaque)
>  
>  while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(>req_list))) {
>  addr = sq->dma_addr + sq->head * n->sqe_size;
> -nvme_addr_read(n, addr, (void *), sizeof(cmd));
> +if (nvme_addr_read(n, addr, (void *), sizeof(cmd))) {
> +

Re: [PATCH v5 18/26] nvme: use preallocated qsg/iov in nvme_dma_prp

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:52 +0100, Klaus Jensen wrote:
> Since clean up of the request qsg/iov has been moved to the common
> nvme_enqueue_req_completion function, there is no need to use a
> stack allocated qsg/iov in nvme_dma_prp.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 18 ++
>  1 file changed, 6 insertions(+), 12 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index e97da35c4ca1..f8c81b9e2202 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -298,23 +298,21 @@ unmap:
>  static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
>  uint64_t prp1, uint64_t prp2, DMADirection dir, NvmeRequest *req)
>  {
> -QEMUSGList qsg;
> -QEMUIOVector iov;
>  uint16_t status = NVME_SUCCESS;
>  size_t bytes;
>  
> -status = nvme_map_prp(n, , , prp1, prp2, len, req);
> +status = nvme_map_prp(n, >qsg, >iov, prp1, prp2, len, req);
>  if (status) {
>  return status;
>  }
>  
> -if (qsg.nsg > 0) {
> +if (req->qsg.nsg > 0) {
>  uint64_t residual;
>  
>  if (dir == DMA_DIRECTION_TO_DEVICE) {
> -residual = dma_buf_write(ptr, len, );
> +residual = dma_buf_write(ptr, len, >qsg);
>  } else {
> -residual = dma_buf_read(ptr, len, );
> +residual = dma_buf_read(ptr, len, >qsg);
>  }
>  
>  if (unlikely(residual)) {
> @@ -322,15 +320,13 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, 
> uint32_t len,
>  status = NVME_INVALID_FIELD | NVME_DNR;
>  }
>  
> -qemu_sglist_destroy();
> -
>  return status;
>  }
>  
>  if (dir == DMA_DIRECTION_TO_DEVICE) {
> -bytes = qemu_iovec_to_buf(, 0, ptr, len);
> +bytes = qemu_iovec_to_buf(>iov, 0, ptr, len);
>  } else {
> -bytes = qemu_iovec_from_buf(, 0, ptr, len);
> +bytes = qemu_iovec_from_buf(>iov, 0, ptr, len);
>  }
>  
>  if (unlikely(bytes != len)) {
> @@ -338,8 +334,6 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, 
> uint32_t len,
>  status = NVME_INVALID_FIELD | NVME_DNR;
>  }
>  
> -qemu_iovec_destroy();
> -
>  return status;
>  }
>  


Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky




Re: [PATCH v5 17/26] nvme: allow multiple aios per command

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> This refactors how the device issues asynchronous block backend
> requests. The NvmeRequest now holds a queue of NvmeAIOs that are
> associated with the command. This allows multiple aios to be issued for
> a command. Only when all requests have been completed will the device
> post a completion queue entry.
> 
> Because the device is currently guaranteed to only issue a single aio
> request per command, the benefit is not immediately obvious. But this
> functionality is required to support metadata, the dataset management
> command and other features.

I don't know what the strategy will be chosen for supporting metadata
(qemu doesn't have any notion of metadata in the block layer), but for dataset 
management
you are right. Dataset management command can contain a table of areas to 
discard
(although in reality I have seen no driver putting there more that one entry).


> 
> Signed-off-by: Klaus Jensen 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 449 +-
>  hw/block/nvme.h   | 134 +++--
>  hw/block/trace-events |   8 +
>  3 files changed, 480 insertions(+), 111 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 334265efb21e..e97da35c4ca1 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -19,7 +19,8 @@
>   *  -drive file=,if=none,id=
>   *  -device nvme,drive=,serial=,id=, \
>   *  cmb_size_mb=, \
> - *  num_queues=
> + *  num_queues=, \
> + *  mdts=

Could you split mdts checks into a separate patch? This is not related to the 
series.

>   *
>   * Note cmb_size_mb denotes size of CMB in MB. CMB is assumed to be at
>   * offset 0 in BAR2 and supports only WDS, RDS and SQS for now.
> @@ -57,6 +58,7 @@
>  } while (0)
>  
>  static void nvme_process_sq(void *opaque);
> +static void nvme_aio_cb(void *opaque, int ret);
>  
>  static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
>  {
> @@ -341,6 +343,107 @@ static uint16_t nvme_dma_prp(NvmeCtrl *n, uint8_t *ptr, 
> uint32_t len,
>  return status;
>  }
>  
> +static uint16_t nvme_map(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> +{
> +NvmeNamespace *ns = req->ns;
> +
> +uint32_t len = req->nlb << nvme_ns_lbads(ns);
> +uint64_t prp1 = le64_to_cpu(cmd->prp1);
> +uint64_t prp2 = le64_to_cpu(cmd->prp2);
> +
> +return nvme_map_prp(n, >qsg, >iov, prp1, prp2, len, req);
> +}

Same here, this is another nice refactoring and it should be in separate patch.

> +
> +static void nvme_aio_destroy(NvmeAIO *aio)
> +{
> +g_free(aio);
> +}
> +
> +static inline void nvme_req_register_aio(NvmeRequest *req, NvmeAIO *aio,
> +NvmeAIOOp opc)
> +{
> +aio->opc = opc;
> +
> +trace_nvme_dev_req_register_aio(nvme_cid(req), aio, blk_name(aio->blk),
> +aio->offset, aio->len, nvme_aio_opc_str(aio), req);
> +
> +if (req) {
> +QTAILQ_INSERT_TAIL(>aio_tailq, aio, tailq_entry);
> +}
> +}
> +
> +static void nvme_aio(NvmeAIO *aio)
Function name not clear to me. Maybe change this to something like 
nvme_submit_aio.
> +{
> +BlockBackend *blk = aio->blk;
> +BlockAcctCookie *acct = >acct;
> +BlockAcctStats *stats = blk_get_stats(blk);
> +
> +bool is_write, dma;
> +
> +switch (aio->opc) {
> +case NVME_AIO_OPC_NONE:
> +break;
> +
> +case NVME_AIO_OPC_FLUSH:
> +block_acct_start(stats, acct, 0, BLOCK_ACCT_FLUSH);
> +aio->aiocb = blk_aio_flush(blk, nvme_aio_cb, aio);
> +break;
> +
> +case NVME_AIO_OPC_WRITE_ZEROES:
> +block_acct_start(stats, acct, aio->len, BLOCK_ACCT_WRITE);
> +aio->aiocb = blk_aio_pwrite_zeroes(blk, aio->offset, aio->len,
> +BDRV_REQ_MAY_UNMAP, nvme_aio_cb, aio);
> +break;
> +
> +case NVME_AIO_OPC_READ:
> +case NVME_AIO_OPC_WRITE:

> +dma = aio->qsg != NULL;

This doesn't work.
aio->qsg is always not null since nvme_rw_aio sets this to >qsg
which is then written to aio->qsg by nvme_aio_new.

That is yet another reason I really don't like these parallel QEMUSGList
and QEMUIOVector. However I see that few other qemu drivers do this,
thus this is probably a necessary evil.

What we can do maybe is to do dma_memory_map on the SG list,
and then deal with QEMUIOVector only. Virtio does this
(virtqueue_pop/virtqueue_push)


> +is_write = (aio->opc == NVME_AIO_OPC_WRITE);
> +
> +block_acct_start(stats, acct, aio->len,
> +is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
> +
> +if (dma) {
> +aio->aiocb = is_write ?
> +dma_blk_write(blk, aio->qsg, aio->offset,
> +BDRV_SECTOR_SIZE, nvme_aio_cb, aio) :
> +dma_blk_read(blk, aio->qsg, aio->offset,
> +BDRV_SECTOR_SIZE, nvme_aio_cb, aio);
> +
Extra space
> +return;
> +}
> +
> +aio->aiocb = is_write ?
> +   

Re: [PATCH v5 16/26] nvme: refactor prp mapping

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> Refactor nvme_map_prp and allow PRPs to be located in the CMB. The logic
> ensures that if some of the PRP is in the CMB, all of it must be located
> there, as per the specification.

To be honest this looks like not refactoring but a bugfix
(old code was just assuming that if first prp entry is in cmb, the rest also is)
> 
> Also combine nvme_dma_{read,write}_prp into a single nvme_dma_prp that
> takes an additional DMADirection parameter.

To be honest 'nvme_dma_prp' was not a clear function name to me at first glance.
Could you rename this to nvme_dma_prp_rw or so? (Although even that is somewhat 
unclear
to convey the meaning of read/write the data to/from the guest memory areas 
defined by the prp list.
Also could you split this change into a new patch?

> 
> Signed-off-by: Klaus Jensen 
> Signed-off-by: Klaus Jensen 
Now you even use your both addresses :-)

> ---
>  hw/block/nvme.c   | 245 +++---
>  hw/block/nvme.h   |   2 +-
>  hw/block/trace-events |   1 +
>  include/block/nvme.h  |   1 +
>  4 files changed, 160 insertions(+), 89 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 4acfc85b56a2..334265efb21e 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -58,6 +58,11 @@
>  
>  static void nvme_process_sq(void *opaque);
>  
> +static inline void *nvme_addr_to_cmb(NvmeCtrl *n, hwaddr addr)
> +{
> +return >cmbuf[addr - n->ctrl_mem.addr];
> +}

To my taste I would put this together with the patch that
added nvme_addr_is_cmb. I know that some people are against
this citing the fact that you should use the code you add
in the same patch. Your call.

Regardless of this I also prefer to put refactoring patches first in the series.

> +
>  static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
>  {
>  hwaddr low = n->ctrl_mem.addr;
> @@ -152,138 +157,187 @@ static void nvme_irq_deassert(NvmeCtrl *n, NvmeCQueue 
> *cq)
>  }
>  }
>  
> -static uint16_t nvme_map_prp(QEMUSGList *qsg, QEMUIOVector *iov, uint64_t 
> prp1,
> - uint64_t prp2, uint32_t len, NvmeCtrl *n)
> +static uint16_t nvme_map_prp(NvmeCtrl *n, QEMUSGList *qsg, QEMUIOVector *iov,
> +uint64_t prp1, uint64_t prp2, uint32_t len, NvmeRequest *req)

Split line alignment (it was correct before).
Also while at the refactoring, it would be great to add some documentation
to this and few more functions, since its not clear immediately what this does.


>  {
>  hwaddr trans_len = n->page_size - (prp1 % n->page_size);
>  trans_len = MIN(len, trans_len);
>  int num_prps = (len >> n->page_bits) + 1;
> +uint16_t status = NVME_SUCCESS;
> +bool is_cmb = false;
> +bool prp_list_in_cmb = false;
> +
> +trace_nvme_dev_map_prp(nvme_cid(req), req->cmd.opcode, trans_len, len,
> +prp1, prp2, num_prps);
>  
>  if (unlikely(!prp1)) {
>  trace_nvme_dev_err_invalid_prp();
>  return NVME_INVALID_FIELD | NVME_DNR;
> -} else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
> -   prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
> -qsg->nsg = 0;
> +}
> +
> +if (nvme_addr_is_cmb(n, prp1)) {
> +is_cmb = true;
> +
>  qemu_iovec_init(iov, num_prps);
> -qemu_iovec_add(iov, (void *)>cmbuf[prp1 - n->ctrl_mem.addr], 
> trans_len);
> +
> +/*
> + * PRPs do not cross page boundaries, so if the start address (here,
> + * prp1) is within the CMB, it cannot cross outside the controller
> + * memory buffer range. This is ensured by
> + *
> + *   len = n->page_size - (addr % n->page_size)
> + *
> + * Thus, we can directly add to the iovec without risking an out of
> + * bounds access. This also holds for the remaining qemu_iovec_add
> + * calls.
> + */
> +qemu_iovec_add(iov, nvme_addr_to_cmb(n, prp1), trans_len);
>  } else {
>  pci_dma_sglist_init(qsg, >parent_obj, num_prps);
>  qemu_sglist_add(qsg, prp1, trans_len);
>  }
> +
>  len -= trans_len;
>  if (len) {
>  if (unlikely(!prp2)) {
>  trace_nvme_dev_err_invalid_prp2_missing();
> +status = NVME_INVALID_FIELD | NVME_DNR;
>  goto unmap;
>  }
> +
>  if (len > n->page_size) {
>  uint64_t prp_list[n->max_prp_ents];
>  uint32_t nents, prp_trans;
>  int i = 0;
>  
> +if (nvme_addr_is_cmb(n, prp2)) {
> +prp_list_in_cmb = true;
> +}
> +
>  nents = (len + n->page_size - 1) >> n->page_bits;
>  prp_trans = MIN(n->max_prp_ents, nents) * sizeof(uint64_t);
> -nvme_addr_read(n, prp2, (void *)prp_list, prp_trans);
> +nvme_addr_read(n, prp2, (void *) prp_list, prp_trans);
>  while (len != 0) {
>  uint64_t prp_ent = 

Re: [PATCH v4 0/4] qmp: Optionally run handlers in coroutines

2020-02-12 Thread Kevin Wolf
Am 05.02.2020 um 15:00 hat Kevin Wolf geschrieben:
> Am 21.01.2020 um 19:11 hat Kevin Wolf geschrieben:
> > Some QMP command handlers can block the main loop for a relatively long
> > time, for example because they perform some I/O. This is quite nasty.
> > Allowing such handlers to run in a coroutine where they can yield (and
> > therefore release the BQL) while waiting for an event such as I/O
> > completion solves the problem.
> > 
> > This series adds the infrastructure to allow this and switches
> > block_resize to run in a coroutine as a first example.
> > 
> > This is an alternative solution to Marc-André's "monitor: add
> > asynchronous command type" series.
> 
> Ping?

Ping.

Kevin




Re: [PATCH v2 2/4] virtio-scsi: default num_queues to -smp N

2020-02-12 Thread Stefan Hajnoczi
On Tue, Feb 11, 2020 at 11:31:17AM -0500, Michael S. Tsirkin wrote:
> On Tue, Feb 11, 2020 at 04:20:41PM +, Stefan Hajnoczi wrote:
> > On Mon, Feb 03, 2020 at 12:39:49PM +0100, Sergio Lopez wrote:
> > > On Mon, Feb 03, 2020 at 10:57:44AM +, Daniel P. Berrangé wrote:
> > > > On Mon, Feb 03, 2020 at 11:25:29AM +0100, Sergio Lopez wrote:
> > > > > On Thu, Jan 30, 2020 at 10:52:35AM +, Stefan Hajnoczi wrote:
> > > > > > On Thu, Jan 30, 2020 at 01:29:16AM +0100, Paolo Bonzini wrote:
> > > > > > > On 29/01/20 16:44, Stefan Hajnoczi wrote:
> > > > > > > > On Mon, Jan 27, 2020 at 02:10:31PM +0100, Cornelia Huck wrote:
> > > > > > > >> On Fri, 24 Jan 2020 10:01:57 +
> > > > > > > >> Stefan Hajnoczi  wrote:
> > I will create a 32 vCPU guest with 100 virtio-blk devices and verify
> > that enabling multi-queue is successful.
> 
> and that it's helpful for performance?

I may be a little while before the next revision of this patch series.
Testing reveals scalability problems when creating so many virtqueues
:).

I've measured boot time, memory consumption, and random read IOPS.  They
are all significantly worse (32 vCPUs, 24 GB RAM, 101 virtio-blk
devices, 32 queues/device).

Time to see what's going on and whether some general scalability
improvements are possible here before we enable multi-queue by default.

Stefan


signature.asc
Description: PGP signature


Re: [PATCH v5 15/26] nvme: bump supported specification to 1.3

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> Add new fields to the Identify Controller and Identify Namespace data
> structures accoding to NVM Express 1.3d.
> 
> NVM Express 1.3d requires the following additional features:
>   - addition of the Namespace Identification Descriptor List (CNS 03h)
> for the Identify command
>   - support for returning Command Sequence Error if a Set Features
> command is submitted for the Number of Queues feature after any I/O
> queues have been created.
>   - The addition of the Log Specific Field (LSP) in the Get Log Page
> command.

> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 57 ---
>  hw/block/nvme.h   |  1 +
>  hw/block/trace-events |  3 ++-
>  include/block/nvme.h  | 20 ++-
>  4 files changed, 71 insertions(+), 10 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 900732bb2f38..4acfc85b56a2 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -9,7 +9,7 @@
>   */
>  
>  /**
> - * Reference Specification: NVM Express 1.2.1
> + * Reference Specification: NVM Express 1.3d
>   *
>   *   https://nvmexpress.org/resources/specifications/
>   */
> @@ -43,7 +43,7 @@
>  #include "trace.h"
>  #include "nvme.h"
>  
> -#define NVME_SPEC_VER 0x00010201
> +#define NVME_SPEC_VER 0x00010300
>  #define NVME_MAX_QS PCI_MSIX_FLAGS_QSIZE
>  #define NVME_TEMPERATURE 0x143
>  #define NVME_TEMPERATURE_WARNING 0x157
> @@ -735,6 +735,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, 
> NvmeRequest *req)
>  uint32_t dw12 = le32_to_cpu(cmd->cdw12);
>  uint32_t dw13 = le32_to_cpu(cmd->cdw13);
>  uint8_t  lid = dw10 & 0xff;
> +uint8_t  lsp = (dw10 >> 8) & 0xf;
>  uint8_t  rae = (dw10 >> 15) & 0x1;
>  uint32_t numdl, numdu;
>  uint64_t off, lpol, lpou;
> @@ -752,7 +753,7 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, 
> NvmeRequest *req)
>  return NVME_INVALID_FIELD | NVME_DNR;
>  }
>  
> -trace_nvme_dev_get_log(nvme_cid(req), lid, rae, len, off);
> +trace_nvme_dev_get_log(nvme_cid(req), lid, lsp, rae, len, off);
>  
>  switch (lid) {
>  case NVME_LOG_ERROR_INFO:
> @@ -863,6 +864,8 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
>  cq = g_malloc0(sizeof(*cq));
>  nvme_init_cq(cq, n, prp1, cqid, vector, qsize + 1,
>  NVME_CQ_FLAGS_IEN(qflags));
Code alignment on that '('
> +
> +n->qs_created = true;
Should be done also at nvme_create_sq
>  return NVME_SUCCESS;
>  }
>  
> @@ -924,6 +927,47 @@ static uint16_t nvme_identify_ns_list(NvmeCtrl *n, 
> NvmeIdentify *c)
>  return ret;
>  }
>  
> +static uint16_t nvme_identify_ns_descr_list(NvmeCtrl *n, NvmeCmd *c)
> +{
> +static const int len = 4096;
The spec caps the Identify payload size to 4K,
thus this should go to nvme.h
> +
> +struct ns_descr {
> +uint8_t nidt;
> +uint8_t nidl;
> +uint8_t rsvd2[2];
> +uint8_t nid[16];
> +};
This is also part of the spec, thus should
move to nvme.h

> +
> +uint32_t nsid = le32_to_cpu(c->nsid);
> +uint64_t prp1 = le64_to_cpu(c->prp1);
> +uint64_t prp2 = le64_to_cpu(c->prp2);
> +
> +struct ns_descr *list;
> +uint16_t ret;
> +
> +trace_nvme_dev_identify_ns_descr_list(nsid);
> +
> +if (unlikely(nsid == 0 || nsid > n->num_namespaces)) {
> +trace_nvme_dev_err_invalid_ns(nsid, n->num_namespaces);
> +return NVME_INVALID_NSID | NVME_DNR;
> +}
> +
> +/*
> + * Because the NGUID and EUI64 fields are 0 in the Identify Namespace 
> data
> + * structure, a Namespace UUID (nidt = 0x3) must be reported in the
> + * Namespace Identification Descriptor. Add a very basic Namespace UUID
> + * here.
Some per namespace uuid qemu property will be very nice to have to have a uuid 
that
is at least somewhat unique.
Linux kernel I think might complain if it detects namespaces with duplicate 
uuids.

> + */
> +list = g_malloc0(len);
> +list->nidt = 0x3;
> +list->nidl = 0x10;
Those should also be #defined in nvme.h
> +*(uint32_t *) >nid[12] = cpu_to_be32(nsid);
> +
> +ret = nvme_dma_read_prp(n, (uint8_t *) list, len, prp1, prp2);
> +g_free(list);
> +return ret;
> +}
> +
>  static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
>  {
>  NvmeIdentify *c = (NvmeIdentify *)cmd;
> @@ -935,6 +979,8 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
>  return nvme_identify_ctrl(n, c);
>  case 0x02:
>  return nvme_identify_ns_list(n, c);
> +case 0x03:
The CNS values should be defined in nvme.h.
> +return nvme_identify_ns_descr_list(n, cmd);
>  default:
>  trace_nvme_dev_err_invalid_identify_cns(le32_to_cpu(c->cns));
>  return NVME_INVALID_FIELD | NVME_DNR;
> @@ -1133,6 +1179,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  

Re: [PATCH v5 14/26] nvme: make sure ncqr and nsqr is valid

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> 0x is not an allowed value for NCQR and NSQR in Set Features on
> Number of Queues.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 4 
>  1 file changed, 4 insertions(+)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 30c5b3e7a67d..900732bb2f38 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1133,6 +1133,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
>  break;
>  case NVME_NUMBER_OF_QUEUES:
> +if ((dw11 & 0x) == 0x || ((dw11 >> 16) & 0x) == 0x) {
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
Very minor nitpick: since this spec requirement is not obvious, a 
quote/reference to the spec
would be nice to have here. 

> +
>  trace_nvme_dev_setfeat_numq((dw11 & 0x) + 1,
>  ((dw11 >> 16) & 0x) + 1, n->params.num_queues - 1,
>  n->params.num_queues - 1);

Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky




Re: [PATCH v5 13/26] nvme: additional tracing

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> Add a trace call for nvme_enqueue_req_completion.
> 
> Also, streamline nvme_identify_ns and nvme_identify_ns_list. They do not
> need to repeat the command, it is already in the trace name.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 8 +---
>  hw/block/trace-events | 5 +++--
>  2 files changed, 8 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 3267ee2de47a..30c5b3e7a67d 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -320,6 +320,8 @@ static void nvme_post_cqes(void *opaque)
>  static void nvme_enqueue_req_completion(NvmeCQueue *cq, NvmeRequest *req)
>  {
>  assert(cq->cqid == req->sq->cqid);
> +trace_nvme_dev_enqueue_req_completion(nvme_cid(req), cq->cqid,
> +req->status);
Split line alignment on that '('

>  QTAILQ_REMOVE(>sq->out_req_list, req, entry);
>  QTAILQ_INSERT_TAIL(>req_list, req, entry);
>  timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
> @@ -895,7 +897,7 @@ static uint16_t nvme_identify_ns(NvmeCtrl *n, 
> NvmeIdentify *c)
>  prp1, prp2);
>  }
>  
> -static uint16_t nvme_identify_nslist(NvmeCtrl *n, NvmeIdentify *c)
> +static uint16_t nvme_identify_ns_list(NvmeCtrl *n, NvmeIdentify *c)
>  {
>  static const int data_len = 4 * KiB;
>  uint32_t min_nsid = le32_to_cpu(c->nsid);
> @@ -905,7 +907,7 @@ static uint16_t nvme_identify_nslist(NvmeCtrl *n, 
> NvmeIdentify *c)
>  uint16_t ret;
>  int i, j = 0;
>  
> -trace_nvme_dev_identify_nslist(min_nsid);
> +trace_nvme_dev_identify_ns_list(min_nsid);
>  
>  list = g_malloc0(data_len);
>  for (i = 0; i < n->num_namespaces; i++) {
> @@ -932,7 +934,7 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
>  case 0x01:
>  return nvme_identify_ctrl(n, c);
>  case 0x02:
> -return nvme_identify_nslist(n, c);
> +return nvme_identify_ns_list(n, c);
>  default:
>  trace_nvme_dev_err_invalid_identify_cns(le32_to_cpu(c->cns));
>  return NVME_INVALID_FIELD | NVME_DNR;
> diff --git a/hw/block/trace-events b/hw/block/trace-events
> index 4cf39961989d..f982ec1a3221 100644
> --- a/hw/block/trace-events
> +++ b/hw/block/trace-events
> @@ -39,8 +39,8 @@ nvme_dev_create_cq(uint64_t addr, uint16_t cqid, uint16_t 
> vector, uint16_t size,
>  nvme_dev_del_sq(uint16_t qid) "deleting submission queue sqid=%"PRIu16""
>  nvme_dev_del_cq(uint16_t cqid) "deleted completion queue, sqid=%"PRIu16""
>  nvme_dev_identify_ctrl(void) "identify controller"
> -nvme_dev_identify_ns(uint16_t ns) "identify namespace, nsid=%"PRIu16""
> -nvme_dev_identify_nslist(uint16_t ns) "identify namespace list, 
> nsid=%"PRIu16""
> +nvme_dev_identify_ns(uint32_t ns) "nsid %"PRIu32""
> +nvme_dev_identify_ns_list(uint32_t ns) "nsid %"PRIu32""
>  nvme_dev_getfeat(uint16_t cid, uint32_t fid) "cid %"PRIu16" fid 0x%"PRIx32""
>  nvme_dev_setfeat(uint16_t cid, uint32_t fid, uint32_t val) "cid %"PRIu16" 
> fid 0x%"PRIx32" val 0x%"PRIx32""
>  nvme_dev_getfeat_vwcache(const char* result) "get feature volatile write 
> cache, result=%s"
> @@ -54,6 +54,7 @@ nvme_dev_aer(uint16_t cid) "cid %"PRIu16""
>  nvme_dev_aer_aerl_exceeded(void) "aerl exceeded"
>  nvme_dev_aer_masked(uint8_t type, uint8_t mask) "type 0x%"PRIx8" mask 
> 0x%"PRIx8""
>  nvme_dev_aer_post_cqe(uint8_t typ, uint8_t info, uint8_t log_page) "type 
> 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8""
> +nvme_dev_enqueue_req_completion(uint16_t cid, uint16_t cqid, uint16_t 
> status) "cid %"PRIu16" cqid %"PRIu16" status 0x%"PRIx16""
>  nvme_dev_enqueue_event(uint8_t typ, uint8_t info, uint8_t log_page) "type 
> 0x%"PRIx8" info 0x%"PRIx8" lid 0x%"PRIx8""
>  nvme_dev_enqueue_event_noqueue(int queued) "queued %d"
>  nvme_dev_enqueue_event_masked(uint8_t typ) "type 0x%"PRIx8""

With alignment fixed:

Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky




Re: [PATCH v5 12/26] nvme: add missing mandatory features

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> Add support for returning a resonable response to Get/Set Features of
> mandatory features.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 57 ---
>  hw/block/trace-events |  2 ++
>  include/block/nvme.h  |  3 ++-
>  3 files changed, 58 insertions(+), 4 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index a186d95df020..3267ee2de47a 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -1008,7 +1008,15 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  uint32_t dw11 = le32_to_cpu(cmd->cdw11);
>  uint32_t result;
>  
> +trace_nvme_dev_getfeat(nvme_cid(req), dw10);
> +
>  switch (dw10) {
> +case NVME_ARBITRATION:
> +result = cpu_to_le32(n->features.arbitration);
> +break;
> +case NVME_POWER_MANAGEMENT:
> +result = cpu_to_le32(n->features.power_mgmt);
> +break;
>  case NVME_TEMPERATURE_THRESHOLD:
>  result = 0;
>  
> @@ -1029,6 +1037,9 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  break;
>  }
>  
> +break;
> +case NVME_ERROR_RECOVERY:
> +result = cpu_to_le32(n->features.err_rec);
>  break;
>  case NVME_VOLATILE_WRITE_CACHE:
>  result = blk_enable_write_cache(n->conf.blk);

This is existing code but still like to point out that endianess conversion is 
missing.
Also we need to think if we need to do some flush if the write cache is 
disabled.
I don't know yet that area well enough.

> @@ -1041,6 +1052,19 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  break;
>  case NVME_TIMESTAMP:
>  return nvme_get_feature_timestamp(n, cmd);
> +case NVME_INTERRUPT_COALESCING:
> +result = cpu_to_le32(n->features.int_coalescing);
> +break;
> +case NVME_INTERRUPT_VECTOR_CONF:
> +if ((dw11 & 0x) > n->params.num_queues) {
Looks like it should be >= since interrupt vector is not zero based.
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +
> +result = cpu_to_le32(n->features.int_vector_config[dw11 & 0x]);
> +break;
> +case NVME_WRITE_ATOMICITY:
> +result = cpu_to_le32(n->features.write_atomicity);
> +break;
>  case NVME_ASYNCHRONOUS_EVENT_CONF:
>  result = cpu_to_le32(n->features.async_config);
>  break;
> @@ -1076,6 +1100,8 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  uint32_t dw10 = le32_to_cpu(cmd->cdw10);
>  uint32_t dw11 = le32_to_cpu(cmd->cdw11);
>  
> +trace_nvme_dev_setfeat(nvme_cid(req), dw10, dw11);
> +
>  switch (dw10) {
>  case NVME_TEMPERATURE_THRESHOLD:
>  if (NVME_TEMP_TMPSEL(dw11)) {
> @@ -1116,6 +1142,13 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  case NVME_ASYNCHRONOUS_EVENT_CONF:
>  n->features.async_config = dw11;
>  break;
> +case NVME_ARBITRATION:
> +case NVME_POWER_MANAGEMENT:
> +case NVME_ERROR_RECOVERY:
> +case NVME_INTERRUPT_COALESCING:
> +case NVME_INTERRUPT_VECTOR_CONF:
> +case NVME_WRITE_ATOMICITY:
> +return NVME_FEAT_NOT_CHANGABLE | NVME_DNR;
>  default:
>  trace_nvme_dev_err_invalid_setfeat(dw10);
>  return NVME_INVALID_FIELD | NVME_DNR;
> @@ -1689,6 +1722,21 @@ static void nvme_init_state(NvmeCtrl *n)
>  n->temperature = NVME_TEMPERATURE;
>  n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
>  n->starttime_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
> +
> +/*
> + * There is no limit on the number of commands that the controller may
> + * launch at one time from a particular Submission Queue.
> + */
> +n->features.arbitration = 0x7;
A nice #define in nvme.h stating that 0x7 means no burst limit would be nice.

> +
> +n->features.int_vector_config = g_malloc0_n(n->params.num_queues,
> +sizeof(*n->features.int_vector_config));
> +
> +/* disable coalescing (not supported) */
> +for (int i = 0; i < n->params.num_queues; i++) {
> +n->features.int_vector_config[i] = i | (1 << 16);
Same here
> +}
> +
>  n->aer_reqs = g_new0(NvmeRequest *, n->params.aerl + 1);
>  }
>  
> @@ -1782,15 +1830,17 @@ static void nvme_init_ctrl(NvmeCtrl *n)
>  id->nn = cpu_to_le32(n->num_namespaces);
>  id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP);
>  
> +
> +if (blk_enable_write_cache(n->conf.blk)) {
> +id->vwc = 1;
> +}
> +
>  strcpy((char *) id->subnqn, "nqn.2019-08.org.qemu:");
>  pstrcat((char *) id->subnqn, sizeof(id->subnqn), n->params.serial);
>  
>  id->psd[0].mp = cpu_to_le16(0x9c4);
>  id->psd[0].enlat = cpu_to_le32(0x10);
>  id->psd[0].exlat = cpu_to_le32(0x4);
> -if 

Re: [PATCH v5 11/26] nvme: add support for the asynchronous event request command

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> Required for compliance with NVMe revision 1.2.1. See NVM Express 1.2.1,
> Section 5.2 ("Asynchronous Event Request command").
> 
> Mostly imported from Keith's qemu-nvme tree. Modified with a max number
> of queued events (controllable with the aer_max_queued device
> parameter). The spec states that the controller *should* retain
> events, so we do best effort here.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 167 +-
>  hw/block/nvme.h   |  14 +++-
>  hw/block/trace-events |   9 +++
>  include/block/nvme.h  |   8 +-
>  4 files changed, 191 insertions(+), 7 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 468c36918042..a186d95df020 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -325,6 +325,85 @@ static void nvme_enqueue_req_completion(NvmeCQueue *cq, 
> NvmeRequest *req)
>  timer_mod(cq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
>  }
>  
> +static void nvme_process_aers(void *opaque)
> +{
> +NvmeCtrl *n = opaque;
> +NvmeAsyncEvent *event, *next;
> +
> +trace_nvme_dev_process_aers(n->aer_queued);
> +
> +QTAILQ_FOREACH_SAFE(event, >aer_queue, entry, next) {
> +NvmeRequest *req;
> +NvmeAerResult *result;
> +
> +/* can't post cqe if there is nothing to complete */
> +if (!n->outstanding_aers) {
> +trace_nvme_dev_no_outstanding_aers();
> +break;
> +}
> +
> +/* ignore if masked (cqe posted, but event not cleared) */
> +if (n->aer_mask & (1 << event->result.event_type)) {
> +trace_nvme_dev_aer_masked(event->result.event_type, n->aer_mask);
> +continue;
> +}
> +
> +QTAILQ_REMOVE(>aer_queue, event, entry);
> +n->aer_queued--;
> +
> +n->aer_mask |= 1 << event->result.event_type;
> +n->outstanding_aers--;
> +
> +req = n->aer_reqs[n->outstanding_aers];
> +
> +result = (NvmeAerResult *) >cqe.result;
> +result->event_type = event->result.event_type;
> +result->event_info = event->result.event_info;
> +result->log_page = event->result.log_page;
> +g_free(event);
> +
> +req->status = NVME_SUCCESS;
> +
> +trace_nvme_dev_aer_post_cqe(result->event_type, result->event_info,
> +result->log_page);
> +
> +nvme_enqueue_req_completion(>admin_cq, req);
> +}
> +}
> +
> +static void nvme_enqueue_event(NvmeCtrl *n, uint8_t event_type,
> +uint8_t event_info, uint8_t log_page)
> +{
> +NvmeAsyncEvent *event;
> +
> +trace_nvme_dev_enqueue_event(event_type, event_info, log_page);
> +
> +if (n->aer_queued == n->params.aer_max_queued) {
> +trace_nvme_dev_enqueue_event_noqueue(n->aer_queued);
> +return;
> +}
> +
> +event = g_new(NvmeAsyncEvent, 1);
> +event->result = (NvmeAerResult) {
> +.event_type = event_type,
> +.event_info = event_info,
> +.log_page   = log_page,
> +};
> +
> +QTAILQ_INSERT_TAIL(>aer_queue, event, entry);
> +n->aer_queued++;
> +
> +nvme_process_aers(n);
> +}
> +
> +static void nvme_clear_events(NvmeCtrl *n, uint8_t event_type)
> +{
> +n->aer_mask &= ~(1 << event_type);
> +if (!QTAILQ_EMPTY(>aer_queue)) {
> +nvme_process_aers(n);
> +}
> +}
> +
>  static void nvme_rw_cb(void *opaque, int ret)
>  {
>  NvmeRequest *req = opaque;
> @@ -569,8 +648,8 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd *cmd)
>  return NVME_SUCCESS;
>  }
>  
> -static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> -uint64_t off, NvmeRequest *req)
> +static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint8_t rae,
> +uint32_t buf_len, uint64_t off, NvmeRequest *req)
>  {
>  uint64_t prp1 = le64_to_cpu(cmd->prp1);
>  uint64_t prp2 = le64_to_cpu(cmd->prp2);
> @@ -619,6 +698,10 @@ static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd 
> *cmd, uint32_t buf_len,
>  smart.power_on_hours[0] = cpu_to_le64(
>  (((current_ms - n->starttime_ms) / 1000) / 60) / 60);
>  
> +if (!rae) {
> +nvme_clear_events(n, NVME_AER_TYPE_SMART);
> +}
> +
>  return nvme_dma_read_prp(n, (uint8_t *)  + off, trans_len, prp1,
>  prp2);
>  }
> @@ -671,13 +754,17 @@ static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, 
> NvmeRequest *req)
>  
>  switch (lid) {
>  case NVME_LOG_ERROR_INFO:
> +if (!rae) {
> +nvme_clear_events(n, NVME_AER_TYPE_ERROR);
> +}
> +
>  if (off) {
>  return NVME_INVALID_FIELD | NVME_DNR;
>  }
>  
>  return NVME_SUCCESS;
>  case NVME_LOG_SMART_INFO:
> -return nvme_smart_info(n, cmd, len, off, req);
> +return nvme_smart_info(n, cmd, rae, len, off, req);
>  case NVME_LOG_FW_SLOT_INFO:
>  return nvme_fw_log_info(n, cmd, len, off, req);
>  

Re: [PATCH v5 10/26] nvme: add support for the get log page command

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> Add support for the Get Log Page command and basic implementations of
> the mandatory Error Information, SMART / Health Information and Firmware
> Slot Information log pages.
> 
> In violation of the specification, the SMART / Health Information log
> page does not persist information over the lifetime of the controller
> because the device has no place to store such persistent state.
Yea, not the end of the world.
> 
> Note that the LPA field in the Identify Controller data structure
> intentionally has bit 0 cleared because there is no namespace specific
> information in the SMART / Health information log page.
Makes sense.
> 
> Required for compliance with NVMe revision 1.2.1. See NVM Express 1.2.1,
> Section 5.10 ("Get Log Page command").
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 122 +-
>  hw/block/nvme.h   |  10 
>  hw/block/trace-events |   2 +
>  include/block/nvme.h  |   2 +-
>  4 files changed, 134 insertions(+), 2 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index f72348344832..468c36918042 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -569,6 +569,123 @@ static uint16_t nvme_create_sq(NvmeCtrl *n, NvmeCmd 
> *cmd)
>  return NVME_SUCCESS;
>  }
>  
> +static uint16_t nvme_smart_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> +uint64_t off, NvmeRequest *req)
> +{
> +uint64_t prp1 = le64_to_cpu(cmd->prp1);
> +uint64_t prp2 = le64_to_cpu(cmd->prp2);
> +uint32_t nsid = le32_to_cpu(cmd->nsid);
> +
> +uint32_t trans_len;
> +time_t current_ms;
> +uint64_t units_read = 0, units_written = 0, read_commands = 0,
> +write_commands = 0;
> +NvmeSmartLog smart;
> +BlockAcctStats *s;
> +
> +if (nsid && nsid != 0x) {
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +
> +s = blk_get_stats(n->conf.blk);
> +
> +units_read = s->nr_bytes[BLOCK_ACCT_READ] >> BDRV_SECTOR_BITS;
> +units_written = s->nr_bytes[BLOCK_ACCT_WRITE] >> BDRV_SECTOR_BITS;
> +read_commands = s->nr_ops[BLOCK_ACCT_READ];
> +write_commands = s->nr_ops[BLOCK_ACCT_WRITE];
> +
> +if (off > sizeof(smart)) {
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +
> +trans_len = MIN(sizeof(smart) - off, buf_len);
> +
> +memset(, 0x0, sizeof(smart));
> +
> +smart.data_units_read[0] = cpu_to_le64(units_read / 1000);
> +smart.data_units_written[0] = cpu_to_le64(units_written / 1000);
> +smart.host_read_commands[0] = cpu_to_le64(read_commands);
> +smart.host_write_commands[0] = cpu_to_le64(write_commands);
> +
> +smart.temperature[0] = n->temperature & 0xff;
> +smart.temperature[1] = (n->temperature >> 8) & 0xff;
> +
> +if ((n->temperature > n->features.temp_thresh_hi) ||
> +(n->temperature < n->features.temp_thresh_low)) {
> +smart.critical_warning |= NVME_SMART_TEMPERATURE;
> +}
> +
> +current_ms = qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL);
> +smart.power_on_hours[0] = cpu_to_le64(
> +(((current_ms - n->starttime_ms) / 1000) / 60) / 60);
> +
> +return nvme_dma_read_prp(n, (uint8_t *)  + off, trans_len, prp1,
> +prp2);
> +}
Looks OK.
> +
> +static uint16_t nvme_fw_log_info(NvmeCtrl *n, NvmeCmd *cmd, uint32_t buf_len,
> +uint64_t off, NvmeRequest *req)
> +{
> +uint32_t trans_len;
> +uint64_t prp1 = le64_to_cpu(cmd->prp1);
> +uint64_t prp2 = le64_to_cpu(cmd->prp2);
> +NvmeFwSlotInfoLog fw_log;
> +
> +if (off > sizeof(fw_log)) {
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +
> +memset(_log, 0, sizeof(NvmeFwSlotInfoLog));
> +
> +trans_len = MIN(sizeof(fw_log) - off, buf_len);
> +
> +return nvme_dma_read_prp(n, (uint8_t *) _log + off, trans_len, prp1,
> +prp2);
> +}
Looks OK
> +
> +static uint16_t nvme_get_log(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> +{
> +uint32_t dw10 = le32_to_cpu(cmd->cdw10);
> +uint32_t dw11 = le32_to_cpu(cmd->cdw11);
> +uint32_t dw12 = le32_to_cpu(cmd->cdw12);
> +uint32_t dw13 = le32_to_cpu(cmd->cdw13);
> +uint8_t  lid = dw10 & 0xff;
> +uint8_t  rae = (dw10 >> 15) & 0x1;
> +uint32_t numdl, numdu;
> +uint64_t off, lpol, lpou;
> +size_t   len;
> +
> +numdl = (dw10 >> 16);
> +numdu = (dw11 & 0x);
> +lpol = dw12;
> +lpou = dw13;
> +
> +len = (((numdu << 16) | numdl) + 1) << 2;
> +off = (lpou << 32ULL) | lpol;
> +
> +if (off & 0x3) {
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}

Good. 
Note that there are plenty of other places in the driver that don't honor
such tiny formal bits of the spec, like for instance checking for the reserved
bits in commands.
> +
> +trace_nvme_dev_get_log(nvme_cid(req), lid, rae, len, off);
> +
> +switch (lid) {
> +case NVME_LOG_ERROR_INFO:
> +if (off) {
> +return NVME_INVALID_FIELD | NVME_DNR;
> +

Re: [PATCH v5 09/26] nvme: add temperature threshold feature

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> It might seem wierd to implement this feature for an emulated device,
> but it is mandatory to support and the feature is useful for testing
> asynchronous event request support, which will be added in a later
> patch.

Absolutely but as the old saying is, rules are rules.
At least, to the defense of the spec, making this mandatory
forced the vendors to actually report some statistics about
the device in neutral format as opposed to yet another
vendor proprietary thing (I am talking about SMART log page).

> 
> Signed-off-by: Klaus Jensen 

I noticed that you sign off some patches with your @samsung.com email,
and some with @cnexlabs.com
Is there a reason for that?


> ---
>  hw/block/nvme.c  | 50 
>  hw/block/nvme.h  |  2 ++
>  include/block/nvme.h |  7 ++-
>  3 files changed, 58 insertions(+), 1 deletion(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 81514eaef63a..f72348344832 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -45,6 +45,9 @@
>  
>  #define NVME_SPEC_VER 0x00010201
>  #define NVME_MAX_QS PCI_MSIX_FLAGS_QSIZE
> +#define NVME_TEMPERATURE 0x143
> +#define NVME_TEMPERATURE_WARNING 0x157
> +#define NVME_TEMPERATURE_CRITICAL 0x175
>  
>  #define NVME_GUEST_ERR(trace, fmt, ...) \
>  do { \
> @@ -798,9 +801,31 @@ static uint16_t nvme_get_feature_timestamp(NvmeCtrl *n, 
> NvmeCmd *cmd)
>  static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
>  {
>  uint32_t dw10 = le32_to_cpu(cmd->cdw10);
> +uint32_t dw11 = le32_to_cpu(cmd->cdw11);
>  uint32_t result;
>  
>  switch (dw10) {
> +case NVME_TEMPERATURE_THRESHOLD:
> +result = 0;
> +
> +/*
> + * The controller only implements the Composite Temperature sensor, 
> so
> + * return 0 for all other sensors.
> + */
> +if (NVME_TEMP_TMPSEL(dw11)) {
> +break;
> +}
> +
> +switch (NVME_TEMP_THSEL(dw11)) {
> +case 0x0:
> +result = cpu_to_le16(n->features.temp_thresh_hi);
> +break;
> +case 0x1:
> +result = cpu_to_le16(n->features.temp_thresh_low);
> +break;
> +}
> +
> +break;
>  case NVME_VOLATILE_WRITE_CACHE:
>  result = blk_enable_write_cache(n->conf.blk);
>  trace_nvme_dev_getfeat_vwcache(result ? "enabled" : "disabled");
> @@ -845,6 +870,23 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  uint32_t dw11 = le32_to_cpu(cmd->cdw11);
>  
>  switch (dw10) {
> +case NVME_TEMPERATURE_THRESHOLD:
> +if (NVME_TEMP_TMPSEL(dw11)) {
> +break;
> +}
> +
> +switch (NVME_TEMP_THSEL(dw11)) {
> +case 0x0:
> +n->features.temp_thresh_hi = NVME_TEMP_TMPTH(dw11);
> +break;
> +case 0x1:
> +n->features.temp_thresh_low = NVME_TEMP_TMPTH(dw11);
> +break;
> +default:
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +
> +break;
>  case NVME_VOLATILE_WRITE_CACHE:
>  blk_set_enable_write_cache(n->conf.blk, dw11 & 1);
>  break;
> @@ -1366,6 +1408,9 @@ static void nvme_init_state(NvmeCtrl *n)
>  n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
>  n->sq = g_new0(NvmeSQueue *, n->params.num_queues);
>  n->cq = g_new0(NvmeCQueue *, n->params.num_queues);
> +
> +n->temperature = NVME_TEMPERATURE;

This appears not to be used in the patch.
I think you should move that to the next patch that
adds the get log page support.

> +n->features.temp_thresh_hi = NVME_TEMPERATURE_WARNING;
>  }
>  
>  static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
> @@ -1447,6 +1492,11 @@ static void nvme_init_ctrl(NvmeCtrl *n)
>  id->acl = 3;
>  id->frmw = 7 << 1;
>  id->lpa = 1 << 0;
> +
> +/* recommended default value (~70 C) */
> +id->wctemp = cpu_to_le16(NVME_TEMPERATURE_WARNING);
> +id->cctemp = cpu_to_le16(NVME_TEMPERATURE_CRITICAL);
> +
>  id->sqes = (0x6 << 4) | 0x6;
>  id->cqes = (0x4 << 4) | 0x4;
>  id->nn = cpu_to_le32(n->num_namespaces);
> diff --git a/hw/block/nvme.h b/hw/block/nvme.h
> index a867bdfabafd..1518f32557a3 100644
> --- a/hw/block/nvme.h
> +++ b/hw/block/nvme.h
> @@ -108,6 +108,7 @@ typedef struct NvmeCtrl {
>  uint64_tirq_status;
>  uint64_thost_timestamp; /* Timestamp sent by the 
> host */
>  uint64_ttimestamp_set_qemu_clock_ms;/* QEMU clock time */
> +uint16_ttemperature;
>  
>  NvmeNamespace   *namespaces;
>  NvmeSQueue  **sq;
> @@ -115,6 +116,7 @@ typedef struct NvmeCtrl {
>  NvmeSQueue  admin_sq;
>  NvmeCQueue  admin_cq;
>  NvmeIdCtrl  id_ctrl;
> +NvmeFeatureVal  features;
>  } NvmeCtrl;
>  
>  static inline uint64_t nvme_ns_nlbas(NvmeCtrl *n, NvmeNamespace *ns)
> diff 

Re: [PATCH v5 08/26] nvme: refactor device realization

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> This patch splits up nvme_realize into multiple individual functions,
> each initializing a different subset of the device.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 175 +++-
>  hw/block/nvme.h |  21 ++
>  2 files changed, 133 insertions(+), 63 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index e1810260d40b..81514eaef63a 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -44,6 +44,7 @@
>  #include "nvme.h"
>  
>  #define NVME_SPEC_VER 0x00010201
> +#define NVME_MAX_QS PCI_MSIX_FLAGS_QSIZE
>  
>  #define NVME_GUEST_ERR(trace, fmt, ...) \
>  do { \
> @@ -1325,67 +1326,106 @@ static const MemoryRegionOps nvme_cmb_ops = {
>  },
>  };
>  
> -static void nvme_realize(PCIDevice *pci_dev, Error **errp)
> +static int nvme_check_constraints(NvmeCtrl *n, Error **errp)
>  {
> -NvmeCtrl *n = NVME(pci_dev);
> -NvmeIdCtrl *id = >id_ctrl;
> -
> -int i;
> -int64_t bs_size;
> -uint8_t *pci_conf;
> -
> -if (!n->params.num_queues) {
> -error_setg(errp, "num_queues can't be zero");
> -return;
> -}
> +NvmeParams *params = >params;
>  
>  if (!n->conf.blk) {
> -error_setg(errp, "drive property not set");
> -return;
> +error_setg(errp, "nvme: block backend not configured");
> +return 1;
As a matter of taste, negative values indicate error, and 0 is the success 
value.
In Linux kernel this is even an official rule.
>  }
>  
> -bs_size = blk_getlength(n->conf.blk);
> -if (bs_size < 0) {
> -error_setg(errp, "could not get backing file size");
> -return;
> +if (!params->serial) {
> +error_setg(errp, "nvme: serial not configured");
> +return 1;
>  }
>  
> -if (!n->params.serial) {
> -error_setg(errp, "serial property not set");
> -return;
> +if ((params->num_queues < 1 || params->num_queues > NVME_MAX_QS)) {
> +error_setg(errp, "nvme: invalid queue configuration");
Maybe something like "nvme: invalid queue count specified, should be between 1 
and ..."?
> +return 1;
>  }
> +
> +return 0;
> +}
> +
> +static int nvme_init_blk(NvmeCtrl *n, Error **errp)
> +{
>  blkconf_blocksizes(>conf);
>  if (!blkconf_apply_backend_options(>conf, 
> blk_is_read_only(n->conf.blk),
> -   false, errp)) {
> -return;
> +false, errp)) {
> +return 1;
>  }
>  
> -pci_conf = pci_dev->config;
> -pci_conf[PCI_INTERRUPT_PIN] = 1;
> -pci_config_set_prog_interface(pci_dev->config, 0x2);
> -pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
> -pcie_endpoint_cap_init(pci_dev, 0x80);
> +return 0;
> +}
>  
> +static void nvme_init_state(NvmeCtrl *n)
> +{
>  n->num_namespaces = 1;
>  n->reg_size = pow2ceil(0x1004 + 2 * (n->params.num_queues + 1) * 4);

Isn't that wrong?
First 4K of mmio (0x1000) is the registers, and that is followed by the 
doorbells,
and each doorbell takes 8 bytes (assuming regular doorbell stride).
so n->params.num_queues + 1 should be total number of queues, thus the 0x1004 
should be 0x1000 IMHO.
I might miss some rounding magic here though.

> -n->ns_size = bs_size / (uint64_t)n->num_namespaces;
> -
>  n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
>  n->sq = g_new0(NvmeSQueue *, n->params.num_queues);
>  n->cq = g_new0(NvmeCQueue *, n->params.num_queues);
> +}
>  
> -memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n,
> -  "nvme", n->reg_size);
> +static void nvme_init_cmb(NvmeCtrl *n, PCIDevice *pci_dev)
> +{
> +NVME_CMBLOC_SET_BIR(n->bar.cmbloc, 2);
It would be nice to have #define for CMB bar number
> +NVME_CMBLOC_SET_OFST(n->bar.cmbloc, 0);
> +
> +NVME_CMBSZ_SET_SQS(n->bar.cmbsz, 1);
> +NVME_CMBSZ_SET_CQS(n->bar.cmbsz, 0);
> +NVME_CMBSZ_SET_LISTS(n->bar.cmbsz, 0);
> +NVME_CMBSZ_SET_RDS(n->bar.cmbsz, 1);
> +NVME_CMBSZ_SET_WDS(n->bar.cmbsz, 1);
> +NVME_CMBSZ_SET_SZU(n->bar.cmbsz, 2);
> +NVME_CMBSZ_SET_SZ(n->bar.cmbsz, n->params.cmb_size_mb);
> +
> +n->cmbloc = n->bar.cmbloc;
> +n->cmbsz = n->bar.cmbsz;
> +
> +n->cmbuf = g_malloc0(NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
> +memory_region_init_io(>ctrl_mem, OBJECT(n), _cmb_ops, n,
> +"nvme-cmb", NVME_CMBSZ_GETSIZE(n->bar.cmbsz));
> +pci_register_bar(pci_dev, NVME_CMBLOC_BIR(n->bar.cmbloc),
Same here although since you read it here from the controller register,
then maybe leave it as is. I prefer though for this kind of thing
to have a #define and use it everywhere. 

> +PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64 |
> +PCI_BASE_ADDRESS_MEM_PREFETCH, >ctrl_mem);
> +}
> +
> +static void nvme_init_pci(NvmeCtrl *n, PCIDevice *pci_dev)
> +{
> +uint8_t *pci_conf = 

Re: [PATCH v5 06/26] nvme: refactor nvme_addr_read

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> Pull the controller memory buffer check to its own function. The check
> will be used on its own in later patches.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 18 +-
>  1 file changed, 13 insertions(+), 5 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index 9abf74da20f2..ba5089df9ece 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -54,14 +54,22 @@
>  
>  static void nvme_process_sq(void *opaque);
>  
> +static inline bool nvme_addr_is_cmb(NvmeCtrl *n, hwaddr addr)
> +{
> +hwaddr low = n->ctrl_mem.addr;
> +hwaddr hi  = n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size);
> +
> +return addr >= low && addr < hi;
> +}
> +
>  static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void *buf, int size)
>  {
> -if (n->cmbsz && addr >= n->ctrl_mem.addr &&
> -addr < (n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size))) {
> -memcpy(buf, (void *)>cmbuf[addr - n->ctrl_mem.addr], size);
> -} else {
> -pci_dma_read(>parent_obj, addr, buf, size);
> +if (n->cmbsz && nvme_addr_is_cmb(n, addr)) {
> +memcpy(buf, (void *) >cmbuf[addr - n->ctrl_mem.addr], size);
Nitpick:
I am not an expert on qemu coding style but I suspect that there is extra space 
after that (void *).

Also note that in following patches you fix a serious bug in this function that 
it doesn't
check that the whole range is in CMB but only that the start of the area is.
I would move it here, or even to a separate patch.

> +return;
>  }
> +
> +pci_dma_read(>parent_obj, addr, buf, size);
>  }
>  
>  static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)

Best regards,
Maxim Levitsky





Re: [PATCH v5 07/26] nvme: add support for the abort command

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> Required for compliance with NVMe revision 1.2.1. See NVM Express 1.2.1,
> Section 5.1 ("Abort command").
> 
> The Abort command is a best effort command; for now, the device always
> fails to abort the given command.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 28 
>  1 file changed, 28 insertions(+)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index ba5089df9ece..e1810260d40b 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -731,6 +731,18 @@ static uint16_t nvme_identify(NvmeCtrl *n, NvmeCmd *cmd)
>  }
>  }
>  
> +static uint16_t nvme_abort(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
> +{
> +uint16_t sqid = le32_to_cpu(cmd->cdw10) & 0x;
> +
> +req->cqe.result = 1;
> +if (nvme_check_sqid(n, sqid)) {
> +return NVME_INVALID_FIELD | NVME_DNR;
> +}
> +
> +return NVME_SUCCESS;
> +}

Looks 100% up to spec.

In my nvme-mdev it looks like I implemented this wrongly by failing this with
NVME_SC_ABORT_MISSING (which is defined in the kernel sources, but looks like a 
reserved
error code in the spec. Not that it matters that much.

Also unrelated to this but something I would like to point out 
(this applies not only to this command but to all admin and IO commands) the 
device
should check for various reserved fields in the command descriptor, which it 
doesn't currently.

This is what I do:
https://gitlab.com/maximlevitsky/linux/blob/mdev-work-5.2/drivers/nvme/mdev/adm.c#L783

> +
>  static inline void nvme_set_timestamp(NvmeCtrl *n, uint64_t ts)
>  {
>  trace_nvme_dev_setfeat_timestamp(ts);
> @@ -848,6 +860,7 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  trace_nvme_dev_err_invalid_setfeat(dw10);
>  return NVME_INVALID_FIELD | NVME_DNR;
>  }
> +
Nitpick: Unrelated whitespace change.
>  return NVME_SUCCESS;
>  }
>  
> @@ -864,6 +877,8 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, 
> NvmeRequest *req)
>  return nvme_create_cq(n, cmd);
>  case NVME_ADM_CMD_IDENTIFY:
>  return nvme_identify(n, cmd);
> +case NVME_ADM_CMD_ABORT:
> +return nvme_abort(n, cmd, req);
>  case NVME_ADM_CMD_SET_FEATURES:
>  return nvme_set_feature(n, cmd, req);
>  case NVME_ADM_CMD_GET_FEATURES:
> @@ -1377,6 +1392,19 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
> **errp)
>  id->ieee[2] = 0xb3;
>  id->ver = cpu_to_le32(NVME_SPEC_VER);
>  id->oacs = cpu_to_le16(0);
> +
> +/*
> + * Because the controller always completes the Abort command immediately,
> + * there can never be more than one concurrently executing Abort command,
> + * so this value is never used for anything. Note that there can easily 
> be
> + * many Abort commands in the queues, but they are not considered
> + * "executing" until processed by nvme_abort.
> + *
> + * The specification recommends a value of 3 for Abort Command Limit 
> (four
> + * concurrently outstanding Abort commands), so lets use that though it 
> is
> + * inconsequential.
> + */
> +id->acl = 3;
Yep.
>  id->frmw = 7 << 1;
>  id->lpa = 1 << 0;
>  id->sqes = (0x6 << 4) | 0x6;


Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky




Re: [PATCH] nbd: Fix regression with multiple meta contexts

2020-02-12 Thread Laurent Vivier
Le 06/02/2020 à 18:38, Eric Blake a écrit :
> Detected by a hang in the libnbd testsuite.  If a client requests
> multiple meta contexts (both base:allocation and qemu:dirty-bitmap:x)
> at the same time, our attempt to silence a false-positive warning
> about a potential uninitialized variable introduced botched logic: we
> were short-circuiting the second context, and never sending the
> NBD_REPLY_FLAG_DONE.  Combining two 'if' into one 'if/else' in
> bdf200a55 was wrong (I'm a bit embarrassed that such a change was my
> initial suggestion after the v1 patch, then I did not review the v2
> patch that actually got committed). Revert that, and instead silence
> the false positive warning by replacing 'return ret' with 'return 0'
> (the value it always has at that point in the code, even though it
> eluded the deduction abilities of the robot that reported the false
> positive).
> 
> Fixes: bdf200a5535
> Signed-off-by: Eric Blake 
> ---
> 
> It's never fun when a regression is caused by a patch taken through
> qemu-trivial, proving that the patch was not trivial after all.

Do you want this one be merged using the trivial branch?

Thanks,
Laurent




Re: [PATCH v5 05/26] nvme: populate the mandatory subnqn and ver fields

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> Required for compliance with NVMe revision 1.2.1 or later. See NVM
> Express 1.2.1, Section 5.11 ("Identify command"), Figure 90 and Section
> 7.9 ("NVMe Qualified Names").
> 
> This also bumps the supported version to 1.2.1.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 13 ++---
>  1 file changed, 10 insertions(+), 3 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index f05ebcce3f53..9abf74da20f2 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -9,9 +9,9 @@
>   */
>  
>  /**
> - * Reference Specs: http://www.nvmexpress.org, 1.2, 1.1, 1.0e
> + * Reference Specification: NVM Express 1.2.1
>   *
> - *  http://www.nvmexpress.org/resources/
> + *   https://nvmexpress.org/resources/specifications/
To be honest that redirects to https://nvmexpress.org/specifications/
Not a problem though.
>   */
>  
>  /**
> @@ -43,6 +43,8 @@
>  #include "trace.h"
>  #include "nvme.h"
>  
> +#define NVME_SPEC_VER 0x00010201
> +
>  #define NVME_GUEST_ERR(trace, fmt, ...) \
>  do { \
>  (trace_##trace)(__VA_ARGS__); \
> @@ -1365,6 +1367,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
> **errp)
>  id->ieee[0] = 0x00;
>  id->ieee[1] = 0x02;
>  id->ieee[2] = 0xb3;
> +id->ver = cpu_to_le32(NVME_SPEC_VER);
This is indeed 1.2 addition
>  id->oacs = cpu_to_le16(0);
>  id->frmw = 7 << 1;
>  id->lpa = 1 << 0;
> @@ -1372,6 +1375,10 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
> **errp)
>  id->cqes = (0x4 << 4) | 0x4;
>  id->nn = cpu_to_le32(n->num_namespaces);
>  id->oncs = cpu_to_le16(NVME_ONCS_WRITE_ZEROS | NVME_ONCS_TIMESTAMP);
> +
> +strcpy((char *) id->subnqn, "nqn.2019-08.org.qemu:");
> +pstrcat((char *) id->subnqn, sizeof(id->subnqn), n->params.serial);
Looks OK, this is first format according to the spec.
> +
>  id->psd[0].mp = cpu_to_le16(0x9c4);
>  id->psd[0].enlat = cpu_to_le32(0x10);
>  id->psd[0].exlat = cpu_to_le32(0x4);
> @@ -1386,7 +1393,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
> **errp)
>  NVME_CAP_SET_CSS(n->bar.cap, 1);
>  NVME_CAP_SET_MPSMAX(n->bar.cap, 4);
>  
> -n->bar.vs = 0x00010200;
> +n->bar.vs = NVME_SPEC_VER;
>  n->bar.intmc = n->bar.intms = 0;
>  
>  if (n->params.cmb_size_mb) {

To be really pedantic, the 1.2 spec also requires at least:
  * wctemp and cctemp to be nonzero in Identify Controller (yea, this is stupid 
to report temperature for virtual controller)
  * NVME_ADM_CMD_GET_LOG_PAGE, with some mandatory log pages
  * NVME_ADM_CMD_SET_FEATURES/NVME_ADM_CMD_GET_FEATURES - The device currently 
doesn't implement some mandatory features.

And there are probably more. This is what I can recall from my nvme-mdev.

However I see that you implmented these in following patches, so I suggest you 
first put patches that implement all that features,
and then bump the NVME version.
Most of these features I mentioned were mandatory even in version 1.0 of the 
spec, so current version is not even
compliant with 1.0 IMHO.

Best regards,
Maxim Levitsky




Re: [PATCH v5 04/26] nvme: add missing fields in the identify data structures

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> Not used by the device model but added for completeness. See NVM Express
> 1.2.1, Section 5.11 ("Identify command"), Figure 90 and Figure 93.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  include/block/nvme.h | 48 
>  1 file changed, 40 insertions(+), 8 deletions(-)
> 
> diff --git a/include/block/nvme.h b/include/block/nvme.h
> index 8fb941c6537c..d2f65e8fe496 100644
> --- a/include/block/nvme.h
> +++ b/include/block/nvme.h
> @@ -543,7 +543,13 @@ typedef struct NvmeIdCtrl {
>  uint8_t ieee[3];
>  uint8_t cmic;
>  uint8_t mdts;
> -uint8_t rsvd255[178];
> +uint16_tcntlid;
> +uint32_tver;
> +uint32_trtd3r;
> +uint32_trtd3e;
> +uint32_toaes;
> +uint32_tctratt;
> +uint8_t rsvd100[156];
>  uint16_toacs;
>  uint8_t acl;
>  uint8_t aerl;
> @@ -551,10 +557,22 @@ typedef struct NvmeIdCtrl {
>  uint8_t lpa;
>  uint8_t elpe;
>  uint8_t npss;
> -uint8_t rsvd511[248];
> +uint8_t avscc;
> +uint8_t apsta;
> +uint16_twctemp;
> +uint16_tcctemp;
> +uint16_tmtfa;
> +uint32_thmpre;
> +uint32_thmmin;
> +uint8_t tnvmcap[16];
> +uint8_t unvmcap[16];
> +uint32_trpmbs;
> +uint8_t rsvd316[4];
> +uint16_tkas;
> +uint8_t rsvd322[190];
>  uint8_t sqes;
>  uint8_t cqes;
> -uint16_trsvd515;
> +uint16_tmaxcmd;
>  uint32_tnn;
>  uint16_toncs;
>  uint16_tfuses;
> @@ -562,8 +580,14 @@ typedef struct NvmeIdCtrl {
>  uint8_t vwc;
>  uint16_tawun;
>  uint16_tawupf;
> -uint8_t rsvd703[174];
> -uint8_t rsvd2047[1344];
> +uint8_t nvscc;
> +uint8_t rsvd531;
> +uint16_tacwu;
> +uint8_t rsvd534[2];
> +uint32_tsgls;
> +uint8_t rsvd540[228];
> +uint8_t subnqn[256];
> +uint8_t rsvd1024[1024];
>  NvmePSD psd[32];
>  uint8_t vs[1024];
>  } NvmeIdCtrl;
> @@ -653,13 +677,21 @@ typedef struct NvmeIdNs {
>  uint8_t mc;
>  uint8_t dpc;
>  uint8_t dps;
> -
>  uint8_t nmic;
>  uint8_t rescap;
>  uint8_t fpi;
>  uint8_t dlfeat;
> -
> -uint8_t res34[94];
> +uint8_t rsvd33;
This is wrong. nawun comes right after dlfeat
> +uint16_tnawun;
> +uint16_tnawupf;
And here the error cancels out since here there should be 'nacwu' field.
> +uint16_tnabsn;
> +uint16_tnabo;
> +uint16_tnabspf;
> +uint8_t rsvd46[2];
> +uint8_t nvmcap[16];
> +uint8_t rsvd64[40];
> +uint8_t nguid[16];
> +uint64_teui64;
>  NvmeLBAFlbaf[16];
>  uint8_t res192[192];
Not related to the patch, but maybe rename this to rsvd192 for the sake of 
consistency?
>  uint8_t vs[3712];


I reviewed this patch by cross referencing the nvme structures as defined in 
the kernel,
and the spec.

I prefer to merge this patch with all other spec updates you do in following 
patches,
to bring nvme.h up to date to 1.3d,
so that it will be easier to review this and remove some noise from other 
patches.

Best regards,
Maxim Levitsky




Re: [PATCH v5 03/26] nvme: move device parameters to separate struct

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> Move device configuration parameters to separate struct to make it
> explicit what is configurable and what is set internally.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 44 ++--
>  hw/block/nvme.h | 16 +---
>  2 files changed, 35 insertions(+), 25 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index c9ad6aaa5f95..f05ebcce3f53 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -64,12 +64,12 @@ static void nvme_addr_read(NvmeCtrl *n, hwaddr addr, void 
> *buf, int size)
>  
>  static int nvme_check_sqid(NvmeCtrl *n, uint16_t sqid)
>  {
> -return sqid < n->num_queues && n->sq[sqid] != NULL ? 0 : -1;
> +return sqid < n->params.num_queues && n->sq[sqid] != NULL ? 0 : -1;
>  }
>  
>  static int nvme_check_cqid(NvmeCtrl *n, uint16_t cqid)
>  {
> -return cqid < n->num_queues && n->cq[cqid] != NULL ? 0 : -1;
> +return cqid < n->params.num_queues && n->cq[cqid] != NULL ? 0 : -1;
>  }
>  
>  static void nvme_inc_cq_tail(NvmeCQueue *cq)
> @@ -631,7 +631,7 @@ static uint16_t nvme_create_cq(NvmeCtrl *n, NvmeCmd *cmd)
>  trace_nvme_dev_err_invalid_create_cq_addr(prp1);
>  return NVME_INVALID_FIELD | NVME_DNR;
>  }
> -if (unlikely(vector > n->num_queues)) {
> +if (unlikely(vector > n->params.num_queues)) {
>  trace_nvme_dev_err_invalid_create_cq_vector(vector);
>  return NVME_INVALID_IRQ_VECTOR | NVME_DNR;
>  }
> @@ -783,7 +783,8 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  trace_nvme_dev_getfeat_vwcache(result ? "enabled" : "disabled");
>  break;
>  case NVME_NUMBER_OF_QUEUES:
> -result = cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 
> 16));
> +result = cpu_to_le32((n->params.num_queues - 2) |
> +((n->params.num_queues - 2) << 16));
Line wrapping issue.

>  trace_nvme_dev_getfeat_numq(result);
>  break;
>  case NVME_TIMESTAMP:
> @@ -826,9 +827,10 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  break;
>  case NVME_NUMBER_OF_QUEUES:
>  trace_nvme_dev_setfeat_numq((dw11 & 0x) + 1,
> -((dw11 >> 16) & 0x) + 1, n->num_queues - 1, n->num_queues - 
> 1);
> -req->cqe.result =
> -cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
> +((dw11 >> 16) & 0x) + 1, n->params.num_queues - 1,
> +n->params.num_queues - 1);
> +req->cqe.result = cpu_to_le32((n->params.num_queues - 2) |
> +((n->params.num_queues - 2) << 16));
Here as well, and there are more probably.
>  break;
>  case NVME_TIMESTAMP:
>  return nvme_set_feature_timestamp(n, cmd);
> @@ -899,12 +901,12 @@ static void nvme_clear_ctrl(NvmeCtrl *n)
>  
>  blk_drain(n->conf.blk);
>  
> -for (i = 0; i < n->num_queues; i++) {
> +for (i = 0; i < n->params.num_queues; i++) {
>  if (n->sq[i] != NULL) {
>  nvme_free_sq(n->sq[i], n);
>  }
>  }
> -for (i = 0; i < n->num_queues; i++) {
> +for (i = 0; i < n->params.num_queues; i++) {
>  if (n->cq[i] != NULL) {
>  nvme_free_cq(n->cq[i], n);
>  }
> @@ -1307,7 +1309,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
> **errp)
>  int64_t bs_size;
>  uint8_t *pci_conf;
>  
> -if (!n->num_queues) {
> +if (!n->params.num_queues) {
>  error_setg(errp, "num_queues can't be zero");
>  return;
>  }
> @@ -1323,7 +1325,7 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
> **errp)
>  return;
>  }
>  
> -if (!n->serial) {
> +if (!n->params.serial) {
>  error_setg(errp, "serial property not set");
>  return;
>  }
> @@ -1340,25 +1342,25 @@ static void nvme_realize(PCIDevice *pci_dev, Error 
> **errp)
>  pcie_endpoint_cap_init(pci_dev, 0x80);
>  
>  n->num_namespaces = 1;
> -n->reg_size = pow2ceil(0x1004 + 2 * (n->num_queues + 1) * 4);
> +n->reg_size = pow2ceil(0x1004 + 2 * (n->params.num_queues + 1) * 4);
>  n->ns_size = bs_size / (uint64_t)n->num_namespaces;
>  
>  n->namespaces = g_new0(NvmeNamespace, n->num_namespaces);
> -n->sq = g_new0(NvmeSQueue *, n->num_queues);
> -n->cq = g_new0(NvmeCQueue *, n->num_queues);
> +n->sq = g_new0(NvmeSQueue *, n->params.num_queues);
> +n->cq = g_new0(NvmeCQueue *, n->params.num_queues);
>  
>  memory_region_init_io(>iomem, OBJECT(n), _mmio_ops, n,
>"nvme", n->reg_size);
>  pci_register_bar(pci_dev, 0,
>  PCI_BASE_ADDRESS_SPACE_MEMORY | PCI_BASE_ADDRESS_MEM_TYPE_64,
>  >iomem);
> -msix_init_exclusive_bar(pci_dev, n->num_queues, 4, NULL);
> +msix_init_exclusive_bar(pci_dev, n->params.num_queues, 4, NULL);
>  
>  id->vid = 

Re: [PATCH v5 02/26] nvme: remove superfluous breaks

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> These break statements was left over when commit 3036a626e9ef ("nvme:
> add Get/Set Feature Timestamp support") was merged.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c | 4 
>  1 file changed, 4 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index dd548d9b6605..c9ad6aaa5f95 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -788,7 +788,6 @@ static uint16_t nvme_get_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  break;
>  case NVME_TIMESTAMP:
>  return nvme_get_feature_timestamp(n, cmd);
> -break;
>  default:
>  trace_nvme_dev_err_invalid_getfeat(dw10);
>  return NVME_INVALID_FIELD | NVME_DNR;
> @@ -831,11 +830,8 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd 
> *cmd, NvmeRequest *req)
>  req->cqe.result =
>  cpu_to_le32((n->num_queues - 2) | ((n->num_queues - 2) << 16));
>  break;
> -
>  case NVME_TIMESTAMP:
>  return nvme_set_feature_timestamp(n, cmd);
> -break;
> -
>  default:
>  trace_nvme_dev_err_invalid_setfeat(dw10);
>  return NVME_INVALID_FIELD | NVME_DNR;

Reviewed-by: Maxim Levitsky 

Best regards,
Maxim Levitsky




Re: [PATCH v5 01/26] nvme: rename trace events to nvme_dev

2020-02-12 Thread Maxim Levitsky
On Tue, 2020-02-04 at 10:51 +0100, Klaus Jensen wrote:
> Change the prefix of all nvme device related trace events to 'nvme_dev'
> to not clash with trace events from the nvme block driver.
> 
> Signed-off-by: Klaus Jensen 
> ---
>  hw/block/nvme.c   | 185 +-
>  hw/block/trace-events | 172 +++
>  2 files changed, 178 insertions(+), 179 deletions(-)
> 
> diff --git a/hw/block/nvme.c b/hw/block/nvme.c
> index d28335cbf377..dd548d9b6605 100644
> --- a/hw/block/nvme.c
> +++ b/hw/block/nvme.c
> @@ -112,16 +112,16 @@ static void nvme_irq_assert(NvmeCtrl *n, NvmeCQueue *cq)
>  {
>  if (cq->irq_enabled) {
>  if (msix_enabled(&(n->parent_obj))) {
> -trace_nvme_irq_msix(cq->vector);
> +trace_nvme_dev_irq_msix(cq->vector);
>  msix_notify(&(n->parent_obj), cq->vector);
>  } else {
> -trace_nvme_irq_pin();
> +trace_nvme_dev_irq_pin();
>  assert(cq->cqid < 64);
>  n->irq_status |= 1 << cq->cqid;
>  nvme_irq_check(n);
>  }
>  } else {
> -trace_nvme_irq_masked();
> +trace_nvme_dev_irq_masked();
>  }
>  }
>  
> @@ -146,7 +146,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
> QEMUIOVector *iov, uint64_t prp1,
>  int num_prps = (len >> n->page_bits) + 1;
>  
>  if (unlikely(!prp1)) {
> -trace_nvme_err_invalid_prp();
> +trace_nvme_dev_err_invalid_prp();
>  return NVME_INVALID_FIELD | NVME_DNR;
>  } else if (n->cmbsz && prp1 >= n->ctrl_mem.addr &&
> prp1 < n->ctrl_mem.addr + int128_get64(n->ctrl_mem.size)) {
> @@ -160,7 +160,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
> QEMUIOVector *iov, uint64_t prp1,
>  len -= trans_len;
>  if (len) {
>  if (unlikely(!prp2)) {
> -trace_nvme_err_invalid_prp2_missing();
> +trace_nvme_dev_err_invalid_prp2_missing();
>  goto unmap;
>  }
>  if (len > n->page_size) {
> @@ -176,7 +176,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
> QEMUIOVector *iov, uint64_t prp1,
>  
>  if (i == n->max_prp_ents - 1 && len > n->page_size) {
>  if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
> -trace_nvme_err_invalid_prplist_ent(prp_ent);
> +trace_nvme_dev_err_invalid_prplist_ent(prp_ent);
>  goto unmap;
>  }
>  
> @@ -189,7 +189,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
> QEMUIOVector *iov, uint64_t prp1,
>  }
>  
>  if (unlikely(!prp_ent || prp_ent & (n->page_size - 1))) {
> -trace_nvme_err_invalid_prplist_ent(prp_ent);
> +trace_nvme_dev_err_invalid_prplist_ent(prp_ent);
>  goto unmap;
>  }
>  
> @@ -204,7 +204,7 @@ static uint16_t nvme_map_prp(QEMUSGList *qsg, 
> QEMUIOVector *iov, uint64_t prp1,
>  }
>  } else {
>  if (unlikely(prp2 & (n->page_size - 1))) {
> -trace_nvme_err_invalid_prp2_align(prp2);
> +trace_nvme_dev_err_invalid_prp2_align(prp2);
>  goto unmap;
>  }
>  if (qsg->nsg) {
> @@ -252,20 +252,20 @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t 
> *ptr, uint32_t len,
>  QEMUIOVector iov;
>  uint16_t status = NVME_SUCCESS;
>  
> -trace_nvme_dma_read(prp1, prp2);
> +trace_nvme_dev_dma_read(prp1, prp2);
>  
>  if (nvme_map_prp(, , prp1, prp2, len, n)) {
>  return NVME_INVALID_FIELD | NVME_DNR;
>  }
>  if (qsg.nsg > 0) {
>  if (unlikely(dma_buf_read(ptr, len, ))) {
> -trace_nvme_err_invalid_dma();
> +trace_nvme_dev_err_invalid_dma();
>  status = NVME_INVALID_FIELD | NVME_DNR;
>  }
>  qemu_sglist_destroy();
>  } else {
>  if (unlikely(qemu_iovec_from_buf(, 0, ptr, len) != len)) {
> -trace_nvme_err_invalid_dma();
> +trace_nvme_dev_err_invalid_dma();
>  status = NVME_INVALID_FIELD | NVME_DNR;
>  }
>  qemu_iovec_destroy();
> @@ -354,7 +354,7 @@ static uint16_t nvme_write_zeros(NvmeCtrl *n, 
> NvmeNamespace *ns, NvmeCmd *cmd,
>  uint32_t count = nlb << data_shift;
>  
>  if (unlikely(slba + nlb > ns->id_ns.nsze)) {
> -trace_nvme_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
> +trace_nvme_dev_err_invalid_lba_range(slba, nlb, ns->id_ns.nsze);
>  return NVME_LBA_RANGE | NVME_DNR;
>  }
>  
> @@ -382,11 +382,11 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, 
> NvmeCmd *cmd,
>  int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
>  enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
>  
> -trace_nvme_rw(is_write ? "write" : "read", nlb, data_size,