Re: [PATCH v2] virtio: vdpa: new SolidNET DPU driver.

2022-12-05 Thread Alvaro Karsz
Thanks
I'll send a new version.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2] vdpasim: support doorbell mapping

2022-12-05 Thread Jason Wang
On Mon, Dec 5, 2022 at 9:52 PM Longpeng(Mike)  wrote:
>
> From: Longpeng 
>
> Support doorbell mapping for vdpasim devices, then we can test the notify
> passthrough feature even if there's no real hardware on hand.
>
> Allocates a dummy page which is used to emulate the notify page of the device,
> all VQs share the same notify register  that initiated to 0x. A  periodic
> work will check whether there're requests need to process ( the value of the
> notify register is 0x or not ).
>
> This cap is disabled as default, users can enable it as follow:
>   modprobe vdpa_sim notify_passthrough=true
>
> Signed-off-by: Longpeng 
> ---
> Changes v1->v2:
>   - support both kick mode and passthrough mode. [Jason]
>   - poll the notify register first. [Jason, Michael]
> ---
>  drivers/vdpa/vdpa_sim/vdpa_sim.c | 77 
>  drivers/vdpa/vdpa_sim/vdpa_sim.h |  3 ++
>  2 files changed, 80 insertions(+)
>
> diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c 
> b/drivers/vdpa/vdpa_sim/vdpa_sim.c
> index 7438a89ce939..07096a04dabb 100644
> --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c
> +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c
> @@ -14,6 +14,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -36,9 +37,16 @@ module_param(max_iotlb_entries, int, 0444);
>  MODULE_PARM_DESC(max_iotlb_entries,
>  "Maximum number of iotlb entries for each address space. 0 
> means unlimited. (default: 2048)");
>
> +static bool notify_passthrough;
> +module_param(notify_passthrough, bool, 0444);
> +MODULE_PARM_DESC(notify_passthrough,
> +"Enable vq notify(doorbell) area mapping. (default: false)");

I'm not sure if it's worth doing this, I think we can afford the cost
of periodic work (especially considering it's a simulator).

> +
>  #define VDPASIM_QUEUE_ALIGN PAGE_SIZE
>  #define VDPASIM_QUEUE_MAX 256
>  #define VDPASIM_VENDOR_ID 0
> +#define VDPASIM_VRING_POLL_PERIOD 100 /* ms */
> +#define VDPASIM_NOTIFY_DEFVAL 0x
>
>  static struct vdpasim *vdpa_to_sim(struct vdpa_device *vdpa)
>  {
> @@ -246,6 +254,28 @@ static const struct dma_map_ops vdpasim_dma_ops = {
>  static const struct vdpa_config_ops vdpasim_config_ops;
>  static const struct vdpa_config_ops vdpasim_batch_config_ops;
>
> +static void vdpasim_notify_work(struct work_struct *work)
> +{
> +   struct vdpasim *vdpasim;
> +   u16 *val;
> +
> +   vdpasim = container_of(work, struct vdpasim, notify_work.work);
> +
> +   if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK))
> +   goto out;
> +
> +   if (!vdpasim->running)
> +   goto out;
> +
> +   val = (u16 *)vdpasim->notify;
> +   if (xchg(val, VDPASIM_NOTIFY_DEFVAL) != VDPASIM_NOTIFY_DEFVAL)
> +   schedule_work(&vdpasim->work);
> +
> +out:
> +   schedule_delayed_work(&vdpasim->notify_work,
> + msecs_to_jiffies(VDPASIM_VRING_POLL_PERIOD));
> +}
> +
>  struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr,
>const struct vdpa_dev_set_config *config)
>  {
> @@ -287,6 +317,18 @@ struct vdpasim *vdpasim_create(struct vdpasim_dev_attr 
> *dev_attr,
> set_dma_ops(dev, &vdpasim_dma_ops);
> vdpasim->vdpa.mdev = dev_attr->mgmt_dev;
>
> +   if (notify_passthrough) {
> +   INIT_DELAYED_WORK(&vdpasim->notify_work, vdpasim_notify_work);
> +
> +   vdpasim->notify = __get_free_page(GFP_KERNEL | __GFP_ZERO);
> +   if (!vdpasim->notify)
> +   goto err_iommu;
> +#ifdef CONFIG_X86
> +   set_memory_uc(vdpasim->notify, 1);
> +#endif

I had the same question with version 1, any reason for having this
part uncacheable? It's a hint that we have bugs somewhere. Are we
missing ACCESS/ORDER_PLATFORM or other features?

> +   *(u16 *)vdpasim->notify = VDPASIM_NOTIFY_DEFVAL;

WRITE_ONCE()?

> +   }
> +
> vdpasim->config = kzalloc(dev_attr->config_size, GFP_KERNEL);
> if (!vdpasim->config)
> goto err_iommu;
> @@ -495,6 +537,18 @@ static u8 vdpasim_get_status(struct vdpa_device *vdpa)
> return status;
>  }
>
> +static void vdpasim_set_notify_work(struct vdpasim *vdpasim, bool start)
> +{
> +   if (!notify_passthrough)
> +   return;

Only two callers for this function: one is start another is stop. If
we decide to get rid of notify_passthrough, I'd rather opencode the
schedule/cancel_delayed().

Thanks

> +
> +   if (start)
> +   schedule_delayed_work(&vdpasim->notify_work,
> + 
> msecs_to_jiffies(VDPASIM_VRING_POLL_PERIOD));
> +   else
> +   cancel_delayed_work_sync(&vdpasim->notify_work);
> +}
> +
>  static void vdpasim_set_status(struct vdpa_device *vdpa, u8 status)
>  {
> struct vdpasim *vdpasim = vdpa_to_sim(vdpa);
> @@ -502,12 +556,14 @@ static void vdpasim_set_status(struct vdpa_device 
> *

Re: [PATCH v2] vp_vdpa: harden the logic of set status

2022-12-05 Thread Jason Wang
On Tue, Dec 6, 2022 at 10:13 AM Longpeng(Mike)  wrote:
>
> From: Longpeng 
>
> 1. We should not set status to 0 when invoking vp_vdpa_set_status(),
>trigger a warning in that case.
>
> 2. The driver MUST wait for a read of device_status to return 0 before
>reinitializing the device. But we also don't want to keep us in an
>infinite loop forever, so wait for 5s if we try to reset the device.
>
> Signed-off-by: Longpeng 
> ---
> Changes v1->v2:
>   - use WARN_ON instead of BUG_ON. [Stefano]
>   - use "warning + failed" instead of "infinite loop". [Jason, Stefano]
>   - use usleep_range instead of msleep (checkpatch). [Longpeng]
>
> ---
>  drivers/vdpa/virtio_pci/vp_vdpa.c | 22 ++
>  1 file changed, 22 insertions(+)
>
> diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c 
> b/drivers/vdpa/virtio_pci/vp_vdpa.c
> index 13701c2a1963..a2d3b13ac646 100644
> --- a/drivers/vdpa/virtio_pci/vp_vdpa.c
> +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c
> @@ -214,6 +214,9 @@ static void vp_vdpa_set_status(struct vdpa_device *vdpa, 
> u8 status)
> struct virtio_pci_modern_device *mdev = vp_vdpa_to_mdev(vp_vdpa);
> u8 s = vp_vdpa_get_status(vdpa);
>
> +   /* We should never be setting status to 0. */
> +   WARN_ON(status == 0);
> +
> if (status & VIRTIO_CONFIG_S_DRIVER_OK &&
> !(s & VIRTIO_CONFIG_S_DRIVER_OK)) {
> vp_vdpa_request_irq(vp_vdpa);
> @@ -222,14 +225,33 @@ static void vp_vdpa_set_status(struct vdpa_device 
> *vdpa, u8 status)
> vp_modern_set_status(mdev, status);
>  }
>
> +#define VP_VDPA_RESET_TMOUT_MS 5000 /* 5s */
> +
>  static int vp_vdpa_reset(struct vdpa_device *vdpa, bool clear)
>  {
> struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa);
> struct virtio_pci_modern_device *mdev = vp_vdpa_to_mdev(vp_vdpa);
> u8 s = vp_vdpa_get_status(vdpa);
> +   unsigned long timeout;
>
> vp_modern_set_status(mdev, 0);
>
> +   /*
> +* As the virtio v1.1 spec (4.1.4.3.2) says: After writing 0 to
> +* device_status, the driver MUST wait for a read of device_status
> +* to return 0 before reinitializing the device.
> +* To avoid keep us here forever, we only wait for 5 seconds.

s/keep/keeping/

> +*/
> +   timeout = jiffies + msecs_to_jiffies(VP_VDPA_RESET_TMOUT_MS);
> +   while (vp_modern_get_status(mdev)) {
> +   usleep_range(1000, 1500);
> +   if (time_after(jiffies, timeout)) {
> +   dev_err(&mdev->pci_dev->dev,
> +   "vp_vdpa: fail to reset device\n");
> +   return -ETIMEDOUT;
> +   }

Any chance to use readx_poll_timeout() here?

Thanks

> +   }
> +
> if (s & VIRTIO_CONFIG_S_DRIVER_OK)
> vp_vdpa_free_irq(vp_vdpa);
>
> --
> 2.23.0
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH V2] vdpa: allow provisioning device features

2022-12-05 Thread Jason Wang
On Tue, Dec 6, 2022 at 9:43 AM Si-Wei Liu  wrote:
>
>
>
> On 12/4/2022 10:46 PM, Jason Wang wrote:
> > On Thu, Dec 1, 2022 at 8:53 AM Si-Wei Liu  wrote:
> >> Sorry for getting back late due to the snag of the holidays.
> > No worries :)
> >
> >> On 11/23/2022 11:13 PM, Jason Wang wrote:
> >>> On Thu, Nov 24, 2022 at 6:53 AM Si-Wei Liu  wrote:
> 
>  On 11/22/2022 7:35 PM, Jason Wang wrote:
> > On Wed, Nov 23, 2022 at 6:29 AM Si-Wei Liu  
> > wrote:
> >> On 11/16/2022 7:33 PM, Jason Wang wrote:
> >>> This patch allows device features to be provisioned via vdpa. This
> >>> will be useful for preserving migration compatibility between source
> >>> and destination:
> >>>
> >>> # vdpa dev add name dev1 mgmtdev pci/:02:00.0 device_features 
> >>> 0x30002
> >> Miss the actual "vdpa dev config show" command below
> > Right, let me fix that.
> >
> >>> # dev1: mac 52:54:00:12:34:56 link up link_announce false mtu 65535
> >>>   negotiated_features CTRL_VQ VERSION_1 ACCESS_PLATFORM
> >>>
> >>> Signed-off-by: Jason Wang 
> >>> ---
> >>> Changes since v1:
> >>> - Use uint64_t instead of __u64 for device_features
> >>> - Fix typos and tweak the manpage
> >>> - Add device_features to the help text
> >>> ---
> >>>  man/man8/vdpa-dev.8| 15 +++
> >>>  vdpa/include/uapi/linux/vdpa.h |  1 +
> >>>  vdpa/vdpa.c| 32 
> >>> +---
> >>>  3 files changed, 45 insertions(+), 3 deletions(-)
> >>>
> >>> diff --git a/man/man8/vdpa-dev.8 b/man/man8/vdpa-dev.8
> >>> index 9faf3838..43e5bf48 100644
> >>> --- a/man/man8/vdpa-dev.8
> >>> +++ b/man/man8/vdpa-dev.8
> >>> @@ -31,6 +31,7 @@ vdpa-dev \- vdpa device configuration
> >>>  .I NAME
> >>>  .B mgmtdev
> >>>  .I MGMTDEV
> >>> +.RI "[ device_features " DEVICE_FEATURES " ]"
> >>>  .RI "[ mac " MACADDR " ]"
> >>>  .RI "[ mtu " MTU " ]"
> >>>  .RI "[ max_vqp " MAX_VQ_PAIRS " ]"
> >>> @@ -74,6 +75,15 @@ Name of the new vdpa device to add.
> >>>  Name of the management device to use for device addition.
> >>>
> >>>  .PP
> >>> +.BI device_features " DEVICE_FEATURES"
> >>> +Specifies the virtio device features bit-mask that is provisioned 
> >>> for the new vdpa device.
> >>> +
> >>> +The bits can be found under include/uapi/linux/virtio*h.
> >>> +
> >>> +see macros such as VIRTIO_F_ and VIRTIO_XXX(e.g NET)_F_ for specific 
> >>> bit values.
> >>> +
> >>> +This is optional.
> >> Document the behavior when this attribute is missing? For e.g. inherit
> >> device features from parent device.
> > This is the current behaviour but unless we've found a way to mandate
> > it, I'd like to not mention it. Maybe add a description to say the
> > user needs to check the features after the add if features are not
> > specified.
>  Well, I think at least for live migration the mgmt software should get
>  to some consistent result between all vdpa parent drivers regarding
>  feature inheritance.
> >>> It would be hard. Especially for the device:
> >>>
> >>> 1) ask device_features from the device, in this case, new features
> >>> could be advertised after e.g a firmware update
> >> The consistency I meant is to always inherit all device features from
> >> the parent device for whatever it is capable of,
> > This looks fragile. How about the features that are mutually
> > exclusive? E.g FEATURE_X and FEATURE_Y that are both supported by the
> > mgmt?
> Hmmm, in theory, yes, it's a bit cumbersome. Is this for future proof,
> since so far as I see the virtio spec doesn't seem to define features
> that are mutually exclusive, and the way how driver should respond to
> mutually exclusive features in feature negotiation is completely undefined?

My understanding is that if a driver accepts two mutually exclusive
features it should be a bug.

But anyhow it's an example that it is not easy to have forward
compatibility if we mandating to inherit all features from the
management device.

>
> >
> >> since that was the only
> >> reasonable behavior pre-dated the device_features attribute, even though
> >> there's no mandatory check by the vdpa core. This way it's
> >> self-descriptive and consistent for the mgmt software to infer, as users
> >> can check into dev_features at the parent mgmtdev level to know what
> >> features will be ended up with after 'vdpa dev add'. I thought even
> >> though inheritance is not mandated as part of uAPI, it should at least
> >> be mentioned as a recommended guide line (for drivers in particular),
> >> especially this is the only reasonable behavior with nowhere to check
> >> what features are ended up after add (i.e. for now we can only set but
> >> not possible to read the exact device_features at vd

Re: [PATCH v2] virtio: vdpa: new SolidNET DPU driver.

2022-12-05 Thread Jason Wang
On Tue, Dec 6, 2022 at 12:55 AM Alvaro Karsz  wrote:
>
> > We must send a  SNET_MSG_DESTROY message to the DPU before removing the 
> > device.
> > This remove() makes sure that if somehow remove was called
> > before/without reset, the DPU will receive this message.
>
> I meant that the message must be sent before the PCI device is removed.
> The message is sent from the vDPA reset cb, and the reset in the PCI
> remove cb makes sure that PCI remove wasn't called without calling
> vDPA reset.
> I can remove it if you think that this is not required.

The reset is usually done during probe:

E.g register_virtio_device() or vhost_vdpa_open().

If it's not required, let's remove but if it is a must for your
hardware, we need keep it.

Thanks

>
>
> Alvaro
>

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH V2] vdpa: allow provisioning device features

2022-12-05 Thread Si-Wei Liu




On 12/4/2022 10:46 PM, Jason Wang wrote:

On Thu, Dec 1, 2022 at 8:53 AM Si-Wei Liu  wrote:

Sorry for getting back late due to the snag of the holidays.

No worries :)


On 11/23/2022 11:13 PM, Jason Wang wrote:

On Thu, Nov 24, 2022 at 6:53 AM Si-Wei Liu  wrote:


On 11/22/2022 7:35 PM, Jason Wang wrote:

On Wed, Nov 23, 2022 at 6:29 AM Si-Wei Liu  wrote:

On 11/16/2022 7:33 PM, Jason Wang wrote:

This patch allows device features to be provisioned via vdpa. This
will be useful for preserving migration compatibility between source
and destination:

# vdpa dev add name dev1 mgmtdev pci/:02:00.0 device_features 0x30002

Miss the actual "vdpa dev config show" command below

Right, let me fix that.


# dev1: mac 52:54:00:12:34:56 link up link_announce false mtu 65535
  negotiated_features CTRL_VQ VERSION_1 ACCESS_PLATFORM

Signed-off-by: Jason Wang 
---
Changes since v1:
- Use uint64_t instead of __u64 for device_features
- Fix typos and tweak the manpage
- Add device_features to the help text
---
 man/man8/vdpa-dev.8| 15 +++
 vdpa/include/uapi/linux/vdpa.h |  1 +
 vdpa/vdpa.c| 32 +---
 3 files changed, 45 insertions(+), 3 deletions(-)

diff --git a/man/man8/vdpa-dev.8 b/man/man8/vdpa-dev.8
index 9faf3838..43e5bf48 100644
--- a/man/man8/vdpa-dev.8
+++ b/man/man8/vdpa-dev.8
@@ -31,6 +31,7 @@ vdpa-dev \- vdpa device configuration
 .I NAME
 .B mgmtdev
 .I MGMTDEV
+.RI "[ device_features " DEVICE_FEATURES " ]"
 .RI "[ mac " MACADDR " ]"
 .RI "[ mtu " MTU " ]"
 .RI "[ max_vqp " MAX_VQ_PAIRS " ]"
@@ -74,6 +75,15 @@ Name of the new vdpa device to add.
 Name of the management device to use for device addition.

 .PP
+.BI device_features " DEVICE_FEATURES"
+Specifies the virtio device features bit-mask that is provisioned for the new 
vdpa device.
+
+The bits can be found under include/uapi/linux/virtio*h.
+
+see macros such as VIRTIO_F_ and VIRTIO_XXX(e.g NET)_F_ for specific bit 
values.
+
+This is optional.

Document the behavior when this attribute is missing? For e.g. inherit
device features from parent device.

This is the current behaviour but unless we've found a way to mandate
it, I'd like to not mention it. Maybe add a description to say the
user needs to check the features after the add if features are not
specified.

Well, I think at least for live migration the mgmt software should get
to some consistent result between all vdpa parent drivers regarding
feature inheritance.

It would be hard. Especially for the device:

1) ask device_features from the device, in this case, new features
could be advertised after e.g a firmware update

The consistency I meant is to always inherit all device features from
the parent device for whatever it is capable of,

This looks fragile. How about the features that are mutually
exclusive? E.g FEATURE_X and FEATURE_Y that are both supported by the
mgmt?
Hmmm, in theory, yes, it's a bit cumbersome. Is this for future proof, 
since so far as I see the virtio spec doesn't seem to define features 
that are mutually exclusive, and the way how driver should respond to 
mutually exclusive features in feature negotiation is completely undefined?





since that was the only
reasonable behavior pre-dated the device_features attribute, even though
there's no mandatory check by the vdpa core. This way it's
self-descriptive and consistent for the mgmt software to infer, as users
can check into dev_features at the parent mgmtdev level to know what
features will be ended up with after 'vdpa dev add'. I thought even
though inheritance is not mandated as part of uAPI, it should at least
be mentioned as a recommended guide line (for drivers in particular),
especially this is the only reasonable behavior with nowhere to check
what features are ended up after add (i.e. for now we can only set but
not possible to read the exact device_features at vdpa dev level, as yet).

I fully agree, but what I want to say is. Consider:

1) We've already had feature provisioning
2) It would be hard or even impossible to mandate the semantic
(consistency) of the features inheritance.

I'm fine with the doc, but the mgmt layer should not depend on this
and they should use feature provisioning instead.
OK, if it's for future proof to not mandate feature inheritance I think 
I see the point.





2) or have hierarchy architecture where several layers were placed
between vDPA and the real hardware

Not sure what it means but I don't get why extra layers are needed. Do
you mean extra layer to validate resulting features during add? Why vdpa
core is not the right place to do that?

Just want to go wild because we can't expect how many layers are below vDPA.

vDPA core is the right place but the validating should be done during
feature provisioning since it's much more easier than trying to
mandating code defined behaviour like inheritance.

OK, thanks for the clarifications.

Re: [PATCH v3] net: vmw_vsock: vmci: Check memcpy_from_msg()

2022-12-05 Thread Vishnu Dasa via Virtualization


> On Dec 5, 2022, at 3:52 AM, Artem Chernyshev  
> wrote:
> 
> vmci_transport_dgram_enqueue() does not check the return value
> of memcpy_from_msg(). Return with an error if the memcpy fails.

I think we can add some more information in the description.  Sorry, I
should've said this earlier.

vmci_transport_dgram_enqueue() does not check the return value
of memcpy_from_msg().  If memcpy_from_msg() fails, it is possible that
uninitialized memory contents are sent unintentionally instead of user's
message in the datagram to the destination.  Return with an error if
memcpy_from_msg() fails.

> 
> Found by Linux Verification Center (linuxtesting.org) with SVACE.
> 
> Fixes: 0f7db23a07af ("vmci_transport: switch ->enqeue_dgram, ->enqueue_stream 
> and ->dequeue_stream to msghdr")
> Signed-off-by: Artem Chernyshev 

Thanks, Artem!  This version looks good to me modulo my suggestion
about the description above.

Reviewed-by: Vishnu Dasa 

Regards,
Vishnu

> ---
> V1->V2 Fix memory leaking and updates for description
> V2->V3 Return the value of memcpy_from_msg()
> 
> net/vmw_vsock/vmci_transport.c | 6 +-
> 1 file changed, 5 insertions(+), 1 deletion(-)
> 
> diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
> index 842c94286d31..36eb16a40745 100644
> --- a/net/vmw_vsock/vmci_transport.c
> +++ b/net/vmw_vsock/vmci_transport.c
> @@ -1711,7 +1711,11 @@ static int vmci_transport_dgram_enqueue(
>   if (!dg)
>   return -ENOMEM;
> 
> - memcpy_from_msg(VMCI_DG_PAYLOAD(dg), msg, len);
> + err = memcpy_from_msg(VMCI_DG_PAYLOAD(dg), msg, len);
> + if (err) {
> + kfree(dg);
> + return err;
> + }
> 
>   dg->dst = vmci_make_handle(remote_addr->svm_cid,
>  remote_addr->svm_port);
> -- 
> 2.30.3
> 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3] virtio_blk: add VIRTIO_BLK_F_LIFETIME feature support

2022-12-05 Thread Jens Axboe
On 12/5/22 1:29?PM, Michael S. Tsirkin wrote:
> On Mon, Dec 05, 2022 at 11:53:51AM -0700, Jens Axboe wrote:
>> On 12/5/22 11:36?AM, Alvaro Karsz wrote:
>>> Hi,
>>>
 Is this based on some spec? Because it looks pretty odd to me. There
 can be a pretty wide range of two/three/etc level cells with wildly
 different ranges of durability. And there's really not a lot of slc
 for generic devices these days, if any.
>>>
>>> Yes, this is based on the virtio spec
>>> https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html
>>> section  5.2.6
>>
>> And where did this come from?
> 
> 
> Here's the commit log from the spec:
>   In many embedded systems, virtio-blk implementations are
>   backed by eMMC or UFS storage devices, which are subject to
>   predictable and measurable wear over time due to repeated write
>   cycles.
> 
>   For such systems, it can be important to be able to track
>   accurately the amount of wear imposed on the storage over
>   time and surface it to applications. In a native deployments
>   this is generally handled by the physical block device driver
>   but no such provision is made in virtio-blk to expose these
>   metrics for devices where it makes sense to do so.
> 
>   This patch adds support to virtio-blk for lifetime and wear
>   metrics to be exposed to the guest when a deployment of
>   virtio-blk is done over compatible eMMC or UFS storage.
> 
>   Signed-off-by: Enrico Granata 
> 
> Cc Enrico Granata as well.

Alvaro sent me this one privately too. To be honest, I don't like this
patch very much, but that's mostly because the spec for this caters to a
narrow use case of embedded flash. It's not a generic storage thing,
it's just for mmc/ufs and the embedded space. That's a missed
opportunity. And either it'll become mostly unused, or it'll actually be
used and then extended at some point.

While I'm not a fan at all, if you guys are willing to take it in
virtio-blk, then I won't stand in your way. I would rename it though to
more specifically detail what it deals with.

-- 
Jens Axboe
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3] virtio_blk: add VIRTIO_BLK_F_LIFETIME feature support

2022-12-05 Thread Michael S. Tsirkin
On Mon, Dec 05, 2022 at 11:53:51AM -0700, Jens Axboe wrote:
> On 12/5/22 11:36 AM, Alvaro Karsz wrote:
> > Hi,
> > 
> >> Is this based on some spec? Because it looks pretty odd to me. There
> >> can be a pretty wide range of two/three/etc level cells with wildly
> >> different ranges of durability. And there's really not a lot of slc
> >> for generic devices these days, if any.
> > 
> > Yes, this is based on the virtio spec
> > https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html
> > section  5.2.6
> 
> And where did this come from?


Here's the commit log from the spec:
In many embedded systems, virtio-blk implementations are
backed by eMMC or UFS storage devices, which are subject to
predictable and measurable wear over time due to repeated write
cycles.

For such systems, it can be important to be able to track
accurately the amount of wear imposed on the storage over
time and surface it to applications. In a native deployments
this is generally handled by the physical block device driver
but no such provision is made in virtio-blk to expose these
metrics for devices where it makes sense to do so.

This patch adds support to virtio-blk for lifetime and wear
metrics to be exposed to the guest when a deployment of
virtio-blk is done over compatible eMMC or UFS storage.

Signed-off-by: Enrico Granata 

Cc Enrico Granata as well.


> -- 
> Jens Axboe
> 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v3] virtio_blk: add VIRTIO_BLK_F_LIFETIME feature support

2022-12-05 Thread Jens Axboe
On 12/5/22 11:36 AM, Alvaro Karsz wrote:
> Hi,
> 
>> Is this based on some spec? Because it looks pretty odd to me. There
>> can be a pretty wide range of two/three/etc level cells with wildly
>> different ranges of durability. And there's really not a lot of slc
>> for generic devices these days, if any.
> 
> Yes, this is based on the virtio spec
> https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html
> section  5.2.6

And where did this come from?

-- 
Jens Axboe


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v3] virtio_blk: add VIRTIO_BLK_F_LIFETIME feature support

2022-12-05 Thread Alvaro Karsz
Hi,

> Is this based on some spec? Because it looks pretty odd to me. There
> can be a pretty wide range of two/three/etc level cells with wildly
> different ranges of durability. And there's really not a lot of slc
> for generic devices these days, if any.

Yes, this is based on the virtio spec
https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html
section  5.2.6
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3] virtio_blk: add VIRTIO_BLK_F_LIFETIME feature support

2022-12-05 Thread Jens Axboe
On 12/5/22 9:20 AM, Alvaro Karsz wrote:
> Implement the VIRTIO_BLK_F_LIFETIME feature for VirtIO block devices.
> 
> This commit introduces a new ioctl command, VBLK_LIFETIME.
> 
> VBLK_LIFETIME ioctl asks for the block device to provide lifetime
> information by sending a VIRTIO_BLK_T_GET_LIFETIME command to the device.

s/VBLK_LIFETIME/VBLK_GET_LIFETIME

for the above.

-- 
Jens Axboe


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v3] virtio_blk: add VIRTIO_BLK_F_LIFETIME feature support

2022-12-05 Thread Jens Axboe
On 12/5/22 9:20 AM, Alvaro Karsz wrote:
> Implement the VIRTIO_BLK_F_LIFETIME feature for VirtIO block devices.
> 
> This commit introduces a new ioctl command, VBLK_LIFETIME.
> 
> VBLK_LIFETIME ioctl asks for the block device to provide lifetime
> information by sending a VIRTIO_BLK_T_GET_LIFETIME command to the device.
> 
> lifetime information fields:
> 
> - pre_eol_info: specifies the percentage of reserved blocks that are
>   consumed.
>   optional values following virtio spec:
>   *) 0 - undefined.
>   *) 1 - normal, < 80% of reserved blocks are consumed.
>   *) 2 - warning, 80% of reserved blocks are consumed.
>   *) 3 - urgent, 90% of reserved blocks are consumed.
> 
> - device_lifetime_est_typ_a: this field refers to wear of SLC cells and
>is provided in increments of 10used, and so
>on, thru to 11 meaning estimated lifetime
>exceeded. All values above 11 are reserved.
> 
> - device_lifetime_est_typ_b: this field refers to wear of MLC cells and is
>provided with the same semantics as
>device_lifetime_est_typ_a.
> 
> The data received from the device will be sent as is to the user.
> No data check/decode is done by virtblk.

Is this based on some spec? Because it looks pretty odd to me. There
can be a pretty wide range of two/three/etc level cells with wildly
different ranges of durability. And there's really not a lot of slc
for generic devices these days, if any.

-- 
Jens Axboe


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v2] virtio: vdpa: new SolidNET DPU driver.

2022-12-05 Thread Alvaro Karsz
> We must send a  SNET_MSG_DESTROY message to the DPU before removing the 
> device.
> This remove() makes sure that if somehow remove was called
> before/without reset, the DPU will receive this message.

I meant that the message must be sent before the PCI device is removed.
The message is sent from the vDPA reset cb, and the reset in the PCI
remove cb makes sure that PCI remove wasn't called without calling
vDPA reset.
I can remove it if you think that this is not required.


Alvaro
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v3] virtio_blk: add VIRTIO_BLK_F_LIFETIME feature support

2022-12-05 Thread Alvaro Karsz
Implement the VIRTIO_BLK_F_LIFETIME feature for VirtIO block devices.

This commit introduces a new ioctl command, VBLK_LIFETIME.

VBLK_LIFETIME ioctl asks for the block device to provide lifetime
information by sending a VIRTIO_BLK_T_GET_LIFETIME command to the device.

lifetime information fields:

- pre_eol_info: specifies the percentage of reserved blocks that are
consumed.
optional values following virtio spec:
*) 0 - undefined.
*) 1 - normal, < 80% of reserved blocks are consumed.
*) 2 - warning, 80% of reserved blocks are consumed.
*) 3 - urgent, 90% of reserved blocks are consumed.

- device_lifetime_est_typ_a: this field refers to wear of SLC cells and
 is provided in increments of 10used, and so
 on, thru to 11 meaning estimated lifetime
 exceeded. All values above 11 are reserved.

- device_lifetime_est_typ_b: this field refers to wear of MLC cells and is
 provided with the same semantics as
 device_lifetime_est_typ_a.

The data received from the device will be sent as is to the user.
No data check/decode is done by virtblk.

Signed-off-by: Alvaro Karsz 
--
v2:
- Remove (void *) casting.
- Fix comments format in virtio_blk.h.
- Set ioprio value for legacy devices for REQ_OP_DRV_IN
  operations.

v3:
- Initialize struct virtio_blk_lifetime to 0 instead of memset.
- Rename ioctl from VBLK_LIFETIME to VBLK_GET_LIFETIME.
- Return EOPNOTSUPP insted of ENOTTY if ioctl is not supported.
- Check if process is CAP_SYS_ADMIN capable in lifetime ioctl.
- Check if vdev is not NULL before accessing it in lifetime ioctl.
--
---
 drivers/block/virtio_blk.c  | 106 ++--
 include/uapi/linux/virtio_blk.h |  32 ++
 2 files changed, 133 insertions(+), 5 deletions(-)

diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 19da5defd73..392d57fd6b7 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -101,6 +101,18 @@ static inline blk_status_t virtblk_result(struct 
virtblk_req *vbr)
}
 }
 
+static inline int virtblk_ioctl_result(struct virtblk_req *vbr)
+{
+   switch (vbr->status) {
+   case VIRTIO_BLK_S_OK:
+   return 0;
+   case VIRTIO_BLK_S_UNSUPP:
+   return -EOPNOTSUPP;
+   default:
+   return -EIO;
+   }
+}
+
 static inline struct virtio_blk_vq *get_virtio_blk_vq(struct blk_mq_hw_ctx 
*hctx)
 {
struct virtio_blk *vblk = hctx->queue->queuedata;
@@ -218,6 +230,7 @@ static blk_status_t virtblk_setup_cmd(struct virtio_device 
*vdev,
u32 type;
 
vbr->out_hdr.sector = 0;
+   vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req));
 
switch (req_op(req)) {
case REQ_OP_READ:
@@ -244,15 +257,14 @@ static blk_status_t virtblk_setup_cmd(struct 
virtio_device *vdev,
type = VIRTIO_BLK_T_SECURE_ERASE;
break;
case REQ_OP_DRV_IN:
-   type = VIRTIO_BLK_T_GET_ID;
-   break;
+   /* type is set in virtblk_get_id/virtblk_ioctl_lifetime */
+   return 0;
default:
WARN_ON_ONCE(1);
return BLK_STS_IOERR;
}
 
vbr->out_hdr.type = cpu_to_virtio32(vdev, type);
-   vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req));
 
if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES ||
type == VIRTIO_BLK_T_SECURE_ERASE) {
@@ -459,12 +471,16 @@ static int virtblk_get_id(struct gendisk *disk, char 
*id_str)
struct virtio_blk *vblk = disk->private_data;
struct request_queue *q = vblk->disk->queue;
struct request *req;
+   struct virtblk_req *vbr;
int err;
 
req = blk_mq_alloc_request(q, REQ_OP_DRV_IN, 0);
if (IS_ERR(req))
return PTR_ERR(req);
 
+   vbr = blk_mq_rq_to_pdu(req);
+   vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID);
+
err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
if (err)
goto out;
@@ -508,6 +524,85 @@ static int virtblk_getgeo(struct block_device *bd, struct 
hd_geometry *geo)
return ret;
 }
 
+/* Get lifetime information from device */
+static int virtblk_ioctl_lifetime(struct virtio_blk *vblk, unsigned long arg)
+{
+   struct request_queue *q = vblk->disk->queue;
+   struct request *req = NULL;
+   struct virtblk_req *vbr;
+   struct virtio_blk_lifetime lifetime = {};
+   int ret;
+
+   if (!capable(CAP_SYS_ADMIN))
+   return -EPERM;
+
+   /* The virtio_blk_lifetime struct fields follow virtio spec.
+* There is no check/decode on values rece

Re: [RFC PATCH v3 4/4] test/vsock: vsock_perf utility

2022-12-05 Thread Stefano Garzarella

On Sun, Dec 04, 2022 at 07:24:33PM +, Arseniy Krasnov wrote:

This adds utility to check vsock rx/tx performance.

Usage as sender:
./vsock_perf --port  --mb  --port  --so_rcvlowat 

Signed-off-by: Arseniy Krasnov 
---
tools/testing/vsock/Makefile |   1 +
tools/testing/vsock/README   |  34 +++
tools/testing/vsock/vsock_perf.c | 449 +++
3 files changed, 484 insertions(+)
create mode 100644 tools/testing/vsock/vsock_perf.c

diff --git a/tools/testing/vsock/Makefile b/tools/testing/vsock/Makefile
index f8293c6910c9..d36fdd59fe2e 100644
--- a/tools/testing/vsock/Makefile
+++ b/tools/testing/vsock/Makefile
@@ -3,6 +3,7 @@ all: test
test: vsock_test vsock_diag_test
vsock_test: vsock_test.o timeout.o control.o util.o
vsock_diag_test: vsock_diag_test.o timeout.o control.o util.o
+vsock_perf: vsock_perf.o


I think we can add "vsock_perf" to "all" target.



CFLAGS += -g -O2 -Werror -Wall -I. -I../../include -I../../../usr/include 
-Wno-pointer-sign -fno-strict-overflow -fno-strict-aliasing -fno-common -MMD 
-U_FORTIFY_SOURCE -D_GNU_SOURCE
.PHONY: all test clean
diff --git a/tools/testing/vsock/README b/tools/testing/vsock/README
index 4d5045e7d2c3..64c5757b7ecc 100644
--- a/tools/testing/vsock/README
+++ b/tools/testing/vsock/README
@@ -35,3 +35,37 @@ Invoke test binaries in both directions as follows:
   --control-port=$GUEST_IP \
   --control-port=1234 \
   --peer-cid=3
+
+vsock_perf utility
+---
+'vsock_perf' is a simple tool to measure vsock performance. It works in
+sender/receiver modes: sender waits for connection at specified port,
+and after accepting it, starts data transmission to the receiver. After data
+processing is done, it prints several metrics(see below).
+
+Usage:
+# run as sender
+# listen port 1234, tx buffer size is 1MB, send of 1G data
+./vsock_perf --sender --port 1234 --buf-size 1MB --mb 1G
+
+Output:
+tx performance: A Gb/s
+
+Output explanation:
+A is calculated as "number of bytes to send" / "time in tx loop"
+
+# run as receiver
+# connect to CID 2, port 1234, rx buffer size is 1MB, peer buf is 1G, 
SO_RCVLOWAT is 65536
+./vsock_perf --cid 2 --port 1234 --buf-size 1MB --vsk-size 1G -so_rcvlowat 
65536
+
+Output:
+rx performance: A Gb/s
+total in 'read()': B sec
+POLLIN wakeups: C
+average in 'read()': D ns
+
+Output explanation:
+A is calculated as "number of received bytes" / "time in rx loop".
+B is time, spent in 'read()' system call(excluding 'poll()')
+C is number of 'poll()' wake ups with POLLIN bit set.
+D is B / C, e.g. average amount of time, spent in single 'read()'.
diff --git a/tools/testing/vsock/vsock_perf.c b/tools/testing/vsock/vsock_perf.c
new file mode 100644
index ..69e3b24868d7
--- /dev/null
+++ b/tools/testing/vsock/vsock_perf.c
@@ -0,0 +1,449 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * vsock_perf - benchmark utility for vsock.
+ *
+ * Copyright (C) 2022 SberDevices.
+ *
+ * Author: Arseniy Krasnov 
+ */
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define DEFAULT_BUF_SIZE_BYTES (128 * 1024)
+#define DEFAULT_TO_SEND_BYTES  (64 * 1024)
+#define DEFAULT_VSOCK_BUF_BYTES (256 * 1024)
+#define DEFAULT_RCVLOWAT_BYTES 1
+#define DEFAULT_PORT   1234
+#define DEFAULT_CID2
+
+#define BYTES_PER_GB   (1024 * 1024 * 1024ULL)
+#define NSEC_PER_SEC   (10ULL)
+
+static unsigned int port = DEFAULT_PORT;
+static unsigned long buf_size_bytes = DEFAULT_BUF_SIZE_BYTES;
+static unsigned long vsock_buf_bytes = DEFAULT_VSOCK_BUF_BYTES;
+
+static inline time_t current_nsec(void)
+{
+   struct timespec ts;
+
+   if (clock_gettime(CLOCK_REALTIME, &ts)) {
+   perror("clock_gettime");
+   exit(EXIT_FAILURE);
+   }
+
+   return (ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec;
+}
+
+/* From lib/cmdline.c. */
+static unsigned long memparse(const char *ptr)
+{
+   char *endptr;
+
+   unsigned long long ret = strtoull(ptr, &endptr, 0);
+
+   switch (*endptr) {
+   case 'E':
+   case 'e':
+   ret <<= 10;
+   case 'P':
+   case 'p':
+   ret <<= 10;
+   case 'T':
+   case 't':
+   ret <<= 10;
+   case 'G':
+   case 'g':
+   ret <<= 10;
+   case 'M':
+   case 'm':
+   ret <<= 10;
+   case 'K':
+   case 'k':
+   ret <<= 10;
+   endptr++;
+   default:
+   break;
+   }
+
+   return ret;
+}
+
+static void vsock_increase_buf_size(int fd)
+{
+   if (setsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_MAX_SIZE,
+  &vsock_buf_bytes, sizeof(vsock_buf_bytes))) {
+   perror("setsockopt");


As for the previous patch, I would add the opt that is failing to make 
it easier to analyze in case of failure.



+   

Re: [RFC PATCH v3 2/4] test/vsock: rework message bounds test

2022-12-05 Thread Stefano Garzarella

On Sun, Dec 04, 2022 at 07:20:52PM +, Arseniy Krasnov wrote:

This updates message bound test making it more complex. Instead of
sending 1 bytes messages with one MSG_EOR bit, it sends messages of
random length(one half of messages are smaller than page size, second
half are bigger) with random number of MSG_EOR bits set. Receiver
also don't know total number of messages.

Signed-off-by: Arseniy Krasnov 
---
tools/testing/vsock/control.c|  28 +++
tools/testing/vsock/control.h|   2 +
tools/testing/vsock/util.c   |  13 
tools/testing/vsock/util.h   |   1 +
tools/testing/vsock/vsock_test.c | 124 +++
5 files changed, 155 insertions(+), 13 deletions(-)

diff --git a/tools/testing/vsock/control.c b/tools/testing/vsock/control.c
index 4874872fc5a3..d2deb4b15b94 100644
--- a/tools/testing/vsock/control.c
+++ b/tools/testing/vsock/control.c
@@ -141,6 +141,34 @@ void control_writeln(const char *str)
timeout_end();
}

+void control_writeulong(unsigned long value)
+{
+   char str[32];
+
+   if (snprintf(str, sizeof(str), "%lu", value) >= sizeof(str)) {
+   perror("snprintf");
+   exit(EXIT_FAILURE);
+   }
+
+   control_writeln(str);
+}
+
+unsigned long control_readulong(void)
+{
+   unsigned long value;
+   char *str;
+
+   str = control_readln();
+
+   if (!str)
+   exit(EXIT_FAILURE);
+
+   value = strtoul(str, NULL, 10);
+   free(str);
+
+   return value;
+}
+
/* Return the next line from the control socket (without the trailing newline).
 *
 * The program terminates if a timeout occurs.
diff --git a/tools/testing/vsock/control.h b/tools/testing/vsock/control.h
index 51814b4f9ac1..c1f77fdb2c7a 100644
--- a/tools/testing/vsock/control.h
+++ b/tools/testing/vsock/control.h
@@ -9,7 +9,9 @@ void control_init(const char *control_host, const char 
*control_port,
void control_cleanup(void);
void control_writeln(const char *str);
char *control_readln(void);
+unsigned long control_readulong(void);
void control_expectln(const char *str);
bool control_cmpln(char *line, const char *str, bool fail);
+void control_writeulong(unsigned long value);

#endif /* CONTROL_H */
diff --git a/tools/testing/vsock/util.c b/tools/testing/vsock/util.c
index 2acbb7703c6a..01b636d3039a 100644
--- a/tools/testing/vsock/util.c
+++ b/tools/testing/vsock/util.c
@@ -395,3 +395,16 @@ void skip_test(struct test_case *test_cases, size_t 
test_cases_len,

test_cases[test_id].skip = true;
}
+
+unsigned long hash_djb2(const void *data, size_t len)
+{
+   unsigned long hash = 5381;
+   int i = 0;
+
+   while (i < len) {
+   hash = ((hash << 5) + hash) + ((unsigned char *)data)[i];
+   i++;
+   }
+
+   return hash;
+}
diff --git a/tools/testing/vsock/util.h b/tools/testing/vsock/util.h
index a3375ad2fb7f..fb99208a95ea 100644
--- a/tools/testing/vsock/util.h
+++ b/tools/testing/vsock/util.h
@@ -49,4 +49,5 @@ void run_tests(const struct test_case *test_cases,
void list_tests(const struct test_case *test_cases);
void skip_test(struct test_case *test_cases, size_t test_cases_len,
   const char *test_id_str);
+unsigned long hash_djb2(const void *data, size_t len);
#endif /* UTIL_H */
diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index bb6d691cb30d..a5904ee39e91 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -284,10 +284,14 @@ static void test_stream_msg_peek_server(const struct 
test_opts *opts)
close(fd);
}

-#define MESSAGES_CNT 7
-#define MSG_EOR_IDX (MESSAGES_CNT / 2)
+#define SOCK_BUF_SIZE (2 * 1024 * 1024)
+#define MAX_MSG_SIZE (32 * 1024)
+
static void test_seqpacket_msg_bounds_client(const struct test_opts *opts)
{
+   unsigned long curr_hash;
+   int page_size;
+   int msg_count;
int fd;

fd = vsock_seqpacket_connect(opts->peer_cid, 1234);
@@ -296,18 +300,79 @@ static void test_seqpacket_msg_bounds_client(const struct 
test_opts *opts)
exit(EXIT_FAILURE);
}

-   /* Send several messages, one with MSG_EOR flag */
-   for (int i = 0; i < MESSAGES_CNT; i++)
-   send_byte(fd, 1, (i == MSG_EOR_IDX) ? MSG_EOR : 0);
+   /* Wait, until receiver sets buffer size. */
+   control_expectln("SRVREADY");
+
+   curr_hash = 0;
+   page_size = getpagesize();
+   msg_count = SOCK_BUF_SIZE / MAX_MSG_SIZE;
+
+   for (int i = 0; i < msg_count; i++) {
+   ssize_t send_size;
+   size_t buf_size;
+   int flags;
+   void *buf;
+
+   /* Use "small" buffers and "big" buffers. */
+   if (i & 1)
+   buf_size = page_size +
+   (rand() % (MAX_MSG_SIZE - page_size));
+   else
+   buf_size = 1 + (rand() % page_size);
+
+   buf = malloc

Re: [RFC PATCH v3 1/4] vsock: return errors other than -ENOMEM to socket

2022-12-05 Thread Stefano Garzarella

On Sun, Dec 04, 2022 at 07:19:20PM +, Arseniy Krasnov wrote:

From: Bobby Eshleman 

This removes behaviour, where error code returned from any
transport was always switched to ENOMEM.


I would add here the example you described in the cover letter with 
EMSGSIZE, so the problem is better explained.




Signed-off-by: Bobby Eshleman 
Signed-off-by: Arseniy Krasnov 
---
net/vmw_vsock/af_vsock.c | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 884eca7f6743..61ddab664c33 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1862,8 +1862,9 @@ static int vsock_connectible_sendmsg(struct socket *sock, 
struct msghdr *msg,
written = transport->stream_enqueue(vsk,
msg, len - total_written);
}
+
if (written < 0) {
-   err = -ENOMEM;
+   err = written;
goto out_err;
}

--
2.25.1


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v3] net: vmw_vsock: vmci: Check memcpy_from_msg()

2022-12-05 Thread Stefano Garzarella

On Mon, Dec 05, 2022 at 02:52:00PM +0300, Artem Chernyshev wrote:

vmci_transport_dgram_enqueue() does not check the return value
of memcpy_from_msg(). Return with an error if the memcpy fails.

Found by Linux Verification Center (linuxtesting.org) with SVACE.

Fixes: 0f7db23a07af ("vmci_transport: switch ->enqeue_dgram, ->enqueue_stream and 
->dequeue_stream to msghdr")
Signed-off-by: Artem Chernyshev 
---
V1->V2 Fix memory leaking and updates for description
V2->V3 Return the value of memcpy_from_msg()

net/vmw_vsock/vmci_transport.c | 6 +-
1 file changed, 5 insertions(+), 1 deletion(-)


Reviewed-by: Stefano Garzarella 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v5] virtio/vsock: replace virtio_vsock_pkt with sk_buff

2022-12-05 Thread Stefano Garzarella

On Fri, Dec 02, 2022 at 09:35:18AM -0800, Bobby Eshleman wrote:

This commit changes virtio/vsock to use sk_buff instead of
virtio_vsock_pkt. Beyond better conforming to other net code, using
sk_buff allows vsock to use sk_buff-dependent features in the future
(such as sockmap) and improves throughput.

This patch introduces the following performance changes:

Tool/Config: uperf w/ 64 threads, SOCK_STREAM
Test Runs: 5, mean of results
Before: commit 95ec6bce2a0b ("Merge branch 'net-ipa-more-endpoints'")

Test: 64KB, g2h
Before: 21.63 Gb/s
After: 25.59 Gb/s (+18%)

Test: 16B, g2h
Before: 11.86 Mb/s
After: 17.41 Mb/s (+46%)

Test: 64KB, h2g
Before: 2.15 Gb/s
After: 3.6 Gb/s (+67%)

Test: 16B, h2g
Before: 14.38 Mb/s
After: 18.43 Mb/s (+28%)

Signed-off-by: Bobby Eshleman 
---
Changes in v5:
- last_skb instead of skb: last_hdr->len = cpu_to_le32(last_skb->len)


With this issue fixed, I confirm that all the tests passed:

Reviewed-by: Stefano Garzarella 


As pointed out in v4, this is net-next material, so you should use the 
net-next tag and base the patch on the net-next tree:

https://www.kernel.org/doc/html/v6.0/process/maintainer-netdev.html#netdev-faq

I locally applied the patch on net-next and everything is fine, so maybe 
the maintainers can apply it, otherwise you should resend it with the 
right tag.
Ah, in that case I suggest you send it before the next merge window 
opens (I guess next week), because net-next closes and you'll have to 
wait for the next cycle.


Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2] net: vmw_vsock: vmci: Check memcpy_from_msg()

2022-12-05 Thread Stefano Garzarella

On Sat, Dec 03, 2022 at 11:33:12AM +0300, Artem Chernyshev wrote:

vmci_transport_dgram_enqueue() does not check the return value
of memcpy_from_msg(). Return with an error if the memcpy fails.

Found by Linux Verification Center (linuxtesting.org) with SVACE.

Fixes: 0f7db23a07af ("vmci_transport: switch ->enqeue_dgram, ->enqueue_stream and 
->dequeue_stream to msghdr")
Signed-off-by: Artem Chernyshev 
---
V1->V2 Fix memory leaking and updates for description

net/vmw_vsock/vmci_transport.c | 5 -
1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 842c94286d31..c94c3deaa09d 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1711,7 +1711,10 @@ static int vmci_transport_dgram_enqueue(
if (!dg)
return -ENOMEM;

-   memcpy_from_msg(VMCI_DG_PAYLOAD(dg), msg, len);
+   if (memcpy_from_msg(VMCI_DG_PAYLOAD(dg), msg, len)) {
+   kfree(dg);
+   return -EFAULT;


Since memcpy_from_msg() is a wrapper of copy_from_iter_full() that 
simply returns -EFAULT in case of an error, perhaps it would be better 
here to return the value of memcpy_from_msg() instead of wiring the 
error.


However in the end the behavior is the same, so even if you don't want 
to change it I'll leave my R-b:


Reviewed-by: Stefano Garzarella 

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC PATCH v2 5/6] test/vsock: add big message test

2022-12-05 Thread Stefano Garzarella

On Thu, Dec 01, 2022 at 11:44:39AM +, Arseniy Krasnov wrote:

On 01.12.2022 12:45, Stefano Garzarella wrote:

On Fri, Nov 25, 2022 at 05:13:06PM +, Arseniy Krasnov wrote:

This adds test for sending message, bigger than peer's buffer size.
For SOCK_SEQPACKET socket it must fail, as this type of socket has
message size limit.

Signed-off-by: Arseniy Krasnov 
---
tools/testing/vsock/vsock_test.c | 69 
1 file changed, 69 insertions(+)

diff --git a/tools/testing/vsock/vsock_test.c b/tools/testing/vsock/vsock_test.c
index 12ef0cca6f93..a8e43424fb32 100644
--- a/tools/testing/vsock/vsock_test.c
+++ b/tools/testing/vsock/vsock_test.c
@@ -569,6 +569,70 @@ static void test_seqpacket_timeout_server(const struct 
test_opts *opts)
close(fd);
}

+static void test_seqpacket_bigmsg_client(const struct test_opts *opts)
+{
+    unsigned long sock_buf_size;
+    ssize_t send_size;
+    socklen_t len;
+    void *data;
+    int fd;
+
+    len = sizeof(sock_buf_size);
+
+    fd = vsock_seqpacket_connect(opts->peer_cid, 1234);
+    if (fd < 0) {
+    perror("connect");
+    exit(EXIT_FAILURE);
+    }
+
+    if (getsockopt(fd, AF_VSOCK, SO_VM_SOCKETS_BUFFER_SIZE,
+   &sock_buf_size, &len)) {
+    perror("getsockopt");
+    exit(EXIT_FAILURE);
+    }
+
+    sock_buf_size++;
+
+    data = malloc(sock_buf_size);
+    if (!data) {
+    perror("malloc");
+    exit(EXIT_FAILURE);
+    }
+
+    send_size = send(fd, data, sock_buf_size, 0);
+    if (send_size != -1) {
+    fprintf(stderr, "expected 'send(2)' failure, got %zi\n",
+    send_size);
+    exit(EXIT_FAILURE);
+    }
+
+    if (errno != EMSGSIZE) {
+    fprintf(stderr, "expected EMSGSIZE in 'errno', got %i\n",
+    errno);
+    exit(EXIT_FAILURE);
+    }


We should make sure that this is true for all transports, but since now only 
virtio-vsock supports it, we should be okay.

Hm, in general: I've tested this test suite for vmci may be several months ago, 
and found, that some tests
didn't work. I'm thinking about reworking this test suite a little bit: each 
transport must have own set of
tests for features that it supports. I had feeling, that all these tests are 
run only with virtio transport :)
Because for example SEQPACKET mode is suported only for virtio.


Yep, when we developed it, we added the "--skip" param for that.
Ideally there should be no difference, but I remember VMCI had a 
different behavior and we couldn't change it for backward compatibility, 
so we added "--skip".


Thanks,
Steano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2] virtio: vdpa: new SolidNET DPU driver.

2022-12-05 Thread Alvaro Karsz
Hi Jason,
Thanks for your comment.

> I wonder if vq_num and max_size_host_cfg is not about to be changed by
> the upper layer, any reason not fail the psnet_read_cfg()?

You're right, I can catch this error earlier in psnet_read_cfg.

> So during reset, the irqs were freed, but I don't see how they are
> allocated here.

The IRQs are allocated in probe and are freed in reset.
This assumes that the device won't be revived after reset.
I  will change it and request the IRQs in snet_set_status.

> Any reason for this reset in the remove()?

We must send a  SNET_MSG_DESTROY message to the DPU before removing the device.
This remove() makes sure that if somehow remove was called
before/without reset, the DPU will receive this message.


Alvaro
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v2] virtio: vdpa: new SolidNET DPU driver.

2022-12-05 Thread Jason Wang


在 2022/11/29 22:33, Alvaro Karsz 写道:

This commit includes:
  1) The driver to manage the controlplane over vDPA bus.
  2) A HW monitor device to read health values from the DPU.

Signed-off-by: Alvaro Karsz 
--
v2:
- Auto detect the BAR used for communication.
- When reading the DPU config, wait for 5secs before giving up
  on the device.
- Return EOPNOTSUPP error code in vDPA set_vq_state callback if
  the vq state is not the same as the initial one.
- Implement a vDPA reset callback.
- Wait for an ACK when sending a message to the DPU.
- Add endianness comments on 64bit read/write functions.
- Remove the get_iova_range and free vDPA callbacks.
- Usage of managed device functions to ioremap a region.
- Call pci_set_drvdata and pci_set_master before
  vdpa_register_device.
- Create DMA isolation between the vDPA devices by using the
  chip SR-IOV feature.
  Every vDPA device gets a PCIe VF with its own DMA device.



Hi Alvaro:

Looks good to me overall, just few comments. See below.



---
  MAINTAINERS|4 +
  drivers/vdpa/Kconfig   |   11 +
  drivers/vdpa/Makefile  |1 +
  drivers/vdpa/solidrun/Makefile |3 +
  drivers/vdpa/solidrun/snet_hwmon.c |  191 +
  drivers/vdpa/solidrun/snet_main.c  | 1086 
  drivers/vdpa/solidrun/snet_vdpa.h  |  192 +
  7 files changed, 1488 insertions(+)
  create mode 100644 drivers/vdpa/solidrun/Makefile
  create mode 100644 drivers/vdpa/solidrun/snet_hwmon.c
  create mode 100644 drivers/vdpa/solidrun/snet_main.c
  create mode 100644 drivers/vdpa/solidrun/snet_vdpa.h

diff --git a/MAINTAINERS b/MAINTAINERS
index 046ff06ff97..ae425b3bed4 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -21725,6 +21725,10 @@ IFCVF VIRTIO DATA PATH ACCELERATOR
  R:Zhu Lingshan 
  F:drivers/vdpa/ifcvf/
  
+SNET DPU VIRTIO DATA PATH ACCELERATOR

+R: Alvaro Karsz 
+F: drivers/vdpa/solidrun/
+
  VIRTIO BALLOON
  M:"Michael S. Tsirkin" 
  M:David Hildenbrand 
diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig
index 50f45d03761..cb1ff0f9e27 100644
--- a/drivers/vdpa/Kconfig
+++ b/drivers/vdpa/Kconfig
@@ -86,4 +86,15 @@ config ALIBABA_ENI_VDPA
  VDPA driver for Alibaba ENI (Elastic Network Interface) which is 
built upon
  virtio 0.9.5 specification.
  
+ config SNET_VDPA

+   tristate "SolidRun's vDPA driver for SolidNET"
+   select HWMON
+   depends on PCI_MSI && PCI_IOV
+   help
+ vDPA driver for SolidNET DPU.
+ With this driver, the VirtIO dataplane can be
+ offloaded to a SolidNET DPU.
+ This driver includes a HW monitor device that
+ reads health values from the DPU.
+
  endif # VDPA
diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile
index 15665563a7f..59396ff2a31 100644
--- a/drivers/vdpa/Makefile
+++ b/drivers/vdpa/Makefile
@@ -6,3 +6,4 @@ obj-$(CONFIG_IFCVF)+= ifcvf/
  obj-$(CONFIG_MLX5_VDPA) += mlx5/
  obj-$(CONFIG_VP_VDPA)+= virtio_pci/
  obj-$(CONFIG_ALIBABA_ENI_VDPA) += alibaba/
+obj-$(CONFIG_SNET_VDPA) += solidrun/
diff --git a/drivers/vdpa/solidrun/Makefile b/drivers/vdpa/solidrun/Makefile
new file mode 100644
index 000..0acabd040fd
--- /dev/null
+++ b/drivers/vdpa/solidrun/Makefile
@@ -0,0 +1,3 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_SNET_VDPA) += snet_vdpa.o
+snet_vdpa-$(CONFIG_SNET_VDPA) += snet_main.o snet_hwmon.o
diff --git a/drivers/vdpa/solidrun/snet_hwmon.c 
b/drivers/vdpa/solidrun/snet_hwmon.c
new file mode 100644
index 000..ec6d09f09c7
--- /dev/null
+++ b/drivers/vdpa/solidrun/snet_hwmon.c
@@ -0,0 +1,191 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * SolidRun DPU driver for control plane
+ *
+ * Copyright (C) 2022 SolidRun
+ *
+ * Author: Alvaro Karsz 
+ *
+ */
+
+#include 
+#include 
+
+#include "snet_vdpa.h"
+
+/* Monitor offsets */
+#define SNET_MON_TMP0_IN_OFF  0x00
+#define SNET_MON_TMP0_MAX_OFF 0x08
+#define SNET_MON_TMP0_CRIT_OFF0x10
+#define SNET_MON_TMP1_IN_OFF  0x18
+#define SNET_MON_TMP1_CRIT_OFF0x20
+#define SNET_MON_CURR_IN_OFF  0x28
+#define SNET_MON_CURR_MAX_OFF 0x30
+#define SNET_MON_CURR_CRIT_OFF0x38
+#define SNET_MON_PWR_IN_OFF   0x40
+#define SNET_MON_VOLT_IN_OFF  0x48
+#define SNET_MON_VOLT_CRIT_OFF0x50
+#define SNET_MON_VOLT_LCRIT_OFF   0x58
+
+static void snet_hwmon_read_reg(struct psnet *psnet, u32 reg, long *out)
+{
+   *out = (long)psnet_read64(psnet, psnet->cfg.hwmon_off + reg);
+}
+
+static umode_t snet_howmon_is_visible(const void *data,
+ enum hwmon_sensor_types type,
+ u32 attr, int channel)
+{
+   /* Files are read only */
+   return 0444;
+}
+
+static int snet_howmon_read(struct device *dev, enum hwmon_sensor_types type,
+   u32 attr,