Re: [PATCH v5 2/5] ACPI: Move IOMMU setup code out of IORT

2021-06-18 Thread Robin Murphy

On 2021-06-18 16:20, Jean-Philippe Brucker wrote:

Extract the code that sets up the IOMMU infrastructure from IORT, since
it can be reused by VIOT. Move it one level up into a new
acpi_iommu_configure_id() function, which calls the IORT parsing
function which in turn calls the acpi_iommu_fwspec_init() helper.


Reviewed-by: Robin Murphy 


Signed-off-by: Jean-Philippe Brucker 
---
  include/acpi/acpi_bus.h   |  3 ++
  include/linux/acpi_iort.h |  8 ++---
  drivers/acpi/arm64/iort.c | 74 +--
  drivers/acpi/scan.c   | 73 +-
  4 files changed, 86 insertions(+), 72 deletions(-)

diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 3a82faac5767..41f092a269f6 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -588,6 +588,9 @@ struct acpi_pci_root {
  
  bool acpi_dma_supported(struct acpi_device *adev);

  enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev);
+int acpi_iommu_fwspec_init(struct device *dev, u32 id,
+  struct fwnode_handle *fwnode,
+  const struct iommu_ops *ops);
  int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset,
   u64 *size);
  int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index f7f054833afd..f1f0842a2cb2 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -35,8 +35,7 @@ void acpi_configure_pmsi_domain(struct device *dev);
  int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
  /* IOMMU interface */
  int iort_dma_get_ranges(struct device *dev, u64 *size);
-const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
-   const u32 *id_in);
+int iort_iommu_configure_id(struct device *dev, const u32 *id_in);
  int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head 
*head);
  phys_addr_t acpi_iort_dma_get_max_cpu_address(void);
  #else
@@ -50,9 +49,8 @@ static inline void acpi_configure_pmsi_domain(struct device 
*dev) { }
  /* IOMMU interface */
  static inline int iort_dma_get_ranges(struct device *dev, u64 *size)
  { return -ENODEV; }
-static inline const struct iommu_ops *iort_iommu_configure_id(
- struct device *dev, const u32 *id_in)
-{ return NULL; }
+static inline int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
+{ return -ENODEV; }
  static inline
  int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head 
*head)
  { return 0; }
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index a940be1cf2af..487d1095030d 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -806,23 +806,6 @@ static struct acpi_iort_node 
*iort_get_msi_resv_iommu(struct device *dev)
return NULL;
  }
  
-static inline const struct iommu_ops *iort_fwspec_iommu_ops(struct device *dev)

-{
-   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-
-   return (fwspec && fwspec->ops) ? fwspec->ops : NULL;
-}
-
-static inline int iort_add_device_replay(struct device *dev)
-{
-   int err = 0;
-
-   if (dev->bus && !device_iommu_mapped(dev))
-   err = iommu_probe_device(dev);
-
-   return err;
-}
-
  /**
   * iort_iommu_msi_get_resv_regions - Reserved region driver helper
   * @dev: Device from iommu_get_resv_regions()
@@ -900,18 +883,6 @@ static inline bool iort_iommu_driver_enabled(u8 type)
}
  }
  
-static int arm_smmu_iort_xlate(struct device *dev, u32 streamid,

-  struct fwnode_handle *fwnode,
-  const struct iommu_ops *ops)
-{
-   int ret = iommu_fwspec_init(dev, fwnode, ops);
-
-   if (!ret)
-   ret = iommu_fwspec_add_ids(dev, , 1);
-
-   return ret;
-}
-
  static bool iort_pci_rc_supports_ats(struct acpi_iort_node *node)
  {
struct acpi_iort_root_complex *pci_rc;
@@ -946,7 +917,7 @@ static int iort_iommu_xlate(struct device *dev, struct 
acpi_iort_node *node,
return iort_iommu_driver_enabled(node->type) ?
   -EPROBE_DEFER : -ENODEV;
  
-	return arm_smmu_iort_xlate(dev, streamid, iort_fwnode, ops);

+   return acpi_iommu_fwspec_init(dev, streamid, iort_fwnode, ops);
  }
  
  struct iort_pci_alias_info {

@@ -1020,24 +991,13 @@ static int iort_nc_iommu_map_id(struct device *dev,
   * @dev: device to configure
   * @id_in: optional input id const value pointer
   *
- * Returns: iommu_ops pointer on configuration success
- *  NULL on configuration failure
+ * Returns: 0 on success, <0 on failure
   */
-const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
-   const u32 *id_in)
+int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
  {
struct acpi_iort_node 

Re: [PATCH v5 1/5] ACPI: arm64: Move DMA setup operations out of IORT

2021-06-18 Thread Robin Murphy

On 2021-06-18 16:20, Jean-Philippe Brucker wrote:

Extract generic DMA setup code out of IORT, so it can be reused by VIOT.
Keep it in drivers/acpi/arm64 for now, since it could break x86
platforms that haven't run this code so far, if they have invalid
tables.


Reviewed-by: Robin Murphy 


Reviewed-by: Eric Auger 
Signed-off-by: Jean-Philippe Brucker 
---
  drivers/acpi/arm64/Makefile |  1 +
  include/linux/acpi.h|  3 +++
  include/linux/acpi_iort.h   |  6 ++---
  drivers/acpi/arm64/dma.c| 50 ++
  drivers/acpi/arm64/iort.c   | 54 ++---
  drivers/acpi/scan.c |  2 +-
  6 files changed, 66 insertions(+), 50 deletions(-)
  create mode 100644 drivers/acpi/arm64/dma.c

diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile
index 6ff50f4ed947..66acbe77f46e 100644
--- a/drivers/acpi/arm64/Makefile
+++ b/drivers/acpi/arm64/Makefile
@@ -1,3 +1,4 @@
  # SPDX-License-Identifier: GPL-2.0-only
  obj-$(CONFIG_ACPI_IORT)   += iort.o
  obj-$(CONFIG_ACPI_GTDT)   += gtdt.o
+obj-y  += dma.o
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c60745f657e9..7aaa9559cc19 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -259,9 +259,12 @@ void acpi_numa_x2apic_affinity_init(struct 
acpi_srat_x2apic_cpu_affinity *pa);
  
  #ifdef CONFIG_ARM64

  void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa);
+void acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size);
  #else
  static inline void
  acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { }
+static inline void
+acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size) { }
  #endif
  
  int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma);

diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index 1a12baa58e40..f7f054833afd 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -34,7 +34,7 @@ struct irq_domain *iort_get_device_domain(struct device *dev, 
u32 id,
  void acpi_configure_pmsi_domain(struct device *dev);
  int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
  /* IOMMU interface */
-void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *size);
+int iort_dma_get_ranges(struct device *dev, u64 *size);
  const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
const u32 *id_in);
  int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head 
*head);
@@ -48,8 +48,8 @@ static inline struct irq_domain *iort_get_device_domain(
  { return NULL; }
  static inline void acpi_configure_pmsi_domain(struct device *dev) { }
  /* IOMMU interface */
-static inline void iort_dma_setup(struct device *dev, u64 *dma_addr,
- u64 *size) { }
+static inline int iort_dma_get_ranges(struct device *dev, u64 *size)
+{ return -ENODEV; }
  static inline const struct iommu_ops *iort_iommu_configure_id(
  struct device *dev, const u32 *id_in)
  { return NULL; }
diff --git a/drivers/acpi/arm64/dma.c b/drivers/acpi/arm64/dma.c
new file mode 100644
index ..f16739ad3cc0
--- /dev/null
+++ b/drivers/acpi/arm64/dma.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include 
+#include 
+#include 
+#include 
+
+void acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size)
+{
+   int ret;
+   u64 end, mask;
+   u64 dmaaddr = 0, size = 0, offset = 0;
+
+   /*
+* If @dev is expected to be DMA-capable then the bus code that created
+* it should have initialised its dma_mask pointer by this point. For
+* now, we'll continue the legacy behaviour of coercing it to the
+* coherent mask if not, but we'll no longer do so quietly.
+*/
+   if (!dev->dma_mask) {
+   dev_warn(dev, "DMA mask not set\n");
+   dev->dma_mask = >coherent_dma_mask;
+   }
+
+   if (dev->coherent_dma_mask)
+   size = max(dev->coherent_dma_mask, dev->coherent_dma_mask + 1);
+   else
+   size = 1ULL << 32;
+
+   ret = acpi_dma_get_range(dev, , , );
+   if (ret == -ENODEV)
+   ret = iort_dma_get_ranges(dev, );
+   if (!ret) {
+   /*
+* Limit coherent and dma mask based on size retrieved from
+* firmware.
+*/
+   end = dmaaddr + size - 1;
+   mask = DMA_BIT_MASK(ilog2(end) + 1);
+   dev->bus_dma_limit = end;
+   dev->coherent_dma_mask = min(dev->coherent_dma_mask, mask);
+   *dev->dma_mask = min(*dev->dma_mask, mask);
+   }
+
+   *dma_addr = dmaaddr;
+   *dma_size = size;
+
+   ret = dma_direct_set_offset(dev, dmaaddr + offset, dmaaddr, size);
+
+   dev_dbg(dev, "dma_offset(%#08llx)%s\n", offset, ret ? " failed!" : "");
+}

Re: [PATCH v5 4/5] iommu/dma: Pass address limit rather than size to iommu_setup_dma_ops()

2021-06-18 Thread Robin Murphy

On 2021-06-18 16:20, Jean-Philippe Brucker wrote:

Passing a 64-bit address width to iommu_setup_dma_ops() is valid on
virtual platforms, but isn't currently possible. The overflow check in
iommu_dma_init_domain() prevents this even when @dma_base isn't 0. Pass
a limit address instead of a size, so callers don't have to fake a size
to work around the check.

The base and limit parameters are being phased out, because:
* they are redundant for x86 callers. dma-iommu already reserves the
   first page, and the upper limit is already in domain->geometry.
* they can now be obtained from dev->dma_range_map on Arm.
But removing them on Arm isn't completely straightforward so is left for
future work. As an intermediate step, simplify the x86 callers by
passing dummy limits.


Reviewed-by: Robin Murphy 


Signed-off-by: Jean-Philippe Brucker 
---
  include/linux/dma-iommu.h   |  4 ++--
  arch/arm64/mm/dma-mapping.c |  2 +-
  drivers/iommu/amd/iommu.c   |  2 +-
  drivers/iommu/dma-iommu.c   | 12 ++--
  drivers/iommu/intel/iommu.c |  5 +
  5 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 6e75a2d689b4..758ca4694257 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -19,7 +19,7 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, 
dma_addr_t base);
  void iommu_put_dma_cookie(struct iommu_domain *domain);
  
  /* Setup call for arch DMA mapping code */

-void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size);
+void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit);
  
  /* The DMA API isn't _quite_ the whole story, though... */

  /*
@@ -50,7 +50,7 @@ struct msi_msg;
  struct device;
  
  static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base,

-   u64 size)
+  u64 dma_limit)
  {
  }
  
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c

index 4bf1dd3eb041..6719f9efea09 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -50,7 +50,7 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 
size,
  
  	dev->dma_coherent = coherent;

if (iommu)
-   iommu_setup_dma_ops(dev, dma_base, size);
+   iommu_setup_dma_ops(dev, dma_base, dma_base + size - 1);
  
  #ifdef CONFIG_XEN

if (xen_swiotlb_detect())
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 3ac42bbdefc6..216323fb27ef 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -1713,7 +1713,7 @@ static void amd_iommu_probe_finalize(struct device *dev)
/* Domains are initialized for this device - have a look what we ended 
up with */
domain = iommu_get_domain_for_dev(dev);
if (domain->type == IOMMU_DOMAIN_DMA)
-   iommu_setup_dma_ops(dev, IOVA_START_PFN << PAGE_SHIFT, 0);
+   iommu_setup_dma_ops(dev, 0, U64_MAX);
else
set_dma_ops(dev, NULL);
  }
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 7bcdd1205535..c62e19bed302 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -319,16 +319,16 @@ static bool dev_is_untrusted(struct device *dev)
   * iommu_dma_init_domain - Initialise a DMA mapping domain
   * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
   * @base: IOVA at which the mappable address space starts
- * @size: Size of IOVA space
+ * @limit: Last address of the IOVA space
   * @dev: Device the domain is being initialised for
   *
- * @base and @size should be exact multiples of IOMMU page granularity to
+ * @base and @limit + 1 should be exact multiples of IOMMU page granularity to
   * avoid rounding surprises. If necessary, we reserve the page at address 0
   * to ensure it is an invalid IOVA. It is safe to reinitialise a domain, but
   * any change which could make prior IOVAs invalid will fail.
   */
  static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
-   u64 size, struct device *dev)
+dma_addr_t limit, struct device *dev)
  {
struct iommu_dma_cookie *cookie = domain->iova_cookie;
unsigned long order, base_pfn;
@@ -346,7 +346,7 @@ static int iommu_dma_init_domain(struct iommu_domain 
*domain, dma_addr_t base,
/* Check the domain allows at least some access to the device... */
if (domain->geometry.force_aperture) {
if (base > domain->geometry.aperture_end ||
-   base + size <= domain->geometry.aperture_start) {
+   limit < domain->geometry.aperture_start) {
pr_warn("specified DMA range outside IOMMU 
capability\n");
return -EFAULT;
}
@@ -1308,7 +1308,7 @@ static const struct dma_map_ops iommu_dma_ops = {
   * The IOMMU core code allocates the default DMA domain, 

Re: [MASSMAIL KLMS] Re: [PATCH v11 11/18] virtio/vsock: dequeue callback for SOCK_SEQPACKET

2021-06-18 Thread Stefano Garzarella

On Fri, Jun 18, 2021 at 07:08:30PM +0300, Arseny Krasnov wrote:


On 18.06.2021 18:55, Stefano Garzarella wrote:

On Fri, Jun 18, 2021 at 06:04:37PM +0300, Arseny Krasnov wrote:

On 18.06.2021 16:44, Stefano Garzarella wrote:

Hi Arseny,
the series looks great, I have just a question below about
seqpacket_dequeue.

I also sent a couple a simple fixes, it would be great if you can review
them:
https://lore.kernel.org/netdev/20210618133526.300347-1-sgarz...@redhat.com/


On Fri, Jun 11, 2021 at 02:12:38PM +0300, Arseny Krasnov wrote:

Callback fetches RW packets from rx queue of socket until whole record
is copied(if user's buffer is full, user is not woken up). This is done
to not stall sender, because if we wake up user and it leaves syscall,
nobody will send credit update for rest of record, and sender will wait
for next enter of read syscall at receiver's side. So if user buffer is
full, we just send credit update and drop data.

Signed-off-by: Arseny Krasnov 
---
v10 -> v11:
1) 'msg_count' field added to count current number of EORs.
2) 'msg_ready' argument removed from callback.
3) If 'memcpy_to_msg()' failed during copy loop, there will be
   no next attempts to copy data, rest of record will be freed.

include/linux/virtio_vsock.h|  5 ++
net/vmw_vsock/virtio_transport_common.c | 84 +
2 files changed, 89 insertions(+)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index dc636b727179..1d9a302cb91d 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -36,6 +36,7 @@ struct virtio_vsock_sock {
u32 rx_bytes;
u32 buf_alloc;
struct list_head rx_queue;
+   u32 msg_count;
};

struct virtio_vsock_pkt {
@@ -80,6 +81,10 @@ virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
   struct msghdr *msg,
   size_t len, int flags);

+ssize_t
+virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
+  struct msghdr *msg,
+  int flags);
s64 virtio_transport_stream_has_data(struct vsock_sock *vsk);
s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index ad0d34d41444..1e1df19ec164 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -393,6 +393,78 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
return err;
}

+static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
+struct msghdr *msg,
+int flags)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   struct virtio_vsock_pkt *pkt;
+   int dequeued_len = 0;
+   size_t user_buf_len = msg_data_left(msg);
+   bool copy_failed = false;
+   bool msg_ready = false;
+
+   spin_lock_bh(>rx_lock);
+
+   if (vvs->msg_count == 0) {
+   spin_unlock_bh(>rx_lock);
+   return 0;
+   }
+
+   while (!msg_ready) {
+   pkt = list_first_entry(>rx_queue, struct virtio_vsock_pkt, 
list);
+
+   if (!copy_failed) {
+   size_t pkt_len;
+   size_t bytes_to_copy;
+
+   pkt_len = (size_t)le32_to_cpu(pkt->hdr.len);
+   bytes_to_copy = min(user_buf_len, pkt_len);
+
+   if (bytes_to_copy) {
+   int err;
+
+   /* sk_lock is held by caller so no one else can 
dequeue.
+* Unlock rx_lock since memcpy_to_msg() may 
sleep.
+*/
+   spin_unlock_bh(>rx_lock);
+
+   err = memcpy_to_msg(msg, pkt->buf, 
bytes_to_copy);
+   if (err) {
+   /* Copy of message failed, set flag to 
skip
+* copy path for rest of fragments. 
Rest of
+* fragments will be freed without copy.
+*/
+   copy_failed = true;
+   dequeued_len = err;

If we fail to copy the message we will discard the entire packet.
Is it acceptable for the user point of view, or we should leave the
packet in the queue and the user can retry, maybe with a different
buffer?

Then we can remove the packets only when we successfully copied all the
fragments.

I'm not sure make sense, maybe better to check also other
implementations :-)

Thanks,
Stefano

Understand, i'll check it on weekend, anyway I think it is
not critical for implementation.

Yep, I agree.



I have another question: may be it is useful to research for
approach 

Re: [PATCH v5 3/5] ACPI: Add driver for the VIOT table

2021-06-18 Thread Rafael J. Wysocki
On Fri, Jun 18, 2021 at 5:33 PM Jean-Philippe Brucker
 wrote:
>
> The ACPI Virtual I/O Translation Table describes topology of
> para-virtual platforms, similarly to vendor tables DMAR, IVRS and IORT.
> For now it describes the relation between virtio-iommu and the endpoints
> it manages.
>
> Three steps are needed to configure DMA of endpoints:
>
> (1) acpi_viot_init(): parse the VIOT table, find or create the fwnode
> associated to each vIOMMU device. This needs to happen after
> acpi_scan_init(), because it relies on the struct device and their
> fwnode to be available.
>
> (2) When probing the vIOMMU device, the driver registers its IOMMU ops
> within the IOMMU subsystem. This step doesn't require any
> intervention from the VIOT driver.
>
> (3) viot_iommu_configure(): before binding the endpoint to a driver,
> find the associated IOMMU ops. Register them, along with the
> endpoint ID, into the device's iommu_fwspec.
>
> If step (3) happens before step (2), it is deferred until the IOMMU is
> initialized, then retried.
>
> Tested-by: Eric Auger 
> Reviewed-by: Eric Auger 
> Signed-off-by: Jean-Philippe Brucker 

>From the general ACPI perspective

Acked-by: Rafael J. Wysocki 

and I'm assuming that it will be routed through a different tree.

> ---
>  drivers/acpi/Kconfig  |   3 +
>  drivers/iommu/Kconfig |   1 +
>  drivers/acpi/Makefile |   2 +
>  include/linux/acpi_viot.h |  19 ++
>  drivers/acpi/bus.c|   2 +
>  drivers/acpi/scan.c   |   3 +
>  drivers/acpi/viot.c   | 366 ++
>  MAINTAINERS   |   8 +
>  8 files changed, 404 insertions(+)
>  create mode 100644 include/linux/acpi_viot.h
>  create mode 100644 drivers/acpi/viot.c
>
> diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
> index eedec61e3476..3758c6940ed7 100644
> --- a/drivers/acpi/Kconfig
> +++ b/drivers/acpi/Kconfig
> @@ -526,6 +526,9 @@ endif
>
>  source "drivers/acpi/pmic/Kconfig"
>
> +config ACPI_VIOT
> +   bool
> +
>  endif  # ACPI
>
>  config X86_PM_TIMER
> diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
> index 1f111b399bca..aff8a4830dd1 100644
> --- a/drivers/iommu/Kconfig
> +++ b/drivers/iommu/Kconfig
> @@ -403,6 +403,7 @@ config VIRTIO_IOMMU
> depends on ARM64
> select IOMMU_API
> select INTERVAL_TREE
> +   select ACPI_VIOT if ACPI
> help
>   Para-virtualised IOMMU driver with virtio.
>
> diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
> index 700b41adf2db..a6e644c48987 100644
> --- a/drivers/acpi/Makefile
> +++ b/drivers/acpi/Makefile
> @@ -118,3 +118,5 @@ video-objs  += acpi_video.o video_detect.o
>  obj-y  += dptf/
>
>  obj-$(CONFIG_ARM64)+= arm64/
> +
> +obj-$(CONFIG_ACPI_VIOT)+= viot.o
> diff --git a/include/linux/acpi_viot.h b/include/linux/acpi_viot.h
> new file mode 100644
> index ..1eb8ee5b0e5f
> --- /dev/null
> +++ b/include/linux/acpi_viot.h
> @@ -0,0 +1,19 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +
> +#ifndef __ACPI_VIOT_H__
> +#define __ACPI_VIOT_H__
> +
> +#include 
> +
> +#ifdef CONFIG_ACPI_VIOT
> +void __init acpi_viot_init(void);
> +int viot_iommu_configure(struct device *dev);
> +#else
> +static inline void acpi_viot_init(void) {}
> +static inline int viot_iommu_configure(struct device *dev)
> +{
> +   return -ENODEV;
> +}
> +#endif
> +
> +#endif /* __ACPI_VIOT_H__ */
> diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
> index a4bd673934c0..d6f4e2f06fdb 100644
> --- a/drivers/acpi/bus.c
> +++ b/drivers/acpi/bus.c
> @@ -27,6 +27,7 @@
>  #include 
>  #endif
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -1334,6 +1335,7 @@ static int __init acpi_init(void)
> acpi_wakeup_device_init();
> acpi_debugger_init();
> acpi_setup_sb_notify_handler();
> +   acpi_viot_init();
> return 0;
>  }
>
> diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
> index 2a2e690040e9..3e2bb04ab528 100644
> --- a/drivers/acpi/scan.c
> +++ b/drivers/acpi/scan.c
> @@ -9,6 +9,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -1556,6 +1557,8 @@ static const struct iommu_ops 
> *acpi_iommu_configure_id(struct device *dev,
> return ops;
>
> err = iort_iommu_configure_id(dev, id_in);
> +   if (err && err != -EPROBE_DEFER)
> +   err = viot_iommu_configure(dev);
>
> /*
>  * If we have reason to believe the IOMMU driver missed the initial
> diff --git a/drivers/acpi/viot.c b/drivers/acpi/viot.c
> new file mode 100644
> index ..d2256326c73a
> --- /dev/null
> +++ b/drivers/acpi/viot.c
> @@ -0,0 +1,366 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Virtual I/O topology
> + *
> + * The Virtual I/O Translation Table (VIOT) describes the topology of
> + * para-virtual IOMMUs and the endpoints they 

Re: [PATCH v11 11/18] virtio/vsock: dequeue callback for SOCK_SEQPACKET

2021-06-18 Thread Stefano Garzarella

On Fri, Jun 18, 2021 at 06:04:37PM +0300, Arseny Krasnov wrote:


On 18.06.2021 16:44, Stefano Garzarella wrote:

Hi Arseny,
the series looks great, I have just a question below about
seqpacket_dequeue.

I also sent a couple a simple fixes, it would be great if you can review
them:
https://lore.kernel.org/netdev/20210618133526.300347-1-sgarz...@redhat.com/


On Fri, Jun 11, 2021 at 02:12:38PM +0300, Arseny Krasnov wrote:

Callback fetches RW packets from rx queue of socket until whole record
is copied(if user's buffer is full, user is not woken up). This is done
to not stall sender, because if we wake up user and it leaves syscall,
nobody will send credit update for rest of record, and sender will wait
for next enter of read syscall at receiver's side. So if user buffer is
full, we just send credit update and drop data.

Signed-off-by: Arseny Krasnov 
---
v10 -> v11:
1) 'msg_count' field added to count current number of EORs.
2) 'msg_ready' argument removed from callback.
3) If 'memcpy_to_msg()' failed during copy loop, there will be
   no next attempts to copy data, rest of record will be freed.

include/linux/virtio_vsock.h|  5 ++
net/vmw_vsock/virtio_transport_common.c | 84 +
2 files changed, 89 insertions(+)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index dc636b727179..1d9a302cb91d 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -36,6 +36,7 @@ struct virtio_vsock_sock {
u32 rx_bytes;
u32 buf_alloc;
struct list_head rx_queue;
+   u32 msg_count;
};

struct virtio_vsock_pkt {
@@ -80,6 +81,10 @@ virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
   struct msghdr *msg,
   size_t len, int flags);

+ssize_t
+virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
+  struct msghdr *msg,
+  int flags);
s64 virtio_transport_stream_has_data(struct vsock_sock *vsk);
s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index ad0d34d41444..1e1df19ec164 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -393,6 +393,78 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
return err;
}

+static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
+struct msghdr *msg,
+int flags)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   struct virtio_vsock_pkt *pkt;
+   int dequeued_len = 0;
+   size_t user_buf_len = msg_data_left(msg);
+   bool copy_failed = false;
+   bool msg_ready = false;
+
+   spin_lock_bh(>rx_lock);
+
+   if (vvs->msg_count == 0) {
+   spin_unlock_bh(>rx_lock);
+   return 0;
+   }
+
+   while (!msg_ready) {
+   pkt = list_first_entry(>rx_queue, struct virtio_vsock_pkt, 
list);
+
+   if (!copy_failed) {
+   size_t pkt_len;
+   size_t bytes_to_copy;
+
+   pkt_len = (size_t)le32_to_cpu(pkt->hdr.len);
+   bytes_to_copy = min(user_buf_len, pkt_len);
+
+   if (bytes_to_copy) {
+   int err;
+
+   /* sk_lock is held by caller so no one else can 
dequeue.
+* Unlock rx_lock since memcpy_to_msg() may 
sleep.
+*/
+   spin_unlock_bh(>rx_lock);
+
+   err = memcpy_to_msg(msg, pkt->buf, 
bytes_to_copy);
+   if (err) {
+   /* Copy of message failed, set flag to 
skip
+* copy path for rest of fragments. 
Rest of
+* fragments will be freed without copy.
+*/
+   copy_failed = true;
+   dequeued_len = err;

If we fail to copy the message we will discard the entire packet.
Is it acceptable for the user point of view, or we should leave the
packet in the queue and the user can retry, maybe with a different
buffer?

Then we can remove the packets only when we successfully copied all the
fragments.

I'm not sure make sense, maybe better to check also other
implementations :-)

Thanks,
Stefano


Understand, i'll check it on weekend, anyway I think it is
not critical for implementation.


Yep, I agree.




I have another question: may be it is useful to research for
approach where packets are not queued until whole message
is received, but copied to user's buffer thus freeing memory.

[PATCH v5 4/5] iommu/dma: Pass address limit rather than size to iommu_setup_dma_ops()

2021-06-18 Thread Jean-Philippe Brucker
Passing a 64-bit address width to iommu_setup_dma_ops() is valid on
virtual platforms, but isn't currently possible. The overflow check in
iommu_dma_init_domain() prevents this even when @dma_base isn't 0. Pass
a limit address instead of a size, so callers don't have to fake a size
to work around the check.

The base and limit parameters are being phased out, because:
* they are redundant for x86 callers. dma-iommu already reserves the
  first page, and the upper limit is already in domain->geometry.
* they can now be obtained from dev->dma_range_map on Arm.
But removing them on Arm isn't completely straightforward so is left for
future work. As an intermediate step, simplify the x86 callers by
passing dummy limits.

Signed-off-by: Jean-Philippe Brucker 
---
 include/linux/dma-iommu.h   |  4 ++--
 arch/arm64/mm/dma-mapping.c |  2 +-
 drivers/iommu/amd/iommu.c   |  2 +-
 drivers/iommu/dma-iommu.c   | 12 ++--
 drivers/iommu/intel/iommu.c |  5 +
 5 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/include/linux/dma-iommu.h b/include/linux/dma-iommu.h
index 6e75a2d689b4..758ca4694257 100644
--- a/include/linux/dma-iommu.h
+++ b/include/linux/dma-iommu.h
@@ -19,7 +19,7 @@ int iommu_get_msi_cookie(struct iommu_domain *domain, 
dma_addr_t base);
 void iommu_put_dma_cookie(struct iommu_domain *domain);
 
 /* Setup call for arch DMA mapping code */
-void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size);
+void iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 dma_limit);
 
 /* The DMA API isn't _quite_ the whole story, though... */
 /*
@@ -50,7 +50,7 @@ struct msi_msg;
 struct device;
 
 static inline void iommu_setup_dma_ops(struct device *dev, u64 dma_base,
-   u64 size)
+  u64 dma_limit)
 {
 }
 
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
index 4bf1dd3eb041..6719f9efea09 100644
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -50,7 +50,7 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 
size,
 
dev->dma_coherent = coherent;
if (iommu)
-   iommu_setup_dma_ops(dev, dma_base, size);
+   iommu_setup_dma_ops(dev, dma_base, dma_base + size - 1);
 
 #ifdef CONFIG_XEN
if (xen_swiotlb_detect())
diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c
index 3ac42bbdefc6..216323fb27ef 100644
--- a/drivers/iommu/amd/iommu.c
+++ b/drivers/iommu/amd/iommu.c
@@ -1713,7 +1713,7 @@ static void amd_iommu_probe_finalize(struct device *dev)
/* Domains are initialized for this device - have a look what we ended 
up with */
domain = iommu_get_domain_for_dev(dev);
if (domain->type == IOMMU_DOMAIN_DMA)
-   iommu_setup_dma_ops(dev, IOVA_START_PFN << PAGE_SHIFT, 0);
+   iommu_setup_dma_ops(dev, 0, U64_MAX);
else
set_dma_ops(dev, NULL);
 }
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 7bcdd1205535..c62e19bed302 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -319,16 +319,16 @@ static bool dev_is_untrusted(struct device *dev)
  * iommu_dma_init_domain - Initialise a DMA mapping domain
  * @domain: IOMMU domain previously prepared by iommu_get_dma_cookie()
  * @base: IOVA at which the mappable address space starts
- * @size: Size of IOVA space
+ * @limit: Last address of the IOVA space
  * @dev: Device the domain is being initialised for
  *
- * @base and @size should be exact multiples of IOMMU page granularity to
+ * @base and @limit + 1 should be exact multiples of IOMMU page granularity to
  * avoid rounding surprises. If necessary, we reserve the page at address 0
  * to ensure it is an invalid IOVA. It is safe to reinitialise a domain, but
  * any change which could make prior IOVAs invalid will fail.
  */
 static int iommu_dma_init_domain(struct iommu_domain *domain, dma_addr_t base,
-   u64 size, struct device *dev)
+dma_addr_t limit, struct device *dev)
 {
struct iommu_dma_cookie *cookie = domain->iova_cookie;
unsigned long order, base_pfn;
@@ -346,7 +346,7 @@ static int iommu_dma_init_domain(struct iommu_domain 
*domain, dma_addr_t base,
/* Check the domain allows at least some access to the device... */
if (domain->geometry.force_aperture) {
if (base > domain->geometry.aperture_end ||
-   base + size <= domain->geometry.aperture_start) {
+   limit < domain->geometry.aperture_start) {
pr_warn("specified DMA range outside IOMMU 
capability\n");
return -EFAULT;
}
@@ -1308,7 +1308,7 @@ static const struct dma_map_ops iommu_dma_ops = {
  * The IOMMU core code allocates the default DMA domain, which the underlying
  * IOMMU driver needs to support via the dma-iommu layer.
  */
-void iommu_setup_dma_ops(struct 

[PATCH v5 3/5] ACPI: Add driver for the VIOT table

2021-06-18 Thread Jean-Philippe Brucker
The ACPI Virtual I/O Translation Table describes topology of
para-virtual platforms, similarly to vendor tables DMAR, IVRS and IORT.
For now it describes the relation between virtio-iommu and the endpoints
it manages.

Three steps are needed to configure DMA of endpoints:

(1) acpi_viot_init(): parse the VIOT table, find or create the fwnode
associated to each vIOMMU device. This needs to happen after
acpi_scan_init(), because it relies on the struct device and their
fwnode to be available.

(2) When probing the vIOMMU device, the driver registers its IOMMU ops
within the IOMMU subsystem. This step doesn't require any
intervention from the VIOT driver.

(3) viot_iommu_configure(): before binding the endpoint to a driver,
find the associated IOMMU ops. Register them, along with the
endpoint ID, into the device's iommu_fwspec.

If step (3) happens before step (2), it is deferred until the IOMMU is
initialized, then retried.

Tested-by: Eric Auger 
Reviewed-by: Eric Auger 
Signed-off-by: Jean-Philippe Brucker 
---
 drivers/acpi/Kconfig  |   3 +
 drivers/iommu/Kconfig |   1 +
 drivers/acpi/Makefile |   2 +
 include/linux/acpi_viot.h |  19 ++
 drivers/acpi/bus.c|   2 +
 drivers/acpi/scan.c   |   3 +
 drivers/acpi/viot.c   | 366 ++
 MAINTAINERS   |   8 +
 8 files changed, 404 insertions(+)
 create mode 100644 include/linux/acpi_viot.h
 create mode 100644 drivers/acpi/viot.c

diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index eedec61e3476..3758c6940ed7 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -526,6 +526,9 @@ endif
 
 source "drivers/acpi/pmic/Kconfig"
 
+config ACPI_VIOT
+   bool
+
 endif  # ACPI
 
 config X86_PM_TIMER
diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index 1f111b399bca..aff8a4830dd1 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -403,6 +403,7 @@ config VIRTIO_IOMMU
depends on ARM64
select IOMMU_API
select INTERVAL_TREE
+   select ACPI_VIOT if ACPI
help
  Para-virtualised IOMMU driver with virtio.
 
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 700b41adf2db..a6e644c48987 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -118,3 +118,5 @@ video-objs  += acpi_video.o video_detect.o
 obj-y  += dptf/
 
 obj-$(CONFIG_ARM64)+= arm64/
+
+obj-$(CONFIG_ACPI_VIOT)+= viot.o
diff --git a/include/linux/acpi_viot.h b/include/linux/acpi_viot.h
new file mode 100644
index ..1eb8ee5b0e5f
--- /dev/null
+++ b/include/linux/acpi_viot.h
@@ -0,0 +1,19 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+
+#ifndef __ACPI_VIOT_H__
+#define __ACPI_VIOT_H__
+
+#include 
+
+#ifdef CONFIG_ACPI_VIOT
+void __init acpi_viot_init(void);
+int viot_iommu_configure(struct device *dev);
+#else
+static inline void acpi_viot_init(void) {}
+static inline int viot_iommu_configure(struct device *dev)
+{
+   return -ENODEV;
+}
+#endif
+
+#endif /* __ACPI_VIOT_H__ */
diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
index a4bd673934c0..d6f4e2f06fdb 100644
--- a/drivers/acpi/bus.c
+++ b/drivers/acpi/bus.c
@@ -27,6 +27,7 @@
 #include 
 #endif
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1334,6 +1335,7 @@ static int __init acpi_init(void)
acpi_wakeup_device_init();
acpi_debugger_init();
acpi_setup_sb_notify_handler();
+   acpi_viot_init();
return 0;
 }
 
diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c
index 2a2e690040e9..3e2bb04ab528 100644
--- a/drivers/acpi/scan.c
+++ b/drivers/acpi/scan.c
@@ -9,6 +9,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1556,6 +1557,8 @@ static const struct iommu_ops 
*acpi_iommu_configure_id(struct device *dev,
return ops;
 
err = iort_iommu_configure_id(dev, id_in);
+   if (err && err != -EPROBE_DEFER)
+   err = viot_iommu_configure(dev);
 
/*
 * If we have reason to believe the IOMMU driver missed the initial
diff --git a/drivers/acpi/viot.c b/drivers/acpi/viot.c
new file mode 100644
index ..d2256326c73a
--- /dev/null
+++ b/drivers/acpi/viot.c
@@ -0,0 +1,366 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Virtual I/O topology
+ *
+ * The Virtual I/O Translation Table (VIOT) describes the topology of
+ * para-virtual IOMMUs and the endpoints they manage. The OS uses it to
+ * initialize devices in the right order, preventing endpoints from issuing DMA
+ * before their IOMMU is ready.
+ *
+ * When binding a driver to a device, before calling the device driver's 
probe()
+ * method, the driver infrastructure calls dma_configure(). At that point the
+ * VIOT driver looks for an IOMMU associated to the device in the VIOT table.
+ * If an IOMMU exists and has been initialized, the VIOT driver initializes the
+ * device's IOMMU 

[PATCH v5 5/5] iommu/virtio: Enable x86 support

2021-06-18 Thread Jean-Philippe Brucker
With the VIOT support in place, x86 platforms can now use the
virtio-iommu.

Because the other x86 IOMMU drivers aren't yet ready to use the
acpi_dma_setup() path, x86 doesn't implement arch_setup_dma_ops() at the
moment. Similarly to Vt-d and AMD IOMMU, clear the DMA ops and call
iommu_setup_dma_ops() from probe_finalize().

Acked-by: Joerg Roedel 
Acked-by: Michael S. Tsirkin 
Tested-by: Eric Auger 
Reviewed-by: Eric Auger 
Signed-off-by: Jean-Philippe Brucker 
---
 drivers/iommu/Kconfig|  3 ++-
 drivers/iommu/dma-iommu.c|  1 +
 drivers/iommu/virtio-iommu.c | 11 +++
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/Kconfig b/drivers/iommu/Kconfig
index aff8a4830dd1..07b7c25cbed8 100644
--- a/drivers/iommu/Kconfig
+++ b/drivers/iommu/Kconfig
@@ -400,8 +400,9 @@ config HYPERV_IOMMU
 config VIRTIO_IOMMU
tristate "Virtio IOMMU driver"
depends on VIRTIO
-   depends on ARM64
+   depends on (ARM64 || X86)
select IOMMU_API
+   select IOMMU_DMA
select INTERVAL_TREE
select ACPI_VIOT if ACPI
help
diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index c62e19bed302..9dbbc95c8189 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1330,6 +1330,7 @@ void iommu_setup_dma_ops(struct device *dev, u64 
dma_base, u64 dma_limit)
 pr_warn("Failed to set up IOMMU for device %s; retaining platform DMA 
ops\n",
 dev_name(dev));
 }
+EXPORT_SYMBOL_GPL(iommu_setup_dma_ops);
 
 static struct iommu_dma_msi_page *iommu_dma_get_msi_page(struct device *dev,
phys_addr_t msi_addr, struct iommu_domain *domain)
diff --git a/drivers/iommu/virtio-iommu.c b/drivers/iommu/virtio-iommu.c
index c6e5ee4d9cef..fe581f0c9b3a 100644
--- a/drivers/iommu/virtio-iommu.c
+++ b/drivers/iommu/virtio-iommu.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -904,6 +905,15 @@ static struct iommu_device *viommu_probe_device(struct 
device *dev)
return ERR_PTR(ret);
 }
 
+static void viommu_probe_finalize(struct device *dev)
+{
+#ifndef CONFIG_ARCH_HAS_SETUP_DMA_OPS
+   /* First clear the DMA ops in case we're switching from a DMA domain */
+   set_dma_ops(dev, NULL);
+   iommu_setup_dma_ops(dev, 0, U64_MAX);
+#endif
+}
+
 static void viommu_release_device(struct device *dev)
 {
struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
@@ -940,6 +950,7 @@ static struct iommu_ops viommu_ops = {
.iova_to_phys   = viommu_iova_to_phys,
.iotlb_sync = viommu_iotlb_sync,
.probe_device   = viommu_probe_device,
+   .probe_finalize = viommu_probe_finalize,
.release_device = viommu_release_device,
.device_group   = viommu_device_group,
.get_resv_regions   = viommu_get_resv_regions,
-- 
2.32.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v5 2/5] ACPI: Move IOMMU setup code out of IORT

2021-06-18 Thread Jean-Philippe Brucker
Extract the code that sets up the IOMMU infrastructure from IORT, since
it can be reused by VIOT. Move it one level up into a new
acpi_iommu_configure_id() function, which calls the IORT parsing
function which in turn calls the acpi_iommu_fwspec_init() helper.

Signed-off-by: Jean-Philippe Brucker 
---
 include/acpi/acpi_bus.h   |  3 ++
 include/linux/acpi_iort.h |  8 ++---
 drivers/acpi/arm64/iort.c | 74 +--
 drivers/acpi/scan.c   | 73 +-
 4 files changed, 86 insertions(+), 72 deletions(-)

diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h
index 3a82faac5767..41f092a269f6 100644
--- a/include/acpi/acpi_bus.h
+++ b/include/acpi/acpi_bus.h
@@ -588,6 +588,9 @@ struct acpi_pci_root {
 
 bool acpi_dma_supported(struct acpi_device *adev);
 enum dev_dma_attr acpi_get_dma_attr(struct acpi_device *adev);
+int acpi_iommu_fwspec_init(struct device *dev, u32 id,
+  struct fwnode_handle *fwnode,
+  const struct iommu_ops *ops);
 int acpi_dma_get_range(struct device *dev, u64 *dma_addr, u64 *offset,
   u64 *size);
 int acpi_dma_configure_id(struct device *dev, enum dev_dma_attr attr,
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index f7f054833afd..f1f0842a2cb2 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -35,8 +35,7 @@ void acpi_configure_pmsi_domain(struct device *dev);
 int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
 /* IOMMU interface */
 int iort_dma_get_ranges(struct device *dev, u64 *size);
-const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
-   const u32 *id_in);
+int iort_iommu_configure_id(struct device *dev, const u32 *id_in);
 int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head 
*head);
 phys_addr_t acpi_iort_dma_get_max_cpu_address(void);
 #else
@@ -50,9 +49,8 @@ static inline void acpi_configure_pmsi_domain(struct device 
*dev) { }
 /* IOMMU interface */
 static inline int iort_dma_get_ranges(struct device *dev, u64 *size)
 { return -ENODEV; }
-static inline const struct iommu_ops *iort_iommu_configure_id(
- struct device *dev, const u32 *id_in)
-{ return NULL; }
+static inline int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
+{ return -ENODEV; }
 static inline
 int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head)
 { return 0; }
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index a940be1cf2af..487d1095030d 100644
--- a/drivers/acpi/arm64/iort.c
+++ b/drivers/acpi/arm64/iort.c
@@ -806,23 +806,6 @@ static struct acpi_iort_node 
*iort_get_msi_resv_iommu(struct device *dev)
return NULL;
 }
 
-static inline const struct iommu_ops *iort_fwspec_iommu_ops(struct device *dev)
-{
-   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
-
-   return (fwspec && fwspec->ops) ? fwspec->ops : NULL;
-}
-
-static inline int iort_add_device_replay(struct device *dev)
-{
-   int err = 0;
-
-   if (dev->bus && !device_iommu_mapped(dev))
-   err = iommu_probe_device(dev);
-
-   return err;
-}
-
 /**
  * iort_iommu_msi_get_resv_regions - Reserved region driver helper
  * @dev: Device from iommu_get_resv_regions()
@@ -900,18 +883,6 @@ static inline bool iort_iommu_driver_enabled(u8 type)
}
 }
 
-static int arm_smmu_iort_xlate(struct device *dev, u32 streamid,
-  struct fwnode_handle *fwnode,
-  const struct iommu_ops *ops)
-{
-   int ret = iommu_fwspec_init(dev, fwnode, ops);
-
-   if (!ret)
-   ret = iommu_fwspec_add_ids(dev, , 1);
-
-   return ret;
-}
-
 static bool iort_pci_rc_supports_ats(struct acpi_iort_node *node)
 {
struct acpi_iort_root_complex *pci_rc;
@@ -946,7 +917,7 @@ static int iort_iommu_xlate(struct device *dev, struct 
acpi_iort_node *node,
return iort_iommu_driver_enabled(node->type) ?
   -EPROBE_DEFER : -ENODEV;
 
-   return arm_smmu_iort_xlate(dev, streamid, iort_fwnode, ops);
+   return acpi_iommu_fwspec_init(dev, streamid, iort_fwnode, ops);
 }
 
 struct iort_pci_alias_info {
@@ -1020,24 +991,13 @@ static int iort_nc_iommu_map_id(struct device *dev,
  * @dev: device to configure
  * @id_in: optional input id const value pointer
  *
- * Returns: iommu_ops pointer on configuration success
- *  NULL on configuration failure
+ * Returns: 0 on success, <0 on failure
  */
-const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
-   const u32 *id_in)
+int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
 {
struct acpi_iort_node *node;
-   const struct iommu_ops *ops;
int err = -ENODEV;
 
-   /*
-* If we already translated the 

[PATCH v5 1/5] ACPI: arm64: Move DMA setup operations out of IORT

2021-06-18 Thread Jean-Philippe Brucker
Extract generic DMA setup code out of IORT, so it can be reused by VIOT.
Keep it in drivers/acpi/arm64 for now, since it could break x86
platforms that haven't run this code so far, if they have invalid
tables.

Reviewed-by: Eric Auger 
Signed-off-by: Jean-Philippe Brucker 
---
 drivers/acpi/arm64/Makefile |  1 +
 include/linux/acpi.h|  3 +++
 include/linux/acpi_iort.h   |  6 ++---
 drivers/acpi/arm64/dma.c| 50 ++
 drivers/acpi/arm64/iort.c   | 54 ++---
 drivers/acpi/scan.c |  2 +-
 6 files changed, 66 insertions(+), 50 deletions(-)
 create mode 100644 drivers/acpi/arm64/dma.c

diff --git a/drivers/acpi/arm64/Makefile b/drivers/acpi/arm64/Makefile
index 6ff50f4ed947..66acbe77f46e 100644
--- a/drivers/acpi/arm64/Makefile
+++ b/drivers/acpi/arm64/Makefile
@@ -1,3 +1,4 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-$(CONFIG_ACPI_IORT)+= iort.o
 obj-$(CONFIG_ACPI_GTDT)+= gtdt.o
+obj-y  += dma.o
diff --git a/include/linux/acpi.h b/include/linux/acpi.h
index c60745f657e9..7aaa9559cc19 100644
--- a/include/linux/acpi.h
+++ b/include/linux/acpi.h
@@ -259,9 +259,12 @@ void acpi_numa_x2apic_affinity_init(struct 
acpi_srat_x2apic_cpu_affinity *pa);
 
 #ifdef CONFIG_ARM64
 void acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa);
+void acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size);
 #else
 static inline void
 acpi_numa_gicc_affinity_init(struct acpi_srat_gicc_affinity *pa) { }
+static inline void
+acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size) { }
 #endif
 
 int acpi_numa_memory_affinity_init (struct acpi_srat_mem_affinity *ma);
diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h
index 1a12baa58e40..f7f054833afd 100644
--- a/include/linux/acpi_iort.h
+++ b/include/linux/acpi_iort.h
@@ -34,7 +34,7 @@ struct irq_domain *iort_get_device_domain(struct device *dev, 
u32 id,
 void acpi_configure_pmsi_domain(struct device *dev);
 int iort_pmsi_get_dev_id(struct device *dev, u32 *dev_id);
 /* IOMMU interface */
-void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *size);
+int iort_dma_get_ranges(struct device *dev, u64 *size);
 const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
const u32 *id_in);
 int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head 
*head);
@@ -48,8 +48,8 @@ static inline struct irq_domain *iort_get_device_domain(
 { return NULL; }
 static inline void acpi_configure_pmsi_domain(struct device *dev) { }
 /* IOMMU interface */
-static inline void iort_dma_setup(struct device *dev, u64 *dma_addr,
- u64 *size) { }
+static inline int iort_dma_get_ranges(struct device *dev, u64 *size)
+{ return -ENODEV; }
 static inline const struct iommu_ops *iort_iommu_configure_id(
  struct device *dev, const u32 *id_in)
 { return NULL; }
diff --git a/drivers/acpi/arm64/dma.c b/drivers/acpi/arm64/dma.c
new file mode 100644
index ..f16739ad3cc0
--- /dev/null
+++ b/drivers/acpi/arm64/dma.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include 
+#include 
+#include 
+#include 
+
+void acpi_arch_dma_setup(struct device *dev, u64 *dma_addr, u64 *dma_size)
+{
+   int ret;
+   u64 end, mask;
+   u64 dmaaddr = 0, size = 0, offset = 0;
+
+   /*
+* If @dev is expected to be DMA-capable then the bus code that created
+* it should have initialised its dma_mask pointer by this point. For
+* now, we'll continue the legacy behaviour of coercing it to the
+* coherent mask if not, but we'll no longer do so quietly.
+*/
+   if (!dev->dma_mask) {
+   dev_warn(dev, "DMA mask not set\n");
+   dev->dma_mask = >coherent_dma_mask;
+   }
+
+   if (dev->coherent_dma_mask)
+   size = max(dev->coherent_dma_mask, dev->coherent_dma_mask + 1);
+   else
+   size = 1ULL << 32;
+
+   ret = acpi_dma_get_range(dev, , , );
+   if (ret == -ENODEV)
+   ret = iort_dma_get_ranges(dev, );
+   if (!ret) {
+   /*
+* Limit coherent and dma mask based on size retrieved from
+* firmware.
+*/
+   end = dmaaddr + size - 1;
+   mask = DMA_BIT_MASK(ilog2(end) + 1);
+   dev->bus_dma_limit = end;
+   dev->coherent_dma_mask = min(dev->coherent_dma_mask, mask);
+   *dev->dma_mask = min(*dev->dma_mask, mask);
+   }
+
+   *dma_addr = dmaaddr;
+   *dma_size = size;
+
+   ret = dma_direct_set_offset(dev, dmaaddr + offset, dmaaddr, size);
+
+   dev_dbg(dev, "dma_offset(%#08llx)%s\n", offset, ret ? " failed!" : "");
+}
diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c
index 3912a1f6058e..a940be1cf2af 100644
--- 

[PATCH v5 0/5] Add support for ACPI VIOT

2021-06-18 Thread Jean-Philippe Brucker
Add a driver for the ACPI VIOT table, which provides topology
information for para-virtual IOMMUs. Enable virtio-iommu on
non-devicetree platforms, including x86.

Since v4 [1]:
* Fixes (comments, wrong argument, unused variable)
* Removed patch 5 that wrongly moved set_dma_ops(dev, NULL) into dma-iommu.
  The simplification of limit parameters for x86 callers is now in patch 4.
* Release ACPI table after parsing
* Added review and tested tags, thanks for all the feedback!

You can find a QEMU implementation at [2], with extra support for
testing all VIOT nodes including MMIO-based endpoints and IOMMU.
This series is at [3].

[1] 
https://lore.kernel.org/linux-iommu/20210610075130.67517-1-jean-phili...@linaro.org/
[2] https://jpbrucker.net/git/qemu/log/?h=virtio-iommu/acpi
[3] https://jpbrucker.net/git/linux/log/?h=virtio-iommu/acpi

Jean-Philippe Brucker (5):
  ACPI: arm64: Move DMA setup operations out of IORT
  ACPI: Move IOMMU setup code out of IORT
  ACPI: Add driver for the VIOT table
  iommu/dma: Pass address limit rather than size to
iommu_setup_dma_ops()
  iommu/virtio: Enable x86 support

 drivers/acpi/Kconfig |   3 +
 drivers/iommu/Kconfig|   4 +-
 drivers/acpi/Makefile|   2 +
 drivers/acpi/arm64/Makefile  |   1 +
 include/acpi/acpi_bus.h  |   3 +
 include/linux/acpi.h |   3 +
 include/linux/acpi_iort.h|  14 +-
 include/linux/acpi_viot.h|  19 ++
 include/linux/dma-iommu.h|   4 +-
 arch/arm64/mm/dma-mapping.c  |   2 +-
 drivers/acpi/arm64/dma.c |  50 +
 drivers/acpi/arm64/iort.c| 128 ++--
 drivers/acpi/bus.c   |   2 +
 drivers/acpi/scan.c  |  78 +++-
 drivers/acpi/viot.c  | 366 +++
 drivers/iommu/amd/iommu.c|   2 +-
 drivers/iommu/dma-iommu.c|  13 +-
 drivers/iommu/intel/iommu.c  |   5 +-
 drivers/iommu/virtio-iommu.c |  11 ++
 MAINTAINERS  |   8 +
 20 files changed, 581 insertions(+), 137 deletions(-)
 create mode 100644 include/linux/acpi_viot.h
 create mode 100644 drivers/acpi/arm64/dma.c
 create mode 100644 drivers/acpi/viot.c

-- 
2.32.0

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v6 2/2] virtio-vsock: SOCK_SEQPACKET description

2021-06-18 Thread Stefano Garzarella

On Mon, May 24, 2021 at 09:34:15PM +0300, Arseny Krasnov wrote:

This adds description of SOCK_SEQPACKET socket type
support for virtio-vsock.

Signed-off-by: Arseny Krasnov 
---
virtio-vsock.tex | 26 +-
1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/virtio-vsock.tex b/virtio-vsock.tex
index ad57f9d..9ef2b0e 100644
--- a/virtio-vsock.tex
+++ b/virtio-vsock.tex
@@ -16,7 +16,10 @@ \subsection{Virtqueues}\label{sec:Device Types / Socket 
Device / Virtqueues}

\subsection{Feature bits}\label{sec:Device Types / Socket Device / Feature bits}

-There are currently no feature bits defined for this device.
+\begin{description}
+\item VIRTIO_VSOCK_F_SEQPACKET (1) SOCK_SEQPACKET socket type is
+supported.
+\end{description}


The VIRTIO_VSOCK_F_STREAM is described in the datagram series by Jiang,
so this patch should be fine:

Reviewed-by: Stefano Garzarella 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v6 1/2] virtio-vsock: use C style defines for constants

2021-06-18 Thread Stefano Garzarella

On Mon, May 24, 2021 at 09:33:50PM +0300, Arseny Krasnov wrote:

This:
1) Replaces enums with C style "defines", because
  use of enums is not documented, while "defines"
  are widely used in spec.
2) Adds defines for some constants.

Reviewed-by: Stefan Hajnoczi 
Signed-off-by: Arseny Krasnov 
---
virtio-vsock.tex | 54 +---
1 file changed, 28 insertions(+), 26 deletions(-)


Reviewed-by: Stefano Garzarella 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v11 11/18] virtio/vsock: dequeue callback for SOCK_SEQPACKET

2021-06-18 Thread Stefano Garzarella

On Fri, Jun 18, 2021 at 09:51:44AM -0400, Michael S. Tsirkin wrote:

On Fri, Jun 18, 2021 at 03:44:23PM +0200, Stefano Garzarella wrote:

Hi Arseny,
the series looks great, I have just a question below about
seqpacket_dequeue.

I also sent a couple a simple fixes, it would be great if you can review
them:
https://lore.kernel.org/netdev/20210618133526.300347-1-sgarz...@redhat.com/


So given this was picked into net next, what's the plan? Just make spec
follow code? We can wait and see, if there are issues with the spec just
remember to mask the feature before release.


Yep, the spec patches was already posted, but not merged yet: 
https://lists.oasis-open.org/archives/virtio-comment/202105/msg00017.html


The changes are quite small and they are aligned with the current 
implementation.


Anyway, I perfectly agree with you about waiting and mask it before 
v5.14 release if there are any issue.


Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v11 11/18] virtio/vsock: dequeue callback for SOCK_SEQPACKET

2021-06-18 Thread Michael S. Tsirkin
On Fri, Jun 18, 2021 at 03:44:23PM +0200, Stefano Garzarella wrote:
> Hi Arseny,
> the series looks great, I have just a question below about
> seqpacket_dequeue.
> 
> I also sent a couple a simple fixes, it would be great if you can review
> them:
> https://lore.kernel.org/netdev/20210618133526.300347-1-sgarz...@redhat.com/

So given this was picked into net next, what's the plan? Just make spec
follow code? We can wait and see, if there are issues with the spec just
remember to mask the feature before release.

-- 
MST

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v11 00/18] virtio/vsock: introduce SOCK_SEQPACKET support

2021-06-18 Thread Michael S. Tsirkin
On Fri, Jun 11, 2021 at 09:00:13PM +, patchwork-bot+netdev...@kernel.org 
wrote:
> Hello:
> 
> This series was applied to netdev/net-next.git (refs/heads/master):
> 
> On Fri, 11 Jun 2021 14:07:40 +0300 you wrote:
> > This patchset implements support of SOCK_SEQPACKET for virtio
> > transport.
> > As SOCK_SEQPACKET guarantees to save record boundaries, so to
> > do it, new bit for field 'flags' was added: SEQ_EOR. This bit is
> > set to 1 in last RW packet of message.
> > Now as  packets of one socket are not reordered neither on vsock
> > nor on vhost transport layers, such bit allows to restore original
> > message on receiver's side. If user's buffer is smaller than message
> > length, when all out of size data is dropped.
> > Maximum length of datagram is limited by 'peer_buf_alloc' value.
> > Implementation also supports 'MSG_TRUNC' flags.
> > Tests also implemented.
> > 
> > [...]
> 
> Here is the summary with links:
>   - [v11,01/18] af_vsock: update functions for connectible socket
> https://git.kernel.org/netdev/net-next/c/a9e29e5511b9
>   - [v11,02/18] af_vsock: separate wait data loop
> https://git.kernel.org/netdev/net-next/c/b3f7fd54881b
>   - [v11,03/18] af_vsock: separate receive data loop
> https://git.kernel.org/netdev/net-next/c/19c1b90e1979
>   - [v11,04/18] af_vsock: implement SEQPACKET receive loop
> https://git.kernel.org/netdev/net-next/c/9942c192b256
>   - [v11,05/18] af_vsock: implement send logic for SEQPACKET
> https://git.kernel.org/netdev/net-next/c/fbe70c480796
>   - [v11,06/18] af_vsock: rest of SEQPACKET support
> https://git.kernel.org/netdev/net-next/c/0798e78b102b
>   - [v11,07/18] af_vsock: update comments for stream sockets
> https://git.kernel.org/netdev/net-next/c/8cb48554ad82
>   - [v11,08/18] virtio/vsock: set packet's type in 
> virtio_transport_send_pkt_info()
> https://git.kernel.org/netdev/net-next/c/b93f8877c1f2
>   - [v11,09/18] virtio/vsock: simplify credit update function API
> https://git.kernel.org/netdev/net-next/c/c10844c59799
>   - [v11,10/18] virtio/vsock: defines and constants for SEQPACKET
> https://git.kernel.org/netdev/net-next/c/f07b2a5b04d4
>   - [v11,11/18] virtio/vsock: dequeue callback for SOCK_SEQPACKET
> https://git.kernel.org/netdev/net-next/c/44931195a541
>   - [v11,12/18] virtio/vsock: add SEQPACKET receive logic
> https://git.kernel.org/netdev/net-next/c/e4b1ef152f53
>   - [v11,13/18] virtio/vsock: rest of SOCK_SEQPACKET support
> https://git.kernel.org/netdev/net-next/c/9ac841f5e9f2
>   - [v11,14/18] virtio/vsock: enable SEQPACKET for transport
> https://git.kernel.org/netdev/net-next/c/53efbba12cc7
>   - [v11,15/18] vhost/vsock: support SEQPACKET for transport
> https://git.kernel.org/netdev/net-next/c/ced7b713711f
>   - [v11,16/18] vsock/loopback: enable SEQPACKET for transport
> https://git.kernel.org/netdev/net-next/c/6e90a57795aa
>   - [v11,17/18] vsock_test: add SOCK_SEQPACKET tests
> https://git.kernel.org/netdev/net-next/c/41b792d7a86d
>   - [v11,18/18] virtio/vsock: update trace event for SEQPACKET
> https://git.kernel.org/netdev/net-next/c/184039eefeae

Hmm so the virtio part was merged before the spec is ready.
What's the plan now?


> You are awesome, thank you!
> --
> Deet-doot-dot, I am a bot.
> https://korg.docs.kernel.org/patchwork/pwbot.html
> 

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v11 11/18] virtio/vsock: dequeue callback for SOCK_SEQPACKET

2021-06-18 Thread Stefano Garzarella

Hi Arseny,
the series looks great, I have just a question below about 
seqpacket_dequeue.


I also sent a couple a simple fixes, it would be great if you can review 
them: 
https://lore.kernel.org/netdev/20210618133526.300347-1-sgarz...@redhat.com/



On Fri, Jun 11, 2021 at 02:12:38PM +0300, Arseny Krasnov wrote:

Callback fetches RW packets from rx queue of socket until whole record
is copied(if user's buffer is full, user is not woken up). This is done
to not stall sender, because if we wake up user and it leaves syscall,
nobody will send credit update for rest of record, and sender will wait
for next enter of read syscall at receiver's side. So if user buffer is
full, we just send credit update and drop data.

Signed-off-by: Arseny Krasnov 
---
v10 -> v11:
1) 'msg_count' field added to count current number of EORs.
2) 'msg_ready' argument removed from callback.
3) If 'memcpy_to_msg()' failed during copy loop, there will be
   no next attempts to copy data, rest of record will be freed.

include/linux/virtio_vsock.h|  5 ++
net/vmw_vsock/virtio_transport_common.c | 84 +
2 files changed, 89 insertions(+)

diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index dc636b727179..1d9a302cb91d 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -36,6 +36,7 @@ struct virtio_vsock_sock {
u32 rx_bytes;
u32 buf_alloc;
struct list_head rx_queue;
+   u32 msg_count;
};

struct virtio_vsock_pkt {
@@ -80,6 +81,10 @@ virtio_transport_dgram_dequeue(struct vsock_sock *vsk,
   struct msghdr *msg,
   size_t len, int flags);

+ssize_t
+virtio_transport_seqpacket_dequeue(struct vsock_sock *vsk,
+  struct msghdr *msg,
+  int flags);
s64 virtio_transport_stream_has_data(struct vsock_sock *vsk);
s64 virtio_transport_stream_has_space(struct vsock_sock *vsk);

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index ad0d34d41444..1e1df19ec164 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -393,6 +393,78 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
return err;
}

+static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
+struct msghdr *msg,
+int flags)
+{
+   struct virtio_vsock_sock *vvs = vsk->trans;
+   struct virtio_vsock_pkt *pkt;
+   int dequeued_len = 0;
+   size_t user_buf_len = msg_data_left(msg);
+   bool copy_failed = false;
+   bool msg_ready = false;
+
+   spin_lock_bh(>rx_lock);
+
+   if (vvs->msg_count == 0) {
+   spin_unlock_bh(>rx_lock);
+   return 0;
+   }
+
+   while (!msg_ready) {
+   pkt = list_first_entry(>rx_queue, struct virtio_vsock_pkt, 
list);
+
+   if (!copy_failed) {
+   size_t pkt_len;
+   size_t bytes_to_copy;
+
+   pkt_len = (size_t)le32_to_cpu(pkt->hdr.len);
+   bytes_to_copy = min(user_buf_len, pkt_len);
+
+   if (bytes_to_copy) {
+   int err;
+
+   /* sk_lock is held by caller so no one else can 
dequeue.
+* Unlock rx_lock since memcpy_to_msg() may 
sleep.
+*/
+   spin_unlock_bh(>rx_lock);
+
+   err = memcpy_to_msg(msg, pkt->buf, 
bytes_to_copy);
+   if (err) {
+   /* Copy of message failed, set flag to 
skip
+* copy path for rest of fragments. 
Rest of
+* fragments will be freed without copy.
+*/
+   copy_failed = true;
+   dequeued_len = err;


If we fail to copy the message we will discard the entire packet.
Is it acceptable for the user point of view, or we should leave the 
packet in the queue and the user can retry, maybe with a different 
buffer?


Then we can remove the packets only when we successfully copied all the 
fragments.


I'm not sure make sense, maybe better to check also other 
implementations :-)


Thanks,
Stefano


+   } else {
+   user_buf_len -= bytes_to_copy;
+   }
+
+   spin_lock_bh(>rx_lock);
+   }
+
+   if (dequeued_len >= 0)
+   dequeued_len += pkt_len;
+   }
+
+   if 

[PATCH net-next 3/3] vsock/virtio: remove redundant `copy_failed` variable

2021-06-18 Thread Stefano Garzarella
When memcpy_to_msg() fails in virtio_transport_seqpacket_do_dequeue(),
we already set `dequeued_len` with the negative error value returned
by memcpy_to_msg().

So we can directly check `dequeued_len` value instead of using a
dedicated flag variable to skip the copy path for the rest of
fragments.

Signed-off-by: Stefano Garzarella 
---
 net/vmw_vsock/virtio_transport_common.c | 7 ++-
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport_common.c 
b/net/vmw_vsock/virtio_transport_common.c
index 23704a6bc437..f014ccfdd9c2 100644
--- a/net/vmw_vsock/virtio_transport_common.c
+++ b/net/vmw_vsock/virtio_transport_common.c
@@ -413,7 +413,6 @@ static int virtio_transport_seqpacket_do_dequeue(struct 
vsock_sock *vsk,
struct virtio_vsock_pkt *pkt;
int dequeued_len = 0;
size_t user_buf_len = msg_data_left(msg);
-   bool copy_failed = false;
bool msg_ready = false;
 
spin_lock_bh(>rx_lock);
@@ -426,7 +425,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct 
vsock_sock *vsk,
while (!msg_ready) {
pkt = list_first_entry(>rx_queue, struct virtio_vsock_pkt, 
list);
 
-   if (!copy_failed) {
+   if (dequeued_len >= 0) {
size_t pkt_len;
size_t bytes_to_copy;
 
@@ -443,11 +442,9 @@ static int virtio_transport_seqpacket_do_dequeue(struct 
vsock_sock *vsk,
 
err = memcpy_to_msg(msg, pkt->buf, 
bytes_to_copy);
if (err) {
-   /* Copy of message failed, set flag to 
skip
-* copy path for rest of fragments. 
Rest of
+   /* Copy of message failed. Rest of
 * fragments will be freed without copy.
 */
-   copy_failed = true;
dequeued_len = err;
} else {
user_buf_len -= bytes_to_copy;
-- 
2.31.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next 2/3] vsock: rename vsock_wait_data()

2021-06-18 Thread Stefano Garzarella
vsock_wait_data() is used only by STREAM and SEQPACKET sockets,
so let's rename it to vsock_connectible_wait_data(), using the same
nomenclature (connectible) used in other functions after the
introduction of SEQPACKET.

Signed-off-by: Stefano Garzarella 
---
 net/vmw_vsock/af_vsock.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index de8249483081..21ccf450e249 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1866,10 +1866,11 @@ static int vsock_connectible_sendmsg(struct socket 
*sock, struct msghdr *msg,
return err;
 }
 
-static int vsock_wait_data(struct sock *sk, struct wait_queue_entry *wait,
-  long timeout,
-  struct vsock_transport_recv_notify_data *recv_data,
-  size_t target)
+static int vsock_connectible_wait_data(struct sock *sk,
+  struct wait_queue_entry *wait,
+  long timeout,
+  struct vsock_transport_recv_notify_data 
*recv_data,
+  size_t target)
 {
const struct vsock_transport *transport;
struct vsock_sock *vsk;
@@ -1967,7 +1968,8 @@ static int __vsock_stream_recvmsg(struct sock *sk, struct 
msghdr *msg,
while (1) {
ssize_t read;
 
-   err = vsock_wait_data(sk, , timeout, _data, target);
+   err = vsock_connectible_wait_data(sk, , timeout,
+ _data, target);
if (err <= 0)
break;
 
@@ -2022,7 +2024,7 @@ static int __vsock_seqpacket_recvmsg(struct sock *sk, 
struct msghdr *msg,
 
timeout = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
 
-   err = vsock_wait_data(sk, , timeout, NULL, 0);
+   err = vsock_connectible_wait_data(sk, , timeout, NULL, 0);
if (err <= 0)
goto out;
 
-- 
2.31.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next 1/3] vsock: rename vsock_has_data()

2021-06-18 Thread Stefano Garzarella
vsock_has_data() is used only by STREAM and SEQPACKET sockets,
so let's rename it to vsock_connectible_has_data(), using the same
nomenclature (connectible) used in other functions after the
introduction of SEQPACKET.

Signed-off-by: Stefano Garzarella 
---
 net/vmw_vsock/af_vsock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 67954afef4e1..de8249483081 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -860,7 +860,7 @@ s64 vsock_stream_has_data(struct vsock_sock *vsk)
 }
 EXPORT_SYMBOL_GPL(vsock_stream_has_data);
 
-static s64 vsock_has_data(struct vsock_sock *vsk)
+static s64 vsock_connectible_has_data(struct vsock_sock *vsk)
 {
struct sock *sk = sk_vsock(vsk);
 
@@ -1880,7 +1880,7 @@ static int vsock_wait_data(struct sock *sk, struct 
wait_queue_entry *wait,
err = 0;
transport = vsk->transport;
 
-   while ((data = vsock_has_data(vsk)) == 0) {
+   while ((data = vsock_connectible_has_data(vsk)) == 0) {
prepare_to_wait(sk_sleep(sk), wait, TASK_INTERRUPTIBLE);
 
if (sk->sk_err != 0 ||
-- 
2.31.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH net-next 0/3] vsock: small fixes for seqpacket support

2021-06-18 Thread Stefano Garzarella
This series contains few patches to clean up a bit the code
of seqpacket recently merged in the net-next tree.

No functionality changes.

Signed-off-by: Stefano Garzarella 

Stefano Garzarella (3):
  vsock: rename vsock_has_data()
  vsock: rename vsock_wait_data()
  vsock/virtio: remove redundant `copy_failed` variable

 net/vmw_vsock/af_vsock.c| 18 ++
 net/vmw_vsock/virtio_transport_common.c |  7 ++-
 2 files changed, 12 insertions(+), 13 deletions(-)

-- 
2.31.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH] virtio_net: Enable MSI-X vector for ctrl queue

2021-06-18 Thread Michael S. Tsirkin
On Fri, Jun 18, 2021 at 04:26:25PM +0900, Keiichi Watanabe wrote:
> When we use vhost-user backend on the host, MSI-X vector should be set
> so that the vmm can get an irq FD and send it to the backend device
> process with vhost-user protocol.
> Since whether the vector is set for a queue is determined depending on
> the queue has a callback, this commit sets an empty callback for
> virtio-net's control queue.
> 
> Signed-off-by: Keiichi Watanabe 

I'm confused by this explanation. If the vmm wants to get
an interrupt it can do so - why change the guest driver?

> ---
>  drivers/net/virtio_net.c | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index 11f722460513..002e3695d4b3 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2696,6 +2696,11 @@ static void virtnet_del_vqs(struct virtnet_info *vi)
>   virtnet_free_queues(vi);
>  }
>  
> +static void virtnet_ctrlq_done(struct virtqueue *rvq)
> +{
> + /* Do nothing */
> +}
> +
>  /* How large should a single buffer be so a queue full of these can fit at
>   * least one full packet?
>   * Logic below assumes the mergeable buffer header is used.
> @@ -2748,7 +2753,7 @@ static int virtnet_find_vqs(struct virtnet_info *vi)
>  
>   /* Parameters for control virtqueue, if any */
>   if (vi->has_cvq) {
> - callbacks[total_vqs - 1] = NULL;
> + callbacks[total_vqs - 1] = virtnet_ctrlq_done;
>   names[total_vqs - 1] = "control";
>   }
>  
> -- 
> 2.32.0.288.g62a8d224e6-goog

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v7 0/2] x86/sev: Fixes for SEV-ES Guest Support

2021-06-18 Thread Joerg Roedel
From: Joerg Roedel 

Hi,

please find here the next iteration of my pending fixes for SEV-ES
guest support in Linux. This version addresses the comments I received
from Peter on the previous version, in particular:

- Removed IRQ disable/enable calls in the ap-hlt loop code
- Made __sev_get/put_ghcb() noinstr instead of __always_inline
  and put instrumentation_begin()/end() calls around panic()
  to fix an objtool warning.
- Split up #DB handling in the #VC handler into a from-user
  and from-kernel part as well.

The patches are again tested with a kernel with various debugging
options enabled, running as an SEV-ES guest.

The previous version of these patches can be found here:

https://lore.kernel.org/lkml/20210616184913.13064-1-j...@8bytes.org/

Please review.

Thanks,

Joerg

Joerg Roedel (2):
  x86/sev: Make sure IRQs are disabled while GHCB is active
  x86/sev: Split up runtime #VC handler for correct state tracking

 arch/x86/entry/entry_64.S   |   4 +-
 arch/x86/include/asm/idtentry.h |  29 ++---
 arch/x86/kernel/sev.c   | 182 ++--
 3 files changed, 113 insertions(+), 102 deletions(-)


base-commit: 07570cef5e5c3fcec40f82a9075abb4c1da63319
-- 
2.31.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH v7 2/2] x86/sev: Split up runtime #VC handler for correct state tracking

2021-06-18 Thread Joerg Roedel
From: Joerg Roedel 

Split up the #VC handler code into a from-user and a from-kernel part.
This allows clean and correct state tracking, as the #VC handler needs
to enter NMI-state when raised from kernel mode and plain IRQ state when
raised from user-mode.

Fixes: 62441a1fb532 ("x86/sev-es: Correctly track IRQ states in runtime #VC 
handler")
Suggested-by: Peter Zijlstra 
Signed-off-by: Joerg Roedel 
---
 arch/x86/entry/entry_64.S   |   4 +-
 arch/x86/include/asm/idtentry.h |  29 +++
 arch/x86/kernel/sev.c   | 148 +---
 3 files changed, 91 insertions(+), 90 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a16a5294d55f..1886aaf19914 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -506,7 +506,7 @@ SYM_CODE_START(\asmsym)
 
movq%rsp, %rdi  /* pt_regs pointer */
 
-   call\cfunc
+   callkernel_\cfunc
 
/*
 * No need to switch back to the IST stack. The current stack is either
@@ -517,7 +517,7 @@ SYM_CODE_START(\asmsym)
 
/* Switch to the regular task stack */
 .Lfrom_usermode_switch_stack_\@:
-   idtentry_body safe_stack_\cfunc, has_error_code=1
+   idtentry_body user_\cfunc, has_error_code=1
 
 _ASM_NOKPROBE(\asmsym)
 SYM_CODE_END(\asmsym)
diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 73d45b0dfff2..cd9f3e304944 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -312,8 +312,8 @@ static __always_inline void __##func(struct pt_regs *regs)
  */
 #define DECLARE_IDTENTRY_VC(vector, func)  \
DECLARE_IDTENTRY_RAW_ERRORCODE(vector, func);   \
-   __visible noinstr void ist_##func(struct pt_regs *regs, unsigned long 
error_code);  \
-   __visible noinstr void safe_stack_##func(struct pt_regs *regs, unsigned 
long error_code)
+   __visible noinstr void kernel_##func(struct pt_regs *regs, unsigned 
long error_code);   \
+   __visible noinstr void   user_##func(struct pt_regs *regs, unsigned 
long error_code)
 
 /**
  * DEFINE_IDTENTRY_IST - Emit code for IST entry points
@@ -355,33 +355,24 @@ static __always_inline void __##func(struct pt_regs *regs)
DEFINE_IDTENTRY_RAW_ERRORCODE(func)
 
 /**
- * DEFINE_IDTENTRY_VC_SAFE_STACK - Emit code for VMM communication handler
-  which runs on a safe stack.
+ * DEFINE_IDTENTRY_VC_KERNEL - Emit code for VMM communication handler
+  when raised from kernel mode
  * @func:  Function name of the entry point
  *
  * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
  */
-#define DEFINE_IDTENTRY_VC_SAFE_STACK(func)\
-   DEFINE_IDTENTRY_RAW_ERRORCODE(safe_stack_##func)
+#define DEFINE_IDTENTRY_VC_KERNEL(func)\
+   DEFINE_IDTENTRY_RAW_ERRORCODE(kernel_##func)
 
 /**
- * DEFINE_IDTENTRY_VC_IST - Emit code for VMM communication handler
-   which runs on the VC fall-back stack
+ * DEFINE_IDTENTRY_VC_USER - Emit code for VMM communication handler
+when raised from user mode
  * @func:  Function name of the entry point
  *
  * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
  */
-#define DEFINE_IDTENTRY_VC_IST(func)   \
-   DEFINE_IDTENTRY_RAW_ERRORCODE(ist_##func)
-
-/**
- * DEFINE_IDTENTRY_VC - Emit code for VMM communication handler
- * @func:  Function name of the entry point
- *
- * Maps to DEFINE_IDTENTRY_RAW_ERRORCODE
- */
-#define DEFINE_IDTENTRY_VC(func)   \
-   DEFINE_IDTENTRY_RAW_ERRORCODE(func)
+#define DEFINE_IDTENTRY_VC_USER(func)  \
+   DEFINE_IDTENTRY_RAW_ERRORCODE(user_##func)
 
 #else  /* CONFIG_X86_64 */
 
diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 9f32cbb773d9..87a4b00f028e 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -793,7 +793,7 @@ void __init sev_es_init_vc_handling(void)
sev_es_setup_play_dead();
 
/* Secondary CPUs use the runtime #VC handler */
-   initial_vc_handler = (unsigned long)safe_stack_exc_vmm_communication;
+   initial_vc_handler = (unsigned long)kernel_exc_vmm_communication;
 }
 
 static void __init vc_early_forward_exception(struct es_em_ctxt *ctxt)
@@ -1231,14 +1231,6 @@ static enum es_result vc_handle_trap_ac(struct ghcb 
*ghcb,
return ES_EXCEPTION;
 }
 
-static __always_inline void vc_handle_trap_db(struct pt_regs *regs)
-{
-   if (user_mode(regs))
-   noist_exc_debug(regs);
-   else
-   exc_debug(regs);
-}
-
 static enum es_result vc_handle_exitcode(struct es_em_ctxt *ctxt,
 struct ghcb *ghcb,
 unsigned long exit_code)
@@ -1334,41 +1326,13 @@ static __always_inline bool 

[PATCH v7 1/2] x86/sev: Make sure IRQs are disabled while GHCB is active

2021-06-18 Thread Joerg Roedel
From: Joerg Roedel 

The #VC handler only cares about IRQs being disabled while the GHCB is
active, as it must not be interrupted by something which could cause
another #VC while it holds the GHCB (NMI is the exception for which the
backup GHCB exits).

Make sure nothing interrupts the code path while the GHCB is active by
disabling IRQs in sev_es_get_ghcb() and restoring the previous irq state
in sev_es_put_ghcb().

Signed-off-by: Joerg Roedel 
---
 arch/x86/kernel/sev.c | 34 ++
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/arch/x86/kernel/sev.c b/arch/x86/kernel/sev.c
index 8178db07a06a..9f32cbb773d9 100644
--- a/arch/x86/kernel/sev.c
+++ b/arch/x86/kernel/sev.c
@@ -12,7 +12,6 @@
 #include  /* For show_regs() */
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -192,11 +191,19 @@ void noinstr __sev_es_ist_exit(void)
this_cpu_write(cpu_tss_rw.x86_tss.ist[IST_INDEX_VC], *(unsigned long 
*)ist);
 }
 
-static __always_inline struct ghcb *sev_es_get_ghcb(struct ghcb_state *state)
+/*
+ * Nothing shall interrupt this code path while holding the per-CPU
+ * GHCB. The backup GHCB is only for NMIs interrupting this path.
+ *
+ * Callers must disable local interrupts around it.
+ */
+static noinstr struct ghcb *__sev_get_ghcb(struct ghcb_state *state)
 {
struct sev_es_runtime_data *data;
struct ghcb *ghcb;
 
+   WARN_ON(!irqs_disabled());
+
data = this_cpu_read(runtime_data);
ghcb = >ghcb_page;
 
@@ -213,7 +220,9 @@ static __always_inline struct ghcb *sev_es_get_ghcb(struct 
ghcb_state *state)
data->ghcb_active= false;
data->backup_ghcb_active = false;
 
+   instrumentation_begin();
panic("Unable to handle #VC exception! GHCB and Backup 
GHCB are already in use");
+   instrumentation_end();
}
 
/* Mark backup_ghcb active before writing to it */
@@ -486,11 +495,13 @@ static enum es_result vc_slow_virt_to_phys(struct ghcb 
*ghcb, struct es_em_ctxt
 /* Include code shared with pre-decompression boot stage */
 #include "sev-shared.c"
 
-static __always_inline void sev_es_put_ghcb(struct ghcb_state *state)
+static noinstr void __sev_put_ghcb(struct ghcb_state *state)
 {
struct sev_es_runtime_data *data;
struct ghcb *ghcb;
 
+   WARN_ON(!irqs_disabled());
+
data = this_cpu_read(runtime_data);
ghcb = >ghcb_page;
 
@@ -514,7 +525,7 @@ void noinstr __sev_es_nmi_complete(void)
struct ghcb_state state;
struct ghcb *ghcb;
 
-   ghcb = sev_es_get_ghcb();
+   ghcb = __sev_get_ghcb();
 
vc_ghcb_invalidate(ghcb);
ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_NMI_COMPLETE);
@@ -524,7 +535,7 @@ void noinstr __sev_es_nmi_complete(void)
sev_es_wr_ghcb_msr(__pa_nodebug(ghcb));
VMGEXIT();
 
-   sev_es_put_ghcb();
+   __sev_put_ghcb();
 }
 
 static u64 get_jump_table_addr(void)
@@ -536,7 +547,7 @@ static u64 get_jump_table_addr(void)
 
local_irq_save(flags);
 
-   ghcb = sev_es_get_ghcb();
+   ghcb = __sev_get_ghcb();
 
vc_ghcb_invalidate(ghcb);
ghcb_set_sw_exit_code(ghcb, SVM_VMGEXIT_AP_JUMP_TABLE);
@@ -550,7 +561,7 @@ static u64 get_jump_table_addr(void)
ghcb_sw_exit_info_2_is_valid(ghcb))
ret = ghcb->save.sw_exit_info_2;
 
-   sev_es_put_ghcb();
+   __sev_put_ghcb();
 
local_irq_restore(flags);
 
@@ -675,7 +686,7 @@ static void sev_es_ap_hlt_loop(void)
struct ghcb_state state;
struct ghcb *ghcb;
 
-   ghcb = sev_es_get_ghcb();
+   ghcb = __sev_get_ghcb();
 
while (true) {
vc_ghcb_invalidate(ghcb);
@@ -692,7 +703,7 @@ static void sev_es_ap_hlt_loop(void)
break;
}
 
-   sev_es_put_ghcb();
+   __sev_put_ghcb();
 }
 
 /*
@@ -1351,7 +1362,6 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
}
 
irq_state = irqentry_nmi_enter(regs);
-   lockdep_assert_irqs_disabled();
instrumentation_begin();
 
/*
@@ -1360,7 +1370,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
 * keep the IRQs disabled to protect us against concurrent TLB flushes.
 */
 
-   ghcb = sev_es_get_ghcb();
+   ghcb = __sev_get_ghcb();
 
vc_ghcb_invalidate(ghcb);
result = vc_init_em_ctxt(, regs, error_code);
@@ -1368,7 +1378,7 @@ DEFINE_IDTENTRY_VC_SAFE_STACK(exc_vmm_communication)
if (result == ES_OK)
result = vc_handle_exitcode(, ghcb, error_code);
 
-   sev_es_put_ghcb();
+   __sev_put_ghcb();
 
/* Done - now check the result */
switch (result) {
-- 
2.31.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org

Re: [PATCH v4 5/6] iommu/dma: Simplify calls to iommu_setup_dma_ops()

2021-06-18 Thread Robin Murphy

On 2021-06-18 11:50, Jean-Philippe Brucker wrote:

On Wed, Jun 16, 2021 at 06:02:39PM +0100, Robin Murphy wrote:

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index c62e19bed302..175f8eaeb5b3 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -1322,7 +1322,9 @@ void iommu_setup_dma_ops(struct device *dev, u64 
dma_base, u64 dma_limit)
if (domain->type == IOMMU_DOMAIN_DMA) {
if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
goto out_err;
-   dev->dma_ops = _dma_ops;
+   set_dma_ops(dev, _dma_ops);
+   } else {
+   set_dma_ops(dev, NULL);


I'm not keen on moving this here, since iommu-dma only knows that its own
ops are right for devices it *is* managing; it can't assume any particular
ops are appropriate for devices it isn't. The idea here is that
arch_setup_dma_ops() may have already set the appropriate ops for the
non-IOMMU case, so if the default domain type is passthrough then we leave
those in place.

For example, I do still plan to revisit my conversion of arch/arm someday,
at which point I'd have to undo this for that reason.


Makes sense, I'll remove this bit.


Simplifying the base and size arguments is of course fine, but TBH I'd say
rip the whole bloody lot out of the arch_setup_dma_ops() flow now. It's a
considerable faff passing them around for nothing but a tenuous sanity check
in iommu_dma_init_domain(), and now that dev->dma_range_map is a common
thing we should expect that to give us any relevant limitations if we even
still care.


So I started working on this but it gets too bulky for a preparatory
patch. Dropping the parameters from arch_setup_dma_ops() seems especially
complicated because arm32 does need the size parameter for IOMMU mappings
and that value falls back to the bus DMA mask or U32_MAX in the absence of
dma-ranges. I could try to dig into this for a separate series.

Even only dropping the parameters from iommu_setup_dma_ops() isn't
completely trivial (8 files changed, 55 insertions(+), 36 deletions(-)
because we still need the lower IOVA limit from dma_range_map), so I'd
rather send it separately and have it sit in -next for a while.


Oh, sure, I didn't mean to imply that the whole cleanup should be within 
the scope of this series, just that we can shave off as much as we *do* 
need to touch here (which TBH is pretty much what you're doing already), 
and mainly to start taking the attitude that these arguments are now 
superseded and increasingly vestigial.


I expected the cross-arch cleanup to be a bit fiddly, but I'd forgotten 
that arch/arm was still actively using these values, so maybe I can 
revisit this when I pick up my iommu-dma conversion again (I swear it's 
not dead, just resting!)


Cheers,
Robin.
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v6 1/2] x86/sev: Make sure IRQs are disabled while GHCB is active

2021-06-18 Thread Peter Zijlstra
On Fri, Jun 18, 2021 at 10:17:54AM +0200, Joerg Roedel wrote:
> On Thu, Jun 17, 2021 at 05:38:46PM +0200, Peter Zijlstra wrote:
> > I'm getting (with all of v6.1 applied):
> > 
> > vmlinux.o: warning: objtool: __sev_es_nmi_complete()+0x1bf: call to panic() 
> > leaves .noinstr.text section
> > 
> > $ ./scripts/faddr2line defconfig-build/vmlinux __sev_es_nmi_complete+0x1bf
> > __sev_es_nmi_complete+0x1bf/0x1d0:
> > __sev_get_ghcb at arch/x86/kernel/sev.c:223
> > (inlined by) __sev_es_nmi_complete at arch/x86/kernel/sev.c:519
> 
> I see where this is coming from, but can't create the same warning. I
> did run 'objtool check -n vmlinux'. Is there more to do to get the full
> check?

You get those when you enable CONFIG_DEBUG_ENTRY=y (make sure to have
PARAVIRT=n, I so need to go fix that :/).
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v4 5/6] iommu/dma: Simplify calls to iommu_setup_dma_ops()

2021-06-18 Thread Jean-Philippe Brucker
On Wed, Jun 16, 2021 at 06:02:39PM +0100, Robin Murphy wrote:
> > diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
> > index c62e19bed302..175f8eaeb5b3 100644
> > --- a/drivers/iommu/dma-iommu.c
> > +++ b/drivers/iommu/dma-iommu.c
> > @@ -1322,7 +1322,9 @@ void iommu_setup_dma_ops(struct device *dev, u64 
> > dma_base, u64 dma_limit)
> > if (domain->type == IOMMU_DOMAIN_DMA) {
> > if (iommu_dma_init_domain(domain, dma_base, dma_limit, dev))
> > goto out_err;
> > -   dev->dma_ops = _dma_ops;
> > +   set_dma_ops(dev, _dma_ops);
> > +   } else {
> > +   set_dma_ops(dev, NULL);
> 
> I'm not keen on moving this here, since iommu-dma only knows that its own
> ops are right for devices it *is* managing; it can't assume any particular
> ops are appropriate for devices it isn't. The idea here is that
> arch_setup_dma_ops() may have already set the appropriate ops for the
> non-IOMMU case, so if the default domain type is passthrough then we leave
> those in place.
> 
> For example, I do still plan to revisit my conversion of arch/arm someday,
> at which point I'd have to undo this for that reason.

Makes sense, I'll remove this bit.

> Simplifying the base and size arguments is of course fine, but TBH I'd say
> rip the whole bloody lot out of the arch_setup_dma_ops() flow now. It's a
> considerable faff passing them around for nothing but a tenuous sanity check
> in iommu_dma_init_domain(), and now that dev->dma_range_map is a common
> thing we should expect that to give us any relevant limitations if we even
> still care.

So I started working on this but it gets too bulky for a preparatory
patch. Dropping the parameters from arch_setup_dma_ops() seems especially
complicated because arm32 does need the size parameter for IOMMU mappings
and that value falls back to the bus DMA mask or U32_MAX in the absence of
dma-ranges. I could try to dig into this for a separate series.

Even only dropping the parameters from iommu_setup_dma_ops() isn't
completely trivial (8 files changed, 55 insertions(+), 36 deletions(-)
because we still need the lower IOVA limit from dma_range_map), so I'd
rather send it separately and have it sit in -next for a while.

Thanks,
Jean

> 
> That said, those are all things which can be fixed up later if the series is
> otherwise ready to go and there's still a chance of landing it for 5.14. If
> you do have any other reason to respin, then I think the x86 probe_finalize
> functions simply want an unconditional set_dma_ops(dev, NULL) before the
> iommu_setup_dma_ops() call.
> 
> Cheers,
> Robin.
> 
> > }
> > return;
> > diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> > index 85f18342603c..8d866940692a 100644
> > --- a/drivers/iommu/intel/iommu.c
> > +++ b/drivers/iommu/intel/iommu.c
> > @@ -5165,15 +5165,7 @@ static void intel_iommu_release_device(struct device 
> > *dev)
> >   static void intel_iommu_probe_finalize(struct device *dev)
> >   {
> > -   dma_addr_t base = IOVA_START_PFN << VTD_PAGE_SHIFT;
> > -   struct iommu_domain *domain = iommu_get_domain_for_dev(dev);
> > -   struct dmar_domain *dmar_domain = to_dmar_domain(domain);
> > -
> > -   if (domain && domain->type == IOMMU_DOMAIN_DMA)
> > -   iommu_setup_dma_ops(dev, base,
> > -   __DOMAIN_MAX_ADDR(dmar_domain->gaw));
> > -   else
> > -   set_dma_ops(dev, NULL);
> > +   iommu_setup_dma_ops(dev, 0, U64_MAX);
> >   }
> >   static void intel_iommu_get_resv_regions(struct device *device,
> > 
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC v1 3/6] vhost/vsock: add support for vhost dgram.

2021-06-18 Thread Stefano Garzarella

We should use le16_to_cpu when accessing pkt->hdr fields.

On Wed, Jun 09, 2021 at 11:24:55PM +, Jiang Wang wrote:

This patch supports dgram on vhost side, including
tx and rx. The vhost send packets asynchronously.

Signed-off-by: Jiang Wang 
---
drivers/vhost/vsock.c | 199 +++---
1 file changed, 173 insertions(+), 26 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 81d064601093..d366463be6d4 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -28,7 +28,10 @@
 * small pkts.
 */
#define VHOST_VSOCK_PKT_WEIGHT 256
+#define VHOST_VSOCK_DGRM_MAX_PENDING_PKT 128

+/* Max wait time in busy poll in microseconds */
+#define VHOST_VSOCK_BUSY_POLL_TIMEOUT 20
enum {
VHOST_VSOCK_FEATURES = VHOST_FEATURES |
   (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
@@ -45,7 +48,7 @@ static DEFINE_READ_MOSTLY_HASHTABLE(vhost_vsock_hash, 8);

struct vhost_vsock {
struct vhost_dev dev;
-   struct vhost_virtqueue vqs[2];
+   struct vhost_virtqueue vqs[4];

/* Link to global vhost_vsock_hash, writes use vhost_vsock_mutex */
struct hlist_node hash;
@@ -54,6 +57,11 @@ struct vhost_vsock {
spinlock_t send_pkt_list_lock;
struct list_head send_pkt_list; /* host->guest pending packets */

+   spinlock_t dgram_send_pkt_list_lock;
+   struct list_head dgram_send_pkt_list;   /* host->guest pending packets 
*/
+   struct vhost_work dgram_send_pkt_work;
+   int  dgram_used; /*pending packets to be send */
+
atomic_t queued_replies;

u32 guest_cid;
@@ -90,10 +98,22 @@ static void
vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
struct vhost_virtqueue *vq)
{
-   struct vhost_virtqueue *tx_vq = >vqs[VSOCK_VQ_TX];
+   struct vhost_virtqueue *tx_vq;
int pkts = 0, total_len = 0;
bool added = false;
bool restart_tx = false;
+   spinlock_t *lock;
+   struct list_head *send_pkt_list;
+
+   if (vq == >vqs[VSOCK_VQ_RX]) {
+   tx_vq = >vqs[VSOCK_VQ_TX];
+   lock = >send_pkt_list_lock;
+   send_pkt_list = >send_pkt_list;
+   } else {
+   tx_vq = >vqs[VSOCK_VQ_DGRAM_TX];
+   lock = >dgram_send_pkt_list_lock;
+   send_pkt_list = >dgram_send_pkt_list;
+   }

mutex_lock(>mutex);

@@ -113,36 +133,48 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
size_t nbytes;
size_t iov_len, payload_len;
int head;
+   bool is_dgram = false;

-   spin_lock_bh(>send_pkt_list_lock);
-   if (list_empty(>send_pkt_list)) {
-   spin_unlock_bh(>send_pkt_list_lock);
+   spin_lock_bh(lock);
+   if (list_empty(send_pkt_list)) {
+   spin_unlock_bh(lock);
vhost_enable_notify(>dev, vq);
break;
}

-   pkt = list_first_entry(>send_pkt_list,
+   pkt = list_first_entry(send_pkt_list,
   struct virtio_vsock_pkt, list);
list_del_init(>list);
-   spin_unlock_bh(>send_pkt_list_lock);
+   spin_unlock_bh(lock);
+
+   if (pkt->hdr.type == VIRTIO_VSOCK_TYPE_DGRAM)

^
le16_to_cpu(pkt->hdr.type)


+   is_dgram = true;

head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
 , , NULL, NULL);
if (head < 0) {
-   spin_lock_bh(>send_pkt_list_lock);
-   list_add(>list, >send_pkt_list);
-   spin_unlock_bh(>send_pkt_list_lock);
+   spin_lock_bh(lock);
+   list_add(>list, send_pkt_list);
+   spin_unlock_bh(lock);
break;
}

if (head == vq->num) {
-   spin_lock_bh(>send_pkt_list_lock);
-   list_add(>list, >send_pkt_list);
-   spin_unlock_bh(>send_pkt_list_lock);
+   if (is_dgram) {
+   virtio_transport_free_pkt(pkt);
+   vq_err(vq, "Dgram virtqueue is full!");
+   spin_lock_bh(lock);
+   vsock->dgram_used--;
+   spin_unlock_bh(lock);
+   break;
+   }
+   spin_lock_bh(lock);
+   list_add(>list, send_pkt_list);
+   spin_unlock_bh(lock);

/* We cannot finish yet if more buffers snuck in while
-* re-enabling notify.
-*/
+  

Re: [RFC v1 2/6] virtio/vsock: add support for virtio datagram

2021-06-18 Thread Stefano Garzarella

On Wed, Jun 09, 2021 at 11:24:54PM +, Jiang Wang wrote:

This patch add support for virtio dgram for the driver.
Implemented related functions for tx and rx, enqueue
and dequeue. Send packets synchronously to give sender
indication when the virtqueue is full.
Refactored virtio_transport_send_pkt_work() a little bit but
no functions changes for it.

Support for the host/device side is in another
patch.

Signed-off-by: Jiang Wang 
---
include/net/af_vsock.h |   1 +
.../trace/events/vsock_virtio_transport_common.h   |   5 +-
include/uapi/linux/virtio_vsock.h  |   1 +
net/vmw_vsock/af_vsock.c   |  12 +
net/vmw_vsock/virtio_transport.c   | 325 ++---
net/vmw_vsock/virtio_transport_common.c| 184 ++--
6 files changed, 466 insertions(+), 62 deletions(-)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index b1c717286993..fcae7bca9609 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -200,6 +200,7 @@ void vsock_remove_sock(struct vsock_sock *vsk);
void vsock_for_each_connected_socket(void (*fn)(struct sock *sk));
int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk);
bool vsock_find_cid(unsigned int cid);
+int vsock_bind_stream(struct vsock_sock *vsk, struct sockaddr_vm *addr);

/ TAP /

diff --git a/include/trace/events/vsock_virtio_transport_common.h 
b/include/trace/events/vsock_virtio_transport_common.h
index 6782213778be..b1be25b327a1 100644
--- a/include/trace/events/vsock_virtio_transport_common.h
+++ b/include/trace/events/vsock_virtio_transport_common.h
@@ -9,9 +9,12 @@
#include 

TRACE_DEFINE_ENUM(VIRTIO_VSOCK_TYPE_STREAM);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_TYPE_DGRAM);

#define show_type(val) \
-   __print_symbolic(val, { VIRTIO_VSOCK_TYPE_STREAM, "STREAM" })
+__print_symbolic(val, \
+   { VIRTIO_VSOCK_TYPE_STREAM, "STREAM" }, 
\
+   { VIRTIO_VSOCK_TYPE_DGRAM, "DGRAM" })

TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_INVALID);
TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_REQUEST);
diff --git a/include/uapi/linux/virtio_vsock.h 
b/include/uapi/linux/virtio_vsock.h
index b56614dff1c9..5503585b26e8 100644
--- a/include/uapi/linux/virtio_vsock.h
+++ b/include/uapi/linux/virtio_vsock.h
@@ -68,6 +68,7 @@ struct virtio_vsock_hdr {

enum virtio_vsock_type {
VIRTIO_VSOCK_TYPE_STREAM = 1,
+   VIRTIO_VSOCK_TYPE_DGRAM = 3,
};

enum virtio_vsock_op {
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 92a72f0e0d94..c1f512291b94 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -659,6 +659,18 @@ static int __vsock_bind_stream(struct vsock_sock *vsk,
return 0;
}

+int vsock_bind_stream(struct vsock_sock *vsk,
+  struct sockaddr_vm *addr)
+{
+   int retval;
+
+   spin_lock_bh(_table_lock);
+   retval = __vsock_bind_stream(vsk, addr);
+   spin_unlock_bh(_table_lock);
+   return retval;
+}
+EXPORT_SYMBOL(vsock_bind_stream);
+
static int __vsock_bind_dgram(struct vsock_sock *vsk,
  struct sockaddr_vm *addr)
{
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 7dcb8db23305..cf47aadb0c34 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -20,21 +20,29 @@
#include 
#include 
#include 
+#include
+#include
+#include 

static struct workqueue_struct *virtio_vsock_workqueue;
static struct virtio_vsock __rcu *the_virtio_vsock;
+static struct virtio_vsock *the_virtio_vsock_dgram;
static DEFINE_MUTEX(the_virtio_vsock_mutex); /* protects the_virtio_vsock */

struct virtio_vsock {
struct virtio_device *vdev;
struct virtqueue **vqs;
bool has_dgram;
+   refcount_t active;

/* Virtqueue processing is deferred to a workqueue */
struct work_struct tx_work;
struct work_struct rx_work;
struct work_struct event_work;

+   struct work_struct dgram_tx_work;
+   struct work_struct dgram_rx_work;
+
/* The following fields are protected by tx_lock.  vqs[VSOCK_VQ_TX]
 * must be accessed with tx_lock held.
 */
@@ -55,6 +63,22 @@ struct virtio_vsock {
int rx_buf_nr;
int rx_buf_max_nr;

+   /* The following fields are protected by dgram_tx_lock.  
vqs[VSOCK_VQ_DGRAM_TX]
+* must be accessed with dgram_tx_lock held.
+*/
+   struct mutex dgram_tx_lock;
+   bool dgram_tx_run;
+
+   atomic_t dgram_queued_replies;
+
+   /* The following fields are protected by dgram_rx_lock.  
vqs[VSOCK_VQ_DGRAM_RX]
+* must be accessed with dgram_rx_lock held.
+*/
+   struct mutex dgram_rx_lock;
+   bool dgram_rx_run;
+   int dgram_rx_buf_nr;
+   int dgram_rx_buf_max_nr;
+
/* The following fields are protected by event_lock.
 * 

Re: [RFC v1 6/6] virtio/vsock: add sysfs for rx buf len for dgram

2021-06-18 Thread Stefano Garzarella

On Wed, Jun 09, 2021 at 11:24:58PM +, Jiang Wang wrote:

Make rx buf len configurable via sysfs

Signed-off-by: Jiang Wang 
---
net/vmw_vsock/virtio_transport.c | 37 +++--
1 file changed, 35 insertions(+), 2 deletions(-)

diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index cf47aadb0c34..2e4dd9c48472 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -29,6 +29,14 @@ static struct virtio_vsock __rcu *the_virtio_vsock;
static struct virtio_vsock *the_virtio_vsock_dgram;
static DEFINE_MUTEX(the_virtio_vsock_mutex); /* protects the_virtio_vsock */

+static int rx_buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+static struct kobject *kobj_ref;
+static ssize_t  sysfs_show(struct kobject *kobj,
+   struct kobj_attribute *attr, char *buf);
+static ssize_t  sysfs_store(struct kobject *kobj,
+   struct kobj_attribute *attr, const char *buf, size_t 
count);
+static struct kobj_attribute rxbuf_attr = __ATTR(rx_buf_value, 0660, 
sysfs_show, sysfs_store);


Maybe better to use a 'dgram' prefix.


+
struct virtio_vsock {
struct virtio_device *vdev;
struct virtqueue **vqs;
@@ -360,7 +368,7 @@ virtio_transport_cancel_pkt(struct vsock_sock *vsk)

static void virtio_vsock_rx_fill(struct virtio_vsock *vsock, bool is_dgram)
{
-   int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+   int buf_len = rx_buf_len;
struct virtio_vsock_pkt *pkt;
struct scatterlist hdr, buf, *sgs[2];
struct virtqueue *vq;
@@ -1003,6 +1011,22 @@ static struct virtio_driver virtio_vsock_driver = {
.remove = virtio_vsock_remove,
};

+static ssize_t sysfs_show(struct kobject *kobj,
+   struct kobj_attribute *attr, char *buf)
+{
+   return sprintf(buf, "%d", rx_buf_len);
+}
+
+static ssize_t sysfs_store(struct kobject *kobj,
+   struct kobj_attribute *attr, const char *buf, size_t count)
+{
+   if (kstrtou32(buf, 0, _buf_len) < 0)
+   return -EINVAL;
+   if (rx_buf_len < 1024)
+   rx_buf_len = 1024;
+   return count;
+}
+
static int __init virtio_vsock_init(void)
{
int ret;
@@ -1020,8 +1044,17 @@ static int __init virtio_vsock_init(void)
if (ret)
goto out_vci;

-   return 0;
+   kobj_ref = kobject_create_and_add("vsock", kernel_kobj);


So, IIUC, the path will be /sys/vsock/rx_buf_value?

I'm not sure if we need to add a `virtio` subdir (e.g.
/sys/vsock/virtio/dgram_rx_buf_size)

Thanks,
Stefano



+   /*Creating sysfs file for etx_value*/
+   ret = sysfs_create_file(kobj_ref, _attr.attr);
+   if (ret)
+   goto out_sysfs;
+
+   return 0;
+out_sysfs:
+   kobject_put(kobj_ref);
+   sysfs_remove_file(kernel_kobj, _attr.attr);
out_vci:
vsock_core_unregister(_transport.transport);
out_wq:
--
2.11.0



___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC v1 5/6] vhost/vsock: add kconfig for vhost dgram support

2021-06-18 Thread Stefano Garzarella

On Wed, Jun 09, 2021 at 11:24:57PM +, Jiang Wang wrote:

Also change number of vqs according to the config

Signed-off-by: Jiang Wang 
---
drivers/vhost/Kconfig |  8 
drivers/vhost/vsock.c | 11 ---
2 files changed, 16 insertions(+), 3 deletions(-)


As we already discussed, I think we don't need this patch.

Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [RFC v1 2/6] virtio/vsock: add support for virtio datagram

2021-06-18 Thread Stefano Garzarella

On Wed, Jun 09, 2021 at 11:24:54PM +, Jiang Wang wrote:

This patch add support for virtio dgram for the driver.
Implemented related functions for tx and rx, enqueue
and dequeue. Send packets synchronously to give sender
indication when the virtqueue is full.
Refactored virtio_transport_send_pkt_work() a little bit but
no functions changes for it.

Support for the host/device side is in another
patch.

Signed-off-by: Jiang Wang 
---
include/net/af_vsock.h |   1 +
.../trace/events/vsock_virtio_transport_common.h   |   5 +-
include/uapi/linux/virtio_vsock.h  |   1 +
net/vmw_vsock/af_vsock.c   |  12 +
net/vmw_vsock/virtio_transport.c   | 325 ++---
net/vmw_vsock/virtio_transport_common.c| 184 ++--
6 files changed, 466 insertions(+), 62 deletions(-)

diff --git a/include/net/af_vsock.h b/include/net/af_vsock.h
index b1c717286993..fcae7bca9609 100644
--- a/include/net/af_vsock.h
+++ b/include/net/af_vsock.h
@@ -200,6 +200,7 @@ void vsock_remove_sock(struct vsock_sock *vsk);
void vsock_for_each_connected_socket(void (*fn)(struct sock *sk));
int vsock_assign_transport(struct vsock_sock *vsk, struct vsock_sock *psk);
bool vsock_find_cid(unsigned int cid);
+int vsock_bind_stream(struct vsock_sock *vsk, struct sockaddr_vm *addr);

/ TAP /

diff --git a/include/trace/events/vsock_virtio_transport_common.h 
b/include/trace/events/vsock_virtio_transport_common.h
index 6782213778be..b1be25b327a1 100644
--- a/include/trace/events/vsock_virtio_transport_common.h
+++ b/include/trace/events/vsock_virtio_transport_common.h
@@ -9,9 +9,12 @@
#include 

TRACE_DEFINE_ENUM(VIRTIO_VSOCK_TYPE_STREAM);
+TRACE_DEFINE_ENUM(VIRTIO_VSOCK_TYPE_DGRAM);

#define show_type(val) \
-   __print_symbolic(val, { VIRTIO_VSOCK_TYPE_STREAM, "STREAM" })
+__print_symbolic(val, \
+   { VIRTIO_VSOCK_TYPE_STREAM, "STREAM" }, 
\
+   { VIRTIO_VSOCK_TYPE_DGRAM, "DGRAM" })

TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_INVALID);
TRACE_DEFINE_ENUM(VIRTIO_VSOCK_OP_REQUEST);
diff --git a/include/uapi/linux/virtio_vsock.h 
b/include/uapi/linux/virtio_vsock.h
index b56614dff1c9..5503585b26e8 100644
--- a/include/uapi/linux/virtio_vsock.h
+++ b/include/uapi/linux/virtio_vsock.h
@@ -68,6 +68,7 @@ struct virtio_vsock_hdr {

enum virtio_vsock_type {
VIRTIO_VSOCK_TYPE_STREAM = 1,
+   VIRTIO_VSOCK_TYPE_DGRAM = 3,
};

enum virtio_vsock_op {
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 92a72f0e0d94..c1f512291b94 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -659,6 +659,18 @@ static int __vsock_bind_stream(struct vsock_sock *vsk,
return 0;
}

+int vsock_bind_stream(struct vsock_sock *vsk,
+  struct sockaddr_vm *addr)
+{
+   int retval;
+
+   spin_lock_bh(_table_lock);
+   retval = __vsock_bind_stream(vsk, addr);
+   spin_unlock_bh(_table_lock);
+   return retval;
+}
+EXPORT_SYMBOL(vsock_bind_stream);
+
static int __vsock_bind_dgram(struct vsock_sock *vsk,
  struct sockaddr_vm *addr)
{
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 7dcb8db23305..cf47aadb0c34 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -20,21 +20,29 @@
#include 
#include 
#include 
+#include

  ^
  Space needed here

+#include

  ^
  Ditto

+#include 

static struct workqueue_struct *virtio_vsock_workqueue;
static struct virtio_vsock __rcu *the_virtio_vsock;
+static struct virtio_vsock *the_virtio_vsock_dgram;
static DEFINE_MUTEX(the_virtio_vsock_mutex); /* protects the_virtio_vsock */

struct virtio_vsock {
struct virtio_device *vdev;
struct virtqueue **vqs;
bool has_dgram;
+   refcount_t active;

/* Virtqueue processing is deferred to a workqueue */
struct work_struct tx_work;
struct work_struct rx_work;
struct work_struct event_work;

+   struct work_struct dgram_tx_work;
+   struct work_struct dgram_rx_work;
+
/* The following fields are protected by tx_lock.  vqs[VSOCK_VQ_TX]
 * must be accessed with tx_lock held.
 */
@@ -55,6 +63,22 @@ struct virtio_vsock {
int rx_buf_nr;
int rx_buf_max_nr;

+   /* The following fields are protected by dgram_tx_lock.  
vqs[VSOCK_VQ_DGRAM_TX]
+* must be accessed with dgram_tx_lock held.
+*/
+   struct mutex dgram_tx_lock;
+   bool dgram_tx_run;
+
+   atomic_t dgram_queued_replies;
+
+   /* The following fields are protected by dgram_rx_lock.  
vqs[VSOCK_VQ_DGRAM_RX]
+* must be accessed with dgram_rx_lock held.
+*/
+   struct mutex dgram_rx_lock;
+   bool dgram_rx_run;
+   int dgram_rx_buf_nr;
+   int dgram_rx_buf_max_nr;
+
 

Re: [RFC v1 1/6] virtio/vsock: add VIRTIO_VSOCK_F_DGRAM feature bit

2021-06-18 Thread Stefano Garzarella

On Wed, Jun 09, 2021 at 11:24:53PM +, Jiang Wang wrote:

When this feature is enabled, allocate 5 queues,
otherwise, allocate 3 queues to be compatible with
old QEMU versions.

Signed-off-by: Jiang Wang 
---
drivers/vhost/vsock.c |  3 +-
include/linux/virtio_vsock.h  |  9 +
include/uapi/linux/virtio_vsock.h |  3 ++
net/vmw_vsock/virtio_transport.c  | 73 +++
4 files changed, 80 insertions(+), 8 deletions(-)

diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index 5e78fb719602..81d064601093 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -31,7 +31,8 @@

enum {
VHOST_VSOCK_FEATURES = VHOST_FEATURES |
-  (1ULL << VIRTIO_F_ACCESS_PLATFORM)
+  (1ULL << VIRTIO_F_ACCESS_PLATFORM) |
+  (1ULL << VIRTIO_VSOCK_F_DGRAM)
};

enum {
diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
index dc636b727179..ba3189ed9345 100644
--- a/include/linux/virtio_vsock.h
+++ b/include/linux/virtio_vsock.h
@@ -18,6 +18,15 @@ enum {
VSOCK_VQ_MAX= 3,
};

+enum {
+   VSOCK_VQ_STREAM_RX = 0, /* for host to guest data */
+   VSOCK_VQ_STREAM_TX = 1, /* for guest to host data */
+   VSOCK_VQ_DGRAM_RX   = 2,
+   VSOCK_VQ_DGRAM_TX   = 3,
+   VSOCK_VQ_EX_EVENT   = 4,
+   VSOCK_VQ_EX_MAX = 5,
+};
+
/* Per-socket state (accessed via vsk->trans) */
struct virtio_vsock_sock {
struct vsock_sock *vsk;
diff --git a/include/uapi/linux/virtio_vsock.h 
b/include/uapi/linux/virtio_vsock.h
index 1d57ed3d84d2..b56614dff1c9 100644
--- a/include/uapi/linux/virtio_vsock.h
+++ b/include/uapi/linux/virtio_vsock.h
@@ -38,6 +38,9 @@
#include 
#include 

+/* The feature bitmap for virtio net */
+#define VIRTIO_VSOCK_F_DGRAM   0   /* Host support dgram vsock */
+
struct virtio_vsock_config {
__le64 guest_cid;
} __attribute__((packed));
diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
index 2700a63ab095..7dcb8db23305 100644
--- a/net/vmw_vsock/virtio_transport.c
+++ b/net/vmw_vsock/virtio_transport.c
@@ -27,7 +27,8 @@ static DEFINE_MUTEX(the_virtio_vsock_mutex); /* protects 
the_virtio_vsock */

struct virtio_vsock {
struct virtio_device *vdev;
-   struct virtqueue *vqs[VSOCK_VQ_MAX];
+   struct virtqueue **vqs;
+   bool has_dgram;

/* Virtqueue processing is deferred to a workqueue */
struct work_struct tx_work;
@@ -333,7 +334,10 @@ static int virtio_vsock_event_fill_one(struct virtio_vsock 
*vsock,
struct scatterlist sg;
struct virtqueue *vq;

-   vq = vsock->vqs[VSOCK_VQ_EVENT];
+   if (vsock->has_dgram)
+   vq = vsock->vqs[VSOCK_VQ_EX_EVENT];
+   else
+   vq = vsock->vqs[VSOCK_VQ_EVENT];

sg_init_one(, event, sizeof(*event));

@@ -351,7 +355,10 @@ static void virtio_vsock_event_fill(struct virtio_vsock 
*vsock)
virtio_vsock_event_fill_one(vsock, event);
}

-   virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]);
+   if (vsock->has_dgram)
+   virtqueue_kick(vsock->vqs[VSOCK_VQ_EX_EVENT]);
+   else
+   virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]);
}

static void virtio_vsock_reset_sock(struct sock *sk)
@@ -391,7 +398,10 @@ static void virtio_transport_event_work(struct work_struct 
*work)
container_of(work, struct virtio_vsock, event_work);
struct virtqueue *vq;

-   vq = vsock->vqs[VSOCK_VQ_EVENT];
+   if (vsock->has_dgram)
+   vq = vsock->vqs[VSOCK_VQ_EX_EVENT];
+   else
+   vq = vsock->vqs[VSOCK_VQ_EVENT];

mutex_lock(>event_lock);

@@ -411,7 +421,10 @@ static void virtio_transport_event_work(struct work_struct 
*work)
}
} while (!virtqueue_enable_cb(vq));

-   virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]);
+   if (vsock->has_dgram)
+   virtqueue_kick(vsock->vqs[VSOCK_VQ_EX_EVENT]);
+   else
+   virtqueue_kick(vsock->vqs[VSOCK_VQ_EVENT]);
out:
mutex_unlock(>event_lock);
}
@@ -434,6 +447,10 @@ static void virtio_vsock_tx_done(struct virtqueue *vq)
queue_work(virtio_vsock_workqueue, >tx_work);
}

+static void virtio_vsock_dgram_tx_done(struct virtqueue *vq)
+{
+}
+
static void virtio_vsock_rx_done(struct virtqueue *vq)
{
struct virtio_vsock *vsock = vq->vdev->priv;
@@ -443,6 +460,10 @@ static void virtio_vsock_rx_done(struct virtqueue *vq)
queue_work(virtio_vsock_workqueue, >rx_work);
}

+static void virtio_vsock_dgram_rx_done(struct virtqueue *vq)
+{
+}
+
static struct virtio_transport virtio_transport = {
.transport = {
.module   = THIS_MODULE,
@@ -545,13 +566,29 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
virtio_vsock_tx_done,
virtio_vsock_event_done,
};
+

Re: [RFC v1 0/6] virtio/vsock: introduce SOCK_DGRAM support

2021-06-18 Thread Stefano Garzarella

On Wed, Jun 09, 2021 at 11:24:52PM +, Jiang Wang wrote:

This patchset implements support of SOCK_DGRAM for virtio
transport.

Datagram sockets are connectionless and unreliable. To avoid unfair contention
with stream and other sockets, add two more virtqueues and
a new feature bit to indicate if those two new queues exist or not.

Dgram does not use the existing credit update mechanism for
stream sockets. When sending from the guest/driver, sending packets
synchronously, so the sender will get an error when the virtqueue is full.
When sending from the host/device, send packets asynchronously
because the descriptor memory belongs to the corresponding QEMU
process.

The virtio spec patch is here:
https://www.spinics.net/lists/linux-virtualization/msg50027.html

For those who prefer git repo, here is the link for the linux kernel:
https://github.com/Jiang1155/linux/tree/vsock-dgram-v1

qemu patch link:
https://github.com/Jiang1155/qemu/tree/vsock-dgram-v1


To do:
1. use skb when receiving packets
2. support multiple transport
3. support mergeable rx buffer


Jiang, I'll do a fast review, but I think is better to rebase on 
net-next since SEQPACKET support is now merged.


Please also run ./scripts/checkpatch.pl, there are a lot of issues.

I'll leave some simple comments in the patches, but I prefer to do a 
deep review after the rebase and the dynamic handling of DGRAM.


Thanks,
Stefano

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v4 4/6] iommu/dma: Pass address limit rather than size to iommu_setup_dma_ops()

2021-06-18 Thread Jean-Philippe Brucker
On Wed, Jun 16, 2021 at 05:28:59PM +0200, Eric Auger wrote:
> Hi Jean,
> 
> On 6/10/21 9:51 AM, Jean-Philippe Brucker wrote:
> > diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
> > index 4bf1dd3eb041..7bd1d2199141 100644
> > --- a/arch/arm64/mm/dma-mapping.c
> > +++ b/arch/arm64/mm/dma-mapping.c
> > @@ -50,7 +50,7 @@ void arch_setup_dma_ops(struct device *dev, u64 dma_base, 
> > u64 size,
> >  
> > dev->dma_coherent = coherent;
> > if (iommu)
> > -   iommu_setup_dma_ops(dev, dma_base, size);
> > +   iommu_setup_dma_ops(dev, dma_base, size - dma_base - 1);
> I don't get  size - dma_base - 1?

Because it's wrong, should be dma_base + size - 1. Thanks for catching it!

Thanks,
Jean

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH v4 2/6] ACPI: Move IOMMU setup code out of IORT

2021-06-18 Thread Robin Murphy

On 2021-06-18 08:41, Jean-Philippe Brucker wrote:

Hi Eric,

On Wed, Jun 16, 2021 at 11:35:13AM +0200, Eric Auger wrote:

-const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
-   const u32 *id_in)
+int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
  {
struct acpi_iort_node *node;
-   const struct iommu_ops *ops;
+   const struct iommu_ops *ops = NULL;


Oops, I need to remove this (and add -Werror to my tests.)



+static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev,
+  const u32 *id_in)
+{
+   int err;
+   const struct iommu_ops *ops;
+
+   /*
+* If we already translated the fwspec there is nothing left to do,
+* return the iommu_ops.
+*/
+   ops = acpi_iommu_fwspec_ops(dev);
+   if (ops)
+   return ops;
+
+   err = iort_iommu_configure_id(dev, id_in);
+
+   /*
+* If we have reason to believe the IOMMU driver missed the initial
+* add_device callback for dev, replay it to get things in order.
+*/
+   if (!err && dev->bus && !device_iommu_mapped(dev))
+   err = iommu_probe_device(dev);

Previously we had:
     if (!err) {
         ops = iort_fwspec_iommu_ops(dev);
         err = iort_add_device_replay(dev);
     }

Please can you explain the transform? I see the

acpi_iommu_fwspec_ops call below but is it not straightforward to me.


I figured that iort_add_device_replay() is only used once and is
sufficiently simple to be inlined manually (saving 10 lines). Then I
replaced the ops assignment with returns, which saves another line and may
be slightly clearer?  I guess it's mostly a matter of taste, the behavior
should be exactly the same.


Right, IIRC the multiple assignments to ops were more of a haphazard 
evolution inherited from the DT version, and looking at it now I think 
the multiple-return is indeed a bit nicer.


Similarly, it looks like the factoring out of iort_add_device_replay() 
was originally an attempt to encapsulate the IOMMU_API dependency, but 
things have moved around a lot since then, so that seems like a sensible 
simplification to make too.


Robin.




Also the comment mentions replay. Unsure if it is still OK.


The "replay" part is, but "add_device" isn't accurate because it has since
been replaced by probe_device. I'll refresh the comment.

Thanks,
Jean
___
iommu mailing list
io...@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH V3 1/3] gpio: Add virtio-gpio driver

2021-06-18 Thread Linus Walleij
On Wed, Jun 16, 2021 at 5:52 PM Enrico Weigelt, metux IT consult
 wrote:
> On 16.06.21 05:30, Bjorn Andersson wrote:
>
> > Combined with the virtio-i2c effort this could provide an alternative by
> > simply tunneling the busses and GPIOs into Linux and use standard iio
> > drivers, for cases where this suits your product requirements better.
>
> So, you wanna use virtio as logical interface between the two CPUs ?
> Interesting idea. Usually folks use rpmsg for those things.

rpmsg is using shared memory as transport mechanism and virtio
is providing this. rpmsg is conceptually a child of virtio: when the
subsystem was proposed by TI Arnd noted that virtio has large
similarities in shared memory transport and as the potential reuse
for buffer handling etc was excellent virtio was used as
a base for rpmsg/remoteproc work.

Yours,
Linus Walleij
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


[PATCH] eventfd: Enlarge recursion limit to allow vhost to work

2021-06-18 Thread He Zhe
commit b5e683d5cab8 ("eventfd: track eventfd_signal() recursion depth")
introduces a percpu counter that tracks the percpu recursion depth and
warn if it greater than zero, to avoid potential deadlock and stack
overflow.

However sometimes different eventfds may be used in parallel. Specifically,
when heavy network load goes through kvm and vhost, working as below, it
would trigger the following call trace.

-  100.00%
   - 66.51%
ret_from_fork
kthread
  - vhost_worker
 - 33.47% handle_tx_kick
  handle_tx
  handle_tx_copy
  vhost_tx_batch.isra.0
  vhost_add_used_and_signal_n
  eventfd_signal
 - 33.05% handle_rx_net
  handle_rx
  vhost_add_used_and_signal_n
  eventfd_signal
   - 33.49%
ioctl
entry_SYSCALL_64_after_hwframe
do_syscall_64
__x64_sys_ioctl
ksys_ioctl
do_vfs_ioctl
kvm_vcpu_ioctl
kvm_arch_vcpu_ioctl_run
vmx_handle_exit
handle_ept_misconfig
kvm_io_bus_write
__kvm_io_bus_write
eventfd_signal

001: WARNING: CPU: 1 PID: 1503 at fs/eventfd.c:73 eventfd_signal+0x85/0xa0
 snip 
001: Call Trace:
001:  vhost_signal+0x15e/0x1b0 [vhost]
001:  vhost_add_used_and_signal_n+0x2b/0x40 [vhost]
001:  handle_rx+0xb9/0x900 [vhost_net]
001:  handle_rx_net+0x15/0x20 [vhost_net]
001:  vhost_worker+0xbe/0x120 [vhost]
001:  kthread+0x106/0x140
001:  ? log_used.part.0+0x20/0x20 [vhost]
001:  ? kthread_park+0x90/0x90
001:  ret_from_fork+0x35/0x40
001: ---[ end trace 0003 ]---

This patch enlarges the limit to 1 which is the maximum recursion depth we
have found so far.

The credit of modification for eventfd_signal_count goes to
Xie Yongji 

Signed-off-by: He Zhe 
---
 fs/eventfd.c| 3 ++-
 include/linux/eventfd.h | 5 -
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/eventfd.c b/fs/eventfd.c
index e265b6dd4f34..add6af91cacf 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -71,7 +71,8 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
 * it returns true, the eventfd_signal() call should be deferred to a
 * safe context.
 */
-   if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
+   if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count) >
+   EFD_WAKE_COUNT_MAX))
return 0;
 
spin_lock_irqsave(>wqh.lock, flags);
diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
index fa0a524baed0..74be152ebe87 100644
--- a/include/linux/eventfd.h
+++ b/include/linux/eventfd.h
@@ -29,6 +29,9 @@
 #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
 #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
 
+/* This is the maximum recursion depth we find so far */
+#define EFD_WAKE_COUNT_MAX 1
+
 struct eventfd_ctx;
 struct file;
 
@@ -47,7 +50,7 @@ DECLARE_PER_CPU(int, eventfd_wake_count);
 
 static inline bool eventfd_signal_count(void)
 {
-   return this_cpu_read(eventfd_wake_count);
+   return this_cpu_read(eventfd_wake_count) > EFD_WAKE_COUNT_MAX;
 }
 
 #else /* CONFIG_EVENTFD */
-- 
2.17.1

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v8 03/10] eventfd: Increase the recursion depth of eventfd_signal()

2021-06-18 Thread He Zhe



On 6/18/21 11:29 AM, Yongji Xie wrote:
> On Thu, Jun 17, 2021 at 4:34 PM He Zhe  wrote:
>>
>>
>> On 6/15/21 10:13 PM, Xie Yongji wrote:
>>> Increase the recursion depth of eventfd_signal() to 1. This
>>> is the maximum recursion depth we have found so far, which
>>> can be triggered with the following call chain:
>>>
>>> kvm_io_bus_write[kvm]
>>>   --> ioeventfd_write   [kvm]
>>> --> eventfd_signal  [eventfd]
>>>   --> vhost_poll_wakeup [vhost]
>>> --> vduse_vdpa_kick_vq  [vduse]
>>>   --> eventfd_signal[eventfd]
>>>
>>> Signed-off-by: Xie Yongji 
>>> Acked-by: Jason Wang 
>> The fix had been posted one year ago.
>>
>> https://lore.kernel.org/lkml/20200410114720.24838-1-zhe...@windriver.com/
>>
> OK, so it seems to be a fix for the RT system if my understanding is
> correct? Any reason why it's not merged? I'm happy to rebase my series
> on your patch if you'd like to repost it.

It works for both mainline and RT kernel. The folks just reproduced in their RT
environments.

This patch somehow hasn't got maintainer's reply, so not merged yet.

And OK, I'll resend the patch.

>
> BTW, I also notice another thread for this issue:
>
> https://lore.kernel.org/linux-fsdevel/dm6pr11mb420291b550a10853403c7592ff...@dm6pr11mb4202.namprd11.prod.outlook.com/T/

This is the same way as my v1

https://lore.kernel.org/lkml/3b4aa4cb-0e76-89c2-c48a-cf24e1a36...@kernel.dk/

which was not what the maintainer wanted.


>
>>> ---
>>>  fs/eventfd.c| 2 +-
>>>  include/linux/eventfd.h | 5 -
>>>  2 files changed, 5 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/fs/eventfd.c b/fs/eventfd.c
>>> index e265b6dd4f34..cc7cd1dbedd3 100644
>>> --- a/fs/eventfd.c
>>> +++ b/fs/eventfd.c
>>> @@ -71,7 +71,7 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
>>>* it returns true, the eventfd_signal() call should be deferred to a
>>>* safe context.
>>>*/
>>> - if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count)))
>>> + if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count) > EFD_WAKE_DEPTH))
>>>   return 0;
>>>
>>>   spin_lock_irqsave(>wqh.lock, flags);
>>> diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h
>>> index fa0a524baed0..886d99cd38ef 100644
>>> --- a/include/linux/eventfd.h
>>> +++ b/include/linux/eventfd.h
>>> @@ -29,6 +29,9 @@
>>>  #define EFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK)
>>>  #define EFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS | EFD_SEMAPHORE)
>>>
>>> +/* Maximum recursion depth */
>>> +#define EFD_WAKE_DEPTH 1
>>> +
>>>  struct eventfd_ctx;
>>>  struct file;
>>>
>>> @@ -47,7 +50,7 @@ DECLARE_PER_CPU(int, eventfd_wake_count);
>>>
>>>  static inline bool eventfd_signal_count(void)
>>>  {
>>> - return this_cpu_read(eventfd_wake_count);
>>> + return this_cpu_read(eventfd_wake_count) > EFD_WAKE_DEPTH;
>> count is just count. How deep is acceptable should be put
>> where eventfd_signal_count is called.
>>
> The return value of this function is boolean rather than integer.
> Please see the comments in eventfd_signal():
>
> "then it should check eventfd_signal_count() before calling this
> function. If it returns true, the eventfd_signal() call should be
> deferred to a safe context."

OK. Now that the maintainer comments as such we can use it accordingly,
though I still got the feeling that the function name and the type of the return
value don't match.


Thanks,
Zhe

>
> Thanks,
> Yongji

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v6 1/2] x86/sev: Make sure IRQs are disabled while GHCB is active

2021-06-18 Thread Joerg Roedel
On Thu, Jun 17, 2021 at 05:00:48PM +0200, Peter Zijlstra wrote:
> I think this is broken, at this point RCU is quite dead on this CPU and
> local_irq_save/restore include tracing and all sorts.
> 
> Also, shouldn't IRQs already be disabled by the time we get here?

Yes it is, I removed these calls, made __sev_get/put_ghcb() noinstr
instead of __always_inline and put instrumentation_begin()/end() around
the panic() call in __sev_get_ghcb().

Regards,

Joerg
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v6 1/2] x86/sev: Make sure IRQs are disabled while GHCB is active

2021-06-18 Thread Joerg Roedel
On Thu, Jun 17, 2021 at 05:38:46PM +0200, Peter Zijlstra wrote:
> I'm getting (with all of v6.1 applied):
> 
> vmlinux.o: warning: objtool: __sev_es_nmi_complete()+0x1bf: call to panic() 
> leaves .noinstr.text section
> 
> $ ./scripts/faddr2line defconfig-build/vmlinux __sev_es_nmi_complete+0x1bf
> __sev_es_nmi_complete+0x1bf/0x1d0:
> __sev_get_ghcb at arch/x86/kernel/sev.c:223
> (inlined by) __sev_es_nmi_complete at arch/x86/kernel/sev.c:519

I see where this is coming from, but can't create the same warning. I
did run 'objtool check -n vmlinux'. Is there more to do to get the full
check?

Regards,

Joerg
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v4 3/6] ACPI: Add driver for the VIOT table

2021-06-18 Thread Jean-Philippe Brucker
On Thu, Jun 17, 2021 at 01:50:59PM +0200, Rafael J. Wysocki wrote:
> > diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c
> > index be7da23fad76..b835ca702ff0 100644
> > --- a/drivers/acpi/bus.c
> > +++ b/drivers/acpi/bus.c
> > @@ -27,6 +27,7 @@
> >  #include 
> >  #endif
> >  #include 
> > +#include 
> >  #include 
> >  #include 
> >  #include 
> > @@ -1339,6 +1340,7 @@ static int __init acpi_init(void)
> > pci_mmcfg_late_init();
> > acpi_iort_init();
> > acpi_scan_init();
> > +   acpi_viot_init();
> 
> Is there a specific reason why to call it right here?
> 
> In particular, does it need to be called after acpi_scan_init()?  And
> does it need to be called before the subsequent functions?  If so,
> then why?

It does need to be called after acpi_scan_init(), because it relies on
struct device and their fwnode to be initialized. In particular to find a
PCI device we call pci_get_domain_bus_and_slot(), which needs the PCI
topology made available by acpi_scan_init().

It does not need to be before the subsequent functions however, I can move
it at the end.

> > +void __init acpi_viot_init(void)
> > +{
> > +   int i;
> > +   acpi_status status;
> > +   struct acpi_table_header *hdr;
> > +   struct acpi_viot_header *node;
> > +
> > +   status = acpi_get_table(ACPI_SIG_VIOT, 0, );
> > +   if (ACPI_FAILURE(status)) {
> > +   if (status != AE_NOT_FOUND) {
> > +   const char *msg = acpi_format_exception(status);
> > +
> > +   pr_err("Failed to get table, %s\n", msg);
> > +   }
> > +   return;
> > +   }
> > +
> > +   viot = (void *)hdr;
> > +
> > +   node = ACPI_ADD_PTR(struct acpi_viot_header, viot, 
> > viot->node_offset);
> > +   for (i = 0; i < viot->node_count; i++) {
> > +   if (viot_parse_node(node))
> > +   return;
> > +
> > +   node = ACPI_ADD_PTR(struct acpi_viot_header, node,
> > +   node->length);
> > +   }
> 
> Do you still need the table after the above is complete?  If not,
> release the reference on it acquired above.

We don't need the table anymore, I'll drop the reference

Thanks,
Jean

___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v4 3/6] ACPI: Add driver for the VIOT table

2021-06-18 Thread Jean-Philippe Brucker
On Wed, Jun 16, 2021 at 03:26:08PM +0200, Eric Auger wrote:
> > +   default:
> > +   pr_warn("Unsupported node %x\n", hdr->type);
> > +   ret = 0;
> > +   goto err_free;
> > +   }
> > +
> > +   /*
> > +* To be compatible with future versions of the table which may include
> > +* other node types, keep parsing.
> > +*/
> nit: doesn't this comment rather apply to the default clause in the
> switch.

Yes, the comment doesn't accurately explain the code below, I'll tweak it.

/*
 * A future version of the table may use the node for other purposes.
 * Keep parsing.
 */

> In case the PCI range node or the single MMIO endoint node does
> not refer to any translation element, isn't it simply an error case?

It is permissible in my opinion. If a future version of the spec appends
new fields to the MMIO endpoint describing some PV property (I can't think
of a useful example), then the table can contain the vIOMMU topology as
usual plus one MMIO node that's only here to describe that property, and
doesn't have a translation element. If we encounter that I think we should
keep parsing.

> > +   if (!ep->viommu) {
> > +   pr_warn("No IOMMU node found\n");
> > +   ret = 0;
> > +   goto err_free;
> > +   }

> Besides
> Reviewed-by: Eric Auger 

Thanks!
Jean
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization


Re: [PATCH v4 2/6] ACPI: Move IOMMU setup code out of IORT

2021-06-18 Thread Jean-Philippe Brucker
Hi Eric,

On Wed, Jun 16, 2021 at 11:35:13AM +0200, Eric Auger wrote:
> > -const struct iommu_ops *iort_iommu_configure_id(struct device *dev,
> > -   const u32 *id_in)
> > +int iort_iommu_configure_id(struct device *dev, const u32 *id_in)
> >  {
> > struct acpi_iort_node *node;
> > -   const struct iommu_ops *ops;
> > +   const struct iommu_ops *ops = NULL;

Oops, I need to remove this (and add -Werror to my tests.)


> > +static const struct iommu_ops *acpi_iommu_configure_id(struct device *dev,
> > +  const u32 *id_in)
> > +{
> > +   int err;
> > +   const struct iommu_ops *ops;
> > +
> > +   /*
> > +* If we already translated the fwspec there is nothing left to do,
> > +* return the iommu_ops.
> > +*/
> > +   ops = acpi_iommu_fwspec_ops(dev);
> > +   if (ops)
> > +   return ops;
> > +
> > +   err = iort_iommu_configure_id(dev, id_in);
> > +
> > +   /*
> > +* If we have reason to believe the IOMMU driver missed the initial
> > +* add_device callback for dev, replay it to get things in order.
> > +*/
> > +   if (!err && dev->bus && !device_iommu_mapped(dev))
> > +   err = iommu_probe_device(dev);
> Previously we had:
>     if (!err) {
>         ops = iort_fwspec_iommu_ops(dev);
>         err = iort_add_device_replay(dev);
>     }
> 
> Please can you explain the transform? I see the
> 
> acpi_iommu_fwspec_ops call below but is it not straightforward to me.

I figured that iort_add_device_replay() is only used once and is
sufficiently simple to be inlined manually (saving 10 lines). Then I
replaced the ops assignment with returns, which saves another line and may
be slightly clearer?  I guess it's mostly a matter of taste, the behavior
should be exactly the same.

> Also the comment mentions replay. Unsure if it is still OK.

The "replay" part is, but "add_device" isn't accurate because it has since
been replaced by probe_device. I'll refresh the comment.

Thanks,
Jean
___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization

Re: [PATCH] vhost-vdpa: fix bug-"v->vqs" and "v" don't free

2021-06-18 Thread Jason Wang


在 2021/6/18 下午2:53, Cai Huoqing 写道:

"v->vqs" and "v" don't free when "cdev_device_add" returns error

Signed-off-by: Cai Huoqing 
---
  drivers/vhost/vdpa.c | 2 ++
  1 file changed, 2 insertions(+)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index fb41db3da611..6e5d5df5ee70 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -1065,6 +1065,8 @@ static int vhost_vdpa_probe(struct vdpa_device *vdpa)
  
  err:

 put_device(>dev);
+   kfree(v->vqs);
+   kfree(v);



Isn't this the charge of vhost_vdpa_release_dev()?

Thanks



 return r;
  }
  


___
Virtualization mailing list
Virtualization@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/virtualization