[PATCH 6/6] dma-mapping: remove dma_virt_ops

2020-11-04 Thread Christoph Hellwig
Now that the RDMA core deals with devices that only do DMA mapping in
lower layers properly, there is no user for dma_virt_ops and it can
be removed.

Signed-off-by: Christoph Hellwig 
---
 include/linux/dma-mapping.h |  2 --
 kernel/dma/Kconfig  |  5 ---
 kernel/dma/Makefile |  1 -
 kernel/dma/virt.c   | 61 -
 4 files changed, 69 deletions(-)
 delete mode 100644 kernel/dma/virt.c

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 956151052d4542..2aaed35b556df4 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -565,6 +565,4 @@ static inline int dma_mmap_wc(struct device *dev,
 int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
dma_addr_t dma_start, u64 size);
 
-extern const struct dma_map_ops dma_virt_ops;
-
 #endif /* _LINUX_DMA_MAPPING_H */
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index c99de4a2145889..fd2db2665fc691 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -75,11 +75,6 @@ config ARCH_HAS_DMA_PREP_COHERENT
 config ARCH_HAS_FORCE_DMA_UNENCRYPTED
bool
 
-config DMA_VIRT_OPS
-   bool
-   depends on HAS_DMA
-   select DMA_OPS
-
 config SWIOTLB
bool
select NEED_DMA_MAP_STATE
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index dc755ab68aabf9..cd1d86358a7a62 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -5,7 +5,6 @@ obj-$(CONFIG_DMA_OPS)   += ops_helpers.o
 obj-$(CONFIG_DMA_OPS)  += dummy.o
 obj-$(CONFIG_DMA_CMA)  += contiguous.o
 obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o
-obj-$(CONFIG_DMA_VIRT_OPS) += virt.o
 obj-$(CONFIG_DMA_API_DEBUG)+= debug.o
 obj-$(CONFIG_SWIOTLB)  += swiotlb.o
 obj-$(CONFIG_DMA_COHERENT_POOL)+= pool.o
diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c
deleted file mode 100644
index 59d32317dd574a..00
--- a/kernel/dma/virt.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DMA operations that map to virtual addresses without flushing memory.
- */
-#include 
-#include 
-#include 
-#include 
-
-static void *dma_virt_alloc(struct device *dev, size_t size,
-   dma_addr_t *dma_handle, gfp_t gfp,
-   unsigned long attrs)
-{
-   void *ret;
-
-   ret = (void *)__get_free_pages(gfp | __GFP_ZERO, get_order(size));
-   if (ret)
-   *dma_handle = (uintptr_t)ret;
-   return ret;
-}
-
-static void dma_virt_free(struct device *dev, size_t size,
- void *cpu_addr, dma_addr_t dma_addr,
- unsigned long attrs)
-{
-   free_pages((unsigned long)cpu_addr, get_order(size));
-}
-
-static dma_addr_t dma_virt_map_page(struct device *dev, struct page *page,
-   unsigned long offset, size_t size,
-   enum dma_data_direction dir,
-   unsigned long attrs)
-{
-   return (uintptr_t)(page_address(page) + offset);
-}
-
-static int dma_virt_map_sg(struct device *dev, struct scatterlist *sgl,
-  int nents, enum dma_data_direction dir,
-  unsigned long attrs)
-{
-   int i;
-   struct scatterlist *sg;
-
-   for_each_sg(sgl, sg, nents, i) {
-   BUG_ON(!sg_page(sg));
-   sg_dma_address(sg) = (uintptr_t)sg_virt(sg);
-   sg_dma_len(sg) = sg->length;
-   }
-
-   return nents;
-}
-
-const struct dma_map_ops dma_virt_ops = {
-   .alloc  = dma_virt_alloc,
-   .free   = dma_virt_free,
-   .map_page   = dma_virt_map_page,
-   .map_sg = dma_virt_map_sg,
-   .alloc_pages= dma_common_alloc_pages,
-   .free_pages = dma_common_free_pages,
-};
-EXPORT_SYMBOL(dma_virt_ops);
-- 
2.28.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 5/6] PCI/P2PDMA: Cleanup __pci_p2pdma_map_sg a bit

2020-11-04 Thread Christoph Hellwig
Remove the pointless paddr variable that was only used once.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Logan Gunthorpe 
Acked-by: Bjorn Helgaas 
---
 drivers/pci/p2pdma.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index b07018af53876c..afd792cc272832 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -825,13 +825,10 @@ static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap 
*p2p_pgmap,
struct device *dev, struct scatterlist *sg, int nents)
 {
struct scatterlist *s;
-   phys_addr_t paddr;
int i;
 
for_each_sg(sg, s, nents, i) {
-   paddr = sg_phys(s);
-
-   s->dma_address = paddr - p2p_pgmap->bus_offset;
+   s->dma_address = sg_phys(s) - p2p_pgmap->bus_offset;
sg_dma_len(s) = s->length;
}
 
-- 
2.28.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 4/6] PCI/P2PDMA: Remove the DMA_VIRT_OPS hacks

2020-11-04 Thread Christoph Hellwig
Now that all users of dma_virt_ops are gone we can remove the workaround
for it in the PCI peer to peer code.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Logan Gunthorpe 
Acked-by: Bjorn Helgaas 
---
 drivers/pci/p2pdma.c | 20 
 1 file changed, 20 deletions(-)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index de1c331dbed43f..b07018af53876c 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -556,15 +556,6 @@ int pci_p2pdma_distance_many(struct pci_dev *provider, 
struct device **clients,
return -1;
 
for (i = 0; i < num_clients; i++) {
-#ifdef CONFIG_DMA_VIRT_OPS
-   if (clients[i]->dma_ops == _virt_ops) {
-   if (verbose)
-   dev_warn(clients[i],
-"cannot be used for peer-to-peer DMA 
because the driver makes use of dma_virt_ops\n");
-   return -1;
-   }
-#endif
-
pci_client = find_parent_pci_dev(clients[i]);
if (!pci_client) {
if (verbose)
@@ -837,17 +828,6 @@ static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap 
*p2p_pgmap,
phys_addr_t paddr;
int i;
 
-   /*
-* p2pdma mappings are not compatible with devices that use
-* dma_virt_ops. If the upper layers do the right thing
-* this should never happen because it will be prevented
-* by the check in pci_p2pdma_distance_many()
-*/
-#ifdef CONFIG_DMA_VIRT_OPS
-   if (WARN_ON_ONCE(dev->dma_ops == _virt_ops))
-   return 0;
-#endif
-
for_each_sg(sg, s, nents, i) {
paddr = sg_phys(s);
 
-- 
2.28.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 3/6] RDMA/core: remove use of dma_virt_ops

2020-11-04 Thread Christoph Hellwig
Use the ib_dma_* helpers to skip the DMA translation instead.  This
removes the last user if dma_virt_ops and keeps the weird layering
violation inside the RDMA core instead of burderning the DMA mapping
subsystems with it.  This also means the software RDMA drivers now
don't have to mess with DMA parameters that are not relevant to them
at all, and that in the future we can use PCI P2P transfers even for
software RDMA, as there is no first fake layer of DMA mapping that
the P2P DMA support.

Signed-off-by: Christoph Hellwig 
---
 drivers/infiniband/core/device.c  | 41 ++-
 drivers/infiniband/core/rw.c  |  2 +-
 drivers/infiniband/sw/rdmavt/Kconfig  |  1 -
 drivers/infiniband/sw/rdmavt/mr.c |  6 +--
 drivers/infiniband/sw/rdmavt/vt.c |  8 
 drivers/infiniband/sw/rxe/Kconfig |  1 -
 drivers/infiniband/sw/rxe/rxe_verbs.c |  7 
 drivers/infiniband/sw/rxe/rxe_verbs.h |  1 -
 drivers/infiniband/sw/siw/Kconfig |  1 -
 drivers/infiniband/sw/siw/siw.h   |  1 -
 drivers/infiniband/sw/siw/siw_main.c  |  7 
 include/rdma/ib_verbs.h   | 59 ++-
 12 files changed, 64 insertions(+), 71 deletions(-)

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index a3b1fc84cdcab9..528de41487521b 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1177,25 +1177,6 @@ static int assign_name(struct ib_device *device, const 
char *name)
return ret;
 }
 
-static void setup_dma_device(struct ib_device *device,
-struct device *dma_device)
-{
-   /*
-* If the caller does not provide a DMA capable device then the IB
-* device will be used. In this case the caller should fully setup the
-* ibdev for DMA. This usually means using dma_virt_ops.
-*/
-#ifdef CONFIG_DMA_VIRT_OPS
-   if (!dma_device) {
-   device->dev.dma_ops = _virt_ops;
-   dma_device = >dev;
-   }
-#endif
-   WARN_ON(!dma_device);
-   device->dma_device = dma_device;
-   WARN_ON(!device->dma_device->dma_parms);
-}
-
 /*
  * setup_device() allocates memory and sets up data that requires calling the
  * device ops, this is the only reason these actions are not done during
@@ -1341,7 +1322,14 @@ int ib_register_device(struct ib_device *device, const 
char *name,
if (ret)
return ret;
 
-   setup_dma_device(device, dma_device);
+   /*
+* If the caller does not provide a DMA capable device then the IB core
+* will set up ib_sge and scatterlist structures that stash the kernel
+* virtual address into the address field.
+*/
+   device->dma_device = dma_device;
+   WARN_ON(dma_device && !dma_device->dma_parms);
+
ret = setup_device(device);
if (ret)
return ret;
@@ -2675,6 +2663,19 @@ void ib_set_device_ops(struct ib_device *dev, const 
struct ib_device_ops *ops)
 }
 EXPORT_SYMBOL(ib_set_device_ops);
 
+int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int 
nents)
+{
+   struct scatterlist *s;
+   int i;
+
+   for_each_sg(sg, s, nents, i) {
+   sg_dma_address(s) = (uintptr_t)sg_virt(s);
+   sg_dma_len(s) = s->length;
+   }
+   return nents;
+}
+EXPORT_SYMBOL(ib_dma_virt_map_sg);
+
 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = {
[RDMA_NL_LS_OP_RESOLVE] = {
.doit = ib_nl_handle_resolve_resp,
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 13f43ab7220b05..ae5a629ecc6772 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -276,7 +276,7 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, 
struct ib_qp *qp,
 static void rdma_rw_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
 u32 sg_cnt, enum dma_data_direction dir)
 {
-   if (is_pci_p2pdma_page(sg_page(sg)))
+   if (dev->dma_device && is_pci_p2pdma_page(sg_page(sg)))
pci_p2pdma_unmap_sg(dev->dma_device, sg, sg_cnt, dir);
else
ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
diff --git a/drivers/infiniband/sw/rdmavt/Kconfig 
b/drivers/infiniband/sw/rdmavt/Kconfig
index c8e268082952b0..0df48b3a6b56c5 100644
--- a/drivers/infiniband/sw/rdmavt/Kconfig
+++ b/drivers/infiniband/sw/rdmavt/Kconfig
@@ -4,6 +4,5 @@ config INFINIBAND_RDMAVT
depends on INFINIBAND_VIRT_DMA
depends on X86_64
depends on PCI
-   select DMA_VIRT_OPS
help
This is a common software verbs provider for RDMA networks.
diff --git a/drivers/infiniband/sw/rdmavt/mr.c 
b/drivers/infiniband/sw/rdmavt/mr.c
index 8490fdb9c91e50..90fc234f489acd 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ b/drivers/infiniband/sw/rdmavt/mr.c
@@ -324,8 +324,6 @@ static void __rvt_free_mr(struct rvt_mr *mr)
  * @acc: access flags
 

[PATCH 2/6] RDMA/core: remove ib_dma_{alloc,free}_coherent

2020-11-04 Thread Christoph Hellwig
These two functions are entirely unused.

Signed-off-by: Christoph Hellwig 
---
 include/rdma/ib_verbs.h | 29 -
 1 file changed, 29 deletions(-)

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9bf6c319a670e2..5f8fd7976034e0 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -4098,35 +4098,6 @@ static inline void ib_dma_sync_single_for_device(struct 
ib_device *dev,
dma_sync_single_for_device(dev->dma_device, addr, size, dir);
 }
 
-/**
- * ib_dma_alloc_coherent - Allocate memory and map it for DMA
- * @dev: The device for which the DMA address is requested
- * @size: The size of the region to allocate in bytes
- * @dma_handle: A pointer for returning the DMA address of the region
- * @flag: memory allocator flags
- */
-static inline void *ib_dma_alloc_coherent(struct ib_device *dev,
-  size_t size,
-  dma_addr_t *dma_handle,
-  gfp_t flag)
-{
-   return dma_alloc_coherent(dev->dma_device, size, dma_handle, flag);
-}
-
-/**
- * ib_dma_free_coherent - Free memory allocated by ib_dma_alloc_coherent()
- * @dev: The device for which the DMA addresses were allocated
- * @size: The size of the region
- * @cpu_addr: the address returned by ib_dma_alloc_coherent()
- * @dma_handle: the DMA address returned by ib_dma_alloc_coherent()
- */
-static inline void ib_dma_free_coherent(struct ib_device *dev,
-   size_t size, void *cpu_addr,
-   dma_addr_t dma_handle)
-{
-   dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle);
-}
-
 /* ib_reg_user_mr - register a memory region for virtual addresses from kernel
  * space. This function should be called when 'current' is the owning MM.
  */
-- 
2.28.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 1/6] RMDA/sw: don't allow drivers using dma_virt_ops on highmem configs

2020-11-04 Thread Christoph Hellwig
dma_virt_ops requires that all pages have a kernel virtual address.
Introduce a INFINIBAND_VIRT_DMA Kconfig symbol that depends on !HIGHMEM
and a large enough dma_addr_t, and make all three driver depend on the
new symbol.

Signed-off-by: Christoph Hellwig 
---
 drivers/infiniband/Kconfig   | 6 ++
 drivers/infiniband/sw/rdmavt/Kconfig | 3 ++-
 drivers/infiniband/sw/rxe/Kconfig| 2 +-
 drivers/infiniband/sw/siw/Kconfig| 1 +
 4 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 32a51432ec4f73..81acaf5fb5be67 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -73,6 +73,12 @@ config INFINIBAND_ADDR_TRANS_CONFIGFS
  This allows the user to config the default GID type that the CM
  uses for each device, when initiaing new connections.
 
+config INFINIBAND_VIRT_DMA
+   bool
+   default y
+   depends on !HIGHMEM
+   depends on !64BIT || ARCH_DMA_ADDR_T_64BIT
+
 if INFINIBAND_USER_ACCESS || !INFINIBAND_USER_ACCESS
 source "drivers/infiniband/hw/mthca/Kconfig"
 source "drivers/infiniband/hw/qib/Kconfig"
diff --git a/drivers/infiniband/sw/rdmavt/Kconfig 
b/drivers/infiniband/sw/rdmavt/Kconfig
index 9ef5f5ce1ff6b0..c8e268082952b0 100644
--- a/drivers/infiniband/sw/rdmavt/Kconfig
+++ b/drivers/infiniband/sw/rdmavt/Kconfig
@@ -1,7 +1,8 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_RDMAVT
tristate "RDMA verbs transport library"
-   depends on X86_64 && ARCH_DMA_ADDR_T_64BIT
+   depends on INFINIBAND_VIRT_DMA
+   depends on X86_64
depends on PCI
select DMA_VIRT_OPS
help
diff --git a/drivers/infiniband/sw/rxe/Kconfig 
b/drivers/infiniband/sw/rxe/Kconfig
index a0c6c7dfc1814f..8810bfa680495a 100644
--- a/drivers/infiniband/sw/rxe/Kconfig
+++ b/drivers/infiniband/sw/rxe/Kconfig
@@ -2,7 +2,7 @@
 config RDMA_RXE
tristate "Software RDMA over Ethernet (RoCE) driver"
depends on INET && PCI && INFINIBAND
-   depends on !64BIT || ARCH_DMA_ADDR_T_64BIT
+   depends on INFINIBAND_VIRT_DMA
select NET_UDP_TUNNEL
select CRYPTO_CRC32
select DMA_VIRT_OPS
diff --git a/drivers/infiniband/sw/siw/Kconfig 
b/drivers/infiniband/sw/siw/Kconfig
index b622fc62f2cd6d..3450ba5081df51 100644
--- a/drivers/infiniband/sw/siw/Kconfig
+++ b/drivers/infiniband/sw/siw/Kconfig
@@ -1,6 +1,7 @@
 config RDMA_SIW
tristate "Software RDMA over TCP/IP (iWARP) driver"
depends on INET && INFINIBAND && LIBCRC32C
+   depends on INFINIBAND_VIRT_DMA
select DMA_VIRT_OPS
help
This driver implements the iWARP RDMA transport over
-- 
2.28.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


remove dma_virt_ops v2

2020-11-04 Thread Christoph Hellwig
Hi Jason,

this series switches the RDMA core to opencode the special case of
devices bypassing the DMA mapping in the RDMA ULPs.  The virt ops
have caused a bit of trouble due to the P2P code node working with
them due to the fact that we'd do two dma mapping iterations for a
single I/O, but also are a bit of layering violation and lead to
more code than necessary.

Tested with nvme-rdma over rxe.

Changes since v1:
 - disable software RDMA drivers for highmem configs
 - update the PCI commit logs
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] iommu/vt-d: remove redundant variable no_platform_optin

2020-11-04 Thread Zhenzhong Duan
Hi Baolu,

On Thu, Nov 5, 2020 at 9:47 AM Lu Baolu  wrote:
>
> Hi Zhenzhong,
>
> On 11/4/20 4:19 PM, Zhenzhong Duan wrote:
> > no_platform_optin is redundant with dmar_disabled and it's only used in
> > platform_optin_force_iommu(), remove it and use dmar_disabled instead.
>
> It's actually not.
>
> If CONFIG_INTEL_IOMMU_DEFAULT_ON is not set, we will get "dmar_disable =
> 1" and "no_platform_optin = 0". In this case, we must force the iommu on
> and set dmar_disable = 0.
>
> The real use case: if a kernel built with [CONFIG_INTEL_IOMMU_DEFAULT_ON
> = n] running on a platform with thunderbolt ports, we must force IOMMU
> on so that the system could be protected from possible malicious
> peripherals.

Thanks for your explanation, clear now and sorry for the noise.

Regards
Zhenzhong
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH] iommu/vt-d: remove redundant variable no_platform_optin

2020-11-04 Thread Lu Baolu

Hi Zhenzhong,

On 11/4/20 4:19 PM, Zhenzhong Duan wrote:

no_platform_optin is redundant with dmar_disabled and it's only used in
platform_optin_force_iommu(), remove it and use dmar_disabled instead.


It's actually not.

If CONFIG_INTEL_IOMMU_DEFAULT_ON is not set, we will get "dmar_disable =
1" and "no_platform_optin = 0". In this case, we must force the iommu on
and set dmar_disable = 0.

The real use case: if a kernel built with [CONFIG_INTEL_IOMMU_DEFAULT_ON
= n] running on a platform with thunderbolt ports, we must force IOMMU
on so that the system could be protected from possible malicious
peripherals.

Best regards,
baolu




Meanwhile remove all the dead code in platform_optin_force_iommu().

Signed-off-by: Zhenzhong Duan 
---
  drivers/iommu/intel/iommu.c | 14 ++
  1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 8651f6d4dfa0..a011d1ed63ef 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -180,7 +180,6 @@ static int rwbf_quirk;
   */
  static int force_on = 0;
  int intel_iommu_tboot_noforce;
-static int no_platform_optin;
  
  #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
  
@@ -440,7 +439,6 @@ static int __init intel_iommu_setup(char *str)

pr_info("IOMMU enabled\n");
} else if (!strncmp(str, "off", 3)) {
dmar_disabled = 1;
-   no_platform_optin = 1;
pr_info("IOMMU disabled\n");
} else if (!strncmp(str, "igfx_off", 8)) {
dmar_map_gfx = 0;
@@ -4810,20 +4808,12 @@ static inline bool has_external_pci(void)
  
  static int __init platform_optin_force_iommu(void)

  {
-   if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
+   if (!dmar_platform_optin() || dmar_disabled || !has_external_pci())
return 0;
  
-	if (no_iommu || dmar_disabled)

+   if (no_iommu)
pr_info("Intel-IOMMU force enabled due to platform opt in\n");
  
-	/*

-* If Intel-IOMMU is disabled by default, we will apply identity
-* map for all devices except those marked as being untrusted.
-*/
-   if (dmar_disabled)
-   iommu_set_default_passthrough(false);
-
-   dmar_disabled = 0;
no_iommu = 0;
  
  	return 1;



___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3] swiotlb: Adjust SWIOTBL bounce buffer size for SEV guests.

2020-11-04 Thread Ashish Kalra
Hello Konrad,

On Wed, Nov 04, 2020 at 05:14:52PM -0500, Konrad Rzeszutek Wilk wrote:
> On Wed, Nov 04, 2020 at 10:08:04PM +, Ashish Kalra wrote:
> > From: Ashish Kalra 
> > 
> > For SEV, all DMA to and from guest has to use shared
> > (un-encrypted) pages. SEV uses SWIOTLB to make this
> > happen without requiring changes to device drivers.
> > However, depending on workload being run, the default
> > 64MB of SWIOTLB might not be enough and SWIOTLB
> > may run out of buffers to use for DMA, resulting
> > in I/O errors and/or performance degradation for
> > high I/O workloads.
> > 
> > Increase the default size of SWIOTLB for SEV guests
> > using a minimum value of 128MB and a maximum value
> 
> 
> 
> 64MB for a 1GB VM is not enough?
> 
> > of 512MB, determining on amount of provisioned guest
> 
> I like the implementation on how this is done.. but
> the choices of memory and how much seems very much
> random. Could there be some math behind this?
>

Earlier the patch was based on using a % of guest memory, as below:

+#define SEV_ADJUST_SWIOTLB_SIZE_PERCENT5
+#define SEV_ADJUST_SWIOTLB_SIZE_MAX(1UL << 30)
...
...
+   if (sev_active() && !io_tlb_nslabs) {
+   unsigned long total_mem = get_num_physpages() << PAGE_SHIFT;
+
+   default_size = total_mem *
+   SEV_ADJUST_SWIOTLB_SIZE_PERCENT / 100;
+
+   default_size = ALIGN(default_size, 1 << IO_TLB_SHIFT);
+
+   default_size = clamp_val(default_size, IO_TLB_DEFAULT_SIZE,
+   SEV_ADJUST_SWIOTLB_SIZE_MAX);
+   }

But, then it is difficult to predict what % of guest memory to use ?

Then there are other factors to consider, such as vcpu_count or if there
is going to be high I/O workload, etc.

But that all makes it very complicated, what we basically want is a
range from 128M to 512M and that's why the current patch which picks up
this range from the amount of allocated guest memory keeps it simple. 

Thanks,
Ashish

> > memory.
> > 
> > Using late_initcall() interface to invoke
> > swiotlb_adjust() does not work as the size
> > adjustment needs to be done before mem_encrypt_init()
> > and reserve_crashkernel() which use the allocated
> > SWIOTLB buffer size, hence calling it explicitly
> > from setup_arch().
> > 
> > The SWIOTLB default size adjustment is added as an
> > architecture specific interface/callback to allow
> > architectures such as those supporting memory
> > encryption to adjust/expand SWIOTLB size for their
> > use.
> > 
> > Signed-off-by: Ashish Kalra 
> > ---
> >  arch/x86/kernel/setup.c   |  2 ++
> >  arch/x86/mm/mem_encrypt.c | 42 +++
> >  include/linux/swiotlb.h   |  1 +
> >  kernel/dma/swiotlb.c  | 27 +
> >  4 files changed, 72 insertions(+)
> > 
> > diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> > index 3511736fbc74..b073d58dd4a3 100644
> > --- a/arch/x86/kernel/setup.c
> > +++ b/arch/x86/kernel/setup.c
> > @@ -1166,6 +1166,8 @@ void __init setup_arch(char **cmdline_p)
> > if (boot_cpu_has(X86_FEATURE_GBPAGES))
> > hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
> >  
> > +   swiotlb_adjust();
> > +
> > /*
> >  * Reserve memory for crash kernel after SRAT is parsed so that it
> >  * won't consume hotpluggable memory.
> > diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
> > index 3f248f0d0e07..e0deb157cddd 100644
> > --- a/arch/x86/mm/mem_encrypt.c
> > +++ b/arch/x86/mm/mem_encrypt.c
> > @@ -489,7 +489,49 @@ static void print_mem_encrypt_feature_info(void)
> > pr_cont("\n");
> >  }
> >  
> > +#define TOTAL_MEM_1G   0x4000UL
> > +#define TOTAL_MEM_4G   0x1UL
> > +
> > +#define SIZE_128M (128UL<<20)
> > +#define SIZE_256M (256UL<<20)
> > +#define SIZE_512M (512UL<<20)
> > +
> >  /* Architecture __weak replacement functions */
> > +unsigned long __init arch_swiotlb_adjust(unsigned long iotlb_default_size)
> > +{
> > +   unsigned long size = 0;
> > +
> > +   /*
> > +* For SEV, all DMA has to occur via shared/unencrypted pages.
> > +* SEV uses SWOTLB to make this happen without changing device
> > +* drivers. However, depending on the workload being run, the
> > +* default 64MB of SWIOTLB may not be enough & SWIOTLB may
> > +* run out of buffers for DMA, resulting in I/O errors and/or
> > +* performance degradation especially with high I/O workloads.
> > +* Increase the default size of SWIOTLB for SEV guests using
> > +* a minimum value of 128MB and a maximum value of 512MB,
> > +* depending on amount of provisioned guest memory.
> > +*/
> > +   if (sev_active()) {
> > +   phys_addr_t total_mem = memblock_phys_mem_size();
> > +
> > +   if (total_mem <= TOTAL_MEM_1G)
> > +   size = clamp(iotlb_default_size * 2, SIZE_128M,
> > +SIZE_128M);
> > +   else if (total_mem <= 

Re: [PATCH v3] swiotlb: Adjust SWIOTBL bounce buffer size for SEV guests.

2020-11-04 Thread Andy Shevchenko
On Thursday, November 5, 2020, Ashish Kalra  wrote:

> From: Ashish Kalra 
>
> For SEV, all DMA to and from guest has to use shared
> (un-encrypted) pages. SEV uses SWIOTLB to make this
> happen without requiring changes to device drivers.
> However, depending on workload being run, the default
> 64MB of SWIOTLB might not be enough and SWIOTLB
> may run out of buffers to use for DMA, resulting
> in I/O errors and/or performance degradation for
> high I/O workloads.
>
> Increase the default size of SWIOTLB for SEV guests
> using a minimum value of 128MB and a maximum value
> of 512MB, determining on amount of provisioned guest
> memory.
>
> Using late_initcall() interface to invoke
> swiotlb_adjust() does not work as the size
> adjustment needs to be done before mem_encrypt_init()
> and reserve_crashkernel() which use the allocated
> SWIOTLB buffer size, hence calling it explicitly
> from setup_arch().
>
> The SWIOTLB default size adjustment is added as an
> architecture specific interface/callback to allow
> architectures such as those supporting memory
> encryption to adjust/expand SWIOTLB size for their
> use.
>
> Signed-off-by: Ashish Kalra 
> ---
>  arch/x86/kernel/setup.c   |  2 ++
>  arch/x86/mm/mem_encrypt.c | 42 +++
>  include/linux/swiotlb.h   |  1 +
>  kernel/dma/swiotlb.c  | 27 +
>  4 files changed, 72 insertions(+)
>
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index 3511736fbc74..b073d58dd4a3 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -1166,6 +1166,8 @@ void __init setup_arch(char **cmdline_p)
> if (boot_cpu_has(X86_FEATURE_GBPAGES))
> hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
>
> +   swiotlb_adjust();
> +
> /*
>  * Reserve memory for crash kernel after SRAT is parsed so that it
>  * won't consume hotpluggable memory.
> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
> index 3f248f0d0e07..e0deb157cddd 100644
> --- a/arch/x86/mm/mem_encrypt.c
> +++ b/arch/x86/mm/mem_encrypt.c
> @@ -489,7 +489,49 @@ static void print_mem_encrypt_feature_info(void)
> pr_cont("\n");
>  }
>
> +#define TOTAL_MEM_1G   0x4000UL
> +#define TOTAL_MEM_4G   0x1UL
> +
> +#define SIZE_128M (128UL<<20)
> +#define SIZE_256M (256UL<<20)
> +#define SIZE_512M (512UL<<20)



We have these constants defined in sizes.h.


> +
>  /* Architecture __weak replacement functions */
> +unsigned long __init arch_swiotlb_adjust(unsigned long iotlb_default_size)
> +{
> +   unsigned long size = 0;
> +
> +   /*
> +* For SEV, all DMA has to occur via shared/unencrypted pages.
> +* SEV uses SWOTLB to make this happen without changing device
> +* drivers. However, depending on the workload being run, the
> +* default 64MB of SWIOTLB may not be enough & SWIOTLB may
> +* run out of buffers for DMA, resulting in I/O errors and/or
> +* performance degradation especially with high I/O workloads.
> +* Increase the default size of SWIOTLB for SEV guests using
> +* a minimum value of 128MB and a maximum value of 512MB,
> +* depending on amount of provisioned guest memory.
> +*/
> +   if (sev_active()) {
> +   phys_addr_t total_mem = memblock_phys_mem_size();
> +
> +   if (total_mem <= TOTAL_MEM_1G)
> +   size = clamp(iotlb_default_size * 2, SIZE_128M,
> +SIZE_128M);
> +   else if (total_mem <= TOTAL_MEM_4G)
> +   size = clamp(iotlb_default_size * 4, SIZE_256M,
> +SIZE_256M);
> +   else
> +   size = clamp(iotlb_default_size * 8, SIZE_512M,
> +SIZE_512M);
> +
> +   pr_info("SEV adjusted max SWIOTLB size = %luMB",
> +   size >> 20);
> +   }
> +
> +   return size;
> +}
> +
>  void __init mem_encrypt_init(void)
>  {
> if (!sme_me_mask)
> diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
> index 046bb94bd4d6..01ae6d891327 100644
> --- a/include/linux/swiotlb.h
> +++ b/include/linux/swiotlb.h
> @@ -33,6 +33,7 @@ extern void swiotlb_init(int verbose);
>  int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
>  extern unsigned long swiotlb_nr_tbl(void);
>  unsigned long swiotlb_size_or_default(void);
> +extern void __init swiotlb_adjust(void);
>  extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
>  extern void __init swiotlb_update_mem_attributes(void);
>
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index c19379fabd20..66a9e627bb51 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -163,6 +163,33 @@ unsigned long swiotlb_size_or_default(void)
> return size ? size : (IO_TLB_DEFAULT_SIZE);
>  }
>
> +unsigned 

Re: [PATCH v3] swiotlb: Adjust SWIOTBL bounce buffer size for SEV guests.

2020-11-04 Thread Konrad Rzeszutek Wilk
On Wed, Nov 04, 2020 at 10:08:04PM +, Ashish Kalra wrote:
> From: Ashish Kalra 
> 
> For SEV, all DMA to and from guest has to use shared
> (un-encrypted) pages. SEV uses SWIOTLB to make this
> happen without requiring changes to device drivers.
> However, depending on workload being run, the default
> 64MB of SWIOTLB might not be enough and SWIOTLB
> may run out of buffers to use for DMA, resulting
> in I/O errors and/or performance degradation for
> high I/O workloads.
> 
> Increase the default size of SWIOTLB for SEV guests
> using a minimum value of 128MB and a maximum value



64MB for a 1GB VM is not enough?

> of 512MB, determining on amount of provisioned guest

I like the implementation on how this is done.. but
the choices of memory and how much seems very much
random. Could there be some math behind this?

> memory.
> 
> Using late_initcall() interface to invoke
> swiotlb_adjust() does not work as the size
> adjustment needs to be done before mem_encrypt_init()
> and reserve_crashkernel() which use the allocated
> SWIOTLB buffer size, hence calling it explicitly
> from setup_arch().
> 
> The SWIOTLB default size adjustment is added as an
> architecture specific interface/callback to allow
> architectures such as those supporting memory
> encryption to adjust/expand SWIOTLB size for their
> use.
> 
> Signed-off-by: Ashish Kalra 
> ---
>  arch/x86/kernel/setup.c   |  2 ++
>  arch/x86/mm/mem_encrypt.c | 42 +++
>  include/linux/swiotlb.h   |  1 +
>  kernel/dma/swiotlb.c  | 27 +
>  4 files changed, 72 insertions(+)
> 
> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
> index 3511736fbc74..b073d58dd4a3 100644
> --- a/arch/x86/kernel/setup.c
> +++ b/arch/x86/kernel/setup.c
> @@ -1166,6 +1166,8 @@ void __init setup_arch(char **cmdline_p)
>   if (boot_cpu_has(X86_FEATURE_GBPAGES))
>   hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
>  
> + swiotlb_adjust();
> +
>   /*
>* Reserve memory for crash kernel after SRAT is parsed so that it
>* won't consume hotpluggable memory.
> diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
> index 3f248f0d0e07..e0deb157cddd 100644
> --- a/arch/x86/mm/mem_encrypt.c
> +++ b/arch/x86/mm/mem_encrypt.c
> @@ -489,7 +489,49 @@ static void print_mem_encrypt_feature_info(void)
>   pr_cont("\n");
>  }
>  
> +#define TOTAL_MEM_1G 0x4000UL
> +#define TOTAL_MEM_4G 0x1UL
> +
> +#define SIZE_128M (128UL<<20)
> +#define SIZE_256M (256UL<<20)
> +#define SIZE_512M (512UL<<20)
> +
>  /* Architecture __weak replacement functions */
> +unsigned long __init arch_swiotlb_adjust(unsigned long iotlb_default_size)
> +{
> + unsigned long size = 0;
> +
> + /*
> +  * For SEV, all DMA has to occur via shared/unencrypted pages.
> +  * SEV uses SWOTLB to make this happen without changing device
> +  * drivers. However, depending on the workload being run, the
> +  * default 64MB of SWIOTLB may not be enough & SWIOTLB may
> +  * run out of buffers for DMA, resulting in I/O errors and/or
> +  * performance degradation especially with high I/O workloads.
> +  * Increase the default size of SWIOTLB for SEV guests using
> +  * a minimum value of 128MB and a maximum value of 512MB,
> +  * depending on amount of provisioned guest memory.
> +  */
> + if (sev_active()) {
> + phys_addr_t total_mem = memblock_phys_mem_size();
> +
> + if (total_mem <= TOTAL_MEM_1G)
> + size = clamp(iotlb_default_size * 2, SIZE_128M,
> +  SIZE_128M);
> + else if (total_mem <= TOTAL_MEM_4G)
> + size = clamp(iotlb_default_size * 4, SIZE_256M,
> +  SIZE_256M);
> + else
> + size = clamp(iotlb_default_size * 8, SIZE_512M,
> +  SIZE_512M);
> +
> + pr_info("SEV adjusted max SWIOTLB size = %luMB",
> + size >> 20);
> + }
> +
> + return size;
> +}
> +
>  void __init mem_encrypt_init(void)
>  {
>   if (!sme_me_mask)
> diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
> index 046bb94bd4d6..01ae6d891327 100644
> --- a/include/linux/swiotlb.h
> +++ b/include/linux/swiotlb.h
> @@ -33,6 +33,7 @@ extern void swiotlb_init(int verbose);
>  int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
>  extern unsigned long swiotlb_nr_tbl(void);
>  unsigned long swiotlb_size_or_default(void);
> +extern void __init swiotlb_adjust(void);
>  extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
>  extern void __init swiotlb_update_mem_attributes(void);
>  
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> index c19379fabd20..66a9e627bb51 100644
> --- a/kernel/dma/swiotlb.c
> +++ b/kernel/dma/swiotlb.c
> @@ -163,6 +163,33 @@ unsigned long 

[PATCH v3] swiotlb: Adjust SWIOTBL bounce buffer size for SEV guests.

2020-11-04 Thread Ashish Kalra
From: Ashish Kalra 

For SEV, all DMA to and from guest has to use shared
(un-encrypted) pages. SEV uses SWIOTLB to make this
happen without requiring changes to device drivers.
However, depending on workload being run, the default
64MB of SWIOTLB might not be enough and SWIOTLB
may run out of buffers to use for DMA, resulting
in I/O errors and/or performance degradation for
high I/O workloads.

Increase the default size of SWIOTLB for SEV guests
using a minimum value of 128MB and a maximum value
of 512MB, determining on amount of provisioned guest
memory.

Using late_initcall() interface to invoke
swiotlb_adjust() does not work as the size
adjustment needs to be done before mem_encrypt_init()
and reserve_crashkernel() which use the allocated
SWIOTLB buffer size, hence calling it explicitly
from setup_arch().

The SWIOTLB default size adjustment is added as an
architecture specific interface/callback to allow
architectures such as those supporting memory
encryption to adjust/expand SWIOTLB size for their
use.

Signed-off-by: Ashish Kalra 
---
 arch/x86/kernel/setup.c   |  2 ++
 arch/x86/mm/mem_encrypt.c | 42 +++
 include/linux/swiotlb.h   |  1 +
 kernel/dma/swiotlb.c  | 27 +
 4 files changed, 72 insertions(+)

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 3511736fbc74..b073d58dd4a3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1166,6 +1166,8 @@ void __init setup_arch(char **cmdline_p)
if (boot_cpu_has(X86_FEATURE_GBPAGES))
hugetlb_cma_reserve(PUD_SHIFT - PAGE_SHIFT);
 
+   swiotlb_adjust();
+
/*
 * Reserve memory for crash kernel after SRAT is parsed so that it
 * won't consume hotpluggable memory.
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 3f248f0d0e07..e0deb157cddd 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -489,7 +489,49 @@ static void print_mem_encrypt_feature_info(void)
pr_cont("\n");
 }
 
+#define TOTAL_MEM_1G   0x4000UL
+#define TOTAL_MEM_4G   0x1UL
+
+#define SIZE_128M (128UL<<20)
+#define SIZE_256M (256UL<<20)
+#define SIZE_512M (512UL<<20)
+
 /* Architecture __weak replacement functions */
+unsigned long __init arch_swiotlb_adjust(unsigned long iotlb_default_size)
+{
+   unsigned long size = 0;
+
+   /*
+* For SEV, all DMA has to occur via shared/unencrypted pages.
+* SEV uses SWOTLB to make this happen without changing device
+* drivers. However, depending on the workload being run, the
+* default 64MB of SWIOTLB may not be enough & SWIOTLB may
+* run out of buffers for DMA, resulting in I/O errors and/or
+* performance degradation especially with high I/O workloads.
+* Increase the default size of SWIOTLB for SEV guests using
+* a minimum value of 128MB and a maximum value of 512MB,
+* depending on amount of provisioned guest memory.
+*/
+   if (sev_active()) {
+   phys_addr_t total_mem = memblock_phys_mem_size();
+
+   if (total_mem <= TOTAL_MEM_1G)
+   size = clamp(iotlb_default_size * 2, SIZE_128M,
+SIZE_128M);
+   else if (total_mem <= TOTAL_MEM_4G)
+   size = clamp(iotlb_default_size * 4, SIZE_256M,
+SIZE_256M);
+   else
+   size = clamp(iotlb_default_size * 8, SIZE_512M,
+SIZE_512M);
+
+   pr_info("SEV adjusted max SWIOTLB size = %luMB",
+   size >> 20);
+   }
+
+   return size;
+}
+
 void __init mem_encrypt_init(void)
 {
if (!sme_me_mask)
diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
index 046bb94bd4d6..01ae6d891327 100644
--- a/include/linux/swiotlb.h
+++ b/include/linux/swiotlb.h
@@ -33,6 +33,7 @@ extern void swiotlb_init(int verbose);
 int swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose);
 extern unsigned long swiotlb_nr_tbl(void);
 unsigned long swiotlb_size_or_default(void);
+extern void __init swiotlb_adjust(void);
 extern int swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs);
 extern void __init swiotlb_update_mem_attributes(void);
 
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index c19379fabd20..66a9e627bb51 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -163,6 +163,33 @@ unsigned long swiotlb_size_or_default(void)
return size ? size : (IO_TLB_DEFAULT_SIZE);
 }
 
+unsigned long __init __weak arch_swiotlb_adjust(unsigned long size)
+{
+   return 0;
+}
+
+void __init swiotlb_adjust(void)
+{
+   unsigned long size;
+
+   /*
+* If swiotlb parameter has not been specified, give a chance to
+* architectures such as those supporting memory encryption to
+* adjust/expand SWIOTLB 

Re: [PATCH v3 0/6] Add virtio-iommu built-in topology

2020-11-04 Thread Al Stone
On 04 Nov 2020 10:33, Jean-Philippe Brucker wrote:
> Hi Al,
> 
> On Tue, Nov 03, 2020 at 01:09:04PM -0700, Al Stone wrote:
> > So, there are some questions about the VIOT definition and I just
> > don't know enough to be able to answer them.  One of the ASWG members
> > is trying to understand the semantics behind the subtables.
> 
> Thanks for the update. We dropped subtables a few versions ago, though, do
> you have the latest v8?
> https://jpbrucker.net/virtio-iommu/viot/viot-v8.pdf

Sorry, I confused some terminology: what are called the Node structures
are implemented as "subtables" in the ACPI reference implementation
(ACPICA).  But yes, I've proposed the v8 version.

> > Is there a particular set of people, or mailing lists, that I can
> > point to to get the questions answered?  Ideally it would be one
> > of the public lists where it has already been discussed, but an
> > individual would be fine, too.  No changes have been proposed, just
> > some questions asked.
> 
> For a public list, I suggest iommu@lists.linux-foundation.org if we should
> pick only one (otherwise add virtualizat...@lists.linux-foundation.org and
> virtio-...@lists.oasis-open.org). I'm happy to answer any question, and
> the folks on here are a good set to Cc:
> 
> eric.au...@redhat.com
> jean-phili...@linaro.org
> j...@8bytes.org
> kevin.t...@intel.com
> lorenzo.pieral...@arm.com
> m...@redhat.com
> sebastien.bo...@intel.com
> 
> Thanks,
> Jean
> 

Merci, Jean-Philippe :).  I'll point the individual at you and the
iommu mailing list, and the CCs.  Sadly, I did not write down the
full question, nor the person's name (from Microsoft, possibly?)
and now seem to have completely forgotten both (it's been a long
few months...).  If I can find something in the meeting minutes,
I'll pass that on.

Thanks again for everyone's patience.

-- 
ciao,
al
---
Al Stone
Software Engineer
Red Hat, Inc.
a...@redhat.com
---

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: (proposal) RE: [PATCH v7 00/16] vfio: expose virtual Shared Virtual Addressing to VMs

2020-11-04 Thread Jason Gunthorpe
On Tue, Nov 03, 2020 at 08:14:29PM +0100, j...@8bytes.org wrote:
> On Tue, Nov 03, 2020 at 01:48:51PM -0400, Jason Gunthorpe wrote:
> > I think the same PCI driver with a small flag to support the PF or
> > VF is not the same as two completely different drivers in different
> > subsystems
> 
> There are counter-examples: ixgbe vs. ixgbevf.
>
> Note that also a single driver can support both, an SVA device and an
> mdev device, sharing code for accessing parts of the device like queues
> and handling interrupts.

Needing a mdev device at all is the larger issue, mdev means the
kernel must carry a lot of emulation code depending on how the SVA
device is designed. Eg creating queues may require an emulated BAR.

Shifting that code to userspace and having a single clean 'SVA'
interface from the kernel for the device makes a lot more sense,
esepcially from a security perspective.

Forcing all vIOMMU stuff to only use VFIO permanently closes this as
an option.

Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: Re: [PATCH 2/5] RDMA/core: remove use of dma_virt_ops

2020-11-04 Thread Jason Gunthorpe
On Wed, Nov 04, 2020 at 03:09:04PM +, Bernard Metzler wrote:

> lkey of zero to pass a physical buffer, only allowed for
> kernel applications? Very nice idea I think.

It already exists, it is called the local_dma_lkey, just set
IB_DEVICE_LOCAL_DMA_LKEY and provide the value you want to use
in device->local_dma_lkey

> btw.
> It would even get the vain blessing of the old IETF RDMA
> verbs draft ;)
> 
> https://tools.ietf.org/html/draft-hilland-rddp-verbs-00#page-90

IBTA standadized this, they just didn't require HW to use 0 as the
lkey.

Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 2/5] RDMA/core: remove use of dma_virt_ops

2020-11-04 Thread Jason Gunthorpe
On Wed, Nov 04, 2020 at 05:31:35PM +0100, Christoph Hellwig wrote:
> On Wed, Nov 04, 2020 at 11:52:55AM -0400, Jason Gunthorpe wrote:
> > It could work, I think a resonable ULP API would be to have some
> > 
> >  rdma_fill_ib_sge_from_sgl()
> >  rdma_map_sge_single()
> >  etc etc
> > 
> > ie instead of wrappering the DMA API as-is we have a new API that
> > directly builds the ib_sge. It always fills the local_dma_lkey from
> > the pd, so it knows it is doing DMA from local kernel memory.
> 
> Yeah.
> 
> > Logically SW devices then have a local_dma_lkey MR that has an IOVA of
> > the CPU physical address space, not the DMA address space as HW
> > devices have. The ib_sge builders can know this detail and fill in
> > addr from either a cpu phyical or a dma map.
> 
> I don't think the builders are the right place to do it - it really
> should to be in the low-level drivers for a bunch of reasons:

At this point we have little choice, the ULP is responsible for
map/unmap because the ULP owns the CQ and (batch) unmap is triggered
by some CQE.

Reworking all drivers to somehow keep track of unmaps a CQEs triggers
feels so hard at this point as to be impossible. It is why the
rdma_rw_ctx basically exists.

So we have to keep the current arrangment, when the ib_sge is built
the dma map must be conditionally done.

>  1) this avoids doing the dma_map when no DMA is performed, e.g. for
> mlx5 when send data is in the extended WQE

At least in the kernel, the ULP has to be involved today in
IB_SEND_INLINE. Userspace does an auto-copy, but that is because it
doesn't have dma map/unmap. 

Without unmap tracking as above the caller must supply a specially
formed ib_sge when using IB_SEND_INLINE that specifies the CPU vaddr
so memcpy works.

But this all looks like dead code, no ULP sets IB_SEND_INLINE ??

>  2) to deal with the fact that dma mapping reduces the number of SGEs.
> When the system uses a modern IOMMU we'll always end up with a
> single IOVA range no matter how many pages were mapped originally.
> This means any MR process can actually be consolidated to use
> a single SGE with the local lkey.

Any place like rdma_rw_ctx_init() that decides dynamically between SGE
and MR becomes a mess. It would be manageable if rdma_rw_ctx_init()
was the only place doing this..

I haven't looked lately, but I wonder if it is feasible that all the
MR users would use this API?

Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 3/5] PCI/p2p: remove the DMA_VIRT_OPS hacks

2020-11-04 Thread Logan Gunthorpe




On 2020-11-04 2:50 a.m., Christoph Hellwig wrote:
> Now that all users of dma_virt_ops are gone we can remove the workaround
> for it in the PCIe peer to peer code.
> 
> Signed-off-by: Christoph Hellwig 

The two P2PDMA patches look fine to me. Nice to get rid of that hack.

Reviewed-by: Logan Gunthorpe 

Thanks,

Logan
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 4/5] PCI/p2p: cleanup up __pci_p2pdma_map_sg a bit

2020-11-04 Thread Bjorn Helgaas
s|PCI/p2p: cleanup up __pci_p2pdma_map_sg|PCI/P2PDMA: Cleanup up 
__pci_p2pdma_map_sg|
to match history.

On Wed, Nov 04, 2020 at 10:50:51AM +0100, Christoph Hellwig wrote:
> Remove the pointless paddr variable that was only used once.
> 
> Signed-off-by: Christoph Hellwig 

Acked-by: Bjorn Helgaas 

> ---
>  drivers/pci/p2pdma.c | 5 +
>  1 file changed, 1 insertion(+), 4 deletions(-)
> 
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index b07018af53876c..afd792cc272832 100644
> --- a/drivers/pci/p2pdma.c
> +++ b/drivers/pci/p2pdma.c
> @@ -825,13 +825,10 @@ static int __pci_p2pdma_map_sg(struct 
> pci_p2pdma_pagemap *p2p_pgmap,
>   struct device *dev, struct scatterlist *sg, int nents)
>  {
>   struct scatterlist *s;
> - phys_addr_t paddr;
>   int i;
>  
>   for_each_sg(sg, s, nents, i) {
> - paddr = sg_phys(s);
> -
> - s->dma_address = paddr - p2p_pgmap->bus_offset;
> + s->dma_address = sg_phys(s) - p2p_pgmap->bus_offset;
>   sg_dma_len(s) = s->length;
>   }
>  
> -- 
> 2.28.0
> 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 3/5] PCI/p2p: remove the DMA_VIRT_OPS hacks

2020-11-04 Thread Bjorn Helgaas
s|PCI/p2p: remove|PCI/P2PDMA: Remove/
to match history.

On Wed, Nov 04, 2020 at 10:50:50AM +0100, Christoph Hellwig wrote:
> Now that all users of dma_virt_ops are gone we can remove the workaround
> for it in the PCIe peer to peer code.

s/PCIe/PCI/
We went to some trouble to make P2PDMA work on conventional PCI as
well as PCIe.

> Signed-off-by: Christoph Hellwig 

Acked-by: Bjorn Helgaas 

> ---
>  drivers/pci/p2pdma.c | 20 
>  1 file changed, 20 deletions(-)
> 
> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
> index de1c331dbed43f..b07018af53876c 100644
> --- a/drivers/pci/p2pdma.c
> +++ b/drivers/pci/p2pdma.c
> @@ -556,15 +556,6 @@ int pci_p2pdma_distance_many(struct pci_dev *provider, 
> struct device **clients,
>   return -1;
>  
>   for (i = 0; i < num_clients; i++) {
> -#ifdef CONFIG_DMA_VIRT_OPS
> - if (clients[i]->dma_ops == _virt_ops) {
> - if (verbose)
> - dev_warn(clients[i],
> -  "cannot be used for peer-to-peer DMA 
> because the driver makes use of dma_virt_ops\n");
> - return -1;
> - }
> -#endif
> -
>   pci_client = find_parent_pci_dev(clients[i]);
>   if (!pci_client) {
>   if (verbose)
> @@ -837,17 +828,6 @@ static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap 
> *p2p_pgmap,
>   phys_addr_t paddr;
>   int i;
>  
> - /*
> -  * p2pdma mappings are not compatible with devices that use
> -  * dma_virt_ops. If the upper layers do the right thing
> -  * this should never happen because it will be prevented
> -  * by the check in pci_p2pdma_distance_many()
> -  */
> -#ifdef CONFIG_DMA_VIRT_OPS
> - if (WARN_ON_ONCE(dev->dma_ops == _virt_ops))
> - return 0;
> -#endif
> -
>   for_each_sg(sg, s, nents, i) {
>   paddr = sg_phys(s);
>  
> -- 
> 2.28.0
> 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 2/5] RDMA/core: remove use of dma_virt_ops

2020-11-04 Thread Christoph Hellwig
On Wed, Nov 04, 2020 at 11:52:55AM -0400, Jason Gunthorpe wrote:
> It could work, I think a resonable ULP API would be to have some
> 
>  rdma_fill_ib_sge_from_sgl()
>  rdma_map_sge_single()
>  etc etc
> 
> ie instead of wrappering the DMA API as-is we have a new API that
> directly builds the ib_sge. It always fills the local_dma_lkey from
> the pd, so it knows it is doing DMA from local kernel memory.

Yeah.

> Logically SW devices then have a local_dma_lkey MR that has an IOVA of
> the CPU physical address space, not the DMA address space as HW
> devices have. The ib_sge builders can know this detail and fill in
> addr from either a cpu phyical or a dma map.

I don't think the builders are the right place to do it - it really
should to be in the low-level drivers for a bunch of reasons:

 1) this avoids doing the dma_map when no DMA is performed, e.g. for
mlx5 when send data is in the extended WQE
 2) to deal with the fact that dma mapping reduces the number of SGEs.
When the system uses a modern IOMMU we'll always end up with a
single IOVA range no matter how many pages were mapped originally.
This means any MR process can actually be consolidated to use
a single SGE with the local lkey.

Note that 2 implies a somewhat more complicated API, where the ULP
attempts to create a MR, but the core/driver will tell it that it didn't
need a MR at all.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 2/5] RDMA/core: remove use of dma_virt_ops

2020-11-04 Thread Jason Gunthorpe
On Wed, Nov 04, 2020 at 03:01:08PM +0100, Christoph Hellwig wrote:

> > Sigh. I think the proper fix is to replace addr/length with a
> > scatterlist pointer in the struct ib_sge, then have SW drivers
> > directly use the page pointer properly.
> 
> The proper fix is to move the DMA mapping into the RDMA core, yes.
> And as you said it will be hard.  But I don't think scatterlists
> are the right interface.  IMHO we can keep re-use the existing
> struct ib_sge:
> 
> struct ib_ge {
>   u64 addr;
>   u32 length;
>   u32 lkey;
> };

Gah, right, this is all about local_dma_lkey..
 
> with the difference that if lkey is not a MR, addr is the physical
> address of the memory, not a dma_addr_t or virtual address.

It could work, I think a resonable ULP API would be to have some

 rdma_fill_ib_sge_from_sgl()
 rdma_map_sge_single()
 etc etc

ie instead of wrappering the DMA API as-is we have a new API that
directly builds the ib_sge. It always fills the local_dma_lkey from
the pd, so it knows it is doing DMA from local kernel memory.

Logically SW devices then have a local_dma_lkey MR that has an IOVA of
the CPU physical address space, not the DMA address space as HW
devices have. The ib_sge builders can know this detail and fill in
addr from either a cpu phyical or a dma map.

The SW device has to translate the addr/length in CPU space to
something else. It actually makes reasonable sense architecturally.

This is actually much less horrible than I thought..

Convert all ULPs to one of these new APIs, searching for
local_dma_lkey will find all places. This will replace a whole lot of
calls to ib DMA API wrapper functions. Searching for local_dma_lkey
will find all users. Drivers already work with sge.addr == CPU
address, so no driver change

Then to kill the dma_ops wrappers the remaining users should all be
connected to map_mr_sg. In this case we want a no-op dma map and fix
the three map_mr_sg's to use the page side of the sgl, not the DMA
side

Not as horrible as I imagined at first, actually..

Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: Re: [PATCH 2/5] RDMA/core: remove use of dma_virt_ops

2020-11-04 Thread Bernard Metzler
-"Christoph Hellwig"  wrote: -

>To: "Jason Gunthorpe" 
>From: "Christoph Hellwig" 
>Date: 11/04/2020 03:02PM
>Cc: "Christoph Hellwig" , "Bjorn Helgaas"
>, "Logan Gunthorpe" ,
>linux-r...@vger.kernel.org, linux-...@vger.kernel.org,
>iommu@lists.linux-foundation.org
>Subject: [EXTERNAL] Re: [PATCH 2/5] RDMA/core: remove use of
>dma_virt_ops
>
>On Wed, Nov 04, 2020 at 09:42:41AM -0400, Jason Gunthorpe wrote:
>> On Wed, Nov 04, 2020 at 10:50:49AM +0100, Christoph Hellwig wrote:
>> 
>> > +int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist
>*sg, int nents)
>> > +{
>> > +  struct scatterlist *s;
>> > +  int i;
>> > +
>> > +  for_each_sg(sg, s, nents, i) {
>> > +  sg_dma_address(s) = (uintptr_t)sg_virt(s);
>> > +  sg_dma_len(s) = s->length;
>> 
>> Hmm.. There is nothing ensuring the page is mapped here for this
>> sg_virt(). Before maybe some of the kconfig stuff prevented highmem
>> systems indirectly, I wonder if we should add something more direct
>to
>> exclude highmem for these drivers?
>
>I had actually noticed this earlier as well and then completely
>forgot
>about it..
>
>rdmavt depends on X86_64, so it can't be used with highmem, but for
>rxe and siw there weren't any such dependencies so I think we were
>just
>lucky.  Let me send a fix to add explicit depencies and then respin
>this
>series on top of that..
>
>> Sigh. I think the proper fix is to replace addr/length with a
>> scatterlist pointer in the struct ib_sge, then have SW drivers
>> directly use the page pointer properly.
>
>The proper fix is to move the DMA mapping into the RDMA core, yes.
>And as you said it will be hard.  But I don't think scatterlists
>are the right interface.  IMHO we can keep re-use the existing
>struct ib_sge:
>
>struct ib_ge {
>   u64 addr;
>   u32 length;
>   u32 lkey;
>};
>
>with the difference that if lkey is not a MR, addr is the physical
>address of the memory, not a dma_addr_t or virtual address.
>

lkey of zero to pass a physical buffer, only allowed for
kernel applications? Very nice idea I think.

btw.
It would even get the vain blessing of the old IETF RDMA
verbs draft ;)

https://tools.ietf.org/html/draft-hilland-rddp-verbs-00#page-90


(section '7.2.1 STag of zero' - read lkey for STag)


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [EXTERNAL] Re: amdgpu error whenever IOMMU is enabled

2020-11-04 Thread Merger, Edgar [AUTOSOL/MAS/AUGS]
Joerg,

One remark: 
> However I found out that with Kernel 5.9.3 the amdgpu kernel module is not 
> loaded/installed
That is likely my fault because I was compiling that linux kernel on a faster 
machine (V1807B CPU against R1305G CPU (target)). I restarted that compile just 
now on the target machine to avoid any problems.

Best regards,
Edgar

-Original Message-
From: Merger, Edgar [AUTOSOL/MAS/AUGS] 
Sent: Mittwoch, 4. November 2020 15:19
To: jroe...@suse.de
Cc: iommu@lists.linux-foundation.org
Subject: RE: [EXTERNAL] Re: amdgpu error whenever IOMMU is enabled

> Yes, but it could be the same underlying reason.
There is no PCI setup issue that we are aware of.

> For a first try, use 5.9.3. If it reproduces there, please try booting with 
> "pci=noats" on the kernel command line.
Did compile the kernel 5.9.3 and started a reboot test to see if it is going to 
fail again. However I found out that with Kernel 5.9.3 the amdgpu kernel module 
is not loaded/installed. So this way I don´t see it makes sense for further 
investigation. I might did something wrong when compiling the linux kernel 
5.9.3. I did reuse my .config file that I used with 5.4.0-47 for configuration 
of the kernel 5.9.3. However I do not know why it did not install amdgpu.

> Please also send me the output of 'lspci -vvv' and 'lspci -t' of the machine 
> where this happens.
For comparison I attached the logs when using 5.4.0-47 and 5.9.3. 

Best regards,
Edgar

-Original Message-
From: jroe...@suse.de 
Sent: Mittwoch, 4. November 2020 11:15
To: Merger, Edgar [AUTOSOL/MAS/AUGS] 
Cc: iommu@lists.linux-foundation.org
Subject: Re: [EXTERNAL] Re: amdgpu error whenever IOMMU is enabled

On Wed, Nov 04, 2020 at 09:21:35AM +, Merger, Edgar [AUTOSOL/MAS/AUGS] 
wrote:
> AMD-Vi: Completion-Wait loop timed out is at [65499.964105] but amdgpu-error 
> is at [   52.772273], hence much earlier.

Yes, but it could be the same underlying reason.

> Have not tried to use an upstream kernel yet. Which one would you recommend?

For a first try, use 5.9.3. If it reproduces there, please try booting with 
"pci=noats" on the kernel command line.

Please also send me the output of 'lspci -vvv' and 'lspci -t' of the machine 
where this happens.

Regards,

Joerg


> 
> As far as inconsistencies in the PCI-setup is concerned, the only thing that 
> I know of right now is that we haven´t entered a PCI subsystem vendor and 
> device ID yet. It is still "Advanced Micro Devices". We will change that soon 
> to "General Electric" or "Emerson".
> 
> Best regards,
> Edgar
> 
> -Original Message-
> From: jroe...@suse.de 
> Sent: Mittwoch, 4. November 2020 09:53
> To: Merger, Edgar [AUTOSOL/MAS/AUGS] 
> Cc: iommu@lists.linux-foundation.org
> Subject: [EXTERNAL] Re: amdgpu error whenever IOMMU is enabled
> 
> Hi Edgar,
> 
> On Fri, Oct 30, 2020 at 02:26:23PM +, Merger, Edgar [AUTOSOL/MAS/AUGS] 
> wrote:
> > With one board we have a boot-problem that is reproducible at every ~50 
> > boot.
> > The system is accessible via ssh and works fine except for the 
> > Graphics. The graphics is off. We don´t see a screen. Please see 
> > attached “dmesg.log”. From [52.772273] onwards the kernel reports 
> > drm/amdgpu errors. It even tries to reset the GPU but that fails too.
> > I tried to reset amdgpu also by command “sudo cat 
> > /sys/kernel/debug/dri/N/amdgpu_gpu_recover”. That did not help either.
> 
> Can you reproduce the problem with an upstream kernel too?
> 
> These messages in dmesg indicate some problem in the platform setup:
> 
>   AMD-Vi: Completion-Wait loop timed out
> 
> Might there be some inconsistencies in the PCI setup between the bridges and 
> the endpoints or something?
> 
> Regards,
> 
>   Joerg
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

RE: [EXTERNAL] Re: amdgpu error whenever IOMMU is enabled

2020-11-04 Thread Merger, Edgar [AUTOSOL/MAS/AUGS]
> Yes, but it could be the same underlying reason.
There is no PCI setup issue that we are aware of.

> For a first try, use 5.9.3. If it reproduces there, please try booting with 
> "pci=noats" on the kernel command line.
Did compile the kernel 5.9.3 and started a reboot test to see if it is going to 
fail again. However I found out that with Kernel 5.9.3 the amdgpu kernel module 
is not loaded/installed. So this way I don´t see it makes sense for further 
investigation. I might did something wrong when compiling the linux kernel 
5.9.3. I did reuse my .config file that I used with 5.4.0-47 for configuration 
of the kernel 5.9.3. However I do not know why it did not install amdgpu.

> Please also send me the output of 'lspci -vvv' and 'lspci -t' of the machine 
> where this happens.
For comparison I attached the logs when using 5.4.0-47 and 5.9.3. 

Best regards,
Edgar

-Original Message-
From: jroe...@suse.de  
Sent: Mittwoch, 4. November 2020 11:15
To: Merger, Edgar [AUTOSOL/MAS/AUGS] 
Cc: iommu@lists.linux-foundation.org
Subject: Re: [EXTERNAL] Re: amdgpu error whenever IOMMU is enabled

On Wed, Nov 04, 2020 at 09:21:35AM +, Merger, Edgar [AUTOSOL/MAS/AUGS] 
wrote:
> AMD-Vi: Completion-Wait loop timed out is at [65499.964105] but amdgpu-error 
> is at [   52.772273], hence much earlier.

Yes, but it could be the same underlying reason.

> Have not tried to use an upstream kernel yet. Which one would you recommend?

For a first try, use 5.9.3. If it reproduces there, please try booting with 
"pci=noats" on the kernel command line.

Please also send me the output of 'lspci -vvv' and 'lspci -t' of the machine 
where this happens.

Regards,

Joerg


> 
> As far as inconsistencies in the PCI-setup is concerned, the only thing that 
> I know of right now is that we haven´t entered a PCI subsystem vendor and 
> device ID yet. It is still "Advanced Micro Devices". We will change that soon 
> to "General Electric" or "Emerson".
> 
> Best regards,
> Edgar
> 
> -Original Message-
> From: jroe...@suse.de 
> Sent: Mittwoch, 4. November 2020 09:53
> To: Merger, Edgar [AUTOSOL/MAS/AUGS] 
> Cc: iommu@lists.linux-foundation.org
> Subject: [EXTERNAL] Re: amdgpu error whenever IOMMU is enabled
> 
> Hi Edgar,
> 
> On Fri, Oct 30, 2020 at 02:26:23PM +, Merger, Edgar [AUTOSOL/MAS/AUGS] 
> wrote:
> > With one board we have a boot-problem that is reproducible at every ~50 
> > boot.
> > The system is accessible via ssh and works fine except for the 
> > Graphics. The graphics is off. We don´t see a screen. Please see 
> > attached “dmesg.log”. From [52.772273] onwards the kernel reports 
> > drm/amdgpu errors. It even tries to reset the GPU but that fails too.
> > I tried to reset amdgpu also by command “sudo cat 
> > /sys/kernel/debug/dri/N/amdgpu_gpu_recover”. That did not help either.
> 
> Can you reproduce the problem with an upstream kernel too?
> 
> These messages in dmesg indicate some problem in the platform setup:
> 
>   AMD-Vi: Completion-Wait loop timed out
> 
> Might there be some inconsistencies in the PCI setup between the bridges and 
> the endpoints or something?
> 
> Regards,
> 
>   Joerg


Linux-logs.tar.gz
Description: Linux-logs.tar.gz
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH for-5.10] swiotlb: remove the tbl_dma_addr argument to swiotlb_tbl_map_single

2020-11-04 Thread Konrad Rzeszutek Wilk
On Tue, Nov 03, 2020 at 10:46:43AM +0100, Christoph Hellwig wrote:
> ping?

Hopefully this goes through. I am in the process of testing it but ran
into testing issues that I believe are unrelated.


> 
> On Fri, Oct 23, 2020 at 08:33:09AM +0200, Christoph Hellwig wrote:
> > The tbl_dma_addr argument is used to check the DMA boundary for the
> > allocations, and thus needs to be a dma_addr_t.  swiotlb-xen instead
> > passed a physical address, which could lead to incorrect results for
> > strange offsets.  Fix this by removing the parameter entirely and hard
> > code the DMA address for io_tlb_start instead.
> > 
> > Fixes: 91ffe4ad534a ("swiotlb-xen: introduce phys_to_dma/dma_to_phys 
> > translations")
> > Signed-off-by: Christoph Hellwig 
> > Reviewed-by: Stefano Stabellini 
> > ---
> >  drivers/iommu/intel/iommu.c |  5 ++---
> >  drivers/xen/swiotlb-xen.c   |  3 +--
> >  include/linux/swiotlb.h | 10 +++---
> >  kernel/dma/swiotlb.c| 16 ++--
> >  4 files changed, 12 insertions(+), 22 deletions(-)
> > 
> > diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
> > index 8651f6d4dfa032..6b560e6f193096 100644
> > --- a/drivers/iommu/intel/iommu.c
> > +++ b/drivers/iommu/intel/iommu.c
> > @@ -3815,9 +3815,8 @@ bounce_map_single(struct device *dev, phys_addr_t 
> > paddr, size_t size,
> >  * page aligned, we don't need to use a bounce page.
> >  */
> > if (!IS_ALIGNED(paddr | size, VTD_PAGE_SIZE)) {
> > -   tlb_addr = swiotlb_tbl_map_single(dev,
> > -   phys_to_dma_unencrypted(dev, io_tlb_start),
> > -   paddr, size, aligned_size, dir, attrs);
> > +   tlb_addr = swiotlb_tbl_map_single(dev, paddr, size,
> > +   aligned_size, dir, attrs);
> > if (tlb_addr == DMA_MAPPING_ERROR) {
> > goto swiotlb_error;
> > } else {
> > diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
> > index 71ce1b7a23d1cc..2b385c1b4a99cb 100644
> > --- a/drivers/xen/swiotlb-xen.c
> > +++ b/drivers/xen/swiotlb-xen.c
> > @@ -395,8 +395,7 @@ static dma_addr_t xen_swiotlb_map_page(struct device 
> > *dev, struct page *page,
> >  */
> > trace_swiotlb_bounced(dev, dev_addr, size, swiotlb_force);
> >  
> > -   map = swiotlb_tbl_map_single(dev, virt_to_phys(xen_io_tlb_start),
> > -phys, size, size, dir, attrs);
> > +   map = swiotlb_tbl_map_single(dev, phys, size, size, dir, attrs);
> > if (map == (phys_addr_t)DMA_MAPPING_ERROR)
> > return DMA_MAPPING_ERROR;
> >  
> > diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h
> > index 513913ff748626..3bb72266a75a1d 100644
> > --- a/include/linux/swiotlb.h
> > +++ b/include/linux/swiotlb.h
> > @@ -45,13 +45,9 @@ enum dma_sync_target {
> > SYNC_FOR_DEVICE = 1,
> >  };
> >  
> > -extern phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
> > - dma_addr_t tbl_dma_addr,
> > - phys_addr_t phys,
> > - size_t mapping_size,
> > - size_t alloc_size,
> > - enum dma_data_direction dir,
> > - unsigned long attrs);
> > +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t phys,
> > +   size_t mapping_size, size_t alloc_size,
> > +   enum dma_data_direction dir, unsigned long attrs);
> >  
> >  extern void swiotlb_tbl_unmap_single(struct device *hwdev,
> >  phys_addr_t tlb_addr,
> > diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
> > index b4eea0abc3f002..92e2f54f24c01b 100644
> > --- a/kernel/dma/swiotlb.c
> > +++ b/kernel/dma/swiotlb.c
> > @@ -441,14 +441,11 @@ static void swiotlb_bounce(phys_addr_t orig_addr, 
> > phys_addr_t tlb_addr,
> > }
> >  }
> >  
> > -phys_addr_t swiotlb_tbl_map_single(struct device *hwdev,
> > -  dma_addr_t tbl_dma_addr,
> > -  phys_addr_t orig_addr,
> > -  size_t mapping_size,
> > -  size_t alloc_size,
> > -  enum dma_data_direction dir,
> > -  unsigned long attrs)
> > +phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t 
> > orig_addr,
> > +   size_t mapping_size, size_t alloc_size,
> > +   enum dma_data_direction dir, unsigned long attrs)
> >  {
> > +   dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start);
> > unsigned long flags;
> > phys_addr_t tlb_addr;
> > unsigned int nslots, stride, index, wrap;
> > @@ -667,9 +664,8 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t 
> > paddr, size_t size,
> > trace_swiotlb_bounced(dev, phys_to_dma(dev, paddr), size,
> >   

Re: [PATCH 2/5] RDMA/core: remove use of dma_virt_ops

2020-11-04 Thread Christoph Hellwig
On Wed, Nov 04, 2020 at 09:42:41AM -0400, Jason Gunthorpe wrote:
> On Wed, Nov 04, 2020 at 10:50:49AM +0100, Christoph Hellwig wrote:
> 
> > +int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int 
> > nents)
> > +{
> > +   struct scatterlist *s;
> > +   int i;
> > +
> > +   for_each_sg(sg, s, nents, i) {
> > +   sg_dma_address(s) = (uintptr_t)sg_virt(s);
> > +   sg_dma_len(s) = s->length;
> 
> Hmm.. There is nothing ensuring the page is mapped here for this
> sg_virt(). Before maybe some of the kconfig stuff prevented highmem
> systems indirectly, I wonder if we should add something more direct to
> exclude highmem for these drivers?

I had actually noticed this earlier as well and then completely forgot
about it..

rdmavt depends on X86_64, so it can't be used with highmem, but for
rxe and siw there weren't any such dependencies so I think we were just
lucky.  Let me send a fix to add explicit depencies and then respin this
series on top of that..

> Sigh. I think the proper fix is to replace addr/length with a
> scatterlist pointer in the struct ib_sge, then have SW drivers
> directly use the page pointer properly.

The proper fix is to move the DMA mapping into the RDMA core, yes.
And as you said it will be hard.  But I don't think scatterlists
are the right interface.  IMHO we can keep re-use the existing
struct ib_sge:

struct ib_ge {
u64 addr;
u32 length;
u32 lkey;
};

with the difference that if lkey is not a MR, addr is the physical
address of the memory, not a dma_addr_t or virtual address.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 2/5] RDMA/core: remove use of dma_virt_ops

2020-11-04 Thread Jason Gunthorpe
On Wed, Nov 04, 2020 at 10:50:49AM +0100, Christoph Hellwig wrote:

> +int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int 
> nents)
> +{
> + struct scatterlist *s;
> + int i;
> +
> + for_each_sg(sg, s, nents, i) {
> + sg_dma_address(s) = (uintptr_t)sg_virt(s);
> + sg_dma_len(s) = s->length;

Hmm.. There is nothing ensuring the page is mapped here for this
sg_virt(). Before maybe some of the kconfig stuff prevented highmem
systems indirectly, I wonder if we should add something more direct to
exclude highmem for these drivers?

Sigh. I think the proper fix is to replace addr/length with a
scatterlist pointer in the struct ib_sge, then have SW drivers
directly use the page pointer properly.

Then just delete this stuff, all drivers need is a noop dmaops

Looks a lot hard though, so we should probably go ahead with this.

Jason
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: use of dma_direct_set_offset in (allwinner) drivers

2020-11-04 Thread Maxime Ripard
On Wed, Nov 04, 2020 at 10:15:49AM +, Robin Murphy wrote:
> On 2020-11-04 08:14, Maxime Ripard wrote:
> > Hi Christoph,
> > 
> > On Tue, Nov 03, 2020 at 10:55:38AM +0100, Christoph Hellwig wrote:
> > > Linux 5.10-rc1 switched from having a single dma offset in struct device
> > > to a set of DMA ranges, and introduced a new helper to set them,
> > > dma_direct_set_offset.
> > > 
> > > This in fact surfaced that a bunch of drivers that violate our layering
> > > and set the offset from drivers, which meant we had to reluctantly
> > > export the symbol to set up the DMA range.
> > > 
> > > The drivers are:
> > > 
> > > drivers/gpu/drm/sun4i/sun4i_backend.c
> > > 
> > >This just use dma_direct_set_offset as a fallback.  Is there any good
> > >reason to not just kill off the fallback?
> > > 
> > > drivers/media/platform/sunxi/sun4i-csi/sun4i_csi.c
> > > 
> > >Same as above.
> > 
> > So, the history of this is:
> > 
> >- We initially introduced the support for those two controllers
> >  assuming that there was a direct mapping between the physical and
> >  DMA addresses. It turns out it didn't and the DMA accesses were
> >  going through a secondary, dedicated, bus that didn't have the same
> >  mapping of the RAM than the CPU.
> > 
> >  4690803b09c6 ("drm/sun4i: backend: Offset layer buffer address by DRAM 
> > starting address")
> > 
> >- This dedicated bus is undocumented and barely used in the vendor
> >  kernel so this was overlooked, and it's fairly hard to get infos on
> >  it for all the SoCs we support. We added the DT support for it
> >  though on some SoCs we had enough infos to do so:
> > 
> >  c43a4469402f ("dt-bindings: interconnect: Add a dma interconnect name")
> >  22f88e311399 ("ARM: dts: sun5i: Add the MBUS controller")
> > 
> >  This explains the check on the interconnect property
> > 
> >- However, due to the stable DT rule, we still need to operate without
> >  regressions on older DTs that wouldn't have that property (and for
> >  SoCs we haven't figured out). Hence the fallback.
> 
> How about having something in the platform code that keys off the top-level
> SoC compatible and uses a bus notifier to create offsets for the relevant
> devices if an MBUS description is missing? At least that way the workaround
> could be confined to a single dedicated place and look somewhat similar to
> other special cases like sta2x11, rather than being duplicated all over the
> place.

I'll give it a try, thanks for the suggestion :)

Maxime


signature.asc
Description: PGP signature
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: Some questions about arm_lpae_install_table

2020-11-04 Thread Robin Murphy

On 2020-11-04 07:17, Kunkun Jiang wrote:

Hi Will and Robin,

Sorry for the late reply.

On 2020/11/3 18:21, Robin Murphy wrote:

On 2020-11-03 09:11, Will Deacon wrote:

On Tue, Nov 03, 2020 at 11:00:27AM +0800, Kunkun Jiang wrote:

Recently, I have read and learned the code related to io-pgtable-arm.c.
There
are two question on arm_lpae_install_table.

1、the first


static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table,
  arm_lpae_iopte *ptep,
  arm_lpae_iopte curr,
  struct io_pgtable_cfg 
*cfg)

{
 arm_lpae_iopte old, new;

 new = __pa(table) | ARM_LPAE_PTE_TYPE_TABLE;
 if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)
 new |= ARM_LPAE_PTE_NSTABLE;

    /*
  * Ensure the table itself is visible before its PTE can be.
  * Whilst we could get away with cmpxchg64_release below, 
this

  * doesn't have any ordering semantics when !CONFIG_SMP.
  */
 dma_wmb();

 old = cmpxchg64_relaxed(ptep, curr, new);

 if (cfg->coherent_walk || (old & ARM_LPAE_PTE_SW_SYNC))
 return old;

 /* Even if it's not ours, there's no point waiting; just 
kick it

*/
 __arm_lpae_sync_pte(ptep, cfg);
 if (old == curr)
 WRITE_ONCE(*ptep, new | ARM_LPAE_PTE_SW_SYNC);

 return old;
}


If another thread changes the ptep between cmpxchg64_relaxed and
WRITE_ONCE(*ptep, new | ARM_LPAE_PTE_SW_SYNC), the operation
WRITE_ONCE will overwrite the change.


Can you please provide an example of a code path where this happens? The
idea is that CPUs can race on the cmpxchg(), but there will only be one
winner.


The only way a table entry can suddenly disappear is in a race that 
involves mapping or unmapping a whole block/table-sized region, while 
simultaneously mapping a page *within* that region. Yes, if someone 
uses the API in a nonsensical and completely invalid way that cannot 
have a predictable outcome, they get an unpredictable outcome. Whoop 
de do...



Yes, as Robin mentioned, there will be an unpredictable outcome in extreme
cases. Now, we have two APIs mapping and unmapping to alter a block/page
descriptor. I worry about that it may be  increasingly difficult to add 
API in

the future.


I still don't understand your concern. If two threads try to 
simultaneously create a block mapping and a page mapping *for the same 
virtual address*, the result is that one of those mappings will end up 
in place, depending on who managed to write to the PTE last. This code 
is self-consistent - it doesn't crash, nor end up with any mapping that 
*wasn't* requested - it just doesn't go far out of its way to 
accommodate obviously invalid usage. By comparison, say instead those 
two threads simultaneously call kfree() on the same pointer; the outcome 
of that is considerably worse.


What kind of additional new API are you imagining where such 
deliberately racy behaviour would be anything other than broken nonsense?



2、the second


for (i = 0; i < tablesz / sizeof(pte); i++, blk_paddr += split_sz) {
 /* Unmap! */
 if (i == unmap_idx)
continue;

 __arm_lpae_init_pte(data, blk_paddr, pte, lvl,
[i]);
}

pte = arm_lpae_install_table(tablep, ptep, blk_pte, cfg);


When altering a translation table descriptor include split a block into
constituent granules, the Armv8-A and SMMUv3 architectures require
a break-before-make procedure. But in the function 
arm_lpae_split_blk_unmap,
it changes a block descriptor to an equivalent span of page 
translations

directly. Is it appropriate to do so?


Break-before-make doesn't really work for the SMMU because faults are
generally fatal.

Are you seeing problems in practice with this code?


TBH I do still wonder if we shouldn't just get rid of split_blk_unmap 
and all its complexity. Other drivers treat an unmap of a page from a 
block entry as simply unmapping the whole block, and that's the 
behaviour VFIO seems to expect. My only worry is that it's been around 
long enough that there might be some horrible out-of-tree code that 
*is* relying on it, despite the fact that it's impossible to implement 
in a way that's definitely 100% safe :/


Robin.
.

I have not encountered a problem. I mean SMMU has supported BBML feature
in SMMUv3.2. Maybe we can use this feature to update translation table 
without

break-before-make.


We could. But then what do we do for all the existing hardware without 
BBML, or on x86 or other drivers which don't split blocks anyway?


Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: use of dma_direct_set_offset in (allwinner) drivers

2020-11-04 Thread Christoph Hellwig
On Wed, Nov 04, 2020 at 10:15:49AM +, Robin Murphy wrote:
> How about having something in the platform code that keys off the top-level 
> SoC compatible and uses a bus notifier to create offsets for the relevant 
> devices if an MBUS description is missing? At least that way the workaround 
> could be confined to a single dedicated place and look somewhat similar to 
> other special cases like sta2x11, rather than being duplicated all over the 
> place.

Yes, that would be the right way to handle the issue.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: use of dma_direct_set_offset in (allwinner) drivers

2020-11-04 Thread Robin Murphy

On 2020-11-04 08:14, Maxime Ripard wrote:

Hi Christoph,

On Tue, Nov 03, 2020 at 10:55:38AM +0100, Christoph Hellwig wrote:

Linux 5.10-rc1 switched from having a single dma offset in struct device
to a set of DMA ranges, and introduced a new helper to set them,
dma_direct_set_offset.

This in fact surfaced that a bunch of drivers that violate our layering
and set the offset from drivers, which meant we had to reluctantly
export the symbol to set up the DMA range.

The drivers are:

drivers/gpu/drm/sun4i/sun4i_backend.c

   This just use dma_direct_set_offset as a fallback.  Is there any good
   reason to not just kill off the fallback?

drivers/media/platform/sunxi/sun4i-csi/sun4i_csi.c

   Same as above.


So, the history of this is:

   - We initially introduced the support for those two controllers
 assuming that there was a direct mapping between the physical and
 DMA addresses. It turns out it didn't and the DMA accesses were
 going through a secondary, dedicated, bus that didn't have the same
 mapping of the RAM than the CPU.

 4690803b09c6 ("drm/sun4i: backend: Offset layer buffer address by DRAM starting 
address")

   - This dedicated bus is undocumented and barely used in the vendor
 kernel so this was overlooked, and it's fairly hard to get infos on
 it for all the SoCs we support. We added the DT support for it
 though on some SoCs we had enough infos to do so:

 c43a4469402f ("dt-bindings: interconnect: Add a dma interconnect name")
 22f88e311399 ("ARM: dts: sun5i: Add the MBUS controller")

 This explains the check on the interconnect property

   - However, due to the stable DT rule, we still need to operate without
 regressions on older DTs that wouldn't have that property (and for
 SoCs we haven't figured out). Hence the fallback.


How about having something in the platform code that keys off the 
top-level SoC compatible and uses a bus notifier to create offsets for 
the relevant devices if an MBUS description is missing? At least that 
way the workaround could be confined to a single dedicated place and 
look somewhat similar to other special cases like sta2x11, rather than 
being duplicated all over the place.


Robin.


drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c

   This driver unconditionally sets the offset.  Why can't we do this
   in the device tree?

drivers/staging/media/sunxi/cedrus/cedrus_hw.c

   Same as above.



We should make those two match the previous ones, but we'll have the
same issue here eventually. Most likely they were never ran on an SoC
for which we have the MBUS figured out.

Maxime


___
linux-arm-kernel mailing list
linux-arm-ker...@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [EXTERNAL] Re: amdgpu error whenever IOMMU is enabled

2020-11-04 Thread jroe...@suse.de
On Wed, Nov 04, 2020 at 09:21:35AM +, Merger, Edgar [AUTOSOL/MAS/AUGS] 
wrote:
> AMD-Vi: Completion-Wait loop timed out is at [65499.964105] but amdgpu-error 
> is at [   52.772273], hence much earlier.

Yes, but it could be the same underlying reason.

> Have not tried to use an upstream kernel yet. Which one would you recommend?

For a first try, use 5.9.3. If it reproduces there, please try booting
with "pci=noats" on the kernel command line.

Please also send me the output of 'lspci -vvv' and 'lspci -t' of the
machine where this happens.

Regards,

Joerg


> 
> As far as inconsistencies in the PCI-setup is concerned, the only thing that 
> I know of right now is that we haven´t entered a PCI subsystem vendor and 
> device ID yet. It is still "Advanced Micro Devices". We will change that soon 
> to "General Electric" or "Emerson".
> 
> Best regards,
> Edgar
> 
> -Original Message-
> From: jroe...@suse.de  
> Sent: Mittwoch, 4. November 2020 09:53
> To: Merger, Edgar [AUTOSOL/MAS/AUGS] 
> Cc: iommu@lists.linux-foundation.org
> Subject: [EXTERNAL] Re: amdgpu error whenever IOMMU is enabled
> 
> Hi Edgar,
> 
> On Fri, Oct 30, 2020 at 02:26:23PM +, Merger, Edgar [AUTOSOL/MAS/AUGS] 
> wrote:
> > With one board we have a boot-problem that is reproducible at every ~50 
> > boot.
> > The system is accessible via ssh and works fine except for the 
> > Graphics. The graphics is off. We don´t see a screen. Please see 
> > attached “dmesg.log”. From [52.772273] onwards the kernel reports 
> > drm/amdgpu errors. It even tries to reset the GPU but that fails too. 
> > I tried to reset amdgpu also by command “sudo cat 
> > /sys/kernel/debug/dri/N/amdgpu_gpu_recover”. That did not help either.
> 
> Can you reproduce the problem with an upstream kernel too?
> 
> These messages in dmesg indicate some problem in the platform setup:
> 
>   AMD-Vi: Completion-Wait loop timed out
> 
> Might there be some inconsistencies in the PCI setup between the bridges and 
> the endpoints or something?
> 
> Regards,
> 
>   Joerg
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCH 5/5] dma-mapping: remove dma_virt_ops

2020-11-04 Thread Christoph Hellwig
Now that the RDMA core deals with devices that only do DMA mapping in
lower layers properly, there is no user for dma_virt_ops and it can
be removed.

Signed-off-by: Christoph Hellwig 
---
 include/linux/dma-mapping.h |  2 --
 kernel/dma/Kconfig  |  5 ---
 kernel/dma/Makefile |  1 -
 kernel/dma/virt.c   | 61 -
 4 files changed, 69 deletions(-)
 delete mode 100644 kernel/dma/virt.c

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 956151052d4542..2aaed35b556df4 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -565,6 +565,4 @@ static inline int dma_mmap_wc(struct device *dev,
 int dma_direct_set_offset(struct device *dev, phys_addr_t cpu_start,
dma_addr_t dma_start, u64 size);
 
-extern const struct dma_map_ops dma_virt_ops;
-
 #endif /* _LINUX_DMA_MAPPING_H */
diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index c99de4a2145889..fd2db2665fc691 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -75,11 +75,6 @@ config ARCH_HAS_DMA_PREP_COHERENT
 config ARCH_HAS_FORCE_DMA_UNENCRYPTED
bool
 
-config DMA_VIRT_OPS
-   bool
-   depends on HAS_DMA
-   select DMA_OPS
-
 config SWIOTLB
bool
select NEED_DMA_MAP_STATE
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index dc755ab68aabf9..cd1d86358a7a62 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -5,7 +5,6 @@ obj-$(CONFIG_DMA_OPS)   += ops_helpers.o
 obj-$(CONFIG_DMA_OPS)  += dummy.o
 obj-$(CONFIG_DMA_CMA)  += contiguous.o
 obj-$(CONFIG_DMA_DECLARE_COHERENT) += coherent.o
-obj-$(CONFIG_DMA_VIRT_OPS) += virt.o
 obj-$(CONFIG_DMA_API_DEBUG)+= debug.o
 obj-$(CONFIG_SWIOTLB)  += swiotlb.o
 obj-$(CONFIG_DMA_COHERENT_POOL)+= pool.o
diff --git a/kernel/dma/virt.c b/kernel/dma/virt.c
deleted file mode 100644
index 59d32317dd574a..00
--- a/kernel/dma/virt.c
+++ /dev/null
@@ -1,61 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * DMA operations that map to virtual addresses without flushing memory.
- */
-#include 
-#include 
-#include 
-#include 
-
-static void *dma_virt_alloc(struct device *dev, size_t size,
-   dma_addr_t *dma_handle, gfp_t gfp,
-   unsigned long attrs)
-{
-   void *ret;
-
-   ret = (void *)__get_free_pages(gfp | __GFP_ZERO, get_order(size));
-   if (ret)
-   *dma_handle = (uintptr_t)ret;
-   return ret;
-}
-
-static void dma_virt_free(struct device *dev, size_t size,
- void *cpu_addr, dma_addr_t dma_addr,
- unsigned long attrs)
-{
-   free_pages((unsigned long)cpu_addr, get_order(size));
-}
-
-static dma_addr_t dma_virt_map_page(struct device *dev, struct page *page,
-   unsigned long offset, size_t size,
-   enum dma_data_direction dir,
-   unsigned long attrs)
-{
-   return (uintptr_t)(page_address(page) + offset);
-}
-
-static int dma_virt_map_sg(struct device *dev, struct scatterlist *sgl,
-  int nents, enum dma_data_direction dir,
-  unsigned long attrs)
-{
-   int i;
-   struct scatterlist *sg;
-
-   for_each_sg(sgl, sg, nents, i) {
-   BUG_ON(!sg_page(sg));
-   sg_dma_address(sg) = (uintptr_t)sg_virt(sg);
-   sg_dma_len(sg) = sg->length;
-   }
-
-   return nents;
-}
-
-const struct dma_map_ops dma_virt_ops = {
-   .alloc  = dma_virt_alloc,
-   .free   = dma_virt_free,
-   .map_page   = dma_virt_map_page,
-   .map_sg = dma_virt_map_sg,
-   .alloc_pages= dma_common_alloc_pages,
-   .free_pages = dma_common_free_pages,
-};
-EXPORT_SYMBOL(dma_virt_ops);
-- 
2.28.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 4/5] PCI/p2p: cleanup up __pci_p2pdma_map_sg a bit

2020-11-04 Thread Christoph Hellwig
Remove the pointless paddr variable that was only used once.

Signed-off-by: Christoph Hellwig 
---
 drivers/pci/p2pdma.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index b07018af53876c..afd792cc272832 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -825,13 +825,10 @@ static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap 
*p2p_pgmap,
struct device *dev, struct scatterlist *sg, int nents)
 {
struct scatterlist *s;
-   phys_addr_t paddr;
int i;
 
for_each_sg(sg, s, nents, i) {
-   paddr = sg_phys(s);
-
-   s->dma_address = paddr - p2p_pgmap->bus_offset;
+   s->dma_address = sg_phys(s) - p2p_pgmap->bus_offset;
sg_dma_len(s) = s->length;
}
 
-- 
2.28.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 3/5] PCI/p2p: remove the DMA_VIRT_OPS hacks

2020-11-04 Thread Christoph Hellwig
Now that all users of dma_virt_ops are gone we can remove the workaround
for it in the PCIe peer to peer code.

Signed-off-by: Christoph Hellwig 
---
 drivers/pci/p2pdma.c | 20 
 1 file changed, 20 deletions(-)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index de1c331dbed43f..b07018af53876c 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -556,15 +556,6 @@ int pci_p2pdma_distance_many(struct pci_dev *provider, 
struct device **clients,
return -1;
 
for (i = 0; i < num_clients; i++) {
-#ifdef CONFIG_DMA_VIRT_OPS
-   if (clients[i]->dma_ops == _virt_ops) {
-   if (verbose)
-   dev_warn(clients[i],
-"cannot be used for peer-to-peer DMA 
because the driver makes use of dma_virt_ops\n");
-   return -1;
-   }
-#endif
-
pci_client = find_parent_pci_dev(clients[i]);
if (!pci_client) {
if (verbose)
@@ -837,17 +828,6 @@ static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap 
*p2p_pgmap,
phys_addr_t paddr;
int i;
 
-   /*
-* p2pdma mappings are not compatible with devices that use
-* dma_virt_ops. If the upper layers do the right thing
-* this should never happen because it will be prevented
-* by the check in pci_p2pdma_distance_many()
-*/
-#ifdef CONFIG_DMA_VIRT_OPS
-   if (WARN_ON_ONCE(dev->dma_ops == _virt_ops))
-   return 0;
-#endif
-
for_each_sg(sg, s, nents, i) {
paddr = sg_phys(s);
 
-- 
2.28.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 2/5] RDMA/core: remove use of dma_virt_ops

2020-11-04 Thread Christoph Hellwig
Use the ib_dma_* helpers to skip the DMA translation instead.  This
removes the last user if dma_virt_ops and keeps the weird layering
violation inside the RDMA core instead of burderning the DMA mapping
subsystems with it.  This also means the software RDMA drivers now
don't have to mess with DMA parameters that are not relevant to them
at all, and that in the future we can use PCI P2P transfers even for
software RDMA, as there is no first fake layer of DMA mapping that
the P2P DMA support.

Signed-off-by: Christoph Hellwig 
---
 drivers/infiniband/core/device.c  | 41 ++-
 drivers/infiniband/core/rw.c  |  2 +-
 drivers/infiniband/sw/rdmavt/Kconfig  |  3 +-
 drivers/infiniband/sw/rdmavt/mr.c |  6 +--
 drivers/infiniband/sw/rdmavt/vt.c |  8 
 drivers/infiniband/sw/rxe/Kconfig |  2 -
 drivers/infiniband/sw/rxe/rxe_verbs.c |  7 
 drivers/infiniband/sw/rxe/rxe_verbs.h |  1 -
 drivers/infiniband/sw/siw/Kconfig |  1 -
 drivers/infiniband/sw/siw/siw.h   |  1 -
 drivers/infiniband/sw/siw/siw_main.c  |  7 
 include/rdma/ib_verbs.h   | 59 ++-
 12 files changed, 65 insertions(+), 73 deletions(-)

diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c
index a3b1fc84cdcab9..528de41487521b 100644
--- a/drivers/infiniband/core/device.c
+++ b/drivers/infiniband/core/device.c
@@ -1177,25 +1177,6 @@ static int assign_name(struct ib_device *device, const 
char *name)
return ret;
 }
 
-static void setup_dma_device(struct ib_device *device,
-struct device *dma_device)
-{
-   /*
-* If the caller does not provide a DMA capable device then the IB
-* device will be used. In this case the caller should fully setup the
-* ibdev for DMA. This usually means using dma_virt_ops.
-*/
-#ifdef CONFIG_DMA_VIRT_OPS
-   if (!dma_device) {
-   device->dev.dma_ops = _virt_ops;
-   dma_device = >dev;
-   }
-#endif
-   WARN_ON(!dma_device);
-   device->dma_device = dma_device;
-   WARN_ON(!device->dma_device->dma_parms);
-}
-
 /*
  * setup_device() allocates memory and sets up data that requires calling the
  * device ops, this is the only reason these actions are not done during
@@ -1341,7 +1322,14 @@ int ib_register_device(struct ib_device *device, const 
char *name,
if (ret)
return ret;
 
-   setup_dma_device(device, dma_device);
+   /*
+* If the caller does not provide a DMA capable device then the IB core
+* will set up ib_sge and scatterlist structures that stash the kernel
+* virtual address into the address field.
+*/
+   device->dma_device = dma_device;
+   WARN_ON(dma_device && !dma_device->dma_parms);
+
ret = setup_device(device);
if (ret)
return ret;
@@ -2675,6 +2663,19 @@ void ib_set_device_ops(struct ib_device *dev, const 
struct ib_device_ops *ops)
 }
 EXPORT_SYMBOL(ib_set_device_ops);
 
+int ib_dma_virt_map_sg(struct ib_device *dev, struct scatterlist *sg, int 
nents)
+{
+   struct scatterlist *s;
+   int i;
+
+   for_each_sg(sg, s, nents, i) {
+   sg_dma_address(s) = (uintptr_t)sg_virt(s);
+   sg_dma_len(s) = s->length;
+   }
+   return nents;
+}
+EXPORT_SYMBOL(ib_dma_virt_map_sg);
+
 static const struct rdma_nl_cbs ibnl_ls_cb_table[RDMA_NL_LS_NUM_OPS] = {
[RDMA_NL_LS_OP_RESOLVE] = {
.doit = ib_nl_handle_resolve_resp,
diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 13f43ab7220b05..ae5a629ecc6772 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -276,7 +276,7 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, 
struct ib_qp *qp,
 static void rdma_rw_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
 u32 sg_cnt, enum dma_data_direction dir)
 {
-   if (is_pci_p2pdma_page(sg_page(sg)))
+   if (dev->dma_device && is_pci_p2pdma_page(sg_page(sg)))
pci_p2pdma_unmap_sg(dev->dma_device, sg, sg_cnt, dir);
else
ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
diff --git a/drivers/infiniband/sw/rdmavt/Kconfig 
b/drivers/infiniband/sw/rdmavt/Kconfig
index 9ef5f5ce1ff6b0..2de31692885cf0 100644
--- a/drivers/infiniband/sw/rdmavt/Kconfig
+++ b/drivers/infiniband/sw/rdmavt/Kconfig
@@ -1,8 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 config INFINIBAND_RDMAVT
tristate "RDMA verbs transport library"
-   depends on X86_64 && ARCH_DMA_ADDR_T_64BIT
+   depends on X86_64
depends on PCI
-   select DMA_VIRT_OPS
help
This is a common software verbs provider for RDMA networks.
diff --git a/drivers/infiniband/sw/rdmavt/mr.c 
b/drivers/infiniband/sw/rdmavt/mr.c
index 8490fdb9c91e50..90fc234f489acd 100644
--- a/drivers/infiniband/sw/rdmavt/mr.c
+++ 

[PATCH 1/5] RDMA/core: remove ib_dma_{alloc,free}_coherent

2020-11-04 Thread Christoph Hellwig
These two functions are entirely unused.

Signed-off-by: Christoph Hellwig 
---
 include/rdma/ib_verbs.h | 29 -
 1 file changed, 29 deletions(-)

diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index 9bf6c319a670e2..5f8fd7976034e0 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -4098,35 +4098,6 @@ static inline void ib_dma_sync_single_for_device(struct 
ib_device *dev,
dma_sync_single_for_device(dev->dma_device, addr, size, dir);
 }
 
-/**
- * ib_dma_alloc_coherent - Allocate memory and map it for DMA
- * @dev: The device for which the DMA address is requested
- * @size: The size of the region to allocate in bytes
- * @dma_handle: A pointer for returning the DMA address of the region
- * @flag: memory allocator flags
- */
-static inline void *ib_dma_alloc_coherent(struct ib_device *dev,
-  size_t size,
-  dma_addr_t *dma_handle,
-  gfp_t flag)
-{
-   return dma_alloc_coherent(dev->dma_device, size, dma_handle, flag);
-}
-
-/**
- * ib_dma_free_coherent - Free memory allocated by ib_dma_alloc_coherent()
- * @dev: The device for which the DMA addresses were allocated
- * @size: The size of the region
- * @cpu_addr: the address returned by ib_dma_alloc_coherent()
- * @dma_handle: the DMA address returned by ib_dma_alloc_coherent()
- */
-static inline void ib_dma_free_coherent(struct ib_device *dev,
-   size_t size, void *cpu_addr,
-   dma_addr_t dma_handle)
-{
-   dma_free_coherent(dev->dma_device, size, cpu_addr, dma_handle);
-}
-
 /* ib_reg_user_mr - register a memory region for virtual addresses from kernel
  * space. This function should be called when 'current' is the owning MM.
  */
-- 
2.28.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


remove dma_virt_ops

2020-11-04 Thread Christoph Hellwig
Hi Jason,

this series switches the RDMA core to opencode the special case of
devices bypassing the DMA mapping in the RDMA ULPs.  The virt ops
have caused a bit of trouble due to the P2P code node working with
them due to the fact that we'd do two dma mapping iterations for a
single I/O, but also are a bit of layering violation and lead to
more code than necessary.

Tested with nvme-rdma over rxe.

Diffstat:
 b/drivers/infiniband/core/device.c  |   41 +++---
 b/drivers/infiniband/core/rw.c  |2 
 b/drivers/infiniband/sw/rdmavt/Kconfig  |3 -
 b/drivers/infiniband/sw/rdmavt/mr.c |6 --
 b/drivers/infiniband/sw/rdmavt/vt.c |8 --
 b/drivers/infiniband/sw/rxe/Kconfig |2 
 b/drivers/infiniband/sw/rxe/rxe_verbs.c |7 --
 b/drivers/infiniband/sw/rxe/rxe_verbs.h |1 
 b/drivers/infiniband/sw/siw/Kconfig |1 
 b/drivers/infiniband/sw/siw/siw.h   |1 
 b/drivers/infiniband/sw/siw/siw_main.c  |7 --
 b/drivers/pci/p2pdma.c  |   25 -
 b/include/linux/dma-mapping.h   |2 
 b/include/rdma/ib_verbs.h   |   88 ++--
 b/kernel/dma/Kconfig|5 -
 b/kernel/dma/Makefile   |1 
 kernel/dma/virt.c   |   61 --
 17 files changed, 66 insertions(+), 195 deletions(-)
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 0/6] Add virtio-iommu built-in topology

2020-11-04 Thread Jean-Philippe Brucker
Hi Al,

On Tue, Nov 03, 2020 at 01:09:04PM -0700, Al Stone wrote:
> So, there are some questions about the VIOT definition and I just
> don't know enough to be able to answer them.  One of the ASWG members
> is trying to understand the semantics behind the subtables.

Thanks for the update. We dropped subtables a few versions ago, though, do
you have the latest v8?
https://jpbrucker.net/virtio-iommu/viot/viot-v8.pdf

> Is there a particular set of people, or mailing lists, that I can
> point to to get the questions answered?  Ideally it would be one
> of the public lists where it has already been discussed, but an
> individual would be fine, too.  No changes have been proposed, just
> some questions asked.

For a public list, I suggest iommu@lists.linux-foundation.org if we should
pick only one (otherwise add virtualizat...@lists.linux-foundation.org and
virtio-...@lists.oasis-open.org). I'm happy to answer any question, and
the folks on here are a good set to Cc:

eric.au...@redhat.com
jean-phili...@linaro.org
j...@8bytes.org
kevin.t...@intel.com
lorenzo.pieral...@arm.com
m...@redhat.com
sebastien.bo...@intel.com

Thanks,
Jean
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [EXTERNAL] Re: amdgpu error whenever IOMMU is enabled

2020-11-04 Thread Merger, Edgar [AUTOSOL/MAS/AUGS]
Hi Jörg,

AMD-Vi: Completion-Wait loop timed out is at [65499.964105] but amdgpu-error is 
at [   52.772273], hence much earlier.

Have not tried to use an upstream kernel yet. Which one would you recommend?

As far as inconsistencies in the PCI-setup is concerned, the only thing that I 
know of right now is that we haven´t entered a PCI subsystem vendor and device 
ID yet. It is still "Advanced Micro Devices". We will change that soon to 
"General Electric" or "Emerson".

Best regards,
Edgar

-Original Message-
From: jroe...@suse.de  
Sent: Mittwoch, 4. November 2020 09:53
To: Merger, Edgar [AUTOSOL/MAS/AUGS] 
Cc: iommu@lists.linux-foundation.org
Subject: [EXTERNAL] Re: amdgpu error whenever IOMMU is enabled

Hi Edgar,

On Fri, Oct 30, 2020 at 02:26:23PM +, Merger, Edgar [AUTOSOL/MAS/AUGS] 
wrote:
> With one board we have a boot-problem that is reproducible at every ~50 boot.
> The system is accessible via ssh and works fine except for the 
> Graphics. The graphics is off. We don´t see a screen. Please see 
> attached “dmesg.log”. From [52.772273] onwards the kernel reports 
> drm/amdgpu errors. It even tries to reset the GPU but that fails too. 
> I tried to reset amdgpu also by command “sudo cat 
> /sys/kernel/debug/dri/N/amdgpu_gpu_recover”. That did not help either.

Can you reproduce the problem with an upstream kernel too?

These messages in dmesg indicate some problem in the platform setup:

AMD-Vi: Completion-Wait loop timed out

Might there be some inconsistencies in the PCI setup between the bridges and 
the endpoints or something?

Regards,

Joerg
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: amdgpu error whenever IOMMU is enabled

2020-11-04 Thread jroe...@suse.de
Hi Edgar,

On Fri, Oct 30, 2020 at 02:26:23PM +, Merger, Edgar [AUTOSOL/MAS/AUGS] 
wrote:
> With one board we have a boot-problem that is reproducible at every ~50 boot.
> The system is accessible via ssh and works fine except for the Graphics. The
> graphics is off. We don´t see a screen. Please see attached “dmesg.log”. From
> [52.772273] onwards the kernel reports drm/amdgpu errors. It even tries to
> reset the GPU but that fails too. I tried to reset amdgpu also by command 
> “sudo
> cat /sys/kernel/debug/dri/N/amdgpu_gpu_recover”. That did not help either.

Can you reproduce the problem with an upstream kernel too?

These messages in dmesg indicate some problem in the platform setup:

AMD-Vi: Completion-Wait loop timed out

Might there be some inconsistencies in the PCI setup between the bridges
and the endpoints or something?

Regards,

Joerg
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH v2 3/4] iommu/iova: Flush CPU rcache for when a depot fills

2020-11-04 Thread John Garry

Hi Robin,


- then cpu3, cpu4, and so on.
- We can do this for all CPUs in the system, so total CPU rcache grows 
from zero -> #CPUs * 128 * 2. Yet no DMA mapping leaks.


I get that. That's the initial warm-up phase I alluded to below. In an 
even simpler example, allocating on CPU A and freeing on CPU B will 
indeed move IOVAs from the tree into magazines without reuse, but only 
up to a point. Eventually, CPU B's cache fills up and pushes a magazine 
into the depot, and at *that* point things reach a steady state, since 
the next allocation on CPU A will then pull that magazine from the depot 
and proceed to allocate from there. If allocs and frees stay perfectly 
balanced, the working set is then 3 magazines. Yes, the depot can fill 
up if the number of IOVAs that CPU B frees at once before CPU A 
reallocates them is comparable to the total depot capacity, but it can't 
reasonably *stay* full unless CPU A stops allocating altogether.


Something similar can happen in normal use, where the scheduler 
relocates processes all over the CPUs in the system as time goes by, 
which causes the total rcache size to continue to grow. And in 
addition to this, the global depot continues to grow very slowly as 
well. But when it does fill (the global depot, that is), and we start 
to free magazines to make space – as is current policy - that's very 
slow and causes the performance drop.


Sure, but how does it then consistently *remain* in that state? And 
*why* does the depot slowly and steadily grow in the first place if 
alloc and free are ultimately balanced? 


So some key info I missed sharing was that we only see this issue for 
non-strict mode. For strict mode, the rcache occupancy stays quite 
compact, and does not grow like we see for non-strict mode.


I have some (very large) kernel logs in which all the CPU and depot 
rcache occupancy levels are periodically dumped, and where you can get 
an idea of the trend.


I'm on vacation today, so I can share them tomorrow.

I can get the depot swinging 
between full and empty if it's simply too small to bounce magazines 
between a large number of "CPU A"s and "CPU B"s, but again, that's 
surely going to show as repeated performance swings between bad at each 
end and good in the middle, not a steady degradation.


Yeah, so I see the depot max size (32) is adequate in size, such that 
this does not happen.





Now indeed that could happen over the short term if IOVAs are allocated
and freed again in giant batches larger than the total global cache
capacity, but that would show a cyclic behaviour - when activity starts,
everything is first allocated straight from the tree, then when it ends
the caches would get overwhelmed by the large burst of freeing and start
having to release things back to the tree, but eventually that would
stop once everything *is* freed, then when activity begins again the
next round of allocating would inherently clear out all the caches
before going anywhere near the tree. 


But there is no clearing. A CPU will keep the IOVA cached 
indefinitely, even when there is no active DMA mapping present at all.


Sure, the percpu caches can buffer IOVAs for an indefinite amount of 
time depending on how many CPUs are active, but the depot usage is still 
absolutely representative of the total working set for whichever CPUs 
*are* active. In this whole discussion I'm basically just considering 
the percpu caches as pipeline stages for serialising IOVAs into and out 
of magazines. It's the motion of magazines that's the interesting part.


If the depot keeps continually filling up, *some* CPUs are freeing 
enough IOVAs to push out full magazines, and those IOVAs have to come 
from somewhere, so *some* CPUs are allocating, and those CPUs can't 
allocate forever without taking magazines back out of the depot (that's 
the "clearing out" I meant). Something about a steady degradation that 
never shows any sign of recovery (even periodically) just doesn't seem 
to add up.


Anyway, by now I think it would be most interesting to get rid of this 
bottleneck completely rather than dance around bodging it, and see what 
happens if we simply let the depot grow to fit the maximum working set, 
so I did this:


https://gitlab.arm.com/linux-arm/linux-rm/-/commits/iommu/iova

Only compile-tested, so probably full of trivial bugs, but I'm curious 
to see if the slight extra overhead to depot management is noticeable in 
normal cases.


So allowing the depot size to grow unbounded should solve that immediate 
issue.


However, I'd like to see a move in the opposite direction, that is to 
trim the rcaches at some intervals. Indeed, with an appreciable 
frequency, IOVA rcache allocation requests may not be satisfied due to 
size limit - we see this for our same storage scenario, where some IOVA 
requests are > 200K in size, and must use the tree. So allowing the 
rcaches to grow further just makes handling these requests slower.


Thanks,
John

[PATCH] iommu/vt-d: remove redundant variable no_platform_optin

2020-11-04 Thread Zhenzhong Duan
no_platform_optin is redundant with dmar_disabled and it's only used in
platform_optin_force_iommu(), remove it and use dmar_disabled instead.

Meanwhile remove all the dead code in platform_optin_force_iommu().

Signed-off-by: Zhenzhong Duan 
---
 drivers/iommu/intel/iommu.c | 14 ++
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 8651f6d4dfa0..a011d1ed63ef 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -180,7 +180,6 @@ static int rwbf_quirk;
  */
 static int force_on = 0;
 int intel_iommu_tboot_noforce;
-static int no_platform_optin;
 
 #define ROOT_ENTRY_NR (VTD_PAGE_SIZE/sizeof(struct root_entry))
 
@@ -440,7 +439,6 @@ static int __init intel_iommu_setup(char *str)
pr_info("IOMMU enabled\n");
} else if (!strncmp(str, "off", 3)) {
dmar_disabled = 1;
-   no_platform_optin = 1;
pr_info("IOMMU disabled\n");
} else if (!strncmp(str, "igfx_off", 8)) {
dmar_map_gfx = 0;
@@ -4810,20 +4808,12 @@ static inline bool has_external_pci(void)
 
 static int __init platform_optin_force_iommu(void)
 {
-   if (!dmar_platform_optin() || no_platform_optin || !has_external_pci())
+   if (!dmar_platform_optin() || dmar_disabled || !has_external_pci())
return 0;
 
-   if (no_iommu || dmar_disabled)
+   if (no_iommu)
pr_info("Intel-IOMMU force enabled due to platform opt in\n");
 
-   /*
-* If Intel-IOMMU is disabled by default, we will apply identity
-* map for all devices except those marked as being untrusted.
-*/
-   if (dmar_disabled)
-   iommu_set_default_passthrough(false);
-
-   dmar_disabled = 0;
no_iommu = 0;
 
return 1;
-- 
2.25.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: use of dma_direct_set_offset in (allwinner) drivers

2020-11-04 Thread Maxime Ripard
Hi Christoph,

On Tue, Nov 03, 2020 at 10:55:38AM +0100, Christoph Hellwig wrote:
> Linux 5.10-rc1 switched from having a single dma offset in struct device
> to a set of DMA ranges, and introduced a new helper to set them,
> dma_direct_set_offset.
> 
> This in fact surfaced that a bunch of drivers that violate our layering
> and set the offset from drivers, which meant we had to reluctantly
> export the symbol to set up the DMA range.
> 
> The drivers are:
> 
> drivers/gpu/drm/sun4i/sun4i_backend.c
> 
>   This just use dma_direct_set_offset as a fallback.  Is there any good
>   reason to not just kill off the fallback?
> 
> drivers/media/platform/sunxi/sun4i-csi/sun4i_csi.c
> 
>   Same as above.

So, the history of this is:

  - We initially introduced the support for those two controllers
assuming that there was a direct mapping between the physical and
DMA addresses. It turns out it didn't and the DMA accesses were
going through a secondary, dedicated, bus that didn't have the same
mapping of the RAM than the CPU.

4690803b09c6 ("drm/sun4i: backend: Offset layer buffer address by DRAM 
starting address")

  - This dedicated bus is undocumented and barely used in the vendor
kernel so this was overlooked, and it's fairly hard to get infos on
it for all the SoCs we support. We added the DT support for it
though on some SoCs we had enough infos to do so:

c43a4469402f ("dt-bindings: interconnect: Add a dma interconnect name")
22f88e311399 ("ARM: dts: sun5i: Add the MBUS controller")

This explains the check on the interconnect property

  - However, due to the stable DT rule, we still need to operate without
regressions on older DTs that wouldn't have that property (and for
SoCs we haven't figured out). Hence the fallback.

> drivers/media/platform/sunxi/sun6i-csi/sun6i_csi.c
> 
>   This driver unconditionally sets the offset.  Why can't we do this
>   in the device tree?
> 
> drivers/staging/media/sunxi/cedrus/cedrus_hw.c
> 
>   Same as above.
>

We should make those two match the previous ones, but we'll have the
same issue here eventually. Most likely they were never ran on an SoC
for which we have the MBUS figured out.

Maxime


signature.asc
Description: PGP signature
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu