[PATCH 05/16] dma-mapping: Introduce dma_map_sg_p2pdma()

2021-04-08 Thread Logan Gunthorpe
dma_map_sg() either returns a positive number indicating the number
of entries mapped or zero indicating that resources were not available
to create the mapping. When zero is returned, it is always safe to retry
the mapping later once resources have been freed.

Once P2PDMA pages are mixed into the SGL there may be pages that may
never be successfully mapped with a given device because that device may
not actually be able to access those pages. Thus, multiple error
conditions will need to be distinguished to determine weather a retry
is safe.

Introduce dma_map_sg_p2pdma[_attrs]() with a different calling
convention from dma_map_sg(). The function will return a positive
integer on success or a negative errno on failure.

ENOMEM will be used to indicate a resource failure and EREMOTEIO to
indicate that a P2PDMA page is not mappable.

The __DMA_ATTR_PCI_P2PDMA attribute is introduced to inform the lower
level implementations that P2PDMA pages are allowed and to warn if a
caller introduces them into the regular dma_map_sg() interface.

Signed-off-by: Logan Gunthorpe 
---
 include/linux/dma-mapping.h | 15 +++
 kernel/dma/mapping.c| 52 -
 2 files changed, 61 insertions(+), 6 deletions(-)

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 2a984cb4d1e0..50b8f586cf59 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -60,6 +60,12 @@
  * at least read-only at lesser-privileged levels).
  */
 #define DMA_ATTR_PRIVILEGED(1UL << 9)
+/*
+ * __DMA_ATTR_PCI_P2PDMA: This should not be used directly, use
+ * dma_map_sg_p2pdma() instead. Used internally to indicate that the
+ * caller is using the dma_map_sg_p2pdma() interface.
+ */
+#define __DMA_ATTR_PCI_P2PDMA  (1UL << 10)
 
 /*
  * A dma_addr_t can hold any valid DMA or bus address for the platform.  It can
@@ -107,6 +113,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t 
addr, size_t size,
enum dma_data_direction dir, unsigned long attrs);
 int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
enum dma_data_direction dir, unsigned long attrs);
+int dma_map_sg_p2pdma_attrs(struct device *dev, struct scatterlist *sg,
+   int nents, enum dma_data_direction dir, unsigned long attrs);
 void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
  int nents, enum dma_data_direction dir,
  unsigned long attrs);
@@ -160,6 +168,12 @@ static inline int dma_map_sg_attrs(struct device *dev, 
struct scatterlist *sg,
 {
return 0;
 }
+static inline int dma_map_sg_p2pdma_attrs(struct device *dev,
+   struct scatterlist *sg, int nents, enum dma_data_direction dir,
+   unsigned long attrs)
+{
+   return 0;
+}
 static inline void dma_unmap_sg_attrs(struct device *dev,
struct scatterlist *sg, int nents, enum dma_data_direction dir,
unsigned long attrs)
@@ -392,6 +406,7 @@ static inline void dma_sync_sgtable_for_device(struct 
device *dev,
 #define dma_map_single(d, a, s, r) dma_map_single_attrs(d, a, s, r, 0)
 #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0)
 #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0)
+#define dma_map_sg_p2pdma(d, s, n, r) dma_map_sg_p2pdma_attrs(d, s, n, r, 0)
 #define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0)
 #define dma_map_page(d, p, o, s, r) dma_map_page_attrs(d, p, o, s, r, 0)
 #define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0)
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index b6a633679933..923089c4267b 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -177,12 +177,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t 
addr, size_t size,
 }
 EXPORT_SYMBOL(dma_unmap_page_attrs);
 
-/*
- * dma_maps_sg_attrs returns 0 on error and > 0 on success.
- * It should never return a value < 0.
- */
-int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
-   enum dma_data_direction dir, unsigned long attrs)
+static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
+   int nents, enum dma_data_direction dir, unsigned long attrs)
 {
const struct dma_map_ops *ops = get_dma_ops(dev);
int ents;
@@ -197,6 +193,20 @@ int dma_map_sg_attrs(struct device *dev, struct 
scatterlist *sg, int nents,
ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
else
ents = ops->map_sg(dev, sg, nents, dir, attrs);
+
+   return ents;
+}
+
+/*
+ * dma_maps_sg_attrs returns 0 on error and > 0 on success.
+ * It should never return a value < 0.
+ */
+int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
+   enum dma_data_direction dir, unsigned long

[PATCH 06/16] lib/scatterlist: Add flag for indicating P2PDMA segments in an SGL

2021-04-08 Thread Logan Gunthorpe
Make use of the third free LSB in scatterlist's page_link on 64bit systems.

The extra bit will be used by dma_[un]map_sg_p2pdma() to determine when a
given SGL segments dma_address points to a PCI bus address.
dma_unmap_sg_p2pdma() will need to perform different cleanup when a
segment is marked as P2PDMA.

Using this bit requires adding an additional dependency on CONFIG_64BIT to
CONFIG_PCI_P2PDMA. This should be acceptable as the majority of P2PDMA
use cases are restricted to newer root complexes and roughly require the
extra address space for memory BARs used in the transactions.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/Kconfig |  2 +-
 include/linux/scatterlist.h | 49 ++---
 2 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 0c473d75e625..90b4bddb3300 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -163,7 +163,7 @@ config PCI_PASID
 
 config PCI_P2PDMA
bool "PCI peer-to-peer transfer support"
-   depends on ZONE_DEVICE
+   depends on ZONE_DEVICE && 64BIT
select GENERIC_ALLOCATOR
help
  Enableѕ drivers to do PCI peer-to-peer transactions to and from
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 6f70572b2938..5525d3ebf36f 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -58,6 +58,21 @@ struct sg_table {
 #define SG_CHAIN   0x01UL
 #define SG_END 0x02UL
 
+/*
+ * bit 2 is the third free bit in the page_link on 64bit systems which
+ * is used by dma_unmap_sg() to determine if the dma_address is a PCI
+ * bus address when doing P2PDMA.
+ * Note: CONFIG_PCI_P2PDMA depends on CONFIG_64BIT because of this.
+ */
+
+#ifdef CONFIG_PCI_P2PDMA
+#define SG_PCI_P2PDMA  0x04UL
+#else
+#define SG_PCI_P2PDMA  0x00UL
+#endif
+
+#define SG_PAGE_LINK_MASK (SG_CHAIN | SG_END | SG_PCI_P2PDMA)
+
 /*
  * We overload the LSB of the page pointer to indicate whether it's
  * a valid sg entry, or whether it points to the start of a new scatterlist.
@@ -65,8 +80,9 @@ struct sg_table {
  */
 #define sg_is_chain(sg)((sg)->page_link & SG_CHAIN)
 #define sg_is_last(sg) ((sg)->page_link & SG_END)
+#define sg_is_pci_p2pdma(sg)   ((sg)->page_link & SG_PCI_P2PDMA)
 #define sg_chain_ptr(sg)   \
-   ((struct scatterlist *) ((sg)->page_link & ~(SG_CHAIN | SG_END)))
+   ((struct scatterlist *) ((sg)->page_link & ~SG_PAGE_LINK_MASK))
 
 /**
  * sg_assign_page - Assign a given page to an SG entry
@@ -80,13 +96,13 @@ struct sg_table {
  **/
 static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
 {
-   unsigned long page_link = sg->page_link & (SG_CHAIN | SG_END);
+   unsigned long page_link = sg->page_link & SG_PAGE_LINK_MASK;
 
/*
 * In order for the low bit stealing approach to work, pages
 * must be aligned at a 32-bit boundary as a minimum.
 */
-   BUG_ON((unsigned long) page & (SG_CHAIN | SG_END));
+   BUG_ON((unsigned long) page & SG_PAGE_LINK_MASK);
 #ifdef CONFIG_DEBUG_SG
BUG_ON(sg_is_chain(sg));
 #endif
@@ -120,7 +136,7 @@ static inline struct page *sg_page(struct scatterlist *sg)
 #ifdef CONFIG_DEBUG_SG
BUG_ON(sg_is_chain(sg));
 #endif
-   return (struct page *)((sg)->page_link & ~(SG_CHAIN | SG_END));
+   return (struct page *)((sg)->page_link & ~SG_PAGE_LINK_MASK);
 }
 
 /**
@@ -222,6 +238,31 @@ static inline void sg_unmark_end(struct scatterlist *sg)
sg->page_link &= ~SG_END;
 }
 
+/**
+ * sg_mark_pci_p2pdma - Mark the scatterlist entry for PCI p2pdma
+ * @sg: SG entryScatterlist
+ *
+ * Description:
+ *   Marks the passed in sg entry to indicate that the dma_address is
+ *   a PCI bus address.
+ **/
+static inline void sg_mark_pci_p2pdma(struct scatterlist *sg)
+{
+   sg->page_link |= SG_PCI_P2PDMA;
+}
+
+/**
+ * sg_unmark_pci_p2pdma - Unmark the scatterlist entry for PCI p2pdma
+ * @sg: SG entryScatterlist
+ *
+ * Description:
+ *   Clears the PCI P2PDMA mark
+ **/
+static inline void sg_unmark_pci_p2pdma(struct scatterlist *sg)
+{
+   sg->page_link &= ~SG_PCI_P2PDMA;
+}
+
 /**
  * sg_phys - Return physical address of an sg entry
  * @sg: SG entry
-- 
2.20.1



[PATCH 07/16] PCI/P2PDMA: Make pci_p2pdma_map_type() non-static

2021-04-08 Thread Logan Gunthorpe
pci_p2pdma_map_type() will be needed by the dma-iommu map_sg
implementation because it will need to determine the mapping type
ahead of actually doing the mapping to create the actual iommu mapping.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/p2pdma.c   | 34 +++---
 include/linux/pci-p2pdma.h | 15 +++
 2 files changed, 38 insertions(+), 11 deletions(-)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index bcb1a6d6119d..38c93f57a941 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -20,13 +20,6 @@
 #include 
 #include 
 
-enum pci_p2pdma_map_type {
-   PCI_P2PDMA_MAP_UNKNOWN = 0,
-   PCI_P2PDMA_MAP_NOT_SUPPORTED,
-   PCI_P2PDMA_MAP_BUS_ADDR,
-   PCI_P2PDMA_MAP_THRU_HOST_BRIDGE,
-};
-
 struct pci_p2pdma {
struct gen_pool *pool;
bool p2pmem_published;
@@ -822,13 +815,30 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool 
publish)
 }
 EXPORT_SYMBOL_GPL(pci_p2pmem_publish);
 
-static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
-   struct device *dev)
+/**
+ * pci_p2pdma_map_type - return the type of mapping that should be used for
+ * a given device and pgmap
+ * @pgmap: the pagemap of a page to determine the mapping type for
+ * @dev: device that is mapping the page
+ * @dma_attrs: the attributes passed to the dma_map operation --
+ * this is so they can be checked to ensure P2PDMA pages were not
+ * introduced into an incorrect interface (like dma_map_sg). *
+ *
+ * Returns one of:
+ * PCI_P2PDMA_MAP_NOT_SUPPORTED - The mapping should not be done
+ * PCI_P2PDMA_MAP_BUS_ADDR - The mapping should use the PCI bus address
+ * PCI_P2PDMA_MAP_THRU_HOST_BRIDGE - The mapping should be done directly
+ */
+enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
+   struct device *dev, unsigned long dma_attrs)
 {
struct pci_dev *provider = to_p2p_pgmap(pgmap)->provider;
enum pci_p2pdma_map_type ret;
struct pci_dev *client;
 
+   WARN_ONCE(!(dma_attrs & __DMA_ATTR_PCI_P2PDMA),
+ "PCI P2PDMA pages were mapped with dma_map_sg!");
+
if (!provider->p2pdma)
return PCI_P2PDMA_MAP_NOT_SUPPORTED;
 
@@ -879,7 +889,8 @@ int pci_p2pdma_map_sg_attrs(struct device *dev, struct 
scatterlist *sg,
struct pci_p2pdma_pagemap *p2p_pgmap =
to_p2p_pgmap(sg_page(sg)->pgmap);
 
-   switch (pci_p2pdma_map_type(sg_page(sg)->pgmap, dev)) {
+   switch (pci_p2pdma_map_type(sg_page(sg)->pgmap, dev,
+   __DMA_ATTR_PCI_P2PDMA)) {
case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
return dma_map_sg_attrs(dev, sg, nents, dir, attrs);
case PCI_P2PDMA_MAP_BUS_ADDR:
@@ -904,7 +915,8 @@ void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct 
scatterlist *sg,
 {
enum pci_p2pdma_map_type map_type;
 
-   map_type = pci_p2pdma_map_type(sg_page(sg)->pgmap, dev);
+   map_type = pci_p2pdma_map_type(sg_page(sg)->pgmap, dev,
+  __DMA_ATTR_PCI_P2PDMA);
 
if (map_type == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
dma_unmap_sg_attrs(dev, sg, nents, dir, attrs);
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index 8318a97c9c61..a06072ac3a52 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -16,6 +16,13 @@
 struct block_device;
 struct scatterlist;
 
+enum pci_p2pdma_map_type {
+   PCI_P2PDMA_MAP_UNKNOWN = 0,
+   PCI_P2PDMA_MAP_NOT_SUPPORTED,
+   PCI_P2PDMA_MAP_BUS_ADDR,
+   PCI_P2PDMA_MAP_THRU_HOST_BRIDGE,
+};
+
 #ifdef CONFIG_PCI_P2PDMA
 int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
u64 offset);
@@ -30,6 +37,8 @@ struct scatterlist *pci_p2pmem_alloc_sgl(struct pci_dev *pdev,
 unsigned int *nents, u32 length);
 void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct scatterlist *sgl);
 void pci_p2pmem_publish(struct pci_dev *pdev, bool publish);
+enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
+   struct device *dev, unsigned long dma_attrs);
 int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
int nents, enum dma_data_direction dir, unsigned long attrs);
 void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
@@ -83,6 +92,12 @@ static inline void pci_p2pmem_free_sgl(struct pci_dev *pdev,
 static inline void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
 {
 }
+static inline enum pci_p2pdma_map_type pci_p2pdma_map_type(
+   struct dev_pagemap *pgmap, struct device *dev,
+   unsigned long dma_attrs)
+{
+   return PCI_P2PDMA_MAP_NOT_SUPPORTED;
+}
 static inline int pci_p2pdma_map_sg_attrs(struct device *dev,

[PATCH 02/16] PCI/P2PDMA: Avoid pci_get_slot() which sleeps

2021-04-08 Thread Logan Gunthorpe
In order to use upstream_bridge_distance_warn() from a dma_map function,
it must not sleep. However, pci_get_slot() takes the pci_bus_sem so it
might sleep.

In order to avoid this, try to get the host bridge's device from
bus->self, and if that is not set, just get the first element in the
device list. It should be impossible for the host bridge's device to
go away while references are held on child devices, so the first element
should not be able to change and, thus, this should be safe.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/p2pdma.c | 14 --
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index bd89437faf06..473a08940fbc 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -311,16 +311,26 @@ static const struct pci_p2pdma_whitelist_entry {
 static bool __host_bridge_whitelist(struct pci_host_bridge *host,
bool same_host_bridge)
 {
-   struct pci_dev *root = pci_get_slot(host->bus, PCI_DEVFN(0, 0));
const struct pci_p2pdma_whitelist_entry *entry;
+   struct pci_dev *root = host->bus->self;
unsigned short vendor, device;
 
+   /*
+* This makes the assumption that the first device on the bus is the
+* bridge itself and it has the devfn of 00.0. This assumption should
+* hold for the devices in the white list above, and if there are cases
+* where this isn't true they will have to be dealt with when such a
+* case is added to the whitelist.
+*/
if (!root)
+   root = list_first_entry_or_null(>bus->devices,
+   struct pci_dev, bus_list);
+
+   if (!root || root->devfn)
return false;
 
vendor = root->vendor;
device = root->device;
-   pci_dev_put(root);
 
for (entry = pci_p2pdma_whitelist; entry->vendor; entry++) {
if (vendor != entry->vendor || device != entry->device)
-- 
2.20.1



[PATCH 08/16] PCI/P2PDMA: Introduce helpers for dma_map_sg implementations

2021-04-08 Thread Logan Gunthorpe
Add pci_p2pdma_map_segment() as a helper for simple dma_map_sg()
implementations. It takes an scatterlist segment that must point to a
pci_p2pdma struct page and will map it if the mapping requires a bus
address.

The return value indicates whether the mapping required a bus address
or whether the caller still needs to map the segment normally. If the
segment should not be mapped, -EREMOTEIO is returned.

This helper uses a state structure to track the changes to the
pgmap across calls and avoid needing to lookup into the xarray for
every page.

Also add pci_p2pdma_map_bus_segment() which is useful for IOMMU
dma_map_sg() implementations where the sg segment containing the page
differs from the sg segment containing the DMA address.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/p2pdma.c   | 65 ++
 include/linux/pci-p2pdma.h | 21 
 2 files changed, 86 insertions(+)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 38c93f57a941..44ad7664e875 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -923,6 +923,71 @@ void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct 
scatterlist *sg,
 }
 EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs);
 
+/**
+ * pci_p2pdma_map_segment - map an sg segment determining the mapping type
+ * @state: State structure that should be declared on the stack outside of
+ * the for_each_sg() loop and initialized to zero.
+ * @dev: DMA device that's doing the mapping operation
+ * @sg: scatterlist segment to map
+ * @attrs: dma mapping attributes
+ *
+ * This is a helper to be used by non-iommu dma_map_sg() implementations where
+ * the sg segment is the same for the page_link and the dma_address.
+ *
+ * Attempt to map a single segment in an SGL with the PCI bus address.
+ * The segment must point to a PCI P2PDMA page and thus must be
+ * wrapped in a is_pci_p2pdma_page(sg_page(sg)) check.
+ *
+ * Returns 1 if the segment was mapped, 0 if the segment should be mapped
+ * directly (or through the IOMMU) and -EREMOTEIO if the segment should not
+ * be mapped at all.
+ */
+int pci_p2pdma_map_segment(struct pci_p2pdma_map_state *state,
+  struct device *dev, struct scatterlist *sg,
+  unsigned long dma_attrs)
+{
+   if (state->pgmap != sg_page(sg)->pgmap) {
+   state->pgmap = sg_page(sg)->pgmap;
+   state->map = pci_p2pdma_map_type(state->pgmap, dev, dma_attrs);
+   state->bus_off = to_p2p_pgmap(state->pgmap)->bus_offset;
+   }
+
+   switch (state->map) {
+   case PCI_P2PDMA_MAP_BUS_ADDR:
+   sg->dma_address = sg_phys(sg) + state->bus_off;
+   sg_dma_len(sg) = sg->length;
+   sg_mark_pci_p2pdma(sg);
+   return 1;
+   case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+   return 0;
+   default:
+   return -EREMOTEIO;
+   }
+}
+
+/**
+ * pci_p2pdma_map_bus_segment - map an sg segment pre determined to
+ * be mapped with PCI_P2PDMA_MAP_BUS_ADDR
+ * @pg_sg: scatterlist segment with the page to map
+ * @dma_sg: scatterlist segment to assign a dma address to
+ *
+ * This is a helper for iommu dma_map_sg() implementations when the
+ * segment for the dma address differs from the segment containing the
+ * source page.
+ *
+ * pci_p2pdma_map_type() must have already been called on the pg_sg and
+ * returned PCI_P2PDMA_MAP_BUS_ADDR.
+ */
+void pci_p2pdma_map_bus_segment(struct scatterlist *pg_sg,
+   struct scatterlist *dma_sg)
+{
+   struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(sg_page(pg_sg)->pgmap);
+
+   dma_sg->dma_address = sg_phys(pg_sg) + pgmap->bus_offset;
+   sg_dma_len(dma_sg) = pg_sg->length;
+   sg_mark_pci_p2pdma(dma_sg);
+}
+
 /**
  * pci_p2pdma_enable_store - parse a configfs/sysfs attribute store
  * to enable p2pdma
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index a06072ac3a52..49e7679403cf 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -13,6 +13,12 @@
 
 #include 
 
+struct pci_p2pdma_map_state {
+   struct dev_pagemap *pgmap;
+   int map;
+   u64 bus_off;
+};
+
 struct block_device;
 struct scatterlist;
 
@@ -43,6 +49,11 @@ int pci_p2pdma_map_sg_attrs(struct device *dev, struct 
scatterlist *sg,
int nents, enum dma_data_direction dir, unsigned long attrs);
 void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
int nents, enum dma_data_direction dir, unsigned long attrs);
+int pci_p2pdma_map_segment(struct pci_p2pdma_map_state *state,
+   struct device *dev, struct scatterlist *sg,
+   unsigned long dma_attrs);
+void pci_p2pdma_map_bus_segment(struct scatterlist *pg_sg,
+   struct scatterlist *dma_sg);
 int pci_p2pdma_enab

[PATCH 13/16] nvme-pci: Convert to using dma_map_sg_p2pdma for p2pdma pages

2021-04-08 Thread Logan Gunthorpe
Convert to using dma_map_sg_p2pdma() for PCI p2pdma pages.

This should be equivalent but allows for heterogeneous scatterlists
with both P2PDMA and regular pages. However, P2PDMA support will be
slightly more restricted (only dma-direct and dma-iommu are currently
supported).

Signed-off-by: Logan Gunthorpe 
---
 drivers/nvme/host/pci.c | 28 
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 14f092973792..a1ed07ff38b7 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -577,17 +577,6 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct 
request *req)
 
 }
 
-static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req)
-{
-   struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-
-   if (is_pci_p2pdma_page(sg_page(iod->sg)))
-   pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
-   rq_dma_dir(req));
-   else
-   dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
-}
-
 static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 {
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -600,7 +589,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct 
request *req)
 
WARN_ON_ONCE(!iod->nents);
 
-   nvme_unmap_sg(dev, req);
+   dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
if (iod->npages == 0)
dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
  iod->first_dma);
@@ -868,14 +857,13 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, 
struct request *req,
if (!iod->nents)
goto out_free_sg;
 
-   if (is_pci_p2pdma_page(sg_page(iod->sg)))
-   nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg,
-   iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN);
-   else
-   nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
-rq_dma_dir(req), DMA_ATTR_NO_WARN);
-   if (!nr_mapped)
+   nr_mapped = dma_map_sg_p2pdma_attrs(dev->dev, iod->sg, iod->nents,
+rq_dma_dir(req), DMA_ATTR_NO_WARN);
+   if (nr_mapped < 0) {
+   if (nr_mapped != -ENOMEM)
+   ret = BLK_STS_TARGET;
goto out_free_sg;
+   }
 
iod->use_sgl = nvme_pci_use_sgls(dev, req);
if (iod->use_sgl)
@@ -887,7 +875,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, 
struct request *req,
return BLK_STS_OK;
 
 out_unmap_sg:
-   nvme_unmap_sg(dev, req);
+   dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
 out_free_sg:
mempool_free(iod->sg, dev->iod_mempool);
return ret;
-- 
2.20.1



[PATCH 09/16] dma-direct: Support PCI P2PDMA pages in dma-direct map_sg

2021-04-08 Thread Logan Gunthorpe
Add PCI P2PDMA support for dma_direct_map_sg() so that it can map
PCI P2PDMA pages directly without a hack in the callers. This allows
for heterogeneous SGLs that contain both P2PDMA and regular pages.

SGL segments that contain PCI bus addresses are marked with
sg_mark_pci_p2pdma() and are ignored when unmapped.

Signed-off-by: Logan Gunthorpe 
---
 kernel/dma/direct.c | 25 ++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 002268262c9a..108dfb4ecbd5 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "direct.h"
 
 /*
@@ -387,19 +388,37 @@ void dma_direct_unmap_sg(struct device *dev, struct 
scatterlist *sgl,
struct scatterlist *sg;
int i;
 
-   for_each_sg(sgl, sg, nents, i)
+   for_each_sg(sgl, sg, nents, i) {
+   if (sg_is_pci_p2pdma(sg)) {
+   sg_unmark_pci_p2pdma(sg);
+   continue;
+   }
+
dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
 attrs);
+   }
 }
 #endif
 
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs)
 {
-   int i;
+   struct pci_p2pdma_map_state p2pdma_state = {};
struct scatterlist *sg;
+   int i, ret = 0;
 
for_each_sg(sgl, sg, nents, i) {
+   if (is_pci_p2pdma_page(sg_page(sg))) {
+   ret = pci_p2pdma_map_segment(_state, dev, sg,
+attrs);
+   if (ret < 0) {
+   goto out_unmap;
+   } else if (ret) {
+   ret = 0;
+   continue;
+   }
+   }
+
sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
sg->offset, sg->length, dir, attrs);
if (sg->dma_address == DMA_MAPPING_ERROR)
@@ -411,7 +430,7 @@ int dma_direct_map_sg(struct device *dev, struct 
scatterlist *sgl, int nents,
 
 out_unmap:
dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
-   return 0;
+   return ret;
 }
 
 dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
-- 
2.20.1



[PATCH 11/16] iommu/dma: Support PCI P2PDMA pages in dma-iommu map_sg

2021-04-08 Thread Logan Gunthorpe
When a PCI P2PDMA page is seen, set the IOVA length of the segment
to zero so that it is not mapped into the IOVA. Then, in finalise_sg(),
apply the appropriate bus address to the segment. The IOVA is not
created if the scatterlist only consists of P2PDMA pages.

Similar to dma-direct, the sg_mark_pci_p2pdma() flag is used to
indicate bus address segments. On unmap, P2PDMA segments are skipped
over when determining the start and end IOVA addresses.

With this change, the flags variable in the dma_map_ops is
set to DMA_F_PCI_P2PDMA_SUPPORTED to indicate support for
P2PDMA pages.

Signed-off-by: Logan Gunthorpe 
---
 drivers/iommu/dma-iommu.c | 66 ++-
 1 file changed, 58 insertions(+), 8 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index af765c813cc8..ef49635f9819 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -864,6 +865,16 @@ static int __finalise_sg(struct device *dev, struct 
scatterlist *sg, int nents,
sg_dma_address(s) = DMA_MAPPING_ERROR;
sg_dma_len(s) = 0;
 
+   if (is_pci_p2pdma_page(sg_page(s)) && !s_iova_len) {
+   if (i > 0)
+   cur = sg_next(cur);
+
+   pci_p2pdma_map_bus_segment(s, cur);
+   count++;
+   cur_len = 0;
+   continue;
+   }
+
/*
 * Now fill in the real DMA data. If...
 * - there is a valid output segment to append to
@@ -961,10 +972,12 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
struct iova_domain *iovad = >iovad;
struct scatterlist *s, *prev = NULL;
int prot = dma_info_to_prot(dir, dev_is_dma_coherent(dev), attrs);
+   struct dev_pagemap *pgmap = NULL;
+   enum pci_p2pdma_map_type map_type;
dma_addr_t iova;
size_t iova_len = 0;
unsigned long mask = dma_get_seg_boundary(dev);
-   int i;
+   int i, ret = 0;
 
if (static_branch_unlikely(_deferred_attach_enabled) &&
iommu_deferred_attach(dev, domain))
@@ -993,6 +1006,31 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
s_length = iova_align(iovad, s_length + s_iova_off);
s->length = s_length;
 
+   if (is_pci_p2pdma_page(sg_page(s))) {
+   if (sg_page(s)->pgmap != pgmap) {
+   pgmap = sg_page(s)->pgmap;
+   map_type = pci_p2pdma_map_type(pgmap, dev,
+  attrs);
+   }
+
+   switch (map_type) {
+   case PCI_P2PDMA_MAP_BUS_ADDR:
+   /*
+* A zero length will be ignored by
+* iommu_map_sg() and then can be detected
+* in __finalise_sg() to actually map the
+* bus address.
+*/
+   s->length = 0;
+   continue;
+   case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+   break;
+   default:
+   ret = -EREMOTEIO;
+   goto out_restore_sg;
+   }
+   }
+
/*
 * Due to the alignment of our single IOVA allocation, we can
 * depend on these assumptions about the segment boundary mask:
@@ -1015,6 +1053,9 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
prev = s;
}
 
+   if (!iova_len)
+   return __finalise_sg(dev, sg, nents, 0);
+
iova = iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev);
if (!iova)
goto out_restore_sg;
@@ -1032,13 +1073,13 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
iommu_dma_free_iova(cookie, iova, iova_len, NULL);
 out_restore_sg:
__invalidate_sg(sg, nents);
-   return 0;
+   return ret;
 }
 
 static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
int nents, enum dma_data_direction dir, unsigned long attrs)
 {
-   dma_addr_t start, end;
+   dma_addr_t end, start = DMA_MAPPING_ERROR;
struct scatterlist *tmp;
int i;
 
@@ -1054,14 +1095,22 @@ static void iommu_dma_unmap_sg(struct device *dev, 
struct scatterlist *sg,
 * The scatterlist segments are mapped into a single
 * contiguous IOVA allocation, so this is incredibly easy.
 */

[PATCH 14/16] nvme-rdma: Ensure dma support when using p2pdma

2021-04-08 Thread Logan Gunthorpe
Ensure the dma operations support p2pdma before using the RDMA
device for P2PDMA. This allows switching the RDMA driver from
pci_p2pdma_map_sg() to dma_map_sg_p2pdma().

Signed-off-by: Logan Gunthorpe 
---
 drivers/nvme/target/rdma.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 6c1f3ab7649c..3ec7e77e5416 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -414,7 +414,8 @@ static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device 
*ndev,
if (ib_dma_mapping_error(ndev->device, r->send_sge.addr))
goto out_free_rsp;
 
-   if (!ib_uses_virt_dma(ndev->device))
+   if (!ib_uses_virt_dma(ndev->device) &&
+   dma_pci_p2pdma_supported(>device->dev))
r->req.p2p_client = >device->dev;
r->send_sge.length = sizeof(*r->req.cqe);
r->send_sge.lkey = ndev->pd->local_dma_lkey;
-- 
2.20.1



[PATCH 12/16] nvme-pci: Check DMA ops when indicating support for PCI P2PDMA

2021-04-08 Thread Logan Gunthorpe
Introduce a supports_pci_p2pdma() operation in nvme_ctrl_ops to
replace the fixed NVME_F_PCI_P2PDMA flag such that the dma_map_ops
flags can be checked for PCI P2PDMA support.

Signed-off-by: Logan Gunthorpe 
---
 drivers/nvme/host/core.c |  3 ++-
 drivers/nvme/host/nvme.h |  2 +-
 drivers/nvme/host/pci.c  | 11 +--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0896e21642be..223419454516 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3907,7 +3907,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, 
unsigned nsid,
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
 
blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
-   if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
+   if (ctrl->ops->supports_pci_p2pdma &&
+   ctrl->ops->supports_pci_p2pdma(ctrl))
blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
 
ns->queue->queuedata = ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 07b34175c6ce..9c04df982d2c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -473,7 +473,6 @@ struct nvme_ctrl_ops {
unsigned int flags;
 #define NVME_F_FABRICS (1 << 0)
 #define NVME_F_METADATA_SUPPORTED  (1 << 1)
-#define NVME_F_PCI_P2PDMA  (1 << 2)
int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
@@ -481,6 +480,7 @@ struct nvme_ctrl_ops {
void (*submit_async_event)(struct nvme_ctrl *ctrl);
void (*delete_ctrl)(struct nvme_ctrl *ctrl);
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
+   bool (*supports_pci_p2pdma)(struct nvme_ctrl *ctrl);
 };
 
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 7249ae74f71f..14f092973792 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2759,17 +2759,24 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, 
char *buf, int size)
return snprintf(buf, size, "%s\n", dev_name(>dev));
 }
 
+static bool nvme_pci_supports_pci_p2pdma(struct nvme_ctrl *ctrl)
+{
+   struct nvme_dev *dev = to_nvme_dev(ctrl);
+
+   return dma_pci_p2pdma_supported(dev->dev);
+}
+
 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name   = "pcie",
.module = THIS_MODULE,
-   .flags  = NVME_F_METADATA_SUPPORTED |
- NVME_F_PCI_P2PDMA,
+   .flags  = NVME_F_METADATA_SUPPORTED,
.reg_read32 = nvme_pci_reg_read32,
.reg_write32= nvme_pci_reg_write32,
.reg_read64 = nvme_pci_reg_read64,
.free_ctrl  = nvme_pci_free_ctrl,
.submit_async_event = nvme_pci_submit_async_event,
.get_address= nvme_pci_get_address,
+   .supports_pci_p2pdma= nvme_pci_supports_pci_p2pdma,
 };
 
 static int nvme_dev_map(struct nvme_dev *dev)
-- 
2.20.1



[PATCH 10/16] dma-mapping: Add flags to dma_map_ops to indicate PCI P2PDMA support

2021-04-08 Thread Logan Gunthorpe
Add a flags member to the dma_map_ops structure with one flag to
indicate support for PCI P2PDMA.

Also, add a helper to check if a device supports PCI P2PDMA.

Signed-off-by: Logan Gunthorpe 
---
 include/linux/dma-map-ops.h |  3 +++
 include/linux/dma-mapping.h |  5 +
 kernel/dma/mapping.c| 18 ++
 3 files changed, 26 insertions(+)

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 51872e736e7b..481892822104 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -12,6 +12,9 @@
 struct cma;
 
 struct dma_map_ops {
+   unsigned int flags;
+#define DMA_F_PCI_P2PDMA_SUPPORTED (1 << 0)
+
void *(*alloc)(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp,
unsigned long attrs);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 50b8f586cf59..c31980ecca62 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -146,6 +146,7 @@ int dma_mmap_attrs(struct device *dev, struct 
vm_area_struct *vma,
unsigned long attrs);
 bool dma_can_mmap(struct device *dev);
 int dma_supported(struct device *dev, u64 mask);
+bool dma_pci_p2pdma_supported(struct device *dev);
 int dma_set_mask(struct device *dev, u64 mask);
 int dma_set_coherent_mask(struct device *dev, u64 mask);
 u64 dma_get_required_mask(struct device *dev);
@@ -247,6 +248,10 @@ static inline int dma_supported(struct device *dev, u64 
mask)
 {
return 0;
 }
+static inline bool dma_pci_p2pdma_supported(struct device *dev)
+{
+   return 0;
+}
 static inline int dma_set_mask(struct device *dev, u64 mask)
 {
return -EIO;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 923089c4267b..ce44a0fcc4e5 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -573,6 +573,24 @@ int dma_supported(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_supported);
 
+bool dma_pci_p2pdma_supported(struct device *dev)
+{
+   const struct dma_map_ops *ops = get_dma_ops(dev);
+
+   /* if ops is not set, dma direct will be used which supports P2PDMA */
+   if (!ops)
+   return true;
+
+   /*
+* Note: dma_ops_bypass is not checked here because P2PDMA should
+* not be used with dma mapping ops that do not have support even
+* if the specific device is bypassing them.
+*/
+
+   return ops->flags & DMA_F_PCI_P2PDMA_SUPPORTED;
+}
+EXPORT_SYMBOL_GPL(dma_pci_p2pdma_supported);
+
 #ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
 void arch_dma_set_mask(struct device *dev, u64 mask);
 #else
-- 
2.20.1



[PATCH 16/16] PCI/P2PDMA: Remove pci_p2pdma_[un]map_sg()

2021-04-08 Thread Logan Gunthorpe
This interface is superseded by the new dma_map_sg_p2pdma() interface
which supports heterogeneous scatterlists. There are no longer
any users, so remove it.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/p2pdma.c   | 67 --
 include/linux/pci-p2pdma.h | 27 ---
 2 files changed, 94 deletions(-)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 44ad7664e875..2f2adcccfa11 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -856,73 +856,6 @@ enum pci_p2pdma_map_type pci_p2pdma_map_type(struct 
dev_pagemap *pgmap,
 GFP_ATOMIC);
 }
 
-static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap *p2p_pgmap,
-   struct device *dev, struct scatterlist *sg, int nents)
-{
-   struct scatterlist *s;
-   int i;
-
-   for_each_sg(sg, s, nents, i) {
-   s->dma_address = sg_phys(s) - p2p_pgmap->bus_offset;
-   sg_dma_len(s) = s->length;
-   }
-
-   return nents;
-}
-
-/**
- * pci_p2pdma_map_sg_attrs - map a PCI peer-to-peer scatterlist for DMA
- * @dev: device doing the DMA request
- * @sg: scatter list to map
- * @nents: elements in the scatterlist
- * @dir: DMA direction
- * @attrs: DMA attributes passed to dma_map_sg() (if called)
- *
- * Scatterlists mapped with this function should be unmapped using
- * pci_p2pdma_unmap_sg_attrs().
- *
- * Returns the number of SG entries mapped or 0 on error.
- */
-int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
-   int nents, enum dma_data_direction dir, unsigned long attrs)
-{
-   struct pci_p2pdma_pagemap *p2p_pgmap =
-   to_p2p_pgmap(sg_page(sg)->pgmap);
-
-   switch (pci_p2pdma_map_type(sg_page(sg)->pgmap, dev,
-   __DMA_ATTR_PCI_P2PDMA)) {
-   case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
-   return dma_map_sg_attrs(dev, sg, nents, dir, attrs);
-   case PCI_P2PDMA_MAP_BUS_ADDR:
-   return __pci_p2pdma_map_sg(p2p_pgmap, dev, sg, nents);
-   default:
-   return 0;
-   }
-}
-EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg_attrs);
-
-/**
- * pci_p2pdma_unmap_sg_attrs - unmap a PCI peer-to-peer scatterlist that was
- * mapped with pci_p2pdma_map_sg()
- * @dev: device doing the DMA request
- * @sg: scatter list to map
- * @nents: number of elements returned by pci_p2pdma_map_sg()
- * @dir: DMA direction
- * @attrs: DMA attributes passed to dma_unmap_sg() (if called)
- */
-void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
-   int nents, enum dma_data_direction dir, unsigned long attrs)
-{
-   enum pci_p2pdma_map_type map_type;
-
-   map_type = pci_p2pdma_map_type(sg_page(sg)->pgmap, dev,
-  __DMA_ATTR_PCI_P2PDMA);
-
-   if (map_type == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
-   dma_unmap_sg_attrs(dev, sg, nents, dir, attrs);
-}
-EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs);
-
 /**
  * pci_p2pdma_map_segment - map an sg segment determining the mapping type
  * @state: State structure that should be declared on the stack outside of
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index 49e7679403cf..2ec9c75fa097 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -45,10 +45,6 @@ void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct 
scatterlist *sgl);
 void pci_p2pmem_publish(struct pci_dev *pdev, bool publish);
 enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
struct device *dev, unsigned long dma_attrs);
-int pci_p2pdma_map_sg_attrs(struct device *dev, struct scatterlist *sg,
-   int nents, enum dma_data_direction dir, unsigned long attrs);
-void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
-   int nents, enum dma_data_direction dir, unsigned long attrs);
 int pci_p2pdma_map_segment(struct pci_p2pdma_map_state *state,
struct device *dev, struct scatterlist *sg,
unsigned long dma_attrs);
@@ -109,17 +105,6 @@ static inline enum pci_p2pdma_map_type pci_p2pdma_map_type(
 {
return PCI_P2PDMA_MAP_NOT_SUPPORTED;
 }
-static inline int pci_p2pdma_map_sg_attrs(struct device *dev,
-   struct scatterlist *sg, int nents, enum dma_data_direction dir,
-   unsigned long attrs)
-{
-   return 0;
-}
-static inline void pci_p2pdma_unmap_sg_attrs(struct device *dev,
-   struct scatterlist *sg, int nents, enum dma_data_direction dir,
-   unsigned long attrs)
-{
-}
 static inline int pci_p2pdma_map_segment(struct pci_p2pdma_map_state *state,
struct device *dev, struct scatterlist *sg,
unsigned long dma_attrs)
@@ -155,16 +140,4 @@ static inline struct pci_dev *pci_p2pmem_find(struct 
device *client)
return pci_p2pmem_find_many(, 1);
 }
 

[PATCH 00/16] Add new DMA mapping operation for P2PDMA

2021-04-08 Thread Logan Gunthorpe
Hi,

This patchset continues my work to to add P2PDMA support to the common
dma map operations. This allows for creating SGLs that have both P2PDMA
and regular pages which is a necessary step to allowing P2PDMA pages in
userspace.

The earlier RFC[1] generated a lot of great feedback and I heard no show
stopping objections. Thus, I've incorporated all the feedback and have
decided to post this as a proper patch series with hopes of eventually
getting it in mainline.

I'm happy to do a few more passes if anyone has any further feedback
or better ideas.

This series is based on v5.12-rc6 and a git branch can be found here:

  https://github.com/sbates130272/linux-p2pmem/  p2pdma_map_ops_v1

Thanks,

Logan

[1] 
https://lore.kernel.org/linux-block/20210311233142.7900-1-log...@deltatee.com/


Changes since the RFC:
 * Added comment and fixed up the pci_get_slot patch. (per Bjorn)
 * Fixed glaring sg_phys() double offset bug. (per Robin)
 * Created a new map operation (dma_map_sg_p2pdma()) with a new calling
   convention instead of modifying the calling convention of
   dma_map_sg(). (per Robin)
 * Integrated the two similar pci_p2pdma_dma_map_type() and
   pci_p2pdma_map_type() functions into one (per Ira)
 * Reworked some of the logic in the map_sg() implementations into
   helpers in the p2pdma code. (per Christoph)
 * Dropped a bunch of unnecessary symbol exports (per Christoph)
 * Expanded the code in dma_pci_p2pdma_supported() for clarity. (per
   Ira and Christoph)
 * Finished off using the new dma_map_sg_p2pdma() call in rdma_rw
   and removed the old pci_p2pdma_[un]map_sg(). (per Jason)

--

Logan Gunthorpe (16):
  PCI/P2PDMA: Pass gfp_mask flags to upstream_bridge_distance_warn()
  PCI/P2PDMA: Avoid pci_get_slot() which sleeps
  PCI/P2PDMA: Attempt to set map_type if it has not been set
  PCI/P2PDMA: Refactor pci_p2pdma_map_type() to take pagmap and device
  dma-mapping: Introduce dma_map_sg_p2pdma()
  lib/scatterlist: Add flag for indicating P2PDMA segments in an SGL
  PCI/P2PDMA: Make pci_p2pdma_map_type() non-static
  PCI/P2PDMA: Introduce helpers for dma_map_sg implementations
  dma-direct: Support PCI P2PDMA pages in dma-direct map_sg
  dma-mapping: Add flags to dma_map_ops to indicate PCI P2PDMA support
  iommu/dma: Support PCI P2PDMA pages in dma-iommu map_sg
  nvme-pci: Check DMA ops when indicating support for PCI P2PDMA
  nvme-pci: Convert to using dma_map_sg_p2pdma for p2pdma pages
  nvme-rdma: Ensure dma support when using p2pdma
  RDMA/rw: use dma_map_sg_p2pdma()
  PCI/P2PDMA: Remove pci_p2pdma_[un]map_sg()

 drivers/infiniband/core/rw.c |  50 +++---
 drivers/iommu/dma-iommu.c|  66 ++--
 drivers/nvme/host/core.c |   3 +-
 drivers/nvme/host/nvme.h |   2 +-
 drivers/nvme/host/pci.c  |  39 
 drivers/nvme/target/rdma.c   |   3 +-
 drivers/pci/Kconfig  |   2 +-
 drivers/pci/p2pdma.c | 188 +++
 include/linux/dma-map-ops.h  |   3 +
 include/linux/dma-mapping.h  |  20 
 include/linux/pci-p2pdma.h   |  53 ++
 include/linux/scatterlist.h  |  49 -
 include/rdma/ib_verbs.h  |  32 ++
 kernel/dma/direct.c  |  25 -
 kernel/dma/mapping.c |  70 +++--
 15 files changed, 416 insertions(+), 189 deletions(-)


base-commit: e49d033bddf5b565044e2abe4241353959bc9120
--
2.20.1


[PATCH 15/16] RDMA/rw: use dma_map_sg_p2pdma()

2021-04-08 Thread Logan Gunthorpe
Drop the use of pci_p2pdma_map_sg() in favour of dma_map_sg_p2pdma().

The new interface allows mapping scatterlists that mix both regular
and P2PDMA pages and will verify that the dma device can communicate
with the device the pages are on.

Signed-off-by: Logan Gunthorpe 
---
 drivers/infiniband/core/rw.c | 50 ++--
 include/rdma/ib_verbs.h  | 32 +++
 2 files changed, 46 insertions(+), 36 deletions(-)

diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index 31156e22d3e7..0c6213d9b044 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -273,26 +273,6 @@ static int rdma_rw_init_single_wr(struct rdma_rw_ctx *ctx, 
struct ib_qp *qp,
return 1;
 }
 
-static void rdma_rw_unmap_sg(struct ib_device *dev, struct scatterlist *sg,
-u32 sg_cnt, enum dma_data_direction dir)
-{
-   if (is_pci_p2pdma_page(sg_page(sg)))
-   pci_p2pdma_unmap_sg(dev->dma_device, sg, sg_cnt, dir);
-   else
-   ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
-}
-
-static int rdma_rw_map_sg(struct ib_device *dev, struct scatterlist *sg,
- u32 sg_cnt, enum dma_data_direction dir)
-{
-   if (is_pci_p2pdma_page(sg_page(sg))) {
-   if (WARN_ON_ONCE(ib_uses_virt_dma(dev)))
-   return 0;
-   return pci_p2pdma_map_sg(dev->dma_device, sg, sg_cnt, dir);
-   }
-   return ib_dma_map_sg(dev, sg, sg_cnt, dir);
-}
-
 /**
  * rdma_rw_ctx_init - initialize a RDMA READ/WRITE context
  * @ctx:   context to initialize
@@ -315,9 +295,9 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp 
*qp, u8 port_num,
struct ib_device *dev = qp->pd->device;
int ret;
 
-   ret = rdma_rw_map_sg(dev, sg, sg_cnt, dir);
-   if (!ret)
-   return -ENOMEM;
+   ret = ib_dma_map_sg_p2pdma(dev, sg, sg_cnt, dir);
+   if (ret < 0)
+   return ret;
sg_cnt = ret;
 
/*
@@ -354,7 +334,7 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp 
*qp, u8 port_num,
return ret;
 
 out_unmap_sg:
-   rdma_rw_unmap_sg(dev, sg, sg_cnt, dir);
+   ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
return ret;
 }
 EXPORT_SYMBOL(rdma_rw_ctx_init);
@@ -394,17 +374,15 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, 
struct ib_qp *qp,
return -EINVAL;
}
 
-   ret = rdma_rw_map_sg(dev, sg, sg_cnt, dir);
-   if (!ret)
-   return -ENOMEM;
+   ret = ib_dma_map_sg_p2pdma(dev, sg, sg_cnt, dir);
+   if (ret < 0)
+   return ret;
sg_cnt = ret;
 
if (prot_sg_cnt) {
-   ret = rdma_rw_map_sg(dev, prot_sg, prot_sg_cnt, dir);
-   if (!ret) {
-   ret = -ENOMEM;
+   ret = ib_dma_map_sg_p2pdma(dev, prot_sg, prot_sg_cnt, dir);
+   if (ret < 0)
goto out_unmap_sg;
-   }
prot_sg_cnt = ret;
}
 
@@ -469,9 +447,9 @@ int rdma_rw_ctx_signature_init(struct rdma_rw_ctx *ctx, 
struct ib_qp *qp,
kfree(ctx->reg);
 out_unmap_prot_sg:
if (prot_sg_cnt)
-   rdma_rw_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
+   ib_dma_unmap_sg(dev, prot_sg, prot_sg_cnt, dir);
 out_unmap_sg:
-   rdma_rw_unmap_sg(dev, sg, sg_cnt, dir);
+   ib_dma_unmap_sg(dev, sg, sg_cnt, dir);
return ret;
 }
 EXPORT_SYMBOL(rdma_rw_ctx_signature_init);
@@ -603,7 +581,7 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct 
ib_qp *qp, u8 port_num,
break;
}
 
-   rdma_rw_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+   ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
 }
 EXPORT_SYMBOL(rdma_rw_ctx_destroy);
 
@@ -631,8 +609,8 @@ void rdma_rw_ctx_destroy_signature(struct rdma_rw_ctx *ctx, 
struct ib_qp *qp,
kfree(ctx->reg);
 
if (prot_sg_cnt)
-   rdma_rw_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
-   rdma_rw_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+   ib_dma_unmap_sg(qp->pd->device, prot_sg, prot_sg_cnt, dir);
+   ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
 }
 EXPORT_SYMBOL(rdma_rw_ctx_destroy_signature);
 
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index ca28fca5736b..a541ed1702f5 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -4028,6 +4028,17 @@ static inline int ib_dma_map_sg_attrs(struct ib_device 
*dev,
dma_attrs);
 }
 
+static inline int ib_dma_map_sg_p2pdma_attrs(struct ib_device *dev,
+struct scatterlist *sg, int nents,
+enum dma_data_direction direction,
+unsigned long dma_attr

[PATCH 01/16] PCI/P2PDMA: Pass gfp_mask flags to upstream_bridge_distance_warn()

2021-04-08 Thread Logan Gunthorpe
In order to call upstream_bridge_distance_warn() from a dma_map function,
it must not sleep. The only reason it does sleep is to allocate the seqbuf
to print which devices are within the ACS path.

Switch the kmalloc call to use a passed in gfp_mask and don't print that
message if the buffer fails to be allocated.

Signed-off-by: Logan Gunthorpe 
Acked-by: Bjorn Helgaas 
---
 drivers/pci/p2pdma.c | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 196382630363..bd89437faf06 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -267,7 +267,7 @@ static int pci_bridge_has_acs_redir(struct pci_dev *pdev)
 
 static void seq_buf_print_bus_devfn(struct seq_buf *buf, struct pci_dev *pdev)
 {
-   if (!buf)
+   if (!buf || !buf->buffer)
return;
 
seq_buf_printf(buf, "%s;", pci_name(pdev));
@@ -495,25 +495,26 @@ upstream_bridge_distance(struct pci_dev *provider, struct 
pci_dev *client,
 
 static enum pci_p2pdma_map_type
 upstream_bridge_distance_warn(struct pci_dev *provider, struct pci_dev *client,
- int *dist)
+ int *dist, gfp_t gfp_mask)
 {
struct seq_buf acs_list;
bool acs_redirects;
int ret;
 
-   seq_buf_init(_list, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
-   if (!acs_list.buffer)
-   return -ENOMEM;
+   seq_buf_init(_list, kmalloc(PAGE_SIZE, gfp_mask), PAGE_SIZE);
 
ret = upstream_bridge_distance(provider, client, dist, _redirects,
   _list);
if (acs_redirects) {
pci_warn(client, "ACS redirect is set between the client and 
provider (%s)\n",
 pci_name(provider));
-   /* Drop final semicolon */
-   acs_list.buffer[acs_list.len-1] = 0;
-   pci_warn(client, "to disable ACS redirect for this path, add 
the kernel parameter: pci=disable_acs_redir=%s\n",
-acs_list.buffer);
+
+   if (acs_list.buffer) {
+   /* Drop final semicolon */
+   acs_list.buffer[acs_list.len - 1] = 0;
+   pci_warn(client, "to disable ACS redirect for this 
path, add the kernel parameter: pci=disable_acs_redir=%s\n",
+acs_list.buffer);
+   }
}
 
if (ret == PCI_P2PDMA_MAP_NOT_SUPPORTED) {
@@ -566,7 +567,7 @@ int pci_p2pdma_distance_many(struct pci_dev *provider, 
struct device **clients,
 
if (verbose)
ret = upstream_bridge_distance_warn(provider,
-   pci_client, );
+   pci_client, , GFP_KERNEL);
else
ret = upstream_bridge_distance(provider, pci_client,
   , NULL, NULL);
-- 
2.20.1



[PATCH 03/16] PCI/P2PDMA: Attempt to set map_type if it has not been set

2021-04-08 Thread Logan Gunthorpe
Attempt to find the mapping type for P2PDMA pages on the first
DMA map attempt if it has not been done ahead of time.

Previously, the mapping type was expected to be calculated ahead of
time, but if pages are to come from userspace then there's no
way to ensure the path was checked ahead of time.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/p2pdma.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 473a08940fbc..2574a062a255 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -825,11 +825,18 @@ EXPORT_SYMBOL_GPL(pci_p2pmem_publish);
 static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct pci_dev *provider,
struct pci_dev *client)
 {
+   enum pci_p2pdma_map_type ret;
+
if (!provider->p2pdma)
return PCI_P2PDMA_MAP_NOT_SUPPORTED;
 
-   return xa_to_value(xa_load(>p2pdma->map_types,
-  map_types_idx(client)));
+   ret = xa_to_value(xa_load(>p2pdma->map_types,
+ map_types_idx(client)));
+   if (ret != PCI_P2PDMA_MAP_UNKNOWN)
+   return ret;
+
+   return upstream_bridge_distance_warn(provider, client, NULL,
+GFP_ATOMIC);
 }
 
 static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap *p2p_pgmap,
@@ -877,7 +884,6 @@ int pci_p2pdma_map_sg_attrs(struct device *dev, struct 
scatterlist *sg,
case PCI_P2PDMA_MAP_BUS_ADDR:
return __pci_p2pdma_map_sg(p2p_pgmap, dev, sg, nents);
default:
-   WARN_ON_ONCE(1);
return 0;
}
 }
-- 
2.20.1



[PATCH 04/16] PCI/P2PDMA: Refactor pci_p2pdma_map_type() to take pagmap and device

2021-04-08 Thread Logan Gunthorpe
All callers of pci_p2pdma_map_type() have a struct dev_pgmap and a
struct device (of the client doing the DMA transfer). Thus move the
conversion to struct pci_devs for the provider and client into this
function.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/p2pdma.c | 29 +++--
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 2574a062a255..bcb1a6d6119d 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -822,14 +822,21 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool 
publish)
 }
 EXPORT_SYMBOL_GPL(pci_p2pmem_publish);
 
-static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct pci_dev *provider,
-   struct pci_dev *client)
+static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct dev_pagemap *pgmap,
+   struct device *dev)
 {
+   struct pci_dev *provider = to_p2p_pgmap(pgmap)->provider;
enum pci_p2pdma_map_type ret;
+   struct pci_dev *client;
 
if (!provider->p2pdma)
return PCI_P2PDMA_MAP_NOT_SUPPORTED;
 
+   if (!dev_is_pci(dev))
+   return PCI_P2PDMA_MAP_NOT_SUPPORTED;
+
+   client = to_pci_dev(dev);
+
ret = xa_to_value(xa_load(>p2pdma->map_types,
  map_types_idx(client)));
if (ret != PCI_P2PDMA_MAP_UNKNOWN)
@@ -871,14 +878,8 @@ int pci_p2pdma_map_sg_attrs(struct device *dev, struct 
scatterlist *sg,
 {
struct pci_p2pdma_pagemap *p2p_pgmap =
to_p2p_pgmap(sg_page(sg)->pgmap);
-   struct pci_dev *client;
-
-   if (WARN_ON_ONCE(!dev_is_pci(dev)))
-   return 0;
 
-   client = to_pci_dev(dev);
-
-   switch (pci_p2pdma_map_type(p2p_pgmap->provider, client)) {
+   switch (pci_p2pdma_map_type(sg_page(sg)->pgmap, dev)) {
case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
return dma_map_sg_attrs(dev, sg, nents, dir, attrs);
case PCI_P2PDMA_MAP_BUS_ADDR:
@@ -901,17 +902,9 @@ EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg_attrs);
 void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
int nents, enum dma_data_direction dir, unsigned long attrs)
 {
-   struct pci_p2pdma_pagemap *p2p_pgmap =
-   to_p2p_pgmap(sg_page(sg)->pgmap);
enum pci_p2pdma_map_type map_type;
-   struct pci_dev *client;
-
-   if (WARN_ON_ONCE(!dev_is_pci(dev)))
-   return;
-
-   client = to_pci_dev(dev);
 
-   map_type = pci_p2pdma_map_type(p2p_pgmap->provider, client);
+   map_type = pci_p2pdma_map_type(sg_page(sg)->pgmap, dev);
 
if (map_type == PCI_P2PDMA_MAP_THRU_HOST_BRIDGE)
dma_unmap_sg_attrs(dev, sg, nents, dir, attrs);
-- 
2.20.1



Re: [PATCH] dmaengine: plx_dma: add a missing put_device() on error path

2021-03-23 Thread Logan Gunthorpe



On 2021-03-23 7:19 a.m., Dan Carpenter wrote:
> Add a missing put_device(>dev) if the call to
> dma_async_device_register(dma); fails.
> 
> Fixes: 905ca51e63be ("dmaengine: plx-dma: Introduce PLX DMA engine PCI driver 
> skeleton")
> Signed-off-by: Dan Carpenter 

Good catch. Thanks Dan!

Reviewed-by: Logan Gunthorpe 

Logan


Re: [RFC PATCH v2 09/11] block: Add BLK_STS_P2PDMA

2021-03-16 Thread Logan Gunthorpe



On 2021-03-16 2:00 a.m., Christoph Hellwig wrote:
> On Thu, Mar 11, 2021 at 04:31:39PM -0700, Logan Gunthorpe wrote:
>> Create a specific error code for when P2PDMA pages are passed to a block
>> devices that cannot map them (due to no IOMMU support or ACS protections).
>>
>> This makes request errors in these cases more informative of as to what
>> caused the error.
> 
> I really don't think we should bother with a specific error code here,
> we don't add a new status for every single possible logic error in the
> caller.

I originally had BLK_STS_IOERR but those errors suggested to people that
the hardware had failed on the request when in fact it was a user error.
I'll try BLK_STS_TARGET unless there's any objection or someone thinks
another error code would make more sense.

Logan


Re: [RFC PATCH v2 06/11] dma-direct: Support PCI P2PDMA pages in dma-direct map_sg

2021-03-16 Thread Logan Gunthorpe



On 2021-03-16 1:58 a.m., Christoph Hellwig wrote:
> On Fri, Mar 12, 2021 at 11:27:46AM -0700, Logan Gunthorpe wrote:
>> So then we reject the patches that make that change. Seems like an odd
>> argument to say that we can't do something that won't cause problems
>> because someone might use it as an example and do something that will
>> cause problems. Reject the change that causes the problem.
> 
> No, the problem is a mess of calling conventions.  A calling convention
> returning 0 for error, positive values for success is fine.  One returning
> a negative errno for error and positive values for success is fine a well.
> One returning 0 for the usual errors and negativ errnos for an unusual
> corner case is just a complete mess.

Fair enough. I can try implementing a dma_map_sg_p2p() roughly as Robin
suggested that has a more reasonable calling convention.

Most of your other feedback seems easy enough so I'll address it in a
future series.

Thanks,

Logan


Re: [RFC PATCH v2 07/11] dma-mapping: Add flags to dma_map_ops to indicate PCI P2PDMA support

2021-03-15 Thread Logan Gunthorpe



On 2021-03-12 7:36 p.m., Ira Weiny wrote:
> On Thu, Mar 11, 2021 at 04:31:37PM -0700, Logan Gunthorpe wrote:
>  
>> +int dma_pci_p2pdma_supported(struct device *dev)
>^^^
>   bool?

Sure.

> 
>> +{
>> +const struct dma_map_ops *ops = get_dma_ops(dev);
>> +
>> +return !ops || ops->flags & DMA_F_PCI_P2PDMA_SUPPORTED;
> 
> Is this logic correct?  I would have expected.
> 
>   return (ops && ops->flags & DMA_F_PCI_P2PDMA_SUPPORTED);


If ops is NULL then the operations in kernel/dma/direct.c are used and
support is added to those in patch 6. So it is correct as written.

Logan


Re: [RFC PATCH v2 04/11] PCI/P2PDMA: Introduce pci_p2pdma_should_map_bus() and pci_p2pdma_bus_offset()

2021-03-15 Thread Logan Gunthorpe



On 2021-03-12 7:32 p.m., Ira Weiny wrote:
> On Thu, Mar 11, 2021 at 04:31:34PM -0700, Logan Gunthorpe wrote:
>> Introduce pci_p2pdma_should_map_bus() which is meant to be called by
> ^
>   pci_p2pdma_dma_map_type() ???
> 
> FWIW I find this name confusing with pci_p2pdma_map_type() and looking at the
> implementation I'm not clear why pci_p2pdma_dma_map_type() needs to exist?

Yeah, there are subtle differences in prototype. But yes, they can
probably be combined. Will do for future postings.

>> + * pci_p2pdma_dma_map_type - determine if a DMA mapping should use the
>> + *  bus address, be mapped normally or fail
>> + * @dev: device doing the DMA request
>> + * @pgmap: dev_pagemap structure for the mapping
>> + *
>> + * Returns:
>> + *1 - if the page should be mapped with a bus address,
>> + *0 - if the page should be mapped normally through an IOMMU mapping or
>> + *physical address; or
>> + *   -1 - if the device should not map the pages and an error should be
>> + *returned
>> + */
>> +int pci_p2pdma_dma_map_type(struct device *dev, struct dev_pagemap *pgmap)
>> +{
>> +struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(pgmap);
>> +struct pci_dev *client;
>> +
>> +if (!dev_is_pci(dev))
>> +return -1;
>> +
>> +client = to_pci_dev(dev);
>> +
>> +switch (pci_p2pdma_map_type(p2p_pgmap->provider, client)) {
>> +case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
>> +return 0;
>> +case PCI_P2PDMA_MAP_BUS_ADDR:
>> +return 1;
>> +default:
>> +return -1;
>> +}
>> +}
>> +EXPORT_SYMBOL_GPL(pci_p2pdma_dma_map_type);
> 
> I guess the main point here is to export this to the DMA layer?

Yes, that's correct.

Logan


Re: [RFC PATCH v2 04/11] PCI/P2PDMA: Introduce pci_p2pdma_should_map_bus() and pci_p2pdma_bus_offset()

2021-03-15 Thread Logan Gunthorpe



On 2021-03-12 6:38 p.m., Ira Weiny wrote:
> On Thu, Mar 11, 2021 at 04:31:34PM -0700, Logan Gunthorpe wrote:
>> Introduce pci_p2pdma_should_map_bus() which is meant to be called by
>> DMA map functions to determine how to map a given p2pdma page.
>>
>> pci_p2pdma_bus_offset() is also added to allow callers to get the bus
>> offset if they need to map the bus address.
>>
>> Signed-off-by: Logan Gunthorpe 
>> ---
>>  drivers/pci/p2pdma.c   | 50 ++
>>  include/linux/pci-p2pdma.h | 11 +
>>  2 files changed, 61 insertions(+)
>>
>> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
>> index 7f6836732bce..66d16b7eb668 100644
>> --- a/drivers/pci/p2pdma.c
>> +++ b/drivers/pci/p2pdma.c
>> @@ -912,6 +912,56 @@ void pci_p2pdma_unmap_sg_attrs(struct device *dev, 
>> struct scatterlist *sg,
>>  }
>>  EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs);
>>  
>> +/**
>> + * pci_p2pdma_bus_offset - returns the bus offset for a given page
>> + * @page: page to get the offset for
>> + *
>> + * Must be passed a PCI p2pdma page.
>> + */
>> +u64 pci_p2pdma_bus_offset(struct page *page)
>> +{
>> +struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page->pgmap);
>> +
>> +WARN_ON(!is_pci_p2pdma_page(page));
> 
> Shouldn't this check be before the to_p2p_pgmap() call?  

The to_p2p_pgmap() call is just doing pointer arithmetic, so strictly
speaking it doesn't need to be before. We just can't access p2p_pgmap
until it has been checked.

> And I've been told not
> to introduce WARN_ON's.  Should this be?
> 
>   if (!is_pci_p2pdma_page(page))
>   return -1;

In this case the WARN_ON is just to guard against misuse of the
function. It should never happen unless a developer changes the code in
a way that is incorrect. So I think that's the correct use of WARN_ON.
Though I might change it to WARN and return, that seems safer.

Logan


Re: [RFC PATCH v2 02/11] PCI/P2PDMA: Avoid pci_get_slot() which sleeps

2021-03-12 Thread Logan Gunthorpe



On 2021-03-12 1:57 p.m., Bjorn Helgaas wrote:
> On Thu, Mar 11, 2021 at 04:31:32PM -0700, Logan Gunthorpe wrote:
>> In order to use upstream_bridge_distance_warn() from a dma_map function,
>> it must not sleep. However, pci_get_slot() takes the pci_bus_sem so it
>> might sleep.
>>
>> In order to avoid this, try to get the host bridge's device from
>> bus->self, and if that is not set just get the first element in the
>> list. It should be impossible for the host bridges device to go away
>> while references are held on child devices, so the first element
>> should not change and this should be safe.
>>
>> Signed-off-by: Logan Gunthorpe 
>> ---
>>  drivers/pci/p2pdma.c | 6 +-
>>  1 file changed, 5 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
>> index bd89437faf06..2135fe69bb07 100644
>> --- a/drivers/pci/p2pdma.c
>> +++ b/drivers/pci/p2pdma.c
>> @@ -311,11 +311,15 @@ static const struct pci_p2pdma_whitelist_entry {
>>  static bool __host_bridge_whitelist(struct pci_host_bridge *host,
>>  bool same_host_bridge)
>>  {
>> -struct pci_dev *root = pci_get_slot(host->bus, PCI_DEVFN(0, 0));
>>  const struct pci_p2pdma_whitelist_entry *entry;
>> +struct pci_dev *root = host->bus->self;
>>  unsigned short vendor, device;
>>  
>>  if (!root)
>> +root = list_first_entry_or_null(>bus->devices,
>> +struct pci_dev, bus_list);
> 
> Replacing one ugliness (assuming there is a pci_dev for the host
> bridge, and that it is at 00.0) with another (still assuming a pci_dev
> and that it is host->bus->self or the first entry).  I can't suggest
> anything better, but maybe a little comment in the code would help
> future readers.

Yeah, I struggled to find a solution here; this was the best I could
come up with. I'd love it if someone had a better idea. I can add a
comment for future iterations.

> I wish we had a real way to discover this property without the
> whitelist, at least for future devices.  Was there ever any interest
> in a _DSM or similar interface for this?

I'd also like to get rid of the whitelist, but I have no idea how or who
would have to lead a fight to get the hardware to self describe in way
that we could use.

> I *am* very glad to remove a pci_get_slot() usage.
> 
>> +
>> +if (!root || root->devfn)
>>  return false;
>>  
>>  vendor = root->vendor;
> 
> Don't you need to also remove the "pci_dev_put(root)" a few lines
> below?

Yes, right. Good catch!

Logan



Re: [RFC PATCH v2 01/11] PCI/P2PDMA: Pass gfp_mask flags to upstream_bridge_distance_warn()

2021-03-12 Thread Logan Gunthorpe




On 2021-03-12 1:39 p.m., Bjorn Helgaas wrote:

On Thu, Mar 11, 2021 at 04:31:31PM -0700, Logan Gunthorpe wrote:

In order to call this function from a dma_map function, it must not sleep.
The only reason it does sleep so to allocate the seqbuf to print
which devices are within the ACS path.


s/this function/upstream_bridge_distance_warn()/ ?
s/so to/is to/

Maybe the subject could say something about the purpose, e.g., allow
calling from atomic context or something?  "Pass gfp_mask flags" sort
of restates what we can read from the patch, but without the
motivation of why this is useful.


Switch the kmalloc call to use a passed in gfp_mask  and don't print that
message if the buffer fails to be allocated.

Signed-off-by: Logan Gunthorpe 


Acked-by: Bjorn Helgaas 


Thanks! I'll apply these changes for any future postings.

Logan


Re: [RFC PATCH v2 08/11] iommu/dma: Support PCI P2PDMA pages in dma-iommu map_sg

2021-03-12 Thread Logan Gunthorpe




On 2021-03-12 12:47 p.m., Robin Murphy wrote:

   {
   struct scatterlist *s, *cur = sg;
   unsigned long seg_mask = dma_get_seg_boundary(dev);
@@ -864,6 +865,20 @@ static int __finalise_sg(struct device *dev,
struct scatterlist *sg, int nents,
   sg_dma_address(s) = DMA_MAPPING_ERROR;
   sg_dma_len(s) = 0;
   +    if (is_pci_p2pdma_page(sg_page(s)) && !s_iova_len) {
+    if (i > 0)
+    cur = sg_next(cur);
+
+    sg_dma_address(cur) = sg_phys(s) + s->offset -


Are you sure about that? ;)


Do you see a bug? I don't follow you...


sg_phys() already accounts for the offset, so you're adding it twice.


Ah, oops. Nice catch. I missed that.




+    pci_p2pdma_bus_offset(sg_page(s));


Can the bus offset make P2P addresses overlap with regions of mem space
that we might use for regular IOVA allocation? That would be very bad...


No. IOMMU drivers already disallow all PCI addresses from being used as
IOVA addresses. See, for example,  dmar_init_reserved_ranges(). It would
be a huge problem for a whole lot of other reasons if it didn't.


I know we reserve the outbound windows (largely *because* some host 
bridges will consider those addresses as attempts at unsupported P2P and 
prevent them working), I just wanted to confirm that this bus offset is 
always something small that stays within the relevant window, rather 
than something that might make a BAR appear in a completely different 
place for P2P purposes. If so, that's good.


Yes, well if an IOVA overlaps with any PCI bus address there's going to 
be some difficult brokenness because when the IOVA is used it might be 
directed to the a PCI device and not the IOMMU. I fixed a bug like that 
once.

I'm not really thrilled about the idea of passing zero-length segments
to iommu_map_sg(). Yes, it happens to trick the concatenation logic in
the current implementation into doing what you want, but it feels 
fragile.


We're not passing zero length segments to iommu_map_sg() (or any
function). This loop is just scanning to calculate the length of the
required IOVA. __finalise_sg() (which is intimately tied to this loop)
then needs a way to determine which segments were P2P segments. The
existing code already overwrites s->length with an aligned length and
stores the original length in sg_dma_len. So we're not relying on
tricking any logic here.


Yes, we temporarily shuffle in page-aligned quantities to satisfy the 
needs of the iommu_map_sg() call, before unpacking things again in 
__finalise_sg(). It's some disgusting trickery that I'm particularly 
proud of. My point is that if you have a mix of both p2p and normal 
segments - which seems to be a case you want to support - then the 
length of 0 that you set to flag p2p segments here will be seen by 
iommu_map_sg() (as it walks the list to map the other segments) before 
you then use it as a key to override the DMA address in the final step. 
It's not a concern if you have a p2p-only list and short-circuit 
straight to that step (in which case all the shuffling was wasted effort 
anyway), but since it's not entirely clear what a segment with zero 
length would mean in general, it seems like a good idea to avoid passing 
the list across a public boundary in that state, if possible.


Ok, well, I mean the iommu_map_sg() does the right thing as is without 
changing it and IMO sg->length set to zero does make sense. Supporting 
mixed P2P and normal segments is really the whole point of this series 
(the current kernel supports homogeneous SGLs with a specialized path -- 
see pci_p2pdma_unmap_sg_attrs()). But do you have an alternate solution 
for sg->length?


Logan


Re: [RFC PATCH v2 06/11] dma-direct: Support PCI P2PDMA pages in dma-direct map_sg

2021-03-12 Thread Logan Gunthorpe



On 2021-03-12 11:11 a.m., Robin Murphy wrote:
> On 2021-03-12 16:24, Logan Gunthorpe wrote:
>>
>>
>> On 2021-03-12 8:52 a.m., Robin Murphy wrote:
>>>> +
>>>>    sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
>>>>    sg->offset, sg->length, dir, attrs);
>>>>    if (sg->dma_address == DMA_MAPPING_ERROR)
>>>> @@ -411,7 +440,7 @@ int dma_direct_map_sg(struct device *dev, struct
>>>> scatterlist *sgl, int nents,
>>>>      out_unmap:
>>>>    dma_direct_unmap_sg(dev, sgl, i, dir, attrs |
>>>> DMA_ATTR_SKIP_CPU_SYNC);
>>>> -    return 0;
>>>> +    return ret;
>>>>    }
>>>>      dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t
>>>> paddr,
>>>> diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
>>>> index b6a633679933..adc1a83950be 100644
>>>> --- a/kernel/dma/mapping.c
>>>> +++ b/kernel/dma/mapping.c
>>>> @@ -178,8 +178,15 @@ void dma_unmap_page_attrs(struct device *dev,
>>>> dma_addr_t addr, size_t size,
>>>>    EXPORT_SYMBOL(dma_unmap_page_attrs);
>>>>      /*
>>>> - * dma_maps_sg_attrs returns 0 on error and > 0 on success.
>>>> - * It should never return a value < 0.
>>>> + * dma_maps_sg_attrs returns 0 on any resource error and > 0 on
>>>> success.
>>>> + *
>>>> + * If 0 is returned, the mapping can be retried and will succeed once
>>>> + * sufficient resources are available.
>>>
>>> That's not a guarantee we can uphold. Retrying forever in the vain hope
>>> that a device might evolve some extra address bits, or a bounce buffer
>>> might magically grow big enough for a gigantic mapping, isn't
>>> necessarily the best idea.
>>
>> Perhaps this is just poorly worded. Returning 0 is the normal case and
>> nothing has changed there. The block layer, for example, will retry if
>> zero is returned as this only happens if it failed to allocate resources
>> for the mapping. The reason we have to return -1 is to tell the block
>> layer not to retry these requests as they will never succeed in the
>> future.
>>
>>>> + *
>>>> + * If there are P2PDMA pages in the scatterlist then this function may
>>>> + * return -EREMOTEIO to indicate that the pages are not mappable by
>>>> the
>>>> + * device. In this case, an error should be returned for the IO as it
>>>> + * will never be successfully retried.
>>>>     */
>>>>    int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int
>>>> nents,
>>>>    enum dma_data_direction dir, unsigned long attrs)
>>>> @@ -197,7 +204,7 @@ int dma_map_sg_attrs(struct device *dev, struct
>>>> scatterlist *sg, int nents,
>>>>    ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
>>>>    else
>>>>    ents = ops->map_sg(dev, sg, nents, dir, attrs);
>>>> -    BUG_ON(ents < 0);
>>>> +
>>>
>>> This scares me - I hesitate to imagine the amount of driver/subsystem
>>> code out there that will see nonzero and merrily set off iterating a
>>> negative number of segments, if we open the floodgates of allowing
>>> implementations to return error codes here.
>>
>> Yes, but it will never happen on existing drivers/subsystems. The only
>> way it can return a negative number is if the driver passes in P2PDMA
>> pages which can't happen without changes in the driver. We are careful
>> about where P2PDMA pages can get into so we don't have to worry about
>> all the existing driver code out there.
> 
> Sure, that's how things stand immediately after this patch. But then
> someone comes along with the perfectly reasonable argument for returning
> more expressive error information for regular mapping failures as well
> (because sometimes those can be terminal too, as above), we start to get
> divergent behaviour across architectures and random bits of old code
> subtly breaking down the line. *That* is what makes me wary of making a
> fundamental change to a long-standing "nonzero means success" interface...

So then we reject the patches that make that change. Seems like an odd
argument to say that we can't do something that won't cause problems
because someone might use it as an example and do something that will
cause problems. Reject the change that causes the problem.

Logan


Re: [RFC PATCH v2 00/11] Add support to dma_map_sg for P2PDMA

2021-03-12 Thread Logan Gunthorpe



On 2021-03-12 10:46 a.m., Robin Murphy wrote:
> On 2021-03-12 16:18, Logan Gunthorpe wrote:
>>
>>
>> On 2021-03-12 8:51 a.m., Robin Murphy wrote:
>>> On 2021-03-11 23:31, Logan Gunthorpe wrote:
>>>> Hi,
>>>>
>>>> This is a rework of the first half of my RFC for doing P2PDMA in
>>>> userspace
>>>> with O_DIRECT[1].
>>>>
>>>> The largest issue with that series was the gross way of flagging P2PDMA
>>>> SGL segments. This RFC proposes a different approach, (suggested by
>>>> Dan Williams[2]) which uses the third bit in the page_link field of the
>>>> SGL.
>>>>
>>>> This approach is a lot less hacky but comes at the cost of adding a
>>>> CONFIG_64BIT dependency to CONFIG_PCI_P2PDMA and using up the last
>>>> scarce bit in the page_link. For our purposes, a 64BIT restriction is
>>>> acceptable but it's not clear if this is ok for all usecases hoping
>>>> to make use of P2PDMA.
>>>>
>>>> Matthew Wilcox has already suggested (off-list) that this is the wrong
>>>> approach, preferring a new dma mapping operation and an SGL
>>>> replacement. I
>>>> don't disagree that something along those lines would be a better long
>>>> term solution, but it involves overcoming a lot of challenges to get
>>>> there. Creating a new mapping operation still means adding support to
>>>> more
>>>> than 25 dma_map_ops implementations (many of which are on obscure
>>>> architectures) or creating a redundant path to fallback with
>>>> dma_map_sg()
>>>> for every driver that uses the new operation. This RFC is an approach
>>>> that doesn't require overcoming these blocks.
>>>
>>> I don't really follow that argument - you're only adding support to two
>>> implementations with the awkward flag, so why would using a dedicated
>>> operation instead be any different? Whatever callers need to do if
>>> dma_pci_p2pdma_supported() says no, they could equally do if
>>> dma_map_p2p_sg() (or whatever) returns -ENXIO, no?
>>
>> The thing is if the dma_map_sg doesn't support P2PDMA then P2PDMA
>> transactions cannot be done, but regular transactions can still go
>> through as they always did.
>>
>> But replacing dma_map_sg() with dma_map_new() affects all operations,
>> P2PDMA or otherwise. If dma_map_new() isn't supported it can't simply
>> not support P2PDMA; it has to maintain a fallback path to dma_map_sg().
> 
> But AFAICS the equivalent fallback path still has to exist either way.
> My impression so far is that callers would end up looking something like
> this:
> 
> if (dma_pci_p2pdma_supported()) {
>     if (dma_map_sg(...) < 0)
>     //do non-p2p fallback due to p2p failure
> } else {
>     //do non-p2p fallback due to lack of support
> }
> 
> at which point, simply:
> 
> if (dma_map_sg_p2p(...) < 0)
>     //do non-p2p fallback either way
> 
> seems entirely reasonable. What am I missing?

No, that's not how it works. The dma_pci_p2pdma_supported() flag gates
whether P2PDMA pages will be used at a much higher level. Currently with
NVMe, if P2PDMA is supported, it sets the QUEUE_FLAG_PCI_P2PDMA on the
block queue. This is then tested with blk_queue_pci_p2pdma() before any
P2PDMA transaction with the block device is started.

In the following patches that could add userspace support,
blk_queue_pci_p2pdma() is used to add a flag to GUP which allows P2PDMA
pages into the driver via O_DIRECT.

There is no fallback path on the dma_map_sg() operation if p2pdma is not
supported. dma_map_sg() is always used. The code simply prevents pages
from getting there in the first place.

A new DMA operation would have to be:

if (dma_has_new_operation()) {
 // create input for dma_map_new_op()
 // map with dma_map_new_op()
 // parse output of dma_map_new_op()
} else {
 // create SGL
 dma_map_sg()
 // convert SGL to hardware map
}

And this if statement has nothing to do with p2pdma support or not.

> 
> Let's not pretend that overloading an existing API means we can start
> feeding P2P pages into any old subsystem/driver without further changes
> - there already *are* at least some that retry ad infinitum if DMA
> mapping fails (the USB layer springs to mind...) and thus wouldn't
> handle the PCI_P2PDMA_MAP_NOT_SUPPORTED case acceptably.

Yes, nobody is pretending that at all. We are being very careful with
P2PDMA pages and we don't want them to get into any driver that isn't
explicitly written to expect them. With the code in the current kernel
this

Re: [RFC PATCH v2 08/11] iommu/dma: Support PCI P2PDMA pages in dma-iommu map_sg

2021-03-12 Thread Logan Gunthorpe



On 2021-03-12 8:52 a.m., Robin Murphy wrote:
> On 2021-03-11 23:31, Logan Gunthorpe wrote:
>> When a PCI P2PDMA page is seen, set the IOVA length of the segment
>> to zero so that it is not mapped into the IOVA. Then, in finalise_sg(),
>> apply the appropriate bus address to the segment. The IOVA is not
>> created if the scatterlist only consists of P2PDMA pages.
> 
> This misled me at first, but I see the implementation does actually
> appear to accomodate the case of working ACS where P2P *would* still
> need to be mapped at the IOMMU.

Yes, that's correct.
>>   static int __finalise_sg(struct device *dev, struct scatterlist *sg,
>> int nents,
>> -    dma_addr_t dma_addr)
>> +    dma_addr_t dma_addr, unsigned long attrs)
>>   {
>>   struct scatterlist *s, *cur = sg;
>>   unsigned long seg_mask = dma_get_seg_boundary(dev);
>> @@ -864,6 +865,20 @@ static int __finalise_sg(struct device *dev,
>> struct scatterlist *sg, int nents,
>>   sg_dma_address(s) = DMA_MAPPING_ERROR;
>>   sg_dma_len(s) = 0;
>>   +    if (is_pci_p2pdma_page(sg_page(s)) && !s_iova_len) {
>> +    if (i > 0)
>> +    cur = sg_next(cur);
>> +
>> +    sg_dma_address(cur) = sg_phys(s) + s->offset -
> 
> Are you sure about that? ;)

Do you see a bug? I don't follow you...

>> +    pci_p2pdma_bus_offset(sg_page(s));
> 
> Can the bus offset make P2P addresses overlap with regions of mem space
> that we might use for regular IOVA allocation? That would be very bad...

No. IOMMU drivers already disallow all PCI addresses from being used as
IOVA addresses. See, for example,  dmar_init_reserved_ranges(). It would
be a huge problem for a whole lot of other reasons if it didn't.


>> @@ -960,11 +975,12 @@ static int iommu_dma_map_sg(struct device *dev,
>> struct scatterlist *sg,
>>   struct iommu_dma_cookie *cookie = domain->iova_cookie;
>>   struct iova_domain *iovad = >iovad;
>>   struct scatterlist *s, *prev = NULL;
>> +    struct dev_pagemap *pgmap = NULL;
>>   int prot = dma_info_to_prot(dir, dev_is_dma_coherent(dev), attrs);
>>   dma_addr_t iova;
>>   size_t iova_len = 0;
>>   unsigned long mask = dma_get_seg_boundary(dev);
>> -    int i;
>> +    int i, map = -1, ret = 0;
>>     if (static_branch_unlikely(_deferred_attach_enabled) &&
>>   iommu_deferred_attach(dev, domain))
>> @@ -993,6 +1009,23 @@ static int iommu_dma_map_sg(struct device *dev,
>> struct scatterlist *sg,
>>   s_length = iova_align(iovad, s_length + s_iova_off);
>>   s->length = s_length;
>>   +    if (is_pci_p2pdma_page(sg_page(s))) {
>> +    if (sg_page(s)->pgmap != pgmap) {
>> +    pgmap = sg_page(s)->pgmap;
>> +    map = pci_p2pdma_dma_map_type(dev, pgmap);
>> +    }
>> +
>> +    if (map < 0) {
> 
> It rather feels like it should be the job of whoever creates the list in
> the first place not to put unusable pages in it, especially since the
> p2pdma_map_type looks to be a fairly coarse-grained and static thing.
> The DMA API isn't responsible for validating normal memory pages, so
> what makes P2P special?

Yes, that would be ideal, but there's some difficulties there. For the
driver to check the pages, it would need to loop through the entire SG
one more time on every transaction, regardless of whether there are
P2PDMA pages, or not. So that will have a performance impact even when
the feature isn't being used. I don't think that'll be acceptable for
many drivers.

The other possibility is for GUP to do it when it gets the pages from
userspace. But GUP doesn't have all the information to do this at the
moment. We'd have to pass the struct device that will eventually map the
pages through all the nested functions in the GUP to do that test at
that time. This might not be a bad option (that I half looked into), but
I'm not sure how acceptable it would be to the GUP developers.

But even if we do verify the pages ahead of time, we still need the same
infrastructure in dma_map_sg(); it could only now be a BUG if the driver
sent invalid pages instead of an error return.

>> +    ret = -EREMOTEIO;
>> +    goto out_restore_sg;
>> +    }
>> +
>> +    if (map) {
>> +    s->length = 0;
> 
> I'm not really thrilled about the idea of passing zero-length segments
> to iommu_map_sg(). Yes, it happens to trick the concatenation logic in
> the current implementation into doing what you want, but it feels fragile.

We're not passing

Re: [RFC PATCH v2 06/11] dma-direct: Support PCI P2PDMA pages in dma-direct map_sg

2021-03-12 Thread Logan Gunthorpe



On 2021-03-12 8:52 a.m., Robin Murphy wrote:
>> +
>>   sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
>>   sg->offset, sg->length, dir, attrs);
>>   if (sg->dma_address == DMA_MAPPING_ERROR)
>> @@ -411,7 +440,7 @@ int dma_direct_map_sg(struct device *dev, struct
>> scatterlist *sgl, int nents,
>>     out_unmap:
>>   dma_direct_unmap_sg(dev, sgl, i, dir, attrs |
>> DMA_ATTR_SKIP_CPU_SYNC);
>> -    return 0;
>> +    return ret;
>>   }
>>     dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t
>> paddr,
>> diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
>> index b6a633679933..adc1a83950be 100644
>> --- a/kernel/dma/mapping.c
>> +++ b/kernel/dma/mapping.c
>> @@ -178,8 +178,15 @@ void dma_unmap_page_attrs(struct device *dev,
>> dma_addr_t addr, size_t size,
>>   EXPORT_SYMBOL(dma_unmap_page_attrs);
>>     /*
>> - * dma_maps_sg_attrs returns 0 on error and > 0 on success.
>> - * It should never return a value < 0.
>> + * dma_maps_sg_attrs returns 0 on any resource error and > 0 on success.
>> + *
>> + * If 0 is returned, the mapping can be retried and will succeed once
>> + * sufficient resources are available.
> 
> That's not a guarantee we can uphold. Retrying forever in the vain hope
> that a device might evolve some extra address bits, or a bounce buffer
> might magically grow big enough for a gigantic mapping, isn't
> necessarily the best idea.

Perhaps this is just poorly worded. Returning 0 is the normal case and
nothing has changed there. The block layer, for example, will retry if
zero is returned as this only happens if it failed to allocate resources
for the mapping. The reason we have to return -1 is to tell the block
layer not to retry these requests as they will never succeed in the future.

>> + *
>> + * If there are P2PDMA pages in the scatterlist then this function may
>> + * return -EREMOTEIO to indicate that the pages are not mappable by the
>> + * device. In this case, an error should be returned for the IO as it
>> + * will never be successfully retried.
>>    */
>>   int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int
>> nents,
>>   enum dma_data_direction dir, unsigned long attrs)
>> @@ -197,7 +204,7 @@ int dma_map_sg_attrs(struct device *dev, struct
>> scatterlist *sg, int nents,
>>   ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
>>   else
>>   ents = ops->map_sg(dev, sg, nents, dir, attrs);
>> -    BUG_ON(ents < 0);
>> +
> 
> This scares me - I hesitate to imagine the amount of driver/subsystem
> code out there that will see nonzero and merrily set off iterating a
> negative number of segments, if we open the floodgates of allowing
> implementations to return error codes here.

Yes, but it will never happen on existing drivers/subsystems. The only
way it can return a negative number is if the driver passes in P2PDMA
pages which can't happen without changes in the driver. We are careful
about where P2PDMA pages can get into so we don't have to worry about
all the existing driver code out there.

Logan


Re: [RFC PATCH v2 00/11] Add support to dma_map_sg for P2PDMA

2021-03-12 Thread Logan Gunthorpe



On 2021-03-12 8:51 a.m., Robin Murphy wrote:
> On 2021-03-11 23:31, Logan Gunthorpe wrote:
>> Hi,
>>
>> This is a rework of the first half of my RFC for doing P2PDMA in
>> userspace
>> with O_DIRECT[1].
>>
>> The largest issue with that series was the gross way of flagging P2PDMA
>> SGL segments. This RFC proposes a different approach, (suggested by
>> Dan Williams[2]) which uses the third bit in the page_link field of the
>> SGL.
>>
>> This approach is a lot less hacky but comes at the cost of adding a
>> CONFIG_64BIT dependency to CONFIG_PCI_P2PDMA and using up the last
>> scarce bit in the page_link. For our purposes, a 64BIT restriction is
>> acceptable but it's not clear if this is ok for all usecases hoping
>> to make use of P2PDMA.
>>
>> Matthew Wilcox has already suggested (off-list) that this is the wrong
>> approach, preferring a new dma mapping operation and an SGL
>> replacement. I
>> don't disagree that something along those lines would be a better long
>> term solution, but it involves overcoming a lot of challenges to get
>> there. Creating a new mapping operation still means adding support to
>> more
>> than 25 dma_map_ops implementations (many of which are on obscure
>> architectures) or creating a redundant path to fallback with dma_map_sg()
>> for every driver that uses the new operation. This RFC is an approach
>> that doesn't require overcoming these blocks.
> 
> I don't really follow that argument - you're only adding support to two
> implementations with the awkward flag, so why would using a dedicated
> operation instead be any different? Whatever callers need to do if
> dma_pci_p2pdma_supported() says no, they could equally do if
> dma_map_p2p_sg() (or whatever) returns -ENXIO, no?

The thing is if the dma_map_sg doesn't support P2PDMA then P2PDMA
transactions cannot be done, but regular transactions can still go
through as they always did.

But replacing dma_map_sg() with dma_map_new() affects all operations,
P2PDMA or otherwise. If dma_map_new() isn't supported it can't simply
not support P2PDMA; it has to maintain a fallback path to dma_map_sg().
Given that the inputs and outputs for dma_map_new() will be completely
different data structures this will be quite a lot of similar paths
required in the driver. (ie mapping a bvec to the input struct and the
output struct to hardware requirements) If a bug crops up in the old
dma_map_sg(), developers might not notice it for some time seeing it
won't be used on the most popular architectures.

Logan


Re: [RFC PATCH v2 11/11] nvme-pci: Convert to using dma_map_sg for p2pdma pages

2021-03-11 Thread Logan Gunthorpe



On 2021-03-11 4:59 p.m., Jason Gunthorpe wrote:
> On Thu, Mar 11, 2021 at 04:31:41PM -0700, Logan Gunthorpe wrote:
>> Convert to using dma_[un]map_sg() for PCI p2pdma pages.
>>
>> This should be equivalent, though support will be somewhat less
>> (only dma-direct and dma-iommu are currently supported).
>>
>> Signed-off-by: Logan Gunthorpe 
>>  drivers/nvme/host/pci.c | 27 +++
>>  1 file changed, 7 insertions(+), 20 deletions(-)
>>
>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index 7d40c6a9e58e..89ca5acf7a62 100644
>> +++ b/drivers/nvme/host/pci.c
>> @@ -577,17 +577,6 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct 
>> request *req)
>>  
>>  }
>>  
>> -static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req)
>> -{
>> -struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>> -
>> -if (is_pci_p2pdma_page(sg_page(iod->sg)))
>> -pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
>> -rq_dma_dir(req));
>> -else
>> -dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
>> -}
> 
> Can the two other places with this code pattern be changed too?

Yes, if this goes forward, I imagine completely dropping
pci_p2pdma_unmap_sg().

Logan


[RFC PATCH v2 03/11] PCI/P2PDMA: Attempt to set map_type if it has not been set

2021-03-11 Thread Logan Gunthorpe
Attempt to find the mapping type for P2PDMA pages on the first
DMA map attempt if it has not been done ahead of time.

Previously, the mapping type was expected to be calculated ahead of
time, but if pages are to come from userspace then there's no
way to ensure the path was checked ahead of time.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/p2pdma.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 2135fe69bb07..7f6836732bce 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -819,11 +819,18 @@ EXPORT_SYMBOL_GPL(pci_p2pmem_publish);
 static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct pci_dev *provider,
struct pci_dev *client)
 {
+   enum pci_p2pdma_map_type ret;
+
if (!provider->p2pdma)
return PCI_P2PDMA_MAP_NOT_SUPPORTED;
 
-   return xa_to_value(xa_load(>p2pdma->map_types,
-  map_types_idx(client)));
+   ret = xa_to_value(xa_load(>p2pdma->map_types,
+ map_types_idx(client)));
+   if (ret != PCI_P2PDMA_MAP_UNKNOWN)
+   return ret;
+
+   return upstream_bridge_distance_warn(provider, client, NULL,
+GFP_ATOMIC);
 }
 
 static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap *p2p_pgmap,
@@ -871,7 +878,6 @@ int pci_p2pdma_map_sg_attrs(struct device *dev, struct 
scatterlist *sg,
case PCI_P2PDMA_MAP_BUS_ADDR:
return __pci_p2pdma_map_sg(p2p_pgmap, dev, sg, nents);
default:
-   WARN_ON_ONCE(1);
return 0;
}
 }
-- 
2.20.1



[RFC PATCH v2 11/11] nvme-pci: Convert to using dma_map_sg for p2pdma pages

2021-03-11 Thread Logan Gunthorpe
Convert to using dma_[un]map_sg() for PCI p2pdma pages.

This should be equivalent, though support will be somewhat less
(only dma-direct and dma-iommu are currently supported).

Signed-off-by: Logan Gunthorpe 
---
 drivers/nvme/host/pci.c | 27 +++
 1 file changed, 7 insertions(+), 20 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 7d40c6a9e58e..89ca5acf7a62 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -577,17 +577,6 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct 
request *req)
 
 }
 
-static void nvme_unmap_sg(struct nvme_dev *dev, struct request *req)
-{
-   struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-
-   if (is_pci_p2pdma_page(sg_page(iod->sg)))
-   pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
-   rq_dma_dir(req));
-   else
-   dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
-}
-
 static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
 {
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -600,7 +589,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct 
request *req)
 
WARN_ON_ONCE(!iod->nents);
 
-   nvme_unmap_sg(dev, req);
+   dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
if (iod->npages == 0)
dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
  iod->first_dma);
@@ -868,13 +857,11 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, 
struct request *req,
if (!iod->nents)
goto out_free_sg;
 
-   if (is_pci_p2pdma_page(sg_page(iod->sg)))
-   nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg,
-   iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN);
-   else
-   nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
-rq_dma_dir(req), DMA_ATTR_NO_WARN);
-   if (!nr_mapped)
+   nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
+rq_dma_dir(req), DMA_ATTR_NO_WARN);
+   if (nr_mapped < 0)
+   ret = BLK_STS_P2PDMA;
+   if (nr_mapped <= 0)
goto out_free_sg;
 
iod->use_sgl = nvme_pci_use_sgls(dev, req);
@@ -887,7 +874,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, 
struct request *req,
return BLK_STS_OK;
 
 out_unmap_sg:
-   nvme_unmap_sg(dev, req);
+   dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
 out_free_sg:
mempool_free(iod->sg, dev->iod_mempool);
return ret;
-- 
2.20.1



[RFC PATCH v2 01/11] PCI/P2PDMA: Pass gfp_mask flags to upstream_bridge_distance_warn()

2021-03-11 Thread Logan Gunthorpe
In order to call this function from a dma_map function, it must not sleep.
The only reason it does sleep so to allocate the seqbuf to print
which devices are within the ACS path.

Switch the kmalloc call to use a passed in gfp_mask  and don't print that
message if the buffer fails to be allocated.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/p2pdma.c | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 196382630363..bd89437faf06 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -267,7 +267,7 @@ static int pci_bridge_has_acs_redir(struct pci_dev *pdev)
 
 static void seq_buf_print_bus_devfn(struct seq_buf *buf, struct pci_dev *pdev)
 {
-   if (!buf)
+   if (!buf || !buf->buffer)
return;
 
seq_buf_printf(buf, "%s;", pci_name(pdev));
@@ -495,25 +495,26 @@ upstream_bridge_distance(struct pci_dev *provider, struct 
pci_dev *client,
 
 static enum pci_p2pdma_map_type
 upstream_bridge_distance_warn(struct pci_dev *provider, struct pci_dev *client,
- int *dist)
+ int *dist, gfp_t gfp_mask)
 {
struct seq_buf acs_list;
bool acs_redirects;
int ret;
 
-   seq_buf_init(_list, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
-   if (!acs_list.buffer)
-   return -ENOMEM;
+   seq_buf_init(_list, kmalloc(PAGE_SIZE, gfp_mask), PAGE_SIZE);
 
ret = upstream_bridge_distance(provider, client, dist, _redirects,
   _list);
if (acs_redirects) {
pci_warn(client, "ACS redirect is set between the client and 
provider (%s)\n",
 pci_name(provider));
-   /* Drop final semicolon */
-   acs_list.buffer[acs_list.len-1] = 0;
-   pci_warn(client, "to disable ACS redirect for this path, add 
the kernel parameter: pci=disable_acs_redir=%s\n",
-acs_list.buffer);
+
+   if (acs_list.buffer) {
+   /* Drop final semicolon */
+   acs_list.buffer[acs_list.len - 1] = 0;
+   pci_warn(client, "to disable ACS redirect for this 
path, add the kernel parameter: pci=disable_acs_redir=%s\n",
+acs_list.buffer);
+   }
}
 
if (ret == PCI_P2PDMA_MAP_NOT_SUPPORTED) {
@@ -566,7 +567,7 @@ int pci_p2pdma_distance_many(struct pci_dev *provider, 
struct device **clients,
 
if (verbose)
ret = upstream_bridge_distance_warn(provider,
-   pci_client, );
+   pci_client, , GFP_KERNEL);
else
ret = upstream_bridge_distance(provider, pci_client,
   , NULL, NULL);
-- 
2.20.1



[RFC PATCH v2 06/11] dma-direct: Support PCI P2PDMA pages in dma-direct map_sg

2021-03-11 Thread Logan Gunthorpe
Add PCI P2PDMA support for dma_direct_map_sg() so that it can map
PCI P2PDMA pages directly without a hack in the callers. This allows
for heterogeneous SGLs that contain both P2PDMA and regular pages.

SGL segments that contain PCI bus addresses are marked with
sg_mark_pci_p2pdma() and are ignored when unmapped.

Signed-off-by: Logan Gunthorpe 
---
 kernel/dma/direct.c  | 35 ---
 kernel/dma/mapping.c | 13 ++---
 2 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 002268262c9a..f326d32062dd 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "direct.h"
 
 /*
@@ -387,19 +388,47 @@ void dma_direct_unmap_sg(struct device *dev, struct 
scatterlist *sgl,
struct scatterlist *sg;
int i;
 
-   for_each_sg(sgl, sg, nents, i)
+   for_each_sg(sgl, sg, nents, i) {
+   if (sg_is_pci_p2pdma(sg))
+   continue;
+
dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
 attrs);
+   }
 }
 #endif
 
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs)
 {
-   int i;
+   struct dev_pagemap *pgmap = NULL;
+   int i, map = -1, ret = 0;
struct scatterlist *sg;
+   u64 bus_off;
 
for_each_sg(sgl, sg, nents, i) {
+   if (is_pci_p2pdma_page(sg_page(sg))) {
+   if (sg_page(sg)->pgmap != pgmap) {
+   pgmap = sg_page(sg)->pgmap;
+   map = pci_p2pdma_dma_map_type(dev, pgmap);
+   bus_off = pci_p2pdma_bus_offset(sg_page(sg));
+   }
+
+   if (map < 0) {
+   sg->dma_address = DMA_MAPPING_ERROR;
+   ret = -EREMOTEIO;
+   goto out_unmap;
+   }
+
+   if (map) {
+   sg->dma_address = sg_phys(sg) + sg->offset -
+   bus_off;
+   sg_dma_len(sg) = sg->length;
+   sg_mark_pci_p2pdma(sg);
+   continue;
+   }
+   }
+
sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
sg->offset, sg->length, dir, attrs);
if (sg->dma_address == DMA_MAPPING_ERROR)
@@ -411,7 +440,7 @@ int dma_direct_map_sg(struct device *dev, struct 
scatterlist *sgl, int nents,
 
 out_unmap:
dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
-   return 0;
+   return ret;
 }
 
 dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index b6a633679933..adc1a83950be 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -178,8 +178,15 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t 
addr, size_t size,
 EXPORT_SYMBOL(dma_unmap_page_attrs);
 
 /*
- * dma_maps_sg_attrs returns 0 on error and > 0 on success.
- * It should never return a value < 0.
+ * dma_maps_sg_attrs returns 0 on any resource error and > 0 on success.
+ *
+ * If 0 is returned, the mapping can be retried and will succeed once
+ * sufficient resources are available.
+ *
+ * If there are P2PDMA pages in the scatterlist then this function may
+ * return -EREMOTEIO to indicate that the pages are not mappable by the
+ * device. In this case, an error should be returned for the IO as it
+ * will never be successfully retried.
  */
 int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, int nents,
enum dma_data_direction dir, unsigned long attrs)
@@ -197,7 +204,7 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist 
*sg, int nents,
ents = dma_direct_map_sg(dev, sg, nents, dir, attrs);
else
ents = ops->map_sg(dev, sg, nents, dir, attrs);
-   BUG_ON(ents < 0);
+
debug_dma_map_sg(dev, sg, nents, ents, dir);
 
return ents;
-- 
2.20.1



[RFC PATCH v2 00/11] Add support to dma_map_sg for P2PDMA

2021-03-11 Thread Logan Gunthorpe
Hi,

This is a rework of the first half of my RFC for doing P2PDMA in userspace
with O_DIRECT[1].

The largest issue with that series was the gross way of flagging P2PDMA
SGL segments. This RFC proposes a different approach, (suggested by
Dan Williams[2]) which uses the third bit in the page_link field of the
SGL.

This approach is a lot less hacky but comes at the cost of adding a
CONFIG_64BIT dependency to CONFIG_PCI_P2PDMA and using up the last
scarce bit in the page_link. For our purposes, a 64BIT restriction is
acceptable but it's not clear if this is ok for all usecases hoping
to make use of P2PDMA.

Matthew Wilcox has already suggested (off-list) that this is the wrong
approach, preferring a new dma mapping operation and an SGL replacement. I
don't disagree that something along those lines would be a better long
term solution, but it involves overcoming a lot of challenges to get
there. Creating a new mapping operation still means adding support to more
than 25 dma_map_ops implementations (many of which are on obscure
architectures) or creating a redundant path to fallback with dma_map_sg()
for every driver that uses the new operation. This RFC is an approach
that doesn't require overcoming these blocks.

Any alternative ideas or feedback is welcome.

These patches are based on v5.12-rc2 and a git branch is available here:

  https://github.com/sbates130272/linux-p2pmem/  p2pdma_dma_map_ops_rfc

A branch with the patches from the previous RFC that add userspace
O_DIRECT support is available at the same URL with the name
"p2pdma_dma_map_ops_rfc+user" (however, none of the issues with those
extra patches from the feedback of the last posting have been fixed).

Thanks,

Logan

[1] 
https://lore.kernel.org/linux-block/20201106170036.18713-1-log...@deltatee.com/
[2] 
https://lore.kernel.org/linux-block/capcyv4ifgcrdotut8qr7pmfhmecghqgvre9g0rorgczcgve...@mail.gmail.com/

--

Logan Gunthorpe (11):
  PCI/P2PDMA: Pass gfp_mask flags to upstream_bridge_distance_warn()
  PCI/P2PDMA: Avoid pci_get_slot() which sleeps
  PCI/P2PDMA: Attempt to set map_type if it has not been set
  PCI/P2PDMA: Introduce pci_p2pdma_should_map_bus() and
pci_p2pdma_bus_offset()
  lib/scatterlist: Add flag for indicating P2PDMA segments in an SGL
  dma-direct: Support PCI P2PDMA pages in dma-direct map_sg
  dma-mapping: Add flags to dma_map_ops to indicate PCI P2PDMA support
  iommu/dma: Support PCI P2PDMA pages in dma-iommu map_sg
  block: Add BLK_STS_P2PDMA
  nvme-pci: Check DMA ops when indicating support for PCI P2PDMA
  nvme-pci: Convert to using dma_map_sg for p2pdma pages

 block/blk-core.c|  2 +
 drivers/iommu/dma-iommu.c   | 63 +-
 drivers/nvme/host/core.c|  3 +-
 drivers/nvme/host/nvme.h|  2 +-
 drivers/nvme/host/pci.c | 38 +++-
 drivers/pci/Kconfig |  2 +-
 drivers/pci/p2pdma.c| 89 +++--
 include/linux/blk_types.h   |  7 +++
 include/linux/dma-map-ops.h |  3 ++
 include/linux/dma-mapping.h |  5 +++
 include/linux/pci-p2pdma.h  | 11 +
 include/linux/scatterlist.h | 49 ++--
 kernel/dma/direct.c | 35 +--
 kernel/dma/mapping.c| 21 +++--
 14 files changed, 271 insertions(+), 59 deletions(-)


base-commit: a38fd8748464831584a19438cbb3082b5a2dab15
--
2.20.1


[RFC PATCH v2 09/11] block: Add BLK_STS_P2PDMA

2021-03-11 Thread Logan Gunthorpe
Create a specific error code for when P2PDMA pages are passed to a block
devices that cannot map them (due to no IOMMU support or ACS protections).

This makes request errors in these cases more informative of as to what
caused the error.

Signed-off-by: Logan Gunthorpe 
---
 block/blk-core.c  | 2 ++
 include/linux/blk_types.h | 7 +++
 2 files changed, 9 insertions(+)

diff --git a/block/blk-core.c b/block/blk-core.c
index fc60ff208497..2cc75b56ac43 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -192,6 +192,8 @@ static const struct {
[BLK_STS_ZONE_OPEN_RESOURCE]= { -ETOOMANYREFS, "open zones 
exceeded" },
[BLK_STS_ZONE_ACTIVE_RESOURCE]  = { -EOVERFLOW, "active zones exceeded" 
},
 
+   [BLK_STS_P2PDMA] = { -EREMOTEIO, "P2PDMA to invalid device" },
+
/* everything else not covered above: */
[BLK_STS_IOERR] = { -EIO,   "I/O" },
 };
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index db026b6ec15a..728a0898cb34 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -142,6 +142,13 @@ typedef u8 __bitwise blk_status_t;
  */
 #define BLK_STS_ZONE_ACTIVE_RESOURCE   ((__force blk_status_t)16)
 
+/*
+ * BLK_STS_P2PDMA is returned from the driver if P2PDMA memory fails to be
+ * mapped to the target device. This is a permanent error and the request
+ * should not be retried.
+ */
+#define BLK_STS_P2PDMA ((__force blk_status_t)17)
+
 /**
  * blk_path_error - returns true if error may be path related
  * @error: status the request was completed with
-- 
2.20.1



[RFC PATCH v2 05/11] lib/scatterlist: Add flag for indicating P2PDMA segments in an SGL

2021-03-11 Thread Logan Gunthorpe
Make use of the third free LSB in scatterlist's page_link on 64bit systems.

The extra bit will be used by dma_[un]map_sg() to determine when a
given SGL segments dma_address points to a PCI bus address.
dma_unmap_sg() will need to perform different cleanup when this is the
case.

Using this bit requires adding an additional dependency on CONFIG_64BIT to
CONFIG_PCI_P2PDMA. This should be acceptable as the majority of P2PDMA
use cases are restricted to newer root complexes and roughly require the
extra address space for memory BARs used in the transactions.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/Kconfig |  2 +-
 include/linux/scatterlist.h | 49 ++---
 2 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 0c473d75e625..90b4bddb3300 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -163,7 +163,7 @@ config PCI_PASID
 
 config PCI_P2PDMA
bool "PCI peer-to-peer transfer support"
-   depends on ZONE_DEVICE
+   depends on ZONE_DEVICE && 64BIT
select GENERIC_ALLOCATOR
help
  Enableѕ drivers to do PCI peer-to-peer transactions to and from
diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 6f70572b2938..5525d3ebf36f 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -58,6 +58,21 @@ struct sg_table {
 #define SG_CHAIN   0x01UL
 #define SG_END 0x02UL
 
+/*
+ * bit 2 is the third free bit in the page_link on 64bit systems which
+ * is used by dma_unmap_sg() to determine if the dma_address is a PCI
+ * bus address when doing P2PDMA.
+ * Note: CONFIG_PCI_P2PDMA depends on CONFIG_64BIT because of this.
+ */
+
+#ifdef CONFIG_PCI_P2PDMA
+#define SG_PCI_P2PDMA  0x04UL
+#else
+#define SG_PCI_P2PDMA  0x00UL
+#endif
+
+#define SG_PAGE_LINK_MASK (SG_CHAIN | SG_END | SG_PCI_P2PDMA)
+
 /*
  * We overload the LSB of the page pointer to indicate whether it's
  * a valid sg entry, or whether it points to the start of a new scatterlist.
@@ -65,8 +80,9 @@ struct sg_table {
  */
 #define sg_is_chain(sg)((sg)->page_link & SG_CHAIN)
 #define sg_is_last(sg) ((sg)->page_link & SG_END)
+#define sg_is_pci_p2pdma(sg)   ((sg)->page_link & SG_PCI_P2PDMA)
 #define sg_chain_ptr(sg)   \
-   ((struct scatterlist *) ((sg)->page_link & ~(SG_CHAIN | SG_END)))
+   ((struct scatterlist *) ((sg)->page_link & ~SG_PAGE_LINK_MASK))
 
 /**
  * sg_assign_page - Assign a given page to an SG entry
@@ -80,13 +96,13 @@ struct sg_table {
  **/
 static inline void sg_assign_page(struct scatterlist *sg, struct page *page)
 {
-   unsigned long page_link = sg->page_link & (SG_CHAIN | SG_END);
+   unsigned long page_link = sg->page_link & SG_PAGE_LINK_MASK;
 
/*
 * In order for the low bit stealing approach to work, pages
 * must be aligned at a 32-bit boundary as a minimum.
 */
-   BUG_ON((unsigned long) page & (SG_CHAIN | SG_END));
+   BUG_ON((unsigned long) page & SG_PAGE_LINK_MASK);
 #ifdef CONFIG_DEBUG_SG
BUG_ON(sg_is_chain(sg));
 #endif
@@ -120,7 +136,7 @@ static inline struct page *sg_page(struct scatterlist *sg)
 #ifdef CONFIG_DEBUG_SG
BUG_ON(sg_is_chain(sg));
 #endif
-   return (struct page *)((sg)->page_link & ~(SG_CHAIN | SG_END));
+   return (struct page *)((sg)->page_link & ~SG_PAGE_LINK_MASK);
 }
 
 /**
@@ -222,6 +238,31 @@ static inline void sg_unmark_end(struct scatterlist *sg)
sg->page_link &= ~SG_END;
 }
 
+/**
+ * sg_mark_pci_p2pdma - Mark the scatterlist entry for PCI p2pdma
+ * @sg: SG entryScatterlist
+ *
+ * Description:
+ *   Marks the passed in sg entry to indicate that the dma_address is
+ *   a PCI bus address.
+ **/
+static inline void sg_mark_pci_p2pdma(struct scatterlist *sg)
+{
+   sg->page_link |= SG_PCI_P2PDMA;
+}
+
+/**
+ * sg_unmark_pci_p2pdma - Unmark the scatterlist entry for PCI p2pdma
+ * @sg: SG entryScatterlist
+ *
+ * Description:
+ *   Clears the PCI P2PDMA mark
+ **/
+static inline void sg_unmark_pci_p2pdma(struct scatterlist *sg)
+{
+   sg->page_link &= ~SG_PCI_P2PDMA;
+}
+
 /**
  * sg_phys - Return physical address of an sg entry
  * @sg: SG entry
-- 
2.20.1



[RFC PATCH v2 04/11] PCI/P2PDMA: Introduce pci_p2pdma_should_map_bus() and pci_p2pdma_bus_offset()

2021-03-11 Thread Logan Gunthorpe
Introduce pci_p2pdma_should_map_bus() which is meant to be called by
DMA map functions to determine how to map a given p2pdma page.

pci_p2pdma_bus_offset() is also added to allow callers to get the bus
offset if they need to map the bus address.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/p2pdma.c   | 50 ++
 include/linux/pci-p2pdma.h | 11 +
 2 files changed, 61 insertions(+)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 7f6836732bce..66d16b7eb668 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -912,6 +912,56 @@ void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct 
scatterlist *sg,
 }
 EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs);
 
+/**
+ * pci_p2pdma_bus_offset - returns the bus offset for a given page
+ * @page: page to get the offset for
+ *
+ * Must be passed a PCI p2pdma page.
+ */
+u64 pci_p2pdma_bus_offset(struct page *page)
+{
+   struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page->pgmap);
+
+   WARN_ON(!is_pci_p2pdma_page(page));
+
+   return p2p_pgmap->bus_offset;
+}
+EXPORT_SYMBOL_GPL(pci_p2pdma_bus_offset);
+
+/**
+ * pci_p2pdma_dma_map_type - determine if a DMA mapping should use the
+ * bus address, be mapped normally or fail
+ * @dev: device doing the DMA request
+ * @pgmap: dev_pagemap structure for the mapping
+ *
+ * Returns:
+ *1 - if the page should be mapped with a bus address,
+ *0 - if the page should be mapped normally through an IOMMU mapping or
+ *physical address; or
+ *   -1 - if the device should not map the pages and an error should be
+ *returned
+ */
+int pci_p2pdma_dma_map_type(struct device *dev, struct dev_pagemap *pgmap)
+{
+   struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(pgmap);
+   struct pci_dev *client;
+
+   if (!dev_is_pci(dev))
+   return -1;
+
+   client = to_pci_dev(dev);
+
+   switch (pci_p2pdma_map_type(p2p_pgmap->provider, client)) {
+   case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+   return 0;
+   case PCI_P2PDMA_MAP_BUS_ADDR:
+   return 1;
+   default:
+   return -1;
+   }
+}
+EXPORT_SYMBOL_GPL(pci_p2pdma_dma_map_type);
+
 /**
  * pci_p2pdma_enable_store - parse a configfs/sysfs attribute store
  * to enable p2pdma
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index 8318a97c9c61..3d55ad19f54c 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -34,6 +34,8 @@ int pci_p2pdma_map_sg_attrs(struct device *dev, struct 
scatterlist *sg,
int nents, enum dma_data_direction dir, unsigned long attrs);
 void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
int nents, enum dma_data_direction dir, unsigned long attrs);
+u64 pci_p2pdma_bus_offset(struct page *page);
+int pci_p2pdma_dma_map_type(struct device *dev, struct dev_pagemap *pgmap);
 int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev,
bool *use_p2pdma);
 ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
@@ -83,6 +85,15 @@ static inline void pci_p2pmem_free_sgl(struct pci_dev *pdev,
 static inline void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
 {
 }
+static inline u64 pci_p2pdma_bus_offset(struct page *page)
+{
+   return -1;
+}
+static inline int pci_p2pdma_dma_map_type(struct device *dev,
+ struct dev_pagemap *pgmap)
+{
+   return -1;
+}
 static inline int pci_p2pdma_map_sg_attrs(struct device *dev,
struct scatterlist *sg, int nents, enum dma_data_direction dir,
unsigned long attrs)
-- 
2.20.1



[RFC PATCH v2 02/11] PCI/P2PDMA: Avoid pci_get_slot() which sleeps

2021-03-11 Thread Logan Gunthorpe
In order to use upstream_bridge_distance_warn() from a dma_map function,
it must not sleep. However, pci_get_slot() takes the pci_bus_sem so it
might sleep.

In order to avoid this, try to get the host bridge's device from
bus->self, and if that is not set just get the first element in the
list. It should be impossible for the host bridges device to go away
while references are held on child devices, so the first element
should not change and this should be safe.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/p2pdma.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index bd89437faf06..2135fe69bb07 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -311,11 +311,15 @@ static const struct pci_p2pdma_whitelist_entry {
 static bool __host_bridge_whitelist(struct pci_host_bridge *host,
bool same_host_bridge)
 {
-   struct pci_dev *root = pci_get_slot(host->bus, PCI_DEVFN(0, 0));
const struct pci_p2pdma_whitelist_entry *entry;
+   struct pci_dev *root = host->bus->self;
unsigned short vendor, device;
 
if (!root)
+   root = list_first_entry_or_null(>bus->devices,
+   struct pci_dev, bus_list);
+
+   if (!root || root->devfn)
return false;
 
vendor = root->vendor;
-- 
2.20.1



[RFC PATCH v2 07/11] dma-mapping: Add flags to dma_map_ops to indicate PCI P2PDMA support

2021-03-11 Thread Logan Gunthorpe
Add a flags member to the dma_map_ops structure with one flag to
indicate support for PCI P2PDMA.

Also, add a helper to check if a device supports PCI P2PDMA.

Signed-off-by: Logan Gunthorpe 
---
 include/linux/dma-map-ops.h | 3 +++
 include/linux/dma-mapping.h | 5 +
 kernel/dma/mapping.c| 8 
 3 files changed, 16 insertions(+)

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 51872e736e7b..481892822104 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -12,6 +12,9 @@
 struct cma;
 
 struct dma_map_ops {
+   unsigned int flags;
+#define DMA_F_PCI_P2PDMA_SUPPORTED (1 << 0)
+
void *(*alloc)(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp,
unsigned long attrs);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 2a984cb4d1e0..a6bced004de8 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -138,6 +138,7 @@ int dma_mmap_attrs(struct device *dev, struct 
vm_area_struct *vma,
unsigned long attrs);
 bool dma_can_mmap(struct device *dev);
 int dma_supported(struct device *dev, u64 mask);
+int dma_pci_p2pdma_supported(struct device *dev);
 int dma_set_mask(struct device *dev, u64 mask);
 int dma_set_coherent_mask(struct device *dev, u64 mask);
 u64 dma_get_required_mask(struct device *dev);
@@ -233,6 +234,10 @@ static inline int dma_supported(struct device *dev, u64 
mask)
 {
return 0;
 }
+static inline int dma_pci_p2pdma_supported(struct device *dev)
+{
+   return 0;
+}
 static inline int dma_set_mask(struct device *dev, u64 mask)
 {
return -EIO;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index adc1a83950be..cc1b97deb74d 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -540,6 +540,14 @@ int dma_supported(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_supported);
 
+int dma_pci_p2pdma_supported(struct device *dev)
+{
+   const struct dma_map_ops *ops = get_dma_ops(dev);
+
+   return !ops || ops->flags & DMA_F_PCI_P2PDMA_SUPPORTED;
+}
+EXPORT_SYMBOL(dma_pci_p2pdma_supported);
+
 #ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
 void arch_dma_set_mask(struct device *dev, u64 mask);
 #else
-- 
2.20.1



[RFC PATCH v2 10/11] nvme-pci: Check DMA ops when indicating support for PCI P2PDMA

2021-03-11 Thread Logan Gunthorpe
Introduce a supports_pci_p2pdma() operation in nvme_ctrl_ops to
replace the fixed NVME_F_PCI_P2PDMA flag such that the dma_map_ops
flags can be checked for PCI P2PDMA support.

Signed-off-by: Logan Gunthorpe 
---
 drivers/nvme/host/core.c |  3 ++-
 drivers/nvme/host/nvme.h |  2 +-
 drivers/nvme/host/pci.c  | 11 +--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index e68a8c4ac5a6..501b4a979776 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3928,7 +3928,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, 
unsigned nsid,
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
 
blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
-   if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
+   if (ctrl->ops->supports_pci_p2pdma &&
+   ctrl->ops->supports_pci_p2pdma(ctrl))
blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
 
ns->queue->queuedata = ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 07b34175c6ce..9c04df982d2c 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -473,7 +473,6 @@ struct nvme_ctrl_ops {
unsigned int flags;
 #define NVME_F_FABRICS (1 << 0)
 #define NVME_F_METADATA_SUPPORTED  (1 << 1)
-#define NVME_F_PCI_P2PDMA  (1 << 2)
int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
@@ -481,6 +480,7 @@ struct nvme_ctrl_ops {
void (*submit_async_event)(struct nvme_ctrl *ctrl);
void (*delete_ctrl)(struct nvme_ctrl *ctrl);
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
+   bool (*supports_pci_p2pdma)(struct nvme_ctrl *ctrl);
 };
 
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 17ab3320d28b..7d40c6a9e58e 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2759,17 +2759,24 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, 
char *buf, int size)
return snprintf(buf, size, "%s\n", dev_name(>dev));
 }
 
+static bool nvme_pci_supports_pci_p2pdma(struct nvme_ctrl *ctrl)
+{
+   struct nvme_dev *dev = to_nvme_dev(ctrl);
+
+   return dma_pci_p2pdma_supported(dev->dev);
+}
+
 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name   = "pcie",
.module = THIS_MODULE,
-   .flags  = NVME_F_METADATA_SUPPORTED |
- NVME_F_PCI_P2PDMA,
+   .flags  = NVME_F_METADATA_SUPPORTED,
.reg_read32 = nvme_pci_reg_read32,
.reg_write32= nvme_pci_reg_write32,
.reg_read64 = nvme_pci_reg_read64,
.free_ctrl  = nvme_pci_free_ctrl,
.submit_async_event = nvme_pci_submit_async_event,
.get_address= nvme_pci_get_address,
+   .supports_pci_p2pdma= nvme_pci_supports_pci_p2pdma,
 };
 
 static int nvme_dev_map(struct nvme_dev *dev)
-- 
2.20.1



[RFC PATCH v2 08/11] iommu/dma: Support PCI P2PDMA pages in dma-iommu map_sg

2021-03-11 Thread Logan Gunthorpe
When a PCI P2PDMA page is seen, set the IOVA length of the segment
to zero so that it is not mapped into the IOVA. Then, in finalise_sg(),
apply the appropriate bus address to the segment. The IOVA is not
created if the scatterlist only consists of P2PDMA pages.

Similar to dma-direct, the sg_mark_pci_p2pdma() flag is used to
indicate bus address segments. On unmap, P2PDMA segments are skipped
over when determining the start and end IOVA addresses.

With this change, the flags variable in the dma_map_ops is
set to DMA_F_PCI_P2PDMA_SUPPORTED to indicate support for
P2PDMA pages.

Signed-off-by: Logan Gunthorpe 
---
 drivers/iommu/dma-iommu.c | 63 ---
 1 file changed, 53 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index af765c813cc8..c0821e9051a9 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -846,7 +847,7 @@ static void iommu_dma_unmap_page(struct device *dev, 
dma_addr_t dma_handle,
  * segment's start address to avoid concatenating across one.
  */
 static int __finalise_sg(struct device *dev, struct scatterlist *sg, int nents,
-   dma_addr_t dma_addr)
+   dma_addr_t dma_addr, unsigned long attrs)
 {
struct scatterlist *s, *cur = sg;
unsigned long seg_mask = dma_get_seg_boundary(dev);
@@ -864,6 +865,20 @@ static int __finalise_sg(struct device *dev, struct 
scatterlist *sg, int nents,
sg_dma_address(s) = DMA_MAPPING_ERROR;
sg_dma_len(s) = 0;
 
+   if (is_pci_p2pdma_page(sg_page(s)) && !s_iova_len) {
+   if (i > 0)
+   cur = sg_next(cur);
+
+   sg_dma_address(cur) = sg_phys(s) + s->offset -
+   pci_p2pdma_bus_offset(sg_page(s));
+   sg_dma_len(cur) = s->length;
+   sg_mark_pci_p2pdma(cur);
+
+   count++;
+   cur_len = 0;
+   continue;
+   }
+
/*
 * Now fill in the real DMA data. If...
 * - there is a valid output segment to append to
@@ -960,11 +975,12 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
struct iommu_dma_cookie *cookie = domain->iova_cookie;
struct iova_domain *iovad = >iovad;
struct scatterlist *s, *prev = NULL;
+   struct dev_pagemap *pgmap = NULL;
int prot = dma_info_to_prot(dir, dev_is_dma_coherent(dev), attrs);
dma_addr_t iova;
size_t iova_len = 0;
unsigned long mask = dma_get_seg_boundary(dev);
-   int i;
+   int i, map = -1, ret = 0;
 
if (static_branch_unlikely(_deferred_attach_enabled) &&
iommu_deferred_attach(dev, domain))
@@ -993,6 +1009,23 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
s_length = iova_align(iovad, s_length + s_iova_off);
s->length = s_length;
 
+   if (is_pci_p2pdma_page(sg_page(s))) {
+   if (sg_page(s)->pgmap != pgmap) {
+   pgmap = sg_page(s)->pgmap;
+   map = pci_p2pdma_dma_map_type(dev, pgmap);
+   }
+
+   if (map < 0) {
+   ret = -EREMOTEIO;
+   goto out_restore_sg;
+   }
+
+   if (map) {
+   s->length = 0;
+   continue;
+   }
+   }
+
/*
 * Due to the alignment of our single IOVA allocation, we can
 * depend on these assumptions about the segment boundary mask:
@@ -1015,6 +1048,9 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
prev = s;
}
 
+   if (!iova_len)
+   return __finalise_sg(dev, sg, nents, 0, attrs);
+
iova = iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev);
if (!iova)
goto out_restore_sg;
@@ -1026,19 +1062,19 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
if (iommu_map_sg_atomic(domain, iova, sg, nents, prot) < iova_len)
goto out_free_iova;
 
-   return __finalise_sg(dev, sg, nents, iova);
+   return __finalise_sg(dev, sg, nents, iova, attrs);
 
 out_free_iova:
iommu_dma_free_iova(cookie, iova, iova_len, NULL);
 out_restore_sg:
__invalidate_sg(sg, nents);
-   return 0;
+   return ret;
 }
 
 static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
int nents, enum dma_data_direction dir, unsi

Re: [PATCH 1/3] cdev: Finish the cdev api with queued mode support

2021-01-20 Thread Logan Gunthorpe




On 2021-01-20 12:38 p.m., Dan Williams wrote:
> ...common reference count handling scenarios were addressed, but the
> shutdown-synchronization problem was only mentioned as something driver
> developers need to be aware in the following note:
> 
> NOTE: This guarantees that associated sysfs callbacks are not running
> or runnable, however any cdevs already open will remain and their fops
> will still be callable even after this function returns.
> 
> Remove that responsibility from driver developers with the concept of a
> 'queued' mode for cdevs.

I find the queued name confusing. What's being queued?

> +static const struct file_operations cdev_queued_fops = {
> + .owner = THIS_MODULE,
> + .open = cdev_queued_open,
> + .unlocked_ioctl = cdev_queued_ioctl,
> + .compat_ioctl = compat_ptr_ioctl,
> + .llseek = noop_llseek,
> +};

Why do we only protect these fops? I'd find it a bit confusing to have
ioctl protected from use after del, but not write/read/etc.

Logan


Re: [patch 17/30] NTB/msi: Use irq_has_action()

2020-12-10 Thread Logan Gunthorpe



On 2020-12-10 12:25 p.m., Thomas Gleixner wrote:
> Use the proper core function.
> 
> Signed-off-by: Thomas Gleixner 
> Cc: Jon Mason 
> Cc: Dave Jiang 
> Cc: Allen Hubbe 
> Cc: linux-...@googlegroups.com

Looks good to me.

Reviewed-by: Logan Gunthorpe 

> ---
>  drivers/ntb/msi.c |4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
> 
> --- a/drivers/ntb/msi.c
> +++ b/drivers/ntb/msi.c
> @@ -282,15 +282,13 @@ int ntbm_msi_request_threaded_irq(struct
> struct ntb_msi_desc *msi_desc)
>  {
>   struct msi_desc *entry;
> - struct irq_desc *desc;
>   int ret;
>  
>   if (!ntb->msi)
>   return -EINVAL;
>  
>   for_each_pci_msi_entry(entry, ntb->pdev) {
> - desc = irq_to_desc(entry->irq);
> - if (desc->action)
> + if (irq_has_action(entry->irq))
>   continue;
>  
>   ret = devm_request_threaded_irq(>dev, entry->irq, handler,
> 


Re: [RFC PATCH 04/15] lib/scatterlist: Add flag for indicating P2PDMA segments in an SGL

2020-12-10 Thread Logan Gunthorpe



On 2020-12-09 9:04 p.m., Dan Williams wrote:
> On Wed, Dec 9, 2020 at 6:07 PM Logan Gunthorpe  wrote:
>>
>>
>>
>> On 2020-12-09 6:22 p.m., Dan Williams wrote:
>>> On Mon, Nov 9, 2020 at 8:47 AM Logan Gunthorpe  wrote:
>>>>
>>>>
>>>>
>>>> On 2020-11-09 2:12 a.m., Christoph Hellwig wrote:
>>>>> On Fri, Nov 06, 2020 at 10:00:25AM -0700, Logan Gunthorpe wrote:
>>>>>> We make use of the top bit of the dma_length to indicate a P2PDMA
>>>>>> segment.
>>>>>
>>>>> I don't think "we" can.  There is nothing limiting the size of a SGL
>>>>> segment.
>>>>
>>>> Yes, I expected this would be the unacceptable part. Any alternative ideas?
>>>
>>> Why is the SG_P2PDMA_FLAG needed as compared to checking the SGL
>>> segment-pages for is_pci_p2pdma_page()?
>>
>> Because the DMA and page segments in the SGL aren't necessarily aligned...
>>
>> The IOMMU implementations can coalesce multiple pages into fewer DMA
>> address ranges, so the page pointed to by sg->page_link may not be the
>> one that corresponds to the address in sg->dma_address for a given segment.
>>
>> If that makes sense -- it's not the easiest thing to explain.
> 
> It does...
> 
> Did someone already grab, or did you already consider the 3rd
> available bit in page_link? AFAICS only SG_CHAIN and SG_END are
> reserved. However, if you have a CONFIG_64BIT dependency for
> user-directed p2pdma that would seem to allow SG_P2PDMA_FLAG to be
> (0x4) in page_link.

Hmm, I half considered that, but I had came to the conclusion that given
the mis-alignment I shouldn't be using the page side of the SGL.
However, reconsidering now, that might actually be a reasonable option.

However, the CONFIG_64BIT dependency would have to be on all P2PDMA,
because we'd need to replace pci_p2pdma_map_sg() in all cases. I'm not
sure if this would be a restriction people care about.

Thanks,

Logan



Re: [RFC PATCH 04/15] lib/scatterlist: Add flag for indicating P2PDMA segments in an SGL

2020-12-09 Thread Logan Gunthorpe



On 2020-12-09 6:22 p.m., Dan Williams wrote:
> On Mon, Nov 9, 2020 at 8:47 AM Logan Gunthorpe  wrote:
>>
>>
>>
>> On 2020-11-09 2:12 a.m., Christoph Hellwig wrote:
>>> On Fri, Nov 06, 2020 at 10:00:25AM -0700, Logan Gunthorpe wrote:
>>>> We make use of the top bit of the dma_length to indicate a P2PDMA
>>>> segment.
>>>
>>> I don't think "we" can.  There is nothing limiting the size of a SGL
>>> segment.
>>
>> Yes, I expected this would be the unacceptable part. Any alternative ideas?
> 
> Why is the SG_P2PDMA_FLAG needed as compared to checking the SGL
> segment-pages for is_pci_p2pdma_page()?

Because the DMA and page segments in the SGL aren't necessarily aligned...

The IOMMU implementations can coalesce multiple pages into fewer DMA
address ranges, so the page pointed to by sg->page_link may not be the
one that corresponds to the address in sg->dma_address for a given segment.

If that makes sense -- it's not the easiest thing to explain.

Logan




Re: [PATCH v3 3/6] mm: support THP migration to device private memory

2020-12-02 Thread Logan Gunthorpe



On 2020-12-02 3:14 a.m., Christoph Hellwig wrote:>>
MEMORY_DEVICE_PCI_P2PDMA:
>> Struct pages are created in pci_p2pdma_add_resource() and represent device
>> memory accessible by PCIe bar address space. Memory is allocated with
>> pci_alloc_p2pmem() based on a byte length but the gen_pool_alloc_owner()
>> call will allocate memory in a minimum of PAGE_SIZE units.
>> Reference counting is +1 per *allocation* on the pgmap->ref reference count.
>> Note that this is not +1 per page which is what put_page() expects. So
>> currently, a get_page()/put_page() works OK because the page reference count
>> only goes 1->2 and 2->1. If it went to zero, the pgmap->ref reference count
>> would be incorrect if the allocation size was greater than one page.
>>
>> I see pci_alloc_p2pmem() is called by nvme_alloc_sq_cmds() and
>> pci_p2pmem_alloc_sgl() to create a command queue and a struct scatterlist *.
>> Looks like sg_page(sg) returns the ZONE_DEVICE struct page of the 
>> scatterlist.
>> There are a huge number of places sg_page() is called so it is hard to tell
>> whether or not get_page()/put_page() is ever called on 
>> MEMORY_DEVICE_PCI_P2PDMA
>> pages.
> 
> Nothing should call get_page/put_page on them, as they are not treated
> as refcountable memory.  More importantly nothing is allowed to keep
> a reference longer than the time of the I/O.

Yes, right now this is safe, as Christoph notes there are no places
where these should be got/put.

But eventually we'll need to change how pci_alloc_p2pmem() works to take
references on the actual pages and allow freeing individual pages,
similar to what you suggest. This is one of the issues Jason pointed out
in my last RFC to try to pass these pages through GUP.

Logan


Re: [PATCH][V2] PCI: Fix a potential uninitentional integer overflow issue

2020-11-10 Thread Logan Gunthorpe



On 2020-11-10 3:10 p.m., Colin King wrote:
> From: Colin Ian King 
> 
> The shift of 1 by align_order is evaluated using 32 bit arithmetic
> and the result is assigned to a resource_size_t type variable that
> is a 64 bit unsigned integer on 64 bit platforms. Fix an overflow
> before widening issue by making the 1 a ULL.
> 
> Addresses-Coverity: ("Unintentional integer overflow")
> Fixes: 07d8d7e57c28 ("PCI: Make specifying PCI devices in kernel parameters 
> reusable")

I think this should probably be

Fixes: 32a9a682bef2 ("PCI: allow assignment of memory resources with a
specified alignment")

That is the commit where the original bug was introduced. 644a544fd9bcd
then extends the code a little bit and 07d8d7e57c28 only refactors it
into a reusable function. If we want this in older stable kernels then
we will probably need to make different patches for the other two vintages.

> Signed-off-by: Colin Ian King 

Besides that, the change makes sense to me.

Reviewed-by: Logan Gunthorpe 

Thanks,

Logan


Re: [RFC PATCH 03/15] PCI/P2PDMA: Introduce pci_p2pdma_should_map_bus() and pci_p2pdma_bus_offset()

2020-11-10 Thread Logan Gunthorpe



On 2020-11-10 4:25 p.m., Bjorn Helgaas wrote:
> On Fri, Nov 06, 2020 at 10:00:24AM -0700, Logan Gunthorpe wrote:
>> Introduce pci_p2pdma_should_map_bus() which is meant to be called by
>> dma map functions to determine how to map a given p2pdma page.
> 
> s/dma/DMA/ for consistency (also below in function comment)
> 
>> pci_p2pdma_bus_offset() is also added to allow callers to get the bus
>> offset if they need to map the bus address.
>>
>> Signed-off-by: Logan Gunthorpe 
>> ---
>>  drivers/pci/p2pdma.c   | 46 ++
>>  include/linux/pci-p2pdma.h | 11 +
>>  2 files changed, 57 insertions(+)
>>
>> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
>> index ea8472278b11..9961e779f430 100644
>> --- a/drivers/pci/p2pdma.c
>> +++ b/drivers/pci/p2pdma.c
>> @@ -930,6 +930,52 @@ void pci_p2pdma_unmap_sg_attrs(struct device *dev, 
>> struct scatterlist *sg,
>>  }
>>  EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs);
>>  
>> +/**
>> + * pci_p2pdma_bus_offset - returns the bus offset for a given page
>> + * @page: page to get the offset for
>> + *
>> + * Must be passed a pci p2pdma page.
> 
> s/pci/PCI/
> 
>> + */
>> +u64 pci_p2pdma_bus_offset(struct page *page)
>> +{
>> +struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page->pgmap);
>> +
>> +WARN_ON(!is_pci_p2pdma_page(page));
>> +
>> +return p2p_pgmap->bus_offset;
>> +}
>> +EXPORT_SYMBOL_GPL(pci_p2pdma_bus_offset);
>> +
>> +/**
>> + * pci_p2pdma_should_map_bus - determine if a dma mapping should use the
>> + *  bus address
>> + * @dev: device doing the DMA request
>> + * @pgmap: dev_pagemap structure for the mapping
>> + *
>> + * Returns 1 if the page should be mapped with a bus address, 0 otherwise
>> + * and -1 the device should not be mapping P2PDMA pages.
> 
> I think this is missing a word.
> 
> I'm not really sure how to interpret the "should" in
> pci_p2pdma_should_map_bus().  If this returns -1, does that mean the
> patches *cannot* be mapped?  They *could* be mapped, but you really
> *shouldn't*?  Something else?
> 
> 1 means page should be mapped with bus address.  0 means ... what,
> exactly?  It should be mapped with some different address?

1 means it must be mapped with a bus address
0 means it may be mapped normally (through the IOMMU or just with a
direct physical address)
-1 means it cannot be mapped and should fail (ie. if it must go through
the IOMMU, but the IOMMU is not in the whitelist).

> Sorry these are naive questions because I don't know how all this
> works.

Thanks for the review. Definitely points out some questionable language
that I used. I'll reword this if/when it goes further.

Logan



Re: [RFC PATCH 15/15] nvme-pci: Allow mmaping the CMB in userspace

2020-11-09 Thread Logan Gunthorpe



On 2020-11-09 8:03 a.m., Keith Busch wrote:
> On Fri, Nov 06, 2020 at 10:00:36AM -0700, Logan Gunthorpe wrote:
>> Allow userspace to obtain CMB memory by mmaping the controller's
>> char device. The mmap call allocates and returns a hunk of CMB memory,
>> (the offset is ignored) so userspace does not have control over the
>> address within the CMB.
>>
>> A VMA allocated in this way will only be usable by drivers that set
>> FOLL_PCI_P2PDMA when calling GUP. And inter-device support will be
>> checked the first time the pages are mapped for DMA.
>>
>> Currently this is only supported by O_DIRECT to an PCI NVMe device
>> or through the NVMe passthrough IOCTL.
> 
> Rather than make this be specific to nvme, could pci p2pdma create an
> mmap'able file for any resource registered with it?

It's certainly possible. However, other people have been arguing that
more of this should be specific to NVMe as some use cases do not want to
use the genalloc inside p2pdma.

Logan


Re: [RFC PATCH 04/15] lib/scatterlist: Add flag for indicating P2PDMA segments in an SGL

2020-11-09 Thread Logan Gunthorpe



On 2020-11-09 2:12 a.m., Christoph Hellwig wrote:
> On Fri, Nov 06, 2020 at 10:00:25AM -0700, Logan Gunthorpe wrote:
>> We make use of the top bit of the dma_length to indicate a P2PDMA
>> segment.
> 
> I don't think "we" can.  There is nothing limiting the size of a SGL
> segment.

Yes, I expected this would be the unacceptable part. Any alternative ideas?

Logan


Re: [RFC PATCH 14/15] PCI/P2PDMA: Introduce pci_mmap_p2pmem()

2020-11-06 Thread Logan Gunthorpe



On 2020-11-06 5:14 p.m., Jason Gunthorpe wrote:
> On Fri, Nov 06, 2020 at 01:03:26PM -0700, Logan Gunthorpe wrote:
>> I don't think a function like that will work for the p2pmem use case. In
>> order to implement proper page freeing I expect I'll need a loop around
>> the allocator and vm_insert_mixed()... Something roughly like:
>>
>> for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
>> vaddr = pci_alloc_p2pmem(pdev, PAGE_SIZE);
>>  ret = vmf_insert_mixed(vma, addr,
>> __pfn_to_pfn_t(virt_to_pfn(vaddr), PFN_DEV | PFN_MAP));
>> }
>>
>> That way we can call pci_free_p2pmem() when a page's ref count goes to
>> zero. I suspect your use case will need to do something similar.
> 
> Yes, but I would say the pci_alloc_p2pmem() layer should be able to
> free pages on a page-by-page basis so you don't have to do stuff like
> the above.
> 
> There is often a lot of value in having physical contiguous addresses,
> so allocating page by page as well seems poor.

Agreed. But I'll have to dig to see if genalloc supports freeing blocks
in different sizes than the allocations.

Logan


Re: [RFC PATCH 14/15] PCI/P2PDMA: Introduce pci_mmap_p2pmem()

2020-11-06 Thread Logan Gunthorpe




On 2020-11-06 12:53 p.m., Jason Gunthorpe wrote:
> On Fri, Nov 06, 2020 at 12:44:59PM -0700, Logan Gunthorpe wrote:
>>
>>
>> On 2020-11-06 12:30 p.m., Jason Gunthorpe wrote:
>>>> I certainly can't make decisions for code that isn't currently
>>>> upstream.
>>>
>>> The rdma drivers are all upstream, what are you thinking about?
>>
>> Really? I feel like you should know what I mean here...
>>
>> I mean upstream code that actually uses the APIs that I'd have to
>> introduce. I can't say here's an API feature that no code uses but the
>> already upstream rdma driver might use eventually. It's fairly easy to
>> send patches that make the necessary changes when someone adds a use of
>> those changes inside the rdma code.
> 
> Sure, but that doesn't mean you have to actively design things to be
> unusable beyond this narrow case. The RDMA drivers are all there, all
> upstream, if this work is accepted then the changes to insert P2P
> pages into their existing mmap flows is a reasonable usecase to
> consider at this point when building core code APIs.
> 
> You shouldn't add dead code, but at least have a design in mind for
> what it needs to look like and some allowance.

I don't see anything I've done that's at odds with that. You will still
need to make changes to the p2pdma code to implement your use case.

>>>> Ultimately, if you aren't using the genpool you will have to implement
>>>> your own mmap operation that somehow allocates the pages and your own
>>>> page_free hook. 
>>>
>>> Sure, the mlx5 driver already has a specialized alloctor for it's BAR
>>> pages.
>>
>> So it *might* make sense to carve out a common helper to setup a VMA for
>> P2PDMA to do the vm_flags check and set VM_MIXEDMAP... but besides that,
>> there's no code that would be common to the two cases.
> 
> I think the whole insertion of P2PDMA pages into a VMA should be
> similar to io_remap_pfn() so all the VM flags, pgprot_decrypted and
> other subtle details are all in one place. (also it needs a
> pgprot_decrypted before doing vmf_insert, I just learned that this
> month)

I don't think a function like that will work for the p2pmem use case. In
order to implement proper page freeing I expect I'll need a loop around
the allocator and vm_insert_mixed()... Something roughly like:

for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
vaddr = pci_alloc_p2pmem(pdev, PAGE_SIZE);
ret = vmf_insert_mixed(vma, addr,
   __pfn_to_pfn_t(virt_to_pfn(vaddr), PFN_DEV | PFN_MAP));
}

That way we can call pci_free_p2pmem() when a page's ref count goes to
zero. I suspect your use case will need to do something similar.

Logan


Re: [RFC PATCH 14/15] PCI/P2PDMA: Introduce pci_mmap_p2pmem()

2020-11-06 Thread Logan Gunthorpe



On 2020-11-06 12:30 p.m., Jason Gunthorpe wrote:
>> I certainly can't make decisions for code that isn't currently
>> upstream.
> 
> The rdma drivers are all upstream, what are you thinking about?

Really? I feel like you should know what I mean here...

I mean upstream code that actually uses the APIs that I'd have to
introduce. I can't say here's an API feature that no code uses but the
already upstream rdma driver might use eventually. It's fairly easy to
send patches that make the necessary changes when someone adds a use of
those changes inside the rdma code.

>> Ultimately, if you aren't using the genpool you will have to implement
>> your own mmap operation that somehow allocates the pages and your own
>> page_free hook. 
> 
> Sure, the mlx5 driver already has a specialized alloctor for it's BAR
> pages.

So it *might* make sense to carve out a common helper to setup a VMA for
P2PDMA to do the vm_flags check and set VM_MIXEDMAP... but besides that,
there's no code that would be common to the two cases.

>> I also don't expect this to be going upstream in the near term so don't
>> get too excited about using it.
> 
> I don't know, it is actually not that horrible, the GUP and IOMMU
> related changes are simpler than I expected

I think the deal breaker is the SGL hack and the fact that there are
important IOMMU implementations that won't have support.

Logan



Re: [RFC PATCH 14/15] PCI/P2PDMA: Introduce pci_mmap_p2pmem()

2020-11-06 Thread Logan Gunthorpe



On 2020-11-06 11:09 a.m., Jason Gunthorpe wrote:
>> Ah, hmm, yes. I guess the pages have to be hooked and returned to the
>> genalloc through free_devmap_managed_page(). 
> 
> That sounds about right, but in this case it doesn't need the VMA
> operations.
> 
>> Seems like it might be doable... but it will complicate things for
>> users that don't want to use the genpool (though no such users exist
>> upstream).
> 
> I would like to use this stuff in RDMA pretty much immediately and the
> genpool is harmful for those cases, so please don't make decisions
> that are tying thing to genpool

I certainly can't make decisions for code that isn't currently upstream.
So you will almost certainly have to make changes for the code you want
to add, as is the standard procedure. I can't and should not export APIs
that you might need that have no upstream users, but you are certainly
free to send patches that create them when you add the use case.

Ultimately, if you aren't using the genpool you will have to implement
your own mmap operation that somehow allocates the pages and your own
page_free hook. The latter can be accommodated for by a patch that
splits off pci_p2pdma_add_resource() into a function that doesn't use
the genpool (I've already seen two independent developers create a
similar patch for this but with no upstream user, they couldn't be taken
upstream).

I also don't expect this to be going upstream in the near term so don't
get too excited about using it.

Logan


Re: [RFC PATCH 14/15] PCI/P2PDMA: Introduce pci_mmap_p2pmem()

2020-11-06 Thread Logan Gunthorpe



On 2020-11-06 10:42 a.m., Jason Gunthorpe wrote:
> On Fri, Nov 06, 2020 at 10:28:00AM -0700, Logan Gunthorpe wrote:
>>
>>
>> On 2020-11-06 10:22 a.m., Jason Gunthorpe wrote:
>>> On Fri, Nov 06, 2020 at 10:00:35AM -0700, Logan Gunthorpe wrote:
>>>> Introduce pci_mmap_p2pmem() which is a helper to allocate and mmap
>>>> a hunk of p2pmem into userspace.
>>>>
>>>> Signed-off-by: Logan Gunthorpe 
>>>>  drivers/pci/p2pdma.c   | 104 +
>>>>  include/linux/pci-p2pdma.h |   6 +++
>>>>  2 files changed, 110 insertions(+)
>>>>
>>>> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
>>>> index 9961e779f430..8eab53ac59ae 100644
>>>> +++ b/drivers/pci/p2pdma.c
>>>> @@ -16,6 +16,7 @@
>>>>  #include 
>>>>  #include 
>>>>  #include 
>>>> +#include 
>>>>  #include 
>>>>  #include 
>>>>  #include 
>>>> @@ -1055,3 +1056,106 @@ ssize_t pci_p2pdma_enable_show(char *page, struct 
>>>> pci_dev *p2p_dev,
>>>>return sprintf(page, "%s\n", pci_name(p2p_dev));
>>>>  }
>>>>  EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show);
>>>> +
>>>> +struct pci_p2pdma_map {
>>>> +  struct kref ref;
>>>> +  struct pci_dev *pdev;
>>>> +  void *kaddr;
>>>> +  size_t len;
>>>> +};
>>>
>>> Why have this at all? Nothing uses it and no vm_operations ops are
>>> implemented?
>>
>> It's necessary to free the allocated p2pmem when the mapping is torn down.
> 
> That's suspicious.. Once in a VMA the lifetime of the page must be
> controlled by the page refcount, it can't be put back into the genpool
> just because the vma was destroed.

Ah, hmm, yes. I guess the pages have to be hooked and returned to the
genalloc through free_devmap_managed_page(). Seems like it might be
doable... but it will complicate things for users that don't want to use
the genpool (though no such users exist upstream).

Logan



Re: [RFC PATCH 15/15] nvme-pci: Allow mmaping the CMB in userspace

2020-11-06 Thread Logan Gunthorpe




On 2020-11-06 10:39 a.m., Jason Gunthorpe wrote:
> On Fri, Nov 06, 2020 at 10:00:36AM -0700, Logan Gunthorpe wrote:
>> Allow userspace to obtain CMB memory by mmaping the controller's
>> char device. The mmap call allocates and returns a hunk of CMB memory,
>> (the offset is ignored) so userspace does not have control over the
>> address within the CMB.
>>
>> A VMA allocated in this way will only be usable by drivers that set
>> FOLL_PCI_P2PDMA when calling GUP. And inter-device support will be
>> checked the first time the pages are mapped for DMA.
>>
>> Currently this is only supported by O_DIRECT to an PCI NVMe device
>> or through the NVMe passthrough IOCTL.
>>
>> Signed-off-by: Logan Gunthorpe 
>>  drivers/nvme/host/core.c | 11 +++
>>  drivers/nvme/host/nvme.h |  1 +
>>  drivers/nvme/host/pci.c  |  9 +
>>  3 files changed, 21 insertions(+)
>>
>> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
>> index f14316c9b34a..fc642aba671d 100644
>> +++ b/drivers/nvme/host/core.c
>> @@ -3240,12 +3240,23 @@ static long nvme_dev_ioctl(struct file *file, 
>> unsigned int cmd,
>>  }
>>  }
>>  
>> +static int nvme_dev_mmap(struct file *file, struct vm_area_struct *vma)
>> +{
>> +struct nvme_ctrl *ctrl = file->private_data;
>> +
>> +if (!ctrl->ops->mmap_cmb)
>> +return -ENODEV;
>> +
>> +return ctrl->ops->mmap_cmb(ctrl, vma);
>> +}
> 
> This needs to ensure that the VMA created is destroyed before the
> driver is unprobed - ie the struct pages backing the BAR memory is
> destroyed.
> 
> I don't see anything that synchronizes this in the nvme_dev_release()?

Yup, looks like something that needs to be fixed. Though I'd probably do
it in the pci_p2pdma helper code instead.

Logan


Re: [RFC PATCH 14/15] PCI/P2PDMA: Introduce pci_mmap_p2pmem()

2020-11-06 Thread Logan Gunthorpe



On 2020-11-06 10:22 a.m., Jason Gunthorpe wrote:
> On Fri, Nov 06, 2020 at 10:00:35AM -0700, Logan Gunthorpe wrote:
>> Introduce pci_mmap_p2pmem() which is a helper to allocate and mmap
>> a hunk of p2pmem into userspace.
>>
>> Signed-off-by: Logan Gunthorpe 
>>  drivers/pci/p2pdma.c   | 104 +
>>  include/linux/pci-p2pdma.h |   6 +++
>>  2 files changed, 110 insertions(+)
>>
>> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
>> index 9961e779f430..8eab53ac59ae 100644
>> +++ b/drivers/pci/p2pdma.c
>> @@ -16,6 +16,7 @@
>>  #include 
>>  #include 
>>  #include 
>> +#include 
>>  #include 
>>  #include 
>>  #include 
>> @@ -1055,3 +1056,106 @@ ssize_t pci_p2pdma_enable_show(char *page, struct 
>> pci_dev *p2p_dev,
>>  return sprintf(page, "%s\n", pci_name(p2p_dev));
>>  }
>>  EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show);
>> +
>> +struct pci_p2pdma_map {
>> +struct kref ref;
>> +struct pci_dev *pdev;
>> +void *kaddr;
>> +size_t len;
>> +};
> 
> Why have this at all? Nothing uses it and no vm_operations ops are
> implemented?

It's necessary to free the allocated p2pmem when the mapping is torn down.

> This is very inflexable, it would be better if this is designed like
> io_remap_pfn where it just preps and fills the VMA, doesn't take
> ownership of the entire VMA

If someone wants to manage their own P2P memory and create their own
userspace mapping with vmf_insert_mixed they are free to do so. But this
helper is specifically for users of pci_p2pdma_map_alloc(). I know you
don't intend to use that but it doesn't make it any less valid.

Logan



[RFC PATCH 05/15] dma-direct: Support PCI P2PDMA pages in dma-direct map_sg

2020-11-06 Thread Logan Gunthorpe
Add PCI P2PDMA support for dma_direct_map_sg() so that it can map
PCI P2PDMA pages directly without a hack in the callers. This allows
for heterogeneous SGLs that contain both P2PDMA and regular pages.

The DMA_ATTR_P2PDMA flag is added to allow callers to indicate support
for P2PDMA pages. In order for a caller to support P2PDMA pages they
must ensure no segment is greater than 2GB such that the high bit
of the dma length can be used as a flag to indicate a P2PDMA segment.
Such code must then ensure to use sg_dma_p2pdma_len() instead of
sg_dma_len() to filter out the flag.

Signed-off-by: Logan Gunthorpe 
---
 include/linux/dma-mapping.h | 11 +++
 kernel/dma/direct.c | 33 +++--
 2 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 956151052d45..8d028e15b531 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -61,6 +61,17 @@
  */
 #define DMA_ATTR_PRIVILEGED(1UL << 9)
 
+/*
+ * DMA_ATTR_P2PDMA: specifies that dma_map_sg() may return p2pdma
+ * bus addresses. Code that specifies this must ensure to
+ * use sg_dma_p2pdma_len() instead of sg_dma_len() as the high
+ * bit of the length will indicate a P2PDMA bus address.
+ *
+ * If this attribute is not set and P2PDMA pages are encountered,
+ * dma_map_sg() will return an error.
+ */
+#define DMA_ATTR_P2PDMA(1UL << 10)
+
 /*
  * A dma_addr_t can hold any valid DMA or bus address for the platform.  It can
  * be given to a device to use as a DMA source or target.  It is specific to a
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 06c111544f61..2fcb31789436 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -13,6 +13,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "direct.h"
 
 /*
@@ -387,19 +388,47 @@ void dma_direct_unmap_sg(struct device *dev, struct 
scatterlist *sgl,
struct scatterlist *sg;
int i;
 
-   for_each_sg(sgl, sg, nents, i)
+   for_each_sg(sgl, sg, nents, i) {
+   if (attrs & DMA_ATTR_P2PDMA && sg_dma_is_p2pdma(sg)) {
+   sg_dma_len(sg) &= ~SG_P2PDMA_FLAG;
+   continue;
+   }
+
dma_direct_unmap_page(dev, sg->dma_address, sg_dma_len(sg), dir,
 attrs);
+   }
 }
 #endif
 
 int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
enum dma_data_direction dir, unsigned long attrs)
 {
-   int i;
+   struct dev_pagemap *pgmap = NULL;
+   int i, map = -1;
struct scatterlist *sg;
+   u64 bus_off;
 
for_each_sg(sgl, sg, nents, i) {
+   if (is_pci_p2pdma_page(sg_page(sg))) {
+   if (sg_page(sg)->pgmap != pgmap) {
+   pgmap = sg_page(sg)->pgmap;
+   map = pci_p2pdma_should_map_bus(dev, pgmap);
+   bus_off = pci_p2pdma_bus_offset(sg_page(sg));
+   }
+
+   if (map < 0 || !(attrs & DMA_ATTR_P2PDMA)) {
+   sg->dma_address = DMA_MAPPING_ERROR;
+   goto out_unmap;
+   }
+
+   if (map) {
+   sg->dma_address = sg_phys(sg) + sg->offset -
+   bus_off;
+   sg_dma_len(sg) = sg->length | SG_P2PDMA_FLAG;
+   continue;
+   }
+   }
+
sg->dma_address = dma_direct_map_page(dev, sg_page(sg),
sg->offset, sg->length, dir, attrs);
if (sg->dma_address == DMA_MAPPING_ERROR)
-- 
2.20.1



[RFC PATCH 06/15] dma-mapping: Add flags to dma_map_ops to indicate PCI P2PDMA support

2020-11-06 Thread Logan Gunthorpe
Add a flags member to the dma_map_ops structure with one flag to
indicate support for PCI P2PDMA.

Also, add a helper to check if a device supports PCI P2PDMA.

Signed-off-by: Logan Gunthorpe 
---
 include/linux/dma-map-ops.h | 3 +++
 include/linux/dma-mapping.h | 5 +
 kernel/dma/mapping.c| 8 
 3 files changed, 16 insertions(+)

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index a5f89fc4d6df..8c12dfa43ce1 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -12,6 +12,9 @@
 struct cma;
 
 struct dma_map_ops {
+   unsigned int flags;
+#define DMA_F_PCI_P2PDMA_SUPPORTED (1 << 0)
+
void *(*alloc)(struct device *dev, size_t size,
dma_addr_t *dma_handle, gfp_t gfp,
unsigned long attrs);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 8d028e15b531..3646c6ba6a00 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -149,6 +149,7 @@ int dma_mmap_attrs(struct device *dev, struct 
vm_area_struct *vma,
unsigned long attrs);
 bool dma_can_mmap(struct device *dev);
 int dma_supported(struct device *dev, u64 mask);
+int dma_pci_p2pdma_supported(struct device *dev);
 int dma_set_mask(struct device *dev, u64 mask);
 int dma_set_coherent_mask(struct device *dev, u64 mask);
 u64 dma_get_required_mask(struct device *dev);
@@ -244,6 +245,10 @@ static inline int dma_supported(struct device *dev, u64 
mask)
 {
return 0;
 }
+static inline int dma_pci_p2pdma_supported(struct device *dev)
+{
+   return 0;
+}
 static inline int dma_set_mask(struct device *dev, u64 mask)
 {
return -EIO;
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 51bb8fa8eb89..192418ef43f8 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -567,6 +567,14 @@ int dma_supported(struct device *dev, u64 mask)
 }
 EXPORT_SYMBOL(dma_supported);
 
+int dma_pci_p2pdma_supported(struct device *dev)
+{
+   const struct dma_map_ops *ops = get_dma_ops(dev);
+
+   return !ops || ops->flags & DMA_F_PCI_P2PDMA_SUPPORTED;
+}
+EXPORT_SYMBOL(dma_pci_p2pdma_supported);
+
 #ifdef CONFIG_ARCH_HAS_DMA_SET_MASK
 void arch_dma_set_mask(struct device *dev, u64 mask);
 #else
-- 
2.20.1



[RFC PATCH 02/15] PCI/P2PDMA: Attempt to set map_type if it has not been set

2020-11-06 Thread Logan Gunthorpe
Attempt to find the mapping type for P2PDMA pages on the first
DMA map attempt if it has not been done ahead of time.

Previously, the mapping type was expected to be calculated ahead of
time, but if pages are to come from userspace then there's no
way to ensure the path was checked ahead of time.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/p2pdma.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 94583779c36e..ea8472278b11 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -824,11 +824,17 @@ EXPORT_SYMBOL_GPL(pci_p2pmem_publish);
 static enum pci_p2pdma_map_type pci_p2pdma_map_type(struct pci_dev *provider,
struct pci_dev *client)
 {
+   enum pci_p2pdma_map_type ret;
+
if (!provider->p2pdma)
return PCI_P2PDMA_MAP_NOT_SUPPORTED;
 
-   return xa_to_value(xa_load(>p2pdma->map_types,
-  map_types_idx(client)));
+   ret = xa_to_value(xa_load(>p2pdma->map_types,
+ map_types_idx(client)));
+   if (ret != PCI_P2PDMA_MAP_UNKNOWN)
+   return ret;
+
+   return upstream_bridge_distance_warn(provider, client, NULL);
 }
 
 static int __pci_p2pdma_map_sg(struct pci_p2pdma_pagemap *p2p_pgmap,
@@ -890,7 +896,6 @@ int pci_p2pdma_map_sg_attrs(struct device *dev, struct 
scatterlist *sg,
case PCI_P2PDMA_MAP_BUS_ADDR:
return __pci_p2pdma_map_sg(p2p_pgmap, dev, sg, nents);
default:
-   WARN_ON_ONCE(1);
return 0;
}
 }
-- 
2.20.1



[RFC PATCH 13/15] block: Set FOLL_PCI_P2PDMA in bio_map_user_iov()

2020-11-06 Thread Logan Gunthorpe
When a bio's queue supports PCI P2PDMA, set FOLL_PCI_P2PDMA for
iov_iter_get_pages_flags(). This allows PCI P2PDMA pages to be
passed from userspace and enables the NVMe passthru requests to
use P2PDMA pages.

Signed-off-by: Logan Gunthorpe 
---
 block/blk-map.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/blk-map.c b/block/blk-map.c
index 21630dccac62..cad8fcfe4f17 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -245,6 +245,7 @@ static int bio_map_user_iov(struct request *rq, struct 
iov_iter *iter,
 {
unsigned int max_sectors = queue_max_hw_sectors(rq->q);
struct bio *bio, *bounce_bio;
+   unsigned int flags = 0;
int ret;
int j;
 
@@ -256,13 +257,17 @@ static int bio_map_user_iov(struct request *rq, struct 
iov_iter *iter,
return -ENOMEM;
bio->bi_opf |= req_op(rq);
 
+   if (blk_queue_pci_p2pdma(rq->q))
+   flags |= FOLL_PCI_P2PDMA;
+
while (iov_iter_count(iter)) {
struct page **pages;
ssize_t bytes;
size_t offs, added = 0;
int npages;
 
-   bytes = iov_iter_get_pages_alloc(iter, , LONG_MAX, );
+   bytes = iov_iter_get_pages_alloc_flags(iter, , LONG_MAX,
+  , flags);
if (unlikely(bytes <= 0)) {
ret = bytes ? bytes : -EFAULT;
goto out_unmap;
-- 
2.20.1



[RFC PATCH 12/15] block: Set FOLL_PCI_P2PDMA in __bio_iov_iter_get_pages()

2020-11-06 Thread Logan Gunthorpe
When a bio's queue supports PCI P2PDMA, set FOLL_PCI_P2PDMA for
iov_iter_get_pages_flags(). This allows PCI P2PDMA pages to be passed
from userspace and enables the O_DIRECT path in iomap based filesystems
and direct to block devices.

Signed-off-by: Logan Gunthorpe 
---
 block/bio.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/bio.c b/block/bio.c
index fa01bef35bb1..6b28256a5575 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -997,6 +997,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct 
iov_iter *iter)
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
bool same_page = false;
+   unsigned int flags = 0;
ssize_t size, left;
unsigned len, i;
size_t offset;
@@ -1009,7 +1010,11 @@ static int __bio_iov_iter_get_pages(struct bio *bio, 
struct iov_iter *iter)
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
 
-   size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, );
+   if (bio->bi_disk && blk_queue_pci_p2pdma(bio->bi_disk->queue))
+   flags |= FOLL_PCI_P2PDMA;
+
+   size = iov_iter_get_pages_flags(iter, pages, LONG_MAX, nr_pages,
+   , flags);
if (unlikely(size <= 0))
return size ? size : -EFAULT;
 
-- 
2.20.1



[RFC PATCH 09/15] nvme-pci: Convert to using dma_map_sg for p2pdma pages

2020-11-06 Thread Logan Gunthorpe
Switch to using sg_dma_p2pdma_len() in places where sg_dma_len() is
used. Then replace the calls to pci_p2pdma_[un]map_sg() with calls
to dma_[un]map_sg() with DMA_ATTR_P2PDMA.

This should be equivalent, though support will be somewhat less
(only dma-direct and dma-iommu are currently supported).

Using DMA_ATTR_P2PDMA is safe because the block layer restricts
requests to be much less than 2GB, thus there's no way for a
segment to be greater than 2GB.

Signed-off-by: Logan Gunthorpe 
---
 drivers/nvme/host/pci.c | 30 --
 1 file changed, 12 insertions(+), 18 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ef7ce464a48d..26976bdf4af0 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -528,12 +528,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct 
request *req)
 
WARN_ON_ONCE(!iod->nents);
 
-   if (is_pci_p2pdma_page(sg_page(iod->sg)))
-   pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
-   rq_dma_dir(req));
-   else
-   dma_unmap_sg(dev->dev, iod->sg, iod->nents, rq_dma_dir(req));
-
+   dma_unmap_sg_attrs(dev->dev, iod->sg, iod->nents, rq_dma_dir(req),
+  DMA_ATTR_P2PDMA);
 
if (iod->npages == 0)
dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0],
@@ -570,7 +566,7 @@ static void nvme_print_sgl(struct scatterlist *sgl, int 
nents)
pr_warn("sg[%d] phys_addr:%pad offset:%d length:%d "
"dma_address:%pad dma_length:%d\n",
i, , sg->offset, sg->length, _dma_address(sg),
-   sg_dma_len(sg));
+   sg_dma_p2pdma_len(sg));
}
 }
 
@@ -581,7 +577,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev 
*dev,
struct dma_pool *pool;
int length = blk_rq_payload_bytes(req);
struct scatterlist *sg = iod->sg;
-   int dma_len = sg_dma_len(sg);
+   int dma_len = sg_dma_p2pdma_len(sg);
u64 dma_addr = sg_dma_address(sg);
int offset = dma_addr & (NVME_CTRL_PAGE_SIZE - 1);
__le64 *prp_list;
@@ -601,7 +597,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev 
*dev,
} else {
sg = sg_next(sg);
dma_addr = sg_dma_address(sg);
-   dma_len = sg_dma_len(sg);
+   dma_len = sg_dma_p2pdma_len(sg);
}
 
if (length <= NVME_CTRL_PAGE_SIZE) {
@@ -650,7 +646,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev 
*dev,
goto bad_sgl;
sg = sg_next(sg);
dma_addr = sg_dma_address(sg);
-   dma_len = sg_dma_len(sg);
+   dma_len = sg_dma_p2pdma_len(sg);
}
 
 done:
@@ -670,7 +666,7 @@ static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
struct scatterlist *sg)
 {
sge->addr = cpu_to_le64(sg_dma_address(sg));
-   sge->length = cpu_to_le32(sg_dma_len(sg));
+   sge->length = cpu_to_le32(sg_dma_p2pdma_len(sg));
sge->type = NVME_SGL_FMT_DATA_DESC << 4;
 }
 
@@ -814,14 +810,12 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, 
struct request *req,
if (!iod->nents)
goto out;
 
-   if (is_pci_p2pdma_page(sg_page(iod->sg)))
-   nr_mapped = pci_p2pdma_map_sg_attrs(dev->dev, iod->sg,
-   iod->nents, rq_dma_dir(req), DMA_ATTR_NO_WARN);
-   else
-   nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
-rq_dma_dir(req), DMA_ATTR_NO_WARN);
-   if (!nr_mapped)
+   nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
+   rq_dma_dir(req), DMA_ATTR_NO_WARN | DMA_ATTR_P2PDMA);
+   if (!nr_mapped) {
+   ret = BLK_STS_IOERR;
goto out;
+   }
 
iod->use_sgl = nvme_pci_use_sgls(dev, req);
if (iod->use_sgl)
-- 
2.20.1



[RFC PATCH 11/15] iov_iter: Introduce iov_iter_get_pages_[alloc_]flags()

2020-11-06 Thread Logan Gunthorpe
Add iov_iter_get_pages_flags() and iov_iter_get_pages_alloc_flags()
which take a flags argument that is passed to get_user_pages_fast().

This is so that FOLL_PCI_P2PDMA can be passed when appropriate.

Signed-off-by: Logan Gunthorpe 
---
 include/linux/uio.h | 21 +
 lib/iov_iter.c  | 25 ++---
 2 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 72d88566694e..694caa868c05 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -221,10 +221,23 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int 
direction, const struct bio_
 void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct 
pipe_inode_info *pipe,
size_t count);
 void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t 
count);
-ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
-   size_t maxsize, unsigned maxpages, size_t *start);
-ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages,
-   size_t maxsize, size_t *start);
+ssize_t iov_iter_get_pages_flags(struct iov_iter *i, struct page **pages,
+   size_t maxsize, unsigned maxpages, size_t *start,
+   unsigned int gup_flags);
+ssize_t iov_iter_get_pages_alloc_flags(struct iov_iter *i,
+   struct page ***pages, size_t maxsize, size_t *start,
+   unsigned int gup_flags);
+static inline ssize_t iov_iter_get_pages(struct iov_iter *i,
+   struct page **pages, size_t maxsize, unsigned maxpages,
+   size_t *start)
+{
+   return iov_iter_get_pages_flags(i, pages, maxsize, maxpages, start, 0);
+}
+static inline ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
+   struct page ***pages, size_t maxsize, size_t *start)
+{
+   return iov_iter_get_pages_alloc_flags(i, pages, maxsize, start, 0);
+}
 int iov_iter_npages(const struct iov_iter *i, int maxpages);
 
 const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 1635111c5bd2..700effe0bb86 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1313,9 +1313,9 @@ static ssize_t pipe_get_pages(struct iov_iter *i,
return __pipe_get_pages(i, min(maxsize, capacity), pages, iter_head, 
start);
 }
 
-ssize_t iov_iter_get_pages(struct iov_iter *i,
+ssize_t iov_iter_get_pages_flags(struct iov_iter *i,
   struct page **pages, size_t maxsize, unsigned maxpages,
-  size_t *start)
+  size_t *start, unsigned int gup_flags)
 {
if (maxsize > i->count)
maxsize = i->count;
@@ -1325,6 +1325,9 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
if (unlikely(iov_iter_is_discard(i)))
return -EFAULT;
 
+   if (iov_iter_rw(i) != WRITE)
+   gup_flags |= FOLL_WRITE;
+
iterate_all_kinds(i, maxsize, v, ({
unsigned long addr = (unsigned long)v.iov_base;
size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
@@ -1335,9 +1338,7 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
len = maxpages * PAGE_SIZE;
addr &= ~(PAGE_SIZE - 1);
n = DIV_ROUND_UP(len, PAGE_SIZE);
-   res = get_user_pages_fast(addr, n,
-   iov_iter_rw(i) != WRITE ?  FOLL_WRITE : 0,
-   pages);
+   res = get_user_pages_fast(addr, n, gup_flags, pages);
if (unlikely(res < 0))
return res;
return (res == n ? len : res * PAGE_SIZE) - *start;
@@ -1352,7 +1353,7 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
)
return 0;
 }
-EXPORT_SYMBOL(iov_iter_get_pages);
+EXPORT_SYMBOL(iov_iter_get_pages_flags);
 
 static struct page **get_pages_array(size_t n)
 {
@@ -1392,9 +1393,9 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i,
return n;
 }
 
-ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
+ssize_t iov_iter_get_pages_alloc_flags(struct iov_iter *i,
   struct page ***pages, size_t maxsize,
-  size_t *start)
+  size_t *start, unsigned int gup_flags)
 {
struct page **p;
 
@@ -1406,6 +1407,9 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
if (unlikely(iov_iter_is_discard(i)))
return -EFAULT;
 
+   if (iov_iter_rw(i) != WRITE)
+   gup_flags |= FOLL_WRITE;
+
iterate_all_kinds(i, maxsize, v, ({
unsigned long addr = (unsigned long)v.iov_base;
size_t len = v.iov_len + (*start = addr & (PAGE_SIZE - 1));
@@ -1417,8 +1421,7 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
p = get_pages_array(n);
if (!p)
return -ENO

[RFC PATCH 14/15] PCI/P2PDMA: Introduce pci_mmap_p2pmem()

2020-11-06 Thread Logan Gunthorpe
Introduce pci_mmap_p2pmem() which is a helper to allocate and mmap
a hunk of p2pmem into userspace.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/p2pdma.c   | 104 +
 include/linux/pci-p2pdma.h |   6 +++
 2 files changed, 110 insertions(+)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 9961e779f430..8eab53ac59ae 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -1055,3 +1056,106 @@ ssize_t pci_p2pdma_enable_show(char *page, struct 
pci_dev *p2p_dev,
return sprintf(page, "%s\n", pci_name(p2p_dev));
 }
 EXPORT_SYMBOL_GPL(pci_p2pdma_enable_show);
+
+struct pci_p2pdma_map {
+   struct kref ref;
+   struct pci_dev *pdev;
+   void *kaddr;
+   size_t len;
+};
+
+static struct pci_p2pdma_map *pci_p2pdma_map_alloc(struct pci_dev *pdev,
+  size_t len)
+{
+   struct pci_p2pdma_map *pmap;
+
+   pmap = kzalloc(sizeof(*pmap), GFP_KERNEL);
+   if (!pmap)
+   return NULL;
+
+   kref_init(>ref);
+   pmap->pdev = pdev;
+   pmap->len = len;
+
+   pmap->kaddr = pci_alloc_p2pmem(pdev, len);
+   if (!pmap->kaddr) {
+   kfree(pmap);
+   return NULL;
+   }
+
+   return pmap;
+}
+
+static void pci_p2pdma_map_free(struct kref *ref)
+{
+   struct pci_p2pdma_map *pmap =
+   container_of(ref, struct pci_p2pdma_map, ref);
+
+   pci_free_p2pmem(pmap->pdev, pmap->kaddr, pmap->len);
+   kfree(pmap);
+}
+
+static void pci_p2pdma_vma_open(struct vm_area_struct *vma)
+{
+   struct pci_p2pdma_map *pmap = vma->vm_private_data;
+
+   kref_get(>ref);
+}
+
+static void pci_p2pdma_vma_close(struct vm_area_struct *vma)
+{
+   struct pci_p2pdma_map *pmap = vma->vm_private_data;
+
+   kref_put(>ref, pci_p2pdma_map_free);
+}
+
+const struct vm_operations_struct pci_p2pdma_vmops = {
+   .open = pci_p2pdma_vma_open,
+   .close = pci_p2pdma_vma_close,
+};
+
+/**
+ * pci_mmap_p2pmem - allocate peer-to-peer DMA memory
+ * @pdev: the device to allocate memory from
+ * @vma: the userspace vma to map the memory to
+ *
+ * Returns 0 on success, or a negative error code on failure
+ */
+int pci_mmap_p2pmem(struct pci_dev *pdev, struct vm_area_struct *vma)
+{
+   struct pci_p2pdma_map *pmap;
+   unsigned long addr, pfn;
+   vm_fault_t ret;
+
+   /* prevent private mappings from being established */
+   if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
+   pci_info_ratelimited(pdev,
+"%s: fail, attempted private mapping\n",
+current->comm);
+   return -EINVAL;
+   }
+
+   pmap = pci_p2pdma_map_alloc(pdev, vma->vm_end - vma->vm_start);
+   if (!pmap)
+   return -ENOMEM;
+
+   vma->vm_flags |= VM_MIXEDMAP;
+   vma->vm_private_data = pmap;
+   vma->vm_ops = _p2pdma_vmops;
+
+   pfn = virt_to_phys(pmap->kaddr) >> PAGE_SHIFT;
+   for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
+   ret = vmf_insert_mixed(vma, addr,
+  __pfn_to_pfn_t(pfn, PFN_DEV | PFN_MAP));
+   if (ret & VM_FAULT_ERROR)
+   goto out_error;
+   pfn++;
+   }
+
+   return 0;
+
+out_error:
+   kref_put(>ref, pci_p2pdma_map_free);
+   return -EFAULT;
+}
+EXPORT_SYMBOL_GPL(pci_mmap_p2pmem);
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index fc5de47eeac4..26fe40363d1c 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -40,6 +40,7 @@ int pci_p2pdma_enable_store(const char *page, struct pci_dev 
**p2p_dev,
bool *use_p2pdma);
 ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
   bool use_p2pdma);
+int pci_mmap_p2pmem(struct pci_dev *pdev, struct vm_area_struct *vma);
 #else /* CONFIG_PCI_P2PDMA */
 static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar,
size_t size, u64 offset)
@@ -116,6 +117,11 @@ static inline ssize_t pci_p2pdma_enable_show(char *page,
 {
return sprintf(page, "none\n");
 }
+static inline int pci_mmap_p2pmem(struct pci_dev *pdev,
+ struct vm_area_struct *vma)
+{
+   return -EOPNOTSUPP;
+}
 #endif /* CONFIG_PCI_P2PDMA */
 
 
-- 
2.20.1



[RFC PATCH 08/15] nvme-pci: Check DMA ops when indicating support for PCI P2PDMA

2020-11-06 Thread Logan Gunthorpe
Introduce a supports_pci_p2pdma() operation in nvme_ctrl_ops to
replace the fixed NVME_F_PCI_P2PDMA flag such that the dma_map_ops
flags can be checked for PCI P2PDMA support.

Signed-off-by: Logan Gunthorpe 
---
 drivers/nvme/host/core.c |  3 ++-
 drivers/nvme/host/nvme.h |  2 +-
 drivers/nvme/host/pci.c  | 11 +--
 3 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 376096bfc54a..f14316c9b34a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3824,7 +3824,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, 
unsigned nsid,
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, ns->queue);
 
blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
-   if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
+   if (ctrl->ops->supports_pci_p2pdma &&
+   ctrl->ops->supports_pci_p2pdma(ctrl))
blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
 
ns->queue->queuedata = ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index cc36a981..a0bfe8709cfc 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -465,7 +465,6 @@ struct nvme_ctrl_ops {
unsigned int flags;
 #define NVME_F_FABRICS (1 << 0)
 #define NVME_F_METADATA_SUPPORTED  (1 << 1)
-#define NVME_F_PCI_P2PDMA  (1 << 2)
int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
@@ -473,6 +472,7 @@ struct nvme_ctrl_ops {
void (*submit_async_event)(struct nvme_ctrl *ctrl);
void (*delete_ctrl)(struct nvme_ctrl *ctrl);
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
+   bool (*supports_pci_p2pdma)(struct nvme_ctrl *ctrl);
 };
 
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index df8f3612107f..ef7ce464a48d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2686,17 +2686,24 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, 
char *buf, int size)
return snprintf(buf, size, "%s\n", dev_name(>dev));
 }
 
+static bool nvme_pci_supports_pci_p2pdma(struct nvme_ctrl *ctrl)
+{
+   struct nvme_dev *dev = to_nvme_dev(ctrl);
+
+   return dma_pci_p2pdma_supported(dev->dev);
+}
+
 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name   = "pcie",
.module = THIS_MODULE,
-   .flags  = NVME_F_METADATA_SUPPORTED |
- NVME_F_PCI_P2PDMA,
+   .flags  = NVME_F_METADATA_SUPPORTED,
.reg_read32 = nvme_pci_reg_read32,
.reg_write32= nvme_pci_reg_write32,
.reg_read64 = nvme_pci_reg_read64,
.free_ctrl  = nvme_pci_free_ctrl,
.submit_async_event = nvme_pci_submit_async_event,
.get_address= nvme_pci_get_address,
+   .supports_pci_p2pdma= nvme_pci_supports_pci_p2pdma,
 };
 
 static int nvme_dev_map(struct nvme_dev *dev)
-- 
2.20.1



[RFC PATCH 07/15] iommu/dma: Support PCI P2PDMA pages in dma-iommu map_sg

2020-11-06 Thread Logan Gunthorpe
When a PCI P2PDMA page is seen, set the IOVA length of the segment
to zero so that it is not mapped into the IOVA. Then, in finalise_sg(),
apply the appropriate bus address to the segment. The IOVA is not
created if the scatterlist only consists of P2PDMA pages.

Similar to dma-direct, DMA_ATTR_P2PDMA is used to indicate caller
support seeing the high bit of the dma_length is used as a flag
to indicate P2PDMA segments.

On unmap, P2PDMA segments are skipped over when determining the
start and end IOVA addresses.

With this change, the flags variable in the dma_map_ops is
set to DMA_F_PCI_P2PDMA_SUPPORTED to indicate support for
P2PDMA pages.

Signed-off-by: Logan Gunthorpe 
---
 drivers/iommu/dma-iommu.c | 63 ---
 1 file changed, 53 insertions(+), 10 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 5591d6593583..1c8402474376 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -872,13 +873,16 @@ static void iommu_dma_unmap_page(struct device *dev, 
dma_addr_t dma_handle,
  * segment's start address to avoid concatenating across one.
  */
 static int __finalise_sg(struct device *dev, struct scatterlist *sg, int nents,
-   dma_addr_t dma_addr)
+   dma_addr_t dma_addr, unsigned long attrs)
 {
struct scatterlist *s, *cur = sg;
unsigned long seg_mask = dma_get_seg_boundary(dev);
unsigned int cur_len = 0, max_len = dma_get_max_seg_size(dev);
int i, count = 0;
 
+   if (attrs & DMA_ATTR_P2PDMA && max_len >= SG_P2PDMA_FLAG)
+   max_len = SG_P2PDMA_FLAG - 1;
+
/*
 * The Intel graphic driver is used to assume that the returned
 * sg list is not combound. This blocks the efforts of converting
@@ -917,6 +921,19 @@ static int __finalise_sg(struct device *dev, struct 
scatterlist *sg, int nents,
sg_dma_address(s) = DMA_MAPPING_ERROR;
sg_dma_len(s) = 0;
 
+   if (is_pci_p2pdma_page(sg_page(s)) && !s_iova_len) {
+   if (i > 0)
+   cur = sg_next(cur);
+
+   sg_dma_address(cur) = sg_phys(s) + s->offset -
+   pci_p2pdma_bus_offset(sg_page(s));
+   sg_dma_len(cur) = s->length | SG_P2PDMA_FLAG;
+
+   count++;
+   cur_len = 0;
+   continue;
+   }
+
/*
 * Now fill in the real DMA data. If...
 * - there is a valid output segment to append to
@@ -1013,11 +1030,12 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
struct iommu_dma_cookie *cookie = domain->iova_cookie;
struct iova_domain *iovad = >iovad;
struct scatterlist *s, *prev = NULL;
+   struct dev_pagemap *pgmap = NULL;
int prot = dma_info_to_prot(dir, dev_is_dma_coherent(dev), attrs);
dma_addr_t iova;
size_t iova_len = 0;
unsigned long mask = dma_get_seg_boundary(dev);
-   int i;
+   int i, map = -1;
 
if (unlikely(iommu_dma_deferred_attach(dev, domain)))
return 0;
@@ -1045,6 +1063,21 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
s_length = iova_align(iovad, s_length + s_iova_off);
s->length = s_length;
 
+   if (is_pci_p2pdma_page(sg_page(s))) {
+   if (sg_page(s)->pgmap != pgmap) {
+   pgmap = sg_page(s)->pgmap;
+   map = pci_p2pdma_should_map_bus(dev, pgmap);
+   }
+
+   if (map < 0 || !(attrs & DMA_ATTR_P2PDMA))
+   goto out_restore_sg;
+
+   if (map) {
+   s->length = 0;
+   continue;
+   }
+   }
+
/*
 * Due to the alignment of our single IOVA allocation, we can
 * depend on these assumptions about the segment boundary mask:
@@ -1067,6 +1100,9 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
prev = s;
}
 
+   if (!iova_len)
+   return __finalise_sg(dev, sg, nents, 0, attrs);
+
iova = iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev);
if (!iova)
goto out_restore_sg;
@@ -1078,7 +1114,7 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
if (iommu_map_sg_atomic(domain, iova, sg, nents, prot) < iova_len)
goto out_free_iova;
 
-   return __finalise_sg(dev, sg, nents, iova);
+   return 

[RFC PATCH 00/15] Userspace P2PDMA with O_DIRECT NVMe devices

2020-11-06 Thread Logan Gunthorpe
This RFC enables P2PDMA transfers in userspace between NVMe drives using
existing O_DIRECT operations or the NVMe passthrough IOCTL.

This is accomplished by allowing userspace to allocate chunks of any CMB
by mmaping the NVMe ctrl device (Patches 14 and 15). The resulting memory
will be backed by P2P pages and can be passed only to O_DIRECT
operations. A flag is added to GUP() in Patch 10 and Patches 11 through 13
wire this flag up based on whether the block queue indicates P2PDMA
support.

The above is pretty straight forward and (I hope) largely uncontroversial.
However, the one significant problem in all this is that, presently,
pci_p2pdma_map_sg() requires a homogeneous SGL with all P2PDMA pages or
none. Enhancing GUP to support enforcing this rule would require a huge
hack that I don't expect would be all that pallatable. So this RFC takes
the approach of removing the requirement of having a homogeneous SGL.

With the new common dma-iommu infrastructure, this patchset adds
support for P2PDMA pages into dma_map_sg() which will support AMD,
Intel (soon) and dma-direct implementations. (Other IOMMU
implementations would then be unsupported, notably ARM and PowerPC).

The other major blocker is that in order to implement support for
P2PDMA pages in dma_map_sg(), a flag is necessary to determine if a
given dma_addr_t points to P2PDMA memory or to an IOVA so that it can
be unmapped appropriately in dma_unmap_sg(). The (ugly) approach this
RFC takes is to use the top bit in the dma_length field and ensure
callers are prepared for it using a new DMA_ATTR_P2PDMA flag.

I suspect, the ultimate solution to this blocker will be to implement
some kind of new dma_op that doesn't use the SGL. Ideas have been
thrown around in the past for one that maps some kind of novel dma_vec
directly from a bio_vec. This will become a lot easier to implement if
more dma_ops providers get converted to the new dma-iommu
implementation, but this will take time.

Alternative ideas or other feedback welcome.

This series is based on v5.10-rc2 with Lu Baolu's (and Tom Murphy's)
v4 patchset for converting the Intel IOMMU to dma-iommu[1]. A git
branch is available here:

  https://github.com/sbates130272/linux-p2pmem/  p2pdma_user_cmb_rfc

Thanks,

Logan

[1] 
https://lkml.kernel.org/lkml/20200927063437.13988-1-baolu...@linux.intel.com/T/#u.


Logan Gunthorpe (15):
  PCI/P2PDMA: Don't sleep in upstream_bridge_distance_warn()
  PCI/P2PDMA: Attempt to set map_type if it has not been set
  PCI/P2PDMA: Introduce pci_p2pdma_should_map_bus() and
pci_p2pdma_bus_offset()
  lib/scatterlist: Add flag for indicating P2PDMA segments in an SGL
  dma-direct: Support PCI P2PDMA pages in dma-direct map_sg
  dma-mapping: Add flags to dma_map_ops to indicate PCI P2PDMA support
  iommu/dma: Support PCI P2PDMA pages in dma-iommu map_sg
  nvme-pci: Check DMA ops when indicating support for PCI P2PDMA
  nvme-pci: Convert to using dma_map_sg for p2pdma pages
  mm: Introduce FOLL_PCI_P2PDMA to gate getting PCI P2PDMA pages
  iov_iter: Introduce iov_iter_get_pages_[alloc_]flags()
  block: Set FOLL_PCI_P2PDMA in __bio_iov_iter_get_pages()
  block: Set FOLL_PCI_P2PDMA in bio_map_user_iov()
  PCI/P2PDMA: Introduce pci_mmap_p2pmem()
  nvme-pci: Allow mmaping the CMB in userspace

 block/bio.c |   7 +-
 block/blk-map.c |   7 +-
 drivers/dax/super.c |   7 +-
 drivers/iommu/dma-iommu.c   |  63 +++--
 drivers/nvme/host/core.c|  14 ++-
 drivers/nvme/host/nvme.h|   3 +-
 drivers/nvme/host/pci.c |  50 ++
 drivers/pci/p2pdma.c| 178 +---
 include/linux/dma-map-ops.h |   3 +
 include/linux/dma-mapping.h |  16 
 include/linux/memremap.h|   4 +-
 include/linux/mm.h  |   1 +
 include/linux/pci-p2pdma.h  |  17 
 include/linux/scatterlist.h |   4 +
 include/linux/uio.h |  21 -
 kernel/dma/direct.c |  33 ++-
 kernel/dma/mapping.c|   8 ++
 lib/iov_iter.c  |  25 ++---
 mm/gup.c|  28 +++---
 mm/huge_memory.c|   8 +-
 mm/memory-failure.c |   4 +-
 mm/memremap.c   |  14 ++-
 22 files changed, 427 insertions(+), 88 deletions(-)


base-commit: 5ba8a2512e8c5f5cf9b7309dc895612f0a77a399
--
2.20.1


[RFC PATCH 01/15] PCI/P2PDMA: Don't sleep in upstream_bridge_distance_warn()

2020-11-06 Thread Logan Gunthorpe
In order to call this function from a dma_map function, it must not sleep.
The only reason it does sleep so to allocate the seqbuf to print
which devices are within the ACS path.

Switch the kmalloc call to use GFP_NOWAIT and simply not print that
message if the buffer fails to be allocated.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/p2pdma.c | 17 +
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index de1c331dbed4..94583779c36e 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -267,7 +267,7 @@ static int pci_bridge_has_acs_redir(struct pci_dev *pdev)
 
 static void seq_buf_print_bus_devfn(struct seq_buf *buf, struct pci_dev *pdev)
 {
-   if (!buf)
+   if (!buf || !buf->buffer)
return;
 
seq_buf_printf(buf, "%s;", pci_name(pdev));
@@ -501,19 +501,20 @@ upstream_bridge_distance_warn(struct pci_dev *provider, 
struct pci_dev *client,
bool acs_redirects;
int ret;
 
-   seq_buf_init(_list, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
-   if (!acs_list.buffer)
-   return -ENOMEM;
+   seq_buf_init(_list, kmalloc(PAGE_SIZE, GFP_NOWAIT), PAGE_SIZE);
 
ret = upstream_bridge_distance(provider, client, dist, _redirects,
   _list);
if (acs_redirects) {
pci_warn(client, "ACS redirect is set between the client and 
provider (%s)\n",
 pci_name(provider));
-   /* Drop final semicolon */
-   acs_list.buffer[acs_list.len-1] = 0;
-   pci_warn(client, "to disable ACS redirect for this path, add 
the kernel parameter: pci=disable_acs_redir=%s\n",
-acs_list.buffer);
+
+   if (acs_list.buffer) {
+   /* Drop final semicolon */
+   acs_list.buffer[acs_list.len - 1] = 0;
+   pci_warn(client, "to disable ACS redirect for this 
path, add the kernel parameter: pci=disable_acs_redir=%s\n",
+acs_list.buffer);
+   }
}
 
if (ret == PCI_P2PDMA_MAP_NOT_SUPPORTED) {
-- 
2.20.1



[RFC PATCH 10/15] mm: Introduce FOLL_PCI_P2PDMA to gate getting PCI P2PDMA pages

2020-11-06 Thread Logan Gunthorpe
Callers that expect PCI P2PDMA pages can now set FOLL_PCI_P2PDMA to
allow obtaining P2PDMA pages. If a caller does not set this flag
and tries to map P2PDMA pages it will fail.

This is implemented by adding a flag and a check to get_dev_pagemap().

Signed-off-by: Logan Gunthorpe 
---
 drivers/dax/super.c  |  7 ---
 include/linux/memremap.h |  4 ++--
 include/linux/mm.h   |  1 +
 mm/gup.c | 28 +---
 mm/huge_memory.c |  8 
 mm/memory-failure.c  |  4 ++--
 mm/memremap.c| 14 ++
 7 files changed, 40 insertions(+), 26 deletions(-)

diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index edc279be3e59..59c05839b3f8 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -132,9 +132,10 @@ bool __generic_fsdax_supported(struct dax_device *dax_dev,
} else if (pfn_t_devmap(pfn) && pfn_t_devmap(end_pfn)) {
struct dev_pagemap *pgmap, *end_pgmap;
 
-   pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL);
-   end_pgmap = get_dev_pagemap(pfn_t_to_pfn(end_pfn), NULL);
-   if (pgmap && pgmap == end_pgmap && pgmap->type == 
MEMORY_DEVICE_FS_DAX
+   pgmap = get_dev_pagemap(pfn_t_to_pfn(pfn), NULL, false);
+   end_pgmap = get_dev_pagemap(pfn_t_to_pfn(end_pfn), NULL, false);
+   if (!IS_ERR_OR_NULL(pgmap) && pgmap == end_pgmap
+   && pgmap->type == MEMORY_DEVICE_FS_DAX
&& pfn_t_to_page(pfn)->pgmap == pgmap
&& pfn_t_to_page(end_pfn)->pgmap == pgmap
&& pfn_t_to_pfn(pfn) == PHYS_PFN(__pa(kaddr))
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 79c49e7f5c30..14f6d899c657 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -136,7 +136,7 @@ void memunmap_pages(struct dev_pagemap *pgmap);
 void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap);
 void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap);
 struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
-   struct dev_pagemap *pgmap);
+   struct dev_pagemap *pgmap, bool allow_pci_p2pdma);
 
 unsigned long vmem_altmap_offset(struct vmem_altmap *altmap);
 void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns);
@@ -160,7 +160,7 @@ static inline void devm_memunmap_pages(struct device *dev,
 }
 
 static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
-   struct dev_pagemap *pgmap)
+   struct dev_pagemap *pgmap, bool allow_pci_p2pdma)
 {
return NULL;
 }
diff --git a/include/linux/mm.h b/include/linux/mm.h
index ef360fe70aaf..c405af966a4e 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2790,6 +2790,7 @@ struct page *follow_page(struct vm_area_struct *vma, 
unsigned long address,
 #define FOLL_SPLIT_PMD 0x2 /* split huge pmd before returning */
 #define FOLL_PIN   0x4 /* pages must be released via unpin_user_page */
 #define FOLL_FAST_ONLY 0x8 /* gup_fast: prevent fall-back to slow gup */
+#define FOLL_PCI_P2PDMA0x10 /* allow returning PCI P2PDMA pages */
 
 /*
  * FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
diff --git a/mm/gup.c b/mm/gup.c
index 102877ed77a4..abbc0a06d241 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -449,11 +449,16 @@ static struct page *follow_page_pte(struct vm_area_struct 
*vma,
 * case since they are only valid while holding the pgmap
 * reference.
 */
-   *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
-   if (*pgmap)
+   *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap,
+flags & FOLL_PCI_P2PDMA);
+   if (IS_ERR(*pgmap)) {
+   page = ERR_CAST(*pgmap);
+   goto out;
+   } else if (*pgmap) {
page = pte_page(pte);
-   else
+   } else {
goto no_page;
+   }
} else if (unlikely(!page)) {
if (flags & FOLL_DUMP) {
/* Avoid special (like zero) pages in core dumps */
@@ -794,7 +799,7 @@ struct page *follow_page(struct vm_area_struct *vma, 
unsigned long address,
struct page *page;
 
page = follow_page_mask(vma, address, foll_flags, );
-   if (ctx.pgmap)
+   if (!IS_ERR_OR_NULL(ctx.pgmap))
put_dev_pagemap(ctx.pgmap);
return page;
 }
@@ -1138,7 +1143,7 @@ static long __get_user_pages(struct mm_struct *mm,
nr_pages -= page_increm;
} while (nr_pages);
 out:
-   if (ctx.pgmap)
+   if (!IS_ERR_OR_NULL(ctx.pgmap))
put_dev_pagemap(ctx.pgmap);
   

[RFC PATCH 03/15] PCI/P2PDMA: Introduce pci_p2pdma_should_map_bus() and pci_p2pdma_bus_offset()

2020-11-06 Thread Logan Gunthorpe
Introduce pci_p2pdma_should_map_bus() which is meant to be called by
dma map functions to determine how to map a given p2pdma page.

pci_p2pdma_bus_offset() is also added to allow callers to get the bus
offset if they need to map the bus address.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/p2pdma.c   | 46 ++
 include/linux/pci-p2pdma.h | 11 +
 2 files changed, 57 insertions(+)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index ea8472278b11..9961e779f430 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -930,6 +930,52 @@ void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct 
scatterlist *sg,
 }
 EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg_attrs);
 
+/**
+ * pci_p2pdma_bus_offset - returns the bus offset for a given page
+ * @page: page to get the offset for
+ *
+ * Must be passed a pci p2pdma page.
+ */
+u64 pci_p2pdma_bus_offset(struct page *page)
+{
+   struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(page->pgmap);
+
+   WARN_ON(!is_pci_p2pdma_page(page));
+
+   return p2p_pgmap->bus_offset;
+}
+EXPORT_SYMBOL_GPL(pci_p2pdma_bus_offset);
+
+/**
+ * pci_p2pdma_should_map_bus - determine if a dma mapping should use the
+ * bus address
+ * @dev: device doing the DMA request
+ * @pgmap: dev_pagemap structure for the mapping
+ *
+ * Returns 1 if the page should be mapped with a bus address, 0 otherwise
+ * and -1 the device should not be mapping P2PDMA pages.
+ */
+int pci_p2pdma_should_map_bus(struct device *dev, struct dev_pagemap *pgmap)
+{
+   struct pci_p2pdma_pagemap *p2p_pgmap = to_p2p_pgmap(pgmap);
+   struct pci_dev *client;
+
+   if (!dev_is_pci(dev))
+   return -1;
+
+   client = to_pci_dev(dev);
+
+   switch (pci_p2pdma_map_type(p2p_pgmap->provider, client)) {
+   case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+   return 0;
+   case PCI_P2PDMA_MAP_BUS_ADDR:
+   return 1;
+   default:
+   return -1;
+   }
+}
+EXPORT_SYMBOL_GPL(pci_p2pdma_should_map_bus);
+
 /**
  * pci_p2pdma_enable_store - parse a configfs/sysfs attribute store
  * to enable p2pdma
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index 8318a97c9c61..fc5de47eeac4 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -34,6 +34,8 @@ int pci_p2pdma_map_sg_attrs(struct device *dev, struct 
scatterlist *sg,
int nents, enum dma_data_direction dir, unsigned long attrs);
 void pci_p2pdma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
int nents, enum dma_data_direction dir, unsigned long attrs);
+u64 pci_p2pdma_bus_offset(struct page *page);
+int pci_p2pdma_should_map_bus(struct device *dev, struct dev_pagemap *pgmap);
 int pci_p2pdma_enable_store(const char *page, struct pci_dev **p2p_dev,
bool *use_p2pdma);
 ssize_t pci_p2pdma_enable_show(char *page, struct pci_dev *p2p_dev,
@@ -83,6 +85,15 @@ static inline void pci_p2pmem_free_sgl(struct pci_dev *pdev,
 static inline void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
 {
 }
+static inline u64 pci_p2pdma_bus_offset(struct page *page)
+{
+   return -1;
+}
+static inline int pci_p2pdma_should_map_bus(struct device *dev,
+   struct dev_pagemap *pgmap)
+{
+   return -1;
+}
 static inline int pci_p2pdma_map_sg_attrs(struct device *dev,
struct scatterlist *sg, int nents, enum dma_data_direction dir,
unsigned long attrs)
-- 
2.20.1



[RFC PATCH 15/15] nvme-pci: Allow mmaping the CMB in userspace

2020-11-06 Thread Logan Gunthorpe
Allow userspace to obtain CMB memory by mmaping the controller's
char device. The mmap call allocates and returns a hunk of CMB memory,
(the offset is ignored) so userspace does not have control over the
address within the CMB.

A VMA allocated in this way will only be usable by drivers that set
FOLL_PCI_P2PDMA when calling GUP. And inter-device support will be
checked the first time the pages are mapped for DMA.

Currently this is only supported by O_DIRECT to an PCI NVMe device
or through the NVMe passthrough IOCTL.

Signed-off-by: Logan Gunthorpe 
---
 drivers/nvme/host/core.c | 11 +++
 drivers/nvme/host/nvme.h |  1 +
 drivers/nvme/host/pci.c  |  9 +
 3 files changed, 21 insertions(+)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f14316c9b34a..fc642aba671d 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3240,12 +3240,23 @@ static long nvme_dev_ioctl(struct file *file, unsigned 
int cmd,
}
 }
 
+static int nvme_dev_mmap(struct file *file, struct vm_area_struct *vma)
+{
+   struct nvme_ctrl *ctrl = file->private_data;
+
+   if (!ctrl->ops->mmap_cmb)
+   return -ENODEV;
+
+   return ctrl->ops->mmap_cmb(ctrl, vma);
+}
+
 static const struct file_operations nvme_dev_fops = {
.owner  = THIS_MODULE,
.open   = nvme_dev_open,
.release= nvme_dev_release,
.unlocked_ioctl = nvme_dev_ioctl,
.compat_ioctl   = compat_ptr_ioctl,
+   .mmap   = nvme_dev_mmap,
 };
 
 static ssize_t nvme_sysfs_reset(struct device *dev,
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a0bfe8709cfc..3d790d849701 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -473,6 +473,7 @@ struct nvme_ctrl_ops {
void (*delete_ctrl)(struct nvme_ctrl *ctrl);
int (*get_address)(struct nvme_ctrl *ctrl, char *buf, int size);
bool (*supports_pci_p2pdma)(struct nvme_ctrl *ctrl);
+   int (*mmap_cmb)(struct nvme_ctrl *ctrl, struct vm_area_struct *vma);
 };
 
 #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 26976bdf4af0..9c61573111ea 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2687,6 +2687,14 @@ static bool nvme_pci_supports_pci_p2pdma(struct 
nvme_ctrl *ctrl)
return dma_pci_p2pdma_supported(dev->dev);
 }
 
+static int nvme_pci_mmap_cmb(struct nvme_ctrl *ctrl,
+struct vm_area_struct *vma)
+{
+   struct pci_dev *pdev = to_pci_dev(to_nvme_dev(ctrl)->dev);
+
+   return pci_mmap_p2pmem(pdev, vma);
+}
+
 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name   = "pcie",
.module = THIS_MODULE,
@@ -2698,6 +2706,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.submit_async_event = nvme_pci_submit_async_event,
.get_address= nvme_pci_get_address,
.supports_pci_p2pdma= nvme_pci_supports_pci_p2pdma,
+   .mmap_cmb   = nvme_pci_mmap_cmb,
 };
 
 static int nvme_dev_map(struct nvme_dev *dev)
-- 
2.20.1



[RFC PATCH 04/15] lib/scatterlist: Add flag for indicating P2PDMA segments in an SGL

2020-11-06 Thread Logan Gunthorpe
We make use of the top bit of the dma_length to indicate a P2PDMA
segment. Code that uses this will need to use sg_dma_p2pdma_len() to
get the length and ensure no lengths exceed 2GB.

An sg_dma_is_p2pdma() helper is included to check if a particular
segment is p2pdma().

Signed-off-by: Logan Gunthorpe 
---
 include/linux/scatterlist.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h
index 36c47e7e66a2..e738159d56f9 100644
--- a/include/linux/scatterlist.h
+++ b/include/linux/scatterlist.h
@@ -39,6 +39,10 @@ struct scatterlist {
 #define sg_dma_len(sg) ((sg)->length)
 #endif
 
+#define SG_P2PDMA_FLAG (1U << 31)
+#define sg_dma_p2pdma_len(sg)  (sg_dma_len(sg) & ~SG_P2PDMA_FLAG)
+#define sg_dma_is_p2pdma(sg)   (sg_dma_len(sg) & SG_P2PDMA_FLAG)
+
 struct sg_table {
struct scatterlist *sgl;/* the list */
unsigned int nents; /* number of mapped entries */
-- 
2.20.1



Re: [PATCH blktests v3 00/11] NVMe Target Passthru Block Tests

2020-10-22 Thread Logan Gunthorpe



On 2020-10-22 4:04 p.m., Omar Sandoval wrote:
> On Thu, Oct 22, 2020 at 12:45:25PM -0600, Logan Gunthorpe wrote:
>>
>> On 2020-10-08 10:40 a.m., Logan Gunthorpe wrote:
>>> Hi,
>>>
>>> This series adds blktests for the nvmet passthru feature that was merged
>>> for 5.9. It's been reconciled with Sagi's blktest series that Omar
>>> recently merged.
>>
>> Bump. This has been around for a while now. Omar, can you please
>> consider picking this up?
> 
> There were a couple of shellcheck errors:
> 
> tests/nvme/rc:77:8: warning: Declare and assign separately to avoid masking 
> return values. [SC2155]
> tests/nvme/rc:278:7: note: Useless echo? Instead of 'echo $(cmd)', just use 
> 'cmd'. [SC2005]
> 
> I fixed those up and applied, thanks.

Oh, oops. I must have introduced those very recently.

Thanks for fixing it up and merging!

Logan


Re: [PATCH blktests v3 00/11] NVMe Target Passthru Block Tests

2020-10-22 Thread Logan Gunthorpe


On 2020-10-08 10:40 a.m., Logan Gunthorpe wrote:
> Hi,
> 
> This series adds blktests for the nvmet passthru feature that was merged
> for 5.9. It's been reconciled with Sagi's blktest series that Omar
> recently merged.

Bump. This has been around for a while now. Omar, can you please
consider picking this up?

Thanks,

Logan


[PATCH blktests v3 00/11] NVMe Target Passthru Block Tests

2020-10-08 Thread Logan Gunthorpe
Hi,

This series adds blktests for the nvmet passthru feature that was merged
for 5.9. It's been reconciled with Sagi's blktest series that Omar
recently merged.

This series is based off of the current blktests master and a git repo is
available for this here:

https://github.com/Eideticom/blktests nvmet_passthru_v3

Thanks,

Logan

--

Changes in v3:
- Fixed a nit with variable initialization in patch 5 (per Chaitanya)
- Replaced use of printf with echo (per Chaitanya)

Changes in v2:

- Rebased on latest blktests master and changed to use the common
  helpers Sagi introduced in his series
- Collected Chaitanya's reviewed-by tag

--


Logan Gunthorpe (11):
  common/fio: Remove state file in common helper
  common/xfs: Create common helper to check for XFS support
  common/xfs: Create common helper to verify block device with xfs
  nvme: Search for specific subsysnqn in _find_nvme_loop_dev
  nvme: Add common helpers for passthru tests
  nvme/033: Simple test to create and connect to a passthru target
  nvme/034: Add test for passthru data verification
  nvme/035: Add test to verify passthru controller with a filesystem
  nvme/036: Add test for testing reset command on nvme-passthru
  nvme/037: Add test which loops passthru connect and disconnect
  nvme/038: Test removal of un-enabled subsystem and ports

 common/fio |  1 +
 common/rc  |  8 +
 common/xfs | 33 +++
 tests/nvme/004 |  2 +-
 tests/nvme/005 |  2 +-
 tests/nvme/008 |  2 +-
 tests/nvme/009 |  2 +-
 tests/nvme/010 |  3 +-
 tests/nvme/011 |  3 +-
 tests/nvme/012 | 23 -
 tests/nvme/013 | 21 +++-
 tests/nvme/014 |  2 +-
 tests/nvme/015 |  2 +-
 tests/nvme/018 |  2 +-
 tests/nvme/019 |  2 +-
 tests/nvme/020 |  2 +-
 tests/nvme/021 |  2 +-
 tests/nvme/022 |  2 +-
 tests/nvme/023 |  2 +-
 tests/nvme/024 |  2 +-
 tests/nvme/025 |  2 +-
 tests/nvme/026 |  2 +-
 tests/nvme/027 |  2 +-
 tests/nvme/028 |  2 +-
 tests/nvme/029 |  2 +-
 tests/nvme/033 | 67 +
 tests/nvme/033.out |  7 
 tests/nvme/034 | 35 
 tests/nvme/034.out |  3 ++
 tests/nvme/035 | 37 +
 tests/nvme/035.out |  3 ++
 tests/nvme/036 | 37 +
 tests/nvme/036.out |  3 ++
 tests/nvme/037 | 35 
 tests/nvme/037.out |  2 ++
 tests/nvme/038 | 36 
 tests/nvme/038.out |  2 ++
 tests/nvme/rc  | 82 --
 38 files changed, 419 insertions(+), 58 deletions(-)
 create mode 100644 common/xfs
 create mode 100755 tests/nvme/033
 create mode 100644 tests/nvme/033.out
 create mode 100755 tests/nvme/034
 create mode 100644 tests/nvme/034.out
 create mode 100755 tests/nvme/035
 create mode 100644 tests/nvme/035.out
 create mode 100755 tests/nvme/036
 create mode 100644 tests/nvme/036.out
 create mode 100755 tests/nvme/037
 create mode 100644 tests/nvme/037.out
 create mode 100755 tests/nvme/038
 create mode 100644 tests/nvme/038.out


base-commit: 20445c5eb6456addca9131ec6917d2a2d7414e04
--
2.20.1


[PATCH blktests v3 01/11] common/fio: Remove state file in common helper

2020-10-08 Thread Logan Gunthorpe
Instead of each individual test removing this file, just do it
in the common helper.

Signed-off-by: Logan Gunthorpe 
Reviewed-by: Chaitanya Kulkarni 
---
 common/fio | 1 +
 tests/nvme/010 | 1 -
 tests/nvme/011 | 1 -
 tests/nvme/012 | 1 -
 tests/nvme/013 | 1 -
 5 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/common/fio b/common/fio
index 8bfad4238dda..94c65c107a14 100644
--- a/common/fio
+++ b/common/fio
@@ -181,6 +181,7 @@ _run_fio_rand_io() {
 _run_fio_verify_io() {
_run_fio --name=verify --rw=randwrite --direct=1 --ioengine=libaio 
--bs=4k \
--iodepth=16 --verify=crc32c "$@"
+   rm -f local*verify*state
 }
 
 _fio_perf_report() {
diff --git a/tests/nvme/010 b/tests/nvme/010
index 9d96d7803be3..0188e842213e 100755
--- a/tests/nvme/010
+++ b/tests/nvme/010
@@ -52,7 +52,6 @@ test() {
losetup -d "${loop_dev}"
 
rm "${file_path}"
-   rm -f local*verify*state
 
echo "Test complete"
 }
diff --git a/tests/nvme/011 b/tests/nvme/011
index 06dc568fb6ea..543dbe840874 100755
--- a/tests/nvme/011
+++ b/tests/nvme/011
@@ -48,7 +48,6 @@ test() {
_remove_nvmet_port "${port}"
 
rm "${file_path}"
-   rm -f local-write-and-verify*state
 
echo "Test complete"
 }
diff --git a/tests/nvme/012 b/tests/nvme/012
index 8110430e49d4..1dbd59804ed7 100755
--- a/tests/nvme/012
+++ b/tests/nvme/012
@@ -63,7 +63,6 @@ test() {
 
losetup -d "${loop_dev}"
 
-   rm -f local*verify*state
rm "${file_path}"
rm -fr "${mount_dir}"
 
diff --git a/tests/nvme/013 b/tests/nvme/013
index 176b11b9ccb5..df7f23e69252 100755
--- a/tests/nvme/013
+++ b/tests/nvme/013
@@ -58,7 +58,6 @@ test() {
_remove_nvmet_subsystem "${subsys_name}"
_remove_nvmet_port "${port}"
 
-   rm -f local*verify*state
rm "${file_path}"
rm -fr "${mount_dir}"
 
-- 
2.20.1



[PATCH blktests v3 06/11] nvme/033: Simple test to create and connect to a passthru target

2020-10-08 Thread Logan Gunthorpe
This tests creates and connects to a passthru controller backed
by a test NVMe namespace. It then verifies that some common fields
in id-ctrl and id-ns are the same in the target and the orginial
device.

Signed-off-by: Logan Gunthorpe 
---
 tests/nvme/033 | 67 ++
 tests/nvme/033.out |  7 +
 2 files changed, 74 insertions(+)
 create mode 100755 tests/nvme/033
 create mode 100644 tests/nvme/033.out

diff --git a/tests/nvme/033 b/tests/nvme/033
new file mode 100755
index ..c6a3f7feb50e
--- /dev/null
+++ b/tests/nvme/033
@@ -0,0 +1,67 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-3.0+
+# Copyright (C) 2019 Logan Gunthorpe
+# Copyright (C) 2019 Eideticom Communications Inc.
+
+. tests/nvme/rc
+
+DESCRIPTION="create and connect to an NVMeOF target with a passthru controller"
+QUICK=1
+
+requires() {
+   _nvme_requires
+   _have_kernel_option NVME_TARGET_PASSTHRU
+}
+
+nvme_info() {
+   local ns=$1
+
+   nvme id-ctrl "${ns}" | grep -E '^(vid|sn|mn|fr) '
+   nvme id-ns "${ns}" | grep -E '^(nsze|ncap) '
+}
+
+compare_dev_info() {
+   local passthru_dev=$1
+   local testdev_info
+   local passthru_info
+
+   testdev_info=$(nvme_info "${TEST_DEV}")
+   passthru_info=$(nvme_info "${passthru_dev}")
+
+   cat >> "${FULL}" <<- EOF
+
+   Test Device ${TEST_DEV} Info:
+   $testdev_info
+
+   Passthru Device ${passthru_dev} Info:
+   $passthru_info
+
+   EOF
+
+   diff -u <(echo "${testdev_info}") <(echo "${passthru_info}")
+   if [[ "${testdev_info}" != "${passthru_info}" ]]; then
+   echo "ERROR: Device information does not match! (See ${FULL})"
+   fi
+}
+
+test_device() {
+   local subsys="blktests-subsystem-1"
+   local nsdev
+   local port
+
+   echo "Running ${TEST_NAME}"
+
+   _setup_nvmet
+   port=$(_nvmet_passthru_target_setup "${subsys}")
+
+   _nvme_discover "${nvme_trtype}" | _filter_discovery
+
+   nsdev=$(_nvmet_passthru_target_connect "${nvme_trtype}" "${subsys}")
+
+   compare_dev_info "${nsdev}"
+
+   _nvme_disconnect_subsys "${subsys}"
+   _nvmet_passthru_target_cleanup "${port}" "${subsys}"
+
+   echo "Test complete"
+}
diff --git a/tests/nvme/033.out b/tests/nvme/033.out
new file mode 100644
index ..6f45a1d5ec1d
--- /dev/null
+++ b/tests/nvme/033.out
@@ -0,0 +1,7 @@
+Running nvme/033
+Discovery Log Number of Records 1, Generation counter X
+=Discovery Log Entry 0==
+trtype:  loop
+subnqn:  blktests-subsystem-1
+NQN:blktests-subsystem-1 disconnected 1 controller(s)
+Test complete
-- 
2.20.1



[PATCH blktests v3 03/11] common/xfs: Create common helper to verify block device with xfs

2020-10-08 Thread Logan Gunthorpe
Make a common helper from the code in tests nvme/012 and nvme/013
to run an fio verify on a XFS file system backed by the
specified block device.

While we are at it, all the output is redirected to $FULL instead of
/dev/null.

Signed-off-by: Logan Gunthorpe 
---
 common/xfs | 22 ++
 tests/nvme/012 | 14 +-
 tests/nvme/013 | 14 +-
 3 files changed, 24 insertions(+), 26 deletions(-)

diff --git a/common/xfs b/common/xfs
index d1a603b8c7b5..210c924cdd41 100644
--- a/common/xfs
+++ b/common/xfs
@@ -9,3 +9,25 @@
 _have_xfs() {
_have_fs xfs && _have_program mkfs.xfs
 }
+
+_xfs_mkfs_and_mount() {
+   local bdev=$1
+   local mount_dir=$2
+
+   mkdir -p "${mount_dir}"
+   umount "${mount_dir}"
+   mkfs.xfs -l size=32m -f "${bdev}"
+   mount "${bdev}" "${mount_dir}"
+}
+
+_xfs_run_fio_verify_io() {
+   local mount_dir="/mnt/blktests"
+   local bdev=$1
+
+   _xfs_mkfs_and_mount "${bdev}" "${mount_dir}" >> "${FULL}" 2>&1
+
+   _run_fio_verify_io --size=950m --directory="${mount_dir}/"
+
+   umount "${mount_dir}" >> "${FULL}" 2>&1
+   rm -fr "${mount_dir}"
+}
diff --git a/tests/nvme/012 b/tests/nvme/012
index 1d8d8e3cc271..a13cd08ce6bf 100755
--- a/tests/nvme/012
+++ b/tests/nvme/012
@@ -26,12 +26,9 @@ test() {
local port
local nvmedev
local loop_dev
-   local mount_dir="/mnt/blktests"
local file_path="${TMPDIR}/img"
local subsys_name="blktests-subsystem-1"
 
-   mkdir -p "${mount_dir}" > /dev/null 2>&1
-
truncate -s 1G "${file_path}"
 
loop_dev="$(losetup -f --show "${file_path}")"
@@ -47,15 +44,7 @@ test() {
cat "/sys/block/${nvmedev}n1/uuid"
cat "/sys/block/${nvmedev}n1/wwid"
 
-   umount ${mount_dir} > /dev/null 2>&1
-
-   mkfs.xfs -l size=32m -f /dev/"${nvmedev}n1" > /dev/null 2>&1
-
-   mount /dev/"${nvmedev}n1" "${mount_dir}"
-
-   _run_fio_verify_io --size=950m --directory="${mount_dir}/"
-
-   umount "${mount_dir}" > /dev/null 2>&1
+   _xfs_run_fio_verify_io "/dev/${nvmedev}n1"
 
_nvme_disconnect_subsys "${subsys_name}"
 
@@ -66,7 +55,6 @@ test() {
losetup -d "${loop_dev}"
 
rm "${file_path}"
-   rm -fr "${mount_dir}"
 
echo "Test complete"
 }
diff --git a/tests/nvme/013 b/tests/nvme/013
index 3819a2730d9b..1ac725ea83f2 100755
--- a/tests/nvme/013
+++ b/tests/nvme/013
@@ -24,13 +24,10 @@ test() {
 
local port
local nvmedev
-   local mount_dir="/mnt/blktests/"
local file_path="${TMPDIR}/img"
 
local subsys_name="blktests-subsystem-1"
 
-   mkdir -p "${mount_dir}" > /dev/null 2>&1
-
truncate -s 1G "${file_path}"
 
_create_nvmet_subsystem "${subsys_name}" "${file_path}" \
@@ -44,15 +41,7 @@ test() {
cat "/sys/block/${nvmedev}n1/uuid"
cat "/sys/block/${nvmedev}n1/wwid"
 
-   umount ${mount_dir} > /dev/null 2>&1
-
-   mkfs.xfs -l size=32m -f /dev/"${nvmedev}n1" > /dev/null 2>&1
-
-   mount /dev/"${nvmedev}n1" "${mount_dir}"
-
-   _run_fio_verify_io --size=800m --directory="${mount_dir}/"
-
-   umount "${mount_dir}" > /dev/null 2>&1
+   _xfs_run_fio_verify_io "/dev/${nvmedev}n1"
 
_nvme_disconnect_subsys "${subsys_name}"
 
@@ -61,7 +50,6 @@ test() {
_remove_nvmet_port "${port}"
 
rm "${file_path}"
-   rm -fr "${mount_dir}"
 
echo "Test complete"
 }
-- 
2.20.1



[PATCH blktests v3 02/11] common/xfs: Create common helper to check for XFS support

2020-10-08 Thread Logan Gunthorpe
Two nvme tests create and mount XFS filesystems and check for mkfs.xfs.

They should also check for XFS support in the kernel so create a common
helper for this.

Signed-off-by: Logan Gunthorpe 
---
 common/rc  |  8 
 common/xfs | 11 +++
 tests/nvme/012 |  6 --
 tests/nvme/013 |  4 +++-
 4 files changed, 26 insertions(+), 3 deletions(-)
 create mode 100644 common/xfs

diff --git a/common/rc b/common/rc
index cdc0150ea5ea..44cb218c2fac 100644
--- a/common/rc
+++ b/common/rc
@@ -181,6 +181,14 @@ _have_tracepoint() {
return 0
 }
 
+_have_fs() {
+   modprobe "$1" >/dev/null 2>&1
+   if [[ ! -d "/sys/fs/$1" ]]; then
+   SKIP_REASON="kernel does not support filesystem $1"
+   return 1
+   fi
+}
+
 _test_dev_is_rotational() {
[[ $(cat "${TEST_DEV_SYSFS}/queue/rotational") -ne 0 ]]
 }
diff --git a/common/xfs b/common/xfs
new file mode 100644
index ..d1a603b8c7b5
--- /dev/null
+++ b/common/xfs
@@ -0,0 +1,11 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-3.0+
+# Copyright (C) 2017 Omar Sandoval
+#
+# fio helper functions.
+
+. common/shellcheck
+
+_have_xfs() {
+   _have_fs xfs && _have_program mkfs.xfs
+}
diff --git a/tests/nvme/012 b/tests/nvme/012
index 1dbd59804ed7..1d8d8e3cc271 100755
--- a/tests/nvme/012
+++ b/tests/nvme/012
@@ -5,14 +5,16 @@
 # Test mkfs with data verification for block device backed ns.
 
 . tests/nvme/rc
+. common/xfs
 
 DESCRIPTION="run mkfs and data verification fio job on NVMeOF block 
device-backed ns"
 TIMED=1
 
 requires() {
_nvme_requires
-   _have_program mkfs.xfs && _have_program fio && \
-   _have_modules loop
+   _have_xfs
+   _have_program fio
+   _have_modules loop
_require_nvme_trtype_is_fabrics
 }
 
diff --git a/tests/nvme/013 b/tests/nvme/013
index df7f23e69252..3819a2730d9b 100755
--- a/tests/nvme/013
+++ b/tests/nvme/013
@@ -5,13 +5,15 @@
 # Test mkfs with data verification for file backed ns.
 
 . tests/nvme/rc
+. common/xfs
 
 DESCRIPTION="run mkfs and data verification fio job on NVMeOF file-backed ns"
 TIMED=1
 
 requires() {
_nvme_requires
-   _have_program mkfs.xfs && _have_fio
+   _have_xfs
+   _have_fio
_require_nvme_trtype_is_fabrics
 }
 
-- 
2.20.1



[PATCH blktests v3 11/11] nvme/038: Test removal of un-enabled subsystem and ports

2020-10-08 Thread Logan Gunthorpe
Test that we can remove a subsystem that has not been enabled by
passthru or any ns. Do the same for ports while we are at it.

This was an issue in the original passthru patches and is
not commonly tested. So this test will ensure we don't regress this.

Signed-off-by: Logan Gunthorpe 
---
 tests/nvme/038 | 36 
 tests/nvme/038.out |  2 ++
 2 files changed, 38 insertions(+)
 create mode 100755 tests/nvme/038
 create mode 100644 tests/nvme/038.out

diff --git a/tests/nvme/038 b/tests/nvme/038
new file mode 100755
index ..24f02d4ad4d1
--- /dev/null
+++ b/tests/nvme/038
@@ -0,0 +1,36 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-3.0+
+# Copyright (C) 2019 Logan Gunthorpe
+# Copyright (C) 2019 Eideticom Communications Inc.
+#
+# Test that we can remove a subsystem that has not been enabled by
+# passthru or any ns. Do the same for ports while we are at it.
+#
+# This was an issue in the original passthru patches and is
+# not commonly tested. So this test will ensure we don't regress this.
+#
+. tests/nvme/rc
+
+DESCRIPTION="test deletion of NVMeOF subsystem without enabling"
+QUICK=1
+
+requires() {
+   _nvme_requires
+}
+
+test() {
+   local subsys_path="${NVMET_CFS}/subsystems/blktests-subsystem-1"
+   local port
+
+   echo "Running ${TEST_NAME}"
+
+   _setup_nvmet
+
+   mkdir -p "${subsys_path}"
+   rmdir "${subsys_path}"
+
+   port=$(_create_nvmet_port loop)
+   _remove_nvmet_port "${port}"
+
+   echo "Test complete"
+}
diff --git a/tests/nvme/038.out b/tests/nvme/038.out
new file mode 100644
index ..06bc98022c33
--- /dev/null
+++ b/tests/nvme/038.out
@@ -0,0 +1,2 @@
+Running nvme/038
+Test complete
-- 
2.20.1



[PATCH blktests v3 05/11] nvme: Add common helpers for passthru tests

2020-10-08 Thread Logan Gunthorpe
Add some simple helpers to setup a passthru target that passes through
to a nvme test device.

Signed-off-by: Logan Gunthorpe 
---
 tests/nvme/rc | 75 +++
 1 file changed, 75 insertions(+)

diff --git a/tests/nvme/rc b/tests/nvme/rc
index dfa57a299625..f0b6f360da8c 100644
--- a/tests/nvme/rc
+++ b/tests/nvme/rc
@@ -73,6 +73,16 @@ _require_nvme_trtype_is_fabrics() {
return 0
 }
 
+_test_dev_nvme_ctrl() {
+   local dev=$(cat "${TEST_DEV_SYSFS}/device/dev")
+
+   echo "/dev/char/${dev}"
+}
+
+_test_dev_nvme_nsid() {
+   cat "${TEST_DEV_SYSFS}/nsid"
+}
+
 _cleanup_nvmet() {
local dev
local port
@@ -257,6 +267,27 @@ _remove_nvmet_subsystem() {
rmdir "${subsys_path}"
 }
 
+_create_nvmet_passthru() {
+   local nvmet_subsystem="$1"
+   local subsys_path="${NVMET_CFS}/subsystems/${nvmet_subsystem}"
+   local passthru_path="${subsys_path}/passthru"
+
+   mkdir -p "${subsys_path}"
+   echo 1 > "${subsys_path}/attr_allow_any_host"
+
+   echo "$(_test_dev_nvme_ctrl)" > "${passthru_path}/device_path"
+   echo 1 > "${passthru_path}/enable"
+}
+
+_remove_nvmet_passhtru() {
+   local nvmet_subsystem="$1"
+   local subsys_path="${NVMET_CFS}/subsystems/${nvmet_subsystem}"
+   local passthru_path="${subsys_path}/passthru"
+
+   echo 0 > "${passthru_path}/enable"
+   rmdir "${subsys_path}"
+}
+
 _add_nvmet_subsys_to_port() {
local port="$1"
local nvmet_subsystem="$2"
@@ -292,6 +323,50 @@ _find_nvme_dev() {
done
 }
 
+_find_nvme_passthru_loop_dev() {
+   local subsys=$1
+   local nsid
+   local dev
+
+   dev=$(_find_nvme_dev "${subsys}")
+   nsid=$(_test_dev_nvme_nsid)
+   echo "/dev/${dev}n${nsid}"
+}
+
+_nvmet_passthru_target_setup() {
+   local subsys_name=$1
+
+   _create_nvmet_passthru "${subsys_name}"
+   port="$(_create_nvmet_port "loop")"
+   _add_nvmet_subsys_to_port "${port}" "${subsys_name}"
+
+   echo "$port"
+}
+
+_nvmet_passthru_target_connect() {
+   local trtype=$1
+   local subsys_name=$2
+
+   _nvme_connect_subsys "${trtype}" "${subsys_name}"
+   nsdev=$(_find_nvme_passthru_loop_dev "${subsys_name}")
+
+   # The following tests can race with the creation
+   # of the device so ensure the block device exists
+   # before continuing
+   while [ ! -b "${nsdev}" ]; do sleep 1; done
+
+   echo "${nsdev}"
+}
+
+_nvmet_passthru_target_cleanup() {
+   local port=$1
+   local subsys_name=$2
+
+   _remove_nvmet_subsystem_from_port "${port}" "${subsys_name}"
+   _remove_nvmet_port "${port}"
+   _remove_nvmet_passhtru "${subsys_name}"
+}
+
 _filter_discovery() {
sed -n -r -e "s/Generation counter [0-9]+/Generation counter X/" \
  -e '/Discovery Log Number|Log Entry|trtype|subnqn/p'
-- 
2.20.1



[PATCH blktests v3 10/11] nvme/037: Add test which loops passthru connect and disconnect

2020-10-08 Thread Logan Gunthorpe
Similar to test nvme/031 except for passthru controllers.

Note: it's normal to get I/O errors in this test as when the controller
disconnects it races with the partition table read.

Signed-off-by: Logan Gunthorpe 
---
 tests/nvme/037 | 35 +++
 tests/nvme/037.out |  2 ++
 2 files changed, 37 insertions(+)
 create mode 100755 tests/nvme/037
 create mode 100644 tests/nvme/037.out

diff --git a/tests/nvme/037 b/tests/nvme/037
new file mode 100755
index ..fc6c21343652
--- /dev/null
+++ b/tests/nvme/037
@@ -0,0 +1,35 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-3.0+
+# Copyright (C) 2019 Logan Gunthorpe
+# Copyright (C) 2019 Eideticom Communications Inc.
+
+. tests/nvme/rc
+
+DESCRIPTION="test deletion of NVMeOF passthru controllers immediately after 
setup"
+
+requires() {
+   _nvme_requires
+   _have_kernel_option NVME_TARGET_PASSTHRU
+}
+
+test_device() {
+   local subsys="blktests-subsystem-"
+   local iterations=10
+   local ctrldev
+   local port
+
+   echo "Running ${TEST_NAME}"
+
+   _setup_nvmet
+
+   for ((i = 0; i < iterations; i++)); do
+   port=$(_nvmet_passthru_target_setup "${subsys}${i}")
+   nsdev=$(_nvmet_passthru_target_connect "${nvme_trtype}" \
+   "${subsys}${i}")
+
+   _nvme_disconnect_subsys "${subsys}${i}" >>"${FULL}" 2>&1
+   _nvmet_passthru_target_cleanup "${port}" "${subsys}${i}"
+   done
+
+   echo "Test complete"
+}
diff --git a/tests/nvme/037.out b/tests/nvme/037.out
new file mode 100644
index ..eaf903d0520e
--- /dev/null
+++ b/tests/nvme/037.out
@@ -0,0 +1,2 @@
+Running nvme/037
+Test complete
-- 
2.20.1



[PATCH blktests v3 04/11] nvme: Search for specific subsysnqn in _find_nvme_loop_dev

2020-10-08 Thread Logan Gunthorpe
This ensures we find the correct nvme loop device if others exist on a
given system (which is generally not expected on test systems).

Additionally, this will be required in the upcomming test nvme/037 which
will have controllers racing with ones being destroyed.

Signed-off-by: Logan Gunthorpe 
---
 tests/nvme/004 | 2 +-
 tests/nvme/005 | 2 +-
 tests/nvme/008 | 2 +-
 tests/nvme/009 | 2 +-
 tests/nvme/010 | 2 +-
 tests/nvme/011 | 2 +-
 tests/nvme/012 | 2 +-
 tests/nvme/013 | 2 +-
 tests/nvme/014 | 2 +-
 tests/nvme/015 | 2 +-
 tests/nvme/018 | 2 +-
 tests/nvme/019 | 2 +-
 tests/nvme/020 | 2 +-
 tests/nvme/021 | 2 +-
 tests/nvme/022 | 2 +-
 tests/nvme/023 | 2 +-
 tests/nvme/024 | 2 +-
 tests/nvme/025 | 2 +-
 tests/nvme/026 | 2 +-
 tests/nvme/027 | 2 +-
 tests/nvme/028 | 2 +-
 tests/nvme/029 | 2 +-
 tests/nvme/rc  | 7 ---
 23 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/tests/nvme/004 b/tests/nvme/004
index dfca79aab20c..4b0b7ae50a5e 100755
--- a/tests/nvme/004
+++ b/tests/nvme/004
@@ -37,7 +37,7 @@ test() {
_nvme_connect_subsys "${nvme_trtype}" blktests-subsystem-1
 
local nvmedev
-   nvmedev="$(_find_nvme_dev)"
+   nvmedev=$(_find_nvme_dev "blktests-subsystem-1")
cat "/sys/block/${nvmedev}n1/uuid"
cat "/sys/block/${nvmedev}n1/wwid"
 
diff --git a/tests/nvme/005 b/tests/nvme/005
index 0d5801868bc0..9f3e388dc695 100755
--- a/tests/nvme/005
+++ b/tests/nvme/005
@@ -37,7 +37,7 @@ test() {
_nvme_connect_subsys "${nvme_trtype}" blktests-subsystem-1
 
local nvmedev
-   nvmedev="$(_find_nvme_dev)"
+   nvmedev=$(_find_nvme_dev "blktests-subsystem-1")
 
udevadm settle
 
diff --git a/tests/nvme/008 b/tests/nvme/008
index 8616617ad398..219fe9b0ca6a 100755
--- a/tests/nvme/008
+++ b/tests/nvme/008
@@ -37,7 +37,7 @@ test() {
 
_nvme_connect_subsys "${nvme_trtype}" "${subsys_name}"
 
-   nvmedev="$(_find_nvme_dev)"
+   nvmedev=$(_find_nvme_dev "${subsys_name}")
cat "/sys/block/${nvmedev}n1/uuid"
cat "/sys/block/${nvmedev}n1/wwid"
 
diff --git a/tests/nvme/009 b/tests/nvme/009
index e91d79065cb1..2814c79164ee 100755
--- a/tests/nvme/009
+++ b/tests/nvme/009
@@ -33,7 +33,7 @@ test() {
 
_nvme_connect_subsys "${nvme_trtype}" "${subsys_name}"
 
-   nvmedev="$(_find_nvme_dev)"
+   nvmedev=$(_find_nvme_dev "${subsys_name}")
cat "/sys/block/${nvmedev}n1/uuid"
cat "/sys/block/${nvmedev}n1/wwid"
 
diff --git a/tests/nvme/010 b/tests/nvme/010
index 0188e842213e..150a4e540f3e 100755
--- a/tests/nvme/010
+++ b/tests/nvme/010
@@ -37,7 +37,7 @@ test() {
 
_nvme_connect_subsys "${nvme_trtype}" "${subsys_name}"
 
-   nvmedev="$(_find_nvme_dev)"
+   nvmedev=$(_find_nvme_dev "${subsys_name}")
cat "/sys/block/${nvmedev}n1/uuid"
cat "/sys/block/${nvmedev}n1/wwid"
 
diff --git a/tests/nvme/011 b/tests/nvme/011
index 543dbe840874..4bfe9af084e4 100755
--- a/tests/nvme/011
+++ b/tests/nvme/011
@@ -35,7 +35,7 @@ test() {
 
_nvme_connect_subsys "${nvme_trtype}" "${subsys_name}"
 
-   nvmedev="$(_find_nvme_dev)"
+   nvmedev=$(_find_nvme_dev "${subsys_name}")
cat "/sys/block/${nvmedev}n1/uuid"
cat "/sys/block/${nvmedev}n1/wwid"
 
diff --git a/tests/nvme/012 b/tests/nvme/012
index a13cd08ce6bf..c4e75b09796a 100755
--- a/tests/nvme/012
+++ b/tests/nvme/012
@@ -40,7 +40,7 @@ test() {
 
_nvme_connect_subsys "${nvme_trtype}" "${subsys_name}"
 
-   nvmedev="$(_find_nvme_dev)"
+   nvmedev=$(_find_nvme_dev "${subsys_name}")
cat "/sys/block/${nvmedev}n1/uuid"
cat "/sys/block/${nvmedev}n1/wwid"
 
diff --git a/tests/nvme/013 b/tests/nvme/013
index 1ac725ea83f2..265b6968fd34 100755
--- a/tests/nvme/013
+++ b/tests/nvme/013
@@ -37,7 +37,7 @@ test() {
 
_nvme_connect_subsys "${nvme_trtype}" "${subsys_name}"
 
-   nvmedev="$(_find_nvme_dev)"
+   nvmedev=$(_find_nvme_dev "${subsys_name}")
cat "/sys/block/${nvmedev}n1/uuid"
cat "/sys/block/${nvmedev}n1/wwid"
 
diff --git a/tests/nvme/014 b/tests/nvme/014
index e3c70364e332..48f8caaec0b3 100755
--- a/tests/nvme/014
+++ b/tests/nvme/014
@@ -37,7 +37,7 @@ test() {
 
_nvme_connect_subsys "${nvme_trtype}" "${subsys_name}"
 
-   nvmedev="$(_find_nvme_dev)"
+   nvmedev=$(_find_nvme_dev "${subsys_name}")
cat "/sys/block/${nvmedev}n1/uuid"
cat "/sys/block/${nvmedev}n1/wwid"
 
diff --git a/tests/nvme/015 b/tests/nvme/015

[PATCH blktests v3 08/11] nvme/035: Add test to verify passthru controller with a filesystem

2020-10-08 Thread Logan Gunthorpe
This is a similar test as nvme/012 and nvme/013, except with a
passthru controller.

Signed-off-by: Logan Gunthorpe 
---
 tests/nvme/035 | 37 +
 tests/nvme/035.out |  3 +++
 2 files changed, 40 insertions(+)
 create mode 100755 tests/nvme/035
 create mode 100644 tests/nvme/035.out

diff --git a/tests/nvme/035 b/tests/nvme/035
new file mode 100755
index ..ee78a7586f35
--- /dev/null
+++ b/tests/nvme/035
@@ -0,0 +1,37 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-3.0+
+# Copyright (C) 2019 Logan Gunthorpe
+# Copyright (C) 2019 Eideticom Communications Inc.
+
+. tests/nvme/rc
+. common/xfs
+
+DESCRIPTION="run mkfs and data verification fio job on an NVMeOF passthru 
controller"
+TIMED=1
+
+requires() {
+   _nvme_requires
+   _have_kernel_option NVME_TARGET_PASSTHRU
+   _have_xfs
+   _have_fio
+}
+
+test_device() {
+   local subsys="blktests-subsystem-1"
+   local ctrldev
+   local nsdev
+   local port
+
+   echo "Running ${TEST_NAME}"
+
+   _setup_nvmet
+   port=$(_nvmet_passthru_target_setup "${subsys}")
+   nsdev=$(_nvmet_passthru_target_connect "${nvme_trtype}" "${subsys}")
+
+   _xfs_run_fio_verify_io "${nsdev}"
+
+   _nvme_disconnect_subsys "${subsys}"
+   _nvmet_passthru_target_cleanup "${port}" "${subsys}"
+
+   echo "Test complete"
+}
diff --git a/tests/nvme/035.out b/tests/nvme/035.out
new file mode 100644
index ..a6027138fbe4
--- /dev/null
+++ b/tests/nvme/035.out
@@ -0,0 +1,3 @@
+Running nvme/035
+NQN:blktests-subsystem-1 disconnected 1 controller(s)
+Test complete
-- 
2.20.1



[PATCH blktests v3 09/11] nvme/036: Add test for testing reset command on nvme-passthru

2020-10-08 Thread Logan Gunthorpe
Similar to test 022 but for passthru controllers.

Signed-off-by: Logan Gunthorpe 
---
 tests/nvme/036 | 37 +
 tests/nvme/036.out |  3 +++
 2 files changed, 40 insertions(+)
 create mode 100755 tests/nvme/036
 create mode 100644 tests/nvme/036.out

diff --git a/tests/nvme/036 b/tests/nvme/036
new file mode 100755
index ..8218c6538dfd
--- /dev/null
+++ b/tests/nvme/036
@@ -0,0 +1,37 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-3.0+
+# Copyright (C) 2019 Logan Gunthorpe
+# Copyright (C) 2019 Eideticom Communications Inc.
+
+. tests/nvme/rc
+
+DESCRIPTION="test NVMe reset command on an NVMeOF target with a passthru 
controller"
+QUICK=1
+
+requires() {
+   _nvme_requires
+   _have_kernel_option NVME_TARGET_PASSTHRU
+}
+
+test_device() {
+   local subsys="blktests-subsystem-1"
+   local ctrldev
+   local port
+
+   echo "Running ${TEST_NAME}"
+
+   _setup_nvmet
+   port=$(_nvmet_passthru_target_setup "${subsys}")
+   nsdev=$(_nvmet_passthru_target_connect "${nvme_trtype}" "${subsys}")
+
+   ctrldev=$(_find_nvme_dev "${subsys}")
+
+   if ! nvme reset "/dev/${ctrldev}" >> "$FULL" 2>&1; then
+   echo "ERROR: reset failed"
+   fi
+
+   _nvme_disconnect_subsys "${subsys}"
+   _nvmet_passthru_target_cleanup "${port}" "${subsys}"
+
+   echo "Test complete"
+}
diff --git a/tests/nvme/036.out b/tests/nvme/036.out
new file mode 100644
index ..d294f8646b20
--- /dev/null
+++ b/tests/nvme/036.out
@@ -0,0 +1,3 @@
+Running nvme/036
+NQN:blktests-subsystem-1 disconnected 1 controller(s)
+Test complete
-- 
2.20.1



[PATCH blktests v3 07/11] nvme/034: Add test for passthru data verification

2020-10-08 Thread Logan Gunthorpe
Similar to test nvme/010 and nvme/011 but for a passthru controller

Signed-off-by: Logan Gunthorpe 
---
 tests/nvme/034 | 35 +++
 tests/nvme/034.out |  3 +++
 2 files changed, 38 insertions(+)
 create mode 100755 tests/nvme/034
 create mode 100644 tests/nvme/034.out

diff --git a/tests/nvme/034 b/tests/nvme/034
new file mode 100755
index ..f92e5e20865b
--- /dev/null
+++ b/tests/nvme/034
@@ -0,0 +1,35 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-3.0+
+# Copyright (C) 2019 Logan Gunthorpe
+# Copyright (C) 2019 Eideticom Communications Inc.
+
+. tests/nvme/rc
+
+DESCRIPTION="run data verification fio job on an NVMeOF passthru controller"
+TIMED=1
+
+requires() {
+   _nvme_requires
+   _have_kernel_option NVME_TARGET_PASSTHRU
+   _have_fio
+}
+
+test_device() {
+   local subsys="blktests-subsystem-1"
+   local ctrldev
+   local nsdev
+   local port
+
+   echo "Running ${TEST_NAME}"
+
+   _setup_nvmet
+   port=$(_nvmet_passthru_target_setup "${subsys}")
+   nsdev=$(_nvmet_passthru_target_connect "${nvme_trtype}" "${subsys}")
+
+   _run_fio_verify_io --size=950m --filename="${nsdev}"
+
+   _nvme_disconnect_subsys "${subsys}"
+   _nvmet_passthru_target_cleanup "${port}" "${subsys}"
+
+   echo "Test complete"
+}
diff --git a/tests/nvme/034.out b/tests/nvme/034.out
new file mode 100644
index ..0a7bd2f90dae
--- /dev/null
+++ b/tests/nvme/034.out
@@ -0,0 +1,3 @@
+Running nvme/034
+NQN:blktests-subsystem-1 disconnected 1 controller(s)
+Test complete
-- 
2.20.1



Re: [PATCH blktests v2 04/11] nvme: Search for specific subsysnqn in _find_nvme_loop_dev

2020-10-07 Thread Logan Gunthorpe



On 2020-10-06 6:24 p.m., Chaitanya Kulkarni wrote:
> On 10/6/20 17:10, Logan Gunthorpe wrote:
>>> With this patch or this series will I be able to write the testcase ?
>> This patch helps with that but other helpers introduced in this series
>> would require minor changes.
>>
>> As far as I can see, you'd only have to adjust _create_nvmet_passthru()
>> to take an optional argument because, presently, it always uses
>> $_test_dev_nvme_ctrl for the backing device.
>>
>> This can easily be done if and when someone writes such a test.
>>
>> However, I'm not even sure right now if that test would pass in the
>> kernel as is -- it seems like an odd thing to do.
>>
>> Logan
>>
> This test should pass if I remember the code correctly where we don't
> 
> have any PCIe specific checks for the passthru controller and it is an

Yes, there's no explicit restrictions, but that doesn't mean there are no bugs
with that particular stack.

> important to support this scenario in order to write device independent
> 
> testcases as rest of the testcases are.

Ok, feel free to write a test for this. It's not important to me.

Logan



Re: [PATCH blktests v2 03/11] common/xfs: Create common helper to verify block device with xfs

2020-10-07 Thread Logan Gunthorpe



On 2020-10-06 6:20 p.m., Chaitanya Kulkarni wrote:
> On 10/6/20 16:59, Logan Gunthorpe wrote:
>>> The mount dir should be a parameter and not the hardcode value
>>> to make it reusable.
>> I also disagree here. It is already reusable and is used in a number of
>> places; none of those places require changing the mount directory. If
>> and when a use comes up that requires a different directory (not sure
>> what that would be), a parameter can be added. It is typically standard
>> practice in the Linux community to not add features that have no users
>> as it's confusing to people reading the code.
>>
>> Logan
>>
> Well if you are making a helper it should be generic if you have a usecase,

"Generic" isn't a binary yes/no quality. Why add the mount option (that nobody 
is using) 
and not a size option as well that nobody uses? For that matter, fio has a ton 
of options
we could expose. (think io-method, read/write pattern, etc, etc). The criteria 
we
decide upon which options get exposed as arguments is what the code that's 
actually
using it needs -- not what's available or what you imagine future use cases 
might be.
If there are no users in the code it should not be exposed. If a use case comes 
along,
an argument can easily be added when the new test is added to the code base.

> mounted on different mount points not just one which is important testcase,
> 
> that will require a prep patch.

So? That's normal.
 
> Why can't we do that right now when we have a clear usecase ?

We don't have a clear use case that's being added to the code though... We 
have an imagined use case that hasn't been written. Add the feature when you
add this use case.

Logan


Re: [PATCH blktests v2 05/11] nvme: Add common helpers for passthru tests

2020-10-06 Thread Logan Gunthorpe



On 2020-10-06 6:02 p.m., Chaitanya Kulkarni wrote:
> On 9/30/20 12:01, Logan Gunthorpe wrote:
>> Add some simple helpers to setup a passthru target that passes through
>> to a nvme test device.
>>
>> Signed-off-by: Logan Gunthorpe 
>> ---
>>  tests/nvme/rc | 76 +++
>>  1 file changed, 76 insertions(+)
>>
>> diff --git a/tests/nvme/rc b/tests/nvme/rc
>> index dfa57a299625..1ea23308a3f7 100644
>> --- a/tests/nvme/rc
>> +++ b/tests/nvme/rc
>> @@ -73,6 +73,17 @@ _require_nvme_trtype_is_fabrics() {
>>  return 0
>>  }
>>  
>> +_test_dev_nvme_ctrl() {
>> +local dev
>> +
>> +dev=$(cat "${TEST_DEV_SYSFS}/device/dev")
>  can you initialize dev this at the time of declaration ?

Yup, will fix.

>> +echo "/dev/char/${dev}"
>> +}
>> +
>> +_test_dev_nvme_nsid() {
>> +cat "${TEST_DEV_SYSFS}/nsid"
>> +}
>> +
>>  _cleanup_nvmet() {
>>  local dev
>>  local port
>> @@ -257,6 +268,27 @@ _remove_nvmet_subsystem() {
>>  rmdir "${subsys_path}"
>>  }
>>  
>> +_create_nvmet_passthru() {
>> +local nvmet_subsystem="$1"
>> +local subsys_path="${NVMET_CFS}/subsystems/${nvmet_subsystem}"
>> +local passthru_path="${subsys_path}/passthru"
>> +
>> +mkdir -p "${subsys_path}"
>> +printf 1 > "${subsys_path}/attr_allow_any_host"
>> +
>> +printf "%s" "$(_test_dev_nvme_ctrl)" > "${passthru_path}/device_path"
>> +printf 1 > "${passthru_path}/enable"
> 
> can you please echo in general and printf only when it is needed ?>
> I know existing code is a bit inconsistent I'll send a clenup to make it
> uniform.

Yes, I agree. I will it fix in v3.
Thanks,

Logan


Re: [PATCH blktests v2 04/11] nvme: Search for specific subsysnqn in _find_nvme_loop_dev

2020-10-06 Thread Logan Gunthorpe



On 2020-10-06 5:55 p.m., Chaitanya Kulkarni wrote:
> On 9/30/20 11:54, Logan Gunthorpe wrote:
>> This ensures we find the correct nvme loop device if others exist on a
>> given system (which is generally not expected on test systems).
>>
>> Additionally, this will be required in the upcomming test nvme/037 which
>> will have controllers racing with ones being destroyed.
>>
>> Signed-off-by: Logan Gunthorpe 
> 
> If I create a passthru testcase with :-
> 
> 1. Create nvme-loop based nvme ctrl backed by null_blk say /dev/nvme1
> 
> 2. Create a nvme-loop based passthru ctrl backed by /dev/nvme1 say nvme2.
> 
> 
> With this patch or this series will I be able to write the testcase ?

This patch helps with that but other helpers introduced in this series
would require minor changes.

As far as I can see, you'd only have to adjust _create_nvmet_passthru()
to take an optional argument because, presently, it always uses
$_test_dev_nvme_ctrl for the backing device.

This can easily be done if and when someone writes such a test.

However, I'm not even sure right now if that test would pass in the
kernel as is -- it seems like an odd thing to do.

Logan


Re: [PATCH blktests v2 03/11] common/xfs: Create common helper to verify block device with xfs

2020-10-06 Thread Logan Gunthorpe



On 2020-10-06 5:50 p.m., Chaitanya Kulkarni wrote:
> On 9/30/20 11:54, Logan Gunthorpe wrote:
>> Make a common helper from the code in tests nvme/012 and nvme/013
>> to run an fio verify on a XFS file system backed by the
>> specified block device.
>>
>> While we are at it, all the output is redirected to $FULL instead of
>> /dev/null.
>>
>> Signed-off-by: Logan Gunthorpe 
>> ---
>>  common/xfs | 22 ++
>>  tests/nvme/012 | 14 +-
>>  tests/nvme/013 | 14 +-
>>  3 files changed, 24 insertions(+), 26 deletions(-)
> 
> The common namespace is getting cluttered. Can you please create
> 
> a subdirectory common/fs/xfs ?

I disagree. The common directory only has 9 files. And given there
appear to be no other files to add to the fs directory I don't think now
is the time to create a directory. We can do so if and when a number of
other fs helpers show up and there's a reason to group them together.

>>
>> diff --git a/common/xfs b/common/xfs
>> index d1a603b8c7b5..210c924cdd41 100644
>> --- a/common/xfs
>> +++ b/common/xfs
>> @@ -9,3 +9,25 @@
>>  _have_xfs() {
>>  _have_fs xfs && _have_program mkfs.xfs
>>  }
>> +
>> +_xfs_mkfs_and_mount() {
>> +local bdev=$1
>> +local mount_dir=$2
>> +
>> +mkdir -p "${mount_dir}"
>> +umount "${mount_dir}"
>> +mkfs.xfs -l size=32m -f "${bdev}"
>> +mount "${bdev}" "${mount_dir}"
>> +}
>> +
>> +_xfs_run_fio_verify_io() {
>> +local mount_dir="/mnt/blktests"
> 
> The mount dir should be a parameter and not the hardcode value
> to make it reusable.

I also disagree here. It is already reusable and is used in a number of
places; none of those places require changing the mount directory. If
and when a use comes up that requires a different directory (not sure
what that would be), a parameter can be added. It is typically standard
practice in the Linux community to not add features that have no users
as it's confusing to people reading the code.

Logan


Re: [PATCH blktests v2 02/11] common/xfs: Create common helper to check for XFS support

2020-10-06 Thread Logan Gunthorpe



On 2020-10-06 5:44 p.m., Chaitanya Kulkarni wrote:
> On 9/30/20 11:54, Logan Gunthorpe wrote:
>>  
>>  requires() {
>>  _nvme_requires
>> -_have_program mkfs.xfs && _have_fio
>> +_have_xfs
>> +_have_fio
> Can you make _have_xfs return true false ? so it can be used with && ?

_have_xfs() does return true/false and can be used with && or in a
conditional.

Per [1], my opinion is that using && in the requires() function where
the return value is ignored is confusing so I prefer not to do it in new
code.

If we want to reconsider this we, should add a check to ensure the
return value of requires() matches the expectation of the global
variable it uses.

Logan

[1]
https://lore.kernel.org/linux-block/92478e6f-622a-a1ae-6189-4009f9a30...@deltatee.com/


Re: [PATCH v4 0/7] Convert the intel iommu driver to the dma-iommu api

2020-10-01 Thread Logan Gunthorpe
Hi Lu,

On 2020-09-27 12:34 a.m., Lu Baolu wrote:
> Hi,
> 
> The previous post of this series could be found here.
> 
> https://lore.kernel.org/linux-iommu/20200912032200.11489-1-baolu...@linux.intel.com/
> 
> This version introduce a new patch [4/7] to fix an issue reported here.
> 
> https://lore.kernel.org/linux-iommu/51a1baec-48d1-c0ac-181b-1fba92aa4...@linux.intel.com/
> 
> There aren't any other changes.
> 
> Please help to test and review.

I've tested this patchset on my Sandy Bridge machine and found no issues (while 
including a 
patch to ioat I've sent to that maintainer).

Tested-By: Logan Gunthorpe 

Thanks,

Logan


[PATCH blktests v2 09/11] nvme/036: Add test for testing reset command on nvme-passthru

2020-09-30 Thread Logan Gunthorpe
Similar to test 022 but for passthru controllers.

Signed-off-by: Logan Gunthorpe 
---
 tests/nvme/036 | 37 +
 tests/nvme/036.out |  3 +++
 2 files changed, 40 insertions(+)
 create mode 100755 tests/nvme/036
 create mode 100644 tests/nvme/036.out

diff --git a/tests/nvme/036 b/tests/nvme/036
new file mode 100755
index ..8218c6538dfd
--- /dev/null
+++ b/tests/nvme/036
@@ -0,0 +1,37 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-3.0+
+# Copyright (C) 2019 Logan Gunthorpe
+# Copyright (C) 2019 Eideticom Communications Inc.
+
+. tests/nvme/rc
+
+DESCRIPTION="test NVMe reset command on an NVMeOF target with a passthru 
controller"
+QUICK=1
+
+requires() {
+   _nvme_requires
+   _have_kernel_option NVME_TARGET_PASSTHRU
+}
+
+test_device() {
+   local subsys="blktests-subsystem-1"
+   local ctrldev
+   local port
+
+   echo "Running ${TEST_NAME}"
+
+   _setup_nvmet
+   port=$(_nvmet_passthru_target_setup "${subsys}")
+   nsdev=$(_nvmet_passthru_target_connect "${nvme_trtype}" "${subsys}")
+
+   ctrldev=$(_find_nvme_dev "${subsys}")
+
+   if ! nvme reset "/dev/${ctrldev}" >> "$FULL" 2>&1; then
+   echo "ERROR: reset failed"
+   fi
+
+   _nvme_disconnect_subsys "${subsys}"
+   _nvmet_passthru_target_cleanup "${port}" "${subsys}"
+
+   echo "Test complete"
+}
diff --git a/tests/nvme/036.out b/tests/nvme/036.out
new file mode 100644
index ..d294f8646b20
--- /dev/null
+++ b/tests/nvme/036.out
@@ -0,0 +1,3 @@
+Running nvme/036
+NQN:blktests-subsystem-1 disconnected 1 controller(s)
+Test complete
-- 
2.20.1



[PATCH blktests v2 02/11] common/xfs: Create common helper to check for XFS support

2020-09-30 Thread Logan Gunthorpe
Two nvme tests create and mount XFS filesystems and check for mkfs.xfs.

They should also check for XFS support in the kernel so create a common
helper for this.

Signed-off-by: Logan Gunthorpe 
---
 common/rc  |  8 
 common/xfs | 11 +++
 tests/nvme/012 |  6 --
 tests/nvme/013 |  4 +++-
 4 files changed, 26 insertions(+), 3 deletions(-)
 create mode 100644 common/xfs

diff --git a/common/rc b/common/rc
index cdc0150ea5ea..44cb218c2fac 100644
--- a/common/rc
+++ b/common/rc
@@ -181,6 +181,14 @@ _have_tracepoint() {
return 0
 }
 
+_have_fs() {
+   modprobe "$1" >/dev/null 2>&1
+   if [[ ! -d "/sys/fs/$1" ]]; then
+   SKIP_REASON="kernel does not support filesystem $1"
+   return 1
+   fi
+}
+
 _test_dev_is_rotational() {
[[ $(cat "${TEST_DEV_SYSFS}/queue/rotational") -ne 0 ]]
 }
diff --git a/common/xfs b/common/xfs
new file mode 100644
index ..d1a603b8c7b5
--- /dev/null
+++ b/common/xfs
@@ -0,0 +1,11 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-3.0+
+# Copyright (C) 2017 Omar Sandoval
+#
+# fio helper functions.
+
+. common/shellcheck
+
+_have_xfs() {
+   _have_fs xfs && _have_program mkfs.xfs
+}
diff --git a/tests/nvme/012 b/tests/nvme/012
index 1dbd59804ed7..1d8d8e3cc271 100755
--- a/tests/nvme/012
+++ b/tests/nvme/012
@@ -5,14 +5,16 @@
 # Test mkfs with data verification for block device backed ns.
 
 . tests/nvme/rc
+. common/xfs
 
 DESCRIPTION="run mkfs and data verification fio job on NVMeOF block 
device-backed ns"
 TIMED=1
 
 requires() {
_nvme_requires
-   _have_program mkfs.xfs && _have_program fio && \
-   _have_modules loop
+   _have_xfs
+   _have_program fio
+   _have_modules loop
_require_nvme_trtype_is_fabrics
 }
 
diff --git a/tests/nvme/013 b/tests/nvme/013
index df7f23e69252..3819a2730d9b 100755
--- a/tests/nvme/013
+++ b/tests/nvme/013
@@ -5,13 +5,15 @@
 # Test mkfs with data verification for file backed ns.
 
 . tests/nvme/rc
+. common/xfs
 
 DESCRIPTION="run mkfs and data verification fio job on NVMeOF file-backed ns"
 TIMED=1
 
 requires() {
_nvme_requires
-   _have_program mkfs.xfs && _have_fio
+   _have_xfs
+   _have_fio
_require_nvme_trtype_is_fabrics
 }
 
-- 
2.20.1



[PATCH blktests v2 08/11] nvme/035: Add test to verify passthru controller with a filesystem

2020-09-30 Thread Logan Gunthorpe
This is a similar test as nvme/012 and nvme/013, except with a
passthru controller.

Signed-off-by: Logan Gunthorpe 
---
 tests/nvme/035 | 37 +
 tests/nvme/035.out |  3 +++
 2 files changed, 40 insertions(+)
 create mode 100755 tests/nvme/035
 create mode 100644 tests/nvme/035.out

diff --git a/tests/nvme/035 b/tests/nvme/035
new file mode 100755
index ..ee78a7586f35
--- /dev/null
+++ b/tests/nvme/035
@@ -0,0 +1,37 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-3.0+
+# Copyright (C) 2019 Logan Gunthorpe
+# Copyright (C) 2019 Eideticom Communications Inc.
+
+. tests/nvme/rc
+. common/xfs
+
+DESCRIPTION="run mkfs and data verification fio job on an NVMeOF passthru 
controller"
+TIMED=1
+
+requires() {
+   _nvme_requires
+   _have_kernel_option NVME_TARGET_PASSTHRU
+   _have_xfs
+   _have_fio
+}
+
+test_device() {
+   local subsys="blktests-subsystem-1"
+   local ctrldev
+   local nsdev
+   local port
+
+   echo "Running ${TEST_NAME}"
+
+   _setup_nvmet
+   port=$(_nvmet_passthru_target_setup "${subsys}")
+   nsdev=$(_nvmet_passthru_target_connect "${nvme_trtype}" "${subsys}")
+
+   _xfs_run_fio_verify_io "${nsdev}"
+
+   _nvme_disconnect_subsys "${subsys}"
+   _nvmet_passthru_target_cleanup "${port}" "${subsys}"
+
+   echo "Test complete"
+}
diff --git a/tests/nvme/035.out b/tests/nvme/035.out
new file mode 100644
index ..a6027138fbe4
--- /dev/null
+++ b/tests/nvme/035.out
@@ -0,0 +1,3 @@
+Running nvme/035
+NQN:blktests-subsystem-1 disconnected 1 controller(s)
+Test complete
-- 
2.20.1



  1   2   3   4   5   6   7   8   9   10   >