date:20180423

[PATCH v2 2/3] lightnvm: pblk: garbage collect lines with failed writes

2018-04-23 Thread Hans Holmberg

From: Hans Holmberg 

Write failures should not happen under normal circumstances,
so in order to bring the chunk back into a known state as soon
as possible, evacuate all the valid data out of the line and let the
fw judge if the block can be written to in the next reset cycle.

Do this by introducing a new gc list for lines with failed writes,
and ensure that the rate limiter allocates a small portion of
the write bandwidth to get the job done.

The lba list is saved in memory for use during gc as we
cannot gurantee that the emeta data is readable if a write
error occurred.

Signed-off-by: Hans Holmberg 
---
 drivers/lightnvm/pblk-core.c  |  45 ++-
 drivers/lightnvm/pblk-gc.c| 102 +++---
 drivers/lightnvm/pblk-init.c  |  45 ---
 drivers/lightnvm/pblk-rl.c|  29 ++--
 drivers/lightnvm/pblk-sysfs.c |  15 ++-
 drivers/lightnvm/pblk-write.c |   2 +
 drivers/lightnvm/pblk.h   |  25 +--
 7 files changed, 199 insertions(+), 64 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 7762e89..413cf3b 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -373,7 +373,13 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, 
struct pblk_line *line)
 
lockdep_assert_held(>lock);
 
-   if (!vsc) {
+   if (line->w_err_gc->has_write_err) {
+   if (line->gc_group != PBLK_LINEGC_WERR) {
+   line->gc_group = PBLK_LINEGC_WERR;
+   move_list = _mg->gc_werr_list;
+   pblk_rl_werr_line_in(>rl);
+   }
+   } else if (!vsc) {
if (line->gc_group != PBLK_LINEGC_FULL) {
line->gc_group = PBLK_LINEGC_FULL;
move_list = _mg->gc_full_list;
@@ -1603,8 +1609,13 @@ static void __pblk_line_put(struct pblk *pblk, struct 
pblk_line *line)
line->state = PBLK_LINESTATE_FREE;
line->gc_group = PBLK_LINEGC_NONE;
pblk_line_free(line);
-   spin_unlock(>lock);
 
+   if (line->w_err_gc->has_write_err) {
+   pblk_rl_werr_line_out(>rl);
+   line->w_err_gc->has_write_err = 0;
+   }
+
+   spin_unlock(>lock);
atomic_dec(>pipeline_gc);
 
spin_lock(_mg->free_lock);
@@ -1767,11 +1778,34 @@ void pblk_line_close_meta(struct pblk *pblk, struct 
pblk_line *line)
 
spin_lock(_mg->close_lock);
spin_lock(>lock);
+
+   /* Update the in-memory start address for emeta, in case it has
+* shifted due to write errors
+*/
+   if (line->emeta_ssec != line->cur_sec)
+   line->emeta_ssec = line->cur_sec;
+
list_add_tail(>list, _mg->emeta_list);
spin_unlock(>lock);
spin_unlock(_mg->close_lock);
 
pblk_line_should_sync_meta(pblk);
+
+
+}
+
+static void pblk_save_lba_list(struct pblk *pblk, struct pblk_line *line)
+{
+   struct pblk_line_meta *lm = >lm;
+   struct pblk_line_mgmt *l_mg = >l_mg;
+   unsigned int lba_list_size = lm->emeta_len[2];
+   struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
+   struct pblk_emeta *emeta = line->emeta;
+
+   w_err_gc->lba_list = pblk_malloc(lba_list_size,
+l_mg->emeta_alloc_type, GFP_KERNEL);
+   memcpy(w_err_gc->lba_list, emeta_to_lbas(pblk, emeta->buf),
+   lba_list_size);
 }
 
 void pblk_line_close_ws(struct work_struct *work)
@@ -1780,6 +1814,13 @@ void pblk_line_close_ws(struct work_struct *work)
ws);
struct pblk *pblk = line_ws->pblk;
struct pblk_line *line = line_ws->line;
+   struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
+
+   /* Write errors makes the emeta start address stored in smeta invalid,
+* so keep a copy of the lba list until we've gc'd the line
+*/
+   if (w_err_gc->has_write_err)
+   pblk_save_lba_list(pblk, line);
 
pblk_line_close(pblk, line);
mempool_free(line_ws, pblk->gen_ws_pool);
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index b0cc277..df88f1b 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -129,6 +129,53 @@ static void pblk_gc_line_ws(struct work_struct *work)
kfree(gc_rq_ws);
 }
 
+static __le64 *get_lba_list_from_emeta(struct pblk *pblk,
+  struct pblk_line *line)
+{
+   struct line_emeta *emeta_buf;
+   struct pblk_line_mgmt *l_mg = >l_mg;
+   struct pblk_line_meta *lm = >lm;
+   unsigned int lba_list_size = lm->emeta_len[2];
+   __le64 *lba_list;
+   int ret;
+
+   emeta_buf = pblk_malloc(lm->emeta_len[0],
+   l_mg->emeta_alloc_type, GFP_KERNEL);
+   if (!emeta_buf)
+

[PATCH v2 3/3] lightnvm: pblk: fix smeta write error path

2018-04-23 Thread Hans Holmberg

From: Hans Holmberg 

Smeta write errors were previously ignored. Skip these
lines instead and throw them back on the free
list, so the chunks will go through a reset cycle
before we attempt to use the line again.

Signed-off-by: Hans Holmberg 
---
 drivers/lightnvm/pblk-core.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 413cf3b..dec1bb4 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -849,9 +849,10 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, 
struct pblk_line *line,
atomic_dec(>inflight_io);
 
if (rqd.error) {
-   if (dir == PBLK_WRITE)
+   if (dir == PBLK_WRITE) {
pblk_log_write_err(pblk, );
-   else if (dir == PBLK_READ)
+   ret = 1;
+   } else if (dir == PBLK_READ)
pblk_log_read_err(pblk, );
}
 
@@ -1120,7 +1121,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct 
pblk_line *line,
 
if (init && pblk_line_submit_smeta_io(pblk, line, off, PBLK_WRITE)) {
pr_debug("pblk: line smeta I/O failed. Retry\n");
-   return 1;
+   return 0;
}
 
bitmap_copy(line->invalid_bitmap, line->map_bitmap, lm->sec_per_line);
-- 
2.7.4

[PATCH v2 1/3] lightnvm: pblk: rework write error recovery path

2018-04-23 Thread Hans Holmberg

From: Hans Holmberg 

The write error recovery path is incomplete, so rework
the write error recovery handling to do resubmits directly
from the write buffer.

When a write error occurs, the remaining sectors in the chunk are
mapped out and invalidated and the request inserted in a resubmit list.

The writer thread checks if there are any requests to resubmit,
scans and invalidates any lbas that have been overwritten by later
writes and resubmits the failed entries.

Signed-off-by: Hans Holmberg 
---
 drivers/lightnvm/pblk-init.c |   2 +
 drivers/lightnvm/pblk-rb.c   |  39 --
 drivers/lightnvm/pblk-recovery.c |  91 -
 drivers/lightnvm/pblk-write.c| 267 ++-
 drivers/lightnvm/pblk.h  |  11 +-
 5 files changed, 181 insertions(+), 229 deletions(-)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index bfc488d..6f06727 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -426,6 +426,7 @@ static int pblk_core_init(struct pblk *pblk)
goto free_r_end_wq;
 
INIT_LIST_HEAD(>compl_list);
+   INIT_LIST_HEAD(>resubmit_list);
 
return 0;
 
@@ -1185,6 +1186,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct 
gendisk *tdisk,
pblk->state = PBLK_STATE_RUNNING;
pblk->gc.gc_enabled = 0;
 
+   spin_lock_init(>resubmit_lock);
spin_lock_init(>trans_lock);
spin_lock_init(>lock);
 
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index 024a366..00cd1f2 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -503,45 +503,6 @@ int pblk_rb_may_write_gc(struct pblk_rb *rb, unsigned int 
nr_entries,
 }
 
 /*
- * The caller of this function must ensure that the backpointer will not
- * overwrite the entries passed on the list.
- */
-unsigned int pblk_rb_read_to_bio_list(struct pblk_rb *rb, struct bio *bio,
- struct list_head *list,
- unsigned int max)
-{
-   struct pblk_rb_entry *entry, *tentry;
-   struct page *page;
-   unsigned int read = 0;
-   int ret;
-
-   list_for_each_entry_safe(entry, tentry, list, index) {
-   if (read > max) {
-   pr_err("pblk: too many entries on list\n");
-   goto out;
-   }
-
-   page = virt_to_page(entry->data);
-   if (!page) {
-   pr_err("pblk: could not allocate write bio page\n");
-   goto out;
-   }
-
-   ret = bio_add_page(bio, page, rb->seg_size, 0);
-   if (ret != rb->seg_size) {
-   pr_err("pblk: could not add page to write bio\n");
-   goto out;
-   }
-
-   list_del(>index);
-   read++;
-   }
-
-out:
-   return read;
-}
-
-/*
  * Read available entries on rb and add them to the given bio. To avoid a 
memory
  * copy, a page reference to the write buffer is used to be added to the bio.
  *
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 9cb6d5d..5983428 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -16,97 +16,6 @@
 
 #include "pblk.h"
 
-void pblk_submit_rec(struct work_struct *work)
-{
-   struct pblk_rec_ctx *recovery =
-   container_of(work, struct pblk_rec_ctx, ws_rec);
-   struct pblk *pblk = recovery->pblk;
-   struct nvm_rq *rqd = recovery->rqd;
-   struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
-   struct bio *bio;
-   unsigned int nr_rec_secs;
-   unsigned int pgs_read;
-   int ret;
-
-   nr_rec_secs = bitmap_weight((unsigned long int *)>ppa_status,
-   NVM_MAX_VLBA);
-
-   bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
-
-   bio->bi_iter.bi_sector = 0;
-   bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-   rqd->bio = bio;
-   rqd->nr_ppas = nr_rec_secs;
-
-   pgs_read = pblk_rb_read_to_bio_list(>rwb, bio, >failed,
-   nr_rec_secs);
-   if (pgs_read != nr_rec_secs) {
-   pr_err("pblk: could not read recovery entries\n");
-   goto err;
-   }
-
-   if (pblk_setup_w_rec_rq(pblk, rqd, c_ctx)) {
-   pr_err("pblk: could not setup recovery request\n");
-   goto err;
-   }
-
-#ifdef CONFIG_NVM_DEBUG
-   atomic_long_add(nr_rec_secs, >recov_writes);
-#endif
-
-   ret = pblk_submit_io(pblk, rqd);
-   if (ret) {
-   pr_err("pblk: I/O submission failed: %d\n", ret);
-   goto err;
-   }
-
-   mempool_free(recovery, pblk->rec_pool);
-   return;
-
-err:
-   bio_put(bio);
-

[PATCH v2 0/3] Rework write error handling in pblk

2018-04-23 Thread Hans Holmberg

From: Hans Holmberg 

This patch series fixes the(currently incomplete) write error handling 
in pblk by:

 * queuing and re-submitting failed writes in the write buffer
 * evacuating valid data data in lines with write failures, so the
   chunk(s) with write failures can be reset to a known state by the fw

Lines with failures in smeta are put back on the free list.
Failed chunks will be reset on the next use.

If a write failes in emeta, the lba list is cached so the line can be 
garbage collected without scanning the out-of-band area.

Changes in V2:
- Added the recov_writes counter increase to the new path
- Moved lba list emeta reading during gc to a separate function
- Allocating the saved lba list with pblk_malloc instead of kmalloc
- Fixed formatting issues
- Removed dead code

Hans Holmberg (3):
  lightnvm: pblk: rework write error recovery path
  lightnvm: pblk: garbage collect lines with failed writes
  lightnvm: pblk: fix smeta write error path

 drivers/lightnvm/pblk-core.c |  52 +++-
 drivers/lightnvm/pblk-gc.c   | 102 +--
 drivers/lightnvm/pblk-init.c |  47 ---
 drivers/lightnvm/pblk-rb.c   |  39 --
 drivers/lightnvm/pblk-recovery.c |  91 -
 drivers/lightnvm/pblk-rl.c   |  29 -
 drivers/lightnvm/pblk-sysfs.c|  15 ++-
 drivers/lightnvm/pblk-write.c| 269 ++-
 drivers/lightnvm/pblk.h  |  36 --
 9 files changed, 384 insertions(+), 296 deletions(-)

-- 
2.7.4

Re: [PATCH v4 04/14] PCI/P2PDMA: Clear ACS P2P flags for all devices behind switches

2018-04-23 Thread Randy Dunlap

On 04/23/2018 04:30 PM, Logan Gunthorpe wrote:> > Signed-off-by: Logan 
Gunthorpe > ---
>  drivers/pci/Kconfig|  9 +>  drivers/pci/p2pdma.c   | 45 
> ++--->  drivers/pci/pci.c  |  
> 6 ++>  include/linux/pci-p2pdma.h |  5 +>  4 files changed, 50 
> insertions(+), 15 deletions(-)
> 
> diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
> index b2396c22b53e..b6db41d4b708 100644
> --- a/drivers/pci/Kconfig
> +++ b/drivers/pci/Kconfig
> @@ -139,6 +139,15 @@ config PCI_P2PDMA
> transations must be between devices behind the same root port.
> (Typically behind a network of PCIe switches).
>  
> +   Enabling this option will also disable ACS on all ports behind
> +   any PCIe switch. This effectively puts all devices behind any
> +   switch heirarchy into the same IOMMU group. Which implies that

 hierarchy group, which

and sames fixes in the commit description...

> +   individual devices behind any switch will not be able to be
> +   assigned to separate VMs because there is no isolation between
> +   them. Additionally, any malicious PCIe devices will be able to
> +   DMA to memory exposed by other EPs in the same domain as TLPs
> +   will not be checked by the IOMMU.
> +
> If unsure, say N.
>  
>  config PCI_LABEL


-- 
~Randy

Re: [PATCH 11/12] swiotlb: move the SWIOTLB config symbol to lib/Kconfig

2018-04-23 Thread Russell King - ARM Linux

On Mon, Apr 23, 2018 at 07:04:18PM +0200, Christoph Hellwig wrote:
> This way we have one central definition of it, and user can select it as
> needed.  Note that we also add a second ARCH_HAS_SWIOTLB symbol to
> indicate the architecture supports swiotlb at all, so that we can still
> make the usage optional for a few architectures that want this feature
> to be user selectable.
> 
> Signed-off-by: Christoph Hellwig 

Hmm, this looks like we end up with NEED_SG_DMA_LENGTH=y on ARM by
default, which probably isn't a good idea - ARM pre-dates the dma_length
parameter in scatterlists, and I don't think all code is guaranteed to
do the right thing if this is enabled.

For example, arch/arm/mach-rpc/dma.c doesn't use the dma_length
member of struct scatterlist.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line in suburbia: sync at 8.8Mbps down 630kbps up
According to speedtest.net: 8.21Mbps down 510kbps up

[PATCH v4 01/14] PCI/P2PDMA: Support peer-to-peer memory

2018-04-23 Thread Logan Gunthorpe

Some PCI devices may have memory mapped in a BAR space that's
intended for use in peer-to-peer transactions. In order to enable
such transactions the memory must be registered with ZONE_DEVICE pages
so it can be used by DMA interfaces in existing drivers.

Add an interface for other subsystems to find and allocate chunks of P2P
memory as necessary to facilitate transfers between two PCI peers:

int pci_p2pdma_add_client();
struct pci_dev *pci_p2pmem_find();
void *pci_alloc_p2pmem();

The new interface requires a driver to collect a list of client devices
involved in the transaction with the pci_p2pmem_add_client*() functions
then call pci_p2pmem_find() to obtain any suitable P2P memory. Once
this is done the list is bound to the memory and the calling driver is
free to add and remove clients as necessary (adding incompatible clients
will fail). With a suitable p2pmem device, memory can then be
allocated with pci_alloc_p2pmem() for use in DMA transactions.

Depending on hardware, using peer-to-peer memory may reduce the bandwidth
of the transfer but can significantly reduce pressure on system memory.
This may be desirable in many cases: for example a system could be designed
with a small CPU connected to a PCI switch by a small number of lanes
which would maximize the number of lanes available to connect to NVMe
devices.

The code is designed to only utilize the p2pmem device if all the devices
involved in a transfer are behind the same root port (typically through
a network of PCIe switches). This is because we have no way of knowing
whether peer-to-peer routing between PCIe Root Ports is supported
(PCIe r4.0, sec 1.3.1). Additionally, the benefits of P2P transfers that
go through the RC is limited to only reducing DRAM usage and, in some
cases, coding convenience. The PCI-SIG may be exploring adding a new
capability bit to advertise whether this is possible for future
hardware.

This commit includes significant rework and feedback from Christoph
Hellwig.

Signed-off-by: Christoph Hellwig 
Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/Kconfig|  17 ++
 drivers/pci/Makefile   |   1 +
 drivers/pci/p2pdma.c   | 694 +
 include/linux/memremap.h   |  18 ++
 include/linux/pci-p2pdma.h | 100 +++
 include/linux/pci.h|   4 +
 6 files changed, 834 insertions(+)
 create mode 100644 drivers/pci/p2pdma.c
 create mode 100644 include/linux/pci-p2pdma.h

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 34b56a8f8480..b2396c22b53e 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -124,6 +124,23 @@ config PCI_PASID
 
  If unsure, say N.
 
+config PCI_P2PDMA
+   bool "PCI peer-to-peer transfer support"
+   depends on PCI && ZONE_DEVICE && EXPERT
+   select GENERIC_ALLOCATOR
+   help
+ Enableѕ drivers to do PCI peer-to-peer transactions to and from
+ BARs that are exposed in other devices that are the part of
+ the hierarchy where peer-to-peer DMA is guaranteed by the PCI
+ specification to work (ie. anything below a single PCI bridge).
+
+ Many PCIe root complexes do not support P2P transactions and
+ it's hard to tell which support it at all, so at this time, DMA
+ transations must be between devices behind the same root port.
+ (Typically behind a network of PCIe switches).
+
+ If unsure, say N.
+
 config PCI_LABEL
def_bool y if (DMI || ACPI)
depends on PCI
diff --git a/drivers/pci/Makefile b/drivers/pci/Makefile
index 952addc7bacf..050c1e19a1de 100644
--- a/drivers/pci/Makefile
+++ b/drivers/pci/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_X86_INTEL_MID)   += pci-mid.o
 obj-$(CONFIG_PCI_SYSCALL)  += syscall.o
 obj-$(CONFIG_PCI_STUB) += pci-stub.o
 obj-$(CONFIG_PCI_ECAM) += ecam.o
+obj-$(CONFIG_PCI_P2PDMA)   += p2pdma.o
 obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += xen-pcifront.o
 
 obj-y  += host/
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
new file mode 100644
index ..e524a12eca1f
--- /dev/null
+++ b/drivers/pci/p2pdma.c
@@ -0,0 +1,694 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PCI Peer 2 Peer DMA support.
+ *
+ * Copyright (c) 2016-2018, Logan Gunthorpe
+ * Copyright (c) 2016-2017, Microsemi Corporation
+ * Copyright (c) 2017, Christoph Hellwig
+ * Copyright (c) 2018, Eideticom Inc.
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+struct pci_p2pdma {
+   struct percpu_ref devmap_ref;
+   struct completion devmap_ref_done;
+   struct gen_pool *pool;
+   bool p2pmem_published;
+};
+
+static void pci_p2pdma_percpu_release(struct percpu_ref *ref)
+{
+   struct pci_p2pdma *p2p =
+   container_of(ref, struct pci_p2pdma, devmap_ref);
+
+   complete_all(>devmap_ref_done);
+}
+
+static void pci_p2pdma_percpu_kill(void *data)
+{
+   struct

[PATCH v4 03/14] PCI/P2PDMA: Add PCI p2pmem dma mappings to adjust the bus offset

2018-04-23 Thread Logan Gunthorpe

The DMA address used when mapping PCI P2P memory must be the PCI bus
address. Thus, introduce pci_p2pmem_[un]map_sg() to map the correct
addresses when using P2P memory.

For this, we assume that an SGL passed to these functions contain all
P2P memory or no P2P memory.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/p2pdma.c   | 51 ++
 include/linux/memremap.h   |  1 +
 include/linux/pci-p2pdma.h | 13 
 3 files changed, 65 insertions(+)

diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 4daad6374869..ed9dce8552a2 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -190,6 +190,8 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, 
size_t size,
pgmap->res.flags = pci_resource_flags(pdev, bar);
pgmap->ref = >p2pdma->devmap_ref;
pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
+   pgmap->pci_p2pdma_bus_offset = pci_bus_address(pdev, bar) -
+   pci_resource_start(pdev, bar);
 
addr = devm_memremap_pages(>dev, pgmap);
if (IS_ERR(addr)) {
@@ -746,3 +748,52 @@ void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
pdev->p2pdma->p2pmem_published = publish;
 }
 EXPORT_SYMBOL_GPL(pci_p2pmem_publish);
+
+/**
+ * pci_p2pdma_map_sg - map a PCI peer-to-peer sg for DMA
+ * @dev: device doing the DMA request
+ * @sg: scatter list to map
+ * @nents: elements in the scatterlist
+ * @dir: DMA direction
+ *
+ * Returns the number of SG entries mapped
+ */
+int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir)
+{
+   struct dev_pagemap *pgmap;
+   struct scatterlist *s;
+   phys_addr_t paddr;
+   int i;
+
+   /*
+* p2pdma mappings are not compatible with devices that use
+* dma_virt_ops.
+*/
+   if (IS_ENABLED(CONFIG_DMA_VIRT_OPS) && dev->dma_ops == _virt_ops)
+   return 0;
+
+   for_each_sg(sg, s, nents, i) {
+   pgmap = sg_page(s)->pgmap;
+   paddr = sg_phys(s);
+
+   s->dma_address = paddr - pgmap->pci_p2pdma_bus_offset;
+   sg_dma_len(s) = s->length;
+   }
+
+   return nents;
+}
+EXPORT_SYMBOL_GPL(pci_p2pdma_map_sg);
+
+/**
+ * pci_p2pdma_unmap_sg - unmap a PCI peer-to-peer sg for DMA
+ * @dev: device doing the DMA request
+ * @sg: scatter list to map
+ * @nents: elements in the scatterlist
+ * @dir: DMA direction
+ */
+void pci_p2pdma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+enum dma_data_direction dir)
+{
+}
+EXPORT_SYMBOL_GPL(pci_p2pdma_unmap_sg);
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 9e907c338a44..1660f64ce96f 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -125,6 +125,7 @@ struct dev_pagemap {
struct device *dev;
void *data;
enum memory_type type;
+   u64 pci_p2pdma_bus_offset;
 };
 
 #ifdef CONFIG_ZONE_DEVICE
diff --git a/include/linux/pci-p2pdma.h b/include/linux/pci-p2pdma.h
index 80e931cb1235..0cde88341eeb 100644
--- a/include/linux/pci-p2pdma.h
+++ b/include/linux/pci-p2pdma.h
@@ -35,6 +35,10 @@ struct scatterlist *pci_p2pmem_alloc_sgl(struct pci_dev 
*pdev,
 unsigned int *nents, u32 length);
 void pci_p2pmem_free_sgl(struct pci_dev *pdev, struct scatterlist *sgl);
 void pci_p2pmem_publish(struct pci_dev *pdev, bool publish);
+int pci_p2pdma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
+ enum dma_data_direction dir);
+void pci_p2pdma_unmap_sg(struct device *dev, struct scatterlist *sg, int nents,
+enum dma_data_direction dir);
 #else /* CONFIG_PCI_P2PDMA */
 static inline int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar,
size_t size, u64 offset)
@@ -96,5 +100,14 @@ static inline void pci_p2pmem_free_sgl(struct pci_dev *pdev,
 static inline void pci_p2pmem_publish(struct pci_dev *pdev, bool publish)
 {
 }
+static inline int pci_p2pdma_map_sg(struct device *dev,
+   struct scatterlist *sg, int nents, enum dma_data_direction dir)
+{
+   return 0;
+}
+static inline void pci_p2pdma_unmap_sg(struct device *dev,
+   struct scatterlist *sg, int nents, enum dma_data_direction dir)
+{
+}
 #endif /* CONFIG_PCI_P2PDMA */
 #endif /* _LINUX_PCI_P2P_H */
-- 
2.11.0

[PATCH v4 10/14] nvme-pci: Add support for P2P memory in requests

2018-04-23 Thread Logan Gunthorpe

For P2P requests, we must use the pci_p2pmem_[un]map_sg() functions
instead of the dma_map_sg functions.

With that, we can then indicate PCI_P2P support in the request queue.
For this, we create an NVME_F_PCI_P2P flag which tells the core to
set QUEUE_FLAG_PCI_P2P in the request queue.

Signed-off-by: Logan Gunthorpe 
Reviewed-by: Sagi Grimberg 
Reviewed-by: Christoph Hellwig 
---
 drivers/nvme/host/core.c |  4 
 drivers/nvme/host/nvme.h |  1 +
 drivers/nvme/host/pci.c  | 19 +++
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 9df4f71e58ca..2ca9debbcf2b 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -2977,7 +2977,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, 
unsigned nsid)
ns->queue = blk_mq_init_queue(ctrl->tagset);
if (IS_ERR(ns->queue))
goto out_free_ns;
+
blk_queue_flag_set(QUEUE_FLAG_NONROT, ns->queue);
+   if (ctrl->ops->flags & NVME_F_PCI_P2PDMA)
+   blk_queue_flag_set(QUEUE_FLAG_PCI_P2PDMA, ns->queue);
+
ns->queue->queuedata = ns;
ns->ctrl = ctrl;
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 061fecfd44f5..9a689c13998f 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -306,6 +306,7 @@ struct nvme_ctrl_ops {
unsigned int flags;
 #define NVME_F_FABRICS (1 << 0)
 #define NVME_F_METADATA_SUPPORTED  (1 << 1)
+#define NVME_F_PCI_P2PDMA  (1 << 2)
int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 514da4de3c85..09b6aba6ed28 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -798,8 +798,13 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, 
struct request *req,
goto out;
 
ret = BLK_STS_RESOURCE;
-   nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents, dma_dir,
-   DMA_ATTR_NO_WARN);
+
+   if (REQ_IS_PCI_P2PDMA(req))
+   nr_mapped = pci_p2pdma_map_sg(dev->dev, iod->sg, iod->nents,
+ dma_dir);
+   else
+   nr_mapped = dma_map_sg_attrs(dev->dev, iod->sg, iod->nents,
+dma_dir,  DMA_ATTR_NO_WARN);
if (!nr_mapped)
goto out;
 
@@ -844,7 +849,12 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct 
request *req)
DMA_TO_DEVICE : DMA_FROM_DEVICE;
 
if (iod->nents) {
-   dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
+   if (REQ_IS_PCI_P2PDMA(req))
+   pci_p2pdma_unmap_sg(dev->dev, iod->sg, iod->nents,
+   dma_dir);
+   else
+   dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir);
+
if (blk_integrity_rq(req)) {
if (req_op(req) == REQ_OP_READ)
nvme_dif_remap(req, nvme_dif_complete);
@@ -2439,7 +2449,8 @@ static int nvme_pci_get_address(struct nvme_ctrl *ctrl, 
char *buf, int size)
 static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name   = "pcie",
.module = THIS_MODULE,
-   .flags  = NVME_F_METADATA_SUPPORTED,
+   .flags  = NVME_F_METADATA_SUPPORTED |
+ NVME_F_PCI_P2PDMA,
.reg_read32 = nvme_pci_reg_read32,
.reg_write32= nvme_pci_reg_write32,
.reg_read64 = nvme_pci_reg_read64,
-- 
2.11.0

[PATCH v4 12/14] nvmet: Introduce helper functions to allocate and free request SGLs

2018-04-23 Thread Logan Gunthorpe

Add helpers to allocate and free the SGL in a struct nvmet_req:

int nvmet_req_alloc_sgl(struct nvmet_req *req, struct nvmet_sq *sq)
void nvmet_req_free_sgl(struct nvmet_req *req)

This will be expanded in a future patch to implement peer-to-peer
memory DMAs and should be common with all target drivers. The presently
unused 'sq' argument in the alloc function will be necessary to
decide whether to use peer-to-peer memory and obtain the correct
provider to allocate the memory.

Signed-off-by: Logan Gunthorpe 
Cc: Christoph Hellwig 
Cc: Sagi Grimberg 
---
 drivers/nvme/target/core.c  | 18 ++
 drivers/nvme/target/nvmet.h |  2 ++
 2 files changed, 20 insertions(+)

diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index e95424f172fd..75d44bc3e8d3 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -575,6 +575,24 @@ void nvmet_req_execute(struct nvmet_req *req)
 }
 EXPORT_SYMBOL_GPL(nvmet_req_execute);
 
+int nvmet_req_alloc_sgl(struct nvmet_req *req, struct nvmet_sq *sq)
+{
+   req->sg = sgl_alloc(req->transfer_len, GFP_KERNEL, >sg_cnt);
+   if (!req->sg)
+   return -ENOMEM;
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(nvmet_req_alloc_sgl);
+
+void nvmet_req_free_sgl(struct nvmet_req *req)
+{
+   sgl_free(req->sg);
+   req->sg = NULL;
+   req->sg_cnt = 0;
+}
+EXPORT_SYMBOL_GPL(nvmet_req_free_sgl);
+
 static inline bool nvmet_cc_en(u32 cc)
 {
return (cc >> NVME_CC_EN_SHIFT) & 0x1;
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 15fd84ab21f8..10b162615a5e 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -273,6 +273,8 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq 
*cq,
 void nvmet_req_uninit(struct nvmet_req *req);
 void nvmet_req_execute(struct nvmet_req *req);
 void nvmet_req_complete(struct nvmet_req *req, u16 status);
+int nvmet_req_alloc_sgl(struct nvmet_req *req, struct nvmet_sq *sq);
+void nvmet_req_free_sgl(struct nvmet_req *req);
 
 void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid,
u16 size);
-- 
2.11.0

[PATCH v4 02/14] PCI/P2PDMA: Add sysfs group to display p2pmem stats

2018-04-23 Thread Logan Gunthorpe

Add a sysfs group to display statistics about P2P memory that is
registered in each PCI device.

Attributes in the group display the total amount of P2P memory, the
amount available and whether it is published or not.

Signed-off-by: Logan Gunthorpe 
---
 Documentation/ABI/testing/sysfs-bus-pci | 25 +++
 drivers/pci/p2pdma.c| 54 +
 2 files changed, 79 insertions(+)

diff --git a/Documentation/ABI/testing/sysfs-bus-pci 
b/Documentation/ABI/testing/sysfs-bus-pci
index 44d4b2be92fd..044812c816d0 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -323,3 +323,28 @@ Description:
 
This is similar to /sys/bus/pci/drivers_autoprobe, but
affects only the VFs associated with a specific PF.
+
+What:  /sys/bus/pci/devices/.../p2pmem/available
+Date:  November 2017
+Contact:   Logan Gunthorpe 
+Description:
+   If the device has any Peer-to-Peer memory registered, this
+   file contains the amount of memory that has not been
+   allocated (in decimal).
+
+What:  /sys/bus/pci/devices/.../p2pmem/size
+Date:  November 2017
+Contact:   Logan Gunthorpe 
+Description:
+   If the device has any Peer-to-Peer memory registered, this
+   file contains the total amount of memory that the device
+   provides (in decimal).
+
+What:  /sys/bus/pci/devices/.../p2pmem/published
+Date:  November 2017
+Contact:   Logan Gunthorpe 
+Description:
+   If the device has any Peer-to-Peer memory registered, this
+   file contains a '1' if the memory has been published for
+   use inside the kernel or a '0' if it is only intended
+   for use within the driver that published it.
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index e524a12eca1f..4daad6374869 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -24,6 +24,54 @@ struct pci_p2pdma {
bool p2pmem_published;
 };
 
+static ssize_t size_show(struct device *dev, struct device_attribute *attr,
+char *buf)
+{
+   struct pci_dev *pdev = to_pci_dev(dev);
+   size_t size = 0;
+
+   if (pdev->p2pdma->pool)
+   size = gen_pool_size(pdev->p2pdma->pool);
+
+   return snprintf(buf, PAGE_SIZE, "%zd\n", size);
+}
+static DEVICE_ATTR_RO(size);
+
+static ssize_t available_show(struct device *dev, struct device_attribute 
*attr,
+ char *buf)
+{
+   struct pci_dev *pdev = to_pci_dev(dev);
+   size_t avail = 0;
+
+   if (pdev->p2pdma->pool)
+   avail = gen_pool_avail(pdev->p2pdma->pool);
+
+   return snprintf(buf, PAGE_SIZE, "%zd\n", avail);
+}
+static DEVICE_ATTR_RO(available);
+
+static ssize_t published_show(struct device *dev, struct device_attribute 
*attr,
+ char *buf)
+{
+   struct pci_dev *pdev = to_pci_dev(dev);
+
+   return snprintf(buf, PAGE_SIZE, "%d\n",
+   pdev->p2pdma->p2pmem_published);
+}
+static DEVICE_ATTR_RO(published);
+
+static struct attribute *p2pmem_attrs[] = {
+   _attr_size.attr,
+   _attr_available.attr,
+   _attr_published.attr,
+   NULL,
+};
+
+static const struct attribute_group p2pmem_group = {
+   .attrs = p2pmem_attrs,
+   .name = "p2pmem",
+};
+
 static void pci_p2pdma_percpu_release(struct percpu_ref *ref)
 {
struct pci_p2pdma *p2p =
@@ -53,6 +101,7 @@ static void pci_p2pdma_release(void *data)
percpu_ref_exit(>p2pdma->devmap_ref);
 
gen_pool_destroy(pdev->p2pdma->pool);
+   sysfs_remove_group(>dev.kobj, _group);
pdev->p2pdma = NULL;
 }
 
@@ -83,9 +132,14 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
 
pdev->p2pdma = p2p;
 
+   error = sysfs_create_group(>dev.kobj, _group);
+   if (error)
+   goto out_pool_destroy;
+
return 0;
 
 out_pool_destroy:
+   pdev->p2pdma = NULL;
gen_pool_destroy(p2p->pool);
 out:
devm_kfree(>dev, p2p);
-- 
2.11.0

[PATCH v4 07/14] block: Introduce PCI P2P flags for request and request queue

2018-04-23 Thread Logan Gunthorpe

QUEUE_FLAG_PCI_P2P is introduced meaning a driver's request queue
supports targeting P2P memory.

REQ_PCI_P2P is introduced to indicate a particular bio request is
directed to/from PCI P2P memory. A request with this flag is not
accepted unless the corresponding queues have the QUEUE_FLAG_PCI_P2P
flag set.

Signed-off-by: Logan Gunthorpe 
Reviewed-by: Sagi Grimberg 
Reviewed-by: Christoph Hellwig 
---
 block/blk-core.c  |  3 +++
 include/linux/blk_types.h | 18 +-
 include/linux/blkdev.h|  3 +++
 3 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 806ce2442819..35680cbebaf4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2270,6 +2270,9 @@ generic_make_request_checks(struct bio *bio)
if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q))
goto not_supported;
 
+   if ((bio->bi_opf & REQ_PCI_P2PDMA) && !blk_queue_pci_p2pdma(q))
+   goto not_supported;
+
if (should_fail_bio(bio))
goto end_io;
 
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 17b18b91ebac..41194d54c45a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -279,6 +279,10 @@ enum req_flag_bits {
__REQ_BACKGROUND,   /* background IO */
__REQ_NOWAIT,   /* Don't wait if request will block */
 
+#ifdef CONFIG_PCI_P2PDMA
+   __REQ_PCI_P2PDMA,   /* request is to/from P2P memory */
+#endif
+
/* command specific flags for REQ_OP_WRITE_ZEROES: */
__REQ_NOUNMAP,  /* do not free blocks when zeroing */
 
@@ -303,6 +307,18 @@ enum req_flag_bits {
 #define REQ_BACKGROUND (1ULL << __REQ_BACKGROUND)
 #define REQ_NOWAIT (1ULL << __REQ_NOWAIT)
 
+#ifdef CONFIG_PCI_P2PDMA
+/*
+ * Currently SGLs do not support mixed P2P and regular memory so
+ * requests with P2P memory must not be merged.
+ */
+#define REQ_PCI_P2PDMA (1ULL << __REQ_PCI_P2PDMA)
+#define REQ_IS_PCI_P2PDMA(req) ((req)->cmd_flags & REQ_PCI_P2PDMA)
+#else
+#define REQ_PCI_P2PDMA 0
+#define REQ_IS_PCI_P2PDMA(req) 0
+#endif /* CONFIG_PCI_P2PDMA */
+
 #define REQ_NOUNMAP(1ULL << __REQ_NOUNMAP)
 
 #define REQ_DRV(1ULL << __REQ_DRV)
@@ -311,7 +327,7 @@ enum req_flag_bits {
(REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT | REQ_FAILFAST_DRIVER)
 
 #define REQ_NOMERGE_FLAGS \
-   (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA)
+   (REQ_NOMERGE | REQ_PREFLUSH | REQ_FUA | REQ_PCI_P2PDMA)
 
 #define bio_op(bio) \
((bio)->bi_opf & REQ_OP_MASK)
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9af3e0f430bc..116367babb39 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -698,6 +698,7 @@ struct request_queue {
 #define QUEUE_FLAG_SCSI_PASSTHROUGH 27 /* queue supports SCSI commands */
 #define QUEUE_FLAG_QUIESCED28  /* queue has been quiesced */
 #define QUEUE_FLAG_PREEMPT_ONLY29  /* only process REQ_PREEMPT 
requests */
+#define QUEUE_FLAG_PCI_P2PDMA  30  /* device supports pci p2p requests */
 
 #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) |\
 (1 << QUEUE_FLAG_SAME_COMP)|   \
@@ -730,6 +731,8 @@ bool blk_queue_flag_test_and_clear(unsigned int flag, 
struct request_queue *q);
 #define blk_queue_dax(q)   test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags)
 #define blk_queue_scsi_passthrough(q)  \
test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags)
+#define blk_queue_pci_p2pdma(q)\
+   test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags)
 
 #define blk_noretry_request(rq) \
((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
-- 
2.11.0

[PATCH v4 06/14] PCI/P2PDMA: Add P2P DMA driver writer's documentation

2018-04-23 Thread Logan Gunthorpe

Add a restructured text file describing how to write drivers
with support for P2P DMA transactions. The document describes
how to use the APIs that were added in the previous few
commits.

Also adds an index for the PCI documentation tree even though this
is the only PCI document that has been converted to restructured text
at this time.

Signed-off-by: Logan Gunthorpe 
Cc: Jonathan Corbet 
---
 Documentation/PCI/index.rst |  14 +++
 Documentation/driver-api/pci/index.rst  |   1 +
 Documentation/driver-api/pci/p2pdma.rst | 166 
 Documentation/index.rst |   3 +-
 4 files changed, 183 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/PCI/index.rst
 create mode 100644 Documentation/driver-api/pci/p2pdma.rst

diff --git a/Documentation/PCI/index.rst b/Documentation/PCI/index.rst
new file mode 100644
index ..2fdc4b3c291d
--- /dev/null
+++ b/Documentation/PCI/index.rst
@@ -0,0 +1,14 @@
+==
+Linux PCI Driver Developer's Guide
+==
+
+.. toctree::
+
+   p2pdma
+
+.. only::  subproject and html
+
+   Indices
+   ===
+
+   * :ref:`genindex`
diff --git a/Documentation/driver-api/pci/index.rst 
b/Documentation/driver-api/pci/index.rst
index 03b57cbf8cc2..d12eeafbfc90 100644
--- a/Documentation/driver-api/pci/index.rst
+++ b/Documentation/driver-api/pci/index.rst
@@ -10,6 +10,7 @@ The Linux PCI driver implementer's API guide
:maxdepth: 2
 
pci
+   p2pdma
 
 .. only::  subproject and html
 
diff --git a/Documentation/driver-api/pci/p2pdma.rst 
b/Documentation/driver-api/pci/p2pdma.rst
new file mode 100644
index ..49a512c405b2
--- /dev/null
+++ b/Documentation/driver-api/pci/p2pdma.rst
@@ -0,0 +1,166 @@
+
+PCI Peer-to-Peer DMA Support
+
+
+The PCI bus has pretty decent support for performing DMA transfers
+between two endpoints on the bus. This type of transaction is
+henceforth called Peer-to-Peer (or P2P). However, there are a number of
+issues that make P2P transactions tricky to do in a perfectly safe way.
+
+One of the biggest issues is that PCI Root Complexes are not required
+to support forwarding packets between Root Ports. To make things worse,
+there is no simple way to determine if a given Root Complex supports
+this or not. (See PCIe r4.0, sec 1.3.1). Therefore, as of this writing,
+the kernel only supports doing P2P when the endpoints involved are all
+behind the same PCIe root port as the spec guarantees that all
+packets will always be routable but does not require routing between
+root ports.
+
+The second issue is that to make use of existing interfaces in Linux,
+memory that is used for P2P transactions needs to be backed by struct
+pages. However, PCI BARs are not typically cache coherent so there are
+a few corner case gotchas with these pages so developers need to
+be careful about what they do with them.
+
+
+Driver Writer's Guide
+=
+
+In a given P2P implementation there may be three or more different
+types of kernel drivers in play:
+
+* Providers - A driver which provides or publishes P2P resources like
+  memory or doorbell registers to other drivers.
+* Clients - A driver which makes use of a resource by setting up a
+  DMA transaction to or from it.
+* Orchestrators - A driver which orchestrates the flow of data between
+  clients and providers
+
+In many cases there could be overlap between these three types (ie.
+it may be typical for a driver to be both a provider and a client).
+
+For example, in the NVMe Target Copy Offload implementation:
+
+* The NVMe PCI driver is both a client, provider and orchestrator
+  in that it exposes any CMB (Controller Memory Buffer) as a P2P memory
+  resource (provider), it accepts P2P memory pages as buffers in requests
+  to be used directly (client) and it can also make use the CMB as
+  submission queue entries.
+* The RDMA driver is a client in this arrangement so that an RNIC
+  can DMA directly to the memory exposed by the NVMe device.
+* The NVMe Target driver (nvmet) can orchestrate the data from the RNIC
+  to the P2P memory (CMB) and then to the NVMe device (and vice versa).
+
+This is currently the only arrangement supported by the kernel but
+one could imagine slight tweaks to this that would allow for the same
+functionality. For example, if a specific RNIC added a BAR with some
+memory behind it, its driver could add support as a P2P provider and
+then the NVMe Target could use the RNIC's memory instead of the CMB
+in cases where the NVMe cards in use do not have CMB support.
+
+
+Provider Drivers
+
+
+A provider simply needs to register a BAR (or a portion of a BAR)
+as a P2P DMA resource using :c:func:`pci_p2pdma_add_resource()`.
+This will register struct pages for all the specified memory.
+
+After that it may optionally publish all of its

[PATCH v4 13/14] nvmet-rdma: Use new SGL alloc/free helper for requests

2018-04-23 Thread Logan Gunthorpe

Use the new helpers introduced in the previous patch to allocate
the SGLs for the request.

Seeing we use req.transfer_len as the length of the SGL it is
set earlier and cleared on any error. It also seems to be unnecessary
to accumulate the length as the map_sgl functions should only ever
be called once.

Signed-off-by: Logan Gunthorpe 
Cc: Christoph Hellwig 
Cc: Sagi Grimberg 
---
 drivers/nvme/target/rdma.c | 20 
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 52e0c5d579a7..f7a3459d618f 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -430,7 +430,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp 
*rsp)
}
 
if (rsp->req.sg != >cmd->inline_sg)
-   sgl_free(rsp->req.sg);
+   nvmet_req_free_sgl(>req);
 
if (unlikely(!list_empty_careful(>rsp_wr_wait_list)))
nvmet_rdma_process_wr_wait_list(queue);
@@ -564,24 +564,24 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp 
*rsp,
 {
struct rdma_cm_id *cm_id = rsp->queue->cm_id;
u64 addr = le64_to_cpu(sgl->addr);
-   u32 len = get_unaligned_le24(sgl->length);
u32 key = get_unaligned_le32(sgl->key);
int ret;
 
+   rsp->req.transfer_len = get_unaligned_le24(sgl->length);
+
/* no data command? */
-   if (!len)
+   if (!rsp->req.transfer_len)
return 0;
 
-   rsp->req.sg = sgl_alloc(len, GFP_KERNEL, >req.sg_cnt);
-   if (!rsp->req.sg)
-   return NVME_SC_INTERNAL;
+   ret = nvmet_req_alloc_sgl(>req, >queue->nvme_sq);
+   if (ret < 0)
+   goto error_out;
 
ret = rdma_rw_ctx_init(>rw, cm_id->qp, cm_id->port_num,
rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
nvmet_data_dir(>req));
if (ret < 0)
-   return NVME_SC_INTERNAL;
-   rsp->req.transfer_len += len;
+   goto error_out;
rsp->n_rdma += ret;
 
if (invalidate) {
@@ -590,6 +590,10 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp 
*rsp,
}
 
return 0;
+
+error_out:
+   rsp->req.transfer_len = 0;
+   return NVME_SC_INTERNAL;
 }
 
 static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp)
-- 
2.11.0

[PATCH v4 14/14] nvmet: Optionally use PCI P2P memory

2018-04-23 Thread Logan Gunthorpe

We create a configfs attribute in each nvme-fabrics target port to
enable p2p memory use. When enabled, the port will only then use the
p2p memory if a p2p memory device can be found which is behind the
same switch heirarchy as the RDMA port and all the block devices in
use. If the user enabled it and no devices are found, then the system
will silently fall back on using regular memory.

If appropriate, that port will allocate memory for the RDMA buffers
for queues from the p2pmem device falling back to system memory should
anything fail.

Ideally, we'd want to use an NVME CMB buffer as p2p memory. This would
save an extra PCI transfer as the NVME card could just take the data
out of it's own memory. However, at this time, only a limited number
of cards with CMB buffers seem to be available.

Signed-off-by: Stephen Bates 
Signed-off-by: Steve Wise 
[hch: partial rewrite of the initial code]
Signed-off-by: Christoph Hellwig 
Signed-off-by: Logan Gunthorpe 
---
 drivers/nvme/target/configfs.c |  67 ++
 drivers/nvme/target/core.c | 127 -
 drivers/nvme/target/io-cmd.c   |   3 +
 drivers/nvme/target/nvmet.h|  13 +
 drivers/nvme/target/rdma.c |   2 +
 5 files changed, 210 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index ad9ff27234b5..5efe0dae0ee7 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -17,6 +17,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 #include "nvmet.h"
 
@@ -864,12 +866,77 @@ static void nvmet_port_release(struct config_item *item)
kfree(port);
 }
 
+#ifdef CONFIG_PCI_P2PDMA
+static ssize_t nvmet_p2pmem_show(struct config_item *item, char *page)
+{
+   struct nvmet_port *port = to_nvmet_port(item);
+
+   if (!port->use_p2pmem)
+   return sprintf(page, "none\n");
+
+   if (!port->p2p_dev)
+   return sprintf(page, "auto\n");
+
+   return sprintf(page, "%s\n", pci_name(port->p2p_dev));
+}
+
+static ssize_t nvmet_p2pmem_store(struct config_item *item,
+ const char *page, size_t count)
+{
+   struct nvmet_port *port = to_nvmet_port(item);
+   struct device *dev;
+   struct pci_dev *p2p_dev = NULL;
+   bool use_p2pmem;
+
+   dev = bus_find_device_by_name(_bus_type, NULL, page);
+   if (dev) {
+   use_p2pmem = true;
+   p2p_dev = to_pci_dev(dev);
+
+   if (!pci_has_p2pmem(p2p_dev)) {
+   pr_err("PCI device has no peer-to-peer memory: %s\n",
+  page);
+   pci_dev_put(p2p_dev);
+   return -ENODEV;
+   }
+   } else if (sysfs_streq(page, "auto")) {
+   use_p2pmem = 1;
+   } else if ((page[0] == '0' || page[0] == '1') && !iscntrl(page[1])) {
+   /*
+* If the user enters a PCI device that  doesn't exist
+* like ":01:00.1", we don't want strtobool to think
+* it's a '0' when it's clearly not what the user wanted.
+* So we require 0's and 1's to be exactly one character.
+*/
+   goto no_such_pci_device;
+   } else if (strtobool(page, _p2pmem)) {
+   goto no_such_pci_device;
+   }
+
+   down_write(_config_sem);
+   port->use_p2pmem = use_p2pmem;
+   pci_dev_put(port->p2p_dev);
+   port->p2p_dev = p2p_dev;
+   up_write(_config_sem);
+
+   return count;
+
+no_such_pci_device:
+   pr_err("No such PCI device: %s\n", page);
+   return -ENODEV;
+}
+CONFIGFS_ATTR(nvmet_, p2pmem);
+#endif /* CONFIG_PCI_P2PDMA */
+
 static struct configfs_attribute *nvmet_port_attrs[] = {
_attr_addr_adrfam,
_attr_addr_treq,
_attr_addr_traddr,
_attr_addr_trsvcid,
_attr_addr_trtype,
+#ifdef CONFIG_PCI_P2PDMA
+   _attr_p2pmem,
+#endif
NULL,
 };
 
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 75d44bc3e8d3..b2b62cd36f6c 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "nvmet.h"
 
@@ -271,6 +272,25 @@ void nvmet_put_namespace(struct nvmet_ns *ns)
percpu_ref_put(>ref);
 }
 
+static int nvmet_p2pdma_add_client(struct nvmet_ctrl *ctrl,
+  struct nvmet_ns *ns)
+{
+   int ret;
+
+   if (!blk_queue_pci_p2pdma(ns->bdev->bd_queue)) {
+   pr_err("peer-to-peer DMA is not supported by %s\n",
+  ns->device_path);
+   return -EINVAL;
+   }
+
+   ret = pci_p2pdma_add_client(>p2p_clients, nvmet_ns_dev(ns));
+   if (ret)
+   pr_err("failed to add peer-to-peer DMA client %s: %d\n",
+

[PATCH v4 04/14] PCI/P2PDMA: Clear ACS P2P flags for all devices behind switches

2018-04-23 Thread Logan Gunthorpe

For peer-to-peer transactions to work the downstream ports in each
switch must not have the ACS flags set. At this time there is no way
to dynamically change the flags and update the corresponding IOMMU
groups so this is done at enumeration time before the groups are
assigned.

This effectively means that if CONFIG_PCI_P2PDMA is selected then
all devices behind any PCIe switch heirarchy will be in the same IOMMU
group. Which implies that individual devices behind any switch
heirarchy will not be able to be assigned to separate VMs because
there is no isolation between them. Additionally, any malicious PCIe
devices will be able to DMA to memory exposed by other EPs in the same
domain as TLPs will not be checked by the IOMMU.

Given that the intended use case of P2P Memory is for users with
custom hardware designed for purpose, we do not expect distributors
to ever need to enable this option. Users that want to use P2P
must have compiled a custom kernel with this configuration option
and understand the implications regarding ACS. They will either
not require ACS or will have design the system in such a way that
devices that require isolation will be separate from those using P2P
transactions.

Signed-off-by: Logan Gunthorpe 
---
 drivers/pci/Kconfig|  9 +
 drivers/pci/p2pdma.c   | 45 ++---
 drivers/pci/pci.c  |  6 ++
 include/linux/pci-p2pdma.h |  5 +
 4 files changed, 50 insertions(+), 15 deletions(-)

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index b2396c22b53e..b6db41d4b708 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -139,6 +139,15 @@ config PCI_P2PDMA
  transations must be between devices behind the same root port.
  (Typically behind a network of PCIe switches).
 
+ Enabling this option will also disable ACS on all ports behind
+ any PCIe switch. This effectively puts all devices behind any
+ switch heirarchy into the same IOMMU group. Which implies that
+ individual devices behind any switch will not be able to be
+ assigned to separate VMs because there is no isolation between
+ them. Additionally, any malicious PCIe devices will be able to
+ DMA to memory exposed by other EPs in the same domain as TLPs
+ will not be checked by the IOMMU.
+
  If unsure, say N.
 
 config PCI_LABEL
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index ed9dce8552a2..e9f43b43acac 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -240,27 +240,42 @@ static struct pci_dev *find_parent_pci_dev(struct device 
*dev)
 }
 
 /*
- * If a device is behind a switch, we try to find the upstream bridge
- * port of the switch. This requires two calls to pci_upstream_bridge():
- * one for the upstream port on the switch, one on the upstream port
- * for the next level in the hierarchy. Because of this, devices connected
- * to the root port will be rejected.
+ * pci_p2pdma_disable_acs - disable ACS flags for all PCI bridges
+ * @pdev: device to disable ACS flags for
+ *
+ * The ACS flags for P2P Request Redirect and P2P Completion Redirect need
+ * to be disabled on any PCI bridge in order for the TLPS to not be forwarded
+ * up to the RC which is not what we want for P2P.
+ *
+ * This function is called when the devices are first enumerated and
+ * will result in all devices behind any bridge to be in the same IOMMU
+ * group. At this time, there is no way to "hotplug" IOMMU groups so we rely
+ * on this largish hammer. If you need the devices to be in separate groups
+ * don't enable CONFIG_PCI_P2PDMA.
+ *
+ * Returns 1 if the ACS bits for this device was cleared, otherwise 0.
  */
-static struct pci_dev *get_upstream_bridge_port(struct pci_dev *pdev)
+int pci_p2pdma_disable_acs(struct pci_dev *pdev)
 {
-   struct pci_dev *up1, *up2;
+   int pos;
+   u16 ctrl;
 
-   if (!pdev)
-   return NULL;
+   if (!pci_is_bridge(pdev))
+   return 0;
 
-   up1 = pci_dev_get(pci_upstream_bridge(pdev));
-   if (!up1)
-   return NULL;
+   pos = pci_find_ext_capability(pdev, PCI_EXT_CAP_ID_ACS);
+   if (!pos)
+   return 0;
+
+   pci_info(pdev, "disabling ACS flags for peer-to-peer DMA\n");
+
+   pci_read_config_word(pdev, pos + PCI_ACS_CTRL, );
+
+   ctrl &= ~(PCI_ACS_RR | PCI_ACS_CR);
 
-   up2 = pci_dev_get(pci_upstream_bridge(up1));
-   pci_dev_put(up1);
+   pci_write_config_word(pdev, pos + PCI_ACS_CTRL, ctrl);
 
-   return up2;
+   return 1;
 }
 
 /*
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index e597655a5643..7e2f5724ba22 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -2835,6 +2836,11 @@ static void pci_std_enable_acs(struct pci_dev *dev)
  */
 void pci_enable_acs(struct pci_dev *dev)

[PATCH v4 08/14] IB/core: Ensure we map P2P memory correctly in rdma_rw_ctx_[init|destroy]()

2018-04-23 Thread Logan Gunthorpe

In order to use PCI P2P memory pci_p2pmem_[un]map_sg() functions must be
called to map the correct PCI bus address.

To do this, check the first page in the scatter list to see if it is P2P
memory or not. At the moment, scatter lists that contain P2P memory must
be homogeneous so if the first page is P2P the entire SGL should be P2P.

Signed-off-by: Logan Gunthorpe 
Reviewed-by: Christoph Hellwig 
---
 drivers/infiniband/core/rw.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/drivers/infiniband/core/rw.c b/drivers/infiniband/core/rw.c
index c8963e91f92a..f495e8a7f8ac 100644
--- a/drivers/infiniband/core/rw.c
+++ b/drivers/infiniband/core/rw.c
@@ -12,6 +12,7 @@
  */
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -280,7 +281,11 @@ int rdma_rw_ctx_init(struct rdma_rw_ctx *ctx, struct ib_qp 
*qp, u8 port_num,
struct ib_device *dev = qp->pd->device;
int ret;
 
-   ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
+   if (is_pci_p2pdma_page(sg_page(sg)))
+   ret = pci_p2pdma_map_sg(dev->dma_device, sg, sg_cnt, dir);
+   else
+   ret = ib_dma_map_sg(dev, sg, sg_cnt, dir);
+
if (!ret)
return -ENOMEM;
sg_cnt = ret;
@@ -602,7 +607,11 @@ void rdma_rw_ctx_destroy(struct rdma_rw_ctx *ctx, struct 
ib_qp *qp, u8 port_num,
break;
}
 
-   ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
+   if (is_pci_p2pdma_page(sg_page(sg)))
+   pci_p2pdma_unmap_sg(qp->pd->device->dma_device, sg,
+   sg_cnt, dir);
+   else
+   ib_dma_unmap_sg(qp->pd->device, sg, sg_cnt, dir);
 }
 EXPORT_SYMBOL(rdma_rw_ctx_destroy);
 
-- 
2.11.0

[PATCH v4 09/14] nvme-pci: Use PCI p2pmem subsystem to manage the CMB

2018-04-23 Thread Logan Gunthorpe

Register the CMB buffer as p2pmem and use the appropriate allocation
functions to create and destroy the IO submission queues.

If the CMB supports WDS and RDS, publish it for use as P2P memory
by other devices.

We can now drop the __iomem safety on the buffer seeing that, by
convention, devm_memremap_pages() allocates regular memory without
side effects that's accessible without the iomem accessors.

Signed-off-by: Logan Gunthorpe 
---
 drivers/nvme/host/pci.c | 75 +++--
 1 file changed, 41 insertions(+), 34 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index fbc71fac6f1e..514da4de3c85 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -29,6 +29,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "nvme.h"
 
@@ -92,9 +93,8 @@ struct nvme_dev {
struct work_struct remove_work;
struct mutex shutdown_lock;
bool subsystem;
-   void __iomem *cmb;
-   pci_bus_addr_t cmb_bus_addr;
u64 cmb_size;
+   bool cmb_use_sqes;
u32 cmbsz;
u32 cmbloc;
struct nvme_ctrl ctrl;
@@ -149,7 +149,7 @@ struct nvme_queue {
struct nvme_dev *dev;
spinlock_t q_lock;
struct nvme_command *sq_cmds;
-   struct nvme_command __iomem *sq_cmds_io;
+   bool sq_cmds_is_io;
volatile struct nvme_completion *cqes;
struct blk_mq_tags **tags;
dma_addr_t sq_dma_addr;
@@ -431,10 +431,7 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
 {
u16 tail = nvmeq->sq_tail;
 
-   if (nvmeq->sq_cmds_io)
-   memcpy_toio(>sq_cmds_io[tail], cmd, sizeof(*cmd));
-   else
-   memcpy(>sq_cmds[tail], cmd, sizeof(*cmd));
+   memcpy(>sq_cmds[tail], cmd, sizeof(*cmd));
 
if (++tail == nvmeq->q_depth)
tail = 0;
@@ -1289,9 +1286,18 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
 {
dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
(void *)nvmeq->cqes, nvmeq->cq_dma_addr);
-   if (nvmeq->sq_cmds)
-   dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
-   nvmeq->sq_cmds, nvmeq->sq_dma_addr);
+
+   if (nvmeq->sq_cmds) {
+   if (nvmeq->sq_cmds_is_io)
+   pci_free_p2pmem(to_pci_dev(nvmeq->q_dmadev),
+   nvmeq->sq_cmds,
+   SQ_SIZE(nvmeq->q_depth));
+   else
+   dma_free_coherent(nvmeq->q_dmadev,
+ SQ_SIZE(nvmeq->q_depth),
+ nvmeq->sq_cmds,
+ nvmeq->sq_dma_addr);
+   }
 }
 
 static void nvme_free_queues(struct nvme_dev *dev, int lowest)
@@ -1371,12 +1377,21 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int 
nr_io_queues,
 static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
int qid, int depth)
 {
-   /* CMB SQEs will be mapped before creation */
-   if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS))
-   return 0;
+   struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+   if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
+   nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth));
+   nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev,
+   nvmeq->sq_cmds);
+   nvmeq->sq_cmds_is_io = true;
+   }
+
+   if (!nvmeq->sq_cmds) {
+   nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
+   >sq_dma_addr, GFP_KERNEL);
+   nvmeq->sq_cmds_is_io = false;
+   }
 
-   nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth),
-   >sq_dma_addr, GFP_KERNEL);
if (!nvmeq->sq_cmds)
return -ENOMEM;
return 0;
@@ -1451,13 +1466,6 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, 
int qid)
struct nvme_dev *dev = nvmeq->dev;
int result;
 
-   if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
-   unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth),
- dev->ctrl.page_size);
-   nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
-   nvmeq->sq_cmds_io = dev->cmb + offset;
-   }
-
/*
 * A queue's vector matches the queue identifier unless the controller
 * has only one vector available.
@@ -1691,9 +1699,6 @@ static void nvme_map_cmb(struct nvme_dev *dev)
return;
dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
 
-   if (!use_cmb_sqes)
-   return;
-
size =

[PATCH v4 05/14] docs-rst: Add a new directory for PCI documentation

2018-04-23 Thread Logan Gunthorpe

Add a new directory in the driver API guide for PCI specific
documentation.

This is in preparation for adding a new PCI P2P DMA driver writers
guide which will go in this directory.

Signed-off-by: Logan Gunthorpe 
Cc: Jonathan Corbet 
Cc: Mauro Carvalho Chehab 
Cc: Greg Kroah-Hartman 
Cc: Vinod Koul 
Cc: Linus Walleij 
Cc: Logan Gunthorpe 
Cc: Thierry Reding 
Cc: Sanyog Kale 
Cc: Sagar Dharia 
---
 Documentation/driver-api/index.rst |  2 +-
 Documentation/driver-api/pci/index.rst | 19 +++
 Documentation/driver-api/{ => pci}/pci.rst |  0
 3 files changed, 20 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/driver-api/pci/index.rst
 rename Documentation/driver-api/{ => pci}/pci.rst (100%)

diff --git a/Documentation/driver-api/index.rst 
b/Documentation/driver-api/index.rst
index 6d8352c0f354..9e4cd4e91a49 100644
--- a/Documentation/driver-api/index.rst
+++ b/Documentation/driver-api/index.rst
@@ -27,7 +27,7 @@ available subsections can be seen below.
iio/index
input
usb/index
-   pci
+   pci/index
spi
i2c
hsi
diff --git a/Documentation/driver-api/pci/index.rst 
b/Documentation/driver-api/pci/index.rst
new file mode 100644
index ..03b57cbf8cc2
--- /dev/null
+++ b/Documentation/driver-api/pci/index.rst
@@ -0,0 +1,19 @@
+
+The Linux PCI driver implementer's API guide
+
+
+.. class:: toc-title
+
+  Table of contents
+
+.. toctree::
+   :maxdepth: 2
+
+   pci
+
+.. only::  subproject and html
+
+   Indices
+   ===
+
+   * :ref:`genindex`
diff --git a/Documentation/driver-api/pci.rst 
b/Documentation/driver-api/pci/pci.rst
similarity index 100%
rename from Documentation/driver-api/pci.rst
rename to Documentation/driver-api/pci/pci.rst
-- 
2.11.0

[PATCH v4 11/14] nvme-pci: Add a quirk for a pseudo CMB

2018-04-23 Thread Logan Gunthorpe

Introduce a quirk to use CMB-like memory on older devices that have
an exposed BAR but do not advertise support for using CMBLOC and
CMBSIZE.

We'd like to use some of these older cards to test P2P memory.

Signed-off-by: Logan Gunthorpe 
Reviewed-by: Sagi Grimberg 
---
 drivers/nvme/host/nvme.h |  7 +++
 drivers/nvme/host/pci.c  | 24 
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9a689c13998f..885e9ec9b889 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -84,6 +84,13 @@ enum nvme_quirks {
 * Supports the LighNVM command set if indicated in vs[1].
 */
NVME_QUIRK_LIGHTNVM = (1 << 6),
+
+   /*
+* Pseudo CMB Support on BAR 4. For adapters like the Microsemi
+* NVRAM that have CMB-like memory on a BAR but does not set
+* CMBLOC or CMBSZ.
+*/
+   NVME_QUIRK_PSEUDO_CMB_BAR4  = (1 << 7),
 };
 
 /*
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 09b6aba6ed28..e526e969680a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1685,6 +1685,13 @@ static ssize_t nvme_cmb_show(struct device *dev,
 }
 static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
 
+static u32 nvme_pseudo_cmbsz(struct pci_dev *pdev, int bar)
+{
+   return NVME_CMBSZ_WDS | NVME_CMBSZ_RDS |
+   (((ilog2(SZ_16M) - 12) / 4) << NVME_CMBSZ_SZU_SHIFT) |
+   ((pci_resource_len(pdev, bar) / SZ_16M) << NVME_CMBSZ_SZ_SHIFT);
+}
+
 static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
 {
u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
@@ -1704,10 +1711,15 @@ static void nvme_map_cmb(struct nvme_dev *dev)
struct pci_dev *pdev = to_pci_dev(dev->dev);
int bar;
 
-   dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
-   if (!dev->cmbsz)
-   return;
-   dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
+   if (dev->ctrl.quirks & NVME_QUIRK_PSEUDO_CMB_BAR4) {
+   dev->cmbsz = nvme_pseudo_cmbsz(pdev, 4);
+   dev->cmbloc = 4;
+   } else {
+   dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
+   if (!dev->cmbsz)
+   return;
+   dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
+   }
 
size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
@@ -2736,6 +2748,10 @@ static const struct pci_device_id nvme_id_table[] = {
.driver_data = NVME_QUIRK_LIGHTNVM, },
{ PCI_DEVICE(0x1d1d, 0x2807),   /* CNEX WL */
.driver_data = NVME_QUIRK_LIGHTNVM, },
+   { PCI_DEVICE(0x11f8, 0xf117),   /* Microsemi NVRAM adaptor */
+   .driver_data = NVME_QUIRK_PSEUDO_CMB_BAR4, },
+   { PCI_DEVICE(0x1db1, 0x0002),   /* Everspin nvNitro adaptor */
+   .driver_data = NVME_QUIRK_PSEUDO_CMB_BAR4,  },
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xff) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
-- 
2.11.0

Re: general protection fault in wb_workfn

2018-04-23 Thread Tetsuo Handa

On 2018/04/23 19:09, Tetsuo Handa wrote:
> By the way, I got a newbie question regarding commit 5318ce7d46866e1d ("bdi:
> Shutdown writeback on all cgwbs in cgwb_bdi_destroy()"). It uses clear_bit()
> to clear WB_shutting_down bit so that threads waiting at wait_on_bit() will
> wake up. But clear_bit() itself does not wake up threads, does it? Who wakes
> them up (e.g. by calling wake_up_bit()) after clear_bit() was called?
> 

Below report might be waiting for wake_up_bit() ?

  INFO: task hung in wb_shutdown (2)
  https://syzkaller.appspot.com/bug?id=b297474817af98d5796bc544e1bb806fc3da0e5e

Re: [PATCH 11/12] swiotlb: move the SWIOTLB config symbol to lib/Kconfig

2018-04-23 Thread Konrad Rzeszutek Wilk

On Mon, Apr 23, 2018 at 07:04:18PM +0200, Christoph Hellwig wrote:
> This way we have one central definition of it, and user can select it as
> needed.  Note that we also add a second ARCH_HAS_SWIOTLB symbol to
> indicate the architecture supports swiotlb at all, so that we can still
> make the usage optional for a few architectures that want this feature
> to be user selectable.

If I follow this select business this will enable it on ARM and x86 by default.

As such:
Reviewed-by: Konrad Rzeszutek Wilk 

Thank you!
> 
> Signed-off-by: Christoph Hellwig 
> ---
>  arch/arm/Kconfig|  4 +---
>  arch/arm64/Kconfig  |  5 ++---
>  arch/ia64/Kconfig   |  9 +
>  arch/mips/Kconfig   |  3 +++
>  arch/mips/cavium-octeon/Kconfig |  5 -
>  arch/mips/loongson64/Kconfig|  8 
>  arch/powerpc/Kconfig|  9 -
>  arch/unicore32/mm/Kconfig   |  5 -
>  arch/x86/Kconfig| 14 +++---
>  lib/Kconfig | 15 +++
>  10 files changed, 25 insertions(+), 52 deletions(-)
> 
> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> index 90b81a3a28a7..f91f69174630 100644
> --- a/arch/arm/Kconfig
> +++ b/arch/arm/Kconfig
> @@ -106,6 +106,7 @@ config ARM
>   select REFCOUNT_FULL
>   select RTC_LIB
>   select SYS_SUPPORTS_APM_EMULATION
> + select ARCH_HAS_SWIOTLB
>   # Above selects are sorted alphabetically; please add new ones
>   # according to that.  Thanks.
>   help
> @@ -1773,9 +1774,6 @@ config SECCOMP
> and the task is only allowed to execute a few safe syscalls
> defined by each seccomp mode.
>  
> -config SWIOTLB
> - bool
> -
>  config PARAVIRT
>   bool "Enable paravirtualization code"
>   help
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 4d924eb32e7f..056bc7365adf 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -21,6 +21,7 @@ config ARM64
>   select ARCH_HAS_SG_CHAIN
>   select ARCH_HAS_STRICT_KERNEL_RWX
>   select ARCH_HAS_STRICT_MODULE_RWX
> + select ARCH_HAS_SWIOTLB
>   select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
>   select ARCH_HAVE_NMI_SAFE_CMPXCHG
>   select ARCH_INLINE_READ_LOCK if !PREEMPT
> @@ -144,6 +145,7 @@ config ARM64
>   select POWER_SUPPLY
>   select REFCOUNT_FULL
>   select SPARSE_IRQ
> + select SWIOTLB
>   select SYSCTL_EXCEPTION_TRACE
>   select THREAD_INFO_IN_TASK
>   help
> @@ -239,9 +241,6 @@ config HAVE_GENERIC_GUP
>  config SMP
>   def_bool y
>  
> -config SWIOTLB
> - def_bool y
> -
>  config KERNEL_MODE_NEON
>   def_bool y
>  
> diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
> index 685d557eea48..d396230913e6 100644
> --- a/arch/ia64/Kconfig
> +++ b/arch/ia64/Kconfig
> @@ -56,6 +56,7 @@ config IA64
>   select HAVE_ARCH_AUDITSYSCALL
>   select NEED_DMA_MAP_STATE
>   select NEED_SG_DMA_LENGTH
> + select ARCH_HAS_SWIOTLB
>   default y
>   help
> The Itanium Processor Family is Intel's 64-bit successor to
> @@ -80,9 +81,6 @@ config MMU
>   bool
>   default y
>  
> -config SWIOTLB
> -   bool
> -
>  config STACKTRACE_SUPPORT
>   def_bool y
>  
> @@ -139,7 +137,6 @@ config IA64_GENERIC
>   bool "generic"
>   select NUMA
>   select ACPI_NUMA
> - select DMA_DIRECT_OPS
>   select SWIOTLB
>   select PCI_MSI
>   help
> @@ -160,7 +157,6 @@ config IA64_GENERIC
>  
>  config IA64_DIG
>   bool "DIG-compliant"
> - select DMA_DIRECT_OPS
>   select SWIOTLB
>  
>  config IA64_DIG_VTD
> @@ -176,7 +172,6 @@ config IA64_HP_ZX1
>  
>  config IA64_HP_ZX1_SWIOTLB
>   bool "HP-zx1/sx1000 with software I/O TLB"
> - select DMA_DIRECT_OPS
>   select SWIOTLB
>   help
> Build a kernel that runs on HP zx1 and sx1000 systems even when they
> @@ -200,7 +195,6 @@ config IA64_SGI_UV
>   bool "SGI-UV"
>   select NUMA
>   select ACPI_NUMA
> - select DMA_DIRECT_OPS
>   select SWIOTLB
>   help
> Selecting this option will optimize the kernel for use on UV based
> @@ -211,7 +205,6 @@ config IA64_SGI_UV
>  
>  config IA64_HP_SIM
>   bool "Ski-simulator"
> - select DMA_DIRECT_OPS
>   select SWIOTLB
>   depends on !PM
>  
> diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
> index e10cc5c7be69..b6b4c1e154f8 100644
> --- a/arch/mips/Kconfig
> +++ b/arch/mips/Kconfig
> @@ -912,6 +912,8 @@ config CAVIUM_OCTEON_SOC
>   select MIPS_NR_CPU_NR_MAP_1024
>   select BUILTIN_DTB
>   select MTD_COMPLEX_MAPPINGS
> + select ARCH_HAS_SWIOTLB
> + select SWIOTLB
>   select SYS_SUPPORTS_RELOCATABLE
>   help
> This option supports all of the Octeon reference boards from Cavium
> @@ -1367,6 +1369,7 @@ config CPU_LOONGSON3
>   select MIPS_PGD_C0_CONTEXT
>   select MIPS_L1_CACHE_SHIFT_6
>   select

Re: [PATCH 10/12] arm: don't build swiotlb by default

2018-04-23 Thread Konrad Rzeszutek Wilk

On Mon, Apr 23, 2018 at 07:04:17PM +0200, Christoph Hellwig wrote:
> swiotlb is only used as a library of helper for xen-swiotlb if Xen support
> is enabled on arm, so don't build it by default.
> 

CCing Stefano
> Signed-off-by: Christoph Hellwig 
> ---
>  arch/arm/Kconfig | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> index aa1c187d756d..90b81a3a28a7 100644
> --- a/arch/arm/Kconfig
> +++ b/arch/arm/Kconfig
> @@ -1774,7 +1774,7 @@ config SECCOMP
> defined by each seccomp mode.
>  
>  config SWIOTLB
> - def_bool y
> + bool
>  
>  config PARAVIRT
>   bool "Enable paravirtualization code"
> @@ -1807,6 +1807,7 @@ config XEN
>   depends on MMU
>   select ARCH_DMA_ADDR_T_64BIT
>   select ARM_PSCI
> + select SWIOTLB
>   select SWIOTLB_XEN
>   select PARAVIRT
>   help
> -- 
> 2.17.0
>

Re: [PATCH] bsg referencing bus driver module

2018-04-23 Thread Anatoliy Glagolev

Thanks, James. The idea of cutting communications with Scsi_Host at
bsg_unregister_queue(..) time and leaving bsg_class_device to
its own fate makes a lot of sense, conceptually. But there are
implementation issues that are difficult to work around.

bsg.c creates bsg_class_device and takes a reference to Scsi_Host
at bsg_register_queue(..) time. The reference is dropped at
bsg_class_device's release(..) function. If the driver implementing
Scsi_Host template is not around we crash.
We could move the reference drop from bsg_class_device's release(..)
function to bsg_unregister_queue(..). That would be a small change in
bsg.c. But bsg.c sets Scsi_Host as the parent of bsg_class_device's
device. We cannot have a device around with a dangling parent.
A device's parent cannot be changed dynamically. Not setting
the device's parent at creation may affect software relying
on bsg_class_device - Scsi_Host child-parent relations.

It looks like I am out of options. Do you have suggestions on
how to work around Scsi_Host being bsg_class_device's parent?

[PATCH 10/12] arm: don't build swiotlb by default

2018-04-23 Thread Christoph Hellwig

swiotlb is only used as a library of helper for xen-swiotlb if Xen support
is enabled on arm, so don't build it by default.

Signed-off-by: Christoph Hellwig 
---
 arch/arm/Kconfig | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index aa1c187d756d..90b81a3a28a7 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1774,7 +1774,7 @@ config SECCOMP
  defined by each seccomp mode.
 
 config SWIOTLB
-   def_bool y
+   bool
 
 config PARAVIRT
bool "Enable paravirtualization code"
@@ -1807,6 +1807,7 @@ config XEN
depends on MMU
select ARCH_DMA_ADDR_T_64BIT
select ARM_PSCI
+   select SWIOTLB
select SWIOTLB_XEN
select PARAVIRT
help
-- 
2.17.0

[PATCH 09/12] PCI: remove CONFIG_PCI_BUS_ADDR_T_64BIT

2018-04-23 Thread Christoph Hellwig

This symbol is now always identical to CONFIG_ARCH_DMA_ADDR_T_64BIT, so
remove it.

Signed-off-by: Christoph Hellwig 
Acked-by: Bjorn Helgaas 
---
 drivers/pci/Kconfig | 4 
 drivers/pci/bus.c   | 4 ++--
 include/linux/pci.h | 2 +-
 3 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 34b56a8f8480..29a487f31dae 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -5,10 +5,6 @@
 
 source "drivers/pci/pcie/Kconfig"
 
-config PCI_BUS_ADDR_T_64BIT
-   def_bool y if (ARCH_DMA_ADDR_T_64BIT || 64BIT)
-   depends on PCI
-
 config PCI_MSI
bool "Message Signaled Interrupts (MSI and MSI-X)"
depends on PCI
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index bc2ded4c451f..35b7fc87eac5 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -120,7 +120,7 @@ int devm_request_pci_bus_resources(struct device *dev,
 EXPORT_SYMBOL_GPL(devm_request_pci_bus_resources);
 
 static struct pci_bus_region pci_32_bit = {0, 0xULL};
-#ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
 static struct pci_bus_region pci_64_bit = {0,
(pci_bus_addr_t) 0xULL};
 static struct pci_bus_region pci_high = {(pci_bus_addr_t) 0x1ULL,
@@ -230,7 +230,7 @@ int pci_bus_alloc_resource(struct pci_bus *bus, struct 
resource *res,
  resource_size_t),
void *alignf_data)
 {
-#ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
int rc;
 
if (res->flags & IORESOURCE_MEM_64) {
diff --git a/include/linux/pci.h b/include/linux/pci.h
index 73178a2fcee0..55371cb827ad 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -670,7 +670,7 @@ int raw_pci_read(unsigned int domain, unsigned int bus, 
unsigned int devfn,
 int raw_pci_write(unsigned int domain, unsigned int bus, unsigned int devfn,
  int reg, int len, u32 val);
 
-#ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
+#ifdef CONFIG_ARCH_DMA_ADDR_T_64BIT
 typedef u64 pci_bus_addr_t;
 #else
 typedef u32 pci_bus_addr_t;
-- 
2.17.0

[PATCH 11/12] swiotlb: move the SWIOTLB config symbol to lib/Kconfig

2018-04-23 Thread Christoph Hellwig

This way we have one central definition of it, and user can select it as
needed.  Note that we also add a second ARCH_HAS_SWIOTLB symbol to
indicate the architecture supports swiotlb at all, so that we can still
make the usage optional for a few architectures that want this feature
to be user selectable.

Signed-off-by: Christoph Hellwig 
---
 arch/arm/Kconfig|  4 +---
 arch/arm64/Kconfig  |  5 ++---
 arch/ia64/Kconfig   |  9 +
 arch/mips/Kconfig   |  3 +++
 arch/mips/cavium-octeon/Kconfig |  5 -
 arch/mips/loongson64/Kconfig|  8 
 arch/powerpc/Kconfig|  9 -
 arch/unicore32/mm/Kconfig   |  5 -
 arch/x86/Kconfig| 14 +++---
 lib/Kconfig | 15 +++
 10 files changed, 25 insertions(+), 52 deletions(-)

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 90b81a3a28a7..f91f69174630 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -106,6 +106,7 @@ config ARM
select REFCOUNT_FULL
select RTC_LIB
select SYS_SUPPORTS_APM_EMULATION
+   select ARCH_HAS_SWIOTLB
# Above selects are sorted alphabetically; please add new ones
# according to that.  Thanks.
help
@@ -1773,9 +1774,6 @@ config SECCOMP
  and the task is only allowed to execute a few safe syscalls
  defined by each seccomp mode.
 
-config SWIOTLB
-   bool
-
 config PARAVIRT
bool "Enable paravirtualization code"
help
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 4d924eb32e7f..056bc7365adf 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -21,6 +21,7 @@ config ARM64
select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_HAS_STRICT_MODULE_RWX
+   select ARCH_HAS_SWIOTLB
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select ARCH_HAVE_NMI_SAFE_CMPXCHG
select ARCH_INLINE_READ_LOCK if !PREEMPT
@@ -144,6 +145,7 @@ config ARM64
select POWER_SUPPLY
select REFCOUNT_FULL
select SPARSE_IRQ
+   select SWIOTLB
select SYSCTL_EXCEPTION_TRACE
select THREAD_INFO_IN_TASK
help
@@ -239,9 +241,6 @@ config HAVE_GENERIC_GUP
 config SMP
def_bool y
 
-config SWIOTLB
-   def_bool y
-
 config KERNEL_MODE_NEON
def_bool y
 
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 685d557eea48..d396230913e6 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -56,6 +56,7 @@ config IA64
select HAVE_ARCH_AUDITSYSCALL
select NEED_DMA_MAP_STATE
select NEED_SG_DMA_LENGTH
+   select ARCH_HAS_SWIOTLB
default y
help
  The Itanium Processor Family is Intel's 64-bit successor to
@@ -80,9 +81,6 @@ config MMU
bool
default y
 
-config SWIOTLB
-   bool
-
 config STACKTRACE_SUPPORT
def_bool y
 
@@ -139,7 +137,6 @@ config IA64_GENERIC
bool "generic"
select NUMA
select ACPI_NUMA
-   select DMA_DIRECT_OPS
select SWIOTLB
select PCI_MSI
help
@@ -160,7 +157,6 @@ config IA64_GENERIC
 
 config IA64_DIG
bool "DIG-compliant"
-   select DMA_DIRECT_OPS
select SWIOTLB
 
 config IA64_DIG_VTD
@@ -176,7 +172,6 @@ config IA64_HP_ZX1
 
 config IA64_HP_ZX1_SWIOTLB
bool "HP-zx1/sx1000 with software I/O TLB"
-   select DMA_DIRECT_OPS
select SWIOTLB
help
  Build a kernel that runs on HP zx1 and sx1000 systems even when they
@@ -200,7 +195,6 @@ config IA64_SGI_UV
bool "SGI-UV"
select NUMA
select ACPI_NUMA
-   select DMA_DIRECT_OPS
select SWIOTLB
help
  Selecting this option will optimize the kernel for use on UV based
@@ -211,7 +205,6 @@ config IA64_SGI_UV
 
 config IA64_HP_SIM
bool "Ski-simulator"
-   select DMA_DIRECT_OPS
select SWIOTLB
depends on !PM
 
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index e10cc5c7be69..b6b4c1e154f8 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -912,6 +912,8 @@ config CAVIUM_OCTEON_SOC
select MIPS_NR_CPU_NR_MAP_1024
select BUILTIN_DTB
select MTD_COMPLEX_MAPPINGS
+   select ARCH_HAS_SWIOTLB
+   select SWIOTLB
select SYS_SUPPORTS_RELOCATABLE
help
  This option supports all of the Octeon reference boards from Cavium
@@ -1367,6 +1369,7 @@ config CPU_LOONGSON3
select MIPS_PGD_C0_CONTEXT
select MIPS_L1_CACHE_SHIFT_6
select GPIOLIB
+   select ARCH_HAS_SWIOTLB
help
The Loongson 3 processor implements the MIPS64R2 instruction
set with many extensions.
diff --git a/arch/mips/cavium-octeon/Kconfig b/arch/mips/cavium-octeon/Kconfig
index 5d73041547a7..4984e462be30 100644
--- a/arch/mips/cavium-octeon/Kconfig
+++ b/arch/mips/cavium-octeon/Kconfig
@@ -67,11 +67,6

[PATCH 12/12] swiotlb: remove the CONFIG_DMA_DIRECT_OPS ifdefs

2018-04-23 Thread Christoph Hellwig

swiotlb now selects the DMA_DIRECT_OPS config symbol, so this will
always be true.

Signed-off-by: Christoph Hellwig 
---
 lib/swiotlb.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/lib/swiotlb.c b/lib/swiotlb.c
index fece57566d45..6954f7ad200a 100644
--- a/lib/swiotlb.c
+++ b/lib/swiotlb.c
@@ -692,7 +692,6 @@ void swiotlb_tbl_sync_single(struct device *hwdev, 
phys_addr_t tlb_addr,
}
 }
 
-#ifdef CONFIG_DMA_DIRECT_OPS
 static inline bool dma_coherent_ok(struct device *dev, dma_addr_t addr,
size_t size)
 {
@@ -764,7 +763,6 @@ static bool swiotlb_free_buffer(struct device *dev, size_t 
size,
 DMA_ATTR_SKIP_CPU_SYNC);
return true;
 }
-#endif
 
 static void
 swiotlb_full(struct device *dev, size_t size, enum dma_data_direction dir,
@@ -1045,7 +1043,6 @@ swiotlb_dma_supported(struct device *hwdev, u64 mask)
return __phys_to_dma(hwdev, io_tlb_end - 1) <= mask;
 }
 
-#ifdef CONFIG_DMA_DIRECT_OPS
 void *swiotlb_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
gfp_t gfp, unsigned long attrs)
 {
@@ -1089,4 +1086,3 @@ const struct dma_map_ops swiotlb_dma_ops = {
.unmap_page = swiotlb_unmap_page,
.dma_supported  = dma_direct_supported,
 };
-#endif /* CONFIG_DMA_DIRECT_OPS */
-- 
2.17.0

[PATCH 05/12] scatterlist: move the NEED_SG_DMA_LENGTH config symbol to lib/Kconfig

2018-04-23 Thread Christoph Hellwig

This way we have one central definition of it, and user can select it as
needed.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Anshuman Khandual 
---
 arch/alpha/Kconfig  | 4 +---
 arch/arm/Kconfig| 3 ---
 arch/arm64/Kconfig  | 4 +---
 arch/hexagon/Kconfig| 4 +---
 arch/ia64/Kconfig   | 4 +---
 arch/mips/cavium-octeon/Kconfig | 3 ---
 arch/mips/loongson64/Kconfig| 3 ---
 arch/mips/netlogic/Kconfig  | 3 ---
 arch/parisc/Kconfig | 4 +---
 arch/powerpc/Kconfig| 4 +---
 arch/s390/Kconfig   | 4 +---
 arch/sh/Kconfig | 5 ++---
 arch/sparc/Kconfig  | 4 +---
 arch/unicore32/mm/Kconfig   | 5 +
 arch/x86/Kconfig| 4 +---
 lib/Kconfig | 3 +++
 16 files changed, 15 insertions(+), 46 deletions(-)

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 3ff735a722af..8e6a67ecf069 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -10,6 +10,7 @@ config ALPHA
select HAVE_OPROFILE
select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
+   select NEED_SG_DMA_LENGTH
select VIRT_TO_BUS
select GENERIC_IRQ_PROBE
select AUTO_IRQ_AFFINITY if SMP
@@ -70,9 +71,6 @@ config ARCH_DMA_ADDR_T_64BIT
 config NEED_DMA_MAP_STATE
def_bool y
 
-config NEED_SG_DMA_LENGTH
-   def_bool y
-
 config GENERIC_ISA_DMA
bool
default y
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 2f79222c5c02..602c8320282f 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -119,9 +119,6 @@ config ARM_HAS_SG_CHAIN
select ARCH_HAS_SG_CHAIN
bool
 
-config NEED_SG_DMA_LENGTH
-   bool
-
 config ARM_DMA_USE_IOMMU
bool
select ARM_HAS_SG_CHAIN
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index fbef5d3de83f..3b441c5587f1 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -133,6 +133,7 @@ config ARM64
select IRQ_FORCED_THREADING
select MODULES_USE_ELF_RELA
select MULTI_IRQ_HANDLER
+   select NEED_SG_DMA_LENGTH
select NO_BOOTMEM
select OF
select OF_EARLY_FLATTREE
@@ -243,9 +244,6 @@ config ARCH_DMA_ADDR_T_64BIT
 config NEED_DMA_MAP_STATE
def_bool y
 
-config NEED_SG_DMA_LENGTH
-   def_bool y
-
 config SMP
def_bool y
 
diff --git a/arch/hexagon/Kconfig b/arch/hexagon/Kconfig
index 76d2f20d525e..37adb2003033 100644
--- a/arch/hexagon/Kconfig
+++ b/arch/hexagon/Kconfig
@@ -19,6 +19,7 @@ config HEXAGON
select GENERIC_IRQ_SHOW
select HAVE_ARCH_KGDB
select HAVE_ARCH_TRACEHOOK
+   select NEED_SG_DMA_LENGTH
select NO_IOPORT_MAP
select GENERIC_IOMAP
select GENERIC_SMP_IDLE_THREAD
@@ -63,9 +64,6 @@ config GENERIC_CSUM
 config GENERIC_IRQ_PROBE
def_bool y
 
-config NEED_SG_DMA_LENGTH
-   def_bool y
-
 config RWSEM_GENERIC_SPINLOCK
def_bool n
 
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 862c5160c09d..333917676f7f 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -54,6 +54,7 @@ config IA64
select MODULES_USE_ELF_RELA
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_AUDITSYSCALL
+   select NEED_SG_DMA_LENGTH
default y
help
  The Itanium Processor Family is Intel's 64-bit successor to
@@ -84,9 +85,6 @@ config ARCH_DMA_ADDR_T_64BIT
 config NEED_DMA_MAP_STATE
def_bool y
 
-config NEED_SG_DMA_LENGTH
-   def_bool y
-
 config SWIOTLB
bool
 
diff --git a/arch/mips/cavium-octeon/Kconfig b/arch/mips/cavium-octeon/Kconfig
index 647ed158ac98..5d73041547a7 100644
--- a/arch/mips/cavium-octeon/Kconfig
+++ b/arch/mips/cavium-octeon/Kconfig
@@ -67,9 +67,6 @@ config CAVIUM_OCTEON_LOCK_L2_MEMCPY
help
  Lock the kernel's implementation of memcpy() into L2.
 
-config NEED_SG_DMA_LENGTH
-   bool
-
 config SWIOTLB
def_bool y
select DMA_DIRECT_OPS
diff --git a/arch/mips/loongson64/Kconfig b/arch/mips/loongson64/Kconfig
index 5efb2e63878e..641a1477031e 100644
--- a/arch/mips/loongson64/Kconfig
+++ b/arch/mips/loongson64/Kconfig
@@ -130,9 +130,6 @@ config LOONGSON_UART_BASE
default y
depends on EARLY_PRINTK || SERIAL_8250
 
-config NEED_SG_DMA_LENGTH
-   bool
-
 config SWIOTLB
bool "Soft IOMMU Support for All-Memory DMA"
default y
diff --git a/arch/mips/netlogic/Kconfig b/arch/mips/netlogic/Kconfig
index 5c5ee0e05a17..412351c5acc6 100644
--- a/arch/mips/netlogic/Kconfig
+++ b/arch/mips/netlogic/Kconfig
@@ -83,7 +83,4 @@ endif
 config NLM_COMMON
bool
 
-config NEED_SG_DMA_LENGTH
-   bool
-
 endif
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index fc5a574c3482..89caea87556e 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -51,6 +51,7 @@ config PARISC
select GENERIC_CLOCKEVENTS
select

[PATCH 08/12] arch: define the ARCH_DMA_ADDR_T_64BIT config symbol in lib/Kconfig

2018-04-23 Thread Christoph Hellwig

Define this symbol if the architecture either uses 64-bit pointers or the
PHYS_ADDR_T_64BIT is set.  This covers 95% of the old arch magic.  We only
need an additional select for Xen on ARM (why anyway?), and we now always
set ARCH_DMA_ADDR_T_64BIT on mips boards with 64-bit physical addressing
instead of only doing it when highmem is set.

Signed-off-by: Christoph Hellwig 
---
 arch/alpha/Kconfig | 3 ---
 arch/arc/Kconfig   | 3 ---
 arch/arm/mach-axxia/Kconfig| 1 -
 arch/arm/mach-bcm/Kconfig  | 1 -
 arch/arm/mach-exynos/Kconfig   | 1 -
 arch/arm/mach-highbank/Kconfig | 1 -
 arch/arm/mach-rockchip/Kconfig | 1 -
 arch/arm/mach-shmobile/Kconfig | 1 -
 arch/arm/mach-tegra/Kconfig| 1 -
 arch/arm/mm/Kconfig| 3 ---
 arch/arm64/Kconfig | 3 ---
 arch/ia64/Kconfig  | 3 ---
 arch/mips/Kconfig  | 3 ---
 arch/powerpc/Kconfig   | 3 ---
 arch/riscv/Kconfig | 3 ---
 arch/s390/Kconfig  | 3 ---
 arch/sparc/Kconfig | 4 
 arch/x86/Kconfig   | 4 
 lib/Kconfig| 3 +++
 19 files changed, 3 insertions(+), 42 deletions(-)

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 1fd9645b0c67..aa7df1a36fd0 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -66,9 +66,6 @@ config ZONE_DMA
bool
default y
 
-config ARCH_DMA_ADDR_T_64BIT
-   def_bool y
-
 config GENERIC_ISA_DMA
bool
default y
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index f94c61da682a..7498aca4b887 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -458,9 +458,6 @@ config ARC_HAS_PAE40
  Enable access to physical memory beyond 4G, only supported on
  ARC cores with 40 bit Physical Addressing support
 
-config ARCH_DMA_ADDR_T_64BIT
-   bool
-
 config ARC_KVADDR_SIZE
int "Kernel Virtual Address Space size (MB)"
range 0 512
diff --git a/arch/arm/mach-axxia/Kconfig b/arch/arm/mach-axxia/Kconfig
index bb2ce1c63fd9..d3eae6037913 100644
--- a/arch/arm/mach-axxia/Kconfig
+++ b/arch/arm/mach-axxia/Kconfig
@@ -2,7 +2,6 @@
 config ARCH_AXXIA
bool "LSI Axxia platforms"
depends on ARCH_MULTI_V7 && ARM_LPAE
-   select ARCH_DMA_ADDR_T_64BIT
select ARM_AMBA
select ARM_GIC
select ARM_TIMER_SP804
diff --git a/arch/arm/mach-bcm/Kconfig b/arch/arm/mach-bcm/Kconfig
index c2f3b0d216a4..c46a728df44e 100644
--- a/arch/arm/mach-bcm/Kconfig
+++ b/arch/arm/mach-bcm/Kconfig
@@ -211,7 +211,6 @@ config ARCH_BRCMSTB
select BRCMSTB_L2_IRQ
select BCM7120_L2_IRQ
select ARCH_HAS_HOLES_MEMORYMODEL
-   select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE
select ZONE_DMA if ARM_LPAE
select SOC_BRCMSTB
select SOC_BUS
diff --git a/arch/arm/mach-exynos/Kconfig b/arch/arm/mach-exynos/Kconfig
index 647c319f9f5f..2ca405816846 100644
--- a/arch/arm/mach-exynos/Kconfig
+++ b/arch/arm/mach-exynos/Kconfig
@@ -112,7 +112,6 @@ config SOC_EXYNOS5440
bool "SAMSUNG EXYNOS5440"
default y
depends on ARCH_EXYNOS5
-   select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE
select HAVE_ARM_ARCH_TIMER
select AUTO_ZRELADDR
select PINCTRL_EXYNOS5440
diff --git a/arch/arm/mach-highbank/Kconfig b/arch/arm/mach-highbank/Kconfig
index 81110ec34226..5552968f07f8 100644
--- a/arch/arm/mach-highbank/Kconfig
+++ b/arch/arm/mach-highbank/Kconfig
@@ -1,7 +1,6 @@
 config ARCH_HIGHBANK
bool "Calxeda ECX-1000/2000 (Highbank/Midway)"
depends on ARCH_MULTI_V7
-   select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE
select ARCH_HAS_HOLES_MEMORYMODEL
select ARCH_SUPPORTS_BIG_ENDIAN
select ARM_AMBA
diff --git a/arch/arm/mach-rockchip/Kconfig b/arch/arm/mach-rockchip/Kconfig
index a4065966881a..fafd3d7f9f8c 100644
--- a/arch/arm/mach-rockchip/Kconfig
+++ b/arch/arm/mach-rockchip/Kconfig
@@ -3,7 +3,6 @@ config ARCH_ROCKCHIP
depends on ARCH_MULTI_V7
select PINCTRL
select PINCTRL_ROCKCHIP
-   select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE
select ARCH_HAS_RESET_CONTROLLER
select ARM_AMBA
select ARM_GIC
diff --git a/arch/arm/mach-shmobile/Kconfig b/arch/arm/mach-shmobile/Kconfig
index 280e7312a9e1..fe60cd09a5ca 100644
--- a/arch/arm/mach-shmobile/Kconfig
+++ b/arch/arm/mach-shmobile/Kconfig
@@ -29,7 +29,6 @@ config ARCH_RMOBILE
 menuconfig ARCH_RENESAS
bool "Renesas ARM SoCs"
depends on ARCH_MULTI_V7 && MMU
-   select ARCH_DMA_ADDR_T_64BIT if ARM_LPAE
select ARCH_SHMOBILE
select ARM_GIC
select GPIOLIB
diff --git a/arch/arm/mach-tegra/Kconfig b/arch/arm/mach-tegra/Kconfig
index 1e0aeb47bac6..7f3b83e0d324 100644
--- a/arch/arm/mach-tegra/Kconfig
+++ b/arch/arm/mach-tegra/Kconfig
@@ -15,6 +15,5 @@ menuconfig ARCH_TEGRA
select RESET_CONTROLLER
select SOC_BUS
select ZONE_DMA if ARM_LPAE
-   select ARCH_DMA_ADDR_T_64BIT

[PATCH 06/12] dma-mapping: move the NEED_DMA_MAP_STATE config symbol to lib/Kconfig

2018-04-23 Thread Christoph Hellwig

This way we have one central definition of it, and user can select it as
needed.  Note that we now also always select it when CONFIG_DMA_API_DEBUG
is select, which fixes some incorrect checks in a few network drivers.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Anshuman Khandual 
---
 arch/alpha/Kconfig  | 4 +---
 arch/arm/Kconfig| 4 +---
 arch/arm64/Kconfig  | 4 +---
 arch/ia64/Kconfig   | 4 +---
 arch/mips/Kconfig   | 3 ---
 arch/parisc/Kconfig | 4 +---
 arch/s390/Kconfig   | 4 +---
 arch/sh/Kconfig | 4 +---
 arch/sparc/Kconfig  | 4 +---
 arch/unicore32/Kconfig  | 4 +---
 arch/x86/Kconfig| 6 ++
 drivers/iommu/Kconfig   | 1 +
 include/linux/dma-mapping.h | 2 +-
 lib/Kconfig | 3 +++
 lib/Kconfig.debug   | 1 +
 15 files changed, 17 insertions(+), 35 deletions(-)

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 8e6a67ecf069..1fd9645b0c67 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -10,6 +10,7 @@ config ALPHA
select HAVE_OPROFILE
select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS
+   select NEED_DMA_MAP_STATE
select NEED_SG_DMA_LENGTH
select VIRT_TO_BUS
select GENERIC_IRQ_PROBE
@@ -68,9 +69,6 @@ config ZONE_DMA
 config ARCH_DMA_ADDR_T_64BIT
def_bool y
 
-config NEED_DMA_MAP_STATE
-   def_bool y
-
 config GENERIC_ISA_DMA
bool
default y
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 602c8320282f..aa1c187d756d 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -96,6 +96,7 @@ config ARM
select HAVE_VIRT_CPU_ACCOUNTING_GEN
select IRQ_FORCED_THREADING
select MODULES_USE_ELF_REL
+   select NEED_DMA_MAP_STATE
select NO_BOOTMEM
select OF_EARLY_FLATTREE if OF
select OF_RESERVED_MEM if OF
@@ -221,9 +222,6 @@ config ARCH_MAY_HAVE_PC_FDC
 config ZONE_DMA
bool
 
-config NEED_DMA_MAP_STATE
-   def_bool y
-
 config ARCH_SUPPORTS_UPROBES
def_bool y
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 3b441c5587f1..940adfb9a2bc 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -133,6 +133,7 @@ config ARM64
select IRQ_FORCED_THREADING
select MODULES_USE_ELF_RELA
select MULTI_IRQ_HANDLER
+   select NEED_DMA_MAP_STATE
select NEED_SG_DMA_LENGTH
select NO_BOOTMEM
select OF
@@ -241,9 +242,6 @@ config HAVE_GENERIC_GUP
 config ARCH_DMA_ADDR_T_64BIT
def_bool y
 
-config NEED_DMA_MAP_STATE
-   def_bool y
-
 config SMP
def_bool y
 
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 333917676f7f..0e42731adaf1 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -54,6 +54,7 @@ config IA64
select MODULES_USE_ELF_RELA
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_ARCH_AUDITSYSCALL
+   select NEED_DMA_MAP_STATE
select NEED_SG_DMA_LENGTH
default y
help
@@ -82,9 +83,6 @@ config MMU
 config ARCH_DMA_ADDR_T_64BIT
def_bool y
 
-config NEED_DMA_MAP_STATE
-   def_bool y
-
 config SWIOTLB
bool
 
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 225c95da23ce..47d72c64d687 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1122,9 +1122,6 @@ config DMA_NONCOHERENT
bool
select NEED_DMA_MAP_STATE
 
-config NEED_DMA_MAP_STATE
-   bool
-
 config SYS_HAS_EARLY_PRINTK
bool
 
diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig
index 89caea87556e..4d8f64d48597 100644
--- a/arch/parisc/Kconfig
+++ b/arch/parisc/Kconfig
@@ -51,6 +51,7 @@ config PARISC
select GENERIC_CLOCKEVENTS
select ARCH_NO_COHERENT_DMA_MMAP
select CPU_NO_EFFICIENT_FFS
+   select NEED_DMA_MAP_STATE
select NEED_SG_DMA_LENGTH
 
help
@@ -112,9 +113,6 @@ config PM
 config STACKTRACE_SUPPORT
def_bool y
 
-config NEED_DMA_MAP_STATE
-   def_bool y
-
 config ISA_DMA_API
bool
 
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index f80c6b983159..89a007672f70 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -711,6 +711,7 @@ menuconfig PCI
select PCI_MSI
select IOMMU_HELPER
select IOMMU_SUPPORT
+   select NEED_DMA_MAP_STATE
select NEED_SG_DMA_LENGTH
 
help
@@ -736,9 +737,6 @@ config PCI_DOMAINS
 config HAS_IOMEM
def_bool PCI
 
-config NEED_DMA_MAP_STATE
-   def_bool PCI
-
 config CHSC_SCH
def_tristate m
prompt "Support for CHSC subchannels"
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index e127e0cbe30f..9417f70e008e 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -50,6 +50,7 @@ config SUPERH
select HAVE_ARCH_AUDITSYSCALL
select HAVE_FUTEX_CMPXCHG if FUTEX
select HAVE_NMI
+   select NEED_DMA_MAP_STATE
select NEED_SG_DMA_LENGTH
 
help
@@

[PATCH 04/12] iommu-helper: move the IOMMU_HELPER config symbol to lib/

2018-04-23 Thread Christoph Hellwig

This way we have one central definition of it, and user can select it as
needed.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Anshuman Khandual 
---
 arch/powerpc/Kconfig | 4 +---
 arch/s390/Kconfig| 5 ++---
 arch/sparc/Kconfig   | 5 +
 arch/x86/Kconfig | 6 ++
 lib/Kconfig  | 3 +++
 5 files changed, 9 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 43e3c8e4e7f4..7698cf89af9c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -223,6 +223,7 @@ config PPC
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_VIRT_CPU_ACCOUNTING
select HAVE_IRQ_TIME_ACCOUNTING
+   select IOMMU_HELPER if PPC64
select IRQ_DOMAIN
select IRQ_FORCED_THREADING
select MODULES_USE_ELF_RELA
@@ -478,9 +479,6 @@ config MPROFILE_KERNEL
depends on PPC64 && CPU_LITTLE_ENDIAN
def_bool !DISABLE_MPROFILE_KERNEL
 
-config IOMMU_HELPER
-   def_bool PPC64
-
 config SWIOTLB
bool "SWIOTLB support"
default n
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 199ac3e4da1d..60c4ab854182 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -709,7 +709,9 @@ config QDIO
 menuconfig PCI
bool "PCI support"
select PCI_MSI
+   select IOMMU_HELPER
select IOMMU_SUPPORT
+
help
  Enable PCI support.
 
@@ -733,9 +735,6 @@ config PCI_DOMAINS
 config HAS_IOMEM
def_bool PCI
 
-config IOMMU_HELPER
-   def_bool PCI
-
 config NEED_SG_DMA_LENGTH
def_bool PCI
 
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 8767e45f1b2b..44e0f3cd7988 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -67,6 +67,7 @@ config SPARC64
select HAVE_SYSCALL_TRACEPOINTS
select HAVE_CONTEXT_TRACKING
select HAVE_DEBUG_KMEMLEAK
+   select IOMMU_HELPER
select SPARSE_IRQ
select RTC_DRV_CMOS
select RTC_DRV_BQ4802
@@ -106,10 +107,6 @@ config ARCH_DMA_ADDR_T_64BIT
bool
default y if ARCH_ATU
 
-config IOMMU_HELPER
-   bool
-   default y if SPARC64
-
 config STACKTRACE_SUPPORT
bool
default y if SPARC64
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cb2c7ecc1fea..fe9713539166 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -871,6 +871,7 @@ config DMI
 
 config GART_IOMMU
bool "Old AMD GART IOMMU support"
+   select IOMMU_HELPER
select SWIOTLB
depends on X86_64 && PCI && AMD_NB
---help---
@@ -892,6 +893,7 @@ config GART_IOMMU
 
 config CALGARY_IOMMU
bool "IBM Calgary IOMMU support"
+   select IOMMU_HELPER
select SWIOTLB
depends on X86_64 && PCI
---help---
@@ -929,10 +931,6 @@ config SWIOTLB
  with more than 3 GB of memory.
  If unsure, say Y.
 
-config IOMMU_HELPER
-   def_bool y
-   depends on CALGARY_IOMMU || GART_IOMMU
-
 config MAXSMP
bool "Enable Maximum number of SMP Processors and NUMA Nodes"
depends on X86_64 && SMP && DEBUG_KERNEL
diff --git a/lib/Kconfig b/lib/Kconfig
index 5fe577673b98..2f6908577534 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -429,6 +429,9 @@ config SGL_ALLOC
bool
default n
 
+config IOMMU_HELPER
+   bool
+
 config DMA_DIRECT_OPS
bool
depends on HAS_DMA && (!64BIT || ARCH_DMA_ADDR_T_64BIT)
-- 
2.17.0

[PATCH 07/12] arch: remove the ARCH_PHYS_ADDR_T_64BIT config symbol

2018-04-23 Thread Christoph Hellwig

Instead select the PHYS_ADDR_T_64BIT for 32-bit architectures that need a
64-bit phys_addr_t type directly.

Signed-off-by: Christoph Hellwig 
---
 arch/arc/Kconfig   |  4 +---
 arch/arm/kernel/setup.c|  2 +-
 arch/arm/mm/Kconfig|  4 +---
 arch/arm64/Kconfig |  3 ---
 arch/mips/Kconfig  | 15 ++-
 arch/powerpc/Kconfig   |  5 +
 arch/powerpc/platforms/Kconfig.cputype |  1 +
 arch/riscv/Kconfig |  6 ++
 arch/x86/Kconfig   |  5 +
 mm/Kconfig |  2 +-
 10 files changed, 15 insertions(+), 32 deletions(-)

diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index d76bf4a83740..f94c61da682a 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -453,13 +453,11 @@ config ARC_HAS_PAE40
default n
depends on ISA_ARCV2
select HIGHMEM
+   select PHYS_ADDR_T_64BIT
help
  Enable access to physical memory beyond 4G, only supported on
  ARC cores with 40 bit Physical Addressing support
 
-config ARCH_PHYS_ADDR_T_64BIT
-   def_bool ARC_HAS_PAE40
-
 config ARCH_DMA_ADDR_T_64BIT
bool
 
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index fc40a2b40595..35ca494c028c 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -754,7 +754,7 @@ int __init arm_add_memory(u64 start, u64 size)
else
size -= aligned_start - start;
 
-#ifndef CONFIG_ARCH_PHYS_ADDR_T_64BIT
+#ifndef CONFIG_PHYS_ADDR_T_64BIT
if (aligned_start > ULONG_MAX) {
pr_crit("Ignoring memory at 0x%08llx outside 32-bit physical 
address space\n",
(long long)start);
diff --git a/arch/arm/mm/Kconfig b/arch/arm/mm/Kconfig
index 7f14acf67caf..2f77c6344ef1 100644
--- a/arch/arm/mm/Kconfig
+++ b/arch/arm/mm/Kconfig
@@ -661,6 +661,7 @@ config ARM_LPAE
bool "Support for the Large Physical Address Extension"
depends on MMU && CPU_32v7 && !CPU_32v6 && !CPU_32v5 && \
!CPU_32v4 && !CPU_32v3
+   select PHYS_ADDR_T_64BIT
help
  Say Y if you have an ARMv7 processor supporting the LPAE page
  table format and you would like to access memory beyond the
@@ -673,9 +674,6 @@ config ARM_PV_FIXUP
def_bool y
depends on ARM_LPAE && ARM_PATCH_PHYS_VIRT && ARCH_KEYSTONE
 
-config ARCH_PHYS_ADDR_T_64BIT
-   def_bool ARM_LPAE
-
 config ARCH_DMA_ADDR_T_64BIT
bool
 
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 940adfb9a2bc..b6aa33e642cc 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -152,9 +152,6 @@ config ARM64
 config 64BIT
def_bool y
 
-config ARCH_PHYS_ADDR_T_64BIT
-   def_bool y
-
 config MMU
def_bool y
 
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 47d72c64d687..985388078872 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -132,7 +132,7 @@ config MIPS_GENERIC
 
 config MIPS_ALCHEMY
bool "Alchemy processor based machines"
-   select ARCH_PHYS_ADDR_T_64BIT
+   select PHYS_ADDR_T_64BIT
select CEVT_R4K
select CSRC_R4K
select IRQ_MIPS_CPU
@@ -890,7 +890,7 @@ config CAVIUM_OCTEON_SOC
bool "Cavium Networks Octeon SoC based boards"
select CEVT_R4K
select ARCH_HAS_PHYS_TO_DMA
-   select ARCH_PHYS_ADDR_T_64BIT
+   select PHYS_ADDR_T_64BIT
select DMA_COHERENT
select SYS_SUPPORTS_64BIT_KERNEL
select SYS_SUPPORTS_BIG_ENDIAN
@@ -936,7 +936,7 @@ config NLM_XLR_BOARD
select SWAP_IO_SPACE
select SYS_SUPPORTS_32BIT_KERNEL
select SYS_SUPPORTS_64BIT_KERNEL
-   select ARCH_PHYS_ADDR_T_64BIT
+   select PHYS_ADDR_T_64BIT
select SYS_SUPPORTS_BIG_ENDIAN
select SYS_SUPPORTS_HIGHMEM
select DMA_COHERENT
@@ -962,7 +962,7 @@ config NLM_XLP_BOARD
select HW_HAS_PCI
select SYS_SUPPORTS_32BIT_KERNEL
select SYS_SUPPORTS_64BIT_KERNEL
-   select ARCH_PHYS_ADDR_T_64BIT
+   select PHYS_ADDR_T_64BIT
select GPIOLIB
select SYS_SUPPORTS_BIG_ENDIAN
select SYS_SUPPORTS_LITTLE_ENDIAN
@@ -1102,7 +1102,7 @@ config FW_CFE
bool
 
 config ARCH_DMA_ADDR_T_64BIT
-   def_bool (HIGHMEM && ARCH_PHYS_ADDR_T_64BIT) || 64BIT
+   def_bool (HIGHMEM && PHYS_ADDR_T_64BIT) || 64BIT
 
 config ARCH_SUPPORTS_UPROBES
bool
@@ -1767,7 +1767,7 @@ config CPU_MIPS32_R5_XPA
depends on SYS_SUPPORTS_HIGHMEM
select XPA
select HIGHMEM
-   select ARCH_PHYS_ADDR_T_64BIT
+   select PHYS_ADDR_T_64BIT
default n
help
  Choose this option if you want to enable the Extended Physical
@@ -2399,9 +2399,6 @@ config SB1_PASS_2_1_WORKAROUNDS
default y
 
 
-config ARCH_PHYS_ADDR_T_64BIT
-   bool
-
 choice
prompt "SmartMIPS or microMIPS ASE support"
 
diff

[PATCH 03/12] iommu-helper: mark iommu_is_span_boundary as inline

2018-04-23 Thread Christoph Hellwig

This avoids selecting IOMMU_HELPER just for this function.  And we only
use it once or twice in normal builds so this often even is a size
reduction.

Signed-off-by: Christoph Hellwig 
---
 arch/alpha/Kconfig  |  3 ---
 arch/arm/Kconfig|  3 ---
 arch/arm64/Kconfig  |  3 ---
 arch/ia64/Kconfig   |  3 ---
 arch/mips/cavium-octeon/Kconfig |  4 
 arch/mips/loongson64/Kconfig|  4 
 arch/mips/netlogic/Kconfig  |  3 ---
 arch/powerpc/Kconfig|  1 -
 arch/unicore32/mm/Kconfig   |  3 ---
 arch/x86/Kconfig|  2 +-
 drivers/parisc/Kconfig  |  5 -
 include/linux/iommu-helper.h| 13 ++---
 lib/iommu-helper.c  | 12 +---
 13 files changed, 12 insertions(+), 47 deletions(-)

diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index b2022885ced8..3ff735a722af 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -345,9 +345,6 @@ config PCI_DOMAINS
 config PCI_SYSCALL
def_bool PCI
 
-config IOMMU_HELPER
-   def_bool PCI
-
 config ALPHA_NONAME
bool
depends on ALPHA_BOOK1 || ALPHA_NONAME_CH
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index a7f8e7f4b88f..2f79222c5c02 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1781,9 +1781,6 @@ config SECCOMP
 config SWIOTLB
def_bool y
 
-config IOMMU_HELPER
-   def_bool SWIOTLB
-
 config PARAVIRT
bool "Enable paravirtualization code"
help
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index eb2cf4938f6d..fbef5d3de83f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -252,9 +252,6 @@ config SMP
 config SWIOTLB
def_bool y
 
-config IOMMU_HELPER
-   def_bool SWIOTLB
-
 config KERNEL_MODE_NEON
def_bool y
 
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index bbe12a038d21..862c5160c09d 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -613,6 +613,3 @@ source "security/Kconfig"
 source "crypto/Kconfig"
 
 source "lib/Kconfig"
-
-config IOMMU_HELPER
-   def_bool (IA64_HP_ZX1 || IA64_HP_ZX1_SWIOTLB || IA64_GENERIC || SWIOTLB)
diff --git a/arch/mips/cavium-octeon/Kconfig b/arch/mips/cavium-octeon/Kconfig
index b5eee1a57d6c..647ed158ac98 100644
--- a/arch/mips/cavium-octeon/Kconfig
+++ b/arch/mips/cavium-octeon/Kconfig
@@ -67,16 +67,12 @@ config CAVIUM_OCTEON_LOCK_L2_MEMCPY
help
  Lock the kernel's implementation of memcpy() into L2.
 
-config IOMMU_HELPER
-   bool
-
 config NEED_SG_DMA_LENGTH
bool
 
 config SWIOTLB
def_bool y
select DMA_DIRECT_OPS
-   select IOMMU_HELPER
select NEED_SG_DMA_LENGTH
 
 config OCTEON_ILM
diff --git a/arch/mips/loongson64/Kconfig b/arch/mips/loongson64/Kconfig
index 72af0c183969..5efb2e63878e 100644
--- a/arch/mips/loongson64/Kconfig
+++ b/arch/mips/loongson64/Kconfig
@@ -130,9 +130,6 @@ config LOONGSON_UART_BASE
default y
depends on EARLY_PRINTK || SERIAL_8250
 
-config IOMMU_HELPER
-   bool
-
 config NEED_SG_DMA_LENGTH
bool
 
@@ -141,7 +138,6 @@ config SWIOTLB
default y
depends on CPU_LOONGSON3
select DMA_DIRECT_OPS
-   select IOMMU_HELPER
select NEED_SG_DMA_LENGTH
select NEED_DMA_MAP_STATE
 
diff --git a/arch/mips/netlogic/Kconfig b/arch/mips/netlogic/Kconfig
index 7fcfc7fe9f14..5c5ee0e05a17 100644
--- a/arch/mips/netlogic/Kconfig
+++ b/arch/mips/netlogic/Kconfig
@@ -83,9 +83,6 @@ endif
 config NLM_COMMON
bool
 
-config IOMMU_HELPER
-   bool
-
 config NEED_SG_DMA_LENGTH
bool
 
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index c32a181a7cbb..43e3c8e4e7f4 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -484,7 +484,6 @@ config IOMMU_HELPER
 config SWIOTLB
bool "SWIOTLB support"
default n
-   select IOMMU_HELPER
---help---
  Support for IO bounce buffering for systems without an IOMMU.
  This allows us to DMA to the full physical address space on
diff --git a/arch/unicore32/mm/Kconfig b/arch/unicore32/mm/Kconfig
index e9154a59d561..3f105e00c432 100644
--- a/arch/unicore32/mm/Kconfig
+++ b/arch/unicore32/mm/Kconfig
@@ -44,9 +44,6 @@ config SWIOTLB
def_bool y
select DMA_DIRECT_OPS
 
-config IOMMU_HELPER
-   def_bool SWIOTLB
-
 config NEED_SG_DMA_LENGTH
def_bool SWIOTLB
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 00fcf81f2c56..cb2c7ecc1fea 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -931,7 +931,7 @@ config SWIOTLB
 
 config IOMMU_HELPER
def_bool y
-   depends on CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU
+   depends on CALGARY_IOMMU || GART_IOMMU
 
 config MAXSMP
bool "Enable Maximum number of SMP Processors and NUMA Nodes"
diff --git a/drivers/parisc/Kconfig b/drivers/parisc/Kconfig
index 3a102a84d637..5a48b5606110 100644
--- a/drivers/parisc/Kconfig
+++ b/drivers/parisc/Kconfig
@@ -103,11

[PATCH 02/12] iommu-helper: unexport iommu_area_alloc

2018-04-23 Thread Christoph Hellwig

This function is only used by built-in code.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Anshuman Khandual 
---
 lib/iommu-helper.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/lib/iommu-helper.c b/lib/iommu-helper.c
index 23633c0fda4a..ded1703e7e64 100644
--- a/lib/iommu-helper.c
+++ b/lib/iommu-helper.c
@@ -3,7 +3,6 @@
  * IOMMU helper functions for the free area management
  */
 
-#include 
 #include 
 #include 
 
@@ -38,4 +37,3 @@ unsigned long iommu_area_alloc(unsigned long *map, unsigned 
long size,
}
return -1;
 }
-EXPORT_SYMBOL(iommu_area_alloc);
-- 
2.17.0

[PATCH 01/12] iommu-common: move to arch/sparc

2018-04-23 Thread Christoph Hellwig

This code is only used by sparc, and all new iommu drivers should use the
drivers/iommu/ framework.  Also remove the unused exports.

Signed-off-by: Christoph Hellwig 
Reviewed-by: Anshuman Khandual 
---
 {include/linux => arch/sparc/include/asm}/iommu-common.h | 0
 arch/sparc/include/asm/iommu_64.h| 2 +-
 arch/sparc/kernel/Makefile   | 2 +-
 {lib => arch/sparc/kernel}/iommu-common.c| 5 +
 arch/sparc/kernel/iommu.c| 2 +-
 arch/sparc/kernel/ldc.c  | 2 +-
 arch/sparc/kernel/pci_sun4v.c| 2 +-
 lib/Makefile | 2 +-
 8 files changed, 7 insertions(+), 10 deletions(-)
 rename {include/linux => arch/sparc/include/asm}/iommu-common.h (100%)
 rename {lib => arch/sparc/kernel}/iommu-common.c (98%)

diff --git a/include/linux/iommu-common.h 
b/arch/sparc/include/asm/iommu-common.h
similarity index 100%
rename from include/linux/iommu-common.h
rename to arch/sparc/include/asm/iommu-common.h
diff --git a/arch/sparc/include/asm/iommu_64.h 
b/arch/sparc/include/asm/iommu_64.h
index 9ed6b54caa4b..0ef6dedf747e 100644
--- a/arch/sparc/include/asm/iommu_64.h
+++ b/arch/sparc/include/asm/iommu_64.h
@@ -17,7 +17,7 @@
 #define IOPTE_WRITE   0x0002UL
 
 #define IOMMU_NUM_CTXS 4096
-#include 
+#include 
 
 struct iommu_arena {
unsigned long   *map;
diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile
index 76cb57750dda..a284662b0e4c 100644
--- a/arch/sparc/kernel/Makefile
+++ b/arch/sparc/kernel/Makefile
@@ -59,7 +59,7 @@ obj-$(CONFIG_SPARC32)   += leon_pmc.o
 
 obj-$(CONFIG_SPARC64)   += reboot.o
 obj-$(CONFIG_SPARC64)   += sysfs.o
-obj-$(CONFIG_SPARC64)   += iommu.o
+obj-$(CONFIG_SPARC64)   += iommu.o iommu-common.o
 obj-$(CONFIG_SPARC64)   += central.o
 obj-$(CONFIG_SPARC64)   += starfire.o
 obj-$(CONFIG_SPARC64)   += power.o
diff --git a/lib/iommu-common.c b/arch/sparc/kernel/iommu-common.c
similarity index 98%
rename from lib/iommu-common.c
rename to arch/sparc/kernel/iommu-common.c
index 55b00de106b5..59cb16691322 100644
--- a/lib/iommu-common.c
+++ b/arch/sparc/kernel/iommu-common.c
@@ -8,9 +8,9 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
+#include 
 
 static unsigned long iommu_large_alloc = 15;
 
@@ -93,7 +93,6 @@ void iommu_tbl_pool_init(struct iommu_map_table *iommu,
p->hint = p->start;
p->end = num_entries;
 }
-EXPORT_SYMBOL(iommu_tbl_pool_init);
 
 unsigned long iommu_tbl_range_alloc(struct device *dev,
struct iommu_map_table *iommu,
@@ -224,7 +223,6 @@ unsigned long iommu_tbl_range_alloc(struct device *dev,
 
return n;
 }
-EXPORT_SYMBOL(iommu_tbl_range_alloc);
 
 static struct iommu_pool *get_pool(struct iommu_map_table *tbl,
   unsigned long entry)
@@ -264,4 +262,3 @@ void iommu_tbl_range_free(struct iommu_map_table *iommu, 
u64 dma_addr,
bitmap_clear(iommu->map, entry, npages);
spin_unlock_irqrestore(&(pool->lock), flags);
 }
-EXPORT_SYMBOL(iommu_tbl_range_free);
diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index b08dc3416f06..40d008b0bd3e 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -14,7 +14,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 #ifdef CONFIG_PCI
 #include 
diff --git a/arch/sparc/kernel/ldc.c b/arch/sparc/kernel/ldc.c
index 86b625f9d8dc..c0fa3ef6cf01 100644
--- a/arch/sparc/kernel/ldc.c
+++ b/arch/sparc/kernel/ldc.c
@@ -16,7 +16,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 #include 
 #include 
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index 249367228c33..565d9ac883d0 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -16,7 +16,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 #include 
 #include 
diff --git a/lib/Makefile b/lib/Makefile
index ce20696d5a92..94203b5eecd4 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -147,7 +147,7 @@ obj-$(CONFIG_AUDIT_GENERIC) += audit.o
 obj-$(CONFIG_AUDIT_COMPAT_GENERIC) += compat_audit.o
 
 obj-$(CONFIG_SWIOTLB) += swiotlb.o
-obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o iommu-common.o
+obj-$(CONFIG_IOMMU_HELPER) += iommu-helper.o
 obj-$(CONFIG_FAULT_INJECTION) += fault-inject.o
 obj-$(CONFIG_NOTIFIER_ERROR_INJECTION) += notifier-error-inject.o
 obj-$(CONFIG_PM_NOTIFIER_ERROR_INJECT) += pm-notifier-error-inject.o
-- 
2.17.0

centralize SWIOTLB config symbol and misc other cleanups V2

2018-04-23 Thread Christoph Hellwig

Hi all,

this seris aims for a single defintion of the Kconfig symbol.  To get
there various cleanups, mostly about config symbols are included as well.

Chances since V2 are a fixed s/Reviewed/Signed-Off/ for me, and a few
reviewed-by tags.  I'd like to start merging this into the dma-mapping
tree rather sooner than later given that quite a bit of material for
this series depends on it.

Re: [RESEND PATCH v1 2/2] trace: events: block: Add tag in block trace events

2018-04-23 Thread Steven Rostedt

On Mon, 23 Apr 2018 14:43:13 +0200
Steffen Maier  wrote:

> > -   TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
> > +   TP_printk("[%s] %d %s", __entry->comm, __entry->nr_rq,
> > +  __entry->explicit ? "Sync" : "Async")
> >   );
> > 
> >   /**  
> 
> This entire hunk does not seem related to this patch description.
> Also, I'm not sure trace-cmd and perf et al. could format it accordingly.

You mean the "?:" operation? trace-cmd and perf can handle it fine.
Just look at the trace event irq_handler_exit:

 print fmt: "irq=%d ret=%s", REC->irq, REC->ret ? "handled" : "unhandled"

# trace-cmd record -e irq_handler_exit
# trace-cmd report
  -0 [001] 856960.382767: irq_handler_exit: irq=29 
ret=handled
  -0 [001] 856961.745640: irq_handler_exit: irq=29 
ret=handled
  -0 [001] 856961.865762: irq_handler_exit: irq=29 
ret=handled


-- Steve

Re: [RESEND PATCH v1 1/2] trace: events: scsi: Add tag in SCSI trace events

2018-04-23 Thread Steffen Maier



On 04/17/2018 12:00 PM, Bean Huo (beanhuo) wrote:


#Cat trace
iozone-4055  [000]    665.039276: block_unplug: [iozone] 1 Sync
iozone-4055  [000] ...1   665.039278: block_rq_insert: 8,48 WS 0 () 39604352 + 
128 tag=18 [iozone]
iozone-4055  [000] ...1   665.039280: block_rq_issue: 8,48 WS 0 () 39604352 + 
128 tag=18 [iozone]
iozone-4055  [000] ...1   665.039284: scsi_dispatch_cmd_start: host_no=0 
channel=0 id=0 lun=3 data_sgl=16 prot_sgl=0 prot_op=SCSI_PROT_NORMAL tag=18 
cmnd=(WRITE_10 lba=4950544 txlen=16 protect=0 raw=2a 00 00 4b 8a 10 00 00 10 00)
iozone-4056  [002]    665.039284: block_dirty_buffer: 8,62 sector=44375 
size=4096
-0 [000] d.h2   665.039319: scsi_dispatch_cmd_done: host_no=0 
channel=0 id=0 lun=3 data_sgl=16 prot_sgl=0 prot_op=SCSI_PROT_NORMAL tag=24 
cmnd=(WRITE_10 lba=4944016 txlen=16 protect=0 raw=2a 00 00 4b 70 90 00 00 10 00) 
result=(driver=DRIVER_OK host=DID_OK message=COMMAND_COMPLETE status=SAM_STAT_GOOD)
-0 [000] d.h3   665.039321: block_rq_complete: 8,48 WS () 39552128 + 
128 tag=24 [0]



iozone-4058  [003]    665.039362: block_bio_remap: 8,48 WS 39568768 + 128 
<- (8,62) 337280
iozone-4058  [003]    665.039364: block_bio_queue: 8,48 WS 39568768 + 128 
[iozone]
iozone-4058  [003] ...1   665.039366: block_getrq: 8,48 WS 39568768 + 128 
[iozone]


I'm not familiar with block/scsi command tagging.

Some block events now would get a tag field.
Some block events would not get a tag field (maybe because for some the 
tag is not (yet) known).


So all block events that belong to the same request would still need to 
be correlated by something like (devt, RWBS, LBA, length) because not 
all have a tag field.



Especially, the ftrace log with tag information, I can easily figure out one 
I/O request between block layer and SCSI.


Provided this is done correctly, I would be in favor of a solution.
Since
v4.11 commit 48b77ad60844 (``block: cleanup tracing'')\newline
v4.11 commit 82ed4db499b8 (``block: split scsi\_request out of struct 
request'')
we don't have the SCSI CDB in block traces (nor in blktrace traditional 
relayfs trace format, nor in ftrace 'blk' tracer binary synthesized 
output [1]) any more (empty Packet Command payload).
Being able to correlate block events with scsi events would indeed be 
very helpful for some cases.


Is a correlation between block and scsi only necessary for these pairs?:

block_rq_issue causes scsi_dispatch_cmd_start, and
scsi_dispatch_cmd_done causes block_rq_complete.

If so, only those two block trace events would need to get a new field?


[1] v2.6.30 commit 08a06b83ff8b (``blkftrace: binary tracing, 
synthesizing old format'')
v2.6.31 commit f3948f8857ef (``blktrace: fix context-info when 
mixed-using blk tracer and trace events'')


--
Mit freundlichen Grüßen / Kind regards
Steffen Maier

Linux on z Systems Development

IBM Deutschland Research & Development GmbH
Vorsitzende des Aufsichtsrats: Martina Koederitz
Geschaeftsfuehrung: Dirk Wittkopp
Sitz der Gesellschaft: Boeblingen
Registergericht: Amtsgericht Stuttgart, HRB 243294

Re: [RESEND PATCH v1 2/2] trace: events: block: Add tag in block trace events

2018-04-23 Thread Steffen Maier



On 04/16/2018 04:33 PM, Bean Huo (beanhuo) wrote:

Print the request tag along with other information in block trace events
when tracing request , and unplug type (Sync / Async).

Signed-off-by: Bean Huo 
---
  include/trace/events/block.h | 36 +---
  1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index 81b43f5..f8c0b9e 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h



@@ -478,15 +486,18 @@ DECLARE_EVENT_CLASS(block_unplug,

TP_STRUCT__entry(
__field( int,   nr_rq   )
+   __field( bool,  explicit)
__array( char,  comm,   TASK_COMM_LEN   )
),

TP_fast_assign(
__entry->nr_rq = depth;
+   __entry->explicit = explicit;
memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
),

-   TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+   TP_printk("[%s] %d %s", __entry->comm, __entry->nr_rq,
+  __entry->explicit ? "Sync" : "Async")
  );

  /**


This entire hunk does not seem related to this patch description.
Also, I'm not sure trace-cmd and perf et al. could format it accordingly.
See also my patch for this same functionality:
https://www.spinics.net/lists/linux-block/msg24691.html
("[PATCH v2 1/2] tracing/events: block: track and print if unplug was 
explicit or schedule")




--
Mit freundlichen Grüßen / Kind regards
Steffen Maier

Linux on z Systems Development

IBM Deutschland Research & Development GmbH
Vorsitzende des Aufsichtsrats: Martina Koederitz
Geschaeftsfuehrung: Dirk Wittkopp
Sitz der Gesellschaft: Boeblingen
Registergericht: Amtsgericht Stuttgart, HRB 243294

Re: [PATCH 2/3] lightnvm: pblk: garbage collect lines with failed writes

2018-04-23 Thread Hans Holmberg

On Fri, Apr 20, 2018 at 9:49 PM, Javier Gonzalez  wrote:
>> On 19 Apr 2018, at 09.39, Hans Holmberg  
>> wrote:
>>
>> From: Hans Holmberg 
>>
>> Write failures should not happen under normal circumstances,
>> so in order to bring the chunk back into a known state as soon
>> as possible, evacuate all the valid data out of the line and let the
>> fw judge if the block can be written to in the next reset cycle.
>>
>> Do this by introducing a new gc list for lines with failed writes,
>> and ensure that the rate limiter allocates a small portion of
>> the write bandwidth to get the job done.
>>
>> The lba list is saved in memory for use during gc as we
>> cannot gurantee that the emeta data is readable if a write
>> error occurred.
>>
>> Signed-off-by: Hans Holmberg 
>> ---
>> drivers/lightnvm/pblk-core.c  | 43 +--
>> drivers/lightnvm/pblk-gc.c| 79 
>> +++
>> drivers/lightnvm/pblk-init.c  | 39 ++---
>> drivers/lightnvm/pblk-rl.c| 29 +---
>> drivers/lightnvm/pblk-sysfs.c | 15 ++--
>> drivers/lightnvm/pblk-write.c |  2 ++
>> drivers/lightnvm/pblk.h   | 25 +++---
>> 7 files changed, 178 insertions(+), 54 deletions(-)
>>
>> diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
>> index 7762e89..f6135e4 100644
>> --- a/drivers/lightnvm/pblk-core.c
>> +++ b/drivers/lightnvm/pblk-core.c
>> @@ -373,7 +373,13 @@ struct list_head *pblk_line_gc_list(struct pblk *pblk, 
>> struct pblk_line *line)
>>
>>   lockdep_assert_held(>lock);
>>
>> - if (!vsc) {
>> + if (line->w_err_gc->has_write_err) {
>> + if (line->gc_group != PBLK_LINEGC_WERR) {
>> + line->gc_group = PBLK_LINEGC_WERR;
>> + move_list = _mg->gc_werr_list;
>> + pblk_rl_werr_line_in(>rl);
>> + }
>> + } else if (!vsc) {
>>   if (line->gc_group != PBLK_LINEGC_FULL) {
>>   line->gc_group = PBLK_LINEGC_FULL;
>>   move_list = _mg->gc_full_list;
>> @@ -1603,8 +1609,13 @@ static void __pblk_line_put(struct pblk *pblk, struct 
>> pblk_line *line)
>>   line->state = PBLK_LINESTATE_FREE;
>>   line->gc_group = PBLK_LINEGC_NONE;
>>   pblk_line_free(line);
>> - spin_unlock(>lock);
>>
>> + if (line->w_err_gc->has_write_err) {
>> + pblk_rl_werr_line_out(>rl);
>> + line->w_err_gc->has_write_err = 0;
>> + }
>> +
>> + spin_unlock(>lock);
>>   atomic_dec(>pipeline_gc);
>>
>>   spin_lock(_mg->free_lock);
>> @@ -1767,11 +1778,32 @@ void pblk_line_close_meta(struct pblk *pblk, struct 
>> pblk_line *line)
>>
>>   spin_lock(_mg->close_lock);
>>   spin_lock(>lock);
>> +
>> + /* Update the in-memory start address for emeta, in case it has
>> +  * shifted due to write errors
>> +  */
>> + if (line->emeta_ssec != line->cur_sec)
>> + line->emeta_ssec = line->cur_sec;
>> +
>>   list_add_tail(>list, _mg->emeta_list);
>>   spin_unlock(>lock);
>>   spin_unlock(_mg->close_lock);
>>
>>   pblk_line_should_sync_meta(pblk);
>> +
>> +
>> +}
>> +
>> +static void pblk_save_lba_list(struct pblk *pblk, struct pblk_line *line)
>> +{
>> + struct pblk_line_meta *lm = >lm;
>> + unsigned int lba_list_size = lm->emeta_len[2];
>> + struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
>> + struct pblk_emeta *emeta = line->emeta;
>> +
>> + w_err_gc->lba_list = kmalloc(lba_list_size, GFP_KERNEL);
>> + memcpy(w_err_gc->lba_list, emeta_to_lbas(pblk, emeta->buf),
>> + lba_list_size);
>> }
>>
>> void pblk_line_close_ws(struct work_struct *work)
>> @@ -1780,6 +1812,13 @@ void pblk_line_close_ws(struct work_struct *work)
>>   ws);
>>   struct pblk *pblk = line_ws->pblk;
>>   struct pblk_line *line = line_ws->line;
>> + struct pblk_w_err_gc *w_err_gc = line->w_err_gc;
>> +
>> + /* Write errors makes the emeta start address stored in smeta invalid,
>> +  * so keep a copy of the lba list until we've gc'd the line
>> +  */
>> + if (w_err_gc->has_write_err)
>> + pblk_save_lba_list(pblk, line);
>>
>>   pblk_line_close(pblk, line);
>>   mempool_free(line_ws, pblk->gen_ws_pool);
>> diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
>> index b0cc277..62f0548 100644
>> --- a/drivers/lightnvm/pblk-gc.c
>> +++ b/drivers/lightnvm/pblk-gc.c
>> @@ -138,10 +138,10 @@ static void pblk_gc_line_prepare_ws(struct work_struct 
>> *work)
>>   struct pblk_line_mgmt *l_mg = >l_mg;
>>   struct pblk_line_meta *lm = >lm;
>>   struct pblk_gc *gc = >gc;
>> - struct line_emeta *emeta_buf;
>> + struct line_emeta *emeta_buf = NULL;
>>   struct pblk_line_ws

Re: [PATCH 1/3] lightnvm: pblk: rework write error recovery path

2018-04-23 Thread Hans Holmberg

On Fri, Apr 20, 2018 at 9:38 PM, Javier Gonzalez  wrote:
>> On 19 Apr 2018, at 09.39, Hans Holmberg  
>> wrote:
>>
>> From: Hans Holmberg 
>>
>> The write error recovery path is incomplete, so rework
>> the write error recovery handling to do resubmits directly
>> from the write buffer.
>>
>> When a write error occurs, the remaining sectors in the chunk are
>> mapped out and invalidated and the request inserted in a resubmit list.
>>
>> The writer thread checks if there are any requests to resubmit,
>> scans and invalidates any lbas that have been overwritten by later
>> writes and resubmits the failed entries.
>>
>> Signed-off-by: Hans Holmberg 
>> ---
>> drivers/lightnvm/pblk-init.c |   2 +
>> drivers/lightnvm/pblk-recovery.c |  91 ---
>> drivers/lightnvm/pblk-write.c| 241 
>> ---
>> drivers/lightnvm/pblk.h  |   8 +-
>> 4 files changed, 180 insertions(+), 162 deletions(-)
>>
>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
>> index bfc488d..6f06727 100644
>> --- a/drivers/lightnvm/pblk-init.c
>> +++ b/drivers/lightnvm/pblk-init.c
>> @@ -426,6 +426,7 @@ static int pblk_core_init(struct pblk *pblk)
>>   goto free_r_end_wq;
>>
>>   INIT_LIST_HEAD(>compl_list);
>> + INIT_LIST_HEAD(>resubmit_list);
>>
>>   return 0;
>>
>> @@ -1185,6 +1186,7 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct 
>> gendisk *tdisk,
>>   pblk->state = PBLK_STATE_RUNNING;
>>   pblk->gc.gc_enabled = 0;
>>
>> + spin_lock_init(>resubmit_lock);
>>   spin_lock_init(>trans_lock);
>>   spin_lock_init(>lock);
>>
>> diff --git a/drivers/lightnvm/pblk-recovery.c 
>> b/drivers/lightnvm/pblk-recovery.c
>> index 9cb6d5d..5983428 100644
>> --- a/drivers/lightnvm/pblk-recovery.c
>> +++ b/drivers/lightnvm/pblk-recovery.c
>> @@ -16,97 +16,6 @@
>>
>> #include "pblk.h"
>>
>> -void pblk_submit_rec(struct work_struct *work)
>> -{
>> - struct pblk_rec_ctx *recovery =
>> - container_of(work, struct pblk_rec_ctx, ws_rec);
>> - struct pblk *pblk = recovery->pblk;
>> - struct nvm_rq *rqd = recovery->rqd;
>> - struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
>> - struct bio *bio;
>> - unsigned int nr_rec_secs;
>> - unsigned int pgs_read;
>> - int ret;
>> -
>> - nr_rec_secs = bitmap_weight((unsigned long int *)>ppa_status,
>> - NVM_MAX_VLBA);
>> -
>> - bio = bio_alloc(GFP_KERNEL, nr_rec_secs);
>> -
>> - bio->bi_iter.bi_sector = 0;
>> - bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
>> - rqd->bio = bio;
>> - rqd->nr_ppas = nr_rec_secs;
>> -
>> - pgs_read = pblk_rb_read_to_bio_list(>rwb, bio, >failed,
>> - nr_rec_secs);
>
> Please, remove functions that are not longer used. Doing a pass on the
> rest of the removed functions would be a good idea.

Yes, thanks.

>
>> - if (pgs_read != nr_rec_secs) {
>> - pr_err("pblk: could not read recovery entries\n");
>> - goto err;
>> - }
>> -
>> - if (pblk_setup_w_rec_rq(pblk, rqd, c_ctx)) {
>
> Same here

I'll clean it up.

>> -
>> -#ifdef CONFIG_NVM_DEBUG
>> - atomic_long_add(nr_rec_secs, >recov_writes);
>> -#endif
>
> Can you add this debug counter to the new path? I see you added other
> counters, if it is a rename, can you put it on a separate patch?

Thanks for catching the lost recov counter update, what other counters
are you referring to?

>
>> -
>> - ret = pblk_submit_io(pblk, rqd);
>> - if (ret) {
>> - pr_err("pblk: I/O submission failed: %d\n", ret);
>> - goto err;
>> - }
>> -
>> - mempool_free(recovery, pblk->rec_pool);
>> - return;
>> -
>> -err:
>> - bio_put(bio);
>> - pblk_free_rqd(pblk, rqd, PBLK_WRITE);
>> -}
>> -
>> -int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
>> - struct pblk_rec_ctx *recovery, u64 *comp_bits,
>> - unsigned int comp)
>> -{
>> - struct nvm_rq *rec_rqd;
>> - struct pblk_c_ctx *rec_ctx;
>> - int nr_entries = c_ctx->nr_valid + c_ctx->nr_padded;
>> -
>> - rec_rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
>> - rec_ctx = nvm_rq_to_pdu(rec_rqd);
>> -
>> - /* Copy completion bitmap, but exclude the first X completed entries */
>> - bitmap_shift_right((unsigned long int *)_rqd->ppa_status,
>> - (unsigned long int *)comp_bits,
>> - comp, NVM_MAX_VLBA);
>> -
>> - /* Save the context for the entries that need to be re-written and
>> -  * update current context with the completed entries.
>> -  */
>> - rec_ctx->sentry = pblk_rb_wrap_pos(>rwb, c_ctx->sentry + comp);
>> - if (comp >= c_ctx->nr_valid) {
>> -

Re: [PATCH blktests] scsi/004: add regression test for false BLK_STS_OK with non good SAM status

2018-04-23 Thread Steffen Maier


On 04/19/2018 10:18 PM, Omar Sandoval wrote:
> On Thu, Apr 19, 2018 at 01:44:41PM -0600, Jens Axboe wrote:
>> On 4/19/18 1:41 PM, Bart Van Assche wrote:
>>> On Thu, 2018-04-19 at 12:13 -0700, Omar Sandoval wrote:
 On Thu, Apr 19, 2018 at 11:53:30AM -0700, Omar Sandoval wrote:
> Thanks for the test! Applied.

 Side note, it's unfortunate that this test takes 180 seconds to run only
 because we have to wait for the command timeout. We should be able to
 export request_queue->rq_timeout writeable in sysfs. Would you be
 interested in doing that?
>>>
>>> Hello Omar,
>>>
>>> Is this perhaps what you are looking for?
>>> # ls -l /sys/class/scsi_device/*/*/timeout
>>> -rw-r--r-- 1 root root 4096 Apr 19 08:52 
>>> /sys/class/scsi_device/2:0:0:0/device/timeout
>>> -rw-r--r-- 1 root root 4096 Apr 19 12:39 
>>> /sys/class/scsi_device/8:0:0:1/device/timeout
>>
>> We should have it generically available though, not just for SCSI. In
>> retrospect, it should have been under queue/ from the start, now we'll
>> end up with duplicate entries for SCSI.
> 
> For the sake of this test, I just decreased the timeout through SCSI.

Great idea.

>   echo 5 > "/sys/block/${SCSI_DEBUG_DEVICES[0]}/device/timeout"

However, the timeout should be sufficiently larger than scsi_debug/delay,
in order not to run into the command timeout.
It may be unfortunate that scsi_debug/delay uses jiffies as unit and
can thus differ in a range of an order of magnitude for different kernel 
configs.

>   # delay to reduce response repetition: around 1..10sec depending on HZ
>   echo 1000 > /sys/bus/pseudo/drivers/scsi_debug/delay

On s390, we typically have HZ=100, so 1000 jiffies are 10 seconds.

We can increase the sdev cmd timeout or decrease the scsi_debug/delay.
100 instead of 1000 for scsi_debug/delay worked for me;
but for some reason the loop checking for busy did not work (any more?)
causing an unexpected test case error:

> # ./check scsi/004
> scsi/004 (ensure repeated TASK SET FULL results in EIO on timing out command) 
> [failed]
> runtime  31.892s  ...  31.720s
> --- tests/scsi/004.out2018-04-16 11:47:19.105931872 +0200
> +++ results/nodev/scsi/004.out.bad2018-04-23 14:07:33.615445253 
> +0200
> @@ -1,3 +1,3 @@
>  Running scsi/004
> -Input/output error
> +modprobe: FATAL: Module scsi_debug is in use.
>  Test complete

so I added another sleep hack:

 # dd closing SCSI disk causes implicit TUR also being delayed once
+# sleep over time window where READ was done and TUR not yet queued
+sleep 2
 while grep -q -F "in_use_bm BUSY:" 
"/proc/scsi/scsi_debug/${SCSI_DEBUG_HOSTS[0]}"; do

What do you think?

-- 
Mit freundlichen Grüßen / Kind regards
Steffen Maier

Linux on z Systems Development

IBM Deutschland Research & Development GmbH
Vorsitzende des Aufsichtsrats: Martina Koederitz
Geschaeftsfuehrung: Dirk Wittkopp
Sitz der Gesellschaft: Boeblingen
Registergericht: Amtsgericht Stuttgart, HRB 243294

Re: general protection fault in wb_workfn

2018-04-23 Thread Tetsuo Handa

On 2018/04/20 1:05, syzbot wrote:
> kasan: CONFIG_KASAN_INLINE enabled
> kasan: GPF could be caused by NULL-ptr deref or user memory access
> general protection fault:  [#1] SMP KASAN
> Dumping ftrace buffer:
>    (ftrace buffer empty)
> Modules linked in:
> CPU: 0 PID: 28 Comm: kworker/u4:2 Not tainted 4.16.0-rc7+ #368
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> Google 01/01/2011
> Workqueue: writeback wb_workfn
> RIP: 0010:dev_name include/linux/device.h:981 [inline]
> RIP: 0010:wb_workfn+0x1a2/0x16b0 fs/fs-writeback.c:1936
> RSP: 0018:8801d951f038 EFLAGS: 00010206
> RAX: dc00 RBX:  RCX: 81bf6ea5
> RDX: 000a RSI: 87b44840 RDI: 0050
> RBP: 8801d951f558 R08: 11003b2a3def R09: 0004
> R10: 8801d951f438 R11: 0004 R12: 0100
> R13: 8801baee0dc0 R14: 8801d951f530 R15: 8801baee10d8
> FS:  () GS:8801db20() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 0047ff80 CR3: 07a22006 CR4: 001626f0
> DR0:  DR1:  DR2: 
> DR3:  DR6: fffe0ff0 DR7: 0400
> Call Trace:
>  process_one_work+0xc47/0x1bb0 kernel/workqueue.c:2113
>  process_scheduled_works kernel/workqueue.c:2173 [inline]
>  worker_thread+0xa4b/0x1990 kernel/workqueue.c:2252
>  kthread+0x33c/0x400 kernel/kthread.c:238
>  ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:406

This report says that wb->bdi->dev == NULL

  static inline const char *dev_name(const struct device *dev)
  {
/* Use the init name until the kobject becomes available */
if (dev->init_name)
  return dev->init_name;
  
return kobject_name(>kobj);
  }

  void wb_workfn(struct work_struct *work)
  {
  (...snipped...)
 set_worker_desc("flush-%s", dev_name(wb->bdi->dev));
  (...snipped...)
  }

immediately after ioctl(LOOP_CTL_REMOVE) was requested. It is plausible
because ioctl(LOOP_CTL_REMOVE) sets bdi->dev to NULL after returning from
wb_shutdown().

loop_control_ioctl(LOOP_CTL_REMOVE) {
  loop_remove(lo) {
del_gendisk(lo->lo_disk) {
  bdi_unregister(disk->queue->backing_dev_info) {
bdi_remove_from_list(bdi);
wb_shutdown(>wb);
cgwb_bdi_unregister(bdi);
if (bdi->dev) {
  bdi_debug_unregister(bdi);
  device_unregister(bdi->dev);
  bdi->dev = NULL;
}
  }
}
  }
}

For some reason wb_shutdown() is not waiting for wb_workfn() to complete
( or something queues again after WB_registered bit was cleared ) ?

Anyway, I think that this is block layer problem rather than fs layer problem.



By the way, I got a newbie question regarding commit 5318ce7d46866e1d ("bdi:
Shutdown writeback on all cgwbs in cgwb_bdi_destroy()"). It uses clear_bit()
to clear WB_shutting_down bit so that threads waiting at wait_on_bit() will
wake up. But clear_bit() itself does not wake up threads, does it? Who wakes
them up (e.g. by calling wake_up_bit()) after clear_bit() was called?

Re: [PATCH 2/3] xen netback: add fault injection facility

2018-04-23 Thread Wei Liu

On Fri, Apr 20, 2018 at 10:47:31AM +, Stanislav Kinsburskii wrote:
>  
>  #include 
>  #include 
> @@ -1649,6 +1650,7 @@ static int __init netback_init(void)
>   PTR_ERR(xen_netback_dbg_root));
>  #endif /* CONFIG_DEBUG_FS */
>  
> + (void) xen_netbk_fi_init();

If you care about the return value, please propagate it to
netback_init's caller. Otherwise you can just make the function return
void.

> +
> +int xenvif_fi_init(struct xenvif *vif)
> +{
> + struct dentry *parent;
> + struct xenvif_fi *vfi;
> + int fi, err = -ENOMEM;
> +
> + parent = vif_fi_dir;
> + if (!parent)
> + return -ENOMEM;
> +
> + vfi = kmalloc(sizeof(*vfi), GFP_KERNEL);
> + if (!vfi)
> + return -ENOMEM;
> +
> + vfi->dir = debugfs_create_dir(vif->dev->name, parent);
> + if (!vfi->dir)
> + goto err_dir;
> +
> + for (fi = 0; fi < XENVIF_FI_MAX; fi++) {
> + vfi->faults[fi] = xen_fi_dir_add(vfi->dir,
> + xenvif_fi_names[fi]);
> + if (!vfi->faults[fi])
> + goto err_fault;
> + }
> +
> + vif->fi_info = vfi;
> + return 0;
> +
> +err_fault:
> + for (; fi > 0; fi--)

fi >= 0

Wei.

Re: testing io.low limit for blk-throttle

2018-04-23 Thread Joseph Qi



On 18/4/23 15:35, Paolo Valente wrote:
> 
> 
>> Il giorno 23 apr 2018, alle ore 08:05, Joseph Qi  ha 
>> scritto:
>>
>> Hi Paolo,
> 
> Hi Joseph,
> thanks for chiming in.
> 
>> What's your idle and latency config?
> 
> I didn't set them at all, as the only (explicit) requirement in my
> basic test is that one of the group is guaranteed a minimum bps.
> 
> 
>> IMO, io.low will allow others run more bandwidth if cgroup's average
>> idle time is high or latency is low.
> 
> What you say here makes me think that I simply misunderstood the
> purpose of io.low.  So, here is my problem/question: "I only need to
> guarantee at least a minimum bandwidth, in bps, to a group.  Is the
> io.low limit the way to go?"
> 
> I know that I can use just io.max (unless I misunderstood the goal of
> io.max too :( ), but my extra purpose would be to not waste bandwidth
> when some group is idle.  Yet, as for now, io.low is not working even
> for the first, simpler goal, i.e., guaranteeing a minimum bandwidth to
> one group when all groups are active.
> 
> Am I getting something wrong?
> 
> Otherwise, if there are some special values for idle and latency
> parameters that would make throttle work for my test, I'll be of
> course happy to try them.
> 
I think you can try idle time with 1000us for all cgroups, and latency
target 100us for cgroup with low limit 100MB/s and 2000us for cgroups
with low limit 10MB/s. That means cgroup with low latency target will
be preferred.
BTW, from my expeierence the parameters are not easy to set because
they are strongly correlated to the cgroup IO behavior.

Thanks,
Joseph

Re: [PATCH V4 0/2] blk-mq: fix race between completion and BLK_EH_RESET_TIMER

2018-04-23 Thread Martin Steigerwald

Hello Ming.

Ming Lei - 18.04.18, 18:46:
> On Mon, Apr 16, 2018 at 03:12:30PM +0200, Martin Steigerwald wrote:
> > Ming Lei - 16.04.18, 02:45:
> > > On Sun, Apr 15, 2018 at 06:31:44PM +0200, Martin Steigerwald 
wrote:
> > > > Hi Ming.
> > > > 
> > > > Ming Lei - 15.04.18, 17:43:
> > > > > Hi Jens,
> > > > > 
> > > > > This two patches fixes the recently discussed race between
> > > > > completion
> > > > > and BLK_EH_RESET_TIMER.
> > > > > 
> > > > > Israel & Martin, this one is a simpler fix on this issue and
> > > > > can
> > > > > cover the potencial hang of MQ_RQ_COMPLETE_IN_TIMEOUT request,
> > > > > could
> > > > > you test V4 and see if your issue can be fixed?
> > > > 
> > > > In replacement of all the three other patches I applied?
> > > > 
> > > > - '[PATCH] blk-mq_Directly schedule q->timeout_work when
> > > > aborting a
> > > > request.mbox'
> > > > 
> > > > - '[PATCH v2] block: Change a rcu_read_{lock,unlock}_sched()
> > > > pair
> > > > into rcu_read_{lock,unlock}().mbox'
> > > > 
> > > > - '[PATCH v4] blk-mq_Fix race conditions in request timeout
> > > > handling.mbox'
> > > 
> > > You only need to replace the above one '[PATCH v4] blk-mq_Fix race
> > > conditions in request timeout' with V4 in this thread.
> > 
> > Ming, a 4.16.2 with the patches:
> > 
> > '[PATCH] blk-mq_Directly schedule q->timeout_work when aborting a
> > request.mbox'
> > '[PATCH v2] block: Change a rcu_read_{lock,unlock}_sched() pair into
> > rcu_read_{lock,unlock}().mbox'
> > '[PATCH V4 1_2] blk-mq_set RQF_MQ_TIMEOUT_EXPIRED when the rq'\''s
> > timeout isn'\''t handled.mbox'
> > '[PATCH V4 2_2] blk-mq_fix race between complete and
> > BLK_EH_RESET_TIMER.mbox'
> > 
> > hung on boot 3 out of 4 times.
> > 
> > See
> > 
> > [Possible REGRESSION, 4.16-rc4] Error updating SMART data during
> > runtime and boot failures with blk_mq_terminate_expired in
> > backtrace https://bugzilla.kernel.org/show_bug.cgi?id=199077#c13
> > 
> > I tried to add your mail address to Cc of the bug report, but
> > Bugzilla did not know it.
> > 
> > Fortunately it booted on the fourth attempt, cause I forgot my GRUB
> > password.
> > 
> > Reverting back to previous 4.16.1 kernel with patches from Bart.
> > 
> > > > These patches worked reliably so far both for the hang on boot
> > > > and
> > > > error reading SMART data.
> > > 
> > > And you may see the reason in the following thread:
> > > 
> > > https://marc.info/?l=linux-block=152366441625786=2
> > 
> > So requests could never be completed?
> 
> Yes.
> 
> I guess Jianchao's patch("[PATCH] blk-mq: start request gstate with
> gen 1") may work for your issue because you are using
> blk_abort_request().
> 
> If it doesn't, please try the following V5 together the other two, and
> V5 fixes one big issue, in which the new rq state shouldn't be
> introduced, otherwise timeout is broken easily.
> 
> I have tested V5 by running blktests(block/011), in which both these
> code paths are covered: EH_HANDLED, EH_RESET_TIMER, and normal
> completion during timeout, and the patch V5 works as expected.

I tested 4.16.3 with just Jianchao's patch "[PATCH] blk-mq: start 
request gstate with gen 1" (+ the unrelated btrfs trimming fix I carry 
for a long time already) and it did at least 15 boots successfully 
(without hanging). So far also no "error loading smart data mail", but 
it takes a few days with suspend/hibernation + resume cycles in order to 
know for sure.

So if I read your mail correctly, there is no need to test your V5 
patch.

Thanks,
Martin

> --
> 
> From e81da316a953db999d155d08143fd5722b44e79e Mon Sep 17 00:00:00 2001
> From: Ming Lei 
> Date: Thu, 12 Apr 2018 04:23:09 +0800
> Subject: [PATCH] blk-mq: fix race between complete and
> BLK_EH_RESET_TIMER
> 
> The normal request completion can be done before or during handling
> BLK_EH_RESET_TIMER, and this race may cause the request to never
> be completed since driver's .timeout() may always return
> BLK_EH_RESET_TIMER.
> 
> This issue can't be fixed completely by driver, since the normal
> completion can be done between returning .timeout() and handling
> BLK_EH_RESET_TIMER.
> 
> This patch fixes the race by introducing rq state of
> MQ_RQ_COMPLETE_IN_RESET, and reading/writing rq's state by holding
> queue lock, which can be per-request actually, but just not necessary
> to introduce one lock for so unusual event.
> 
> Also handle the timeout requests in two steps:
> 
> 1) in 1st step, call .timeout(), and reset timer for
> BLK_EH_RESET_TIMER
> 
> 2) in 2nd step, sync with normal completion path by holding queue lock
> for avoiding race between BLK_EH_RESET_TIMER and normal completion.
> 
> Another change is that one request is always handled as time-out
> exclusively in this patch with help of queue lock.
> 
> Cc: "jianchao.wang" 
> Cc: Bart Van Assche 
> Cc: Tejun Heo 
> Cc: Christoph Hellwig 
> Cc: Ming Lei 
> Cc:

Re: [PATCH] blk-mq: start request gstate with gen 1

2018-04-23 Thread Martin Steigerwald

Hi Jianchao.

jianchao.wang - 17.04.18, 16:34:
> On 04/17/2018 08:10 PM, Martin Steigerwald wrote:
> > For testing it I add it to 4.16.2 with the patches I have already?
> 
> You could try to only apply this patch to have a test. :)

I tested 4.16.3 with just your patch (+ the unrelated btrfs trimming fix 
I carry for a long time already) and it did at least 15 boots 
successfully (without hanging). So far also no "error loading smart data 
mail", but it takes a few days with suspend/hibernation + resume cycles 
in order to know for sure.

Thanks,
-- 
Martin

Re: testing io.low limit for blk-throttle

2018-04-23 Thread jianchao.wang

Hi Paolo

When I test execute the script, I got this
8:0 rbps=1000 wbps=0 riops=0 wiops=0 idle=0 latency=max

The idle is 0.
I'm afraid the io.low would not work.
Please refer to the following code in tg_set_limit

/* force user to configure all settings for low limit  */
if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] ||
  tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) ||
tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD ||//-> 
HERE
tg->latency_target_conf == DFL_LATENCY_TARGET) {
tg->bps[READ][LIMIT_LOW] = 0;
tg->bps[WRITE][LIMIT_LOW] = 0;
tg->iops[READ][LIMIT_LOW] = 0;
tg->iops[WRITE][LIMIT_LOW] = 0;
tg->idletime_threshold = DFL_IDLE_THRESHOLD;
tg->latency_target = DFL_LATENCY_TARGET;
} else if (index == LIMIT_LOW) {
tg->idletime_threshold = tg->idletime_threshold_conf;
tg->latency_target = tg->latency_target_conf;
}

blk_throtl_update_limit_valid(tg->td);


Thanks
Jianchao

On 04/23/2018 03:37 PM, Paolo Valente wrote:
> cd thr-lat-with-interference
> sudo ./thr-lat-with-interference.sh -b t -w 1 -W "1000 1000 
> 1000 1000 1000 1000" -n 6 -T "read read read read read read" 
> -R "0 0 0 0 0 0"

Re: testing io.low limit for blk-throttle

2018-04-23 Thread Paolo Valente

> Il giorno 23 apr 2018, alle ore 08:35, jianchao.wang 
>  ha scritto:
> 
> Hi Paolo
> 
> On 04/23/2018 01:32 PM, Paolo Valente wrote:
>> Thanks for sharing this fix.  I tried it too, but nothing changes in
>> my test :(> 
> 
> That's really sad.
> 
>> At this point, my doubt is still: am I getting io.low limit right?  I
>> understand that an I/O-bound group should be guaranteed a rbps at
>> least equal to the rbps set with io.low for that group (of course,
>> provided that the sum of io.low limits is lower than the rate at which
>> the device serves all the I/O generated by the groups).  Is this
>> really what io.low shall guarantee?
> 
> I agree with your point about this even if I'm not qualified to judge it.
> 

ok, thank for your feedback.

> On the other hand, could you share your test case and blk-throl config here ?
> 

I wrote the description of the test, and the way I made it (and so the way you 
can easily reproduce it exactly) in my first email. I'm repeating it here for 
your convenience.

With
- one group, the interfered, containing one process that does sequential
 reads with fio
- io.low set to 100MB/s for the interfered
- six other groups, the interferers, with each interferer containing one
 process doing sequential read with fio
- io.low set to 10MB/s for each interferer
- the workload executed on an SSD, with a 500MB/s of overall throughput
the interfered gets only 75MB/s.

In particular, the throughput of the interfered becomes lower and
lower as the number of interferers is increased.  So you can make it
become even much lower than the 75MB/s in the example above.  There
seems to be no control on bandwidth.

Am I doing something wrong?  Or did I simply misunderstand the goal of
io.low, and the only parameter for guaranteeing the desired bandwidth to
a group is io.max (to be used indirectly, by limiting the bandwidth of
the interferers)?

If useful for you, you can reproduce the above test very quickly, by
using the S suite [1] and typing:

cd thr-lat-with-interference
sudo ./thr-lat-with-interference.sh -b t -w 1 -W "1000 1000 
1000 1000 1000 1000" -n 6 -T "read read read read read read" -R 
"0 0 0 0 0 0"

[1] https://github.com/Algodev-github/S

> Thanks
> Jianchao

Re: testing io.low limit for blk-throttle

2018-04-23 Thread Paolo Valente



> Il giorno 23 apr 2018, alle ore 08:05, Joseph Qi  ha 
> scritto:
> 
> Hi Paolo,

Hi Joseph,
thanks for chiming in.

> What's your idle and latency config?

I didn't set them at all, as the only (explicit) requirement in my
basic test is that one of the group is guaranteed a minimum bps.


> IMO, io.low will allow others run more bandwidth if cgroup's average
> idle time is high or latency is low.

What you say here makes me think that I simply misunderstood the
purpose of io.low.  So, here is my problem/question: "I only need to
guarantee at least a minimum bandwidth, in bps, to a group.  Is the
io.low limit the way to go?"

I know that I can use just io.max (unless I misunderstood the goal of
io.max too :( ), but my extra purpose would be to not waste bandwidth
when some group is idle.  Yet, as for now, io.low is not working even
for the first, simpler goal, i.e., guaranteeing a minimum bandwidth to
one group when all groups are active.

Am I getting something wrong?

Otherwise, if there are some special values for idle and latency
parameters that would make throttle work for my test, I'll be of
course happy to try them.

Thanks,
Paolo

> In such cases, low limit won't get
> guaranteed.
> 
> Thanks,
> Joseph
> 
> On 18/4/22 17:23, Paolo Valente wrote:
>> Hi Shaohua, all,
>> at last, I started testing your io.low limit for blk-throttle.  One of
>> the things I'm interested in is how good throttling is in achieving a
>> high throughput in the presence of realistic, variable workloads.
>> 
>> However, I seem to have bumped into a totally different problem.  The
>> io.low parameter doesn't seem to guarantee what I understand it is meant
>> to guarantee: minimum per-group bandwidths.  For example, with
>> - one group, the interfered, containing one process that does sequential
>>  reads with fio
>> - io.low set to 100MB/s for the interfered
>> - six other groups, the interferers, with each interferer containing one
>>  process doing sequential read with fio
>> - io.low set to 10MB/s for each interferer
>> - the workload executed on an SSD, with a 500MB/s of overall throughput
>> the interfered gets only 75MB/s.
>> 
>> In particular, the throughput of the interfered becomes lower and
>> lower as the number of interferers is increased.  So you can make it
>> become even much lower than the 75MB/s in the example above.  There
>> seems to be no control on bandwidth.
>> 
>> Am I doing something wrong?  Or did I simply misunderstand the goal of
>> io.low, and the only parameter for guaranteeing the desired bandwidth to
>> a group is io.max (to be used indirectly, by limiting the bandwidth of
>> the interferers)?
>> 
>> If useful for you, you can reproduce the above test very quickly, by
>> using the S suite [1] and typing:
>> 
>> cd thr-lat-with-interference
>> sudo ./thr-lat-with-interference.sh -b t -w 1 -W "1000 1000 
>> 1000 1000 1000 1000" -n 6 -T "read read read read read read" 
>> -R "0 0 0 0 0 0"
>> 
>> Looking forward to your feedback,
>> Paolo
>> 
>> [1] 
>>

Re: [PATCH] blk-mq: start request gstate with gen 1

2018-04-23 Thread Martin Steigerwald

Hi Jianchao.

jianchao.wang - 17.04.18, 16:34:
> On 04/17/2018 08:10 PM, Martin Steigerwald wrote:
> > For testing it I add it to 4.16.2 with the patches I have already?
> 
> You could try to only apply this patch to have a test. 

Compiling now to have a test.

Thanks,
-- 
Martin

Re: testing io.low limit for blk-throttle

2018-04-23 Thread jianchao.wang

Hi Paolo

On 04/23/2018 01:32 PM, Paolo Valente wrote:
> Thanks for sharing this fix.  I tried it too, but nothing changes in
> my test :(> 

That's really sad.

> At this point, my doubt is still: am I getting io.low limit right?  I
> understand that an I/O-bound group should be guaranteed a rbps at
> least equal to the rbps set with io.low for that group (of course,
> provided that the sum of io.low limits is lower than the rate at which
> the device serves all the I/O generated by the groups).  Is this
> really what io.low shall guarantee?

I agree with your point about this even if I'm not qualified to judge it.

On the other hand, could you share your test case and blk-throl config here ?

Thanks
Jianchao

Re: testing io.low limit for blk-throttle

2018-04-23 Thread Joseph Qi

Hi Paolo,
What's your idle and latency config?
IMO, io.low will allow others run more bandwidth if cgroup's average
idle time is high or latency is low. In such cases, low limit won't get
guaranteed.

Thanks,
Joseph

On 18/4/22 17:23, Paolo Valente wrote:
> Hi Shaohua, all,
> at last, I started testing your io.low limit for blk-throttle.  One of
> the things I'm interested in is how good throttling is in achieving a
> high throughput in the presence of realistic, variable workloads.
> 
> However, I seem to have bumped into a totally different problem.  The
> io.low parameter doesn't seem to guarantee what I understand it is meant
> to guarantee: minimum per-group bandwidths.  For example, with
> - one group, the interfered, containing one process that does sequential
>   reads with fio
> - io.low set to 100MB/s for the interfered
> - six other groups, the interferers, with each interferer containing one
>   process doing sequential read with fio
> - io.low set to 10MB/s for each interferer
> - the workload executed on an SSD, with a 500MB/s of overall throughput
> the interfered gets only 75MB/s.
> 
> In particular, the throughput of the interfered becomes lower and
> lower as the number of interferers is increased.  So you can make it
> become even much lower than the 75MB/s in the example above.  There
> seems to be no control on bandwidth.
> 
> Am I doing something wrong?  Or did I simply misunderstand the goal of
> io.low, and the only parameter for guaranteeing the desired bandwidth to
> a group is io.max (to be used indirectly, by limiting the bandwidth of
> the interferers)?
> 
> If useful for you, you can reproduce the above test very quickly, by
> using the S suite [1] and typing:
> 
> cd thr-lat-with-interference
> sudo ./thr-lat-with-interference.sh -b t -w 1 -W "1000 1000 
> 1000 1000 1000 1000" -n 6 -T "read read read read read read" 
> -R "0 0 0 0 0 0"
> 
> Looking forward to your feedback,
> Paolo
> 
> [1] 
>

54 matches

Mail list logo