[PATCH v3 05/21] alpha: return error code from alpha_pci_map_sg()

2021-07-29 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

pci_map_single_1() can fail for different reasons, but since the only
supported type of error return is DMA_MAPPING_ERROR, we coalesce those
errors into EIO.

ENOMEM is returned when no page tables can be allocated.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Richard Henderson 
Cc: Ivan Kokshaysky 
Cc: Matt Turner 
---
 arch/alpha/kernel/pci_iommu.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index 35d7b3096d6e..21f9ac101324 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -649,7 +649,9 @@ static int alpha_pci_map_sg(struct device *dev, struct 
scatterlist *sg,
sg->dma_address
  = pci_map_single_1(pdev, SG_ENT_VIRT_ADDRESS(sg),
 sg->length, dac_allowed);
-   return sg->dma_address != DMA_MAPPING_ERROR;
+   if (sg->dma_address == DMA_MAPPING_ERROR)
+   return -EIO;
+   return 1;
}
 
start = sg;
@@ -685,8 +687,10 @@ static int alpha_pci_map_sg(struct device *dev, struct 
scatterlist *sg,
if (out < end)
out->dma_length = 0;
 
-   if (out - start == 0)
+   if (out - start == 0) {
printk(KERN_WARNING "pci_map_sg failed: no entries?\n");
+   return -ENOMEM;
+   }
DBGA("pci_map_sg: %ld entries\n", out - start);
 
return out - start;
@@ -699,7 +703,7 @@ static int alpha_pci_map_sg(struct device *dev, struct 
scatterlist *sg,
   entries.  Unmap them now.  */
if (out > start)
pci_unmap_sg(pdev, start, out - start, dir);
-   return 0;
+   return -ENOMEM;
 }
 
 /* Unmap a set of streaming mode DMA translations.  Again, cpu read
-- 
2.20.1



[PATCH v3 07/21] ARM/dma-mapping: don't set failed sg dma_address to DMA_MAPPING_ERROR

2021-07-29 Thread Logan Gunthorpe
Setting the ->dma_address to DMA_MAPPING_ERROR is not part of the
->map_sg calling convention, so remove it.

Link: https://lore.kernel.org/linux-mips/20210716063241.gc13...@lst.de/
Suggested-by: Christoph Hellwig 
Signed-off-by: Logan Gunthorpe 
Cc: Russell King 
Cc: Thomas Bogendoerfer 
---
 arch/arm/mm/dma-mapping.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 113b9cb3701b..4b61541853ea 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -1632,7 +1632,6 @@ static int __iommu_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
for (i = 1; i < nents; i++) {
s = sg_next(s);
 
-   s->dma_address = DMA_MAPPING_ERROR;
s->dma_length = 0;
 
if (s->offset || (size & ~PAGE_MASK) || size + s->length > max) 
{
-- 
2.20.1



[PATCH v3 06/21] ARM/dma-mapping: return error code from .map_sg() ops

2021-07-29 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.
In the case of a DMA_MAPPING_ERROR, -EIO is returned. Otherwise,
-ENOMEM or -EINVAL is returned depending on the error from
__map_sg_chunk().

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Russell King 
Cc: Thomas Bogendoerfer 
---
 arch/arm/mm/dma-mapping.c | 25 -
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index c4b8df2ad328..113b9cb3701b 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -980,7 +980,7 @@ int arm_dma_map_sg(struct device *dev, struct scatterlist 
*sg, int nents,
 {
const struct dma_map_ops *ops = get_dma_ops(dev);
struct scatterlist *s;
-   int i, j;
+   int i, j, ret;
 
for_each_sg(sg, s, nents, i) {
 #ifdef CONFIG_NEED_SG_DMA_LENGTH
@@ -988,15 +988,17 @@ int arm_dma_map_sg(struct device *dev, struct scatterlist 
*sg, int nents,
 #endif
s->dma_address = ops->map_page(dev, sg_page(s), s->offset,
s->length, dir, attrs);
-   if (dma_mapping_error(dev, s->dma_address))
+   if (dma_mapping_error(dev, s->dma_address)) {
+   ret = -EIO;
goto bad_mapping;
+   }
}
return nents;
 
  bad_mapping:
for_each_sg(sg, s, i, j)
ops->unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir, 
attrs);
-   return 0;
+   return ret;
 }
 
 /**
@@ -1622,7 +1624,7 @@ static int __iommu_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
 bool is_coherent)
 {
struct scatterlist *s = sg, *dma = sg, *start = sg;
-   int i, count = 0;
+   int i, count = 0, ret;
unsigned int offset = s->offset;
unsigned int size = s->offset + s->length;
unsigned int max = dma_get_max_seg_size(dev);
@@ -1634,8 +1636,10 @@ static int __iommu_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
s->dma_length = 0;
 
if (s->offset || (size & ~PAGE_MASK) || size + s->length > max) 
{
-   if (__map_sg_chunk(dev, start, size, >dma_address,
-   dir, attrs, is_coherent) < 0)
+   ret = __map_sg_chunk(dev, start, size,
+>dma_address, dir, attrs,
+is_coherent);
+   if (ret < 0)
goto bad_mapping;
 
dma->dma_address += offset;
@@ -1648,8 +1652,9 @@ static int __iommu_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
}
size += s->length;
}
-   if (__map_sg_chunk(dev, start, size, >dma_address, dir, attrs,
-   is_coherent) < 0)
+   ret = __map_sg_chunk(dev, start, size, >dma_address, dir, attrs,
+is_coherent);
+   if (ret < 0)
goto bad_mapping;
 
dma->dma_address += offset;
@@ -1660,7 +1665,9 @@ static int __iommu_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
 bad_mapping:
for_each_sg(sg, s, count, i)
__iommu_remove_mapping(dev, sg_dma_address(s), sg_dma_len(s));
-   return 0;
+   if (ret == -ENOMEM)
+   return ret;
+   return -EINVAL;
 }
 
 /**
-- 
2.20.1



[PATCH v3 10/21] powerpc/iommu: return error code from .map_sg() ops

2021-07-29 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

Propagate the error up if vio_dma_iommu_map_sg() fails.

ppc_iommu_map_sg() may fail either because of iommu_range_alloc() or
because of tbl->it_ops->set(). The former only supports returning an
error with DMA_MAPPING_ERROR and an examination of the latter indicates
that it may return arch-specific errors (for example,
tce_buildmulti_pSeriesLP()). Hence, coalesce all of those errors into
-EIO, per the documentation on dma_map_sgtable().

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Geoff Levand 
---
 arch/powerpc/kernel/iommu.c | 4 ++--
 arch/powerpc/platforms/ps3/system-bus.c | 2 +-
 arch/powerpc/platforms/pseries/vio.c| 5 +++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 2af89a5e379f..a8ec4fe42817 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -473,7 +473,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table 
*tbl,
BUG_ON(direction == DMA_NONE);
 
if ((nelems == 0) || !tbl)
-   return 0;
+   return -EINVAL;
 
outs = s = segstart = [0];
outcount = 1;
@@ -599,7 +599,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table 
*tbl,
if (s == outs)
break;
}
-   return 0;
+   return -EIO;
 }
 
 
diff --git a/arch/powerpc/platforms/ps3/system-bus.c 
b/arch/powerpc/platforms/ps3/system-bus.c
index 1a5665875165..c54eb46f0cfb 100644
--- a/arch/powerpc/platforms/ps3/system-bus.c
+++ b/arch/powerpc/platforms/ps3/system-bus.c
@@ -663,7 +663,7 @@ static int ps3_ioc0_map_sg(struct device *_dev, struct 
scatterlist *sg,
   unsigned long attrs)
 {
BUG();
-   return 0;
+   return -EINVAL;
 }
 
 static void ps3_sb_unmap_sg(struct device *_dev, struct scatterlist *sg,
diff --git a/arch/powerpc/platforms/pseries/vio.c 
b/arch/powerpc/platforms/pseries/vio.c
index e00f3725ec96..e31e59c54f30 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -560,7 +560,8 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct 
scatterlist *sglist,
for_each_sg(sglist, sgl, nelems, count)
alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl));
 
-   if (vio_cmo_alloc(viodev, alloc_size))
+   ret = vio_cmo_alloc(viodev, alloc_size);
+   if (ret)
goto out_fail;
ret = ppc_iommu_map_sg(dev, tbl, sglist, nelems, dma_get_mask(dev),
direction, attrs);
@@ -577,7 +578,7 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct 
scatterlist *sglist,
vio_cmo_dealloc(viodev, alloc_size);
 out_fail:
atomic_inc(>cmo.allocs_failed);
-   return 0;
+   return ret;
 }
 
 static void vio_dma_iommu_unmap_sg(struct device *dev,
-- 
2.20.1



[PATCH v3 08/21] ia64/sba_iommu: return error code from sba_map_sg_attrs()

2021-07-29 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

In the case of a dma_mapping_error() return -EIO as the actual cause
is opaque here.

sba_coalesce_chunks() may only presently fail if sba_alloc_range()
fails, which in turn only fails if the iommu is out of mapping
resources, hence a -ENOMEM is used in that case.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Michael Ellerman 
Cc: Niklas Schnelle 
Cc: Thomas Bogendoerfer 
---
 arch/ia64/hp/common/sba_iommu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 9148ddbf02e5..430c166b68cd 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1458,8 +1458,8 @@ static int sba_map_sg_attrs(struct device *dev, struct 
scatterlist *sglist,
sglist->dma_length = sglist->length;
sglist->dma_address = sba_map_page(dev, sg_page(sglist),
sglist->offset, sglist->length, dir, attrs);
-   if (dma_mapping_error(dev, sglist->dma_address))
-   return 0;
+   if(dma_mapping_error(dev, sglist->dma_address))
+   return -EIO;
return 1;
}
 
@@ -1486,7 +1486,7 @@ static int sba_map_sg_attrs(struct device *dev, struct 
scatterlist *sglist,
coalesced = sba_coalesce_chunks(ioc, dev, sglist, nents);
if (coalesced < 0) {
sba_unmap_sg_attrs(dev, sglist, nents, dir, attrs);
-   return 0;
+   return -ENOMEM;
}
 
/*
-- 
2.20.1



[PATCH v3 11/21] powerpc/iommu: don't set failed sg dma_address to DMA_MAPPING_ERROR

2021-07-29 Thread Logan Gunthorpe
Setting the ->dma_address to DMA_MAPPING_ERROR is not part of
the ->map_sg calling convention, so remove it.

Link: https://lore.kernel.org/linux-mips/20210716063241.gc13...@lst.de/
Suggested-by: Christoph Hellwig 
Signed-off-by: Logan Gunthorpe 
Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Geoff Levand 
---
 arch/powerpc/kernel/iommu.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index a8ec4fe42817..30b7736f0896 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -575,7 +575,6 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table 
*tbl,
 */
if (outcount < incount) {
outs = sg_next(outs);
-   outs->dma_address = DMA_MAPPING_ERROR;
outs->dma_length = 0;
}
 
@@ -593,7 +592,6 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table 
*tbl,
npages = iommu_num_pages(s->dma_address, s->dma_length,
 IOMMU_PAGE_SIZE(tbl));
__iommu_free(tbl, vaddr, npages);
-   s->dma_address = DMA_MAPPING_ERROR;
s->dma_length = 0;
}
if (s == outs)
-- 
2.20.1



[PATCH v3 09/21] MIPS/jazzdma: return error code from jazz_dma_map_sg()

2021-07-29 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

vdma_alloc() may fail for different reasons, but since it only supports
indicating an error via a return of DMA_MAPPING_ERROR, we coalesce the
different reasons into -EIO as is documented on dma_map_sgtable().

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Thomas Bogendoerfer 
---
 arch/mips/jazz/jazzdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/jazz/jazzdma.c b/arch/mips/jazz/jazzdma.c
index 461457b28982..eabddb89d221 100644
--- a/arch/mips/jazz/jazzdma.c
+++ b/arch/mips/jazz/jazzdma.c
@@ -552,7 +552,7 @@ static int jazz_dma_map_sg(struct device *dev, struct 
scatterlist *sglist,
dir);
sg->dma_address = vdma_alloc(sg_phys(sg), sg->length);
if (sg->dma_address == DMA_MAPPING_ERROR)
-   return 0;
+   return -EIO;
sg_dma_len(sg) = sg->length;
}
 
-- 
2.20.1



[PATCH v3 13/21] s390/pci: don't set failed sg dma_address to DMA_MAPPING_ERROR

2021-07-29 Thread Logan Gunthorpe
Setting the ->dma_address to DMA_MAPPING_ERROR is not part of
the ->map_sg calling convention, so remove it.

Link: https://lore.kernel.org/linux-mips/20210716063241.gc13...@lst.de/
Suggested-by: Christoph Hellwig 
Signed-off-by: Logan Gunthorpe 
Cc: Niklas Schnelle 
Cc: Gerald Schaefer 
Cc: Heiko Carstens 
Cc: Vasily Gorbik 
Cc: Christian Borntraeger 
---
 arch/s390/pci/pci_dma.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index c78b02012764..be48e5b5bfcf 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -492,7 +492,6 @@ static int s390_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
for (i = 1; i < nr_elements; i++) {
s = sg_next(s);
 
-   s->dma_address = DMA_MAPPING_ERROR;
s->dma_length = 0;
 
if (s->offset || (size & ~PAGE_MASK) ||
-- 
2.20.1



[PATCH v3 12/21] s390/pci: return error code from s390_dma_map_sg()

2021-07-29 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

So propagate the error from __s390_dma_map_sg() up. __s390_dma_map_sg()
returns either -ENOMEM on allocation failure or -EINVAL which is
the same as what's expected by dma_map_sgtable().

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Acked-by: Niklas Schnelle 
Cc: Gerald Schaefer 
Cc: Heiko Carstens 
Cc: Vasily Gorbik 
Cc: Christian Borntraeger 
---
 arch/s390/pci/pci_dma.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index ebc9a49523aa..c78b02012764 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -487,7 +487,7 @@ static int s390_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
unsigned int max = dma_get_max_seg_size(dev);
unsigned int size = s->offset + s->length;
unsigned int offset = s->offset;
-   int count = 0, i;
+   int count = 0, i, ret;
 
for (i = 1; i < nr_elements; i++) {
s = sg_next(s);
@@ -497,8 +497,9 @@ static int s390_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
 
if (s->offset || (size & ~PAGE_MASK) ||
size + s->length > max) {
-   if (__s390_dma_map_sg(dev, start, size,
- >dma_address, dir))
+   ret = __s390_dma_map_sg(dev, start, size,
+   >dma_address, dir);
+   if (ret)
goto unmap;
 
dma->dma_address += offset;
@@ -511,7 +512,8 @@ static int s390_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
}
size += s->length;
}
-   if (__s390_dma_map_sg(dev, start, size, >dma_address, dir))
+   ret = __s390_dma_map_sg(dev, start, size, >dma_address, dir);
+   if (ret)
goto unmap;
 
dma->dma_address += offset;
@@ -523,7 +525,7 @@ static int s390_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
s390_dma_unmap_pages(dev, sg_dma_address(s), sg_dma_len(s),
 dir, attrs);
 
-   return 0;
+   return ret;
 }
 
 static void s390_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
-- 
2.20.1



[PATCH v3 14/21] sparc/iommu: return error codes from .map_sg() ops

2021-07-29 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

Returning an errno from __sbus_iommu_map_sg() results in
sbus_iommu_map_sg_gflush() and sbus_iommu_map_sg_pflush() returning an
errno, as those functions are wrappers around __sbus_iommu_map_sg().

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: "David S. Miller" 
Cc: Niklas Schnelle 
Cc: Michael Ellerman 
---
 arch/sparc/kernel/iommu.c | 4 ++--
 arch/sparc/kernel/pci_sun4v.c | 4 ++--
 arch/sparc/mm/iommu.c | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index a034f571d869..0589acd34201 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -448,7 +448,7 @@ static int dma_4u_map_sg(struct device *dev, struct 
scatterlist *sglist,
iommu = dev->archdata.iommu;
strbuf = dev->archdata.stc;
if (nelems == 0 || !iommu)
-   return 0;
+   return -EINVAL;
 
spin_lock_irqsave(>lock, flags);
 
@@ -580,7 +580,7 @@ static int dma_4u_map_sg(struct device *dev, struct 
scatterlist *sglist,
}
spin_unlock_irqrestore(>lock, flags);
 
-   return 0;
+   return -EINVAL;
 }
 
 /* If contexts are being used, they are the same in all of the mappings
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index 9de57e88f7a1..d90e80fa5705 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -486,7 +486,7 @@ static int dma_4v_map_sg(struct device *dev, struct 
scatterlist *sglist,
 
iommu = dev->archdata.iommu;
if (nelems == 0 || !iommu)
-   return 0;
+   return -EINVAL;
atu = iommu->atu;
 
prot = HV_PCI_MAP_ATTR_READ;
@@ -619,7 +619,7 @@ static int dma_4v_map_sg(struct device *dev, struct 
scatterlist *sglist,
}
local_irq_restore(flags);
 
-   return 0;
+   return -EINVAL;
 }
 
 static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist,
diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c
index 0c0342e5b10d..9e3f6933ca13 100644
--- a/arch/sparc/mm/iommu.c
+++ b/arch/sparc/mm/iommu.c
@@ -256,7 +256,7 @@ static int __sbus_iommu_map_sg(struct device *dev, struct 
scatterlist *sgl,
sg->dma_address =__sbus_iommu_map_page(dev, sg_page(sg),
sg->offset, sg->length, per_page_flush);
if (sg->dma_address == DMA_MAPPING_ERROR)
-   return 0;
+   return -EIO;
sg->dma_length = sg->length;
}
 
-- 
2.20.1



[PATCH v3 16/21] parisc: return error code from .map_sg() ops

2021-07-29 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.
Return -EINVAL if the ioc cannot be obtained.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: "James E.J. Bottomley" 
Cc: Helge Deller 
---
 drivers/parisc/ccio-dma.c  | 2 +-
 drivers/parisc/sba_iommu.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index b5f9ee81a46c..452e72b7bd01 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -918,7 +918,7 @@ ccio_map_sg(struct device *dev, struct scatterlist *sglist, 
int nents,
BUG_ON(!dev);
ioc = GET_IOC(dev);
if (!ioc)
-   return 0;
+   return -EINVAL;

DBG_RUN_SG("%s() START %d entries\n", __func__, nents);
 
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index dce4cdf786cd..e60690d38d67 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -947,7 +947,7 @@ sba_map_sg(struct device *dev, struct scatterlist *sglist, 
int nents,
 
ioc = GET_IOC(dev);
if (!ioc)
-   return 0;
+   return -EINVAL;
 
/* Fast path single entry scatterlists. */
if (nents == 1) {
-- 
2.20.1



[PATCH v3 15/21] sparc/iommu: don't set failed sg dma_address to DMA_MAPPING_ERROR

2021-07-29 Thread Logan Gunthorpe
Setting the ->dma_address to DMA_MAPPING_ERROR is not part of
the ->map_sg calling convention, so remove it.

Link: https://lore.kernel.org/linux-mips/20210716063241.gc13...@lst.de/
Suggested-by: Christoph Hellwig 
Signed-off-by: Logan Gunthorpe 
Cc: "David S. Miller" 
Cc: Niklas Schnelle 
Cc: Michael Ellerman 
---
 arch/sparc/kernel/iommu.c | 2 --
 arch/sparc/kernel/pci_sun4v.c | 2 --
 2 files changed, 4 deletions(-)

diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index 0589acd34201..da0363692528 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -546,7 +546,6 @@ static int dma_4u_map_sg(struct device *dev, struct 
scatterlist *sglist,
 
if (outcount < incount) {
outs = sg_next(outs);
-   outs->dma_address = DMA_MAPPING_ERROR;
outs->dma_length = 0;
}
 
@@ -572,7 +571,6 @@ static int dma_4u_map_sg(struct device *dev, struct 
scatterlist *sglist,
iommu_tbl_range_free(>tbl, vaddr, npages,
 IOMMU_ERROR_CODE);
 
-   s->dma_address = DMA_MAPPING_ERROR;
s->dma_length = 0;
}
if (s == outs)
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index d90e80fa5705..384480971805 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -594,7 +594,6 @@ static int dma_4v_map_sg(struct device *dev, struct 
scatterlist *sglist,
 
if (outcount < incount) {
outs = sg_next(outs);
-   outs->dma_address = DMA_MAPPING_ERROR;
outs->dma_length = 0;
}
 
@@ -611,7 +610,6 @@ static int dma_4v_map_sg(struct device *dev, struct 
scatterlist *sglist,
iommu_tbl_range_free(tbl, vaddr, npages,
 IOMMU_ERROR_CODE);
/* XXX demap? XXX */
-   s->dma_address = DMA_MAPPING_ERROR;
s->dma_length = 0;
}
if (s == outs)
-- 
2.20.1



[PATCH v3 19/21] x86/amd_gart: don't set failed sg dma_address to DMA_MAPPING_ERROR

2021-07-29 Thread Logan Gunthorpe
Setting the ->dma_address to DMA_MAPPING_ERROR is not part of
the ->map_sg calling convention, so remove it.

Link: https://lore.kernel.org/linux-mips/20210716063241.gc13...@lst.de/
Suggested-by: Christoph Hellwig 
Signed-off-by: Logan Gunthorpe 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: Niklas Schnelle 
Cc: Thomas Bogendoerfer 
Cc: Michael Ellerman 
---
 arch/x86/kernel/amd_gart_64.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 46aea9a4f26b..ed837383de5c 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -458,8 +458,6 @@ static int gart_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
panic("dma_map_sg: overflow on %lu pages\n", pages);
 
iommu_full(dev, pages << PAGE_SHIFT, dir);
-   for_each_sg(sg, s, nents, i)
-   s->dma_address = DMA_MAPPING_ERROR;
return ret;
 }
 
-- 
2.20.1



[PATCH v3 18/21] x86/amd_gart: return error code from gart_map_sg()

2021-07-29 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

So make __dma_map_cont() return a valid errno (which is then propagated
to gart_map_sg() via dma_map_cont()) and return it in case of failure.

Also, return -EINVAL in case of invalid nents.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: Niklas Schnelle 
Cc: Thomas Bogendoerfer 
Cc: Michael Ellerman 
---
 arch/x86/kernel/amd_gart_64.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 9ac696487b13..46aea9a4f26b 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -331,7 +331,7 @@ static int __dma_map_cont(struct device *dev, struct 
scatterlist *start,
int i;
 
if (iommu_start == -1)
-   return -1;
+   return -ENOMEM;
 
for_each_sg(start, s, nelems, i) {
unsigned long pages, addr;
@@ -380,13 +380,13 @@ static int gart_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
   enum dma_data_direction dir, unsigned long attrs)
 {
struct scatterlist *s, *ps, *start_sg, *sgmap;
-   int need = 0, nextneed, i, out, start;
+   int need = 0, nextneed, i, out, start, ret;
unsigned long pages = 0;
unsigned int seg_size;
unsigned int max_seg_size;
 
if (nents == 0)
-   return 0;
+   return -EINVAL;
 
out = 0;
start   = 0;
@@ -414,8 +414,9 @@ static int gart_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
if (!iommu_merge || !nextneed || !need || s->offset ||
(s->length + seg_size > max_seg_size) ||
(ps->offset + ps->length) % PAGE_SIZE) {
-   if (dma_map_cont(dev, start_sg, i - start,
-sgmap, pages, need) < 0)
+   ret = dma_map_cont(dev, start_sg, i - start,
+  sgmap, pages, need);
+   if (ret < 0)
goto error;
out++;
 
@@ -432,7 +433,8 @@ static int gart_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
ps = s;
}
-   if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
+   ret = dma_map_cont(dev, start_sg, i - start, sgmap, pages, need);
+   if (ret < 0)
goto error;
out++;
flush_gart();
@@ -458,7 +460,7 @@ static int gart_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
iommu_full(dev, pages << PAGE_SHIFT, dir);
for_each_sg(sg, s, nents, i)
s->dma_address = DMA_MAPPING_ERROR;
-   return 0;
+   return ret;
 }
 
 /* allocate and map a coherent mapping */
-- 
2.20.1



[PATCH v3 00/21] .map_sg() error cleanup

2021-07-29 Thread Logan Gunthorpe
Hi,

This v3 of the series is spun out and expanded from my work to add
P2PDMA support to DMA map operations[1]. v2 is at [2]. The main changes
in v1 are to more carefully define the meaning of the error codes for
dma_map_sgtable().

The P2PDMA work requires distinguishing different error conditions in
a map_sg operation. dma_map_sgtable() already allows for returning an
error code (where as dma_map_sg() is only allowed to return zero)
however, it currently only returns -EINVAL when a .map_sg() call returns
zero.

This series cleans up all .map_sg() implementations to return appropriate
error codes. After the cleanup, dma_map_sg() will still return zero,
however dma_map_sgtable() will pass the error code from the .map_sg()
call. Thanks go to Martn Oliveira for doing a lot of the cleanup of the
obscure implementations.

The patch set is based off of v5.14-rc2 and a git repo can be found
here:

  https://github.com/sbates130272/linux-p2pmem map_sg_err_cleanup_v2

Thanks,

Logan

[1] 
https://lore.kernel.org/linux-block/20210513223203.5542-1-log...@deltatee.com/
[2] 
https://lore.kernel.org/linux-mips/20210723175008.22410-1-log...@deltatee.com/

--

Changes in v3:
  - Move the validation of errnos into __dma_map_sg_attrs() (Per
Christoph)
  - Fix the out of date commit message in patch 21 (Per Eike)
Changes in v2:
  - Attempt to define the meanings of the errors returned by
dma_map_sgtable() and restrict the valid return codes of
.map_sg implementations. (Per Christoph)
  - Change dma_map_sgtable() to EXPORT_SYMBOL_GPL() (Per Christoph)
  - Add patches to remove the erroneous setting of sg->dma_address
to DMA_MAP_ERROR in a few .map_sg(0 implementations. (Per
Christoph).

--

Logan Gunthorpe (10):
  dma-mapping: Allow map_sg() ops to return negative error codes
  dma-direct: Return appropriate error code from dma_direct_map_sg()
  iommu: Return full error code from iommu_map_sg[_atomic]()
  dma-iommu: Return error code from iommu_dma_map_sg()
  ARM/dma-mapping: don't set failed sg dma_address to DMA_MAPPING_ERROR
  powerpc/iommu: don't set failed sg dma_address to DMA_MAPPING_ERROR
  s390/pci: don't set failed sg dma_address to DMA_MAPPING_ERROR
  sparc/iommu: don't set failed sg dma_address to DMA_MAPPING_ERROR
  x86/amd_gart: don't set failed sg dma_address to DMA_MAPPING_ERROR
  dma-mapping: Disallow .map_sg operations from returning zero on error

Martin Oliveira (11):
  alpha: return error code from alpha_pci_map_sg()
  ARM/dma-mapping: return error code from .map_sg() ops
  ia64/sba_iommu: return error code from sba_map_sg_attrs()
  MIPS/jazzdma: return error code from jazz_dma_map_sg()
  powerpc/iommu: return error code from .map_sg() ops
  s390/pci: return error code from s390_dma_map_sg()
  sparc/iommu: return error codes from .map_sg() ops
  parisc: return error code from .map_sg() ops
  xen: swiotlb: return error code from xen_swiotlb_map_sg()
  x86/amd_gart: return error code from gart_map_sg()
  dma-mapping: return error code from dma_dummy_map_sg()

 arch/alpha/kernel/pci_iommu.c   | 10 ++-
 arch/arm/mm/dma-mapping.c   | 26 +---
 arch/ia64/hp/common/sba_iommu.c |  6 +-
 arch/mips/jazz/jazzdma.c|  2 +-
 arch/powerpc/kernel/iommu.c |  6 +-
 arch/powerpc/platforms/ps3/system-bus.c |  2 +-
 arch/powerpc/platforms/pseries/vio.c|  5 +-
 arch/s390/pci/pci_dma.c | 13 ++--
 arch/sparc/kernel/iommu.c   |  6 +-
 arch/sparc/kernel/pci_sun4v.c   |  6 +-
 arch/sparc/mm/iommu.c   |  2 +-
 arch/x86/kernel/amd_gart_64.c   | 18 +++---
 drivers/iommu/dma-iommu.c   | 23 ---
 drivers/iommu/iommu.c   | 15 +++--
 drivers/parisc/ccio-dma.c   |  2 +-
 drivers/parisc/sba_iommu.c  |  2 +-
 drivers/xen/swiotlb-xen.c   |  2 +-
 include/linux/dma-map-ops.h |  5 +-
 include/linux/dma-mapping.h | 35 +++
 include/linux/iommu.h   | 22 +++
 kernel/dma/direct.c |  2 +-
 kernel/dma/dummy.c  |  2 +-
 kernel/dma/mapping.c| 82 ++---
 23 files changed, 177 insertions(+), 117 deletions(-)


base-commit: ff1176468d368232b684f75e82563369208bc371
--
2.20.1


[PATCH v3 01/21] dma-mapping: Allow map_sg() ops to return negative error codes

2021-07-29 Thread Logan Gunthorpe
Allow dma_map_sgtable() to pass errors from the map_sg() ops. This
will be required for returning appropriate error codes when mapping
P2PDMA memory.

Introduce __dma_map_sg_attrs() which will return the raw error code
from the map_sg operation (whether it be negative or zero). Then add a
dma_map_sg_attrs() wrapper to convert any negative errors to zero to
satisfy the existing calling convention.

dma_map_sgtable() defines three error codes that .map_sg implementations
are allowed to return: -EINVAL, -ENOMEM and -EIO. The latter of which
is a generic return for cases that are passing DMA_MAPPING_ERROR
through.

dma_map_sgtable() will convert a zero error return for old map_sg() ops
into a -EIO return and return any negative errors as reported.

This allows map_sg implementations to start returning multiple
negative error codes. Legacy map_sg implementations can continue
to return zero until they are all converted.

Signed-off-by: Logan Gunthorpe 
---
 include/linux/dma-map-ops.h |  5 ++-
 include/linux/dma-mapping.h | 35 
 kernel/dma/mapping.c| 84 +
 3 files changed, 86 insertions(+), 38 deletions(-)

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 0d53a96a3d64..2f842498c448 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -41,8 +41,9 @@ struct dma_map_ops {
size_t size, enum dma_data_direction dir,
unsigned long attrs);
/*
-* map_sg returns 0 on error and a value > 0 on success.
-* It should never return a value < 0.
+* map_sg should return a negative error code on error. See
+* dma_map_sgtable() for a list of appropriate error codes
+* and their meanings.
 */
int (*map_sg)(struct device *dev, struct scatterlist *sg, int nents,
enum dma_data_direction dir, unsigned long attrs);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 183e7103a66d..daa1e360f0ee 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -110,6 +110,8 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist 
*sg, int nents,
 void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
  int nents, enum dma_data_direction dir,
  unsigned long attrs);
+int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+   enum dma_data_direction dir, unsigned long attrs);
 dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
size_t size, enum dma_data_direction dir, unsigned long attrs);
 void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
@@ -174,6 +176,11 @@ static inline void dma_unmap_sg_attrs(struct device *dev,
unsigned long attrs)
 {
 }
+static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+   enum dma_data_direction dir, unsigned long attrs)
+{
+   return -EOPNOTSUPP;
+}
 static inline dma_addr_t dma_map_resource(struct device *dev,
phys_addr_t phys_addr, size_t size, enum dma_data_direction dir,
unsigned long attrs)
@@ -343,34 +350,6 @@ static inline void dma_sync_single_range_for_device(struct 
device *dev,
return dma_sync_single_for_device(dev, addr + offset, size, dir);
 }
 
-/**
- * dma_map_sgtable - Map the given buffer for DMA
- * @dev:   The device for which to perform the DMA operation
- * @sgt:   The sg_table object describing the buffer
- * @dir:   DMA direction
- * @attrs: Optional DMA attributes for the map operation
- *
- * Maps a buffer described by a scatterlist stored in the given sg_table
- * object for the @dir DMA operation by the @dev device. After success the
- * ownership for the buffer is transferred to the DMA domain.  One has to
- * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the
- * ownership of the buffer back to the CPU domain before touching the
- * buffer by the CPU.
- *
- * Returns 0 on success or -EINVAL on error during mapping the buffer.
- */
-static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
-   enum dma_data_direction dir, unsigned long attrs)
-{
-   int nents;
-
-   nents = dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
-   if (nents <= 0)
-   return -EINVAL;
-   sgt->nents = nents;
-   return 0;
-}
-
 /**
  * dma_unmap_sgtable - Unmap the given buffer for DMA
  * @dev:   The device for which to perform the DMA operation
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 2b06a809d0b9..9f0bb56eb9aa 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -177,12 +177,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t 
addr, size_t size,
 }
 EXPORT_SYMBOL(dma_unmap_page_attrs);
 
-/*
- *

[PATCH v3 02/21] dma-direct: Return appropriate error code from dma_direct_map_sg()

2021-07-29 Thread Logan Gunthorpe
Now that the map_sg() op expects error codes instead of return zero on
error, convert dma_direct_map_sg() to return an error code. Per the
documentation for dma_map_sgtable(), -EIO is returned due to an
DMA_MAPPING_ERROR with unknown cause.

Signed-off-by: Logan Gunthorpe 
---
 kernel/dma/direct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index f737e3347059..f33ceb68aef2 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -411,7 +411,7 @@ int dma_direct_map_sg(struct device *dev, struct 
scatterlist *sgl, int nents,
 
 out_unmap:
dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
-   return 0;
+   return -EIO;
 }
 
 dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
-- 
2.20.1



[PATCH v3 04/21] dma-iommu: Return error code from iommu_dma_map_sg()

2021-07-29 Thread Logan Gunthorpe
Return appropriate error codes EINVAL or ENOMEM from
iommup_dma_map_sg(). If lower level code returns ENOMEM, then we
return it, other errors are coalesced into EINVAL.

iommu_dma_map_sg_swiotlb() returns -EIO as its an unknown error
from a call that returns DMA_MAPPING_ERROR.

Signed-off-by: Logan Gunthorpe 
Cc: Joerg Roedel 
Cc: Will Deacon 
---
 drivers/iommu/dma-iommu.c | 23 ---
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 98ba927aee1a..d9aaed080e68 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -972,7 +972,7 @@ static int iommu_dma_map_sg_swiotlb(struct device *dev, 
struct scatterlist *sg,
 
 out_unmap:
iommu_dma_unmap_sg_swiotlb(dev, sg, i, dir, attrs | 
DMA_ATTR_SKIP_CPU_SYNC);
-   return 0;
+   return -EIO;
 }
 
 /*
@@ -993,11 +993,13 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
dma_addr_t iova;
size_t iova_len = 0;
unsigned long mask = dma_get_seg_boundary(dev);
+   ssize_t ret;
int i;
 
-   if (static_branch_unlikely(_deferred_attach_enabled) &&
-   iommu_deferred_attach(dev, domain))
-   return 0;
+   if (static_branch_unlikely(_deferred_attach_enabled)) {
+   ret = iommu_deferred_attach(dev, domain);
+   goto out;
+   }
 
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
iommu_dma_sync_sg_for_device(dev, sg, nents, dir);
@@ -1045,14 +1047,17 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
}
 
iova = iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev);
-   if (!iova)
+   if (!iova) {
+   ret = -ENOMEM;
goto out_restore_sg;
+   }
 
/*
 * We'll leave any physical concatenation to the IOMMU driver's
 * implementation - it knows better than we do.
 */
-   if (iommu_map_sg_atomic(domain, iova, sg, nents, prot) < iova_len)
+   ret = iommu_map_sg_atomic(domain, iova, sg, nents, prot);
+   if (ret < iova_len)
goto out_free_iova;
 
return __finalise_sg(dev, sg, nents, iova);
@@ -1061,7 +1066,11 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
iommu_dma_free_iova(cookie, iova, iova_len, NULL);
 out_restore_sg:
__invalidate_sg(sg, nents);
-   return 0;
+out:
+   if (ret == -ENOMEM)
+   return ret;
+   else
+   return -EINVAL;
 }
 
 static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
-- 
2.20.1



[PATCH v3 21/21] dma-mapping: Disallow .map_sg operations from returning zero on error

2021-07-29 Thread Logan Gunthorpe
Now that all the .map_sg operations have been converted to returning
proper error codes, drop the code to handle a zero return value,
add a warning if a zero is returned.

Signed-off-by: Logan Gunthorpe 
---
 kernel/dma/mapping.c | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 9f0bb56eb9aa..cbcbdc877458 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -196,8 +196,8 @@ static int __dma_map_sg_attrs(struct device *dev, struct 
scatterlist *sg,
 
if (ents > 0)
debug_dma_map_sg(dev, sg, nents, ents, dir);
-   else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM &&
- ents != -EIO && ents != 0))
+   else if (WARN_ON_ONCE((ents != -EINVAL && ents != -ENOMEM &&
+  ents != -EIO) || ents == 0))
return -EIO;
 
return ents;
@@ -262,9 +262,7 @@ int dma_map_sgtable(struct device *dev, struct sg_table 
*sgt,
int nents;
 
nents = __dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
-   if (nents == 0)
-   return -EIO;
-   else if (nents < 0)
+   if (nents < 0)
return nents;
 
sgt->nents = nents;
-- 
2.20.1



[PATCH v3 17/21] xen: swiotlb: return error code from xen_swiotlb_map_sg()

2021-07-29 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

xen_swiotlb_map_sg() may only fail if xen_swiotlb_map_page() fails, but
xen_swiotlb_map_page() only supports returning errors as
DMA_MAPPING_ERROR. So coalesce all errors into EIO per the documentation
for dma_map_sgtable().

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Reviewed-by: Boris Ostrovsky 
Cc: Konrad Rzeszutek Wilk 
Cc: Juergen Gross 
Cc: Stefano Stabellini 
---
 drivers/xen/swiotlb-xen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 24d11861ac7d..85d58b720a24 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -509,7 +509,7 @@ xen_swiotlb_map_sg(struct device *dev, struct scatterlist 
*sgl, int nelems,
 out_unmap:
xen_swiotlb_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
sg_dma_len(sgl) = 0;
-   return 0;
+   return -EIO;
 }
 
 static void
-- 
2.20.1



[PATCH v3 03/21] iommu: Return full error code from iommu_map_sg[_atomic]()

2021-07-29 Thread Logan Gunthorpe
Convert to ssize_t return code so the return code from __iommu_map()
can be returned all the way down through dma_iommu_map_sg().

Signed-off-by: Logan Gunthorpe 
Cc: Joerg Roedel 
Cc: Will Deacon 
---
 drivers/iommu/iommu.c | 15 +++
 include/linux/iommu.h | 22 +++---
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 5419c4b9f27a..bf971b4e34aa 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2567,9 +2567,9 @@ size_t iommu_unmap_fast(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_unmap_fast);
 
-static size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
-struct scatterlist *sg, unsigned int nents, int 
prot,
-gfp_t gfp)
+static ssize_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+   struct scatterlist *sg, unsigned int nents, int prot,
+   gfp_t gfp)
 {
const struct iommu_ops *ops = domain->ops;
size_t len = 0, mapped = 0;
@@ -2610,19 +2610,18 @@ static size_t __iommu_map_sg(struct iommu_domain 
*domain, unsigned long iova,
/* undo mappings already done */
iommu_unmap(domain, iova, mapped);
 
-   return 0;
-
+   return ret;
 }
 
-size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
-   struct scatterlist *sg, unsigned int nents, int prot)
+ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+struct scatterlist *sg, unsigned int nents, int prot)
 {
might_sleep();
return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_KERNEL);
 }
 EXPORT_SYMBOL_GPL(iommu_map_sg);
 
-size_t iommu_map_sg_atomic(struct iommu_domain *domain, unsigned long iova,
+ssize_t iommu_map_sg_atomic(struct iommu_domain *domain, unsigned long iova,
struct scatterlist *sg, unsigned int nents, int prot)
 {
return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_ATOMIC);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 32d448050bf7..9369458ba1bd 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -414,11 +414,11 @@ extern size_t iommu_unmap(struct iommu_domain *domain, 
unsigned long iova,
 extern size_t iommu_unmap_fast(struct iommu_domain *domain,
   unsigned long iova, size_t size,
   struct iommu_iotlb_gather *iotlb_gather);
-extern size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
-  struct scatterlist *sg,unsigned int nents, int prot);
-extern size_t iommu_map_sg_atomic(struct iommu_domain *domain,
- unsigned long iova, struct scatterlist *sg,
- unsigned int nents, int prot);
+extern ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+   struct scatterlist *sg, unsigned int nents, int prot);
+extern ssize_t iommu_map_sg_atomic(struct iommu_domain *domain,
+  unsigned long iova, struct scatterlist *sg,
+  unsigned int nents, int prot);
 extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t 
iova);
 extern void iommu_set_fault_handler(struct iommu_domain *domain,
iommu_fault_handler_t handler, void *token);
@@ -679,18 +679,18 @@ static inline size_t iommu_unmap_fast(struct iommu_domain 
*domain,
return 0;
 }
 
-static inline size_t iommu_map_sg(struct iommu_domain *domain,
- unsigned long iova, struct scatterlist *sg,
- unsigned int nents, int prot)
+static inline ssize_t iommu_map_sg(struct iommu_domain *domain,
+  unsigned long iova, struct scatterlist *sg,
+  unsigned int nents, int prot)
 {
-   return 0;
+   return -ENODEV;
 }
 
-static inline size_t iommu_map_sg_atomic(struct iommu_domain *domain,
+static inline ssize_t iommu_map_sg_atomic(struct iommu_domain *domain,
  unsigned long iova, struct scatterlist *sg,
  unsigned int nents, int prot)
 {
-   return 0;
+   return -ENODEV;
 }
 
 static inline void iommu_flush_iotlb_all(struct iommu_domain *domain)
-- 
2.20.1



[PATCH v3 20/21] dma-mapping: return error code from dma_dummy_map_sg()

2021-07-29 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

The only errno to return is -EINVAL in the case when DMA is not
supported.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
---
 kernel/dma/dummy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c
index eacd4c5b10bf..b492d59ac77e 100644
--- a/kernel/dma/dummy.c
+++ b/kernel/dma/dummy.c
@@ -22,7 +22,7 @@ static int dma_dummy_map_sg(struct device *dev, struct 
scatterlist *sgl,
int nelems, enum dma_data_direction dir,
unsigned long attrs)
 {
-   return 0;
+   return -EINVAL;
 }
 
 static int dma_dummy_supported(struct device *hwdev, u64 mask)
-- 
2.20.1



[PATCH v2 05/21] alpha: return error code from alpha_pci_map_sg()

2021-07-23 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

pci_map_single_1() can fail for different reasons, but since the only
supported type of error return is DMA_MAPPING_ERROR, we coalesce those
errors into EIO.

ENOMEM is returned when no page tables can be allocated.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Richard Henderson 
Cc: Ivan Kokshaysky 
Cc: Matt Turner 
---
 arch/alpha/kernel/pci_iommu.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index 35d7b3096d6e..21f9ac101324 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -649,7 +649,9 @@ static int alpha_pci_map_sg(struct device *dev, struct 
scatterlist *sg,
sg->dma_address
  = pci_map_single_1(pdev, SG_ENT_VIRT_ADDRESS(sg),
 sg->length, dac_allowed);
-   return sg->dma_address != DMA_MAPPING_ERROR;
+   if (sg->dma_address == DMA_MAPPING_ERROR)
+   return -EIO;
+   return 1;
}
 
start = sg;
@@ -685,8 +687,10 @@ static int alpha_pci_map_sg(struct device *dev, struct 
scatterlist *sg,
if (out < end)
out->dma_length = 0;
 
-   if (out - start == 0)
+   if (out - start == 0) {
printk(KERN_WARNING "pci_map_sg failed: no entries?\n");
+   return -ENOMEM;
+   }
DBGA("pci_map_sg: %ld entries\n", out - start);
 
return out - start;
@@ -699,7 +703,7 @@ static int alpha_pci_map_sg(struct device *dev, struct 
scatterlist *sg,
   entries.  Unmap them now.  */
if (out > start)
pci_unmap_sg(pdev, start, out - start, dir);
-   return 0;
+   return -ENOMEM;
 }
 
 /* Unmap a set of streaming mode DMA translations.  Again, cpu read
-- 
2.20.1



[PATCH v2 07/21] ARM/dma-mapping: don't set failed sg dma_address to DMA_MAPPING_ERROR

2021-07-23 Thread Logan Gunthorpe
Setting the ->dma_address to DMA_MAPPING_ERROR is not part of the
->map_sg calling convention, so remove it.

Link: https://lore.kernel.org/linux-mips/20210716063241.gc13...@lst.de/
Suggested-by: Christoph Hellwig 
Signed-off-by: Logan Gunthorpe 
Cc: Russell King 
Cc: Thomas Bogendoerfer 
---
 arch/arm/mm/dma-mapping.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 113b9cb3701b..4b61541853ea 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -1632,7 +1632,6 @@ static int __iommu_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
for (i = 1; i < nents; i++) {
s = sg_next(s);
 
-   s->dma_address = DMA_MAPPING_ERROR;
s->dma_length = 0;
 
if (s->offset || (size & ~PAGE_MASK) || size + s->length > max) 
{
-- 
2.20.1



[PATCH v2 06/21] ARM/dma-mapping: return error code from .map_sg() ops

2021-07-23 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.
In the case of a DMA_MAPPING_ERROR, -EIO is returned. Otherwise,
-ENOMEM or -EINVAL is returned depending on the error from
__map_sg_chunk().

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Russell King 
Cc: Thomas Bogendoerfer 
---
 arch/arm/mm/dma-mapping.c | 25 -
 1 file changed, 16 insertions(+), 9 deletions(-)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index c4b8df2ad328..113b9cb3701b 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -980,7 +980,7 @@ int arm_dma_map_sg(struct device *dev, struct scatterlist 
*sg, int nents,
 {
const struct dma_map_ops *ops = get_dma_ops(dev);
struct scatterlist *s;
-   int i, j;
+   int i, j, ret;
 
for_each_sg(sg, s, nents, i) {
 #ifdef CONFIG_NEED_SG_DMA_LENGTH
@@ -988,15 +988,17 @@ int arm_dma_map_sg(struct device *dev, struct scatterlist 
*sg, int nents,
 #endif
s->dma_address = ops->map_page(dev, sg_page(s), s->offset,
s->length, dir, attrs);
-   if (dma_mapping_error(dev, s->dma_address))
+   if (dma_mapping_error(dev, s->dma_address)) {
+   ret = -EIO;
goto bad_mapping;
+   }
}
return nents;
 
  bad_mapping:
for_each_sg(sg, s, i, j)
ops->unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir, 
attrs);
-   return 0;
+   return ret;
 }
 
 /**
@@ -1622,7 +1624,7 @@ static int __iommu_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
 bool is_coherent)
 {
struct scatterlist *s = sg, *dma = sg, *start = sg;
-   int i, count = 0;
+   int i, count = 0, ret;
unsigned int offset = s->offset;
unsigned int size = s->offset + s->length;
unsigned int max = dma_get_max_seg_size(dev);
@@ -1634,8 +1636,10 @@ static int __iommu_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
s->dma_length = 0;
 
if (s->offset || (size & ~PAGE_MASK) || size + s->length > max) 
{
-   if (__map_sg_chunk(dev, start, size, >dma_address,
-   dir, attrs, is_coherent) < 0)
+   ret = __map_sg_chunk(dev, start, size,
+>dma_address, dir, attrs,
+is_coherent);
+   if (ret < 0)
goto bad_mapping;
 
dma->dma_address += offset;
@@ -1648,8 +1652,9 @@ static int __iommu_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
}
size += s->length;
}
-   if (__map_sg_chunk(dev, start, size, >dma_address, dir, attrs,
-   is_coherent) < 0)
+   ret = __map_sg_chunk(dev, start, size, >dma_address, dir, attrs,
+is_coherent);
+   if (ret < 0)
goto bad_mapping;
 
dma->dma_address += offset;
@@ -1660,7 +1665,9 @@ static int __iommu_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
 bad_mapping:
for_each_sg(sg, s, count, i)
__iommu_remove_mapping(dev, sg_dma_address(s), sg_dma_len(s));
-   return 0;
+   if (ret == -ENOMEM)
+   return ret;
+   return -EINVAL;
 }
 
 /**
-- 
2.20.1



[PATCH v2 08/21] ia64/sba_iommu: return error code from sba_map_sg_attrs()

2021-07-23 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

In the case of a dma_mapping_error() return -EIO as the actual cause
is opaque here.

sba_coalesce_chunks() may only presently fail if sba_alloc_range()
fails, which in turn only fails if the iommu is out of mapping
resources, hence a -ENOMEM is used in that case.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Michael Ellerman 
Cc: Niklas Schnelle 
Cc: Thomas Bogendoerfer 
---
 arch/ia64/hp/common/sba_iommu.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 9148ddbf02e5..430c166b68cd 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1458,8 +1458,8 @@ static int sba_map_sg_attrs(struct device *dev, struct 
scatterlist *sglist,
sglist->dma_length = sglist->length;
sglist->dma_address = sba_map_page(dev, sg_page(sglist),
sglist->offset, sglist->length, dir, attrs);
-   if (dma_mapping_error(dev, sglist->dma_address))
-   return 0;
+   if(dma_mapping_error(dev, sglist->dma_address))
+   return -EIO;
return 1;
}
 
@@ -1486,7 +1486,7 @@ static int sba_map_sg_attrs(struct device *dev, struct 
scatterlist *sglist,
coalesced = sba_coalesce_chunks(ioc, dev, sglist, nents);
if (coalesced < 0) {
sba_unmap_sg_attrs(dev, sglist, nents, dir, attrs);
-   return 0;
+   return -ENOMEM;
}
 
/*
-- 
2.20.1



[PATCH v2 11/21] powerpc/iommu: don't set failed sg dma_address to DMA_MAPPING_ERROR

2021-07-23 Thread Logan Gunthorpe
Setting the ->dma_address to DMA_MAPPING_ERROR is not part of
the ->map_sg calling convention, so remove it.

Link: https://lore.kernel.org/linux-mips/20210716063241.gc13...@lst.de/
Suggested-by: Christoph Hellwig 
Signed-off-by: Logan Gunthorpe 
Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Geoff Levand 
---
 arch/powerpc/kernel/iommu.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index a8ec4fe42817..30b7736f0896 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -575,7 +575,6 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table 
*tbl,
 */
if (outcount < incount) {
outs = sg_next(outs);
-   outs->dma_address = DMA_MAPPING_ERROR;
outs->dma_length = 0;
}
 
@@ -593,7 +592,6 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table 
*tbl,
npages = iommu_num_pages(s->dma_address, s->dma_length,
 IOMMU_PAGE_SIZE(tbl));
__iommu_free(tbl, vaddr, npages);
-   s->dma_address = DMA_MAPPING_ERROR;
s->dma_length = 0;
}
if (s == outs)
-- 
2.20.1



[PATCH v2 10/21] powerpc/iommu: return error code from .map_sg() ops

2021-07-23 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

Propagate the error up if vio_dma_iommu_map_sg() fails.

ppc_iommu_map_sg() may fail either because of iommu_range_alloc() or
because of tbl->it_ops->set(). The former only supports returning an
error with DMA_MAPPING_ERROR and an examination of the latter indicates
that it may return arch-specific errors (for example,
tce_buildmulti_pSeriesLP()). Hence, coalesce all of those errors into
-EIO, per the documentation on dma_map_sgtable().

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Geoff Levand 
---
 arch/powerpc/kernel/iommu.c | 4 ++--
 arch/powerpc/platforms/ps3/system-bus.c | 2 +-
 arch/powerpc/platforms/pseries/vio.c| 5 +++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 2af89a5e379f..a8ec4fe42817 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -473,7 +473,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table 
*tbl,
BUG_ON(direction == DMA_NONE);
 
if ((nelems == 0) || !tbl)
-   return 0;
+   return -EINVAL;
 
outs = s = segstart = [0];
outcount = 1;
@@ -599,7 +599,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table 
*tbl,
if (s == outs)
break;
}
-   return 0;
+   return -EIO;
 }
 
 
diff --git a/arch/powerpc/platforms/ps3/system-bus.c 
b/arch/powerpc/platforms/ps3/system-bus.c
index 1a5665875165..c54eb46f0cfb 100644
--- a/arch/powerpc/platforms/ps3/system-bus.c
+++ b/arch/powerpc/platforms/ps3/system-bus.c
@@ -663,7 +663,7 @@ static int ps3_ioc0_map_sg(struct device *_dev, struct 
scatterlist *sg,
   unsigned long attrs)
 {
BUG();
-   return 0;
+   return -EINVAL;
 }
 
 static void ps3_sb_unmap_sg(struct device *_dev, struct scatterlist *sg,
diff --git a/arch/powerpc/platforms/pseries/vio.c 
b/arch/powerpc/platforms/pseries/vio.c
index e00f3725ec96..e31e59c54f30 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -560,7 +560,8 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct 
scatterlist *sglist,
for_each_sg(sglist, sgl, nelems, count)
alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl));
 
-   if (vio_cmo_alloc(viodev, alloc_size))
+   ret = vio_cmo_alloc(viodev, alloc_size);
+   if (ret)
goto out_fail;
ret = ppc_iommu_map_sg(dev, tbl, sglist, nelems, dma_get_mask(dev),
direction, attrs);
@@ -577,7 +578,7 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct 
scatterlist *sglist,
vio_cmo_dealloc(viodev, alloc_size);
 out_fail:
atomic_inc(>cmo.allocs_failed);
-   return 0;
+   return ret;
 }
 
 static void vio_dma_iommu_unmap_sg(struct device *dev,
-- 
2.20.1



[PATCH v2 09/21] MIPS/jazzdma: return error code from jazz_dma_map_sg()

2021-07-23 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

vdma_alloc() may fail for different reasons, but since it only supports
indicating an error via a return of DMA_MAPPING_ERROR, we coalesce the
different reasons into -EIO as is documented on dma_map_sgtable().

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Thomas Bogendoerfer 
---
 arch/mips/jazz/jazzdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/jazz/jazzdma.c b/arch/mips/jazz/jazzdma.c
index 461457b28982..eabddb89d221 100644
--- a/arch/mips/jazz/jazzdma.c
+++ b/arch/mips/jazz/jazzdma.c
@@ -552,7 +552,7 @@ static int jazz_dma_map_sg(struct device *dev, struct 
scatterlist *sglist,
dir);
sg->dma_address = vdma_alloc(sg_phys(sg), sg->length);
if (sg->dma_address == DMA_MAPPING_ERROR)
-   return 0;
+   return -EIO;
sg_dma_len(sg) = sg->length;
}
 
-- 
2.20.1



[PATCH v2 03/21] iommu: Return full error code from iommu_map_sg[_atomic]()

2021-07-23 Thread Logan Gunthorpe
Convert to ssize_t return code so the return code from __iommu_map()
can be returned all the way down through dma_iommu_map_sg().

Signed-off-by: Logan Gunthorpe 
Cc: Joerg Roedel 
Cc: Will Deacon 
---
 drivers/iommu/iommu.c | 15 +++
 include/linux/iommu.h | 22 +++---
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 5419c4b9f27a..bf971b4e34aa 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2567,9 +2567,9 @@ size_t iommu_unmap_fast(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_unmap_fast);
 
-static size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
-struct scatterlist *sg, unsigned int nents, int 
prot,
-gfp_t gfp)
+static ssize_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+   struct scatterlist *sg, unsigned int nents, int prot,
+   gfp_t gfp)
 {
const struct iommu_ops *ops = domain->ops;
size_t len = 0, mapped = 0;
@@ -2610,19 +2610,18 @@ static size_t __iommu_map_sg(struct iommu_domain 
*domain, unsigned long iova,
/* undo mappings already done */
iommu_unmap(domain, iova, mapped);
 
-   return 0;
-
+   return ret;
 }
 
-size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
-   struct scatterlist *sg, unsigned int nents, int prot)
+ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+struct scatterlist *sg, unsigned int nents, int prot)
 {
might_sleep();
return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_KERNEL);
 }
 EXPORT_SYMBOL_GPL(iommu_map_sg);
 
-size_t iommu_map_sg_atomic(struct iommu_domain *domain, unsigned long iova,
+ssize_t iommu_map_sg_atomic(struct iommu_domain *domain, unsigned long iova,
struct scatterlist *sg, unsigned int nents, int prot)
 {
return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_ATOMIC);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 32d448050bf7..9369458ba1bd 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -414,11 +414,11 @@ extern size_t iommu_unmap(struct iommu_domain *domain, 
unsigned long iova,
 extern size_t iommu_unmap_fast(struct iommu_domain *domain,
   unsigned long iova, size_t size,
   struct iommu_iotlb_gather *iotlb_gather);
-extern size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
-  struct scatterlist *sg,unsigned int nents, int prot);
-extern size_t iommu_map_sg_atomic(struct iommu_domain *domain,
- unsigned long iova, struct scatterlist *sg,
- unsigned int nents, int prot);
+extern ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+   struct scatterlist *sg, unsigned int nents, int prot);
+extern ssize_t iommu_map_sg_atomic(struct iommu_domain *domain,
+  unsigned long iova, struct scatterlist *sg,
+  unsigned int nents, int prot);
 extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t 
iova);
 extern void iommu_set_fault_handler(struct iommu_domain *domain,
iommu_fault_handler_t handler, void *token);
@@ -679,18 +679,18 @@ static inline size_t iommu_unmap_fast(struct iommu_domain 
*domain,
return 0;
 }
 
-static inline size_t iommu_map_sg(struct iommu_domain *domain,
- unsigned long iova, struct scatterlist *sg,
- unsigned int nents, int prot)
+static inline ssize_t iommu_map_sg(struct iommu_domain *domain,
+  unsigned long iova, struct scatterlist *sg,
+  unsigned int nents, int prot)
 {
-   return 0;
+   return -ENODEV;
 }
 
-static inline size_t iommu_map_sg_atomic(struct iommu_domain *domain,
+static inline ssize_t iommu_map_sg_atomic(struct iommu_domain *domain,
  unsigned long iova, struct scatterlist *sg,
  unsigned int nents, int prot)
 {
-   return 0;
+   return -ENODEV;
 }
 
 static inline void iommu_flush_iotlb_all(struct iommu_domain *domain)
-- 
2.20.1



[PATCH v2 04/21] dma-iommu: Return error code from iommu_dma_map_sg()

2021-07-23 Thread Logan Gunthorpe
Return appropriate error codes EINVAL or ENOMEM from
iommup_dma_map_sg(). If lower level code returns ENOMEM, then we
return it, other errors are coalesced into EINVAL.

iommu_dma_map_sg_swiotlb() returns -EIO as its an unknown error
from a call that returns DMA_MAPPING_ERROR.

Signed-off-by: Logan Gunthorpe 
Cc: Joerg Roedel 
Cc: Will Deacon 
---
 drivers/iommu/dma-iommu.c | 23 ---
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 98ba927aee1a..d9aaed080e68 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -972,7 +972,7 @@ static int iommu_dma_map_sg_swiotlb(struct device *dev, 
struct scatterlist *sg,
 
 out_unmap:
iommu_dma_unmap_sg_swiotlb(dev, sg, i, dir, attrs | 
DMA_ATTR_SKIP_CPU_SYNC);
-   return 0;
+   return -EIO;
 }
 
 /*
@@ -993,11 +993,13 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
dma_addr_t iova;
size_t iova_len = 0;
unsigned long mask = dma_get_seg_boundary(dev);
+   ssize_t ret;
int i;
 
-   if (static_branch_unlikely(_deferred_attach_enabled) &&
-   iommu_deferred_attach(dev, domain))
-   return 0;
+   if (static_branch_unlikely(_deferred_attach_enabled)) {
+   ret = iommu_deferred_attach(dev, domain);
+   goto out;
+   }
 
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
iommu_dma_sync_sg_for_device(dev, sg, nents, dir);
@@ -1045,14 +1047,17 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
}
 
iova = iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev);
-   if (!iova)
+   if (!iova) {
+   ret = -ENOMEM;
goto out_restore_sg;
+   }
 
/*
 * We'll leave any physical concatenation to the IOMMU driver's
 * implementation - it knows better than we do.
 */
-   if (iommu_map_sg_atomic(domain, iova, sg, nents, prot) < iova_len)
+   ret = iommu_map_sg_atomic(domain, iova, sg, nents, prot);
+   if (ret < iova_len)
goto out_free_iova;
 
return __finalise_sg(dev, sg, nents, iova);
@@ -1061,7 +1066,11 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
iommu_dma_free_iova(cookie, iova, iova_len, NULL);
 out_restore_sg:
__invalidate_sg(sg, nents);
-   return 0;
+out:
+   if (ret == -ENOMEM)
+   return ret;
+   else
+   return -EINVAL;
 }
 
 static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
-- 
2.20.1



[PATCH v2 12/21] s390/pci: return error code from s390_dma_map_sg()

2021-07-23 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

So propagate the error from __s390_dma_map_sg() up. __s390_dma_map_sg()
returns either -ENOMEM on allocation failure or -EINVAL which is
the same as what's expected by dma_map_sgtable().

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Acked-by: Niklas Schnelle 
Cc: Gerald Schaefer 
Cc: Heiko Carstens 
Cc: Vasily Gorbik 
Cc: Christian Borntraeger 
---
 arch/s390/pci/pci_dma.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index ebc9a49523aa..c78b02012764 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -487,7 +487,7 @@ static int s390_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
unsigned int max = dma_get_max_seg_size(dev);
unsigned int size = s->offset + s->length;
unsigned int offset = s->offset;
-   int count = 0, i;
+   int count = 0, i, ret;
 
for (i = 1; i < nr_elements; i++) {
s = sg_next(s);
@@ -497,8 +497,9 @@ static int s390_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
 
if (s->offset || (size & ~PAGE_MASK) ||
size + s->length > max) {
-   if (__s390_dma_map_sg(dev, start, size,
- >dma_address, dir))
+   ret = __s390_dma_map_sg(dev, start, size,
+   >dma_address, dir);
+   if (ret)
goto unmap;
 
dma->dma_address += offset;
@@ -511,7 +512,8 @@ static int s390_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
}
size += s->length;
}
-   if (__s390_dma_map_sg(dev, start, size, >dma_address, dir))
+   ret = __s390_dma_map_sg(dev, start, size, >dma_address, dir);
+   if (ret)
goto unmap;
 
dma->dma_address += offset;
@@ -523,7 +525,7 @@ static int s390_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
s390_dma_unmap_pages(dev, sg_dma_address(s), sg_dma_len(s),
 dir, attrs);
 
-   return 0;
+   return ret;
 }
 
 static void s390_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
-- 
2.20.1



[PATCH v2 13/21] s390/pci: don't set failed sg dma_address to DMA_MAPPING_ERROR

2021-07-23 Thread Logan Gunthorpe
Setting the ->dma_address to DMA_MAPPING_ERROR is not part of
the ->map_sg calling convention, so remove it.

Link: https://lore.kernel.org/linux-mips/20210716063241.gc13...@lst.de/
Suggested-by: Christoph Hellwig 
Signed-off-by: Logan Gunthorpe 
Cc: Niklas Schnelle 
Cc: Gerald Schaefer 
Cc: Heiko Carstens 
Cc: Vasily Gorbik 
Cc: Christian Borntraeger 
---
 arch/s390/pci/pci_dma.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index c78b02012764..be48e5b5bfcf 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -492,7 +492,6 @@ static int s390_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
for (i = 1; i < nr_elements; i++) {
s = sg_next(s);
 
-   s->dma_address = DMA_MAPPING_ERROR;
s->dma_length = 0;
 
if (s->offset || (size & ~PAGE_MASK) ||
-- 
2.20.1



[PATCH v2 15/21] sparc/iommu: don't set failed sg dma_address to DMA_MAPPING_ERROR

2021-07-23 Thread Logan Gunthorpe
Setting the ->dma_address to DMA_MAPPING_ERROR is not part of
the ->map_sg calling convention, so remove it.

Link: https://lore.kernel.org/linux-mips/20210716063241.gc13...@lst.de/
Suggested-by: Christoph Hellwig 
Signed-off-by: Logan Gunthorpe 
Cc: "David S. Miller" 
Cc: Niklas Schnelle 
Cc: Michael Ellerman 
---
 arch/sparc/kernel/iommu.c | 2 --
 arch/sparc/kernel/pci_sun4v.c | 2 --
 2 files changed, 4 deletions(-)

diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index 0589acd34201..da0363692528 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -546,7 +546,6 @@ static int dma_4u_map_sg(struct device *dev, struct 
scatterlist *sglist,
 
if (outcount < incount) {
outs = sg_next(outs);
-   outs->dma_address = DMA_MAPPING_ERROR;
outs->dma_length = 0;
}
 
@@ -572,7 +571,6 @@ static int dma_4u_map_sg(struct device *dev, struct 
scatterlist *sglist,
iommu_tbl_range_free(>tbl, vaddr, npages,
 IOMMU_ERROR_CODE);
 
-   s->dma_address = DMA_MAPPING_ERROR;
s->dma_length = 0;
}
if (s == outs)
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index d90e80fa5705..384480971805 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -594,7 +594,6 @@ static int dma_4v_map_sg(struct device *dev, struct 
scatterlist *sglist,
 
if (outcount < incount) {
outs = sg_next(outs);
-   outs->dma_address = DMA_MAPPING_ERROR;
outs->dma_length = 0;
}
 
@@ -611,7 +610,6 @@ static int dma_4v_map_sg(struct device *dev, struct 
scatterlist *sglist,
iommu_tbl_range_free(tbl, vaddr, npages,
 IOMMU_ERROR_CODE);
/* XXX demap? XXX */
-   s->dma_address = DMA_MAPPING_ERROR;
s->dma_length = 0;
}
if (s == outs)
-- 
2.20.1



[PATCH v2 14/21] sparc/iommu: return error codes from .map_sg() ops

2021-07-23 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

Returning an errno from __sbus_iommu_map_sg() results in
sbus_iommu_map_sg_gflush() and sbus_iommu_map_sg_pflush() returning an
errno, as those functions are wrappers around __sbus_iommu_map_sg().

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: "David S. Miller" 
Cc: Niklas Schnelle 
Cc: Michael Ellerman 
---
 arch/sparc/kernel/iommu.c | 4 ++--
 arch/sparc/kernel/pci_sun4v.c | 4 ++--
 arch/sparc/mm/iommu.c | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index a034f571d869..0589acd34201 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -448,7 +448,7 @@ static int dma_4u_map_sg(struct device *dev, struct 
scatterlist *sglist,
iommu = dev->archdata.iommu;
strbuf = dev->archdata.stc;
if (nelems == 0 || !iommu)
-   return 0;
+   return -EINVAL;
 
spin_lock_irqsave(>lock, flags);
 
@@ -580,7 +580,7 @@ static int dma_4u_map_sg(struct device *dev, struct 
scatterlist *sglist,
}
spin_unlock_irqrestore(>lock, flags);
 
-   return 0;
+   return -EINVAL;
 }
 
 /* If contexts are being used, they are the same in all of the mappings
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index 9de57e88f7a1..d90e80fa5705 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -486,7 +486,7 @@ static int dma_4v_map_sg(struct device *dev, struct 
scatterlist *sglist,
 
iommu = dev->archdata.iommu;
if (nelems == 0 || !iommu)
-   return 0;
+   return -EINVAL;
atu = iommu->atu;
 
prot = HV_PCI_MAP_ATTR_READ;
@@ -619,7 +619,7 @@ static int dma_4v_map_sg(struct device *dev, struct 
scatterlist *sglist,
}
local_irq_restore(flags);
 
-   return 0;
+   return -EINVAL;
 }
 
 static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist,
diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c
index 0c0342e5b10d..9e3f6933ca13 100644
--- a/arch/sparc/mm/iommu.c
+++ b/arch/sparc/mm/iommu.c
@@ -256,7 +256,7 @@ static int __sbus_iommu_map_sg(struct device *dev, struct 
scatterlist *sgl,
sg->dma_address =__sbus_iommu_map_page(dev, sg_page(sg),
sg->offset, sg->length, per_page_flush);
if (sg->dma_address == DMA_MAPPING_ERROR)
-   return 0;
+   return -EIO;
sg->dma_length = sg->length;
}
 
-- 
2.20.1



[PATCH v2 16/21] parisc: return error code from .map_sg() ops

2021-07-23 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.
Return -EINVAL if the ioc cannot be obtained.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: "James E.J. Bottomley" 
Cc: Helge Deller 
---
 drivers/parisc/ccio-dma.c  | 2 +-
 drivers/parisc/sba_iommu.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index b5f9ee81a46c..452e72b7bd01 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -918,7 +918,7 @@ ccio_map_sg(struct device *dev, struct scatterlist *sglist, 
int nents,
BUG_ON(!dev);
ioc = GET_IOC(dev);
if (!ioc)
-   return 0;
+   return -EINVAL;

DBG_RUN_SG("%s() START %d entries\n", __func__, nents);
 
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index dce4cdf786cd..e60690d38d67 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -947,7 +947,7 @@ sba_map_sg(struct device *dev, struct scatterlist *sglist, 
int nents,
 
ioc = GET_IOC(dev);
if (!ioc)
-   return 0;
+   return -EINVAL;
 
/* Fast path single entry scatterlists. */
if (nents == 1) {
-- 
2.20.1



[PATCH v2 18/21] x86/amd_gart: return error code from gart_map_sg()

2021-07-23 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

So make __dma_map_cont() return a valid errno (which is then propagated
to gart_map_sg() via dma_map_cont()) and return it in case of failure.

Also, return -EINVAL in case of invalid nents.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: Niklas Schnelle 
Cc: Thomas Bogendoerfer 
Cc: Michael Ellerman 
---
 arch/x86/kernel/amd_gart_64.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 9ac696487b13..46aea9a4f26b 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -331,7 +331,7 @@ static int __dma_map_cont(struct device *dev, struct 
scatterlist *start,
int i;
 
if (iommu_start == -1)
-   return -1;
+   return -ENOMEM;
 
for_each_sg(start, s, nelems, i) {
unsigned long pages, addr;
@@ -380,13 +380,13 @@ static int gart_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
   enum dma_data_direction dir, unsigned long attrs)
 {
struct scatterlist *s, *ps, *start_sg, *sgmap;
-   int need = 0, nextneed, i, out, start;
+   int need = 0, nextneed, i, out, start, ret;
unsigned long pages = 0;
unsigned int seg_size;
unsigned int max_seg_size;
 
if (nents == 0)
-   return 0;
+   return -EINVAL;
 
out = 0;
start   = 0;
@@ -414,8 +414,9 @@ static int gart_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
if (!iommu_merge || !nextneed || !need || s->offset ||
(s->length + seg_size > max_seg_size) ||
(ps->offset + ps->length) % PAGE_SIZE) {
-   if (dma_map_cont(dev, start_sg, i - start,
-sgmap, pages, need) < 0)
+   ret = dma_map_cont(dev, start_sg, i - start,
+  sgmap, pages, need);
+   if (ret < 0)
goto error;
out++;
 
@@ -432,7 +433,8 @@ static int gart_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
ps = s;
}
-   if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
+   ret = dma_map_cont(dev, start_sg, i - start, sgmap, pages, need);
+   if (ret < 0)
goto error;
out++;
flush_gart();
@@ -458,7 +460,7 @@ static int gart_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
iommu_full(dev, pages << PAGE_SHIFT, dir);
for_each_sg(sg, s, nents, i)
s->dma_address = DMA_MAPPING_ERROR;
-   return 0;
+   return ret;
 }
 
 /* allocate and map a coherent mapping */
-- 
2.20.1



[PATCH v2 19/21] x86/amd_gart: don't set failed sg dma_address to DMA_MAPPING_ERROR

2021-07-23 Thread Logan Gunthorpe
Setting the ->dma_address to DMA_MAPPING_ERROR is not part of
the ->map_sg calling convention, so remove it.

Link: https://lore.kernel.org/linux-mips/20210716063241.gc13...@lst.de/
Suggested-by: Christoph Hellwig 
Signed-off-by: Logan Gunthorpe 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: Niklas Schnelle 
Cc: Thomas Bogendoerfer 
Cc: Michael Ellerman 
---
 arch/x86/kernel/amd_gart_64.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 46aea9a4f26b..ed837383de5c 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -458,8 +458,6 @@ static int gart_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
panic("dma_map_sg: overflow on %lu pages\n", pages);
 
iommu_full(dev, pages << PAGE_SHIFT, dir);
-   for_each_sg(sg, s, nents, i)
-   s->dma_address = DMA_MAPPING_ERROR;
return ret;
 }
 
-- 
2.20.1



[PATCH v2 17/21] xen: swiotlb: return error code from xen_swiotlb_map_sg()

2021-07-23 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

xen_swiotlb_map_sg() may only fail if xen_swiotlb_map_page() fails, but
xen_swiotlb_map_page() only supports returning errors as
DMA_MAPPING_ERROR. So coalesce all errors into EIO per the documentation
for dma_map_sgtable().

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Reviewed-by: Boris Ostrovsky 
Cc: Konrad Rzeszutek Wilk 
Cc: Juergen Gross 
Cc: Stefano Stabellini 
---
 drivers/xen/swiotlb-xen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 24d11861ac7d..85d58b720a24 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -509,7 +509,7 @@ xen_swiotlb_map_sg(struct device *dev, struct scatterlist 
*sgl, int nelems,
 out_unmap:
xen_swiotlb_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
sg_dma_len(sgl) = 0;
-   return 0;
+   return -EIO;
 }
 
 static void
-- 
2.20.1



[PATCH v2 01/21] dma-mapping: Allow map_sg() ops to return negative error codes

2021-07-23 Thread Logan Gunthorpe
Allow dma_map_sgtable() to pass errors from the map_sg() ops. This
will be required for returning appropriate error codes when mapping
P2PDMA memory.

Introduce __dma_map_sg_attrs() which will return the raw error code
from the map_sg operation (whether it be negative or zero). Then add a
dma_map_sg_attrs() wrapper to convert any negative errors to zero to
satisfy the existing calling convention.

dma_map_sgtable() defines three error codes that .map_sg implementations
are allowed to return: -EINVAL, -ENOMEM and -EIO. The latter of which
is a generic return for cases that are passing DMA_MAPPING_ERROR
through.

dma_map_sgtable() will convert a zero error return for old map_sg() ops
into a -EIO return and return any negative errors as reported.

This allows map_sg implementations to start returning multiple
negative error codes. Legacy map_sg implementations can continue
to return zero until they are all converted.

Signed-off-by: Logan Gunthorpe 
---
 include/linux/dma-map-ops.h |  5 ++-
 include/linux/dma-mapping.h | 35 +++
 kernel/dma/mapping.c| 85 +
 3 files changed, 87 insertions(+), 38 deletions(-)

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 0d53a96a3d64..2f842498c448 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -41,8 +41,9 @@ struct dma_map_ops {
size_t size, enum dma_data_direction dir,
unsigned long attrs);
/*
-* map_sg returns 0 on error and a value > 0 on success.
-* It should never return a value < 0.
+* map_sg should return a negative error code on error. See
+* dma_map_sgtable() for a list of appropriate error codes
+* and their meanings.
 */
int (*map_sg)(struct device *dev, struct scatterlist *sg, int nents,
enum dma_data_direction dir, unsigned long attrs);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 183e7103a66d..daa1e360f0ee 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -110,6 +110,8 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist 
*sg, int nents,
 void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
  int nents, enum dma_data_direction dir,
  unsigned long attrs);
+int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+   enum dma_data_direction dir, unsigned long attrs);
 dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
size_t size, enum dma_data_direction dir, unsigned long attrs);
 void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
@@ -174,6 +176,11 @@ static inline void dma_unmap_sg_attrs(struct device *dev,
unsigned long attrs)
 {
 }
+static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+   enum dma_data_direction dir, unsigned long attrs)
+{
+   return -EOPNOTSUPP;
+}
 static inline dma_addr_t dma_map_resource(struct device *dev,
phys_addr_t phys_addr, size_t size, enum dma_data_direction dir,
unsigned long attrs)
@@ -343,34 +350,6 @@ static inline void dma_sync_single_range_for_device(struct 
device *dev,
return dma_sync_single_for_device(dev, addr + offset, size, dir);
 }
 
-/**
- * dma_map_sgtable - Map the given buffer for DMA
- * @dev:   The device for which to perform the DMA operation
- * @sgt:   The sg_table object describing the buffer
- * @dir:   DMA direction
- * @attrs: Optional DMA attributes for the map operation
- *
- * Maps a buffer described by a scatterlist stored in the given sg_table
- * object for the @dir DMA operation by the @dev device. After success the
- * ownership for the buffer is transferred to the DMA domain.  One has to
- * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the
- * ownership of the buffer back to the CPU domain before touching the
- * buffer by the CPU.
- *
- * Returns 0 on success or -EINVAL on error during mapping the buffer.
- */
-static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
-   enum dma_data_direction dir, unsigned long attrs)
-{
-   int nents;
-
-   nents = dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
-   if (nents <= 0)
-   return -EINVAL;
-   sgt->nents = nents;
-   return 0;
-}
-
 /**
  * dma_unmap_sgtable - Unmap the given buffer for DMA
  * @dev:   The device for which to perform the DMA operation
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 2b06a809d0b9..b8dc8b1cb402 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -177,12 +177,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t 
addr, size_t size,
 }
 EXPORT_SYMBOL(dma_unmap_page_attrs);
 
-/*
- *

[PATCH v2 21/21] dma-mapping: Disallow .map_sg operations from returning zero on error

2021-07-23 Thread Logan Gunthorpe
Now that all the .map_sg operations have been converted to returning
proper error codes, drop the code to handle a zero return value,
add a warning if a zero is returned and update the comment for the
map_sg operation.

Signed-off-by: Logan Gunthorpe 
---
 kernel/dma/mapping.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index b8dc8b1cb402..86a8a421344a 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -194,6 +194,9 @@ static int __dma_map_sg_attrs(struct device *dev, struct 
scatterlist *sg,
else
ents = ops->map_sg(dev, sg, nents, dir, attrs);
 
+   if (WARN_ON_ONCE(ents == 0))
+   return -EIO;
+
if (ents > 0)
debug_dma_map_sg(dev, sg, nents, ents, dir);
 
@@ -259,9 +262,7 @@ int dma_map_sgtable(struct device *dev, struct sg_table 
*sgt,
int nents;
 
nents = __dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
-   if (nents == 0)
-   return -EIO;
-   else if (nents < 0) {
+   if (nents < 0) {
if (WARN_ON_ONCE(nents != -EINVAL && nents != -ENOMEM &&
 nents != -EIO))
return -EIO;
-- 
2.20.1



[PATCH v2 02/21] dma-direct: Return appropriate error code from dma_direct_map_sg()

2021-07-23 Thread Logan Gunthorpe
Now that the map_sg() op expects error codes instead of return zero on
error, convert dma_direct_map_sg() to return an error code. Per the
documentation for dma_map_sgtable(), -EIO is returned due to an
DMA_MAPPING_ERROR with unknown cause.

Signed-off-by: Logan Gunthorpe 
---
 kernel/dma/direct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index f737e3347059..f33ceb68aef2 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -411,7 +411,7 @@ int dma_direct_map_sg(struct device *dev, struct 
scatterlist *sgl, int nents,
 
 out_unmap:
dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
-   return 0;
+   return -EIO;
 }
 
 dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
-- 
2.20.1



[PATCH v2 00/21] .map_sg() error cleanup

2021-07-23 Thread Logan Gunthorpe
Hi,

This v2 of the series is spun out and expanded from my work to add
P2PDMA support to DMA map operations[1]. v1 is at [2]. The main changes
in v1 are to more carefully define the meaning of the error codes for
dma_map_sgtable().

The P2PDMA work requires distinguishing different error conditions in
a map_sg operation. dma_map_sgtable() already allows for returning an
error code (where as dma_map_sg() is only allowed to return zero)
however, it currently only returns -EINVAL when a .map_sg() call returns
zero.

This series cleans up all .map_sg() implementations to return appropriate
error codes. After the cleanup, dma_map_sg() will still return zero,
however dma_map_sgtable() will pass the error code from the .map_sg()
call. Thanks go to Martn Oliveira for doing a lot of the cleanup of the
obscure implementations.

The patch set is based off of v5.14-rc2 and a git repo can be found
here:

  https://github.com/sbates130272/linux-p2pmem map_sg_err_cleanup_v2

Thanks,

Logan

[1] 
https://lore.kernel.org/linux-block/20210513223203.5542-1-log...@deltatee.com/
[2] 
https://lore.kernel.org/linux-mips/20210715164544.6827-1-log...@deltatee.com/

--

Changes in v2:
  - Attempt to define the meanings of the errors returned by
dma_map_sgtable() and restrict the valid return codes of
.map_sg implementations. (Per Christoph)
  - Change dma_map_sgtable() to EXPORT_SYMBOL_GPL() (Per Christoph)
  - Add patches to remove the erroneous setting of sg->dma_address
to DMA_MAP_ERROR in a few .map_sg(0 implementations. (Per
Christoph).

--

Logan Gunthorpe (10):
  dma-mapping: Allow map_sg() ops to return negative error codes
  dma-direct: Return appropriate error code from dma_direct_map_sg()
  iommu: Return full error code from iommu_map_sg[_atomic]()
  dma-iommu: Return error code from iommu_dma_map_sg()
  ARM/dma-mapping: don't set failed sg dma_address to DMA_MAPPING_ERROR
  powerpc/iommu: don't set failed sg dma_address to DMA_MAPPING_ERROR
  s390/pci: don't set failed sg dma_address to DMA_MAPPING_ERROR
  sparc/iommu: don't set failed sg dma_address to DMA_MAPPING_ERROR
  x86/amd_gart: don't set failed sg dma_address to DMA_MAPPING_ERROR
  dma-mapping: Disallow .map_sg operations from returning zero on error

Martin Oliveira (11):
  alpha: return error code from alpha_pci_map_sg()
  ARM/dma-mapping: return error code from .map_sg() ops
  ia64/sba_iommu: return error code from sba_map_sg_attrs()
  MIPS/jazzdma: return error code from jazz_dma_map_sg()
  powerpc/iommu: return error code from .map_sg() ops
  s390/pci: return error code from s390_dma_map_sg()
  sparc/iommu: return error codes from .map_sg() ops
  parisc: return error code from .map_sg() ops
  xen: swiotlb: return error code from xen_swiotlb_map_sg()
  x86/amd_gart: return error code from gart_map_sg()
  dma-mapping: return error code from dma_dummy_map_sg()

 arch/alpha/kernel/pci_iommu.c   | 10 ++-
 arch/arm/mm/dma-mapping.c   | 26 +---
 arch/ia64/hp/common/sba_iommu.c |  6 +-
 arch/mips/jazz/jazzdma.c|  2 +-
 arch/powerpc/kernel/iommu.c |  6 +-
 arch/powerpc/platforms/ps3/system-bus.c |  2 +-
 arch/powerpc/platforms/pseries/vio.c|  5 +-
 arch/s390/pci/pci_dma.c | 13 ++--
 arch/sparc/kernel/iommu.c   |  6 +-
 arch/sparc/kernel/pci_sun4v.c   |  6 +-
 arch/sparc/mm/iommu.c   |  2 +-
 arch/x86/kernel/amd_gart_64.c   | 18 +++---
 drivers/iommu/dma-iommu.c   | 23 +--
 drivers/iommu/iommu.c   | 15 ++---
 drivers/parisc/ccio-dma.c   |  2 +-
 drivers/parisc/sba_iommu.c  |  2 +-
 drivers/xen/swiotlb-xen.c   |  2 +-
 include/linux/dma-map-ops.h |  5 +-
 include/linux/dma-mapping.h | 35 ++
 include/linux/iommu.h   | 22 +++
 kernel/dma/direct.c |  2 +-
 kernel/dma/dummy.c  |  2 +-
 kernel/dma/mapping.c| 86 ++---
 23 files changed, 181 insertions(+), 117 deletions(-)


base-commit: 2734d6c1b1a089fb593ef6a23d4b70903526fe0c
--
2.20.1


[PATCH v2 20/21] dma-mapping: return error code from dma_dummy_map_sg()

2021-07-23 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

The only errno to return is -EINVAL in the case when DMA is not
supported.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
---
 kernel/dma/dummy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c
index eacd4c5b10bf..b492d59ac77e 100644
--- a/kernel/dma/dummy.c
+++ b/kernel/dma/dummy.c
@@ -22,7 +22,7 @@ static int dma_dummy_map_sg(struct device *dev, struct 
scatterlist *sgl,
int nelems, enum dma_data_direction dir,
unsigned long attrs)
 {
-   return 0;
+   return -EINVAL;
 }
 
 static int dma_dummy_supported(struct device *hwdev, u64 mask)
-- 
2.20.1



Re: [PATCH v1 16/16] dma-mapping: Disallow .map_sg operations from returning zero on error

2021-07-16 Thread Logan Gunthorpe



On 2021-07-16 12:33 a.m., Christoph Hellwig wrote:
> On Thu, Jul 15, 2021 at 10:45:44AM -0600, Logan Gunthorpe wrote:
>> @@ -194,6 +194,8 @@ static int __dma_map_sg_attrs(struct device *dev, struct 
>> scatterlist *sg,
>>  else
>>  ents = ops->map_sg(dev, sg, nents, dir, attrs);
>>  
>> +WARN_ON_ONCE(ents == 0);
> 
> Turns this into a negative error code while we're at it, just to keep
> the callers sane?
> 

Sure thing. All the feedback makes sense, we'll fix it up and send a v2
in due course.

Thanks,

Logan


[PATCH v1 09/16] powerpc/iommu: return error code from .map_sg() ops

2021-07-15 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

Propagate the error up if vio_dma_iommu_map_sg() fails.

ppc_iommu_map_sg() may fail either because of iommu_range_alloc() or
because of tbl->it_ops->set(). The former only supports returning an
error with DMA_MAPPING_ERROR and an examination of the latter indicates
that it may return arch-specific errors (for example,
tce_buildmulti_pSeriesLP()). Hence, coalesce all of those errors into
-EINVAL;

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Michael Ellerman 
Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Geoff Levand 
---
 arch/powerpc/kernel/iommu.c | 4 ++--
 arch/powerpc/platforms/ps3/system-bus.c | 2 +-
 arch/powerpc/platforms/pseries/vio.c| 5 +++--
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 2af89a5e379f..bd0ed618bfa5 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -473,7 +473,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table 
*tbl,
BUG_ON(direction == DMA_NONE);
 
if ((nelems == 0) || !tbl)
-   return 0;
+   return -EINVAL;
 
outs = s = segstart = [0];
outcount = 1;
@@ -599,7 +599,7 @@ int ppc_iommu_map_sg(struct device *dev, struct iommu_table 
*tbl,
if (s == outs)
break;
}
-   return 0;
+   return -EINVAL;
 }
 
 
diff --git a/arch/powerpc/platforms/ps3/system-bus.c 
b/arch/powerpc/platforms/ps3/system-bus.c
index 1a5665875165..c54eb46f0cfb 100644
--- a/arch/powerpc/platforms/ps3/system-bus.c
+++ b/arch/powerpc/platforms/ps3/system-bus.c
@@ -663,7 +663,7 @@ static int ps3_ioc0_map_sg(struct device *_dev, struct 
scatterlist *sg,
   unsigned long attrs)
 {
BUG();
-   return 0;
+   return -EINVAL;
 }
 
 static void ps3_sb_unmap_sg(struct device *_dev, struct scatterlist *sg,
diff --git a/arch/powerpc/platforms/pseries/vio.c 
b/arch/powerpc/platforms/pseries/vio.c
index e00f3725ec96..e31e59c54f30 100644
--- a/arch/powerpc/platforms/pseries/vio.c
+++ b/arch/powerpc/platforms/pseries/vio.c
@@ -560,7 +560,8 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct 
scatterlist *sglist,
for_each_sg(sglist, sgl, nelems, count)
alloc_size += roundup(sgl->length, IOMMU_PAGE_SIZE(tbl));
 
-   if (vio_cmo_alloc(viodev, alloc_size))
+   ret = vio_cmo_alloc(viodev, alloc_size);
+   if (ret)
goto out_fail;
ret = ppc_iommu_map_sg(dev, tbl, sglist, nelems, dma_get_mask(dev),
direction, attrs);
@@ -577,7 +578,7 @@ static int vio_dma_iommu_map_sg(struct device *dev, struct 
scatterlist *sglist,
vio_cmo_dealloc(viodev, alloc_size);
 out_fail:
atomic_inc(>cmo.allocs_failed);
-   return 0;
+   return ret;
 }
 
 static void vio_dma_iommu_unmap_sg(struct device *dev,
-- 
2.20.1



[PATCH v1 06/16] ARM/dma-mapping: return error code from .map_sg() ops

2021-07-15 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure,
so propagate any errors that may happen all the way up.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Russell King 
Cc: Thomas Bogendoerfer 
---
 arch/arm/mm/dma-mapping.c | 22 +-
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index c4b8df2ad328..8c286e690756 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -980,7 +980,7 @@ int arm_dma_map_sg(struct device *dev, struct scatterlist 
*sg, int nents,
 {
const struct dma_map_ops *ops = get_dma_ops(dev);
struct scatterlist *s;
-   int i, j;
+   int i, j, ret;
 
for_each_sg(sg, s, nents, i) {
 #ifdef CONFIG_NEED_SG_DMA_LENGTH
@@ -988,7 +988,8 @@ int arm_dma_map_sg(struct device *dev, struct scatterlist 
*sg, int nents,
 #endif
s->dma_address = ops->map_page(dev, sg_page(s), s->offset,
s->length, dir, attrs);
-   if (dma_mapping_error(dev, s->dma_address))
+   ret = dma_mapping_error(dev, s->dma_address);
+   if (ret)
goto bad_mapping;
}
return nents;
@@ -996,7 +997,7 @@ int arm_dma_map_sg(struct device *dev, struct scatterlist 
*sg, int nents,
  bad_mapping:
for_each_sg(sg, s, i, j)
ops->unmap_page(dev, sg_dma_address(s), sg_dma_len(s), dir, 
attrs);
-   return 0;
+   return ret;
 }
 
 /**
@@ -1622,7 +1623,7 @@ static int __iommu_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
 bool is_coherent)
 {
struct scatterlist *s = sg, *dma = sg, *start = sg;
-   int i, count = 0;
+   int i, count = 0, ret;
unsigned int offset = s->offset;
unsigned int size = s->offset + s->length;
unsigned int max = dma_get_max_seg_size(dev);
@@ -1634,8 +1635,10 @@ static int __iommu_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
s->dma_length = 0;
 
if (s->offset || (size & ~PAGE_MASK) || size + s->length > max) 
{
-   if (__map_sg_chunk(dev, start, size, >dma_address,
-   dir, attrs, is_coherent) < 0)
+   ret = __map_sg_chunk(dev, start, size,
+>dma_address, dir, attrs,
+is_coherent);
+   if (ret < 0)
goto bad_mapping;
 
dma->dma_address += offset;
@@ -1648,8 +1651,9 @@ static int __iommu_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
}
size += s->length;
}
-   if (__map_sg_chunk(dev, start, size, >dma_address, dir, attrs,
-   is_coherent) < 0)
+   ret = __map_sg_chunk(dev, start, size, >dma_address, dir, attrs,
+is_coherent);
+   if (ret < 0)
goto bad_mapping;
 
dma->dma_address += offset;
@@ -1660,7 +1664,7 @@ static int __iommu_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
 bad_mapping:
for_each_sg(sg, s, count, i)
__iommu_remove_mapping(dev, sg_dma_address(s), sg_dma_len(s));
-   return 0;
+   return ret;
 }
 
 /**
-- 
2.20.1



[PATCH v1 03/16] iommu: Return full error code from iommu_map_sg[_atomic]()

2021-07-15 Thread Logan Gunthorpe
Convert to ssize_t return code so the return code from __iommu_map()
can be returned all the way down through dma_iommu_map_sg().

Signed-off-by: Logan Gunthorpe 
Cc: Joerg Roedel 
Cc: Will Deacon 
---
 drivers/iommu/iommu.c | 15 +++
 include/linux/iommu.h | 22 +++---
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index 5419c4b9f27a..bf971b4e34aa 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -2567,9 +2567,9 @@ size_t iommu_unmap_fast(struct iommu_domain *domain,
 }
 EXPORT_SYMBOL_GPL(iommu_unmap_fast);
 
-static size_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
-struct scatterlist *sg, unsigned int nents, int 
prot,
-gfp_t gfp)
+static ssize_t __iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+   struct scatterlist *sg, unsigned int nents, int prot,
+   gfp_t gfp)
 {
const struct iommu_ops *ops = domain->ops;
size_t len = 0, mapped = 0;
@@ -2610,19 +2610,18 @@ static size_t __iommu_map_sg(struct iommu_domain 
*domain, unsigned long iova,
/* undo mappings already done */
iommu_unmap(domain, iova, mapped);
 
-   return 0;
-
+   return ret;
 }
 
-size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
-   struct scatterlist *sg, unsigned int nents, int prot)
+ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+struct scatterlist *sg, unsigned int nents, int prot)
 {
might_sleep();
return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_KERNEL);
 }
 EXPORT_SYMBOL_GPL(iommu_map_sg);
 
-size_t iommu_map_sg_atomic(struct iommu_domain *domain, unsigned long iova,
+ssize_t iommu_map_sg_atomic(struct iommu_domain *domain, unsigned long iova,
struct scatterlist *sg, unsigned int nents, int prot)
 {
return __iommu_map_sg(domain, iova, sg, nents, prot, GFP_ATOMIC);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 32d448050bf7..9369458ba1bd 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -414,11 +414,11 @@ extern size_t iommu_unmap(struct iommu_domain *domain, 
unsigned long iova,
 extern size_t iommu_unmap_fast(struct iommu_domain *domain,
   unsigned long iova, size_t size,
   struct iommu_iotlb_gather *iotlb_gather);
-extern size_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
-  struct scatterlist *sg,unsigned int nents, int prot);
-extern size_t iommu_map_sg_atomic(struct iommu_domain *domain,
- unsigned long iova, struct scatterlist *sg,
- unsigned int nents, int prot);
+extern ssize_t iommu_map_sg(struct iommu_domain *domain, unsigned long iova,
+   struct scatterlist *sg, unsigned int nents, int prot);
+extern ssize_t iommu_map_sg_atomic(struct iommu_domain *domain,
+  unsigned long iova, struct scatterlist *sg,
+  unsigned int nents, int prot);
 extern phys_addr_t iommu_iova_to_phys(struct iommu_domain *domain, dma_addr_t 
iova);
 extern void iommu_set_fault_handler(struct iommu_domain *domain,
iommu_fault_handler_t handler, void *token);
@@ -679,18 +679,18 @@ static inline size_t iommu_unmap_fast(struct iommu_domain 
*domain,
return 0;
 }
 
-static inline size_t iommu_map_sg(struct iommu_domain *domain,
- unsigned long iova, struct scatterlist *sg,
- unsigned int nents, int prot)
+static inline ssize_t iommu_map_sg(struct iommu_domain *domain,
+  unsigned long iova, struct scatterlist *sg,
+  unsigned int nents, int prot)
 {
-   return 0;
+   return -ENODEV;
 }
 
-static inline size_t iommu_map_sg_atomic(struct iommu_domain *domain,
+static inline ssize_t iommu_map_sg_atomic(struct iommu_domain *domain,
  unsigned long iova, struct scatterlist *sg,
  unsigned int nents, int prot)
 {
-   return 0;
+   return -ENODEV;
 }
 
 static inline void iommu_flush_iotlb_all(struct iommu_domain *domain)
-- 
2.20.1



[PATCH v1 07/16] ia64/sba_iommu: return error code from sba_map_sg_attrs()

2021-07-15 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

Propagate the return of dma_mapping_error() up, if it is an errno.

sba_coalesce_chunks() may only presently fail if sba_alloc_range()
fails, which in turn only fails if the iommu is out of mapping
resources, hence a -ENOMEM is used in that case.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Michael Ellerman 
Cc: Niklas Schnelle 
Cc: Thomas Bogendoerfer 
---
 arch/ia64/hp/common/sba_iommu.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/ia64/hp/common/sba_iommu.c b/arch/ia64/hp/common/sba_iommu.c
index 9148ddbf02e5..09dbe07a18c1 100644
--- a/arch/ia64/hp/common/sba_iommu.c
+++ b/arch/ia64/hp/common/sba_iommu.c
@@ -1431,7 +1431,7 @@ static int sba_map_sg_attrs(struct device *dev, struct 
scatterlist *sglist,
unsigned long attrs)
 {
struct ioc *ioc;
-   int coalesced, filled = 0;
+   int coalesced, filled = 0, ret;
 #ifdef ASSERT_PDIR_SANITY
unsigned long flags;
 #endif
@@ -1458,8 +1458,9 @@ static int sba_map_sg_attrs(struct device *dev, struct 
scatterlist *sglist,
sglist->dma_length = sglist->length;
sglist->dma_address = sba_map_page(dev, sg_page(sglist),
sglist->offset, sglist->length, dir, attrs);
-   if (dma_mapping_error(dev, sglist->dma_address))
-   return 0;
+   ret = dma_mapping_error(dev, sglist->dma_address);
+   if (ret)
+   return ret;
return 1;
}
 
@@ -1486,7 +1487,7 @@ static int sba_map_sg_attrs(struct device *dev, struct 
scatterlist *sglist,
coalesced = sba_coalesce_chunks(ioc, dev, sglist, nents);
if (coalesced < 0) {
sba_unmap_sg_attrs(dev, sglist, nents, dir, attrs);
-   return 0;
+   return -ENOMEM;
}
 
/*
-- 
2.20.1



[PATCH v1 13/16] xen: swiotlb: return error code from xen_swiotlb_map_sg()

2021-07-15 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

xen_swiotlb_map_sg() may only fail if xen_swiotlb_map_page() fails, but
xen_swiotlb_map_page() only supports returning errors as
DMA_MAPPING_ERROR. So coalesce all errors into EINVAL.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Konrad Rzeszutek Wilk 
Cc: Boris Ostrovsky 
Cc: Juergen Gross 
Cc: Stefano Stabellini 
---
 drivers/xen/swiotlb-xen.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c
index 24d11861ac7d..b5707127c9d7 100644
--- a/drivers/xen/swiotlb-xen.c
+++ b/drivers/xen/swiotlb-xen.c
@@ -509,7 +509,7 @@ xen_swiotlb_map_sg(struct device *dev, struct scatterlist 
*sgl, int nelems,
 out_unmap:
xen_swiotlb_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
sg_dma_len(sgl) = 0;
-   return 0;
+   return -EINVAL;
 }
 
 static void
-- 
2.20.1



[PATCH v1 00/16] .map_sg() error cleanup

2021-07-15 Thread Logan Gunthorpe
Hi,

This series is spun out and expanded from my work to add P2PDMA support
to DMA map operations[1].

The P2PDMA work requires distinguishing different error conditions in
a map_sg operation. dma_map_sgtable() already allows for returning an
error code (where as dma_map_sg() is only allowed to return zero)
however, it currently only returns -EINVAL when a .map_sg() call returns
zero.

This series cleans up all .map_sg() implementations to return appropriate
error codes. After the cleanup, dma_map_sg() will still return zero,
however dma_map_sgtable() will pass the error code from the .map_sg()
call. Thanks go to Martn Oliveira for doing a lot of the cleanup of the
obscure implementations.

The patch set is based off of v5.14-rc1 and a git repo can be found
here:

  https://github.com/sbates130272/linux-p2pmem map_sg_err_cleanup_v1

Thanks,

Logan

[1] 
https://lore.kernel.org/linux-block/20210513223203.5542-1-log...@deltatee.com/

--

Logan Gunthorpe (5):
  dma-mapping: Allow map_sg() ops to return negative error codes
  dma-direct: Return appropriate error code from dma_direct_map_sg()
  iommu: Return full error code from iommu_map_sg[_atomic]()
  dma-iommu: Return error code from iommu_dma_map_sg()
  dma-mapping: Disallow .map_sg operations from returning zero on error

Martin Oliveira (11):
  alpha: return error code from alpha_pci_map_sg()
  ARM/dma-mapping: return error code from .map_sg() ops
  ia64/sba_iommu: return error code from sba_map_sg_attrs()
  MIPS/jazzdma: return error code from jazz_dma_map_sg()
  powerpc/iommu: return error code from .map_sg() ops
  s390/pci: return error code from s390_dma_map_sg()
  sparc/iommu: return error codes from .map_sg() ops
  parisc: return error code from .map_sg() ops
  xen: swiotlb: return error code from xen_swiotlb_map_sg()
  x86/amd_gart: return error code from gart_map_sg()
  dma-mapping: return error code from dma_dummy_map_sg()

 arch/alpha/kernel/pci_iommu.c   | 10 +++-
 arch/arm/mm/dma-mapping.c   | 22 +---
 arch/ia64/hp/common/sba_iommu.c |  9 +--
 arch/mips/jazz/jazzdma.c|  2 +-
 arch/powerpc/kernel/iommu.c |  4 +-
 arch/powerpc/platforms/ps3/system-bus.c |  2 +-
 arch/powerpc/platforms/pseries/vio.c|  5 +-
 arch/s390/pci/pci_dma.c | 12 ++--
 arch/sparc/kernel/iommu.c   |  4 +-
 arch/sparc/kernel/pci_sun4v.c   |  4 +-
 arch/sparc/mm/iommu.c   |  2 +-
 arch/x86/kernel/amd_gart_64.c   | 16 +++---
 drivers/iommu/dma-iommu.c   | 20 ---
 drivers/iommu/iommu.c   | 15 +++--
 drivers/parisc/ccio-dma.c   |  2 +-
 drivers/parisc/sba_iommu.c  |  2 +-
 drivers/xen/swiotlb-xen.c   |  2 +-
 include/linux/dma-map-ops.h |  6 +-
 include/linux/dma-mapping.h | 35 +++-
 include/linux/iommu.h   | 22 
 kernel/dma/direct.c |  2 +-
 kernel/dma/dummy.c  |  2 +-
 kernel/dma/mapping.c| 73 ++---
 23 files changed, 165 insertions(+), 108 deletions(-)


base-commit: e73f0f0ee7541171d89f2e2491130c7771ba58d3
--
2.20.1


[PATCH v1 14/16] x86/amd_gart: return error code from gart_map_sg()

2021-07-15 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

So make __dma_map_cont() return a valid errno (which is then propagated
to gart_map_sg() via dma_map_cont()) and return it in case of failure.

Also, return -EINVAL in case of invalid nents.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: Niklas Schnelle 
Cc: Thomas Bogendoerfer 
Cc: Michael Ellerman 
---
 arch/x86/kernel/amd_gart_64.c | 16 +---
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 9ac696487b13..46aea9a4f26b 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -331,7 +331,7 @@ static int __dma_map_cont(struct device *dev, struct 
scatterlist *start,
int i;
 
if (iommu_start == -1)
-   return -1;
+   return -ENOMEM;
 
for_each_sg(start, s, nelems, i) {
unsigned long pages, addr;
@@ -380,13 +380,13 @@ static int gart_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
   enum dma_data_direction dir, unsigned long attrs)
 {
struct scatterlist *s, *ps, *start_sg, *sgmap;
-   int need = 0, nextneed, i, out, start;
+   int need = 0, nextneed, i, out, start, ret;
unsigned long pages = 0;
unsigned int seg_size;
unsigned int max_seg_size;
 
if (nents == 0)
-   return 0;
+   return -EINVAL;
 
out = 0;
start   = 0;
@@ -414,8 +414,9 @@ static int gart_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
if (!iommu_merge || !nextneed || !need || s->offset ||
(s->length + seg_size > max_seg_size) ||
(ps->offset + ps->length) % PAGE_SIZE) {
-   if (dma_map_cont(dev, start_sg, i - start,
-sgmap, pages, need) < 0)
+   ret = dma_map_cont(dev, start_sg, i - start,
+  sgmap, pages, need);
+   if (ret < 0)
goto error;
out++;
 
@@ -432,7 +433,8 @@ static int gart_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
pages += iommu_num_pages(s->offset, s->length, PAGE_SIZE);
ps = s;
}
-   if (dma_map_cont(dev, start_sg, i - start, sgmap, pages, need) < 0)
+   ret = dma_map_cont(dev, start_sg, i - start, sgmap, pages, need);
+   if (ret < 0)
goto error;
out++;
flush_gart();
@@ -458,7 +460,7 @@ static int gart_map_sg(struct device *dev, struct 
scatterlist *sg, int nents,
iommu_full(dev, pages << PAGE_SHIFT, dir);
for_each_sg(sg, s, nents, i)
s->dma_address = DMA_MAPPING_ERROR;
-   return 0;
+   return ret;
 }
 
 /* allocate and map a coherent mapping */
-- 
2.20.1



[PATCH v1 12/16] parisc: return error code from .map_sg() ops

2021-07-15 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: "James E.J. Bottomley" 
Cc: Helge Deller 
---
 drivers/parisc/ccio-dma.c  | 2 +-
 drivers/parisc/sba_iommu.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c
index b5f9ee81a46c..a3a5cfda3d93 100644
--- a/drivers/parisc/ccio-dma.c
+++ b/drivers/parisc/ccio-dma.c
@@ -918,7 +918,7 @@ ccio_map_sg(struct device *dev, struct scatterlist *sglist, 
int nents,
BUG_ON(!dev);
ioc = GET_IOC(dev);
if (!ioc)
-   return 0;
+   return -ENODEV;

DBG_RUN_SG("%s() START %d entries\n", __func__, nents);
 
diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c
index dce4cdf786cd..9a6671a230ee 100644
--- a/drivers/parisc/sba_iommu.c
+++ b/drivers/parisc/sba_iommu.c
@@ -947,7 +947,7 @@ sba_map_sg(struct device *dev, struct scatterlist *sglist, 
int nents,
 
ioc = GET_IOC(dev);
if (!ioc)
-   return 0;
+   return -ENODEV;
 
/* Fast path single entry scatterlists. */
if (nents == 1) {
-- 
2.20.1



[PATCH v1 02/16] dma-direct: Return appropriate error code from dma_direct_map_sg()

2021-07-15 Thread Logan Gunthorpe
Now that the map_sg() op expects error codes instead of return zero on
error, convert dma_direct_map_sg() to return an error code. The
only error to return presently is EINVAL if a page could not
be mapped.

Signed-off-by: Logan Gunthorpe 
---
 kernel/dma/direct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index f737e3347059..803ee9321170 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -411,7 +411,7 @@ int dma_direct_map_sg(struct device *dev, struct 
scatterlist *sgl, int nents,
 
 out_unmap:
dma_direct_unmap_sg(dev, sgl, i, dir, attrs | DMA_ATTR_SKIP_CPU_SYNC);
-   return 0;
+   return -EINVAL;
 }
 
 dma_addr_t dma_direct_map_resource(struct device *dev, phys_addr_t paddr,
-- 
2.20.1



[PATCH v1 11/16] sparc/iommu: return error codes from .map_sg() ops

2021-07-15 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

Returning an errno from __sbus_iommu_map_sg() results in
sbus_iommu_map_sg_gflush() and sbus_iommu_map_sg_pflush() returning an
errno, as those functions are wrappers around __sbus_iommu_map_sg().

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: "David S. Miller" 
Cc: Niklas Schnelle 
Cc: Michael Ellerman 
---
 arch/sparc/kernel/iommu.c | 4 ++--
 arch/sparc/kernel/pci_sun4v.c | 4 ++--
 arch/sparc/mm/iommu.c | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/sparc/kernel/iommu.c b/arch/sparc/kernel/iommu.c
index a034f571d869..0589acd34201 100644
--- a/arch/sparc/kernel/iommu.c
+++ b/arch/sparc/kernel/iommu.c
@@ -448,7 +448,7 @@ static int dma_4u_map_sg(struct device *dev, struct 
scatterlist *sglist,
iommu = dev->archdata.iommu;
strbuf = dev->archdata.stc;
if (nelems == 0 || !iommu)
-   return 0;
+   return -EINVAL;
 
spin_lock_irqsave(>lock, flags);
 
@@ -580,7 +580,7 @@ static int dma_4u_map_sg(struct device *dev, struct 
scatterlist *sglist,
}
spin_unlock_irqrestore(>lock, flags);
 
-   return 0;
+   return -EINVAL;
 }
 
 /* If contexts are being used, they are the same in all of the mappings
diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c
index 9de57e88f7a1..d90e80fa5705 100644
--- a/arch/sparc/kernel/pci_sun4v.c
+++ b/arch/sparc/kernel/pci_sun4v.c
@@ -486,7 +486,7 @@ static int dma_4v_map_sg(struct device *dev, struct 
scatterlist *sglist,
 
iommu = dev->archdata.iommu;
if (nelems == 0 || !iommu)
-   return 0;
+   return -EINVAL;
atu = iommu->atu;
 
prot = HV_PCI_MAP_ATTR_READ;
@@ -619,7 +619,7 @@ static int dma_4v_map_sg(struct device *dev, struct 
scatterlist *sglist,
}
local_irq_restore(flags);
 
-   return 0;
+   return -EINVAL;
 }
 
 static void dma_4v_unmap_sg(struct device *dev, struct scatterlist *sglist,
diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c
index 0c0342e5b10d..01ffcedd159c 100644
--- a/arch/sparc/mm/iommu.c
+++ b/arch/sparc/mm/iommu.c
@@ -256,7 +256,7 @@ static int __sbus_iommu_map_sg(struct device *dev, struct 
scatterlist *sgl,
sg->dma_address =__sbus_iommu_map_page(dev, sg_page(sg),
sg->offset, sg->length, per_page_flush);
if (sg->dma_address == DMA_MAPPING_ERROR)
-   return 0;
+   return -EINVAL;
sg->dma_length = sg->length;
}
 
-- 
2.20.1



[PATCH v1 01/16] dma-mapping: Allow map_sg() ops to return negative error codes

2021-07-15 Thread Logan Gunthorpe
Allow dma_map_sgtable() to pass errors from the map_sg() ops. This
will be required for returning appropriate error codes when mapping
P2PDMA memory.

Introduce __dma_map_sg_attrs() which will return the raw error code
from the map_sg operation (whether it be negative or zero). Then add a
dma_map_sg_attrs() wrapper to convert any negative errors to zero to
satisfy the existing calling convention.

dma_map_sgtable() will convert a zero error return for old map_sg() ops
into a -EINVAL return and return any negative errors as reported.

This allows map_sg implementations to start returning multiple
negative error codes. Legacy map_sg implementations can continue
to return zero until they are all converted.

Signed-off-by: Logan Gunthorpe 
---
 include/linux/dma-map-ops.h |  8 +++-
 include/linux/dma-mapping.h | 35 --
 kernel/dma/mapping.c| 73 +
 3 files changed, 78 insertions(+), 38 deletions(-)

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index 0d53a96a3d64..eaa969be8284 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -41,8 +41,12 @@ struct dma_map_ops {
size_t size, enum dma_data_direction dir,
unsigned long attrs);
/*
-* map_sg returns 0 on error and a value > 0 on success.
-* It should never return a value < 0.
+* map_sg should return a negative error code on error.
+* dma_map_sgtable() will return the error code returned and convert
+* a zero return (for legacy implementations) into -EINVAL.
+*
+* dma_map_sg() will always return zero on any negative or zero
+* return to satisfy its own calling convention.
 */
int (*map_sg)(struct device *dev, struct scatterlist *sg, int nents,
enum dma_data_direction dir, unsigned long attrs);
diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h
index 183e7103a66d..daa1e360f0ee 100644
--- a/include/linux/dma-mapping.h
+++ b/include/linux/dma-mapping.h
@@ -110,6 +110,8 @@ int dma_map_sg_attrs(struct device *dev, struct scatterlist 
*sg, int nents,
 void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg,
  int nents, enum dma_data_direction dir,
  unsigned long attrs);
+int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+   enum dma_data_direction dir, unsigned long attrs);
 dma_addr_t dma_map_resource(struct device *dev, phys_addr_t phys_addr,
size_t size, enum dma_data_direction dir, unsigned long attrs);
 void dma_unmap_resource(struct device *dev, dma_addr_t addr, size_t size,
@@ -174,6 +176,11 @@ static inline void dma_unmap_sg_attrs(struct device *dev,
unsigned long attrs)
 {
 }
+static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
+   enum dma_data_direction dir, unsigned long attrs)
+{
+   return -EOPNOTSUPP;
+}
 static inline dma_addr_t dma_map_resource(struct device *dev,
phys_addr_t phys_addr, size_t size, enum dma_data_direction dir,
unsigned long attrs)
@@ -343,34 +350,6 @@ static inline void dma_sync_single_range_for_device(struct 
device *dev,
return dma_sync_single_for_device(dev, addr + offset, size, dir);
 }
 
-/**
- * dma_map_sgtable - Map the given buffer for DMA
- * @dev:   The device for which to perform the DMA operation
- * @sgt:   The sg_table object describing the buffer
- * @dir:   DMA direction
- * @attrs: Optional DMA attributes for the map operation
- *
- * Maps a buffer described by a scatterlist stored in the given sg_table
- * object for the @dir DMA operation by the @dev device. After success the
- * ownership for the buffer is transferred to the DMA domain.  One has to
- * call dma_sync_sgtable_for_cpu() or dma_unmap_sgtable() to move the
- * ownership of the buffer back to the CPU domain before touching the
- * buffer by the CPU.
- *
- * Returns 0 on success or -EINVAL on error during mapping the buffer.
- */
-static inline int dma_map_sgtable(struct device *dev, struct sg_table *sgt,
-   enum dma_data_direction dir, unsigned long attrs)
-{
-   int nents;
-
-   nents = dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
-   if (nents <= 0)
-   return -EINVAL;
-   sgt->nents = nents;
-   return 0;
-}
-
 /**
  * dma_unmap_sgtable - Unmap the given buffer for DMA
  * @dev:   The device for which to perform the DMA operation
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 2b06a809d0b9..30f89d244566 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -177,12 +177,8 @@ void dma_unmap_page_attrs(struct device *dev, dma_addr_t 
addr, size_t size,
 }
 EXPORT_SYMBOL(dma_unmap_page_attrs);
 
-/*
- * dma_maps_sg_attrs returns 

[PATCH v1 05/16] alpha: return error code from alpha_pci_map_sg()

2021-07-15 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

pci_map_single_1() can fail for different reasons, but since the only
supported type of error return is DMA_MAPPING_ERROR, we coalesce those
errors into EINVAL.

ENOMEM is returned when no page tables can be allocated.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Richard Henderson 
Cc: Ivan Kokshaysky 
Cc: Matt Turner 
---
 arch/alpha/kernel/pci_iommu.c | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index 35d7b3096d6e..72fc2465d13c 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -649,7 +649,9 @@ static int alpha_pci_map_sg(struct device *dev, struct 
scatterlist *sg,
sg->dma_address
  = pci_map_single_1(pdev, SG_ENT_VIRT_ADDRESS(sg),
 sg->length, dac_allowed);
-   return sg->dma_address != DMA_MAPPING_ERROR;
+   if (sg->dma_address == DMA_MAPPING_ERROR)
+   return -EINVAL;
+   return 1;
}
 
start = sg;
@@ -685,8 +687,10 @@ static int alpha_pci_map_sg(struct device *dev, struct 
scatterlist *sg,
if (out < end)
out->dma_length = 0;
 
-   if (out - start == 0)
+   if (out - start == 0) {
printk(KERN_WARNING "pci_map_sg failed: no entries?\n");
+   return -ENOMEM;
+   }
DBGA("pci_map_sg: %ld entries\n", out - start);
 
return out - start;
@@ -699,7 +703,7 @@ static int alpha_pci_map_sg(struct device *dev, struct 
scatterlist *sg,
   entries.  Unmap them now.  */
if (out > start)
pci_unmap_sg(pdev, start, out - start, dir);
-   return 0;
+   return -ENOMEM;
 }
 
 /* Unmap a set of streaming mode DMA translations.  Again, cpu read
-- 
2.20.1



[PATCH v1 10/16] s390/pci: return error code from s390_dma_map_sg()

2021-07-15 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

So propagate the error from __s390_dma_map_sg() up.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Niklas Schnelle 
Cc: Gerald Schaefer 
Cc: Heiko Carstens 
Cc: Vasily Gorbik 
Cc: Christian Borntraeger 
---
 arch/s390/pci/pci_dma.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c
index ebc9a49523aa..c78b02012764 100644
--- a/arch/s390/pci/pci_dma.c
+++ b/arch/s390/pci/pci_dma.c
@@ -487,7 +487,7 @@ static int s390_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
unsigned int max = dma_get_max_seg_size(dev);
unsigned int size = s->offset + s->length;
unsigned int offset = s->offset;
-   int count = 0, i;
+   int count = 0, i, ret;
 
for (i = 1; i < nr_elements; i++) {
s = sg_next(s);
@@ -497,8 +497,9 @@ static int s390_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
 
if (s->offset || (size & ~PAGE_MASK) ||
size + s->length > max) {
-   if (__s390_dma_map_sg(dev, start, size,
- >dma_address, dir))
+   ret = __s390_dma_map_sg(dev, start, size,
+   >dma_address, dir);
+   if (ret)
goto unmap;
 
dma->dma_address += offset;
@@ -511,7 +512,8 @@ static int s390_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
}
size += s->length;
}
-   if (__s390_dma_map_sg(dev, start, size, >dma_address, dir))
+   ret = __s390_dma_map_sg(dev, start, size, >dma_address, dir);
+   if (ret)
goto unmap;
 
dma->dma_address += offset;
@@ -523,7 +525,7 @@ static int s390_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
s390_dma_unmap_pages(dev, sg_dma_address(s), sg_dma_len(s),
 dir, attrs);
 
-   return 0;
+   return ret;
 }
 
 static void s390_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
-- 
2.20.1



Re: [PATCH v1 00/16] .map_sg() error cleanup

2021-07-15 Thread Logan Gunthorpe




On 2021-07-15 10:53 a.m., Russell King (Oracle) wrote:
> On Thu, Jul 15, 2021 at 10:45:28AM -0600, Logan Gunthorpe wrote:
>> Hi,
>>
>> This series is spun out and expanded from my work to add P2PDMA support
>> to DMA map operations[1].
>>
>> The P2PDMA work requires distinguishing different error conditions in
>> a map_sg operation. dma_map_sgtable() already allows for returning an
>> error code (where as dma_map_sg() is only allowed to return zero)
>> however, it currently only returns -EINVAL when a .map_sg() call returns
>> zero.
>>
>> This series cleans up all .map_sg() implementations to return appropriate
>> error codes. After the cleanup, dma_map_sg() will still return zero,
>> however dma_map_sgtable() will pass the error code from the .map_sg()
>> call. Thanks go to Martn Oliveira for doing a lot of the cleanup of the
>> obscure implementations.
>>
>> The patch set is based off of v5.14-rc1 and a git repo can be found
>> here:
> 
> Have all the callers for dma_map_sg() been updated to check for error
> codes? If not, isn't that a pre-requisit to this patch set?

No. Perhaps I wasn't clear enough: This series is changing only
impelemntations of .map_sg(). It does *not* change the return code of
dma_map_sg(). dma_map_sg() will continue to return zero on error for the
foreseeable future. The dma_map_sgtable() call already allows returning
error codes and it will pass the new error code through. This is what
will be used in the P2PDMA work.

Logan


[PATCH v1 08/16] MIPS/jazzdma: return error code from jazz_dma_map_sg()

2021-07-15 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

vdma_alloc() may fail for different reasons, but since it only supports
indicating an error via a return of DMA_MAPPING_ERROR, we coalesce the
different reasons into -EINVAL.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
Cc: Thomas Bogendoerfer 
---
 arch/mips/jazz/jazzdma.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/mips/jazz/jazzdma.c b/arch/mips/jazz/jazzdma.c
index 461457b28982..3b99743435db 100644
--- a/arch/mips/jazz/jazzdma.c
+++ b/arch/mips/jazz/jazzdma.c
@@ -552,7 +552,7 @@ static int jazz_dma_map_sg(struct device *dev, struct 
scatterlist *sglist,
dir);
sg->dma_address = vdma_alloc(sg_phys(sg), sg->length);
if (sg->dma_address == DMA_MAPPING_ERROR)
-   return 0;
+   return -EINVAL;
sg_dma_len(sg) = sg->length;
}

--
2.20.1


[PATCH v1 04/16] dma-iommu: Return error code from iommu_dma_map_sg()

2021-07-15 Thread Logan Gunthorpe
Pass through appropriate error codes from iommu_dma_map_sg() now that
the error code will be passed through dma_map_sgtable().

Signed-off-by: Logan Gunthorpe 
Cc: Joerg Roedel 
Cc: Will Deacon 
---
 drivers/iommu/dma-iommu.c | 20 +---
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c
index 98ba927aee1a..9d35e9994306 100644
--- a/drivers/iommu/dma-iommu.c
+++ b/drivers/iommu/dma-iommu.c
@@ -972,7 +972,7 @@ static int iommu_dma_map_sg_swiotlb(struct device *dev, 
struct scatterlist *sg,
 
 out_unmap:
iommu_dma_unmap_sg_swiotlb(dev, sg, i, dir, attrs | 
DMA_ATTR_SKIP_CPU_SYNC);
-   return 0;
+   return -EINVAL;
 }
 
 /*
@@ -993,11 +993,14 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
dma_addr_t iova;
size_t iova_len = 0;
unsigned long mask = dma_get_seg_boundary(dev);
+   ssize_t ret;
int i;
 
-   if (static_branch_unlikely(_deferred_attach_enabled) &&
-   iommu_deferred_attach(dev, domain))
-   return 0;
+   if (static_branch_unlikely(_deferred_attach_enabled)) {
+   ret = iommu_deferred_attach(dev, domain);
+   if (ret)
+   return ret;
+   }
 
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
iommu_dma_sync_sg_for_device(dev, sg, nents, dir);
@@ -1045,14 +1048,17 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
}
 
iova = iommu_dma_alloc_iova(domain, iova_len, dma_get_mask(dev), dev);
-   if (!iova)
+   if (!iova) {
+   ret = -ENOMEM;
goto out_restore_sg;
+   }
 
/*
 * We'll leave any physical concatenation to the IOMMU driver's
 * implementation - it knows better than we do.
 */
-   if (iommu_map_sg_atomic(domain, iova, sg, nents, prot) < iova_len)
+   ret = iommu_map_sg_atomic(domain, iova, sg, nents, prot);
+   if (ret < iova_len)
goto out_free_iova;
 
return __finalise_sg(dev, sg, nents, iova);
@@ -1061,7 +1067,7 @@ static int iommu_dma_map_sg(struct device *dev, struct 
scatterlist *sg,
iommu_dma_free_iova(cookie, iova, iova_len, NULL);
 out_restore_sg:
__invalidate_sg(sg, nents);
-   return 0;
+   return ret;
 }
 
 static void iommu_dma_unmap_sg(struct device *dev, struct scatterlist *sg,
-- 
2.20.1



[PATCH v1 16/16] dma-mapping: Disallow .map_sg operations from returning zero on error

2021-07-15 Thread Logan Gunthorpe
Now that all the .map_sg operations have been converted to returning
proper error codes, drop the code to handle a zero return value,
add a warning if a zero is returned and update the comment for the
map_sg operation.

Signed-off-by: Logan Gunthorpe 
---
 include/linux/dma-map-ops.h | 8 +++-
 kernel/dma/mapping.c| 6 +++---
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
index eaa969be8284..f299bc1e317b 100644
--- a/include/linux/dma-map-ops.h
+++ b/include/linux/dma-map-ops.h
@@ -42,11 +42,9 @@ struct dma_map_ops {
unsigned long attrs);
/*
 * map_sg should return a negative error code on error.
-* dma_map_sgtable() will return the error code returned and convert
-* a zero return (for legacy implementations) into -EINVAL.
-*
-* dma_map_sg() will always return zero on any negative or zero
-* return to satisfy its own calling convention.
+* dma_map_sgtable() will return the error code returned by the
+* operation and dma_map_sg() will always convert any error to zero
+* to satisfy its own calling convention.
 */
int (*map_sg)(struct device *dev, struct scatterlist *sg, int nents,
enum dma_data_direction dir, unsigned long attrs);
diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c
index 30f89d244566..978a6a16aaf7 100644
--- a/kernel/dma/mapping.c
+++ b/kernel/dma/mapping.c
@@ -194,6 +194,8 @@ static int __dma_map_sg_attrs(struct device *dev, struct 
scatterlist *sg,
else
ents = ops->map_sg(dev, sg, nents, dir, attrs);
 
+   WARN_ON_ONCE(ents == 0);
+
if (ents > 0)
debug_dma_map_sg(dev, sg, nents, ents, dir);
 
@@ -251,9 +253,7 @@ int dma_map_sgtable(struct device *dev, struct sg_table 
*sgt,
int nents;
 
nents = __dma_map_sg_attrs(dev, sgt->sgl, sgt->orig_nents, dir, attrs);
-   if (nents == 0)
-   return -EINVAL;
-   else if (nents < 0)
+   if (nents < 0)
return nents;
 
sgt->nents = nents;
-- 
2.20.1



[PATCH v1 15/16] dma-mapping: return error code from dma_dummy_map_sg()

2021-07-15 Thread Logan Gunthorpe
From: Martin Oliveira 

The .map_sg() op now expects an error code instead of zero on failure.

The only errno to return is -ENODEV in the case when DMA is not
supported.

Signed-off-by: Martin Oliveira 
Signed-off-by: Logan Gunthorpe 
---
 kernel/dma/dummy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/dma/dummy.c b/kernel/dma/dummy.c
index eacd4c5b10bf..ae9abebed0c4 100644
--- a/kernel/dma/dummy.c
+++ b/kernel/dma/dummy.c
@@ -22,7 +22,7 @@ static int dma_dummy_map_sg(struct device *dev, struct 
scatterlist *sgl,
int nelems, enum dma_data_direction dir,
unsigned long attrs)
 {
-   return 0;
+   return -ENODEV;
 }
 
 static int dma_dummy_supported(struct device *hwdev, u64 mask)
-- 
2.20.1



Re: [PATCH v2 01/14] PCI: Use sysfs_emit() and sysfs_emit_at() in "show" functions

2021-05-17 Thread Logan Gunthorpe



On 2021-05-14 11:24 p.m., Krzysztof Wilczyński wrote:
> The sysfs_emit() and sysfs_emit_at() functions were introduced to make
> it less ambiguous which function is preferred when writing to the output
> buffer in a device attribute's "show" callback [1].
> 
> Convert the PCI sysfs object "show" functions from sprintf(), snprintf()
> and scnprintf() to sysfs_emit() and sysfs_emit_at() accordingly, as the
> latter is aware of the PAGE_SIZE buffer and correctly returns the number
> of bytes written into the buffer.
> 
> No functional change intended.
> 
> [1] Documentation/filesystems/sysfs.rst
> 
> Related to:
>   commit ad025f8e46f3 ("PCI/sysfs: Use sysfs_emit() and sysfs_emit_at() in 
> "show" functions")
> 
> Signed-off-by: Krzysztof Wilczyński 
> Reviewed-by: Logan Gunthorpe 

I re-reviewed the whole series. It still looks good to me.

Very nice solution in patch 12 to the new line issue.

Reviewed-by: Logan Gunthorpe 

Thanks,

Logan


Re: [PATCH 01/11] PCI: Use sysfs_emit() and sysfs_emit_at() in "show" functions

2021-05-10 Thread Logan Gunthorpe



On 2021-05-09 10:14 p.m., Krzysztof Wilczyński wrote:
> The sysfs_emit() and sysfs_emit_at() functions were introduced to make
> it less ambiguous which function is preferred when writing to the output
> buffer in a device attribute's "show" callback [1].
> 
> Convert the PCI sysfs object "show" functions from sprintf(), snprintf()
> and scnprintf() to sysfs_emit() and sysfs_emit_at() accordingly, as the
> latter is aware of the PAGE_SIZE buffer and correctly returns the number
> of bytes written into the buffer.
> 
> No functional change intended.
> 
> [1] Documentation/filesystems/sysfs.rst
> 
> Related to:
>   commit ad025f8e46f3 ("PCI/sysfs: Use sysfs_emit() and sysfs_emit_at() in 
> "show" functions")
> 
> Signed-off-by: Krzysztof Wilczyński 

Thanks, this is a great cleanup. I've reviewed the entire series.

Reviewed-by: Logan Gunthorpe 

I agree that the new lines that are missing should be added.

Logan


Re: [patch V2 11/15] completion: Use simple wait queues

2020-03-18 Thread Logan Gunthorpe



On 2020-03-18 2:43 p.m., Thomas Gleixner wrote:
> There is no semantical or functional change:
> 
>   - completions use the exclusive wait mode which is what swait provides
> 
>   - complete() wakes one exclusive waiter
> 
>   - complete_all() wakes all waiters while holding the lock which protects
> the wait queue against newly incoming waiters. The conversion to swait
> preserves this behaviour.
> 
> complete_all() might cause unbound latencies with a large number of waiters
> being woken at once, but most complete_all() usage sites are either in
> testing or initialization code or have only a really small number of
> concurrent waiters which for now does not cause a latency problem. Keep it
> simple for now.

Seems like it would be worth adding a note for this to the
complete_all() doc string. Otherwise developers will not likely find out
about this issue and may not keep it as simple as you'd like.

Logan


Re: [patch V2 02/15] pci/switchtec: Replace completion wait queue usage for poll

2020-03-18 Thread Logan Gunthorpe



On 2020-03-18 2:43 p.m., Thomas Gleixner wrote:
> From: Sebastian Andrzej Siewior 
> 
> The poll callback is using the completion wait queue and sticks it into
> poll_wait() to wake up pollers after a command has completed.
> 
> This works to some extent, but cannot provide EPOLLEXCLUSIVE support
> because the waker side uses complete_all() which unconditionally wakes up
> all waiters. complete_all() is required because completions internally use
> exclusive wait and complete() only wakes up one waiter by default.
> 
> This mixes conceptually different mechanisms and relies on internal
> implementation details of completions, which in turn puts contraints on
> changing the internal implementation of completions.
> 
> Replace it with a regular wait queue and store the state in struct
> switchtec_user.
> 
> Signed-off-by: Sebastian Andrzej Siewior 
> Acked-by: Peter Zijlstra (Intel) 

While I've been against open coding the completion in this driver for a
while, I'm convinced by the EPOLLEXCLUSIVE argument for this change.
I've reviewed and lightly tested the change with hardware:

Reviewed-by: Logan Gunthorpe 

Thanks,

Logan

> Cc: Kurt Schwemmer 
> Cc: Logan Gunthorpe 
> Cc: Bjorn Helgaas 
> Cc: linux-...@vger.kernel.org
> ---
> V2: Reworded changelog.
> ---
>  drivers/pci/switch/switchtec.c |   22 +-
>  1 file changed, 13 insertions(+), 9 deletions(-)
> 
> --- a/drivers/pci/switch/switchtec.c
> +++ b/drivers/pci/switch/switchtec.c
> @@ -52,10 +52,11 @@ struct switchtec_user {
>  
>   enum mrpc_state state;
>  
> - struct completion comp;
> + wait_queue_head_t cmd_comp;
>   struct kref kref;
>   struct list_head list;
>  
> + bool cmd_done;
>   u32 cmd;
>   u32 status;
>   u32 return_code;
> @@ -77,7 +78,7 @@ static struct switchtec_user *stuser_cre
>   stuser->stdev = stdev;
>   kref_init(>kref);
>   INIT_LIST_HEAD(>list);
> - init_completion(>comp);
> + init_waitqueue_head(>cmd_comp);
>   stuser->event_cnt = atomic_read(>event_cnt);
>  
>   dev_dbg(>dev, "%s: %p\n", __func__, stuser);
> @@ -175,7 +176,7 @@ static int mrpc_queue_cmd(struct switcht
>   kref_get(>kref);
>   stuser->read_len = sizeof(stuser->data);
>   stuser_set_state(stuser, MRPC_QUEUED);
> - reinit_completion(>comp);
> + stuser->cmd_done = false;
>   list_add_tail(>list, >mrpc_queue);
>  
>   mrpc_cmd_submit(stdev);
> @@ -222,7 +223,8 @@ static void mrpc_complete_cmd(struct swi
>   memcpy_fromio(stuser->data, >mmio_mrpc->output_data,
> stuser->read_len);
>  out:
> - complete_all(>comp);
> + stuser->cmd_done = true;
> + wake_up_interruptible(>cmd_comp);
>   list_del_init(>list);
>   stuser_put(stuser);
>   stdev->mrpc_busy = 0;
> @@ -529,10 +531,11 @@ static ssize_t switchtec_dev_read(struct
>   mutex_unlock(>mrpc_mutex);
>  
>   if (filp->f_flags & O_NONBLOCK) {
> - if (!try_wait_for_completion(>comp))
> + if (!stuser->cmd_done)
>   return -EAGAIN;
>   } else {
> - rc = wait_for_completion_interruptible(>comp);
> + rc = wait_event_interruptible(stuser->cmd_comp,
> +   stuser->cmd_done);
>   if (rc < 0)
>   return rc;
>   }
> @@ -580,7 +583,7 @@ static __poll_t switchtec_dev_poll(struc
>   struct switchtec_dev *stdev = stuser->stdev;
>   __poll_t ret = 0;
>  
> - poll_wait(filp, >comp.wait, wait);
> + poll_wait(filp, >cmd_comp, wait);
>   poll_wait(filp, >event_wq, wait);
>  
>   if (lock_mutex_and_test_alive(stdev))
> @@ -588,7 +591,7 @@ static __poll_t switchtec_dev_poll(struc
>  
>   mutex_unlock(>mrpc_mutex);
>  
> - if (try_wait_for_completion(>comp))
> + if (stuser->cmd_done)
>   ret |= EPOLLIN | EPOLLRDNORM;
>  
>   if (stuser->event_cnt != atomic_read(>event_cnt))
> @@ -1272,7 +1275,8 @@ static void stdev_kill(struct switchtec_
>  
>   /* Wake up and kill any users waiting on an MRPC request */
>   list_for_each_entry_safe(stuser, tmpuser, >mrpc_queue, list) {
> - complete_all(>comp);
> + stuser->cmd_done = true;
> + wake_up_interruptible(>cmd_comp);
>   list_del_init(>list);
>   stuser_put(stuser);
>   }
> 


[PATCH v4 7/7] mm/memremap: Set caching mode for PCI P2PDMA memory to WC

2020-03-06 Thread Logan Gunthorpe
PCI BAR IO memory should never be mapped as WB, however prior to this
the PAT bits were set WB and it was typically overridden by MTRR
registers set by the firmware.

Set PCI P2PDMA memory to be UC as this is what it currently, typically,
ends up being mapped as on x86 after the MTRR registers override the
cache setting.

Future use-cases may need to generalize this by adding flags to
select the caching type, as some P2PDMA cases may not want UC.
However, those use-cases are not upstream yet and this can be changed
when they arrive.

Cc: Christoph Hellwig 
Cc: Jason Gunthorpe 
Signed-off-by: Logan Gunthorpe 
Reviewed-by: Dan Williams 
---
 mm/memremap.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/memremap.c b/mm/memremap.c
index 06742372a203..9033ae401448 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -190,7 +190,10 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
}
break;
case MEMORY_DEVICE_DEVDAX:
+   need_devmap_managed = false;
+   break;
case MEMORY_DEVICE_PCI_P2PDMA:
+   params.pgprot = pgprot_noncached(params.pgprot);
need_devmap_managed = false;
break;
default:
-- 
2.20.1



[PATCH v4 2/7] mm/memory_hotplug: Rename mhp_restrictions to mhp_params

2020-03-06 Thread Logan Gunthorpe
The mhp_restrictions struct really doesn't specify anything resembling
a restriction anymore so rename it to be mhp_params as it is a list
of extended parameters.

Signed-off-by: Logan Gunthorpe 
Reviewed-by: David Hildenbrand 
Reviewed-by: Dan Williams 
Acked-by: Michal Hocko 
---
 arch/arm64/mm/mmu.c|  4 ++--
 arch/ia64/mm/init.c|  4 ++--
 arch/powerpc/mm/mem.c  |  4 ++--
 arch/s390/mm/init.c|  6 +++---
 arch/sh/mm/init.c  |  4 ++--
 arch/x86/mm/init_32.c  |  4 ++--
 arch/x86/mm/init_64.c  |  8 
 include/linux/memory_hotplug.h | 16 
 mm/memory_hotplug.c|  8 
 mm/memremap.c  |  8 
 10 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 128f70852bf3..ee37bca8aba8 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1050,7 +1050,7 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size,
-   struct mhp_restrictions *restrictions)
+   struct mhp_params *params)
 {
int flags = 0;
 
@@ -1063,7 +1063,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
memblock_clear_nomap(start, size);
 
return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
-  restrictions);
+  params);
 }
 void arch_remove_memory(int nid, u64 start, u64 size,
struct vmem_altmap *altmap)
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index b01d68a2d5d9..97bbc23ea1e3 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -670,13 +670,13 @@ mem_init (void)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size,
-   struct mhp_restrictions *restrictions)
+   struct mhp_params *params)
 {
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
 
-   ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
+   ret = __add_pages(nid, start_pfn, nr_pages, params);
if (ret)
printk("%s: Problem encountered in __add_pages() as ret=%d\n",
   __func__,  ret);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index ef7b1119b2e2..b4bece53bec0 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -128,7 +128,7 @@ static void flush_dcache_range_chunked(unsigned long start, 
unsigned long stop,
 }
 
 int __ref arch_add_memory(int nid, u64 start, u64 size,
-   struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
 {
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -144,7 +144,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
return -EFAULT;
}
 
-   return __add_pages(nid, start_pfn, nr_pages, restrictions);
+   return __add_pages(nid, start_pfn, nr_pages, params);
 }
 
 void __ref arch_remove_memory(int nid, u64 start, u64 size,
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index ac44bd76db4b..e9e4a7abd0cc 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -268,20 +268,20 @@ device_initcall(s390_cma_mem_init);
 #endif /* CONFIG_CMA */
 
 int arch_add_memory(int nid, u64 start, u64 size,
-   struct mhp_restrictions *restrictions)
+   struct mhp_params *params)
 {
unsigned long start_pfn = PFN_DOWN(start);
unsigned long size_pages = PFN_DOWN(size);
int rc;
 
-   if (WARN_ON_ONCE(restrictions->altmap))
+   if (WARN_ON_ONCE(params->altmap))
return -EINVAL;
 
rc = vmem_add_mapping(start, size);
if (rc)
return rc;
 
-   rc = __add_pages(nid, start_pfn, size_pages, restrictions);
+   rc = __add_pages(nid, start_pfn, size_pages, params);
if (rc)
vmem_remove_mapping(start, size);
return rc;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index d1b1ff2be17a..e5114c053364 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -406,14 +406,14 @@ void __init mem_init(void)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size,
-   struct mhp_restrictions *restrictions)
+   struct mhp_params *params)
 {
unsigned long start_pfn = PFN_DOWN(start);
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
 
/* We only have ZONE_NORMAL, so this is easy.. */
-   ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
+   ret = __add_pages(nid, start_pfn, nr_pages, params);
if (unlikely(ret))
printk("%s: Failed, __a

[PATCH v4 6/7] mm/memory_hotplug: Add pgprot_t to mhp_params

2020-03-06 Thread Logan Gunthorpe
devm_memremap_pages() is currently used by the PCI P2PDMA code to create
struct page mappings for IO memory. At present, these mappings are created
with PAGE_KERNEL which implies setting the PAT bits to be WB. However, on
x86, an mtrr register will typically override this and force the cache
type to be UC-. In the case firmware doesn't set this register it is
effectively WB and will typically result in a machine check exception
when it's accessed.

Other arches are not currently likely to function correctly seeing they
don't have any MTRR registers to fall back on.

To solve this, provide a way to specify the pgprot value explicitly to
arch_add_memory().

Of the arches that support MEMORY_HOTPLUG: x86_64, and arm64 need a simple
change to pass the pgprot_t down to their respective functions which set
up the page tables. For x86_32, set the page tables explicitly using
_set_memory_prot() (seeing they are already mapped). For ia64, s390 and
sh, reject anything but PAGE_KERNEL settings -- this should be fine,
for now, seeing these architectures don't support ZONE_DEVICE.

A check in __add_pages() is also added to ensure the pgprot parameter was
set for all arches.

Signed-off-by: Logan Gunthorpe 
Acked-by: David Hildenbrand 
Acked-by: Michal Hocko 
Acked-by: Dan Williams 
---
 arch/arm64/mm/mmu.c|  3 ++-
 arch/ia64/mm/init.c|  3 +++
 arch/powerpc/mm/mem.c  |  3 ++-
 arch/s390/mm/init.c|  3 +++
 arch/sh/mm/init.c  |  3 +++
 arch/x86/mm/init_32.c  | 12 
 arch/x86/mm/init_64.c  |  2 +-
 include/linux/memory_hotplug.h |  3 +++
 mm/memory_hotplug.c|  5 -
 mm/memremap.c  |  6 +++---
 10 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ee37bca8aba8..ea3fa844a8a2 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1058,7 +1058,8 @@ int arch_add_memory(int nid, u64 start, u64 size,
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
-size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
+size, params->pgprot, __pgd_pgtable_alloc,
+flags);
 
memblock_clear_nomap(start, size);
 
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 97bbc23ea1e3..d637b4ea3147 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -676,6 +676,9 @@ int arch_add_memory(int nid, u64 start, u64 size,
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
 
+   if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
+   return -EINVAL;
+
ret = __add_pages(nid, start_pfn, nr_pages, params);
if (ret)
printk("%s: Problem encountered in __add_pages() as ret=%d\n",
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 19b1da5d7eca..832412bc7fad 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -138,7 +138,8 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
resize_hpt_for_hotplug(memblock_phys_mem_size());
 
start = (unsigned long)__va(start);
-   rc = create_section_mapping(start, start + size, nid, PAGE_KERNEL);
+   rc = create_section_mapping(start, start + size, nid,
+   params->pgprot);
if (rc) {
pr_warn("Unable to create mapping for hot added memory 
0x%llx..0x%llx: %d\n",
start, start + size, rc);
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index e9e4a7abd0cc..87b2d024e75a 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -277,6 +277,9 @@ int arch_add_memory(int nid, u64 start, u64 size,
if (WARN_ON_ONCE(params->altmap))
return -EINVAL;
 
+   if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
+   return -EINVAL;
+
rc = vmem_add_mapping(start, size);
if (rc)
return rc;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index e5114c053364..b9de2d4fa57e 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -412,6 +412,9 @@ int arch_add_memory(int nid, u64 start, u64 size,
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
 
+   if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot)
+   return -EINVAL;
+
/* We only have ZONE_NORMAL, so this is easy.. */
ret = __add_pages(nid, start_pfn, nr_pages, params);
if (unlikely(ret))
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index e25a4218e6ff..69128f1a22ac 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -858,6 +858,18 @@ int arch_add_memory(int nid, u64 start, u64 size,
 {
unsigned long start_pfn = start >> PAGE_SHIFT;
unsi

[PATCH v4 4/7] x86/mm: Introduce __set_memory_prot()

2020-03-06 Thread Logan Gunthorpe
For use in the 32bit arch_add_memory() to set the pgprot type of the
memory to add.

Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: x...@kernel.org
Cc: Dave Hansen 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Signed-off-by: Logan Gunthorpe 
Reviewed-by: Dan Williams 
---
 arch/x86/include/asm/set_memory.h |  1 +
 arch/x86/mm/pat/set_memory.c  | 13 +
 2 files changed, 14 insertions(+)

diff --git a/arch/x86/include/asm/set_memory.h 
b/arch/x86/include/asm/set_memory.h
index 64c3dce374e5..034358da4837 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -34,6 +34,7 @@
  * The caller is required to take care of these.
  */
 
+int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot);
 int _set_memory_uc(unsigned long addr, int numpages);
 int _set_memory_wc(unsigned long addr, int numpages);
 int _set_memory_wt(unsigned long addr, int numpages);
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index c4aedd00c1ba..a7b14dffeb0b 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1792,6 +1792,19 @@ static inline int cpa_clear_pages_array(struct page 
**pages, int numpages,
CPA_PAGES_ARRAY, pages);
 }
 
+/*
+ * _set_memory_prot is an internal helper for callers that have been passed
+ * a pgprot_t value from upper layers and a reservation has already been taken.
+ * If you want to set the pgprot to a specific page protocol, use the
+ * set_memory_xx() functions.
+ */
+int __set_memory_prot(unsigned long addr, int numpages, pgprot_t prot)
+{
+   return change_page_attr_set_clr(, numpages, prot,
+   __pgprot(~pgprot_val(prot)), 0, 0,
+   NULL);
+}
+
 int _set_memory_uc(unsigned long addr, int numpages)
 {
/*
-- 
2.20.1



[PATCH v4 5/7] powerpc/mm: Thread pgprot_t through create_section_mapping()

2020-03-06 Thread Logan Gunthorpe
In prepartion to support a pgprot_t argument for arch_add_memory().

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Signed-off-by: Logan Gunthorpe 
---
 arch/powerpc/include/asm/book3s/64/hash.h  |  3 ++-
 arch/powerpc/include/asm/book3s/64/radix.h |  3 ++-
 arch/powerpc/include/asm/sparsemem.h   |  3 ++-
 arch/powerpc/mm/book3s64/hash_utils.c  |  5 +++--
 arch/powerpc/mm/book3s64/pgtable.c |  7 ---
 arch/powerpc/mm/book3s64/radix_pgtable.c   | 18 +++---
 arch/powerpc/mm/mem.c  |  5 +++--
 7 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index 2781ebf6add4..6fc4520092c7 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -251,7 +251,8 @@ extern int __meminit hash__vmemmap_create_mapping(unsigned 
long start,
 extern void hash__vmemmap_remove_mapping(unsigned long start,
 unsigned long page_size);
 
-int hash__create_section_mapping(unsigned long start, unsigned long end, int 
nid);
+int hash__create_section_mapping(unsigned long start, unsigned long end,
+int nid, pgprot_t prot);
 int hash__remove_section_mapping(unsigned long start, unsigned long end);
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index d97db3ad9aae..46799f3c3d1d 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -289,7 +289,8 @@ static inline unsigned long radix__get_tree_size(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int radix__create_section_mapping(unsigned long start, unsigned long end, int 
nid);
+int radix__create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot);
 int radix__remove_section_mapping(unsigned long start, unsigned long end);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/sparsemem.h 
b/arch/powerpc/include/asm/sparsemem.h
index 3192d454a733..c89b32443cff 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -13,7 +13,8 @@
 #endif /* CONFIG_SPARSEMEM */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-extern int create_section_mapping(unsigned long start, unsigned long end, int 
nid);
+extern int create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot);
 extern int remove_section_mapping(unsigned long start, unsigned long end);
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 523d4d39d11e..201738e07a1d 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -809,7 +809,8 @@ int resize_hpt_for_hotplug(unsigned long new_mem_size)
return 0;
 }
 
-int hash__create_section_mapping(unsigned long start, unsigned long end, int 
nid)
+int hash__create_section_mapping(unsigned long start, unsigned long end,
+int nid, pgprot_t prot)
 {
int rc;
 
@@ -819,7 +820,7 @@ int hash__create_section_mapping(unsigned long start, 
unsigned long end, int nid
}
 
rc = htab_bolt_mapping(start, end, __pa(start),
-  pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+  pgprot_val(prot), mmu_linear_psize,
   mmu_kernel_ssize);
 
if (rc < 0) {
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index 2bf7e1b4fd82..e0bb69c616e4 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -171,12 +171,13 @@ void mmu_cleanup_all(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int __meminit create_section_mapping(unsigned long start, unsigned long end, 
int nid)
+int __meminit create_section_mapping(unsigned long start, unsigned long end,
+int nid, pgprot_t prot)
 {
if (radix_enabled())
-   return radix__create_section_mapping(start, end, nid);
+   return radix__create_section_mapping(start, end, nid, prot);
 
-   return hash__create_section_mapping(start, end, nid);
+   return hash__create_section_mapping(start, end, nid, prot);
 }
 
 int __meminit remove_section_mapping(unsigned long start, unsigned long end)
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index dd1bea45325c..0ef10b5c26ba 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -253,7 +253,7 @@ static unsigned long next_boundary(unsigned long addr, 
unsigned long end)
 
 static int __meminit create_physical_mapping(unsigned long st

[PATCH v4 3/7] x86/mm: Thread pgprot_t through init_memory_mapping()

2020-03-06 Thread Logan Gunthorpe
In prepartion to support a pgprot_t argument for arch_add_memory().

It's required to move the prototype of init_memory_mapping() seeing
the original location came before the definition of pgprot_t.

Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: x...@kernel.org
Cc: Dave Hansen 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Signed-off-by: Logan Gunthorpe 
Reviewed-by: Dan Williams 
Acked-by: Michal Hocko 
---
 arch/x86/include/asm/page_types.h |  3 ---
 arch/x86/include/asm/pgtable.h|  3 +++
 arch/x86/kernel/amd_gart_64.c |  3 ++-
 arch/x86/mm/init.c|  9 +
 arch/x86/mm/init_32.c |  3 ++-
 arch/x86/mm/init_64.c | 32 +--
 arch/x86/mm/mm_internal.h |  3 ++-
 arch/x86/platform/uv/bios_uv.c|  3 ++-
 8 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/page_types.h 
b/arch/x86/include/asm/page_types.h
index c85e15010f48..bf7aa2e290ef 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -73,9 +73,6 @@ static inline phys_addr_t get_max_mapped(void)
 
 bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
 
-extern unsigned long init_memory_mapping(unsigned long start,
-unsigned long end);
-
 extern void initmem_init(void);
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 7e118660bbd9..48d6a5960f28 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1046,6 +1046,9 @@ static inline void __meminit init_trampoline_default(void)
 
 void __init poking_init(void);
 
+unsigned long init_memory_mapping(unsigned long start,
+ unsigned long end, pgprot_t prot);
+
 # ifdef CONFIG_RANDOMIZE_MEMORY
 void __meminit init_trampoline(void);
 # else
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 4e5f50236048..16133819415c 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -744,7 +744,8 @@ int __init gart_iommu_init(void)
 
start_pfn = PFN_DOWN(aper_base);
if (!pfn_range_is_mapped(start_pfn, end_pfn))
-   init_memory_mapping(start_pfn<> PAGE_SHIFT, ret >> PAGE_SHIFT);
 
@@ -521,7 +522,7 @@ static unsigned long __init init_range_memory_mapping(
 */
can_use_brk_pgt = max(start, (u64)pgt_buf_end<=
min(end, (u64)pgt_buf_top<> PAGE_SHIFT,
-PAGE_KERNEL_LARGE),
+prot),
 init);
spin_unlock(_mm.page_table_lock);
paddr_last = paddr_next;
@@ -669,7 +672,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, 
unsigned long paddr_end,
 
 static unsigned long __meminit
 phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
- unsigned long page_size_mask, bool init)
+ unsigned long page_size_mask, pgprot_t prot, bool init)
 {
unsigned long vaddr, vaddr_end, vaddr_next, paddr_next, paddr_last;
 
@@ -679,7 +682,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, 
unsigned long paddr_end,
 
if (!pgtable_l5_enabled())
return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end,
-page_size_mask, init);
+page_size_mask, prot, init);
 
for (; vaddr < vaddr_end; vaddr = vaddr_next) {
p4d_t *p4d = p4d_page + p4d_index(vaddr);
@@ -702,13 +705,13 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, 
unsigned long paddr_end,
if (!p4d_none(*p4d)) {
pud = pud_offset(p4d, 0);
paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
-   page_size_mask, init);
+   page_size_mask, prot, init);
continue;
}
 
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
-  page_size_mask, init);
+  page_size_mask, prot, init);
 
spin_lock(_mm.page_table_lock);
p4d_populate_init(_mm, p4d, pud, init);
@@ -722,7 +725,7 @@ static unsigned long __meminit
 __kernel_physical_mapping_init(unsigned long paddr_start,
   unsigned long paddr_end,
   unsigned long page_size_mask,
-  bool init)
+  pgprot_t prot, bool init)
 {
bool pgd_changed = false;
unsigned long vaddr, vaddr_start, vaddr_end, vaddr_ne

[PATCH v4 1/7] mm/memory_hotplug: Drop the flags field from struct mhp_restrictions

2020-03-06 Thread Logan Gunthorpe
This variable is not used anywhere and should therefore be removed
from the structure.

Signed-off-by: Logan Gunthorpe 
Reviewed-by: David Hildenbrand 
Reviewed-by: Dan Williams 
Acked-by: Michal Hocko 
---
 include/linux/memory_hotplug.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index f4d59155f3d4..69ff3037528d 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -55,11 +55,9 @@ enum {
 
 /*
  * Restrictions for the memory hotplug:
- * flags:  MHP_ flags
  * altmap: alternative allocator for memmap array
  */
 struct mhp_restrictions {
-   unsigned long flags;
struct vmem_altmap *altmap;
 };
 
-- 
2.20.1



[PATCH v4 0/7] Allow setting caching mode in arch_add_memory() for P2PDMA

2020-03-06 Thread Logan Gunthorpe
Hi,

This is v4 of the patchset which cleans up a number of minor issues
from the feedback of v3 and rebases onto v5.6-rc4. Additional feedback
is welcome.

Also worth noting, is that the kernel test robot reports[1] that Patch 3
in this series improves will-it-scale.per_process_ops by 36%. Though,
for the life of me, I can't understand why that would be. But it's
reported the same thing twice now for different versions of the series.

Thanks,

Logan

[1] 
https://lists.01.org/hyperkitty/list/l...@lists.01.org/thread/5APDKNBEJGVJTJRTI2IIA3P4OC2OEYPS/

--

Changes in v4:
 * Rebased onto v5.6-rc4
 * Collected tags form David, Dan and Michal
 * Minor changes to the new _set_memory_prot() function and added some
   comments as requested by Dan.
 * Changed the default caching type for P2PDMA memory to UC instead of
   WC per Jason's concerns that WC might be more generally unsafe.

Changes in v3:
 * Rebased onto v5.6-rc2
 * Rename mhp_modifiers to mhp_params per David with an updated kernel
   doc per Dan
 * Drop support for s390 per David seeing it does not support
   ZONE_DEVICE yet and there was a potential problem with huge pages.
 * Added WARN_ON_ONCE in cases where arches recieve non PAGE_KERNEL
   parameters
 * Collected David and Micheal's Reviewed-By and Acked-by Tags

Changes in v2:
 * Rebased onto v5.5-rc5
 * Renamed mhp_restrictions to mhp_modifiers and added the pgprot field
   to that structure instead of using an argument for
   arch_add_memory().
 * Add patch to drop the unused flags field in mhp_restrictions

A git branch is available here:

https://github.com/sbates130272/linux-p2pmem remap_pages_cache_v4

--

Currently, the page tables created using memremap_pages() are always
created with the PAGE_KERNEL cacheing mode. However, the P2PDMA code
is creating pages for PCI BAR memory which should never be accessed
through the cache and instead use either WC or UC. This still works in
most cases, on x86, because the MTRR registers typically override the
caching settings in the page tables for all of the IO memory to be
UC-. However, this tends not to work so well on other arches or
some rare x86 machines that have firmware which does not setup the
MTRR registers in this way.

Instead of this, this series proposes a change to arch_add_memory()
to take the pgprot required by the mapping which allows us to
explicitly set pagetable entries for P2PDMA memory to UC.

This changes is pretty routine for most of the arches: x86_64, arm64
and powerpc simply need to thread the pgprot through to where the page
tables are setup. x86_32 unfortunately sets up the page tables at boot so
must use _set_memory_prot() to change their caching mode. ia64, s390 and sh
don't appear to have an easy way to change the page tables so, for now
at least, we just return -EINVAL on such mappings and thus they will
not support P2PDMA memory until the work for this is done. This should
be fine as they don't yet support ZONE_DEVICE.

--

Logan Gunthorpe (7):
  mm/memory_hotplug: Drop the flags field from struct mhp_restrictions
  mm/memory_hotplug: Rename mhp_restrictions to mhp_params
  x86/mm: Thread pgprot_t through init_memory_mapping()
  x86/mm: Introduce __set_memory_prot()
  powerpc/mm: Thread pgprot_t through create_section_mapping()
  mm/memory_hotplug: Add pgprot_t to mhp_params
  mm/memremap: Set caching mode for PCI P2PDMA memory to WC

 arch/arm64/mm/mmu.c|  7 ++--
 arch/ia64/mm/init.c|  7 ++--
 arch/powerpc/include/asm/book3s/64/hash.h  |  3 +-
 arch/powerpc/include/asm/book3s/64/radix.h |  3 +-
 arch/powerpc/include/asm/sparsemem.h   |  3 +-
 arch/powerpc/mm/book3s64/hash_utils.c  |  5 +--
 arch/powerpc/mm/book3s64/pgtable.c |  7 ++--
 arch/powerpc/mm/book3s64/radix_pgtable.c   | 18 ++
 arch/powerpc/mm/mem.c  | 10 +++---
 arch/s390/mm/init.c|  9 +++--
 arch/sh/mm/init.c  |  7 ++--
 arch/x86/include/asm/page_types.h  |  3 --
 arch/x86/include/asm/pgtable.h |  3 ++
 arch/x86/include/asm/set_memory.h  |  1 +
 arch/x86/kernel/amd_gart_64.c  |  3 +-
 arch/x86/mm/init.c |  9 ++---
 arch/x86/mm/init_32.c  | 19 --
 arch/x86/mm/init_64.c  | 40 --
 arch/x86/mm/mm_internal.h  |  3 +-
 arch/x86/mm/pat/set_memory.c   | 13 +++
 arch/x86/platform/uv/bios_uv.c |  3 +-
 include/linux/memory_hotplug.h | 21 ++--
 mm/memory_hotplug.c| 11 +++---
 mm/memremap.c  | 17 +
 24 files changed, 144 insertions(+), 81 deletions(-)


base-commit: 98d54f81e36ba3bf92172791eba5ca5bd813989b
--
2.20.1


Re: [PATCH v3 7/7] mm/memremap: Set caching mode for PCI P2PDMA memory to WC

2020-03-02 Thread Logan Gunthorpe



On 2020-02-29 3:47 p.m., Dan Williams wrote:
> On Fri, Feb 21, 2020 at 10:25 AM Logan Gunthorpe  wrote:
>>
>> PCI BAR IO memory should never be mapped as WB, however prior to this
>> the PAT bits were set WB and it was typically overridden by MTRR
>> registers set by the firmware.
>>
>> Set PCI P2PDMA memory to be WC (writecombining) as the only current
>> user (the NVMe CMB) was originally mapped WC before the P2PDMA code
>> replaced the mapping with devm_memremap_pages().
> 
> Will the change to UC regress this existing use case?

I don't think so. They've been essentially mapped UC for a long time now
(since the P2PDMA patch set was merged) and nobody has complained.


Re: [PATCH v3 6/7] mm/memory_hotplug: Add pgprot_t to mhp_params

2020-03-02 Thread Logan Gunthorpe



On 2020-02-29 3:44 p.m., Dan Williams wrote:
> On Fri, Feb 21, 2020 at 10:25 AM Logan Gunthorpe  wrote:
>>
>> devm_memremap_pages() is currently used by the PCI P2PDMA code to create
>> struct page mappings for IO memory. At present, these mappings are created
>> with PAGE_KERNEL which implies setting the PAT bits to be WB. However, on
>> x86, an mtrr register will typically override this and force the cache
>> type to be UC-. In the case firmware doesn't set this register it is
>> effectively WB and will typically result in a machine check exception
>> when it's accessed.
>>
>> Other arches are not currently likely to function correctly seeing they
>> don't have any MTRR registers to fall back on.
>>
>> To solve this, provide a way to specify the pgprot value explicitly to
>> arch_add_memory().
>>
>> Of the arches that support MEMORY_HOTPLUG: x86_64, and arm64 need a simple
>> change to pass the pgprot_t down to their respective functions which set
>> up the page tables. For x86_32, set the page tables explicitly using
>> _set_memory_prot() (seeing they are already mapped). For ia64, s390 and
>> sh, reject anything but PAGE_KERNEL settings -- this should be fine,
>> for now, seeing these architectures don't support ZONE_DEVICE.
>>
>> A check in __add_pages() is also added to ensure the pgprot parameter was
>> set for all arches.
>>
>> Cc: Dan Williams 
>> Signed-off-by: Logan Gunthorpe 
>> Acked-by: David Hildenbrand 
>> Acked-by: Michal Hocko 
>> ---
>>  arch/arm64/mm/mmu.c| 3 ++-
>>  arch/ia64/mm/init.c| 3 +++
>>  arch/powerpc/mm/mem.c  | 3 ++-
>>  arch/s390/mm/init.c| 3 +++
>>  arch/sh/mm/init.c  | 3 +++
>>  arch/x86/mm/init_32.c  | 5 +
>>  arch/x86/mm/init_64.c  | 2 +-
>>  include/linux/memory_hotplug.h | 2 ++
>>  mm/memory_hotplug.c| 5 -
>>  mm/memremap.c  | 6 +++---
>>  10 files changed, 28 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index ee37bca8aba8..ea3fa844a8a2 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -1058,7 +1058,8 @@ int arch_add_memory(int nid, u64 start, u64 size,
>> flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
>>
>> __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
>> -size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
>> +size, params->pgprot, __pgd_pgtable_alloc,
>> +flags);
>>
>> memblock_clear_nomap(start, size);
>>
>> diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
>> index 97bbc23ea1e3..d637b4ea3147 100644
>> --- a/arch/ia64/mm/init.c
>> +++ b/arch/ia64/mm/init.c
>> @@ -676,6 +676,9 @@ int arch_add_memory(int nid, u64 start, u64 size,
>> unsigned long nr_pages = size >> PAGE_SHIFT;
>> int ret;
>>
>> +   if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
>> +   return -EINVAL;
>> +
>> ret = __add_pages(nid, start_pfn, nr_pages, params);
>> if (ret)
>> printk("%s: Problem encountered in __add_pages() as 
>> ret=%d\n",
>> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
>> index 19b1da5d7eca..832412bc7fad 100644
>> --- a/arch/powerpc/mm/mem.c
>> +++ b/arch/powerpc/mm/mem.c
>> @@ -138,7 +138,8 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
>> resize_hpt_for_hotplug(memblock_phys_mem_size());
>>
>> start = (unsigned long)__va(start);
>> -   rc = create_section_mapping(start, start + size, nid, PAGE_KERNEL);
>> +   rc = create_section_mapping(start, start + size, nid,
>> +   params->pgprot);
>> if (rc) {
>> pr_warn("Unable to create mapping for hot added memory 
>> 0x%llx..0x%llx: %d\n",
>> start, start + size, rc);
>> diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
>> index e9e4a7abd0cc..87b2d024e75a 100644
>> --- a/arch/s390/mm/init.c
>> +++ b/arch/s390/mm/init.c
>> @@ -277,6 +277,9 @@ int arch_add_memory(int nid, u64 start, u64 size,
>> if (WARN_ON_ONCE(params->altmap))
>> return -EINVAL;
>>
>> +   if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
>> +   return -EINVAL;
>> +
>>

Re: [PATCH v3 4/7] x86/mm: Introduce _set_memory_prot()

2020-03-02 Thread Logan Gunthorpe



On 2020-02-29 3:33 p.m., Dan Williams wrote:
> On Fri, Feb 21, 2020 at 10:25 AM Logan Gunthorpe  wrote:
>>
>> For use in the 32bit arch_add_memory() to set the pgprot type of the
>> memory to add.
>>
>> Cc: Thomas Gleixner 
>> Cc: Ingo Molnar 
>> Cc: Borislav Petkov 
>> Cc: "H. Peter Anvin" 
>> Cc: x...@kernel.org
>> Cc: Dave Hansen 
>> Cc: Andy Lutomirski 
>> Cc: Peter Zijlstra 
>> Signed-off-by: Logan Gunthorpe 
>> ---
>>  arch/x86/include/asm/set_memory.h | 1 +
>>  arch/x86/mm/pat/set_memory.c  | 7 +++
>>  2 files changed, 8 insertions(+)
>>
>> diff --git a/arch/x86/include/asm/set_memory.h 
>> b/arch/x86/include/asm/set_memory.h
>> index 64c3dce374e5..0aca959cf9a4 100644
>> --- a/arch/x86/include/asm/set_memory.h
>> +++ b/arch/x86/include/asm/set_memory.h
>> @@ -34,6 +34,7 @@
>>   * The caller is required to take care of these.
>>   */
>>
>> +int _set_memory_prot(unsigned long addr, int numpages, pgprot_t prot);
> 
> I wonder if this should be separated from the naming convention of the
> other routines because this is only an internal helper for code paths
> where the prot was established by an upper layer. For example, I
> expect that the kernel does not want new usages to make the mistake of
> calling:
> 
>_set_memory_prot(..., pgprot_writecombine(pgprot))
> 
> ...instead of
> 
> _set_memory_wc()
> 
> I'm thinking just a double underscore rename (__set_memory_prot) and a
> kerneldoc comment for that  pointing people to use the direct
> _set_memory_ helpers.

Thanks! Will do. Note, though, that even _set_memory_wc() is an internal
x86-specific function. But the extra comment and underscore still make
sense.

> With that you can add:
> 
> Reviewed-by: Dan Williams 
> 


Re: [PATCH v3 0/7] Allow setting caching mode in arch_add_memory() for P2PDMA

2020-02-27 Thread Logan Gunthorpe



On 2020-02-27 10:43 a.m., Jason Gunthorpe wrote:
> Hm, AFAIK WC memory is not compatible with the spinlocks/mutexs/etc in
> Linux, so while it is true the memory has no side effects, there would
> be surprising concurrency risks if anything in the kernel tried to
> write to it.
> 
> Not compatible means the locks don't contain stores to WC memory the
> way you would expect. AFAIK on many CPUs extra barriers are required
> to keep WC stores ordered, the same way ARM already has extra barriers
> to keep UC stores ordered with locking..
> 
> The spinlocks are defined to contain UC stores though.
> 
> If there is no actual need today for WC I would suggest using UC as
> the default.

Ok, that sounds sensible. I'll do that in the next revision.

Thanks,

Logan


Re: [PATCH v3 0/7] Allow setting caching mode in arch_add_memory() for P2PDMA

2020-02-27 Thread Logan Gunthorpe



On 2020-02-27 10:17 a.m., Jason Gunthorpe wrote:
>> Instead of this, this series proposes a change to arch_add_memory()
>> to take the pgprot required by the mapping which allows us to
>> explicitly set pagetable entries for P2PDMA memory to WC.
> 
> Is there a particular reason why WC was selected here? I thought for
> the p2pdma cases there was no kernel user that touched the memory?

Yes, that's correct. I choose WC here because the existing users are
registering memory blocks without side effects which fit the WC
semantics well.

> I definitely forsee devices where we want UC instead.

Yes. My expectation is that once we have a kernel user that needs this,
we'd wire the option through struct dev_pagemap so the caller can choose
the mapping that makes sense.

Logan


[PATCH v3 1/7] mm/memory_hotplug: Drop the flags field from struct mhp_restrictions

2020-02-21 Thread Logan Gunthorpe
This variable is not used anywhere and should therefore be removed
from the structure.

Signed-off-by: Logan Gunthorpe 
Reviewed-by: David Hildenbrand 
---
 include/linux/memory_hotplug.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index f4d59155f3d4..69ff3037528d 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -55,11 +55,9 @@ enum {
 
 /*
  * Restrictions for the memory hotplug:
- * flags:  MHP_ flags
  * altmap: alternative allocator for memmap array
  */
 struct mhp_restrictions {
-   unsigned long flags;
struct vmem_altmap *altmap;
 };
 
-- 
2.20.1



[PATCH v3 5/7] powerpc/mm: Thread pgprot_t through create_section_mapping()

2020-02-21 Thread Logan Gunthorpe
In prepartion to support a pgprot_t argument for arch_add_memory().

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Signed-off-by: Logan Gunthorpe 
---
 arch/powerpc/include/asm/book3s/64/hash.h  |  3 ++-
 arch/powerpc/include/asm/book3s/64/radix.h |  3 ++-
 arch/powerpc/include/asm/sparsemem.h   |  3 ++-
 arch/powerpc/mm/book3s64/hash_utils.c  |  5 +++--
 arch/powerpc/mm/book3s64/pgtable.c |  7 ---
 arch/powerpc/mm/book3s64/radix_pgtable.c   | 18 +++---
 arch/powerpc/mm/mem.c  |  5 +++--
 7 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index 2781ebf6add4..6fc4520092c7 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -251,7 +251,8 @@ extern int __meminit hash__vmemmap_create_mapping(unsigned 
long start,
 extern void hash__vmemmap_remove_mapping(unsigned long start,
 unsigned long page_size);
 
-int hash__create_section_mapping(unsigned long start, unsigned long end, int 
nid);
+int hash__create_section_mapping(unsigned long start, unsigned long end,
+int nid, pgprot_t prot);
 int hash__remove_section_mapping(unsigned long start, unsigned long end);
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index d97db3ad9aae..46799f3c3d1d 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -289,7 +289,8 @@ static inline unsigned long radix__get_tree_size(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int radix__create_section_mapping(unsigned long start, unsigned long end, int 
nid);
+int radix__create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot);
 int radix__remove_section_mapping(unsigned long start, unsigned long end);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/sparsemem.h 
b/arch/powerpc/include/asm/sparsemem.h
index 3192d454a733..c89b32443cff 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -13,7 +13,8 @@
 #endif /* CONFIG_SPARSEMEM */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-extern int create_section_mapping(unsigned long start, unsigned long end, int 
nid);
+extern int create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot);
 extern int remove_section_mapping(unsigned long start, unsigned long end);
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index 523d4d39d11e..201738e07a1d 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -809,7 +809,8 @@ int resize_hpt_for_hotplug(unsigned long new_mem_size)
return 0;
 }
 
-int hash__create_section_mapping(unsigned long start, unsigned long end, int 
nid)
+int hash__create_section_mapping(unsigned long start, unsigned long end,
+int nid, pgprot_t prot)
 {
int rc;
 
@@ -819,7 +820,7 @@ int hash__create_section_mapping(unsigned long start, 
unsigned long end, int nid
}
 
rc = htab_bolt_mapping(start, end, __pa(start),
-  pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+  pgprot_val(prot), mmu_linear_psize,
   mmu_kernel_ssize);
 
if (rc < 0) {
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index 2bf7e1b4fd82..e0bb69c616e4 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -171,12 +171,13 @@ void mmu_cleanup_all(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int __meminit create_section_mapping(unsigned long start, unsigned long end, 
int nid)
+int __meminit create_section_mapping(unsigned long start, unsigned long end,
+int nid, pgprot_t prot)
 {
if (radix_enabled())
-   return radix__create_section_mapping(start, end, nid);
+   return radix__create_section_mapping(start, end, nid, prot);
 
-   return hash__create_section_mapping(start, end, nid);
+   return hash__create_section_mapping(start, end, nid, prot);
 }
 
 int __meminit remove_section_mapping(unsigned long start, unsigned long end)
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index dd1bea45325c..0ef10b5c26ba 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -253,7 +253,7 @@ static unsigned long next_boundary(unsigned long addr, 
unsigned long end)
 
 static int __meminit create_physical_mapping(unsigned long st

[PATCH v3 3/7] x86/mm: Thread pgprot_t through init_memory_mapping()

2020-02-21 Thread Logan Gunthorpe
In prepartion to support a pgprot_t argument for arch_add_memory().

It's required to move the prototype of init_memory_mapping() seeing
the original location came before the definition of pgprot_t.

Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: x...@kernel.org
Cc: Dave Hansen 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Signed-off-by: Logan Gunthorpe 
---
 arch/x86/include/asm/page_types.h |  3 ---
 arch/x86/include/asm/pgtable.h|  3 +++
 arch/x86/kernel/amd_gart_64.c |  3 ++-
 arch/x86/mm/init.c|  9 +
 arch/x86/mm/init_32.c |  3 ++-
 arch/x86/mm/init_64.c | 32 +--
 arch/x86/mm/mm_internal.h |  3 ++-
 arch/x86/platform/uv/bios_uv.c|  3 ++-
 8 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/page_types.h 
b/arch/x86/include/asm/page_types.h
index c85e15010f48..bf7aa2e290ef 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -73,9 +73,6 @@ static inline phys_addr_t get_max_mapped(void)
 
 bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
 
-extern unsigned long init_memory_mapping(unsigned long start,
-unsigned long end);
-
 extern void initmem_init(void);
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 7e118660bbd9..48d6a5960f28 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1046,6 +1046,9 @@ static inline void __meminit init_trampoline_default(void)
 
 void __init poking_init(void);
 
+unsigned long init_memory_mapping(unsigned long start,
+ unsigned long end, pgprot_t prot);
+
 # ifdef CONFIG_RANDOMIZE_MEMORY
 void __meminit init_trampoline(void);
 # else
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 4e5f50236048..16133819415c 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -744,7 +744,8 @@ int __init gart_iommu_init(void)
 
start_pfn = PFN_DOWN(aper_base);
if (!pfn_range_is_mapped(start_pfn, end_pfn))
-   init_memory_mapping(start_pfn<> PAGE_SHIFT, ret >> PAGE_SHIFT);
 
@@ -521,7 +522,7 @@ static unsigned long __init init_range_memory_mapping(
 */
can_use_brk_pgt = max(start, (u64)pgt_buf_end<=
min(end, (u64)pgt_buf_top<> PAGE_SHIFT,
-PAGE_KERNEL_LARGE),
+prot),
 init);
spin_unlock(_mm.page_table_lock);
paddr_last = paddr_next;
@@ -669,7 +672,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, 
unsigned long paddr_end,
 
 static unsigned long __meminit
 phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
- unsigned long page_size_mask, bool init)
+ unsigned long page_size_mask, pgprot_t prot, bool init)
 {
unsigned long vaddr, vaddr_end, vaddr_next, paddr_next, paddr_last;
 
@@ -679,7 +682,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, 
unsigned long paddr_end,
 
if (!pgtable_l5_enabled())
return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end,
-page_size_mask, init);
+page_size_mask, prot, init);
 
for (; vaddr < vaddr_end; vaddr = vaddr_next) {
p4d_t *p4d = p4d_page + p4d_index(vaddr);
@@ -702,13 +705,13 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, 
unsigned long paddr_end,
if (!p4d_none(*p4d)) {
pud = pud_offset(p4d, 0);
paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
-   page_size_mask, init);
+   page_size_mask, prot, init);
continue;
}
 
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
-  page_size_mask, init);
+  page_size_mask, prot, init);
 
spin_lock(_mm.page_table_lock);
p4d_populate_init(_mm, p4d, pud, init);
@@ -722,7 +725,7 @@ static unsigned long __meminit
 __kernel_physical_mapping_init(unsigned long paddr_start,
   unsigned long paddr_end,
   unsigned long page_size_mask,
-  bool init)
+  pgprot_t prot, bool init)
 {
bool pgd_changed = false;
unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
@@ -743,13 +746,13 @@ __kernel_p

[PATCH v3 4/7] x86/mm: Introduce _set_memory_prot()

2020-02-21 Thread Logan Gunthorpe
For use in the 32bit arch_add_memory() to set the pgprot type of the
memory to add.

Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: x...@kernel.org
Cc: Dave Hansen 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Signed-off-by: Logan Gunthorpe 
---
 arch/x86/include/asm/set_memory.h | 1 +
 arch/x86/mm/pat/set_memory.c  | 7 +++
 2 files changed, 8 insertions(+)

diff --git a/arch/x86/include/asm/set_memory.h 
b/arch/x86/include/asm/set_memory.h
index 64c3dce374e5..0aca959cf9a4 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -34,6 +34,7 @@
  * The caller is required to take care of these.
  */
 
+int _set_memory_prot(unsigned long addr, int numpages, pgprot_t prot);
 int _set_memory_uc(unsigned long addr, int numpages);
 int _set_memory_wc(unsigned long addr, int numpages);
 int _set_memory_wt(unsigned long addr, int numpages);
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index c4aedd00c1ba..2ba83d53d835 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1792,6 +1792,13 @@ static inline int cpa_clear_pages_array(struct page 
**pages, int numpages,
CPA_PAGES_ARRAY, pages);
 }
 
+int _set_memory_prot(unsigned long addr, int numpages, pgprot_t prot)
+{
+   return change_page_attr_set_clr(, numpages, prot,
+   __pgprot(~pgprot_val(prot)), 0, 0,
+   NULL);
+}
+
 int _set_memory_uc(unsigned long addr, int numpages)
 {
/*
-- 
2.20.1



[PATCH v3 2/7] mm/memory_hotplug: Rename mhp_restrictions to mhp_params

2020-02-21 Thread Logan Gunthorpe
The mhp_restrictions struct really doesn't specify anything resembling
a restriction anymore so rename it to be mhp_params as it is a list
of extended parameters.

Signed-off-by: Logan Gunthorpe 
---
 arch/arm64/mm/mmu.c|  4 ++--
 arch/ia64/mm/init.c|  4 ++--
 arch/powerpc/mm/mem.c  |  4 ++--
 arch/s390/mm/init.c|  6 +++---
 arch/sh/mm/init.c  |  4 ++--
 arch/x86/mm/init_32.c  |  4 ++--
 arch/x86/mm/init_64.c  |  8 
 include/linux/memory_hotplug.h | 16 
 mm/memory_hotplug.c|  8 
 mm/memremap.c  |  8 
 10 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 128f70852bf3..ee37bca8aba8 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1050,7 +1050,7 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size,
-   struct mhp_restrictions *restrictions)
+   struct mhp_params *params)
 {
int flags = 0;
 
@@ -1063,7 +1063,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
memblock_clear_nomap(start, size);
 
return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
-  restrictions);
+  params);
 }
 void arch_remove_memory(int nid, u64 start, u64 size,
struct vmem_altmap *altmap)
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index b01d68a2d5d9..97bbc23ea1e3 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -670,13 +670,13 @@ mem_init (void)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size,
-   struct mhp_restrictions *restrictions)
+   struct mhp_params *params)
 {
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
 
-   ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
+   ret = __add_pages(nid, start_pfn, nr_pages, params);
if (ret)
printk("%s: Problem encountered in __add_pages() as ret=%d\n",
   __func__,  ret);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index ef7b1119b2e2..b4bece53bec0 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -128,7 +128,7 @@ static void flush_dcache_range_chunked(unsigned long start, 
unsigned long stop,
 }
 
 int __ref arch_add_memory(int nid, u64 start, u64 size,
-   struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
 {
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -144,7 +144,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
return -EFAULT;
}
 
-   return __add_pages(nid, start_pfn, nr_pages, restrictions);
+   return __add_pages(nid, start_pfn, nr_pages, params);
 }
 
 void __ref arch_remove_memory(int nid, u64 start, u64 size,
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index ac44bd76db4b..e9e4a7abd0cc 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -268,20 +268,20 @@ device_initcall(s390_cma_mem_init);
 #endif /* CONFIG_CMA */
 
 int arch_add_memory(int nid, u64 start, u64 size,
-   struct mhp_restrictions *restrictions)
+   struct mhp_params *params)
 {
unsigned long start_pfn = PFN_DOWN(start);
unsigned long size_pages = PFN_DOWN(size);
int rc;
 
-   if (WARN_ON_ONCE(restrictions->altmap))
+   if (WARN_ON_ONCE(params->altmap))
return -EINVAL;
 
rc = vmem_add_mapping(start, size);
if (rc)
return rc;
 
-   rc = __add_pages(nid, start_pfn, size_pages, restrictions);
+   rc = __add_pages(nid, start_pfn, size_pages, params);
if (rc)
vmem_remove_mapping(start, size);
return rc;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index d1b1ff2be17a..e5114c053364 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -406,14 +406,14 @@ void __init mem_init(void)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size,
-   struct mhp_restrictions *restrictions)
+   struct mhp_params *params)
 {
unsigned long start_pfn = PFN_DOWN(start);
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
 
/* We only have ZONE_NORMAL, so this is easy.. */
-   ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
+   ret = __add_pages(nid, start_pfn, nr_pages, params);
if (unlikely(ret))
printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
 
diff --git a/arch/x86/mm/init_32.c b/arch

[PATCH v3 6/7] mm/memory_hotplug: Add pgprot_t to mhp_params

2020-02-21 Thread Logan Gunthorpe
devm_memremap_pages() is currently used by the PCI P2PDMA code to create
struct page mappings for IO memory. At present, these mappings are created
with PAGE_KERNEL which implies setting the PAT bits to be WB. However, on
x86, an mtrr register will typically override this and force the cache
type to be UC-. In the case firmware doesn't set this register it is
effectively WB and will typically result in a machine check exception
when it's accessed.

Other arches are not currently likely to function correctly seeing they
don't have any MTRR registers to fall back on.

To solve this, provide a way to specify the pgprot value explicitly to
arch_add_memory().

Of the arches that support MEMORY_HOTPLUG: x86_64, and arm64 need a simple
change to pass the pgprot_t down to their respective functions which set
up the page tables. For x86_32, set the page tables explicitly using
_set_memory_prot() (seeing they are already mapped). For ia64, s390 and
sh, reject anything but PAGE_KERNEL settings -- this should be fine,
for now, seeing these architectures don't support ZONE_DEVICE.

A check in __add_pages() is also added to ensure the pgprot parameter was
set for all arches.

Cc: Dan Williams 
Signed-off-by: Logan Gunthorpe 
Acked-by: David Hildenbrand 
Acked-by: Michal Hocko 
---
 arch/arm64/mm/mmu.c| 3 ++-
 arch/ia64/mm/init.c| 3 +++
 arch/powerpc/mm/mem.c  | 3 ++-
 arch/s390/mm/init.c| 3 +++
 arch/sh/mm/init.c  | 3 +++
 arch/x86/mm/init_32.c  | 5 +
 arch/x86/mm/init_64.c  | 2 +-
 include/linux/memory_hotplug.h | 2 ++
 mm/memory_hotplug.c| 5 -
 mm/memremap.c  | 6 +++---
 10 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index ee37bca8aba8..ea3fa844a8a2 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1058,7 +1058,8 @@ int arch_add_memory(int nid, u64 start, u64 size,
flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
 
__create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
-size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
+size, params->pgprot, __pgd_pgtable_alloc,
+flags);
 
memblock_clear_nomap(start, size);
 
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 97bbc23ea1e3..d637b4ea3147 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -676,6 +676,9 @@ int arch_add_memory(int nid, u64 start, u64 size,
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
 
+   if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
+   return -EINVAL;
+
ret = __add_pages(nid, start_pfn, nr_pages, params);
if (ret)
printk("%s: Problem encountered in __add_pages() as ret=%d\n",
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index 19b1da5d7eca..832412bc7fad 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -138,7 +138,8 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
resize_hpt_for_hotplug(memblock_phys_mem_size());
 
start = (unsigned long)__va(start);
-   rc = create_section_mapping(start, start + size, nid, PAGE_KERNEL);
+   rc = create_section_mapping(start, start + size, nid,
+   params->pgprot);
if (rc) {
pr_warn("Unable to create mapping for hot added memory 
0x%llx..0x%llx: %d\n",
start, start + size, rc);
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index e9e4a7abd0cc..87b2d024e75a 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -277,6 +277,9 @@ int arch_add_memory(int nid, u64 start, u64 size,
if (WARN_ON_ONCE(params->altmap))
return -EINVAL;
 
+   if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot))
+   return -EINVAL;
+
rc = vmem_add_mapping(start, size);
if (rc)
return rc;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index e5114c053364..b9de2d4fa57e 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -412,6 +412,9 @@ int arch_add_memory(int nid, u64 start, u64 size,
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
 
+   if (WARN_ON_ONCE(params->pgprot.pgprot != PAGE_KERNEL.pgprot)
+   return -EINVAL;
+
/* We only have ZONE_NORMAL, so this is easy.. */
ret = __add_pages(nid, start_pfn, nr_pages, params);
if (unlikely(ret))
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index e25a4218e6ff..96d8e4fb1cc8 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -858,6 +858,11 @@ int arch_add_memory(int nid, u64 start, u64 size,
 {
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
+  

[PATCH v3 0/7] Allow setting caching mode in arch_add_memory() for P2PDMA

2020-02-21 Thread Logan Gunthorpe
Hi,

This is v3 of the patchset which cleans up a number of minor issues
from the feedback of v2 and rebases onto v5.6-rc2. Additional feedback
is welcome.

Thanks,

Logan

--

Changes in v3:
 * Rebased onto v5.6-rc2
 * Rename mhp_modifiers to mhp_params per David with an updated kernel
   doc per Dan
 * Drop support for s390 per David seeing it does not support
   ZONE_DEVICE yet and there was a potential problem with huge pages.
 * Added WARN_ON_ONCE in cases where arches recieve non PAGE_KERNEL
   parameters
 * Collected David and Micheal's Reviewed-By and Acked-by Tags

Changes in v2:
 * Rebased onto v5.5-rc5
 * Renamed mhp_restrictions to mhp_modifiers and added the pgprot field
   to that structure instead of using an argument for
   arch_add_memory().
 * Add patch to drop the unused flags field in mhp_restrictions

A git branch is available here:

https://github.com/sbates130272/linux-p2pmem remap_pages_cache_v3

--

Currently, the page tables created using memremap_pages() are always
created with the PAGE_KERNEL cacheing mode. However, the P2PDMA code
is creating pages for PCI BAR memory which should never be accessed
through the cache and instead use either WC or UC. This still works in
most cases, on x86, because the MTRR registers typically override the
caching settings in the page tables for all of the IO memory to be
UC-. However, this tends not to work so well on other arches or
some rare x86 machines that have firmware which does not setup the
MTRR registers in this way.

Instead of this, this series proposes a change to arch_add_memory()
to take the pgprot required by the mapping which allows us to
explicitly set pagetable entries for P2PDMA memory to WC.

This changes is pretty routine for most of the arches: x86_64, s390, arm64
and powerpc simply need to thread the pgprot through to where the page
tables are setup. x86_32 unfortunately sets up the page tables at boot so
must use _set_memory_prot() to change their caching mode. ia64 and sh
don't appear to have an easy way to change the page tables so, for now
at least, we just return -EINVAL on such mappings and thus they will
not support P2PDMA memory until the work for this is done.

--

Logan Gunthorpe (7):
  mm/memory_hotplug: Drop the flags field from struct mhp_restrictions
  mm/memory_hotplug: Rename mhp_restrictions to mhp_params
  x86/mm: Thread pgprot_t through init_memory_mapping()
  x86/mm: Introduce _set_memory_prot()
  powerpc/mm: Thread pgprot_t through create_section_mapping()
  mm/memory_hotplug: Add pgprot_t to mhp_params
  mm/memremap: Set caching mode for PCI P2PDMA memory to WC

 arch/arm64/mm/mmu.c|  7 ++--
 arch/ia64/mm/init.c|  7 ++--
 arch/powerpc/include/asm/book3s/64/hash.h  |  3 +-
 arch/powerpc/include/asm/book3s/64/radix.h |  3 +-
 arch/powerpc/include/asm/sparsemem.h   |  3 +-
 arch/powerpc/mm/book3s64/hash_utils.c  |  5 +--
 arch/powerpc/mm/book3s64/pgtable.c |  7 ++--
 arch/powerpc/mm/book3s64/radix_pgtable.c   | 18 ++
 arch/powerpc/mm/mem.c  | 10 +++---
 arch/s390/mm/init.c|  9 +++--
 arch/sh/mm/init.c  |  7 ++--
 arch/x86/include/asm/page_types.h  |  3 --
 arch/x86/include/asm/pgtable.h |  3 ++
 arch/x86/include/asm/set_memory.h  |  1 +
 arch/x86/kernel/amd_gart_64.c  |  3 +-
 arch/x86/mm/init.c |  9 ++---
 arch/x86/mm/init_32.c  | 12 +--
 arch/x86/mm/init_64.c  | 40 --
 arch/x86/mm/mm_internal.h  |  3 +-
 arch/x86/mm/pat/set_memory.c   |  7 
 arch/x86/platform/uv/bios_uv.c |  3 +-
 include/linux/memory_hotplug.h | 20 +--
 mm/memory_hotplug.c| 11 +++---
 mm/memremap.c  | 17 +
 24 files changed, 130 insertions(+), 81 deletions(-)


base-commit: 11a48a5a18c63fd7621bb050228cebf13566e4d8
--
2.20.1


[PATCH v3 7/7] mm/memremap: Set caching mode for PCI P2PDMA memory to WC

2020-02-21 Thread Logan Gunthorpe
PCI BAR IO memory should never be mapped as WB, however prior to this
the PAT bits were set WB and it was typically overridden by MTRR
registers set by the firmware.

Set PCI P2PDMA memory to be WC (writecombining) as the only current
user (the NVMe CMB) was originally mapped WC before the P2PDMA code
replaced the mapping with devm_memremap_pages().

Future use-cases may need to generalize this by adding flags to
select the caching type, as some P2PDMA cases will not want WC.
However, those use-cases are not upstream yet and this can be changed
when they arrive.

Cc: Dan Williams 
Cc: Christoph Hellwig 
Cc: Jason Gunthorpe 
Signed-off-by: Logan Gunthorpe 
---
 mm/memremap.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mm/memremap.c b/mm/memremap.c
index 06742372a203..8d141c3e3364 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -190,7 +190,10 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
}
break;
case MEMORY_DEVICE_DEVDAX:
+   need_devmap_managed = false;
+   break;
case MEMORY_DEVICE_PCI_P2PDMA:
+   params.pgprot = pgprot_writecombine(params.pgprot);
need_devmap_managed = false;
break;
default:
-- 
2.20.1



Re: [PATCH v2 2/8] mm/memory_hotplug: Rename mhp_restrictions to mhp_modifiers

2020-01-08 Thread Logan Gunthorpe



On 2020-01-08 12:13 p.m., Dan Williams wrote:
> On Wed, Jan 8, 2020 at 11:08 AM David Hildenbrand  wrote:
>>
>>
>>
>>> Am 08.01.2020 um 20:00 schrieb Dan Williams :
>>>
>>> On Wed, Jan 8, 2020 at 9:17 AM Logan Gunthorpe  wrote:
>>>>
>>>>
>>>>
>>>>> On 2020-01-08 5:28 a.m., David Hildenbrand wrote:
>>>>> On 07.01.20 21:59, Logan Gunthorpe wrote:
>>>>>> The mhp_restrictions struct really doesn't specify anything resembling
>>>>>> a restriction anymore so rename it to be mhp_modifiers.
>>>>>
>>>>> I wonder if something like "mhp_params" would be even better. It's
>>>>> essentially just a way to avoid changing call chains rough-out all archs
>>>>> whenever we want to add a new parameter.
>>>>
>>>> Sure, that does sound a bit nicer to me. I can change it for v3.
>>>
>>> Oh, I was just about to chime in to support "modifiers" because I
>>> would expect all parameters to folded into a "params" struct. The
>>> modifiers seem to be limited to the set of items that are only
>>> considered in a non-default / expert memory hotplug use cases.

>>
>> It‘s a set of extended parameters I‘d say.

> Sure, we can call them "mhp_params" and just clarify that they are
> optional / extended in the kernel-doc.

Well pgprot isn't going to be optional... But I'll add something to the
kernel_doc.

Logan



Re: [PATCH v2 6/8] s390/mm: Thread pgprot_t through vmem_add_mapping()

2020-01-08 Thread Logan Gunthorpe



On 2020-01-08 5:43 a.m., David Hildenbrand wrote:
> On 07.01.20 21:59, Logan Gunthorpe wrote:
>> In prepartion to support a pgprot_t argument for arch_add_memory().
>>
>> Cc: Heiko Carstens 
>> Cc: Vasily Gorbik 
>> Cc: Christian Borntraeger 
>> Signed-off-by: Logan Gunthorpe 
>> ---
>>  arch/s390/include/asm/pgtable.h |  3 ++-
>>  arch/s390/mm/extmem.c   |  3 ++-
>>  arch/s390/mm/init.c |  2 +-
>>  arch/s390/mm/vmem.c | 10 +-
>>  4 files changed, 10 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/s390/include/asm/pgtable.h 
>> b/arch/s390/include/asm/pgtable.h
>> index 7b03037a8475..e667a1a96879 100644
>> --- a/arch/s390/include/asm/pgtable.h
>> +++ b/arch/s390/include/asm/pgtable.h
>> @@ -1640,7 +1640,8 @@ static inline swp_entry_t __swp_entry(unsigned long 
>> type, unsigned long offset)
>>  
>>  #define kern_addr_valid(addr)   (1)
>>  
>> -extern int vmem_add_mapping(unsigned long start, unsigned long size);
>> +extern int vmem_add_mapping(unsigned long start, unsigned long size,
>> +pgprot_t prot);
>>  extern int vmem_remove_mapping(unsigned long start, unsigned long size);
>>  extern int s390_enable_sie(void);
>>  extern int s390_enable_skey(void);
>> diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
>> index fd0dae9d10f4..6cf7029a7b35 100644
>> --- a/arch/s390/mm/extmem.c
>> +++ b/arch/s390/mm/extmem.c
>> @@ -313,7 +313,8 @@ __segment_load (char *name, int do_nonshared, unsigned 
>> long *addr, unsigned long
>>  goto out_free;
>>  }
>>  
>> -rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
>> +rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1,
>> +  PAGE_KERNEL);
>>  
>>  if (rc)
>>  goto out_free;
>> diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
>> index a0c88c1c9ad0..ef19522ddad2 100644
>> --- a/arch/s390/mm/init.c
>> +++ b/arch/s390/mm/init.c
>> @@ -277,7 +277,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
>>  if (WARN_ON_ONCE(modifiers->altmap))
>>  return -EINVAL;
>>  
>> -rc = vmem_add_mapping(start, size);
>> +rc = vmem_add_mapping(start, size, PAGE_KERNEL);
>>  if (rc)
>>  return rc;
>>  
>> diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
>> index b403fa14847d..8a5e95f184a2 100644
>> --- a/arch/s390/mm/vmem.c
>> +++ b/arch/s390/mm/vmem.c
>> @@ -66,7 +66,7 @@ pte_t __ref *vmem_pte_alloc(void)
>>  /*
>>   * Add a physical memory range to the 1:1 mapping.
>>   */
>> -static int vmem_add_mem(unsigned long start, unsigned long size)
>> +static int vmem_add_mem(unsigned long start, unsigned long size, pgprot_t 
>> prot)
>>  {
>>  unsigned long pgt_prot, sgt_prot, r3_prot;
>>  unsigned long pages4k, pages1m, pages2g;
>> @@ -79,7 +79,7 @@ static int vmem_add_mem(unsigned long start, unsigned long 
>> size)
>>  pte_t *pt_dir;
>>  int ret = -ENOMEM;
>>  
>> -pgt_prot = pgprot_val(PAGE_KERNEL);
>> +pgt_prot = pgprot_val(prot);
>>  sgt_prot = pgprot_val(SEGMENT_KERNEL);
>>  r3_prot = pgprot_val(REGION3_KERNEL);
> 
> So, if we map as huge/gigantic pages, the protection would be discarded?
> That looks wrong.
> 
> s390x does not support ZONE_DEVICE yet. Maybe simply bail out for s390x
> as you do for sh to make your life easier?

Yeah, ok, makes sense to me; I'll change it for v3.

Logan


Re: [PATCH v2 7/8] mm/memory_hotplug: Add pgprot_t to mhp_modifiers

2020-01-08 Thread Logan Gunthorpe



On 2020-01-08 5:42 a.m., Michal Hocko wrote:
> On Tue 07-01-20 13:59:58, Logan Gunthorpe wrote:
>> devm_memremap_pages() is currently used by the PCI P2PDMA code to create
>> struct page mappings for IO memory. At present, these mappings are created
>> with PAGE_KERNEL which implies setting the PAT bits to be WB. However, on
>> x86, an mtrr register will typically override this and force the cache
>> type to be UC-. In the case firmware doesn't set this register it is
>> effectively WB and will typically result in a machine check exception
>> when it's accessed.
>>
>> Other arches are not currently likely to function correctly seeing they
>> don't have any MTRR registers to fall back on.
>>
>> To solve this, add an argument to arch_add_memory() to explicitly
>> set the pgprot value to a specific value.
>>
>> Of the arches that support MEMORY_HOTPLUG: x86_64, s390 and arm64 is a
>> simple change to pass the pgprot_t down to their respective functions
>> which set up the page tables. For x86_32, set the page tables explicitly
>> using _set_memory_prot() (seeing they are already mapped). For sh, reject
>> anything but PAGE_KERNEL settings -- this should be fine, for now, seeing
>> sh doesn't support ZONE_DEVICE anyway.
>>
>> Cc: Dan Williams 
>> Cc: David Hildenbrand 
>> Cc: Michal Hocko 
>> Signed-off-by: Logan Gunthorpe 
> 
> OK, this is less code churn than I expected. Having pgprot as an implcit
> parameter de-facto is a bit fragile though. Should we add a WARN_ON_ONCE
> (e.g. into the add_pages to catch all arches) for value 0?

Sure, I can add that for v3.

Logan

> Other than that
> Acked-by: Michal Hocko 
> 
>> ---
>>  arch/arm64/mm/mmu.c| 3 ++-
>>  arch/ia64/mm/init.c| 4 
>>  arch/powerpc/mm/mem.c  | 3 ++-
>>  arch/s390/mm/init.c| 2 +-
>>  arch/sh/mm/init.c  | 3 +++
>>  arch/x86/mm/init_32.c  | 5 +
>>  arch/x86/mm/init_64.c  | 2 +-
>>  include/linux/memory_hotplug.h | 2 ++
>>  mm/memory_hotplug.c| 2 +-
>>  mm/memremap.c  | 6 +++---
>>  10 files changed, 24 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 3320406579c3..9b214b0d268f 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -1058,7 +1058,8 @@ int arch_add_memory(int nid, u64 start, u64 size,
>>  flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
>>  
>>  __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
>> - size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
>> + size, modifiers->pgprot, __pgd_pgtable_alloc,
>> + flags);
>>  
>>  memblock_clear_nomap(start, size);
>>  
>> diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
>> index daf438e08b96..5fd6ae4929c9 100644
>> --- a/arch/ia64/mm/init.c
>> +++ b/arch/ia64/mm/init.c
>> @@ -677,6 +677,10 @@ int arch_add_memory(int nid, u64 start, u64 size,
>>  int ret;
>>  
>>  ret = __add_pages(nid, start_pfn, nr_pages, modifiers);
>> +if (modifiers->pgprot != PAGE_KERNEL)
>> +return -EINVAL;
>> +
>> +ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
>>  if (ret)
>>  printk("%s: Problem encountered in __add_pages() as ret=%d\n",
>> __func__,  ret);
>> diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
>> index 631ee684721f..fddeaee53198 100644
>> --- a/arch/powerpc/mm/mem.c
>> +++ b/arch/powerpc/mm/mem.c
>> @@ -137,7 +137,8 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
>>  resize_hpt_for_hotplug(memblock_phys_mem_size());
>>  
>>  start = (unsigned long)__va(start);
>> -rc = create_section_mapping(start, start + size, nid, PAGE_KERNEL);
>> +rc = create_section_mapping(start, start + size, nid,
>> +modifiers->pgprot);
>>  if (rc) {
>>  pr_warn("Unable to create mapping for hot added memory 
>> 0x%llx..0x%llx: %d\n",
>>  start, start + size, rc);
>> diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
>> index ef19522ddad2..c65fb33f6a89 100644
>> --- a/arch/s390/mm/init.c
>> +++ b/arch/s390/mm/init.c
>> @@ -277,7 +277,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
>>  if (WARN_ON_ONCE(modifiers->altmap))
>>  return -EINVAL;
>>  
>&g

Re: [PATCH v2 7/8] mm/memory_hotplug: Add pgprot_t to mhp_modifiers

2020-01-08 Thread Logan Gunthorpe



On 2020-01-08 5:39 a.m., David Hildenbrand wrote:
> On 07.01.20 21:59, Logan Gunthorpe wrote:
>> devm_memremap_pages() is currently used by the PCI P2PDMA code to create
>> struct page mappings for IO memory. At present, these mappings are created
>> with PAGE_KERNEL which implies setting the PAT bits to be WB. However, on
>> x86, an mtrr register will typically override this and force the cache
>> type to be UC-. In the case firmware doesn't set this register it is
>> effectively WB and will typically result in a machine check exception
>> when it's accessed.
>>
>> Other arches are not currently likely to function correctly seeing they
>> don't have any MTRR registers to fall back on.
>>
>> To solve this, add an argument to arch_add_memory() to explicitly
>> set the pgprot value to a specific value.
> 
> You're adding a parameter indirectly by adding it to the structure.
> Maybe "provide a way to specify the pgprot value explicitly to
> arch_add_memory()"
> 
>>
>> Of the arches that support MEMORY_HOTPLUG: x86_64, s390 and arm64 is a
> 
> s/is/need/
> 
>> simple change to pass the pgprot_t down to their respective functions
>> which set up the page tables. For x86_32, set the page tables explicitly
> 
> "page table protection" ?
> 
>> using _set_memory_prot() (seeing they are already mapped). For sh, reject
>> anything but PAGE_KERNEL settings -- this should be fine, for now, seeing
>> sh doesn't support ZONE_DEVICE anyway.
>>
>> Cc: Dan Williams 
>> Cc: David Hildenbrand 
>> Cc: Michal Hocko 
>> Signed-off-by: Logan Gunthorpe 
>> ---
>>  arch/arm64/mm/mmu.c| 3 ++-
>>  arch/ia64/mm/init.c| 4 
>>  arch/powerpc/mm/mem.c  | 3 ++-
>>  arch/s390/mm/init.c| 2 +-
>>  arch/sh/mm/init.c  | 3 +++
>>  arch/x86/mm/init_32.c  | 5 +
>>  arch/x86/mm/init_64.c  | 2 +-
>>  include/linux/memory_hotplug.h | 2 ++
>>  mm/memory_hotplug.c| 2 +-
>>  mm/memremap.c  | 6 +++---
>>  10 files changed, 24 insertions(+), 8 deletions(-)
>>
>> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
>> index 3320406579c3..9b214b0d268f 100644
>> --- a/arch/arm64/mm/mmu.c
>> +++ b/arch/arm64/mm/mmu.c
>> @@ -1058,7 +1058,8 @@ int arch_add_memory(int nid, u64 start, u64 size,
>>  flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS;
>>  
>>  __create_pgd_mapping(swapper_pg_dir, start, __phys_to_virt(start),
>> - size, PAGE_KERNEL, __pgd_pgtable_alloc, flags);
>> + size, modifiers->pgprot, __pgd_pgtable_alloc,
>> + flags);
>>  
>>  memblock_clear_nomap(start, size);
>>  
>> diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
>> index daf438e08b96..5fd6ae4929c9 100644
>> --- a/arch/ia64/mm/init.c
>> +++ b/arch/ia64/mm/init.c
>> @@ -677,6 +677,10 @@ int arch_add_memory(int nid, u64 start, u64 size,
>>  int ret;
>>  
>>  ret = __add_pages(nid, start_pfn, nr_pages, modifiers);
>> +if (modifiers->pgprot != PAGE_KERNEL)
>> +return -EINVAL;
> 
> ... maybe better "if (WARN_ON_ONCE(...))"
> [...]
> 
>> --- a/include/linux/memory_hotplug.h
>> +++ b/include/linux/memory_hotplug.h
>> @@ -56,9 +56,11 @@ enum {
>>  /*
>>   * Restrictions for the memory hotplug:
>>   * altmap: alternative allocator for memmap array
>> + * pgprot: page protection flags to apply to newly added page tables
>>   */
>>  struct mhp_modifiers {
>>  struct vmem_altmap *altmap;
>> +pgprot_t pgprot;
>>  };
>>  
>>  /*
>> diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
>> index 1bb3f92e087d..0888f821af06 100644
>> --- a/mm/memory_hotplug.c
>> +++ b/mm/memory_hotplug.c
>> @@ -1027,7 +1027,7 @@ static int online_memory_block(struct memory_block 
>> *mem, void *arg)
>>   */
>>  int __ref add_memory_resource(int nid, struct resource *res)
>>  {
>> -struct mhp_modifiers modifiers = {};
>> +struct mhp_modifiers modifiers = {.pgprot = PAGE_KERNEL};
> 
> I think we usually use spaces like
> 
> = { .pgprot = PAGE_KERNEL };
> 
> t480s: ~/git/linux virtio-mem-v1 $ git grep "= {\." | wc -l
> 978
> t480s: ~/git/linux virtio-mem-v1 $ git grep "= { " | wc -l
> 35447
> 
>>  u64 start, size;
>>  bool new_node = false;
>>  int ret;
>> diff --git a/mm/memremap.c b/mm/memremap.c

Re: [PATCH v2 2/8] mm/memory_hotplug: Rename mhp_restrictions to mhp_modifiers

2020-01-08 Thread Logan Gunthorpe



On 2020-01-08 5:28 a.m., David Hildenbrand wrote:
> On 07.01.20 21:59, Logan Gunthorpe wrote:
>> The mhp_restrictions struct really doesn't specify anything resembling
>> a restriction anymore so rename it to be mhp_modifiers.
> 
> I wonder if something like "mhp_params" would be even better. It's
> essentially just a way to avoid changing call chains rough-out all archs
> whenever we want to add a new parameter.

Sure, that does sound a bit nicer to me. I can change it for v3.

Logan


[PATCH v2 0/8] Allow setting caching mode in arch_add_memory() for P2PDMA

2020-01-07 Thread Logan Gunthorpe
Hi,

The main feedback from v1 was around the interface for arch_add_memory().
Per Dan's suggestions I've renamed the mhp_restrictions structure to
mhp_modifiers and put a pgprot_t field in that structure. I've also
included patch to drop the unused flags field.

Thanks,

Logan

--

Changes in v2:
 * Rebased onto v5.5-rc5
 * Renamed mhp_restrictions to mhp_modifiers and added the pgprot field
   to that structure instead of using an argument for
   arch_add_memory().
 * Add patch to drop the unused flags field in mhp_restrictions

A git branch is available here:

https://github.com/sbates130272/linux-p2pmem remap_pages_cache_v2

--

Currently, the page tables created using memremap_pages() are always
created with the PAGE_KERNEL cacheing mode. However, the P2PDMA code
is creating pages for PCI BAR memory which should never be accessed
through the cache and instead use either WC or UC. This still works in
most cases, on x86, because the MTRR registers typically override the
caching settings in the page tables for all of the IO memory to be
UC-. However, this tends not to work so well on other arches or
some rare x86 machines that have firmware which does not setup the
MTRR registers in this way.

Instead of this, this series proposes a change to arch_add_memory()
to take the pgprot required by the mapping which allows us to
explicitly set pagetable entries for P2PDMA memory to WC.

This changes is pretty routine for most of the arches: x86_64, s390, arm64
and powerpc simply need to thread the pgprot through to where the page
tables are setup. x86_32 unfortunately sets up the page tables at boot so
must use _set_memory_prot() to change their caching mode. ia64 and sh
don't appear to have an easy way to change the page tables so, for now
at least, we just return -EINVAL on such mappings and thus they will
not support P2PDMA memory until the work for this is done.

--

Logan Gunthorpe (8):
  mm/memory_hotplug: Drop the flags field from struct mhp_restrictions
  mm/memory_hotplug: Rename mhp_restrictions to mhp_modifiers
  x86/mm: Thread pgprot_t through init_memory_mapping()
  x86/mm: Introduce _set_memory_prot()
  powerpc/mm: Thread pgprot_t through create_section_mapping()
  s390/mm: Thread pgprot_t through vmem_add_mapping()
  mm/memory_hotplug: Add pgprot_t to mhp_modifiers
  mm/memremap: Set caching mode for PCI P2PDMA memory to WC

 arch/arm64/mm/mmu.c|  7 ++--
 arch/ia64/mm/init.c|  6 +++-
 arch/powerpc/include/asm/book3s/64/hash.h  |  3 +-
 arch/powerpc/include/asm/book3s/64/radix.h |  3 +-
 arch/powerpc/include/asm/sparsemem.h   |  3 +-
 arch/powerpc/mm/book3s64/hash_utils.c  |  5 +--
 arch/powerpc/mm/book3s64/pgtable.c |  7 ++--
 arch/powerpc/mm/book3s64/radix_pgtable.c   | 18 ++
 arch/powerpc/mm/mem.c  | 10 +++---
 arch/s390/include/asm/pgtable.h|  3 +-
 arch/s390/mm/extmem.c  |  3 +-
 arch/s390/mm/init.c|  8 ++---
 arch/s390/mm/vmem.c| 10 +++---
 arch/sh/mm/init.c  |  7 ++--
 arch/x86/include/asm/page_types.h  |  3 --
 arch/x86/include/asm/pgtable.h |  3 ++
 arch/x86/include/asm/set_memory.h  |  1 +
 arch/x86/kernel/amd_gart_64.c  |  3 +-
 arch/x86/mm/init.c |  9 ++---
 arch/x86/mm/init_32.c  | 12 +--
 arch/x86/mm/init_64.c  | 40 --
 arch/x86/mm/mm_internal.h  |  3 +-
 arch/x86/mm/pageattr.c |  7 
 arch/x86/platform/efi/efi_64.c |  3 +-
 include/linux/memory_hotplug.h | 16 -
 mm/memory_hotplug.c|  8 ++---
 mm/memremap.c  | 17 +
 27 files changed, 132 insertions(+), 86 deletions(-)

--
2.20.1


[PATCH v2 3/8] x86/mm: Thread pgprot_t through init_memory_mapping()

2020-01-07 Thread Logan Gunthorpe
In prepartion to support a pgprot_t argument for arch_add_memory().

It's required to move the prototype of init_memory_mapping() seeing
the original location came before the definition of pgprot_t.

Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: x...@kernel.org
Cc: Dave Hansen 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Signed-off-by: Logan Gunthorpe 
---
 arch/x86/include/asm/page_types.h |  3 ---
 arch/x86/include/asm/pgtable.h|  3 +++
 arch/x86/kernel/amd_gart_64.c |  3 ++-
 arch/x86/mm/init.c|  9 +
 arch/x86/mm/init_32.c |  3 ++-
 arch/x86/mm/init_64.c | 32 +--
 arch/x86/mm/mm_internal.h |  3 ++-
 arch/x86/platform/efi/efi_64.c|  3 ++-
 8 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/page_types.h 
b/arch/x86/include/asm/page_types.h
index c85e15010f48..bf7aa2e290ef 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -73,9 +73,6 @@ static inline phys_addr_t get_max_mapped(void)
 
 bool pfn_range_is_mapped(unsigned long start_pfn, unsigned long end_pfn);
 
-extern unsigned long init_memory_mapping(unsigned long start,
-unsigned long end);
-
 extern void initmem_init(void);
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index ad97dc155195..844635e02da5 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1041,6 +1041,9 @@ static inline void __meminit init_trampoline_default(void)
 
 void __init poking_init(void);
 
+unsigned long init_memory_mapping(unsigned long start,
+ unsigned long end, pgprot_t prot);
+
 # ifdef CONFIG_RANDOMIZE_MEMORY
 void __meminit init_trampoline(void);
 # else
diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 4e5f50236048..16133819415c 100644
--- a/arch/x86/kernel/amd_gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -744,7 +744,8 @@ int __init gart_iommu_init(void)
 
start_pfn = PFN_DOWN(aper_base);
if (!pfn_range_is_mapped(start_pfn, end_pfn))
-   init_memory_mapping(start_pfn<> PAGE_SHIFT, ret >> PAGE_SHIFT);
 
@@ -521,7 +522,7 @@ static unsigned long __init init_range_memory_mapping(
 */
can_use_brk_pgt = max(start, (u64)pgt_buf_end<=
min(end, (u64)pgt_buf_top<> PAGE_SHIFT,
-PAGE_KERNEL_LARGE),
+prot),
 init);
spin_unlock(_mm.page_table_lock);
paddr_last = paddr_next;
@@ -669,7 +672,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, 
unsigned long paddr_end,
 
 static unsigned long __meminit
 phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
- unsigned long page_size_mask, bool init)
+ unsigned long page_size_mask, pgprot_t prot, bool init)
 {
unsigned long vaddr, vaddr_end, vaddr_next, paddr_next, paddr_last;
 
@@ -679,7 +682,7 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, 
unsigned long paddr_end,
 
if (!pgtable_l5_enabled())
return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end,
-page_size_mask, init);
+page_size_mask, prot, init);
 
for (; vaddr < vaddr_end; vaddr = vaddr_next) {
p4d_t *p4d = p4d_page + p4d_index(vaddr);
@@ -702,13 +705,13 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, 
unsigned long paddr_end,
if (!p4d_none(*p4d)) {
pud = pud_offset(p4d, 0);
paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
-   page_size_mask, init);
+   page_size_mask, prot, init);
continue;
}
 
pud = alloc_low_page();
paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
-  page_size_mask, init);
+  page_size_mask, prot, init);
 
spin_lock(_mm.page_table_lock);
p4d_populate_init(_mm, p4d, pud, init);
@@ -722,7 +725,7 @@ static unsigned long __meminit
 __kernel_physical_mapping_init(unsigned long paddr_start,
   unsigned long paddr_end,
   unsigned long page_size_mask,
-  bool init)
+  pgprot_t prot, bool init)
 {
bool pgd_changed = false;
unsigned long vaddr, vaddr_start, vaddr_end, vaddr_next, paddr_last;
@@ -743,13 +746,13 @@ __kernel_p

[PATCH v2 5/8] powerpc/mm: Thread pgprot_t through create_section_mapping()

2020-01-07 Thread Logan Gunthorpe
In prepartion to support a pgprot_t argument for arch_add_memory().

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Signed-off-by: Logan Gunthorpe 
---
 arch/powerpc/include/asm/book3s/64/hash.h  |  3 ++-
 arch/powerpc/include/asm/book3s/64/radix.h |  3 ++-
 arch/powerpc/include/asm/sparsemem.h   |  3 ++-
 arch/powerpc/mm/book3s64/hash_utils.c  |  5 +++--
 arch/powerpc/mm/book3s64/pgtable.c |  7 ---
 arch/powerpc/mm/book3s64/radix_pgtable.c   | 18 +++---
 arch/powerpc/mm/mem.c  |  5 +++--
 7 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/arch/powerpc/include/asm/book3s/64/hash.h 
b/arch/powerpc/include/asm/book3s/64/hash.h
index 2781ebf6add4..6fc4520092c7 100644
--- a/arch/powerpc/include/asm/book3s/64/hash.h
+++ b/arch/powerpc/include/asm/book3s/64/hash.h
@@ -251,7 +251,8 @@ extern int __meminit hash__vmemmap_create_mapping(unsigned 
long start,
 extern void hash__vmemmap_remove_mapping(unsigned long start,
 unsigned long page_size);
 
-int hash__create_section_mapping(unsigned long start, unsigned long end, int 
nid);
+int hash__create_section_mapping(unsigned long start, unsigned long end,
+int nid, pgprot_t prot);
 int hash__remove_section_mapping(unsigned long start, unsigned long end);
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/book3s/64/radix.h 
b/arch/powerpc/include/asm/book3s/64/radix.h
index d97db3ad9aae..46799f3c3d1d 100644
--- a/arch/powerpc/include/asm/book3s/64/radix.h
+++ b/arch/powerpc/include/asm/book3s/64/radix.h
@@ -289,7 +289,8 @@ static inline unsigned long radix__get_tree_size(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int radix__create_section_mapping(unsigned long start, unsigned long end, int 
nid);
+int radix__create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot);
 int radix__remove_section_mapping(unsigned long start, unsigned long end);
 #endif /* CONFIG_MEMORY_HOTPLUG */
 #endif /* __ASSEMBLY__ */
diff --git a/arch/powerpc/include/asm/sparsemem.h 
b/arch/powerpc/include/asm/sparsemem.h
index 3192d454a733..c89b32443cff 100644
--- a/arch/powerpc/include/asm/sparsemem.h
+++ b/arch/powerpc/include/asm/sparsemem.h
@@ -13,7 +13,8 @@
 #endif /* CONFIG_SPARSEMEM */
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-extern int create_section_mapping(unsigned long start, unsigned long end, int 
nid);
+extern int create_section_mapping(unsigned long start, unsigned long end,
+ int nid, pgprot_t prot);
 extern int remove_section_mapping(unsigned long start, unsigned long end);
 
 #ifdef CONFIG_PPC_BOOK3S_64
diff --git a/arch/powerpc/mm/book3s64/hash_utils.c 
b/arch/powerpc/mm/book3s64/hash_utils.c
index b30435c7d804..276e353d5264 100644
--- a/arch/powerpc/mm/book3s64/hash_utils.c
+++ b/arch/powerpc/mm/book3s64/hash_utils.c
@@ -800,7 +800,8 @@ int resize_hpt_for_hotplug(unsigned long new_mem_size)
return 0;
 }
 
-int hash__create_section_mapping(unsigned long start, unsigned long end, int 
nid)
+int hash__create_section_mapping(unsigned long start, unsigned long end,
+int nid, pgprot_t prot)
 {
int rc;
 
@@ -810,7 +811,7 @@ int hash__create_section_mapping(unsigned long start, 
unsigned long end, int nid
}
 
rc = htab_bolt_mapping(start, end, __pa(start),
-  pgprot_val(PAGE_KERNEL), mmu_linear_psize,
+  pgprot_val(prot), mmu_linear_psize,
   mmu_kernel_ssize);
 
if (rc < 0) {
diff --git a/arch/powerpc/mm/book3s64/pgtable.c 
b/arch/powerpc/mm/book3s64/pgtable.c
index 75483b40fcb1..b60c18d2e5c9 100644
--- a/arch/powerpc/mm/book3s64/pgtable.c
+++ b/arch/powerpc/mm/book3s64/pgtable.c
@@ -171,12 +171,13 @@ void mmu_cleanup_all(void)
 }
 
 #ifdef CONFIG_MEMORY_HOTPLUG
-int __meminit create_section_mapping(unsigned long start, unsigned long end, 
int nid)
+int __meminit create_section_mapping(unsigned long start, unsigned long end,
+int nid, pgprot_t prot)
 {
if (radix_enabled())
-   return radix__create_section_mapping(start, end, nid);
+   return radix__create_section_mapping(start, end, nid, prot);
 
-   return hash__create_section_mapping(start, end, nid);
+   return hash__create_section_mapping(start, end, nid, prot);
 }
 
 int __meminit remove_section_mapping(unsigned long start, unsigned long end)
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c 
b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 974109bb85db..2a21fb4a22b2 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -253,7 +253,7 @@ static unsigned long next_boundary(unsigned long addr, 
unsigned long end)
 
 static int __meminit create_physical_mapping(unsigned long st

[PATCH v2 4/8] x86/mm: Introduce _set_memory_prot()

2020-01-07 Thread Logan Gunthorpe
For use in the 32bit arch_add_memory() to set the pgprot type of the
memory to add.

Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: "H. Peter Anvin" 
Cc: x...@kernel.org
Cc: Dave Hansen 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Signed-off-by: Logan Gunthorpe 
---
 arch/x86/include/asm/set_memory.h | 1 +
 arch/x86/mm/pageattr.c| 7 +++
 2 files changed, 8 insertions(+)

diff --git a/arch/x86/include/asm/set_memory.h 
b/arch/x86/include/asm/set_memory.h
index 2ee8e469dcf5..d728c2f3ad96 100644
--- a/arch/x86/include/asm/set_memory.h
+++ b/arch/x86/include/asm/set_memory.h
@@ -34,6 +34,7 @@
  * The caller is required to take care of these.
  */
 
+int _set_memory_prot(unsigned long addr, int numpages, pgprot_t prot);
 int _set_memory_uc(unsigned long addr, int numpages);
 int _set_memory_wc(unsigned long addr, int numpages);
 int _set_memory_wt(unsigned long addr, int numpages);
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 1b99ad05b117..2f1934d7b8d9 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1781,6 +1781,13 @@ static inline int cpa_clear_pages_array(struct page 
**pages, int numpages,
CPA_PAGES_ARRAY, pages);
 }
 
+int _set_memory_prot(unsigned long addr, int numpages, pgprot_t prot)
+{
+   return change_page_attr_set_clr(, numpages, prot,
+   __pgprot(~pgprot_val(prot)), 0, 0,
+   NULL);
+}
+
 int _set_memory_uc(unsigned long addr, int numpages)
 {
/*
-- 
2.20.1



[PATCH v2 1/8] mm/memory_hotplug: Drop the flags field from struct mhp_restrictions

2020-01-07 Thread Logan Gunthorpe
This variable is not used anywhere and should therefore be removed
from the structure.

Signed-off-by: Logan Gunthorpe 
---
 include/linux/memory_hotplug.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h
index ba0dca6aac6e..e47a29761088 100644
--- a/include/linux/memory_hotplug.h
+++ b/include/linux/memory_hotplug.h
@@ -55,11 +55,9 @@ enum {
 
 /*
  * Restrictions for the memory hotplug:
- * flags:  MHP_ flags
  * altmap: alternative allocator for memmap array
  */
 struct mhp_restrictions {
-   unsigned long flags;
struct vmem_altmap *altmap;
 };
 
-- 
2.20.1



[PATCH v2 6/8] s390/mm: Thread pgprot_t through vmem_add_mapping()

2020-01-07 Thread Logan Gunthorpe
In prepartion to support a pgprot_t argument for arch_add_memory().

Cc: Heiko Carstens 
Cc: Vasily Gorbik 
Cc: Christian Borntraeger 
Signed-off-by: Logan Gunthorpe 
---
 arch/s390/include/asm/pgtable.h |  3 ++-
 arch/s390/mm/extmem.c   |  3 ++-
 arch/s390/mm/init.c |  2 +-
 arch/s390/mm/vmem.c | 10 +-
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 7b03037a8475..e667a1a96879 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -1640,7 +1640,8 @@ static inline swp_entry_t __swp_entry(unsigned long type, 
unsigned long offset)
 
 #define kern_addr_valid(addr)   (1)
 
-extern int vmem_add_mapping(unsigned long start, unsigned long size);
+extern int vmem_add_mapping(unsigned long start, unsigned long size,
+   pgprot_t prot);
 extern int vmem_remove_mapping(unsigned long start, unsigned long size);
 extern int s390_enable_sie(void);
 extern int s390_enable_skey(void);
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index fd0dae9d10f4..6cf7029a7b35 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -313,7 +313,8 @@ __segment_load (char *name, int do_nonshared, unsigned long 
*addr, unsigned long
goto out_free;
}
 
-   rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1);
+   rc = vmem_add_mapping(seg->start_addr, seg->end - seg->start_addr + 1,
+ PAGE_KERNEL);
 
if (rc)
goto out_free;
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index a0c88c1c9ad0..ef19522ddad2 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -277,7 +277,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
if (WARN_ON_ONCE(modifiers->altmap))
return -EINVAL;
 
-   rc = vmem_add_mapping(start, size);
+   rc = vmem_add_mapping(start, size, PAGE_KERNEL);
if (rc)
return rc;
 
diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c
index b403fa14847d..8a5e95f184a2 100644
--- a/arch/s390/mm/vmem.c
+++ b/arch/s390/mm/vmem.c
@@ -66,7 +66,7 @@ pte_t __ref *vmem_pte_alloc(void)
 /*
  * Add a physical memory range to the 1:1 mapping.
  */
-static int vmem_add_mem(unsigned long start, unsigned long size)
+static int vmem_add_mem(unsigned long start, unsigned long size, pgprot_t prot)
 {
unsigned long pgt_prot, sgt_prot, r3_prot;
unsigned long pages4k, pages1m, pages2g;
@@ -79,7 +79,7 @@ static int vmem_add_mem(unsigned long start, unsigned long 
size)
pte_t *pt_dir;
int ret = -ENOMEM;
 
-   pgt_prot = pgprot_val(PAGE_KERNEL);
+   pgt_prot = pgprot_val(prot);
sgt_prot = pgprot_val(SEGMENT_KERNEL);
r3_prot = pgprot_val(REGION3_KERNEL);
if (!MACHINE_HAS_NX) {
@@ -362,7 +362,7 @@ int vmem_remove_mapping(unsigned long start, unsigned long 
size)
return ret;
 }
 
-int vmem_add_mapping(unsigned long start, unsigned long size)
+int vmem_add_mapping(unsigned long start, unsigned long size, pgprot_t prot)
 {
struct memory_segment *seg;
int ret;
@@ -379,7 +379,7 @@ int vmem_add_mapping(unsigned long start, unsigned long 
size)
if (ret)
goto out_free;
 
-   ret = vmem_add_mem(start, size);
+   ret = vmem_add_mem(start, size, prot);
if (ret)
goto out_remove;
goto out;
@@ -403,7 +403,7 @@ void __init vmem_map_init(void)
struct memblock_region *reg;
 
for_each_memblock(memory, reg)
-   vmem_add_mem(reg->base, reg->size);
+   vmem_add_mem(reg->base, reg->size, PAGE_KERNEL);
__set_memory((unsigned long)_stext,
 (unsigned long)(_etext - _stext) >> PAGE_SHIFT,
 SET_MEMORY_RO | SET_MEMORY_X);
-- 
2.20.1



[PATCH v2 2/8] mm/memory_hotplug: Rename mhp_restrictions to mhp_modifiers

2020-01-07 Thread Logan Gunthorpe
The mhp_restrictions struct really doesn't specify anything resembling
a restriction anymore so rename it to be mhp_modifiers.

Signed-off-by: Logan Gunthorpe 
---
 arch/arm64/mm/mmu.c|  4 ++--
 arch/ia64/mm/init.c|  4 ++--
 arch/powerpc/mm/mem.c  |  4 ++--
 arch/s390/mm/init.c|  6 +++---
 arch/sh/mm/init.c  |  4 ++--
 arch/x86/mm/init_32.c  |  4 ++--
 arch/x86/mm/init_64.c  |  8 
 include/linux/memory_hotplug.h | 12 ++--
 mm/memory_hotplug.c|  8 
 mm/memremap.c  |  8 
 10 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 40797cbfba2d..3320406579c3 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1050,7 +1050,7 @@ int p4d_free_pud_page(p4d_t *p4d, unsigned long addr)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size,
-   struct mhp_restrictions *restrictions)
+   struct mhp_modifiers *modifiers)
 {
int flags = 0;
 
@@ -1063,7 +1063,7 @@ int arch_add_memory(int nid, u64 start, u64 size,
memblock_clear_nomap(start, size);
 
return __add_pages(nid, start >> PAGE_SHIFT, size >> PAGE_SHIFT,
-  restrictions);
+  modifiers);
 }
 void arch_remove_memory(int nid, u64 start, u64 size,
struct vmem_altmap *altmap)
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index b01d68a2d5d9..daf438e08b96 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -670,13 +670,13 @@ mem_init (void)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size,
-   struct mhp_restrictions *restrictions)
+   struct mhp_modifiers *modifiers)
 {
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
 
-   ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
+   ret = __add_pages(nid, start_pfn, nr_pages, modifiers);
if (ret)
printk("%s: Problem encountered in __add_pages() as ret=%d\n",
   __func__,  ret);
diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c
index f5535eae637f..9dd9c3c1be7f 100644
--- a/arch/powerpc/mm/mem.c
+++ b/arch/powerpc/mm/mem.c
@@ -127,7 +127,7 @@ static void flush_dcache_range_chunked(unsigned long start, 
unsigned long stop,
 }
 
 int __ref arch_add_memory(int nid, u64 start, u64 size,
-   struct mhp_restrictions *restrictions)
+ struct mhp_modifiers *modifiers)
 {
unsigned long start_pfn = start >> PAGE_SHIFT;
unsigned long nr_pages = size >> PAGE_SHIFT;
@@ -143,7 +143,7 @@ int __ref arch_add_memory(int nid, u64 start, u64 size,
return -EFAULT;
}
 
-   return __add_pages(nid, start_pfn, nr_pages, restrictions);
+   return __add_pages(nid, start_pfn, nr_pages, modifiers);
 }
 
 void __ref arch_remove_memory(int nid, u64 start, u64 size,
diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c
index ac44bd76db4b..a0c88c1c9ad0 100644
--- a/arch/s390/mm/init.c
+++ b/arch/s390/mm/init.c
@@ -268,20 +268,20 @@ device_initcall(s390_cma_mem_init);
 #endif /* CONFIG_CMA */
 
 int arch_add_memory(int nid, u64 start, u64 size,
-   struct mhp_restrictions *restrictions)
+   struct mhp_modifiers *modifiers)
 {
unsigned long start_pfn = PFN_DOWN(start);
unsigned long size_pages = PFN_DOWN(size);
int rc;
 
-   if (WARN_ON_ONCE(restrictions->altmap))
+   if (WARN_ON_ONCE(modifiers->altmap))
return -EINVAL;
 
rc = vmem_add_mapping(start, size);
if (rc)
return rc;
 
-   rc = __add_pages(nid, start_pfn, size_pages, restrictions);
+   rc = __add_pages(nid, start_pfn, size_pages, modifiers);
if (rc)
vmem_remove_mapping(start, size);
return rc;
diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c
index d1b1ff2be17a..7e64f42fb570 100644
--- a/arch/sh/mm/init.c
+++ b/arch/sh/mm/init.c
@@ -406,14 +406,14 @@ void __init mem_init(void)
 
 #ifdef CONFIG_MEMORY_HOTPLUG
 int arch_add_memory(int nid, u64 start, u64 size,
-   struct mhp_restrictions *restrictions)
+   struct mhp_modifiers *modifiers)
 {
unsigned long start_pfn = PFN_DOWN(start);
unsigned long nr_pages = size >> PAGE_SHIFT;
int ret;
 
/* We only have ZONE_NORMAL, so this is easy.. */
-   ret = __add_pages(nid, start_pfn, nr_pages, restrictions);
+   ret = __add_pages(nid, start_pfn, nr_pages, modifiers);
if (unlikely(ret))
printk("%s: Failed, __add_pages() == %d\n", __func__, ret);
 
diff --git a/arch/x86/mm/init_

  1   2   >