[PATCH 1/1] iommu/vtd: Cleanup dma_remapping.h header

2018-11-09 Thread Lu Baolu
Commit e61d98d8dad00 ("x64, x2apic/intr-remap: Intel vt-d, IOMMU
code reorganization") moved dma_remapping.h from drivers/pci/ to
current place. It is entirely VT-d specific, but uses a generic
name. This merges dma_remapping.h with include/linux/intel-iommu.h
and removes dma_remapping.h as the result.

Cc: Ashok Raj 
Cc: Jacob Pan 
Cc: Sohil Mehta 
Suggested-by: Christoph Hellwig 
Signed-off-by: Lu Baolu 
---
 arch/x86/kernel/tboot.c|  2 +-
 drivers/gpu/drm/i915/i915_gem_execbuffer.c |  2 +-
 drivers/gpu/drm/i915/intel_display.c   |  2 +-
 drivers/gpu/drm/vmwgfx/vmwgfx_drv.c|  2 +-
 drivers/misc/mic/scif/scif_rma.c   |  2 +-
 drivers/misc/mic/scif/scif_rma.h   |  2 +-
 include/linux/dma_remapping.h  | 58 --
 include/linux/intel-iommu.h| 49 +-
 8 files changed, 53 insertions(+), 66 deletions(-)
 delete mode 100644 include/linux/dma_remapping.h

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index a2486f444073..6e5ef8fb8a02 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -19,7 +19,7 @@
  *
  */
 
-#include 
+#include 
 #include 
 #include 
 #include 
diff --git a/drivers/gpu/drm/i915/i915_gem_execbuffer.c 
b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
index 09187286d346..d8ccc77fdbaf 100644
--- a/drivers/gpu/drm/i915/i915_gem_execbuffer.c
+++ b/drivers/gpu/drm/i915/i915_gem_execbuffer.c
@@ -26,7 +26,7 @@
  *
  */
 
-#include 
+#include 
 #include 
 #include 
 #include 
diff --git a/drivers/gpu/drm/i915/intel_display.c 
b/drivers/gpu/drm/i915/intel_display.c
index 9741cc419e1b..45334a84fab1 100644
--- a/drivers/gpu/drm/i915/intel_display.c
+++ b/drivers/gpu/drm/i915/intel_display.c
@@ -47,7 +47,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 #include 
 
 /* Primary plane formats for gen <= 3 */
diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c 
b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
index 61a84b958d67..c3e80a3b09fc 100644
--- a/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
+++ b/drivers/gpu/drm/vmwgfx/vmwgfx_drv.c
@@ -34,7 +34,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 #define VMWGFX_DRIVER_DESC "Linux drm driver for VMware graphics devices"
 #define VMWGFX_CHIP_SVGAII 0
diff --git a/drivers/misc/mic/scif/scif_rma.c b/drivers/misc/mic/scif/scif_rma.c
index c824329f7012..b441f6b0c743 100644
--- a/drivers/misc/mic/scif/scif_rma.c
+++ b/drivers/misc/mic/scif/scif_rma.c
@@ -15,7 +15,7 @@
  * Intel SCIF driver.
  *
  */
-#include 
+#include 
 #include 
 #include 
 #include 
diff --git a/drivers/misc/mic/scif/scif_rma.h b/drivers/misc/mic/scif/scif_rma.h
index fa6722279196..d90a06d4e93b 100644
--- a/drivers/misc/mic/scif/scif_rma.h
+++ b/drivers/misc/mic/scif/scif_rma.h
@@ -53,7 +53,7 @@
 #ifndef SCIF_RMA_H
 #define SCIF_RMA_H
 
-#include 
+#include 
 #include 
 
 #include "../bus/scif_bus.h"
diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h
deleted file mode 100644
index 21b3e7d33d68..
--- a/include/linux/dma_remapping.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _DMA_REMAPPING_H
-#define _DMA_REMAPPING_H
-
-/*
- * VT-d hardware uses 4KiB page size regardless of host page size.
- */
-#define VTD_PAGE_SHIFT (12)
-#define VTD_PAGE_SIZE  (1UL << VTD_PAGE_SHIFT)
-#define VTD_PAGE_MASK  (((u64)-1) << VTD_PAGE_SHIFT)
-#define VTD_PAGE_ALIGN(addr)   (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK)
-
-#define VTD_STRIDE_SHIFT(9)
-#define VTD_STRIDE_MASK (((u64)-1) << VTD_STRIDE_SHIFT)
-
-#define DMA_PTE_READ (1)
-#define DMA_PTE_WRITE (2)
-#define DMA_PTE_LARGE_PAGE (1 << 7)
-#define DMA_PTE_SNP (1 << 11)
-
-#define CONTEXT_TT_MULTI_LEVEL 0
-#define CONTEXT_TT_DEV_IOTLB   1
-#define CONTEXT_TT_PASS_THROUGH 2
-/* Extended context entry types */
-#define CONTEXT_TT_PT_PASID4
-#define CONTEXT_TT_PT_PASID_DEV_IOTLB 5
-#define CONTEXT_TT_MASK (7ULL << 2)
-
-#define CONTEXT_DINVE  (1ULL << 8)
-#define CONTEXT_PRS(1ULL << 9)
-#define CONTEXT_PASIDE (1ULL << 11)
-
-struct intel_iommu;
-struct dmar_domain;
-struct root_entry;
-
-
-#ifdef CONFIG_INTEL_IOMMU
-extern int iommu_calculate_agaw(struct intel_iommu *iommu);
-extern int iommu_calculate_max_sagaw(struct intel_iommu *iommu);
-extern int dmar_disabled;
-extern int intel_iommu_enabled;
-extern int intel_iommu_tboot_noforce;
-#else
-static inline int iommu_calculate_agaw(struct intel_iommu *iommu)
-{
-   return 0;
-}
-static inline int iommu_calculate_max_sagaw(struct intel_iommu *iommu)
-{
-   return 0;
-}
-#define dmar_disabled  (1)
-#define intel_iommu_enabled (0)
-#endif
-
-
-#endif
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index b0ae25837361..a58bc05d6798 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -26,7 +26,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ 

Re: [RFC] remove the ->mapping_error method from dma_map_ops

2018-11-09 Thread David Miller
From: Christoph Hellwig 
Date: Fri,  9 Nov 2018 09:46:30 +0100

> Error reporting for the dma_map_single and dma_map_page operations is
> currently a mess.  Both APIs directly return the dma_addr_t to be used for
> the DMA, with a magic error escape that is specific to the instance and
> checked by another method provided.  This has a few downsides:
> 
>  - the error check is easily forgotten and a __must_check marker doesn't
>help as the value always is consumed anyway
>  - the error checking requires another indirect call, which have gotten
>incredibly expensive
>  - a lot of code is wasted on implementing these methods
> 
> The historical reason for this is that people thought DMA mappings would
> not fail when the API was created, which sounds like a really bad
> assumption in retrospective, and then we tried to cram error handling
> onto it later on.
> 
> There basically are two variants:  the error code is 0 because the
> implementation will never return 0 as a valid DMA address, or the error
> code is all-F as the implementation won't ever return an address that
> high.  The old AMD GART is the only one not falling into these two camps
> as it picks sort of a relative zero relative to where it is mapped.
> 
> The 0 return doesn't work for a lot of IOMMUs that start allocating
> bus space from address zero, so we can't consolidate on that, but I think
> we can move everyone to all-Fs, which the patch here does.  The reason
> for that is that there is only one way to ever get this address: by
> doing a 1-byte long, 1-byte aligned mapping, but all our mappings
> are not only longer but generally aligned, and the mappings have to
> keep at least the basic alignment.  Please try to poke holes into this
> idea as it would simplify our logic a lot, and also allow to change
> at least the not so commonly used yet dma_map_single_attrs and
> dma_map_page_attrs to actually return an error code and further improve
> the error handling flow in the drivers.

I have no problem with patch #1, it looks great.

But patch #2 on the other hand, not so much.

I hate seeing values returned by reference, it adds cost especially
on cpus where all argments and return values fit in registers (we end
up forcing a stack slot and memory references).

And we don't need it here.

DMA addresses are like pointers, and therefore we can return errors and
valid success values in the same dma_addr_t just fine.  PTR_ERR() --> DMA_ERR(),
IS_PTR_ERR() --> IS_DMA_ERR, etc.

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC] iommu/vt-d: Group and domain relationship

2018-11-09 Thread Jacob Pan
On Thu, 8 Nov 2018 11:30:04 +
James Sewart  wrote:

> Hey,
> 
> > On 8 Nov 2018, at 01:42, Lu Baolu  wrote:
> > 
> > Hi,
> > 
> > On 11/8/18 1:55 AM, James Sewart wrote:  
> >> Hey,  
> >>> On 7 Nov 2018, at 02:10, Lu Baolu 
> >>> wrote:
> >>> 
> >>> Hi,
> >>> 
> >>> On 11/6/18 6:40 PM, James Sewart wrote:  
>  Hey Lu,
>  Would you be able to go into more detail about the issues with
>  allowing IOMMU_DOMAIN_DMA to be allocated via domain_alloc?  
> >>> 
> >>> This door is closed because intel iommu driver does everything for
> >>> IOMMU_DOMAIN_DMA: allocating a domain and setup the context
> >>> entries for the domain.  
> >> As far as I can tell, attach_device in the intel driver will handle
> >> cleaning up any old domain context mapping and ensure the new
> >> domain is mapped with calls to dmar_remove_one_dev_info and
> >> domain_add_dev_info.  
> > 
> > That's only for domains of IOMMU_DOMAIN_UNMANAGED, right?  
> 
> attach_device has logic for cleaning up domains that are allocated by
> the intel driver.
> 
>   old_domain = find_domain(dev);
>   if (old_domain) {
>   rcu_read_lock();
>   dmar_remove_one_dev_info(old_domain, dev);
>   rcu_read_unlock();
> 
>   if (!domain_type_is_vm_or_si(old_domain) &&
>list_empty(_domain->devices))
>   domain_exit(old_domain);
>   }
> 
> This is checking the type of the old domain only, freeing it if is
> not attached to any devices. Looking at this now, maybe the solution
> would be to distinguish between internally allocated dma domains and
> dma domains allocated via the external api, so we avoid freeing a
> domain a driver has reference to.
> 
I think this can also be solved by adopting default domain and moving
internal domain to the generic code at group level. i.e. when device is
added to a group, the group gets a default domain (also set as current
domain) for DMA APIs. When device is attached to an unmanaged domain
(VFIO container and IOMMU API), it will switch the current domain to
the new domain. Old domain will be inactive but not exited.
> >>> 
> >>> Why do we want to open this door? Probably we want the generic
> >>> iommu layer to handle these things (it's called default domain).  
> >> I’d like to allocate a domain and attach it to multiple devices in
> >> a group/multiple groups so that they share address translation,
> >> but still allow drivers for devices in those groups to use the
> >> dma_map_ops api.  
> > 
> > Just out of curiosity, why do you want to share a single domain
> > across multiple groups? By default, the groups and DMA domains are
> > normally 1-1 mapped, right?  
> 
> Currently we see each device in a group with their own domain. 
> find_or_alloc_domain looks at dma aliases to determine who shares
> domains whereas pci_device_group in the generic iommu code determines
> groups using a few other checks. We have observed internally that
> devices under a pcie switch will be put in the same group but they do
> not share a domain within that group.
> 
> Getting every device within a group to share a domain would get us
> 90% of the way to what we want. But we have some configurations where
> there exist devices put in other groups that we want to share
> translations with.
> 
This is indeed a huge disconnect between IOMMU API and VT-d handling
of the DMA API. IOMMU API operates at group granu (per group domain)
that is based on ACS whereas DMA API in VT-d has per device domain and
sharing is not based on ACS. So I agree you could have multiple domains
under the same group for DMA API use.

I am working on a patch to align the two, e.g. use per group
default domain for DMA APIs. My idea is as follows:
- support DOMAIN_DMA type in intel_iommu_domain_alloc()
- add device to group will attach to the default DMA domain
- have to inherit any RMRR or identity mapping holes set up
  prior to IOMMU group setup.
- Or perhaps implement apply_resv_region() in VT-d driver and
  move RMRR setup here.
- DMA map API, will use per group default domain instead of per device
  private domain

The change is rather pervasive so i am trying to set a balance for
short term functionality and complete clean up. Any ideas welcome.

> >   
> >>> So we can't just open the door but not cleanup the things right?  
> >> A user of domain_alloc and attach_device is responsible for
> >> detaching a domain if it is no longer needed and calling
> >> domain_free.  
> > 
> > Currently DMA API calls get_valid_domain_for_dev() to retrieve a DMA
> > domain. If the domain has already been allocated, return directly.
> > Otherwise, allocate and initialize a new one for the device. Let's
> > call domains allocated by get_valid_domain_for_dev() as "A".
> > 
> > If we open the door and allow another component to manage the DMA
> > domains through domain 

Re: [PATCH] iommu: arm-smmu: Set SCTLR.HUPCF bit

2018-11-09 Thread Rob Clark
On Mon, Oct 29, 2018 at 3:09 PM Will Deacon  wrote:
>
> On Thu, Sep 27, 2018 at 06:46:07PM -0400, Rob Clark wrote:
> > We seem to need to set either this or CFCFG (stall), otherwise gpu
> > faults trigger problems with other in-flight transactions from the
> > GPU causing CP errors, etc.
> >
> > In the ARM SMMU spec, the 'Hit under previous context fault' bit is
> > described as:
> >
> >  '0' - Stall or terminate subsequent transactions in the presence
> >of an outstanding context fault
> >  '1' - Process all subsequent transactions independently of any
> >outstanding context fault.
> >
> > Since we don't enable CFCFG (stall) the behavior of terminating
> > other transactions makes sense.  And is probably not what we want
> > (and definately not what we want for GPU).
> >
> > Signed-off-by: Rob Clark 
> > ---
> > So I hit this issue a long time back on 820 (msm8996) and at the
> > time I solved it with a patch that enabled CFCFG.  And it resurfaced
> > more recently on sdm845.  But at the time CFCFG was rejected, iirc
> > because of concern that it would cause problems on other non-qcom
> > arm smmu implementations.  And I think I forgot to send this version
> > of the solution.
> >
> > If enabling HUPCF is anticipated to cause problems on other ARM
> > SMMU implementations, I think I can come up with a variant of this
> > patch which conditionally enables it for snapdragon.
> >
> > Either way, I'd really like to get some variant of this fix merged
> > (and probably it would be a good idea for stable kernel branches
> > too), since current behaviour with the GPU means faults turn into
> > a fantastic cascade of fail.
>
> Can you describe how this fantastic cascade of fail improves with this
> patch, please? If you're getting context faults then something has already
> gone horribly wrong, so I'm trying to work out how this improves things.
>

There are plenty of cases where getting iommu faults with a GPU is
"normal", or at least not something the kernel or even GL driver can
control.

With this patch, you still get the iommu fault, but it doesn't cause
the gpu to crash.  But without it, other memory accesses in flight
while the fault occurs, like the GPU command-processor reading further
ahead in the cmdstream to setup next draw, would return zero's,
causing the GPU to crash or get into a bad state.

BR,
-R

> > diff --git a/drivers/iommu/arm-smmu-regs.h b/drivers/iommu/arm-smmu-regs.h
> > index a1226e4ab5f8..2291925eb800 100644
> > --- a/drivers/iommu/arm-smmu-regs.h
> > +++ b/drivers/iommu/arm-smmu-regs.h
> > @@ -178,6 +178,7 @@ enum arm_smmu_s2cr_privcfg {
> >  #define ARM_SMMU_CB_ATSR 0x8f0
> >
> >  #define SCTLR_S1_ASIDPNE (1 << 12)
> > +#define SCTLR_HUPCF  (1 << 8)
> >  #define SCTLR_CFCFG  (1 << 7)
> >  #define SCTLR_CFIE   (1 << 6)
> >  #define SCTLR_CFRE   (1 << 5)
> > diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
> > index f7a96bcf94a6..47ffc9aade72 100644
> > --- a/drivers/iommu/arm-smmu.c
> > +++ b/drivers/iommu/arm-smmu.c
> > @@ -713,9 +713,9 @@ static void arm_smmu_write_context_bank(struct 
> > arm_smmu_device *smmu, int idx)
> >   reg = SCTLR_CFIE | SCTLR_CFRE | SCTLR_AFE | SCTLR_TRE | SCTLR_M;
> >   if (stage1)
> >   reg |= SCTLR_S1_ASIDPNE;
> > + reg |= SCTLR_HUPCF;
>
> Maybe just throw this into the assignment a couple of lines above?
>
> >   if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
> >   reg |= SCTLR_E;
> > -
>
> Random whitespace change.
>
> Will
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 06/10] swiotlb: use swiotlb_map_page in swiotlb_map_sg_attrs

2018-11-09 Thread Robin Murphy

On 09/11/2018 07:49, Christoph Hellwig wrote:

On Tue, Nov 06, 2018 at 05:27:14PM -0800, John Stultz wrote:

But at that point if I just re-apply "swiotlb: use swiotlb_map_page in
swiotlb_map_sg_attrs", I reproduce the hangs.

Any suggestions for how to further debug what might be going wrong
would be appreciated!


Very odd.  In the end map_sg and map_page are defined to do the same
things to start with.  The only real issue we had in this area was:

"[PATCH v2] of/device: Really only set bus DMA mask when appropriate"

so with current mainline + that you still see a problem, and if you
rever the commit we are replying to it still goes away?


OK, after quite a bit of trying I have managed to provoke a 
similar-looking problem with straight 4.20-rc1 on my Juno board - so far 
my "reproducer" is to decompress a ~10GB .tar.xz off an external USB 
hard disk, wherein after somewhere between 5 minutes and half an hour or 
so it tends to falls over with xz choking on corrupt data and/or a USB 
error.


From the presentation, this really smells like there's some corner in 
which we're either missing cache maintenance or doing it to the wrong 
address - I've not seen any issues with Juno's main PCIe-attached I/O, 
but the EHCI here is non-coherent (and 32-bit, so the bus_dma_mask thing 
doesn't matter) as are the HiKey UFS and SD controller.


I'll keep digging...

Robin.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 7/7] vfio/type1: Remove map_try_harder() code path

2018-11-09 Thread Alex Williamson
On Fri,  9 Nov 2018 12:07:12 +0100
Joerg Roedel  wrote:

> From: Joerg Roedel 
> 
> The AMD IOMMU driver can now map a huge-page where smaller
> mappings existed before, so this code-path is no longer
> triggered.
> 
> Signed-off-by: Joerg Roedel 
> ---
>  drivers/vfio/vfio_iommu_type1.c | 33 ++---
>  1 file changed, 2 insertions(+), 31 deletions(-)

Cool, glad to see this finally fixed.  My "should be fixed soon"
comment turned out to be a little optimistic with the fix finally
coming 5 years later.  We could of course keep this code as it really
doesn't harm anything, but I'm in favor trying to remove it if we think
it's dead now.  In order to expedite into one pull:

Acked-by: Alex Williamson 

Thanks,
Alex
 
> diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
> index d9fd3188615d..7651cfb14836 100644
> --- a/drivers/vfio/vfio_iommu_type1.c
> +++ b/drivers/vfio/vfio_iommu_type1.c
> @@ -978,32 +978,6 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
>   return ret;
>  }
>  
> -/*
> - * Turns out AMD IOMMU has a page table bug where it won't map large pages
> - * to a region that previously mapped smaller pages.  This should be fixed
> - * soon, so this is just a temporary workaround to break mappings down into
> - * PAGE_SIZE.  Better to map smaller pages than nothing.
> - */
> -static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
> -   unsigned long pfn, long npage, int prot)
> -{
> - long i;
> - int ret = 0;
> -
> - for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
> - ret = iommu_map(domain->domain, iova,
> - (phys_addr_t)pfn << PAGE_SHIFT,
> - PAGE_SIZE, prot | domain->prot);
> - if (ret)
> - break;
> - }
> -
> - for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
> - iommu_unmap(domain->domain, iova, PAGE_SIZE);
> -
> - return ret;
> -}
> -
>  static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
> unsigned long pfn, long npage, int prot)
>  {
> @@ -1013,11 +987,8 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, 
> dma_addr_t iova,
>   list_for_each_entry(d, >domain_list, next) {
>   ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
>   npage << PAGE_SHIFT, prot | d->prot);
> - if (ret) {
> - if (ret != -EBUSY ||
> - map_try_harder(d, iova, pfn, npage, prot))
> - goto unwind;
> - }
> + if (ret)
> + goto unwind;
>  
>   cond_resched();
>   }

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH 1/2] dma-mapping: remove ->mapping_error

2018-11-09 Thread Robin Murphy

On 09/11/2018 08:46, Christoph Hellwig wrote:
[...]

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 1167ff0416cf..cfb422e17049 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -55,8 +55,6 @@
  #include "amd_iommu_types.h"
  #include "irq_remapping.h"
  
-#define AMD_IOMMU_MAPPING_ERROR	0

-
  #define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
  
  #define LOOP_TIMEOUT	10

@@ -2339,7 +2337,7 @@ static dma_addr_t __map_single(struct device *dev,
paddr &= PAGE_MASK;
  
  	address = dma_ops_alloc_iova(dev, dma_dom, pages, dma_mask);

-   if (address == AMD_IOMMU_MAPPING_ERROR)
+   if (address == DMA_MAPPING_ERROR)


This for one is clearly broken, because the IOVA allocator still returns 
0 on failure here...



goto out;
  
  	prot = dir2prot(direction);

@@ -2376,7 +2374,7 @@ static dma_addr_t __map_single(struct device *dev,
  
  	dma_ops_free_iova(dma_dom, address, pages);
  
-	return AMD_IOMMU_MAPPING_ERROR;

+   return DMA_MAPPING_ERROR;
  }
  
  /*

@@ -2427,7 +2425,7 @@ static dma_addr_t map_page(struct device *dev, struct 
page *page,
if (PTR_ERR(domain) == -EINVAL)
return (dma_addr_t)paddr;
else if (IS_ERR(domain))
-   return AMD_IOMMU_MAPPING_ERROR;
+   return DMA_MAPPING_ERROR;
  
  	dma_mask = *dev->dma_mask;

dma_dom = to_dma_ops_domain(domain);
@@ -2504,7 +2502,7 @@ static int map_sg(struct device *dev, struct scatterlist 
*sglist,
npages = sg_num_pages(dev, sglist, nelems);
  
  	address = dma_ops_alloc_iova(dev, dma_dom, npages, dma_mask);

-   if (address == AMD_IOMMU_MAPPING_ERROR)
+   if (address == DMA_MAPPING_ERROR)


..and here.

I very much agree with the concept, but I think the way to go about it 
is to convert the implementations which need it to the standardised 
*_MAPPING_ERROR value one-by-one, and only then then do the big sweep to 
remove them all. That has more of a chance of getting worthwhile review 
and testing from the respective relevant parties (I'll confess I came 
looking for this bug specifically, since I happened to recall amd_iommu 
having a tricky implicit reliance on the old DMA_ERROR_CODE being 0 on x86).


In terms of really minimising the error-checking overhead it's a bit of 
a shame that DMA_MAPPING_ERROR = 0 doesn't seem viable as the thing to 
standardise on, since that has advantages at the micro-optimisation 
level for many ISAs - fixing up the legacy IOMMU code doesn't seem 
insurmountable, but I suspect there may well be non-IOMMU platforms 
where DMA to physical address 0 is a thing :(


(yeah, I know saving a couple of instructions and potential register 
allocations is down in the noise when we're already going from an 
indirect call to an inline comparison; I'm mostly just thinking out loud 
there)


Robin.


goto out_err;
  
  	prot = dir2prot(direction);

@@ -2627,7 +2625,7 @@ static void *alloc_coherent(struct device *dev, size_t 
size,
*dma_addr = __map_single(dev, dma_dom, page_to_phys(page),
 size, DMA_BIDIRECTIONAL, dma_mask);
  
-	if (*dma_addr == AMD_IOMMU_MAPPING_ERROR)

+   if (*dma_addr == DMA_MAPPING_ERROR)
goto out_free;
  
  	return page_address(page);

@@ -2678,11 +2676,6 @@ static int amd_iommu_dma_supported(struct device *dev, 
u64 mask)
return check_device(dev);
  }
  
-static int amd_iommu_mapping_error(struct device *dev, dma_addr_t dma_addr)

-{
-   return dma_addr == AMD_IOMMU_MAPPING_ERROR;
-}
-
  static const struct dma_map_ops amd_iommu_dma_ops = {
.alloc  = alloc_coherent,
.free   = free_coherent,
@@ -2691,7 +2684,6 @@ static const struct dma_map_ops amd_iommu_dma_ops = {
.map_sg = map_sg,
.unmap_sg   = unmap_sg,
.dma_supported  = amd_iommu_dma_supported,
-   .mapping_error  = amd_iommu_mapping_error,
  };
  
  static int init_reserved_iova_ranges(void)

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 1/3] mm: When CONFIG_ZONE_DMA32 is set, use DMA32 for SLAB_CACHE_DMA

2018-11-09 Thread Vlastimil Babka
On 11/9/18 12:57 PM, Nicolas Boichat wrote:
> On Fri, Nov 9, 2018 at 6:43 PM Vlastimil Babka  wrote:
>> Also I'm probably missing the point of this all. In patch 3 you use
>> __get_dma32_pages() thus __get_free_pages(__GFP_DMA32), which uses
>> alloc_pages, thus the page allocator directly, and there's no slab
>> caches involved.
> 
> __get_dma32_pages fixes level 1 page allocations in the patch 3.
> 
> This change fixes level 2 page allocations
> (kmem_cache_zalloc(data->l2_tables, gfp | GFP_DMA)), by transparently
> remapping GFP_DMA to an underlying ZONE_DMA32.
> 
> The alternative would be to create a new SLAB_CACHE_DMA32 when
> CONFIG_ZONE_DMA32 is defined, but then I'm concerned that the callers
> would need to choose between the 2 (GFP_DMA or GFP_DMA32...), and also
> need to use some ifdefs (but maybe that's not a valid concern?).
> 
>> It makes little sense to involve slab for page table
>> allocations anyway, as those tend to be aligned to a page size (or
>> high-order page size). So what am I missing?
> 
> Level 2 tables are ARM_V7S_TABLE_SIZE(2) => 1kb, so we'd waste 3kb if
> we allocated a full page.

Oh, I see.

Well, I think indeed the most transparent would be to support
SLAB_CACHE_DMA32. The callers of kmem_cache_zalloc() would then need not
add anything special to gfp, as that's stored internally upon
kmem_cache_create(). Of course SLAB_BUG_MASK would no longer have to
treat __GFP_DMA32 as unexpected. It would be unexpected when passed to
kmalloc() which doesn't have special dma32 caches, but for a cache
explicitly created to allocate from ZONE_DMA32, I don't see why not. I'm
somewhat surprised that there wouldn't be a need for this earlier, so
maybe I'm still missing something.

> Thanks,
> 

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 1/3] mm: When CONFIG_ZONE_DMA32 is set, use DMA32 for SLAB_CACHE_DMA

2018-11-09 Thread Nicolas Boichat
On Fri, Nov 9, 2018 at 6:43 PM Vlastimil Babka  wrote:
>
> On 11/9/18 9:24 AM, Nicolas Boichat wrote:
> > Some callers, namely iommu/io-pgtable-arm-v7s, expect the physical
> > address returned by kmem_cache_alloc with GFP_DMA parameter to be
> > a 32-bit address.
> >
> > Instead of adding a separate SLAB_CACHE_DMA32 (and then audit
> > all the calls to check if they require memory from DMA or DMA32
> > zone), we simply allocate SLAB_CACHE_DMA cache in DMA32 region,
> > if CONFIG_ZONE_DMA32 is set.
> >
> > Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32")
> > Signed-off-by: Nicolas Boichat 
> > ---
> >  include/linux/slab.h | 13 -
> >  mm/slab.c|  2 +-
> >  mm/slub.c|  2 +-
> >  3 files changed, 14 insertions(+), 3 deletions(-)
> >
> > diff --git a/include/linux/slab.h b/include/linux/slab.h
> > index 918f374e7156f4..390afe90c5dec0 100644
> > --- a/include/linux/slab.h
> > +++ b/include/linux/slab.h
> > @@ -30,7 +30,7 @@
> >  #define SLAB_POISON  ((slab_flags_t __force)0x0800U)
> >  /* Align objs on cache lines */
> >  #define SLAB_HWCACHE_ALIGN   ((slab_flags_t __force)0x2000U)
> > -/* Use GFP_DMA memory */
> > +/* Use GFP_DMA or GFP_DMA32 memory */
> >  #define SLAB_CACHE_DMA   ((slab_flags_t __force)0x4000U)
> >  /* DEBUG: Store the last owner for bug hunting */
> >  #define SLAB_STORE_USER  ((slab_flags_t __force)0x0001U)
> > @@ -126,6 +126,17 @@
> >  #define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
> >   (unsigned long)ZERO_SIZE_PTR)
> >
> > +/*
> > + * When ZONE_DMA32 is defined, have SLAB_CACHE_DMA allocate memory with
> > + * GFP_DMA32 instead of GFP_DMA, as this is what some of the callers
> > + * require (instead of duplicating cache for DMA and DMA32 zones).
> > + */
> > +#ifdef CONFIG_ZONE_DMA32
> > +#define SLAB_CACHE_DMA_GFP GFP_DMA32
> > +#else
> > +#define SLAB_CACHE_DMA_GFP GFP_DMA
> > +#endif
>
> AFAICS this will break e.g. x86 which can have both ZONE_DMA and
> ZONE_DMA32, and now you would make kmalloc(__GFP_DMA) return objects
> from ZONE_DMA32 instead of __ZONE_DMA, which can break something.

Oh, I was not aware that both ZONE_DMA and ZONE_DMA32 can be defined
at the same time. I guess the test should be inverted, something like
this (can be simplified...):
#ifdef CONFIG_ZONE_DMA
#define SLAB_CACHE_DMA_GFP GFP_DMA
#elif defined(CONFIG_ZONE_DMA32)
#define SLAB_CACHE_DMA_GFP GFP_DMA32
#else
#define SLAB_CACHE_DMA_GFP GFP_DMA // ?
#endif

> Also I'm probably missing the point of this all. In patch 3 you use
> __get_dma32_pages() thus __get_free_pages(__GFP_DMA32), which uses
> alloc_pages, thus the page allocator directly, and there's no slab
> caches involved.

__get_dma32_pages fixes level 1 page allocations in the patch 3.

This change fixes level 2 page allocations
(kmem_cache_zalloc(data->l2_tables, gfp | GFP_DMA)), by transparently
remapping GFP_DMA to an underlying ZONE_DMA32.

The alternative would be to create a new SLAB_CACHE_DMA32 when
CONFIG_ZONE_DMA32 is defined, but then I'm concerned that the callers
would need to choose between the 2 (GFP_DMA or GFP_DMA32...), and also
need to use some ifdefs (but maybe that's not a valid concern?).

> It makes little sense to involve slab for page table
> allocations anyway, as those tend to be aligned to a page size (or
> high-order page size). So what am I missing?

Level 2 tables are ARM_V7S_TABLE_SIZE(2) => 1kb, so we'd waste 3kb if
we allocated a full page.

Thanks,
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [RFC] iommu/vt-d: Group and domain relationship

2018-11-09 Thread James Sewart via iommu
Hey Yi,

> On 9 Nov 2018, at 06:54, Liu, Yi L  wrote:
> 
> Hi James,
> 
> Regards to the relationship of iommu group and domain, the blog written by 
> Alex
> may help you. The blog explained very well on how iommu group is determined 
> and
> why.
> 
> http://vfio.blogspot.com/2014/08/iommu-groups-inside-and-out.html

Thanks for the link, this explains how I expected the group domain 
relationship to work.

> 
>> From: iommu-boun...@lists.linux-foundation.org [mailto:iommu-
>> boun...@lists.linux-foundation.org] On Behalf Of James Sewart via iommu
>> Sent: Thursday, November 8, 2018 7:30 PM
>> Subject: Re: [RFC] iommu/vt-d: Group and domain relationship
>> 
>> Hey,
>> 
>>> On 8 Nov 2018, at 01:42, Lu Baolu  wrote:
>>> 
>>> Hi,
>>> 
>>> On 11/8/18 1:55 AM, James Sewart wrote:
 Hey,
> On 7 Nov 2018, at 02:10, Lu Baolu  wrote:
> 
> Hi,
> 
> On 11/6/18 6:40 PM, James Sewart wrote:
>> Hey Lu,
>> Would you be able to go into more detail about the issues with
> 
> [...]
> 
> 
> Why do we want to open this door? Probably we want the generic iommu
> layer to handle these things (it's called default domain).
 I’d like to allocate a domain and attach it to multiple devices in a
 group/multiple groups so that they share address translation, but still
 allow drivers for devices in those groups to use the dma_map_ops api.
>>> 
>>> Just out of curiosity, why do you want to share a single domain across
>>> multiple groups? By default, the groups and DMA domains are normally
>>> 1-1 mapped, right?
>> 
>> Currently we see each device in a group with their own domain.
>> find_or_alloc_domain looks at dma aliases to determine who shares domains
>> whereas pci_device_group in the generic iommu code determines groups using
>> a few other checks. We have observed internally that devices under a pcie
>> switch will be put in the same group but they do not share a domain within
>> that group.
> 
> Really? iommu group is DMA isolation unit. You said they are not sharing a 
> domain.
> Do you mean they have different IOVA address space by mentioning they are not
> sharing a domain? Is there any special things(e.g. special IOVA allocation to 
> avoid
> unexpected P2P) done on your system? Normally, devices within an iommu group
> should share an IOVA address space.

We see a unique set of mappings for each device within a group. Adding 
debug output to the kernel shows find_or_alloc_domain allocating a new 
domain for each device in a group. We don’t have any special configuration 
with regard to IOVA allocation.

I think the issue is that find_or_alloc_domain only uses 
pci_for_each_dma_alias to determine domain sharing. This doesn’t check the 
configuration of upstream pcie-pcie switches and doesn’t look to see if ACS 
is enabled or disabled. pci_device_group in drivers/iommu/iommu.c, which 
is used to determine which group a device is assigned, performs an extra 
walk of the upstream buses to see if there exists a bus without ACS which 
would allow peer-to-peer dma and groups the device accordingly.

One solution would be to allow the generic iommu code to assign domains, 
which it does based on the device_group function of the iommu_ops. The 
reason it doesn’t currently is that the default domain type is 
IOMMU_DOMAIN_DMA which the intel driver currently disallows allocating via 
the domain_alloc api.

Cheers,
James.


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

[PATCH 0/7] iommu/amd: Always allow to map huge pages

2018-11-09 Thread Joerg Roedel
Hi,

the AMD IOMMU driver had an issue for a long time where it
didn't allow to map a huge-page when smaller mappings
existed at that address range before. The VFIO driver even
had a workaround for that behavior.

These patches fix the issue and remove the workaround from
the VFIO driver.

Please review.

Thanks,

Joerg

Joerg Roedel (7):
  iommu/amd: Collect page-table pages in freelist
  iommu/amd: Introduce free_sub_pt() function
  iommu/amd: Ignore page-mode 7 in free_sub_pt()
  iommu/amd: Allow downgrading page-sizes in alloc_pte()
  iommu/amd: Restart loop if cmpxchg64 succeeded in alloc_pte()
  iommu/amd: Allow to upgrade page-size
  vfio/type1: Remove map_try_harder() code path

 drivers/iommu/amd_iommu.c   | 204 +---
 drivers/iommu/amd_iommu_types.h |   1 +
 drivers/vfio/vfio_iommu_type1.c |  33 +-
 3 files changed, 137 insertions(+), 101 deletions(-)

-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 7/7] vfio/type1: Remove map_try_harder() code path

2018-11-09 Thread Joerg Roedel
From: Joerg Roedel 

The AMD IOMMU driver can now map a huge-page where smaller
mappings existed before, so this code-path is no longer
triggered.

Signed-off-by: Joerg Roedel 
---
 drivers/vfio/vfio_iommu_type1.c | 33 ++---
 1 file changed, 2 insertions(+), 31 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index d9fd3188615d..7651cfb14836 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -978,32 +978,6 @@ static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
return ret;
 }
 
-/*
- * Turns out AMD IOMMU has a page table bug where it won't map large pages
- * to a region that previously mapped smaller pages.  This should be fixed
- * soon, so this is just a temporary workaround to break mappings down into
- * PAGE_SIZE.  Better to map smaller pages than nothing.
- */
-static int map_try_harder(struct vfio_domain *domain, dma_addr_t iova,
- unsigned long pfn, long npage, int prot)
-{
-   long i;
-   int ret = 0;
-
-   for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
-   ret = iommu_map(domain->domain, iova,
-   (phys_addr_t)pfn << PAGE_SHIFT,
-   PAGE_SIZE, prot | domain->prot);
-   if (ret)
-   break;
-   }
-
-   for (; i < npage && i > 0; i--, iova -= PAGE_SIZE)
-   iommu_unmap(domain->domain, iova, PAGE_SIZE);
-
-   return ret;
-}
-
 static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
  unsigned long pfn, long npage, int prot)
 {
@@ -1013,11 +987,8 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, 
dma_addr_t iova,
list_for_each_entry(d, >domain_list, next) {
ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
npage << PAGE_SHIFT, prot | d->prot);
-   if (ret) {
-   if (ret != -EBUSY ||
-   map_try_harder(d, iova, pfn, npage, prot))
-   goto unwind;
-   }
+   if (ret)
+   goto unwind;
 
cond_resched();
}
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 5/7] iommu/amd: Restart loop if cmpxchg64 succeeded in alloc_pte()

2018-11-09 Thread Joerg Roedel
From: Joerg Roedel 

This makes sure that __pte always contains the correct value
when the pointer to the next page-table level is derived.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/amd_iommu.c | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 6a88ba9321d1..c539b2a59019 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1474,13 +1474,12 @@ static u64 *alloc_pte(struct protection_domain *domain,
__npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page));
 
/* pte could have been changed somewhere. */
-   if (cmpxchg64(pte, __pte, __npte) != __pte) {
+   if (cmpxchg64(pte, __pte, __npte) != __pte)
free_page((unsigned long)page);
-   continue;
-   }
-
-   if (pte_level == PAGE_MODE_7_LEVEL)
+   else if (pte_level == PAGE_MODE_7_LEVEL)
domain->updated = true;
+
+   continue;
}
 
/* No level skipping support yet */
@@ -1489,7 +1488,7 @@ static u64 *alloc_pte(struct protection_domain *domain,
 
level -= 1;
 
-   pte = IOMMU_PTE_PAGE(*pte);
+   pte = IOMMU_PTE_PAGE(__pte);
 
if (pte_page && level == end_lvl)
*pte_page = pte;
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 6/7] iommu/amd: Allow to upgrade page-size

2018-11-09 Thread Joerg Roedel
From: Joerg Roedel 

Before this patch the iommu_map_page() function failed when
it tried to map a huge-page where smaller mappings existed
before.

With this change the page-table pages of the old mappings
are teared down, so that the huge-page can be mapped.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/amd_iommu.c | 29 +++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index c539b2a59019..71797b3d67e5 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1557,6 +1557,25 @@ static u64 *fetch_pte(struct protection_domain *domain,
return pte;
 }
 
+static struct page *free_clear_pte(u64 *pte, u64 pteval, struct page *freelist)
+{
+   unsigned long pt;
+   int mode;
+
+   while (cmpxchg64(pte, pteval, 0) != pteval) {
+   pr_warn("AMD-Vi: IOMMU pte changed since we read it\n");
+   pteval = *pte;
+   }
+
+   if (!IOMMU_PTE_PRESENT(pteval))
+   return freelist;
+
+   pt   = (unsigned long)IOMMU_PTE_PAGE(pteval);
+   mode = IOMMU_PTE_MODE(pteval);
+
+   return free_sub_pt(pt, mode, freelist);
+}
+
 /*
  * Generic mapping functions. It maps a physical address into a DMA
  * address space. It allocates the page table pages if necessary.
@@ -1571,6 +1590,7 @@ static int iommu_map_page(struct protection_domain *dom,
  int prot,
  gfp_t gfp)
 {
+   struct page *freelist = NULL;
u64 __pte, *pte;
int i, count;
 
@@ -1587,8 +1607,10 @@ static int iommu_map_page(struct protection_domain *dom,
return -ENOMEM;
 
for (i = 0; i < count; ++i)
-   if (IOMMU_PTE_PRESENT(pte[i]))
-   return -EBUSY;
+   freelist = free_clear_pte([i], pte[i], freelist);
+
+   if (freelist != NULL)
+   dom->updated = true;
 
if (count > 1) {
__pte = PAGE_SIZE_PTE(__sme_set(phys_addr), page_size);
@@ -1606,6 +1628,9 @@ static int iommu_map_page(struct protection_domain *dom,
 
update_domain(dom);
 
+   /* Everything flushed out, free pages now */
+   free_page_list(freelist);
+
return 0;
 }
 
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 1/7] iommu/amd: Collect page-table pages in freelist

2018-11-09 Thread Joerg Roedel
From: Joerg Roedel 

Collect all pages that belong to a page-table in a list and
free them after the tree has been traversed. This allows to
implement safer page-table updates in subsequent patches.
Also move the functions for page-table freeing a bit upwards
in the file so that they are usable from the iommu_map() path.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/amd_iommu.c | 144 ++
 1 file changed, 83 insertions(+), 61 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 1167ff0416cf..2655bd91af93 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1317,6 +1317,89 @@ static void domain_flush_devices(struct 
protection_domain *domain)
  *
  /
 
+static void free_page_list(struct page *freelist)
+{
+   while (freelist != NULL) {
+   unsigned long p = (unsigned long)page_address(freelist);
+   freelist = freelist->freelist;
+   free_page(p);
+   }
+}
+
+static struct page *free_pt_page(unsigned long pt, struct page *freelist)
+{
+   struct page *p = virt_to_page((void *)pt);
+
+   p->freelist = freelist;
+
+   return p;
+}
+
+#define DEFINE_FREE_PT_FN(LVL, FN) 
\
+static struct page *free_pt_##LVL (unsigned long __pt, struct page *freelist)  
\
+{  
\
+   unsigned long p;
\
+   u64 *pt;
\
+   int i;  
\
+   
\
+   pt = (u64 *)__pt;   
\
+   
\
+   for (i = 0; i < 512; ++i) { 
\
+   /* PTE present? */  
\
+   if (!IOMMU_PTE_PRESENT(pt[i]))  
\
+   continue;   
\
+   
\
+   /* Large PTE? */
\
+   if (PM_PTE_LEVEL(pt[i]) == 0 || 
\
+   PM_PTE_LEVEL(pt[i]) == 7)   
\
+   continue;   
\
+   
\
+   p = (unsigned long)IOMMU_PTE_PAGE(pt[i]);   
\
+   freelist = FN(p, freelist); 
\
+   }   
\
+   
\
+   return free_pt_page((unsigned long)pt, freelist);   
\
+}
+
+DEFINE_FREE_PT_FN(l2, free_pt_page)
+DEFINE_FREE_PT_FN(l3, free_pt_l2)
+DEFINE_FREE_PT_FN(l4, free_pt_l3)
+DEFINE_FREE_PT_FN(l5, free_pt_l4)
+DEFINE_FREE_PT_FN(l6, free_pt_l5)
+
+static void free_pagetable(struct protection_domain *domain)
+{
+   unsigned long root = (unsigned long)domain->pt_root;
+   struct page *freelist = NULL;
+
+   switch (domain->mode) {
+   case PAGE_MODE_NONE:
+   break;
+   case PAGE_MODE_1_LEVEL:
+   freelist = free_pt_page(root, freelist);
+   break;
+   case PAGE_MODE_2_LEVEL:
+   freelist = free_pt_l2(root, freelist);
+   break;
+   case PAGE_MODE_3_LEVEL:
+   freelist = free_pt_l3(root, freelist);
+   break;
+   case PAGE_MODE_4_LEVEL:
+   freelist = free_pt_l4(root, freelist);
+   break;
+   case PAGE_MODE_5_LEVEL:
+   freelist = free_pt_l5(root, freelist);
+   break;
+   case PAGE_MODE_6_LEVEL:
+   freelist = free_pt_l6(root, freelist);
+   break;
+   default:
+   BUG();
+   }
+
+   free_page_list(freelist);
+}
+
 /*
  * This function is used to add another level to an IO page table. Adding
  * another level increases the size of the address space by 9 bits to a size up
@@ -1638,67 +1721,6 @@ static void domain_id_free(int id)
spin_unlock(_bitmap_lock);
 }
 
-#define DEFINE_FREE_PT_FN(LVL, FN) \
-static void free_pt_##LVL (unsigned long __pt) \
-{  \
-   unsigned long p;\
-   u64 *pt;   

[PATCH 3/7] iommu/amd: Ignore page-mode 7 in free_sub_pt()

2018-11-09 Thread Joerg Roedel
From: Joerg Roedel 

The page-mode 7 is a special one as it marks a final PTE to
a page with an intermediary size.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/amd_iommu.c   | 4 
 drivers/iommu/amd_iommu_types.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 1186571f77e1..49b5d3115e56 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1372,6 +1372,7 @@ static struct page *free_sub_pt(unsigned long root, int 
mode,
 {
switch (mode) {
case PAGE_MODE_NONE:
+   case PAGE_MODE_7_LEVEL:
break;
case PAGE_MODE_1_LEVEL:
freelist = free_pt_page(root, freelist);
@@ -1403,6 +1404,9 @@ static void free_pagetable(struct protection_domain 
*domain)
unsigned long root = (unsigned long)domain->pt_root;
struct page *freelist = NULL;
 
+   BUG_ON(domain->mode < PAGE_MODE_NONE ||
+  domain->mode > PAGE_MODE_6_LEVEL);
+
free_sub_pt(root, domain->mode, freelist);
 
free_page_list(freelist);
diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h
index e2b342e65a7b..eae0741f72dc 100644
--- a/drivers/iommu/amd_iommu_types.h
+++ b/drivers/iommu/amd_iommu_types.h
@@ -269,6 +269,7 @@
 #define PAGE_MODE_4_LEVEL 0x04
 #define PAGE_MODE_5_LEVEL 0x05
 #define PAGE_MODE_6_LEVEL 0x06
+#define PAGE_MODE_7_LEVEL 0x07
 
 #define PM_LEVEL_SHIFT(x)  (12 + ((x) * 9))
 #define PM_LEVEL_SIZE(x)   (((x) < 6) ? \
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 2/7] iommu/amd: Introduce free_sub_pt() function

2018-11-09 Thread Joerg Roedel
From: Joerg Roedel 

The function is a more generic version of free_pagetable()
and will be used to free only specific sub-trees of a
page-table.

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/amd_iommu.c | 18 +-
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 2655bd91af93..1186571f77e1 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1367,12 +1367,10 @@ DEFINE_FREE_PT_FN(l4, free_pt_l3)
 DEFINE_FREE_PT_FN(l5, free_pt_l4)
 DEFINE_FREE_PT_FN(l6, free_pt_l5)
 
-static void free_pagetable(struct protection_domain *domain)
+static struct page *free_sub_pt(unsigned long root, int mode,
+   struct page *freelist)
 {
-   unsigned long root = (unsigned long)domain->pt_root;
-   struct page *freelist = NULL;
-
-   switch (domain->mode) {
+   switch (mode) {
case PAGE_MODE_NONE:
break;
case PAGE_MODE_1_LEVEL:
@@ -1397,6 +1395,16 @@ static void free_pagetable(struct protection_domain 
*domain)
BUG();
}
 
+   return freelist;
+}
+
+static void free_pagetable(struct protection_domain *domain)
+{
+   unsigned long root = (unsigned long)domain->pt_root;
+   struct page *freelist = NULL;
+
+   free_sub_pt(root, domain->mode, freelist);
+
free_page_list(freelist);
 }
 
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 4/7] iommu/amd: Allow downgrading page-sizes in alloc_pte()

2018-11-09 Thread Joerg Roedel
From: Joerg Roedel 

Before this patch it was not possible the downgrade a
mapping established with page-mode 7 to a mapping using
smaller page-sizes, because the pte_level != level check
prevented that.

Treat page-mode 7 like a non-present mapping and allow to
overwrite it in alloc_pte().

Signed-off-by: Joerg Roedel 
---
 drivers/iommu/amd_iommu.c | 12 +---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c
index 49b5d3115e56..6a88ba9321d1 100644
--- a/drivers/iommu/amd_iommu.c
+++ b/drivers/iommu/amd_iommu.c
@@ -1460,10 +1460,13 @@ static u64 *alloc_pte(struct protection_domain *domain,
 
while (level > end_lvl) {
u64 __pte, __npte;
+   int pte_level;
 
-   __pte = *pte;
+   __pte = *pte;
+   pte_level = PM_PTE_LEVEL(__pte);
 
-   if (!IOMMU_PTE_PRESENT(__pte)) {
+   if (!IOMMU_PTE_PRESENT(__pte) ||
+   pte_level == PAGE_MODE_7_LEVEL) {
page = (u64 *)get_zeroed_page(gfp);
if (!page)
return NULL;
@@ -1475,10 +1478,13 @@ static u64 *alloc_pte(struct protection_domain *domain,
free_page((unsigned long)page);
continue;
}
+
+   if (pte_level == PAGE_MODE_7_LEVEL)
+   domain->updated = true;
}
 
/* No level skipping support yet */
-   if (PM_PTE_LEVEL(*pte) != level)
+   if (pte_level != level)
return NULL;
 
level -= 1;
-- 
2.17.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH RFC 1/3] mm: When CONFIG_ZONE_DMA32 is set, use DMA32 for SLAB_CACHE_DMA

2018-11-09 Thread Vlastimil Babka
On 11/9/18 9:24 AM, Nicolas Boichat wrote:
> Some callers, namely iommu/io-pgtable-arm-v7s, expect the physical
> address returned by kmem_cache_alloc with GFP_DMA parameter to be
> a 32-bit address.
> 
> Instead of adding a separate SLAB_CACHE_DMA32 (and then audit
> all the calls to check if they require memory from DMA or DMA32
> zone), we simply allocate SLAB_CACHE_DMA cache in DMA32 region,
> if CONFIG_ZONE_DMA32 is set.
> 
> Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32")
> Signed-off-by: Nicolas Boichat 
> ---
>  include/linux/slab.h | 13 -
>  mm/slab.c|  2 +-
>  mm/slub.c|  2 +-
>  3 files changed, 14 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/slab.h b/include/linux/slab.h
> index 918f374e7156f4..390afe90c5dec0 100644
> --- a/include/linux/slab.h
> +++ b/include/linux/slab.h
> @@ -30,7 +30,7 @@
>  #define SLAB_POISON  ((slab_flags_t __force)0x0800U)
>  /* Align objs on cache lines */
>  #define SLAB_HWCACHE_ALIGN   ((slab_flags_t __force)0x2000U)
> -/* Use GFP_DMA memory */
> +/* Use GFP_DMA or GFP_DMA32 memory */
>  #define SLAB_CACHE_DMA   ((slab_flags_t __force)0x4000U)
>  /* DEBUG: Store the last owner for bug hunting */
>  #define SLAB_STORE_USER  ((slab_flags_t __force)0x0001U)
> @@ -126,6 +126,17 @@
>  #define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
>   (unsigned long)ZERO_SIZE_PTR)
>  
> +/*
> + * When ZONE_DMA32 is defined, have SLAB_CACHE_DMA allocate memory with
> + * GFP_DMA32 instead of GFP_DMA, as this is what some of the callers
> + * require (instead of duplicating cache for DMA and DMA32 zones).
> + */
> +#ifdef CONFIG_ZONE_DMA32
> +#define SLAB_CACHE_DMA_GFP GFP_DMA32
> +#else
> +#define SLAB_CACHE_DMA_GFP GFP_DMA
> +#endif

AFAICS this will break e.g. x86 which can have both ZONE_DMA and
ZONE_DMA32, and now you would make kmalloc(__GFP_DMA) return objects
from ZONE_DMA32 instead of __ZONE_DMA, which can break something.

Also I'm probably missing the point of this all. In patch 3 you use
__get_dma32_pages() thus __get_free_pages(__GFP_DMA32), which uses
alloc_pages, thus the page allocator directly, and there's no slab
caches involved. It makes little sense to involve slab for page table
allocations anyway, as those tend to be aligned to a page size (or
high-order page size). So what am I missing?
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: iommu/io-pgtable-arm-v7s: About pagetable 33bit PA

2018-11-09 Thread Nicolas Boichat
Hi Robin/Yong,

On Fri, Nov 9, 2018 at 3:51 PM Yong Wu  wrote:
>
> On Thu, 2018-11-08 at 13:49 +, Robin Murphy wrote:
> > On 08/11/2018 07:31, Yong Wu wrote:
> > > Hi Robin,
> > >
> > > After the commit ad67f5a6545f ("arm64: replace ZONE_DMA with
> > > ZONE_DMA32"), we don't have ZONE_DMA in aarch64, but
> > > __arm_v7s_alloc_table[1] use the GFP_DMA to expect the physical address
> > > of pagetable should be 32bit.
> > >
> > > Right now we meet a issue that the lvl1 pagetable PA is 0x1_3ab6_ in
> > > the 4GB broad. then the IOMMU initialize failed.(This issue can be fixed
> > > if we revert ad67f5a6545f.)
> >
> > But that shouldn't actually be failing on MTK platforms anyway, right,
> > since your IOMMU is still capable of addressing that? Seems like we
> > overlooked that check in __arm_v7s_alloc_table(), where the conversion
> > by casting will want updating to something like
> > "iopte_to_paddr(paddr_to_iopte(...))" in patch #4 of your series.
>
> The current mt8183 IOMMU support the pagetable address over 4GB.
> Unfortunately the previous mt8173 don't support.(mt8173 IOMMU support
> mapping for the dram over 4GB while its pagetable still need locate
> below 4GB.)
>
> In order to unify the code, we still expect lvl2 pagetable base don't
> over 4GB. thus I didn't change it in the current patchset.
>
> >
> > > I planed to add GFP_DMA32 for pagetable allocation. the level1 was
> > > ok but level2 was failed. It looks that slab don't like GFP_DMA32[2].
> > > this is the warning log:
> > > =
> > > Unexpected gfp: 0x4 (GFP_DMA32). Fixing up to gfp: 0x488020 (GFP_ATOMIC|
> > > __GFP_ZERO). Fix your code!
> > > =
> > > I don't know why slab don't allow GFP_DMA32, the gfp flags seems only
> > > be bypassed to alloc_page. Is it possible that let slab allow GFP_DMA32
> > > for aarch64?
> > I had a bit of a look into it some time ago, and I don't recall seeing
> > any obvious reason that the kmem_cache API couldn't support ZONE_DMA32
> > in general (either via a separate SLAB_CACHE_DMA32, or just overloading
> > SLAB_CACHE_DMA depending on which of CONFIG_ZONE_DMA/CONFIG_ZONE_DMA32
> > are enabled) - I guess that's just never been implemented since nobody
> > needed it before.
>
> Thanks for the comment. We could try a patch for this.

I made an attempt here, if that's helpful:
https://lore.kernel.org/patchwork/cover/1009116/ .

I'm still a bit uncomfortable with GFP_DMA passed to kmem_cache_alloc
suddenly becoming GFP_DMA32 in the underlying call, but I'm not sure
if there is a better way (apart from duplicating the caches, and a lot
of ifdefs in callers).

Thanks,

> >
> > Robin.
> >
> > > We notice that this has been discussed[3]. but if we use alloc_page for
> > > the level2 pagetable, It may waste lots of memory.
> > >
> > > Any suggestion is appreciated.
> > >
> > >
> > > Reported-by: Nicolas Boichat 
> > >
> > > [1]
> > > https://elixir.bootlin.com/linux/v4.20-rc1/source/drivers/iommu/io-pgtable-arm-v7s.c#L200
> > > [2] https://elixir.bootlin.com/linux/v4.20-rc1/source/mm/internal.h#L37
> > > [3] https://patchwork.kernel.org/patch/10474289/
> > >
>
>
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH RFC 1/3] mm: When CONFIG_ZONE_DMA32 is set, use DMA32 for SLAB_CACHE_DMA

2018-11-09 Thread Nicolas Boichat
Some callers, namely iommu/io-pgtable-arm-v7s, expect the physical
address returned by kmem_cache_alloc with GFP_DMA parameter to be
a 32-bit address.

Instead of adding a separate SLAB_CACHE_DMA32 (and then audit
all the calls to check if they require memory from DMA or DMA32
zone), we simply allocate SLAB_CACHE_DMA cache in DMA32 region,
if CONFIG_ZONE_DMA32 is set.

Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32")
Signed-off-by: Nicolas Boichat 
---
 include/linux/slab.h | 13 -
 mm/slab.c|  2 +-
 mm/slub.c|  2 +-
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/include/linux/slab.h b/include/linux/slab.h
index 918f374e7156f4..390afe90c5dec0 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -30,7 +30,7 @@
 #define SLAB_POISON((slab_flags_t __force)0x0800U)
 /* Align objs on cache lines */
 #define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x2000U)
-/* Use GFP_DMA memory */
+/* Use GFP_DMA or GFP_DMA32 memory */
 #define SLAB_CACHE_DMA ((slab_flags_t __force)0x4000U)
 /* DEBUG: Store the last owner for bug hunting */
 #define SLAB_STORE_USER((slab_flags_t __force)0x0001U)
@@ -126,6 +126,17 @@
 #define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
(unsigned long)ZERO_SIZE_PTR)
 
+/*
+ * When ZONE_DMA32 is defined, have SLAB_CACHE_DMA allocate memory with
+ * GFP_DMA32 instead of GFP_DMA, as this is what some of the callers
+ * require (instead of duplicating cache for DMA and DMA32 zones).
+ */
+#ifdef CONFIG_ZONE_DMA32
+#define SLAB_CACHE_DMA_GFP GFP_DMA32
+#else
+#define SLAB_CACHE_DMA_GFP GFP_DMA
+#endif
+
 #include 
 
 struct mem_cgroup;
diff --git a/mm/slab.c b/mm/slab.c
index 2a5654bb3b3ff3..8810daa052dcdc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2121,7 +2121,7 @@ int __kmem_cache_create(struct kmem_cache *cachep, 
slab_flags_t flags)
cachep->flags = flags;
cachep->allocflags = __GFP_COMP;
if (flags & SLAB_CACHE_DMA)
-   cachep->allocflags |= GFP_DMA;
+   cachep->allocflags |= SLAB_CACHE_DMA_GFP;
if (flags & SLAB_RECLAIM_ACCOUNT)
cachep->allocflags |= __GFP_RECLAIMABLE;
cachep->size = size;
diff --git a/mm/slub.c b/mm/slub.c
index e3629cd7aff164..fdd05323e54cbd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -3575,7 +3575,7 @@ static int calculate_sizes(struct kmem_cache *s, int 
forced_order)
s->allocflags |= __GFP_COMP;
 
if (s->flags & SLAB_CACHE_DMA)
-   s->allocflags |= GFP_DMA;
+   s->allocflags |= SLAB_CACHE_DMA_GFP;
 
if (s->flags & SLAB_RECLAIM_ACCOUNT)
s->allocflags |= __GFP_RECLAIMABLE;
-- 
2.19.1.930.g4563a0d9d0-goog

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH RFC 3/3] iommu/io-pgtable-arm-v7s: Request DMA32 memory, and improve debugging

2018-11-09 Thread Nicolas Boichat
For level 1 pages, use __get_dma32_pages to make sure physical
memory address is 32-bit.

For level 2 pages, kmem_cache_zalloc with GFP_DMA has been modified
in a previous patch to be allocated in DMA32 zone.

Also, print an error when the physical address does not fit in
32-bit, to make debugging easier in the future.

Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32")
Signed-off-by: Nicolas Boichat 
---
 drivers/iommu/io-pgtable-arm-v7s.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm-v7s.c 
b/drivers/iommu/io-pgtable-arm-v7s.c
index 445c3bde04800c..862fd404de84d8 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -198,13 +198,15 @@ static void *__arm_v7s_alloc_table(int lvl, gfp_t gfp,
void *table = NULL;
 
if (lvl == 1)
-   table = (void *)__get_dma_pages(__GFP_ZERO, get_order(size));
+   table = (void *)__get_dma32_pages(__GFP_ZERO, get_order(size));
else if (lvl == 2)
table = kmem_cache_zalloc(data->l2_tables, gfp | GFP_DMA);
phys = virt_to_phys(table);
-   if (phys != (arm_v7s_iopte)phys)
+   if (phys != (arm_v7s_iopte)phys) {
/* Doesn't fit in PTE */
+   dev_err(dev, "Page table does not fit: %pa", );
goto out_free;
+   }
if (table && !(cfg->quirks & IO_PGTABLE_QUIRK_NO_DMA)) {
dma = dma_map_single(dev, table, size, DMA_TO_DEVICE);
if (dma_mapping_error(dev, dma))
-- 
2.19.1.930.g4563a0d9d0-goog

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH RFC 2/3] include/linux/gfp.h: Add __get_dma32_pages macro

2018-11-09 Thread Nicolas Boichat
Some callers (e.g. iommu/io-pgtable-arm-v7s) require DMA32 memory
when calling __get_dma_pages. Add a new macro for that purpose.

Fixes: ad67f5a6545f ("arm64: replace ZONE_DMA with ZONE_DMA32")
Signed-off-by: Nicolas Boichat 
---
 include/linux/gfp.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 76f8db0b0e715c..50e04896b78017 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -535,6 +535,8 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t 
size, gfp_t gfp_mask);
 
 #define __get_dma_pages(gfp_mask, order) \
__get_free_pages((gfp_mask) | GFP_DMA, (order))
+#define __get_dma32_pages(gfp_mask, order) \
+   __get_free_pages((gfp_mask) | GFP_DMA32, (order))
 
 extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
-- 
2.19.1.930.g4563a0d9d0-goog

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH RFC 0/3] iommu/io-pgtable-arm-v7s: Use DMA32 zone for page tables

2018-11-09 Thread Nicolas Boichat
This is a follow-up to the discussion in [1], to make sure that the page tables
allocated by iommu/io-pgtable-arm-v7s are contained within 32-bit physical
address space.

[1] https://lists.linuxfoundation.org/pipermail/iommu/2018-November/030876.html

Nicolas Boichat (3):
  mm: When CONFIG_ZONE_DMA32 is set, use DMA32 for SLAB_CACHE_DMA
  include/linux/gfp.h: Add __get_dma32_pages macro
  iommu/io-pgtable-arm-v7s: Request DMA32 memory, and improve debugging

 drivers/iommu/io-pgtable-arm-v7s.c |  6 --
 include/linux/gfp.h|  2 ++
 include/linux/slab.h   | 13 -
 mm/slab.c  |  2 +-
 mm/slub.c  |  2 +-
 5 files changed, 20 insertions(+), 5 deletions(-)

-- 
2.19.1.930.g4563a0d9d0-goog

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 2/2] arch: switch the default on ARCH_HAS_SG_CHAIN

2018-11-09 Thread Christoph Hellwig
These days architectures are mostly out of the business of dealing with
struct scatterlist at all, unless they have architecture specific iommu
drivers.  Replace the ARCH_HAS_SG_CHAIN symbol with a ARCH_NO_SG_CHAIN
one only enabled for architectures with horrible legacy iommu drivers
like alpha and parisc, and conditionally for arm which wants to keep it
disable for legacy platforms.

Signed-off-by: Christoph Hellwig 
---
 .../features/io/sg-chain/arch-support.txt | 33 ---
 arch/alpha/Kconfig|  1 +
 arch/arc/Kconfig  |  1 -
 arch/arm/Kconfig  |  2 +-
 arch/arm64/Kconfig|  1 -
 arch/ia64/Kconfig |  1 -
 arch/parisc/Kconfig   |  1 +
 arch/powerpc/Kconfig  |  1 -
 arch/s390/Kconfig |  1 -
 arch/sparc/Kconfig|  1 -
 arch/x86/Kconfig  |  1 -
 arch/xtensa/Kconfig   |  1 -
 include/linux/scatterlist.h   |  6 ++--
 lib/Kconfig   |  2 +-
 lib/scatterlist.c |  2 +-
 15 files changed, 8 insertions(+), 47 deletions(-)
 delete mode 100644 Documentation/features/io/sg-chain/arch-support.txt

diff --git a/Documentation/features/io/sg-chain/arch-support.txt 
b/Documentation/features/io/sg-chain/arch-support.txt
deleted file mode 100644
index 6554f0372c3f..
--- a/Documentation/features/io/sg-chain/arch-support.txt
+++ /dev/null
@@ -1,33 +0,0 @@
-#
-# Feature name:  sg-chain
-# Kconfig:   ARCH_HAS_SG_CHAIN
-# description:   arch supports chained scatter-gather lists
-#
----
-| arch |status|
----
-|   alpha: | TODO |
-| arc: |  ok  |
-| arm: |  ok  |
-|   arm64: |  ok  |
-| c6x: | TODO |
-|   h8300: | TODO |
-| hexagon: | TODO |
-|ia64: |  ok  |
-|m68k: | TODO |
-|  microblaze: | TODO |
-|mips: | TODO |
-|   nds32: | TODO |
-|   nios2: | TODO |
-|openrisc: | TODO |
-|  parisc: | TODO |
-| powerpc: |  ok  |
-|   riscv: | TODO |
-|s390: |  ok  |
-|  sh: | TODO |
-|   sparc: |  ok  |
-|  um: | TODO |
-|   unicore32: | TODO |
-| x86: |  ok  |
-|  xtensa: | TODO |
----
diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig
index 5b4f88363453..a7e748a46c18 100644
--- a/arch/alpha/Kconfig
+++ b/arch/alpha/Kconfig
@@ -5,6 +5,7 @@ config ALPHA
select ARCH_MIGHT_HAVE_PC_PARPORT
select ARCH_MIGHT_HAVE_PC_SERIO
select ARCH_NO_PREEMPT
+   select ARCH_NO_SG_CHAIN
select ARCH_USE_CMPXCHG_LOCKREF
select HAVE_AOUT
select HAVE_IDE
diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig
index c9e2a1323536..fd48d698da29 100644
--- a/arch/arc/Kconfig
+++ b/arch/arc/Kconfig
@@ -13,7 +13,6 @@ config ARC
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SYNC_DMA_FOR_CPU
select ARCH_HAS_SYNC_DMA_FOR_DEVICE
-   select ARCH_HAS_SG_CHAIN
select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC
select BUILDTIME_EXTABLE_SORT
select CLONE_BACKWARDS
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 91be74d8df65..199eb62230b0 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -19,6 +19,7 @@ config ARM
select ARCH_HAVE_CUSTOM_GPIO_H
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_MIGHT_HAVE_PC_PARPORT
+   select ARCH_NO_SG_CHAIN if !ARM_HAS_SG_CHAIN
select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7
select ARCH_SUPPORTS_ATOMIC_RMW
@@ -118,7 +119,6 @@ config ARM
  .
 
 config ARM_HAS_SG_CHAIN
-   select ARCH_HAS_SG_CHAIN
bool
 
 config ARM_DMA_USE_IOMMU
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 787d7850e064..4c851b3a7b7c 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -23,7 +23,6 @@ config ARM64
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_PTE_SPECIAL
select ARCH_HAS_SET_MEMORY
-   select ARCH_HAS_SG_CHAIN
select ARCH_HAS_STRICT_KERNEL_RWX
select ARCH_HAS_STRICT_MODULE_RWX
select ARCH_HAS_SYNC_DMA_FOR_DEVICE
diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig
index 36773def6920..d6f203658994 100644
--- a/arch/ia64/Kconfig
+++ b/arch/ia64/Kconfig
@@ -29,7 +29,6 @@ config IA64
select HAVE_MEMBLOCK_NODE_MAP
select HAVE_VIRT_CPU_ACCOUNTING
select ARCH_HAS_DMA_MARK_CLEAN
-   select ARCH_HAS_SG_CHAIN
select VIRT_TO_BUS
select ARCH_DISCARD_MEMBLOCK
select 

scatterlist arch cleanups

2018-11-09 Thread Christoph Hellwig
Remove leftovers, and switch the default on enabling SG chaining.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 1/2] csky, h8300, riscv: remove leftovers

2018-11-09 Thread Christoph Hellwig
There has been no  for a long time, which also means
there is no point in using it from asm-generic.

Signed-off-by: Christoph Hellwig 
---
 arch/csky/include/asm/Kbuild  | 1 -
 arch/h8300/include/asm/Kbuild | 1 -
 arch/riscv/include/asm/Kbuild | 1 -
 3 files changed, 3 deletions(-)

diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild
index 2a0abe8f2a35..7c48a123300d 100644
--- a/arch/csky/include/asm/Kbuild
+++ b/arch/csky/include/asm/Kbuild
@@ -34,7 +34,6 @@ generic-y += pci.h
 generic-y += percpu.h
 generic-y += preempt.h
 generic-y += qrwlock.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += serial.h
 generic-y += shm.h
diff --git a/arch/h8300/include/asm/Kbuild b/arch/h8300/include/asm/Kbuild
index a5d0b2991f47..32f0c8952147 100644
--- a/arch/h8300/include/asm/Kbuild
+++ b/arch/h8300/include/asm/Kbuild
@@ -36,7 +36,6 @@ generic-y += parport.h
 generic-y += percpu.h
 generic-y += pgalloc.h
 generic-y += preempt.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += serial.h
 generic-y += sizes.h
diff --git a/arch/riscv/include/asm/Kbuild b/arch/riscv/include/asm/Kbuild
index 6a646d9ea780..011cc7c7b3ad 100644
--- a/arch/riscv/include/asm/Kbuild
+++ b/arch/riscv/include/asm/Kbuild
@@ -37,7 +37,6 @@ generic-y += poll.h
 generic-y += posix_types.h
 generic-y += preempt.h
 generic-y += resource.h
-generic-y += scatterlist.h
 generic-y += sections.h
 generic-y += sembuf.h
 generic-y += serial.h
-- 
2.19.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[RFC] remove the ->mapping_error method from dma_map_ops

2018-11-09 Thread Christoph Hellwig
Error reporting for the dma_map_single and dma_map_page operations is
currently a mess.  Both APIs directly return the dma_addr_t to be used for
the DMA, with a magic error escape that is specific to the instance and
checked by another method provided.  This has a few downsides:

 - the error check is easily forgotten and a __must_check marker doesn't
   help as the value always is consumed anyway
 - the error checking requires another indirect call, which have gotten
   incredibly expensive
 - a lot of code is wasted on implementing these methods

The historical reason for this is that people thought DMA mappings would
not fail when the API was created, which sounds like a really bad
assumption in retrospective, and then we tried to cram error handling
onto it later on.

There basically are two variants:  the error code is 0 because the
implementation will never return 0 as a valid DMA address, or the error
code is all-F as the implementation won't ever return an address that
high.  The old AMD GART is the only one not falling into these two camps
as it picks sort of a relative zero relative to where it is mapped.

The 0 return doesn't work for a lot of IOMMUs that start allocating
bus space from address zero, so we can't consolidate on that, but I think
we can move everyone to all-Fs, which the patch here does.  The reason
for that is that there is only one way to ever get this address: by
doing a 1-byte long, 1-byte aligned mapping, but all our mappings
are not only longer but generally aligned, and the mappings have to
keep at least the basic alignment.  Please try to poke holes into this
idea as it would simplify our logic a lot, and also allow to change
at least the not so commonly used yet dma_map_single_attrs and
dma_map_page_attrs to actually return an error code and further improve
the error handling flow in the drivers.
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH 1/2] dma-mapping: remove ->mapping_error

2018-11-09 Thread Christoph Hellwig
There is no need to perform an indirect function call to check if a
DMA mapping resulted in an error, if we always return the last
possible dma address as the error code.  While that could in theory
be a valid DMAable region, it would have to assume we want to
support unaligned DMAs of size 1, which is rather pointless.

Instead simplify the whole machinery by using (~(dma_addr_t)0x0)
as the common error return.

Signed-off-by: Christoph Hellwig 
---
 arch/alpha/kernel/pci_iommu.c| 10 ++-
 arch/arm/common/dmabounce.c  | 12 +++--
 arch/arm/include/asm/dma-iommu.h |  2 --
 arch/arm/mm/dma-mapping.c| 39 +++-
 arch/arm64/mm/dma-mapping.c  | 15 +++
 arch/ia64/hp/common/sba_iommu.c  |  8 +-
 arch/ia64/sn/pci/pci_dma.c   |  8 +-
 arch/mips/include/asm/jazzdma.h  |  6 -
 arch/mips/jazz/jazzdma.c | 16 
 arch/powerpc/include/asm/iommu.h |  3 ---
 arch/powerpc/kernel/dma-iommu.c  |  6 -
 arch/powerpc/kernel/dma-swiotlb.c|  1 -
 arch/powerpc/kernel/iommu.c  | 28 ++--
 arch/powerpc/platforms/cell/iommu.c  |  1 -
 arch/powerpc/platforms/pseries/vio.c |  3 +--
 arch/s390/pci/pci_dma.c  | 18 -
 arch/sparc/kernel/iommu.c| 12 +++--
 arch/sparc/kernel/iommu_common.h |  2 --
 arch/sparc/kernel/pci_sun4v.c| 14 +++---
 arch/x86/kernel/amd_gart_64.c| 19 --
 arch/x86/kernel/pci-calgary_64.c | 24 ++---
 drivers/iommu/amd_iommu.c| 18 -
 drivers/iommu/dma-iommu.c| 23 ++--
 drivers/iommu/intel-iommu.c  | 13 +-
 drivers/parisc/ccio-dma.c| 10 +--
 drivers/parisc/sba_iommu.c   | 10 +--
 drivers/pci/controller/vmd.c |  6 -
 drivers/xen/swiotlb-xen.c| 12 ++---
 include/linux/dma-direct.h   |  3 ---
 include/linux/dma-iommu.h|  1 -
 include/linux/dma-mapping.h  | 12 -
 kernel/dma/direct.c  |  8 +-
 kernel/dma/swiotlb.c |  9 +++
 33 files changed, 105 insertions(+), 267 deletions(-)

diff --git a/arch/alpha/kernel/pci_iommu.c b/arch/alpha/kernel/pci_iommu.c
index 46e08e0d9181..30339a0369ce 100644
--- a/arch/alpha/kernel/pci_iommu.c
+++ b/arch/alpha/kernel/pci_iommu.c
@@ -291,7 +291,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, 
size_t size,
   use direct_map above, it now must be considered an error. */
if (! alpha_mv.mv_pci_tbi) {
printk_once(KERN_WARNING "pci_map_single: no HW sg\n");
-   return 0;
+   return DMA_MAPPING_ERROR;
}
 
arena = hose->sg_pci;
@@ -307,7 +307,7 @@ pci_map_single_1(struct pci_dev *pdev, void *cpu_addr, 
size_t size,
if (dma_ofs < 0) {
printk(KERN_WARNING "pci_map_single failed: "
   "could not allocate dma page tables\n");
-   return 0;
+   return DMA_MAPPING_ERROR;
}
 
paddr &= PAGE_MASK;
@@ -935,11 +935,6 @@ iommu_unbind(struct pci_iommu_arena *arena, long pg_start, 
long pg_count)
return 0;
 }
 
-static int alpha_pci_mapping_error(struct device *dev, dma_addr_t dma_addr)
-{
-   return dma_addr == 0;
-}
-
 const struct dma_map_ops alpha_pci_ops = {
.alloc  = alpha_pci_alloc_coherent,
.free   = alpha_pci_free_coherent,
@@ -947,7 +942,6 @@ const struct dma_map_ops alpha_pci_ops = {
.unmap_page = alpha_pci_unmap_page,
.map_sg = alpha_pci_map_sg,
.unmap_sg   = alpha_pci_unmap_sg,
-   .mapping_error  = alpha_pci_mapping_error,
.dma_supported  = alpha_pci_supported,
 };
 EXPORT_SYMBOL(alpha_pci_ops);
diff --git a/arch/arm/common/dmabounce.c b/arch/arm/common/dmabounce.c
index 9a92de63426f..5ba4622030ca 100644
--- a/arch/arm/common/dmabounce.c
+++ b/arch/arm/common/dmabounce.c
@@ -257,7 +257,7 @@ static inline dma_addr_t map_single(struct device *dev, 
void *ptr, size_t size,
if (buf == NULL) {
dev_err(dev, "%s: unable to map unsafe buffer %p!\n",
   __func__, ptr);
-   return ARM_MAPPING_ERROR;
+   return DMA_MAPPING_ERROR;
}
 
dev_dbg(dev, "%s: unsafe buffer %p (dma=%#x) mapped to %p (dma=%#x)\n",
@@ -327,7 +327,7 @@ static dma_addr_t dmabounce_map_page(struct device *dev, 
struct page *page,
 
ret = needs_bounce(dev, dma_addr, size);
if (ret < 0)
-   return ARM_MAPPING_ERROR;
+   return DMA_MAPPING_ERROR;
 
if (ret == 0) {
arm_dma_ops.sync_single_for_device(dev, dma_addr, size, dir);
@@ -336,7 +336,7 @@ static dma_addr_t dmabounce_map_page(struct device *dev, 
struct page *page,
 
if 

[PATCH 2/2] dma-mapping: return errors from dma_map_page and dma_map_attrs

2018-11-09 Thread Christoph Hellwig
The current DMA API map_page and map_single routines use a very bad API
pattern that makes error checking hard.  The calls to them are far too
many and too complex to easily change that, but the relatively new _attr
variants that take an additional attributs argument only have a few
callers and can be changed easily.  So here we change them to return
an errno value, and return the dma address by reference to allow for
much saner error checking.

In the long run we should move existing callers of dma_map_page and
dma_map_single to these _attr variants with a 0 attrs argument to get
the better API everywhere, but without a single flag day.

Signed-off-by: Christoph Hellwig 
---
 Documentation/DMA-API.txt |  4 +-
 drivers/crypto/caam/caamalg.c |  8 +--
 drivers/crypto/caam/caamalg_qi2.c | 13 ++--
 drivers/crypto/caam/caamhash.c|  8 +--
 drivers/crypto/talitos.c  |  4 +-
 drivers/gpu/drm/i915/i915_gem_gtt.c   | 19 +++---
 drivers/net/ethernet/broadcom/bnxt/bnxt.c | 20 +++---
 .../ethernet/cavium/thunder/nicvf_queues.c| 27 
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 11 +---
 drivers/net/ethernet/intel/i40e/i40e_xsk.c|  5 +-
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   | 11 +---
 drivers/net/ethernet/intel/igb/igb_main.c | 11 +---
 drivers/net/ethernet/intel/igc/igc_main.c | 11 +---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 12 +---
 drivers/net/ethernet/intel/ixgbe/ixgbe_xsk.c  |  5 +-
 .../net/ethernet/intel/ixgbevf/ixgbevf_main.c | 11 +---
 .../ethernet/netronome/nfp/nfp_net_common.c   | 12 ++--
 include/linux/dma-mapping.h   | 62 +--
 18 files changed, 109 insertions(+), 145 deletions(-)

diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index ac66ae2509a9..2a9069907470 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -439,10 +439,10 @@ See also dma_map_single().
 
 ::
 
-   dma_addr_t
+   int 
dma_map_single_attrs(struct device *dev, void *cpu_addr, size_t size,
 enum dma_data_direction dir,
-unsigned long attrs)
+unsigned long attrs, dma_addr_t *dma_handle);
 
void
dma_unmap_single_attrs(struct device *dev, dma_addr_t dma_addr,
diff --git a/drivers/crypto/caam/caamalg.c b/drivers/crypto/caam/caamalg.c
index 869f092432de..cbcd55fda24f 100644
--- a/drivers/crypto/caam/caamalg.c
+++ b/drivers/crypto/caam/caamalg.c
@@ -3022,11 +3022,9 @@ static int caam_init_common(struct caam_ctx *ctx, struct 
caam_alg_entry *caam,
else
ctx->dir = DMA_TO_DEVICE;
 
-   dma_addr = dma_map_single_attrs(ctx->jrdev, ctx->sh_desc_enc,
-   offsetof(struct caam_ctx,
-sh_desc_enc_dma),
-   ctx->dir, DMA_ATTR_SKIP_CPU_SYNC);
-   if (dma_mapping_error(ctx->jrdev, dma_addr)) {
+   if (dma_map_single_attrs(ctx->jrdev, ctx->sh_desc_enc,
+   offsetof(struct caam_ctx, sh_desc_enc_dma),
+   ctx->dir, DMA_ATTR_SKIP_CPU_SYNC, _addr)) {
dev_err(ctx->jrdev, "unable to map key, shared descriptors\n");
caam_jr_free(ctx->jrdev);
return -ENOMEM;
diff --git a/drivers/crypto/caam/caamalg_qi2.c 
b/drivers/crypto/caam/caamalg_qi2.c
index 7d8ac0222fa3..9453dca56798 100644
--- a/drivers/crypto/caam/caamalg_qi2.c
+++ b/drivers/crypto/caam/caamalg_qi2.c
@@ -1336,10 +1336,9 @@ static int caam_cra_init(struct caam_ctx *ctx, struct 
caam_alg_entry *caam,
ctx->dev = caam->dev;
ctx->dir = uses_dkp ? DMA_BIDIRECTIONAL : DMA_TO_DEVICE;
 
-   dma_addr = dma_map_single_attrs(ctx->dev, ctx->flc,
-   offsetof(struct caam_ctx, flc_dma),
-   ctx->dir, DMA_ATTR_SKIP_CPU_SYNC);
-   if (dma_mapping_error(ctx->dev, dma_addr)) {
+   if (dma_map_single_attrs(ctx->dev, ctx->flc,
+   offsetof(struct caam_ctx, flc_dma),
+   ctx->dir, DMA_ATTR_SKIP_CPU_SYNC, _addr)) {
dev_err(ctx->dev, "unable to map key, shared descriptors\n");
return -ENOMEM;
}
@@ -4272,10 +4271,8 @@ static int caam_hash_cra_init(struct crypto_tfm *tfm)
 
ctx->dev = caam_hash->dev;
 
-   dma_addr = dma_map_single_attrs(ctx->dev, ctx->flc, sizeof(ctx->flc),
-   DMA_BIDIRECTIONAL,
-   DMA_ATTR_SKIP_CPU_SYNC);
-   if (dma_mapping_error(ctx->dev, dma_addr)) {
+   if (dma_map_single_attrs(ctx->dev, ctx->flc, sizeof(ctx->flc),
+   DMA_BIDIRECTIONAL, DMA_ATTR_SKIP_CPU_SYNC, _addr)) {
dev_err(ctx->dev, "unable to map shared descriptors\n");