[PATCH v5 2/3] dt-bindings: memory: mediatek: Add mt8192 support

2020-11-02 Thread Yong Wu
Add mt8192 smi support in the bindings.

Signed-off-by: Yong Wu 
Reviewed-by: Rob Herring 
---
 .../bindings/memory-controllers/mediatek,smi-common.yaml  | 4 +++-
 .../bindings/memory-controllers/mediatek,smi-larb.yaml| 2 ++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git 
a/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.yaml 
b/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.yaml
index 56c78317f9b7..a08a32340987 100644
--- 
a/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.yaml
+++ 
b/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.yaml
@@ -16,7 +16,7 @@ description: |
   MediaTek SMI have two generations of HW architecture, here is the list
   which generation the SoCs use:
   generation 1: mt2701 and mt7623.
-  generation 2: mt2712, mt6779, mt8167, mt8173 and mt8183.
+  generation 2: mt2712, mt6779, mt8167, mt8173, mt8183 and mt8192.
 
   There's slight differences between the two SMI, for generation 2, the
   register which control the iommu port is at each larb's register base. But
@@ -35,6 +35,7 @@ properties:
   - mediatek,mt8167-smi-common
   - mediatek,mt8173-smi-common
   - mediatek,mt8183-smi-common
+  - mediatek,mt8192-smi-common
 
   - description: for mt7623
 items:
@@ -98,6 +99,7 @@ allOf:
   enum:
 - mediatek,mt6779-smi-common
 - mediatek,mt8183-smi-common
+- mediatek,mt8192-smi-common
 
 then:
   properties:
diff --git 
a/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-larb.yaml 
b/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-larb.yaml
index 06b623b34f48..7ed7839ff0a7 100644
--- 
a/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-larb.yaml
+++ 
b/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-larb.yaml
@@ -23,6 +23,7 @@ properties:
   - mediatek,mt8167-smi-larb
   - mediatek,mt8173-smi-larb
   - mediatek,mt8183-smi-larb
+  - mediatek,mt8192-smi-larb
 
   - description: for mt7623
 items:
@@ -107,6 +108,7 @@ allOf:
   - mediatek,mt2712-smi-larb
   - mediatek,mt6779-smi-larb
   - mediatek,mt8167-smi-larb
+  - mediatek,mt8192-smi-larb
 
 then:
   required:
-- 
2.18.0
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 3/3] memory: mtk-smi: Add mt8192 support

2020-11-02 Thread Yong Wu
Add mt8192 smi support.

Signed-off-by: Yong Wu 
---
 drivers/memory/mtk-smi.c | 19 +++
 1 file changed, 19 insertions(+)

diff --git a/drivers/memory/mtk-smi.c b/drivers/memory/mtk-smi.c
index 691e4c344cf8..ac350f8d1e20 100644
--- a/drivers/memory/mtk-smi.c
+++ b/drivers/memory/mtk-smi.c
@@ -268,6 +268,10 @@ static const struct mtk_smi_larb_gen mtk_smi_larb_mt8183 = 
{
  /* IPU0 | IPU1 | CCU */
 };
 
+static const struct mtk_smi_larb_gen mtk_smi_larb_mt8192 = {
+   .config_port= mtk_smi_larb_config_port_gen2_general,
+};
+
 static const struct of_device_id mtk_smi_larb_of_ids[] = {
{
.compatible = "mediatek,mt8167-smi-larb",
@@ -293,6 +297,10 @@ static const struct of_device_id mtk_smi_larb_of_ids[] = {
.compatible = "mediatek,mt8183-smi-larb",
.data = &mtk_smi_larb_mt8183
},
+   {
+   .compatible = "mediatek,mt8192-smi-larb",
+   .data = &mtk_smi_larb_mt8192
+   },
{}
 };
 
@@ -432,6 +440,13 @@ static const struct mtk_smi_common_plat 
mtk_smi_common_mt8183 = {
F_MMU1_LARB(7),
 };
 
+static const struct mtk_smi_common_plat mtk_smi_common_mt8192 = {
+   .gen  = MTK_SMI_GEN2,
+   .has_gals = true,
+   .bus_sel  = F_MMU1_LARB(1) | F_MMU1_LARB(2) | F_MMU1_LARB(5) |
+   F_MMU1_LARB(6),
+};
+
 static const struct of_device_id mtk_smi_common_of_ids[] = {
{
.compatible = "mediatek,mt8173-smi-common",
@@ -457,6 +472,10 @@ static const struct of_device_id mtk_smi_common_of_ids[] = 
{
.compatible = "mediatek,mt8183-smi-common",
.data = &mtk_smi_common_mt8183,
},
+   {
+   .compatible = "mediatek,mt8192-smi-common",
+   .data = &mtk_smi_common_mt8192,
+   },
{}
 };
 
-- 
2.18.0
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 0/3] MT8192 SMI support

2020-11-02 Thread Yong Wu
This patchset mainly adds SMI support for mt8192.

It comes from the patchset[1]. I seperate the smi part into this patchset.
And the two part(IOMMU/SMI) patchset don't depend on each other.

Rebase on v5.10-rc1.

changenote:
 v5: Fix complain from yamllint.

 v4: 
https://lore.kernel.org/linux-mediatek/20201030091254.26382-1-yong...@mediatek.com/T/#meb03b3f4018894bf40c47fece52fe9b386409934
 
add if-then segment in the binding.

 v3: [1]

[1] 
https://lore.kernel.org/linux-iommu/20200930070647.10188-1-yong...@mediatek.com/


Yong Wu (3):
  dt-bindings: memory: mediatek: Convert SMI to DT schema
  dt-bindings: memory: mediatek: Add mt8192 support
  memory: mtk-smi: Add mt8192 support

 .../mediatek,smi-common.txt   |  50 --
 .../mediatek,smi-common.yaml  | 142 ++
 .../memory-controllers/mediatek,smi-larb.txt  |  50 --
 .../memory-controllers/mediatek,smi-larb.yaml | 132 
 drivers/memory/mtk-smi.c  |  19 +++
 5 files changed, 293 insertions(+), 100 deletions(-)
 delete mode 100644 
Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.txt
 create mode 100644 
Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.yaml
 delete mode 100644 
Documentation/devicetree/bindings/memory-controllers/mediatek,smi-larb.txt
 create mode 100644 
Documentation/devicetree/bindings/memory-controllers/mediatek,smi-larb.yaml

-- 
2.18.0

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v5 1/3] dt-bindings: memory: mediatek: Convert SMI to DT schema

2020-11-02 Thread Yong Wu
Convert MediaTek SMI to DT schema.

Signed-off-by: Yong Wu 
---
 .../mediatek,smi-common.txt   |  50 ---
 .../mediatek,smi-common.yaml  | 140 ++
 .../memory-controllers/mediatek,smi-larb.txt  |  50 ---
 .../memory-controllers/mediatek,smi-larb.yaml | 130 
 4 files changed, 270 insertions(+), 100 deletions(-)
 delete mode 100644 
Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.txt
 create mode 100644 
Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.yaml
 delete mode 100644 
Documentation/devicetree/bindings/memory-controllers/mediatek,smi-larb.txt
 create mode 100644 
Documentation/devicetree/bindings/memory-controllers/mediatek,smi-larb.yaml

diff --git 
a/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.txt 
b/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.txt
deleted file mode 100644
index dbafffe3f41e..
--- 
a/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-SMI (Smart Multimedia Interface) Common
-
-The hardware block diagram please check bindings/iommu/mediatek,iommu.txt
-
-Mediatek SMI have two generations of HW architecture, here is the list
-which generation the SoCs use:
-generation 1: mt2701 and mt7623.
-generation 2: mt2712, mt6779, mt8167, mt8173 and mt8183.
-
-There's slight differences between the two SMI, for generation 2, the
-register which control the iommu port is at each larb's register base. But
-for generation 1, the register is at smi ao base(smi always on register
-base). Besides that, the smi async clock should be prepared and enabled for
-SMI generation 1 to transform the smi clock into emi clock domain, but that is
-not needed for SMI generation 2.
-
-Required properties:
-- compatible : must be one of :
-   "mediatek,mt2701-smi-common"
-   "mediatek,mt2712-smi-common"
-   "mediatek,mt6779-smi-common"
-   "mediatek,mt7623-smi-common", "mediatek,mt2701-smi-common"
-   "mediatek,mt8167-smi-common"
-   "mediatek,mt8173-smi-common"
-   "mediatek,mt8183-smi-common"
-- reg : the register and size of the SMI block.
-- power-domains : a phandle to the power domain of this local arbiter.
-- clocks : Must contain an entry for each entry in clock-names.
-- clock-names : must contain 3 entries for generation 1 smi HW and 2 entries
-  for generation 2 smi HW as follows:
-  - "apb" : Advanced Peripheral Bus clock, It's the clock for setting
-   the register.
-  - "smi" : It's the clock for transfer data and command.
-   They may be the same if both source clocks are the same.
-  - "async" : asynchronous clock, it help transform the smi clock into the emi
- clock domain, this clock is only needed by generation 1 smi HW.
-  and these 2 option clocks for generation 2 smi HW:
-  - "gals0": the path0 clock of GALS(Global Async Local Sync).
-  - "gals1": the path1 clock of GALS(Global Async Local Sync).
-  Here is the list which has this GALS: mt6779 and mt8183.
-
-Example:
-   smi_common: smi@14022000 {
-   compatible = "mediatek,mt8173-smi-common";
-   reg = <0 0x14022000 0 0x1000>;
-   power-domains = <&scpsys MT8173_POWER_DOMAIN_MM>;
-   clocks = <&mmsys CLK_MM_SMI_COMMON>,
-<&mmsys CLK_MM_SMI_COMMON>;
-   clock-names = "apb", "smi";
-   };
diff --git 
a/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.yaml 
b/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.yaml
new file mode 100644
index ..56c78317f9b7
--- /dev/null
+++ 
b/Documentation/devicetree/bindings/memory-controllers/mediatek,smi-common.yaml
@@ -0,0 +1,140 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+# Copyright (c) 2020 MediaTek Inc.
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/memory-controllers/mediatek,smi-common.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: SMI (Smart Multimedia Interface) Common
+
+maintainers:
+  - Yong Wu 
+
+description: |
+  The hardware block diagram please check bindings/iommu/mediatek,iommu.yaml
+
+  MediaTek SMI have two generations of HW architecture, here is the list
+  which generation the SoCs use:
+  generation 1: mt2701 and mt7623.
+  generation 2: mt2712, mt6779, mt8167, mt8173 and mt8183.
+
+  There's slight differences between the two SMI, for generation 2, the
+  register which control the iommu port is at each larb's register base. But
+  for generation 1, the register is at smi ao base(smi always on register
+  base). Besides that, the smi async clock should be prepared and enabled for
+  SMI generation 1 to transform the smi clock into emi clock domain, but that 
is
+  not needed for SMI generation 2.
+
+properties:
+  compatible:
+oneOf:
+  - enum:
+  - mediatek,mt2701-smi-common
+ 

Re: [PATCH v6 5/5] vfio/type1: Use mdev bus iommu_ops for IOMMU callbacks

2020-11-02 Thread Lu Baolu

Hi Alex,

On 10/31/20 5:06 AM, Alex Williamson wrote:

On Fri, 30 Oct 2020 06:16:28 +
"Tian, Kevin"  wrote:


From: Lu Baolu 
Sent: Friday, October 30, 2020 12:58 PM

With the IOMMU driver registering iommu_ops for the mdev_bus, the
IOMMU
operations on an mdev could be done in the same way as any normal device
(for example, PCI/PCIe). There's no need to distinguish an mdev from
others for iommu operations. Remove the unnecessary code.


This is really a nice cleanup as the output of this change! :)


It's easy to remove a bunch of code when the result is breaking
everyone else.  Please share with me how SR-IOV backed mdevs continue
to work on AMD platforms, or how they might work on ARM platforms, when
siov_iommu_ops (VT-d only) becomes the one and only provider of
iommu_ops on the mdev bus.  Hard NAK on this series.  Thanks,


I focused too much on a feature and forgot about university. I should
apologize for this. Sorry about it!

Back to the original intention of this series. The aux domain was
allocated in vfio/mdev, but it's also needed by the vDCM component of a
device driver for mediated callbacks. Currently vfio/mdev or iommu core
has no support for this.

We had a proposal when we first did aux-domain support. But was not
discussed since there was no consumer at that time.

https://lore.kernel.org/linux-iommu/20181105073408.21815-7-baolu...@linux.intel.com/

Does it look good to you? I can send patches of such solution for
discussion if you think it's a right way.

Extending the iommu core for subdevice passthrough support sounds an
interesting topic, but it will take much time before we reach a
consensus. It sounds a good topic for the next year's LPC/MC :-).

Best regards,
baolu
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Some questions about arm_lpae_install_table

2020-11-02 Thread Kunkun Jiang

Hi Robin,

Recently, I have read and learned the code related to io-pgtable-arm.c. 
There

are two question on arm_lpae_install_table.

1、the first


static arm_lpae_iopte arm_lpae_install_table(arm_lpae_iopte *table,
 arm_lpae_iopte *ptep,
 arm_lpae_iopte curr,
 struct io_pgtable_cfg *cfg)
{
    arm_lpae_iopte old, new;

    new = __pa(table) | ARM_LPAE_PTE_TYPE_TABLE;
    if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS)
    new |= ARM_LPAE_PTE_NSTABLE;

   /*
 * Ensure the table itself is visible before its PTE can be.
 * Whilst we could get away with cmpxchg64_release below, this
 * doesn't have any ordering semantics when !CONFIG_SMP.
 */
    dma_wmb();

    old = cmpxchg64_relaxed(ptep, curr, new);

    if (cfg->coherent_walk || (old & ARM_LPAE_PTE_SW_SYNC))
    return old;

    /* Even if it's not ours, there's no point waiting; just kick 
it */

    __arm_lpae_sync_pte(ptep, cfg);
    if (old == curr)
    WRITE_ONCE(*ptep, new | ARM_LPAE_PTE_SW_SYNC);

    return old;
}


If another thread changes the ptep between cmpxchg64_relaxed and
WRITE_ONCE(*ptep, new | ARM_LPAE_PTE_SW_SYNC), the operation
WRITE_ONCE will overwrite the change.

2、the second


for (i = 0; i < tablesz / sizeof(pte); i++, blk_paddr += split_sz) {
    /* Unmap! */
    if (i == unmap_idx)
continue;

    __arm_lpae_init_pte(data, blk_paddr, pte, lvl, 
&tablep[i]);

}

pte = arm_lpae_install_table(tablep, ptep, blk_pte, cfg);


When altering a translation table descriptor include split a block into
constituent granules, the Armv8-A and SMMUv3 architectures require
a break-before-make procedure. But in the function arm_lpae_split_blk_unmap,
it changes a block descriptor to an equivalent span of page translations
directly. Is it appropriate to do so?

The above two questions may be my wrong thinking. Coude you please
give me more hints?

Thanks,
Kunkun Jiang






___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH v4 0/7] Convert the intel iommu driver to the dma-iommu api

2020-11-02 Thread Lu Baolu

On 11/2/20 7:52 PM, Tvrtko Ursulin wrote:


On 02/11/2020 02:00, Lu Baolu wrote:

Hi Tvrtko,
On 10/12/20 4:44 PM, Tvrtko Ursulin wrote:


On 29/09/2020 01:11, Lu Baolu wrote:

Hi Tvrtko,

On 9/28/20 5:44 PM, Tvrtko Ursulin wrote:


On 27/09/2020 07:34, Lu Baolu wrote:

Hi,

The previous post of this series could be found here.

https://lore.kernel.org/linux-iommu/20200912032200.11489-1-baolu...@linux.intel.com/ 



This version introduce a new patch [4/7] to fix an issue reported 
here.


https://lore.kernel.org/linux-iommu/51a1baec-48d1-c0ac-181b-1fba92aa4...@linux.intel.com/ 



There aren't any other changes.

Please help to test and review.

Best regards,
baolu

Lu Baolu (3):
   iommu: Add quirk for Intel graphic devices in map_sg


Since I do have patches to fix i915 to handle this, do we want to 
co-ordinate the two and avoid having to add this quirk and then 
later remove it? Or you want to go the staged approach?


I have no preference. It depends on which patch goes first. Let the
maintainers help here.


FYI we have merged the required i915 patches to out tree last week or 
so. I *think* this means they will go into 5.11. So the i915 specific 
workaround patch will not be needed in Intel IOMMU.


Do you mind telling me what's the status of this fix patch? I tried this
series on v5.10-rc1 with the graphic quirk patch dropped. I am still
seeing dma faults from graphic device.


Hmm back then I thought i915 fixes for this would land in 5.11 so I will 
stick with that. :) (See my quoted text a paragraph above yours.)


What size are those fixes? I am considering pushing this series for
v5.11. Is it possible to get some acks for those patches and let them
go to Linus through iommu tree?

Best regards,
baolu
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH v3 01/14] docs: Document IO Address Space ID (IOASID) APIs

2020-11-02 Thread Jacob Pan
Hi Jean-Philippe,

On Fri, 30 Oct 2020 11:18:27 +0100, Jean-Philippe Brucker
 wrote:

> On Mon, Oct 26, 2020 at 02:05:06PM -0700, Jacob Pan wrote:
> > > This looks good to me, with small comments below.
> > >   
> > Can I add your Reviewed-by tag after addressing the comments?  
> 
> Yes sure, this took forever to review so I'm happy not to do another
> pass :)
> 
I am afraid I have to ask for another round of reviews since it was
suggested to keep IOASID allocation interface independent, instead of being
part of VFIO UAPI. Yi and I are working out the details to come up with a
PoC. As you might be aware, the need for this independent interface is that
we may have multiple users of PASID, e.g VDPA, user space drivers, etc.
The IOASID user interface also has slight impact on the IOASID core code,
which is why I am slow in response to your code review. Will incorporate
your review in the next round with support of independent user API.
Much appreciated!

> 
> > > > +Each IOASID set is created with a token, which can be one of the
> > > > +following token types:
> > > > +
> > > > + - IOASID_SET_TYPE_NULL (Arbitrary u64 value)
> > > 
> > > Maybe NULL isn't the best name then. NONE?
> > >   
> > Agreed, 'NONE' makes more sense.  
> 
> Although patch 5 only allows a NULL token for this type. So the name seems
> fine, you could just fix this description.
> 
OK.

> 
> > > > +IOASID core has the notion of "custom allocator" such that guest
> > > > can +register virtual command allocator that precedes the default
> > > > one.
> > > 
> > > "Supersedes", rather than "precedes"?
> > >   
> > My understanding is that 'supersede' means replace something but
> > 'precede' means get in front of something. I do want to emphasis that
> > the custom allocator takes precedence over the default allocator.  
> 
> Right it's ambiguous. The custom allocator does entirely replace the
> allocation action, but the default one is still used for storage. Anyway,
> you can leave this.
> 
OK

> 
> > > > +Let's examine the IOASID life cycle again when free happens
> > > > *before* +unbind. This could be a result of misbehaving guests or
> > > > crash. Assuming +VFIO cannot enforce unbind->free order. Notice
> > > > that the setup part up +until step #12 is identical to the normal
> > > > case, the flow below starts +with step 13.
> > > > +
> > > > +::
> > > > +
> > > > + VFIOIOMMUKVMVDCMIOASID
> > > > Ref
> > > > +   ..
> > > > +   13  GUEST STARTS DMA --
> > > > +   14  *GUEST MISBEHAVES!!!* 
> > > > +   15 ioasid_free()
> > > > +   16
> > > > ioasid_notify(FREE)
> > > > +   17 mark_free_pending
> > > > (1)
> > > 
> > > Could we use superscript ¹²³⁴ for footnotes? These look like function
> > > parameters
> > >   
> > yes, much better
> >   
> > > > +   18  kvm_nb_handler(FREE)
> > > > +   19  vmcs_update_atomic()
> > > > +   20  ioasid_put_locked()   ->   3
> > > > +   21   vdcm_nb_handler(FREE)
> > > > +   22iomm_nb_handler(FREE)
> > > 
> > > iommu_nb_handler
> > >   
> > got it
> >   
> > > > +   23 ioasid_free() returns(2)  schedule_work()
> > > > 2
> > > 
> > > I completely lost track here, couldn't figure out in which direction
> > > to read the diagram. What work is scheduled?  
> > The time line goes downward but we only control the notification order
> > in terms of when the events are received. Some completions are async
> > thus out of order done by work items. The only in-order completion is
> > the KVM update of its PASID translation table.
> > 
> > After #23, the async works are scheduled to complete clean up work
> > outside the spinlock(held by the caller of the atomic notifier).
> > 
> > Any suggestions to improve the readability of the time line?  
> 
> Maybe explain what happens from line 23: ioasid_free() schedules... a FREE
> notification? Which happens on line 24 (corresponding to the second
> schedule_work()?) and is handled by (a) VDCM to clear the device context
> and (b) IOMMU to clear the PASID context, both ending up dropping their
> ref.
> 
Got it, I will add that.

> >   
> > > Why does the IOMMU driver drop
> > > its reference to the IOASID before unbdind_gpasid()?
> > >   
> > This is the exception case where userspace issues IOASID free before
> > unbind_gpasid(). The equivalent of unbind is performed in the
> > IOASID_FREE notification handler. In IOASID_FREE handler, reference is
> > dropped and private data deleted. After that, if unbind comes to IOMMU
> > driver, it will not find IOASID private data therefore just return.  
> 
> Right ok. As you noted below the damage is caused by and limited to the
> guest, so I think it's fine.
> 
OK.

> >   
> > > > +  

Re: [PATCH v5 3/7] of/address: Introduce of_dma_get_max_cpu_address()

2020-11-02 Thread Rob Herring
On Thu, Oct 29, 2020 at 12:26 PM Nicolas Saenz Julienne
 wrote:
>
> Introduce of_dma_get_max_cpu_address(), which provides the highest CPU
> physical address addressable by all DMA masters in the system. It's
> specially useful for setting memory zones sizes at early boot time.
>
> Signed-off-by: Nicolas Saenz Julienne 
>
> ---
>
> Changes since v4:
>  - Return max address, not address limit (one-off difference)
>
> Changes since v3:
>  - use u64 with cpu_end
>
> Changes since v2:
>  - Use PHYS_ADDR_MAX
>  - return phys_dma_t
>  - Rename function
>  - Correct subject
>  - Add support to start parsing from an arbitrary device node in order
>for the function to work with unit tests
>
>  drivers/of/address.c | 42 ++
>  include/linux/of.h   |  7 +++
>  2 files changed, 49 insertions(+)

Reviewed-by: Rob Herring 
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v18 3/4] dt-bindings: arm-smmu: Add compatible string for Adreno GPU SMMU

2020-11-02 Thread Robin Murphy

On 2020-11-02 18:22, Robin Murphy wrote:

On 2020-11-02 17:14, Jordan Crouse wrote:

Every Qcom Adreno GPU has an embedded SMMU for its own use. These
devices depend on unique features such as split pagetables,
different stall/halt requirements and other settings. Identify them
with a compatible string so that they can be identified in the
arm-smmu implementation specific code.

Signed-off-by: Jordan Crouse 
Reviewed-by: Rob Herring 
Signed-off-by: Rob Clark 
Reviewed-by: Bjorn Andersson 
---

  Documentation/devicetree/bindings/iommu/arm,smmu.yaml | 9 +++--
  1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml 
b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml

index 503160a7b9a0..3b63f2ae24db 100644
--- a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml
+++ b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml
@@ -28,8 +28,6 @@ properties:
    - enum:
    - qcom,msm8996-smmu-v2
    - qcom,msm8998-smmu-v2
-  - qcom,sc7180-smmu-v2
-  - qcom,sdm845-smmu-v2


What about the "Apps SMMU" instances? Those are distinct and don't 
have/need the GPU special behaviour, right?


Oh, having looked at patch #4, which prompted me go and look at the 845 
DTSI in context, now I realise the subtlety I overlooked. So I guess it 
really was worth resending, ha! Sorry for being thick :)


Reviewed-by: Robin Murphy 



Robin.


    - const: qcom,smmu-v2
    - description: Qcom SoCs implementing "arm,mmu-500"
@@ -40,6 +38,13 @@ properties:
    - qcom,sm8150-smmu-500
    - qcom,sm8250-smmu-500
    - const: arm,mmu-500
+  - description: Qcom Adreno GPUs implementing "arm,smmu-v2"
+    items:
+  - enum:
+  - qcom,sc7180-smmu-v2
+  - qcom,sdm845-smmu-v2
+  - const: qcom,adreno-smmu
+  - const: qcom,smmu-v2
    - description: Marvell SoCs implementing "arm,mmu-500"
  items:
    - const: marvell,ap806-smmu-500


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [PATCH v18 3/4] dt-bindings: arm-smmu: Add compatible string for Adreno GPU SMMU

2020-11-02 Thread Robin Murphy

On 2020-11-02 17:14, Jordan Crouse wrote:

Every Qcom Adreno GPU has an embedded SMMU for its own use. These
devices depend on unique features such as split pagetables,
different stall/halt requirements and other settings. Identify them
with a compatible string so that they can be identified in the
arm-smmu implementation specific code.

Signed-off-by: Jordan Crouse 
Reviewed-by: Rob Herring 
Signed-off-by: Rob Clark 
Reviewed-by: Bjorn Andersson 
---

  Documentation/devicetree/bindings/iommu/arm,smmu.yaml | 9 +++--
  1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml 
b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml
index 503160a7b9a0..3b63f2ae24db 100644
--- a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml
+++ b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml
@@ -28,8 +28,6 @@ properties:
- enum:
- qcom,msm8996-smmu-v2
- qcom,msm8998-smmu-v2
-  - qcom,sc7180-smmu-v2
-  - qcom,sdm845-smmu-v2


What about the "Apps SMMU" instances? Those are distinct and don't 
have/need the GPU special behaviour, right?


Robin.


- const: qcom,smmu-v2
  
- description: Qcom SoCs implementing "arm,mmu-500"

@@ -40,6 +38,13 @@ properties:
- qcom,sm8150-smmu-500
- qcom,sm8250-smmu-500
- const: arm,mmu-500
+  - description: Qcom Adreno GPUs implementing "arm,smmu-v2"
+items:
+  - enum:
+  - qcom,sc7180-smmu-v2
+  - qcom,sdm845-smmu-v2
+  - const: qcom,adreno-smmu
+  - const: qcom,smmu-v2
- description: Marvell SoCs implementing "arm,mmu-500"
  items:
- const: marvell,ap806-smmu-500


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v18 2/4] iommu/arm-smmu: Add a way for implementations to influence SCTLR

2020-11-02 Thread Robin Murphy

On 2020-11-02 17:14, Jordan Crouse wrote:

From: Rob Clark 

For the Adreno GPU's SMMU, we want SCTLR.HUPCF set to ensure that
pending translations are not terminated on iova fault.  Otherwise
a terminated CP read could hang the GPU by returning invalid
command-stream data.

Signed-off-by: Rob Clark 
Reviewed-by: Bjorn Andersson 
Signed-off-by: Jordan Crouse 
---

  drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 6 ++
  drivers/iommu/arm/arm-smmu/arm-smmu.c  | 3 +++
  drivers/iommu/arm/arm-smmu/arm-smmu.h  | 3 +++
  3 files changed, 12 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 1e942eed2dfc..0663d7d26908 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -129,6 +129,12 @@ static int qcom_adreno_smmu_init_context(struct 
arm_smmu_domain *smmu_domain,
(smmu_domain->cfg.fmt == ARM_SMMU_CTX_FMT_AARCH64))
pgtbl_cfg->quirks |= IO_PGTABLE_QUIRK_ARM_TTBR1;
  
+	/*

+* On the GPU device we want to process subsequent transactions after a
+* fault to keep the GPU from hanging
+*/
+   smmu_domain->cfg.sctlr_set |= ARM_SMMU_SCTLR_HUPCF;
+
/*
 * Initialize private interface with GPU:
 */
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index dad7fa86fbd4..1f06ab219819 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -617,6 +617,9 @@ void arm_smmu_write_context_bank(struct arm_smmu_device 
*smmu, int idx)
if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
reg |= ARM_SMMU_SCTLR_E;
  
+	reg |= cfg->sctlr_set;

+   reg &= ~cfg->sctlr_clr;


Since we now have a write_s2cr hook, I'm inclined to think that the 
consistency of a write_sctlr hook that could similarly apply its own 
arbitrary tweaks would make sense for this. Does anyone have any strong 
opinions?


Robin.


+
arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, reg);
  }
  
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h b/drivers/iommu/arm/arm-smmu/arm-smmu.h

index 6c5ffeae..ddf2ca4c923d 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -144,6 +144,7 @@ enum arm_smmu_cbar_type {
  #define ARM_SMMU_CB_SCTLR 0x0
  #define ARM_SMMU_SCTLR_S1_ASIDPNE BIT(12)
  #define ARM_SMMU_SCTLR_CFCFG  BIT(7)
+#define ARM_SMMU_SCTLR_HUPCF   BIT(8)
  #define ARM_SMMU_SCTLR_CFIE   BIT(6)
  #define ARM_SMMU_SCTLR_CFRE   BIT(5)
  #define ARM_SMMU_SCTLR_E  BIT(4)
@@ -341,6 +342,8 @@ struct arm_smmu_cfg {
u16 asid;
u16 vmid;
};
+   u32 sctlr_set;/* extra bits to set in 
SCTLR */
+   u32 sctlr_clr;/* bits to mask in SCTLR 
*/
enum arm_smmu_cbar_type cbar;
enum arm_smmu_context_fmt   fmt;
  };


___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v18 1/4] iommu/arm-smmu-qcom: Add implementation for the adreno GPU SMMU

2020-11-02 Thread Robin Murphy

On 2020-11-02 17:14, Jordan Crouse wrote:

Add a special implementation for the SMMU attached to most Adreno GPU
target triggered from the qcom,adreno-smmu compatible string.

The new Adreno SMMU implementation will enable split pagetables
(TTBR1) for the domain attached to the GPU device (SID 0) and
hard code it context bank 0 so the GPU hardware can implement
per-instance pagetables.

Co-developed-by: Rob Clark 
Signed-off-by: Jordan Crouse 
Signed-off-by: Rob Clark 
Reviewed-by: Bjorn Andersson 
---

  drivers/iommu/arm/arm-smmu/arm-smmu-impl.c |   3 +
  drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 151 -
  drivers/iommu/arm/arm-smmu/arm-smmu.h  |   1 +
  3 files changed, 153 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index 88f17cc33023..d199b4bff15d 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -223,6 +223,9 @@ struct arm_smmu_device *arm_smmu_impl_init(struct 
arm_smmu_device *smmu)
of_device_is_compatible(np, "qcom,sm8250-smmu-500"))
return qcom_smmu_impl_init(smmu);
  
+	if (of_device_is_compatible(smmu->dev->of_node, "qcom,adreno-smmu"))

+   return qcom_adreno_smmu_impl_init(smmu);
+
if (of_device_is_compatible(np, "marvell,ap806-smmu-500"))
smmu->impl = &mrvl_mmu500_impl;
  
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c

index be4318044f96..1e942eed2dfc 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -3,6 +3,7 @@
   * Copyright (c) 2019, The Linux Foundation. All rights reserved.
   */
  
+#include 

  #include 
  #include 
  
@@ -12,6 +13,134 @@ struct qcom_smmu {

struct arm_smmu_device smmu;
  };
  
+#define QCOM_ADRENO_SMMU_GPU_SID 0

+
+static bool qcom_adreno_smmu_is_gpu_device(struct device *dev)
+{
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+   int i;
+
+   /*
+* The GPU will always use SID 0 so that is a handy way to uniquely
+* identify it and configure it for per-instance pagetables
+*/
+   for (i = 0; i < fwspec->num_ids; i++) {
+   u16 sid = FIELD_GET(ARM_SMMU_SMR_ID, fwspec->ids[i]);
+
+   if (sid == QCOM_ADRENO_SMMU_GPU_SID)
+   return true;
+   }
+
+   return false;
+}
+
+static const struct io_pgtable_cfg *qcom_adreno_smmu_get_ttbr1_cfg(
+   const void *cookie)
+{
+   struct arm_smmu_domain *smmu_domain = (void *)cookie;
+   struct io_pgtable *pgtable =
+   io_pgtable_ops_to_pgtable(smmu_domain->pgtbl_ops);
+   return &pgtable->cfg;
+}
+
+/*
+ * Local implementation to configure TTBR0 with the specified pagetable config.
+ * The GPU driver will call this to enable TTBR0 when per-instance pagetables
+ * are active
+ */
+
+static int qcom_adreno_smmu_set_ttbr0_cfg(const void *cookie,
+   const struct io_pgtable_cfg *pgtbl_cfg)
+{
+   struct arm_smmu_domain *smmu_domain = (void *)cookie;
+   struct io_pgtable *pgtable = 
io_pgtable_ops_to_pgtable(smmu_domain->pgtbl_ops);
+   struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+   struct arm_smmu_cb *cb = &smmu_domain->smmu->cbs[cfg->cbndx];
+
+   /* The domain must have split pagetables already enabled */
+   if (cb->tcr[0] & ARM_SMMU_TCR_EPD1)
+   return -EINVAL;
+
+   /* If the pagetable config is NULL, disable TTBR0 */
+   if (!pgtbl_cfg) {
+   /* Do nothing if it is already disabled */
+   if ((cb->tcr[0] & ARM_SMMU_TCR_EPD0))
+   return -EINVAL;
+
+   /* Set TCR to the original configuration */
+   cb->tcr[0] = arm_smmu_lpae_tcr(&pgtable->cfg);
+   cb->ttbr[0] = FIELD_PREP(ARM_SMMU_TTBRn_ASID, cb->cfg->asid);
+   } else {
+   u32 tcr = cb->tcr[0];
+
+   /* Don't call this again if TTBR0 is already enabled */
+   if (!(cb->tcr[0] & ARM_SMMU_TCR_EPD0))
+   return -EINVAL;
+
+   tcr |= arm_smmu_lpae_tcr(pgtbl_cfg);
+   tcr &= ~(ARM_SMMU_TCR_EPD0 | ARM_SMMU_TCR_EPD1);
+
+   cb->tcr[0] = tcr;
+   cb->ttbr[0] = pgtbl_cfg->arm_lpae_s1_cfg.ttbr;
+   cb->ttbr[0] |= FIELD_PREP(ARM_SMMU_TTBRn_ASID, cb->cfg->asid);
+   }
+
+   arm_smmu_write_context_bank(smmu_domain->smmu, cb->cfg->cbndx);
+
+   return 0;
+}
+
+static int qcom_adreno_smmu_alloc_context_bank(struct arm_smmu_domain 
*smmu_domain,
+  struct arm_smmu_device *smmu,
+  struct device *dev, int start)
+{
+   int count;
+
+   /*
+* Assign context bank 0 to the GPU device so the GPU hardware can
+* switch pagetables

Re: [PATCH v18 0/4] iommu/arm-smmu: Add adreno-smmu implementation and bindings

2020-11-02 Thread Jordan Crouse
On Mon, Nov 02, 2020 at 10:08:23AM -0700, Jordan Crouse wrote:
> On Thu, Oct 29, 2020 at 05:26:08PM +, Will Deacon wrote:
> > On Tue, Oct 27, 2020 at 04:34:04PM -0600, Jordan Crouse wrote:
> > > This short series adds support for the adreno-smmu implementation of the
> > > arm-smmu driver and the device-tree bindings to turn on the implementation
> > > for the sm845 and sc7180 GPUs. These changes are the last ones needed to 
> > > enable
> > > per-instance pagetables in the drm/msm driver.
> > > 
> > > No deltas in this patchset since the last go-around for 5.10 [1].
> > > 
> > > [1] https://patchwork.freedesktop.org/series/81393/
> > > 
> > > Jordan Crouse (3):
> > >   iommu/arm-smmu-qcom: Add implementation for the adreno GPU SMMU
> > >   dt-bindings: arm-smmu: Add compatible string for Adreno GPU SMMU
> > >   arm: dts: qcom: sm845: Set the compatible string for the GPU SMMU
> > > 
> > > Rob Clark (1):
> > >   iommu/arm-smmu: Add a way for implementations to influence SCTLR
> > 
> > FYI: this patch (patch 4/4) doesn't seem to have made it anywhere (I don't
> > have it, and neither does the archive).
> > 
> > Will
> 
> Patch 4/4 was the bindings for sdm845 and I didn't explicitly add IOMMU to the
> CC list and so patman did what patman does.
> 
> I'll resend.

Stack re-sent with you and Robin and the list on the CC for the bindings. I
expect that Bjorn can pick up the bindings patches once the adreno-smmu patch is
accepted but it is good for everybody to get the full picture.

Jordan

-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v18 0/4] iommu/arm-smmu: Add adreno-smmu implementation and bindings

2020-11-02 Thread Jordan Crouse
(resend with expanded CC list so everybody can see all the patches)

This short series adds support for the adreno-smmu implementation of the
arm-smmu driver and the device-tree bindings to turn on the implementation
for the sm845 and sc7180 GPUs. These changes are the last ones needed to enable
per-instance pagetables in the drm/msm driver.

No deltas in this patchset since the last go-around for 5.10 [1].

[1] https://patchwork.freedesktop.org/series/81393/

Jordan Crouse (3):
  iommu/arm-smmu-qcom: Add implementation for the adreno GPU SMMU
  dt-bindings: arm-smmu: Add compatible string for Adreno GPU SMMU
  arm: dts: qcom: sm845: Set the compatible string for the GPU SMMU

Rob Clark (1):
  iommu/arm-smmu: Add a way for implementations to influence SCTLR

 .../devicetree/bindings/iommu/arm,smmu.yaml   |   9 +-
 arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi|   9 +
 arch/arm64/boot/dts/qcom/sdm845.dtsi  |   2 +-
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c|   3 +
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c| 157 +-
 drivers/iommu/arm/arm-smmu/arm-smmu.c |   3 +
 drivers/iommu/arm/arm-smmu/arm-smmu.h |   4 +
 7 files changed, 182 insertions(+), 5 deletions(-)

-- 
2.25.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v18 3/4] dt-bindings: arm-smmu: Add compatible string for Adreno GPU SMMU

2020-11-02 Thread Jordan Crouse
Every Qcom Adreno GPU has an embedded SMMU for its own use. These
devices depend on unique features such as split pagetables,
different stall/halt requirements and other settings. Identify them
with a compatible string so that they can be identified in the
arm-smmu implementation specific code.

Signed-off-by: Jordan Crouse 
Reviewed-by: Rob Herring 
Signed-off-by: Rob Clark 
Reviewed-by: Bjorn Andersson 
---

 Documentation/devicetree/bindings/iommu/arm,smmu.yaml | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml 
b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml
index 503160a7b9a0..3b63f2ae24db 100644
--- a/Documentation/devicetree/bindings/iommu/arm,smmu.yaml
+++ b/Documentation/devicetree/bindings/iommu/arm,smmu.yaml
@@ -28,8 +28,6 @@ properties:
   - enum:
   - qcom,msm8996-smmu-v2
   - qcom,msm8998-smmu-v2
-  - qcom,sc7180-smmu-v2
-  - qcom,sdm845-smmu-v2
   - const: qcom,smmu-v2
 
   - description: Qcom SoCs implementing "arm,mmu-500"
@@ -40,6 +38,13 @@ properties:
   - qcom,sm8150-smmu-500
   - qcom,sm8250-smmu-500
   - const: arm,mmu-500
+  - description: Qcom Adreno GPUs implementing "arm,smmu-v2"
+items:
+  - enum:
+  - qcom,sc7180-smmu-v2
+  - qcom,sdm845-smmu-v2
+  - const: qcom,adreno-smmu
+  - const: qcom,smmu-v2
   - description: Marvell SoCs implementing "arm,mmu-500"
 items:
   - const: marvell,ap806-smmu-500
-- 
2.25.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v18 4/4] arm: dts: qcom: sm845: Set the compatible string for the GPU SMMU

2020-11-02 Thread Jordan Crouse
Set the qcom,adreno-smmu compatible string for the GPU SMMU to enable
split pagetables and per-instance pagetables for drm/msm.

Signed-off-by: Jordan Crouse 
Signed-off-by: Rob Clark 
Reviewed-by: Bjorn Andersson 
---

 arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi | 9 +
 arch/arm64/boot/dts/qcom/sdm845.dtsi   | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi 
b/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi
index 64fc1bfd66fa..39f23cdcbd02 100644
--- a/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi
+++ b/arch/arm64/boot/dts/qcom/sdm845-cheza.dtsi
@@ -633,6 +633,15 @@ &mdss_mdp {
status = "okay";
 };
 
+/*
+ * Cheza fw does not properly program the GPU aperture to allow the
+ * GPU to update the SMMU pagetables for context switches.  Work
+ * around this by dropping the "qcom,adreno-smmu" compat string.
+ */
+&adreno_smmu {
+   compatible = "qcom,sdm845-smmu-v2", "qcom,smmu-v2";
+};
+
 &mss_pil {
iommus = <&apps_smmu 0x781 0x0>,
 <&apps_smmu 0x724 0x3>;
diff --git a/arch/arm64/boot/dts/qcom/sdm845.dtsi 
b/arch/arm64/boot/dts/qcom/sdm845.dtsi
index 40e8c11f23ab..0508e86140bd 100644
--- a/arch/arm64/boot/dts/qcom/sdm845.dtsi
+++ b/arch/arm64/boot/dts/qcom/sdm845.dtsi
@@ -4103,7 +4103,7 @@ opp-25700 {
};
 
adreno_smmu: iommu@504 {
-   compatible = "qcom,sdm845-smmu-v2", "qcom,smmu-v2";
+   compatible = "qcom,sdm845-smmu-v2", "qcom,adreno-smmu", 
"qcom,smmu-v2";
reg = <0 0x504 0 0x1>;
#iommu-cells = <1>;
#global-interrupts = <2>;
-- 
2.25.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v18 1/4] iommu/arm-smmu-qcom: Add implementation for the adreno GPU SMMU

2020-11-02 Thread Jordan Crouse
Add a special implementation for the SMMU attached to most Adreno GPU
target triggered from the qcom,adreno-smmu compatible string.

The new Adreno SMMU implementation will enable split pagetables
(TTBR1) for the domain attached to the GPU device (SID 0) and
hard code it context bank 0 so the GPU hardware can implement
per-instance pagetables.

Co-developed-by: Rob Clark 
Signed-off-by: Jordan Crouse 
Signed-off-by: Rob Clark 
Reviewed-by: Bjorn Andersson 
---

 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c |   3 +
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 151 -
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |   1 +
 3 files changed, 153 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index 88f17cc33023..d199b4bff15d 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -223,6 +223,9 @@ struct arm_smmu_device *arm_smmu_impl_init(struct 
arm_smmu_device *smmu)
of_device_is_compatible(np, "qcom,sm8250-smmu-500"))
return qcom_smmu_impl_init(smmu);
 
+   if (of_device_is_compatible(smmu->dev->of_node, "qcom,adreno-smmu"))
+   return qcom_adreno_smmu_impl_init(smmu);
+
if (of_device_is_compatible(np, "marvell,ap806-smmu-500"))
smmu->impl = &mrvl_mmu500_impl;
 
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index be4318044f96..1e942eed2dfc 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -3,6 +3,7 @@
  * Copyright (c) 2019, The Linux Foundation. All rights reserved.
  */
 
+#include 
 #include 
 #include 
 
@@ -12,6 +13,134 @@ struct qcom_smmu {
struct arm_smmu_device smmu;
 };
 
+#define QCOM_ADRENO_SMMU_GPU_SID 0
+
+static bool qcom_adreno_smmu_is_gpu_device(struct device *dev)
+{
+   struct iommu_fwspec *fwspec = dev_iommu_fwspec_get(dev);
+   int i;
+
+   /*
+* The GPU will always use SID 0 so that is a handy way to uniquely
+* identify it and configure it for per-instance pagetables
+*/
+   for (i = 0; i < fwspec->num_ids; i++) {
+   u16 sid = FIELD_GET(ARM_SMMU_SMR_ID, fwspec->ids[i]);
+
+   if (sid == QCOM_ADRENO_SMMU_GPU_SID)
+   return true;
+   }
+
+   return false;
+}
+
+static const struct io_pgtable_cfg *qcom_adreno_smmu_get_ttbr1_cfg(
+   const void *cookie)
+{
+   struct arm_smmu_domain *smmu_domain = (void *)cookie;
+   struct io_pgtable *pgtable =
+   io_pgtable_ops_to_pgtable(smmu_domain->pgtbl_ops);
+   return &pgtable->cfg;
+}
+
+/*
+ * Local implementation to configure TTBR0 with the specified pagetable config.
+ * The GPU driver will call this to enable TTBR0 when per-instance pagetables
+ * are active
+ */
+
+static int qcom_adreno_smmu_set_ttbr0_cfg(const void *cookie,
+   const struct io_pgtable_cfg *pgtbl_cfg)
+{
+   struct arm_smmu_domain *smmu_domain = (void *)cookie;
+   struct io_pgtable *pgtable = 
io_pgtable_ops_to_pgtable(smmu_domain->pgtbl_ops);
+   struct arm_smmu_cfg *cfg = &smmu_domain->cfg;
+   struct arm_smmu_cb *cb = &smmu_domain->smmu->cbs[cfg->cbndx];
+
+   /* The domain must have split pagetables already enabled */
+   if (cb->tcr[0] & ARM_SMMU_TCR_EPD1)
+   return -EINVAL;
+
+   /* If the pagetable config is NULL, disable TTBR0 */
+   if (!pgtbl_cfg) {
+   /* Do nothing if it is already disabled */
+   if ((cb->tcr[0] & ARM_SMMU_TCR_EPD0))
+   return -EINVAL;
+
+   /* Set TCR to the original configuration */
+   cb->tcr[0] = arm_smmu_lpae_tcr(&pgtable->cfg);
+   cb->ttbr[0] = FIELD_PREP(ARM_SMMU_TTBRn_ASID, cb->cfg->asid);
+   } else {
+   u32 tcr = cb->tcr[0];
+
+   /* Don't call this again if TTBR0 is already enabled */
+   if (!(cb->tcr[0] & ARM_SMMU_TCR_EPD0))
+   return -EINVAL;
+
+   tcr |= arm_smmu_lpae_tcr(pgtbl_cfg);
+   tcr &= ~(ARM_SMMU_TCR_EPD0 | ARM_SMMU_TCR_EPD1);
+
+   cb->tcr[0] = tcr;
+   cb->ttbr[0] = pgtbl_cfg->arm_lpae_s1_cfg.ttbr;
+   cb->ttbr[0] |= FIELD_PREP(ARM_SMMU_TTBRn_ASID, cb->cfg->asid);
+   }
+
+   arm_smmu_write_context_bank(smmu_domain->smmu, cb->cfg->cbndx);
+
+   return 0;
+}
+
+static int qcom_adreno_smmu_alloc_context_bank(struct arm_smmu_domain 
*smmu_domain,
+  struct arm_smmu_device *smmu,
+  struct device *dev, int start)
+{
+   int count;
+
+   /*
+* Assign context bank 0 to the GPU device so the GPU hardware can
+* switch pagetables
+*/
+   if (qcom_adreno_smmu_is_gpu_device(d

[PATCH v18 2/4] iommu/arm-smmu: Add a way for implementations to influence SCTLR

2020-11-02 Thread Jordan Crouse
From: Rob Clark 

For the Adreno GPU's SMMU, we want SCTLR.HUPCF set to ensure that
pending translations are not terminated on iova fault.  Otherwise
a terminated CP read could hang the GPU by returning invalid
command-stream data.

Signed-off-by: Rob Clark 
Reviewed-by: Bjorn Andersson 
Signed-off-by: Jordan Crouse 
---

 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 6 ++
 drivers/iommu/arm/arm-smmu/arm-smmu.c  | 3 +++
 drivers/iommu/arm/arm-smmu/arm-smmu.h  | 3 +++
 3 files changed, 12 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 1e942eed2dfc..0663d7d26908 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -129,6 +129,12 @@ static int qcom_adreno_smmu_init_context(struct 
arm_smmu_domain *smmu_domain,
(smmu_domain->cfg.fmt == ARM_SMMU_CTX_FMT_AARCH64))
pgtbl_cfg->quirks |= IO_PGTABLE_QUIRK_ARM_TTBR1;
 
+   /*
+* On the GPU device we want to process subsequent transactions after a
+* fault to keep the GPU from hanging
+*/
+   smmu_domain->cfg.sctlr_set |= ARM_SMMU_SCTLR_HUPCF;
+
/*
 * Initialize private interface with GPU:
 */
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index dad7fa86fbd4..1f06ab219819 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -617,6 +617,9 @@ void arm_smmu_write_context_bank(struct arm_smmu_device 
*smmu, int idx)
if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN))
reg |= ARM_SMMU_SCTLR_E;
 
+   reg |= cfg->sctlr_set;
+   reg &= ~cfg->sctlr_clr;
+
arm_smmu_cb_write(smmu, idx, ARM_SMMU_CB_SCTLR, reg);
 }
 
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 6c5ffeae..ddf2ca4c923d 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -144,6 +144,7 @@ enum arm_smmu_cbar_type {
 #define ARM_SMMU_CB_SCTLR  0x0
 #define ARM_SMMU_SCTLR_S1_ASIDPNE  BIT(12)
 #define ARM_SMMU_SCTLR_CFCFG   BIT(7)
+#define ARM_SMMU_SCTLR_HUPCF   BIT(8)
 #define ARM_SMMU_SCTLR_CFIEBIT(6)
 #define ARM_SMMU_SCTLR_CFREBIT(5)
 #define ARM_SMMU_SCTLR_E   BIT(4)
@@ -341,6 +342,8 @@ struct arm_smmu_cfg {
u16 asid;
u16 vmid;
};
+   u32 sctlr_set;/* extra bits to set in 
SCTLR */
+   u32 sctlr_clr;/* bits to mask in SCTLR 
*/
enum arm_smmu_cbar_type cbar;
enum arm_smmu_context_fmt   fmt;
 };
-- 
2.25.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v18 0/4] iommu/arm-smmu: Add adreno-smmu implementation and bindings

2020-11-02 Thread Jordan Crouse
On Thu, Oct 29, 2020 at 05:26:08PM +, Will Deacon wrote:
> On Tue, Oct 27, 2020 at 04:34:04PM -0600, Jordan Crouse wrote:
> > This short series adds support for the adreno-smmu implementation of the
> > arm-smmu driver and the device-tree bindings to turn on the implementation
> > for the sm845 and sc7180 GPUs. These changes are the last ones needed to 
> > enable
> > per-instance pagetables in the drm/msm driver.
> > 
> > No deltas in this patchset since the last go-around for 5.10 [1].
> > 
> > [1] https://patchwork.freedesktop.org/series/81393/
> > 
> > Jordan Crouse (3):
> >   iommu/arm-smmu-qcom: Add implementation for the adreno GPU SMMU
> >   dt-bindings: arm-smmu: Add compatible string for Adreno GPU SMMU
> >   arm: dts: qcom: sm845: Set the compatible string for the GPU SMMU
> > 
> > Rob Clark (1):
> >   iommu/arm-smmu: Add a way for implementations to influence SCTLR
> 
> FYI: this patch (patch 4/4) doesn't seem to have made it anywhere (I don't
> have it, and neither does the archive).
> 
> Will

Patch 4/4 was the bindings for sdm845 and I didn't explicitly add IOMMU to the
CC list and so patman did what patman does.

I'll resend.

Jordan
-- 
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum,
a Linux Foundation Collaborative Project
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v4 0/7] Convert the intel iommu driver to the dma-iommu api

2020-11-02 Thread Tvrtko Ursulin


On 02/11/2020 02:00, Lu Baolu wrote:

Hi Tvrtko,
On 10/12/20 4:44 PM, Tvrtko Ursulin wrote:


On 29/09/2020 01:11, Lu Baolu wrote:

Hi Tvrtko,

On 9/28/20 5:44 PM, Tvrtko Ursulin wrote:


On 27/09/2020 07:34, Lu Baolu wrote:

Hi,

The previous post of this series could be found here.

https://lore.kernel.org/linux-iommu/20200912032200.11489-1-baolu...@linux.intel.com/ 



This version introduce a new patch [4/7] to fix an issue reported 
here.


https://lore.kernel.org/linux-iommu/51a1baec-48d1-c0ac-181b-1fba92aa4...@linux.intel.com/ 



There aren't any other changes.

Please help to test and review.

Best regards,
baolu

Lu Baolu (3):
   iommu: Add quirk for Intel graphic devices in map_sg


Since I do have patches to fix i915 to handle this, do we want to 
co-ordinate the two and avoid having to add this quirk and then 
later remove it? Or you want to go the staged approach?


I have no preference. It depends on which patch goes first. Let the
maintainers help here.


FYI we have merged the required i915 patches to out tree last week or 
so. I *think* this means they will go into 5.11. So the i915 specific 
workaround patch will not be needed in Intel IOMMU.


Do you mind telling me what's the status of this fix patch? I tried this
series on v5.10-rc1 with the graphic quirk patch dropped. I am still
seeing dma faults from graphic device.


Hmm back then I thought i915 fixes for this would land in 5.11 so I will 
stick with that. :) (See my quoted text a paragraph above yours.)


Regards,

Tvrtko
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

Re: [GIT PULL] dma-mapping fix for 5.10

2020-11-02 Thread Geert Uytterhoeven
Hi Linus,

On Sat, Oct 31, 2020 at 8:51 PM Linus Torvalds
 wrote:
> On Sat, Oct 31, 2020 at 2:40 AM Christoph Hellwig  wrote:
> > dma-mapping fix for 5.10:
> >
> >  - fix an integer overflow on 32-bit platforms in the new DMA range code
> >(Geert Uytterhoeven)
>
> So this is just a stylistic nit, and has no impact on this pull (which
> I've done). But looking at the patch, it triggers one of my "this is
> wrong" patterns.
>
> In particular, this:
>
> u64 dma_start = 0;
> ...
> for (dma_start = ~0ULL; r->size; r++) {
>
> is actually completely bogus in theory, and it's a horribly horribly
> bad pattern to have.
>
> The thing that I hate about that parttern is "~0ULL", which is simply wrong.
>
> The correct pattern for "all bits set" is ~0. NOTHING ELSE. No extra
> letters at the end.
>
> Why? Because using an unsigned type is wrong, and will not extend the
> bits up to a potentially bigger size.
>
> So adding that "ULL" is not just three extra characters to type, it
> actually _detracts_ from the code and makes it more fragile and
> potentially wrong.
>
> It so happens, that yes, in the kernel, "ull" us 64-bit, and you get
> the right results. So the code _works_. But it's wrong, and it now
> requires that the types match exactly (ie it would not be broken if
> somebody ever were to say "I want to use use 128-bit dma addresses and
> u128").

Thanks, you're right, the "ULL" suffix is not needed, and could cause
future issues.

> Another example is using "~0ul", which would give different results on
> a 32-bit kernel and a 64-bit kernel. Again: DON'T DO THAT.

Definitely.

> I repeat: the right thing to do for "all bits set" is just a plain ~0
> or -1. Either of those are fine (technically assumes a 2's complement
> machine, but let's just be honest: that's a perfectly fine assumption,
> and -1 might be preferred by some because it makes that sign extension
> behavior of the integer constant more obvious).

"-1" definitely causes warnings, depending on your compiler (not with
the gcc 9.3.0 I'm currently using, though).

> Don't try to do anything clever or anything else, because it's going
> to be strictly worse.
>
> The old code that that patch removed was "technically correct", but
> just pointless, and actually shows the problem:
>
> for (dma_start = ~(dma_addr_t)0; r->size; r++) {
>
> the above is indeed a correct way to say "I want all bits set in a
> dma_addr_t", but while correct, it is - once again - strictly inferior
> to just using "~0".

Obviously I was misled by the old code, and instead of changing
the cast, I replaced the cast ("casts are evil") by a suffix. Doh.

Any, I've just sent a patch. Thanks!

Gr{oetje,eeting}s,

Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Intel VT-d IOMMU group regression in v5.9

2020-11-02 Thread Tom Yan
Hi,

Please check out https://bugzilla.kernel.org/show_bug.cgi?id=209857, thanks!

Regards,
Tom
___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


RE: [PATCH v3 1/2] dma-mapping: add benchmark support for streaming DMA APIs

2020-11-02 Thread Song Bao Hua (Barry Song)



> -Original Message-
> From: John Garry
> Sent: Monday, November 2, 2020 10:19 PM
> To: Song Bao Hua (Barry Song) ;
> iommu@lists.linux-foundation.org; h...@lst.de; robin.mur...@arm.com;
> m.szyprow...@samsung.com
> Cc: linux-kselft...@vger.kernel.org; Shuah Khan ; Joerg
> Roedel ; Linuxarm ; xuwei (O)
> ; Will Deacon 
> Subject: Re: [PATCH v3 1/2] dma-mapping: add benchmark support for
> streaming DMA APIs
> 
> On 02/11/2020 08:06, Barry Song wrote:
> > Nowadays, there are increasing requirements to benchmark the performance
> > of dma_map and dma_unmap particually while the device is attached to an
> > IOMMU.
> >
> > This patch enables the support. Users can run specified number of threads
> > to do dma_map_page and dma_unmap_page on a specific NUMA node with
> the
> > specified duration. Then dma_map_benchmark will calculate the average
> > latency for map and unmap.
> >
> > A difficulity for this benchmark is that dma_map/unmap APIs must run on
> > a particular device. Each device might have different backend of IOMMU or
> > non-IOMMU.
> >
> > So we use the driver_override to bind dma_map_benchmark to a particual
> > device by:
> > For platform devices:
> > echo dma_map_benchmark > /sys/bus/platform/devices/xxx/driver_override
> > echo xxx > /sys/bus/platform/drivers/xxx/unbind
> > echo xxx > /sys/bus/platform/drivers/dma_map_benchmark/bind
> >
> > For PCI devices:
> > echo dma_map_benchmark >
> /sys/bus/pci/devices/:00:01.0/driver_override
> > echo :00:01.0 > /sys/bus/pci/drivers/xxx/unbind
> > echo :00:01.0 > /sys/bus/pci/drivers/dma_map_benchmark/bind
> >
> > Cc: Joerg Roedel 
> > Cc: Will Deacon 
> > Cc: Shuah Khan 
> > Cc: Christoph Hellwig 
> > Cc: Marek Szyprowski 
> > Cc: Robin Murphy 
> > Signed-off-by: Barry Song 
> > ---
> > -v3:
> >* fix build issues reported by 0day kernel test robot
> > -v2:
> >* add PCI support; v1 supported platform devices only
> >* replace ssleep by msleep_interruptible() to permit users to exit
> >  benchmark before it is completed
> >* many changes according to Robin's suggestions, thanks! Robin
> >  - add standard deviation output to reflect the worst case
> >  - check users' parameters strictly like the number of threads
> >  - make cache dirty before dma_map
> >  - fix unpaired dma_map_page and dma_unmap_single;
> >  - remove redundant "long long" before ktime_to_ns();
> >  - use devm_add_action()
> >
> >   kernel/dma/Kconfig |   8 +
> >   kernel/dma/Makefile|   1 +
> >   kernel/dma/map_benchmark.c | 296
> +
> >   3 files changed, 305 insertions(+)
> >   create mode 100644 kernel/dma/map_benchmark.c
> >
> > diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
> > index c99de4a21458..949c53da5991 100644
> > --- a/kernel/dma/Kconfig
> > +++ b/kernel/dma/Kconfig
> > @@ -225,3 +225,11 @@ config DMA_API_DEBUG_SG
> >   is technically out-of-spec.
> >
> >   If unsure, say N.
> > +
> > +config DMA_MAP_BENCHMARK
> > +   bool "Enable benchmarking of streaming DMA mapping"
> > +   help
> > + Provides /sys/kernel/debug/dma_map_benchmark that helps with
> testing
> > + performance of dma_(un)map_page.
> 
> Since this is a driver, any reason for which it cannot be loadable? If
> so, it seems any functionality would depend on DEBUG FS, I figure that's
> just how we work for debugfs.

We depend on kthread_bind_mask which isn't an export_symbol.
Maybe worth to send a patch to export it?

> 
> Thanks,
> John
> 
> > +
> > + See tools/testing/selftests/dma/dma_map_benchmark.c
> > diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
> > index dc755ab68aab..7aa6b26b1348 100644
> > --- a/kernel/dma/Makefile
> > +++ b/kernel/dma/Makefile

Thanks
Barry

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


Re: [PATCH v3 1/2] dma-mapping: add benchmark support for streaming DMA APIs

2020-11-02 Thread John Garry

On 02/11/2020 08:06, Barry Song wrote:

Nowadays, there are increasing requirements to benchmark the performance
of dma_map and dma_unmap particually while the device is attached to an
IOMMU.

This patch enables the support. Users can run specified number of threads
to do dma_map_page and dma_unmap_page on a specific NUMA node with the
specified duration. Then dma_map_benchmark will calculate the average
latency for map and unmap.

A difficulity for this benchmark is that dma_map/unmap APIs must run on
a particular device. Each device might have different backend of IOMMU or
non-IOMMU.

So we use the driver_override to bind dma_map_benchmark to a particual
device by:
For platform devices:
echo dma_map_benchmark > /sys/bus/platform/devices/xxx/driver_override
echo xxx > /sys/bus/platform/drivers/xxx/unbind
echo xxx > /sys/bus/platform/drivers/dma_map_benchmark/bind

For PCI devices:
echo dma_map_benchmark > /sys/bus/pci/devices/:00:01.0/driver_override
echo :00:01.0 > /sys/bus/pci/drivers/xxx/unbind
echo :00:01.0 > /sys/bus/pci/drivers/dma_map_benchmark/bind

Cc: Joerg Roedel 
Cc: Will Deacon 
Cc: Shuah Khan 
Cc: Christoph Hellwig 
Cc: Marek Szyprowski 
Cc: Robin Murphy 
Signed-off-by: Barry Song 
---
-v3:
   * fix build issues reported by 0day kernel test robot
-v2:
   * add PCI support; v1 supported platform devices only
   * replace ssleep by msleep_interruptible() to permit users to exit
 benchmark before it is completed
   * many changes according to Robin's suggestions, thanks! Robin
 - add standard deviation output to reflect the worst case
 - check users' parameters strictly like the number of threads
 - make cache dirty before dma_map
 - fix unpaired dma_map_page and dma_unmap_single;
 - remove redundant "long long" before ktime_to_ns();
 - use devm_add_action()

  kernel/dma/Kconfig |   8 +
  kernel/dma/Makefile|   1 +
  kernel/dma/map_benchmark.c | 296 +
  3 files changed, 305 insertions(+)
  create mode 100644 kernel/dma/map_benchmark.c

diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index c99de4a21458..949c53da5991 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -225,3 +225,11 @@ config DMA_API_DEBUG_SG
  is technically out-of-spec.
  
  	  If unsure, say N.

+
+config DMA_MAP_BENCHMARK
+   bool "Enable benchmarking of streaming DMA mapping"
+   help
+ Provides /sys/kernel/debug/dma_map_benchmark that helps with testing
+ performance of dma_(un)map_page.


Since this is a driver, any reason for which it cannot be loadable? If 
so, it seems any functionality would depend on DEBUG FS, I figure that's 
just how we work for debugfs.


Thanks,
John


+
+ See tools/testing/selftests/dma/dma_map_benchmark.c
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index dc755ab68aab..7aa6b26b1348 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v3 1/2] dma-mapping: add benchmark support for streaming DMA APIs

2020-11-02 Thread Barry Song
Nowadays, there are increasing requirements to benchmark the performance
of dma_map and dma_unmap particually while the device is attached to an
IOMMU.

This patch enables the support. Users can run specified number of threads
to do dma_map_page and dma_unmap_page on a specific NUMA node with the
specified duration. Then dma_map_benchmark will calculate the average
latency for map and unmap.

A difficulity for this benchmark is that dma_map/unmap APIs must run on
a particular device. Each device might have different backend of IOMMU or
non-IOMMU.

So we use the driver_override to bind dma_map_benchmark to a particual
device by:
For platform devices:
echo dma_map_benchmark > /sys/bus/platform/devices/xxx/driver_override
echo xxx > /sys/bus/platform/drivers/xxx/unbind
echo xxx > /sys/bus/platform/drivers/dma_map_benchmark/bind

For PCI devices:
echo dma_map_benchmark > /sys/bus/pci/devices/:00:01.0/driver_override
echo :00:01.0 > /sys/bus/pci/drivers/xxx/unbind
echo :00:01.0 > /sys/bus/pci/drivers/dma_map_benchmark/bind

Cc: Joerg Roedel 
Cc: Will Deacon 
Cc: Shuah Khan 
Cc: Christoph Hellwig 
Cc: Marek Szyprowski 
Cc: Robin Murphy 
Signed-off-by: Barry Song 
---
-v3:
  * fix build issues reported by 0day kernel test robot
-v2:
  * add PCI support; v1 supported platform devices only
  * replace ssleep by msleep_interruptible() to permit users to exit
benchmark before it is completed
  * many changes according to Robin's suggestions, thanks! Robin
- add standard deviation output to reflect the worst case
- check users' parameters strictly like the number of threads
- make cache dirty before dma_map
- fix unpaired dma_map_page and dma_unmap_single;
- remove redundant "long long" before ktime_to_ns();
- use devm_add_action()

 kernel/dma/Kconfig |   8 +
 kernel/dma/Makefile|   1 +
 kernel/dma/map_benchmark.c | 296 +
 3 files changed, 305 insertions(+)
 create mode 100644 kernel/dma/map_benchmark.c

diff --git a/kernel/dma/Kconfig b/kernel/dma/Kconfig
index c99de4a21458..949c53da5991 100644
--- a/kernel/dma/Kconfig
+++ b/kernel/dma/Kconfig
@@ -225,3 +225,11 @@ config DMA_API_DEBUG_SG
  is technically out-of-spec.
 
  If unsure, say N.
+
+config DMA_MAP_BENCHMARK
+   bool "Enable benchmarking of streaming DMA mapping"
+   help
+ Provides /sys/kernel/debug/dma_map_benchmark that helps with testing
+ performance of dma_(un)map_page.
+
+ See tools/testing/selftests/dma/dma_map_benchmark.c
diff --git a/kernel/dma/Makefile b/kernel/dma/Makefile
index dc755ab68aab..7aa6b26b1348 100644
--- a/kernel/dma/Makefile
+++ b/kernel/dma/Makefile
@@ -10,3 +10,4 @@ obj-$(CONFIG_DMA_API_DEBUG)   += debug.o
 obj-$(CONFIG_SWIOTLB)  += swiotlb.o
 obj-$(CONFIG_DMA_COHERENT_POOL)+= pool.o
 obj-$(CONFIG_DMA_REMAP)+= remap.o
+obj-$(CONFIG_DMA_MAP_BENCHMARK)+= map_benchmark.o
diff --git a/kernel/dma/map_benchmark.c b/kernel/dma/map_benchmark.c
new file mode 100644
index ..dc4e5ff48a2d
--- /dev/null
+++ b/kernel/dma/map_benchmark.c
@@ -0,0 +1,296 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Hisilicon Limited.
+ */
+
+#define pr_fmt(fmt)KBUILD_MODNAME ": " fmt
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define DMA_MAP_BENCHMARK  _IOWR('d', 1, struct map_benchmark)
+#define DMA_MAP_MAX_THREADS1024
+#define DMA_MAP_MAX_SECONDS300
+
+struct map_benchmark {
+   __u64 avg_map_100ns; /* average map latency in 100ns */
+   __u64 map_stddev; /* standard deviation of map latency */
+   __u64 avg_unmap_100ns; /* as above */
+   __u64 unmap_stddev;
+   __u32 threads; /* how many threads will do map/unmap in parallel */
+   __u32 seconds; /* how long the test will last */
+   int node; /* which numa node this benchmark will run on */
+   __u64 expansion[10];/* For future use */
+};
+
+struct map_benchmark_data {
+   struct map_benchmark bparam;
+   struct device *dev;
+   struct dentry  *debugfs;
+   atomic64_t sum_map_100ns;
+   atomic64_t sum_unmap_100ns;
+   atomic64_t sum_square_map;
+   atomic64_t sum_square_unmap;
+   atomic64_t loops;
+};
+
+static int map_benchmark_thread(void *data)
+{
+   void *buf;
+   dma_addr_t dma_addr;
+   struct map_benchmark_data *map = data;
+   int ret = 0;
+
+   buf = (void *)__get_free_page(GFP_KERNEL);
+   if (!buf)
+   return -ENOMEM;
+
+   while (!kthread_should_stop())  {
+   __u64 map_100ns, unmap_100ns, map_square, unmap_square;
+   ktime_t map_stime, map_etime, unmap_stime, unmap_etime;
+
+   /*
+* for a non-coherent device, if we don't stain them in the 
cache,
+* this will

[PATCH v3 0/2] dma-mapping: provide a benchmark for streaming DMA mapping

2020-11-02 Thread Barry Song
Nowadays, there are increasing requirements to benchmark the performance
of dma_map and dma_unmap particually while the device is attached to an
IOMMU.

This patchset provides the benchmark infrastruture for streaming DMA
mapping. The architecture of the code is pretty much similar with GUP
benchmark:
* mm/gup_benchmark.c provides kernel interface;
* tools/testing/selftests/vm/gup_benchmark.c provides user program to
call the interface provided by mm/gup_benchmark.c.

In our case, kernel/dma/map_benchmark.c is like mm/gup_benchmark.c;
tools/testing/selftests/dma/dma_map_benchmark.c is like tools/testing/
selftests/vm/gup_benchmark.c

A major difference with GUP benchmark is DMA_MAP benchmark needs to run
on a device. Considering one board with below devices and IOMMUs
device A  --- IOMMU 1
device B  --- IOMMU 2
device C  --- non-IOMMU

Different devices might attach to different IOMMU or non-IOMMU. To make
benchmark run, we can either
* create a virtual device and hack the kernel code to attach the virtual
device to IOMMU1, IOMMU2 or non-IOMMU.
* use the existing driver_override mechinism, unbind device A,B, OR c from
their original driver and bind A to dma_map_benchmark platform driver or
pci driver for benchmarking.

In this patchset, I prefer to use the driver_override and avoid the ugly
hack in kernel. We can dynamically switch device behind different IOMMUs
to get the performance of IOMMU or non-IOMMU.

-v3:
  * fix build issues reported by 0day kernel test robot
-v2:
  * add PCI support; v1 supported platform devices only
  * replace ssleep by msleep_interruptible() to permit users to exit
benchmark before it is completed
  * many changes according to Robin's suggestions, thanks! Robin
- add standard deviation output to reflect the worst case
- check users' parameters strictly like the number of threads
- make cache dirty before dma_map
- fix unpaired dma_map_page and dma_unmap_single;
- remove redundant "long long" before ktime_to_ns();
- use devm_add_action()

Barry Song (2):
  dma-mapping: add benchmark support for streaming DMA APIs
  selftests/dma: add test application for DMA_MAP_BENCHMARK

 MAINTAINERS   |   6 +
 kernel/dma/Kconfig|   8 +
 kernel/dma/Makefile   |   1 +
 kernel/dma/map_benchmark.c| 296 ++
 tools/testing/selftests/dma/Makefile  |   6 +
 tools/testing/selftests/dma/config|   1 +
 .../testing/selftests/dma/dma_map_benchmark.c |  87 +
 7 files changed, 405 insertions(+)
 create mode 100644 kernel/dma/map_benchmark.c
 create mode 100644 tools/testing/selftests/dma/Makefile
 create mode 100644 tools/testing/selftests/dma/config
 create mode 100644 tools/testing/selftests/dma/dma_map_benchmark.c

-- 
2.25.1

___
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu


[PATCH v3 2/2] selftests/dma: add test application for DMA_MAP_BENCHMARK

2020-11-02 Thread Barry Song
This patch provides the test application for DMA_MAP_BENCHMARK.

Before running the test application, we need to bind a device to dma_map_
benchmark driver. For example, unbind "xxx" from its original driver and
bind to dma_map_benchmark:

echo dma_map_benchmark > /sys/bus/platform/devices/xxx/driver_override
echo xxx > /sys/bus/platform/drivers/xxx/unbind
echo xxx > /sys/bus/platform/drivers/dma_map_benchmark/bind

Another example for PCI devices:
echo dma_map_benchmark > /sys/bus/pci/devices/:00:01.0/driver_override
echo :00:01.0 > /sys/bus/pci/drivers/xxx/unbind
echo :00:01.0 > /sys/bus/pci/drivers/dma_map_benchmark/bind

The below command will run 16 threads on numa node 0 for 10 seconds on
the device bound to dma_map_benchmark platform_driver or pci_driver:
./dma_map_benchmark -t 16 -s 10 -n 0
dma mapping benchmark: threads:16 seconds:10
average map latency(us):1.1 standard deviation:1.9
average unmap latency(us):0.5 standard deviation:0.8

Cc: Joerg Roedel 
Cc: Will Deacon 
Cc: Shuah Khan 
Cc: Christoph Hellwig 
Cc: Marek Szyprowski 
Cc: Robin Murphy 
Signed-off-by: Barry Song 
---
 MAINTAINERS   |  6 ++
 tools/testing/selftests/dma/Makefile  |  6 ++
 tools/testing/selftests/dma/config|  1 +
 .../testing/selftests/dma/dma_map_benchmark.c | 87 +++
 4 files changed, 100 insertions(+)
 create mode 100644 tools/testing/selftests/dma/Makefile
 create mode 100644 tools/testing/selftests/dma/config
 create mode 100644 tools/testing/selftests/dma/dma_map_benchmark.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 608fc8484c02..a1e38d5e14f6 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5247,6 +5247,12 @@ F:   include/linux/dma-mapping.h
 F: include/linux/dma-map-ops.h
 F: kernel/dma/
 
+DMA MAPPING BENCHMARK
+M: Barry Song 
+L: iommu@lists.linux-foundation.org
+F: kernel/dma/map_benchmark.c
+F: tools/testing/selftests/dma/
+
 DMA-BUF HEAPS FRAMEWORK
 M: Sumit Semwal 
 R: Benjamin Gaignard 
diff --git a/tools/testing/selftests/dma/Makefile 
b/tools/testing/selftests/dma/Makefile
new file mode 100644
index ..aa8e8b5b3864
--- /dev/null
+++ b/tools/testing/selftests/dma/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0
+CFLAGS += -I../../../../usr/include/
+
+TEST_GEN_PROGS := dma_map_benchmark
+
+include ../lib.mk
diff --git a/tools/testing/selftests/dma/config 
b/tools/testing/selftests/dma/config
new file mode 100644
index ..6102ee3c43cd
--- /dev/null
+++ b/tools/testing/selftests/dma/config
@@ -0,0 +1 @@
+CONFIG_DMA_MAP_BENCHMARK=y
diff --git a/tools/testing/selftests/dma/dma_map_benchmark.c 
b/tools/testing/selftests/dma/dma_map_benchmark.c
new file mode 100644
index ..4778df0c458f
--- /dev/null
+++ b/tools/testing/selftests/dma/dma_map_benchmark.c
@@ -0,0 +1,87 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2020 Hisilicon Limited.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#define DMA_MAP_BENCHMARK  _IOWR('d', 1, struct map_benchmark)
+#define DMA_MAP_MAX_THREADS1024
+#define DMA_MAP_MAX_SECONDS 300
+
+struct map_benchmark {
+   __u64 avg_map_100ns; /* average map latency in 100ns */
+   __u64 map_stddev; /* standard deviation of map latency */
+   __u64 avg_unmap_100ns; /* as above */
+   __u64 unmap_stddev;
+   __u32 threads; /* how many threads will do map/unmap in parallel */
+   __u32 seconds; /* how long the test will last */
+   int node; /* which numa node this benchmark will run on */
+   __u64 expansion[10];/* For future use */
+};
+
+int main(int argc, char **argv)
+{
+   struct map_benchmark map;
+   int fd, opt;
+   /* default single thread, run 20 seconds on NUMA_NO_NODE */
+   int threads = 1, seconds = 20, node = -1;
+   int cmd = DMA_MAP_BENCHMARK;
+   char *p;
+
+   while ((opt = getopt(argc, argv, "t:s:n:")) != -1) {
+   switch (opt) {
+   case 't':
+   threads = atoi(optarg);
+   break;
+   case 's':
+   seconds = atoi(optarg);
+   break;
+   case 'n':
+   node = atoi(optarg);
+   break;
+   default:
+   return -1;
+   }
+   }
+
+   if (threads <= 0 || threads > DMA_MAP_MAX_THREADS) {
+   fprintf(stderr, "invalid number of threads, must be in 1-%d\n",
+   DMA_MAP_MAX_THREADS);
+   exit(1);
+   }
+
+   if (seconds <= 0 || seconds > DMA_MAP_MAX_SECONDS) {
+   fprintf(stderr, "invalid number of seconds, must be in 1-%d\n",
+   DMA_MAP_MAX_SECONDS);
+   exit(1);
+   }
+
+   fd = open("/sys/kernel/debug/dma_map_benchmark", O_RDWR);
+   if (fd == -1) {
+   perror(