Re: [PATCH] drm/msm/iommu: optimize map/unmap

2022-08-24 Thread Sai Prakash Ranjan

Hi Rob,

On 8/23/2022 12:17 AM, Rob Clark wrote:

From: Rob Clark 

Using map_pages/unmap_pages cuts down on the # of pgtable walks needed
in the process of finding where to insert/remove an entry.  The end
result is ~5-10x faster than mapping a single page at a time.

Signed-off-by: Rob Clark 
---
  drivers/gpu/drm/msm/msm_iommu.c | 91 -
  1 file changed, 79 insertions(+), 12 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c
index a54ed354578b..0f3f60da3314 100644
--- a/drivers/gpu/drm/msm/msm_iommu.c
+++ b/drivers/gpu/drm/msm/msm_iommu.c
@@ -21,6 +21,7 @@ struct msm_iommu_pagetable {
struct msm_mmu base;
struct msm_mmu *parent;
struct io_pgtable_ops *pgtbl_ops;
+   unsigned long pgsize_bitmap;/* Bitmap of page sizes in use */
phys_addr_t ttbr;
u32 asid;
  };
@@ -29,23 +30,85 @@ static struct msm_iommu_pagetable *to_pagetable(struct 
msm_mmu *mmu)
return container_of(mmu, struct msm_iommu_pagetable, base);
  }
  
+/* based on iommu_pgsize() in iommu.c: */

+static size_t iommu_pgsize(struct msm_iommu_pagetable *pagetable,


Maybe call this msm_iommu_pgsize? There won't be any namespace conflict since 
it is static
in both places but still would be better.


+  unsigned long iova, phys_addr_t paddr,
+  size_t size, size_t *count)
+{
+   unsigned int pgsize_idx, pgsize_idx_next;
+   unsigned long pgsizes;
+   size_t offset, pgsize, pgsize_next;
+   unsigned long addr_merge = paddr | iova;
+
+   /* Page sizes supported by the hardware and small enough for @size */
+   pgsizes = pagetable->pgsize_bitmap & GENMASK(__fls(size), 0);
+
+   /* Constrain the page sizes further based on the maximum alignment */
+   if (likely(addr_merge))
+   pgsizes &= GENMASK(__ffs(addr_merge), 0);
+
+   /* Make sure we have at least one suitable page size */
+   BUG_ON(!pgsizes);
+
+   /* Pick the biggest page size remaining */
+   pgsize_idx = __fls(pgsizes);
+   pgsize = BIT(pgsize_idx);
+   if (!count)
+   return pgsize;
+
+   /* Find the next biggest support page size, if it exists */
+   pgsizes = pagetable->pgsize_bitmap & ~GENMASK(pgsize_idx, 0);
+   if (!pgsizes)
+   goto out_set_count;
+
+   pgsize_idx_next = __ffs(pgsizes);
+   pgsize_next = BIT(pgsize_idx_next);
+
+   /*
+* There's no point trying a bigger page size unless the virtual
+* and physical addresses are similarly offset within the larger page.
+*/
+   if ((iova ^ paddr) & (pgsize_next - 1))
+   goto out_set_count;
+
+   /* Calculate the offset to the next page size alignment boundary */
+   offset = pgsize_next - (addr_merge & (pgsize_next - 1));
+
+   /*
+* If size is big enough to accommodate the larger page, reduce
+* the number of smaller pages.
+*/
+   if (offset + pgsize_next <= size)
+   size = offset;
+
+out_set_count:
+   *count = size >> pgsize_idx;
+   return pgsize;
+}
+
  static int msm_iommu_pagetable_unmap(struct msm_mmu *mmu, u64 iova,
size_t size)
  {
struct msm_iommu_pagetable *pagetable = to_pagetable(mmu);
struct io_pgtable_ops *ops = pagetable->pgtbl_ops;
-   size_t unmapped = 0;
  
  	/* Unmap the block one page at a time */


This comment will need an update.


while (size) {
-   unmapped += ops->unmap(ops, iova, 4096, NULL);
-   iova += 4096;
-   size -= 4096;
+   size_t unmapped, pgsize, count;
+
+   pgsize = iommu_pgsize(pagetable, iova, iova, size, );
+
+   unmapped = ops->unmap_pages(ops, iova, pgsize, count, NULL);
+   if (!unmapped)
+   break;
+
+   iova += unmapped;
+   size -= unmapped;
}
  
  	iommu_flush_iotlb_all(to_msm_iommu(pagetable->parent)->domain);
  
-	return (unmapped == size) ? 0 : -EINVAL;

+   return (size == 0) ? 0 : -EINVAL;
  }
  
  static int msm_iommu_pagetable_map(struct msm_mmu *mmu, u64 iova,

@@ -54,7 +117,6 @@ static int msm_iommu_pagetable_map(struct msm_mmu *mmu, u64 
iova,
struct msm_iommu_pagetable *pagetable = to_pagetable(mmu);
struct io_pgtable_ops *ops = pagetable->pgtbl_ops;
struct scatterlist *sg;
-   size_t mapped = 0;
u64 addr = iova;
unsigned int i;
  
@@ -64,15 +126,19 @@ static int msm_iommu_pagetable_map(struct msm_mmu *mmu, u64 iova,
  
  		/* Map the block one page at a time */


This comment will need an update.


while (size) {
-   if (ops->map(ops, addr, phys, 4096, prot, GFP_KERNEL)) {
-   msm_iommu_pagetable_unmap(mmu, iova, mapped);
+   size_t pgsize, count, mapped;
+
+

Re: [PATCH v2] drm/msm/iommu: optimize map/unmap

2022-08-24 Thread Sai Prakash Ranjan

On 8/23/2022 10:07 PM, Rob Clark wrote:

From: Rob Clark 

Using map_pages/unmap_pages cuts down on the # of pgtable walks needed
in the process of finding where to insert/remove an entry.  The end
result is ~5-10x faster than mapping a single page at a time.

v2: Rename iommu_pgsize(), drop obsolete comments, fix error handling
 in msm_iommu_pagetable_map()

Signed-off-by: Rob Clark 
---
  drivers/gpu/drm/msm/msm_iommu.c | 101 +++-
  1 file changed, 86 insertions(+), 15 deletions(-)


Reviewed-by: Sai Prakash Ranjan 

Thanks,
Sai



Re: [Freedreno] [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-08-10 Thread Sai Prakash Ranjan

On 2021-08-10 14:46, Will Deacon wrote:

On Mon, Aug 09, 2021 at 11:17:40PM +0530, Sai Prakash Ranjan wrote:

On 2021-08-09 23:10, Will Deacon wrote:
> On Mon, Aug 09, 2021 at 10:18:21AM -0700, Rob Clark wrote:
> > On Mon, Aug 9, 2021 at 10:05 AM Will Deacon  wrote:
> > > On Mon, Aug 09, 2021 at 09:57:08AM -0700, Rob Clark wrote:
> > > > But I suppose we could call it instead IOMMU_QCOM_LLC or something
> > > > like that to make it more clear that it is not necessarily something
> > > > that would work with a different outer level cache implementation?
> > >
> > > ... or we could just deal with the problem so that other people can reuse
> > > the code. I haven't really understood the reluctance to solve this 
properly.
> > >
> > > Am I missing some reason this isn't solvable?
> >
> > Oh, was there another way to solve it (other than foregoing setting
> > INC_OCACHE in the pgtables)?  Maybe I misunderstood, is there a
> > corresponding setting on the MMU pgtables side of things?
>
> Right -- we just need to program the CPU's MMU with the matching memory
> attributes! It's a bit more fiddly if you're just using ioremap_wc()
> though, as it's usually the DMA API which handles the attributes under
> the
> hood.
>
> Anyway, sorry, I should've said that explicitly earlier on. We've done
> this
> sort of thing in the Android tree so I assumed Sai knew what needed to
> be
> done and then I didn't think to explain to you :(
>

Right I was aware of that but even in the android tree there is no 
user :)


I'm assuming there are vendor modules using it there, otherwise we 
wouldn't
have been asked to put it in. Since you work at Qualcomm, maybe you 
could

talk to your colleagues (Isaac and Patrick) directly?



Right I will check with them regarding the vendor modules in android.

I think we can't have a new memory type without any user right in 
upstream

like android tree?


Correct. But I don't think we should be adding IOMMU_* anything 
upstream

if we don't have a user.



Agreed, once we have the fix for GPU crash I can continue further on 
using

this properly.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation


Re: [Freedreno] [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-08-09 Thread Sai Prakash Ranjan

On 2021-08-10 00:00, Rob Clark wrote:

On Mon, Aug 9, 2021 at 11:11 AM Sai Prakash Ranjan
 wrote:


On 2021-08-09 23:37, Rob Clark wrote:
> On Mon, Aug 9, 2021 at 10:47 AM Sai Prakash Ranjan
>  wrote:
>>
>> On 2021-08-09 23:10, Will Deacon wrote:
>> > On Mon, Aug 09, 2021 at 10:18:21AM -0700, Rob Clark wrote:
>> >> On Mon, Aug 9, 2021 at 10:05 AM Will Deacon  wrote:
>> >> >
>> >> > On Mon, Aug 09, 2021 at 09:57:08AM -0700, Rob Clark wrote:
>> >> > > On Mon, Aug 9, 2021 at 7:56 AM Will Deacon  wrote:
>> >> > > > On Mon, Aug 02, 2021 at 06:36:04PM -0700, Rob Clark wrote:
>> >> > > > > On Mon, Aug 2, 2021 at 8:14 AM Will Deacon  
wrote:
>> >> > > > > > On Mon, Aug 02, 2021 at 08:08:07AM -0700, Rob Clark wrote:
>> >> > > > > > > On Mon, Aug 2, 2021 at 3:55 AM Will Deacon  
wrote:
>> >> > > > > > > > On Thu, Jul 29, 2021 at 10:08:22AM +0530, Sai Prakash 
Ranjan wrote:
>> >> > > > > > > > > On 2021-07-28 19:30, Georgi Djakov wrote:
>> >> > > > > > > > > > On Mon, Jan 11, 2021 at 07:45:02PM +0530, Sai Prakash 
Ranjan wrote:
>> >> > > > > > > > > > > commit ecd7274fb4cd ("iommu: Remove unused 
IOMMU_SYS_CACHE_ONLY flag")
>> >> > > > > > > > > > > removed unused IOMMU_SYS_CACHE_ONLY prot flag and 
along with it went
>> >> > > > > > > > > > > the memory type setting required for the non-coherent 
masters to use
>> >> > > > > > > > > > > system cache. Now that system cache support for GPU 
is added, we will
>> >> > > > > > > > > > > need to set the right PTE attribute for GPU buffers 
to be sys cached.
>> >> > > > > > > > > > > Without this, the system cache lines are not 
allocated for GPU.
>> >> > > > > > > > > > >
>> >> > > > > > > > > > > So the patches in this series introduces a new prot 
flag IOMMU_LLC,
>> >> > > > > > > > > > > renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to 
IO_PGTABLE_QUIRK_PTW_LLC
>> >> > > > > > > > > > > and makes GPU the user of this protection flag.
>> >> > > > > > > > > >
>> >> > > > > > > > > > Thank you for the patchset! Are you planning to refresh 
it, as it does
>> >> > > > > > > > > > not apply anymore?
>> >> > > > > > > > > >
>> >> > > > > > > > >
>> >> > > > > > > > > I was waiting on Will's reply [1]. If there are no 
changes needed, then
>> >> > > > > > > > > I can repost the patch.
>> >> > > > > > > >
>> >> > > > > > > > I still think you need to handle the mismatched alias, no? 
You're adding
>> >> > > > > > > > a new memory type to the SMMU which doesn't exist on the 
CPU side. That
>> >> > > > > > > > can't be right.
>> >> > > > > > > >
>> >> > > > > > >
>> >> > > > > > > Just curious, and maybe this is a dumb question, but what is 
your
>> >> > > > > > > concern about mismatched aliases?  I mean the cache hierarchy 
on the
>> >> > > > > > > GPU device side (anything beyond the LLC) is pretty different 
and
>> >> > > > > > > doesn't really care about the smmu pgtable attributes..
>> >> > > > > >
>> >> > > > > > If the CPU accesses a shared buffer with different attributes 
to those which
>> >> > > > > > the device is using then you fall into the "mismatched memory 
attributes"
>> >> > > > > > part of the Arm architecture. It's reasonably unforgiving (you 
should go and
>> >> > > > > > read it) and in some cases can apply to speculative accesses as 
well, but
>> >> > > > > > the end result is typically loss of coherency.
>> >> > > > >
>> >> > > > > Ok, I might have a few other sections to read first to decipher 
the
>> >> > > > > terminology..
>&g

Re: [Freedreno] [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-08-09 Thread Sai Prakash Ranjan

On 2021-08-09 23:37, Rob Clark wrote:

On Mon, Aug 9, 2021 at 10:47 AM Sai Prakash Ranjan
 wrote:


On 2021-08-09 23:10, Will Deacon wrote:
> On Mon, Aug 09, 2021 at 10:18:21AM -0700, Rob Clark wrote:
>> On Mon, Aug 9, 2021 at 10:05 AM Will Deacon  wrote:
>> >
>> > On Mon, Aug 09, 2021 at 09:57:08AM -0700, Rob Clark wrote:
>> > > On Mon, Aug 9, 2021 at 7:56 AM Will Deacon  wrote:
>> > > > On Mon, Aug 02, 2021 at 06:36:04PM -0700, Rob Clark wrote:
>> > > > > On Mon, Aug 2, 2021 at 8:14 AM Will Deacon  wrote:
>> > > > > > On Mon, Aug 02, 2021 at 08:08:07AM -0700, Rob Clark wrote:
>> > > > > > > On Mon, Aug 2, 2021 at 3:55 AM Will Deacon  
wrote:
>> > > > > > > > On Thu, Jul 29, 2021 at 10:08:22AM +0530, Sai Prakash Ranjan 
wrote:
>> > > > > > > > > On 2021-07-28 19:30, Georgi Djakov wrote:
>> > > > > > > > > > On Mon, Jan 11, 2021 at 07:45:02PM +0530, Sai Prakash 
Ranjan wrote:
>> > > > > > > > > > > commit ecd7274fb4cd ("iommu: Remove unused 
IOMMU_SYS_CACHE_ONLY flag")
>> > > > > > > > > > > removed unused IOMMU_SYS_CACHE_ONLY prot flag and along 
with it went
>> > > > > > > > > > > the memory type setting required for the non-coherent 
masters to use
>> > > > > > > > > > > system cache. Now that system cache support for GPU is 
added, we will
>> > > > > > > > > > > need to set the right PTE attribute for GPU buffers to 
be sys cached.
>> > > > > > > > > > > Without this, the system cache lines are not allocated 
for GPU.
>> > > > > > > > > > >
>> > > > > > > > > > > So the patches in this series introduces a new prot flag 
IOMMU_LLC,
>> > > > > > > > > > > renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to 
IO_PGTABLE_QUIRK_PTW_LLC
>> > > > > > > > > > > and makes GPU the user of this protection flag.
>> > > > > > > > > >
>> > > > > > > > > > Thank you for the patchset! Are you planning to refresh 
it, as it does
>> > > > > > > > > > not apply anymore?
>> > > > > > > > > >
>> > > > > > > > >
>> > > > > > > > > I was waiting on Will's reply [1]. If there are no changes 
needed, then
>> > > > > > > > > I can repost the patch.
>> > > > > > > >
>> > > > > > > > I still think you need to handle the mismatched alias, no? 
You're adding
>> > > > > > > > a new memory type to the SMMU which doesn't exist on the CPU 
side. That
>> > > > > > > > can't be right.
>> > > > > > > >
>> > > > > > >
>> > > > > > > Just curious, and maybe this is a dumb question, but what is your
>> > > > > > > concern about mismatched aliases?  I mean the cache hierarchy on 
the
>> > > > > > > GPU device side (anything beyond the LLC) is pretty different and
>> > > > > > > doesn't really care about the smmu pgtable attributes..
>> > > > > >
>> > > > > > If the CPU accesses a shared buffer with different attributes to 
those which
>> > > > > > the device is using then you fall into the "mismatched memory 
attributes"
>> > > > > > part of the Arm architecture. It's reasonably unforgiving (you 
should go and
>> > > > > > read it) and in some cases can apply to speculative accesses as 
well, but
>> > > > > > the end result is typically loss of coherency.
>> > > > >
>> > > > > Ok, I might have a few other sections to read first to decipher the
>> > > > > terminology..
>> > > > >
>> > > > > But my understanding of LLC is that it looks just like system memory
>> > > > > to the CPU and GPU (I think that would make it "the point of
>> > > > > coherence" between the GPU and CPU?)  If that is true, shouldn't it 
be
>> > > > > invisible from the point of view of different CPU mapping options?
>> > > >
>> > > > You could certainly build a system where mismatched attributes don't 
cause
>> > > > 

Re: [Freedreno] [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-08-09 Thread Sai Prakash Ranjan

On 2021-08-09 23:10, Will Deacon wrote:

On Mon, Aug 09, 2021 at 10:18:21AM -0700, Rob Clark wrote:

On Mon, Aug 9, 2021 at 10:05 AM Will Deacon  wrote:
>
> On Mon, Aug 09, 2021 at 09:57:08AM -0700, Rob Clark wrote:
> > On Mon, Aug 9, 2021 at 7:56 AM Will Deacon  wrote:
> > > On Mon, Aug 02, 2021 at 06:36:04PM -0700, Rob Clark wrote:
> > > > On Mon, Aug 2, 2021 at 8:14 AM Will Deacon  wrote:
> > > > > On Mon, Aug 02, 2021 at 08:08:07AM -0700, Rob Clark wrote:
> > > > > > On Mon, Aug 2, 2021 at 3:55 AM Will Deacon  wrote:
> > > > > > > On Thu, Jul 29, 2021 at 10:08:22AM +0530, Sai Prakash Ranjan 
wrote:
> > > > > > > > On 2021-07-28 19:30, Georgi Djakov wrote:
> > > > > > > > > On Mon, Jan 11, 2021 at 07:45:02PM +0530, Sai Prakash Ranjan 
wrote:
> > > > > > > > > > commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY 
flag")
> > > > > > > > > > removed unused IOMMU_SYS_CACHE_ONLY prot flag and along 
with it went
> > > > > > > > > > the memory type setting required for the non-coherent 
masters to use
> > > > > > > > > > system cache. Now that system cache support for GPU is 
added, we will
> > > > > > > > > > need to set the right PTE attribute for GPU buffers to be 
sys cached.
> > > > > > > > > > Without this, the system cache lines are not allocated for 
GPU.
> > > > > > > > > >
> > > > > > > > > > So the patches in this series introduces a new prot flag 
IOMMU_LLC,
> > > > > > > > > > renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to 
IO_PGTABLE_QUIRK_PTW_LLC
> > > > > > > > > > and makes GPU the user of this protection flag.
> > > > > > > > >
> > > > > > > > > Thank you for the patchset! Are you planning to refresh it, 
as it does
> > > > > > > > > not apply anymore?
> > > > > > > > >
> > > > > > > >
> > > > > > > > I was waiting on Will's reply [1]. If there are no changes 
needed, then
> > > > > > > > I can repost the patch.
> > > > > > >
> > > > > > > I still think you need to handle the mismatched alias, no? You're 
adding
> > > > > > > a new memory type to the SMMU which doesn't exist on the CPU 
side. That
> > > > > > > can't be right.
> > > > > > >
> > > > > >
> > > > > > Just curious, and maybe this is a dumb question, but what is your
> > > > > > concern about mismatched aliases?  I mean the cache hierarchy on the
> > > > > > GPU device side (anything beyond the LLC) is pretty different and
> > > > > > doesn't really care about the smmu pgtable attributes..
> > > > >
> > > > > If the CPU accesses a shared buffer with different attributes to 
those which
> > > > > the device is using then you fall into the "mismatched memory 
attributes"
> > > > > part of the Arm architecture. It's reasonably unforgiving (you should 
go and
> > > > > read it) and in some cases can apply to speculative accesses as well, 
but
> > > > > the end result is typically loss of coherency.
> > > >
> > > > Ok, I might have a few other sections to read first to decipher the
> > > > terminology..
> > > >
> > > > But my understanding of LLC is that it looks just like system memory
> > > > to the CPU and GPU (I think that would make it "the point of
> > > > coherence" between the GPU and CPU?)  If that is true, shouldn't it be
> > > > invisible from the point of view of different CPU mapping options?
> > >
> > > You could certainly build a system where mismatched attributes don't cause
> > > loss of coherence, but as it's not guaranteed by the architecture and the
> > > changes proposed here affect APIs which are exposed across SoCs, then I
> > > don't think it helps much.
> > >
> >
> > Hmm, the description of the new mapping flag is that it applies only
> > to transparent outer level cache:
> >
> > +/*
> > + * Non-coherent masters can use this page protection flag to set cacheable
> > + * memory attributes for only a transparent outer level of cache, also 
known as
> > + * the last-level or system cache.

Re: [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-07-28 Thread Sai Prakash Ranjan

Hi Georgi,

On 2021-07-28 19:30, Georgi Djakov wrote:

On Mon, Jan 11, 2021 at 07:45:02PM +0530, Sai Prakash Ranjan wrote:

commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag")
removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went
the memory type setting required for the non-coherent masters to use
system cache. Now that system cache support for GPU is added, we will
need to set the right PTE attribute for GPU buffers to be sys cached.
Without this, the system cache lines are not allocated for GPU.

So the patches in this series introduces a new prot flag IOMMU_LLC,
renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC
and makes GPU the user of this protection flag.


Hi Sai,

Thank you for the patchset! Are you planning to refresh it, as it does
not apply anymore?



I was waiting on Will's reply [1]. If there are no changes needed, then
I can repost the patch.

[1] 
https://lore.kernel.org/lkml/21239ba603d0bdc4e4c696588a905...@codeaurora.org/


Thanks,
Sai





The series slightly depends on following 2 patches posted earlier and
is based on msm-next branch:
 * https://lore.kernel.org/patchwork/patch/1363008/
 * https://lore.kernel.org/patchwork/patch/1363010/

Sai Prakash Ranjan (3):
  iommu/io-pgtable: Rename last-level cache quirk to
IO_PGTABLE_QUIRK_PTW_LLC
  iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag
  drm/msm: Use IOMMU_LLC page protection flag to map gpu buffers

 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 3 +++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +-
 drivers/gpu/drm/msm/msm_iommu.c | 3 +++
 drivers/gpu/drm/msm/msm_mmu.h   | 4 
 drivers/iommu/io-pgtable-arm.c  | 9 ++---
 include/linux/io-pgtable.h  | 6 +++---
 include/linux/iommu.h   | 6 ++
 7 files changed, 26 insertions(+), 7 deletions(-)


base-commit: 00fd44a1a4700718d5d962432b55c09820f7e709
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation



--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation


Re: [v1] drm/msm/disp/dpu1: add safe lut config in dpu driver

2021-07-11 Thread Sai Prakash Ranjan

On 2021-07-09 16:10, Kalyan Thota wrote:

Add safe lut configuration for all the targets in dpu
driver as per QOS recommendation.

Issue reported on SC7280:

With wait-for-safe feature in smmu enabled, RT client
buffer levels are checked to be safe before smmu invalidation.
Since display was always set to unsafe it was delaying the
invalidaiton process thus impacting the performance on NRT clients
such as eMMC and NVMe.

Validated this change on SC7280, With this change eMMC performance
has improved significantly.

Signed-off-by: Kalyan Thota 
---
 drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c
b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c
index d01c4c9..2e482cd 100644
--- a/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c
+++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_hw_catalog.c
@@ -974,6 +974,7 @@ static const struct dpu_perf_cfg sdm845_perf_data = 
{

.amortizable_threshold = 25,
.min_prefill_lines = 24,
.danger_lut_tbl = {0xf, 0x, 0x0},
+   .safe_lut_tbl = {0xfff0, 0xf000, 0x},
.qos_lut_tbl = {
{.nentry = ARRAY_SIZE(sdm845_qos_linear),
.entries = sdm845_qos_linear
@@ -1001,6 +1002,7 @@ static const struct dpu_perf_cfg sc7180_perf_data 
= {

.min_dram_ib = 160,
.min_prefill_lines = 24,
.danger_lut_tbl = {0xff, 0x, 0x0},
+   .safe_lut_tbl = {0xfff0, 0xff00, 0x},
.qos_lut_tbl = {
{.nentry = ARRAY_SIZE(sc7180_qos_linear),
.entries = sc7180_qos_linear
@@ -1028,6 +1030,7 @@ static const struct dpu_perf_cfg sm8150_perf_data 
= {

.min_dram_ib = 80,
.min_prefill_lines = 24,
.danger_lut_tbl = {0xf, 0x, 0x0},
+   .safe_lut_tbl = {0xfff8, 0xf000, 0x},
.qos_lut_tbl = {
{.nentry = ARRAY_SIZE(sm8150_qos_linear),
.entries = sm8150_qos_linear
@@ -1056,6 +1059,7 @@ static const struct dpu_perf_cfg sm8250_perf_data 
= {

.min_dram_ib = 80,
.min_prefill_lines = 35,
.danger_lut_tbl = {0xf, 0x, 0x0},
+   .safe_lut_tbl = {0xfff0, 0xff00, 0x},
.qos_lut_tbl = {
{.nentry = ARRAY_SIZE(sc7180_qos_linear),
.entries = sc7180_qos_linear
@@ -1084,6 +1088,7 @@ static const struct dpu_perf_cfg sc7280_perf_data 
= {

.min_dram_ib = 160,
.min_prefill_lines = 24,
.danger_lut_tbl = {0x, 0x, 0x0},
+   .safe_lut_tbl = {0xff00, 0xff00, 0x},
.qos_lut_tbl = {
{.nentry = ARRAY_SIZE(sc7180_qos_macrotile),
.entries = sc7180_qos_macrotile


Tested-by: Sai Prakash Ranjan  
(sc7280, sc7180)


This will need fixes and stable tag and I think this should also fix the
wait-for-safe issue with sdm845 (ufs/usb speed slowdown with display 
active)

which we have in arm-smmu-qcom.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation


Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag

2021-06-30 Thread Sai Prakash Ranjan

Hi Will,

On 2021-03-25 23:03, Will Deacon wrote:

On Tue, Mar 09, 2021 at 12:10:44PM +0530, Sai Prakash Ranjan wrote:

On 2021-02-05 17:38, Sai Prakash Ranjan wrote:
> On 2021-02-04 03:16, Will Deacon wrote:
> > On Tue, Feb 02, 2021 at 11:56:27AM +0530, Sai Prakash Ranjan wrote:
> > > On 2021-02-01 23:50, Jordan Crouse wrote:
> > > > On Mon, Feb 01, 2021 at 08:20:44AM -0800, Rob Clark wrote:
> > > > > On Mon, Feb 1, 2021 at 3:16 AM Will Deacon  wrote:
> > > > > > On Fri, Jan 29, 2021 at 03:12:59PM +0530, Sai Prakash Ranjan wrote:
> > > > > > > On 2021-01-29 14:35, Will Deacon wrote:
> > > > > > > > On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan 
wrote:
> > > > > > > > > +#define IOMMU_LLC(1 << 6)
> > > > > > > >
> > > > > > > > On reflection, I'm a bit worried about exposing this because I 
think it
> > > > > > > > will
> > > > > > > > introduce a mismatched virtual alias with the CPU (we don't 
even have a
> > > > > > > > MAIR
> > > > > > > > set up for this memory type). Now, we also have that issue for 
the PTW,
> > > > > > > > but
> > > > > > > > since we always use cache maintenance (i.e. the streaming API) 
for
> > > > > > > > publishing the page-tables to a non-coheren walker, it works 
out.
> > > > > > > > However,
> > > > > > > > if somebody expects IOMMU_LLC to be coherent with a DMA API 
coherent
> > > > > > > > allocation, then they're potentially in for a nasty surprise 
due to the
> > > > > > > > mismatched outer-cacheability attributes.
> > > > > > > >
> > > > > > >
> > > > > > > Can't we add the syscached memory type similar to what is done on 
android?
> > > > > >
> > > > > > Maybe. How does the GPU driver map these things on the CPU side?
> > > > >
> > > > > Currently we use writecombine mappings for everything, although there
> > > > > are some cases that we'd like to use cached (but have not merged
> > > > > patches that would give userspace a way to flush/invalidate)
> > > > >
> > > >
> > > > LLC/system cache doesn't have a relationship with the CPU cache.  Its
> > > > just a
> > > > little accelerator that sits on the connection from the GPU to DDR and
> > > > caches
> > > > accesses. The hint that Sai is suggesting is used to mark the buffers as
> > > > 'no-write-allocate' to prevent GPU write operations from being cached in
> > > > the LLC
> > > > which a) isn't interesting and b) takes up cache space for read
> > > > operations.
> > > >
> > > > Its easiest to think of the LLC as a bonus accelerator that has no cost
> > > > for
> > > > us to use outside of the unfortunate per buffer hint.
> > > >
> > > > We do have to worry about the CPU cache w.r.t I/O coherency (which is a
> > > > different hint) and in that case we have all of concerns that Will
> > > > identified.
> > > >
> > >
> > > For mismatched outer cacheability attributes which Will
> > > mentioned, I was
> > > referring to [1] in android kernel.
> >
> > I've lost track of the conversation here :/
> >
> > When the GPU has a buffer mapped with IOMMU_LLC, is the buffer also
> > mapped
> > into the CPU and with what attributes? Rob said "writecombine for
> > everything" -- does that mean ioremap_wc() / MEMREMAP_WC?
> >
>
> Rob answered this.
>
> > Finally, we need to be careful when we use the word "hint" as
> > "allocation
> > hint" has a specific meaning in the architecture, and if we only
> > mismatch on
> > those then we're actually ok. But I think IOMMU_LLC is more than
> > just a
> > hint, since it actually drives eviction policy (i.e. it enables
> > writeback).
> >
> > Sorry for the pedantry, but I just want to make sure we're all talking
> > about the same things!
> >
>
> Sorry for the confusion which probably was caused by my mentioning of
> android, NWA(no write allocate) is an allocation hint which we can
> ignore
> for now as it is not introduced yet in upstream.
>

Any chance of taking this forward? We do not want to miss out on small 
fps

gain when the product gets released.


Do we have a solution to the mismatched virtual alias?



Sorry for the long delay on this thread.

For mismatched virtual alias question, wasn't this already discussed in 
stretch
when initial support for system cache [1] (which was reverted by you) 
was added?


Excerpt from there,

"As seen in downstream kernels there are few non-coherent devices which
would not want to allocate in system cache, and therefore would want
Inner/Outer non-cached memory. So, we may want to either override the
attributes per-device, or as you suggested we may want to introduce
another memory type 'sys-cached' that can be added with its separate
infra."

As for DMA API usage, we do not have any upstream users (video will be
one if they decide to upstream that).

[1] 
https://patchwork.kernel.org/project/linux-arm-msm/patch/20180615105329.26800-1-vivek.gau...@codeaurora.org/


Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation


Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag

2021-03-17 Thread Sai Prakash Ranjan

Hi Rob,

On 2021-03-16 22:46, Rob Clark wrote:

...


> >
> > When the GPU has a buffer mapped with IOMMU_LLC, is the buffer also mapped
> > into the CPU and with what attributes? Rob said "writecombine for
> > everything" -- does that mean ioremap_wc() / MEMREMAP_WC?
>
> Currently userspace asks for everything WC, so pgprot_writecombine()
>
> The kernel doesn't enforce this, but so far provides no UAPI to do
> anything useful with non-coherent cached mappings (although there is
> interest to support this)
>

btw, I'm looking at a benchmark (gl_driver2_off) where (after some
other in-flight optimizations land) we end up bottlenecked on writing
to WC cmdstream buffers.  I assume in the current state, WC goes all
the way to main memory rather than just to system cache?



oh, I guess this (mentioned earlier in thread) is what I really want
for this benchmark:

https://android-review.googlesource.com/c/kernel/common/+/1549097/3



You can also check if the system cache lines are allocated for GPU
or not with patch in https://crrev.com/c/2766723

With the above patch applied,
cat /sys/kernel/debug/llcc_stats/llcc_scid_status

The SCIDs for GPU are listed in include/linux/soc/qcom/llcc-qcom.h

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag

2021-03-08 Thread Sai Prakash Ranjan

Hi,

On 2021-02-05 17:38, Sai Prakash Ranjan wrote:

On 2021-02-04 03:16, Will Deacon wrote:

On Tue, Feb 02, 2021 at 11:56:27AM +0530, Sai Prakash Ranjan wrote:

On 2021-02-01 23:50, Jordan Crouse wrote:
> On Mon, Feb 01, 2021 at 08:20:44AM -0800, Rob Clark wrote:
> > On Mon, Feb 1, 2021 at 3:16 AM Will Deacon  wrote:
> > > On Fri, Jan 29, 2021 at 03:12:59PM +0530, Sai Prakash Ranjan wrote:
> > > > On 2021-01-29 14:35, Will Deacon wrote:
> > > > > On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan wrote:
> > > > > > +#define IOMMU_LLC(1 << 6)
> > > > >
> > > > > On reflection, I'm a bit worried about exposing this because I think 
it
> > > > > will
> > > > > introduce a mismatched virtual alias with the CPU (we don't even have 
a
> > > > > MAIR
> > > > > set up for this memory type). Now, we also have that issue for the 
PTW,
> > > > > but
> > > > > since we always use cache maintenance (i.e. the streaming API) for
> > > > > publishing the page-tables to a non-coheren walker, it works out.
> > > > > However,
> > > > > if somebody expects IOMMU_LLC to be coherent with a DMA API coherent
> > > > > allocation, then they're potentially in for a nasty surprise due to 
the
> > > > > mismatched outer-cacheability attributes.
> > > > >
> > > >
> > > > Can't we add the syscached memory type similar to what is done on 
android?
> > >
> > > Maybe. How does the GPU driver map these things on the CPU side?
> >
> > Currently we use writecombine mappings for everything, although there
> > are some cases that we'd like to use cached (but have not merged
> > patches that would give userspace a way to flush/invalidate)
> >
>
> LLC/system cache doesn't have a relationship with the CPU cache.  Its
> just a
> little accelerator that sits on the connection from the GPU to DDR and
> caches
> accesses. The hint that Sai is suggesting is used to mark the buffers as
> 'no-write-allocate' to prevent GPU write operations from being cached in
> the LLC
> which a) isn't interesting and b) takes up cache space for read
> operations.
>
> Its easiest to think of the LLC as a bonus accelerator that has no cost
> for
> us to use outside of the unfortunate per buffer hint.
>
> We do have to worry about the CPU cache w.r.t I/O coherency (which is a
> different hint) and in that case we have all of concerns that Will
> identified.
>

For mismatched outer cacheability attributes which Will mentioned, I 
was

referring to [1] in android kernel.


I've lost track of the conversation here :/

When the GPU has a buffer mapped with IOMMU_LLC, is the buffer also 
mapped

into the CPU and with what attributes? Rob said "writecombine for
everything" -- does that mean ioremap_wc() / MEMREMAP_WC?



Rob answered this.

Finally, we need to be careful when we use the word "hint" as 
"allocation
hint" has a specific meaning in the architecture, and if we only 
mismatch on
those then we're actually ok. But I think IOMMU_LLC is more than just 
a
hint, since it actually drives eviction policy (i.e. it enables 
writeback).


Sorry for the pedantry, but I just want to make sure we're all talking
about the same things!



Sorry for the confusion which probably was caused by my mentioning of
android, NWA(no write allocate) is an allocation hint which we can 
ignore

for now as it is not introduced yet in upstream.



Any chance of taking this forward? We do not want to miss out on small 
fps

gain when the product gets released.

Thanks,
Sai
--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag

2021-02-05 Thread Sai Prakash Ranjan

On 2021-02-04 03:16, Will Deacon wrote:

On Tue, Feb 02, 2021 at 11:56:27AM +0530, Sai Prakash Ranjan wrote:

On 2021-02-01 23:50, Jordan Crouse wrote:
> On Mon, Feb 01, 2021 at 08:20:44AM -0800, Rob Clark wrote:
> > On Mon, Feb 1, 2021 at 3:16 AM Will Deacon  wrote:
> > > On Fri, Jan 29, 2021 at 03:12:59PM +0530, Sai Prakash Ranjan wrote:
> > > > On 2021-01-29 14:35, Will Deacon wrote:
> > > > > On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan wrote:
> > > > > > +#define IOMMU_LLC(1 << 6)
> > > > >
> > > > > On reflection, I'm a bit worried about exposing this because I think 
it
> > > > > will
> > > > > introduce a mismatched virtual alias with the CPU (we don't even have 
a
> > > > > MAIR
> > > > > set up for this memory type). Now, we also have that issue for the 
PTW,
> > > > > but
> > > > > since we always use cache maintenance (i.e. the streaming API) for
> > > > > publishing the page-tables to a non-coheren walker, it works out.
> > > > > However,
> > > > > if somebody expects IOMMU_LLC to be coherent with a DMA API coherent
> > > > > allocation, then they're potentially in for a nasty surprise due to 
the
> > > > > mismatched outer-cacheability attributes.
> > > > >
> > > >
> > > > Can't we add the syscached memory type similar to what is done on 
android?
> > >
> > > Maybe. How does the GPU driver map these things on the CPU side?
> >
> > Currently we use writecombine mappings for everything, although there
> > are some cases that we'd like to use cached (but have not merged
> > patches that would give userspace a way to flush/invalidate)
> >
>
> LLC/system cache doesn't have a relationship with the CPU cache.  Its
> just a
> little accelerator that sits on the connection from the GPU to DDR and
> caches
> accesses. The hint that Sai is suggesting is used to mark the buffers as
> 'no-write-allocate' to prevent GPU write operations from being cached in
> the LLC
> which a) isn't interesting and b) takes up cache space for read
> operations.
>
> Its easiest to think of the LLC as a bonus accelerator that has no cost
> for
> us to use outside of the unfortunate per buffer hint.
>
> We do have to worry about the CPU cache w.r.t I/O coherency (which is a
> different hint) and in that case we have all of concerns that Will
> identified.
>

For mismatched outer cacheability attributes which Will mentioned, I 
was

referring to [1] in android kernel.


I've lost track of the conversation here :/

When the GPU has a buffer mapped with IOMMU_LLC, is the buffer also 
mapped

into the CPU and with what attributes? Rob said "writecombine for
everything" -- does that mean ioremap_wc() / MEMREMAP_WC?



Rob answered this.

Finally, we need to be careful when we use the word "hint" as 
"allocation
hint" has a specific meaning in the architecture, and if we only 
mismatch on

those then we're actually ok. But I think IOMMU_LLC is more than just a
hint, since it actually drives eviction policy (i.e. it enables 
writeback).


Sorry for the pedantry, but I just want to make sure we're all talking
about the same things!



Sorry for the confusion which probably was caused by my mentioning of
android, NWA(no write allocate) is an allocation hint which we can 
ignore

for now as it is not introduced yet in upstream.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag

2021-02-02 Thread Sai Prakash Ranjan

On 2021-02-01 23:50, Jordan Crouse wrote:

On Mon, Feb 01, 2021 at 08:20:44AM -0800, Rob Clark wrote:

On Mon, Feb 1, 2021 at 3:16 AM Will Deacon  wrote:
>
> On Fri, Jan 29, 2021 at 03:12:59PM +0530, Sai Prakash Ranjan wrote:
> > On 2021-01-29 14:35, Will Deacon wrote:
> > > On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan wrote:
> > > > Add a new page protection flag IOMMU_LLC which can be used
> > > > by non-coherent masters to set cacheable memory attributes
> > > > for an outer level of cache called as last-level cache or
> > > > system cache. Initial user of this page protection flag is
> > > > the adreno gpu and then can later be used by other clients
> > > > such as video where this can be used for per-buffer based
> > > > mapping.
> > > >
> > > > Signed-off-by: Sai Prakash Ranjan 
> > > > ---
> > > >  drivers/iommu/io-pgtable-arm.c | 3 +++
> > > >  include/linux/iommu.h  | 6 ++
> > > >  2 files changed, 9 insertions(+)
> > > >
> > > > diff --git a/drivers/iommu/io-pgtable-arm.c
> > > > b/drivers/iommu/io-pgtable-arm.c
> > > > index 7439ee7fdcdb..ebe653ef601b 100644
> > > > --- a/drivers/iommu/io-pgtable-arm.c
> > > > +++ b/drivers/iommu/io-pgtable-arm.c
> > > > @@ -415,6 +415,9 @@ static arm_lpae_iopte
> > > > arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
> > > >   else if (prot & IOMMU_CACHE)
> > > >   pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
> > > >   << ARM_LPAE_PTE_ATTRINDX_SHIFT);
> > > > + else if (prot & IOMMU_LLC)
> > > > + pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE
> > > > + << ARM_LPAE_PTE_ATTRINDX_SHIFT);
> > > >   }
> > > >
> > > >   if (prot & IOMMU_CACHE)
> > > > diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> > > > index ffaa389ea128..1f82057df531 100644
> > > > --- a/include/linux/iommu.h
> > > > +++ b/include/linux/iommu.h
> > > > @@ -31,6 +31,12 @@
> > > >   * if the IOMMU page table format is equivalent.
> > > >   */
> > > >  #define IOMMU_PRIV   (1 << 5)
> > > > +/*
> > > > + * Non-coherent masters can use this page protection flag to set
> > > > cacheable
> > > > + * memory attributes for only a transparent outer level of cache,
> > > > also known as
> > > > + * the last-level or system cache.
> > > > + */
> > > > +#define IOMMU_LLC(1 << 6)
> > >
> > > On reflection, I'm a bit worried about exposing this because I think it
> > > will
> > > introduce a mismatched virtual alias with the CPU (we don't even have a
> > > MAIR
> > > set up for this memory type). Now, we also have that issue for the PTW,
> > > but
> > > since we always use cache maintenance (i.e. the streaming API) for
> > > publishing the page-tables to a non-coheren walker, it works out.
> > > However,
> > > if somebody expects IOMMU_LLC to be coherent with a DMA API coherent
> > > allocation, then they're potentially in for a nasty surprise due to the
> > > mismatched outer-cacheability attributes.
> > >
> >
> > Can't we add the syscached memory type similar to what is done on android?
>
> Maybe. How does the GPU driver map these things on the CPU side?

Currently we use writecombine mappings for everything, although there
are some cases that we'd like to use cached (but have not merged
patches that would give userspace a way to flush/invalidate)

BR,
-R


LLC/system cache doesn't have a relationship with the CPU cache.  Its 
just a
little accelerator that sits on the connection from the GPU to DDR and 
caches
accesses. The hint that Sai is suggesting is used to mark the buffers 
as
'no-write-allocate' to prevent GPU write operations from being cached 
in the LLC
which a) isn't interesting and b) takes up cache space for read 
operations.


Its easiest to think of the LLC as a bonus accelerator that has no cost 
for

us to use outside of the unfortunate per buffer hint.

We do have to worry about the CPU cache w.r.t I/O coherency (which is a
different hint) and in that case we have all of concerns that Will 
identified.




For mismatched outer cacheability attributes which Will mentioned, I was
referring to [1] in android kernel.

[1] https://android-review.googlesource.com/c/kernel/common/+/1549097/3

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag

2021-02-02 Thread Sai Prakash Ranjan

On 2021-02-01 23:50, Jordan Crouse wrote:

On Mon, Feb 01, 2021 at 08:20:44AM -0800, Rob Clark wrote:

On Mon, Feb 1, 2021 at 3:16 AM Will Deacon  wrote:
>
> On Fri, Jan 29, 2021 at 03:12:59PM +0530, Sai Prakash Ranjan wrote:
> > On 2021-01-29 14:35, Will Deacon wrote:
> > > On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan wrote:
> > > > Add a new page protection flag IOMMU_LLC which can be used
> > > > by non-coherent masters to set cacheable memory attributes
> > > > for an outer level of cache called as last-level cache or
> > > > system cache. Initial user of this page protection flag is
> > > > the adreno gpu and then can later be used by other clients
> > > > such as video where this can be used for per-buffer based
> > > > mapping.
> > > >
> > > > Signed-off-by: Sai Prakash Ranjan 
> > > > ---
> > > >  drivers/iommu/io-pgtable-arm.c | 3 +++
> > > >  include/linux/iommu.h  | 6 ++
> > > >  2 files changed, 9 insertions(+)
> > > >
> > > > diff --git a/drivers/iommu/io-pgtable-arm.c
> > > > b/drivers/iommu/io-pgtable-arm.c
> > > > index 7439ee7fdcdb..ebe653ef601b 100644
> > > > --- a/drivers/iommu/io-pgtable-arm.c
> > > > +++ b/drivers/iommu/io-pgtable-arm.c
> > > > @@ -415,6 +415,9 @@ static arm_lpae_iopte
> > > > arm_lpae_prot_to_pte(struct arm_lpae_io_pgtable *data,
> > > >   else if (prot & IOMMU_CACHE)
> > > >   pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
> > > >   << ARM_LPAE_PTE_ATTRINDX_SHIFT);
> > > > + else if (prot & IOMMU_LLC)
> > > > + pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE
> > > > + << ARM_LPAE_PTE_ATTRINDX_SHIFT);
> > > >   }
> > > >
> > > >   if (prot & IOMMU_CACHE)
> > > > diff --git a/include/linux/iommu.h b/include/linux/iommu.h
> > > > index ffaa389ea128..1f82057df531 100644
> > > > --- a/include/linux/iommu.h
> > > > +++ b/include/linux/iommu.h
> > > > @@ -31,6 +31,12 @@
> > > >   * if the IOMMU page table format is equivalent.
> > > >   */
> > > >  #define IOMMU_PRIV   (1 << 5)
> > > > +/*
> > > > + * Non-coherent masters can use this page protection flag to set
> > > > cacheable
> > > > + * memory attributes for only a transparent outer level of cache,
> > > > also known as
> > > > + * the last-level or system cache.
> > > > + */
> > > > +#define IOMMU_LLC(1 << 6)
> > >
> > > On reflection, I'm a bit worried about exposing this because I think it
> > > will
> > > introduce a mismatched virtual alias with the CPU (we don't even have a
> > > MAIR
> > > set up for this memory type). Now, we also have that issue for the PTW,
> > > but
> > > since we always use cache maintenance (i.e. the streaming API) for
> > > publishing the page-tables to a non-coheren walker, it works out.
> > > However,
> > > if somebody expects IOMMU_LLC to be coherent with a DMA API coherent
> > > allocation, then they're potentially in for a nasty surprise due to the
> > > mismatched outer-cacheability attributes.
> > >
> >
> > Can't we add the syscached memory type similar to what is done on android?
>
> Maybe. How does the GPU driver map these things on the CPU side?

Currently we use writecombine mappings for everything, although there
are some cases that we'd like to use cached (but have not merged
patches that would give userspace a way to flush/invalidate)

BR,
-R


LLC/system cache doesn't have a relationship with the CPU cache.  Its 
just a
little accelerator that sits on the connection from the GPU to DDR and 
caches
accesses. The hint that Sai is suggesting is used to mark the buffers 
as
'no-write-allocate' to prevent GPU write operations from being cached 
in the LLC
which a) isn't interesting and b) takes up cache space for read 
operations.


Its easiest to think of the LLC as a bonus accelerator that has no cost 
for

us to use outside of the unfortunate per buffer hint.

We do have to worry about the CPU cache w.r.t I/O coherency (which is a
different hint) and in that case we have all of concerns that Will 
identified.




For mismatched outer cacheability attributes which Will mentioned, I was
referring to [1] in android kernel.

[1] https://android-review.googlesource.com/c/kernel/common/+/1549097/3

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-01-30 Thread Sai Prakash Ranjan

On 2021-01-20 10:48, Sai Prakash Ranjan wrote:

On 2021-01-11 19:45, Sai Prakash Ranjan wrote:

commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag")
removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went
the memory type setting required for the non-coherent masters to use
system cache. Now that system cache support for GPU is added, we will
need to set the right PTE attribute for GPU buffers to be sys cached.
Without this, the system cache lines are not allocated for GPU.

So the patches in this series introduces a new prot flag IOMMU_LLC,
renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC
and makes GPU the user of this protection flag.

The series slightly depends on following 2 patches posted earlier and
is based on msm-next branch:
 * https://lore.kernel.org/patchwork/patch/1363008/
 * https://lore.kernel.org/patchwork/patch/1363010/

Sai Prakash Ranjan (3):
  iommu/io-pgtable: Rename last-level cache quirk to
IO_PGTABLE_QUIRK_PTW_LLC
  iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag
  drm/msm: Use IOMMU_LLC page protection flag to map gpu buffers

 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 3 +++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +-
 drivers/gpu/drm/msm/msm_iommu.c | 3 +++
 drivers/gpu/drm/msm/msm_mmu.h   | 4 
 drivers/iommu/io-pgtable-arm.c  | 9 ++---
 include/linux/io-pgtable.h  | 6 +++---
 include/linux/iommu.h   | 6 ++
 7 files changed, 26 insertions(+), 7 deletions(-)


base-commit: 00fd44a1a4700718d5d962432b55c09820f7e709



Gentle Ping!



Gentle Ping!!

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag

2021-01-30 Thread Sai Prakash Ranjan

On 2021-01-29 14:35, Will Deacon wrote:

On Mon, Jan 11, 2021 at 07:45:04PM +0530, Sai Prakash Ranjan wrote:

Add a new page protection flag IOMMU_LLC which can be used
by non-coherent masters to set cacheable memory attributes
for an outer level of cache called as last-level cache or
system cache. Initial user of this page protection flag is
the adreno gpu and then can later be used by other clients
such as video where this can be used for per-buffer based
mapping.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 3 +++
 include/linux/iommu.h  | 6 ++
 2 files changed, 9 insertions(+)

diff --git a/drivers/iommu/io-pgtable-arm.c 
b/drivers/iommu/io-pgtable-arm.c

index 7439ee7fdcdb..ebe653ef601b 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -415,6 +415,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct 
arm_lpae_io_pgtable *data,

else if (prot & IOMMU_CACHE)
pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
+   else if (prot & IOMMU_LLC)
+   pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE
+   << ARM_LPAE_PTE_ATTRINDX_SHIFT);
}

if (prot & IOMMU_CACHE)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ffaa389ea128..1f82057df531 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -31,6 +31,12 @@
  * if the IOMMU page table format is equivalent.
  */
 #define IOMMU_PRIV (1 << 5)
+/*
+ * Non-coherent masters can use this page protection flag to set 
cacheable
+ * memory attributes for only a transparent outer level of cache, 
also known as

+ * the last-level or system cache.
+ */
+#define IOMMU_LLC  (1 << 6)


On reflection, I'm a bit worried about exposing this because I think it 
will
introduce a mismatched virtual alias with the CPU (we don't even have a 
MAIR
set up for this memory type). Now, we also have that issue for the PTW, 
but

since we always use cache maintenance (i.e. the streaming API) for
publishing the page-tables to a non-coheren walker, it works out. 
However,

if somebody expects IOMMU_LLC to be coherent with a DMA API coherent
allocation, then they're potentially in for a nasty surprise due to the
mismatched outer-cacheability attributes.



Can't we add the syscached memory type similar to what is done on 
android?


So I can take patch (1) as a trivial rename, but unfortunately I think 
this

needs more thought before exposing it beyond the PTW.



That wouldn't be of much use, would it :) , we would be losing on
perf gain for GPU usecases without the rest of the patches.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-01-21 Thread Sai Prakash Ranjan

On 2021-01-11 19:45, Sai Prakash Ranjan wrote:

commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag")
removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went
the memory type setting required for the non-coherent masters to use
system cache. Now that system cache support for GPU is added, we will
need to set the right PTE attribute for GPU buffers to be sys cached.
Without this, the system cache lines are not allocated for GPU.

So the patches in this series introduces a new prot flag IOMMU_LLC,
renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC
and makes GPU the user of this protection flag.

The series slightly depends on following 2 patches posted earlier and
is based on msm-next branch:
 * https://lore.kernel.org/patchwork/patch/1363008/
 * https://lore.kernel.org/patchwork/patch/1363010/

Sai Prakash Ranjan (3):
  iommu/io-pgtable: Rename last-level cache quirk to
IO_PGTABLE_QUIRK_PTW_LLC
  iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag
  drm/msm: Use IOMMU_LLC page protection flag to map gpu buffers

 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 3 +++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +-
 drivers/gpu/drm/msm/msm_iommu.c | 3 +++
 drivers/gpu/drm/msm/msm_mmu.h   | 4 
 drivers/iommu/io-pgtable-arm.c  | 9 ++---
 include/linux/io-pgtable.h  | 6 +++---
 include/linux/iommu.h   | 6 ++
 7 files changed, 26 insertions(+), 7 deletions(-)


base-commit: 00fd44a1a4700718d5d962432b55c09820f7e709



Gentle Ping!

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Freedreno] [PATCH] drm/msm: Only enable A6xx LLCC code on A6xx

2021-01-11 Thread Sai Prakash Ranjan

Hi Jordan,

On 2021-01-11 21:41, Jordan Crouse wrote:

On Mon, Jan 11, 2021 at 09:54:12AM +0530, Sai Prakash Ranjan wrote:

Hi Rob,

On 2021-01-08 22:16, Rob Clark wrote:
>On Fri, Jan 8, 2021 at 6:05 AM Sai Prakash Ranjan
> wrote:
>>
>>On 2021-01-08 19:09, Konrad Dybcio wrote:
>>>> Konrad, can you please test this below change without your change?
>>>
>>> This brings no difference, a BUG still happens. We're still calling
>>> to_a6xx_gpu on ANY device that's probed! Too bad it won't turn my A330
>>> into an A640..
>>>
>>> Also, relying on disabling LLCC in the config is out of question as it
>>> makes the arm32 kernel not compile with DRM/MSM and it just removes
>>> the functionality on devices with a6xx.. (unless somebody removes the
>>> dependency on it, which in my opinion is even worse and will cause
>>> more problems for developers!).
>>>
>>
>>Disabling LLCC is not the suggestion, I was under the impression that
>>was the cause here for the smmu bug. Anyways, the check for llc slice
>>in case llcc is disabled is not correct as well. I will send a patch for
>>that as well.
>>
>>> The bigger question is how and why did that piece of code ever make it
>>> to adreno_gpu.c and not a6xx_gpu.c?
>>>
>>
>>My mistake, I will move it.
>
>Thanks, since we don't have kernel-CI coverage for gpu, and there
>probably isn't one person who has all the different devices supported
>(or enough hours in the day to test them all), it is probably
>better/safer to keep things in the backend code that is specific to a
>given generation.
>

Agreed, I will post this change soon and will introduce some feature
check as well because we will need it for iommu prot flag as per 
discussion
here - 
https://lore.kernel.org/lkml/20210108181830.GA5457@willie-the-truck/


>>> To solve it in a cleaner way I propose to move it to an a6xx-specific
>>> file, or if it's going to be used with next-gen GPUs, perhaps manage
>>> calling of this code via an adreno quirk/feature in adreno_device.c.
>>> Now that I think about it, A5xx GPMU en/disable could probably managed
>>> like that, instead of using tons of if-statements for each GPU model
>>> that has it..
>>>
>>> While we're at it, do ALL (and I truly do mean ALL, including the
>>> low-end ones, this will be important later on) A6xx GPUs make use of
>>> that feature?
>>>
>>
>>I do not have a list of all A6XX GPUs with me currently, but from what
>>I know, A618, A630, A640, A650 has the support.
>>
>
>From the PoV of bringing up new a6xx, we should probably consider that
>some of them may not *yet* have LLCC enabled.  I have an 8cx laptop
>and once I find time to get the display working, the next step would
>be bringing up a680.. and I'd probably like to start without LLCC..
>

Right, once I move the LLCC code to a6xx specific address space 
creation,

without LLCC slices for GPU specified in qcom llcc driver, we will not
be using it.


Right. The problem here was that we were assuming an a6xx container in 
generic
code. Testing the existence of LLCC or not is a different problem but 
it is my
understanding that if we set the attribute without LLCC enabled it just 
gets

ignored. Is that correct Sai?



Yes that is correct, I just confirmed now with LLCC team.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCH 3/3] drm/msm: Use IOMMU_LLC page protection flag to map gpu buffers

2021-01-11 Thread Sai Prakash Ranjan
Use the newly introduced IOMMU_LLC page protection flag to map
GPU buffers. This will make sure that proper stage-1 PTE
attributes are set for GPU buffers to use system cache. This
also introduces MMU_FEATURE_USE_LLC features bit to check for
GPUs supporting LLC and set them in the target specific address
space creation, in this case we set them for A6XX GPUs.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 3 +++
 drivers/gpu/drm/msm/msm_iommu.c   | 3 +++
 drivers/gpu/drm/msm/msm_mmu.h | 4 
 3 files changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 3c7ad51732bb..23da21b6f0ff 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1266,6 +1266,9 @@ a6xx_create_address_space(struct msm_gpu *gpu, struct 
platform_device *pdev)
return ERR_CAST(mmu);
}
 
+   if (!IS_ERR_OR_NULL(a6xx_gpu->llc_slice))
+   mmu->features |= MMU_FEATURE_USE_LLC;
+
/*
 * Use the aperture start or SZ_16M, whichever is greater. This will
 * ensure that we align with the allocated pagetable range while still
diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c
index 22ac7c692a81..a329f9836422 100644
--- a/drivers/gpu/drm/msm/msm_iommu.c
+++ b/drivers/gpu/drm/msm/msm_iommu.c
@@ -235,6 +235,9 @@ static int msm_iommu_map(struct msm_mmu *mmu, uint64_t iova,
if (iova & BIT_ULL(48))
iova |= GENMASK_ULL(63, 49);
 
+   if (mmu->features & MMU_FEATURE_USE_LLC)
+   prot |= IOMMU_LLC;
+
ret = iommu_map_sgtable(iommu->domain, iova, sgt, prot);
WARN_ON(!ret);
 
diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h
index 61ade89d9e48..efcd1939c98e 100644
--- a/drivers/gpu/drm/msm/msm_mmu.h
+++ b/drivers/gpu/drm/msm/msm_mmu.h
@@ -23,12 +23,16 @@ enum msm_mmu_type {
MSM_MMU_IOMMU_PAGETABLE,
 };
 
+/* MMU features */
+#define MMU_FEATURE_USE_LLCBIT(0)
+
 struct msm_mmu {
const struct msm_mmu_funcs *funcs;
struct device *dev;
int (*handler)(void *arg, unsigned long iova, int flags);
void *arg;
enum msm_mmu_type type;
+   u32 features;
 };
 
 static inline void msm_mmu_init(struct msm_mmu *mmu, struct device *dev,
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCH 2/3] iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag

2021-01-11 Thread Sai Prakash Ranjan
Add a new page protection flag IOMMU_LLC which can be used
by non-coherent masters to set cacheable memory attributes
for an outer level of cache called as last-level cache or
system cache. Initial user of this page protection flag is
the adreno gpu and then can later be used by other clients
such as video where this can be used for per-buffer based
mapping.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 3 +++
 include/linux/iommu.h  | 6 ++
 2 files changed, 9 insertions(+)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 7439ee7fdcdb..ebe653ef601b 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -415,6 +415,9 @@ static arm_lpae_iopte arm_lpae_prot_to_pte(struct 
arm_lpae_io_pgtable *data,
else if (prot & IOMMU_CACHE)
pte |= (ARM_LPAE_MAIR_ATTR_IDX_CACHE
<< ARM_LPAE_PTE_ATTRINDX_SHIFT);
+   else if (prot & IOMMU_LLC)
+   pte |= (ARM_LPAE_MAIR_ATTR_IDX_INC_OCACHE
+   << ARM_LPAE_PTE_ATTRINDX_SHIFT);
}
 
if (prot & IOMMU_CACHE)
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index ffaa389ea128..1f82057df531 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -31,6 +31,12 @@
  * if the IOMMU page table format is equivalent.
  */
 #define IOMMU_PRIV (1 << 5)
+/*
+ * Non-coherent masters can use this page protection flag to set cacheable
+ * memory attributes for only a transparent outer level of cache, also known as
+ * the last-level or system cache.
+ */
+#define IOMMU_LLC  (1 << 6)
 
 struct iommu_ops;
 struct iommu_group;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCH 0/3] iommu/drm/msm: Allow non-coherent masters to use system cache

2021-01-11 Thread Sai Prakash Ranjan
commit ecd7274fb4cd ("iommu: Remove unused IOMMU_SYS_CACHE_ONLY flag")
removed unused IOMMU_SYS_CACHE_ONLY prot flag and along with it went
the memory type setting required for the non-coherent masters to use
system cache. Now that system cache support for GPU is added, we will
need to set the right PTE attribute for GPU buffers to be sys cached.
Without this, the system cache lines are not allocated for GPU.

So the patches in this series introduces a new prot flag IOMMU_LLC,
renames IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to IO_PGTABLE_QUIRK_PTW_LLC
and makes GPU the user of this protection flag.

The series slightly depends on following 2 patches posted earlier and
is based on msm-next branch:
 * https://lore.kernel.org/patchwork/patch/1363008/
 * https://lore.kernel.org/patchwork/patch/1363010/

Sai Prakash Ranjan (3):
  iommu/io-pgtable: Rename last-level cache quirk to
IO_PGTABLE_QUIRK_PTW_LLC
  iommu/io-pgtable-arm: Add IOMMU_LLC page protection flag
  drm/msm: Use IOMMU_LLC page protection flag to map gpu buffers

 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 3 +++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +-
 drivers/gpu/drm/msm/msm_iommu.c | 3 +++
 drivers/gpu/drm/msm/msm_mmu.h   | 4 
 drivers/iommu/io-pgtable-arm.c  | 9 ++---
 include/linux/io-pgtable.h  | 6 +++---
 include/linux/iommu.h   | 6 ++
 7 files changed, 26 insertions(+), 7 deletions(-)


base-commit: 00fd44a1a4700718d5d962432b55c09820f7e709
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCH 1/3] iommu/io-pgtable: Rename last-level cache quirk to IO_PGTABLE_QUIRK_PTW_LLC

2021-01-11 Thread Sai Prakash Ranjan
Rename last-level cache quirk IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to
IO_PGTABLE_QUIRK_PTW_LLC which is used to set the required TCR
attributes for non-coherent page table walker to be more generic
and in sync with the upcoming page protection flag IOMMU_LLC.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 2 +-
 drivers/iommu/io-pgtable-arm.c  | 6 +++---
 include/linux/io-pgtable.h  | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c 
b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 0f184c3dd9d9..82b5e4969195 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -190,7 +190,7 @@ void adreno_set_llc_attributes(struct iommu_domain *iommu)
 {
struct io_pgtable_domain_attr pgtbl_cfg;
 
-   pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_ARM_OUTER_WBWA;
+   pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_PTW_LLC;
iommu_domain_set_attr(iommu, DOMAIN_ATTR_IO_PGTABLE_CFG, _cfg);
 }
 
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 7c9ea9d7874a..7439ee7fdcdb 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -762,7 +762,7 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
IO_PGTABLE_QUIRK_NON_STRICT |
IO_PGTABLE_QUIRK_ARM_TTBR1 |
-   IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+   IO_PGTABLE_QUIRK_PTW_LLC))
return NULL;
 
data = arm_lpae_alloc_pgtable(cfg);
@@ -774,12 +774,12 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
tcr->sh = ARM_LPAE_TCR_SH_IS;
tcr->irgn = ARM_LPAE_TCR_RGN_WBWA;
tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
-   if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)
+   if (cfg->quirks & IO_PGTABLE_QUIRK_PTW_LLC)
goto out_free_data;
} else {
tcr->sh = ARM_LPAE_TCR_SH_OS;
tcr->irgn = ARM_LPAE_TCR_RGN_NC;
-   if (!(cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+   if (!(cfg->quirks & IO_PGTABLE_QUIRK_PTW_LLC))
tcr->orgn = ARM_LPAE_TCR_RGN_NC;
else
tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index fb4d5a763e0c..6f996a817441 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -87,8 +87,8 @@ struct io_pgtable_cfg {
 * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table
 *  for use in the upper half of a split address space.
 *
-* IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the outer-cacheability
-*  attributes set in the TCR for a non-coherent page-table walker.
+* IO_PGTABLE_QUIRK_PTW_LLC: Override the outer-cacheability attributes
+*  set in the TCR for a non-coherent page-table walker.
 */
#define IO_PGTABLE_QUIRK_ARM_NS BIT(0)
#define IO_PGTABLE_QUIRK_NO_PERMS   BIT(1)
@@ -96,7 +96,7 @@ struct io_pgtable_cfg {
#define IO_PGTABLE_QUIRK_ARM_MTK_EXTBIT(3)
#define IO_PGTABLE_QUIRK_NON_STRICT BIT(4)
#define IO_PGTABLE_QUIRK_ARM_TTBR1  BIT(5)
-   #define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6)
+   #define IO_PGTABLE_QUIRK_PTW_LLCBIT(6)
unsigned long   quirks;
unsigned long   pgsize_bitmap;
unsigned intias;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [Freedreno] [PATCH] drm/msm: Only enable A6xx LLCC code on A6xx

2021-01-11 Thread Sai Prakash Ranjan

Hi Rob,

On 2021-01-08 22:16, Rob Clark wrote:

On Fri, Jan 8, 2021 at 6:05 AM Sai Prakash Ranjan
 wrote:


On 2021-01-08 19:09, Konrad Dybcio wrote:
>> Konrad, can you please test this below change without your change?
>
> This brings no difference, a BUG still happens. We're still calling
> to_a6xx_gpu on ANY device that's probed! Too bad it won't turn my A330
> into an A640..
>
> Also, relying on disabling LLCC in the config is out of question as it
> makes the arm32 kernel not compile with DRM/MSM and it just removes
> the functionality on devices with a6xx.. (unless somebody removes the
> dependency on it, which in my opinion is even worse and will cause
> more problems for developers!).
>

Disabling LLCC is not the suggestion, I was under the impression that
was the cause here for the smmu bug. Anyways, the check for llc slice
in case llcc is disabled is not correct as well. I will send a patch 
for

that as well.

> The bigger question is how and why did that piece of code ever make it
> to adreno_gpu.c and not a6xx_gpu.c?
>

My mistake, I will move it.


Thanks, since we don't have kernel-CI coverage for gpu, and there
probably isn't one person who has all the different devices supported
(or enough hours in the day to test them all), it is probably
better/safer to keep things in the backend code that is specific to a
given generation.



Agreed, I will post this change soon and will introduce some feature
check as well because we will need it for iommu prot flag as per 
discussion
here - 
https://lore.kernel.org/lkml/20210108181830.GA5457@willie-the-truck/



> To solve it in a cleaner way I propose to move it to an a6xx-specific
> file, or if it's going to be used with next-gen GPUs, perhaps manage
> calling of this code via an adreno quirk/feature in adreno_device.c.
> Now that I think about it, A5xx GPMU en/disable could probably managed
> like that, instead of using tons of if-statements for each GPU model
> that has it..
>
> While we're at it, do ALL (and I truly do mean ALL, including the
> low-end ones, this will be important later on) A6xx GPUs make use of
> that feature?
>

I do not have a list of all A6XX GPUs with me currently, but from what
I know, A618, A630, A640, A650 has the support.



From the PoV of bringing up new a6xx, we should probably consider that
some of them may not *yet* have LLCC enabled.  I have an 8cx laptop
and once I find time to get the display working, the next step would
be bringing up a680.. and I'd probably like to start without LLCC..



Right, once I move the LLCC code to a6xx specific address space 
creation,

without LLCC slices for GPU specified in qcom llcc driver, we will not
be using it.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH] drm/msm: Only enable A6xx LLCC code on A6xx

2021-01-09 Thread Sai Prakash Ranjan
Hi Rob, Konrad,

On 2021-01-07 22:56, Rob Clark wrote:
> On Wed, Jan 6, 2021 at 8:50 PM Sai Prakash Ranjan
>  wrote:
>>
>> On 2021-01-05 01:00, Konrad Dybcio wrote:
>> > Using this code on A5xx (and probably older too) causes a
>> > smmu bug.
>> >
>> > Fixes: 474dadb8b0d5 ("drm/msm/a6xx: Add support for using system
>> > cache(LLC)")
>> > Signed-off-by: Konrad Dybcio 
>> > Tested-by: AngeloGioacchino Del Regno
>> > 
>> > ---
>>
>> Reviewed-by: Sai Prakash Ranjan 
>>
>> >  drivers/gpu/drm/msm/adreno/adreno_gpu.c | 21 -
>> >  drivers/gpu/drm/msm/adreno/adreno_gpu.h |  5 +
>> >  2 files changed, 17 insertions(+), 9 deletions(-)
>> >
>> > diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> > b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> > index 6cf9975e951e..f09175698827 100644
>> > --- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> > +++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
>> > @@ -191,8 +191,6 @@ adreno_iommu_create_address_space(struct msm_gpu
>> > *gpu,
>> >   struct platform_device *pdev)
>> >  {
>> >   struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>> > - struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
>> > - struct io_pgtable_domain_attr pgtbl_cfg;
>> >   struct iommu_domain *iommu;
>> >   struct msm_mmu *mmu;
> >   struct msm_gem_address_space *aspace;
>> > @@ -202,13 +200,18 @@ adreno_iommu_create_address_space(struct msm_gpu
>> > *gpu,
>> >   if (!iommu)
>> >   return NULL;
>> >
>> > - /*
>> > -  * This allows GPU to set the bus attributes required to use system
>> > -  * cache on behalf of the iommu page table walker.
>> > -  */
>> > - if (!IS_ERR(a6xx_gpu->htw_llc_slice)) {
>> > - pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_ARM_OUTER_WBWA;
>> > - iommu_domain_set_attr(iommu, DOMAIN_ATTR_IO_PGTABLE_CFG,
>> > _cfg);
>> > +
>> > + if (adreno_is_a6xx(adreno_gpu)) {
>> > + struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
>> > + struct io_pgtable_domain_attr pgtbl_cfg;
>> > + /*
>> > + * This allows GPU to set the bus attributes required to use 
>> > system
>> > + * cache on behalf of the iommu page table walker.
>> > + */
>> > + if (!IS_ERR(a6xx_gpu->htw_llc_slice)) {
>> > + pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_ARM_OUTER_WBWA;
>> > + iommu_domain_set_attr(iommu, 
>> > DOMAIN_ATTR_IO_PGTABLE_CFG,
>> > _cfg);
>> > + }
>
> I'm applying for -fixes as this is an obvious problem..  But kinda
> thinking that we should try to move it into an a6xx specific
> create_address_space() (or wrapper for the generic fxn)
>
> Sai/Jordan, could I talk one of you into trying to clean this up
> better for next cycle?
>

Looking more closely(sorry I should have before), the quirk setting
is already guarded by htw_llc_slice check but what is happening here is
that check is not proper when LLCC is disabled i.e., CONFIG_QCOM_LLCC=n.
When LLCC is disabled, htw_llc_slice is set to NULL and the !IS_ERR
check passes because it doesn't take care of NULL and quirk is
set causing bugs. So the proper fix would be to use IS_ERR_OR_NULL for
the check.

Konrad, can you please test this below change without your change?

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 130661898546..3b798e883f82 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1117,7 +1117,7 @@ static void a6xx_llc_slices_init(struct platform_device 
*pdev,
a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU);
a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);

-   if (IS_ERR(a6xx_gpu->llc_slice) && IS_ERR(a6xx_gpu->htw_llc_slice))
+   if (IS_ERR_OR_NULL(a6xx_gpu->llc_slice) && 
IS_ERR_OR_NULL(a6xx_gpu->htw_llc_slice))
a6xx_gpu->llc_mmio = ERR_PTR(-EINVAL);
 }

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c 
b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 6cf9975e951e..dbd5cacddb9c 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -206,7 +206,7 @@ adreno_iommu_create_address_space(struct msm_gpu *gpu,
 * This allows GPU to set the bus attributes required to use system
 * cache on behalf of the iommu page table walker.
 */
-   if (!IS_ERR(a6xx_gpu->htw_llc_slice)) {
+   if (!IS_ERR_OR_NULL(a6xx_gpu->htw_llc_slice)) {
pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_ARM_OUTER_WBWA;
iommu_domain_set_attr(iommu, DOMAIN_ATTR_IO_PGTABLE_CFG, 
_cfg);
}


Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH] drm/msm: Only enable A6xx LLCC code on A6xx

2021-01-09 Thread Sai Prakash Ranjan

On 2021-01-08 19:09, Konrad Dybcio wrote:

Konrad, can you please test this below change without your change?


This brings no difference, a BUG still happens. We're still calling
to_a6xx_gpu on ANY device that's probed! Too bad it won't turn my A330
into an A640..

Also, relying on disabling LLCC in the config is out of question as it
makes the arm32 kernel not compile with DRM/MSM and it just removes
the functionality on devices with a6xx.. (unless somebody removes the
dependency on it, which in my opinion is even worse and will cause
more problems for developers!).



Disabling LLCC is not the suggestion, I was under the impression that
was the cause here for the smmu bug. Anyways, the check for llc slice
in case llcc is disabled is not correct as well. I will send a patch for
that as well.


The bigger question is how and why did that piece of code ever make it
to adreno_gpu.c and not a6xx_gpu.c?



My mistake, I will move it.


To solve it in a cleaner way I propose to move it to an a6xx-specific
file, or if it's going to be used with next-gen GPUs, perhaps manage
calling of this code via an adreno quirk/feature in adreno_device.c.
Now that I think about it, A5xx GPMU en/disable could probably managed
like that, instead of using tons of if-statements for each GPU model
that has it..

While we're at it, do ALL (and I truly do mean ALL, including the
low-end ones, this will be important later on) A6xx GPUs make use of
that feature?



I do not have a list of all A6XX GPUs with me currently, but from what
I know, A618, A630, A640, A650 has the support.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH] drm/msm: Only enable A6xx LLCC code on A6xx

2021-01-07 Thread Sai Prakash Ranjan

On 2021-01-05 01:00, Konrad Dybcio wrote:

Using this code on A5xx (and probably older too) causes a
smmu bug.

Fixes: 474dadb8b0d5 ("drm/msm/a6xx: Add support for using system 
cache(LLC)")

Signed-off-by: Konrad Dybcio 
Tested-by: AngeloGioacchino Del Regno 


---


Reviewed-by: Sai Prakash Ranjan 


 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 21 -
 drivers/gpu/drm/msm/adreno/adreno_gpu.h |  5 +
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
index 6cf9975e951e..f09175698827 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.c
@@ -191,8 +191,6 @@ adreno_iommu_create_address_space(struct msm_gpu 
*gpu,

struct platform_device *pdev)
 {
struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
-   struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
-   struct io_pgtable_domain_attr pgtbl_cfg;
struct iommu_domain *iommu;
struct msm_mmu *mmu;
struct msm_gem_address_space *aspace;
@@ -202,13 +200,18 @@ adreno_iommu_create_address_space(struct msm_gpu 
*gpu,

if (!iommu)
return NULL;

-   /*
-* This allows GPU to set the bus attributes required to use system
-* cache on behalf of the iommu page table walker.
-*/
-   if (!IS_ERR(a6xx_gpu->htw_llc_slice)) {
-   pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_ARM_OUTER_WBWA;
-		iommu_domain_set_attr(iommu, DOMAIN_ATTR_IO_PGTABLE_CFG, 
_cfg);

+
+   if (adreno_is_a6xx(adreno_gpu)) {
+   struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
+   struct io_pgtable_domain_attr pgtbl_cfg;
+   /*
+   * This allows GPU to set the bus attributes required to use 
system
+   * cache on behalf of the iommu page table walker.
+   */
+   if (!IS_ERR(a6xx_gpu->htw_llc_slice)) {
+   pgtbl_cfg.quirks = IO_PGTABLE_QUIRK_ARM_OUTER_WBWA;
+			iommu_domain_set_attr(iommu, DOMAIN_ATTR_IO_PGTABLE_CFG, 
_cfg);

+   }
}

mmu = msm_iommu_new(>dev, iommu);
diff --git a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
index 4574d85c5680..08421fa54a50 100644
--- a/drivers/gpu/drm/msm/adreno/adreno_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/adreno_gpu.h
@@ -226,6 +226,11 @@ static inline int adreno_is_a540(struct adreno_gpu 
*gpu)

return gpu->revn == 540;
 }

+static inline bool adreno_is_a6xx(struct adreno_gpu *gpu)
+{
+   return ((gpu->revn < 700 && gpu->revn > 599));
+}
+
 static inline int adreno_is_a618(struct adreno_gpu *gpu)
 {
return gpu->revn == 618;


--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv10 4/9] iommu/arm-smmu: Move non-strict mode to use io_pgtable_domain_attr

2020-11-25 Thread Sai Prakash Ranjan
Now that we have a struct io_pgtable_domain_attr with quirks,
use that for non_strict mode as well thereby removing the need
for more members of arm_smmu_domain in the future.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 15 +--
 drivers/iommu/arm/arm-smmu/arm-smmu.h |  1 -
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 4b9b10fe50ed..d8979bb71fc0 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -786,9 +786,6 @@ static int arm_smmu_init_domain_context(struct iommu_domain 
*domain,
goto out_clear_smmu;
}
 
-   if (smmu_domain->non_strict)
-   pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
-
if (smmu_domain->pgtbl_cfg.quirks)
pgtbl_cfg.quirks |= smmu_domain->pgtbl_cfg.quirks;
 
@@ -1526,9 +1523,12 @@ static int arm_smmu_domain_get_attr(struct iommu_domain 
*domain,
break;
case IOMMU_DOMAIN_DMA:
switch (attr) {
-   case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-   *(int *)data = smmu_domain->non_strict;
+   case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE: {
+   bool non_strict = smmu_domain->pgtbl_cfg.quirks &
+ IO_PGTABLE_QUIRK_NON_STRICT;
+   *(int *)data = non_strict;
return 0;
+   }
default:
return -ENODEV;
}
@@ -1578,7 +1578,10 @@ static int arm_smmu_domain_set_attr(struct iommu_domain 
*domain,
case IOMMU_DOMAIN_DMA:
switch (attr) {
case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-   smmu_domain->non_strict = *(int *)data;
+   if (*(int *)data)
+   smmu_domain->pgtbl_cfg.quirks |= 
IO_PGTABLE_QUIRK_NON_STRICT;
+   else
+   smmu_domain->pgtbl_cfg.quirks &= 
~IO_PGTABLE_QUIRK_NON_STRICT;
break;
default:
ret = -ENODEV;
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index bb5a419f240f..cb7ca3a444c9 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -368,7 +368,6 @@ struct arm_smmu_domain {
const struct iommu_flush_ops*flush_ops;
struct arm_smmu_cfg cfg;
enum arm_smmu_domain_stage  stage;
-   boolnon_strict;
struct mutexinit_mutex; /* Protects smmu pointer */
spinlock_t  cb_lock; /* Serialises ATS1* ops and 
TLB syncs */
struct iommu_domain domain;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv10 7/9] drm/msm/a6xx: Add support for using system cache on MMU500 based targets

2020-11-25 Thread Sai Prakash Ranjan
From: Jordan Crouse 

GPU targets with an MMU-500 attached have a slightly different process for
enabling system cache. Use the compatible string on the IOMMU phandle
to see if an MMU-500 is attached and modify the programming sequence
accordingly.

Signed-off-by: Jordan Crouse 
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 46 +--
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h |  1 +
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 95c98c642876..3f8b92da8cba 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1042,6 +1042,8 @@ static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu)
 
 static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
 {
+   struct adreno_gpu *adreno_gpu = _gpu->base;
+   struct msm_gpu *gpu = _gpu->base;
u32 cntl1_regval = 0;
 
if (IS_ERR(a6xx_gpu->llc_mmio))
@@ -1055,11 +1057,17 @@ static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
   (gpu_scid << 15) | (gpu_scid << 20);
}
 
+   /*
+* For targets with a MMU500, activate the slice but don't program the
+* register.  The XBL will take care of that.
+*/
if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
-   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
+   if (!a6xx_gpu->have_mmu500) {
+   u32 gpuhtw_scid = 
llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
 
-   gpuhtw_scid &= 0x1f;
-   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
+   gpuhtw_scid &= 0x1f;
+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), 
gpuhtw_scid);
+   }
}
 
if (cntl1_regval) {
@@ -1067,13 +1075,20 @@ static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
 * Program the slice IDs for the various GPU blocks and GPU MMU
 * pagetables
 */
-   a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
-
-   /*
-* Program cacheability overrides to not allocate cache lines on
-* a write miss
-*/
-   a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 
0xF, 0x03);
+   if (a6xx_gpu->have_mmu500)
+   gpu_rmw(gpu, REG_A6XX_GBIF_SCACHE_CNTL1, GENMASK(24, 0),
+   cntl1_regval);
+   else {
+   a6xx_llc_write(a6xx_gpu,
+   REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
+
+   /*
+* Program cacheability overrides to not allocate cache
+* lines on a write miss
+*/
+   a6xx_llc_rmw(a6xx_gpu,
+   REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 
0x03);
+   }
}
 }
 
@@ -1086,10 +1101,21 @@ static void a6xx_llc_slices_destroy(struct a6xx_gpu 
*a6xx_gpu)
 static void a6xx_llc_slices_init(struct platform_device *pdev,
struct a6xx_gpu *a6xx_gpu)
 {
+   struct device_node *phandle;
+
a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
if (IS_ERR(a6xx_gpu->llc_mmio))
return;
 
+   /*
+* There is a different programming path for targets with an mmu500
+* attached, so detect if that is the case
+*/
+   phandle = of_parse_phandle(pdev->dev.of_node, "iommus", 0);
+   a6xx_gpu->have_mmu500 = (phandle &&
+   of_device_is_compatible(phandle, "arm,mmu-500"));
+   of_node_put(phandle);
+
a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU);
a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
 
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
index 9e6079af679c..e793d329e77b 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
@@ -32,6 +32,7 @@ struct a6xx_gpu {
void __iomem *llc_mmio;
void *llc_slice;
void *htw_llc_slice;
+   bool have_mmu500;
 };
 
 #define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base)
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv10 8/9] iommu: arm-smmu-impl: Use table to list QCOM implementations

2020-11-25 Thread Sai Prakash Ranjan
Use table and of_match_node() to match qcom implementation
instead of multiple of_device_compatible() calls for each
QCOM SMMU implementation.

Signed-off-by: Sai Prakash Ranjan 
Acked-by: Will Deacon 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c |  9 +
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 21 -
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |  1 -
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index 7fed89c9d18a..26e2734eb4d7 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -214,14 +214,7 @@ struct arm_smmu_device *arm_smmu_impl_init(struct 
arm_smmu_device *smmu)
if (of_device_is_compatible(np, "nvidia,tegra194-smmu"))
return nvidia_smmu_impl_init(smmu);
 
-   if (of_device_is_compatible(np, "qcom,sdm845-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sc7180-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sm8150-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sm8250-smmu-500"))
-   return qcom_smmu_impl_init(smmu);
-
-   if (of_device_is_compatible(smmu->dev->of_node, "qcom,adreno-smmu"))
-   return qcom_adreno_smmu_impl_init(smmu);
+   smmu = qcom_smmu_impl_init(smmu);
 
if (of_device_is_compatible(np, "marvell,ap806-smmu-500"))
smmu->impl = _mmu500_impl;
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index d0636c803a36..add1859b2899 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -318,12 +318,23 @@ static struct arm_smmu_device *qcom_smmu_create(struct 
arm_smmu_device *smmu,
return >smmu;
 }
 
+static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = {
+   { .compatible = "qcom,sc7180-smmu-500" },
+   { .compatible = "qcom,sdm845-smmu-500" },
+   { .compatible = "qcom,sm8150-smmu-500" },
+   { .compatible = "qcom,sm8250-smmu-500" },
+   { }
+};
+
 struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu)
 {
-   return qcom_smmu_create(smmu, _smmu_impl);
-}
+   const struct device_node *np = smmu->dev->of_node;
 
-struct arm_smmu_device *qcom_adreno_smmu_impl_init(struct arm_smmu_device 
*smmu)
-{
-   return qcom_smmu_create(smmu, _adreno_smmu_impl);
+   if (of_match_node(qcom_smmu_impl_of_match, np))
+   return qcom_smmu_create(smmu, _smmu_impl);
+
+   if (of_device_is_compatible(np, "qcom,adreno-smmu"))
+   return qcom_smmu_create(smmu, _adreno_smmu_impl);
+
+   return smmu;
 }
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index cb7ca3a444c9..d2a2d1bc58ba 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -523,7 +523,6 @@ static inline void arm_smmu_writeq(struct arm_smmu_device 
*smmu, int page,
 struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu);
 struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu);
 struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu);
-struct arm_smmu_device *qcom_adreno_smmu_impl_init(struct arm_smmu_device 
*smmu);
 
 void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx);
 int arm_mmu500_reset(struct arm_smmu_device *smmu);
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCHv9 2/8] iommu/arm-smmu: Add domain attribute for pagetable configuration

2020-11-25 Thread Sai Prakash Ranjan

On 2020-11-25 03:11, Will Deacon wrote:

On Mon, Nov 23, 2020 at 10:35:55PM +0530, Sai Prakash Ranjan wrote:

Add iommu domain attribute for pagetable configuration which
initially will be used to set quirks like for system cache aka
last level cache to be used by client drivers like GPU to set
right attributes for caching the hardware pagetables into the
system cache and later can be extended to include other page
table configuration data.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 20 
 drivers/iommu/arm/arm-smmu/arm-smmu.h |  1 +
 include/linux/io-pgtable.h|  4 
 include/linux/iommu.h |  1 +
 4 files changed, 26 insertions(+)


Given that we're heading for a v10 to address my comments on patch 3,
then I guess you may as well split this into two patches so that I can
share just the atttibute with Rob rather than the driver parts.

Please keep it all as one series though, with the common parts at the
beginning, and I'll figure it out.



Ok I will split up and send v10.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv10 9/9] iommu: arm-smmu-impl: Add a space before open parenthesis

2020-11-25 Thread Sai Prakash Ranjan
Fix the checkpatch warning for space required before the open
parenthesis.

Signed-off-by: Sai Prakash Ranjan 
Acked-by: Will Deacon 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index 26e2734eb4d7..136872e77195 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -12,7 +12,7 @@
 
 static int arm_smmu_gr0_ns(int offset)
 {
-   switch(offset) {
+   switch (offset) {
case ARM_SMMU_GR0_sCR0:
case ARM_SMMU_GR0_sACR:
case ARM_SMMU_GR0_sGFSR:
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv10 6/9] drm/msm/a6xx: Add support for using system cache(LLC)

2020-11-25 Thread Sai Prakash Ranjan
From: Sharat Masetty 

The last level system cache can be partitioned to 32 different
slices of which GPU has two slices preallocated. One slice is
used for caching GPU buffers and the other slice is used for
caching the GPU SMMU pagetables. This talks to the core system
cache driver to acquire the slice handles, configure the SCID's
to those slices and activates and deactivates the slices upon
GPU power collapse and restore.

Some support from the IOMMU driver is also needed to make use
of the system cache to set the right TCR attributes. GPU then
has the ability to override a few cacheability parameters which
it does to override write-allocate to write-no-allocate as the
GPU hardware does not benefit much from it.

DOMAIN_ATTR_IO_PGTABLE_CFG is another domain level attribute used
by the IOMMU driver for pagetable configuration which will be used
to set a quirk initially to set the right attributes to cache the
hardware pagetables into the system cache.

Signed-off-by: Sharat Masetty 
[saiprakash.ranjan: fix to set attr before device attach to iommu and rebase]
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 83 +
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h   |  4 ++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 17 +
 3 files changed, 104 insertions(+)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 948f3656c20c..95c98c642876 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -8,7 +8,9 @@
 #include "a6xx_gpu.h"
 #include "a6xx_gmu.xml.h"
 
+#include 
 #include 
+#include 
 
 #define GPU_PAS_ID 13
 
@@ -1022,6 +1024,79 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
return IRQ_HANDLED;
 }
 
+static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 mask, u32 or)
+{
+   return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or);
+}
+
+static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 value)
+{
+   return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2));
+}
+
+static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu)
+{
+   llcc_slice_deactivate(a6xx_gpu->llc_slice);
+   llcc_slice_deactivate(a6xx_gpu->htw_llc_slice);
+}
+
+static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
+{
+   u32 cntl1_regval = 0;
+
+   if (IS_ERR(a6xx_gpu->llc_mmio))
+   return;
+
+   if (!llcc_slice_activate(a6xx_gpu->llc_slice)) {
+   u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice);
+
+   gpu_scid &= 0x1f;
+   cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid << 
10) |
+  (gpu_scid << 15) | (gpu_scid << 20);
+   }
+
+   if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
+   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
+
+   gpuhtw_scid &= 0x1f;
+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
+   }
+
+   if (cntl1_regval) {
+   /*
+* Program the slice IDs for the various GPU blocks and GPU MMU
+* pagetables
+*/
+   a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
+
+   /*
+* Program cacheability overrides to not allocate cache lines on
+* a write miss
+*/
+   a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 
0xF, 0x03);
+   }
+}
+
+static void a6xx_llc_slices_destroy(struct a6xx_gpu *a6xx_gpu)
+{
+   llcc_slice_putd(a6xx_gpu->llc_slice);
+   llcc_slice_putd(a6xx_gpu->htw_llc_slice);
+}
+
+static void a6xx_llc_slices_init(struct platform_device *pdev,
+   struct a6xx_gpu *a6xx_gpu)
+{
+   a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
+   if (IS_ERR(a6xx_gpu->llc_mmio))
+   return;
+
+   a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU);
+   a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
+
+   if (IS_ERR(a6xx_gpu->llc_slice) && IS_ERR(a6xx_gpu->htw_llc_slice))
+   a6xx_gpu->llc_mmio = ERR_PTR(-EINVAL);
+}
+
 static int a6xx_pm_resume(struct msm_gpu *gpu)
 {
struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
@@ -1038,6 +1113,8 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
 
msm_gpu_resume_devfreq(gpu);
 
+   a6xx_llc_activate(a6xx_gpu);
+
return 0;
 }
 
@@ -1048,6 +1125,8 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)
 
trace_msm_gpu_suspend(0);
 
+   a6xx_llc_deactivate(a6xx_gpu);
+
devfreq_suspend_device(gpu->devfreq.devfreq);
 
return a6xx_gmu_stop(a6xx_gpu);
@@ -1091,6 +1170,8 @@ static void a6xx_destroy(struct msm_gpu *gpu)
drm_ge

Re: [PATCHv9 3/8] iommu/arm-smmu: Move non-strict mode to use io_pgtable_domain_attr

2020-11-25 Thread Sai Prakash Ranjan

On 2020-11-25 03:09, Will Deacon wrote:

On Mon, Nov 23, 2020 at 10:35:56PM +0530, Sai Prakash Ranjan wrote:

Now that we have a struct io_pgtable_domain_attr with quirks,
use that for non_strict mode as well thereby removing the need
for more members of arm_smmu_domain in the future.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 8 +++-
 drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 -
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c

index 4b9b10fe50ed..f56f266ebdf7 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -786,9 +786,6 @@ static int arm_smmu_init_domain_context(struct 
iommu_domain *domain,

goto out_clear_smmu;
}

-   if (smmu_domain->non_strict)
-   pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
-
if (smmu_domain->pgtbl_cfg.quirks)
pgtbl_cfg.quirks |= smmu_domain->pgtbl_cfg.quirks;

@@ -1527,7 +1524,8 @@ static int arm_smmu_domain_get_attr(struct 
iommu_domain *domain,

case IOMMU_DOMAIN_DMA:
switch (attr) {
case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-   *(int *)data = smmu_domain->non_strict;
+   if (smmu_domain->pgtbl_cfg.quirks & 
IO_PGTABLE_QUIRK_NON_STRICT)
+   *(int *)data = smmu_domain->pgtbl_cfg.quirks;


I still don't think this is right :(
We need to set *data to 1 or 0 depending on whether or not the 
non-strict

quirk is set, i.e:

	bool non_strict = smmu_domain->pgtbl_cfg.quirks & 
IO_PGTABLE_QUIRK_NON_STRICT;

*(int *)data = non_strict;

Your code above leaves *data uninitialised if non_strict is not set.


Ugh sorry, I should have looked at this some more before hurrying up
to post, will fix it.




return 0;
default:
return -ENODEV;
@@ -1578,7 +1576,7 @@ static int arm_smmu_domain_set_attr(struct 
iommu_domain *domain,

case IOMMU_DOMAIN_DMA:
switch (attr) {
case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-   smmu_domain->non_strict = *(int *)data;
+   smmu_domain->pgtbl_cfg.quirks |= 
IO_PGTABLE_QUIRK_NON_STRICT;


And this is broken because if *data is 0, then you _set_ the quirk, 
which is

the opposite of what we should be doing.

In other words, although the implementation has changed, the semantics 
have

not.



Will fix this to have quirk set only when *data = 1 and unset in case of 
0.


Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv10 3/9] iommu/arm-smmu: Add support for pagetable config domain attribute

2020-11-25 Thread Sai Prakash Ranjan
Add support for domain attribute DOMAIN_ATTR_IO_PGTABLE_CFG
to get/set pagetable configuration data which initially will
be used to set quirks and later can be extended to include
other pagetable configuration data.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 20 
 drivers/iommu/arm/arm-smmu/arm-smmu.h |  1 +
 2 files changed, 21 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 0f28a8614da3..4b9b10fe50ed 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -789,6 +789,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain 
*domain,
if (smmu_domain->non_strict)
pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
 
+   if (smmu_domain->pgtbl_cfg.quirks)
+   pgtbl_cfg.quirks |= smmu_domain->pgtbl_cfg.quirks;
+
pgtbl_ops = alloc_io_pgtable_ops(fmt, _cfg, smmu_domain);
if (!pgtbl_ops) {
ret = -ENOMEM;
@@ -1511,6 +1514,12 @@ static int arm_smmu_domain_get_attr(struct iommu_domain 
*domain,
case DOMAIN_ATTR_NESTING:
*(int *)data = (smmu_domain->stage == 
ARM_SMMU_DOMAIN_NESTED);
return 0;
+   case DOMAIN_ATTR_IO_PGTABLE_CFG: {
+   struct io_pgtable_domain_attr *pgtbl_cfg = data;
+   *pgtbl_cfg = smmu_domain->pgtbl_cfg;
+
+   return 0;
+   }
default:
return -ENODEV;
}
@@ -1551,6 +1560,17 @@ static int arm_smmu_domain_set_attr(struct iommu_domain 
*domain,
else
smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
break;
+   case DOMAIN_ATTR_IO_PGTABLE_CFG: {
+   struct io_pgtable_domain_attr *pgtbl_cfg = data;
+
+   if (smmu_domain->smmu) {
+   ret = -EPERM;
+   goto out_unlock;
+   }
+
+   smmu_domain->pgtbl_cfg = *pgtbl_cfg;
+   break;
+   }
default:
ret = -ENODEV;
}
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 04288b6fc619..bb5a419f240f 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -364,6 +364,7 @@ enum arm_smmu_domain_stage {
 struct arm_smmu_domain {
struct arm_smmu_device  *smmu;
struct io_pgtable_ops   *pgtbl_ops;
+   struct io_pgtable_domain_attr   pgtbl_cfg;
const struct iommu_flush_ops*flush_ops;
struct arm_smmu_cfg cfg;
enum arm_smmu_domain_stage  stage;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv10 1/9] iommu/io-pgtable: Add a domain attribute for pagetable configuration

2020-11-25 Thread Sai Prakash Ranjan
Add a new iommu domain attribute DOMAIN_ATTR_IO_PGTABLE_CFG
for pagetable configuration which initially will be used to
set quirks like for system cache aka last level cache to be
used by client drivers like GPU to set right attributes for
caching the hardware pagetables into the system cache and
later can be extended to include other page table configuration
data.

Signed-off-by: Sai Prakash Ranjan 
---
 include/linux/io-pgtable.h | 4 
 include/linux/iommu.h  | 1 +
 2 files changed, 5 insertions(+)

diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 4cde111e425b..215fd9d69540 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -208,6 +208,10 @@ struct io_pgtable {
 
 #define io_pgtable_ops_to_pgtable(x) container_of((x), struct io_pgtable, ops)
 
+struct io_pgtable_domain_attr {
+   unsigned long quirks;
+};
+
 static inline void io_pgtable_tlb_flush_all(struct io_pgtable *iop)
 {
iop->cfg.tlb->tlb_flush_all(iop->cookie);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b95a6f8db6ff..ffaa389ea128 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -118,6 +118,7 @@ enum iommu_attr {
DOMAIN_ATTR_FSL_PAMUV1,
DOMAIN_ATTR_NESTING,/* two stages of translation */
DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
+   DOMAIN_ATTR_IO_PGTABLE_CFG,
DOMAIN_ATTR_MAX,
 };
 
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv10 2/9] iommu/io-pgtable-arm: Add support to use system cache

2020-11-25 Thread Sai Prakash Ranjan
Add a quirk IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to override
the outer-cacheability attributes set in the TCR for a
non-coherent page table walker when using system cache.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 10 --
 include/linux/io-pgtable.h |  4 
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index a7a9bc08dcd1..7c9ea9d7874a 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -761,7 +761,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
 
if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
IO_PGTABLE_QUIRK_NON_STRICT |
-   IO_PGTABLE_QUIRK_ARM_TTBR1))
+   IO_PGTABLE_QUIRK_ARM_TTBR1 |
+   IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
return NULL;
 
data = arm_lpae_alloc_pgtable(cfg);
@@ -773,10 +774,15 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
tcr->sh = ARM_LPAE_TCR_SH_IS;
tcr->irgn = ARM_LPAE_TCR_RGN_WBWA;
tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
+   if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)
+   goto out_free_data;
} else {
tcr->sh = ARM_LPAE_TCR_SH_OS;
tcr->irgn = ARM_LPAE_TCR_RGN_NC;
-   tcr->orgn = ARM_LPAE_TCR_RGN_NC;
+   if (!(cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+   tcr->orgn = ARM_LPAE_TCR_RGN_NC;
+   else
+   tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
}
 
tg1 = cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 215fd9d69540..fb4d5a763e0c 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -86,6 +86,9 @@ struct io_pgtable_cfg {
 *
 * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table
 *  for use in the upper half of a split address space.
+*
+* IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the outer-cacheability
+*  attributes set in the TCR for a non-coherent page-table walker.
 */
#define IO_PGTABLE_QUIRK_ARM_NS BIT(0)
#define IO_PGTABLE_QUIRK_NO_PERMS   BIT(1)
@@ -93,6 +96,7 @@ struct io_pgtable_cfg {
#define IO_PGTABLE_QUIRK_ARM_MTK_EXTBIT(3)
#define IO_PGTABLE_QUIRK_NON_STRICT BIT(4)
#define IO_PGTABLE_QUIRK_ARM_TTBR1  BIT(5)
+   #define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6)
unsigned long   quirks;
unsigned long   pgsize_bitmap;
unsigned intias;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv10 0/9] System Cache support for GPU and required SMMU support

2020-11-25 Thread Sai Prakash Ranjan
Some hardware variants contain a system cache or the last level
cache(llc). This cache is typically a large block which is shared
by multiple clients on the SOC. GPU uses the system cache to cache
both the GPU data buffers(like textures) as well the SMMU pagetables.
This helps with improved render performance as well as lower power
consumption by reducing the bus traffic to the system memory.

The system cache architecture allows the cache to be split into slices
which then be used by multiple SOC clients. This patch series is an
effort to enable and use two of those slices preallocated for the GPU,
one for the GPU data buffers and another for the GPU SMMU hardware
pagetables.

Patch 1 - Patch 7 adds system cache support in SMMU and GPU driver.
Patch 8 and 9 are minor cleanups for arm-smmu impl.

Changes in v10:
 * Fix non-strict mode domain attr handling (Will)
 * Split the domain attribute patch into two (Will)

Changes in v9:
 * Change name from domain_attr_io_pgtbl_cfg to io_pgtable_domain_attr (Will)
 * Modify comment for the quirk as suggested (Will)
 * Compare with IO_PGTABLE_QUIRK_NON_STRICT for non-strict mode (Will)

Changes in v8:
 * Introduce a generic domain attribute for pagetable config (Will)
 * Rename quirk to more generic IO_PGTABLE_QUIRK_ARM_OUTER_WBWA (Will)
 * Move non-strict mode to use new struct domain_attr_io_pgtbl_config (Will)

Changes in v7:
 * Squash Jordan's patch to support MMU500 targets
 * Rebase on top of for-joerg/arm-smmu/updates and Jordan's short series for 
adreno-smmu impl

Changes in v6:
 * Move table to arm-smmu-qcom (Robin)

Changes in v5:
 * Drop cleanup of blank lines since it was intentional (Robin)
 * Rebase again on top of msm-next-pgtables as it moves pretty fast

Changes in v4:
 * Drop IOMMU_SYS_CACHE prot flag
 * Rebase on top of 
https://gitlab.freedesktop.org/drm/msm/-/tree/msm-next-pgtables

Changes in v3:
 * Fix domain attribute setting to before iommu_attach_device()
 * Fix few code style and checkpatch warnings
 * Rebase on top of Jordan's latest split pagetables and per-instance
   pagetables support

Changes in v2:
 * Addressed review comments and rebased on top of Jordan's split
   pagetables series

Jordan Crouse (1):
  drm/msm/a6xx: Add support for using system cache on MMU500 based
targets

Sai Prakash Ranjan (6):
  iommu/io-pgtable: Add a domain attribute for pagetable configuration
  iommu/io-pgtable-arm: Add support to use system cache
  iommu/arm-smmu: Add support for pagetable config domain attribute
  iommu/arm-smmu: Move non-strict mode to use io_pgtable_domain_attr
  iommu: arm-smmu-impl: Use table to list QCOM implementations
  iommu: arm-smmu-impl: Add a space before open parenthesis

Sharat Masetty (2):
  drm/msm: rearrange the gpu_rmw() function
  drm/msm/a6xx: Add support for using system cache(LLC)

 drivers/gpu/drm/msm/adreno/a6xx_gpu.c  | 109 +
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h  |   5 +
 drivers/gpu/drm/msm/adreno/adreno_gpu.c|  17 
 drivers/gpu/drm/msm/msm_drv.c  |   8 ++
 drivers/gpu/drm/msm/msm_drv.h  |   1 +
 drivers/gpu/drm/msm/msm_gpu.h  |   5 +-
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c |  11 +--
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c |  21 +++-
 drivers/iommu/arm/arm-smmu/arm-smmu.c  |  33 ++-
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |   3 +-
 drivers/iommu/io-pgtable-arm.c |  10 +-
 include/linux/io-pgtable.h |   8 ++
 include/linux/iommu.h  |   1 +
 13 files changed, 205 insertions(+), 27 deletions(-)


base-commit: a29bbb0861f487a5e144dc997a9f71a36c7a2404
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv10 5/9] drm/msm: rearrange the gpu_rmw() function

2020-11-25 Thread Sai Prakash Ranjan
From: Sharat Masetty 

The register read-modify-write construct is generic enough
that it can be used by other subsystems as needed, create
a more generic rmw() function and have the gpu_rmw() use
this new function.

Signed-off-by: Sharat Masetty 
Reviewed-by: Jordan Crouse 
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/msm_drv.c | 8 
 drivers/gpu/drm/msm/msm_drv.h | 1 +
 drivers/gpu/drm/msm/msm_gpu.h | 5 +
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 49685571dc0e..a1e22b974b77 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -180,6 +180,14 @@ u32 msm_readl(const void __iomem *addr)
return val;
 }
 
+void msm_rmw(void __iomem *addr, u32 mask, u32 or)
+{
+   u32 val = msm_readl(addr);
+
+   val &= ~mask;
+   msm_writel(val | or, addr);
+}
+
 struct msm_vblank_work {
struct work_struct work;
int crtc_id;
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index b9dd8f8f4887..655b3b0424a1 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -478,6 +478,7 @@ void __iomem *msm_ioremap_quiet(struct platform_device 
*pdev, const char *name,
const char *dbgname);
 void msm_writel(u32 data, void __iomem *addr);
 u32 msm_readl(const void __iomem *addr);
+void msm_rmw(void __iomem *addr, u32 mask, u32 or);
 
 struct msm_gpu_submitqueue;
 int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 6c9e1fdc1a76..b2b419277953 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -246,10 +246,7 @@ static inline u32 gpu_read(struct msm_gpu *gpu, u32 reg)
 
 static inline void gpu_rmw(struct msm_gpu *gpu, u32 reg, u32 mask, u32 or)
 {
-   uint32_t val = gpu_read(gpu, reg);
-
-   val &= ~mask;
-   gpu_write(gpu, reg, val | or);
+   msm_rmw(gpu->mmio + (reg << 2), mask, or);
 }
 
 static inline u64 gpu_read64(struct msm_gpu *gpu, u32 lo, u32 hi)
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCHv8 1/8] iommu/io-pgtable-arm: Add support to use system cache

2020-11-24 Thread Sai Prakash Ranjan

On 2020-11-23 20:36, Will Deacon wrote:

On Tue, Nov 17, 2020 at 08:00:40PM +0530, Sai Prakash Ranjan wrote:

Add a quirk IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to override
the attributes set in TCR for the page table walker when
using system cache.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 10 --
 include/linux/io-pgtable.h |  4 
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm.c 
b/drivers/iommu/io-pgtable-arm.c

index a7a9bc08dcd1..7c9ea9d7874a 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -761,7 +761,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg 
*cfg, void *cookie)


if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
IO_PGTABLE_QUIRK_NON_STRICT |
-   IO_PGTABLE_QUIRK_ARM_TTBR1))
+   IO_PGTABLE_QUIRK_ARM_TTBR1 |
+   IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
return NULL;

data = arm_lpae_alloc_pgtable(cfg);
@@ -773,10 +774,15 @@ arm_64_lpae_alloc_pgtable_s1(struct 
io_pgtable_cfg *cfg, void *cookie)

tcr->sh = ARM_LPAE_TCR_SH_IS;
tcr->irgn = ARM_LPAE_TCR_RGN_WBWA;
tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
+   if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)
+   goto out_free_data;
} else {
tcr->sh = ARM_LPAE_TCR_SH_OS;
tcr->irgn = ARM_LPAE_TCR_RGN_NC;
-   tcr->orgn = ARM_LPAE_TCR_RGN_NC;
+   if (!(cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+   tcr->orgn = ARM_LPAE_TCR_RGN_NC;
+   else
+   tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
}

tg1 = cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 4cde111e425b..a9a2c59fab37 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -86,6 +86,9 @@ struct io_pgtable_cfg {
 *
 * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table
 *  for use in the upper half of a split address space.
+*
+	 * IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the attributes set in 
TCR for

+*  the page table walker when using system cache.


Please can you reword this to say:

  "Override the outer-cacheability attributes set in the TCR for a 
non-coherent

   page-table walker."



Sure, thanks.

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv9 4/8] drm/msm: rearrange the gpu_rmw() function

2020-11-24 Thread Sai Prakash Ranjan
From: Sharat Masetty 

The register read-modify-write construct is generic enough
that it can be used by other subsystems as needed, create
a more generic rmw() function and have the gpu_rmw() use
this new function.

Signed-off-by: Sharat Masetty 
Reviewed-by: Jordan Crouse 
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/msm_drv.c | 8 
 drivers/gpu/drm/msm/msm_drv.h | 1 +
 drivers/gpu/drm/msm/msm_gpu.h | 5 +
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 49685571dc0e..a1e22b974b77 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -180,6 +180,14 @@ u32 msm_readl(const void __iomem *addr)
return val;
 }
 
+void msm_rmw(void __iomem *addr, u32 mask, u32 or)
+{
+   u32 val = msm_readl(addr);
+
+   val &= ~mask;
+   msm_writel(val | or, addr);
+}
+
 struct msm_vblank_work {
struct work_struct work;
int crtc_id;
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index b9dd8f8f4887..655b3b0424a1 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -478,6 +478,7 @@ void __iomem *msm_ioremap_quiet(struct platform_device 
*pdev, const char *name,
const char *dbgname);
 void msm_writel(u32 data, void __iomem *addr);
 u32 msm_readl(const void __iomem *addr);
+void msm_rmw(void __iomem *addr, u32 mask, u32 or);
 
 struct msm_gpu_submitqueue;
 int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 6c9e1fdc1a76..b2b419277953 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -246,10 +246,7 @@ static inline u32 gpu_read(struct msm_gpu *gpu, u32 reg)
 
 static inline void gpu_rmw(struct msm_gpu *gpu, u32 reg, u32 mask, u32 or)
 {
-   uint32_t val = gpu_read(gpu, reg);
-
-   val &= ~mask;
-   gpu_write(gpu, reg, val | or);
+   msm_rmw(gpu->mmio + (reg << 2), mask, or);
 }
 
 static inline u64 gpu_read64(struct msm_gpu *gpu, u32 lo, u32 hi)
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv9 3/8] iommu/arm-smmu: Move non-strict mode to use io_pgtable_domain_attr

2020-11-24 Thread Sai Prakash Ranjan
Now that we have a struct io_pgtable_domain_attr with quirks,
use that for non_strict mode as well thereby removing the need
for more members of arm_smmu_domain in the future.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 8 +++-
 drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 -
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 4b9b10fe50ed..f56f266ebdf7 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -786,9 +786,6 @@ static int arm_smmu_init_domain_context(struct iommu_domain 
*domain,
goto out_clear_smmu;
}
 
-   if (smmu_domain->non_strict)
-   pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
-
if (smmu_domain->pgtbl_cfg.quirks)
pgtbl_cfg.quirks |= smmu_domain->pgtbl_cfg.quirks;
 
@@ -1527,7 +1524,8 @@ static int arm_smmu_domain_get_attr(struct iommu_domain 
*domain,
case IOMMU_DOMAIN_DMA:
switch (attr) {
case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-   *(int *)data = smmu_domain->non_strict;
+   if (smmu_domain->pgtbl_cfg.quirks & 
IO_PGTABLE_QUIRK_NON_STRICT)
+   *(int *)data = smmu_domain->pgtbl_cfg.quirks;
return 0;
default:
return -ENODEV;
@@ -1578,7 +1576,7 @@ static int arm_smmu_domain_set_attr(struct iommu_domain 
*domain,
case IOMMU_DOMAIN_DMA:
switch (attr) {
case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-   smmu_domain->non_strict = *(int *)data;
+   smmu_domain->pgtbl_cfg.quirks |= 
IO_PGTABLE_QUIRK_NON_STRICT;
break;
default:
ret = -ENODEV;
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index bb5a419f240f..cb7ca3a444c9 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -368,7 +368,6 @@ struct arm_smmu_domain {
const struct iommu_flush_ops*flush_ops;
struct arm_smmu_cfg cfg;
enum arm_smmu_domain_stage  stage;
-   boolnon_strict;
struct mutexinit_mutex; /* Protects smmu pointer */
spinlock_t  cb_lock; /* Serialises ATS1* ops and 
TLB syncs */
struct iommu_domain domain;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv9 6/8] drm/msm/a6xx: Add support for using system cache on MMU500 based targets

2020-11-24 Thread Sai Prakash Ranjan
From: Jordan Crouse 

GPU targets with an MMU-500 attached have a slightly different process for
enabling system cache. Use the compatible string on the IOMMU phandle
to see if an MMU-500 is attached and modify the programming sequence
accordingly.

Signed-off-by: Jordan Crouse 
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 46 +--
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h |  1 +
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 95c98c642876..3f8b92da8cba 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1042,6 +1042,8 @@ static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu)
 
 static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
 {
+   struct adreno_gpu *adreno_gpu = _gpu->base;
+   struct msm_gpu *gpu = _gpu->base;
u32 cntl1_regval = 0;
 
if (IS_ERR(a6xx_gpu->llc_mmio))
@@ -1055,11 +1057,17 @@ static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
   (gpu_scid << 15) | (gpu_scid << 20);
}
 
+   /*
+* For targets with a MMU500, activate the slice but don't program the
+* register.  The XBL will take care of that.
+*/
if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
-   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
+   if (!a6xx_gpu->have_mmu500) {
+   u32 gpuhtw_scid = 
llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
 
-   gpuhtw_scid &= 0x1f;
-   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
+   gpuhtw_scid &= 0x1f;
+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), 
gpuhtw_scid);
+   }
}
 
if (cntl1_regval) {
@@ -1067,13 +1075,20 @@ static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
 * Program the slice IDs for the various GPU blocks and GPU MMU
 * pagetables
 */
-   a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
-
-   /*
-* Program cacheability overrides to not allocate cache lines on
-* a write miss
-*/
-   a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 
0xF, 0x03);
+   if (a6xx_gpu->have_mmu500)
+   gpu_rmw(gpu, REG_A6XX_GBIF_SCACHE_CNTL1, GENMASK(24, 0),
+   cntl1_regval);
+   else {
+   a6xx_llc_write(a6xx_gpu,
+   REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
+
+   /*
+* Program cacheability overrides to not allocate cache
+* lines on a write miss
+*/
+   a6xx_llc_rmw(a6xx_gpu,
+   REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 
0x03);
+   }
}
 }
 
@@ -1086,10 +1101,21 @@ static void a6xx_llc_slices_destroy(struct a6xx_gpu 
*a6xx_gpu)
 static void a6xx_llc_slices_init(struct platform_device *pdev,
struct a6xx_gpu *a6xx_gpu)
 {
+   struct device_node *phandle;
+
a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
if (IS_ERR(a6xx_gpu->llc_mmio))
return;
 
+   /*
+* There is a different programming path for targets with an mmu500
+* attached, so detect if that is the case
+*/
+   phandle = of_parse_phandle(pdev->dev.of_node, "iommus", 0);
+   a6xx_gpu->have_mmu500 = (phandle &&
+   of_device_is_compatible(phandle, "arm,mmu-500"));
+   of_node_put(phandle);
+
a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU);
a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
 
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
index 9e6079af679c..e793d329e77b 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
@@ -32,6 +32,7 @@ struct a6xx_gpu {
void __iomem *llc_mmio;
void *llc_slice;
void *htw_llc_slice;
+   bool have_mmu500;
 };
 
 #define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base)
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv9 5/8] drm/msm/a6xx: Add support for using system cache(LLC)

2020-11-24 Thread Sai Prakash Ranjan
From: Sharat Masetty 

The last level system cache can be partitioned to 32 different
slices of which GPU has two slices preallocated. One slice is
used for caching GPU buffers and the other slice is used for
caching the GPU SMMU pagetables. This talks to the core system
cache driver to acquire the slice handles, configure the SCID's
to those slices and activates and deactivates the slices upon
GPU power collapse and restore.

Some support from the IOMMU driver is also needed to make use
of the system cache to set the right TCR attributes. GPU then
has the ability to override a few cacheability parameters which
it does to override write-allocate to write-no-allocate as the
GPU hardware does not benefit much from it.

DOMAIN_ATTR_IO_PGTABLE_CFG is another domain level attribute used
by the IOMMU driver for pagetable configuration which will be used
to set a quirk initially to set the right attributes to cache the
hardware pagetables into the system cache.

Signed-off-by: Sharat Masetty 
[saiprakash.ranjan: fix to set attr before device attach to iommu and rebase]
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 83 +
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h   |  4 ++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 17 +
 3 files changed, 104 insertions(+)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 948f3656c20c..95c98c642876 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -8,7 +8,9 @@
 #include "a6xx_gpu.h"
 #include "a6xx_gmu.xml.h"
 
+#include 
 #include 
+#include 
 
 #define GPU_PAS_ID 13
 
@@ -1022,6 +1024,79 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
return IRQ_HANDLED;
 }
 
+static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 mask, u32 or)
+{
+   return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or);
+}
+
+static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 value)
+{
+   return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2));
+}
+
+static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu)
+{
+   llcc_slice_deactivate(a6xx_gpu->llc_slice);
+   llcc_slice_deactivate(a6xx_gpu->htw_llc_slice);
+}
+
+static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
+{
+   u32 cntl1_regval = 0;
+
+   if (IS_ERR(a6xx_gpu->llc_mmio))
+   return;
+
+   if (!llcc_slice_activate(a6xx_gpu->llc_slice)) {
+   u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice);
+
+   gpu_scid &= 0x1f;
+   cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid << 
10) |
+  (gpu_scid << 15) | (gpu_scid << 20);
+   }
+
+   if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
+   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
+
+   gpuhtw_scid &= 0x1f;
+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
+   }
+
+   if (cntl1_regval) {
+   /*
+* Program the slice IDs for the various GPU blocks and GPU MMU
+* pagetables
+*/
+   a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
+
+   /*
+* Program cacheability overrides to not allocate cache lines on
+* a write miss
+*/
+   a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 
0xF, 0x03);
+   }
+}
+
+static void a6xx_llc_slices_destroy(struct a6xx_gpu *a6xx_gpu)
+{
+   llcc_slice_putd(a6xx_gpu->llc_slice);
+   llcc_slice_putd(a6xx_gpu->htw_llc_slice);
+}
+
+static void a6xx_llc_slices_init(struct platform_device *pdev,
+   struct a6xx_gpu *a6xx_gpu)
+{
+   a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
+   if (IS_ERR(a6xx_gpu->llc_mmio))
+   return;
+
+   a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU);
+   a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
+
+   if (IS_ERR(a6xx_gpu->llc_slice) && IS_ERR(a6xx_gpu->htw_llc_slice))
+   a6xx_gpu->llc_mmio = ERR_PTR(-EINVAL);
+}
+
 static int a6xx_pm_resume(struct msm_gpu *gpu)
 {
struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
@@ -1038,6 +1113,8 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
 
msm_gpu_resume_devfreq(gpu);
 
+   a6xx_llc_activate(a6xx_gpu);
+
return 0;
 }
 
@@ -1048,6 +1125,8 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)
 
trace_msm_gpu_suspend(0);
 
+   a6xx_llc_deactivate(a6xx_gpu);
+
devfreq_suspend_device(gpu->devfreq.devfreq);
 
return a6xx_gmu_stop(a6xx_gpu);
@@ -1091,6 +1170,8 @@ static void a6xx_destroy(struct msm_gpu *gpu)
drm_ge

[PATCHv9 7/8] iommu: arm-smmu-impl: Use table to list QCOM implementations

2020-11-24 Thread Sai Prakash Ranjan
Use table and of_match_node() to match qcom implementation
instead of multiple of_device_compatible() calls for each
QCOM SMMU implementation.

Signed-off-by: Sai Prakash Ranjan 
Acked-by: Will Deacon 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c |  9 +
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 21 -
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |  1 -
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index 7fed89c9d18a..26e2734eb4d7 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -214,14 +214,7 @@ struct arm_smmu_device *arm_smmu_impl_init(struct 
arm_smmu_device *smmu)
if (of_device_is_compatible(np, "nvidia,tegra194-smmu"))
return nvidia_smmu_impl_init(smmu);
 
-   if (of_device_is_compatible(np, "qcom,sdm845-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sc7180-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sm8150-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sm8250-smmu-500"))
-   return qcom_smmu_impl_init(smmu);
-
-   if (of_device_is_compatible(smmu->dev->of_node, "qcom,adreno-smmu"))
-   return qcom_adreno_smmu_impl_init(smmu);
+   smmu = qcom_smmu_impl_init(smmu);
 
if (of_device_is_compatible(np, "marvell,ap806-smmu-500"))
smmu->impl = _mmu500_impl;
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index d0636c803a36..add1859b2899 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -318,12 +318,23 @@ static struct arm_smmu_device *qcom_smmu_create(struct 
arm_smmu_device *smmu,
return >smmu;
 }
 
+static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = {
+   { .compatible = "qcom,sc7180-smmu-500" },
+   { .compatible = "qcom,sdm845-smmu-500" },
+   { .compatible = "qcom,sm8150-smmu-500" },
+   { .compatible = "qcom,sm8250-smmu-500" },
+   { }
+};
+
 struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu)
 {
-   return qcom_smmu_create(smmu, _smmu_impl);
-}
+   const struct device_node *np = smmu->dev->of_node;
 
-struct arm_smmu_device *qcom_adreno_smmu_impl_init(struct arm_smmu_device 
*smmu)
-{
-   return qcom_smmu_create(smmu, _adreno_smmu_impl);
+   if (of_match_node(qcom_smmu_impl_of_match, np))
+   return qcom_smmu_create(smmu, _smmu_impl);
+
+   if (of_device_is_compatible(np, "qcom,adreno-smmu"))
+   return qcom_smmu_create(smmu, _adreno_smmu_impl);
+
+   return smmu;
 }
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index cb7ca3a444c9..d2a2d1bc58ba 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -523,7 +523,6 @@ static inline void arm_smmu_writeq(struct arm_smmu_device 
*smmu, int page,
 struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu);
 struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu);
 struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu);
-struct arm_smmu_device *qcom_adreno_smmu_impl_init(struct arm_smmu_device 
*smmu);
 
 void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx);
 int arm_mmu500_reset(struct arm_smmu_device *smmu);
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCHv8 3/8] iommu/arm-smmu: Move non-strict mode to use domain_attr_io_pgtbl_cfg

2020-11-24 Thread Sai Prakash Ranjan

On 2020-11-23 20:49, Will Deacon wrote:

On Tue, Nov 17, 2020 at 08:00:42PM +0530, Sai Prakash Ranjan wrote:

Now that we have a struct domain_attr_io_pgtbl_cfg with quirks,
use that for non_strict mode as well thereby removing the need
for more members of arm_smmu_domain in the future.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 7 ++-
 drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 -
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c

index 7b05782738e2..5f066a1b7221 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -786,9 +786,6 @@ static int arm_smmu_init_domain_context(struct 
iommu_domain *domain,

goto out_clear_smmu;
}

-   if (smmu_domain->non_strict)
-   pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
-
if (smmu_domain->pgtbl_cfg.quirks)
pgtbl_cfg.quirks |= smmu_domain->pgtbl_cfg.quirks;

@@ -1527,7 +1524,7 @@ static int arm_smmu_domain_get_attr(struct 
iommu_domain *domain,

case IOMMU_DOMAIN_DMA:
switch (attr) {
case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-   *(int *)data = smmu_domain->non_strict;
+   *(int *)data = smmu_domain->pgtbl_cfg.quirks;


Probably better to compare with IO_PGTABLE_QUIRK_NON_STRICT here even 
though

we only support this one quirk for DMA domains atm.



Ok will do, thanks.

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCHv8 0/8] System Cache support for GPU and required SMMU support

2020-11-24 Thread Sai Prakash Ranjan

On 2020-11-23 20:51, Will Deacon wrote:

On Tue, Nov 17, 2020 at 08:00:39PM +0530, Sai Prakash Ranjan wrote:

Some hardware variants contain a system cache or the last level
cache(llc). This cache is typically a large block which is shared
by multiple clients on the SOC. GPU uses the system cache to cache
both the GPU data buffers(like textures) as well the SMMU pagetables.
This helps with improved render performance as well as lower power
consumption by reducing the bus traffic to the system memory.

The system cache architecture allows the cache to be split into slices
which then be used by multiple SOC clients. This patch series is an
effort to enable and use two of those slices preallocated for the GPU,
one for the GPU data buffers and another for the GPU SMMU hardware
pagetables.

Patch 1 - Patch 6 adds system cache support in SMMU and GPU driver.
Patch 7 and 8 are minor cleanups for arm-smmu impl.

Changes in v8:
 * Introduce a generic domain attribute for pagetable config (Will)
 * Rename quirk to more generic IO_PGTABLE_QUIRK_ARM_OUTER_WBWA (Will)
 * Move non-strict mode to use new struct domain_attr_io_pgtbl_config 
(Will)


Modulo some minor comments I've made, this looks good to me. What is 
the
plan for merging it? I can take the IOMMU parts, but patches 4-6 touch 
the

MSM GPU driver and I'd like to avoid conflicts with that.



SMMU bits are pretty much independent and GPU relies on the domain 
attribute
and the quirk exposed, so as long as SMMU changes go in first it should 
be good.

Rob?

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCHv8 0/8] System Cache support for GPU and required SMMU support

2020-11-24 Thread Sai Prakash Ranjan

On 2020-11-24 00:52, Rob Clark wrote:

On Mon, Nov 23, 2020 at 9:01 AM Sai Prakash Ranjan
 wrote:


On 2020-11-23 20:51, Will Deacon wrote:
> On Tue, Nov 17, 2020 at 08:00:39PM +0530, Sai Prakash Ranjan wrote:
>> Some hardware variants contain a system cache or the last level
>> cache(llc). This cache is typically a large block which is shared
>> by multiple clients on the SOC. GPU uses the system cache to cache
>> both the GPU data buffers(like textures) as well the SMMU pagetables.
>> This helps with improved render performance as well as lower power
>> consumption by reducing the bus traffic to the system memory.
>>
>> The system cache architecture allows the cache to be split into slices
>> which then be used by multiple SOC clients. This patch series is an
>> effort to enable and use two of those slices preallocated for the GPU,
>> one for the GPU data buffers and another for the GPU SMMU hardware
>> pagetables.
>>
>> Patch 1 - Patch 6 adds system cache support in SMMU and GPU driver.
>> Patch 7 and 8 are minor cleanups for arm-smmu impl.
>>
>> Changes in v8:
>>  * Introduce a generic domain attribute for pagetable config (Will)
>>  * Rename quirk to more generic IO_PGTABLE_QUIRK_ARM_OUTER_WBWA (Will)
>>  * Move non-strict mode to use new struct domain_attr_io_pgtbl_config
>> (Will)
>
> Modulo some minor comments I've made, this looks good to me. What is
> the
> plan for merging it? I can take the IOMMU parts, but patches 4-6 touch
> the
> MSM GPU driver and I'd like to avoid conflicts with that.
>

SMMU bits are pretty much independent and GPU relies on the domain
attribute
and the quirk exposed, so as long as SMMU changes go in first it 
should

be good.
Rob?


I suppose one option would be to split out the patch that adds the
attribute into it's own patch, and merge that both thru drm and iommu?



Ok I can split out domain attr and quirk into its own patch if Will is
fine with that approach.


If Will/Robin dislike that approach, I'll pick up the parts of the drm
patches which don't depend on the new attribute for v5.11 and the rest
for v5.12.. or possibly a second late v5.11 pull req if airlied
doesn't hate me too much for it.

Going forward, I think we will have one or two more co-dependent
series, like the smmu iova fault handler improvements that Jordan
posted.  So I would like to hear how Will and Robin prefer to handle
those.

BR,
-R



Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv9 0/8] System Cache support for GPU and required SMMU support

2020-11-24 Thread Sai Prakash Ranjan
Some hardware variants contain a system cache or the last level
cache(llc). This cache is typically a large block which is shared
by multiple clients on the SOC. GPU uses the system cache to cache
both the GPU data buffers(like textures) as well the SMMU pagetables.
This helps with improved render performance as well as lower power
consumption by reducing the bus traffic to the system memory.

The system cache architecture allows the cache to be split into slices
which then be used by multiple SOC clients. This patch series is an
effort to enable and use two of those slices preallocated for the GPU,
one for the GPU data buffers and another for the GPU SMMU hardware
pagetables.

Patch 1 - Patch 6 adds system cache support in SMMU and GPU driver.
Patch 7 and 8 are minor cleanups for arm-smmu impl.

Changes in v9:
 * Change name from domain_attr_io_pgtbl_cfg to io_pgtable_domain_attr (Will)
 * Modify comment for the quirk as suggested (Will)
 * Compare with IO_PGTABLE_QUIRK_NON_STRICT for non-strict mode (Will)

Changes in v8:
 * Introduce a generic domain attribute for pagetable config (Will)
 * Rename quirk to more generic IO_PGTABLE_QUIRK_ARM_OUTER_WBWA (Will)
 * Move non-strict mode to use new struct domain_attr_io_pgtbl_config (Will)

Changes in v7:
 * Squash Jordan's patch to support MMU500 targets
 * Rebase on top of for-joerg/arm-smmu/updates and Jordan's short series for 
adreno-smmu impl

Changes in v6:
 * Move table to arm-smmu-qcom (Robin)

Changes in v5:
 * Drop cleanup of blank lines since it was intentional (Robin)
 * Rebase again on top of msm-next-pgtables as it moves pretty fast

Changes in v4:
 * Drop IOMMU_SYS_CACHE prot flag
 * Rebase on top of 
https://gitlab.freedesktop.org/drm/msm/-/tree/msm-next-pgtables

Changes in v3:
 * Fix domain attribute setting to before iommu_attach_device()
 * Fix few code style and checkpatch warnings
 * Rebase on top of Jordan's latest split pagetables and per-instance
   pagetables support

Changes in v2:
 * Addressed review comments and rebased on top of Jordan's split
   pagetables series

Jordan Crouse (1):
  drm/msm/a6xx: Add support for using system cache on MMU500 based
targets

Sai Prakash Ranjan (5):
  iommu/io-pgtable-arm: Add support to use system cache
  iommu/arm-smmu: Add domain attribute for pagetable configuration
  iommu/arm-smmu: Move non-strict mode to use io_pgtable_domain_attr
  iommu: arm-smmu-impl: Use table to list QCOM implementations
  iommu: arm-smmu-impl: Add a space before open parenthesis

Sharat Masetty (2):
  drm/msm: rearrange the gpu_rmw() function
  drm/msm/a6xx: Add support for using system cache(LLC)

 drivers/gpu/drm/msm/adreno/a6xx_gpu.c  | 109 +
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h  |   5 +
 drivers/gpu/drm/msm/adreno/adreno_gpu.c|  17 
 drivers/gpu/drm/msm/msm_drv.c  |   8 ++
 drivers/gpu/drm/msm/msm_drv.h  |   1 +
 drivers/gpu/drm/msm/msm_gpu.h  |   5 +-
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c |  11 +--
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c |  21 +++-
 drivers/iommu/arm/arm-smmu/arm-smmu.c  |  26 -
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |   3 +-
 drivers/iommu/io-pgtable-arm.c |  10 +-
 include/linux/io-pgtable.h |   8 ++
 include/linux/iommu.h  |   1 +
 13 files changed, 199 insertions(+), 26 deletions(-)


base-commit: a29bbb0861f487a5e144dc997a9f71a36c7a2404
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv9 8/8] iommu: arm-smmu-impl: Add a space before open parenthesis

2020-11-24 Thread Sai Prakash Ranjan
Fix the checkpatch warning for space required before the open
parenthesis.

Signed-off-by: Sai Prakash Ranjan 
Acked-by: Will Deacon 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index 26e2734eb4d7..136872e77195 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -12,7 +12,7 @@
 
 static int arm_smmu_gr0_ns(int offset)
 {
-   switch(offset) {
+   switch (offset) {
case ARM_SMMU_GR0_sCR0:
case ARM_SMMU_GR0_sACR:
case ARM_SMMU_GR0_sGFSR:
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv9 2/8] iommu/arm-smmu: Add domain attribute for pagetable configuration

2020-11-24 Thread Sai Prakash Ranjan
Add iommu domain attribute for pagetable configuration which
initially will be used to set quirks like for system cache aka
last level cache to be used by client drivers like GPU to set
right attributes for caching the hardware pagetables into the
system cache and later can be extended to include other page
table configuration data.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 20 
 drivers/iommu/arm/arm-smmu/arm-smmu.h |  1 +
 include/linux/io-pgtable.h|  4 
 include/linux/iommu.h |  1 +
 4 files changed, 26 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 0f28a8614da3..4b9b10fe50ed 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -789,6 +789,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain 
*domain,
if (smmu_domain->non_strict)
pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
 
+   if (smmu_domain->pgtbl_cfg.quirks)
+   pgtbl_cfg.quirks |= smmu_domain->pgtbl_cfg.quirks;
+
pgtbl_ops = alloc_io_pgtable_ops(fmt, _cfg, smmu_domain);
if (!pgtbl_ops) {
ret = -ENOMEM;
@@ -1511,6 +1514,12 @@ static int arm_smmu_domain_get_attr(struct iommu_domain 
*domain,
case DOMAIN_ATTR_NESTING:
*(int *)data = (smmu_domain->stage == 
ARM_SMMU_DOMAIN_NESTED);
return 0;
+   case DOMAIN_ATTR_IO_PGTABLE_CFG: {
+   struct io_pgtable_domain_attr *pgtbl_cfg = data;
+   *pgtbl_cfg = smmu_domain->pgtbl_cfg;
+
+   return 0;
+   }
default:
return -ENODEV;
}
@@ -1551,6 +1560,17 @@ static int arm_smmu_domain_set_attr(struct iommu_domain 
*domain,
else
smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
break;
+   case DOMAIN_ATTR_IO_PGTABLE_CFG: {
+   struct io_pgtable_domain_attr *pgtbl_cfg = data;
+
+   if (smmu_domain->smmu) {
+   ret = -EPERM;
+   goto out_unlock;
+   }
+
+   smmu_domain->pgtbl_cfg = *pgtbl_cfg;
+   break;
+   }
default:
ret = -ENODEV;
}
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 04288b6fc619..bb5a419f240f 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -364,6 +364,7 @@ enum arm_smmu_domain_stage {
 struct arm_smmu_domain {
struct arm_smmu_device  *smmu;
struct io_pgtable_ops   *pgtbl_ops;
+   struct io_pgtable_domain_attr   pgtbl_cfg;
const struct iommu_flush_ops*flush_ops;
struct arm_smmu_cfg cfg;
enum arm_smmu_domain_stage  stage;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 6b8bb4f4afef..fb4d5a763e0c 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -212,6 +212,10 @@ struct io_pgtable {
 
 #define io_pgtable_ops_to_pgtable(x) container_of((x), struct io_pgtable, ops)
 
+struct io_pgtable_domain_attr {
+   unsigned long quirks;
+};
+
 static inline void io_pgtable_tlb_flush_all(struct io_pgtable *iop)
 {
iop->cfg.tlb->tlb_flush_all(iop->cookie);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b95a6f8db6ff..ffaa389ea128 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -118,6 +118,7 @@ enum iommu_attr {
DOMAIN_ATTR_FSL_PAMUV1,
DOMAIN_ATTR_NESTING,/* two stages of translation */
DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
+   DOMAIN_ATTR_IO_PGTABLE_CFG,
DOMAIN_ATTR_MAX,
 };
 
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCHv8 2/8] iommu/arm-smmu: Add domain attribute for pagetable configuration

2020-11-24 Thread Sai Prakash Ranjan

On 2020-11-23 20:48, Will Deacon wrote:

On Tue, Nov 17, 2020 at 08:00:41PM +0530, Sai Prakash Ranjan wrote:

Add iommu domain attribute for pagetable configuration which
initially will be used to set quirks like for system cache aka
last level cache to be used by client drivers like GPU to set
right attributes for caching the hardware pagetables into the
system cache and later can be extended to include other page
table configuration data.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 25 +
 drivers/iommu/arm/arm-smmu/arm-smmu.h |  1 +
 include/linux/io-pgtable.h|  4 
 include/linux/iommu.h |  1 +
 4 files changed, 31 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c

index 0f28a8614da3..7b05782738e2 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -789,6 +789,9 @@ static int arm_smmu_init_domain_context(struct 
iommu_domain *domain,

if (smmu_domain->non_strict)
pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;

+   if (smmu_domain->pgtbl_cfg.quirks)
+   pgtbl_cfg.quirks |= smmu_domain->pgtbl_cfg.quirks;
+
pgtbl_ops = alloc_io_pgtable_ops(fmt, _cfg, smmu_domain);
if (!pgtbl_ops) {
ret = -ENOMEM;
@@ -1511,6 +1514,12 @@ static int arm_smmu_domain_get_attr(struct 
iommu_domain *domain,

case DOMAIN_ATTR_NESTING:
*(int *)data = (smmu_domain->stage == 
ARM_SMMU_DOMAIN_NESTED);
return 0;
+   case DOMAIN_ATTR_IO_PGTABLE_CFG: {
+   struct domain_attr_io_pgtbl_cfg *pgtbl_cfg = data;
+   *pgtbl_cfg = smmu_domain->pgtbl_cfg;
+
+   return 0;
+   }
default:
return -ENODEV;
}
@@ -1551,6 +1560,22 @@ static int arm_smmu_domain_set_attr(struct 
iommu_domain *domain,

else
smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
break;
+   case DOMAIN_ATTR_IO_PGTABLE_CFG: {
+   struct domain_attr_io_pgtbl_cfg *pgtbl_cfg = data;
+
+   if (smmu_domain->smmu) {
+   ret = -EPERM;
+   goto out_unlock;
+   }
+
+   if (!pgtbl_cfg) {


Do we really need to check this? If somebody passed us a NULL pointer 
then
they have a bug and we don't check this for other domain attributes 
afaict.




True, I'll drop it.


+   ret = -ENODEV;
+   goto out_unlock;
+   }
+
+   smmu_domain->pgtbl_cfg = *pgtbl_cfg;
+   break;
+   }
default:
ret = -ENODEV;
}
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h

index 04288b6fc619..18fbed376afb 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -364,6 +364,7 @@ enum arm_smmu_domain_stage {
 struct arm_smmu_domain {
struct arm_smmu_device  *smmu;
struct io_pgtable_ops   *pgtbl_ops;
+   struct domain_attr_io_pgtbl_cfg pgtbl_cfg;
const struct iommu_flush_ops*flush_ops;
struct arm_smmu_cfg cfg;
enum arm_smmu_domain_stage  stage;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index a9a2c59fab37..686b37d48743 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -212,6 +212,10 @@ struct io_pgtable {

 #define io_pgtable_ops_to_pgtable(x) container_of((x), struct 
io_pgtable, ops)


+struct domain_attr_io_pgtbl_cfg {
+   unsigned long quirks;
+};


nit: Can you rename this to 'struct io_pgtable_domain_attr' please?



Done, thanks.

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv9 1/8] iommu/io-pgtable-arm: Add support to use system cache

2020-11-24 Thread Sai Prakash Ranjan
Add a quirk IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to override
the outer-cacheability attributes set in the TCR for a
non-coherent page table walker when using system cache.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 10 --
 include/linux/io-pgtable.h |  4 
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index a7a9bc08dcd1..7c9ea9d7874a 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -761,7 +761,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
 
if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
IO_PGTABLE_QUIRK_NON_STRICT |
-   IO_PGTABLE_QUIRK_ARM_TTBR1))
+   IO_PGTABLE_QUIRK_ARM_TTBR1 |
+   IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
return NULL;
 
data = arm_lpae_alloc_pgtable(cfg);
@@ -773,10 +774,15 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
tcr->sh = ARM_LPAE_TCR_SH_IS;
tcr->irgn = ARM_LPAE_TCR_RGN_WBWA;
tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
+   if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)
+   goto out_free_data;
} else {
tcr->sh = ARM_LPAE_TCR_SH_OS;
tcr->irgn = ARM_LPAE_TCR_RGN_NC;
-   tcr->orgn = ARM_LPAE_TCR_RGN_NC;
+   if (!(cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+   tcr->orgn = ARM_LPAE_TCR_RGN_NC;
+   else
+   tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
}
 
tg1 = cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 4cde111e425b..6b8bb4f4afef 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -86,6 +86,9 @@ struct io_pgtable_cfg {
 *
 * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table
 *  for use in the upper half of a split address space.
+*
+* IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the outer-cacheability
+*  attributes set in the TCR for a non-coherent page-table walker.
 */
#define IO_PGTABLE_QUIRK_ARM_NS BIT(0)
#define IO_PGTABLE_QUIRK_NO_PERMS   BIT(1)
@@ -93,6 +96,7 @@ struct io_pgtable_cfg {
#define IO_PGTABLE_QUIRK_ARM_MTK_EXTBIT(3)
#define IO_PGTABLE_QUIRK_NON_STRICT BIT(4)
#define IO_PGTABLE_QUIRK_ARM_TTBR1  BIT(5)
+   #define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6)
unsigned long   quirks;
unsigned long   pgsize_bitmap;
unsigned intias;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv8 4/8] drm/msm: rearrange the gpu_rmw() function

2020-11-18 Thread Sai Prakash Ranjan
From: Sharat Masetty 

The register read-modify-write construct is generic enough
that it can be used by other subsystems as needed, create
a more generic rmw() function and have the gpu_rmw() use
this new function.

Signed-off-by: Sharat Masetty 
Reviewed-by: Jordan Crouse 
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/msm_drv.c | 8 
 drivers/gpu/drm/msm/msm_drv.h | 1 +
 drivers/gpu/drm/msm/msm_gpu.h | 5 +
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 49685571dc0e..a1e22b974b77 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -180,6 +180,14 @@ u32 msm_readl(const void __iomem *addr)
return val;
 }
 
+void msm_rmw(void __iomem *addr, u32 mask, u32 or)
+{
+   u32 val = msm_readl(addr);
+
+   val &= ~mask;
+   msm_writel(val | or, addr);
+}
+
 struct msm_vblank_work {
struct work_struct work;
int crtc_id;
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index b9dd8f8f4887..655b3b0424a1 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -478,6 +478,7 @@ void __iomem *msm_ioremap_quiet(struct platform_device 
*pdev, const char *name,
const char *dbgname);
 void msm_writel(u32 data, void __iomem *addr);
 u32 msm_readl(const void __iomem *addr);
+void msm_rmw(void __iomem *addr, u32 mask, u32 or);
 
 struct msm_gpu_submitqueue;
 int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 6c9e1fdc1a76..b2b419277953 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -246,10 +246,7 @@ static inline u32 gpu_read(struct msm_gpu *gpu, u32 reg)
 
 static inline void gpu_rmw(struct msm_gpu *gpu, u32 reg, u32 mask, u32 or)
 {
-   uint32_t val = gpu_read(gpu, reg);
-
-   val &= ~mask;
-   gpu_write(gpu, reg, val | or);
+   msm_rmw(gpu->mmio + (reg << 2), mask, or);
 }
 
 static inline u64 gpu_read64(struct msm_gpu *gpu, u32 lo, u32 hi)
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv8 2/8] iommu/arm-smmu: Add domain attribute for pagetable configuration

2020-11-18 Thread Sai Prakash Ranjan
Add iommu domain attribute for pagetable configuration which
initially will be used to set quirks like for system cache aka
last level cache to be used by client drivers like GPU to set
right attributes for caching the hardware pagetables into the
system cache and later can be extended to include other page
table configuration data.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 25 +
 drivers/iommu/arm/arm-smmu/arm-smmu.h |  1 +
 include/linux/io-pgtable.h|  4 
 include/linux/iommu.h |  1 +
 4 files changed, 31 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 0f28a8614da3..7b05782738e2 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -789,6 +789,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain 
*domain,
if (smmu_domain->non_strict)
pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
 
+   if (smmu_domain->pgtbl_cfg.quirks)
+   pgtbl_cfg.quirks |= smmu_domain->pgtbl_cfg.quirks;
+
pgtbl_ops = alloc_io_pgtable_ops(fmt, _cfg, smmu_domain);
if (!pgtbl_ops) {
ret = -ENOMEM;
@@ -1511,6 +1514,12 @@ static int arm_smmu_domain_get_attr(struct iommu_domain 
*domain,
case DOMAIN_ATTR_NESTING:
*(int *)data = (smmu_domain->stage == 
ARM_SMMU_DOMAIN_NESTED);
return 0;
+   case DOMAIN_ATTR_IO_PGTABLE_CFG: {
+   struct domain_attr_io_pgtbl_cfg *pgtbl_cfg = data;
+   *pgtbl_cfg = smmu_domain->pgtbl_cfg;
+
+   return 0;
+   }
default:
return -ENODEV;
}
@@ -1551,6 +1560,22 @@ static int arm_smmu_domain_set_attr(struct iommu_domain 
*domain,
else
smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
break;
+   case DOMAIN_ATTR_IO_PGTABLE_CFG: {
+   struct domain_attr_io_pgtbl_cfg *pgtbl_cfg = data;
+
+   if (smmu_domain->smmu) {
+   ret = -EPERM;
+   goto out_unlock;
+   }
+
+   if (!pgtbl_cfg) {
+   ret = -ENODEV;
+   goto out_unlock;
+   }
+
+   smmu_domain->pgtbl_cfg = *pgtbl_cfg;
+   break;
+   }
default:
ret = -ENODEV;
}
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 04288b6fc619..18fbed376afb 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -364,6 +364,7 @@ enum arm_smmu_domain_stage {
 struct arm_smmu_domain {
struct arm_smmu_device  *smmu;
struct io_pgtable_ops   *pgtbl_ops;
+   struct domain_attr_io_pgtbl_cfg pgtbl_cfg;
const struct iommu_flush_ops*flush_ops;
struct arm_smmu_cfg cfg;
enum arm_smmu_domain_stage  stage;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index a9a2c59fab37..686b37d48743 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -212,6 +212,10 @@ struct io_pgtable {
 
 #define io_pgtable_ops_to_pgtable(x) container_of((x), struct io_pgtable, ops)
 
+struct domain_attr_io_pgtbl_cfg {
+   unsigned long quirks;
+};
+
 static inline void io_pgtable_tlb_flush_all(struct io_pgtable *iop)
 {
iop->cfg.tlb->tlb_flush_all(iop->cookie);
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b95a6f8db6ff..ffaa389ea128 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -118,6 +118,7 @@ enum iommu_attr {
DOMAIN_ATTR_FSL_PAMUV1,
DOMAIN_ATTR_NESTING,/* two stages of translation */
DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
+   DOMAIN_ATTR_IO_PGTABLE_CFG,
DOMAIN_ATTR_MAX,
 };
 
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv8 8/8] iommu: arm-smmu-impl: Add a space before open parenthesis

2020-11-18 Thread Sai Prakash Ranjan
Fix the checkpatch warning for space required before the open
parenthesis.

Signed-off-by: Sai Prakash Ranjan 
Acked-by: Will Deacon 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index 26e2734eb4d7..136872e77195 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -12,7 +12,7 @@
 
 static int arm_smmu_gr0_ns(int offset)
 {
-   switch(offset) {
+   switch (offset) {
case ARM_SMMU_GR0_sCR0:
case ARM_SMMU_GR0_sACR:
case ARM_SMMU_GR0_sGFSR:
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv8 0/8] System Cache support for GPU and required SMMU support

2020-11-18 Thread Sai Prakash Ranjan
Some hardware variants contain a system cache or the last level
cache(llc). This cache is typically a large block which is shared
by multiple clients on the SOC. GPU uses the system cache to cache
both the GPU data buffers(like textures) as well the SMMU pagetables.
This helps with improved render performance as well as lower power
consumption by reducing the bus traffic to the system memory.

The system cache architecture allows the cache to be split into slices
which then be used by multiple SOC clients. This patch series is an
effort to enable and use two of those slices preallocated for the GPU,
one for the GPU data buffers and another for the GPU SMMU hardware
pagetables.

Patch 1 - Patch 6 adds system cache support in SMMU and GPU driver.
Patch 7 and 8 are minor cleanups for arm-smmu impl.

Changes in v8:
 * Introduce a generic domain attribute for pagetable config (Will)
 * Rename quirk to more generic IO_PGTABLE_QUIRK_ARM_OUTER_WBWA (Will)
 * Move non-strict mode to use new struct domain_attr_io_pgtbl_config (Will)

Changes in v7:
 * Squash Jordan's patch to support MMU500 targets
 * Rebase on top of for-joerg/arm-smmu/updates and Jordan's short series for 
adreno-smmu impl

Changes in v6:
 * Move table to arm-smmu-qcom (Robin)

Changes in v5:
 * Drop cleanup of blank lines since it was intentional (Robin)
 * Rebase again on top of msm-next-pgtables as it moves pretty fast

Changes in v4:
 * Drop IOMMU_SYS_CACHE prot flag
 * Rebase on top of 
https://gitlab.freedesktop.org/drm/msm/-/tree/msm-next-pgtables

Changes in v3:
 * Fix domain attribute setting to before iommu_attach_device()
 * Fix few code style and checkpatch warnings
 * Rebase on top of Jordan's latest split pagetables and per-instance
   pagetables support

Changes in v2:
 * Addressed review comments and rebased on top of Jordan's split
   pagetables series

Jordan Crouse (1):
  drm/msm/a6xx: Add support for using system cache on MMU500 based
targets

Sai Prakash Ranjan (5):
  iommu/io-pgtable-arm: Add support to use system cache
  iommu/arm-smmu: Add domain attribute for pagetable configuration
  iommu/arm-smmu: Move non-strict mode to use domain_attr_io_pgtbl_cfg
  iommu: arm-smmu-impl: Use table to list QCOM implementations
  iommu: arm-smmu-impl: Add a space before open parenthesis

Sharat Masetty (2):
  drm/msm: rearrange the gpu_rmw() function
  drm/msm/a6xx: Add support for using system cache(LLC)

 drivers/gpu/drm/msm/adreno/a6xx_gpu.c  | 109 +
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h  |   5 +
 drivers/gpu/drm/msm/adreno/adreno_gpu.c|  17 
 drivers/gpu/drm/msm/msm_drv.c  |   8 ++
 drivers/gpu/drm/msm/msm_drv.h  |   1 +
 drivers/gpu/drm/msm/msm_gpu.h  |   5 +-
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c |  11 +--
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c |  21 +++-
 drivers/iommu/arm/arm-smmu/arm-smmu.c  |  30 +-
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |   3 +-
 drivers/iommu/io-pgtable-arm.c |  10 +-
 include/linux/io-pgtable.h |   8 ++
 include/linux/iommu.h  |   1 +
 13 files changed, 203 insertions(+), 26 deletions(-)


base-commit: a29bbb0861f487a5e144dc997a9f71a36c7a2404
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv8 3/8] iommu/arm-smmu: Move non-strict mode to use domain_attr_io_pgtbl_cfg

2020-11-18 Thread Sai Prakash Ranjan
Now that we have a struct domain_attr_io_pgtbl_cfg with quirks,
use that for non_strict mode as well thereby removing the need
for more members of arm_smmu_domain in the future.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 7 ++-
 drivers/iommu/arm/arm-smmu/arm-smmu.h | 1 -
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 7b05782738e2..5f066a1b7221 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -786,9 +786,6 @@ static int arm_smmu_init_domain_context(struct iommu_domain 
*domain,
goto out_clear_smmu;
}
 
-   if (smmu_domain->non_strict)
-   pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
-
if (smmu_domain->pgtbl_cfg.quirks)
pgtbl_cfg.quirks |= smmu_domain->pgtbl_cfg.quirks;
 
@@ -1527,7 +1524,7 @@ static int arm_smmu_domain_get_attr(struct iommu_domain 
*domain,
case IOMMU_DOMAIN_DMA:
switch (attr) {
case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-   *(int *)data = smmu_domain->non_strict;
+   *(int *)data = smmu_domain->pgtbl_cfg.quirks;
return 0;
default:
return -ENODEV;
@@ -1583,7 +1580,7 @@ static int arm_smmu_domain_set_attr(struct iommu_domain 
*domain,
case IOMMU_DOMAIN_DMA:
switch (attr) {
case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
-   smmu_domain->non_strict = *(int *)data;
+   smmu_domain->pgtbl_cfg.quirks |= 
IO_PGTABLE_QUIRK_NON_STRICT;
break;
default:
ret = -ENODEV;
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 18fbed376afb..caae543ea077 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -368,7 +368,6 @@ struct arm_smmu_domain {
const struct iommu_flush_ops*flush_ops;
struct arm_smmu_cfg cfg;
enum arm_smmu_domain_stage  stage;
-   boolnon_strict;
struct mutexinit_mutex; /* Protects smmu pointer */
spinlock_t  cb_lock; /* Serialises ATS1* ops and 
TLB syncs */
struct iommu_domain domain;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv8 7/8] iommu: arm-smmu-impl: Use table to list QCOM implementations

2020-11-18 Thread Sai Prakash Ranjan
Use table and of_match_node() to match qcom implementation
instead of multiple of_device_compatible() calls for each
QCOM SMMU implementation.

Signed-off-by: Sai Prakash Ranjan 
Acked-by: Will Deacon 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c |  9 +
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 21 -
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |  1 -
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index 7fed89c9d18a..26e2734eb4d7 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -214,14 +214,7 @@ struct arm_smmu_device *arm_smmu_impl_init(struct 
arm_smmu_device *smmu)
if (of_device_is_compatible(np, "nvidia,tegra194-smmu"))
return nvidia_smmu_impl_init(smmu);
 
-   if (of_device_is_compatible(np, "qcom,sdm845-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sc7180-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sm8150-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sm8250-smmu-500"))
-   return qcom_smmu_impl_init(smmu);
-
-   if (of_device_is_compatible(smmu->dev->of_node, "qcom,adreno-smmu"))
-   return qcom_adreno_smmu_impl_init(smmu);
+   smmu = qcom_smmu_impl_init(smmu);
 
if (of_device_is_compatible(np, "marvell,ap806-smmu-500"))
smmu->impl = _mmu500_impl;
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index d0636c803a36..add1859b2899 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -318,12 +318,23 @@ static struct arm_smmu_device *qcom_smmu_create(struct 
arm_smmu_device *smmu,
return >smmu;
 }
 
+static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = {
+   { .compatible = "qcom,sc7180-smmu-500" },
+   { .compatible = "qcom,sdm845-smmu-500" },
+   { .compatible = "qcom,sm8150-smmu-500" },
+   { .compatible = "qcom,sm8250-smmu-500" },
+   { }
+};
+
 struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu)
 {
-   return qcom_smmu_create(smmu, _smmu_impl);
-}
+   const struct device_node *np = smmu->dev->of_node;
 
-struct arm_smmu_device *qcom_adreno_smmu_impl_init(struct arm_smmu_device 
*smmu)
-{
-   return qcom_smmu_create(smmu, _adreno_smmu_impl);
+   if (of_match_node(qcom_smmu_impl_of_match, np))
+   return qcom_smmu_create(smmu, _smmu_impl);
+
+   if (of_device_is_compatible(np, "qcom,adreno-smmu"))
+   return qcom_smmu_create(smmu, _adreno_smmu_impl);
+
+   return smmu;
 }
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index caae543ea077..7db81c7c7833 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -523,7 +523,6 @@ static inline void arm_smmu_writeq(struct arm_smmu_device 
*smmu, int page,
 struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu);
 struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu);
 struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu);
-struct arm_smmu_device *qcom_adreno_smmu_impl_init(struct arm_smmu_device 
*smmu);
 
 void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx);
 int arm_mmu500_reset(struct arm_smmu_device *smmu);
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv8 6/8] drm/msm/a6xx: Add support for using system cache on MMU500 based targets

2020-11-18 Thread Sai Prakash Ranjan
From: Jordan Crouse 

GPU targets with an MMU-500 attached have a slightly different process for
enabling system cache. Use the compatible string on the IOMMU phandle
to see if an MMU-500 is attached and modify the programming sequence
accordingly.

Signed-off-by: Jordan Crouse 
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 46 +--
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h |  1 +
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 95c98c642876..3f8b92da8cba 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1042,6 +1042,8 @@ static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu)
 
 static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
 {
+   struct adreno_gpu *adreno_gpu = _gpu->base;
+   struct msm_gpu *gpu = _gpu->base;
u32 cntl1_regval = 0;
 
if (IS_ERR(a6xx_gpu->llc_mmio))
@@ -1055,11 +1057,17 @@ static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
   (gpu_scid << 15) | (gpu_scid << 20);
}
 
+   /*
+* For targets with a MMU500, activate the slice but don't program the
+* register.  The XBL will take care of that.
+*/
if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
-   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
+   if (!a6xx_gpu->have_mmu500) {
+   u32 gpuhtw_scid = 
llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
 
-   gpuhtw_scid &= 0x1f;
-   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
+   gpuhtw_scid &= 0x1f;
+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), 
gpuhtw_scid);
+   }
}
 
if (cntl1_regval) {
@@ -1067,13 +1075,20 @@ static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
 * Program the slice IDs for the various GPU blocks and GPU MMU
 * pagetables
 */
-   a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
-
-   /*
-* Program cacheability overrides to not allocate cache lines on
-* a write miss
-*/
-   a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 
0xF, 0x03);
+   if (a6xx_gpu->have_mmu500)
+   gpu_rmw(gpu, REG_A6XX_GBIF_SCACHE_CNTL1, GENMASK(24, 0),
+   cntl1_regval);
+   else {
+   a6xx_llc_write(a6xx_gpu,
+   REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
+
+   /*
+* Program cacheability overrides to not allocate cache
+* lines on a write miss
+*/
+   a6xx_llc_rmw(a6xx_gpu,
+   REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 
0x03);
+   }
}
 }
 
@@ -1086,10 +1101,21 @@ static void a6xx_llc_slices_destroy(struct a6xx_gpu 
*a6xx_gpu)
 static void a6xx_llc_slices_init(struct platform_device *pdev,
struct a6xx_gpu *a6xx_gpu)
 {
+   struct device_node *phandle;
+
a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
if (IS_ERR(a6xx_gpu->llc_mmio))
return;
 
+   /*
+* There is a different programming path for targets with an mmu500
+* attached, so detect if that is the case
+*/
+   phandle = of_parse_phandle(pdev->dev.of_node, "iommus", 0);
+   a6xx_gpu->have_mmu500 = (phandle &&
+   of_device_is_compatible(phandle, "arm,mmu-500"));
+   of_node_put(phandle);
+
a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU);
a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
 
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
index 9e6079af679c..e793d329e77b 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
@@ -32,6 +32,7 @@ struct a6xx_gpu {
void __iomem *llc_mmio;
void *llc_slice;
void *htw_llc_slice;
+   bool have_mmu500;
 };
 
 #define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base)
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv8 5/8] drm/msm/a6xx: Add support for using system cache(LLC)

2020-11-18 Thread Sai Prakash Ranjan
From: Sharat Masetty 

The last level system cache can be partitioned to 32 different
slices of which GPU has two slices preallocated. One slice is
used for caching GPU buffers and the other slice is used for
caching the GPU SMMU pagetables. This talks to the core system
cache driver to acquire the slice handles, configure the SCID's
to those slices and activates and deactivates the slices upon
GPU power collapse and restore.

Some support from the IOMMU driver is also needed to make use
of the system cache to set the right TCR attributes. GPU then
has the ability to override a few cacheability parameters which
it does to override write-allocate to write-no-allocate as the
GPU hardware does not benefit much from it.

DOMAIN_ATTR_IO_PGTABLE_CFG is another domain level attribute used
by the IOMMU driver for pagetable configuration which will be used
to set a quirk initially to set the right attributes to cache the
hardware pagetables into the system cache.

Signed-off-by: Sharat Masetty 
[saiprakash.ranjan: fix to set attr before device attach to iommu and rebase]
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 83 +
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h   |  4 ++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 17 +
 3 files changed, 104 insertions(+)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 948f3656c20c..95c98c642876 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -8,7 +8,9 @@
 #include "a6xx_gpu.h"
 #include "a6xx_gmu.xml.h"
 
+#include 
 #include 
+#include 
 
 #define GPU_PAS_ID 13
 
@@ -1022,6 +1024,79 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
return IRQ_HANDLED;
 }
 
+static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 mask, u32 or)
+{
+   return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or);
+}
+
+static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 value)
+{
+   return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2));
+}
+
+static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu)
+{
+   llcc_slice_deactivate(a6xx_gpu->llc_slice);
+   llcc_slice_deactivate(a6xx_gpu->htw_llc_slice);
+}
+
+static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
+{
+   u32 cntl1_regval = 0;
+
+   if (IS_ERR(a6xx_gpu->llc_mmio))
+   return;
+
+   if (!llcc_slice_activate(a6xx_gpu->llc_slice)) {
+   u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice);
+
+   gpu_scid &= 0x1f;
+   cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid << 
10) |
+  (gpu_scid << 15) | (gpu_scid << 20);
+   }
+
+   if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
+   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
+
+   gpuhtw_scid &= 0x1f;
+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
+   }
+
+   if (cntl1_regval) {
+   /*
+* Program the slice IDs for the various GPU blocks and GPU MMU
+* pagetables
+*/
+   a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
+
+   /*
+* Program cacheability overrides to not allocate cache lines on
+* a write miss
+*/
+   a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 
0xF, 0x03);
+   }
+}
+
+static void a6xx_llc_slices_destroy(struct a6xx_gpu *a6xx_gpu)
+{
+   llcc_slice_putd(a6xx_gpu->llc_slice);
+   llcc_slice_putd(a6xx_gpu->htw_llc_slice);
+}
+
+static void a6xx_llc_slices_init(struct platform_device *pdev,
+   struct a6xx_gpu *a6xx_gpu)
+{
+   a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
+   if (IS_ERR(a6xx_gpu->llc_mmio))
+   return;
+
+   a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU);
+   a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
+
+   if (IS_ERR(a6xx_gpu->llc_slice) && IS_ERR(a6xx_gpu->htw_llc_slice))
+   a6xx_gpu->llc_mmio = ERR_PTR(-EINVAL);
+}
+
 static int a6xx_pm_resume(struct msm_gpu *gpu)
 {
struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
@@ -1038,6 +1113,8 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
 
msm_gpu_resume_devfreq(gpu);
 
+   a6xx_llc_activate(a6xx_gpu);
+
return 0;
 }
 
@@ -1048,6 +1125,8 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)
 
trace_msm_gpu_suspend(0);
 
+   a6xx_llc_deactivate(a6xx_gpu);
+
devfreq_suspend_device(gpu->devfreq.devfreq);
 
return a6xx_gmu_stop(a6xx_gpu);
@@ -1091,6 +1170,8 @@ static void a6xx_destroy(struct msm_gpu *gpu)
drm_ge

[PATCHv8 1/8] iommu/io-pgtable-arm: Add support to use system cache

2020-11-18 Thread Sai Prakash Ranjan
Add a quirk IO_PGTABLE_QUIRK_ARM_OUTER_WBWA to override
the attributes set in TCR for the page table walker when
using system cache.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 10 --
 include/linux/io-pgtable.h |  4 
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index a7a9bc08dcd1..7c9ea9d7874a 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -761,7 +761,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
 
if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
IO_PGTABLE_QUIRK_NON_STRICT |
-   IO_PGTABLE_QUIRK_ARM_TTBR1))
+   IO_PGTABLE_QUIRK_ARM_TTBR1 |
+   IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
return NULL;
 
data = arm_lpae_alloc_pgtable(cfg);
@@ -773,10 +774,15 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
tcr->sh = ARM_LPAE_TCR_SH_IS;
tcr->irgn = ARM_LPAE_TCR_RGN_WBWA;
tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
+   if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA)
+   goto out_free_data;
} else {
tcr->sh = ARM_LPAE_TCR_SH_OS;
tcr->irgn = ARM_LPAE_TCR_RGN_NC;
-   tcr->orgn = ARM_LPAE_TCR_RGN_NC;
+   if (!(cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+   tcr->orgn = ARM_LPAE_TCR_RGN_NC;
+   else
+   tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
}
 
tg1 = cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 4cde111e425b..a9a2c59fab37 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -86,6 +86,9 @@ struct io_pgtable_cfg {
 *
 * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table
 *  for use in the upper half of a split address space.
+*
+* IO_PGTABLE_QUIRK_ARM_OUTER_WBWA: Override the attributes set in TCR 
for
+*  the page table walker when using system cache.
 */
#define IO_PGTABLE_QUIRK_ARM_NS BIT(0)
#define IO_PGTABLE_QUIRK_NO_PERMS   BIT(1)
@@ -93,6 +96,7 @@ struct io_pgtable_cfg {
#define IO_PGTABLE_QUIRK_ARM_MTK_EXTBIT(3)
#define IO_PGTABLE_QUIRK_NON_STRICT BIT(4)
#define IO_PGTABLE_QUIRK_ARM_TTBR1  BIT(5)
+   #define IO_PGTABLE_QUIRK_ARM_OUTER_WBWA BIT(6)
unsigned long   quirks;
unsigned long   pgsize_bitmap;
unsigned intias;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCHv7 2/7] iommu/arm-smmu: Add domain attribute for system cache

2020-11-10 Thread Sai Prakash Ranjan

On 2020-11-10 17:48, Will Deacon wrote:

On Fri, Oct 30, 2020 at 02:53:09PM +0530, Sai Prakash Ranjan wrote:

Add iommu domain attribute for using system cache aka last level
cache by client drivers like GPU to set right attributes for caching
the hardware pagetables into the system cache.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 +
 drivers/iommu/arm/arm-smmu/arm-smmu.h |  1 +
 include/linux/iommu.h |  1 +
 3 files changed, 19 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c

index b1cf8f0abc29..070d13f80c7e 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -789,6 +789,9 @@ static int arm_smmu_init_domain_context(struct 
iommu_domain *domain,

if (smmu_domain->non_strict)
pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;

+   if (smmu_domain->sys_cache)
+   pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE;
+
pgtbl_ops = alloc_io_pgtable_ops(fmt, _cfg, smmu_domain);
if (!pgtbl_ops) {
ret = -ENOMEM;
@@ -1520,6 +1523,9 @@ static int arm_smmu_domain_get_attr(struct 
iommu_domain *domain,

case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
*(int *)data = smmu_domain->non_strict;
return 0;
+   case DOMAIN_ATTR_SYS_CACHE:
+   *((int *)data) = smmu_domain->sys_cache;
+   return 0;
default:
return -ENODEV;
}
@@ -1551,6 +1557,17 @@ static int arm_smmu_domain_set_attr(struct 
iommu_domain *domain,

else
smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
break;
+   case DOMAIN_ATTR_SYS_CACHE:
+   if (smmu_domain->smmu) {
+   ret = -EPERM;
+   goto out_unlock;
+   }
+
+   if (*((int *)data))
+   smmu_domain->sys_cache = true;
+   else
+   smmu_domain->sys_cache = false;
+   break;
default:
ret = -ENODEV;
}
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h

index 885840f3bec8..dfc44d806671 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -373,6 +373,7 @@ struct arm_smmu_domain {
struct mutexinit_mutex; /* Protects smmu pointer */
spinlock_t  cb_lock; /* Serialises ATS1* ops and 
TLB syncs */
struct iommu_domain domain;
+   boolsys_cache;
 };

 struct arm_smmu_master_cfg {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b95a6f8db6ff..4f4bb9c6f8f6 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -118,6 +118,7 @@ enum iommu_attr {
DOMAIN_ATTR_FSL_PAMUV1,
DOMAIN_ATTR_NESTING,/* two stages of translation */
DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
+   DOMAIN_ATTR_SYS_CACHE,


I think you're trying to make this look generic, but it's really not.
If we need to funnel io-pgtable quirks through domain attributes, then 
I

think we should be open about that and add something like
DOMAIN_ATTR_IO_PGTABLE_CFG which could take a struct of page-table
configuration data for the domain (this could just be quirks initially,
but maybe it's worth extending to take ias, oas and page size)



Actually the initial versions used DOMAIN_ATTR_QCOM_SYS_CACHE
to make it QCOM specific and not generic, I don't see anyone else
using this attribute, would that work?

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCHv7 1/7] iommu/io-pgtable-arm: Add support to use system cache

2020-11-10 Thread Sai Prakash Ranjan

On 2020-11-10 17:48, Will Deacon wrote:

On Fri, Oct 30, 2020 at 02:53:08PM +0530, Sai Prakash Ranjan wrote:

Add a quirk IO_PGTABLE_QUIRK_SYS_CACHE to override the
attributes set in TCR for the page table walker when
using system cache.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 7 ++-
 include/linux/io-pgtable.h | 4 
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/io-pgtable-arm.c 
b/drivers/iommu/io-pgtable-arm.c

index a7a9bc08dcd1..a356caf1683a 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -761,7 +761,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg 
*cfg, void *cookie)


if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
IO_PGTABLE_QUIRK_NON_STRICT |
-   IO_PGTABLE_QUIRK_ARM_TTBR1))
+   IO_PGTABLE_QUIRK_ARM_TTBR1 |
+   IO_PGTABLE_QUIRK_SYS_CACHE))
return NULL;

data = arm_lpae_alloc_pgtable(cfg);
@@ -773,6 +774,10 @@ arm_64_lpae_alloc_pgtable_s1(struct 
io_pgtable_cfg *cfg, void *cookie)

tcr->sh = ARM_LPAE_TCR_SH_IS;
tcr->irgn = ARM_LPAE_TCR_RGN_WBWA;
tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
+   } else if (cfg->quirks & IO_PGTABLE_QUIRK_SYS_CACHE) {
+   tcr->sh = ARM_LPAE_TCR_SH_OS;
+   tcr->irgn = ARM_LPAE_TCR_RGN_NC;
+   tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;


Given that this only applies in the case where then page-table walker 
is
non-coherent, I think we'd be better off renaming the quirk to 
something

like IO_PGTABLE_QUIRK_ARM_OUTER_WBWA and then rejecting it in the
non-coherent case.



Do you mean like below?

diff --git a/drivers/iommu/io-pgtable-arm.c 
b/drivers/iommu/io-pgtable-arm.c

index a7a9bc08dcd1..94de1f71db42 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -776,7 +776,10 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg 
*cfg, void *cookie)

} else {
tcr->sh = ARM_LPAE_TCR_SH_OS;
tcr->irgn = ARM_LPAE_TCR_RGN_NC;
-   tcr->orgn = ARM_LPAE_TCR_RGN_NC;
+   if (!(cfg->quirks & IO_PGTABLE_QUIRK_ARM_OUTER_WBWA))
+   tcr->orgn = ARM_LPAE_TCR_RGN_NC;
+   else
+   tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
}

tg1 = cfg->quirks & IO_PGTABLE_QUIRK_ARM_TTBR1;


Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCHv7 0/7] System Cache support for GPU and required SMMU support

2020-11-09 Thread Sai Prakash Ranjan

On 2020-10-30 14:53, Sai Prakash Ranjan wrote:

Some hardware variants contain a system cache or the last level
cache(llc). This cache is typically a large block which is shared
by multiple clients on the SOC. GPU uses the system cache to cache
both the GPU data buffers(like textures) as well the SMMU pagetables.
This helps with improved render performance as well as lower power
consumption by reducing the bus traffic to the system memory.

The system cache architecture allows the cache to be split into slices
which then be used by multiple SOC clients. This patch series is an
effort to enable and use two of those slices preallocated for the GPU,
one for the GPU data buffers and another for the GPU SMMU hardware
pagetables.

Patch 1 - Patch 5 adds system cache support in SMMU and GPU driver.
Patch 6 and 7 are minor cleanups for arm-smmu impl.

Changes in v7:
 * Squash Jordan's patch to support MMU500 targets
 * Rebase on top of for-joerg/arm-smmu/updates and Jordan's short
series for adreno-smmu impl

Changes in v6:
 * Move table to arm-smmu-qcom (Robin)

Changes in v5:
 * Drop cleanup of blank lines since it was intentional (Robin)
 * Rebase again on top of msm-next-pgtables as it moves pretty fast

Changes in v4:
 * Drop IOMMU_SYS_CACHE prot flag
 * Rebase on top of
https://gitlab.freedesktop.org/drm/msm/-/tree/msm-next-pgtables

Changes in v3:
 * Fix domain attribute setting to before iommu_attach_device()
 * Fix few code style and checkpatch warnings
 * Rebase on top of Jordan's latest split pagetables and per-instance
   pagetables support

Changes in v2:
 * Addressed review comments and rebased on top of Jordan's split
   pagetables series

Jordan Crouse (1):
  drm/msm/a6xx: Add support for using system cache on MMU500 based
targets

Sai Prakash Ranjan (4):
  iommu/io-pgtable-arm: Add support to use system cache
  iommu/arm-smmu: Add domain attribute for system cache
  iommu: arm-smmu-impl: Use table to list QCOM implementations
  iommu: arm-smmu-impl: Add a space before open parenthesis

Sharat Masetty (2):
  drm/msm: rearrange the gpu_rmw() function
  drm/msm/a6xx: Add support for using system cache(LLC)



Hi,

Gentle Ping!

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv7 1/7] iommu/io-pgtable-arm: Add support to use system cache

2020-11-01 Thread Sai Prakash Ranjan
Add a quirk IO_PGTABLE_QUIRK_SYS_CACHE to override the
attributes set in TCR for the page table walker when
using system cache.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 7 ++-
 include/linux/io-pgtable.h | 4 
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index a7a9bc08dcd1..a356caf1683a 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -761,7 +761,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
 
if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
IO_PGTABLE_QUIRK_NON_STRICT |
-   IO_PGTABLE_QUIRK_ARM_TTBR1))
+   IO_PGTABLE_QUIRK_ARM_TTBR1 |
+   IO_PGTABLE_QUIRK_SYS_CACHE))
return NULL;
 
data = arm_lpae_alloc_pgtable(cfg);
@@ -773,6 +774,10 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
tcr->sh = ARM_LPAE_TCR_SH_IS;
tcr->irgn = ARM_LPAE_TCR_RGN_WBWA;
tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
+   } else if (cfg->quirks & IO_PGTABLE_QUIRK_SYS_CACHE) {
+   tcr->sh = ARM_LPAE_TCR_SH_OS;
+   tcr->irgn = ARM_LPAE_TCR_RGN_NC;
+   tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
} else {
tcr->sh = ARM_LPAE_TCR_SH_OS;
tcr->irgn = ARM_LPAE_TCR_RGN_NC;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 4cde111e425b..86631f711e05 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -86,6 +86,9 @@ struct io_pgtable_cfg {
 *
 * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table
 *  for use in the upper half of a split address space.
+*
+* IO_PGTABLE_QUIRK_SYS_CACHE: Override the attributes set in TCR for
+*  the page table walker when using system cache.
 */
#define IO_PGTABLE_QUIRK_ARM_NS BIT(0)
#define IO_PGTABLE_QUIRK_NO_PERMS   BIT(1)
@@ -93,6 +96,7 @@ struct io_pgtable_cfg {
#define IO_PGTABLE_QUIRK_ARM_MTK_EXTBIT(3)
#define IO_PGTABLE_QUIRK_NON_STRICT BIT(4)
#define IO_PGTABLE_QUIRK_ARM_TTBR1  BIT(5)
+   #define IO_PGTABLE_QUIRK_SYS_CACHE  BIT(6)
unsigned long   quirks;
unsigned long   pgsize_bitmap;
unsigned intias;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv7 0/7] System Cache support for GPU and required SMMU support

2020-11-01 Thread Sai Prakash Ranjan
Some hardware variants contain a system cache or the last level
cache(llc). This cache is typically a large block which is shared
by multiple clients on the SOC. GPU uses the system cache to cache
both the GPU data buffers(like textures) as well the SMMU pagetables.
This helps with improved render performance as well as lower power
consumption by reducing the bus traffic to the system memory.

The system cache architecture allows the cache to be split into slices
which then be used by multiple SOC clients. This patch series is an
effort to enable and use two of those slices preallocated for the GPU,
one for the GPU data buffers and another for the GPU SMMU hardware
pagetables.

Patch 1 - Patch 5 adds system cache support in SMMU and GPU driver.
Patch 6 and 7 are minor cleanups for arm-smmu impl.

Changes in v7:
 * Squash Jordan's patch to support MMU500 targets
 * Rebase on top of for-joerg/arm-smmu/updates and Jordan's short series for 
adreno-smmu impl

Changes in v6:
 * Move table to arm-smmu-qcom (Robin)

Changes in v5:
 * Drop cleanup of blank lines since it was intentional (Robin)
 * Rebase again on top of msm-next-pgtables as it moves pretty fast

Changes in v4:
 * Drop IOMMU_SYS_CACHE prot flag
 * Rebase on top of 
https://gitlab.freedesktop.org/drm/msm/-/tree/msm-next-pgtables

Changes in v3:
 * Fix domain attribute setting to before iommu_attach_device()
 * Fix few code style and checkpatch warnings
 * Rebase on top of Jordan's latest split pagetables and per-instance
   pagetables support

Changes in v2:
 * Addressed review comments and rebased on top of Jordan's split
   pagetables series

Jordan Crouse (1):
  drm/msm/a6xx: Add support for using system cache on MMU500 based
targets

Sai Prakash Ranjan (4):
  iommu/io-pgtable-arm: Add support to use system cache
  iommu/arm-smmu: Add domain attribute for system cache
  iommu: arm-smmu-impl: Use table to list QCOM implementations
  iommu: arm-smmu-impl: Add a space before open parenthesis

Sharat Masetty (2):
  drm/msm: rearrange the gpu_rmw() function
  drm/msm/a6xx: Add support for using system cache(LLC)

 drivers/gpu/drm/msm/adreno/a6xx_gpu.c  | 109 +
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h  |   5 +
 drivers/gpu/drm/msm/adreno/adreno_gpu.c|  17 
 drivers/gpu/drm/msm/msm_drv.c  |   8 ++
 drivers/gpu/drm/msm/msm_drv.h  |   1 +
 drivers/gpu/drm/msm/msm_gpu.h  |   5 +-
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c |  11 +--
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c |  21 +++-
 drivers/iommu/arm/arm-smmu/arm-smmu.c  |  17 
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |   2 +-
 drivers/iommu/io-pgtable-arm.c |   7 +-
 include/linux/io-pgtable.h |   4 +
 include/linux/iommu.h  |   1 +
 13 files changed, 188 insertions(+), 20 deletions(-)


base-commit: f9081b8ff5934b8d69c748d0200e844cadd2c667
prerequisite-patch-id: db09851f375ca5efde35f2e5c21b3959eed7d8a8
prerequisite-patch-id: 55c6af17808c2047b67cdbd04af5541156ef496e
prerequisite-patch-id: e82c1e678da701e112ac255ea966c6797d975692
prerequisite-patch-id: f7978f5f2fb06528b7a1f75fa4255e386a30b91a
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv7 6/7] iommu: arm-smmu-impl: Use table to list QCOM implementations

2020-11-01 Thread Sai Prakash Ranjan
Use table and of_match_node() to match qcom implementation
instead of multiple of_device_compatible() calls for each
QCOM SMMU implementation.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c |  9 +
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 21 -
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |  1 -
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index d199b4bff15d..ffaf3f91ba52 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -217,14 +217,7 @@ struct arm_smmu_device *arm_smmu_impl_init(struct 
arm_smmu_device *smmu)
if (of_device_is_compatible(np, "nvidia,tegra194-smmu"))
return nvidia_smmu_impl_init(smmu);
 
-   if (of_device_is_compatible(np, "qcom,sdm845-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sc7180-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sm8150-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sm8250-smmu-500"))
-   return qcom_smmu_impl_init(smmu);
-
-   if (of_device_is_compatible(smmu->dev->of_node, "qcom,adreno-smmu"))
-   return qcom_adreno_smmu_impl_init(smmu);
+   smmu = qcom_smmu_impl_init(smmu);
 
if (of_device_is_compatible(np, "marvell,ap806-smmu-500"))
smmu->impl = _mmu500_impl;
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 0f763d555c92..221e2a6a3231 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -314,12 +314,23 @@ static struct arm_smmu_device *qcom_smmu_create(struct 
arm_smmu_device *smmu,
return >smmu;
 }
 
+static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = {
+   { .compatible = "qcom,sc7180-smmu-500" },
+   { .compatible = "qcom,sdm845-smmu-500" },
+   { .compatible = "qcom,sm8150-smmu-500" },
+   { .compatible = "qcom,sm8250-smmu-500" },
+   { }
+};
+
 struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu)
 {
-   return qcom_smmu_create(smmu, _smmu_impl);
-}
+   const struct device_node *np = smmu->dev->of_node;
 
-struct arm_smmu_device *qcom_adreno_smmu_impl_init(struct arm_smmu_device 
*smmu)
-{
-   return qcom_smmu_create(smmu, _adreno_smmu_impl);
+   if (of_match_node(qcom_smmu_impl_of_match, np))
+   return qcom_smmu_create(smmu, _smmu_impl);
+
+   if (of_device_is_compatible(np, "qcom,adreno-smmu"))
+   return qcom_smmu_create(smmu, _adreno_smmu_impl);
+
+   return smmu;
 }
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index dfc44d806671..43b2411e65cc 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -525,7 +525,6 @@ static inline void arm_smmu_writeq(struct arm_smmu_device 
*smmu, int page,
 struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu);
 struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu);
 struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu);
-struct arm_smmu_device *qcom_adreno_smmu_impl_init(struct arm_smmu_device 
*smmu);
 
 void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx);
 int arm_mmu500_reset(struct arm_smmu_device *smmu);
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv7 3/7] drm/msm: rearrange the gpu_rmw() function

2020-11-01 Thread Sai Prakash Ranjan
From: Sharat Masetty 

The register read-modify-write construct is generic enough
that it can be used by other subsystems as needed, create
a more generic rmw() function and have the gpu_rmw() use
this new function.

Signed-off-by: Sharat Masetty 
Reviewed-by: Jordan Crouse 
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/msm_drv.c | 8 
 drivers/gpu/drm/msm/msm_drv.h | 1 +
 drivers/gpu/drm/msm/msm_gpu.h | 5 +
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 49685571dc0e..a1e22b974b77 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -180,6 +180,14 @@ u32 msm_readl(const void __iomem *addr)
return val;
 }
 
+void msm_rmw(void __iomem *addr, u32 mask, u32 or)
+{
+   u32 val = msm_readl(addr);
+
+   val &= ~mask;
+   msm_writel(val | or, addr);
+}
+
 struct msm_vblank_work {
struct work_struct work;
int crtc_id;
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index b9dd8f8f4887..655b3b0424a1 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -478,6 +478,7 @@ void __iomem *msm_ioremap_quiet(struct platform_device 
*pdev, const char *name,
const char *dbgname);
 void msm_writel(u32 data, void __iomem *addr);
 u32 msm_readl(const void __iomem *addr);
+void msm_rmw(void __iomem *addr, u32 mask, u32 or);
 
 struct msm_gpu_submitqueue;
 int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 6c9e1fdc1a76..b2b419277953 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -246,10 +246,7 @@ static inline u32 gpu_read(struct msm_gpu *gpu, u32 reg)
 
 static inline void gpu_rmw(struct msm_gpu *gpu, u32 reg, u32 mask, u32 or)
 {
-   uint32_t val = gpu_read(gpu, reg);
-
-   val &= ~mask;
-   gpu_write(gpu, reg, val | or);
+   msm_rmw(gpu->mmio + (reg << 2), mask, or);
 }
 
 static inline u64 gpu_read64(struct msm_gpu *gpu, u32 lo, u32 hi)
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv7 2/7] iommu/arm-smmu: Add domain attribute for system cache

2020-11-01 Thread Sai Prakash Ranjan
Add iommu domain attribute for using system cache aka last level
cache by client drivers like GPU to set right attributes for caching
the hardware pagetables into the system cache.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 +
 drivers/iommu/arm/arm-smmu/arm-smmu.h |  1 +
 include/linux/iommu.h |  1 +
 3 files changed, 19 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index b1cf8f0abc29..070d13f80c7e 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -789,6 +789,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain 
*domain,
if (smmu_domain->non_strict)
pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
 
+   if (smmu_domain->sys_cache)
+   pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE;
+
pgtbl_ops = alloc_io_pgtable_ops(fmt, _cfg, smmu_domain);
if (!pgtbl_ops) {
ret = -ENOMEM;
@@ -1520,6 +1523,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain 
*domain,
case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
*(int *)data = smmu_domain->non_strict;
return 0;
+   case DOMAIN_ATTR_SYS_CACHE:
+   *((int *)data) = smmu_domain->sys_cache;
+   return 0;
default:
return -ENODEV;
}
@@ -1551,6 +1557,17 @@ static int arm_smmu_domain_set_attr(struct iommu_domain 
*domain,
else
smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
break;
+   case DOMAIN_ATTR_SYS_CACHE:
+   if (smmu_domain->smmu) {
+   ret = -EPERM;
+   goto out_unlock;
+   }
+
+   if (*((int *)data))
+   smmu_domain->sys_cache = true;
+   else
+   smmu_domain->sys_cache = false;
+   break;
default:
ret = -ENODEV;
}
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 885840f3bec8..dfc44d806671 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -373,6 +373,7 @@ struct arm_smmu_domain {
struct mutexinit_mutex; /* Protects smmu pointer */
spinlock_t  cb_lock; /* Serialises ATS1* ops and 
TLB syncs */
struct iommu_domain domain;
+   boolsys_cache;
 };
 
 struct arm_smmu_master_cfg {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index b95a6f8db6ff..4f4bb9c6f8f6 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -118,6 +118,7 @@ enum iommu_attr {
DOMAIN_ATTR_FSL_PAMUV1,
DOMAIN_ATTR_NESTING,/* two stages of translation */
DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
+   DOMAIN_ATTR_SYS_CACHE,
DOMAIN_ATTR_MAX,
 };
 
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv7 5/7] drm/msm/a6xx: Add support for using system cache on MMU500 based targets

2020-11-01 Thread Sai Prakash Ranjan
From: Jordan Crouse 

This is an extension to the series [1] to enable the System Cache (LLC) for
Adreno a6xx targets.

GPU targets with an MMU-500 attached have a slightly different process for
enabling system cache. Use the compatible string on the IOMMU phandle
to see if an MMU-500 is attached and modify the programming sequence
accordingly.

[1] https://patchwork.freedesktop.org/series/83037/

Signed-off-by: Jordan Crouse 
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 46 +--
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h |  1 +
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 95c98c642876..3f8b92da8cba 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1042,6 +1042,8 @@ static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu)
 
 static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
 {
+   struct adreno_gpu *adreno_gpu = _gpu->base;
+   struct msm_gpu *gpu = _gpu->base;
u32 cntl1_regval = 0;
 
if (IS_ERR(a6xx_gpu->llc_mmio))
@@ -1055,11 +1057,17 @@ static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
   (gpu_scid << 15) | (gpu_scid << 20);
}
 
+   /*
+* For targets with a MMU500, activate the slice but don't program the
+* register.  The XBL will take care of that.
+*/
if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
-   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
+   if (!a6xx_gpu->have_mmu500) {
+   u32 gpuhtw_scid = 
llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
 
-   gpuhtw_scid &= 0x1f;
-   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
+   gpuhtw_scid &= 0x1f;
+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), 
gpuhtw_scid);
+   }
}
 
if (cntl1_regval) {
@@ -1067,13 +1075,20 @@ static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
 * Program the slice IDs for the various GPU blocks and GPU MMU
 * pagetables
 */
-   a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
-
-   /*
-* Program cacheability overrides to not allocate cache lines on
-* a write miss
-*/
-   a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 
0xF, 0x03);
+   if (a6xx_gpu->have_mmu500)
+   gpu_rmw(gpu, REG_A6XX_GBIF_SCACHE_CNTL1, GENMASK(24, 0),
+   cntl1_regval);
+   else {
+   a6xx_llc_write(a6xx_gpu,
+   REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
+
+   /*
+* Program cacheability overrides to not allocate cache
+* lines on a write miss
+*/
+   a6xx_llc_rmw(a6xx_gpu,
+   REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 
0x03);
+   }
}
 }
 
@@ -1086,10 +1101,21 @@ static void a6xx_llc_slices_destroy(struct a6xx_gpu 
*a6xx_gpu)
 static void a6xx_llc_slices_init(struct platform_device *pdev,
struct a6xx_gpu *a6xx_gpu)
 {
+   struct device_node *phandle;
+
a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
if (IS_ERR(a6xx_gpu->llc_mmio))
return;
 
+   /*
+* There is a different programming path for targets with an mmu500
+* attached, so detect if that is the case
+*/
+   phandle = of_parse_phandle(pdev->dev.of_node, "iommus", 0);
+   a6xx_gpu->have_mmu500 = (phandle &&
+   of_device_is_compatible(phandle, "arm,mmu-500"));
+   of_node_put(phandle);
+
a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU);
a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
 
diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
index 9e6079af679c..e793d329e77b 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
@@ -32,6 +32,7 @@ struct a6xx_gpu {
void __iomem *llc_mmio;
void *llc_slice;
void *htw_llc_slice;
+   bool have_mmu500;
 };
 
 #define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base)
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv7 7/7] iommu: arm-smmu-impl: Add a space before open parenthesis

2020-11-01 Thread Sai Prakash Ranjan
Fix the checkpatch warning for space required before the open
parenthesis.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index ffaf3f91ba52..f16da4a21270 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -12,7 +12,7 @@
 
 static int arm_smmu_gr0_ns(int offset)
 {
-   switch(offset) {
+   switch (offset) {
case ARM_SMMU_GR0_sCR0:
case ARM_SMMU_GR0_sACR:
case ARM_SMMU_GR0_sGFSR:
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv7 4/7] drm/msm/a6xx: Add support for using system cache(LLC)

2020-11-01 Thread Sai Prakash Ranjan
From: Sharat Masetty 

The last level system cache can be partitioned to 32 different
slices of which GPU has two slices preallocated. One slice is
used for caching GPU buffers and the other slice is used for
caching the GPU SMMU pagetables. This talks to the core system
cache driver to acquire the slice handles, configure the SCID's
to those slices and activates and deactivates the slices upon
GPU power collapse and restore.

Some support from the IOMMU driver is also needed to make use
of the system cache to set the right TCR attributes. GPU then
has the ability to override a few cacheability parameters which
it does to override write-allocate to write-no-allocate as the
GPU hardware does not benefit much from it.

DOMAIN_ATTR_SYS_CACHE is another domain level attribute used by the
IOMMU driver to set the right attributes to cache the hardware
pagetables into the system cache.

Signed-off-by: Sharat Masetty 
[saiprakash.ranjan: fix to set attr before device attach to iommu and rebase]
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 83 +
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h   |  4 ++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 17 +
 3 files changed, 104 insertions(+)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 948f3656c20c..95c98c642876 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -8,7 +8,9 @@
 #include "a6xx_gpu.h"
 #include "a6xx_gmu.xml.h"
 
+#include 
 #include 
+#include 
 
 #define GPU_PAS_ID 13
 
@@ -1022,6 +1024,79 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
return IRQ_HANDLED;
 }
 
+static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 mask, u32 or)
+{
+   return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or);
+}
+
+static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 value)
+{
+   return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2));
+}
+
+static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu)
+{
+   llcc_slice_deactivate(a6xx_gpu->llc_slice);
+   llcc_slice_deactivate(a6xx_gpu->htw_llc_slice);
+}
+
+static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
+{
+   u32 cntl1_regval = 0;
+
+   if (IS_ERR(a6xx_gpu->llc_mmio))
+   return;
+
+   if (!llcc_slice_activate(a6xx_gpu->llc_slice)) {
+   u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice);
+
+   gpu_scid &= 0x1f;
+   cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid << 
10) |
+  (gpu_scid << 15) | (gpu_scid << 20);
+   }
+
+   if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
+   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
+
+   gpuhtw_scid &= 0x1f;
+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
+   }
+
+   if (cntl1_regval) {
+   /*
+* Program the slice IDs for the various GPU blocks and GPU MMU
+* pagetables
+*/
+   a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
+
+   /*
+* Program cacheability overrides to not allocate cache lines on
+* a write miss
+*/
+   a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 
0xF, 0x03);
+   }
+}
+
+static void a6xx_llc_slices_destroy(struct a6xx_gpu *a6xx_gpu)
+{
+   llcc_slice_putd(a6xx_gpu->llc_slice);
+   llcc_slice_putd(a6xx_gpu->htw_llc_slice);
+}
+
+static void a6xx_llc_slices_init(struct platform_device *pdev,
+   struct a6xx_gpu *a6xx_gpu)
+{
+   a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
+   if (IS_ERR(a6xx_gpu->llc_mmio))
+   return;
+
+   a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU);
+   a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
+
+   if (IS_ERR(a6xx_gpu->llc_slice) && IS_ERR(a6xx_gpu->htw_llc_slice))
+   a6xx_gpu->llc_mmio = ERR_PTR(-EINVAL);
+}
+
 static int a6xx_pm_resume(struct msm_gpu *gpu)
 {
struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
@@ -1038,6 +1113,8 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
 
msm_gpu_resume_devfreq(gpu);
 
+   a6xx_llc_activate(a6xx_gpu);
+
return 0;
 }
 
@@ -1048,6 +1125,8 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)
 
trace_msm_gpu_suspend(0);
 
+   a6xx_llc_deactivate(a6xx_gpu);
+
devfreq_suspend_device(gpu->devfreq.devfreq);
 
return a6xx_gmu_stop(a6xx_gpu);
@@ -1091,6 +1170,8 @@ static void a6xx_destroy(struct msm_gpu *gpu)
drm_gem_object_put(a6xx_gpu->shadow_bo);
}
 
+   a6xx_llc_slices_destroy(a

Re: [PATCH] drm/msm/a6xx: Add support for using system cache on MMU500 based targets

2020-10-28 Thread Sai Prakash Ranjan

On 2020-10-27 20:09, Jordan Crouse wrote:

On Tue, Oct 27, 2020 at 12:38:02PM +0530, Sai Prakash Ranjan wrote:

On 2020-10-27 00:24, Jordan Crouse wrote:
>This is an extension to the series [1] to enable the System Cache (LLC)
>for
>Adreno a6xx targets.
>
>GPU targets with an MMU-500 attached have a slightly different process for
>enabling system cache. Use the compatible string on the IOMMU phandle
>to see if an MMU-500 is attached and modify the programming sequence
>accordingly.
>
>[1] https://patchwork.freedesktop.org/series/83037/
>
>Signed-off-by: Jordan Crouse 
>---
>
> drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 46 +--
> drivers/gpu/drm/msm/adreno/a6xx_gpu.h |  1 +
> 2 files changed, 37 insertions(+), 10 deletions(-)
>
>diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>index 95c98c642876..b7737732fbb6 100644
>--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>@@ -1042,6 +1042,8 @@ static void a6xx_llc_deactivate(struct a6xx_gpu
>*a6xx_gpu)
>
> static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
> {
>+   struct adreno_gpu *adreno_gpu = _gpu->base;
>+   struct msm_gpu *gpu = _gpu->base;
>u32 cntl1_regval = 0;
>
>if (IS_ERR(a6xx_gpu->llc_mmio))
>@@ -1055,11 +1057,17 @@ static void a6xx_llc_activate(struct a6xx_gpu
>*a6xx_gpu)
>   (gpu_scid << 15) | (gpu_scid << 20);
>}
>
>+   /*
>+* For targets with a MMU500, activate the slice but don't program the
>+* register.  The XBL will take care of that.
>+*/
>if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
>-   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
>+   if (!a6xx_gpu->have_mmu500) {
>+   u32 gpuhtw_scid = 
llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
>
>-   gpuhtw_scid &= 0x1f;
>-   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
>+   gpuhtw_scid &= 0x1f;
>+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
>+   }
>}
>
>if (cntl1_regval) {
>@@ -1067,13 +1075,20 @@ static void a6xx_llc_activate(struct a6xx_gpu
>*a6xx_gpu)
> * Program the slice IDs for the various GPU blocks and GPU MMU
> * pagetables
> */
>-   a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1,
>cntl1_regval);
>-
>-   /*
>-* Program cacheability overrides to not allocate cache lines on
>-* a write miss
>-*/
>-   a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF,
>0x03);
>+   if (a6xx_gpu->have_mmu500)
>+   gpu_rmw(gpu, REG_A6XX_GBIF_SCACHE_CNTL1, GENMASK(24, 0),
>+   cntl1_regval);
>+   else {
>+   a6xx_llc_write(a6xx_gpu,
>+   REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
>+
>+   /*
>+* Program cacheability overrides to not allocate cache
>+* lines on a write miss
>+*/
>+   a6xx_llc_rmw(a6xx_gpu,
>+   REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 0x03);
>+   }
>}
> }
>
>@@ -1086,10 +1101,21 @@ static void a6xx_llc_slices_destroy(struct
>a6xx_gpu *a6xx_gpu)
> static void a6xx_llc_slices_init(struct platform_device *pdev,
>struct a6xx_gpu *a6xx_gpu)
> {
>+   struct device_node *phandle;
>+
>a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
>if (IS_ERR(a6xx_gpu->llc_mmio))
>return;
>
>+   /*
>+* There is a different programming path for targets with an mmu500
>+* attached, so detect if that is the case
>+*/
>+   phandle = of_parse_phandle(pdev->dev.of_node, "iommus", 0);
>+   a6xx_gpu->have_mmu500 = (phandle &&
>+   of_device_is_compatible(phandle, "arm,mmu500"));
>+   of_node_put(phandle);
>+
>a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU);
>a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
>
>diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>index 9e6079af679c..e793d329e77b 100644
>--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>@@ -32,6 +32,7 @@ struct a6xx_gpu {
>void __iomem *llc_mmio;
>void *llc_slice;
>void *htw_llc_slice;
>+   bool have_mmu500;
> };
>
> #define to_a6xx_gpu(x) container_of(x,

Re: [PATCH] drm/msm/a6xx: Add support for using system cache on MMU500 based targets

2020-10-28 Thread Sai Prakash Ranjan

On 2020-10-27 21:10, Robin Murphy wrote:

On 2020-10-26 18:54, Jordan Crouse wrote:
This is an extension to the series [1] to enable the System Cache 
(LLC) for

Adreno a6xx targets.

GPU targets with an MMU-500 attached have a slightly different process 
for

enabling system cache. Use the compatible string on the IOMMU phandle
to see if an MMU-500 is attached and modify the programming sequence
accordingly.

[1] https://patchwork.freedesktop.org/series/83037/

Signed-off-by: Jordan Crouse 
---

  drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 46 
+--

  drivers/gpu/drm/msm/adreno/a6xx_gpu.h |  1 +
  2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c

index 95c98c642876..b7737732fbb6 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1042,6 +1042,8 @@ static void a6xx_llc_deactivate(struct a6xx_gpu 
*a6xx_gpu)

static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
  {
+   struct adreno_gpu *adreno_gpu = _gpu->base;
+   struct msm_gpu *gpu = _gpu->base;
u32 cntl1_regval = 0;
if (IS_ERR(a6xx_gpu->llc_mmio))
@@ -1055,11 +1057,17 @@ static void a6xx_llc_activate(struct a6xx_gpu 
*a6xx_gpu)

   (gpu_scid << 15) | (gpu_scid << 20);
}
  + /*
+	 * For targets with a MMU500, activate the slice but don't program 
the

+* register.  The XBL will take care of that.
+*/
if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
-   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
+   if (!a6xx_gpu->have_mmu500) {
+   u32 gpuhtw_scid = 
llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
  - gpuhtw_scid &= 0x1f;
-   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
+   gpuhtw_scid &= 0x1f;
+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), 
gpuhtw_scid);
+   }
}
if (cntl1_regval) {
@@ -1067,13 +1075,20 @@ static void a6xx_llc_activate(struct a6xx_gpu 
*a6xx_gpu)

 * Program the slice IDs for the various GPU blocks and GPU MMU
 * pagetables
 */
-		a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);

-
-   /*
-* Program cacheability overrides to not allocate cache lines on
-* a write miss
-*/
-		a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 
0x03);

+   if (a6xx_gpu->have_mmu500)
+   gpu_rmw(gpu, REG_A6XX_GBIF_SCACHE_CNTL1, GENMASK(24, 0),
+   cntl1_regval);
+   else {
+   a6xx_llc_write(a6xx_gpu,
+   REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
+
+   /*
+* Program cacheability overrides to not allocate cache
+* lines on a write miss
+*/
+   a6xx_llc_rmw(a6xx_gpu,
+   REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 
0x03);
+   }
}
  }
  @@ -1086,10 +1101,21 @@ static void a6xx_llc_slices_destroy(struct 
a6xx_gpu *a6xx_gpu)

  static void a6xx_llc_slices_init(struct platform_device *pdev,
struct a6xx_gpu *a6xx_gpu)
  {
+   struct device_node *phandle;
+
a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
if (IS_ERR(a6xx_gpu->llc_mmio))
return;
  + /*
+* There is a different programming path for targets with an mmu500
+* attached, so detect if that is the case
+*/
+   phandle = of_parse_phandle(pdev->dev.of_node, "iommus", 0);
+   a6xx_gpu->have_mmu500 = (phandle &&
+   of_device_is_compatible(phandle, "arm,mmu500"));


Note that this should never match, since the compatible string defined
by the binding is "arm,mmu-500" ;)



Hawk eye ;) I will fix it in the next version when I squash
this with my series.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCH] drm/msm/a6xx: Add support for using system cache on MMU500 based targets

2020-10-27 Thread Sai Prakash Ranjan

On 2020-10-27 00:24, Jordan Crouse wrote:
This is an extension to the series [1] to enable the System Cache (LLC) 
for

Adreno a6xx targets.

GPU targets with an MMU-500 attached have a slightly different process 
for

enabling system cache. Use the compatible string on the IOMMU phandle
to see if an MMU-500 is attached and modify the programming sequence
accordingly.

[1] https://patchwork.freedesktop.org/series/83037/

Signed-off-by: Jordan Crouse 
---

 drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 46 +--
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h |  1 +
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 95c98c642876..b7737732fbb6 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -1042,6 +1042,8 @@ static void a6xx_llc_deactivate(struct a6xx_gpu 
*a6xx_gpu)


 static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
 {
+   struct adreno_gpu *adreno_gpu = _gpu->base;
+   struct msm_gpu *gpu = _gpu->base;
u32 cntl1_regval = 0;

if (IS_ERR(a6xx_gpu->llc_mmio))
@@ -1055,11 +1057,17 @@ static void a6xx_llc_activate(struct a6xx_gpu 
*a6xx_gpu)

   (gpu_scid << 15) | (gpu_scid << 20);
}

+   /*
+	 * For targets with a MMU500, activate the slice but don't program 
the

+* register.  The XBL will take care of that.
+*/
if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
-   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
+   if (!a6xx_gpu->have_mmu500) {
+   u32 gpuhtw_scid = 
llcc_get_slice_id(a6xx_gpu->htw_llc_slice);

-   gpuhtw_scid &= 0x1f;
-   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
+   gpuhtw_scid &= 0x1f;
+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), 
gpuhtw_scid);
+   }
}

if (cntl1_regval) {
@@ -1067,13 +1075,20 @@ static void a6xx_llc_activate(struct a6xx_gpu 
*a6xx_gpu)

 * Program the slice IDs for the various GPU blocks and GPU MMU
 * pagetables
 */
-		a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);

-
-   /*
-* Program cacheability overrides to not allocate cache lines on
-* a write miss
-*/
-		a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 
0x03);

+   if (a6xx_gpu->have_mmu500)
+   gpu_rmw(gpu, REG_A6XX_GBIF_SCACHE_CNTL1, GENMASK(24, 0),
+   cntl1_regval);
+   else {
+   a6xx_llc_write(a6xx_gpu,
+   REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
+
+   /*
+* Program cacheability overrides to not allocate cache
+* lines on a write miss
+*/
+   a6xx_llc_rmw(a6xx_gpu,
+   REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 
0x03);
+   }
}
 }

@@ -1086,10 +1101,21 @@ static void a6xx_llc_slices_destroy(struct
a6xx_gpu *a6xx_gpu)
 static void a6xx_llc_slices_init(struct platform_device *pdev,
struct a6xx_gpu *a6xx_gpu)
 {
+   struct device_node *phandle;
+
a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
if (IS_ERR(a6xx_gpu->llc_mmio))
return;

+   /*
+* There is a different programming path for targets with an mmu500
+* attached, so detect if that is the case
+*/
+   phandle = of_parse_phandle(pdev->dev.of_node, "iommus", 0);
+   a6xx_gpu->have_mmu500 = (phandle &&
+   of_device_is_compatible(phandle, "arm,mmu500"));
+   of_node_put(phandle);
+
a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU);
a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
index 9e6079af679c..e793d329e77b 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
@@ -32,6 +32,7 @@ struct a6xx_gpu {
void __iomem *llc_mmio;
void *llc_slice;
void *htw_llc_slice;
+   bool have_mmu500;
 };

 #define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base)


Thanks Jordan for the patch.

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv6 0/6] System Cache support for GPU and required SMMU support

2020-10-26 Thread Sai Prakash Ranjan
Some hardware variants contain a system cache or the last level
cache(llc). This cache is typically a large block which is shared
by multiple clients on the SOC. GPU uses the system cache to cache
both the GPU data buffers(like textures) as well the SMMU pagetables.
This helps with improved render performance as well as lower power
consumption by reducing the bus traffic to the system memory.

The system cache architecture allows the cache to be split into slices
which then be used by multiple SOC clients. This patch series is an
effort to enable and use two of those slices perallocated for the GPU,
one for the GPU data buffers and another for the GPU SMMU hardware
pagetables.

Patch 1 - Patch 4 adds system cache support in SMMU and GPU driver.
Patch 5 and 6 are minor cleanups for arm-smmu impl.

The series is based on top of 
https://gitlab.freedesktop.org/drm/msm/-/tree/msm-next-pgtables

Changes in v6:
 * Move table to arm-smmu-qcom (Robin)

Changes in v5:
 * Drop cleanup of blank lines since it was intentional (Robin)
 * Rebase again on top of msm-next-pgtables as it moves pretty fast

Changes in v4:
 * Drop IOMMU_SYS_CACHE prot flag
 * Rebase on top of 
https://gitlab.freedesktop.org/drm/msm/-/tree/msm-next-pgtables

Changes in v3:
 * Fix domain attribute setting to before iommu_attach_device()
 * Fix few code style and checkpatch warnings
 * Rebase on top of Jordan's latest split pagetables and per-instance
   pagetables support

Changes in v2:
 * Addressed review comments and rebased on top of Jordan's split
   pagetables series

Sai Prakash Ranjan (4):
  iommu/io-pgtable-arm: Add support to use system cache
  iommu/arm-smmu: Add domain attribute for system cache
  iommu: arm-smmu-impl: Use table to list QCOM implementations
  iommu: arm-smmu-impl: Add a space before open parenthesis

Sharat Masetty (2):
  drm/msm: rearrange the gpu_rmw() function
  drm/msm/a6xx: Add support for using system cache(LLC)

 drivers/gpu/drm/msm/adreno/a6xx_gpu.c  | 83 ++
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h  |  4 ++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c| 17 +
 drivers/gpu/drm/msm/msm_drv.c  |  8 +++
 drivers/gpu/drm/msm/msm_drv.h  |  1 +
 drivers/gpu/drm/msm/msm_gpu.h  |  5 +-
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 11 +--
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 21 --
 drivers/iommu/arm/arm-smmu/arm-smmu.c  | 17 +
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |  2 +-
 drivers/iommu/io-pgtable-arm.c |  7 +-
 include/linux/io-pgtable.h |  4 ++
 include/linux/iommu.h  |  1 +
 13 files changed, 161 insertions(+), 20 deletions(-)


base-commit: ea95e543fd6201aceff96a0dd95530b2085874c4
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv6 6/6] iommu: arm-smmu-impl: Add a space before open parenthesis

2020-10-26 Thread Sai Prakash Ranjan
Fix the checkpatch warning for space required before the open
parenthesis.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index ffaf3f91ba52..f16da4a21270 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -12,7 +12,7 @@
 
 static int arm_smmu_gr0_ns(int offset)
 {
-   switch(offset) {
+   switch (offset) {
case ARM_SMMU_GR0_sCR0:
case ARM_SMMU_GR0_sACR:
case ARM_SMMU_GR0_sGFSR:
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv6 2/6] iommu/arm-smmu: Add domain attribute for system cache

2020-10-26 Thread Sai Prakash Ranjan
Add iommu domain attribute for using system cache aka last level
cache by client drivers like GPU to set right attributes for caching
the hardware pagetables into the system cache.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 +
 drivers/iommu/arm/arm-smmu/arm-smmu.h |  1 +
 include/linux/iommu.h |  1 +
 3 files changed, 19 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 1f06ab219819..d449c895ba16 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -789,6 +789,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain 
*domain,
if (smmu_domain->non_strict)
pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
 
+   if (smmu_domain->sys_cache)
+   pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE;
+
pgtbl_ops = alloc_io_pgtable_ops(fmt, _cfg, smmu_domain);
if (!pgtbl_ops) {
ret = -ENOMEM;
@@ -1513,6 +1516,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain 
*domain,
case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
*(int *)data = smmu_domain->non_strict;
return 0;
+   case DOMAIN_ATTR_SYS_CACHE:
+   *((int *)data) = smmu_domain->sys_cache;
+   return 0;
default:
return -ENODEV;
}
@@ -1544,6 +1550,17 @@ static int arm_smmu_domain_set_attr(struct iommu_domain 
*domain,
else
smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
break;
+   case DOMAIN_ATTR_SYS_CACHE:
+   if (smmu_domain->smmu) {
+   ret = -EPERM;
+   goto out_unlock;
+   }
+
+   if (*((int *)data))
+   smmu_domain->sys_cache = true;
+   else
+   smmu_domain->sys_cache = false;
+   break;
default:
ret = -ENODEV;
}
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index ddf2ca4c923d..93593e164e44 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -373,6 +373,7 @@ struct arm_smmu_domain {
struct mutexinit_mutex; /* Protects smmu pointer */
spinlock_t  cb_lock; /* Serialises ATS1* ops and 
TLB syncs */
struct iommu_domain domain;
+   boolsys_cache;
 };
 
 struct arm_smmu_master_cfg {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index fee209efb756..a580dfe9c68d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -118,6 +118,7 @@ enum iommu_attr {
DOMAIN_ATTR_FSL_PAMUV1,
DOMAIN_ATTR_NESTING,/* two stages of translation */
DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
+   DOMAIN_ATTR_SYS_CACHE,
DOMAIN_ATTR_MAX,
 };
 
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv6 3/6] drm/msm: rearrange the gpu_rmw() function

2020-10-26 Thread Sai Prakash Ranjan
From: Sharat Masetty 

The register read-modify-write construct is generic enough
that it can be used by other subsystems as needed, create
a more generic rmw() function and have the gpu_rmw() use
this new function.

Signed-off-by: Sharat Masetty 
Reviewed-by: Jordan Crouse 
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/msm_drv.c | 8 
 drivers/gpu/drm/msm/msm_drv.h | 1 +
 drivers/gpu/drm/msm/msm_gpu.h | 5 +
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 49685571dc0e..a1e22b974b77 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -180,6 +180,14 @@ u32 msm_readl(const void __iomem *addr)
return val;
 }
 
+void msm_rmw(void __iomem *addr, u32 mask, u32 or)
+{
+   u32 val = msm_readl(addr);
+
+   val &= ~mask;
+   msm_writel(val | or, addr);
+}
+
 struct msm_vblank_work {
struct work_struct work;
int crtc_id;
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index b9dd8f8f4887..655b3b0424a1 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -478,6 +478,7 @@ void __iomem *msm_ioremap_quiet(struct platform_device 
*pdev, const char *name,
const char *dbgname);
 void msm_writel(u32 data, void __iomem *addr);
 u32 msm_readl(const void __iomem *addr);
+void msm_rmw(void __iomem *addr, u32 mask, u32 or);
 
 struct msm_gpu_submitqueue;
 int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 6c9e1fdc1a76..b2b419277953 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -246,10 +246,7 @@ static inline u32 gpu_read(struct msm_gpu *gpu, u32 reg)
 
 static inline void gpu_rmw(struct msm_gpu *gpu, u32 reg, u32 mask, u32 or)
 {
-   uint32_t val = gpu_read(gpu, reg);
-
-   val &= ~mask;
-   gpu_write(gpu, reg, val | or);
+   msm_rmw(gpu->mmio + (reg << 2), mask, or);
 }
 
 static inline u64 gpu_read64(struct msm_gpu *gpu, u32 lo, u32 hi)
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv6 4/6] drm/msm/a6xx: Add support for using system cache(LLC)

2020-10-26 Thread Sai Prakash Ranjan
From: Sharat Masetty 

The last level system cache can be partitioned to 32 different
slices of which GPU has two slices preallocated. One slice is
used for caching GPU buffers and the other slice is used for
caching the GPU SMMU pagetables. This talks to the core system
cache driver to acquire the slice handles, configure the SCID's
to those slices and activates and deactivates the slices upon
GPU power collapse and restore.

Some support from the IOMMU driver is also needed to make use
of the system cache to set the right TCR attributes. GPU then
has the ability to override a few cacheability parameters which
it does to override write-allocate to write-no-allocate as the
GPU hardware does not benefit much from it.

DOMAIN_ATTR_SYS_CACHE is another domain level attribute used by the
IOMMU driver to set the right attributes to cache the hardware
pagetables into the system cache.

Signed-off-by: Sharat Masetty 
[saiprakash.ranjan: fix to set attr before device attach to iommu and rebase]
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 83 +
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h   |  4 ++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 17 +
 3 files changed, 104 insertions(+)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 8915882e..151190ff62f7 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -8,7 +8,9 @@
 #include "a6xx_gpu.h"
 #include "a6xx_gmu.xml.h"
 
+#include 
 #include 
+#include 
 
 #define GPU_PAS_ID 13
 
@@ -1022,6 +1024,79 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
return IRQ_HANDLED;
 }
 
+static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 mask, u32 or)
+{
+   return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or);
+}
+
+static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 value)
+{
+   return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2));
+}
+
+static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu)
+{
+   llcc_slice_deactivate(a6xx_gpu->llc_slice);
+   llcc_slice_deactivate(a6xx_gpu->htw_llc_slice);
+}
+
+static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
+{
+   u32 cntl1_regval = 0;
+
+   if (IS_ERR(a6xx_gpu->llc_mmio))
+   return;
+
+   if (!llcc_slice_activate(a6xx_gpu->llc_slice)) {
+   u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice);
+
+   gpu_scid &= 0x1f;
+   cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid << 
10) |
+  (gpu_scid << 15) | (gpu_scid << 20);
+   }
+
+   if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
+   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
+
+   gpuhtw_scid &= 0x1f;
+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
+   }
+
+   if (cntl1_regval) {
+   /*
+* Program the slice IDs for the various GPU blocks and GPU MMU
+* pagetables
+*/
+   a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
+
+   /*
+* Program cacheability overrides to not allocate cache lines on
+* a write miss
+*/
+   a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 
0xF, 0x03);
+   }
+}
+
+static void a6xx_llc_slices_destroy(struct a6xx_gpu *a6xx_gpu)
+{
+   llcc_slice_putd(a6xx_gpu->llc_slice);
+   llcc_slice_putd(a6xx_gpu->htw_llc_slice);
+}
+
+static void a6xx_llc_slices_init(struct platform_device *pdev,
+   struct a6xx_gpu *a6xx_gpu)
+{
+   a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
+   if (IS_ERR(a6xx_gpu->llc_mmio))
+   return;
+
+   a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU);
+   a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
+
+   if (IS_ERR(a6xx_gpu->llc_slice) && IS_ERR(a6xx_gpu->htw_llc_slice))
+   a6xx_gpu->llc_mmio = ERR_PTR(-EINVAL);
+}
+
 static int a6xx_pm_resume(struct msm_gpu *gpu)
 {
struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
@@ -1038,6 +1113,8 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
 
msm_gpu_resume_devfreq(gpu);
 
+   a6xx_llc_activate(a6xx_gpu);
+
return 0;
 }
 
@@ -1048,6 +1125,8 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)
 
trace_msm_gpu_suspend(0);
 
+   a6xx_llc_deactivate(a6xx_gpu);
+
devfreq_suspend_device(gpu->devfreq.devfreq);
 
return a6xx_gmu_stop(a6xx_gpu);
@@ -1091,6 +1170,8 @@ static void a6xx_destroy(struct msm_gpu *gpu)
drm_gem_object_put(a6xx_gpu->shadow_bo);
}
 
+   a6xx_llc_slices_destroy(a

[PATCHv6 1/6] iommu/io-pgtable-arm: Add support to use system cache

2020-10-26 Thread Sai Prakash Ranjan
Add a quirk IO_PGTABLE_QUIRK_SYS_CACHE to override the
attributes set in TCR for the page table walker when
using system cache.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 7 ++-
 include/linux/io-pgtable.h | 4 
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index dc7bcf858b6d..828426c16fa9 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -789,7 +789,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
 
if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
IO_PGTABLE_QUIRK_NON_STRICT |
-   IO_PGTABLE_QUIRK_ARM_TTBR1))
+   IO_PGTABLE_QUIRK_ARM_TTBR1 |
+   IO_PGTABLE_QUIRK_SYS_CACHE))
return NULL;
 
data = arm_lpae_alloc_pgtable(cfg);
@@ -801,6 +802,10 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
tcr->sh = ARM_LPAE_TCR_SH_IS;
tcr->irgn = ARM_LPAE_TCR_RGN_WBWA;
tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
+   } else if (cfg->quirks & IO_PGTABLE_QUIRK_SYS_CACHE) {
+   tcr->sh = ARM_LPAE_TCR_SH_OS;
+   tcr->irgn = ARM_LPAE_TCR_RGN_NC;
+   tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
} else {
tcr->sh = ARM_LPAE_TCR_SH_OS;
tcr->irgn = ARM_LPAE_TCR_RGN_NC;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 23285ba645db..ecc9d2248b84 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -86,6 +86,9 @@ struct io_pgtable_cfg {
 *
 * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table
 *  for use in the upper half of a split address space.
+*
+* IO_PGTABLE_QUIRK_SYS_CACHE: Override the attributes set in TCR for
+*  the page table walker when using system cache.
 */
#define IO_PGTABLE_QUIRK_ARM_NS BIT(0)
#define IO_PGTABLE_QUIRK_NO_PERMS   BIT(1)
@@ -93,6 +96,7 @@ struct io_pgtable_cfg {
#define IO_PGTABLE_QUIRK_ARM_MTK_EXTBIT(3)
#define IO_PGTABLE_QUIRK_NON_STRICT BIT(4)
#define IO_PGTABLE_QUIRK_ARM_TTBR1  BIT(5)
+   #define IO_PGTABLE_QUIRK_SYS_CACHE  BIT(6)
unsigned long   quirks;
unsigned long   pgsize_bitmap;
unsigned intias;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv6 5/6] iommu: arm-smmu-impl: Use table to list QCOM implementations

2020-10-26 Thread Sai Prakash Ranjan
Use table and of_match_node() to match qcom implementation
instead of multiple of_device_compatible() calls for each
QCOM SMMU implementation.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c |  9 +
 drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 21 -
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |  1 -
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index d199b4bff15d..ffaf3f91ba52 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -217,14 +217,7 @@ struct arm_smmu_device *arm_smmu_impl_init(struct 
arm_smmu_device *smmu)
if (of_device_is_compatible(np, "nvidia,tegra194-smmu"))
return nvidia_smmu_impl_init(smmu);
 
-   if (of_device_is_compatible(np, "qcom,sdm845-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sc7180-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sm8150-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sm8250-smmu-500"))
-   return qcom_smmu_impl_init(smmu);
-
-   if (of_device_is_compatible(smmu->dev->of_node, "qcom,adreno-smmu"))
-   return qcom_adreno_smmu_impl_init(smmu);
+   smmu = qcom_smmu_impl_init(smmu);
 
if (of_device_is_compatible(np, "marvell,ap806-smmu-500"))
smmu->impl = _mmu500_impl;
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
index 0663d7d26908..9c40fb7a2241 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c
@@ -224,12 +224,23 @@ static struct arm_smmu_device *qcom_smmu_create(struct 
arm_smmu_device *smmu,
return >smmu;
 }
 
+static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = {
+   { .compatible = "qcom,sc7180-smmu-500" },
+   { .compatible = "qcom,sdm845-smmu-500" },
+   { .compatible = "qcom,sm8150-smmu-500" },
+   { .compatible = "qcom,sm8250-smmu-500" },
+   { }
+};
+
 struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu)
 {
-   return qcom_smmu_create(smmu, _smmu_impl);
-}
+   const struct device_node *np = smmu->dev->of_node;
 
-struct arm_smmu_device *qcom_adreno_smmu_impl_init(struct arm_smmu_device 
*smmu)
-{
-   return qcom_smmu_create(smmu, _adreno_smmu_impl);
+   if (of_match_node(qcom_smmu_impl_of_match, np))
+   return qcom_smmu_create(smmu, _smmu_impl);
+
+   if (of_device_is_compatible(np, "qcom,adreno-smmu"))
+   return qcom_smmu_create(smmu, _adreno_smmu_impl);
+
+   return smmu;
 }
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index 93593e164e44..94f3e439c082 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -524,7 +524,6 @@ static inline void arm_smmu_writeq(struct arm_smmu_device 
*smmu, int page,
 struct arm_smmu_device *arm_smmu_impl_init(struct arm_smmu_device *smmu);
 struct arm_smmu_device *nvidia_smmu_impl_init(struct arm_smmu_device *smmu);
 struct arm_smmu_device *qcom_smmu_impl_init(struct arm_smmu_device *smmu);
-struct arm_smmu_device *qcom_adreno_smmu_impl_init(struct arm_smmu_device 
*smmu);
 
 void arm_smmu_write_context_bank(struct arm_smmu_device *smmu, int idx);
 int arm_mmu500_reset(struct arm_smmu_device *smmu);
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCHv5 4/6] drm/msm/a6xx: Add support for using system cache(LLC)

2020-09-29 Thread Sai Prakash Ranjan

On 2020-09-28 21:41, Jordan Crouse wrote:

On Mon, Sep 28, 2020 at 05:56:55PM +0530, Sai Prakash Ranjan wrote:

Hi Jordan,

On 2020-09-23 20:33, Jordan Crouse wrote:
>On Tue, Sep 22, 2020 at 11:48:17AM +0530, Sai Prakash Ranjan wrote:
>>From: Sharat Masetty 
>>
>>The last level system cache can be partitioned to 32 different
>>slices of which GPU has two slices preallocated. One slice is
>>used for caching GPU buffers and the other slice is used for
>>caching the GPU SMMU pagetables. This talks to the core system
>>cache driver to acquire the slice handles, configure the SCID's
>>to those slices and activates and deactivates the slices upon
>>GPU power collapse and restore.
>>
>>Some support from the IOMMU driver is also needed to make use
>>of the system cache to set the right TCR attributes. GPU then
>>has the ability to override a few cacheability parameters which
>>it does to override write-allocate to write-no-allocate as the
>>GPU hardware does not benefit much from it.
>>
>>DOMAIN_ATTR_SYS_CACHE is another domain level attribute used by the
>>IOMMU driver to set the right attributes to cache the hardware
>>pagetables into the system cache.
>>
>>Signed-off-by: Sharat Masetty 
>>[saiprakash.ranjan: fix to set attr before device attach to iommu and
>>rebase]
>>Signed-off-by: Sai Prakash Ranjan 
>>---
>> drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 83 +
>> drivers/gpu/drm/msm/adreno/a6xx_gpu.h   |  4 ++
>> drivers/gpu/drm/msm/adreno/adreno_gpu.c | 17 +
>> 3 files changed, 104 insertions(+)
>>
>>diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>>b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>>index 8915882e..151190ff62f7 100644
>>--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>>+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>>@@ -8,7 +8,9 @@
>> #include "a6xx_gpu.h"
>> #include "a6xx_gmu.xml.h"
>>
>>+#include 
>> #include 
>>+#include 
>>
>> #define GPU_PAS_ID 13
>>
>>@@ -1022,6 +1024,79 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
>>return IRQ_HANDLED;
>> }
>>
>>+static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 mask,
>>u32 or)
>>+{
>>+   return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or);
>>+}
>>+
>>+static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32
>>value)
>>+{
>>+   return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2));
>>+}
>>+
>>+static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu)
>>+{
>>+   llcc_slice_deactivate(a6xx_gpu->llc_slice);
>>+   llcc_slice_deactivate(a6xx_gpu->htw_llc_slice);
>>+}
>>+
>>+static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
>>+{
>>+   u32 cntl1_regval = 0;
>>+
>>+   if (IS_ERR(a6xx_gpu->llc_mmio))
>>+   return;
>>+
>>+   if (!llcc_slice_activate(a6xx_gpu->llc_slice)) {
>>+   u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice);
>>+
>>+   gpu_scid &= 0x1f;
>>+   cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid 
<< 10) |
>>+  (gpu_scid << 15) | (gpu_scid << 20);
>>+   }
>>+
>>+   if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
>>+   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
>>+
>>+   gpuhtw_scid &= 0x1f;
>>+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
>>+   }
>>+
>>+   if (cntl1_regval) {
>>+   /*
>>+* Program the slice IDs for the various GPU blocks and GPU 
MMU
>>+* pagetables
>>+*/
>>+   a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1,
>>cntl1_regval);
>>+
>>+   /*
>>+* Program cacheability overrides to not allocate cache lines 
on
>>+* a write miss
>>+*/
>>+   a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 
0xF,
>>0x03);
>>+   }
>>+}
>
>This code has been around long enough that it pre-dates a650. On a650 and
>other
>MMU-500 targets the htw_llc is configured by the firmware and the
>llc_slice is
>configured in a different register.
>
>I don't think we need to pause everything and add support for the MMU-500
>path,
>but we do need a way to 

Re: [PATCHv5 4/6] drm/msm/a6xx: Add support for using system cache(LLC)

2020-09-29 Thread Sai Prakash Ranjan

Hi Jordan,

On 2020-09-23 20:33, Jordan Crouse wrote:

On Tue, Sep 22, 2020 at 11:48:17AM +0530, Sai Prakash Ranjan wrote:

From: Sharat Masetty 

The last level system cache can be partitioned to 32 different
slices of which GPU has two slices preallocated. One slice is
used for caching GPU buffers and the other slice is used for
caching the GPU SMMU pagetables. This talks to the core system
cache driver to acquire the slice handles, configure the SCID's
to those slices and activates and deactivates the slices upon
GPU power collapse and restore.

Some support from the IOMMU driver is also needed to make use
of the system cache to set the right TCR attributes. GPU then
has the ability to override a few cacheability parameters which
it does to override write-allocate to write-no-allocate as the
GPU hardware does not benefit much from it.

DOMAIN_ATTR_SYS_CACHE is another domain level attribute used by the
IOMMU driver to set the right attributes to cache the hardware
pagetables into the system cache.

Signed-off-by: Sharat Masetty 
[saiprakash.ranjan: fix to set attr before device attach to iommu and 
rebase]

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 83 
+

 drivers/gpu/drm/msm/adreno/a6xx_gpu.h   |  4 ++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 17 +
 3 files changed, 104 insertions(+)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c

index 8915882e..151190ff62f7 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -8,7 +8,9 @@
 #include "a6xx_gpu.h"
 #include "a6xx_gmu.xml.h"

+#include 
 #include 
+#include 

 #define GPU_PAS_ID 13

@@ -1022,6 +1024,79 @@ static irqreturn_t a6xx_irq(struct msm_gpu 
*gpu)

return IRQ_HANDLED;
 }

+static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 
mask, u32 or)

+{
+   return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or);
+}
+
+static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 
value)

+{
+   return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2));
+}
+
+static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu)
+{
+   llcc_slice_deactivate(a6xx_gpu->llc_slice);
+   llcc_slice_deactivate(a6xx_gpu->htw_llc_slice);
+}
+
+static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
+{
+   u32 cntl1_regval = 0;
+
+   if (IS_ERR(a6xx_gpu->llc_mmio))
+   return;
+
+   if (!llcc_slice_activate(a6xx_gpu->llc_slice)) {
+   u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice);
+
+   gpu_scid &= 0x1f;
+		cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid << 10) 
|

+  (gpu_scid << 15) | (gpu_scid << 20);
+   }
+
+   if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
+   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
+
+   gpuhtw_scid &= 0x1f;
+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
+   }
+
+   if (cntl1_regval) {
+   /*
+* Program the slice IDs for the various GPU blocks and GPU MMU
+* pagetables
+*/
+		a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);

+
+   /*
+* Program cacheability overrides to not allocate cache lines on
+* a write miss
+*/
+		a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF, 
0x03);

+   }
+}


This code has been around long enough that it pre-dates a650. On a650 
and other
MMU-500 targets the htw_llc is configured by the firmware and the 
llc_slice is

configured in a different register.

I don't think we need to pause everything and add support for the 
MMU-500 path,
but we do need a way to disallow LLCC on affected targets until such 
time that

we can get it fixed up.



Thanks for taking a close look, does something like below look ok or 
something

else is needed here?

+ /* Till the time we get in LLCC support for A650 */
+ if (!(info && info->revn == 650))
+ a6xx_llc_slices_init(pdev, a6xx_gpu);

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCHv5 5/6] iommu: arm-smmu-impl: Use table to list QCOM implementations

2020-09-29 Thread Sai Prakash Ranjan

On 2020-09-23 20:54, Robin Murphy wrote:

On 2020-09-22 07:18, Sai Prakash Ranjan wrote:

Use table and of_match_node() to match qcom implementation
instead of multiple of_device_compatible() calls for each
QCOM SMMU implementation.

Signed-off-by: Sai Prakash Ranjan 
---
  drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 12 
  1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c

index d199b4bff15d..ce78295cfa78 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -9,6 +9,13 @@
#include "arm-smmu.h"
  +static const struct of_device_id __maybe_unused 
qcom_smmu_impl_of_match[] = {

+   { .compatible = "qcom,sc7180-smmu-500" },
+   { .compatible = "qcom,sdm845-smmu-500" },
+   { .compatible = "qcom,sm8150-smmu-500" },
+   { .compatible = "qcom,sm8250-smmu-500" },
+   { }
+};


Can you push the table itself into arm-smmu-qcom? That way you'll be
free to add new SoCs willy-nilly without any possibility of
conflicting with anything else.

Bonus points if you can fold in the Adreno variant and keep everything
together ;)



Sure I can get bonus points :)

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv5 0/6] System Cache support for GPU and required SMMU support

2020-09-22 Thread Sai Prakash Ranjan
Some hardware variants contain a system cache or the last level
cache(llc). This cache is typically a large block which is shared
by multiple clients on the SOC. GPU uses the system cache to cache
both the GPU data buffers(like textures) as well the SMMU pagetables.
This helps with improved render performance as well as lower power
consumption by reducing the bus traffic to the system memory.

The system cache architecture allows the cache to be split into slices
which then be used by multiple SOC clients. This patch series is an
effort to enable and use two of those slices perallocated for the GPU,
one for the GPU data buffers and another for the GPU SMMU hardware
pagetables.

Patch 1 - Patch 4 adds system cache support in SMMU and GPU driver.
Patch 5 and 6 are minor cleanups for arm-smmu impl.

The series is based on top of 
https://gitlab.freedesktop.org/drm/msm/-/tree/msm-next-pgtables

Changes in v5:
 * Drop cleanup of blank lines since it was intentional (Robin)
 * Rebase again on top of msm-next-pgtables as it moves pretty fast

Changes in v4:
 * Drop IOMMU_SYS_CACHE prot flag
 * Rebase on top of 
https://gitlab.freedesktop.org/drm/msm/-/tree/msm-next-pgtables

Changes in v3:
 * Fix domain attribute setting to before iommu_attach_device()
 * Fix few code style and checkpatch warnings
 * Rebase on top of Jordan's latest split pagetables and per-instance
   pagetables support

Changes in v2:
 * Addressed review comments and rebased on top of Jordan's split
   pagetables series

Sai Prakash Ranjan (4):
  iommu/io-pgtable-arm: Add support to use system cache
  iommu/arm-smmu: Add domain attribute for system cache
  iommu: arm-smmu-impl: Use table to list QCOM implementations
  iommu: arm-smmu-impl: Add a space before open parenthesis

Sharat Masetty (2):
  drm/msm: rearrange the gpu_rmw() function
  drm/msm/a6xx: Add support for using system cache(LLC)

 drivers/gpu/drm/msm/adreno/a6xx_gpu.c  | 83 ++
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h  |  4 ++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c| 17 +
 drivers/gpu/drm/msm/msm_drv.c  |  8 +++
 drivers/gpu/drm/msm/msm_drv.h  |  1 +
 drivers/gpu/drm/msm/msm_gpu.h  |  5 +-
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 14 ++--
 drivers/iommu/arm/arm-smmu/arm-smmu.c  | 17 +
 drivers/iommu/arm/arm-smmu/arm-smmu.h  |  1 +
 drivers/iommu/io-pgtable-arm.c |  7 +-
 include/linux/io-pgtable.h |  4 ++
 include/linux/iommu.h  |  1 +
 12 files changed, 152 insertions(+), 10 deletions(-)


base-commit: 115b1aca7a2a9c0649b1f5f6cffee6873c7efd89
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv5 4/6] drm/msm/a6xx: Add support for using system cache(LLC)

2020-09-22 Thread Sai Prakash Ranjan
From: Sharat Masetty 

The last level system cache can be partitioned to 32 different
slices of which GPU has two slices preallocated. One slice is
used for caching GPU buffers and the other slice is used for
caching the GPU SMMU pagetables. This talks to the core system
cache driver to acquire the slice handles, configure the SCID's
to those slices and activates and deactivates the slices upon
GPU power collapse and restore.

Some support from the IOMMU driver is also needed to make use
of the system cache to set the right TCR attributes. GPU then
has the ability to override a few cacheability parameters which
it does to override write-allocate to write-no-allocate as the
GPU hardware does not benefit much from it.

DOMAIN_ATTR_SYS_CACHE is another domain level attribute used by the
IOMMU driver to set the right attributes to cache the hardware
pagetables into the system cache.

Signed-off-by: Sharat Masetty 
[saiprakash.ranjan: fix to set attr before device attach to iommu and rebase]
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 83 +
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h   |  4 ++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 17 +
 3 files changed, 104 insertions(+)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index 8915882e..151190ff62f7 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -8,7 +8,9 @@
 #include "a6xx_gpu.h"
 #include "a6xx_gmu.xml.h"
 
+#include 
 #include 
+#include 
 
 #define GPU_PAS_ID 13
 
@@ -1022,6 +1024,79 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
return IRQ_HANDLED;
 }
 
+static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 mask, u32 or)
+{
+   return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or);
+}
+
+static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 value)
+{
+   return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2));
+}
+
+static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu)
+{
+   llcc_slice_deactivate(a6xx_gpu->llc_slice);
+   llcc_slice_deactivate(a6xx_gpu->htw_llc_slice);
+}
+
+static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
+{
+   u32 cntl1_regval = 0;
+
+   if (IS_ERR(a6xx_gpu->llc_mmio))
+   return;
+
+   if (!llcc_slice_activate(a6xx_gpu->llc_slice)) {
+   u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice);
+
+   gpu_scid &= 0x1f;
+   cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid << 
10) |
+  (gpu_scid << 15) | (gpu_scid << 20);
+   }
+
+   if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
+   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
+
+   gpuhtw_scid &= 0x1f;
+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
+   }
+
+   if (cntl1_regval) {
+   /*
+* Program the slice IDs for the various GPU blocks and GPU MMU
+* pagetables
+*/
+   a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
+
+   /*
+* Program cacheability overrides to not allocate cache lines on
+* a write miss
+*/
+   a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 
0xF, 0x03);
+   }
+}
+
+static void a6xx_llc_slices_destroy(struct a6xx_gpu *a6xx_gpu)
+{
+   llcc_slice_putd(a6xx_gpu->llc_slice);
+   llcc_slice_putd(a6xx_gpu->htw_llc_slice);
+}
+
+static void a6xx_llc_slices_init(struct platform_device *pdev,
+   struct a6xx_gpu *a6xx_gpu)
+{
+   a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
+   if (IS_ERR(a6xx_gpu->llc_mmio))
+   return;
+
+   a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU);
+   a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
+
+   if (IS_ERR(a6xx_gpu->llc_slice) && IS_ERR(a6xx_gpu->htw_llc_slice))
+   a6xx_gpu->llc_mmio = ERR_PTR(-EINVAL);
+}
+
 static int a6xx_pm_resume(struct msm_gpu *gpu)
 {
struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
@@ -1038,6 +1113,8 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
 
msm_gpu_resume_devfreq(gpu);
 
+   a6xx_llc_activate(a6xx_gpu);
+
return 0;
 }
 
@@ -1048,6 +1125,8 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)
 
trace_msm_gpu_suspend(0);
 
+   a6xx_llc_deactivate(a6xx_gpu);
+
devfreq_suspend_device(gpu->devfreq.devfreq);
 
return a6xx_gmu_stop(a6xx_gpu);
@@ -1091,6 +1170,8 @@ static void a6xx_destroy(struct msm_gpu *gpu)
drm_gem_object_put(a6xx_gpu->shadow_bo);
}
 
+   a6xx_llc_slices_destroy(a

[PATCHv5 3/6] drm/msm: rearrange the gpu_rmw() function

2020-09-22 Thread Sai Prakash Ranjan
From: Sharat Masetty 

The register read-modify-write construct is generic enough
that it can be used by other subsystems as needed, create
a more generic rmw() function and have the gpu_rmw() use
this new function.

Signed-off-by: Sharat Masetty 
Reviewed-by: Jordan Crouse 
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/msm_drv.c | 8 
 drivers/gpu/drm/msm/msm_drv.h | 1 +
 drivers/gpu/drm/msm/msm_gpu.h | 5 +
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index 49685571dc0e..a1e22b974b77 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -180,6 +180,14 @@ u32 msm_readl(const void __iomem *addr)
return val;
 }
 
+void msm_rmw(void __iomem *addr, u32 mask, u32 or)
+{
+   u32 val = msm_readl(addr);
+
+   val &= ~mask;
+   msm_writel(val | or, addr);
+}
+
 struct msm_vblank_work {
struct work_struct work;
int crtc_id;
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index 319327462b29..079e33680a6c 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -474,6 +474,7 @@ void __iomem *msm_ioremap_quiet(struct platform_device 
*pdev, const char *name,
const char *dbgname);
 void msm_writel(u32 data, void __iomem *addr);
 u32 msm_readl(const void __iomem *addr);
+void msm_rmw(void __iomem *addr, u32 mask, u32 or);
 
 struct msm_gpu_submitqueue;
 int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 6c9e1fdc1a76..b2b419277953 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -246,10 +246,7 @@ static inline u32 gpu_read(struct msm_gpu *gpu, u32 reg)
 
 static inline void gpu_rmw(struct msm_gpu *gpu, u32 reg, u32 mask, u32 or)
 {
-   uint32_t val = gpu_read(gpu, reg);
-
-   val &= ~mask;
-   gpu_write(gpu, reg, val | or);
+   msm_rmw(gpu->mmio + (reg << 2), mask, or);
 }
 
 static inline u64 gpu_read64(struct msm_gpu *gpu, u32 lo, u32 hi)
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv5 1/6] iommu/io-pgtable-arm: Add support to use system cache

2020-09-22 Thread Sai Prakash Ranjan
Add a quirk IO_PGTABLE_QUIRK_SYS_CACHE to override the
attributes set in TCR for the page table walker when
using system cache.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/io-pgtable-arm.c | 7 ++-
 include/linux/io-pgtable.h | 4 
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index dc7bcf858b6d..828426c16fa9 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -789,7 +789,8 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
 
if (cfg->quirks & ~(IO_PGTABLE_QUIRK_ARM_NS |
IO_PGTABLE_QUIRK_NON_STRICT |
-   IO_PGTABLE_QUIRK_ARM_TTBR1))
+   IO_PGTABLE_QUIRK_ARM_TTBR1 |
+   IO_PGTABLE_QUIRK_SYS_CACHE))
return NULL;
 
data = arm_lpae_alloc_pgtable(cfg);
@@ -801,6 +802,10 @@ arm_64_lpae_alloc_pgtable_s1(struct io_pgtable_cfg *cfg, 
void *cookie)
tcr->sh = ARM_LPAE_TCR_SH_IS;
tcr->irgn = ARM_LPAE_TCR_RGN_WBWA;
tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
+   } else if (cfg->quirks & IO_PGTABLE_QUIRK_SYS_CACHE) {
+   tcr->sh = ARM_LPAE_TCR_SH_OS;
+   tcr->irgn = ARM_LPAE_TCR_RGN_NC;
+   tcr->orgn = ARM_LPAE_TCR_RGN_WBWA;
} else {
tcr->sh = ARM_LPAE_TCR_SH_OS;
tcr->irgn = ARM_LPAE_TCR_RGN_NC;
diff --git a/include/linux/io-pgtable.h b/include/linux/io-pgtable.h
index 23285ba645db..ecc9d2248b84 100644
--- a/include/linux/io-pgtable.h
+++ b/include/linux/io-pgtable.h
@@ -86,6 +86,9 @@ struct io_pgtable_cfg {
 *
 * IO_PGTABLE_QUIRK_ARM_TTBR1: (ARM LPAE format) Configure the table
 *  for use in the upper half of a split address space.
+*
+* IO_PGTABLE_QUIRK_SYS_CACHE: Override the attributes set in TCR for
+*  the page table walker when using system cache.
 */
#define IO_PGTABLE_QUIRK_ARM_NS BIT(0)
#define IO_PGTABLE_QUIRK_NO_PERMS   BIT(1)
@@ -93,6 +96,7 @@ struct io_pgtable_cfg {
#define IO_PGTABLE_QUIRK_ARM_MTK_EXTBIT(3)
#define IO_PGTABLE_QUIRK_NON_STRICT BIT(4)
#define IO_PGTABLE_QUIRK_ARM_TTBR1  BIT(5)
+   #define IO_PGTABLE_QUIRK_SYS_CACHE  BIT(6)
unsigned long   quirks;
unsigned long   pgsize_bitmap;
unsigned intias;
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv5 6/6] iommu: arm-smmu-impl: Add a space before open parenthesis

2020-09-22 Thread Sai Prakash Ranjan
Fix the checkpatch warning for space required before the open
parenthesis.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index ce78295cfa78..eda852111706 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -19,7 +19,7 @@ static const struct of_device_id __maybe_unused 
qcom_smmu_impl_of_match[] = {
 
 static int arm_smmu_gr0_ns(int offset)
 {
-   switch(offset) {
+   switch (offset) {
case ARM_SMMU_GR0_sCR0:
case ARM_SMMU_GR0_sACR:
case ARM_SMMU_GR0_sGFSR:
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv5 5/6] iommu: arm-smmu-impl: Use table to list QCOM implementations

2020-09-22 Thread Sai Prakash Ranjan
Use table and of_match_node() to match qcom implementation
instead of multiple of_device_compatible() calls for each
QCOM SMMU implementation.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index d199b4bff15d..ce78295cfa78 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -9,6 +9,13 @@
 
 #include "arm-smmu.h"
 
+static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = {
+   { .compatible = "qcom,sc7180-smmu-500" },
+   { .compatible = "qcom,sdm845-smmu-500" },
+   { .compatible = "qcom,sm8150-smmu-500" },
+   { .compatible = "qcom,sm8250-smmu-500" },
+   { }
+};
 
 static int arm_smmu_gr0_ns(int offset)
 {
@@ -217,10 +224,7 @@ struct arm_smmu_device *arm_smmu_impl_init(struct 
arm_smmu_device *smmu)
if (of_device_is_compatible(np, "nvidia,tegra194-smmu"))
return nvidia_smmu_impl_init(smmu);
 
-   if (of_device_is_compatible(np, "qcom,sdm845-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sc7180-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sm8150-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sm8250-smmu-500"))
+   if (of_match_node(qcom_smmu_impl_of_match, np))
return qcom_smmu_impl_init(smmu);
 
if (of_device_is_compatible(smmu->dev->of_node, "qcom,adreno-smmu"))
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv5 2/6] iommu/arm-smmu: Add domain attribute for system cache

2020-09-22 Thread Sai Prakash Ranjan
Add iommu domain attribute for using system cache aka last level
cache by client drivers like GPU to set right attributes for caching
the hardware pagetables into the system cache.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 +
 drivers/iommu/arm/arm-smmu/arm-smmu.h |  1 +
 include/linux/iommu.h |  1 +
 3 files changed, 19 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 1f06ab219819..d449c895ba16 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -789,6 +789,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain 
*domain,
if (smmu_domain->non_strict)
pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
 
+   if (smmu_domain->sys_cache)
+   pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE;
+
pgtbl_ops = alloc_io_pgtable_ops(fmt, _cfg, smmu_domain);
if (!pgtbl_ops) {
ret = -ENOMEM;
@@ -1513,6 +1516,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain 
*domain,
case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
*(int *)data = smmu_domain->non_strict;
return 0;
+   case DOMAIN_ATTR_SYS_CACHE:
+   *((int *)data) = smmu_domain->sys_cache;
+   return 0;
default:
return -ENODEV;
}
@@ -1544,6 +1550,17 @@ static int arm_smmu_domain_set_attr(struct iommu_domain 
*domain,
else
smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
break;
+   case DOMAIN_ATTR_SYS_CACHE:
+   if (smmu_domain->smmu) {
+   ret = -EPERM;
+   goto out_unlock;
+   }
+
+   if (*((int *)data))
+   smmu_domain->sys_cache = true;
+   else
+   smmu_domain->sys_cache = false;
+   break;
default:
ret = -ENODEV;
}
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index ddf2ca4c923d..93593e164e44 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -373,6 +373,7 @@ struct arm_smmu_domain {
struct mutexinit_mutex; /* Protects smmu pointer */
spinlock_t  cb_lock; /* Serialises ATS1* ops and 
TLB syncs */
struct iommu_domain domain;
+   boolsys_cache;
 };
 
 struct arm_smmu_master_cfg {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index fee209efb756..a580dfe9c68d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -118,6 +118,7 @@ enum iommu_attr {
DOMAIN_ATTR_FSL_PAMUV1,
DOMAIN_ATTR_NESTING,/* two stages of translation */
DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
+   DOMAIN_ATTR_SYS_CACHE,
DOMAIN_ATTR_MAX,
 };
 
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCHv4 1/6] iommu/io-pgtable-arm: Add support to use system cache

2020-09-22 Thread Sai Prakash Ranjan

Hi Will,

On 2020-09-21 23:33, Will Deacon wrote:

On Fri, Sep 11, 2020 at 07:57:18PM +0530, Sai Prakash Ranjan wrote:

Add a quirk IO_PGTABLE_QUIRK_SYS_CACHE to override the
attributes set in TCR for the page table walker when
using system cache.


I wonder if the panfrost folks can reuse this for the issue discussed
over at:

https://lore.kernel.org/r/cover.1600213517.git.robin.mur...@arm.com

However, Sai, your email setup went wrong when you posted this so you
probably need to repost now that you have that fixed.



I have sent a v5 [1] now since I had to drop cleanup of blank lines
as Robin said it was intentional and also had to rebase over new
gpu changes since it moves pretty fast.

[1] https://lore.kernel.org/patchwork/cover/131/

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCHv4 6/6] iommu: arm-smmu-impl: Remove unwanted extra blank lines

2020-09-15 Thread Sai Prakash Ranjan

On 2020-09-11 22:20, Sai Prakash Ranjan wrote:

On 2020-09-11 22:04, Robin Murphy wrote:

On 2020-09-11 17:21, Sai Prakash Ranjan wrote:

On 2020-09-11 21:37, Will Deacon wrote:

On Fri, Sep 11, 2020 at 05:03:06PM +0100, Robin Murphy wrote:
BTW am I supposed to have received 3 copies of everything? Because 
I did...


Yeah, this seems to be happening for all of Sai's emails :/



Sorry, I am not sure what went wrong as I only sent this once
and there are no recent changes to any of my configs, I'll
check it further.


Actually on closer inspection it appears to be "correct" behaviour.
I'm still subscribed to LAKML and the IOMMU list on this account, but
normally Office 365 deduplicates so aggressively that I have rules set
up to copy list mails that I'm cc'ed on back to my inbox, in case they
arrive first and cause the direct copy to get eaten - apparently
there's something unique about your email setup that manages to defeat
the deduplicator and make it deliver all 3 copies intact... :/



No changes in my local setup atleast, but in the past we have
had cases with codeaurora mail acting weird or it could be my vpn,
will have to check.



This was an issue with codeaurora servers and I am told that it is
fixed now.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


Re: [PATCHv4 6/6] iommu: arm-smmu-impl: Remove unwanted extra blank lines

2020-09-14 Thread Sai Prakash Ranjan

On 2020-09-11 21:37, Will Deacon wrote:

On Fri, Sep 11, 2020 at 05:03:06PM +0100, Robin Murphy wrote:
BTW am I supposed to have received 3 copies of everything? Because I 
did...


Yeah, this seems to be happening for all of Sai's emails :/



Sorry, I am not sure what went wrong as I only sent this once
and there are no recent changes to any of my configs, I'll
check it further.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv4 2/6] iommu/arm-smmu: Add domain attribute for system cache

2020-09-14 Thread Sai Prakash Ranjan
Add iommu domain attribute for using system cache aka last level
cache by client drivers like GPU to set right attributes for caching
the hardware pagetables into the system cache.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu.c | 17 +
 drivers/iommu/arm/arm-smmu/arm-smmu.h |  1 +
 include/linux/iommu.h |  1 +
 3 files changed, 19 insertions(+)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu.c
index 1f06ab219819..d449c895ba16 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.c
@@ -789,6 +789,9 @@ static int arm_smmu_init_domain_context(struct iommu_domain 
*domain,
if (smmu_domain->non_strict)
pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_NON_STRICT;
 
+   if (smmu_domain->sys_cache)
+   pgtbl_cfg.quirks |= IO_PGTABLE_QUIRK_SYS_CACHE;
+
pgtbl_ops = alloc_io_pgtable_ops(fmt, _cfg, smmu_domain);
if (!pgtbl_ops) {
ret = -ENOMEM;
@@ -1513,6 +1516,9 @@ static int arm_smmu_domain_get_attr(struct iommu_domain 
*domain,
case DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE:
*(int *)data = smmu_domain->non_strict;
return 0;
+   case DOMAIN_ATTR_SYS_CACHE:
+   *((int *)data) = smmu_domain->sys_cache;
+   return 0;
default:
return -ENODEV;
}
@@ -1544,6 +1550,17 @@ static int arm_smmu_domain_set_attr(struct iommu_domain 
*domain,
else
smmu_domain->stage = ARM_SMMU_DOMAIN_S1;
break;
+   case DOMAIN_ATTR_SYS_CACHE:
+   if (smmu_domain->smmu) {
+   ret = -EPERM;
+   goto out_unlock;
+   }
+
+   if (*((int *)data))
+   smmu_domain->sys_cache = true;
+   else
+   smmu_domain->sys_cache = false;
+   break;
default:
ret = -ENODEV;
}
diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu.h 
b/drivers/iommu/arm/arm-smmu/arm-smmu.h
index ddf2ca4c923d..93593e164e44 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu.h
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu.h
@@ -373,6 +373,7 @@ struct arm_smmu_domain {
struct mutexinit_mutex; /* Protects smmu pointer */
spinlock_t  cb_lock; /* Serialises ATS1* ops and 
TLB syncs */
struct iommu_domain domain;
+   boolsys_cache;
 };
 
 struct arm_smmu_master_cfg {
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index fee209efb756..a580dfe9c68d 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -118,6 +118,7 @@ enum iommu_attr {
DOMAIN_ATTR_FSL_PAMUV1,
DOMAIN_ATTR_NESTING,/* two stages of translation */
DOMAIN_ATTR_DMA_USE_FLUSH_QUEUE,
+   DOMAIN_ATTR_SYS_CACHE,
DOMAIN_ATTR_MAX,
 };
 
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv4 5/6] iommu: arm-smmu-impl: Use table to list QCOM implementations

2020-09-14 Thread Sai Prakash Ranjan
Use table and of_match_node() to match qcom implementation
instead of multiple of_device_compatible() calls for each
QCOM SMMU implementation.

Signed-off-by: Sai Prakash Ranjan 
---
 drivers/iommu/arm/arm-smmu/arm-smmu-impl.c | 12 
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c 
b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
index d199b4bff15d..ce78295cfa78 100644
--- a/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
+++ b/drivers/iommu/arm/arm-smmu/arm-smmu-impl.c
@@ -9,6 +9,13 @@
 
 #include "arm-smmu.h"
 
+static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = {
+   { .compatible = "qcom,sc7180-smmu-500" },
+   { .compatible = "qcom,sdm845-smmu-500" },
+   { .compatible = "qcom,sm8150-smmu-500" },
+   { .compatible = "qcom,sm8250-smmu-500" },
+   { }
+};
 
 static int arm_smmu_gr0_ns(int offset)
 {
@@ -217,10 +224,7 @@ struct arm_smmu_device *arm_smmu_impl_init(struct 
arm_smmu_device *smmu)
if (of_device_is_compatible(np, "nvidia,tegra194-smmu"))
return nvidia_smmu_impl_init(smmu);
 
-   if (of_device_is_compatible(np, "qcom,sdm845-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sc7180-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sm8150-smmu-500") ||
-   of_device_is_compatible(np, "qcom,sm8250-smmu-500"))
+   if (of_match_node(qcom_smmu_impl_of_match, np))
return qcom_smmu_impl_init(smmu);
 
if (of_device_is_compatible(smmu->dev->of_node, "qcom,adreno-smmu"))
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv4 3/6] drm/msm: rearrange the gpu_rmw() function

2020-09-14 Thread Sai Prakash Ranjan
From: Sharat Masetty 

The register read-modify-write construct is generic enough
that it can be used by other subsystems as needed, create
a more generic rmw() function and have the gpu_rmw() use
this new function.

Signed-off-by: Sharat Masetty 
Reviewed-by: Jordan Crouse 
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/msm_drv.c | 8 
 drivers/gpu/drm/msm/msm_drv.h | 1 +
 drivers/gpu/drm/msm/msm_gpu.h | 5 +
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/msm/msm_drv.c b/drivers/gpu/drm/msm/msm_drv.c
index abf5799d9a22..03caafa7c7b2 100644
--- a/drivers/gpu/drm/msm/msm_drv.c
+++ b/drivers/gpu/drm/msm/msm_drv.c
@@ -180,6 +180,14 @@ u32 msm_readl(const void __iomem *addr)
return val;
 }
 
+void msm_rmw(void __iomem *addr, u32 mask, u32 or)
+{
+   u32 val = msm_readl(addr);
+
+   val &= ~mask;
+   msm_writel(val | or, addr);
+}
+
 struct msm_vblank_work {
struct work_struct work;
int crtc_id;
diff --git a/drivers/gpu/drm/msm/msm_drv.h b/drivers/gpu/drm/msm/msm_drv.h
index 2ca9c3c03845..aa07900d9ac4 100644
--- a/drivers/gpu/drm/msm/msm_drv.h
+++ b/drivers/gpu/drm/msm/msm_drv.h
@@ -419,6 +419,7 @@ void __iomem *msm_ioremap_quiet(struct platform_device 
*pdev, const char *name,
const char *dbgname);
 void msm_writel(u32 data, void __iomem *addr);
 u32 msm_readl(const void __iomem *addr);
+void msm_rmw(void __iomem *addr, u32 mask, u32 or);
 
 struct msm_gpu_submitqueue;
 int msm_submitqueue_init(struct drm_device *drm, struct msm_file_private *ctx);
diff --git a/drivers/gpu/drm/msm/msm_gpu.h b/drivers/gpu/drm/msm/msm_gpu.h
index 5ee358b480e6..1d446b2e5746 100644
--- a/drivers/gpu/drm/msm/msm_gpu.h
+++ b/drivers/gpu/drm/msm/msm_gpu.h
@@ -245,10 +245,7 @@ static inline u32 gpu_read(struct msm_gpu *gpu, u32 reg)
 
 static inline void gpu_rmw(struct msm_gpu *gpu, u32 reg, u32 mask, u32 or)
 {
-   uint32_t val = gpu_read(gpu, reg);
-
-   val &= ~mask;
-   gpu_write(gpu, reg, val | or);
+   msm_rmw(gpu->mmio + (reg << 2), mask, or);
 }
 
 static inline u64 gpu_read64(struct msm_gpu *gpu, u32 lo, u32 hi)
-- 
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a member
of Code Aurora Forum, hosted by The Linux Foundation

___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


[PATCHv4 4/6] drm/msm/a6xx: Add support for using system cache(LLC)

2020-09-14 Thread Sai Prakash Ranjan
From: Sharat Masetty 

The last level system cache can be partitioned to 32 different
slices of which GPU has two slices preallocated. One slice is
used for caching GPU buffers and the other slice is used for
caching the GPU SMMU pagetables. This talks to the core system
cache driver to acquire the slice handles, configure the SCID's
to those slices and activates and deactivates the slices upon
GPU power collapse and restore.

Some support from the IOMMU driver is also needed to make use
of the system cache to set the right TCR attributes. GPU then
has the ability to override a few cacheability parameters which
it does to override write-allocate to write-no-allocate as the
GPU hardware does not benefit much from it.

DOMAIN_ATTR_SYS_CACHE is another domain level attribute used by the
IOMMU driver to set the right attributes to cache the hardware
pagetables into the system cache.

Signed-off-by: Sharat Masetty 
(saiprakash.ranjan: fix to set attr before device attach to iommu and rebase)
Signed-off-by: Sai Prakash Ranjan 
---
 drivers/gpu/drm/msm/adreno/a6xx_gpu.c   | 83 +
 drivers/gpu/drm/msm/adreno/a6xx_gpu.h   |  4 ++
 drivers/gpu/drm/msm/adreno/adreno_gpu.c | 21 ++-
 3 files changed, 107 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c 
b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
index a3a8d6fd06bb..6241b93ed375 100644
--- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
+++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
@@ -9,6 +9,8 @@
 #include "a6xx_gmu.xml.h"
 
 #include 
+#include 
+#include 
 
 #define GPU_PAS_ID 13
 
@@ -969,6 +971,79 @@ static const u32 
a6xx_register_offsets[REG_ADRENO_REGISTER_MAX] = {
REG_ADRENO_DEFINE(REG_ADRENO_CP_RB_CNTL, REG_A6XX_CP_RB_CNTL),
 };
 
+static void a6xx_llc_rmw(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 mask, u32 or)
+{
+   return msm_rmw(a6xx_gpu->llc_mmio + (reg << 2), mask, or);
+}
+
+static void a6xx_llc_write(struct a6xx_gpu *a6xx_gpu, u32 reg, u32 value)
+{
+   return msm_writel(value, a6xx_gpu->llc_mmio + (reg << 2));
+}
+
+static void a6xx_llc_deactivate(struct a6xx_gpu *a6xx_gpu)
+{
+   llcc_slice_deactivate(a6xx_gpu->llc_slice);
+   llcc_slice_deactivate(a6xx_gpu->htw_llc_slice);
+}
+
+static void a6xx_llc_activate(struct a6xx_gpu *a6xx_gpu)
+{
+   u32 cntl1_regval = 0;
+
+   if (IS_ERR(a6xx_gpu->llc_mmio))
+   return;
+
+   if (!llcc_slice_activate(a6xx_gpu->llc_slice)) {
+   u32 gpu_scid = llcc_get_slice_id(a6xx_gpu->llc_slice);
+
+   gpu_scid &= 0x1f;
+   cntl1_regval = (gpu_scid << 0) | (gpu_scid << 5) | (gpu_scid << 
10) |
+  (gpu_scid << 15) | (gpu_scid << 20);
+   }
+
+   if (!llcc_slice_activate(a6xx_gpu->htw_llc_slice)) {
+   u32 gpuhtw_scid = llcc_get_slice_id(a6xx_gpu->htw_llc_slice);
+
+   gpuhtw_scid &= 0x1f;
+   cntl1_regval |= FIELD_PREP(GENMASK(29, 25), gpuhtw_scid);
+   }
+
+   if (cntl1_regval) {
+   /*
+* Program the slice IDs for the various GPU blocks and GPU MMU
+* pagetables
+*/
+   a6xx_llc_write(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_1, 
cntl1_regval);
+
+   /*
+* Program cacheability overrides to not allocate cache lines on
+* a write miss
+*/
+   a6xx_llc_rmw(a6xx_gpu, REG_A6XX_CX_MISC_SYSTEM_CACHE_CNTL_0, 
0xF, 0x03);
+   }
+}
+
+static void a6xx_llc_slices_destroy(struct a6xx_gpu *a6xx_gpu)
+{
+   llcc_slice_putd(a6xx_gpu->llc_slice);
+   llcc_slice_putd(a6xx_gpu->htw_llc_slice);
+}
+
+static void a6xx_llc_slices_init(struct platform_device *pdev,
+   struct a6xx_gpu *a6xx_gpu)
+{
+   a6xx_gpu->llc_mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
+   if (IS_ERR(a6xx_gpu->llc_mmio))
+   return;
+
+   a6xx_gpu->llc_slice = llcc_slice_getd(LLCC_GPU);
+   a6xx_gpu->htw_llc_slice = llcc_slice_getd(LLCC_GPUHTW);
+
+   if (IS_ERR(a6xx_gpu->llc_slice) && IS_ERR(a6xx_gpu->htw_llc_slice))
+   a6xx_gpu->llc_mmio = ERR_PTR(-EINVAL);
+}
+
 static int a6xx_pm_resume(struct msm_gpu *gpu)
 {
struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
@@ -985,6 +1060,8 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
 
msm_gpu_resume_devfreq(gpu);
 
+   a6xx_llc_activate(a6xx_gpu);
+
return 0;
 }
 
@@ -995,6 +1072,8 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)
 
trace_msm_gpu_suspend(0);
 
+   a6xx_llc_deactivate(a6xx_gpu);
+
devfreq_suspend_device(gpu->devfreq.devfreq);
 
return a6xx_gmu_stop(a6xx_gpu);
@@ -1033,6 +1112,8 @@ static void a6xx_destroy(struct msm_gpu *gpu)
drm_gem_object_put(a6xx_gpu->sqe_bo);

Re: [PATCHv4 6/6] iommu: arm-smmu-impl: Remove unwanted extra blank lines

2020-09-14 Thread Sai Prakash Ranjan

On 2020-09-11 22:04, Robin Murphy wrote:

On 2020-09-11 17:21, Sai Prakash Ranjan wrote:

On 2020-09-11 21:37, Will Deacon wrote:

On Fri, Sep 11, 2020 at 05:03:06PM +0100, Robin Murphy wrote:
BTW am I supposed to have received 3 copies of everything? Because I 
did...


Yeah, this seems to be happening for all of Sai's emails :/



Sorry, I am not sure what went wrong as I only sent this once
and there are no recent changes to any of my configs, I'll
check it further.


Actually on closer inspection it appears to be "correct" behaviour.
I'm still subscribed to LAKML and the IOMMU list on this account, but
normally Office 365 deduplicates so aggressively that I have rules set
up to copy list mails that I'm cc'ed on back to my inbox, in case they
arrive first and cause the direct copy to get eaten - apparently
there's something unique about your email setup that manages to defeat
the deduplicator and make it deliver all 3 copies intact... :/



No changes in my local setup atleast, but in the past we have
had cases with codeaurora mail acting weird or it could be my vpn,
will have to check.

Thanks,
Sai

--
QUALCOMM INDIA, on behalf of Qualcomm Innovation Center, Inc. is a 
member

of Code Aurora Forum, hosted by The Linux Foundation
___
dri-devel mailing list
dri-devel@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/dri-devel


  1   2   >