[PATCH][next] drm/amdgpu: sdma: clean up identation

2021-09-02 Thread Colin King
From: Colin Ian King 

There is a statement that is indented incorrectly. Clean it up.

Signed-off-by: Colin Ian King 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
index 779f5c911e11..e4a96e7e386d 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
@@ -375,10 +375,10 @@ static void sdma_v5_2_ring_emit_ib(struct amdgpu_ring 
*ring,
  */
 static void sdma_v5_2_ring_emit_mem_sync(struct amdgpu_ring *ring)
 {
-uint32_t gcr_cntl =
-   SDMA_GCR_GL2_INV | SDMA_GCR_GL2_WB | SDMA_GCR_GLM_INV |
-   SDMA_GCR_GL1_INV | SDMA_GCR_GLV_INV | SDMA_GCR_GLK_INV |
-   SDMA_GCR_GLI_INV(1);
+   uint32_t gcr_cntl = SDMA_GCR_GL2_INV | SDMA_GCR_GL2_WB |
+   SDMA_GCR_GLM_INV | SDMA_GCR_GL1_INV |
+   SDMA_GCR_GLV_INV | SDMA_GCR_GLK_INV |
+   SDMA_GCR_GLI_INV(1);
 
/* flush entire cache L0/L1/L2, this can be optimized by performance 
requirement */
amdgpu_ring_write(ring, SDMA_PKT_HEADER_OP(SDMA_OP_GCR_REQ));
-- 
2.32.0



[PATCH][next] drm/amdgpu: clean up inconsistent indenting

2021-09-02 Thread Colin King
From: Colin Ian King 

There are a couple of statements that are indented one character
too deeply, clean these up.

Signed-off-by: Colin Ian King 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index d6aa032890ee..a573424a6e0b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -60,10 +60,9 @@ static vm_fault_t amdgpu_gem_fault(struct vm_fault *vmf)
goto unlock;
}
 
-ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
-   TTM_BO_VM_NUM_PREFAULT, 1);
-
-drm_dev_exit(idx);
+   ret = ttm_bo_vm_fault_reserved(vmf, vmf->vma->vm_page_prot,
+  TTM_BO_VM_NUM_PREFAULT, 1);
+   drm_dev_exit(idx);
} else {
ret = ttm_bo_vm_dummy_page(vmf, vmf->vma->vm_page_prot);
}
-- 
2.32.0



[PATCH v2 1/1] drm/amdkfd: make needs_pcie_atomics FW-version dependent

2021-09-02 Thread Felix Kuehling
On some GPUs the PCIe atomic requirement for KFD depends on the MEC
firmware version. Add a firmware version check for this. The minimum
firmware version that works without atomics can be updated in the
device_info structure for each GPU type.

Signed-off-by: Felix Kuehling 
Reviewed-by: Oak Zeng 
---
 drivers/gpu/drm/amd/amdkfd/kfd_device.c | 13 +++--
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h   |  1 +
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 16a57b70cc1a..d5379c79c739 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
@@ -525,6 +525,7 @@ static const struct kfd_device_info 
sienna_cichlid_device_info = {
.needs_iommu_device = false,
.supports_cwsr = true,
.needs_pci_atomics = true,
+   .no_atomic_fw_version = 92,
.num_sdma_engines = 4,
.num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 8,
@@ -544,6 +545,7 @@ static const struct kfd_device_info 
navy_flounder_device_info = {
.needs_iommu_device = false,
.supports_cwsr = true,
.needs_pci_atomics = true,
+   .no_atomic_fw_version = 92,
.num_sdma_engines = 2,
.num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 8,
@@ -563,6 +565,7 @@ static const struct kfd_device_info vangogh_device_info = {
.needs_iommu_device = false,
.supports_cwsr = true,
.needs_pci_atomics = false,
+   .no_atomic_fw_version = 92,
.num_sdma_engines = 1,
.num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
@@ -582,6 +585,7 @@ static const struct kfd_device_info 
dimgrey_cavefish_device_info = {
.needs_iommu_device = false,
.supports_cwsr = true,
.needs_pci_atomics = true,
+   .no_atomic_fw_version = 92,
.num_sdma_engines = 2,
.num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 8,
@@ -601,6 +605,7 @@ static const struct kfd_device_info beige_goby_device_info 
= {
.needs_iommu_device = false,
.supports_cwsr = true,
.needs_pci_atomics = true,
+   .no_atomic_fw_version = 92,
.num_sdma_engines = 1,
.num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 8,
@@ -620,6 +625,7 @@ static const struct kfd_device_info yellow_carp_device_info 
= {
.needs_iommu_device = false,
.supports_cwsr = true,
.needs_pci_atomics = false,
+   .no_atomic_fw_version = 92,
.num_sdma_engines = 1,
.num_xgmi_sdma_engines = 0,
.num_sdma_queues_per_engine = 2,
@@ -713,8 +719,11 @@ struct kfd_dev *kgd2kfd_probe(struct kgd_dev *kgd,
 * supported.
 */
kfd->pci_atomic_requested = amdgpu_amdkfd_have_atomics_support(kgd);
-   if (device_info->needs_pci_atomics &&
-   !kfd->pci_atomic_requested) {
+   if (!kfd->pci_atomic_requested &&
+   device_info->needs_pci_atomics &&
+   (!device_info->no_atomic_fw_version ||
+ amdgpu_amdkfd_get_fw_version(kgd, KGD_ENGINE_MEC1) <
+   device_info->no_atomic_fw_version)) {
dev_info(kfd_device,
 "skipped device %x:%x, PCI rejects atomics\n",
 pdev->vendor, pdev->device);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index ab83b0de6b22..6d8f9bb2d905 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -207,6 +207,7 @@ struct kfd_device_info {
bool supports_cwsr;
bool needs_iommu_device;
bool needs_pci_atomics;
+   uint32_t no_atomic_fw_version;
unsigned int num_sdma_engines;
unsigned int num_xgmi_sdma_engines;
unsigned int num_sdma_queues_per_engine;
-- 
2.32.0



Re: [PATCH v1 03/14] mm: add iomem vma selection for memory migration

2021-09-02 Thread Dan Williams
On Thu, Sep 2, 2021 at 1:18 AM Christoph Hellwig  wrote:
>
> On Wed, Sep 01, 2021 at 11:40:43AM -0400, Felix Kuehling wrote:
> > >>> It looks like I'm totally misunderstanding what you are adding here
> > >>> then.  Why do we need any special treatment at all for memory that
> > >>> has normal struct pages and is part of the direct kernel map?
> > >> The pages are like normal memory for purposes of mapping them in CPU
> > >> page tables and for coherent access from the CPU.
> > > That's the user page tables.  What about the kernel direct map?
> > > If there is a normal kernel struct page backing there really should
> > > be no need for the pgmap.
> >
> > I'm not sure. The physical address ranges are in the UEFI system address
> > map as special-purpose memory. Does Linux create the struct pages and
> > kernel direct map for that without a pgmap call? I didn't see that last
> > time I went digging through that code.
>
> So doing some googling finds a patch from Dan that claims to hand EFI
> special purpose memory to the device dax driver.  But when I try to
> follow the version that got merged it looks it is treated simply as an
> MMIO region to be claimed by drivers, which would not get a struct page.
>
> Dan, did I misunderstand how E820_TYPE_SOFT_RESERVED works?

The original implementation of "soft reserve" support depended on the
combination of the EFI special purpose memory type and the ACPI HMAT
to define the device ranges. The requirement for ACPI HMAT was relaxed
later with commit:

5ccac54f3e12 ACPI: HMAT: attach a device for each soft-reserved range

The expectation is that system software policy can then either use the
device interface, assign a portion of the reservation back to the page
allocator, ignore the reservation altogether. Is this discussion
asking for a way to assign this memory to the GPU driver to manage?
device-dax already knows how to hand off to the page-allocator, seems
reasonable for it to be able to hand-off to another driver.


Re: [diagnostic TDR mode patches] unify our solution opinions/suggestions in one thread

2021-09-02 Thread Daniel Stone
Hi Monk,

On Thu, 2 Sept 2021 at 06:52, Liu, Monk  wrote:
> I didn't mean your changes on AMD driver need my personal approval or review 
> ... and  I'm totally already get used that our driver is not 100% under 
> control by AMDers,
> but supposedly any one from community (including you) who tend to change 
> AMD's driver need at least to get approvement from someone in AMD, e.g.: 
> AlexD or Christian, doesn't that reasonable?
> just like we need your approve if we try to modify DRM-sched, or need 
> panfrost's approval if we need to change panfrost code ...
>
> by only CC AMD's engineers looks not quite properly, how do you know if your 
> changes (on AMD code part) are conflicting with AMD's on-going internal 
> features/refactoring or not ?

Looking at the patches in question, they were (at least mostly) CCed
both to the amd-gfx@ mailing list and also to ckoenig. Unfortunately
it is not possible for every single patch to get mandatory signoff
from every single stakeholder - e.g. if every AMD patch which touched
the scheduler required explicit approval from Etnaviv, Freedreno,
Lima, Panfrost, and V3D teams, it would become very difficult for AMD
to merge any code.

So the approach is that patches are sent for approval, they are CCed
to people who should be interested, and after some time with no
comments, they may be merged if it seems like a reasonable thing to
do.

The problem with internal work is that, well, it's internal. If the
community sends patches to amd-gfx@, there is no comment from AMD, and
then months later we are told that it should not have happened because
it conflicts with development that AMD has been doing - how should the
rest of the community have known about this? So unfortunately this is
the compromise: if you decide to do private development, not inform
anyone about your plans, and not join in any common discussion, then
it is your responsibility to deal with any changes or conflicts that
happen whilst you are developing privately.

The only way we can successfully have support in the same ecosystem
for AMD, Arm, Broadcom, Intel, NVIDIA, Qualcomm, and VeriSilicon, is
that we are all working together openly. If community development had
to stop because each of these vendors had been doing internal
development for several months without even informing the community of
their plans, any kind of shared development is clearly impossible.

Cheers,
Daniel


Re: [PATCH] drm/amd/amdgpu: Increase HWIP_MAX_INSTANCE to 10

2021-09-02 Thread Alex Deucher
Applied.  Thanks!

Alex

On Thu, Sep 2, 2021 at 3:50 AM Ernst Sjöstrand  wrote:
>
> Seems like newer cards can have even more instances now.
> Found by UBSAN: array-index-out-of-bounds in
> drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c:318:29
> index 8 is out of range for type 'uint32_t *[8]'
>
> Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1697
> Cc: sta...@vger.kernel.org
> Signed-off-by: Ernst Sjöstrand 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> index dc3c6b3a00e5..d356e329e6f8 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> @@ -758,7 +758,7 @@ enum amd_hw_ip_block_type {
> MAX_HWIP
>  };
>
> -#define HWIP_MAX_INSTANCE  8
> +#define HWIP_MAX_INSTANCE  10
>
>  struct amd_powerplay {
> void *pp_handle;
> --
> 2.30.2
>


Re: [diagnostic TDR mode patches] unify our solution opinions/suggestions in one thread

2021-09-02 Thread Daniel Vetter
On Thu, Sep 2, 2021 at 1:00 PM Christian König  wrote:
>
> Hi Monk,
>
> Am 02.09.21 um 07:52 schrieb Liu, Monk:
> > [AMD Official Use Only]
> >
> > I'm not sure I can add much to help this along, I'm sure Alex has some 
> > internal training,
> > Once your driver is upstream, it belongs to upstream, you can maintain it, 
> > but you no longer control it 100%, it's a tradeoff, it's not one companies 
> > always understand.
> > Usually people are fine developing away internally, but once interaction 
> > with other parts of the kernel/subsystem is required they have the 
> > realisation that they needed to work upstream 6 months earlier.
> > The best time to interact with upstream was 6 months ago, the second best 
> > time is now.
> > <<<
> >
> > Daniel/AlexD
> >
> > I didn't mean your changes on AMD driver need my personal approval or 
> > review ... and  I'm totally already get used that our driver is not 100% 
> > under control by AMDers,
> > but supposedly any one from community (including you) who tend to change 
> > AMD's driver need at least to get approvement from someone in AMD, e.g.: 
> > AlexD or Christian, doesn't that reasonable?
>
> I'm fearing that just repeating what Alex said, but to make it clear
> once more: That is *not* necessary!
>
> The shared repository is owned by upstream maintainers and they are
> usually free to do restructuring work without getting acknowledge from
> every single driver maintainer.
>
> Anybody can of course technically object to upstream design decisions,
> but that means that you need to pay attention to the mailing lists in
> the first place.
>
> > just like we need your approve if we try to modify DRM-sched, or need 
> > panfrost's approval if we need to change panfrost code ...
> >
> > by only CC AMD's engineers looks not quite properly, how do you know if 
> > your changes (on AMD code part) are conflicting with AMD's on-going 
> > internal features/refactoring or not ?
>
> Well because AMD is supposed to work in public as much as possible and
> ask upstream before doing changes to the code base.
>
> Additional to that design decisions are supposed to be discussed on the
> mailing list and *not* internally.

Yeah I'm honestly really surprised about the course of this discussion
here. With Alex, Christian and others amd has a lot of folks with
years/decades of experience in how to collaborate in upstream, when to
pull in others proactively and when that's not needed, and in general
how to plan upstream work with the lest amount of risk and surprises.

I think step zero here needs to be some training at amd and then
re-planning this effort, before we get back to technical stuff.
Otherwise we'll just get bogged down in pain because expectations
about the process don't pan out.
-Daniel

>
> Regards,
> Christian.
>
> >
> > Thanks
> >
> > --
> > Monk Liu | Cloud-GPU Core team
> > --
> >
> > -Original Message-
> > From: Dave Airlie 
> > Sent: Thursday, September 2, 2021 2:51 AM
> > To: Alex Deucher 
> > Cc: Liu, Monk ; Daniel Vetter ; Koenig, 
> > Christian ; Grodzovsky, Andrey 
> > ; Chen, JingWen ; DRI 
> > Development ; amd-gfx@lists.freedesktop.org
> > Subject: Re: [diagnostic TDR mode patches] unify our solution 
> > opinions/suggestions in one thread
> >
> > On Thu, 2 Sept 2021 at 01:20, Alex Deucher  wrote:
> >> On Wed, Sep 1, 2021 at 6:19 AM Liu, Monk  wrote:
> >>> [AMD Official Use Only]
> >>>
> >>> Daniel
> >>>
> >>>  From the link you share it looks you(or someone else) have quite a bunch 
> >>> patches that changes DRM_SCHED or even amdgpu, by that case before they 
> >>> are merged to kernel tree I'm wondering if any AMD develop reviewed them ?
> >>>
> >>> They looks to me somehow conflicting with what we changed in our repo
> >>>
> >>> It is really a chaos for AMDer if someone else out side of AMD changes 
> >>> our kernel driver (or/and scheduler) without reviewed by AMDer, just like 
> >>> we are requiring your review if we tend to change scheduler's logic here 
> >>> 
> >>>
> >>> This one changes AMD's code:
> >>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flo
> >>> re.kernel.org%2Fdri-devel%2F20210625133327.2598825-2-boris.brezillon
> >>> %40collabora.com%2Fdata=04%7C01%7CMonk.Liu%40amd.com%7C6c507d18
> >>> d65341ef53bb08d96d7976e6%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%
> >>> 7C637661190727875969%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJ
> >>> QIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=BWJSkKN
> >>> y2%2BwjxbQrfxGPzuJ5PBpBwB4aV0ZH6QoJGEg%3Dreserved=0
> >>> And I didn't see any reviewed-by from AMDers ...
> >>>
> >>> This one also touches AMD's code:
> >>> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flo
> >>> re.kernel.org%2Fdri-devel%2F20200604081224.863494-12-daniel.vetter%4
> >>> 0ffwll.ch%2Fdata=04%7C01%7CMonk.Liu%40amd.com%7C6c507d18d65341e
> >>> 

Re: [PATCH v2] Revert "drm/scheduler: Avoid accessing freed bad job."

2021-09-02 Thread Andrey Grodzovsky



On 2021-09-02 10:28 a.m., Daniel Vetter wrote:

On Tue, Aug 31, 2021 at 02:24:52PM -0400, Andrey Grodzovsky wrote:

On 2021-08-31 9:11 a.m., Daniel Vetter wrote:

On Thu, Aug 26, 2021 at 11:04:14AM +0200, Daniel Vetter wrote:

On Thu, Aug 19, 2021 at 11:25:09AM -0400, Andrey Grodzovsky wrote:

On 2021-08-19 5:30 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:51:00AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:42 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:36:32AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:32 a.m., Daniel Vetter wrote:

On Wed, Aug 18, 2021 at 10:26:25AM -0400, Andrey Grodzovsky wrote:

On 2021-08-18 10:02 a.m., Alex Deucher wrote:


+ dri-devel

Since scheduler is a shared component, please add dri-devel on all
scheduler patches.

On Wed, Aug 18, 2021 at 7:21 AM Jingwen Chen  wrote:

[Why]
for bailing job, this commit will delete it from pending list thus the
bailing job will never have a chance to be resubmitted even in advance
tdr mode.

[How]
after embeded hw_fence into amdgpu_job is done, the race condition that
this commit tries to work around is completely solved.So revert this
commit.
This reverts commit 135517d3565b48f4def3b1b82008bc17eb5d1c90.
v2:
add dma_fence_get/put() around timedout_job to avoid concurrent delete
during processing timedout_job

Signed-off-by: Jingwen Chen 
---
   drivers/gpu/drm/scheduler/sched_main.c | 23 +--
   1 file changed, 5 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
b/drivers/gpu/drm/scheduler/sched_main.c
index a2a953693b45..f9b9b3aefc4a 100644
--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -314,6 +314,7 @@ static void drm_sched_job_timedout(struct work_struct *work)
   {
  struct drm_gpu_scheduler *sched;
  struct drm_sched_job *job;
+   struct dma_fence *fence;
  enum drm_gpu_sched_stat status = DRM_GPU_SCHED_STAT_NOMINAL;

  sched = container_of(work, struct drm_gpu_scheduler, 
work_tdr.work);
@@ -325,11 +326,10 @@ static void drm_sched_job_timedout(struct work_struct 
*work)

  if (job) {
  /*
-* Remove the bad job so it cannot be freed by concurrent
-* drm_sched_cleanup_jobs. It will be reinserted back after 
sched->thread
-* is parked at which point it's safe.
+* Get job->s_fence->parent here to avoid concurrent delete 
during
+* processing timedout_job
   */
-   list_del_init(>list);
+   fence = dma_fence_get(job->s_fence->parent);

While this is true for amdgpu, it has no meaning for other drivers for whom
we haven't
done the refactoring of embedding HW fence (parent) into the job structure.
In fact thinking
about it, unless you do the HW fence embedding for all the drivers using the
scheduler you cannot
revert this patch or you will just break them.

btw, why did you do that embedding? I do still have my patches with
dma_fence annotations floating around, but my idea at least was to fix
that issue with a mempool, not with embeddeding. What was the motivation
for embedding the wh fence?
-Daniel

The motivation was 2 fold, avoid memory allocation during jobs submissions
(HW fence allocation) because as Christian explained this leads to deadlock
with
mm code during evictions due to memory pressure (Christian can clarify if I
messed

Yeah that's the exact same thing I've chased with my dma_fence
annotations, but thus far zero to none interested in getting it sorted. I
think it'd be good to have some cross-driver agreement on how this should
be solved before someone just charges ahead ...


this explanation). Second is to exactly revert this patch because while it
solved the issue
described in the patch it created another with drivers who baildc out early
during TDR handling
for various reason and the job would just leak because it was already
removed form pending list.

Can't we reinsert it before we restart the scheduler thread? It might need
a separate list for that due to the lockless queue tricks. Or am I
thinking about the wrong kind of "we lost the job"?
-Danile

If you look at the original patch it would reinsert it even earlier - right
after stopping the  SW scheduler thread, and even then it was to late for
some drivers as they would decide to return back from their TDR handler even
before that. It is solvable but in an ugly way as far as I see, you need to
require each driver in his code to put the job back in the list if they do
it before reaching the place where scheduler framework does it. Kind of
spaghetti code seems to me.

Hm yeah I didn't realize this all happens before we stop the scheduler
thread.

Why can't we stop the scheduler thread first, so that there's guaranteed
no race? I've recently had a lot of discussions with panfrost folks about
their reset that spawns 

Re: [PATCH 1/2] drm/amdgpu: use IS_ERR for debugfs APIs

2021-09-02 Thread Sharma, Shashank



On 9/2/2021 5:14 PM, Nirmoy Das wrote:

debugfs APIs returns encoded error so use
IS_ERR for checking return value.

References: https://gitlab.freedesktop.org/drm/amd/-/issues/1686
Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 6 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c| 2 +-
  2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index d256215ab2c7..077f9baf74fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1696,18 +1696,16 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
struct dentry *ent;
int r, i;
  
-

-
ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
  _ib_preempt);
-   if (!ent) {
+   if (IS_ERR(ent)) {
DRM_ERROR("unable to create amdgpu_preempt_ib debugsfs file\n");
return -EIO;
}
  
  	ent = debugfs_create_file("amdgpu_force_sclk", 0200, root, adev,

  _sclk_set);
-   if (!ent) {
+   if (IS_ERR(ent)) {
DRM_ERROR("unable to create amdgpu_set_sclk debugsfs file\n");
return -EIO;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 7b634a1517f9..f40753e1a60d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -428,7 +428,7 @@ int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
ent = debugfs_create_file(name,
  S_IFREG | S_IRUGO, root,
  ring, _debugfs_ring_fops);
-   if (!ent)
+   if (IS_ERR(ent))
return -ENOMEM;
  
  	i_size_write(ent->d_inode, ring->ring_size + 12);




ACK: Shashank Sharma 

Regards
Shashank


Re: [PATCH v2] Revert "drm/scheduler: Avoid accessing freed bad job."

2021-09-02 Thread Daniel Vetter
On Tue, Aug 31, 2021 at 02:24:52PM -0400, Andrey Grodzovsky wrote:
> 
> On 2021-08-31 9:11 a.m., Daniel Vetter wrote:
> > On Thu, Aug 26, 2021 at 11:04:14AM +0200, Daniel Vetter wrote:
> > > On Thu, Aug 19, 2021 at 11:25:09AM -0400, Andrey Grodzovsky wrote:
> > > > On 2021-08-19 5:30 a.m., Daniel Vetter wrote:
> > > > > On Wed, Aug 18, 2021 at 10:51:00AM -0400, Andrey Grodzovsky wrote:
> > > > > > On 2021-08-18 10:42 a.m., Daniel Vetter wrote:
> > > > > > > On Wed, Aug 18, 2021 at 10:36:32AM -0400, Andrey Grodzovsky wrote:
> > > > > > > > On 2021-08-18 10:32 a.m., Daniel Vetter wrote:
> > > > > > > > > On Wed, Aug 18, 2021 at 10:26:25AM -0400, Andrey Grodzovsky 
> > > > > > > > > wrote:
> > > > > > > > > > On 2021-08-18 10:02 a.m., Alex Deucher wrote:
> > > > > > > > > > 
> > > > > > > > > > > + dri-devel
> > > > > > > > > > > 
> > > > > > > > > > > Since scheduler is a shared component, please add 
> > > > > > > > > > > dri-devel on all
> > > > > > > > > > > scheduler patches.
> > > > > > > > > > > 
> > > > > > > > > > > On Wed, Aug 18, 2021 at 7:21 AM Jingwen Chen 
> > > > > > > > > > >  wrote:
> > > > > > > > > > > > [Why]
> > > > > > > > > > > > for bailing job, this commit will delete it from 
> > > > > > > > > > > > pending list thus the
> > > > > > > > > > > > bailing job will never have a chance to be resubmitted 
> > > > > > > > > > > > even in advance
> > > > > > > > > > > > tdr mode.
> > > > > > > > > > > > 
> > > > > > > > > > > > [How]
> > > > > > > > > > > > after embeded hw_fence into amdgpu_job is done, the 
> > > > > > > > > > > > race condition that
> > > > > > > > > > > > this commit tries to work around is completely 
> > > > > > > > > > > > solved.So revert this
> > > > > > > > > > > > commit.
> > > > > > > > > > > > This reverts commit 
> > > > > > > > > > > > 135517d3565b48f4def3b1b82008bc17eb5d1c90.
> > > > > > > > > > > > v2:
> > > > > > > > > > > > add dma_fence_get/put() around timedout_job to avoid 
> > > > > > > > > > > > concurrent delete
> > > > > > > > > > > > during processing timedout_job
> > > > > > > > > > > > 
> > > > > > > > > > > > Signed-off-by: Jingwen Chen 
> > > > > > > > > > > > ---
> > > > > > > > > > > >   drivers/gpu/drm/scheduler/sched_main.c | 23 
> > > > > > > > > > > > +--
> > > > > > > > > > > >   1 file changed, 5 insertions(+), 18 deletions(-)
> > > > > > > > > > > > 
> > > > > > > > > > > > diff --git a/drivers/gpu/drm/scheduler/sched_main.c 
> > > > > > > > > > > > b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > > > > index a2a953693b45..f9b9b3aefc4a 100644
> > > > > > > > > > > > --- a/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > > > > +++ b/drivers/gpu/drm/scheduler/sched_main.c
> > > > > > > > > > > > @@ -314,6 +314,7 @@ static void 
> > > > > > > > > > > > drm_sched_job_timedout(struct work_struct *work)
> > > > > > > > > > > >   {
> > > > > > > > > > > >  struct drm_gpu_scheduler *sched;
> > > > > > > > > > > >  struct drm_sched_job *job;
> > > > > > > > > > > > +   struct dma_fence *fence;
> > > > > > > > > > > >  enum drm_gpu_sched_stat status = 
> > > > > > > > > > > > DRM_GPU_SCHED_STAT_NOMINAL;
> > > > > > > > > > > > 
> > > > > > > > > > > >  sched = container_of(work, struct 
> > > > > > > > > > > > drm_gpu_scheduler, work_tdr.work);
> > > > > > > > > > > > @@ -325,11 +326,10 @@ static void 
> > > > > > > > > > > > drm_sched_job_timedout(struct work_struct *work)
> > > > > > > > > > > > 
> > > > > > > > > > > >  if (job) {
> > > > > > > > > > > >  /*
> > > > > > > > > > > > -* Remove the bad job so it cannot be 
> > > > > > > > > > > > freed by concurrent
> > > > > > > > > > > > -* drm_sched_cleanup_jobs. It will be 
> > > > > > > > > > > > reinserted back after sched->thread
> > > > > > > > > > > > -* is parked at which point it's safe.
> > > > > > > > > > > > +* Get job->s_fence->parent here to 
> > > > > > > > > > > > avoid concurrent delete during
> > > > > > > > > > > > +* processing timedout_job
> > > > > > > > > > > >   */
> > > > > > > > > > > > -   list_del_init(>list);
> > > > > > > > > > > > +   fence = 
> > > > > > > > > > > > dma_fence_get(job->s_fence->parent);
> > > > > > > > > > While this is true for amdgpu, it has no meaning for other 
> > > > > > > > > > drivers for whom
> > > > > > > > > > we haven't
> > > > > > > > > > done the refactoring of embedding HW fence (parent) into 
> > > > > > > > > > the job structure.
> > > > > > > > > > In fact thinking
> > > > > > > > > > about it, unless you do the HW fence embedding for all the 
> > > > > > > > > > drivers using the
> > > > > > > > > > scheduler you cannot
> > > > > > > > > > revert this patch or you will just break them.
> > > > > > > > > btw, why did you do that embedding? I do still have my 
> > > > > > > > 

[PATCH v2 1/2] drm/amdgpu: use IS_ERR for debugfs APIs

2021-09-02 Thread Nirmoy Das
debugfs APIs returns encoded error so use
IS_ERR for checking return value.

v2: return PTR_ERR(ent)

References: https://gitlab.freedesktop.org/drm/amd/-/issues/1686
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 10 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c|  4 ++--
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index d256215ab2c7..60f46a4b0144 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1696,20 +1696,18 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
struct dentry *ent;
int r, i;

-
-
ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
  _ib_preempt);
-   if (!ent) {
+   if (IS_ERR(ent)) {
DRM_ERROR("unable to create amdgpu_preempt_ib debugsfs file\n");
-   return -EIO;
+   return PTR_ERR(ent);
}

ent = debugfs_create_file("amdgpu_force_sclk", 0200, root, adev,
  _sclk_set);
-   if (!ent) {
+   if (IS_ERR(ent)) {
DRM_ERROR("unable to create amdgpu_set_sclk debugsfs file\n");
-   return -EIO;
+   return PTR_ERR(ent);
}

/* Register debugfs entries for amdgpu_ttm */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 7b634a1517f9..0554576d3695 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -428,8 +428,8 @@ int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
ent = debugfs_create_file(name,
  S_IFREG | S_IRUGO, root,
  ring, _debugfs_ring_fops);
-   if (!ent)
-   return -ENOMEM;
+   if (IS_ERR(ent))
+   return PTR_ERR(ent);

i_size_write(ent->d_inode, ring->ring_size + 12);
ring->ent = ent;
--
2.32.0



[PATCH 2/2] drm/amdgpu: cleanup debugfs for amdgpu rings

2021-09-02 Thread Nirmoy Das
Use debugfs_create_file_size API for creating ring debugfs
file, also cleanup surrounding code.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  4 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c| 18 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h| 10 ++
 3 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 60f46a4b0144..97d88f3e1c4c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1734,9 +1734,7 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
if (!ring)
continue;
 
-   if (amdgpu_debugfs_ring_init(adev, ring)) {
-   DRM_ERROR("Failed to register debugfs file for rings 
!\n");
-   }
+   amdgpu_debugfs_ring_init(adev, ring);
}
 
amdgpu_ras_debugfs_create_all(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 0554576d3695..ab2351ba9574 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -415,26 +415,20 @@ static const struct file_operations 
amdgpu_debugfs_ring_fops = {
 
 #endif
 
-int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
-struct amdgpu_ring *ring)
+void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+ struct amdgpu_ring *ring)
 {
 #if defined(CONFIG_DEBUG_FS)
struct drm_minor *minor = adev_to_drm(adev)->primary;
-   struct dentry *ent, *root = minor->debugfs_root;
+   struct dentry *root = minor->debugfs_root;
char name[32];
 
sprintf(name, "amdgpu_ring_%s", ring->name);
+   debugfs_create_file_size(name, S_IFREG | S_IRUGO, root, ring,
+_debugfs_ring_fops,
+ring->ring_size + 12);
 
-   ent = debugfs_create_file(name,
- S_IFREG | S_IRUGO, root,
- ring, _debugfs_ring_fops);
-   if (IS_ERR(ent))
-   return PTR_ERR(ent);
-
-   i_size_write(ent->d_inode, ring->ring_size + 12);
-   ring->ent = ent;
 #endif
-   return 0;
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 88d80eb3fea1..4d380e79752c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -253,10 +253,6 @@ struct amdgpu_ring {
boolhas_compute_vm_bug;
boolno_scheduler;
int hw_prio;
-
-#if defined(CONFIG_DEBUG_FS)
-   struct dentry *ent;
-#endif
 };
 
 #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))
@@ -356,8 +352,6 @@ static inline void amdgpu_ring_write_multiple(struct 
amdgpu_ring *ring,
 
 int amdgpu_ring_test_helper(struct amdgpu_ring *ring);
 
-int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
-struct amdgpu_ring *ring);
-void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
-
+void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+ struct amdgpu_ring *ring);
 #endif
-- 
2.32.0



Re: [diagnostic TDR mode patches] unify our solution opinions/suggestions in one thread

2021-09-02 Thread Alex Deucher
On Thu, Sep 2, 2021 at 1:52 AM Liu, Monk  wrote:
>
> [AMD Official Use Only]
>
> >>>
> I'm not sure I can add much to help this along, I'm sure Alex has some 
> internal training,
> Once your driver is upstream, it belongs to upstream, you can maintain it, 
> but you no longer control it 100%, it's a tradeoff, it's not one companies 
> always understand.
> Usually people are fine developing away internally, but once interaction with 
> other parts of the kernel/subsystem is required they have the realisation 
> that they needed to work upstream 6 months earlier.
> The best time to interact with upstream was 6 months ago, the second best 
> time is now.
> <<<
>
> Daniel/AlexD
>
> I didn't mean your changes on AMD driver need my personal approval or review 
> ... and  I'm totally already get used that our driver is not 100% under 
> control by AMDers,
> but supposedly any one from community (including you) who tend to change 
> AMD's driver need at least to get approvement from someone in AMD, e.g.: 
> AlexD or Christian, doesn't that reasonable?
> just like we need your approve if we try to modify DRM-sched, or need 
> panfrost's approval if we need to change panfrost code ...
>
> by only CC AMD's engineers looks not quite properly, how do you know if your 
> changes (on AMD code part) are conflicting with AMD's on-going internal 
> features/refactoring or not ?
>

We keep as up to date as possible with upstream.  I don't have the
bandwidth to verify every patch, but in most cases I try and look at
them.  In your first example, the patch basically just adds a new
parameter to some common functions.  Drivers that don't need that
parameter don't use it.  It shouldn't really affect the functionality.
There are lots of changes that touch our driver that we are largely
not aware of.  E.g., APIs that we may use may change the function
signatures with no intended functional changes.  If problems are found
they are reported and resolved.  It is a collective effort.  If there
are changes that would conflict with stuff we are doing in our tree we
should bring them up when the relevant patches are being discussed.
We can also make changes to core functionality like scheduler, ttm,
etc. that would affect other drivers.  When we send out the patches we
cc the relevant maintainers, but ultimately the ones who participate
in the discussion set the direction.  That's why participation is
important.

Alex


> Thanks
>
> --
> Monk Liu | Cloud-GPU Core team
> --
>
> -Original Message-
> From: Dave Airlie 
> Sent: Thursday, September 2, 2021 2:51 AM
> To: Alex Deucher 
> Cc: Liu, Monk ; Daniel Vetter ; Koenig, 
> Christian ; Grodzovsky, Andrey 
> ; Chen, JingWen ; DRI 
> Development ; amd-gfx@lists.freedesktop.org
> Subject: Re: [diagnostic TDR mode patches] unify our solution 
> opinions/suggestions in one thread
>
> On Thu, 2 Sept 2021 at 01:20, Alex Deucher  wrote:
> >
> > On Wed, Sep 1, 2021 at 6:19 AM Liu, Monk  wrote:
> > >
> > > [AMD Official Use Only]
> > >
> > > Daniel
> > >
> > > From the link you share it looks you(or someone else) have quite a bunch 
> > > patches that changes DRM_SCHED or even amdgpu, by that case before they 
> > > are merged to kernel tree I'm wondering if any AMD develop reviewed them ?
> > >
> > > They looks to me somehow conflicting with what we changed in our repo
> > >
> > > It is really a chaos for AMDer if someone else out side of AMD changes 
> > > our kernel driver (or/and scheduler) without reviewed by AMDer, just like 
> > > we are requiring your review if we tend to change scheduler's logic here 
> > > 
> > >
> > > This one changes AMD's code:
> > > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flo
> > > re.kernel.org%2Fdri-devel%2F20210625133327.2598825-2-boris.brezillon
> > > %40collabora.com%2Fdata=04%7C01%7CMonk.Liu%40amd.com%7C6c507d18
> > > d65341ef53bb08d96d7976e6%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%
> > > 7C637661190727875969%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJ
> > > QIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=BWJSkKN
> > > y2%2BwjxbQrfxGPzuJ5PBpBwB4aV0ZH6QoJGEg%3Dreserved=0
> > > And I didn't see any reviewed-by from AMDers ...
> > >
> > > This one also touches AMD's code:
> > > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flo
> > > re.kernel.org%2Fdri-devel%2F20200604081224.863494-12-daniel.vetter%4
> > > 0ffwll.ch%2Fdata=04%7C01%7CMonk.Liu%40amd.com%7C6c507d18d65341e
> > > f53bb08d96d7976e6%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C63766
> > > 1190727885929%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2
> > > luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=%2F8vIVXCWjHkM
> > > 56pcYI9EvuzhbsZhV9WczkKaBJE67KQ%3Dreserved=0
> > > Which is conflicting with one patch we submitted (in our repo
> > > rightnow), and neither see AMDder gave a review-by on this one (let
> > > me know if I missed it)
> > >
> 

Re: [PATCH v1 03/14] mm: add iomem vma selection for memory migration

2021-09-02 Thread Christoph Hellwig
On Wed, Sep 01, 2021 at 11:40:43AM -0400, Felix Kuehling wrote:
> >>> It looks like I'm totally misunderstanding what you are adding here
> >>> then.  Why do we need any special treatment at all for memory that
> >>> has normal struct pages and is part of the direct kernel map?
> >> The pages are like normal memory for purposes of mapping them in CPU
> >> page tables and for coherent access from the CPU.
> > That's the user page tables.  What about the kernel direct map?
> > If there is a normal kernel struct page backing there really should
> > be no need for the pgmap.
> 
> I'm not sure. The physical address ranges are in the UEFI system address
> map as special-purpose memory. Does Linux create the struct pages and
> kernel direct map for that without a pgmap call? I didn't see that last
> time I went digging through that code.

So doing some googling finds a patch from Dan that claims to hand EFI
special purpose memory to the device dax driver.  But when I try to
follow the version that got merged it looks it is treated simply as an
MMIO region to be claimed by drivers, which would not get a struct page.

Dan, did I misunderstand how E820_TYPE_SOFT_RESERVED works?

> >> From an application
> >> perspective, we want file-backed and anonymous mappings to be able to
> >> use DEVICE_PUBLIC pages with coherent CPU access. The goal is to
> >> optimize performance for GPU heavy workloads while minimizing the need
> >> to migrate data back-and-forth between system memory and device memory.
> > I don't really understand that part.  file backed pages are always
> > allocated by the file system using the pagecache helpers, that is
> > using the page allocator.  Anonymouns memory also always comes from
> > the page allocator.
> 
> I'm coming at this from my experience with DEVICE_PRIVATE. Both
> anonymous and file-backed pages should be migrateable to DEVICE_PRIVATE
> memory by the migrate_vma_* helpers for more efficient access by our
> GPU. (*) It's part of the basic premise of HMM as I understand it. I
> would expect the same thing to work for DEVICE_PUBLIC memory.

Ok, so you want to migrate to and from them.  Not use DEVICE_PUBLIC
for the actual page cache pages.  That maks a lot more sense.

> I see DEVICE_PUBLIC as an improved version of DEVICE_PRIVATE that allows
> the CPU to map the device memory coherently to minimize the need for
> migrations when CPU and GPU access the same memory concurrently or
> alternatingly. But we're not going as far as putting that memory
> entirely under the management of the Linux memory manager and VM
> subsystem. Our (and HPE's) system architects decided that this memory is
> not suitable to be used like regular NUMA system memory by the Linux
> memory manager.

So yes.  It is a Memory Mapped I/O region, which unlike the PCIe BARs
that people typically deal with is fully cache coherent.  I think this
does make more sense as a description.

But to go back to what start this discussion:  If these are memory
mapped I/O pfn_valid should generally not return true for them.

And as you already pointed out in reply to Alex we need to tighten the
selection criteria one way or another.


[PATCH 2/2] drm/amdgpu: cleanup debugfs for amdgpu rings

2021-09-02 Thread Nirmoy Das
Use debugfs_create_file_size API for creating ring debugfs
file, also cleanup surrounding code.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  4 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c| 16 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h|  8 +---
 3 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 077f9baf74fe..dee56ab19a8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1734,9 +1734,7 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
if (!ring)
continue;
 
-   if (amdgpu_debugfs_ring_init(adev, ring)) {
-   DRM_ERROR("Failed to register debugfs file for rings 
!\n");
-   }
+   amdgpu_debugfs_ring_init(adev, ring);
}
 
amdgpu_ras_debugfs_create_all(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index f40753e1a60d..968521d80514 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -415,26 +415,20 @@ static const struct file_operations 
amdgpu_debugfs_ring_fops = {
 
 #endif
 
-int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
 struct amdgpu_ring *ring)
 {
 #if defined(CONFIG_DEBUG_FS)
struct drm_minor *minor = adev_to_drm(adev)->primary;
-   struct dentry *ent, *root = minor->debugfs_root;
+   struct dentry *root = minor->debugfs_root;
char name[32];
 
sprintf(name, "amdgpu_ring_%s", ring->name);
 
-   ent = debugfs_create_file(name,
- S_IFREG | S_IRUGO, root,
- ring, _debugfs_ring_fops);
-   if (IS_ERR(ent))
-   return -ENOMEM;
-
-   i_size_write(ent->d_inode, ring->ring_size + 12);
-   ring->ent = ent;
+   debugfs_create_file_size(name, S_IFREG | S_IRUGO, root, ring,
+_debugfs_ring_fops,
+ring->ring_size + 12);
 #endif
-   return 0;
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 88d80eb3fea1..c29fbce0a5b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -253,10 +253,6 @@ struct amdgpu_ring {
boolhas_compute_vm_bug;
boolno_scheduler;
int hw_prio;
-
-#if defined(CONFIG_DEBUG_FS)
-   struct dentry *ent;
-#endif
 };
 
 #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))
@@ -356,8 +352,6 @@ static inline void amdgpu_ring_write_multiple(struct 
amdgpu_ring *ring,
 
 int amdgpu_ring_test_helper(struct amdgpu_ring *ring);
 
-int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
 struct amdgpu_ring *ring);
-void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
-
 #endif
-- 
2.32.0



[PATCH 1/2] drm/amdgpu: use IS_ERR for debugfs APIs

2021-09-02 Thread Nirmoy Das
debugfs APIs returns encoded error so use
IS_ERR for checking return value.

References: https://gitlab.freedesktop.org/drm/amd/-/issues/1686
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c| 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index d256215ab2c7..077f9baf74fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1696,18 +1696,16 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
struct dentry *ent;
int r, i;
 
-
-
ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
  _ib_preempt);
-   if (!ent) {
+   if (IS_ERR(ent)) {
DRM_ERROR("unable to create amdgpu_preempt_ib debugsfs file\n");
return -EIO;
}
 
ent = debugfs_create_file("amdgpu_force_sclk", 0200, root, adev,
  _sclk_set);
-   if (!ent) {
+   if (IS_ERR(ent)) {
DRM_ERROR("unable to create amdgpu_set_sclk debugsfs file\n");
return -EIO;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 7b634a1517f9..f40753e1a60d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -428,7 +428,7 @@ int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
ent = debugfs_create_file(name,
  S_IFREG | S_IRUGO, root,
  ring, _debugfs_ring_fops);
-   if (!ent)
+   if (IS_ERR(ent))
return -ENOMEM;
 
i_size_write(ent->d_inode, ring->ring_size + 12);
-- 
2.32.0



Re: [diagnostic TDR mode patches] unify our solution opinions/suggestions in one thread

2021-09-02 Thread Christian König

Hi Monk,

Am 02.09.21 um 07:52 schrieb Liu, Monk:

[AMD Official Use Only]

I'm not sure I can add much to help this along, I'm sure Alex has some internal 
training,
Once your driver is upstream, it belongs to upstream, you can maintain it, but 
you no longer control it 100%, it's a tradeoff, it's not one companies always 
understand.
Usually people are fine developing away internally, but once interaction with 
other parts of the kernel/subsystem is required they have the realisation that 
they needed to work upstream 6 months earlier.
The best time to interact with upstream was 6 months ago, the second best time 
is now.
<<<

Daniel/AlexD

I didn't mean your changes on AMD driver need my personal approval or review 
... and  I'm totally already get used that our driver is not 100% under control 
by AMDers,
but supposedly any one from community (including you) who tend to change AMD's 
driver need at least to get approvement from someone in AMD, e.g.: AlexD or 
Christian, doesn't that reasonable?


I'm fearing that just repeating what Alex said, but to make it clear 
once more: That is *not* necessary!


The shared repository is owned by upstream maintainers and they are 
usually free to do restructuring work without getting acknowledge from 
every single driver maintainer.


Anybody can of course technically object to upstream design decisions, 
but that means that you need to pay attention to the mailing lists in 
the first place.



just like we need your approve if we try to modify DRM-sched, or need 
panfrost's approval if we need to change panfrost code ...

by only CC AMD's engineers looks not quite properly, how do you know if your 
changes (on AMD code part) are conflicting with AMD's on-going internal 
features/refactoring or not ?


Well because AMD is supposed to work in public as much as possible and 
ask upstream before doing changes to the code base.


Additional to that design decisions are supposed to be discussed on the 
mailing list and *not* internally.


Regards,
Christian.



Thanks

--
Monk Liu | Cloud-GPU Core team
--

-Original Message-
From: Dave Airlie 
Sent: Thursday, September 2, 2021 2:51 AM
To: Alex Deucher 
Cc: Liu, Monk ; Daniel Vetter ; Koenig, Christian 
; Grodzovsky, Andrey ; Chen, JingWen 
; DRI Development ; 
amd-gfx@lists.freedesktop.org
Subject: Re: [diagnostic TDR mode patches] unify our solution 
opinions/suggestions in one thread

On Thu, 2 Sept 2021 at 01:20, Alex Deucher  wrote:

On Wed, Sep 1, 2021 at 6:19 AM Liu, Monk  wrote:

[AMD Official Use Only]

Daniel

 From the link you share it looks you(or someone else) have quite a bunch 
patches that changes DRM_SCHED or even amdgpu, by that case before they are 
merged to kernel tree I'm wondering if any AMD develop reviewed them ?

They looks to me somehow conflicting with what we changed in our repo

It is really a chaos for AMDer if someone else out side of AMD changes our 
kernel driver (or/and scheduler) without reviewed by AMDer, just like we are 
requiring your review if we tend to change scheduler's logic here 

This one changes AMD's code:
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flo
re.kernel.org%2Fdri-devel%2F20210625133327.2598825-2-boris.brezillon
%40collabora.com%2Fdata=04%7C01%7CMonk.Liu%40amd.com%7C6c507d18
d65341ef53bb08d96d7976e6%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%
7C637661190727875969%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJ
QIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=BWJSkKN
y2%2BwjxbQrfxGPzuJ5PBpBwB4aV0ZH6QoJGEg%3Dreserved=0
And I didn't see any reviewed-by from AMDers ...

This one also touches AMD's code:
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flo
re.kernel.org%2Fdri-devel%2F20200604081224.863494-12-daniel.vetter%4
0ffwll.ch%2Fdata=04%7C01%7CMonk.Liu%40amd.com%7C6c507d18d65341e
f53bb08d96d7976e6%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C63766
1190727885929%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2
luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=%2F8vIVXCWjHkM
56pcYI9EvuzhbsZhV9WczkKaBJE67KQ%3Dreserved=0
Which is conflicting with one patch we submitted (in our repo
rightnow), and neither see AMDder gave a review-by on this one (let
me know if I missed it)


Monk, this is not how upstream works.  You need to participate.
That's how communities work.  There's a reason all these discussions
happen on public mailing lists.  The patch author can't be expected to
know every person on every vendor team to CC with a patch.  If you
have concerns, you need to raise them when the patches are being
discussed.


I'm not sure I can add much to help this along, I'm sure Alex has some internal 
training,

Once your driver is upstream, it belongs to upstream, you can maintain it, but 
you no longer control it 100%, it's a tradeoff, it's not one companies always 
understand.

Usually people are fine developing away 

[PATCH] drm/amd/amdgpu: Increase HWIP_MAX_INSTANCE to 10

2021-09-02 Thread Ernst Sjöstrand
Seems like newer cards can have even more instances now.
Found by UBSAN: array-index-out-of-bounds in
drivers/gpu/drm/amd/amdgpu/amdgpu_discovery.c:318:29
index 8 is out of range for type 'uint32_t *[8]'

Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1697
Cc: sta...@vger.kernel.org
Signed-off-by: Ernst Sjöstrand 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index dc3c6b3a00e5..d356e329e6f8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -758,7 +758,7 @@ enum amd_hw_ip_block_type {
MAX_HWIP
 };
 
-#define HWIP_MAX_INSTANCE  8
+#define HWIP_MAX_INSTANCE  10
 
 struct amd_powerplay {
void *pp_handle;
-- 
2.30.2