from:"Nirmoy"

Re: [PATCH 2/2] drm_print: Remove deprecated DRM_DEBUG_KMS_RATELIMITED()

2023-01-18 Thread Das, Nirmoy




On 1/18/2023 7:27 AM, Christian König wrote:



Am 17.01.23 um 19:12 schrieb Das, Nirmoy:

Hi Alex,

On 1/17/2023 7:06 PM, Alex Deucher wrote:
On Tue, Jan 17, 2023 at 1:05 PM Nirmoy Das  
wrote:

There are no current users of DRM_DEBUG_KMS_RATELIMITED()
so remove it.

Cc: Maarten Lankhorst 
Cc: Maxime Ripard 
Cc: Thomas Zimmermann 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: Sam Ravnborg 

Signed-off-by: Nirmoy Das 
Reviewed-by: Sam Ravnborg 

Series is:
Reviewed-by: Alex Deucher 

Feel free to take the patches through whatever tree you want.



Please help me with this, I don't have committer rights for any tree.


Going to push that into drm-misc-next later today.



Thanks, Christian.




Thanks,
Christian.




Nirmoy




Alex


---
  include/drm/drm_print.h | 3 ---
  1 file changed, 3 deletions(-)

diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
index a44fb7ef257f..c3753da97c4e 100644
--- a/include/drm/drm_print.h
+++ b/include/drm/drm_print.h
@@ -605,9 +605,6 @@ void __drm_err(const char *format, ...);
  #define drm_dbg_kms_ratelimited(drm, fmt, ...) \
 __DRM_DEFINE_DBG_RATELIMITED(KMS, drm, fmt, ## __VA_ARGS__)

-/* NOTE: this is deprecated in favor of 
drm_dbg_kms_ratelimited(NULL, ...). */
-#define DRM_DEBUG_KMS_RATELIMITED(fmt, ...) 
drm_dbg_kms_ratelimited(NULL, fmt, ## __VA_ARGS__)

-
  /*
   * struct drm_device based WARNs
   *
--
2.39.0

Re: [PATCH 2/2] drm_print: Remove deprecated DRM_DEBUG_KMS_RATELIMITED()

2023-01-17 Thread Das, Nirmoy


Hi Alex,

On 1/17/2023 7:06 PM, Alex Deucher wrote:

On Tue, Jan 17, 2023 at 1:05 PM Nirmoy Das  wrote:

There are no current users of DRM_DEBUG_KMS_RATELIMITED()
so remove it.

Cc: Maarten Lankhorst 
Cc: Maxime Ripard 
Cc: Thomas Zimmermann 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: Sam Ravnborg 

Signed-off-by: Nirmoy Das 
Reviewed-by: Sam Ravnborg 

Series is:
Reviewed-by: Alex Deucher 

Feel free to take the patches through whatever tree you want.



Please help me with this, I don't have committer rights for any tree.


Nirmoy




Alex


---
  include/drm/drm_print.h | 3 ---
  1 file changed, 3 deletions(-)

diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
index a44fb7ef257f..c3753da97c4e 100644
--- a/include/drm/drm_print.h
+++ b/include/drm/drm_print.h
@@ -605,9 +605,6 @@ void __drm_err(const char *format, ...);
  #define drm_dbg_kms_ratelimited(drm, fmt, ...) \
 __DRM_DEFINE_DBG_RATELIMITED(KMS, drm, fmt, ## __VA_ARGS__)

-/* NOTE: this is deprecated in favor of drm_dbg_kms_ratelimited(NULL, ...). */
-#define DRM_DEBUG_KMS_RATELIMITED(fmt, ...) drm_dbg_kms_ratelimited(NULL, fmt, 
## __VA_ARGS__)
-
  /*
   * struct drm_device based WARNs
   *
--
2.39.0

[PATCH v2] drm/radeon: Do not use deprecated drm log API

2023-01-17 Thread Nirmoy Das

Replace deprecated DRM_DEBUG_KMS_RATELIMITED() and DRM_ERROR()
with proper APIs.

v2: replace pr_err with dev_err(Alex).

Cc: Alex Deucher 
Cc: Christian König 

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/radeon/radeon_dp_auxch.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon_dp_auxch.c 
b/drivers/gpu/drm/radeon/radeon_dp_auxch.c
index 69379b95146e..1e5b6baf76a1 100644
--- a/drivers/gpu/drm/radeon/radeon_dp_auxch.c
+++ b/drivers/gpu/drm/radeon/radeon_dp_auxch.c
@@ -158,7 +158,7 @@ radeon_dp_aux_transfer_native(struct drm_dp_aux *aux, 
struct drm_dp_aux_msg *msg
} while (retry_count++ < 1000);
 
if (retry_count >= 1000) {
-   DRM_ERROR("auxch hw never signalled completion, error %08x\n", 
tmp);
+   dev_err(rdev->dev, "auxch hw never signalled completion, error 
%08x\n", tmp);
ret = -EIO;
goto done;
}
@@ -168,8 +168,7 @@ radeon_dp_aux_transfer_native(struct drm_dp_aux *aux, 
struct drm_dp_aux_msg *msg
goto done;
}
if (tmp & AUX_RX_ERROR_FLAGS) {
-   DRM_DEBUG_KMS_RATELIMITED("dp_aux_ch flags not zero: %08x\n",
- tmp);
+   drm_dbg_kms_ratelimited(dev, "dp_aux_ch flags not zero: 
%08x\n", tmp);
ret = -EIO;
goto done;
}
-- 
2.39.0

[PATCH 2/2] drm_print: Remove deprecated DRM_DEBUG_KMS_RATELIMITED()

2023-01-17 Thread Nirmoy Das

There are no current users of DRM_DEBUG_KMS_RATELIMITED()
so remove it.

Cc: Maarten Lankhorst 
Cc: Maxime Ripard 
Cc: Thomas Zimmermann 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: Sam Ravnborg 

Signed-off-by: Nirmoy Das 
Reviewed-by: Sam Ravnborg 
---
 include/drm/drm_print.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
index a44fb7ef257f..c3753da97c4e 100644
--- a/include/drm/drm_print.h
+++ b/include/drm/drm_print.h
@@ -605,9 +605,6 @@ void __drm_err(const char *format, ...);
 #define drm_dbg_kms_ratelimited(drm, fmt, ...) \
__DRM_DEFINE_DBG_RATELIMITED(KMS, drm, fmt, ## __VA_ARGS__)
 
-/* NOTE: this is deprecated in favor of drm_dbg_kms_ratelimited(NULL, ...). */
-#define DRM_DEBUG_KMS_RATELIMITED(fmt, ...) drm_dbg_kms_ratelimited(NULL, fmt, 
## __VA_ARGS__)
-
 /*
  * struct drm_device based WARNs
  *
-- 
2.39.0

Re: [PATCH 1/2] drm/radeon: Do not use deprecated drm log API

2023-01-17 Thread Das, Nirmoy




On 1/17/2023 6:48 PM, Alex Deucher wrote:

On Tue, Jan 17, 2023 at 12:45 PM Nirmoy Das  wrote:

Replace deprecated DRM_DEBUG_KMS_RATELIMITED() and DRM_ERROR()
with proper APIs.

Cc: Alex Deucher 
Cc: Christian König 

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/radeon/radeon_dp_auxch.c | 5 ++---
  1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon_dp_auxch.c 
b/drivers/gpu/drm/radeon/radeon_dp_auxch.c
index 69379b95146e..76ce66efb5f8 100644
--- a/drivers/gpu/drm/radeon/radeon_dp_auxch.c
+++ b/drivers/gpu/drm/radeon/radeon_dp_auxch.c
@@ -158,7 +158,7 @@ radeon_dp_aux_transfer_native(struct drm_dp_aux *aux, 
struct drm_dp_aux_msg *msg
 } while (retry_count++ < 1000);

 if (retry_count >= 1000) {
-   DRM_ERROR("auxch hw never signalled completion, error %08x\n", 
tmp);
+   pr_err("auxch hw never signalled completion, error %08x\n", 
tmp);

Please use dev_err() instead so we get device identification on error
messages.  Makes it much easier when you have multiple GPUs in a
system.



Thanks for your quick review, Alex. I will resend with dev_err().


Nirmoy



Alex


 ret = -EIO;
 goto done;
 }
@@ -168,8 +168,7 @@ radeon_dp_aux_transfer_native(struct drm_dp_aux *aux, 
struct drm_dp_aux_msg *msg
 goto done;
 }
 if (tmp & AUX_RX_ERROR_FLAGS) {
-   DRM_DEBUG_KMS_RATELIMITED("dp_aux_ch flags not zero: %08x\n",
- tmp);
+   drm_dbg_kms_ratelimited(dev, "dp_aux_ch flags not zero: 
%08x\n", tmp);
 ret = -EIO;
 goto done;
 }
--
2.39.0

[PATCH 1/2] drm/radeon: Do not use deprecated drm log API

2023-01-17 Thread Nirmoy Das

Replace deprecated DRM_DEBUG_KMS_RATELIMITED() and DRM_ERROR()
with proper APIs.

Cc: Alex Deucher 
Cc: Christian König 

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/radeon/radeon_dp_auxch.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon_dp_auxch.c 
b/drivers/gpu/drm/radeon/radeon_dp_auxch.c
index 69379b95146e..76ce66efb5f8 100644
--- a/drivers/gpu/drm/radeon/radeon_dp_auxch.c
+++ b/drivers/gpu/drm/radeon/radeon_dp_auxch.c
@@ -158,7 +158,7 @@ radeon_dp_aux_transfer_native(struct drm_dp_aux *aux, 
struct drm_dp_aux_msg *msg
} while (retry_count++ < 1000);
 
if (retry_count >= 1000) {
-   DRM_ERROR("auxch hw never signalled completion, error %08x\n", 
tmp);
+   pr_err("auxch hw never signalled completion, error %08x\n", 
tmp);
ret = -EIO;
goto done;
}
@@ -168,8 +168,7 @@ radeon_dp_aux_transfer_native(struct drm_dp_aux *aux, 
struct drm_dp_aux_msg *msg
goto done;
}
if (tmp & AUX_RX_ERROR_FLAGS) {
-   DRM_DEBUG_KMS_RATELIMITED("dp_aux_ch flags not zero: %08x\n",
- tmp);
+   drm_dbg_kms_ratelimited(dev, "dp_aux_ch flags not zero: 
%08x\n", tmp);
ret = -EIO;
goto done;
}
-- 
2.39.0

[PATCH 2/2] drm_print: Remove deprecated DRM_DEBUG_KMS_RATELIMITED()

2023-01-17 Thread Nirmoy Das

There are no current users of DRM_DEBUG_KMS_RATELIMITED()
so remove it.

Cc: Maarten Lankhorst 
Cc: Maxime Ripard 
Cc: Thomas Zimmermann 
Cc: David Airlie 
Cc: Daniel Vetter 
Cc: Sam Ravnborg 

Signed-off-by: Nirmoy Das 
---
 include/drm/drm_print.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/drm/drm_print.h b/include/drm/drm_print.h
index a44fb7ef257f..c3753da97c4e 100644
--- a/include/drm/drm_print.h
+++ b/include/drm/drm_print.h
@@ -605,9 +605,6 @@ void __drm_err(const char *format, ...);
 #define drm_dbg_kms_ratelimited(drm, fmt, ...) \
__DRM_DEFINE_DBG_RATELIMITED(KMS, drm, fmt, ## __VA_ARGS__)
 
-/* NOTE: this is deprecated in favor of drm_dbg_kms_ratelimited(NULL, ...). */
-#define DRM_DEBUG_KMS_RATELIMITED(fmt, ...) drm_dbg_kms_ratelimited(NULL, fmt, 
## __VA_ARGS__)
-
 /*
  * struct drm_device based WARNs
  *
-- 
2.39.0

Re: amd-staging-drm-next breaks suspend

2022-01-19 Thread Das, Nirmoy




On 1/19/2022 10:59 PM, Limonciello, Mario wrote:

[Public]


-Original Message-
From: Bert Karwatzki 
Sent: Wednesday, January 19, 2022 15:52
To: amd-gfx@lists.freedesktop.org
Cc: Limonciello, Mario ; Kazlauskas, Nicholas
; Zhuo, Qingqing (Lillian)
; Scott Bruce ; Alex Deucher
; Chris Hixon 
Subject: amd-staging-drm-next breaks suspend

I just tested drm-staging-drm-next with HEAD
f1b2924ee6929cb431440e6f961f06eb65d52beb:
Going into suspend leads to a hang again:
This is probably caused by
[ 1.310551] trying to bind memory to uninitialized GART !
and/or
[ 3.976438] trying to bind memory to uninitialized GART !



Could you please also try https://patchwork.freedesktop.org/patch/469907/ ?


Regards,

Nirmoy





+@Das, Nirmoy

The only thing that touched that file recently was
72f686438de13f121c52f58d7445570a33dfdc61

Could you see if backing that out helps?


Here's the complete dmesg:
[ 0.00] Linux version 5.13.0+ (bert@lisa) (gcc (Debian 11.2.0-14)
11.2.0, GNU ld (GNU Binutils for Debian) 2.37.50.20220106) #4 SMP Wed
Jan 19 22:19:19 CET 2022
[ 0.00] Command line: BOOT_IMAGE=/boot/vmlinuz-5.13.0+
root=UUID=78dcbf14-902d-49c0-9d4d-b7ad84550d9a ro
mt7921e.disable_aspm=1 quiet
[ 0.00] x86/fpu: Supporting XSAVE feature 0x001: 'x87 floating
point registers'
[ 0.00] x86/fpu: Supporting XSAVE feature 0x002: 'SSE registers'
[ 0.00] x86/fpu: Supporting XSAVE feature 0x004: 'AVX registers'
[ 0.00] x86/fpu: Supporting XSAVE feature 0x200: 'Protection Keys
User registers'
[ 0.00] x86/fpu: xstate_offset[2]: 576, xstate_sizes[2]: 256
[ 0.00] x86/fpu: xstate_offset[9]: 832, xstate_sizes[9]: 8
[ 0.00] x86/fpu: Enabled xstate features 0x207, context size is 840
bytes, using 'compacted' format.
[ 0.00] BIOS-provided physical RAM map:
[ 0.00] BIOS-e820: [mem 0x-0x0009]
usable
[ 0.00] BIOS-e820: [mem 0x000a-0x000f]
reserved
[ 0.00] BIOS-e820: [mem 0x0010-0x09bfefff]
usable
[ 0.00] BIOS-e820: [mem 0x09bff000-0x0a000fff]
reserved
[ 0.00] BIOS-e820: [mem 0x0a001000-0x0a1f]
usable
[ 0.00] BIOS-e820: [mem 0x0a20-0x0a20efff] ACPI
NVS
[ 0.00] BIOS-e820: [mem 0x0a20f000-0xe9e1]
usable
[ 0.00] BIOS-e820: [mem 0xe9e2-0xeb33efff]
reserved
[ 0.00] BIOS-e820: [mem 0xeb33f000-0xeb39efff] ACPI
data
[ 0.00] BIOS-e820: [mem 0xeb39f000-0xeb556fff] ACPI
NVS
[ 0.00] BIOS-e820: [mem 0xeb557000-0xed17cfff]
reserved
[ 0.00] BIOS-e820: [mem 0xed17d000-0xed1fefff] type
20
[ 0.00] BIOS-e820: [mem 0xed1ff000-0xedff]
usable
[ 0.00] BIOS-e820: [mem 0xee00-0xf7ff]
reserved
[ 0.00] BIOS-e820: [mem 0xfd00-0xfdff]
reserved
[ 0.00] BIOS-e820: [mem 0xfeb8-0xfec01fff]
reserved
[ 0.00] BIOS-e820: [mem 0xfec1-0xfec10fff]
reserved
[ 0.00] BIOS-e820: [mem 0xfed0-0xfed00fff]
reserved
[ 0.00] BIOS-e820: [mem 0xfed4-0xfed44fff]
reserved
[ 0.00] BIOS-e820: [mem 0xfed8-0xfed8]
reserved
[ 0.00] BIOS-e820: [mem 0xfedc4000-0xfedc9fff]
reserved
[ 0.00] BIOS-e820: [mem 0xfedcc000-0xfedcefff]
reserved
[ 0.00] BIOS-e820: [mem 0xfedd5000-0xfedd5fff]
reserved
[ 0.00] BIOS-e820: [mem 0xff00-0x]
reserved
[ 0.00] BIOS-e820: [mem 0x0001-0x0003ee2f]
usable
[ 0.00] BIOS-e820: [mem 0x0003ee30-0x00040fff]
reserved
[ 0.00] NX (Execute Disable) protection: active
[ 0.00] efi: EFI v2.70 by American Megatrends
[ 0.00] efi: ACPI=0xeb54 ACPI 2.0=0xeb540014
TPMFinalLog=0xeb50c000 SMBIOS=0xed02 SMBIOS 3.0=0xed01f000
MEMATTR=0xe6fa3018 ESRT=0xe87cb918 MOKvar=0xe6fa
[ 0.00] SMBIOS 3.3.0 present.
[ 0.00] DMI: Micro-Star International Co., Ltd. Alpha 15 B5EEK/MS-
158L, BIOS E158LAMS.107 11/10/2021
[ 0.00] tsc: Fast TSC calibration using PIT
[ 0.00] tsc: Detected 3194.034 MHz processor
[ 0.000125] e820: update [mem 0x-0x0fff] usable ==>
reserved
[ 0.000126] e820: remove [mem 0x000a-0x000f] usable
[ 0.000131] last_pfn = 0x3ee300 max_arch_pfn = 0x4
[ 0.000363] x86/PAT: Configuration [0-7]: WB WC UC- UC WB WP UC- WT
[ 0.000577] e820: update [mem 0xf000-0x] usable ==>
reserved
[ 0.000582] last_pfn = 0xee000 max_arch_pfn = 0x4
[ 0.003213] esrt: Reserving ESRT space from 0xe87cb918 to
0xe87cb950.
[ 0.003217] e820: update [mem 0xe87cb000-0xe87cbfff] usable ==>
reserved
[ 0.003225] e820: update [mem 0xe6fa-0xe6fa2fff] usable ==>
reserved
[ 0.003235] Using GB pages for direct mapping
[ 0.003498] Secure boot disabled
[ 0.003499] R

Re: [PATCH] drm/amdgpu: Indent some if statements

2022-01-13 Thread Das, Nirmoy


Reviewed-by: Nirmoy Das 

On 1/13/2022 7:17 AM, Dan Carpenter wrote:

These if statements need to be indented.

Signed-off-by: Dan Carpenter 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 5 ++---
  1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index d4d9b9ea8bbd..777def770dc8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1714,8 +1714,7 @@ static void amdgpu_ras_error_status_query(struct 
amdgpu_device *adev,
}
  
  	if (block_obj->hw_ops->query_ras_error_status)

-   block_obj->hw_ops->query_ras_error_status(adev);
-
+   block_obj->hw_ops->query_ras_error_status(adev);
  }
  
  static void amdgpu_ras_query_err_status(struct amdgpu_device *adev)

@@ -2722,7 +2721,7 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct 
amdgpu_device *adev)
  int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras* 
ras_con)
  {
if (!adev)
-   return -EINVAL;;
+   return -EINVAL;
  
  	adev->psp.ras_context.ras = ras_con;

return 0;

Re: [PATCH 1/1] drm/amdgpu: move bo_va ref counting to internal funcs

2022-01-13 Thread Das, Nirmoy




On 1/13/2022 1:12 PM, Christian König wrote:



Am 13.01.22 um 13:06 schrieb Nirmoy Das:

GEM code should not deal with struct amdgpu_bo_va's ref_count.
Move ref counting to amdgpu_vm.c.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 11 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 38 +++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  2 ++
  3 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c

index 4a11a2f4fa73..691f0a879c90 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -176,12 +176,9 @@ static int amdgpu_gem_object_open(struct 
drm_gem_object *obj,

  if (r)
  return r;
  -    bo_va = amdgpu_vm_bo_find(vm, abo);
-    if (!bo_va) {
-    bo_va = amdgpu_vm_bo_add(adev, vm, abo);
-    } else {
-    ++bo_va->ref_count;
-    }
+    if (!amdgpu_vm_bo_get(vm, abo))
+    amdgpu_vm_bo_add(adev, vm, abo);
+
  amdgpu_bo_unreserve(abo);
  return 0;
  }
@@ -218,7 +215,7 @@ static void amdgpu_gem_object_close(struct 
drm_gem_object *obj,

  return;
  }
  bo_va = amdgpu_vm_bo_find(vm, bo);
-    if (!bo_va || --bo_va->ref_count)
+    if (!bo_va)
  goto out_unlock;
    amdgpu_vm_bo_rmv(adev, bo_va);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c

index b23cb463b106..9d60de6a6697 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1290,16 +1290,49 @@ struct amdgpu_bo_va *amdgpu_vm_bo_find(struct 
amdgpu_vm *vm,

 struct amdgpu_bo *bo)
  {
  struct amdgpu_vm_bo_base *base;
+    struct amdgpu_bo_va *bo_va = NULL;
    for (base = bo->vm_bo; base; base = base->next) {
  if (base->vm != vm)
  continue;
  -    return container_of(base, struct amdgpu_bo_va, base);
+    bo_va = container_of(base, struct amdgpu_bo_va, base);
  }
-    return NULL;
+
+    if (bo_va && bo_va->ref_count <= 0)
+    return NULL;
+
+    return bo_va;
  }
  +/**
+ * amdgpu_vm_bo_get - find the bo_va for a specific vm & bo and 
increase

+ * the ref_count
+ *
+ * @vm: requested vm
+ * @bo: requested buffer object
+ *
+ * Find @bo inside the requested vm.
+ * Search inside the @bos vm list for the requested vm
+ * Returns the found bo_va with +1 ref_count or NULL if none is found
+ *
+ * Object has to be reserved!
+ *
+ * Returns:
+ * Found bo_va or NULL.
+ */
+struct amdgpu_bo_va *amdgpu_vm_bo_get(struct amdgpu_vm *vm,
+  struct amdgpu_bo *bo)
+{
+    struct amdgpu_bo_va *bo_va = amdgpu_vm_bo_find(vm, bo);
+
+    if (bo_va)
+    ++bo_va->ref_count;
+
+    return bo_va;
+}
+
+
  /**
   * amdgpu_vm_map_gart - Resolve gart mapping of addr
   *
@@ -2704,6 +2737,7 @@ void amdgpu_vm_bo_rmv(struct amdgpu_device *adev,
  if (bo && bo_va->is_xgmi)
  amdgpu_xgmi_set_pstate(adev, AMDGPU_XGMI_PSTATE_MIN);
  +    --bo_va->ref_count;
  kfree(bo_va);


That here won't work, you are removing and freeing the bo_va even if 
the refcount is not zero yet.


I suggest to have a matching amdgpu_vm_bo_put() function instead.



Right, let me resend v2.


Thanks,

Nirmoy



Regards,
Christian.


  }
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h

index 85fcfb8c5efd..6d936fb1b934 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -415,6 +415,8 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device 
*adev,
  uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t 
addr);

  struct amdgpu_bo_va *amdgpu_vm_bo_find(struct amdgpu_vm *vm,
 struct amdgpu_bo *bo);
+struct amdgpu_bo_va *amdgpu_vm_bo_get(struct amdgpu_vm *vm,
+   struct amdgpu_bo *bo);
  struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev,
    struct amdgpu_vm *vm,
    struct amdgpu_bo *bo);

[PATCH 1/1] drm/amdgpu: move bo_va ref counting to internal funcs

2022-01-13 Thread Nirmoy Das

GEM code should not deal with struct amdgpu_bo_va's ref_count.
Move ref counting to amdgpu_vm.c.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 11 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 38 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  2 ++
 3 files changed, 42 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
index 4a11a2f4fa73..691f0a879c90 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c
@@ -176,12 +176,9 @@ static int amdgpu_gem_object_open(struct drm_gem_object 
*obj,
if (r)
return r;
 
-   bo_va = amdgpu_vm_bo_find(vm, abo);
-   if (!bo_va) {
-   bo_va = amdgpu_vm_bo_add(adev, vm, abo);
-   } else {
-   ++bo_va->ref_count;
-   }
+   if (!amdgpu_vm_bo_get(vm, abo))
+   amdgpu_vm_bo_add(adev, vm, abo);
+
amdgpu_bo_unreserve(abo);
return 0;
 }
@@ -218,7 +215,7 @@ static void amdgpu_gem_object_close(struct drm_gem_object 
*obj,
return;
}
bo_va = amdgpu_vm_bo_find(vm, bo);
-   if (!bo_va || --bo_va->ref_count)
+   if (!bo_va)
goto out_unlock;
 
amdgpu_vm_bo_rmv(adev, bo_va);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index b23cb463b106..9d60de6a6697 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -1290,16 +1290,49 @@ struct amdgpu_bo_va *amdgpu_vm_bo_find(struct amdgpu_vm 
*vm,
   struct amdgpu_bo *bo)
 {
struct amdgpu_vm_bo_base *base;
+   struct amdgpu_bo_va *bo_va = NULL;
 
for (base = bo->vm_bo; base; base = base->next) {
if (base->vm != vm)
continue;
 
-   return container_of(base, struct amdgpu_bo_va, base);
+   bo_va = container_of(base, struct amdgpu_bo_va, base);
}
-   return NULL;
+
+   if (bo_va && bo_va->ref_count <= 0)
+   return NULL;
+
+   return bo_va;
 }
 
+/**
+ * amdgpu_vm_bo_get - find the bo_va for a specific vm & bo and increase
+ * the ref_count
+ *
+ * @vm: requested vm
+ * @bo: requested buffer object
+ *
+ * Find @bo inside the requested vm.
+ * Search inside the @bos vm list for the requested vm
+ * Returns the found bo_va with +1 ref_count or NULL if none is found
+ *
+ * Object has to be reserved!
+ *
+ * Returns:
+ * Found bo_va or NULL.
+ */
+struct amdgpu_bo_va *amdgpu_vm_bo_get(struct amdgpu_vm *vm,
+ struct amdgpu_bo *bo)
+{
+   struct amdgpu_bo_va *bo_va = amdgpu_vm_bo_find(vm, bo);
+
+   if (bo_va)
+   ++bo_va->ref_count;
+
+   return bo_va;
+}
+
+
 /**
  * amdgpu_vm_map_gart - Resolve gart mapping of addr
  *
@@ -2704,6 +2737,7 @@ void amdgpu_vm_bo_rmv(struct amdgpu_device *adev,
if (bo && bo_va->is_xgmi)
amdgpu_xgmi_set_pstate(adev, AMDGPU_XGMI_PSTATE_MIN);
 
+   --bo_va->ref_count;
kfree(bo_va);
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
index 85fcfb8c5efd..6d936fb1b934 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
@@ -415,6 +415,8 @@ void amdgpu_vm_bo_invalidate(struct amdgpu_device *adev,
 uint64_t amdgpu_vm_map_gart(const dma_addr_t *pages_addr, uint64_t addr);
 struct amdgpu_bo_va *amdgpu_vm_bo_find(struct amdgpu_vm *vm,
   struct amdgpu_bo *bo);
+struct amdgpu_bo_va *amdgpu_vm_bo_get(struct amdgpu_vm *vm,
+  struct amdgpu_bo *bo);
 struct amdgpu_bo_va *amdgpu_vm_bo_add(struct amdgpu_device *adev,
  struct amdgpu_vm *vm,
  struct amdgpu_bo *bo);
-- 
2.33.1

Re: [PATCH] drm/amdgpu: cleanup ttm debug sdma vram access function

2022-01-12 Thread Das, Nirmoy


LGTM acked-by: Nirmoy Das 


On 1/12/2022 7:52 PM, Jonathan Kim wrote:

Some suggested cleanups to declutter ttm when doing debug VRAM access over
SDMA.

v2: rename post_mortem_allowed func to has_timeouts_enable.

Signed-off-by: Jonathan Kim 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  9 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 23 +++
  2 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index a675dde81ce0..747d310aa72f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1448,6 +1448,15 @@ int amdgpu_device_set_cg_state(struct amdgpu_device 
*adev,
  int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
   enum amd_powergating_state state);
  
+static inline bool amdgpu_device_has_timeouts_enabled(struct amdgpu_device *adev)

+{
+   return amdgpu_gpu_recovery != 0 &&
+   adev->gfx_timeout != MAX_SCHEDULE_TIMEOUT &&
+   adev->compute_timeout != MAX_SCHEDULE_TIMEOUT &&
+   adev->sdma_timeout != MAX_SCHEDULE_TIMEOUT &&
+   adev->video_timeout != MAX_SCHEDULE_TIMEOUT;
+}
+
  #include "amdgpu_object.h"
  
  static inline bool amdgpu_is_tmz(struct amdgpu_device *adev)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 33781509838c..b489cd8abe31 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -1460,10 +1460,11 @@ static int amdgpu_ttm_access_memory_sdma(struct 
ttm_buffer_object *bo,
if (r)
goto out;
  
-	src_addr = write ? amdgpu_bo_gpu_offset(adev->mman.sdma_access_bo) :

-   amdgpu_bo_gpu_offset(abo);
-   dst_addr = write ? amdgpu_bo_gpu_offset(abo) :
-   amdgpu_bo_gpu_offset(adev->mman.sdma_access_bo);
+   src_addr = amdgpu_bo_gpu_offset(abo);
+   dst_addr = amdgpu_bo_gpu_offset(adev->mman.sdma_access_bo);
+   if (write)
+   swap(src_addr, dst_addr);
+
amdgpu_emit_copy_buffer(adev, >ibs[0], src_addr, dst_addr, 
PAGE_SIZE, false);
  
  	amdgpu_ring_pad_ib(adev->mman.buffer_funcs_ring, >ibs[0]);

@@ -1486,15 +1487,6 @@ static int amdgpu_ttm_access_memory_sdma(struct 
ttm_buffer_object *bo,
return r;
  }
  
-static inline bool amdgpu_ttm_allow_post_mortem_debug(struct amdgpu_device *adev)

-{
-   return amdgpu_gpu_recovery == 0 ||
-   adev->gfx_timeout == MAX_SCHEDULE_TIMEOUT ||
-   adev->compute_timeout == MAX_SCHEDULE_TIMEOUT ||
-   adev->sdma_timeout == MAX_SCHEDULE_TIMEOUT ||
-   adev->video_timeout == MAX_SCHEDULE_TIMEOUT;
-}
-
  /**
   * amdgpu_ttm_access_memory - Read or Write memory that backs a buffer object.
   *
@@ -1519,7 +1511,7 @@ static int amdgpu_ttm_access_memory(struct 
ttm_buffer_object *bo,
if (bo->resource->mem_type != TTM_PL_VRAM)
return -EIO;
  
-	if (!amdgpu_ttm_allow_post_mortem_debug(adev) &&

+   if (amdgpu_device_has_timeouts_enabled(adev) &&
!amdgpu_ttm_access_memory_sdma(bo, offset, buf, len, 
write))
return len;
  
@@ -1909,8 +1901,7 @@ void amdgpu_ttm_fini(struct amdgpu_device *adev)

ttm_range_man_fini(>mman.bdev, AMDGPU_PL_OA);
ttm_device_fini(>mman.bdev);
adev->mman.initialized = false;
-   if (adev->mman.sdma_access_ptr)
-   amdgpu_bo_free_kernel(>mman.sdma_access_bo, NULL,
+   amdgpu_bo_free_kernel(>mman.sdma_access_bo, NULL,
>mman.sdma_access_ptr);
DRM_INFO("amdgpu: ttm finalized\n");
  }

Re: [PATCH 2/4] drm/amdkfd: remove unused function

2022-01-07 Thread Nirmoy


Found the commit that removed usages of this function.


Fixes: dfcbe6d5f ("drm/amdgpu: Remove unused function pointers")

On 1/7/22 09:51, Nirmoy Das wrote:

Remove unused amdgpu_amdkfd_get_vram_usage()

CC: felix.kuehl...@amd.com

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 7 ---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 -
  2 files changed, 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 776a947b45df..6ca1db3c243f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -514,13 +514,6 @@ int amdgpu_amdkfd_get_dmabuf_info(struct amdgpu_device 
*adev, int dma_buf_fd,
return r;
  }

-uint64_t amdgpu_amdkfd_get_vram_usage(struct amdgpu_device *adev)
-{
-   struct ttm_resource_manager *vram_man = 
ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
-
-   return amdgpu_vram_mgr_usage(vram_man);
-}
-
  uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct amdgpu_device *dst,
  struct amdgpu_device *src)
  {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 61f899e54fd5..ac841ae8f5cc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -223,7 +223,6 @@ int amdgpu_amdkfd_get_dmabuf_info(struct amdgpu_device 
*adev, int dma_buf_fd,
  uint64_t *bo_size, void *metadata_buffer,
  size_t buffer_size, uint32_t *metadata_size,
  uint32_t *flags);
-uint64_t amdgpu_amdkfd_get_vram_usage(struct amdgpu_device *adev);
  uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct amdgpu_device *dst,
  struct amdgpu_device *src);
  int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct amdgpu_device *dst,
--
2.33.1

[PATCH REBASED 4/4] drm/amdgpu: recover gart table at resume

2022-01-07 Thread Nirmoy Das

Get rid off pin/unpin of gart BO at resume/suspend and
instead pin only once and try to recover gart content
at resume time. This is much more stable in case there
is OOM situation at 2nd call to amdgpu_device_evict_resources()
while evicting GART table.

v3: remove gart recovery from other places
v2: pin gart at amdgpu_gart_table_vram_alloc()

Reviewed-by: Christian König 
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 84 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  3 +-
 7 files changed, 11 insertions(+), 99 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 58b9a5176082..a89e08e44fea 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4004,16 +4004,11 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
if (!adev->in_s0ix)
amdgpu_amdkfd_suspend(adev, adev->in_runpm);
 
-   /* First evict vram memory */
amdgpu_device_evict_resources(adev);
 
amdgpu_fence_driver_hw_fini(adev);
 
amdgpu_device_ip_suspend_phase2(adev);
-   /* This second call to evict device resources is to evict
-* the gart page table using the CPU.
-*/
-   amdgpu_device_evict_resources(adev);
 
return 0;
 }
@@ -4356,8 +4351,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
goto error;
 
amdgpu_virt_init_data_exchange(adev);
-   /* we need recover gart prior to run SMC/CP/SDMA resume */
-   amdgpu_gtt_mgr_recover(>mman.gtt_mgr);
 
r = amdgpu_device_fw_loading(adev);
if (r)
@@ -4677,10 +4670,6 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
amdgpu_inc_vram_lost(tmp_adev);
}
 
-   r = 
amdgpu_gtt_mgr_recover(_adev->mman.gtt_mgr);
-   if (r)
-   goto out;
-
r = amdgpu_device_fw_loading(tmp_adev);
if (r)
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index d3e4203f6217..645950a653a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -114,80 +114,12 @@ void amdgpu_gart_dummy_page_fini(struct amdgpu_device 
*adev)
  */
 int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
 {
-   int r;
-
-   if (adev->gart.bo == NULL) {
-   struct amdgpu_bo_param bp;
-
-   memset(, 0, sizeof(bp));
-   bp.size = adev->gart.table_size;
-   bp.byte_align = PAGE_SIZE;
-   bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
-   bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
-   AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
-   bp.type = ttm_bo_type_kernel;
-   bp.resv = NULL;
-   bp.bo_ptr_size = sizeof(struct amdgpu_bo);
-
-   r = amdgpu_bo_create(adev, , >gart.bo);
-   if (r) {
-   return r;
-   }
-   }
-   return 0;
-}
-
-/**
- * amdgpu_gart_table_vram_pin - pin gart page table in vram
- *
- * @adev: amdgpu_device pointer
- *
- * Pin the GART page table in vram so it will not be moved
- * by the memory manager (pcie r4xx, r5xx+).  These asics require the
- * gart table to be in video memory.
- * Returns 0 for success, error for failure.
- */
-int amdgpu_gart_table_vram_pin(struct amdgpu_device *adev)
-{
-   int r;
-
-   r = amdgpu_bo_reserve(adev->gart.bo, false);
-   if (unlikely(r != 0))
-   return r;
-   r = amdgpu_bo_pin(adev->gart.bo, AMDGPU_GEM_DOMAIN_VRAM);
-   if (r) {
-   amdgpu_bo_unreserve(adev->gart.bo);
-   return r;
-   }
-   r = amdgpu_bo_kmap(adev->gart.bo, >gart.ptr);
-   if (r)
-   amdgpu_bo_unpin(adev->gart.bo);
-   amdgpu_bo_unreserve(adev->gart.bo);
-   return r;
-}
-
-/**
- * amdgpu_gart_table_vram_unpin - unpin gart page table in vram
- *
- * @adev: amdgpu_device pointer
- *
- * Unpin the GART page table in vram (pcie r4xx, r5xx+).
- * These asics require the gart table to be in video memory.
- */
-void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev)
-{
-   int r;
+   if (adev->gart.bo != NULL)
+   return 0;
 
-   if (adev->gart.bo == NULL) {
-   return;
-   }
-   r = amdgpu_bo_re

[PATCH 2/4] drm/amdkfd: remove unused function

2022-01-07 Thread Nirmoy Das

Remove unused amdgpu_amdkfd_get_vram_usage()

CC: felix.kuehl...@amd.com

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 7 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 1 -
 2 files changed, 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 776a947b45df..6ca1db3c243f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -514,13 +514,6 @@ int amdgpu_amdkfd_get_dmabuf_info(struct amdgpu_device 
*adev, int dma_buf_fd,
return r;
 }

-uint64_t amdgpu_amdkfd_get_vram_usage(struct amdgpu_device *adev)
-{
-   struct ttm_resource_manager *vram_man = 
ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
-
-   return amdgpu_vram_mgr_usage(vram_man);
-}
-
 uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct amdgpu_device *dst,
  struct amdgpu_device *src)
 {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index 61f899e54fd5..ac841ae8f5cc 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -223,7 +223,6 @@ int amdgpu_amdkfd_get_dmabuf_info(struct amdgpu_device 
*adev, int dma_buf_fd,
  uint64_t *bo_size, void *metadata_buffer,
  size_t buffer_size, uint32_t *metadata_size,
  uint32_t *flags);
-uint64_t amdgpu_amdkfd_get_vram_usage(struct amdgpu_device *adev);
 uint8_t amdgpu_amdkfd_get_xgmi_hops_count(struct amdgpu_device *dst,
  struct amdgpu_device *src);
 int amdgpu_amdkfd_get_xgmi_bandwidth_mbytes(struct amdgpu_device *dst,
--
2.33.1

[PATCH REBASED 3/4] drm/amdgpu: do not pass ttm_resource_manager to vram_mgr

2022-01-07 Thread Nirmoy Das

Do not allow exported amdgpu_vram_mgr_*() to accept
any ttm_resource_manager pointer. Also there is no need
to force other module to call a ttm function just to
eventually call vram_mgr functions.

v2: pass adev's vram_mgr instead of adev

Reviewed-by: Christian König 
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   |  5 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c  | 10 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  6 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h  |  8 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c |  5 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 40 
 6 files changed, 30 insertions(+), 44 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 76fe5b71e35d..7e745164a624 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -298,7 +298,6 @@ static void amdgpu_cs_get_threshold_for_moves(struct 
amdgpu_device *adev,
 {
s64 time_us, increment_us;
u64 free_vram, total_vram, used_vram;
-   struct ttm_resource_manager *vram_man = 
ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
/* Allow a maximum of 200 accumulated ms. This is basically per-IB
 * throttling.
 *
@@ -315,7 +314,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct 
amdgpu_device *adev,
}
 
total_vram = adev->gmc.real_vram_size - 
atomic64_read(>vram_pin_size);
-   used_vram = amdgpu_vram_mgr_usage(vram_man);
+   used_vram = amdgpu_vram_mgr_usage(>mman.vram_mgr);
free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
 
spin_lock(>mm_stats.lock);
@@ -362,7 +361,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct 
amdgpu_device *adev,
if (!amdgpu_gmc_vram_full_visible(>gmc)) {
u64 total_vis_vram = adev->gmc.visible_vram_size;
u64 used_vis_vram =
- amdgpu_vram_mgr_vis_usage(vram_man);
+ amdgpu_vram_mgr_vis_usage(>mman.vram_mgr);
 
if (used_vis_vram < total_vis_vram) {
u64 free_vis_vram = total_vis_vram - used_vis_vram;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 763de822afa1..289521aafb79 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -672,10 +672,10 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
ui64 = atomic64_read(>num_vram_cpu_page_faults);
return copy_to_user(out, , min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_VRAM_USAGE:
-   ui64 = amdgpu_vram_mgr_usage(ttm_manager_type(>mman.bdev, 
TTM_PL_VRAM));
+   ui64 = amdgpu_vram_mgr_usage(>mman.vram_mgr);
return copy_to_user(out, , min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_VIS_VRAM_USAGE:
-   ui64 = 
amdgpu_vram_mgr_vis_usage(ttm_manager_type(>mman.bdev, TTM_PL_VRAM));
+   ui64 = amdgpu_vram_mgr_vis_usage(>mman.vram_mgr);
return copy_to_user(out, , min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_GTT_USAGE:
ui64 = amdgpu_gtt_mgr_usage(>mman.gtt_mgr);
@@ -709,8 +709,6 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
}
case AMDGPU_INFO_MEMORY: {
struct drm_amdgpu_memory_info mem;
-   struct ttm_resource_manager *vram_man =
-   ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
struct ttm_resource_manager *gtt_man =
ttm_manager_type(>mman.bdev, TTM_PL_TT);
memset(, 0, sizeof(mem));
@@ -719,7 +717,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
atomic64_read(>vram_pin_size) -
AMDGPU_VM_RESERVED_VRAM;
mem.vram.heap_usage =
-   amdgpu_vram_mgr_usage(vram_man);
+   amdgpu_vram_mgr_usage(>mman.vram_mgr);
mem.vram.max_allocation = mem.vram.usable_heap_size * 3 / 4;
 
mem.cpu_accessible_vram.total_heap_size =
@@ -729,7 +727,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
atomic64_read(>visible_pin_size),
mem.vram.usable_heap_size);
mem.cpu_accessible_vram.heap_usage =
-   amdgpu_vram_mgr_vis_usage(vram_man);
+   amdgpu_vram_mgr_vis_usage(>mman.vram_mgr);
mem.cpu_accessible_vram.max_allocation =
mem.cpu_accessible_vram.usable_heap_size * 3 / 4;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

[PATCH REBASED 1/4] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr

2022-01-07 Thread Nirmoy Das

Do not allow exported amdgpu_gtt_mgr_*() to accept
any ttm_resource_manager pointer. Also there is no need
to force other module to call a ttm function just to
eventually call gtt_mgr functions.

v4: remove unused adev.
v3: upcast mgr from ttm resopurce manager instead of
getting it from adev.
v2: pass adev's gtt_mgr instead of adev.

Reviewed-by: Christian König 
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 22 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  4 ++--
 4 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 3aab187520c6..58b9a5176082 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4357,7 +4357,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
 
amdgpu_virt_init_data_exchange(adev);
/* we need recover gart prior to run SMC/CP/SDMA resume */
-   amdgpu_gtt_mgr_recover(ttm_manager_type(>mman.bdev, TTM_PL_TT));
+   amdgpu_gtt_mgr_recover(>mman.gtt_mgr);
 
r = amdgpu_device_fw_loading(adev);
if (r)
@@ -4677,7 +4677,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
amdgpu_inc_vram_lost(tmp_adev);
}
 
-   r = 
amdgpu_gtt_mgr_recover(ttm_manager_type(_adev->mman.bdev, TTM_PL_TT));
+   r = 
amdgpu_gtt_mgr_recover(_adev->mman.gtt_mgr);
if (r)
goto out;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
index c18f16b3be9c..9151950e0cc3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
@@ -77,10 +77,8 @@ static ssize_t amdgpu_mem_info_gtt_used_show(struct device 
*dev,
 {
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(ddev);
-   struct ttm_resource_manager *man;
 
-   man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
-   return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(man));
+   return sysfs_emit(buf, "%llu\n", 
amdgpu_gtt_mgr_usage(>mman.gtt_mgr));
 }
 
 static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO,
@@ -206,14 +204,15 @@ static void amdgpu_gtt_mgr_del(struct 
ttm_resource_manager *man,
 /**
  * amdgpu_gtt_mgr_usage - return usage of GTT domain
  *
- * @man: TTM memory type manager
+ * @mgr: amdgpu_gtt_mgr pointer
  *
  * Return how many bytes are used in the GTT domain
  */
-uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
+uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_gtt_mgr *mgr)
 {
-   struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-   s64 result = man->size - atomic64_read(>available);
+   s64 result;
+
+   result = mgr->manager.size - atomic64_read(>available);
 
return (result > 0 ? result : 0) * PAGE_SIZE;
 }
@@ -221,16 +220,15 @@ uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager 
*man)
 /**
  * amdgpu_gtt_mgr_recover - re-init gart
  *
- * @man: TTM memory type manager
+ * @mgr: amdgpu_gtt_mgr pointer
  *
  * Re-init the gart for each known BO in the GTT.
  */
-int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man)
+int amdgpu_gtt_mgr_recover(struct amdgpu_gtt_mgr *mgr)
 {
-   struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-   struct amdgpu_device *adev;
struct amdgpu_gtt_node *node;
struct drm_mm_node *mm_node;
+   struct amdgpu_device *adev;
int r = 0;
 
adev = container_of(mgr, typeof(*adev), mman.gtt_mgr);
@@ -267,7 +265,7 @@ static void amdgpu_gtt_mgr_debug(struct 
ttm_resource_manager *man,
 
drm_printf(printer, "man size:%llu pages, gtt available:%lld pages, 
usage:%lluMB\n",
   man->size, (u64)atomic64_read(>available),
-  amdgpu_gtt_mgr_usage(man) >> 20);
+  amdgpu_gtt_mgr_usage(mgr) >> 20);
 }
 
 static const struct ttm_resource_manager_func amdgpu_gtt_mgr_func = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 651c7abfde03..763de822afa1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -678,7 +678,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
ui64 = 
amdgpu_vram_mgr_vis_usage(ttm_manager_type(>mman.bdev, TTM_PL_VRAM));
return copy_to_user(out, , min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_GTT_USAGE:
-   ui64 = amdgpu_gtt_mgr_usage(ttm_manager_type

Re: [PATCH] drm/amdgpu: avoid NULL pointer dereference

2021-12-29 Thread Das, Nirmoy


LGTM Acked-by: Nirmoy Das 

On 12/22/2021 3:19 PM, Guchun Chen wrote:

amdgpu_umc_poison_handler for UMC RAS consumption gets
called in KFD queue reset, but it needs to return early when
RAS context is NULL. This can guarantee lower access to
RAS context like in amdgpu_umc_do_page_retirement. Also
improve coding style in amdgpu_umc_poison_handler.

Signed-off-by: Guchun Chen 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c | 12 
  1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 46264a4002f7..b455fc7d1546 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -112,16 +112,20 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
void *ras_error_status,
bool reset)
  {
-   int ret;
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
struct ras_common_if head = {
.block = AMDGPU_RAS_BLOCK__UMC,
};
-   struct ras_manager *obj = amdgpu_ras_find_obj(adev, );
+   struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+   struct ras_manager *obj;
+   int ret;
+
+   if (!con)
+   return 0;
  
-	ret =

-   amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, 
reset);
+   ret = amdgpu_umc_do_page_retirement(adev, ras_error_status, NULL, 
reset);
  
+	obj = amdgpu_ras_find_obj(adev, );

if (ret == AMDGPU_RAS_SUCCESS && obj) {
obj->err_data.ue_count += err_data->ue_count;
obj->err_data.ce_count += err_data->ce_count;

Re: [RFC PATCH 3/3] drm/amdgpu: enable HIQ in amdgpu without kfd

2021-11-05 Thread Das, Nirmoy




On 11/5/2021 3:17 PM, Alex Deucher wrote:

On Fri, Nov 5, 2021 at 10:09 AM Nirmoy Das  wrote:

There is a HW bug which prevents CP to read secure buffers
with HIQ being configured and mapped using KIQ. KFD already
does this for amdgpu but when kfd is not enabled amdgpu
should that for itself.

Can we just move the HIQ init/fini into the KGD and then have KFD call
into the KGD when it needs to interact with it?  I'd rather not have
two code paths to maintain to handle the HIQ ring.



I looked into the kfd code a bit, AFAIU kfd deals with struct 
v{9|10}_mqd instead of amdgpu_ring.


I could try to expose a function in KGD to map HIQ with a mqd struct 
which kfd can use.



Regards,

Nirmoy




Alex


Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 14 -
  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 77 
  drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 80 +
  3 files changed, 170 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 053a1119ebfe..837f76550242 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -519,7 +519,7 @@ int amdgpu_gfx_mqd_sw_init(struct amdgpu_device *adev,
 AMDGPU_GEM_DOMAIN_VRAM, 
>mqd_obj,
 >mqd_gpu_addr, 
>mqd_ptr);
 if (r) {
-   dev_warn(adev->dev, "failed to create ring mqd ob 
(%d)", r);
+   dev_warn(adev->dev, "failed to create KIQ ring mqd ob 
(%d)", r);
 return r;
 }

@@ -569,6 +569,18 @@ int amdgpu_gfx_mqd_sw_init(struct amdgpu_device *adev,
 }
 }

+   /* create MQD for HIQ */
+   ring = >gfx.hiq.ring;
+   if (!ring->mqd_obj) {
+   r = amdgpu_bo_create_kernel(adev, mqd_size, PAGE_SIZE,
+   AMDGPU_GEM_DOMAIN_VRAM, 
>mqd_obj,
+   >mqd_gpu_addr, 
>mqd_ptr);
+   if (r) {
+   dev_warn(adev->dev, "failed to create HIQ ring mqd ob 
(%d)", r);
+   return r;
+   }
+   }
+
 return 0;
  }

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 538130c453a6..9532f013128f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -4794,6 +4794,7 @@ static int gfx_v10_0_sw_init(void *handle)
  {
 int i, j, k, r, ring_id = 0;
 struct amdgpu_kiq *kiq;
+   struct amdgpu_hiq *hiq;
 struct amdgpu_device *adev = (struct amdgpu_device *)handle;

 switch (adev->ip_versions[GC_HWIP][0]) {
@@ -4923,6 +4924,18 @@ static int gfx_v10_0_sw_init(void *handle)
 if (r)
 return r;

+   if (!adev->kfd.dev) {
+   r = amdgpu_gfx_hiq_init(adev, GFX10_MEC_HPD_SIZE);
+   if (r) {
+   DRM_ERROR("Failed to init HIQ BOs!\n");
+   return r;
+   }
+
+   hiq = >gfx.hiq;
+   r = amdgpu_gfx_hiq_init_ring(adev, >ring, >irq);
+   if (r)
+   return r;
+   }
 r = amdgpu_gfx_mqd_sw_init(adev, sizeof(struct v10_compute_mqd));
 if (r)
 return r;
@@ -7215,6 +7228,54 @@ static int gfx_v10_0_kcq_resume(struct amdgpu_device 
*adev)
 return r;
  }

+static int gfx_v10_0_hiq_init_queue(struct amdgpu_ring *ring)
+{
+   struct amdgpu_device *adev = ring->adev;
+   struct v10_compute_mqd *mqd = ring->mqd_ptr;
+
+
+   if (amdgpu_in_reset(adev)) {
+   /* reset ring buffer */
+   ring->wptr = 0;
+   amdgpu_ring_clear_ring(ring);
+
+   } else {
+   memset((void *)mqd, 0, sizeof(*mqd));
+   mutex_lock(>srbm_mutex);
+   nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
+   gfx_v10_0_compute_mqd_init(ring);
+   nv_grbm_select(adev, 0, 0, 0, 0);
+   mutex_unlock(>srbm_mutex);
+   }
+
+   return 0;
+}
+
+static int gfx_v10_0_hiq_resume(struct amdgpu_device *adev)
+{
+   struct amdgpu_ring *ring;
+   int r;
+
+   ring = >gfx.hiq.ring;
+
+   r = amdgpu_bo_reserve(ring->mqd_obj, false);
+   if (unlikely(r != 0))
+   return r;
+
+   r = amdgpu_bo_kmap(ring->mqd_obj, (void **)>mqd_ptr);
+   if (unlikely(r != 0))
+   return r;
+
+   gfx_v10_0_hiq_init_queue(ring);
+   amdgpu_bo_kunmap(ring->mqd_obj);
+   ring->mqd_ptr = NULL;
+   amdgpu_bo_unreserve(ring->mqd_obj);
+   ring->sched.ready = true;
+
+

[RFC PATCH 2/3] drm/amdgpu: add HIQ eng_sel to KIQ packets

2021-11-05 Thread Nirmoy Das

Allow KIQ to map/unmap HIQ MQD as well.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 14 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   |  4 ++--
 4 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 5b8cb76e35a0..053a1119ebfe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -1010,3 +1010,17 @@ void amdgpu_gfx_state_change_set(struct amdgpu_device 
*adev, enum gfx_change_sta
(adev)->powerplay.pp_handle, state));
mutex_unlock(>pm.mutex);
 }
+
+int amdgpu_kiq_get_eng_num(struct amdgpu_ring *ring)
+{
+
+   switch (ring->funcs->type) {
+   case AMDGPU_RING_TYPE_GFX:
+   return 4;
+   case AMDGPU_RING_TYPE_HIQ:
+   return 1;
+   default:
+   return 0;
+   }
+
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 4d9c91f4400d..88d942b1ef08 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -373,6 +373,8 @@ static inline u32 amdgpu_gfx_create_bitmask(u32 bit_width)
return (u32)((1ULL << bit_width) - 1);
 }
 
+int amdgpu_kiq_get_eng_num(struct amdgpu_ring *ring);
+
 int amdgpu_gfx_scratch_get(struct amdgpu_device *adev, uint32_t *reg);
 void amdgpu_gfx_scratch_free(struct amdgpu_device *adev, uint32_t reg);
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 90a834dc4008..538130c453a6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -3633,7 +3633,7 @@ static void gfx10_kiq_unmap_queues(struct amdgpu_ring 
*kiq_ring,
   enum amdgpu_unmap_queues_action action,
   u64 gpu_addr, u64 seq)
 {
-   uint32_t eng_sel = ring->funcs->type == AMDGPU_RING_TYPE_GFX ? 4 : 0;
+   uint32_t eng_sel = amdgpu_kiq_get_eng_num(ring);
 
amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_UNMAP_QUEUES, 4));
amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q: 1 */
@@ -3660,7 +3660,7 @@ static void gfx10_kiq_query_status(struct amdgpu_ring 
*kiq_ring,
   u64 addr,
   u64 seq)
 {
-   uint32_t eng_sel = ring->funcs->type == AMDGPU_RING_TYPE_GFX ? 4 : 0;
+   uint32_t eng_sel = amdgpu_kiq_get_eng_num(ring);
 
amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_QUERY_STATUS, 5));
amdgpu_ring_write(kiq_ring,
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 7f944bb11298..2b29e42bde62 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -847,7 +847,7 @@ static void gfx_v9_0_kiq_map_queues(struct amdgpu_ring 
*kiq_ring,
struct amdgpu_device *adev = kiq_ring->adev;
uint64_t mqd_addr = amdgpu_bo_gpu_offset(ring->mqd_obj);
uint64_t wptr_addr = adev->wb.gpu_addr + (ring->wptr_offs * 4);
-   uint32_t eng_sel = ring->funcs->type == AMDGPU_RING_TYPE_GFX ? 4 : 0;
+   uint32_t eng_sel = amdgpu_kiq_get_eng_num(ring);
 
amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
/* Q_sel:0, vmid:0, vidmem: 1, engine:0, num_Q:1*/
@@ -877,7 +877,7 @@ static void gfx_v9_0_kiq_unmap_queues(struct amdgpu_ring 
*kiq_ring,
   enum amdgpu_unmap_queues_action action,
   u64 gpu_addr, u64 seq)
 {
-   uint32_t eng_sel = ring->funcs->type == AMDGPU_RING_TYPE_GFX ? 4 : 0;
+   uint32_t eng_sel = amdgpu_kiq_get_eng_num(ring);
 
amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_UNMAP_QUEUES, 4));
amdgpu_ring_write(kiq_ring, /* Q_sel: 0, vmid: 0, engine: 0, num_Q: 1 */
-- 
2.31.1

[RFC PATCH 3/3] drm/amdgpu: enable HIQ in amdgpu without kfd

2021-11-05 Thread Nirmoy Das

There is a HW bug which prevents CP to read secure buffers
with HIQ being configured and mapped using KIQ. KFD already
does this for amdgpu but when kfd is not enabled amdgpu
should that for itself.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 14 -
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c  | 77 
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c   | 80 +
 3 files changed, 170 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 053a1119ebfe..837f76550242 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -519,7 +519,7 @@ int amdgpu_gfx_mqd_sw_init(struct amdgpu_device *adev,
AMDGPU_GEM_DOMAIN_VRAM, 
>mqd_obj,
>mqd_gpu_addr, 
>mqd_ptr);
if (r) {
-   dev_warn(adev->dev, "failed to create ring mqd ob 
(%d)", r);
+   dev_warn(adev->dev, "failed to create KIQ ring mqd ob 
(%d)", r);
return r;
}
 
@@ -569,6 +569,18 @@ int amdgpu_gfx_mqd_sw_init(struct amdgpu_device *adev,
}
}
 
+   /* create MQD for HIQ */
+   ring = >gfx.hiq.ring;
+   if (!ring->mqd_obj) {
+   r = amdgpu_bo_create_kernel(adev, mqd_size, PAGE_SIZE,
+   AMDGPU_GEM_DOMAIN_VRAM, 
>mqd_obj,
+   >mqd_gpu_addr, 
>mqd_ptr);
+   if (r) {
+   dev_warn(adev->dev, "failed to create HIQ ring mqd ob 
(%d)", r);
+   return r;
+   }
+   }
+
return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 538130c453a6..9532f013128f 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -4794,6 +4794,7 @@ static int gfx_v10_0_sw_init(void *handle)
 {
int i, j, k, r, ring_id = 0;
struct amdgpu_kiq *kiq;
+   struct amdgpu_hiq *hiq;
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
switch (adev->ip_versions[GC_HWIP][0]) {
@@ -4923,6 +4924,18 @@ static int gfx_v10_0_sw_init(void *handle)
if (r)
return r;
 
+   if (!adev->kfd.dev) {
+   r = amdgpu_gfx_hiq_init(adev, GFX10_MEC_HPD_SIZE);
+   if (r) {
+   DRM_ERROR("Failed to init HIQ BOs!\n");
+   return r;
+   }
+
+   hiq = >gfx.hiq;
+   r = amdgpu_gfx_hiq_init_ring(adev, >ring, >irq);
+   if (r)
+   return r;
+   }
r = amdgpu_gfx_mqd_sw_init(adev, sizeof(struct v10_compute_mqd));
if (r)
return r;
@@ -7215,6 +7228,54 @@ static int gfx_v10_0_kcq_resume(struct amdgpu_device 
*adev)
return r;
 }
 
+static int gfx_v10_0_hiq_init_queue(struct amdgpu_ring *ring)
+{
+   struct amdgpu_device *adev = ring->adev;
+   struct v10_compute_mqd *mqd = ring->mqd_ptr;
+
+
+   if (amdgpu_in_reset(adev)) {
+   /* reset ring buffer */
+   ring->wptr = 0;
+   amdgpu_ring_clear_ring(ring);
+
+   } else {
+   memset((void *)mqd, 0, sizeof(*mqd));
+   mutex_lock(>srbm_mutex);
+   nv_grbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
+   gfx_v10_0_compute_mqd_init(ring);
+   nv_grbm_select(adev, 0, 0, 0, 0);
+   mutex_unlock(>srbm_mutex);
+   }
+
+   return 0;
+}
+
+static int gfx_v10_0_hiq_resume(struct amdgpu_device *adev)
+{
+   struct amdgpu_ring *ring;
+   int r;
+
+   ring = >gfx.hiq.ring;
+
+   r = amdgpu_bo_reserve(ring->mqd_obj, false);
+   if (unlikely(r != 0))
+   return r;
+
+   r = amdgpu_bo_kmap(ring->mqd_obj, (void **)>mqd_ptr);
+   if (unlikely(r != 0))
+   return r;
+
+   gfx_v10_0_hiq_init_queue(ring);
+   amdgpu_bo_kunmap(ring->mqd_obj);
+   ring->mqd_ptr = NULL;
+   amdgpu_bo_unreserve(ring->mqd_obj);
+   ring->sched.ready = true;
+
+   amdgpu_gfx_enable_hiq(adev);
+   return 0;
+}
+
 static int gfx_v10_0_cp_resume(struct amdgpu_device *adev)
 {
int r, i;
@@ -7252,6 +7313,12 @@ static int gfx_v10_0_cp_resume(struct amdgpu_device 
*adev)
return r;
}
 
+   if (!adev->kfd.dev) {
+   r = gfx_v10_0_hiq_resume(adev);
+   if (r)
+   return r;
+   }
+
for (i = 0; i < adev->gfx.num_gfx_rings; i++) {
ring

[RFC PATCH 1/3] drm/amdgpu: add HIQ ring to amdgpu

2021-11-05 Thread Nirmoy Das

Add HIQ ring structs and functions that will map HIQ
using KIQ.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_doorbell.h |   1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c  | 142 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  24 
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |   3 +-
 4 files changed, 169 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_doorbell.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_doorbell.h
index 89e6ad30396f..2d9295adac06 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_doorbell.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_doorbell.h
@@ -40,6 +40,7 @@ struct amdgpu_doorbell {
  */
 struct amdgpu_doorbell_index {
uint32_t kiq;
+   uint32_t hiq;
uint32_t mec_ring0;
uint32_t mec_ring1;
uint32_t mec_ring2;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
index 1916ec84dd71..5b8cb76e35a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c
@@ -256,6 +256,148 @@ void amdgpu_gfx_graphics_queue_acquire(struct 
amdgpu_device *adev)
bitmap_weight(adev->gfx.me.queue_bitmap, AMDGPU_MAX_GFX_QUEUES);
 }
 
+int amdgpu_gfx_hiq_acquire(struct amdgpu_device *adev, struct amdgpu_ring 
*ring)
+{
+   int queue_bit;
+   int mec, pipe, queue;
+
+   queue_bit = adev->gfx.mec.num_mec
+   * adev->gfx.mec.num_pipe_per_mec
+   * adev->gfx.mec.num_queue_per_pipe;
+
+   while (queue_bit-- >= 0) {
+   if (test_bit(queue_bit, adev->gfx.mec.queue_bitmap))
+   continue;
+
+   amdgpu_queue_mask_bit_to_mec_queue(adev, queue_bit, , 
, );
+
+   if (mec == 1 && pipe > 1)
+   continue;
+
+   ring->me = mec + 1;
+   ring->pipe = pipe;
+   ring->queue = queue;
+
+   return 0;
+   }
+
+   dev_err(adev->dev, "Failed to find a queue for HIQ\n");
+   return -EINVAL;
+}
+
+int amdgpu_gfx_hiq_init_ring(struct amdgpu_device *adev,
+struct amdgpu_ring *ring,
+struct amdgpu_irq_src *irq)
+{
+   struct amdgpu_hiq *hiq = >gfx.hiq;
+   int r = 0;
+
+   ring->adev = NULL;
+   ring->ring_obj = NULL;
+   ring->use_doorbell = true;
+   ring->doorbell_index = adev->doorbell_index.hiq;
+
+   r = amdgpu_gfx_hiq_acquire(adev, ring);
+   if (r)
+   return r;
+
+   ring->eop_gpu_addr = hiq->eop_gpu_addr;
+   ring->no_scheduler = true;
+   sprintf(ring->name, "hiq_%d.%d.%d", ring->me, ring->pipe, ring->queue);
+   r = amdgpu_ring_init(adev, ring, 1024, irq, 
AMDGPU_CP_IRQ_COMPUTE_MEC2_PIPE0_EOP,
+AMDGPU_RING_PRIO_DEFAULT, NULL);
+   if (r)
+   dev_warn(adev->dev, "(%d) failed to init hiq ring\n", r);
+
+   return r;
+}
+
+void amdgpu_gfx_hiq_free_ring(struct amdgpu_ring *ring)
+{
+   amdgpu_ring_fini(ring);
+}
+
+void amdgpu_gfx_hiq_init_ring_fini(struct amdgpu_device *adev)
+{
+   struct amdgpu_hiq *hiq = >gfx.hiq;
+
+   amdgpu_bo_free_kernel(>eop_obj, >eop_gpu_addr, NULL);
+}
+
+int amdgpu_gfx_hiq_init(struct amdgpu_device *adev,
+   unsigned hpd_size)
+{
+   int r;
+   u32 *hpd;
+   struct amdgpu_hiq *hiq = >gfx.hiq;
+
+   r = amdgpu_bo_create_kernel(adev, hpd_size, PAGE_SIZE,
+   AMDGPU_GEM_DOMAIN_GTT, >eop_obj,
+   >eop_gpu_addr, (void **));
+   if (r) {
+   dev_warn(adev->dev, "failed to create HIQ bo (%d).\n", r);
+   return r;
+   }
+
+   memset(hpd, 0, hpd_size);
+
+   r = amdgpu_bo_reserve(hiq->eop_obj, true);
+   if (unlikely(r != 0))
+   dev_warn(adev->dev, "(%d) reserve hiq eop bo failed\n", r);
+   amdgpu_bo_kunmap(hiq->eop_obj);
+   amdgpu_bo_unreserve(hiq->eop_obj);
+
+   return 0;
+}
+
+int amdgpu_gfx_disable_hiq(struct amdgpu_device *adev)
+{
+   struct amdgpu_kiq *kiq = >gfx.kiq;
+   struct amdgpu_ring *kiq_ring = >ring;
+   int r;
+
+   if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues)
+   return -EINVAL;
+
+   spin_lock(>gfx.kiq.ring_lock);
+   if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size)) {
+   spin_unlock(>gfx.kiq.ring_lock);
+   return -ENOMEM;
+   }
+
+   kiq->pmf->kiq_unmap_queues(kiq_ring, >gfx.kiq.ring, RESET_QUEUES,
+  0, 0);
+   r = amdgpu_ring_test_helper(kiq_ring);
+   spin_unlock(>gfx.kiq.ring_lock);
+
+   return r;
+}
+
+int amdgpu_gfx_enable_hiq(struct amdg

Re: [PATCH 6/6] drm/radeon: use dma_resv_wait_timeout() instead of manually waiting

2021-11-03 Thread Das, Nirmoy


Acked-by: Nirmoy Das 

On 10/28/2021 3:26 PM, Christian König wrote:

Don't touch the exclusive fence manually here, but rather use the
general dma_resv function. We did that for better hw reset handling but
this doesn't necessary work correctly.

Signed-off-by: Christian König 
---
  drivers/gpu/drm/radeon/radeon_uvd.c | 13 +
  1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/radeon/radeon_uvd.c 
b/drivers/gpu/drm/radeon/radeon_uvd.c
index 2ea86919d953..377f9cdb5b53 100644
--- a/drivers/gpu/drm/radeon/radeon_uvd.c
+++ b/drivers/gpu/drm/radeon/radeon_uvd.c
@@ -469,7 +469,6 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, 
struct radeon_bo *bo,
  {
int32_t *msg, msg_type, handle;
unsigned img_size = 0;
-   struct dma_fence *f;
void *ptr;
  
  	int i, r;

@@ -479,13 +478,11 @@ static int radeon_uvd_cs_msg(struct radeon_cs_parser *p, 
struct radeon_bo *bo,
return -EINVAL;
}
  
-	f = dma_resv_excl_fence(bo->tbo.base.resv);

-   if (f) {
-   r = radeon_fence_wait((struct radeon_fence *)f, false);
-   if (r) {
-   DRM_ERROR("Failed waiting for UVD message (%d)!\n", r);
-   return r;
-   }
+   r = dma_resv_wait_timeout(bo->tbo.base.resv, false, false,
+ MAX_SCHEDULE_TIMEOUT);
+   if (r <= 0) {
+   DRM_ERROR("Failed waiting for UVD message (%d)!\n", r);
+   return r ? r : -ETIME;
}
  
  	r = radeon_bo_kmap(bo, );

Re: [PATCH 1/1] drm/amdgpu: return early on error while setting bar0 memtype

2021-11-02 Thread Das, Nirmoy




On 11/2/2021 12:33 PM, Christian König wrote:

Am 02.11.21 um 12:18 schrieb Lazar, Lijo:



On 11/2/2021 4:39 PM, Christian König wrote:

Am 02.11.21 um 11:11 schrieb Das, Nirmoy:


On 11/2/2021 9:00 AM, Lazar, Lijo wrote:



On 10/29/2021 8:39 PM, Nirmoy Das wrote:

We set WC memtype for aper_base but don't check return value
of arch_io_reserve_memtype_wc(). Be more defensive and return
early on error.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 7 ++-
  1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 073ba2af0b9c..6b25982a9077 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1032,9 +1032,14 @@ int amdgpu_bo_init(struct amdgpu_device 
*adev)

  /* On A+A platform, VRAM can be mapped as WB */
  if (!adev->gmc.xgmi.connected_to_cpu) {
  /* reserve PAT memory space to WC for VRAM */
- arch_io_reserve_memtype_wc(adev->gmc.aper_base,
+    int r = arch_io_reserve_memtype_wc(adev->gmc.aper_base,
  adev->gmc.aper_size);


BTW, isn't it more appropriate to use visible vram size? There are 
cases where pci resize rounds aperture to the next higher size > 
size of actual VRAM.



Good point, I will update this one and send again.


Not a good idea at all.

The aperture size is rounded up to the next power of two and that's 
exactly what we should stick to if we don't want to get an error 
code in return.


PCI rebar sizes have its restrictions. It jumps from 4G to 8G to 16G 
and so on. Why we need to map 16G for a card with 12G VRAM? BTW, how 
it increases the failure chance - this mapping happens in page sizes, 
right?




Exactly that's the point. This mapping usually happens in power of two 
in the same way as the PCI BAR sizes. So we should use 16GiB even for 
a 12GiB card here.


Only some architectures work with page size mappings (e.g. x86 with 
PAT enabled). Then we can indeed use the real VRAM size, but that is 
absolutely not guaranteed as far as I know.



Thanks for clarifying this. I will push the patch with your suggested 
changes.



Regards,

Nirmoy



Regards,
Christian.


Thanks,
Lijo


Regards,
Christian.




Regards,

Nirmoy




Thanks,
Lijo

  +    if (r) {
+    DRM_ERROR("Unable to set WC memtype for the aperture 
base\n");

+    return r;
+    }
+
  /* Add an MTRR for the VRAM */
  adev->gmc.vram_mtrr = 
arch_phys_wc_add(adev->gmc.aper_base,

  adev->gmc.aper_size);

Re: [PATCH 1/1] drm/amdgpu: return early on error while setting bar0 memtype

2021-11-02 Thread Das, Nirmoy




On 11/2/2021 9:00 AM, Lazar, Lijo wrote:



On 10/29/2021 8:39 PM, Nirmoy Das wrote:

We set WC memtype for aper_base but don't check return value
of arch_io_reserve_memtype_wc(). Be more defensive and return
early on error.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 7 ++-
  1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 073ba2af0b9c..6b25982a9077 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1032,9 +1032,14 @@ int amdgpu_bo_init(struct amdgpu_device *adev)
  /* On A+A platform, VRAM can be mapped as WB */
  if (!adev->gmc.xgmi.connected_to_cpu) {
  /* reserve PAT memory space to WC for VRAM */
-    arch_io_reserve_memtype_wc(adev->gmc.aper_base,
+    int r = arch_io_reserve_memtype_wc(adev->gmc.aper_base,
  adev->gmc.aper_size);


BTW, isn't it more appropriate to use visible vram size? There are 
cases where pci resize rounds aperture to the next higher size > size 
of actual VRAM.



Good point, I will update this one and send again.


Regards,

Nirmoy




Thanks,
Lijo

  +    if (r) {
+    DRM_ERROR("Unable to set WC memtype for the aperture 
base\n");

+    return r;
+    }
+
  /* Add an MTRR for the VRAM */
  adev->gmc.vram_mtrr = arch_phys_wc_add(adev->gmc.aper_base,
  adev->gmc.aper_size);

Re: [PATCH 1/1] drm/amdgpu: remove unnecessary checks

2021-10-29 Thread Das, Nirmoy


ping!

On 10/22/2021 1:03 PM, Nirmoy Das wrote:

amdgpu_ttm_backend_bind() only needed for TTM_PL_TT
and AMDGPU_PL_PREEMPT.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 5 -
  1 file changed, 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index d784f8d3a834..eb872fc4ad92 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -914,11 +914,6 @@ static int amdgpu_ttm_backend_bind(struct ttm_device *bdev,
 ttm->num_pages, bo_mem, ttm);
}
  
-	if (bo_mem->mem_type == AMDGPU_PL_GDS ||

-   bo_mem->mem_type == AMDGPU_PL_GWS ||
-   bo_mem->mem_type == AMDGPU_PL_OA)
-   return -EINVAL;
-
if (bo_mem->mem_type != TTM_PL_TT ||
!amdgpu_gtt_mgr_has_gart_addr(bo_mem)) {
gtt->offset = AMDGPU_BO_INVALID_OFFSET;

[PATCH 1/1] drm/amdgpu: return early on error while setting bar0 memtype

2021-10-29 Thread Nirmoy Das

We set WC memtype for aper_base but don't check return value
of arch_io_reserve_memtype_wc(). Be more defensive and return
early on error.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 073ba2af0b9c..6b25982a9077 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1032,9 +1032,14 @@ int amdgpu_bo_init(struct amdgpu_device *adev)
/* On A+A platform, VRAM can be mapped as WB */
if (!adev->gmc.xgmi.connected_to_cpu) {
/* reserve PAT memory space to WC for VRAM */
-   arch_io_reserve_memtype_wc(adev->gmc.aper_base,
+   int r = arch_io_reserve_memtype_wc(adev->gmc.aper_base,
adev->gmc.aper_size);
 
+   if (r) {
+   DRM_ERROR("Unable to set WC memtype for the aperture 
base\n");
+   return r;
+   }
+
/* Add an MTRR for the VRAM */
adev->gmc.vram_mtrr = arch_phys_wc_add(adev->gmc.aper_base,
adev->gmc.aper_size);
-- 
2.33.1

Re: [PATCH v4 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr

2021-10-22 Thread Das, Nirmoy




On 10/22/2021 1:46 PM, Christian König wrote:
Reviewed-by: Christian König  for the entire 
series.


But please keep in mind that this here needs extensive testing.



I will do multiple round of test again on gfx7-10 cards and gfx9 APU 
that I have but I don't have gfx6 and sriov cards to test.



Regards,

Nirmoy



Thanks,
Christian.

Am 22.10.21 um 12:54 schrieb Nirmoy Das:

Do not allow exported amdgpu_gtt_mgr_*() to accept
any ttm_resource_manager pointer. Also there is no need
to force other module to call a ttm function just to
eventually call gtt_mgr functions.

v4: remove unused adev.
v3: upcast mgr from ttm resopurce manager instead of
getting it from adev.
v2: pass adev's gtt_mgr instead of adev.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 22 ++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  4 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  4 ++--
  4 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 41ce86244144..2b53d86aebac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4287,7 +4287,7 @@ static int amdgpu_device_reset_sriov(struct 
amdgpu_device *adev,


  amdgpu_virt_init_data_exchange(adev);
  /* we need recover gart prior to run SMC/CP/SDMA resume */
- amdgpu_gtt_mgr_recover(ttm_manager_type(>mman.bdev, TTM_PL_TT));
+    amdgpu_gtt_mgr_recover(>mman.gtt_mgr);

  r = amdgpu_device_fw_loading(adev);
  if (r)
@@ -4604,7 +4604,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,

  amdgpu_inc_vram_lost(tmp_adev);
  }

-    r = 
amdgpu_gtt_mgr_recover(ttm_manager_type(_adev->mman.bdev, 
TTM_PL_TT));

+    r = amdgpu_gtt_mgr_recover(_adev->mman.gtt_mgr);
  if (r)
  goto out;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c

index c18f16b3be9c..9151950e0cc3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
@@ -77,10 +77,8 @@ static ssize_t 
amdgpu_mem_info_gtt_used_show(struct device *dev,

  {
  struct drm_device *ddev = dev_get_drvdata(dev);
  struct amdgpu_device *adev = drm_to_adev(ddev);
-    struct ttm_resource_manager *man;

-    man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
-    return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(man));
+    return sysfs_emit(buf, "%llu\n", 
amdgpu_gtt_mgr_usage(>mman.gtt_mgr));

  }

  static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO,
@@ -206,14 +204,15 @@ static void amdgpu_gtt_mgr_del(struct 
ttm_resource_manager *man,

  /**
   * amdgpu_gtt_mgr_usage - return usage of GTT domain
   *
- * @man: TTM memory type manager
+ * @mgr: amdgpu_gtt_mgr pointer
   *
   * Return how many bytes are used in the GTT domain
   */
-uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
+uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_gtt_mgr *mgr)
  {
-    struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-    s64 result = man->size - atomic64_read(>available);
+    s64 result;
+
+    result = mgr->manager.size - atomic64_read(>available);

  return (result > 0 ? result : 0) * PAGE_SIZE;
  }
@@ -221,16 +220,15 @@ uint64_t amdgpu_gtt_mgr_usage(struct 
ttm_resource_manager *man)

  /**
   * amdgpu_gtt_mgr_recover - re-init gart
   *
- * @man: TTM memory type manager
+ * @mgr: amdgpu_gtt_mgr pointer
   *
   * Re-init the gart for each known BO in the GTT.
   */
-int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man)
+int amdgpu_gtt_mgr_recover(struct amdgpu_gtt_mgr *mgr)
  {
-    struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-    struct amdgpu_device *adev;
  struct amdgpu_gtt_node *node;
  struct drm_mm_node *mm_node;
+    struct amdgpu_device *adev;
  int r = 0;

  adev = container_of(mgr, typeof(*adev), mman.gtt_mgr);
@@ -267,7 +265,7 @@ static void amdgpu_gtt_mgr_debug(struct 
ttm_resource_manager *man,


  drm_printf(printer, "man size:%llu pages, gtt available:%lld 
pages, usage:%lluMB\n",

 man->size, (u64)atomic64_read(>available),
-   amdgpu_gtt_mgr_usage(man) >> 20);
+   amdgpu_gtt_mgr_usage(mgr) >> 20);
  }

  static const struct ttm_resource_manager_func amdgpu_gtt_mgr_func = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

index d2955ea4a62b..603ce32db5c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -678,7 +678,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, 
void *data, struct drm_file *filp)
  ui64 = 
amdgpu_vram_mgr_vis_usage(ttm_manager_type(>mman.bdev, 
TTM_PL_VR

[PATCH 1/1] drm/amdgpu: remove unnecessary checks

2021-10-22 Thread Nirmoy Das

amdgpu_ttm_backend_bind() only needed for TTM_PL_TT
and AMDGPU_PL_PREEMPT.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index d784f8d3a834..eb872fc4ad92 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -914,11 +914,6 @@ static int amdgpu_ttm_backend_bind(struct ttm_device *bdev,
 ttm->num_pages, bo_mem, ttm);
}
 
-   if (bo_mem->mem_type == AMDGPU_PL_GDS ||
-   bo_mem->mem_type == AMDGPU_PL_GWS ||
-   bo_mem->mem_type == AMDGPU_PL_OA)
-   return -EINVAL;
-
if (bo_mem->mem_type != TTM_PL_TT ||
!amdgpu_gtt_mgr_has_gart_addr(bo_mem)) {
gtt->offset = AMDGPU_BO_INVALID_OFFSET;
-- 
2.32.0

Re: [PATCH v4 3/3] drm/amdgpu: recover gart table at resume

2021-10-22 Thread Nirmoy




On 10/22/21 12:54 PM, Nirmoy Das wrote:

Get rid off pin/unpin of gart BO at resume/suspend and
instead pin only once and try to recover gart content
at resume time. This is much more stable in case there
is OOM situation at 2nd call to amdgpu_device_evict_resources()
while evicting GART table.

v4: return amdgpu_bo_create_kernel() directly without checking
its return value.
v3: remove gart recovery from other places
v2: pin gart at amdgpu_gart_table_vram_alloc()
Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 82 ++
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  3 +-
  drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c  |  3 +-
  drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  |  3 +-
  drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  |  3 +-
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  3 +-
  7 files changed, 11 insertions(+), 97 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2b53d86aebac..f0c70e9d37fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3935,16 +3935,11 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
if (!adev->in_s0ix)
amdgpu_amdkfd_suspend(adev, adev->in_runpm);

-   /* First evict vram memory */
amdgpu_device_evict_resources(adev);

amdgpu_fence_driver_hw_fini(adev);

amdgpu_device_ip_suspend_phase2(adev);
-   /* This second call to evict device resources is to evict
-* the gart page table using the CPU.
-*/
-   amdgpu_device_evict_resources(adev);

return 0;
  }
@@ -4286,8 +4281,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
goto error;

amdgpu_virt_init_data_exchange(adev);
-   /* we need recover gart prior to run SMC/CP/SDMA resume */
-   amdgpu_gtt_mgr_recover(>mman.gtt_mgr);

r = amdgpu_device_fw_loading(adev);
if (r)
@@ -4604,10 +4597,6 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
amdgpu_inc_vram_lost(tmp_adev);
}

-   r = 
amdgpu_gtt_mgr_recover(_adev->mman.gtt_mgr);
-   if (r)
-   goto out;
-
r = amdgpu_device_fw_loading(tmp_adev);
if (r)
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index d3e4203f6217..3525f87dc1af 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -116,78 +116,12 @@ int amdgpu_gart_table_vram_alloc(struct amdgpu_device 
*adev)
  {
int r;



I forgot to remove this unused r and now resent[1] this patch again but 
forgot add in-reply-to.



https://patchwork.freedesktop.org/patch/460939/


Regards,

Nirmoy



-   if (adev->gart.bo == NULL) {
-   struct amdgpu_bo_param bp;
-
-   memset(, 0, sizeof(bp));
-   bp.size = adev->gart.table_size;
-   bp.byte_align = PAGE_SIZE;
-   bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
-   bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
-   AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
-   bp.type = ttm_bo_type_kernel;
-   bp.resv = NULL;
-   bp.bo_ptr_size = sizeof(struct amdgpu_bo);
-
-   r = amdgpu_bo_create(adev, , >gart.bo);
-   if (r) {
-   return r;
-   }
-   }
-   return 0;
-}
-
-/**
- * amdgpu_gart_table_vram_pin - pin gart page table in vram
- *
- * @adev: amdgpu_device pointer
- *
- * Pin the GART page table in vram so it will not be moved
- * by the memory manager (pcie r4xx, r5xx+).  These asics require the
- * gart table to be in video memory.
- * Returns 0 for success, error for failure.
- */
-int amdgpu_gart_table_vram_pin(struct amdgpu_device *adev)
-{
-   int r;
-
-   r = amdgpu_bo_reserve(adev->gart.bo, false);
-   if (unlikely(r != 0))
-   return r;
-   r = amdgpu_bo_pin(adev->gart.bo, AMDGPU_GEM_DOMAIN_VRAM);
-   if (r) {
-   amdgpu_bo_unreserve(adev->gart.bo);
-   return r;
-   }
-   r = amdgpu_bo_kmap(adev->gart.bo, >gart.ptr);
-   if (r)
-   amdgpu_bo_unpin(adev->gart.bo);
-   amdgpu_bo_unreserve(adev->gart.bo);
-   return r;
-}
-
-/**
- * amdgpu_gart_table_vram_unpin - unpin gart page table in vram
- *
- * @adev: amdgpu_device pointer
- *
- * Unpin the GART page table in vram (pcie r4xx, r5xx+).
- * These asics require the gart table to be in video memory.
- */
-void amdgpu_gart_table_vram_unpin(st

[PATCH 3/3] drm/amdgpu: recover gart table at resume

2021-10-22 Thread Nirmoy Das

Get rid off pin/unpin of gart BO at resume/suspend and
instead pin only once and try to recover gart content
at resume time. This is much more stable in case there
is OOM situation at 2nd call to amdgpu_device_evict_resources()
while evicting GART table.

v3: remove gart recovery from other places
v2: pin gart at amdgpu_gart_table_vram_alloc()
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 84 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  3 +-
 7 files changed, 11 insertions(+), 99 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2b53d86aebac..f0c70e9d37fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3935,16 +3935,11 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
if (!adev->in_s0ix)
amdgpu_amdkfd_suspend(adev, adev->in_runpm);
 
-   /* First evict vram memory */
amdgpu_device_evict_resources(adev);
 
amdgpu_fence_driver_hw_fini(adev);
 
amdgpu_device_ip_suspend_phase2(adev);
-   /* This second call to evict device resources is to evict
-* the gart page table using the CPU.
-*/
-   amdgpu_device_evict_resources(adev);
 
return 0;
 }
@@ -4286,8 +4281,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
goto error;
 
amdgpu_virt_init_data_exchange(adev);
-   /* we need recover gart prior to run SMC/CP/SDMA resume */
-   amdgpu_gtt_mgr_recover(>mman.gtt_mgr);
 
r = amdgpu_device_fw_loading(adev);
if (r)
@@ -4604,10 +4597,6 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
amdgpu_inc_vram_lost(tmp_adev);
}
 
-   r = 
amdgpu_gtt_mgr_recover(_adev->mman.gtt_mgr);
-   if (r)
-   goto out;
-
r = amdgpu_device_fw_loading(tmp_adev);
if (r)
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index d3e4203f6217..645950a653a0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -114,80 +114,12 @@ void amdgpu_gart_dummy_page_fini(struct amdgpu_device 
*adev)
  */
 int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
 {
-   int r;
-
-   if (adev->gart.bo == NULL) {
-   struct amdgpu_bo_param bp;
-
-   memset(, 0, sizeof(bp));
-   bp.size = adev->gart.table_size;
-   bp.byte_align = PAGE_SIZE;
-   bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
-   bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
-   AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
-   bp.type = ttm_bo_type_kernel;
-   bp.resv = NULL;
-   bp.bo_ptr_size = sizeof(struct amdgpu_bo);
-
-   r = amdgpu_bo_create(adev, , >gart.bo);
-   if (r) {
-   return r;
-   }
-   }
-   return 0;
-}
-
-/**
- * amdgpu_gart_table_vram_pin - pin gart page table in vram
- *
- * @adev: amdgpu_device pointer
- *
- * Pin the GART page table in vram so it will not be moved
- * by the memory manager (pcie r4xx, r5xx+).  These asics require the
- * gart table to be in video memory.
- * Returns 0 for success, error for failure.
- */
-int amdgpu_gart_table_vram_pin(struct amdgpu_device *adev)
-{
-   int r;
-
-   r = amdgpu_bo_reserve(adev->gart.bo, false);
-   if (unlikely(r != 0))
-   return r;
-   r = amdgpu_bo_pin(adev->gart.bo, AMDGPU_GEM_DOMAIN_VRAM);
-   if (r) {
-   amdgpu_bo_unreserve(adev->gart.bo);
-   return r;
-   }
-   r = amdgpu_bo_kmap(adev->gart.bo, >gart.ptr);
-   if (r)
-   amdgpu_bo_unpin(adev->gart.bo);
-   amdgpu_bo_unreserve(adev->gart.bo);
-   return r;
-}
-
-/**
- * amdgpu_gart_table_vram_unpin - unpin gart page table in vram
- *
- * @adev: amdgpu_device pointer
- *
- * Unpin the GART page table in vram (pcie r4xx, r5xx+).
- * These asics require the gart table to be in video memory.
- */
-void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev)
-{
-   int r;
+   if (adev->gart.bo != NULL)
+   return 0;
 
-   if (adev->gart.bo == NULL) {
-   return;
-   }
-   r = amdgpu_bo_reserve

[PATCH 2/3] drm/amdgpu: do not pass ttm_resource_manager to vram_mgr

2021-10-22 Thread Nirmoy Das

Do not allow exported amdgpu_vram_mgr_*() to accept
any ttm_resource_manager pointer. Also there is no need
to force other module to call a ttm function just to
eventually call vram_mgr functions.

v2: pass adev's vram_mgr instead of adev
Signed-off-by: Nirmoy Das 
Reviewed-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c   |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   |  5 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c  | 10 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  6 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h  |  8 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c |  5 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 40 
 7 files changed, 31 insertions(+), 46 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7077f21f0021..df818e145d9a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -531,9 +531,8 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int 
dma_buf_fd,
 uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
-   struct ttm_resource_manager *vram_man = 
ttm_manager_type(>mman.bdev, TTM_PL_VRAM);

-   return amdgpu_vram_mgr_usage(vram_man);
+   return amdgpu_vram_mgr_usage(>mman.vram_mgr);
 }

 uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 76fe5b71e35d..7e745164a624 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -298,7 +298,6 @@ static void amdgpu_cs_get_threshold_for_moves(struct 
amdgpu_device *adev,
 {
s64 time_us, increment_us;
u64 free_vram, total_vram, used_vram;
-   struct ttm_resource_manager *vram_man = 
ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
/* Allow a maximum of 200 accumulated ms. This is basically per-IB
 * throttling.
 *
@@ -315,7 +314,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct 
amdgpu_device *adev,
}

total_vram = adev->gmc.real_vram_size - 
atomic64_read(>vram_pin_size);
-   used_vram = amdgpu_vram_mgr_usage(vram_man);
+   used_vram = amdgpu_vram_mgr_usage(>mman.vram_mgr);
free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;

spin_lock(>mm_stats.lock);
@@ -362,7 +361,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct 
amdgpu_device *adev,
if (!amdgpu_gmc_vram_full_visible(>gmc)) {
u64 total_vis_vram = adev->gmc.visible_vram_size;
u64 used_vis_vram =
- amdgpu_vram_mgr_vis_usage(vram_man);
+ amdgpu_vram_mgr_vis_usage(>mman.vram_mgr);

if (used_vis_vram < total_vis_vram) {
u64 free_vis_vram = total_vis_vram - used_vis_vram;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 603ce32db5c5..b426e03ad630 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -672,10 +672,10 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
ui64 = atomic64_read(>num_vram_cpu_page_faults);
return copy_to_user(out, , min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_VRAM_USAGE:
-   ui64 = amdgpu_vram_mgr_usage(ttm_manager_type(>mman.bdev, 
TTM_PL_VRAM));
+   ui64 = amdgpu_vram_mgr_usage(>mman.vram_mgr);
return copy_to_user(out, , min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_VIS_VRAM_USAGE:
-   ui64 = 
amdgpu_vram_mgr_vis_usage(ttm_manager_type(>mman.bdev, TTM_PL_VRAM));
+   ui64 = amdgpu_vram_mgr_vis_usage(>mman.vram_mgr);
return copy_to_user(out, , min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_GTT_USAGE:
ui64 = amdgpu_gtt_mgr_usage(>mman.gtt_mgr);
@@ -709,8 +709,6 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
}
case AMDGPU_INFO_MEMORY: {
struct drm_amdgpu_memory_info mem;
-   struct ttm_resource_manager *vram_man =
-   ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
struct ttm_resource_manager *gtt_man =
ttm_manager_type(>mman.bdev, TTM_PL_TT);
memset(, 0, sizeof(mem));
@@ -719,7 +717,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
atomic64_read(>vram_pin_size) -
AMDGPU_VM_RESERVED_VRAM;
mem.vram.heap_usage =
-   amdgpu_vram_mgr_usage(vram_man);
+   amdgpu_vram_mgr_usage(>mman.vram_mgr);

[PATCH v4 3/3] drm/amdgpu: recover gart table at resume

2021-10-22 Thread Nirmoy Das

Get rid off pin/unpin of gart BO at resume/suspend and
instead pin only once and try to recover gart content
at resume time. This is much more stable in case there
is OOM situation at 2nd call to amdgpu_device_evict_resources()
while evicting GART table.

v4: return amdgpu_bo_create_kernel() directly without checking
its return value.
v3: remove gart recovery from other places
v2: pin gart at amdgpu_gart_table_vram_alloc()
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 82 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  3 +-
 7 files changed, 11 insertions(+), 97 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2b53d86aebac..f0c70e9d37fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3935,16 +3935,11 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
if (!adev->in_s0ix)
amdgpu_amdkfd_suspend(adev, adev->in_runpm);

-   /* First evict vram memory */
amdgpu_device_evict_resources(adev);

amdgpu_fence_driver_hw_fini(adev);

amdgpu_device_ip_suspend_phase2(adev);
-   /* This second call to evict device resources is to evict
-* the gart page table using the CPU.
-*/
-   amdgpu_device_evict_resources(adev);

return 0;
 }
@@ -4286,8 +4281,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
goto error;

amdgpu_virt_init_data_exchange(adev);
-   /* we need recover gart prior to run SMC/CP/SDMA resume */
-   amdgpu_gtt_mgr_recover(>mman.gtt_mgr);

r = amdgpu_device_fw_loading(adev);
if (r)
@@ -4604,10 +4597,6 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
amdgpu_inc_vram_lost(tmp_adev);
}

-   r = 
amdgpu_gtt_mgr_recover(_adev->mman.gtt_mgr);
-   if (r)
-   goto out;
-
r = amdgpu_device_fw_loading(tmp_adev);
if (r)
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index d3e4203f6217..3525f87dc1af 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -116,78 +116,12 @@ int amdgpu_gart_table_vram_alloc(struct amdgpu_device 
*adev)
 {
int r;

-   if (adev->gart.bo == NULL) {
-   struct amdgpu_bo_param bp;
-
-   memset(, 0, sizeof(bp));
-   bp.size = adev->gart.table_size;
-   bp.byte_align = PAGE_SIZE;
-   bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
-   bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
-   AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
-   bp.type = ttm_bo_type_kernel;
-   bp.resv = NULL;
-   bp.bo_ptr_size = sizeof(struct amdgpu_bo);
-
-   r = amdgpu_bo_create(adev, , >gart.bo);
-   if (r) {
-   return r;
-   }
-   }
-   return 0;
-}
-
-/**
- * amdgpu_gart_table_vram_pin - pin gart page table in vram
- *
- * @adev: amdgpu_device pointer
- *
- * Pin the GART page table in vram so it will not be moved
- * by the memory manager (pcie r4xx, r5xx+).  These asics require the
- * gart table to be in video memory.
- * Returns 0 for success, error for failure.
- */
-int amdgpu_gart_table_vram_pin(struct amdgpu_device *adev)
-{
-   int r;
-
-   r = amdgpu_bo_reserve(adev->gart.bo, false);
-   if (unlikely(r != 0))
-   return r;
-   r = amdgpu_bo_pin(adev->gart.bo, AMDGPU_GEM_DOMAIN_VRAM);
-   if (r) {
-   amdgpu_bo_unreserve(adev->gart.bo);
-   return r;
-   }
-   r = amdgpu_bo_kmap(adev->gart.bo, >gart.ptr);
-   if (r)
-   amdgpu_bo_unpin(adev->gart.bo);
-   amdgpu_bo_unreserve(adev->gart.bo);
-   return r;
-}
-
-/**
- * amdgpu_gart_table_vram_unpin - unpin gart page table in vram
- *
- * @adev: amdgpu_device pointer
- *
- * Unpin the GART page table in vram (pcie r4xx, r5xx+).
- * These asics require the gart table to be in video memory.
- */
-void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev)
-{
-   int r;
+   if (adev->gart.bo != NULL)
+   return 0;

-   if (adev->gart.bo == NULL) {
-   return;
-   }
-   r = amdgpu_bo_reserve

[PATCH v4 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr

2021-10-22 Thread Nirmoy Das

Do not allow exported amdgpu_gtt_mgr_*() to accept
any ttm_resource_manager pointer. Also there is no need
to force other module to call a ttm function just to
eventually call gtt_mgr functions.

v4: remove unused adev.
v3: upcast mgr from ttm resopurce manager instead of
getting it from adev.
v2: pass adev's gtt_mgr instead of adev.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 22 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  4 ++--
 4 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 41ce86244144..2b53d86aebac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4287,7 +4287,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,

amdgpu_virt_init_data_exchange(adev);
/* we need recover gart prior to run SMC/CP/SDMA resume */
-   amdgpu_gtt_mgr_recover(ttm_manager_type(>mman.bdev, TTM_PL_TT));
+   amdgpu_gtt_mgr_recover(>mman.gtt_mgr);

r = amdgpu_device_fw_loading(adev);
if (r)
@@ -4604,7 +4604,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
amdgpu_inc_vram_lost(tmp_adev);
}

-   r = 
amdgpu_gtt_mgr_recover(ttm_manager_type(_adev->mman.bdev, TTM_PL_TT));
+   r = 
amdgpu_gtt_mgr_recover(_adev->mman.gtt_mgr);
if (r)
goto out;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
index c18f16b3be9c..9151950e0cc3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
@@ -77,10 +77,8 @@ static ssize_t amdgpu_mem_info_gtt_used_show(struct device 
*dev,
 {
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(ddev);
-   struct ttm_resource_manager *man;

-   man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
-   return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(man));
+   return sysfs_emit(buf, "%llu\n", 
amdgpu_gtt_mgr_usage(>mman.gtt_mgr));
 }

 static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO,
@@ -206,14 +204,15 @@ static void amdgpu_gtt_mgr_del(struct 
ttm_resource_manager *man,
 /**
  * amdgpu_gtt_mgr_usage - return usage of GTT domain
  *
- * @man: TTM memory type manager
+ * @mgr: amdgpu_gtt_mgr pointer
  *
  * Return how many bytes are used in the GTT domain
  */
-uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
+uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_gtt_mgr *mgr)
 {
-   struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-   s64 result = man->size - atomic64_read(>available);
+   s64 result;
+
+   result = mgr->manager.size - atomic64_read(>available);

return (result > 0 ? result : 0) * PAGE_SIZE;
 }
@@ -221,16 +220,15 @@ uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager 
*man)
 /**
  * amdgpu_gtt_mgr_recover - re-init gart
  *
- * @man: TTM memory type manager
+ * @mgr: amdgpu_gtt_mgr pointer
  *
  * Re-init the gart for each known BO in the GTT.
  */
-int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man)
+int amdgpu_gtt_mgr_recover(struct amdgpu_gtt_mgr *mgr)
 {
-   struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-   struct amdgpu_device *adev;
struct amdgpu_gtt_node *node;
struct drm_mm_node *mm_node;
+   struct amdgpu_device *adev;
int r = 0;

adev = container_of(mgr, typeof(*adev), mman.gtt_mgr);
@@ -267,7 +265,7 @@ static void amdgpu_gtt_mgr_debug(struct 
ttm_resource_manager *man,

drm_printf(printer, "man size:%llu pages, gtt available:%lld pages, 
usage:%lluMB\n",
   man->size, (u64)atomic64_read(>available),
-  amdgpu_gtt_mgr_usage(man) >> 20);
+  amdgpu_gtt_mgr_usage(mgr) >> 20);
 }

 static const struct ttm_resource_manager_func amdgpu_gtt_mgr_func = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index d2955ea4a62b..603ce32db5c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -678,7 +678,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
ui64 = 
amdgpu_vram_mgr_vis_usage(ttm_manager_type(>mman.bdev, TTM_PL_VRAM));
return copy_to_user(out, , min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_GTT_USAGE:
-   ui64 = amdgpu_gtt_mgr_usage(ttm_manager_type(>mman.bdev, 
TTM_PL_TT));
+   ui64 = amdgp

Re: [PATCH v3 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr

2021-10-22 Thread Nirmoy




On 10/22/21 11:44 AM, Christian König wrote:

Am 22.10.21 um 11:32 schrieb Nirmoy Das:

Do not allow exported amdgpu_gtt_mgr_*() to accept
any ttm_resource_manager pointer. Also there is no need
to force other module to call a ttm function just to
eventually call gtt_mgr functions.

v3: upcast mgr from ttm resopurce manager instead of
getting it from adev.
v2: pass adev's gtt_mgr instead of adev

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 23 ++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  4 ++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  4 ++--
  4 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 41ce86244144..2b53d86aebac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4287,7 +4287,7 @@ static int amdgpu_device_reset_sriov(struct 
amdgpu_device *adev,


  amdgpu_virt_init_data_exchange(adev);
  /* we need recover gart prior to run SMC/CP/SDMA resume */
- amdgpu_gtt_mgr_recover(ttm_manager_type(>mman.bdev, TTM_PL_TT));
+    amdgpu_gtt_mgr_recover(>mman.gtt_mgr);

  r = amdgpu_device_fw_loading(adev);
  if (r)
@@ -4604,7 +4604,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,

  amdgpu_inc_vram_lost(tmp_adev);
  }

-    r = 
amdgpu_gtt_mgr_recover(ttm_manager_type(_adev->mman.bdev, 
TTM_PL_TT));

+    r = amdgpu_gtt_mgr_recover(_adev->mman.gtt_mgr);
  if (r)
  goto out;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c

index c18f16b3be9c..62e14efa61fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
@@ -77,10 +77,8 @@ static ssize_t 
amdgpu_mem_info_gtt_used_show(struct device *dev,

  {
  struct drm_device *ddev = dev_get_drvdata(dev);
  struct amdgpu_device *adev = drm_to_adev(ddev);
-    struct ttm_resource_manager *man;

-    man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
-    return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(man));
+    return sysfs_emit(buf, "%llu\n", 
amdgpu_gtt_mgr_usage(>mman.gtt_mgr));

  }

  static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO,
@@ -206,14 +204,15 @@ static void amdgpu_gtt_mgr_del(struct 
ttm_resource_manager *man,

  /**
   * amdgpu_gtt_mgr_usage - return usage of GTT domain
   *
- * @man: TTM memory type manager
+ * @mgr: amdgpu_gtt_mgr pointer
   *
   * Return how many bytes are used in the GTT domain
   */
-uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
+uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_gtt_mgr *mgr)
  {
-    struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-    s64 result = man->size - atomic64_read(>available);
+    s64 result;
+
+    result = mgr->manager.size - atomic64_read(>available);

  return (result > 0 ? result : 0) * PAGE_SIZE;
  }
@@ -221,16 +220,15 @@ uint64_t amdgpu_gtt_mgr_usage(struct 
ttm_resource_manager *man)

  /**
   * amdgpu_gtt_mgr_recover - re-init gart
   *
- * @man: TTM memory type manager
+ * @mgr: amdgpu_gtt_mgr pointer
   *
   * Re-init the gart for each known BO in the GTT.
   */
-int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man)
+int amdgpu_gtt_mgr_recover(struct amdgpu_gtt_mgr *mgr)
  {
-    struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-    struct amdgpu_device *adev;
  struct amdgpu_gtt_node *node;
  struct drm_mm_node *mm_node;
+    struct amdgpu_device *adev;
  int r = 0;

  adev = container_of(mgr, typeof(*adev), mman.gtt_mgr);
@@ -260,6 +258,7 @@ static void amdgpu_gtt_mgr_debug(struct 
ttm_resource_manager *man,

   struct drm_printer *printer)
  {
  struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
+    struct amdgpu_device *adev = container_of(mgr, typeof(*adev), 
mman.gtt_mgr);


I think that variable is now unused, isn't it?

Apart from that looks good to me now.



Yes it is. I will remove it. Strangely GCC didn't complain.


Nirmoy




Christian.



  spin_lock(>lock);
  drm_mm_print(>mm, printer);
@@ -267,7 +266,7 @@ static void amdgpu_gtt_mgr_debug(struct 
ttm_resource_manager *man,


  drm_printf(printer, "man size:%llu pages, gtt available:%lld 
pages, usage:%lluMB\n",

 man->size, (u64)atomic64_read(>available),
-   amdgpu_gtt_mgr_usage(man) >> 20);
+   amdgpu_gtt_mgr_usage(mgr) >> 20);
  }

  static const struct ttm_resource_manager_func amdgpu_gtt_mgr_func = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c

index d2955ea4a62b..603ce32db5c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amd

[PATCH v2 2/3] drm/amdgpu: do not pass ttm_resource_manager to vram_mgr

2021-10-22 Thread Nirmoy Das

Do not allow exported amdgpu_vram_mgr_*() to accept
any ttm_resource_manager pointer. Also there is no need
to force other module to call a ttm function just to
eventually call vram_mgr functions.

v2: pass adev's vram_mgr instead of adev

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c   |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   |  5 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c  | 10 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  6 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h  |  8 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c |  5 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 40 
 7 files changed, 31 insertions(+), 46 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7077f21f0021..df818e145d9a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -531,9 +531,8 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int 
dma_buf_fd,
 uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
-   struct ttm_resource_manager *vram_man = 
ttm_manager_type(>mman.bdev, TTM_PL_VRAM);

-   return amdgpu_vram_mgr_usage(vram_man);
+   return amdgpu_vram_mgr_usage(>mman.vram_mgr);
 }

 uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 76fe5b71e35d..7e745164a624 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -298,7 +298,6 @@ static void amdgpu_cs_get_threshold_for_moves(struct 
amdgpu_device *adev,
 {
s64 time_us, increment_us;
u64 free_vram, total_vram, used_vram;
-   struct ttm_resource_manager *vram_man = 
ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
/* Allow a maximum of 200 accumulated ms. This is basically per-IB
 * throttling.
 *
@@ -315,7 +314,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct 
amdgpu_device *adev,
}

total_vram = adev->gmc.real_vram_size - 
atomic64_read(>vram_pin_size);
-   used_vram = amdgpu_vram_mgr_usage(vram_man);
+   used_vram = amdgpu_vram_mgr_usage(>mman.vram_mgr);
free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;

spin_lock(>mm_stats.lock);
@@ -362,7 +361,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct 
amdgpu_device *adev,
if (!amdgpu_gmc_vram_full_visible(>gmc)) {
u64 total_vis_vram = adev->gmc.visible_vram_size;
u64 used_vis_vram =
- amdgpu_vram_mgr_vis_usage(vram_man);
+ amdgpu_vram_mgr_vis_usage(>mman.vram_mgr);

if (used_vis_vram < total_vis_vram) {
u64 free_vis_vram = total_vis_vram - used_vis_vram;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 603ce32db5c5..b426e03ad630 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -672,10 +672,10 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
ui64 = atomic64_read(>num_vram_cpu_page_faults);
return copy_to_user(out, , min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_VRAM_USAGE:
-   ui64 = amdgpu_vram_mgr_usage(ttm_manager_type(>mman.bdev, 
TTM_PL_VRAM));
+   ui64 = amdgpu_vram_mgr_usage(>mman.vram_mgr);
return copy_to_user(out, , min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_VIS_VRAM_USAGE:
-   ui64 = 
amdgpu_vram_mgr_vis_usage(ttm_manager_type(>mman.bdev, TTM_PL_VRAM));
+   ui64 = amdgpu_vram_mgr_vis_usage(>mman.vram_mgr);
return copy_to_user(out, , min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_GTT_USAGE:
ui64 = amdgpu_gtt_mgr_usage(>mman.gtt_mgr);
@@ -709,8 +709,6 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
}
case AMDGPU_INFO_MEMORY: {
struct drm_amdgpu_memory_info mem;
-   struct ttm_resource_manager *vram_man =
-   ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
struct ttm_resource_manager *gtt_man =
ttm_manager_type(>mman.bdev, TTM_PL_TT);
memset(, 0, sizeof(mem));
@@ -719,7 +717,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
atomic64_read(>vram_pin_size) -
AMDGPU_VM_RESERVED_VRAM;
mem.vram.heap_usage =
-   amdgpu_vram_mgr_usage(vram_man);
+   amdgpu_vram_mgr_usage(>mman.vram_mgr);

[PATCH v3 3/3] drm/amdgpu: recover gart table at resume

2021-10-22 Thread Nirmoy Das

Get rid off pin/unpin of gart BO at resume/suspend and
instead pin only once and try to recover gart content
at resume time. This is much more stable in case there
is OOM situation at 2nd call to amdgpu_device_evict_resources()
while evicting GART table.

v3: remove gart recovery from other places
v2: pin gart at amdgpu_gart_table_vram_alloc()

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 80 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  3 +-
 7 files changed, 12 insertions(+), 94 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2b53d86aebac..f0c70e9d37fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3935,16 +3935,11 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
if (!adev->in_s0ix)
amdgpu_amdkfd_suspend(adev, adev->in_runpm);

-   /* First evict vram memory */
amdgpu_device_evict_resources(adev);

amdgpu_fence_driver_hw_fini(adev);

amdgpu_device_ip_suspend_phase2(adev);
-   /* This second call to evict device resources is to evict
-* the gart page table using the CPU.
-*/
-   amdgpu_device_evict_resources(adev);

return 0;
 }
@@ -4286,8 +4281,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
goto error;

amdgpu_virt_init_data_exchange(adev);
-   /* we need recover gart prior to run SMC/CP/SDMA resume */
-   amdgpu_gtt_mgr_recover(>mman.gtt_mgr);

r = amdgpu_device_fw_loading(adev);
if (r)
@@ -4604,10 +4597,6 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
amdgpu_inc_vram_lost(tmp_adev);
}

-   r = 
amdgpu_gtt_mgr_recover(_adev->mman.gtt_mgr);
-   if (r)
-   goto out;
-
r = amdgpu_device_fw_loading(tmp_adev);
if (r)
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index d3e4203f6217..679eec122bb5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -116,78 +116,16 @@ int amdgpu_gart_table_vram_alloc(struct amdgpu_device 
*adev)
 {
int r;

-   if (adev->gart.bo == NULL) {
-   struct amdgpu_bo_param bp;
-
-   memset(, 0, sizeof(bp));
-   bp.size = adev->gart.table_size;
-   bp.byte_align = PAGE_SIZE;
-   bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
-   bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
-   AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
-   bp.type = ttm_bo_type_kernel;
-   bp.resv = NULL;
-   bp.bo_ptr_size = sizeof(struct amdgpu_bo);
-
-   r = amdgpu_bo_create(adev, , >gart.bo);
-   if (r) {
-   return r;
-   }
-   }
-   return 0;
-}
-
-/**
- * amdgpu_gart_table_vram_pin - pin gart page table in vram
- *
- * @adev: amdgpu_device pointer
- *
- * Pin the GART page table in vram so it will not be moved
- * by the memory manager (pcie r4xx, r5xx+).  These asics require the
- * gart table to be in video memory.
- * Returns 0 for success, error for failure.
- */
-int amdgpu_gart_table_vram_pin(struct amdgpu_device *adev)
-{
-   int r;
+   if (adev->gart.bo != NULL)
+   return 0;

-   r = amdgpu_bo_reserve(adev->gart.bo, false);
-   if (unlikely(r != 0))
-   return r;
-   r = amdgpu_bo_pin(adev->gart.bo, AMDGPU_GEM_DOMAIN_VRAM);
+   r = amdgpu_bo_create_kernel(adev,  adev->gart.table_size, PAGE_SIZE,
+   AMDGPU_GEM_DOMAIN_VRAM, >gart.bo,
+   NULL, (void *)>gart.ptr);
if (r) {
-   amdgpu_bo_unreserve(adev->gart.bo);
return r;
}
-   r = amdgpu_bo_kmap(adev->gart.bo, >gart.ptr);
-   if (r)
-   amdgpu_bo_unpin(adev->gart.bo);
-   amdgpu_bo_unreserve(adev->gart.bo);
-   return r;
-}
-
-/**
- * amdgpu_gart_table_vram_unpin - unpin gart page table in vram
- *
- * @adev: amdgpu_device pointer
- *
- * Unpin the GART page table in vram (pcie r4xx, r5xx+).
- * These asics require the gart table to be in video memory.
- */
-void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev)

[PATCH v3 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr

2021-10-22 Thread Nirmoy Das

Do not allow exported amdgpu_gtt_mgr_*() to accept
any ttm_resource_manager pointer. Also there is no need
to force other module to call a ttm function just to
eventually call gtt_mgr functions.

v3: upcast mgr from ttm resopurce manager instead of
getting it from adev.
v2: pass adev's gtt_mgr instead of adev

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 23 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  4 ++--
 4 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 41ce86244144..2b53d86aebac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4287,7 +4287,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,

amdgpu_virt_init_data_exchange(adev);
/* we need recover gart prior to run SMC/CP/SDMA resume */
-   amdgpu_gtt_mgr_recover(ttm_manager_type(>mman.bdev, TTM_PL_TT));
+   amdgpu_gtt_mgr_recover(>mman.gtt_mgr);

r = amdgpu_device_fw_loading(adev);
if (r)
@@ -4604,7 +4604,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
amdgpu_inc_vram_lost(tmp_adev);
}

-   r = 
amdgpu_gtt_mgr_recover(ttm_manager_type(_adev->mman.bdev, TTM_PL_TT));
+   r = 
amdgpu_gtt_mgr_recover(_adev->mman.gtt_mgr);
if (r)
goto out;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
index c18f16b3be9c..62e14efa61fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
@@ -77,10 +77,8 @@ static ssize_t amdgpu_mem_info_gtt_used_show(struct device 
*dev,
 {
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(ddev);
-   struct ttm_resource_manager *man;

-   man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
-   return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(man));
+   return sysfs_emit(buf, "%llu\n", 
amdgpu_gtt_mgr_usage(>mman.gtt_mgr));
 }

 static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO,
@@ -206,14 +204,15 @@ static void amdgpu_gtt_mgr_del(struct 
ttm_resource_manager *man,
 /**
  * amdgpu_gtt_mgr_usage - return usage of GTT domain
  *
- * @man: TTM memory type manager
+ * @mgr: amdgpu_gtt_mgr pointer
  *
  * Return how many bytes are used in the GTT domain
  */
-uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
+uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_gtt_mgr *mgr)
 {
-   struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-   s64 result = man->size - atomic64_read(>available);
+   s64 result;
+
+   result = mgr->manager.size - atomic64_read(>available);

return (result > 0 ? result : 0) * PAGE_SIZE;
 }
@@ -221,16 +220,15 @@ uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager 
*man)
 /**
  * amdgpu_gtt_mgr_recover - re-init gart
  *
- * @man: TTM memory type manager
+ * @mgr: amdgpu_gtt_mgr pointer
  *
  * Re-init the gart for each known BO in the GTT.
  */
-int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man)
+int amdgpu_gtt_mgr_recover(struct amdgpu_gtt_mgr *mgr)
 {
-   struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-   struct amdgpu_device *adev;
struct amdgpu_gtt_node *node;
struct drm_mm_node *mm_node;
+   struct amdgpu_device *adev;
int r = 0;

adev = container_of(mgr, typeof(*adev), mman.gtt_mgr);
@@ -260,6 +258,7 @@ static void amdgpu_gtt_mgr_debug(struct 
ttm_resource_manager *man,
 struct drm_printer *printer)
 {
struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
+   struct amdgpu_device *adev = container_of(mgr, typeof(*adev), 
mman.gtt_mgr);

spin_lock(>lock);
drm_mm_print(>mm, printer);
@@ -267,7 +266,7 @@ static void amdgpu_gtt_mgr_debug(struct 
ttm_resource_manager *man,

drm_printf(printer, "man size:%llu pages, gtt available:%lld pages, 
usage:%lluMB\n",
   man->size, (u64)atomic64_read(>available),
-  amdgpu_gtt_mgr_usage(man) >> 20);
+  amdgpu_gtt_mgr_usage(mgr) >> 20);
 }

 static const struct ttm_resource_manager_func amdgpu_gtt_mgr_func = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index d2955ea4a62b..603ce32db5c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -678,7 +678,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_fi

[PATCH v2 2/3] drm/amdgpu: do not pass ttm_resource_manager to vram_mgr

2021-10-21 Thread Nirmoy Das

Do not allow exported amdgpu_vram_mgr_*() to accept
any ttm_resource_manager pointer. Also there is no need
to force other module to call a ttm function just to
eventually call vram_mgr functions.

v2: pass adev's vram_mgr instead of adev
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c   |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   |  5 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c  | 10 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  6 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h  |  8 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c |  5 ++-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 38 
 7 files changed, 30 insertions(+), 45 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7077f21f0021..df818e145d9a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -531,9 +531,8 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int 
dma_buf_fd,
 uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
-   struct ttm_resource_manager *vram_man = 
ttm_manager_type(>mman.bdev, TTM_PL_VRAM);

-   return amdgpu_vram_mgr_usage(vram_man);
+   return amdgpu_vram_mgr_usage(>mman.vram_mgr);
 }

 uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 76fe5b71e35d..7e745164a624 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -298,7 +298,6 @@ static void amdgpu_cs_get_threshold_for_moves(struct 
amdgpu_device *adev,
 {
s64 time_us, increment_us;
u64 free_vram, total_vram, used_vram;
-   struct ttm_resource_manager *vram_man = 
ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
/* Allow a maximum of 200 accumulated ms. This is basically per-IB
 * throttling.
 *
@@ -315,7 +314,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct 
amdgpu_device *adev,
}

total_vram = adev->gmc.real_vram_size - 
atomic64_read(>vram_pin_size);
-   used_vram = amdgpu_vram_mgr_usage(vram_man);
+   used_vram = amdgpu_vram_mgr_usage(>mman.vram_mgr);
free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;

spin_lock(>mm_stats.lock);
@@ -362,7 +361,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct 
amdgpu_device *adev,
if (!amdgpu_gmc_vram_full_visible(>gmc)) {
u64 total_vis_vram = adev->gmc.visible_vram_size;
u64 used_vis_vram =
- amdgpu_vram_mgr_vis_usage(vram_man);
+ amdgpu_vram_mgr_vis_usage(>mman.vram_mgr);

if (used_vis_vram < total_vis_vram) {
u64 free_vis_vram = total_vis_vram - used_vis_vram;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index 603ce32db5c5..b426e03ad630 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -672,10 +672,10 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
ui64 = atomic64_read(>num_vram_cpu_page_faults);
return copy_to_user(out, , min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_VRAM_USAGE:
-   ui64 = amdgpu_vram_mgr_usage(ttm_manager_type(>mman.bdev, 
TTM_PL_VRAM));
+   ui64 = amdgpu_vram_mgr_usage(>mman.vram_mgr);
return copy_to_user(out, , min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_VIS_VRAM_USAGE:
-   ui64 = 
amdgpu_vram_mgr_vis_usage(ttm_manager_type(>mman.bdev, TTM_PL_VRAM));
+   ui64 = amdgpu_vram_mgr_vis_usage(>mman.vram_mgr);
return copy_to_user(out, , min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_GTT_USAGE:
ui64 = amdgpu_gtt_mgr_usage(>mman.gtt_mgr);
@@ -709,8 +709,6 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
}
case AMDGPU_INFO_MEMORY: {
struct drm_amdgpu_memory_info mem;
-   struct ttm_resource_manager *vram_man =
-   ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
struct ttm_resource_manager *gtt_man =
ttm_manager_type(>mman.bdev, TTM_PL_TT);
memset(, 0, sizeof(mem));
@@ -719,7 +717,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
atomic64_read(>vram_pin_size) -
AMDGPU_VM_RESERVED_VRAM;
mem.vram.heap_usage =
-   amdgpu_vram_mgr_usage(vram_man);
+   amdgpu_vram_mgr_usage(>mman.vram_mgr);

[PATCH v3 3/3] drm/amdgpu: recover gart table at resume

2021-10-21 Thread Nirmoy Das

Get rid off pin/unpin of gart BO at resume/suspend and
instead pin only once and try to recover gart content
at resume time. This is much more stable in case there
is OOM situation at 2nd call to amdgpu_device_evict_resources()
while evicting GART table.

v3: remove gart recovery from other places
v2: pin gart at amdgpu_gart_table_vram_alloc()
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 80 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v6_0.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  |  3 +-
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  3 +-
 7 files changed, 12 insertions(+), 94 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 2b53d86aebac..f0c70e9d37fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3935,16 +3935,11 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
if (!adev->in_s0ix)
amdgpu_amdkfd_suspend(adev, adev->in_runpm);

-   /* First evict vram memory */
amdgpu_device_evict_resources(adev);

amdgpu_fence_driver_hw_fini(adev);

amdgpu_device_ip_suspend_phase2(adev);
-   /* This second call to evict device resources is to evict
-* the gart page table using the CPU.
-*/
-   amdgpu_device_evict_resources(adev);

return 0;
 }
@@ -4286,8 +4281,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
goto error;

amdgpu_virt_init_data_exchange(adev);
-   /* we need recover gart prior to run SMC/CP/SDMA resume */
-   amdgpu_gtt_mgr_recover(>mman.gtt_mgr);

r = amdgpu_device_fw_loading(adev);
if (r)
@@ -4604,10 +4597,6 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
amdgpu_inc_vram_lost(tmp_adev);
}

-   r = 
amdgpu_gtt_mgr_recover(_adev->mman.gtt_mgr);
-   if (r)
-   goto out;
-
r = amdgpu_device_fw_loading(tmp_adev);
if (r)
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index d3e4203f6217..679eec122bb5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -116,78 +116,16 @@ int amdgpu_gart_table_vram_alloc(struct amdgpu_device 
*adev)
 {
int r;

-   if (adev->gart.bo == NULL) {
-   struct amdgpu_bo_param bp;
-
-   memset(, 0, sizeof(bp));
-   bp.size = adev->gart.table_size;
-   bp.byte_align = PAGE_SIZE;
-   bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
-   bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
-   AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
-   bp.type = ttm_bo_type_kernel;
-   bp.resv = NULL;
-   bp.bo_ptr_size = sizeof(struct amdgpu_bo);
-
-   r = amdgpu_bo_create(adev, , >gart.bo);
-   if (r) {
-   return r;
-   }
-   }
-   return 0;
-}
-
-/**
- * amdgpu_gart_table_vram_pin - pin gart page table in vram
- *
- * @adev: amdgpu_device pointer
- *
- * Pin the GART page table in vram so it will not be moved
- * by the memory manager (pcie r4xx, r5xx+).  These asics require the
- * gart table to be in video memory.
- * Returns 0 for success, error for failure.
- */
-int amdgpu_gart_table_vram_pin(struct amdgpu_device *adev)
-{
-   int r;
+   if (adev->gart.bo != NULL)
+   return 0;

-   r = amdgpu_bo_reserve(adev->gart.bo, false);
-   if (unlikely(r != 0))
-   return r;
-   r = amdgpu_bo_pin(adev->gart.bo, AMDGPU_GEM_DOMAIN_VRAM);
+   r = amdgpu_bo_create_kernel(adev,  adev->gart.table_size, PAGE_SIZE,
+   AMDGPU_GEM_DOMAIN_VRAM, >gart.bo,
+   NULL, (void *)>gart.ptr);
if (r) {
-   amdgpu_bo_unreserve(adev->gart.bo);
return r;
}
-   r = amdgpu_bo_kmap(adev->gart.bo, >gart.ptr);
-   if (r)
-   amdgpu_bo_unpin(adev->gart.bo);
-   amdgpu_bo_unreserve(adev->gart.bo);
-   return r;
-}
-
-/**
- * amdgpu_gart_table_vram_unpin - unpin gart page table in vram
- *
- * @adev: amdgpu_device pointer
- *
- * Unpin the GART page table in vram (pcie r4xx, r5xx+).
- * These asics require the gart table to be in video memory.
- */
-void amdgpu_gart_table_vram_unpin(struct amdgpu_device *adev)

[PATCH v2 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr

2021-10-21 Thread Nirmoy Das

Do not allow exported amdgpu_gtt_mgr_*() to accept
any ttm_resource_manager pointer. Also there is no need
to force other module to call a ttm function just to
eventually call gtt_mgr functions.

v2: pass adev's gtt_mgr instead of adev

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 23 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  4 ++--
 4 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 41ce86244144..2b53d86aebac 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4287,7 +4287,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,

amdgpu_virt_init_data_exchange(adev);
/* we need recover gart prior to run SMC/CP/SDMA resume */
-   amdgpu_gtt_mgr_recover(ttm_manager_type(>mman.bdev, TTM_PL_TT));
+   amdgpu_gtt_mgr_recover(>mman.gtt_mgr);

r = amdgpu_device_fw_loading(adev);
if (r)
@@ -4604,7 +4604,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
amdgpu_inc_vram_lost(tmp_adev);
}

-   r = 
amdgpu_gtt_mgr_recover(ttm_manager_type(_adev->mman.bdev, TTM_PL_TT));
+   r = 
amdgpu_gtt_mgr_recover(_adev->mman.gtt_mgr);
if (r)
goto out;

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
index c18f16b3be9c..e429f2df73be 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
@@ -77,10 +77,8 @@ static ssize_t amdgpu_mem_info_gtt_used_show(struct device 
*dev,
 {
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(ddev);
-   struct ttm_resource_manager *man;

-   man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
-   return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(man));
+   return sysfs_emit(buf, "%llu\n", 
amdgpu_gtt_mgr_usage(>mman.gtt_mgr));
 }

 static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO,
@@ -206,14 +204,15 @@ static void amdgpu_gtt_mgr_del(struct 
ttm_resource_manager *man,
 /**
  * amdgpu_gtt_mgr_usage - return usage of GTT domain
  *
- * @man: TTM memory type manager
+ * @mgr: amdgpu_gtt_mgr pointer
  *
  * Return how many bytes are used in the GTT domain
  */
-uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
+uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_gtt_mgr *mgr)
 {
-   struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-   s64 result = man->size - atomic64_read(>available);
+   s64 result;
+
+   result = mgr->manager.size - atomic64_read(>available);

return (result > 0 ? result : 0) * PAGE_SIZE;
 }
@@ -221,16 +220,15 @@ uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager 
*man)
 /**
  * amdgpu_gtt_mgr_recover - re-init gart
  *
- * @man: TTM memory type manager
+ * @mgr: amdgpu_gtt_mgr pointer
  *
  * Re-init the gart for each known BO in the GTT.
  */
-int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man)
+int amdgpu_gtt_mgr_recover(struct amdgpu_gtt_mgr *mgr)
 {
-   struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-   struct amdgpu_device *adev;
struct amdgpu_gtt_node *node;
struct drm_mm_node *mm_node;
+   struct amdgpu_device *adev;
int r = 0;

adev = container_of(mgr, typeof(*adev), mman.gtt_mgr);
@@ -260,6 +258,7 @@ static void amdgpu_gtt_mgr_debug(struct 
ttm_resource_manager *man,
 struct drm_printer *printer)
 {
struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
+   struct amdgpu_device *adev = container_of(mgr, typeof(*adev), 
mman.gtt_mgr);

spin_lock(>lock);
drm_mm_print(>mm, printer);
@@ -267,7 +266,7 @@ static void amdgpu_gtt_mgr_debug(struct 
ttm_resource_manager *man,

drm_printf(printer, "man size:%llu pages, gtt available:%lld pages, 
usage:%lluMB\n",
   man->size, (u64)atomic64_read(>available),
-  amdgpu_gtt_mgr_usage(man) >> 20);
+  amdgpu_gtt_mgr_usage(>mman.gtt_mgr) >> 20);
 }

 static const struct ttm_resource_manager_func amdgpu_gtt_mgr_func = {
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index d2955ea4a62b..603ce32db5c5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -678,7 +678,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
ui64 = 
amdgpu_vram_mgr_vis_usage(

Re: [PATCH 1/1] drm/amdgpu: fix BO leak after successful move test

2021-10-20 Thread Das, Nirmoy




On 10/20/2021 1:51 PM, Christian König wrote:

Am 20.10.21 um 13:50 schrieb Christian König:



Am 13.10.21 um 17:09 schrieb Nirmoy Das:

GTT BO cleanup code is with in the test for loop and
we would skip cleaning up GTT BO on success.

Reported-by: zhang 
Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_test.c | 25 


  1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c

index 909d830b513e..5fe7ff680c29 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
@@ -35,6 +35,7 @@ static void amdgpu_do_test_moves(struct 
amdgpu_device *adev)

  struct amdgpu_bo *vram_obj = NULL;
  struct amdgpu_bo **gtt_obj = NULL;
  struct amdgpu_bo_param bp;
+    struct dma_fence *fence = NULL;
  uint64_t gart_addr, vram_addr;
  unsigned n, size;
  int i, r;
@@ -82,7 +83,6 @@ static void amdgpu_do_test_moves(struct 
amdgpu_device *adev)

  void *gtt_map, *vram_map;
  void **gart_start, **gart_end;
  void **vram_start, **vram_end;
-    struct dma_fence *fence = NULL;
    bp.domain = AMDGPU_GEM_DOMAIN_GTT;
  r = amdgpu_bo_create(adev, , gtt_obj + i);
@@ -212,24 +212,23 @@ static void amdgpu_do_test_moves(struct 
amdgpu_device *adev)
    DRM_INFO("Tested GTT->VRAM and VRAM->GTT copy for GTT 
offset 0x%llx\n",

   gart_addr - adev->gmc.gart_start);
-    continue;
+    }
  +    --i;
  out_lclean_unpin:
-    amdgpu_bo_unpin(gtt_obj[i]);
+    amdgpu_bo_unpin(gtt_obj[i]);
  out_lclean_unres:
-    amdgpu_bo_unreserve(gtt_obj[i]);
+    amdgpu_bo_unreserve(gtt_obj[i]);
  out_lclean_unref:
-    amdgpu_bo_unref(_obj[i]);
+    amdgpu_bo_unref(_obj[i]);
  out_lclean:
-    for (--i; i >= 0; --i) {
-    amdgpu_bo_unpin(gtt_obj[i]);
-    amdgpu_bo_unreserve(gtt_obj[i]);
-    amdgpu_bo_unref(_obj[i]);
-    }
-    if (fence)
-    dma_fence_put(fence);
-    break;
+    for (--i; i >= 0; --i) {


The usual idiom for cleanups like that is "while (i--)..." because 
that also works with an unsigned i.


Apart from that looks good to me.


But I'm not sure that we would want to keep the in kernel tests around 
anyway.


We now have my amdgpu_stress tool to test memory bandwidth and mesa 
has an option for that for a long time as well.



Shall I then remove amdgpu_test.c ?


Nirmoy




Christian.



Christian.


+    amdgpu_bo_unpin(gtt_obj[i]);
+    amdgpu_bo_unreserve(gtt_obj[i]);
+    amdgpu_bo_unref(_obj[i]);
  }
+    if (fence)
+    dma_fence_put(fence);
    amdgpu_bo_unpin(vram_obj);
  out_unres:

Re: [PATCH 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr

2021-10-20 Thread Das, Nirmoy




On 10/20/2021 12:49 PM, Christian König wrote:

Am 20.10.21 um 11:19 schrieb Lazar, Lijo:



On 10/20/2021 2:18 PM, Das, Nirmoy wrote:


On 10/20/2021 8:49 AM, Christian König wrote:

Am 19.10.21 um 20:14 schrieb Nirmoy Das:

Do not allow exported amdgpu_gtt_mgr_*() to accept
any ttm_resource_manager pointer. Also there is no need
to force other module to call a ttm function just to
eventually call gtt_mgr functions.


That's a rather bad idea I think.

The GTT and VRAM manager work on their respective objects and not 
on the adev directly.



What is bothering me is : it is obvious that  the 
amdgpu_gtt_mgr_usage() for example should only calculate


usages for TTM_PL_TT type resource manager, why to pass that 
explicitly. I am trying to leverage the fact that


we only have one gtt/vram manager for a adev and the functions that 
I changed  work on whole gtt/vram manager


as a unit.



Don't know about the functional aspects. From a sofware perspective, 
amdgpu_gtt_mgr_*() operating on struct amdgpu_gtt_mgr *mgr seems more 
logical.


What we could do is to pass in amdgpu_gtt_mgr instead of 
ttm_resource_manager and then use >mman.gtt_mgr.



Sounds good, I will try this way then.


Regards,

Nirmoy



Regards,
Christian.




Thanks,
Lijo



Regards,

Nirmoy




Christian.



Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 31 
-

  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  4 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  4 +--
  4 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 41ce86244144..5807df52031c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4287,7 +4287,7 @@ static int amdgpu_device_reset_sriov(struct 
amdgpu_device *adev,

    amdgpu_virt_init_data_exchange(adev);
  /* we need recover gart prior to run SMC/CP/SDMA resume */
- amdgpu_gtt_mgr_recover(ttm_manager_type(>mman.bdev, 
TTM_PL_TT));

+    amdgpu_gtt_mgr_recover(adev);
    r = amdgpu_device_fw_loading(adev);
  if (r)
@@ -4604,7 +4604,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,

  amdgpu_inc_vram_lost(tmp_adev);
  }
  -    r = 
amdgpu_gtt_mgr_recover(ttm_manager_type(_adev->mman.bdev, 
TTM_PL_TT));

+    r = amdgpu_gtt_mgr_recover(tmp_adev);
  if (r)
  goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c

index c18f16b3be9c..5e41f8ef743a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
@@ -77,10 +77,8 @@ static ssize_t 
amdgpu_mem_info_gtt_used_show(struct device *dev,

  {
  struct drm_device *ddev = dev_get_drvdata(dev);
  struct amdgpu_device *adev = drm_to_adev(ddev);
-    struct ttm_resource_manager *man;
  -    man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
-    return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(man));
+    return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(adev));
  }
    static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO,
@@ -206,14 +204,19 @@ static void amdgpu_gtt_mgr_del(struct 
ttm_resource_manager *man,

  /**
   * amdgpu_gtt_mgr_usage - return usage of GTT domain
   *
- * @man: TTM memory type manager
+ * @adev: amdgpu_device pointer
   *
   * Return how many bytes are used in the GTT domain
   */
-uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
+uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_device *adev)
  {
-    struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-    s64 result = man->size - atomic64_read(>available);
+    struct ttm_resource_manager *man;
+    struct amdgpu_gtt_mgr *mgr;
+    s64 result;
+
+    man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
+    mgr = to_gtt_mgr(man);
+    result = man->size - atomic64_read(>available);
    return (result > 0 ? result : 0) * PAGE_SIZE;
  }
@@ -221,19 +224,20 @@ uint64_t amdgpu_gtt_mgr_usage(struct 
ttm_resource_manager *man)

  /**
   * amdgpu_gtt_mgr_recover - re-init gart
   *
- * @man: TTM memory type manager
+ * @adev: amdgpu_device pointer
   *
   * Re-init the gart for each known BO in the GTT.
   */
-int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man)
+int amdgpu_gtt_mgr_recover(struct amdgpu_device *adev)
  {
-    struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-    struct amdgpu_device *adev;
+    struct ttm_resource_manager *man;
+    struct amdgpu_gtt_mgr *mgr;
  struct amdgpu_gtt_node *node;
  struct drm_mm_node *mm_node;
  int r = 0;
  -    adev = container_of(mgr, typeof(*adev), mman.gtt_mgr);
+    man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
+    mgr = to_gtt_mgr(man);
  spin_lock(>lock);

Re: [PATCH v2 3/3] drm/amdgpu: recover gart table at resume

2021-10-20 Thread Das, Nirmoy




On 10/20/2021 12:51 PM, Christian König wrote:



Am 20.10.21 um 12:21 schrieb Das, Nirmoy:


On 10/20/2021 12:15 PM, Lazar, Lijo wrote:



On 10/20/2021 3:42 PM, Das, Nirmoy wrote:


On 10/20/2021 12:03 PM, Lazar, Lijo wrote:



On 10/20/2021 3:23 PM, Das, Nirmoy wrote:


On 10/20/2021 11:11 AM, Lazar, Lijo wrote:



On 10/19/2021 11:44 PM, Nirmoy Das wrote:

Get rid off pin/unpin of gart BO at resume/suspend and
instead pin only once and try to recover gart content
at resume time. This is much more stable in case there
is OOM situation at 2nd call to amdgpu_device_evict_resources()
while evicting GART table.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 
--

  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  9 ++---
  drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  | 10 +++---
  drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 10 +++---
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  9 ++---
  6 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 5807df52031c..f69e613805db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct 
drm_device *dev, bool fbcon)

  amdgpu_fence_driver_hw_fini(adev);

  amdgpu_device_ip_suspend_phase2(adev);
-    /* This second call to evict device resources is to evict
- * the gart page table using the CPU.
- */
-    amdgpu_device_evict_resources(adev);

  return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c

index d3e4203f6217..97a9f61fa106 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct 
amdgpu_device *adev)

   *
   * @adev: amdgpu_device pointer
   *
- * Allocate video memory for GART page table
+ * Allocate and pin video memory for GART page table
   * (pcie r4xx, r5xx+).  These asics require the
   * gart table to be in video memory.
   * Returns 0 for success, error for failure.
   */
  int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
  {
+    struct amdgpu_bo_param bp;
  int r;

-    if (adev->gart.bo == NULL) {
-    struct amdgpu_bo_param bp;
-
-    memset(, 0, sizeof(bp));
-    bp.size = adev->gart.table_size;
-    bp.byte_align = PAGE_SIZE;
-    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
-    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
-    AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
-    bp.type = ttm_bo_type_kernel;
-    bp.resv = NULL;
-    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
-
-    r = amdgpu_bo_create(adev, , >gart.bo);
-    if (r) {
-    return r;
-    }
-    }
+    if (adev->gart.bo != NULL)
+    return 0;
+
+    memset(, 0, sizeof(bp));
+    bp.size = adev->gart.table_size;
+    bp.byte_align = PAGE_SIZE;
+    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
+    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
+    AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
+    bp.type = ttm_bo_type_kernel;
+    bp.resv = NULL;
+    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
+
+    r = amdgpu_bo_create(adev, , >gart.bo);
+    if (r)
+    return r;
+
+    r = amdgpu_gart_table_vram_pin(adev);
+    if (r)
+    return r;
+
  return 0;
  }

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c

index 3ec5ff5a6dbe..75d584e1b0e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct 
amdgpu_device *adev)

  return -EINVAL;
  }

-    r = amdgpu_gart_table_vram_pin(adev);
-    if (r)
-    return r;
+    if (adev->in_suspend) {
+    r = amdgpu_gtt_mgr_recover(adev);


When the existing usage of this function is checked, this is 
called during reset recovery after ip resume phase1. Can't the 
same thing be done in ip_resume() to place this after phase1 
resume instead of repeating in every IP version?



Placing amdgpu_gtt_mgr_recover() after phase1 generally works but 
gmc_v10_0_gart_enable() seems to be correct place  to do this


gart specific work.



I see. In that case probably the patch needs to change other call 
places also as this step is already taken care in gart enable.



Do you mean amdgpu_do_asic_reset() ?



Yes, and saw it called in one more place related to sriov reset 
(didn't track the sriov reset path though).



True, hmm looks like this patch going to need multiple tested-by tags 
for gfx6,7 and sriov. I only have gfx8,9,10.


You also need to test this on APUs as well, when it works won 
Raven/gfx9 I'm pretty sure it will work on other generations as well 
(except for typos of course).



I have a r

Re: [PATCH v2 3/3] drm/amdgpu: recover gart table at resume

2021-10-20 Thread Das, Nirmoy




On 10/20/2021 12:15 PM, Lazar, Lijo wrote:



On 10/20/2021 3:42 PM, Das, Nirmoy wrote:


On 10/20/2021 12:03 PM, Lazar, Lijo wrote:



On 10/20/2021 3:23 PM, Das, Nirmoy wrote:


On 10/20/2021 11:11 AM, Lazar, Lijo wrote:



On 10/19/2021 11:44 PM, Nirmoy Das wrote:

Get rid off pin/unpin of gart BO at resume/suspend and
instead pin only once and try to recover gart content
at resume time. This is much more stable in case there
is OOM situation at 2nd call to amdgpu_device_evict_resources()
while evicting GART table.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 
--

  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  9 ++---
  drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  | 10 +++---
  drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 10 +++---
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  9 ++---
  6 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 5807df52031c..f69e613805db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct 
drm_device *dev, bool fbcon)

  amdgpu_fence_driver_hw_fini(adev);

  amdgpu_device_ip_suspend_phase2(adev);
-    /* This second call to evict device resources is to evict
- * the gart page table using the CPU.
- */
-    amdgpu_device_evict_resources(adev);

  return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c

index d3e4203f6217..97a9f61fa106 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct 
amdgpu_device *adev)

   *
   * @adev: amdgpu_device pointer
   *
- * Allocate video memory for GART page table
+ * Allocate and pin video memory for GART page table
   * (pcie r4xx, r5xx+).  These asics require the
   * gart table to be in video memory.
   * Returns 0 for success, error for failure.
   */
  int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
  {
+    struct amdgpu_bo_param bp;
  int r;

-    if (adev->gart.bo == NULL) {
-    struct amdgpu_bo_param bp;
-
-    memset(, 0, sizeof(bp));
-    bp.size = adev->gart.table_size;
-    bp.byte_align = PAGE_SIZE;
-    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
-    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
-    AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
-    bp.type = ttm_bo_type_kernel;
-    bp.resv = NULL;
-    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
-
-    r = amdgpu_bo_create(adev, , >gart.bo);
-    if (r) {
-    return r;
-    }
-    }
+    if (adev->gart.bo != NULL)
+    return 0;
+
+    memset(, 0, sizeof(bp));
+    bp.size = adev->gart.table_size;
+    bp.byte_align = PAGE_SIZE;
+    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
+    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
+    AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
+    bp.type = ttm_bo_type_kernel;
+    bp.resv = NULL;
+    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
+
+    r = amdgpu_bo_create(adev, , >gart.bo);
+    if (r)
+    return r;
+
+    r = amdgpu_gart_table_vram_pin(adev);
+    if (r)
+    return r;
+
  return 0;
  }

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c

index 3ec5ff5a6dbe..75d584e1b0e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct 
amdgpu_device *adev)

  return -EINVAL;
  }

-    r = amdgpu_gart_table_vram_pin(adev);
-    if (r)
-    return r;
+    if (adev->in_suspend) {
+    r = amdgpu_gtt_mgr_recover(adev);


When the existing usage of this function is checked, this is 
called during reset recovery after ip resume phase1. Can't the 
same thing be done in ip_resume() to place this after phase1 
resume instead of repeating in every IP version?



Placing amdgpu_gtt_mgr_recover() after phase1 generally works but 
gmc_v10_0_gart_enable() seems to be correct place  to do this


gart specific work.



I see. In that case probably the patch needs to change other call 
places also as this step is already taken care in gart enable.



Do you mean amdgpu_do_asic_reset() ?



Yes, and saw it called in one more place related to sriov reset 
(didn't track the sriov reset path though).



True, hmm looks like this patch going to need multiple tested-by tags 
for gfx6,7 and sriov. I only have gfx8,9,10.



Regards,

Nirmoy




Thanks,
Lijo



Nirmoy




Thanks,
Lijo



Regards,

Nirmoy





Thanks,
Lijo


+    if (r)
+    return r;
+    }

  r = adev->gfxhub.funcs->gart_enable(adev);
  if (r)
@@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_di

Re: [PATCH v2 3/3] drm/amdgpu: recover gart table at resume

2021-10-20 Thread Das, Nirmoy




On 10/20/2021 12:03 PM, Lazar, Lijo wrote:



On 10/20/2021 3:23 PM, Das, Nirmoy wrote:


On 10/20/2021 11:11 AM, Lazar, Lijo wrote:



On 10/19/2021 11:44 PM, Nirmoy Das wrote:

Get rid off pin/unpin of gart BO at resume/suspend and
instead pin only once and try to recover gart content
at resume time. This is much more stable in case there
is OOM situation at 2nd call to amdgpu_device_evict_resources()
while evicting GART table.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 
--

  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  9 ++---
  drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  | 10 +++---
  drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 10 +++---
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  9 ++---
  6 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 5807df52031c..f69e613805db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct drm_device 
*dev, bool fbcon)

  amdgpu_fence_driver_hw_fini(adev);

  amdgpu_device_ip_suspend_phase2(adev);
-    /* This second call to evict device resources is to evict
- * the gart page table using the CPU.
- */
-    amdgpu_device_evict_resources(adev);

  return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c

index d3e4203f6217..97a9f61fa106 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct 
amdgpu_device *adev)

   *
   * @adev: amdgpu_device pointer
   *
- * Allocate video memory for GART page table
+ * Allocate and pin video memory for GART page table
   * (pcie r4xx, r5xx+).  These asics require the
   * gart table to be in video memory.
   * Returns 0 for success, error for failure.
   */
  int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
  {
+    struct amdgpu_bo_param bp;
  int r;

-    if (adev->gart.bo == NULL) {
-    struct amdgpu_bo_param bp;
-
-    memset(, 0, sizeof(bp));
-    bp.size = adev->gart.table_size;
-    bp.byte_align = PAGE_SIZE;
-    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
-    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
-    AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
-    bp.type = ttm_bo_type_kernel;
-    bp.resv = NULL;
-    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
-
-    r = amdgpu_bo_create(adev, , >gart.bo);
-    if (r) {
-    return r;
-    }
-    }
+    if (adev->gart.bo != NULL)
+    return 0;
+
+    memset(, 0, sizeof(bp));
+    bp.size = adev->gart.table_size;
+    bp.byte_align = PAGE_SIZE;
+    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
+    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
+    AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
+    bp.type = ttm_bo_type_kernel;
+    bp.resv = NULL;
+    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
+
+    r = amdgpu_bo_create(adev, , >gart.bo);
+    if (r)
+    return r;
+
+    r = amdgpu_gart_table_vram_pin(adev);
+    if (r)
+    return r;
+
  return 0;
  }

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c

index 3ec5ff5a6dbe..75d584e1b0e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct 
amdgpu_device *adev)

  return -EINVAL;
  }

-    r = amdgpu_gart_table_vram_pin(adev);
-    if (r)
-    return r;
+    if (adev->in_suspend) {
+    r = amdgpu_gtt_mgr_recover(adev);


When the existing usage of this function is checked, this is called 
during reset recovery after ip resume phase1. Can't the same thing 
be done in ip_resume() to place this after phase1 resume instead of 
repeating in every IP version?



Placing amdgpu_gtt_mgr_recover() after phase1 generally works but 
gmc_v10_0_gart_enable() seems to be correct  place  to do this


gart specific work.



I see. In that case probably the patch needs to change other call 
places also as this step is already taken care in gart enable.



Do you mean amdgpu_do_asic_reset() ?


Nirmoy




Thanks,
Lijo



Regards,

Nirmoy





Thanks,
Lijo


+    if (r)
+    return r;
+    }

  r = adev->gfxhub.funcs->gart_enable(adev);
  if (r)
@@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct 
amdgpu_device *adev)

  {
  adev->gfxhub.funcs->gart_disable(adev);
  adev->mmhub.funcs->gart_disable(adev);
-    amdgpu_gart_table_vram_unpin(adev);
  }

  static int gmc_v10_0_hw_fini(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c

index 0a50fdaced7e..02e90d9443c1

Re: [PATCH v2 3/3] drm/amdgpu: recover gart table at resume

2021-10-20 Thread Das, Nirmoy




On 10/20/2021 11:11 AM, Lazar, Lijo wrote:



On 10/19/2021 11:44 PM, Nirmoy Das wrote:

Get rid off pin/unpin of gart BO at resume/suspend and
instead pin only once and try to recover gart content
at resume time. This is much more stable in case there
is OOM situation at 2nd call to amdgpu_device_evict_resources()
while evicting GART table.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 --
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  9 ++---
  drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  | 10 +++---
  drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 10 +++---
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  9 ++---
  6 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 5807df52031c..f69e613805db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct drm_device 
*dev, bool fbcon)

  amdgpu_fence_driver_hw_fini(adev);

  amdgpu_device_ip_suspend_phase2(adev);
-    /* This second call to evict device resources is to evict
- * the gart page table using the CPU.
- */
-    amdgpu_device_evict_resources(adev);

  return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c

index d3e4203f6217..97a9f61fa106 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct 
amdgpu_device *adev)

   *
   * @adev: amdgpu_device pointer
   *
- * Allocate video memory for GART page table
+ * Allocate and pin video memory for GART page table
   * (pcie r4xx, r5xx+).  These asics require the
   * gart table to be in video memory.
   * Returns 0 for success, error for failure.
   */
  int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
  {
+    struct amdgpu_bo_param bp;
  int r;

-    if (adev->gart.bo == NULL) {
-    struct amdgpu_bo_param bp;
-
-    memset(, 0, sizeof(bp));
-    bp.size = adev->gart.table_size;
-    bp.byte_align = PAGE_SIZE;
-    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
-    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
-    AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
-    bp.type = ttm_bo_type_kernel;
-    bp.resv = NULL;
-    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
-
-    r = amdgpu_bo_create(adev, , >gart.bo);
-    if (r) {
-    return r;
-    }
-    }
+    if (adev->gart.bo != NULL)
+    return 0;
+
+    memset(, 0, sizeof(bp));
+    bp.size = adev->gart.table_size;
+    bp.byte_align = PAGE_SIZE;
+    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
+    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
+    AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
+    bp.type = ttm_bo_type_kernel;
+    bp.resv = NULL;
+    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
+
+    r = amdgpu_bo_create(adev, , >gart.bo);
+    if (r)
+    return r;
+
+    r = amdgpu_gart_table_vram_pin(adev);
+    if (r)
+    return r;
+
  return 0;
  }

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c

index 3ec5ff5a6dbe..75d584e1b0e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct 
amdgpu_device *adev)

  return -EINVAL;
  }

-    r = amdgpu_gart_table_vram_pin(adev);
-    if (r)
-    return r;
+    if (adev->in_suspend) {
+    r = amdgpu_gtt_mgr_recover(adev);


When the existing usage of this function is checked, this is called 
during reset recovery after ip resume phase1. Can't the same thing be 
done in ip_resume() to place this after phase1 resume instead of 
repeating in every IP version?



Placing amdgpu_gtt_mgr_recover() after phase1 generally works but  
gmc_v10_0_gart_enable() seems to be correct  place  to do this


gart specific work.


Regards,

Nirmoy





Thanks,
Lijo


+    if (r)
+    return r;
+    }

  r = adev->gfxhub.funcs->gart_enable(adev);
  if (r)
@@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct 
amdgpu_device *adev)

  {
  adev->gfxhub.funcs->gart_disable(adev);
  adev->mmhub.funcs->gart_disable(adev);
-    amdgpu_gart_table_vram_unpin(adev);
  }

  static int gmc_v10_0_hw_fini(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c

index 0a50fdaced7e..02e90d9443c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
@@ -620,9 +620,12 @@ static int gmc_v7_0_gart_enable(struct 
amdgpu_device *adev)

  dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
  return -EINVAL;
  }
-

Re: [PATCH 1/1] drm/amdgpu: fix BO leak after successful move test

2021-10-20 Thread Das, Nirmoy


ping.

On 10/13/2021 5:09 PM, Nirmoy Das wrote:

GTT BO cleanup code is with in the test for loop and
we would skip cleaning up GTT BO on success.

Reported-by: zhang 
Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_test.c | 25 
  1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
index 909d830b513e..5fe7ff680c29 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
@@ -35,6 +35,7 @@ static void amdgpu_do_test_moves(struct amdgpu_device *adev)
struct amdgpu_bo *vram_obj = NULL;
struct amdgpu_bo **gtt_obj = NULL;
struct amdgpu_bo_param bp;
+   struct dma_fence *fence = NULL;
uint64_t gart_addr, vram_addr;
unsigned n, size;
int i, r;
@@ -82,7 +83,6 @@ static void amdgpu_do_test_moves(struct amdgpu_device *adev)
void *gtt_map, *vram_map;
void **gart_start, **gart_end;
void **vram_start, **vram_end;
-   struct dma_fence *fence = NULL;
  
  		bp.domain = AMDGPU_GEM_DOMAIN_GTT;

r = amdgpu_bo_create(adev, , gtt_obj + i);
@@ -212,24 +212,23 @@ static void amdgpu_do_test_moves(struct amdgpu_device 
*adev)
  
  		DRM_INFO("Tested GTT->VRAM and VRAM->GTT copy for GTT offset 0x%llx\n",

 gart_addr - adev->gmc.gart_start);
-   continue;
+   }
  
+	--i;

  out_lclean_unpin:
-   amdgpu_bo_unpin(gtt_obj[i]);
+   amdgpu_bo_unpin(gtt_obj[i]);
  out_lclean_unres:
-   amdgpu_bo_unreserve(gtt_obj[i]);
+   amdgpu_bo_unreserve(gtt_obj[i]);
  out_lclean_unref:
-   amdgpu_bo_unref(_obj[i]);
+   amdgpu_bo_unref(_obj[i]);
  out_lclean:
-   for (--i; i >= 0; --i) {
-   amdgpu_bo_unpin(gtt_obj[i]);
-   amdgpu_bo_unreserve(gtt_obj[i]);
-   amdgpu_bo_unref(_obj[i]);
-   }
-   if (fence)
-   dma_fence_put(fence);
-   break;
+   for (--i; i >= 0; --i) {
+   amdgpu_bo_unpin(gtt_obj[i]);
+   amdgpu_bo_unreserve(gtt_obj[i]);
+   amdgpu_bo_unref(_obj[i]);
}
+   if (fence)
+   dma_fence_put(fence);
  
  	amdgpu_bo_unpin(vram_obj);

  out_unres:

Re: [PATCH 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr

2021-10-20 Thread Das, Nirmoy




On 10/20/2021 8:49 AM, Christian König wrote:

Am 19.10.21 um 20:14 schrieb Nirmoy Das:

Do not allow exported amdgpu_gtt_mgr_*() to accept
any ttm_resource_manager pointer. Also there is no need
to force other module to call a ttm function just to
eventually call gtt_mgr functions.


That's a rather bad idea I think.

The GTT and VRAM manager work on their respective objects and not on 
the adev directly.



What is bothering me is : it is obvious that  the amdgpu_gtt_mgr_usage() 
for example should only calculate


usages for TTM_PL_TT type resource manager, why to pass that explicitly. 
I am trying to leverage the fact that


we only have one gtt/vram manager for a adev and the functions that I 
changed  work on whole gtt/vram manager


as a unit.


Regards,

Nirmoy




Christian.



Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 31 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  4 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  4 +--
  4 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 41ce86244144..5807df52031c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4287,7 +4287,7 @@ static int amdgpu_device_reset_sriov(struct 
amdgpu_device *adev,

    amdgpu_virt_init_data_exchange(adev);
  /* we need recover gart prior to run SMC/CP/SDMA resume */
- amdgpu_gtt_mgr_recover(ttm_manager_type(>mman.bdev, TTM_PL_TT));
+    amdgpu_gtt_mgr_recover(adev);
    r = amdgpu_device_fw_loading(adev);
  if (r)
@@ -4604,7 +4604,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,

  amdgpu_inc_vram_lost(tmp_adev);
  }
  -    r = 
amdgpu_gtt_mgr_recover(ttm_manager_type(_adev->mman.bdev, 
TTM_PL_TT));

+    r = amdgpu_gtt_mgr_recover(tmp_adev);
  if (r)
  goto out;
  diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c

index c18f16b3be9c..5e41f8ef743a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
@@ -77,10 +77,8 @@ static ssize_t 
amdgpu_mem_info_gtt_used_show(struct device *dev,

  {
  struct drm_device *ddev = dev_get_drvdata(dev);
  struct amdgpu_device *adev = drm_to_adev(ddev);
-    struct ttm_resource_manager *man;
  -    man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
-    return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(man));
+    return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(adev));
  }
    static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO,
@@ -206,14 +204,19 @@ static void amdgpu_gtt_mgr_del(struct 
ttm_resource_manager *man,

  /**
   * amdgpu_gtt_mgr_usage - return usage of GTT domain
   *
- * @man: TTM memory type manager
+ * @adev: amdgpu_device pointer
   *
   * Return how many bytes are used in the GTT domain
   */
-uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
+uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_device *adev)
  {
-    struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-    s64 result = man->size - atomic64_read(>available);
+    struct ttm_resource_manager *man;
+    struct amdgpu_gtt_mgr *mgr;
+    s64 result;
+
+    man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
+    mgr = to_gtt_mgr(man);
+    result = man->size - atomic64_read(>available);
    return (result > 0 ? result : 0) * PAGE_SIZE;
  }
@@ -221,19 +224,20 @@ uint64_t amdgpu_gtt_mgr_usage(struct 
ttm_resource_manager *man)

  /**
   * amdgpu_gtt_mgr_recover - re-init gart
   *
- * @man: TTM memory type manager
+ * @adev: amdgpu_device pointer
   *
   * Re-init the gart for each known BO in the GTT.
   */
-int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man)
+int amdgpu_gtt_mgr_recover(struct amdgpu_device *adev)
  {
-    struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-    struct amdgpu_device *adev;
+    struct ttm_resource_manager *man;
+    struct amdgpu_gtt_mgr *mgr;
  struct amdgpu_gtt_node *node;
  struct drm_mm_node *mm_node;
  int r = 0;
  -    adev = container_of(mgr, typeof(*adev), mman.gtt_mgr);
+    man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
+    mgr = to_gtt_mgr(man);
  spin_lock(>lock);
  drm_mm_for_each_node(mm_node, >mm) {
  node = container_of(mm_node, typeof(*node), base.mm_nodes[0]);
@@ -260,6 +264,7 @@ static void amdgpu_gtt_mgr_debug(struct 
ttm_resource_manager *man,

   struct drm_printer *printer)
  {
  struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
+    struct amdgpu_device *adev = container_of(mgr, typeof(*adev), 
mman.gtt_mgr);

    spin_lock(>lock);
  drm_mm_print(>mm, printer);
@@ -267,7 +272,7 @@ static void amdgpu_gtt_mgr_debug

Re: [PATCH v2 3/3] drm/amdgpu: recover gart table at resume

2021-10-20 Thread Das, Nirmoy




On 10/20/2021 8:52 AM, Christian König wrote:

Am 19.10.21 um 20:14 schrieb Nirmoy Das:

Get rid off pin/unpin of gart BO at resume/suspend and
instead pin only once and try to recover gart content
at resume time. This is much more stable in case there
is OOM situation at 2nd call to amdgpu_device_evict_resources()
while evicting GART table.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 --
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  9 ++---
  drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  | 10 +++---
  drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 10 +++---
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  9 ++---
  6 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 5807df52031c..f69e613805db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct drm_device 
*dev, bool fbcon)

  amdgpu_fence_driver_hw_fini(adev);

  amdgpu_device_ip_suspend_phase2(adev);
-    /* This second call to evict device resources is to evict
- * the gart page table using the CPU.
- */
-    amdgpu_device_evict_resources(adev);

  return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c

index d3e4203f6217..97a9f61fa106 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct 
amdgpu_device *adev)

   *
   * @adev: amdgpu_device pointer
   *
- * Allocate video memory for GART page table
+ * Allocate and pin video memory for GART page table
   * (pcie r4xx, r5xx+).  These asics require the
   * gart table to be in video memory.
   * Returns 0 for success, error for failure.
   */
  int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
  {
+    struct amdgpu_bo_param bp;
  int r;

-    if (adev->gart.bo == NULL) {
-    struct amdgpu_bo_param bp;
-
-    memset(, 0, sizeof(bp));
-    bp.size = adev->gart.table_size;
-    bp.byte_align = PAGE_SIZE;
-    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
-    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
-    AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
-    bp.type = ttm_bo_type_kernel;
-    bp.resv = NULL;
-    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
-
-    r = amdgpu_bo_create(adev, , >gart.bo);
-    if (r) {
-    return r;
-    }
-    }
+    if (adev->gart.bo != NULL)
+    return 0;
+
+    memset(, 0, sizeof(bp));
+    bp.size = adev->gart.table_size;
+    bp.byte_align = PAGE_SIZE;
+    bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
+    bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
+    AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
+    bp.type = ttm_bo_type_kernel;
+    bp.resv = NULL;
+    bp.bo_ptr_size = sizeof(struct amdgpu_bo);
+
+    r = amdgpu_bo_create(adev, , >gart.bo);
+    if (r)
+    return r;
+
+    r = amdgpu_gart_table_vram_pin(adev);
+    if (r)
+    return r;
+


Instead of all this you should be able to use amdgpu_bo_create_kernel().



OK, with that we can remove amdgpu_gart_table_vram_pin() completely.





  return 0;
  }

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c

index 3ec5ff5a6dbe..75d584e1b0e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct 
amdgpu_device *adev)

  return -EINVAL;
  }

-    r = amdgpu_gart_table_vram_pin(adev);
-    if (r)
-    return r;
+    if (adev->in_suspend) {
+    r = amdgpu_gtt_mgr_recover(adev);
+    if (r)
+    return r;
+    }


Please drop the in_suspend check here.

If I'm not completely mistaken the GTT domain should already be 
initialized here and if it's not then we can easily check for that in 
amdgpu_gtt_mgr_recover.



Yes it is. I will remove that.


Thanks,

Nirmoy




Christian.



  r = adev->gfxhub.funcs->gart_enable(adev);
  if (r)
@@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct 
amdgpu_device *adev)

  {
  adev->gfxhub.funcs->gart_disable(adev);
  adev->mmhub.funcs->gart_disable(adev);
-    amdgpu_gart_table_vram_unpin(adev);
  }

  static int gmc_v10_0_hw_fini(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c

index 0a50fdaced7e..02e90d9443c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
@@ -620,9 +620,12 @@ static int gmc_v7_0_gart_enable(struct 
amdgpu_device *adev)

  dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
  return -EINVAL;
  }
-

[PATCH 1/3] drm/amdgpu: do not pass ttm_resource_manager to gtt_mgr

2021-10-19 Thread Nirmoy Das

Do not allow exported amdgpu_gtt_mgr_*() to accept
any ttm_resource_manager pointer. Also there is no need
to force other module to call a ttm function just to
eventually call gtt_mgr functions.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c | 31 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c |  4 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  4 +--
 4 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 41ce86244144..5807df52031c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4287,7 +4287,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
 
amdgpu_virt_init_data_exchange(adev);
/* we need recover gart prior to run SMC/CP/SDMA resume */
-   amdgpu_gtt_mgr_recover(ttm_manager_type(>mman.bdev, TTM_PL_TT));
+   amdgpu_gtt_mgr_recover(adev);
 
r = amdgpu_device_fw_loading(adev);
if (r)
@@ -4604,7 +4604,7 @@ int amdgpu_do_asic_reset(struct list_head 
*device_list_handle,
amdgpu_inc_vram_lost(tmp_adev);
}
 
-   r = 
amdgpu_gtt_mgr_recover(ttm_manager_type(_adev->mman.bdev, TTM_PL_TT));
+   r = amdgpu_gtt_mgr_recover(tmp_adev);
if (r)
goto out;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
index c18f16b3be9c..5e41f8ef743a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gtt_mgr.c
@@ -77,10 +77,8 @@ static ssize_t amdgpu_mem_info_gtt_used_show(struct device 
*dev,
 {
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = drm_to_adev(ddev);
-   struct ttm_resource_manager *man;
 
-   man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
-   return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(man));
+   return sysfs_emit(buf, "%llu\n", amdgpu_gtt_mgr_usage(adev));
 }
 
 static DEVICE_ATTR(mem_info_gtt_total, S_IRUGO,
@@ -206,14 +204,19 @@ static void amdgpu_gtt_mgr_del(struct 
ttm_resource_manager *man,
 /**
  * amdgpu_gtt_mgr_usage - return usage of GTT domain
  *
- * @man: TTM memory type manager
+ * @adev: amdgpu_device pointer
  *
  * Return how many bytes are used in the GTT domain
  */
-uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager *man)
+uint64_t amdgpu_gtt_mgr_usage(struct amdgpu_device *adev)
 {
-   struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-   s64 result = man->size - atomic64_read(>available);
+   struct ttm_resource_manager *man;
+   struct amdgpu_gtt_mgr *mgr;
+   s64 result;
+
+   man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
+   mgr = to_gtt_mgr(man);
+   result = man->size - atomic64_read(>available);
 
return (result > 0 ? result : 0) * PAGE_SIZE;
 }
@@ -221,19 +224,20 @@ uint64_t amdgpu_gtt_mgr_usage(struct ttm_resource_manager 
*man)
 /**
  * amdgpu_gtt_mgr_recover - re-init gart
  *
- * @man: TTM memory type manager
+ * @adev: amdgpu_device pointer
  *
  * Re-init the gart for each known BO in the GTT.
  */
-int amdgpu_gtt_mgr_recover(struct ttm_resource_manager *man)
+int amdgpu_gtt_mgr_recover(struct amdgpu_device *adev)
 {
-   struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
-   struct amdgpu_device *adev;
+   struct ttm_resource_manager *man;
+   struct amdgpu_gtt_mgr *mgr;
struct amdgpu_gtt_node *node;
struct drm_mm_node *mm_node;
int r = 0;
 
-   adev = container_of(mgr, typeof(*adev), mman.gtt_mgr);
+   man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
+   mgr = to_gtt_mgr(man);
spin_lock(>lock);
drm_mm_for_each_node(mm_node, >mm) {
node = container_of(mm_node, typeof(*node), base.mm_nodes[0]);
@@ -260,6 +264,7 @@ static void amdgpu_gtt_mgr_debug(struct 
ttm_resource_manager *man,
 struct drm_printer *printer)
 {
struct amdgpu_gtt_mgr *mgr = to_gtt_mgr(man);
+   struct amdgpu_device *adev = container_of(mgr, typeof(*adev), 
mman.gtt_mgr);
 
spin_lock(>lock);
drm_mm_print(>mm, printer);
@@ -267,7 +272,7 @@ static void amdgpu_gtt_mgr_debug(struct 
ttm_resource_manager *man,
 
drm_printf(printer, "man size:%llu pages, gtt available:%lld pages, 
usage:%lluMB\n",
   man->size, (u64)atomic64_read(>available),
-  amdgpu_gtt_mgr_usage(man) >> 20);
+  amdgpu_gtt_mgr_usage(adev) >> 20);
 }
 
 static const struct ttm_resource_manager_func amdgpu_gtt_mgr_func = {
diff --git

[PATCH v2 3/3] drm/amdgpu: recover gart table at resume

2021-10-19 Thread Nirmoy Das

Get rid off pin/unpin of gart BO at resume/suspend and
instead pin only once and try to recover gart content
at resume time. This is much more stable in case there
is OOM situation at 2nd call to amdgpu_device_evict_resources()
while evicting GART table.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  4 ---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c   | 42 --
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c |  9 ++---
 drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c  | 10 +++---
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 10 +++---
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  |  9 ++---
 6 files changed, 45 insertions(+), 39 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 5807df52031c..f69e613805db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3941,10 +3941,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
amdgpu_fence_driver_hw_fini(adev);

amdgpu_device_ip_suspend_phase2(adev);
-   /* This second call to evict device resources is to evict
-* the gart page table using the CPU.
-*/
-   amdgpu_device_evict_resources(adev);

return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
index d3e4203f6217..97a9f61fa106 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gart.c
@@ -107,33 +107,37 @@ void amdgpu_gart_dummy_page_fini(struct amdgpu_device 
*adev)
  *
  * @adev: amdgpu_device pointer
  *
- * Allocate video memory for GART page table
+ * Allocate and pin video memory for GART page table
  * (pcie r4xx, r5xx+).  These asics require the
  * gart table to be in video memory.
  * Returns 0 for success, error for failure.
  */
 int amdgpu_gart_table_vram_alloc(struct amdgpu_device *adev)
 {
+   struct amdgpu_bo_param bp;
int r;

-   if (adev->gart.bo == NULL) {
-   struct amdgpu_bo_param bp;
-
-   memset(, 0, sizeof(bp));
-   bp.size = adev->gart.table_size;
-   bp.byte_align = PAGE_SIZE;
-   bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
-   bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
-   AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
-   bp.type = ttm_bo_type_kernel;
-   bp.resv = NULL;
-   bp.bo_ptr_size = sizeof(struct amdgpu_bo);
-
-   r = amdgpu_bo_create(adev, , >gart.bo);
-   if (r) {
-   return r;
-   }
-   }
+   if (adev->gart.bo != NULL)
+   return 0;
+
+   memset(, 0, sizeof(bp));
+   bp.size = adev->gart.table_size;
+   bp.byte_align = PAGE_SIZE;
+   bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
+   bp.flags = AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED |
+   AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS;
+   bp.type = ttm_bo_type_kernel;
+   bp.resv = NULL;
+   bp.bo_ptr_size = sizeof(struct amdgpu_bo);
+
+   r = amdgpu_bo_create(adev, , >gart.bo);
+   if (r)
+   return r;
+
+   r = amdgpu_gart_table_vram_pin(adev);
+   if (r)
+   return r;
+
return 0;
 }

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 3ec5ff5a6dbe..75d584e1b0e9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -992,9 +992,11 @@ static int gmc_v10_0_gart_enable(struct amdgpu_device 
*adev)
return -EINVAL;
}

-   r = amdgpu_gart_table_vram_pin(adev);
-   if (r)
-   return r;
+   if (adev->in_suspend) {
+   r = amdgpu_gtt_mgr_recover(adev);
+   if (r)
+   return r;
+   }

r = adev->gfxhub.funcs->gart_enable(adev);
if (r)
@@ -1062,7 +1064,6 @@ static void gmc_v10_0_gart_disable(struct amdgpu_device 
*adev)
 {
adev->gfxhub.funcs->gart_disable(adev);
adev->mmhub.funcs->gart_disable(adev);
-   amdgpu_gart_table_vram_unpin(adev);
 }

 static int gmc_v10_0_hw_fini(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
index 0a50fdaced7e..02e90d9443c1 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v7_0.c
@@ -620,9 +620,12 @@ static int gmc_v7_0_gart_enable(struct amdgpu_device *adev)
dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
return -EINVAL;
}
-   r = amdgpu_gart_table_vram_pin(adev);
-   if (r)
-   return r;
+
+   if (adev->in_suspend) {
+   r = amdgpu_gtt_mgr_recover(adev);
+   if (r)
+   return r;
+   }

table_addr = amdgpu_bo_gpu_

[PATCH 2/3] drm/amdgpu: do not pass ttm_resource_manager to vram_mgr

2021-10-19 Thread Nirmoy Das

Do not allow exported amdgpu_vram_mgr_*() to accept
any ttm_resource_manager pointer. Also there is no need
to force other module to call a ttm function just to
eventually call vram_mgr functions.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c   |  3 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c   |  5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c  | 10 ++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c  |  6 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h  |  8 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c |  5 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c | 54 
 7 files changed, 49 insertions(+), 42 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
index 7077f21f0021..4837c579a787 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
@@ -531,9 +531,8 @@ int amdgpu_amdkfd_get_dmabuf_info(struct kgd_dev *kgd, int 
dma_buf_fd,
 uint64_t amdgpu_amdkfd_get_vram_usage(struct kgd_dev *kgd)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)kgd;
-   struct ttm_resource_manager *vram_man = 
ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
 
-   return amdgpu_vram_mgr_usage(vram_man);
+   return amdgpu_vram_mgr_usage(adev);
 }
 
 uint64_t amdgpu_amdkfd_get_hive_id(struct kgd_dev *kgd)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
index 76fe5b71e35d..f4084ca8b614 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c
@@ -298,7 +298,6 @@ static void amdgpu_cs_get_threshold_for_moves(struct 
amdgpu_device *adev,
 {
s64 time_us, increment_us;
u64 free_vram, total_vram, used_vram;
-   struct ttm_resource_manager *vram_man = 
ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
/* Allow a maximum of 200 accumulated ms. This is basically per-IB
 * throttling.
 *
@@ -315,7 +314,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct 
amdgpu_device *adev,
}
 
total_vram = adev->gmc.real_vram_size - 
atomic64_read(>vram_pin_size);
-   used_vram = amdgpu_vram_mgr_usage(vram_man);
+   used_vram = amdgpu_vram_mgr_usage(adev);
free_vram = used_vram >= total_vram ? 0 : total_vram - used_vram;
 
spin_lock(>mm_stats.lock);
@@ -362,7 +361,7 @@ static void amdgpu_cs_get_threshold_for_moves(struct 
amdgpu_device *adev,
if (!amdgpu_gmc_vram_full_visible(>gmc)) {
u64 total_vis_vram = adev->gmc.visible_vram_size;
u64 used_vis_vram =
- amdgpu_vram_mgr_vis_usage(vram_man);
+ amdgpu_vram_mgr_vis_usage(adev);
 
if (used_vis_vram < total_vis_vram) {
u64 free_vis_vram = total_vis_vram - used_vis_vram;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index b9b38f70e416..34674ccabd67 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -672,10 +672,10 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
ui64 = atomic64_read(>num_vram_cpu_page_faults);
return copy_to_user(out, , min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_VRAM_USAGE:
-   ui64 = amdgpu_vram_mgr_usage(ttm_manager_type(>mman.bdev, 
TTM_PL_VRAM));
+   ui64 = amdgpu_vram_mgr_usage(adev);
return copy_to_user(out, , min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_VIS_VRAM_USAGE:
-   ui64 = 
amdgpu_vram_mgr_vis_usage(ttm_manager_type(>mman.bdev, TTM_PL_VRAM));
+   ui64 = amdgpu_vram_mgr_vis_usage(adev);
return copy_to_user(out, , min(size, 8u)) ? -EFAULT : 0;
case AMDGPU_INFO_GTT_USAGE:
ui64 = amdgpu_gtt_mgr_usage(adev);
@@ -709,8 +709,6 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
}
case AMDGPU_INFO_MEMORY: {
struct drm_amdgpu_memory_info mem;
-   struct ttm_resource_manager *vram_man =
-   ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
struct ttm_resource_manager *gtt_man =
ttm_manager_type(>mman.bdev, TTM_PL_TT);
memset(, 0, sizeof(mem));
@@ -719,7 +717,7 @@ int amdgpu_info_ioctl(struct drm_device *dev, void *data, 
struct drm_file *filp)
atomic64_read(>vram_pin_size) -
AMDGPU_VM_RESERVED_VRAM;
mem.vram.heap_usage =
-   amdgpu_vram_mgr_usage(vram_man);
+   amdgpu_vram_mgr_usage(adev);
mem.vram.max_allocation = mem.vram.usable_heap_size * 3 / 4;
 
mem.cpu_accessible_vram.total_heap_size =
@@ -729,7 +727,7 @@

Re: [PATCH 1/1] drm/amdgpu: recover gart table at resume

2021-10-19 Thread Das, Nirmoy




On 10/19/2021 5:43 PM, Christian König wrote:

Am 19.10.21 um 15:22 schrieb Nirmoy Das:

Get rid off pin/unpin and evict and swap back gart
page table which should make things less likely to break.

Also remove 2nd call to amdgpu_device_evict_resources()
as we don't need it.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 -
  drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 16 
  drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 17 +
  drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 16 
  4 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 41ce86244144..22ff229ab981 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3941,11 +3941,6 @@ int amdgpu_device_suspend(struct drm_device 
*dev, bool fbcon)

  amdgpu_fence_driver_hw_fini(adev);
    amdgpu_device_ip_suspend_phase2(adev);
-    /* This second call to evict device resources is to evict
- * the gart page table using the CPU.
- */
-    amdgpu_device_evict_resources(adev);
-
  return 0;
  }
  diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c

index 3ec5ff5a6dbe..18e3f3c5aae6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -992,9 +992,16 @@ static int gmc_v10_0_gart_enable(struct 
amdgpu_device *adev)

  return -EINVAL;
  }
  -    r = amdgpu_gart_table_vram_pin(adev);
-    if (r)
-    return r;
+    if (!adev->in_suspend) {
+    r = amdgpu_gart_table_vram_pin(adev);
+    if (r)
+    return r;


I think you can move the functionality of pinning into 
amdgpu_gart_table_vram_alloc().



+    } else {
+    r = amdgpu_gtt_mgr_recover(ttm_manager_type(>mman.bdev,
+    TTM_PL_TT));
+    if (r)
+    return r;
+    }


And add a wrapper around this call here. Something like 
amdgpu_gart_recover() or similar.



Thanks Christian, I will resend with your suggested changes.



Regards,
Christian.


    r = adev->gfxhub.funcs->gart_enable(adev);
  if (r)
@@ -1062,7 +1069,8 @@ static void gmc_v10_0_gart_disable(struct 
amdgpu_device *adev)

  {
  adev->gfxhub.funcs->gart_disable(adev);
  adev->mmhub.funcs->gart_disable(adev);
-    amdgpu_gart_table_vram_unpin(adev);
+    if (!adev->in_suspend)
+    amdgpu_gart_table_vram_unpin(adev);
  }
    static int gmc_v10_0_hw_fini(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c

index 492ebed2915b..0ef50ad3d7d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -837,9 +837,17 @@ static int gmc_v8_0_gart_enable(struct 
amdgpu_device *adev)

  dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
  return -EINVAL;
  }
-    r = amdgpu_gart_table_vram_pin(adev);
-    if (r)
-    return r;
+
+    if (!adev->in_suspend) {
+    r = amdgpu_gart_table_vram_pin(adev);
+    if (r)
+    return r;
+    } else {
+    r = amdgpu_gtt_mgr_recover(ttm_manager_type(>mman.bdev,
+    TTM_PL_TT));
+    if (r)
+    return r;
+    }
    table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
  @@ -992,7 +1000,8 @@ static void gmc_v8_0_gart_disable(struct 
amdgpu_device *adev)

  tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
  WREG32(mmVM_L2_CNTL, tmp);
  WREG32(mmVM_L2_CNTL2, 0);
-    amdgpu_gart_table_vram_unpin(adev);
+    if (!adev->in_suspend)
+    amdgpu_gart_table_vram_unpin(adev);
  }
    /**
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c

index cb82404df534..1bbcefd53974 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1714,9 +1714,16 @@ static int gmc_v9_0_gart_enable(struct 
amdgpu_device *adev)

  return -EINVAL;
  }
  -    r = amdgpu_gart_table_vram_pin(adev);
-    if (r)
-    return r;
+    if (!adev->in_suspend) {
+    r = amdgpu_gart_table_vram_pin(adev);
+    if (r)
+    return r;
+    } else {
+    r = amdgpu_gtt_mgr_recover(ttm_manager_type(>mman.bdev,
+    TTM_PL_TT));
+    if (r)
+    return r;
+    }
    r = adev->gfxhub.funcs->gart_enable(adev);
  if (r)
@@ -1793,7 +1800,8 @@ static void gmc_v9_0_gart_disable(struct 
amdgpu_device *adev)

  {
  adev->gfxhub.funcs->gart_disable(adev);
  adev->mmhub.funcs->gart_disable(adev);
-    amdgpu_gart_table_vram_unpin(adev);
+    if (!adev->in_suspend)
+    amdgpu_gart_table_vram_unpin(adev);
  }
    static int gmc_v9_0_hw_fini(void *handle)

[PATCH 1/1] drm/amdgpu: recover gart table at resume

2021-10-19 Thread Nirmoy Das

Get rid off pin/unpin and evict and swap back gart
page table which should make things less likely to break.

Also remove 2nd call to amdgpu_device_evict_resources()
as we don't need it.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  5 -
 drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 16 
 drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c  | 17 +
 drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 16 
 4 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 41ce86244144..22ff229ab981 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3941,11 +3941,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
amdgpu_fence_driver_hw_fini(adev);
 
amdgpu_device_ip_suspend_phase2(adev);
-   /* This second call to evict device resources is to evict
-* the gart page table using the CPU.
-*/
-   amdgpu_device_evict_resources(adev);
-
return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
index 3ec5ff5a6dbe..18e3f3c5aae6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
@@ -992,9 +992,16 @@ static int gmc_v10_0_gart_enable(struct amdgpu_device 
*adev)
return -EINVAL;
}
 
-   r = amdgpu_gart_table_vram_pin(adev);
-   if (r)
-   return r;
+   if (!adev->in_suspend) {
+   r = amdgpu_gart_table_vram_pin(adev);
+   if (r)
+   return r;
+   } else {
+   r = amdgpu_gtt_mgr_recover(ttm_manager_type(>mman.bdev,
+   TTM_PL_TT));
+   if (r)
+   return r;
+   }
 
r = adev->gfxhub.funcs->gart_enable(adev);
if (r)
@@ -1062,7 +1069,8 @@ static void gmc_v10_0_gart_disable(struct amdgpu_device 
*adev)
 {
adev->gfxhub.funcs->gart_disable(adev);
adev->mmhub.funcs->gart_disable(adev);
-   amdgpu_gart_table_vram_unpin(adev);
+   if (!adev->in_suspend)
+   amdgpu_gart_table_vram_unpin(adev);
 }
 
 static int gmc_v10_0_hw_fini(void *handle)
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
index 492ebed2915b..0ef50ad3d7d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v8_0.c
@@ -837,9 +837,17 @@ static int gmc_v8_0_gart_enable(struct amdgpu_device *adev)
dev_err(adev->dev, "No VRAM object for PCIE GART.\n");
return -EINVAL;
}
-   r = amdgpu_gart_table_vram_pin(adev);
-   if (r)
-   return r;
+
+   if (!adev->in_suspend) {
+   r = amdgpu_gart_table_vram_pin(adev);
+   if (r)
+   return r;
+   } else {
+   r = amdgpu_gtt_mgr_recover(ttm_manager_type(>mman.bdev,
+   TTM_PL_TT));
+   if (r)
+   return r;
+   }
 
table_addr = amdgpu_bo_gpu_offset(adev->gart.bo);
 
@@ -992,7 +1000,8 @@ static void gmc_v8_0_gart_disable(struct amdgpu_device 
*adev)
tmp = REG_SET_FIELD(tmp, VM_L2_CNTL, ENABLE_L2_CACHE, 0);
WREG32(mmVM_L2_CNTL, tmp);
WREG32(mmVM_L2_CNTL2, 0);
-   amdgpu_gart_table_vram_unpin(adev);
+   if (!adev->in_suspend)
+   amdgpu_gart_table_vram_unpin(adev);
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
index cb82404df534..1bbcefd53974 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
@@ -1714,9 +1714,16 @@ static int gmc_v9_0_gart_enable(struct amdgpu_device 
*adev)
return -EINVAL;
}
 
-   r = amdgpu_gart_table_vram_pin(adev);
-   if (r)
-   return r;
+   if (!adev->in_suspend) {
+   r = amdgpu_gart_table_vram_pin(adev);
+   if (r)
+   return r;
+   } else {
+   r = amdgpu_gtt_mgr_recover(ttm_manager_type(>mman.bdev,
+   TTM_PL_TT));
+   if (r)
+   return r;
+   }
 
r = adev->gfxhub.funcs->gart_enable(adev);
if (r)
@@ -1793,7 +1800,8 @@ static void gmc_v9_0_gart_disable(struct amdgpu_device 
*adev)
 {
adev->gfxhub.funcs->gart_disable(adev);
adev->mmhub.funcs->gart_disable(adev);
-   amdgpu_gart_table_vram_unpin(adev);
+   if (!adev->in_suspend)
+   amdgpu_gart_table_vram_unpin(adev);
 }
 
 static int gmc_v9_0_hw_fini(void *handle)
-- 
2.32.0

Re: [PATCH Review 1/1] drm/ttm: fix debugfs node create failed

2021-10-13 Thread Das, Nirmoy




On 10/13/2021 2:29 PM, Christian König wrote:

Am 12.10.21 um 15:12 schrieb Das, Nirmoy:


On 10/12/2021 1:58 PM, Stanley.Yang wrote:

Test scenario:
 modprobe amdgpu -> rmmod amdgpu -> modprobe amdgpu
Error log:
 [   54.396807] debugfs: File 'page_pool' in directory 'amdttm' 
already present!
 [   54.396833] debugfs: File 'page_pool_shrink' in directory 
'amdttm' already present!
 [   54.396848] debugfs: File 'buffer_objects' in directory 
'amdttm' already present!



We should instead add a check if those debugfs files already 
exist/created in ttm debugfs dir using debugfs_lookup() before creating.


No, IIRC the Intel guys had fixed that already by adding/removing the 
debugfs file on module load/unload.



Adding/removing on ttm module load/unload is nicer.


Nirmoy




Christian.




Regards,

Nirmoy




Reason:
 page_pool, page_pool_shrink and buffer_objects can be removed when
 rmmod amdttm, in the above test scenario only rmmod amdgpu, so 
those

 debugfs node will not be removed, this caused file create failed.
Soultion:
 create ttm_page directory under ttm_root directory when insmod 
amdgpu,
 page_pool, page_pool_shrink and buffer_objects are stored in 
ttm_page directiry,
 remove ttm_page directory when do rmmod amdgpu, this can fix 
above issue.


Signed-off-by: Stanley.Yang 
---
  drivers/gpu/drm/ttm/ttm_device.c | 12 +++-
  drivers/gpu/drm/ttm/ttm_module.c |  1 +
  drivers/gpu/drm/ttm/ttm_module.h |  1 +
  drivers/gpu/drm/ttm/ttm_pool.c   |  4 ++--
  4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_device.c 
b/drivers/gpu/drm/ttm/ttm_device.c

index 1de23edbc182..ad170328f0c8 100644
--- a/drivers/gpu/drm/ttm/ttm_device.c
+++ b/drivers/gpu/drm/ttm/ttm_device.c
@@ -55,6 +55,10 @@ static void ttm_global_release(void)
    ttm_pool_mgr_fini();
  +#ifdef CONFIG_DEBUG_FS
+    debugfs_remove(ttm_debugfs_page);
+#endif
+
  __free_page(glob->dummy_read_page);
  memset(glob, 0, sizeof(*glob));
  out:
@@ -85,6 +89,10 @@ static int ttm_global_init(void)
  >> PAGE_SHIFT;
  num_dma32 = min(num_dma32, 2UL << (30 - PAGE_SHIFT));
  +#ifdef CONFIG_DEBUG_FS
+    ttm_debugfs_page = debugfs_create_dir("ttm_page", 
ttm_debugfs_root);

+#endif
+
  ttm_pool_mgr_init(num_pages);
  ttm_tt_mgr_init(num_pages, num_dma32);
  @@ -98,8 +106,10 @@ static int ttm_global_init(void)
  INIT_LIST_HEAD(>device_list);
  atomic_set(>bo_count, 0);
  -    debugfs_create_atomic_t("buffer_objects", 0444, 
ttm_debugfs_root,

+#ifdef CONFIG_DEBUG_FS
+    debugfs_create_atomic_t("buffer_objects", 0444, ttm_debugfs_page,
  >bo_count);
+#endif
  out:
  mutex_unlock(_global_mutex);
  return ret;
diff --git a/drivers/gpu/drm/ttm/ttm_module.c 
b/drivers/gpu/drm/ttm/ttm_module.c

index 88970a6b8e32..66595e6e7087 100644
--- a/drivers/gpu/drm/ttm/ttm_module.c
+++ b/drivers/gpu/drm/ttm/ttm_module.c
@@ -38,6 +38,7 @@
  #include "ttm_module.h"
    struct dentry *ttm_debugfs_root;
+struct dentry *ttm_debugfs_page;
    static int __init ttm_init(void)
  {
diff --git a/drivers/gpu/drm/ttm/ttm_module.h 
b/drivers/gpu/drm/ttm/ttm_module.h

index d7cac5d4b835..6007dc66f44e 100644
--- a/drivers/gpu/drm/ttm/ttm_module.h
+++ b/drivers/gpu/drm/ttm/ttm_module.h
@@ -36,5 +36,6 @@
  struct dentry;
    extern struct dentry *ttm_debugfs_root;
+extern struct dentry *ttm_debugfs_page;
    #endif /* _TTM_MODULE_H_ */
diff --git a/drivers/gpu/drm/ttm/ttm_pool.c 
b/drivers/gpu/drm/ttm/ttm_pool.c

index 8be7fd7161fd..ecb33daad7b5 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -709,9 +709,9 @@ int ttm_pool_mgr_init(unsigned long num_pages)
  }
    #ifdef CONFIG_DEBUG_FS
-    debugfs_create_file("page_pool", 0444, ttm_debugfs_root, NULL,
+    debugfs_create_file("page_pool", 0444, ttm_debugfs_page, NULL,
  _pool_debugfs_globals_fops);
-    debugfs_create_file("page_pool_shrink", 0400, ttm_debugfs_root, 
NULL,
+    debugfs_create_file("page_pool_shrink", 0400, ttm_debugfs_page, 
NULL,

  _pool_debugfs_shrink_fops);
  #endif

Re: [PATCH 1/1] drm/amdgpu: release gtt bo after each move test

2021-10-13 Thread Das, Nirmoy


Please ignore this!

On 10/13/2021 5:04 PM, Nirmoy Das wrote:

When gart size is < gtt size this test will fail with
-ENOMEM as we are not freeing gtt bo after each move test.
This is generally not an issue when gart size >= gtt size.

Reported-by: zhang 
Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_test.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
index 909d830b513e..0cf2a560d673 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
@@ -212,7 +212,6 @@ static void amdgpu_do_test_moves(struct amdgpu_device *adev)
  
  		DRM_INFO("Tested GTT->VRAM and VRAM->GTT copy for GTT offset 0x%llx\n",

 gart_addr - adev->gmc.gart_start);
-   continue;
  
  out_lclean_unpin:

amdgpu_bo_unpin(gtt_obj[i]);
@@ -220,6 +219,7 @@ static void amdgpu_do_test_moves(struct amdgpu_device *adev)
amdgpu_bo_unreserve(gtt_obj[i]);
  out_lclean_unref:
amdgpu_bo_unref(_obj[i]);
+   continue;
  out_lclean:
for (--i; i >= 0; --i) {
amdgpu_bo_unpin(gtt_obj[i]);

[PATCH 1/1] drm/amdgpu: fix BO leak after successful move test

2021-10-13 Thread Nirmoy Das

GTT BO cleanup code is with in the test for loop and
we would skip cleaning up GTT BO on success.

Reported-by: zhang 
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_test.c | 25 
 1 file changed, 12 insertions(+), 13 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
index 909d830b513e..5fe7ff680c29 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
@@ -35,6 +35,7 @@ static void amdgpu_do_test_moves(struct amdgpu_device *adev)
struct amdgpu_bo *vram_obj = NULL;
struct amdgpu_bo **gtt_obj = NULL;
struct amdgpu_bo_param bp;
+   struct dma_fence *fence = NULL;
uint64_t gart_addr, vram_addr;
unsigned n, size;
int i, r;
@@ -82,7 +83,6 @@ static void amdgpu_do_test_moves(struct amdgpu_device *adev)
void *gtt_map, *vram_map;
void **gart_start, **gart_end;
void **vram_start, **vram_end;
-   struct dma_fence *fence = NULL;
 
bp.domain = AMDGPU_GEM_DOMAIN_GTT;
r = amdgpu_bo_create(adev, , gtt_obj + i);
@@ -212,24 +212,23 @@ static void amdgpu_do_test_moves(struct amdgpu_device 
*adev)
 
DRM_INFO("Tested GTT->VRAM and VRAM->GTT copy for GTT offset 
0x%llx\n",
 gart_addr - adev->gmc.gart_start);
-   continue;
+   }
 
+   --i;
 out_lclean_unpin:
-   amdgpu_bo_unpin(gtt_obj[i]);
+   amdgpu_bo_unpin(gtt_obj[i]);
 out_lclean_unres:
-   amdgpu_bo_unreserve(gtt_obj[i]);
+   amdgpu_bo_unreserve(gtt_obj[i]);
 out_lclean_unref:
-   amdgpu_bo_unref(_obj[i]);
+   amdgpu_bo_unref(_obj[i]);
 out_lclean:
-   for (--i; i >= 0; --i) {
-   amdgpu_bo_unpin(gtt_obj[i]);
-   amdgpu_bo_unreserve(gtt_obj[i]);
-   amdgpu_bo_unref(_obj[i]);
-   }
-   if (fence)
-   dma_fence_put(fence);
-   break;
+   for (--i; i >= 0; --i) {
+   amdgpu_bo_unpin(gtt_obj[i]);
+   amdgpu_bo_unreserve(gtt_obj[i]);
+   amdgpu_bo_unref(_obj[i]);
}
+   if (fence)
+   dma_fence_put(fence);
 
amdgpu_bo_unpin(vram_obj);
 out_unres:
-- 
2.32.0

Re: [PATCH 1/1] drm/amdgpu: release gtt bo after each move test

2021-10-13 Thread Das, Nirmoy



On 10/13/2021 12:42 PM, Das, Nirmoy wrote:



On 10/13/2021 3:22 AM, zhang wrote:


Hi . Nirmoy


If you let continue to unpin. this will  allways test a same va for gtt

I think we should  rafresh calculate the value n



Right, I guess then the test should only run till gart size.



Actually the test size calculation was fine, it is just that we wouldn't 
release BO after a successful test as the cleanup code is inside the 
test for loop.



Regards,

Nirmoy



Regards,

Nirmoy



On 2021/10/12 20:10, Nirmoy Das wrote:

When gart size is < gtt size this test will fail with
-ENOMEM as we are not freeing gtt bo after each move test.
This is generally not an issue when gart size >= gtt size.

Reported-by: zhang
Signed-off-by: Nirmoy Das
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_test.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
index 909d830b513e..0cf2a560d673 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
@@ -212,7 +212,6 @@ static void amdgpu_do_test_moves(struct amdgpu_device *adev)
  
  		DRM_INFO("Tested GTT->VRAM and VRAM->GTT copy for GTT offset 0x%llx\n",

 gart_addr - adev->gmc.gart_start);
-   continue;
  
  out_lclean_unpin:

amdgpu_bo_unpin(gtt_obj[i]);
@@ -220,6 +219,7 @@ static void amdgpu_do_test_moves(struct amdgpu_device *adev)
amdgpu_bo_unreserve(gtt_obj[i]);
  out_lclean_unref:
amdgpu_bo_unref(_obj[i]);
+   continue;
  out_lclean:
for (--i; i >= 0; --i) {
amdgpu_bo_unpin(gtt_obj[i]);

[PATCH 1/1] drm/amdgpu: release gtt bo after each move test

2021-10-13 Thread Nirmoy Das

When gart size is < gtt size this test will fail with
-ENOMEM as we are not freeing gtt bo after each move test.
This is generally not an issue when gart size >= gtt size.

Reported-by: zhang 
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
index 909d830b513e..0cf2a560d673 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
@@ -212,7 +212,6 @@ static void amdgpu_do_test_moves(struct amdgpu_device *adev)
 
DRM_INFO("Tested GTT->VRAM and VRAM->GTT copy for GTT offset 
0x%llx\n",
 gart_addr - adev->gmc.gart_start);
-   continue;
 
 out_lclean_unpin:
amdgpu_bo_unpin(gtt_obj[i]);
@@ -220,6 +219,7 @@ static void amdgpu_do_test_moves(struct amdgpu_device *adev)
amdgpu_bo_unreserve(gtt_obj[i]);
 out_lclean_unref:
amdgpu_bo_unref(_obj[i]);
+   continue;
 out_lclean:
for (--i; i >= 0; --i) {
amdgpu_bo_unpin(gtt_obj[i]);
-- 
2.32.0

Re: [PATCH] drm/amdkfd: Fix an inappropriate error handling in allloc memory of gpu

2021-10-13 Thread Das, Nirmoy

LGTM as we create a gem object 1st and retrieve amdgpu_bo from the gem 
object.



Acked-by: Nirmoy Das 

On 10/13/2021 9:28 AM, Lang Yu wrote:

We should unreference a gem object instead of an amdgpu bo here.

Fixes: 5ae0283e831a ("drm/amdgpu: Add userptr support for KFD")

Signed-off-by: Lang Yu 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 054c1a224def..cdf46bd0d8d5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1503,7 +1503,7 @@ int amdgpu_amdkfd_gpuvm_alloc_memory_of_gpu(
remove_kgd_mem_from_kfd_bo_list(*mem, avm->process_info);
drm_vma_node_revoke(>vma_node, drm_priv);
  err_node_allow:
-   amdgpu_bo_unref();
+   drm_gem_object_put(gobj);
/* Don't unreserve system mem limit twice */
goto err_reserve_limit;
  err_bo_create:

Re: [PATCH 1/1] drm/amdgpu: release gtt bo after each move test

2021-10-13 Thread Das, Nirmoy



On 10/13/2021 3:22 AM, zhang wrote:


Hi . Nirmoy


If you let continue to unpin. this will  allways test a same va for gtt

I think we should  rafresh calculate  the value n



Right, I guess then the test should only run till gart size.


Regards,

Nirmoy



On 2021/10/12 20:10, Nirmoy Das wrote:

When gart size is < gtt size this test will fail with
-ENOMEM as we are not freeing gtt bo after each move test.
This is generally not an issue when gart size >= gtt size.

Reported-by: zhang
Signed-off-by: Nirmoy Das
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_test.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
index 909d830b513e..0cf2a560d673 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
@@ -212,7 +212,6 @@ static void amdgpu_do_test_moves(struct amdgpu_device *adev)
  
  		DRM_INFO("Tested GTT->VRAM and VRAM->GTT copy for GTT offset 0x%llx\n",

 gart_addr - adev->gmc.gart_start);
-   continue;
  
  out_lclean_unpin:

amdgpu_bo_unpin(gtt_obj[i]);
@@ -220,6 +219,7 @@ static void amdgpu_do_test_moves(struct amdgpu_device *adev)
amdgpu_bo_unreserve(gtt_obj[i]);
  out_lclean_unref:
amdgpu_bo_unref(_obj[i]);
+   continue;
  out_lclean:
for (--i; i >= 0; --i) {
amdgpu_bo_unpin(gtt_obj[i]);

Re: [PATCH Review 1/1] drm/ttm: fix debugfs node create failed

2021-10-12 Thread Das, Nirmoy




On 10/12/2021 1:58 PM, Stanley.Yang wrote:

Test scenario:
 modprobe amdgpu -> rmmod amdgpu -> modprobe amdgpu
Error log:
 [   54.396807] debugfs: File 'page_pool' in directory 'amdttm' already 
present!
 [   54.396833] debugfs: File 'page_pool_shrink' in directory 'amdttm' 
already present!
 [   54.396848] debugfs: File 'buffer_objects' in directory 'amdttm' 
already present!



We should instead add a check if those debugfs files already 
exist/created in ttm debugfs dir using debugfs_lookup() before creating.



Regards,

Nirmoy




Reason:
 page_pool, page_pool_shrink and buffer_objects can be removed when
 rmmod amdttm, in the above test scenario only rmmod amdgpu, so those
 debugfs node will not be removed, this caused file create failed.
Soultion:
 create ttm_page directory under ttm_root directory when insmod amdgpu,
 page_pool, page_pool_shrink and buffer_objects are stored in ttm_page 
directiry,
 remove ttm_page directory when do rmmod amdgpu, this can fix above issue.

Signed-off-by: Stanley.Yang 
---
  drivers/gpu/drm/ttm/ttm_device.c | 12 +++-
  drivers/gpu/drm/ttm/ttm_module.c |  1 +
  drivers/gpu/drm/ttm/ttm_module.h |  1 +
  drivers/gpu/drm/ttm/ttm_pool.c   |  4 ++--
  4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/ttm/ttm_device.c b/drivers/gpu/drm/ttm/ttm_device.c
index 1de23edbc182..ad170328f0c8 100644
--- a/drivers/gpu/drm/ttm/ttm_device.c
+++ b/drivers/gpu/drm/ttm/ttm_device.c
@@ -55,6 +55,10 @@ static void ttm_global_release(void)
  
  	ttm_pool_mgr_fini();
  
+#ifdef CONFIG_DEBUG_FS

+   debugfs_remove(ttm_debugfs_page);
+#endif
+
__free_page(glob->dummy_read_page);
memset(glob, 0, sizeof(*glob));
  out:
@@ -85,6 +89,10 @@ static int ttm_global_init(void)
>> PAGE_SHIFT;
num_dma32 = min(num_dma32, 2UL << (30 - PAGE_SHIFT));
  
+#ifdef CONFIG_DEBUG_FS

+   ttm_debugfs_page = debugfs_create_dir("ttm_page", ttm_debugfs_root);
+#endif
+
ttm_pool_mgr_init(num_pages);
ttm_tt_mgr_init(num_pages, num_dma32);
  
@@ -98,8 +106,10 @@ static int ttm_global_init(void)

INIT_LIST_HEAD(>device_list);
atomic_set(>bo_count, 0);
  
-	debugfs_create_atomic_t("buffer_objects", 0444, ttm_debugfs_root,

+#ifdef CONFIG_DEBUG_FS
+   debugfs_create_atomic_t("buffer_objects", 0444, ttm_debugfs_page,
>bo_count);
+#endif
  out:
mutex_unlock(_global_mutex);
return ret;
diff --git a/drivers/gpu/drm/ttm/ttm_module.c b/drivers/gpu/drm/ttm/ttm_module.c
index 88970a6b8e32..66595e6e7087 100644
--- a/drivers/gpu/drm/ttm/ttm_module.c
+++ b/drivers/gpu/drm/ttm/ttm_module.c
@@ -38,6 +38,7 @@
  #include "ttm_module.h"
  
  struct dentry *ttm_debugfs_root;

+struct dentry *ttm_debugfs_page;
  
  static int __init ttm_init(void)

  {
diff --git a/drivers/gpu/drm/ttm/ttm_module.h b/drivers/gpu/drm/ttm/ttm_module.h
index d7cac5d4b835..6007dc66f44e 100644
--- a/drivers/gpu/drm/ttm/ttm_module.h
+++ b/drivers/gpu/drm/ttm/ttm_module.h
@@ -36,5 +36,6 @@
  struct dentry;
  
  extern struct dentry *ttm_debugfs_root;

+extern struct dentry *ttm_debugfs_page;
  
  #endif /* _TTM_MODULE_H_ */

diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c
index 8be7fd7161fd..ecb33daad7b5 100644
--- a/drivers/gpu/drm/ttm/ttm_pool.c
+++ b/drivers/gpu/drm/ttm/ttm_pool.c
@@ -709,9 +709,9 @@ int ttm_pool_mgr_init(unsigned long num_pages)
}
  
  #ifdef CONFIG_DEBUG_FS

-   debugfs_create_file("page_pool", 0444, ttm_debugfs_root, NULL,
+   debugfs_create_file("page_pool", 0444, ttm_debugfs_page, NULL,
_pool_debugfs_globals_fops);
-   debugfs_create_file("page_pool_shrink", 0400, ttm_debugfs_root, NULL,
+   debugfs_create_file("page_pool_shrink", 0400, ttm_debugfs_page, NULL,
_pool_debugfs_shrink_fops);
  #endif

[PATCH 1/1] drm/amdgpu: release gtt bo after each move test

2021-10-12 Thread Nirmoy Das

When gart size is < gtt size this test will fail with
-ENOMEM as we are not freeing gtt bo after each move test.
This is generally not an issue when gart size >= gtt size.

Reported-by: zhang 
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_test.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
index 909d830b513e..0cf2a560d673 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_test.c
@@ -212,7 +212,6 @@ static void amdgpu_do_test_moves(struct amdgpu_device *adev)
 
DRM_INFO("Tested GTT->VRAM and VRAM->GTT copy for GTT offset 
0x%llx\n",
 gart_addr - adev->gmc.gart_start);
-   continue;
 
 out_lclean_unpin:
amdgpu_bo_unpin(gtt_obj[i]);
@@ -220,6 +219,7 @@ static void amdgpu_do_test_moves(struct amdgpu_device *adev)
amdgpu_bo_unreserve(gtt_obj[i]);
 out_lclean_unref:
amdgpu_bo_unref(_obj[i]);
+   continue;
 out_lclean:
for (--i; i >= 0; --i) {
amdgpu_bo_unpin(gtt_obj[i]);
-- 
2.32.0

Re: [PATCH 1/1] drm/amdgpu: unify BO evicting method in amdgpu_ttm

2021-10-07 Thread Das, Nirmoy




On 10/7/2021 12:38 PM, Christian König wrote:

Am 07.10.21 um 12:00 schrieb Nirmoy Das:

Unify BO evicting functionality for possible memory
types in amdgpu_ttm.c.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  8 +++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  | 30 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  | 23 
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  |  1 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 30 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  1 +
  6 files changed, 58 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 5497e2d31d1a..164d6a9e9fbb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1328,7 +1328,7 @@ static int amdgpu_debugfs_evict_vram(void 
*data, u64 *val)

  return r;
  }
  -    *val = amdgpu_bo_evict_vram(adev);
+    *val = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
    pm_runtime_mark_last_busy(dev->dev);
  pm_runtime_put_autosuspend(dev->dev);
@@ -1341,17 +1341,15 @@ static int amdgpu_debugfs_evict_gtt(void 
*data, u64 *val)

  {
  struct amdgpu_device *adev = (struct amdgpu_device *)data;
  struct drm_device *dev = adev_to_drm(adev);
-    struct ttm_resource_manager *man;
  int r;
    r = pm_runtime_get_sync(dev->dev);
  if (r < 0) {
-    pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
+    pm_runtime_put_autosuspend(dev->dev);
  return r;
  }
  -    man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
-    *val = ttm_resource_manager_evict_all(>mman.bdev, man);
+    *val = amdgpu_ttm_evict_resources(adev, TTM_PL_TT);
    pm_runtime_mark_last_busy(dev->dev);
  pm_runtime_put_autosuspend(dev->dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 57638fe9cfc2..032deca4cea2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3880,6 +3880,25 @@ void amdgpu_device_fini_sw(struct 
amdgpu_device *adev)

    }
  +/**
+ * amdgpu_device_evict_resources - evict device resources
+ * @adev: amdgpu device object
+ *
+ * Evicts all ttm device resources(vram BOs, gart table) from the 
lru list

+ * of the vram memory type. Mainly used for evicting device resources
+ * at suspend time.
+ *
+ */
+void amdgpu_device_evict_resources(struct amdgpu_device *adev)


Please add static here, apart from that the patch is Reviewed-by: 
Christian König 



Thanks, I will add that and push the commit.


Nirmoy



Thanks,
Christian.


+{
+    /* No need to evict vram on APUs for suspend to ram */
+    if (adev->in_s3 && (adev->flags & AMD_IS_APU))
+    return;
+
+    if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM))
+    DRM_WARN("evicting device resources failed\n");
+
+}
    /*
   * Suspend & resume.
@@ -3920,17 +3939,16 @@ int amdgpu_device_suspend(struct drm_device 
*dev, bool fbcon)

  if (!adev->in_s0ix)
  amdgpu_amdkfd_suspend(adev, adev->in_runpm);
  -    /* evict vram memory */
-    amdgpu_bo_evict_vram(adev);
+    /* First evict vram memory */
+    amdgpu_device_evict_resources(adev);
    amdgpu_fence_driver_hw_fini(adev);
    amdgpu_device_ip_suspend_phase2(adev);
-    /* evict remaining vram memory
- * This second call to evict vram is to evict the gart page table
- * using the CPU.
+    /* This second call to evict device resources is to evict
+ * the gart page table using the CPU.
   */
-    amdgpu_bo_evict_vram(adev);
+    amdgpu_device_evict_resources(adev);
    return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 4ec904f36ceb..073ba2af0b9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1004,29 +1004,6 @@ void amdgpu_bo_unpin(struct amdgpu_bo *bo)
  }
  }
  -/**
- * amdgpu_bo_evict_vram - evict VRAM buffers
- * @adev: amdgpu device object
- *
- * Evicts all VRAM buffers on the lru list of the memory type.
- * Mainly used for evicting vram at suspend time.
- *
- * Returns:
- * 0 for success or a negative error code on failure.
- */
-int amdgpu_bo_evict_vram(struct amdgpu_device *adev)
-{
-    struct ttm_resource_manager *man;
-
-    if (adev->in_s3 && (adev->flags & AMD_IS_APU)) {
-    /* No need to evict vram on APUs for suspend to ram */
-    return 0;
-    }
-
-    man = ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
-    return ttm_resource_manager_evict_all(>mman.bdev, man);
-}
-
  static const char *amdgpu_vram_names[] = {
  "UNKNOWN",
  "GDDR1",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
b/drivers/gpu/drm/amd/amdgpu

[PATCH 1/1] drm/amdgpu: unify BO evicting method in amdgpu_ttm

2021-10-07 Thread Nirmoy Das

Unify BO evicting functionality for possible memory
types in amdgpu_ttm.c.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  8 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  | 30 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  | 23 
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 30 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  1 +
 6 files changed, 58 insertions(+), 35 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 5497e2d31d1a..164d6a9e9fbb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1328,7 +1328,7 @@ static int amdgpu_debugfs_evict_vram(void *data, u64 *val)
return r;
}
 
-   *val = amdgpu_bo_evict_vram(adev);
+   *val = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
 
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
@@ -1341,17 +1341,15 @@ static int amdgpu_debugfs_evict_gtt(void *data, u64 
*val)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)data;
struct drm_device *dev = adev_to_drm(adev);
-   struct ttm_resource_manager *man;
int r;
 
r = pm_runtime_get_sync(dev->dev);
if (r < 0) {
-   pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
+   pm_runtime_put_autosuspend(dev->dev);
return r;
}
 
-   man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
-   *val = ttm_resource_manager_evict_all(>mman.bdev, man);
+   *val = amdgpu_ttm_evict_resources(adev, TTM_PL_TT);
 
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 57638fe9cfc2..032deca4cea2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3880,6 +3880,25 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
 
 }
 
+/**
+ * amdgpu_device_evict_resources - evict device resources
+ * @adev: amdgpu device object
+ *
+ * Evicts all ttm device resources(vram BOs, gart table) from the lru list
+ * of the vram memory type. Mainly used for evicting device resources
+ * at suspend time.
+ *
+ */
+void amdgpu_device_evict_resources(struct amdgpu_device *adev)
+{
+   /* No need to evict vram on APUs for suspend to ram */
+   if (adev->in_s3 && (adev->flags & AMD_IS_APU))
+   return;
+
+   if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM))
+   DRM_WARN("evicting device resources failed\n");
+
+}
 
 /*
  * Suspend & resume.
@@ -3920,17 +3939,16 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
if (!adev->in_s0ix)
amdgpu_amdkfd_suspend(adev, adev->in_runpm);
 
-   /* evict vram memory */
-   amdgpu_bo_evict_vram(adev);
+   /* First evict vram memory */
+   amdgpu_device_evict_resources(adev);
 
amdgpu_fence_driver_hw_fini(adev);
 
amdgpu_device_ip_suspend_phase2(adev);
-   /* evict remaining vram memory
-* This second call to evict vram is to evict the gart page table
-* using the CPU.
+   /* This second call to evict device resources is to evict
+* the gart page table using the CPU.
 */
-   amdgpu_bo_evict_vram(adev);
+   amdgpu_device_evict_resources(adev);
 
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 4ec904f36ceb..073ba2af0b9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1004,29 +1004,6 @@ void amdgpu_bo_unpin(struct amdgpu_bo *bo)
}
 }
 
-/**
- * amdgpu_bo_evict_vram - evict VRAM buffers
- * @adev: amdgpu device object
- *
- * Evicts all VRAM buffers on the lru list of the memory type.
- * Mainly used for evicting vram at suspend time.
- *
- * Returns:
- * 0 for success or a negative error code on failure.
- */
-int amdgpu_bo_evict_vram(struct amdgpu_device *adev)
-{
-   struct ttm_resource_manager *man;
-
-   if (adev->in_s3 && (adev->flags & AMD_IS_APU)) {
-   /* No need to evict vram on APUs for suspend to ram */
-   return 0;
-   }
-
-   man = ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
-   return ttm_resource_manager_evict_all(>mman.bdev, man);
-}
-
 static const char *amdgpu_vram_names[] = {
"UNKNOWN",
"GDDR1",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index 8ff61bad4138..d787e0e89e0b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/d

Re: [PATCH 1/1] drm/amdgpu: unify BO evicting method in amdgpu_ttm

2021-10-07 Thread Nirmoy




On 10/7/21 8:08 AM, Christian König wrote:



Am 06.10.21 um 18:04 schrieb Nirmoy Das:

Unify BO evicting functionality for possible memory
types in amdgpu_ttm.c and remove corresponding function
from amdgpu_object.c.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  8 +++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  | 23 
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  |  1 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 30 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  1 +
  6 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 5497e2d31d1a..22f3de29d783 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1328,7 +1328,7 @@ static int amdgpu_debugfs_evict_vram(void 
*data, u64 *val)

  return r;
  }
  -    *val = amdgpu_bo_evict_vram(adev);
+    *val = amdgpu_bo_evict_memory(adev, TTM_PL_VRAM);
    pm_runtime_mark_last_busy(dev->dev);
  pm_runtime_put_autosuspend(dev->dev);
@@ -1341,17 +1341,15 @@ static int amdgpu_debugfs_evict_gtt(void 
*data, u64 *val)

  {
  struct amdgpu_device *adev = (struct amdgpu_device *)data;
  struct drm_device *dev = adev_to_drm(adev);
-    struct ttm_resource_manager *man;
  int r;
    r = pm_runtime_get_sync(dev->dev);
  if (r < 0) {
-    pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
+    pm_runtime_put_autosuspend(dev->dev);
  return r;
  }
  -    man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
-    *val = ttm_resource_manager_evict_all(>mman.bdev, man);
+    *val = amdgpu_bo_evict_memory(adev, TTM_PL_TT);
    pm_runtime_mark_last_busy(dev->dev);
  pm_runtime_put_autosuspend(dev->dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

index 57638fe9cfc2..c441ebe9da11 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3921,7 +3921,7 @@ int amdgpu_device_suspend(struct drm_device 
*dev, bool fbcon)

  amdgpu_amdkfd_suspend(adev, adev->in_runpm);
    /* evict vram memory */
-    amdgpu_bo_evict_vram(adev);
+    amdgpu_bo_evict_memory(adev, TTM_PL_VRAM);
    amdgpu_fence_driver_hw_fini(adev);
  @@ -3930,7 +3930,7 @@ int amdgpu_device_suspend(struct drm_device 
*dev, bool fbcon)

   * This second call to evict vram is to evict the gart page table
   * using the CPU.
   */
-    amdgpu_bo_evict_vram(adev);
+    amdgpu_bo_evict_memory(adev, TTM_PL_VRAM);


Those two call are now missing the "(adev->in_s3 && (adev->flags & 
AMD_IS_APU))" check.



Thanks, not sure how I always miss such details :/


I will resend a v3.


Nirmoy



Probably best if you move that into a amdgpu_device_evict_vram() helper.


    return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 4ec904f36ceb..073ba2af0b9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1004,29 +1004,6 @@ void amdgpu_bo_unpin(struct amdgpu_bo *bo)
  }
  }
  -/**
- * amdgpu_bo_evict_vram - evict VRAM buffers
- * @adev: amdgpu device object
- *
- * Evicts all VRAM buffers on the lru list of the memory type.
- * Mainly used for evicting vram at suspend time.
- *
- * Returns:
- * 0 for success or a negative error code on failure.
- */
-int amdgpu_bo_evict_vram(struct amdgpu_device *adev)
-{
-    struct ttm_resource_manager *man;
-
-    if (adev->in_s3 && (adev->flags & AMD_IS_APU)) {
-    /* No need to evict vram on APUs for suspend to ram */
-    return 0;
-    }
-
-    man = ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
-    return ttm_resource_manager_evict_all(>mman.bdev, man);
-}
-
  static const char *amdgpu_vram_names[] = {
  "UNKNOWN",
  "GDDR1",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

index 8ff61bad4138..d787e0e89e0b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -305,7 +305,6 @@ int amdgpu_bo_pin(struct amdgpu_bo *bo, u32 domain);
  int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
   u64 min_offset, u64 max_offset);
  void amdgpu_bo_unpin(struct amdgpu_bo *bo);
-int amdgpu_bo_evict_vram(struct amdgpu_device *adev);
  int amdgpu_bo_init(struct amdgpu_device *adev);
  void amdgpu_bo_fini(struct amdgpu_device *adev);
  int amdgpu_bo_set_tiling_flags(struct amdgpu_bo *bo, u64 
tiling_flags);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c

index e2896ac2c9ce..545b4bdeae07 1006

[PATCH 1/1] drm/amdgpu: unify BO evicting method in amdgpu_ttm

2021-10-06 Thread Nirmoy Das

Unify BO evicting functionality for possible memory
types in amdgpu_ttm.c and remove corresponding function
from amdgpu_object.c.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  8 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  | 23 
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  |  1 -
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 30 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h |  1 +
 6 files changed, 36 insertions(+), 31 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 5497e2d31d1a..22f3de29d783 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1328,7 +1328,7 @@ static int amdgpu_debugfs_evict_vram(void *data, u64 *val)
return r;
}
 
-   *val = amdgpu_bo_evict_vram(adev);
+   *val = amdgpu_bo_evict_memory(adev, TTM_PL_VRAM);
 
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
@@ -1341,17 +1341,15 @@ static int amdgpu_debugfs_evict_gtt(void *data, u64 
*val)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)data;
struct drm_device *dev = adev_to_drm(adev);
-   struct ttm_resource_manager *man;
int r;
 
r = pm_runtime_get_sync(dev->dev);
if (r < 0) {
-   pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
+   pm_runtime_put_autosuspend(dev->dev);
return r;
}
 
-   man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
-   *val = ttm_resource_manager_evict_all(>mman.bdev, man);
+   *val = amdgpu_bo_evict_memory(adev, TTM_PL_TT);
 
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 57638fe9cfc2..c441ebe9da11 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3921,7 +3921,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
amdgpu_amdkfd_suspend(adev, adev->in_runpm);
 
/* evict vram memory */
-   amdgpu_bo_evict_vram(adev);
+   amdgpu_bo_evict_memory(adev, TTM_PL_VRAM);
 
amdgpu_fence_driver_hw_fini(adev);
 
@@ -3930,7 +3930,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool 
fbcon)
 * This second call to evict vram is to evict the gart page table
 * using the CPU.
 */
-   amdgpu_bo_evict_vram(adev);
+   amdgpu_bo_evict_memory(adev, TTM_PL_VRAM);
 
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 4ec904f36ceb..073ba2af0b9c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1004,29 +1004,6 @@ void amdgpu_bo_unpin(struct amdgpu_bo *bo)
}
 }
 
-/**
- * amdgpu_bo_evict_vram - evict VRAM buffers
- * @adev: amdgpu device object
- *
- * Evicts all VRAM buffers on the lru list of the memory type.
- * Mainly used for evicting vram at suspend time.
- *
- * Returns:
- * 0 for success or a negative error code on failure.
- */
-int amdgpu_bo_evict_vram(struct amdgpu_device *adev)
-{
-   struct ttm_resource_manager *man;
-
-   if (adev->in_s3 && (adev->flags & AMD_IS_APU)) {
-   /* No need to evict vram on APUs for suspend to ram */
-   return 0;
-   }
-
-   man = ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
-   return ttm_resource_manager_evict_all(>mman.bdev, man);
-}
-
 static const char *amdgpu_vram_names[] = {
"UNKNOWN",
"GDDR1",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index 8ff61bad4138..d787e0e89e0b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -305,7 +305,6 @@ int amdgpu_bo_pin(struct amdgpu_bo *bo, u32 domain);
 int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 domain,
 u64 min_offset, u64 max_offset);
 void amdgpu_bo_unpin(struct amdgpu_bo *bo);
-int amdgpu_bo_evict_vram(struct amdgpu_device *adev);
 int amdgpu_bo_init(struct amdgpu_device *adev);
 void amdgpu_bo_fini(struct amdgpu_device *adev);
 int amdgpu_bo_set_tiling_flags(struct amdgpu_bo *bo, u64 tiling_flags);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index e2896ac2c9ce..545b4bdeae07 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -2034,6 +2034,36 @@ int amdgpu_fill_buffer(struct amdgpu_bo *bo,
return r;
 }
 
+/**
+ * amdgpu_bo_evict_memory - evict memory buffers
+ * @adev: am

Re: [PATCH 1/1] drm/amdgpu: add and use amdgpu_bo_evict_gtt

2021-10-06 Thread Das, Nirmoy




On 10/6/2021 4:58 PM, Christian König wrote:

Am 06.10.21 um 16:45 schrieb Nirmoy Das:

Unify BO evicting functionality for VRAM and TT memory
types in amdgpu_object.c. Use amdgpu_bo_evict_gtt()
for evicting gtt memory similar to how we do that
for amdgpu_debugfs_evict_vram().

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  6 +--
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  | 52 +++--
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  |  1 +
  3 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 5497e2d31d1a..67045983d63d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1341,17 +1341,15 @@ static int amdgpu_debugfs_evict_gtt(void 
*data, u64 *val)

  {
  struct amdgpu_device *adev = (struct amdgpu_device *)data;
  struct drm_device *dev = adev_to_drm(adev);
-    struct ttm_resource_manager *man;
  int r;
    r = pm_runtime_get_sync(dev->dev);
  if (r < 0) {
-    pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
+    pm_runtime_put_autosuspend(dev->dev);
  return r;
  }
  -    man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
-    *val = ttm_resource_manager_evict_all(>mman.bdev, man);
+    *val = amdgpu_bo_evict_gtt(adev);
    pm_runtime_mark_last_busy(dev->dev);
  pm_runtime_put_autosuspend(dev->dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

index 4ec904f36ceb..3b8c9cf44d74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1005,10 +1005,37 @@ void amdgpu_bo_unpin(struct amdgpu_bo *bo)
  }
    /**
- * amdgpu_bo_evict_vram - evict VRAM buffers
+ * amdgpu_bo_evict_memory - evict memory buffers
   * @adev: amdgpu device object
+ * @mem_type: evicted BO's memory type
   *
- * Evicts all VRAM buffers on the lru list of the memory type.
+ * Evicts all @mem_type buffers on the lru list of the memory type.
+ *
+ * Returns:
+ * 0 for success or a negative error code on failure.
+ */
+static int amdgpu_bo_evict_memory(struct amdgpu_device *adev, int 
mem_type)


That function should probably be inside amdgpu_ttm.c instead.


+{
+    struct ttm_resource_manager *man;
+
+    switch (mem_type) {
+    case TTM_PL_VRAM:
+    case TTM_PL_TT:
+    man = ttm_manager_type(>mman.bdev, mem_type);
+    break;
+    default:
+    DRM_ERROR("Trying to evict invalid memory type\n");
+    return -EINVAL;


At least in theory we could do that for OA, GWS and GDS as well.



I will add those and take care of other comments and resend v2.


Thanks,

Nirmoy





+    }
+
+    return ttm_resource_manager_evict_all(>mman.bdev, man);
+}
+
+/**
+ * amdgpu_bo_evict_vram - evict vram buffers
+ * @adev: amdgpu device object
+ *
+ * Evicts all vram buffers on the lru list of the memory type.
   * Mainly used for evicting vram at suspend time.
   *
   * Returns:
@@ -1016,17 +1043,32 @@ void amdgpu_bo_unpin(struct amdgpu_bo *bo)
   */
  int amdgpu_bo_evict_vram(struct amdgpu_device *adev)
  {
-    struct ttm_resource_manager *man;
    if (adev->in_s3 && (adev->flags & AMD_IS_APU)) {
  /* No need to evict vram on APUs for suspend to ram */
  return 0;
  }
  -    man = ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
-    return ttm_resource_manager_evict_all(>mman.bdev, man);
+    return amdgpu_bo_evict_memory(adev, TTM_PL_VRAM);
+}
+
+/**
+ * amdgpu_bo_evict_gtt - evict gtt buffers
+ * @adev: amdgpu device object
+ *
+ * Evicts all gtt buffers on the lru list of the memory type.
+ * Mainly used for evicting gtt buffers through debugfs.
+ *
+ * Returns:
+ * 0 for success or a negative error code on failure.
+ */
+
+int amdgpu_bo_evict_gtt(struct amdgpu_device *adev)
+{


I won't add a wrapper for that. This looks like misplaced and overkill.

Christian.


+    return amdgpu_bo_evict_memory(adev, TTM_PL_TT);
  }
  +
  static const char *amdgpu_vram_names[] = {
  "UNKNOWN",
  "GDDR1",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

index 8ff61bad4138..5e9b7710b8e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -306,6 +306,7 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo 
*bo, u32 domain,

   u64 min_offset, u64 max_offset);
  void amdgpu_bo_unpin(struct amdgpu_bo *bo);
  int amdgpu_bo_evict_vram(struct amdgpu_device *adev);
+int amdgpu_bo_evict_gtt(struct amdgpu_device *adev);
  int amdgpu_bo_init(struct amdgpu_device *adev);
  void amdgpu_bo_fini(struct amdgpu_device *adev);
  int amdgpu_bo_set_tiling_flags(struct amdgpu_bo *bo, u64 
tiling_flags);

[PATCH 1/1] drm/amdgpu: add and use amdgpu_bo_evict_gtt

2021-10-06 Thread Nirmoy Das

Unify BO evicting functionality for VRAM and TT memory
types in amdgpu_object.c. Use amdgpu_bo_evict_gtt()
for evicting gtt memory similar to how we do that
for amdgpu_debugfs_evict_vram().

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  6 +--
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c  | 52 +++--
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.h  |  1 +
 3 files changed, 50 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 5497e2d31d1a..67045983d63d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1341,17 +1341,15 @@ static int amdgpu_debugfs_evict_gtt(void *data, u64 
*val)
 {
struct amdgpu_device *adev = (struct amdgpu_device *)data;
struct drm_device *dev = adev_to_drm(adev);
-   struct ttm_resource_manager *man;
int r;
 
r = pm_runtime_get_sync(dev->dev);
if (r < 0) {
-   pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
+   pm_runtime_put_autosuspend(dev->dev);
return r;
}
 
-   man = ttm_manager_type(>mman.bdev, TTM_PL_TT);
-   *val = ttm_resource_manager_evict_all(>mman.bdev, man);
+   *val = amdgpu_bo_evict_gtt(adev);
 
pm_runtime_mark_last_busy(dev->dev);
pm_runtime_put_autosuspend(dev->dev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index 4ec904f36ceb..3b8c9cf44d74 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1005,10 +1005,37 @@ void amdgpu_bo_unpin(struct amdgpu_bo *bo)
 }
 
 /**
- * amdgpu_bo_evict_vram - evict VRAM buffers
+ * amdgpu_bo_evict_memory - evict memory buffers
  * @adev: amdgpu device object
+ * @mem_type: evicted BO's memory type
  *
- * Evicts all VRAM buffers on the lru list of the memory type.
+ * Evicts all @mem_type buffers on the lru list of the memory type.
+ *
+ * Returns:
+ * 0 for success or a negative error code on failure.
+ */
+static int amdgpu_bo_evict_memory(struct amdgpu_device *adev, int mem_type)
+{
+   struct ttm_resource_manager *man;
+
+   switch (mem_type) {
+   case TTM_PL_VRAM:
+   case TTM_PL_TT:
+   man = ttm_manager_type(>mman.bdev, mem_type);
+   break;
+   default:
+   DRM_ERROR("Trying to evict invalid memory type\n");
+   return -EINVAL;
+   }
+
+   return ttm_resource_manager_evict_all(>mman.bdev, man);
+}
+
+/**
+ * amdgpu_bo_evict_vram - evict vram buffers
+ * @adev: amdgpu device object
+ *
+ * Evicts all vram buffers on the lru list of the memory type.
  * Mainly used for evicting vram at suspend time.
  *
  * Returns:
@@ -1016,17 +1043,32 @@ void amdgpu_bo_unpin(struct amdgpu_bo *bo)
  */
 int amdgpu_bo_evict_vram(struct amdgpu_device *adev)
 {
-   struct ttm_resource_manager *man;
 
if (adev->in_s3 && (adev->flags & AMD_IS_APU)) {
/* No need to evict vram on APUs for suspend to ram */
return 0;
}
 
-   man = ttm_manager_type(>mman.bdev, TTM_PL_VRAM);
-   return ttm_resource_manager_evict_all(>mman.bdev, man);
+   return amdgpu_bo_evict_memory(adev, TTM_PL_VRAM);
+}
+
+/**
+ * amdgpu_bo_evict_gtt - evict gtt buffers
+ * @adev: amdgpu device object
+ *
+ * Evicts all gtt buffers on the lru list of the memory type.
+ * Mainly used for evicting gtt buffers through debugfs.
+ *
+ * Returns:
+ * 0 for success or a negative error code on failure.
+ */
+
+int amdgpu_bo_evict_gtt(struct amdgpu_device *adev)
+{
+   return amdgpu_bo_evict_memory(adev, TTM_PL_TT);
 }
 
+
 static const char *amdgpu_vram_names[] = {
"UNKNOWN",
"GDDR1",
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
index 8ff61bad4138..5e9b7710b8e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h
@@ -306,6 +306,7 @@ int amdgpu_bo_pin_restricted(struct amdgpu_bo *bo, u32 
domain,
 u64 min_offset, u64 max_offset);
 void amdgpu_bo_unpin(struct amdgpu_bo *bo);
 int amdgpu_bo_evict_vram(struct amdgpu_device *adev);
+int amdgpu_bo_evict_gtt(struct amdgpu_device *adev);
 int amdgpu_bo_init(struct amdgpu_device *adev);
 void amdgpu_bo_fini(struct amdgpu_device *adev);
 int amdgpu_bo_set_tiling_flags(struct amdgpu_bo *bo, u64 tiling_flags);
-- 
2.32.0

Re: [PATCH 1/1] drm/amdgpu: return early if debugfs is not initialized

2021-10-06 Thread Das, Nirmoy




On 10/6/2021 1:55 PM, Lazar, Lijo wrote:



On 10/6/2021 3:21 PM, Nirmoy Das wrote:

Check first if debugfs is initialized before creating
amdgpu debugfs files.

References: https://gitlab.freedesktop.org/drm/amd/-/issues/1686
Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 3 +++
  1 file changed, 3 insertions(+)



Sorry about another miss. There is one other option added in the patch.

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=a24c6f7bc923d5e2f3139855eb09b0d480d6b410 



"
config DEBUG_FS_DISALLOW_MOUNT
bool "Do not register debugfs as filesystem"
help
  The API is open but filesystem is not loaded. Clients can still 
do their work and read with debug tools that do not need debugfs 
filesystem.

"

This doesn't work under this mode. Guess, we are not worried about this.



It does work with DEBUG_FS_DISALLOW_MOUNT. I tested it with that option.



Reviewed-by: Lijo Lazar 



Thanks,

Nimroy



Thanks,
Lijo

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 6611b3c7c149..5497e2d31d1a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1617,6 +1617,9 @@ int amdgpu_debugfs_init(struct amdgpu_device 
*adev)

  struct dentry *ent;
  int r, i;
  +    if (!debugfs_initialized())
+    return 0;
+
  ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
    _ib_preempt);
  if (IS_ERR(ent)) {

[PATCH 1/1] drm/amdgpu: return early if debugfs is not initialized

2021-10-06 Thread Nirmoy Das

Check first if debugfs is initialized before creating
amdgpu debugfs files.

References: https://gitlab.freedesktop.org/drm/amd/-/issues/1686
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 6611b3c7c149..5497e2d31d1a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1617,6 +1617,9 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
struct dentry *ent;
int r, i;
 
+   if (!debugfs_initialized())
+   return 0;
+
ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
  _ib_preempt);
if (IS_ERR(ent)) {
-- 
2.32.0

Re: [PATCH 1/1] drm/amdgpu: ignore -EPERM error from debugfs

2021-10-06 Thread Das, Nirmoy




On 10/6/2021 8:59 AM, Christian König wrote:

Am 06.10.21 um 08:55 schrieb Lazar, Lijo:



On 10/6/2021 12:05 PM, Christian König wrote:

Am 06.10.21 um 08:32 schrieb Lazar, Lijo:



On 10/6/2021 11:49 AM, Christian König wrote:

Am 06.10.21 um 06:51 schrieb Lazar, Lijo:



On 10/5/2021 10:15 PM, Christian König wrote:

Am 05.10.21 um 15:49 schrieb Das, Nirmoy:


On 10/5/2021 3:22 PM, Christian König wrote:



Am 05.10.21 um 15:11 schrieb Nirmoy Das:

Debugfs core APIs will throw -EPERM when user disables debugfs
using CONFIG_DEBUG_FS_ALLOW_NONE or with kernel param. We 
shouldn't
see that as an error. Also validate drm root dentry before 
creating

amdgpu debugfs files.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 10 ++
  1 file changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 6611b3c7c149..d786072e918b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1617,6 +1617,16 @@ int amdgpu_debugfs_init(struct 
amdgpu_device *adev)

  struct dentry *ent;
  int r, i;
  +    if (IS_ERR(root)) {
+    /* When debugfs is disabled we get -EPERM which is 
not an

+ * error as this is user controllable.
+ */


Well setting primary->debugfs_root to an error code is 
probably not a good idea to begin with.


When debugfs is disabled that should most likely be NULL.



If we set primary->debugfs_root to  NULL then we need to add 
bunch of NULL checks everywhere before creating any debugfs files


because debugfs_create_{file|dir}() with NULL root is still 
valid. I am assuming a hypothetical case when debugfs_root dir 
creation fails even with debugfs enabled


but further calls are successful.  This wont be a problem if we 
propagate the error code.


Yeah, but an error code in members is ugly like hell and 
potentially causes crashes instead.


I strongly suggest to fix this so that root is NULL when debugfs 
isn't available and we add proper checks for that instead.


This shouldn't be done. A NULL is a valid parent for debugfs API. 
An invalid parent is always checked like this

  if (IS_ERR(parent))
    return parent;

Instead of adding redundant work like NULL checks, let the API do 
its work and don't break the API contract. For ex: usage of 
sample client, you may look at the drm usage; it does the same.


Yeah, but that is horrible API design and should be avoided.

ERR_PTR(), PTR_ERR(), IS_ERR() and similar are supposed to be used 
as alternative to signaling errors as return values from functions 
and should *never* ever be used to signal errors in pointer members.




One escape route may be - add another export from debugfs like 
debugfs_is_valid_node() which adheres to the current logic in 
debugfs API and use that in client code. Whenever debugfs changes 
to a different logic from IS_ERR, let that be changed.


Well that would then rather be drm_is_debugfs_enabled(), because 
that we separate debugfs handling into a drm core and individual 
drivers is drm specific.




Had one more look and looks like this will do the job. In other 
cases, API usage is allowed.


if (!debugfs_initialized())
    return;


Yeah, that might work as well.

Potentially a good idea to add that to both the core drm function and 
the amdgpu function. and not attempt to create debugfs files in the 
first place.



Sounds good, I will send patches to add this check.


Thanks,

Nirmoy




Christian.



Thanks,
Lijo


Christian.



Thanks,
Lijo


Regards,
Christian.



Thanks,
Lijo



Regards,
Christian.




Regards,

Nirmoy



Regards,
Christian.


+    if (PTR_ERR(root) == -EPERM)
+    return 0;
+
+    return PTR_ERR(ent);
+    }
+
  ent = debugfs_create_file("amdgpu_preempt_ib", 0600, 
root, adev,

    _ib_preempt);
  if (IS_ERR(ent)) {

Re: [PATCH 1/1] drm/amdgpu: ignore -EPERM error from debugfs

2021-10-05 Thread Das, Nirmoy




On 10/5/2021 3:22 PM, Christian König wrote:



Am 05.10.21 um 15:11 schrieb Nirmoy Das:

Debugfs core APIs will throw -EPERM when user disables debugfs
using CONFIG_DEBUG_FS_ALLOW_NONE or with kernel param. We shouldn't
see that as an error. Also validate drm root dentry before creating
amdgpu debugfs files.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 10 ++
  1 file changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 6611b3c7c149..d786072e918b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1617,6 +1617,16 @@ int amdgpu_debugfs_init(struct amdgpu_device 
*adev)

  struct dentry *ent;
  int r, i;
  +    if (IS_ERR(root)) {
+    /* When debugfs is disabled we get -EPERM which is not an
+ * error as this is user controllable.
+ */


Well setting primary->debugfs_root to an error code is probably not a 
good idea to begin with.


When debugfs is disabled that should most likely be NULL.



If we set primary->debugfs_root to  NULL then we need to add bunch of 
NULL checks everywhere before creating any debugfs files


because debugfs_create_{file|dir}() with NULL root is still valid.  I am 
assuming a hypothetical case when debugfs_root dir creation fails  even 
with debugfs enabled


but further calls are successful.  This wont be a problem if we 
propagate the error code.



Regards,

Nirmoy



Regards,
Christian.


+    if (PTR_ERR(root) == -EPERM)
+    return 0;
+
+    return PTR_ERR(ent);
+    }
+
  ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
    _ib_preempt);
  if (IS_ERR(ent)) {

[PATCH 1/1] drm/amdgpu: ignore -EPERM error from debugfs

2021-10-05 Thread Nirmoy Das

Debugfs core APIs will throw -EPERM when user disables debugfs
using CONFIG_DEBUG_FS_ALLOW_NONE or with kernel param. We shouldn't
see that as an error. Also validate drm root dentry before creating
amdgpu debugfs files.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 6611b3c7c149..d786072e918b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1617,6 +1617,16 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
struct dentry *ent;
int r, i;
 
+   if (IS_ERR(root)) {
+   /* When debugfs is disabled we get -EPERM which is not an
+* error as this is user controllable.
+*/
+   if (PTR_ERR(root) == -EPERM)
+   return 0;
+
+   return PTR_ERR(ent);
+   }
+
ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
  _ib_preempt);
if (IS_ERR(ent)) {
-- 
2.32.0

Re: [PATCH 1/1] drm/amdgpu: init debugfs drm driver callback

2021-10-05 Thread Das, Nirmoy




On 10/5/2021 2:41 PM, Christian König wrote:



Am 05.10.21 um 14:20 schrieb Das, Nirmoy:

Hi Christian,

On 10/5/2021 2:01 PM, Christian König wrote:

Am 05.10.21 um 13:58 schrieb Nirmoy Das:

drm_dev_register() will try to init driver's debugfs using
drm_driver.debugfs_init call back function. Use that callback
also for amdgpu to intialize debugfs.


Mhm, why is that useful? We rather wanted to get rid of all this DRM 
midlayering.



Actually main issue I am trying to solve is:

When user disables debugfs with CONFIG_DEBUG_FS_ALLOW_NONE, amdgpu 
gets EPERM and throws a DRM_ERROR even though it is not an error as 
this is user controllable.


Shall I just make all debugfs error logs to DRM_WARN ?

ref: 
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fdrm%2Famd%2F-%2Fissues%2F1686%23note_1052168data=04%7C01%7Cnirmoy.das%40amd.com%7C63032bdca1394c92e88808d987fd867b%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637690345246933980%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=eXyj6wlxD0YGiNsjB4smmRAm2KKGRtq%2FWiDSMzEWTo8%3Dreserved=0


Why not just add an "if (!root) return" at the beginning of 
amdgpu_debugfs_init() ?



This is fine too, I will resend.


Nirmoy



Regards,
Christian.



Regards,

Nirmoy



Christian.



Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 10 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  7 +++
  3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 6611b3c7c149..3076742f8f85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1611,8 +1611,9 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
  DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
  amdgpu_debugfs_sclk_set, "%llu\n");
  -int amdgpu_debugfs_init(struct amdgpu_device *adev)
+void amdgpu_debugfs_init(struct drm_minor *minor)
  {
+    struct amdgpu_device *adev = drm_to_adev(minor->dev);
  struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
  struct dentry *ent;
  int r, i;
@@ -1621,14 +1622,14 @@ int amdgpu_debugfs_init(struct 
amdgpu_device *adev)

    _ib_preempt);
  if (IS_ERR(ent)) {
  DRM_ERROR("unable to create amdgpu_preempt_ib debugsfs 
file\n");

-    return PTR_ERR(ent);
+    return;
  }
    ent = debugfs_create_file("amdgpu_force_sclk", 0200, root, 
adev,

    _sclk_set);
  if (IS_ERR(ent)) {
  DRM_ERROR("unable to create amdgpu_set_sclk debugsfs 
file\n");

-    return PTR_ERR(ent);
+    return;
  }
    /* Register debugfs entries for amdgpu_ttm */
@@ -1682,11 +1683,10 @@ int amdgpu_debugfs_init(struct 
amdgpu_device *adev)

  debugfs_create_blob("amdgpu_discovery", 0444, root,
  >debugfs_discovery_blob);
  -    return 0;
  }
    #else
-int amdgpu_debugfs_init(struct amdgpu_device *adev)
+void amdgpu_debugfs_init(struct drm_minor *minor)
  {
  return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h

index 371a6f0deb29..06b68e16e35d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
@@ -27,7 +27,7 @@
   */
    int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
-int amdgpu_debugfs_init(struct amdgpu_device *adev);
+void amdgpu_debugfs_init(struct drm_minor *minor);
  void amdgpu_debugfs_fini(struct amdgpu_device *adev);
  void amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
  void amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

index df83b1f438b6..ceda650895db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2012,10 +2012,6 @@ static int amdgpu_pci_probe(struct pci_dev 
*pdev,

  drm_fbdev_generic_setup(adev_to_drm(adev), 32);
  }
  -    ret = amdgpu_debugfs_init(adev);
-    if (ret)
-    DRM_ERROR("Creating debugfs files failed (%d).\n", ret);
-
  return 0;
    err_pci:
@@ -2479,6 +2475,9 @@ static const struct drm_driver 
amdgpu_kms_driver = {

  .dumb_map_offset = amdgpu_mode_dumb_mmap,
  .fops = _driver_kms_fops,
  .release = _driver_release_kms,
+#if defined(CONFIG_DEBUG_FS)
+    .debugfs_init = amdgpu_debugfs_init,
+#endif
    .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
  .prime_fd_to_handle = drm_gem_prime_fd_to_handle,

Re: [PATCH 1/1] drm/amdgpu: init debugfs drm driver callback

2021-10-05 Thread Das, Nirmoy


Hi Christian,

On 10/5/2021 2:01 PM, Christian König wrote:

Am 05.10.21 um 13:58 schrieb Nirmoy Das:

drm_dev_register() will try to init driver's debugfs using
drm_driver.debugfs_init call back function. Use that callback
also for amdgpu to intialize debugfs.


Mhm, why is that useful? We rather wanted to get rid of all this DRM 
midlayering.



Actually main issue I am trying to solve is:

When user disables debugfs with CONFIG_DEBUG_FS_ALLOW_NONE, amdgpu gets 
EPERM and throws a DRM_ERROR even though it is not an error as this is 
user controllable.


Shall I just make all debugfs error logs to DRM_WARN ?

ref: https://gitlab.freedesktop.org/drm/amd/-/issues/1686#note_1052168

Regards,

Nirmoy



Christian.



Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 10 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  7 +++
  3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 6611b3c7c149..3076742f8f85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1611,8 +1611,9 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
  DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
  amdgpu_debugfs_sclk_set, "%llu\n");
  -int amdgpu_debugfs_init(struct amdgpu_device *adev)
+void amdgpu_debugfs_init(struct drm_minor *minor)
  {
+    struct amdgpu_device *adev = drm_to_adev(minor->dev);
  struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
  struct dentry *ent;
  int r, i;
@@ -1621,14 +1622,14 @@ int amdgpu_debugfs_init(struct amdgpu_device 
*adev)

    _ib_preempt);
  if (IS_ERR(ent)) {
  DRM_ERROR("unable to create amdgpu_preempt_ib debugsfs 
file\n");

-    return PTR_ERR(ent);
+    return;
  }
    ent = debugfs_create_file("amdgpu_force_sclk", 0200, root, adev,
    _sclk_set);
  if (IS_ERR(ent)) {
  DRM_ERROR("unable to create amdgpu_set_sclk debugsfs file\n");
-    return PTR_ERR(ent);
+    return;
  }
    /* Register debugfs entries for amdgpu_ttm */
@@ -1682,11 +1683,10 @@ int amdgpu_debugfs_init(struct amdgpu_device 
*adev)

  debugfs_create_blob("amdgpu_discovery", 0444, root,
  >debugfs_discovery_blob);
  -    return 0;
  }
    #else
-int amdgpu_debugfs_init(struct amdgpu_device *adev)
+void amdgpu_debugfs_init(struct drm_minor *minor)
  {
  return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h

index 371a6f0deb29..06b68e16e35d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
@@ -27,7 +27,7 @@
   */
    int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
-int amdgpu_debugfs_init(struct amdgpu_device *adev);
+void amdgpu_debugfs_init(struct drm_minor *minor);
  void amdgpu_debugfs_fini(struct amdgpu_device *adev);
  void amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
  void amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

index df83b1f438b6..ceda650895db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2012,10 +2012,6 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
  drm_fbdev_generic_setup(adev_to_drm(adev), 32);
  }
  -    ret = amdgpu_debugfs_init(adev);
-    if (ret)
-    DRM_ERROR("Creating debugfs files failed (%d).\n", ret);
-
  return 0;
    err_pci:
@@ -2479,6 +2475,9 @@ static const struct drm_driver 
amdgpu_kms_driver = {

  .dumb_map_offset = amdgpu_mode_dumb_mmap,
  .fops = _driver_kms_fops,
  .release = _driver_release_kms,
+#if defined(CONFIG_DEBUG_FS)
+    .debugfs_init = amdgpu_debugfs_init,
+#endif
    .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
  .prime_fd_to_handle = drm_gem_prime_fd_to_handle,

Re: [PATCH 1/1] drm/amdgpu: init debugfs drm driver callback

2021-10-05 Thread Das, Nirmoy




On 10/5/2021 2:01 PM, Christian König wrote:

Am 05.10.21 um 13:58 schrieb Nirmoy Das:

drm_dev_register() will try to init driver's debugfs using
drm_driver.debugfs_init call back function. Use that callback
also for amdgpu to intialize debugfs.


Mhm, why is that useful? We rather wanted to get rid of all this DRM 
midlayering.



I was thinking of not calling further debugfs  APIs if we are unable to 
create the root dentry itself by adding another


patch in drm_debugfs_init(). But I agree with removing DRM midlayering, 
I  will then add a  IS_ERR(root) check in amdgpu_debugfs_init()



Nirmoy



Christian.



Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 10 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  7 +++
  3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 6611b3c7c149..3076742f8f85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1611,8 +1611,9 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
  DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
  amdgpu_debugfs_sclk_set, "%llu\n");
  -int amdgpu_debugfs_init(struct amdgpu_device *adev)
+void amdgpu_debugfs_init(struct drm_minor *minor)
  {
+    struct amdgpu_device *adev = drm_to_adev(minor->dev);
  struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
  struct dentry *ent;
  int r, i;
@@ -1621,14 +1622,14 @@ int amdgpu_debugfs_init(struct amdgpu_device 
*adev)

    _ib_preempt);
  if (IS_ERR(ent)) {
  DRM_ERROR("unable to create amdgpu_preempt_ib debugsfs 
file\n");

-    return PTR_ERR(ent);
+    return;
  }
    ent = debugfs_create_file("amdgpu_force_sclk", 0200, root, adev,
    _sclk_set);
  if (IS_ERR(ent)) {
  DRM_ERROR("unable to create amdgpu_set_sclk debugsfs file\n");
-    return PTR_ERR(ent);
+    return;
  }
    /* Register debugfs entries for amdgpu_ttm */
@@ -1682,11 +1683,10 @@ int amdgpu_debugfs_init(struct amdgpu_device 
*adev)

  debugfs_create_blob("amdgpu_discovery", 0444, root,
  >debugfs_discovery_blob);
  -    return 0;
  }
    #else
-int amdgpu_debugfs_init(struct amdgpu_device *adev)
+void amdgpu_debugfs_init(struct drm_minor *minor)
  {
  return 0;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h

index 371a6f0deb29..06b68e16e35d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
@@ -27,7 +27,7 @@
   */
    int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
-int amdgpu_debugfs_init(struct amdgpu_device *adev);
+void amdgpu_debugfs_init(struct drm_minor *minor);
  void amdgpu_debugfs_fini(struct amdgpu_device *adev);
  void amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
  void amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c

index df83b1f438b6..ceda650895db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2012,10 +2012,6 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
  drm_fbdev_generic_setup(adev_to_drm(adev), 32);
  }
  -    ret = amdgpu_debugfs_init(adev);
-    if (ret)
-    DRM_ERROR("Creating debugfs files failed (%d).\n", ret);
-
  return 0;
    err_pci:
@@ -2479,6 +2475,9 @@ static const struct drm_driver 
amdgpu_kms_driver = {

  .dumb_map_offset = amdgpu_mode_dumb_mmap,
  .fops = _driver_kms_fops,
  .release = _driver_release_kms,
+#if defined(CONFIG_DEBUG_FS)
+    .debugfs_init = amdgpu_debugfs_init,
+#endif
    .prime_handle_to_fd = drm_gem_prime_handle_to_fd,
  .prime_fd_to_handle = drm_gem_prime_fd_to_handle,

Re: [PATCH 1/1] drm/amdgpu: init debugfs drm driver callback

2021-10-05 Thread Das, Nirmoy




On 10/5/2021 1:58 PM, Nirmoy Das wrote:

drm_dev_register() will try to init driver's debugfs using
drm_driver.debugfs_init call back function. Use that callback
also for amdgpu to intialize debugfs.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 10 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  7 +++
  3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 6611b3c7c149..3076742f8f85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1611,8 +1611,9 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
  DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
amdgpu_debugfs_sclk_set, "%llu\n");
  
-int amdgpu_debugfs_init(struct amdgpu_device *adev)

+void amdgpu_debugfs_init(struct drm_minor *minor)
  {
+   struct amdgpu_device *adev = drm_to_adev(minor->dev);
struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
struct dentry *ent;
int r, i;
@@ -1621,14 +1622,14 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
  _ib_preempt);
if (IS_ERR(ent)) {
DRM_ERROR("unable to create amdgpu_preempt_ib debugsfs file\n");
-   return PTR_ERR(ent);
+   return;
}
  
  	ent = debugfs_create_file("amdgpu_force_sclk", 0200, root, adev,

  _sclk_set);
if (IS_ERR(ent)) {
DRM_ERROR("unable to create amdgpu_set_sclk debugsfs file\n");
-   return PTR_ERR(ent);
+   return;
}
  
  	/* Register debugfs entries for amdgpu_ttm */

@@ -1682,11 +1683,10 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
debugfs_create_blob("amdgpu_discovery", 0444, root,
>debugfs_discovery_blob);
  
-	return 0;

  }
  
  #else

-int amdgpu_debugfs_init(struct amdgpu_device *adev)
+void amdgpu_debugfs_init(struct drm_minor *minor)
  {
return 0;



Ah, this should be just "return".


  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
index 371a6f0deb29..06b68e16e35d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
@@ -27,7 +27,7 @@
   */
  
  int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);

-int amdgpu_debugfs_init(struct amdgpu_device *adev);
+void amdgpu_debugfs_init(struct drm_minor *minor);
  void amdgpu_debugfs_fini(struct amdgpu_device *adev);
  void amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
  void amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index df83b1f438b6..ceda650895db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2012,10 +2012,6 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
drm_fbdev_generic_setup(adev_to_drm(adev), 32);
}
  
-	ret = amdgpu_debugfs_init(adev);

-   if (ret)
-   DRM_ERROR("Creating debugfs files failed (%d).\n", ret);
-
return 0;
  
  err_pci:

@@ -2479,6 +2475,9 @@ static const struct drm_driver amdgpu_kms_driver = {
.dumb_map_offset = amdgpu_mode_dumb_mmap,
.fops = _driver_kms_fops,
.release = _driver_release_kms,
+#if defined(CONFIG_DEBUG_FS)
+   .debugfs_init = amdgpu_debugfs_init,
+#endif
  
  	.prime_handle_to_fd = drm_gem_prime_handle_to_fd,

.prime_fd_to_handle = drm_gem_prime_fd_to_handle,

[PATCH 1/1] drm/amdgpu: init debugfs drm driver callback

2021-10-05 Thread Nirmoy Das

drm_dev_register() will try to init driver's debugfs using
drm_driver.debugfs_init call back function. Use that callback
also for amdgpu to intialize debugfs.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 10 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c |  7 +++
 3 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 6611b3c7c149..3076742f8f85 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1611,8 +1611,9 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ib_preempt, NULL,
 DEFINE_DEBUGFS_ATTRIBUTE(fops_sclk_set, NULL,
amdgpu_debugfs_sclk_set, "%llu\n");
 
-int amdgpu_debugfs_init(struct amdgpu_device *adev)
+void amdgpu_debugfs_init(struct drm_minor *minor)
 {
+   struct amdgpu_device *adev = drm_to_adev(minor->dev);
struct dentry *root = adev_to_drm(adev)->primary->debugfs_root;
struct dentry *ent;
int r, i;
@@ -1621,14 +1622,14 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
  _ib_preempt);
if (IS_ERR(ent)) {
DRM_ERROR("unable to create amdgpu_preempt_ib debugsfs file\n");
-   return PTR_ERR(ent);
+   return;
}
 
ent = debugfs_create_file("amdgpu_force_sclk", 0200, root, adev,
  _sclk_set);
if (IS_ERR(ent)) {
DRM_ERROR("unable to create amdgpu_set_sclk debugsfs file\n");
-   return PTR_ERR(ent);
+   return;
}
 
/* Register debugfs entries for amdgpu_ttm */
@@ -1682,11 +1683,10 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
debugfs_create_blob("amdgpu_discovery", 0444, root,
>debugfs_discovery_blob);
 
-   return 0;
 }
 
 #else
-int amdgpu_debugfs_init(struct amdgpu_device *adev)
+void amdgpu_debugfs_init(struct drm_minor *minor)
 {
return 0;
 }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
index 371a6f0deb29..06b68e16e35d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
@@ -27,7 +27,7 @@
  */
 
 int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);
-int amdgpu_debugfs_init(struct amdgpu_device *adev);
+void amdgpu_debugfs_init(struct drm_minor *minor);
 void amdgpu_debugfs_fini(struct amdgpu_device *adev);
 void amdgpu_debugfs_fence_init(struct amdgpu_device *adev);
 void amdgpu_debugfs_firmware_init(struct amdgpu_device *adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
index df83b1f438b6..ceda650895db 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
@@ -2012,10 +2012,6 @@ static int amdgpu_pci_probe(struct pci_dev *pdev,
drm_fbdev_generic_setup(adev_to_drm(adev), 32);
}
 
-   ret = amdgpu_debugfs_init(adev);
-   if (ret)
-   DRM_ERROR("Creating debugfs files failed (%d).\n", ret);
-
return 0;
 
 err_pci:
@@ -2479,6 +2475,9 @@ static const struct drm_driver amdgpu_kms_driver = {
.dumb_map_offset = amdgpu_mode_dumb_mmap,
.fops = _driver_kms_fops,
.release = _driver_release_kms,
+#if defined(CONFIG_DEBUG_FS)
+   .debugfs_init = amdgpu_debugfs_init,
+#endif
 
.prime_handle_to_fd = drm_gem_prime_handle_to_fd,
.prime_fd_to_handle = drm_gem_prime_fd_to_handle,
-- 
2.32.0

Re: [PATCH] drm/amdgpu: revert "Add autodump debugfs node for gpu reset v8"

2021-09-30 Thread Das, Nirmoy


Acked-by: Nirmoy Das 

On 9/30/2021 11:26 AM, Christian König wrote:

This reverts commit 728e7e0cd61899208e924472b9e641dbeb0775c4.

Further discussion reveals that this feature is severely broken
and needs to be reverted ASAP.

GPU reset can never be delayed by userspace even for debugging or
otherwise we can run into in kernel deadlocks.

Signed-off-by: Christian König 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h |  2 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 80 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h |  5 --
  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c  |  4 --
  4 files changed, 91 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index dc3c6b3a00e5..6a1928a720a6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1078,8 +1078,6 @@ struct amdgpu_device {
charproduct_name[32];
charserial[20];
  
-	struct amdgpu_autodump		autodump;

-
atomic_tthrottling_logging_enabled;
struct ratelimit_state  throttling_logging_rs;
uint32_tras_hw_enabled;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 277128846dd1..0b89ba142a59 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -27,7 +27,6 @@
  #include 
  #include 
  #include 
-#include 
  
  #include "amdgpu.h"

  #include "amdgpu_pm.h"
@@ -37,85 +36,7 @@
  #include "amdgpu_securedisplay.h"
  #include "amdgpu_fw_attestation.h"
  
-int amdgpu_debugfs_wait_dump(struct amdgpu_device *adev)

-{
  #if defined(CONFIG_DEBUG_FS)
-   unsigned long timeout = 600 * HZ;
-   int ret;
-
-   wake_up_interruptible(>autodump.gpu_hang);
-
-   ret = 
wait_for_completion_interruptible_timeout(>autodump.dumping, timeout);
-   if (ret == 0) {
-   pr_err("autodump: timeout, move on to gpu recovery\n");
-   return -ETIMEDOUT;
-   }
-#endif
-   return 0;
-}
-
-#if defined(CONFIG_DEBUG_FS)
-
-static int amdgpu_debugfs_autodump_open(struct inode *inode, struct file *file)
-{
-   struct amdgpu_device *adev = inode->i_private;
-   int ret;
-
-   file->private_data = adev;
-
-   ret = down_read_killable(>reset_sem);
-   if (ret)
-   return ret;
-
-   if (adev->autodump.dumping.done) {
-   reinit_completion(>autodump.dumping);
-   ret = 0;
-   } else {
-   ret = -EBUSY;
-   }
-
-   up_read(>reset_sem);
-
-   return ret;
-}
-
-static int amdgpu_debugfs_autodump_release(struct inode *inode, struct file 
*file)
-{
-   struct amdgpu_device *adev = file->private_data;
-
-   complete_all(>autodump.dumping);
-   return 0;
-}
-
-static unsigned int amdgpu_debugfs_autodump_poll(struct file *file, struct 
poll_table_struct *poll_table)
-{
-   struct amdgpu_device *adev = file->private_data;
-
-   poll_wait(file, >autodump.gpu_hang, poll_table);
-
-   if (amdgpu_in_reset(adev))
-   return POLLIN | POLLRDNORM | POLLWRNORM;
-
-   return 0;
-}
-
-static const struct file_operations autodump_debug_fops = {
-   .owner = THIS_MODULE,
-   .open = amdgpu_debugfs_autodump_open,
-   .poll = amdgpu_debugfs_autodump_poll,
-   .release = amdgpu_debugfs_autodump_release,
-};
-
-static void amdgpu_debugfs_autodump_init(struct amdgpu_device *adev)
-{
-   init_completion(>autodump.dumping);
-   complete_all(>autodump.dumping);
-   init_waitqueue_head(>autodump.gpu_hang);
-
-   debugfs_create_file("amdgpu_autodump", 0600,
-   adev_to_drm(adev)->primary->debugfs_root,
-   adev, _debug_fops);
-}
  
  /**

   * amdgpu_debugfs_process_reg_op - Handle MMIO register reads/writes
@@ -1590,7 +1511,6 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
}
  
  	amdgpu_ras_debugfs_create_all(adev);

-   amdgpu_debugfs_autodump_init(adev);
amdgpu_rap_debugfs_init(adev);
amdgpu_securedisplay_debugfs_init(adev);
amdgpu_fw_attestation_debugfs_init(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
index 141a8474e24f..8b641f40fdf6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.h
@@ -26,10 +26,6 @@
  /*
   * Debugfs
   */
-struct amdgpu_autodump {
-   struct completion   dumping;
-   struct wait_queue_head  gpu_hang;
-};
  
  int amdgpu_debugfs_regs_init(struct amdgpu_device *adev);

  int amdgpu_debugfs_init(struct amdgpu_device *adev);
@@ -37,4 +33,3 @@ void amdgpu_debugfs_fini(struct amdgpu_device *adev);
  void

[PATCH 1/1] drm/radeon: pass drm dev radeon_agp_head_init directly

2021-09-13 Thread Nirmoy Das

Pass drm dev directly as rdev->ddev gets initialized later on
at radeon_device_init().

Bug: https://bugzilla.kernel.org/show_bug.cgi?id=214375
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/radeon/radeon_kms.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/radeon/radeon_kms.c 
b/drivers/gpu/drm/radeon/radeon_kms.c
index 0473583dcdac..482fb0ae6cb5 100644
--- a/drivers/gpu/drm/radeon/radeon_kms.c
+++ b/drivers/gpu/drm/radeon/radeon_kms.c
@@ -119,7 +119,7 @@ int radeon_driver_load_kms(struct drm_device *dev, unsigned 
long flags)
 #endif
 
if (pci_find_capability(pdev, PCI_CAP_ID_AGP))
-   rdev->agp = radeon_agp_head_init(rdev->ddev);
+   rdev->agp = radeon_agp_head_init(dev);
if (rdev->agp) {
rdev->agp->agp_mtrr = arch_phys_wc_add(
rdev->agp->agp_info.aper_base,
-- 
2.32.0

Re: [PATCH 2/2] drm/amdgpu: cleanup debugfs for amdgpu rings

2021-09-08 Thread Das, Nirmoy




On 9/6/2021 6:45 PM, Sharma, Shashank wrote:



On 9/5/2021 5:01 PM, Das, Nirmoy wrote:


On 9/5/2021 10:03 AM, Sharma, Shashank wrote:



On 9/3/2021 9:44 PM, Das, Nirmoy wrote:

Hi Shashank,

On 9/3/2021 5:51 PM, Das, Nirmoy wrote:


On 9/3/2021 5:26 PM, Sharma, Shashank wrote:



On 9/3/2021 1:39 PM, Das, Nirmoy wrote:


On 9/3/2021 8:36 AM, Sharma, Shashank wrote:



On 9/2/2021 5:14 PM, Nirmoy Das wrote:

Use debugfs_create_file_size API for creating ring debugfs
file, also cleanup surrounding code.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  4 +---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c    | 16 
+---

  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    |  8 +---
  3 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 077f9baf74fe..dee56ab19a8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1734,9 +1734,7 @@ int amdgpu_debugfs_init(struct 
amdgpu_device *adev)

  if (!ring)
  continue;
  -    if (amdgpu_debugfs_ring_init(adev, ring)) {
-    DRM_ERROR("Failed to register debugfs file for 
rings !\n");

-    }
+    amdgpu_debugfs_ring_init(adev, ring);
  }
    amdgpu_ras_debugfs_create_all(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c

index f40753e1a60d..968521d80514 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -415,26 +415,20 @@ static const struct file_operations 
amdgpu_debugfs_ring_fops = {

    #endif
  -int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
   struct amdgpu_ring *ring)
  {
  #if defined(CONFIG_DEBUG_FS)
  struct drm_minor *minor = adev_to_drm(adev)->primary;
-    struct dentry *ent, *root = minor->debugfs_root;
+    struct dentry *root = minor->debugfs_root;
  char name[32];
    sprintf(name, "amdgpu_ring_%s", ring->name);
  -    ent = debugfs_create_file(name,
-  S_IFREG | S_IRUGO, root,
-  ring, _debugfs_ring_fops);
-    if (IS_ERR(ent))
-    return -ENOMEM;


Why are we doing this ? Why to make it void from int ?



We tend to ignore debugfs return values as those are not serious 
errors. This to sync with rest of our


debugfs calls.


Regards,

Nirmoy




I am not suere if completely removing the provision of return 
value is a good way of doing it, we can always ignore it at the 
caller side, isn't it ?




I just realized while making the change debugfs_create_file_size() 
is void return, so we don't have anything useful to return in 
amdgpu_debugfs_ring_init()





Ah, it makes better sense now. Probably just a mention in the body 
of the message that we are moving from debugfs_create_file() to 
debugfs_create_file_size(), will make this change of return type 
more logical.



Yes, I have that "Use debugfs_create_file_size API for creating ring 
debugfs file,..."





My bad, I was too focused (and a bit confused due to uasge of 
clean-up) around the code change.


Suggestion for message: Use debugfs_create_file_size API for creating 
ring debugfs, and as its a NULL returning API, change the return type 
for amdgpu_debugfs_ring_init API as well.



Thanks Shashank, I pushed the change with your suggested commit message.


Nirmoy




With (or even without) this change, please feel free to use:

Reviewed-by: Shashank Sharma 

- Shashank


Nirmoy



- Shashank


Regards,

Nirmoy





Yes, we are currently throwing an error msg and ignoring it. I 
don't have a strong opinion regarding this, I will send a v2 
restoring previous behavior.



Thanks,

Nirmoy




- Shashank



- Shashank



-
-    i_size_write(ent->d_inode, ring->ring_size + 12);
-    ring->ent = ent;
+    debugfs_create_file_size(name, S_IFREG | S_IRUGO, root, 
ring,

+ _debugfs_ring_fops,
+ ring->ring_size + 12);
  #endif
-    return 0;
  }
    /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

index 88d80eb3fea1..c29fbce0a5b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -253,10 +253,6 @@ struct amdgpu_ring {
  bool    has_compute_vm_bug;
  bool    no_scheduler;
  int    hw_prio;
-
-#if defined(CONFIG_DEBUG_FS)
-    struct dentry *ent;
-#endif
  };
    #define amdgpu_ring_parse_cs(r, p, ib) 
((r)->funcs->parse_cs((p), (ib)))
@@ -356,8 +352,6 @@ static inline void 
amdgpu_ring_write_multiple(struct amdgpu_ring *ring,

    int amdgpu_ring_test_helper(struct amdgpu_ring *ring);
  -int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+void amdgpu_debugfs_rin

Re: [PATCH 1/2] drm/amdgpu: fix use after free during BO move

2021-09-07 Thread Das, Nirmoy

 Acked-by: Nirmoy Das  for the 1st patch and second 
patch is


Reviewed-by: Nirmoy Das 


On 9/7/2021 10:14 AM, Christian König wrote:

The memory backing old_mem is already freed at that point, move the
check a bit more up.

Signed-off-by: Christian König 
Fixes: bfa3357ef9ab ("drm/ttm: allocate resource object instead of embedding it 
v2")
Bug: 
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Fgitlab.freedesktop.org%2Fdrm%2Famd%2F-%2Fissues%2F1699data=04%7C01%7Cnirmoy.das%40amd.com%7Ce76c4a0ac29e480fcf7108d971d79344%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637665992971099794%7CUnknown%7CTWFpbGZsb3d8eyJWIjoiMC4wLjAwMDAiLCJQIjoiV2luMzIiLCJBTiI6Ik1haWwiLCJXVCI6Mn0%3D%7C1000sdata=JttDenpA2ZII0Ttktn3HMVodWWU0kJoPVPvQ3%2BnN4sw%3Dreserved=0
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 18 +-
  1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 446943e32e3e..e2896ac2c9ce 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -513,6 +513,15 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, 
bool evict,
goto out;
}
  
+	if (bo->type == ttm_bo_type_device &&

+   new_mem->mem_type == TTM_PL_VRAM &&
+   old_mem->mem_type != TTM_PL_VRAM) {
+   /* amdgpu_bo_fault_reserve_notify will re-set this if the CPU
+* accesses the BO after it's moved.
+*/
+   abo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
+   }
+
if (adev->mman.buffer_funcs_enabled) {
if (((old_mem->mem_type == TTM_PL_SYSTEM &&
  new_mem->mem_type == TTM_PL_VRAM) ||
@@ -543,15 +552,6 @@ static int amdgpu_bo_move(struct ttm_buffer_object *bo, 
bool evict,
return r;
}
  
-	if (bo->type == ttm_bo_type_device &&

-   new_mem->mem_type == TTM_PL_VRAM &&
-   old_mem->mem_type != TTM_PL_VRAM) {
-   /* amdgpu_bo_fault_reserve_notify will re-set this if the CPU
-* accesses the BO after it's moved.
-*/
-   abo->flags &= ~AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED;
-   }
-
  out:
/* update statistics */
atomic64_add(bo->base.size, >num_bytes_moved);

Re: [PATCH 2/2] drm/amdgpu: cleanup debugfs for amdgpu rings

2021-09-05 Thread Das, Nirmoy




On 9/5/2021 10:03 AM, Sharma, Shashank wrote:



On 9/3/2021 9:44 PM, Das, Nirmoy wrote:

Hi Shashank,

On 9/3/2021 5:51 PM, Das, Nirmoy wrote:


On 9/3/2021 5:26 PM, Sharma, Shashank wrote:



On 9/3/2021 1:39 PM, Das, Nirmoy wrote:


On 9/3/2021 8:36 AM, Sharma, Shashank wrote:



On 9/2/2021 5:14 PM, Nirmoy Das wrote:

Use debugfs_create_file_size API for creating ring debugfs
file, also cleanup surrounding code.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  4 +---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c    | 16 +---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    |  8 +---
  3 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 077f9baf74fe..dee56ab19a8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1734,9 +1734,7 @@ int amdgpu_debugfs_init(struct 
amdgpu_device *adev)

  if (!ring)
  continue;
  -    if (amdgpu_debugfs_ring_init(adev, ring)) {
-    DRM_ERROR("Failed to register debugfs file for 
rings !\n");

-    }
+    amdgpu_debugfs_ring_init(adev, ring);
  }
    amdgpu_ras_debugfs_create_all(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c

index f40753e1a60d..968521d80514 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -415,26 +415,20 @@ static const struct file_operations 
amdgpu_debugfs_ring_fops = {

    #endif
  -int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
   struct amdgpu_ring *ring)
  {
  #if defined(CONFIG_DEBUG_FS)
  struct drm_minor *minor = adev_to_drm(adev)->primary;
-    struct dentry *ent, *root = minor->debugfs_root;
+    struct dentry *root = minor->debugfs_root;
  char name[32];
    sprintf(name, "amdgpu_ring_%s", ring->name);
  -    ent = debugfs_create_file(name,
-  S_IFREG | S_IRUGO, root,
-  ring, _debugfs_ring_fops);
-    if (IS_ERR(ent))
-    return -ENOMEM;


Why are we doing this ? Why to make it void from int ?



We tend to ignore debugfs return values as those are not serious 
errors. This to sync with rest of our


debugfs calls.


Regards,

Nirmoy




I am not suere if completely removing the provision of return value 
is a good way of doing it, we can always ignore it at the caller 
side, isn't it ?




I just realized while making the change debugfs_create_file_size() is 
void return, so we don't have anything useful to return in 
amdgpu_debugfs_ring_init()





Ah, it makes better sense now. Probably just a mention in the body of 
the message that we are moving from debugfs_create_file() to 
debugfs_create_file_size(), will make this change of return type more 
logical.



Yes, I have that "Use debugfs_create_file_size API for creating ring 
debugfs file,..."



Nirmoy



- Shashank


Regards,

Nirmoy





Yes, we are currently throwing an error msg and ignoring it. I don't 
have a strong opinion regarding this, I will send a v2 restoring 
previous behavior.



Thanks,

Nirmoy




- Shashank



- Shashank



-
-    i_size_write(ent->d_inode, ring->ring_size + 12);
-    ring->ent = ent;
+    debugfs_create_file_size(name, S_IFREG | S_IRUGO, root, ring,
+ _debugfs_ring_fops,
+ ring->ring_size + 12);
  #endif
-    return 0;
  }
    /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

index 88d80eb3fea1..c29fbce0a5b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -253,10 +253,6 @@ struct amdgpu_ring {
  bool    has_compute_vm_bug;
  bool    no_scheduler;
  int    hw_prio;
-
-#if defined(CONFIG_DEBUG_FS)
-    struct dentry *ent;
-#endif
  };
    #define amdgpu_ring_parse_cs(r, p, ib) 
((r)->funcs->parse_cs((p), (ib)))
@@ -356,8 +352,6 @@ static inline void 
amdgpu_ring_write_multiple(struct amdgpu_ring *ring,

    int amdgpu_ring_test_helper(struct amdgpu_ring *ring);
  -int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
   struct amdgpu_ring *ring);
-void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
-
  #endif

Re: [PATCH 2/2] drm/amdgpu: cleanup debugfs for amdgpu rings

2021-09-03 Thread Das, Nirmoy


Hi Shashank,

On 9/3/2021 5:51 PM, Das, Nirmoy wrote:


On 9/3/2021 5:26 PM, Sharma, Shashank wrote:



On 9/3/2021 1:39 PM, Das, Nirmoy wrote:


On 9/3/2021 8:36 AM, Sharma, Shashank wrote:



On 9/2/2021 5:14 PM, Nirmoy Das wrote:

Use debugfs_create_file_size API for creating ring debugfs
file, also cleanup surrounding code.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  4 +---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c    | 16 +---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    |  8 +---
  3 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 077f9baf74fe..dee56ab19a8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1734,9 +1734,7 @@ int amdgpu_debugfs_init(struct amdgpu_device 
*adev)

  if (!ring)
  continue;
  -    if (amdgpu_debugfs_ring_init(adev, ring)) {
-    DRM_ERROR("Failed to register debugfs file for rings 
!\n");

-    }
+    amdgpu_debugfs_ring_init(adev, ring);
  }
    amdgpu_ras_debugfs_create_all(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c

index f40753e1a60d..968521d80514 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -415,26 +415,20 @@ static const struct file_operations 
amdgpu_debugfs_ring_fops = {

    #endif
  -int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
   struct amdgpu_ring *ring)
  {
  #if defined(CONFIG_DEBUG_FS)
  struct drm_minor *minor = adev_to_drm(adev)->primary;
-    struct dentry *ent, *root = minor->debugfs_root;
+    struct dentry *root = minor->debugfs_root;
  char name[32];
    sprintf(name, "amdgpu_ring_%s", ring->name);
  -    ent = debugfs_create_file(name,
-  S_IFREG | S_IRUGO, root,
-  ring, _debugfs_ring_fops);
-    if (IS_ERR(ent))
-    return -ENOMEM;


Why are we doing this ? Why to make it void from int ?



We tend to ignore debugfs return values as those are not serious 
errors. This to sync with rest of our


debugfs calls.


Regards,

Nirmoy




I am not suere if completely removing the provision of return value 
is a good way of doing it, we can always ignore it at the caller 
side, isn't it ?




I just realized while making the change debugfs_create_file_size() is 
void return, so we don't have anything useful to return in 
amdgpu_debugfs_ring_init()



Regards,

Nirmoy





Yes, we are currently throwing an error msg and ignoring it. I don't 
have a strong opinion regarding this, I will send a v2 restoring  
previous behavior.



Thanks,

Nirmoy




- Shashank



- Shashank



-
-    i_size_write(ent->d_inode, ring->ring_size + 12);
-    ring->ent = ent;
+    debugfs_create_file_size(name, S_IFREG | S_IRUGO, root, ring,
+ _debugfs_ring_fops,
+ ring->ring_size + 12);
  #endif
-    return 0;
  }
    /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

index 88d80eb3fea1..c29fbce0a5b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -253,10 +253,6 @@ struct amdgpu_ring {
  bool    has_compute_vm_bug;
  bool    no_scheduler;
  int    hw_prio;
-
-#if defined(CONFIG_DEBUG_FS)
-    struct dentry *ent;
-#endif
  };
    #define amdgpu_ring_parse_cs(r, p, ib) 
((r)->funcs->parse_cs((p), (ib)))
@@ -356,8 +352,6 @@ static inline void 
amdgpu_ring_write_multiple(struct amdgpu_ring *ring,

    int amdgpu_ring_test_helper(struct amdgpu_ring *ring);
  -int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
   struct amdgpu_ring *ring);
-void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
-
  #endif

Re: [PATCH 2/2] drm/amdgpu: cleanup debugfs for amdgpu rings

2021-09-03 Thread Das, Nirmoy




On 9/3/2021 5:26 PM, Sharma, Shashank wrote:



On 9/3/2021 1:39 PM, Das, Nirmoy wrote:


On 9/3/2021 8:36 AM, Sharma, Shashank wrote:



On 9/2/2021 5:14 PM, Nirmoy Das wrote:

Use debugfs_create_file_size API for creating ring debugfs
file, also cleanup surrounding code.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  4 +---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c    | 16 +---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    |  8 +---
  3 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 077f9baf74fe..dee56ab19a8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1734,9 +1734,7 @@ int amdgpu_debugfs_init(struct amdgpu_device 
*adev)

  if (!ring)
  continue;
  -    if (amdgpu_debugfs_ring_init(adev, ring)) {
-    DRM_ERROR("Failed to register debugfs file for rings 
!\n");

-    }
+    amdgpu_debugfs_ring_init(adev, ring);
  }
    amdgpu_ras_debugfs_create_all(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c

index f40753e1a60d..968521d80514 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -415,26 +415,20 @@ static const struct file_operations 
amdgpu_debugfs_ring_fops = {

    #endif
  -int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
   struct amdgpu_ring *ring)
  {
  #if defined(CONFIG_DEBUG_FS)
  struct drm_minor *minor = adev_to_drm(adev)->primary;
-    struct dentry *ent, *root = minor->debugfs_root;
+    struct dentry *root = minor->debugfs_root;
  char name[32];
    sprintf(name, "amdgpu_ring_%s", ring->name);
  -    ent = debugfs_create_file(name,
-  S_IFREG | S_IRUGO, root,
-  ring, _debugfs_ring_fops);
-    if (IS_ERR(ent))
-    return -ENOMEM;


Why are we doing this ? Why to make it void from int ?



We tend to ignore debugfs return values as those are not serious 
errors. This to sync with rest of our


debugfs calls.


Regards,

Nirmoy




I am not suere if completely removing the provision of return value is 
a good way of doing it, we can always ignore it at the caller side, 
isn't it ?



Yes, we are currently throwing an error msg and ignoring it. I don't 
have a strong opinion regarding this, I will send a v2 restoring  
previous behavior.



Thanks,

Nirmoy




- Shashank



- Shashank



-
-    i_size_write(ent->d_inode, ring->ring_size + 12);
-    ring->ent = ent;
+    debugfs_create_file_size(name, S_IFREG | S_IRUGO, root, ring,
+ _debugfs_ring_fops,
+ ring->ring_size + 12);
  #endif
-    return 0;
  }
    /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

index 88d80eb3fea1..c29fbce0a5b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -253,10 +253,6 @@ struct amdgpu_ring {
  bool    has_compute_vm_bug;
  bool    no_scheduler;
  int    hw_prio;
-
-#if defined(CONFIG_DEBUG_FS)
-    struct dentry *ent;
-#endif
  };
    #define amdgpu_ring_parse_cs(r, p, ib) 
((r)->funcs->parse_cs((p), (ib)))
@@ -356,8 +352,6 @@ static inline void 
amdgpu_ring_write_multiple(struct amdgpu_ring *ring,

    int amdgpu_ring_test_helper(struct amdgpu_ring *ring);
  -int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
   struct amdgpu_ring *ring);
-void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
-
  #endif

Re: [PATCH 1/2] drm/amdgpu: use IS_ERR for debugfs APIs

2021-09-03 Thread Das, Nirmoy


Hi Christian and Shashank,


Please review the v2 : https://patchwork.freedesktop.org/patch/452175/

In v2, I am returning "PTR_ERR(ent)" instead of -EIO which I think makes more 
sense.

Regards,
Nirmoy

On 9/3/2021 9:53 AM, Christian König wrote:

Am 02.09.21 um 13:44 schrieb Nirmoy Das:

debugfs APIs returns encoded error so use
IS_ERR for checking return value.

References: https://gitlab.freedesktop.org/drm/amd/-/issues/1686
Signed-off-by: Nirmoy Das 


Reviewed-by: Christian König 


---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 6 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c    | 2 +-
  2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index d256215ab2c7..077f9baf74fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1696,18 +1696,16 @@ int amdgpu_debugfs_init(struct amdgpu_device 
*adev)

  struct dentry *ent;
  int r, i;
  -
-
  ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
    _ib_preempt);
-    if (!ent) {
+    if (IS_ERR(ent)) {
  DRM_ERROR("unable to create amdgpu_preempt_ib debugsfs 
file\n");

  return -EIO;
  }
    ent = debugfs_create_file("amdgpu_force_sclk", 0200, root, adev,
    _sclk_set);
-    if (!ent) {
+    if (IS_ERR(ent)) {
  DRM_ERROR("unable to create amdgpu_set_sclk debugsfs file\n");
  return -EIO;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c

index 7b634a1517f9..f40753e1a60d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -428,7 +428,7 @@ int amdgpu_debugfs_ring_init(struct amdgpu_device 
*adev,

  ent = debugfs_create_file(name,
    S_IFREG | S_IRUGO, root,
    ring, _debugfs_ring_fops);
-    if (!ent)
+    if (IS_ERR(ent))
  return -ENOMEM;
    i_size_write(ent->d_inode, ring->ring_size + 12);

Re: [PATCH 2/2] drm/amdgpu: cleanup debugfs for amdgpu rings

2021-09-03 Thread Das, Nirmoy




On 9/3/2021 8:36 AM, Sharma, Shashank wrote:



On 9/2/2021 5:14 PM, Nirmoy Das wrote:

Use debugfs_create_file_size API for creating ring debugfs
file, also cleanup surrounding code.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  4 +---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c    | 16 +---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h    |  8 +---
  3 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c

index 077f9baf74fe..dee56ab19a8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1734,9 +1734,7 @@ int amdgpu_debugfs_init(struct amdgpu_device 
*adev)

  if (!ring)
  continue;
  -    if (amdgpu_debugfs_ring_init(adev, ring)) {
-    DRM_ERROR("Failed to register debugfs file for rings !\n");
-    }
+    amdgpu_debugfs_ring_init(adev, ring);
  }
    amdgpu_ras_debugfs_create_all(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c

index f40753e1a60d..968521d80514 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -415,26 +415,20 @@ static const struct file_operations 
amdgpu_debugfs_ring_fops = {

    #endif
  -int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
   struct amdgpu_ring *ring)
  {
  #if defined(CONFIG_DEBUG_FS)
  struct drm_minor *minor = adev_to_drm(adev)->primary;
-    struct dentry *ent, *root = minor->debugfs_root;
+    struct dentry *root = minor->debugfs_root;
  char name[32];
    sprintf(name, "amdgpu_ring_%s", ring->name);
  -    ent = debugfs_create_file(name,
-  S_IFREG | S_IRUGO, root,
-  ring, _debugfs_ring_fops);
-    if (IS_ERR(ent))
-    return -ENOMEM;


Why are we doing this ? Why to make it void from int ?



We tend to ignore debugfs return values as those are not serious errors. 
This to sync with rest of our


debugfs calls.


Regards,

Nirmoy



- Shashank



-
-    i_size_write(ent->d_inode, ring->ring_size + 12);
-    ring->ent = ent;
+    debugfs_create_file_size(name, S_IFREG | S_IRUGO, root, ring,
+ _debugfs_ring_fops,
+ ring->ring_size + 12);
  #endif
-    return 0;
  }
    /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

index 88d80eb3fea1..c29fbce0a5b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -253,10 +253,6 @@ struct amdgpu_ring {
  bool    has_compute_vm_bug;
  bool    no_scheduler;
  int    hw_prio;
-
-#if defined(CONFIG_DEBUG_FS)
-    struct dentry *ent;
-#endif
  };
    #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), 
(ib)))
@@ -356,8 +352,6 @@ static inline void 
amdgpu_ring_write_multiple(struct amdgpu_ring *ring,

    int amdgpu_ring_test_helper(struct amdgpu_ring *ring);
  -int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
   struct amdgpu_ring *ring);
-void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
-
  #endif

[PATCH v2 1/2] drm/amdgpu: use IS_ERR for debugfs APIs

2021-09-02 Thread Nirmoy Das

debugfs APIs returns encoded error so use
IS_ERR for checking return value.

v2: return PTR_ERR(ent)

References: https://gitlab.freedesktop.org/drm/amd/-/issues/1686
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 10 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c|  4 ++--
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index d256215ab2c7..60f46a4b0144 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1696,20 +1696,18 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
struct dentry *ent;
int r, i;

-
-
ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
  _ib_preempt);
-   if (!ent) {
+   if (IS_ERR(ent)) {
DRM_ERROR("unable to create amdgpu_preempt_ib debugsfs file\n");
-   return -EIO;
+   return PTR_ERR(ent);
}

ent = debugfs_create_file("amdgpu_force_sclk", 0200, root, adev,
  _sclk_set);
-   if (!ent) {
+   if (IS_ERR(ent)) {
DRM_ERROR("unable to create amdgpu_set_sclk debugsfs file\n");
-   return -EIO;
+   return PTR_ERR(ent);
}

/* Register debugfs entries for amdgpu_ttm */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 7b634a1517f9..0554576d3695 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -428,8 +428,8 @@ int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
ent = debugfs_create_file(name,
  S_IFREG | S_IRUGO, root,
  ring, _debugfs_ring_fops);
-   if (!ent)
-   return -ENOMEM;
+   if (IS_ERR(ent))
+   return PTR_ERR(ent);

i_size_write(ent->d_inode, ring->ring_size + 12);
ring->ent = ent;
--
2.32.0

[PATCH 2/2] drm/amdgpu: cleanup debugfs for amdgpu rings

2021-09-02 Thread Nirmoy Das

Use debugfs_create_file_size API for creating ring debugfs
file, also cleanup surrounding code.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  4 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c| 18 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h| 10 ++
 3 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 60f46a4b0144..97d88f3e1c4c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1734,9 +1734,7 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
if (!ring)
continue;
 
-   if (amdgpu_debugfs_ring_init(adev, ring)) {
-   DRM_ERROR("Failed to register debugfs file for rings 
!\n");
-   }
+   amdgpu_debugfs_ring_init(adev, ring);
}
 
amdgpu_ras_debugfs_create_all(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 0554576d3695..ab2351ba9574 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -415,26 +415,20 @@ static const struct file_operations 
amdgpu_debugfs_ring_fops = {
 
 #endif
 
-int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
-struct amdgpu_ring *ring)
+void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+ struct amdgpu_ring *ring)
 {
 #if defined(CONFIG_DEBUG_FS)
struct drm_minor *minor = adev_to_drm(adev)->primary;
-   struct dentry *ent, *root = minor->debugfs_root;
+   struct dentry *root = minor->debugfs_root;
char name[32];
 
sprintf(name, "amdgpu_ring_%s", ring->name);
+   debugfs_create_file_size(name, S_IFREG | S_IRUGO, root, ring,
+_debugfs_ring_fops,
+ring->ring_size + 12);
 
-   ent = debugfs_create_file(name,
- S_IFREG | S_IRUGO, root,
- ring, _debugfs_ring_fops);
-   if (IS_ERR(ent))
-   return PTR_ERR(ent);
-
-   i_size_write(ent->d_inode, ring->ring_size + 12);
-   ring->ent = ent;
 #endif
-   return 0;
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 88d80eb3fea1..4d380e79752c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -253,10 +253,6 @@ struct amdgpu_ring {
boolhas_compute_vm_bug;
boolno_scheduler;
int hw_prio;
-
-#if defined(CONFIG_DEBUG_FS)
-   struct dentry *ent;
-#endif
 };
 
 #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))
@@ -356,8 +352,6 @@ static inline void amdgpu_ring_write_multiple(struct 
amdgpu_ring *ring,
 
 int amdgpu_ring_test_helper(struct amdgpu_ring *ring);
 
-int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
-struct amdgpu_ring *ring);
-void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
-
+void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+ struct amdgpu_ring *ring);
 #endif
-- 
2.32.0

[PATCH 2/2] drm/amdgpu: cleanup debugfs for amdgpu rings

2021-09-02 Thread Nirmoy Das

Use debugfs_create_file_size API for creating ring debugfs
file, also cleanup surrounding code.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c |  4 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c| 16 +---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h|  8 +---
 3 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 077f9baf74fe..dee56ab19a8f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1734,9 +1734,7 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
if (!ring)
continue;
 
-   if (amdgpu_debugfs_ring_init(adev, ring)) {
-   DRM_ERROR("Failed to register debugfs file for rings 
!\n");
-   }
+   amdgpu_debugfs_ring_init(adev, ring);
}
 
amdgpu_ras_debugfs_create_all(adev);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index f40753e1a60d..968521d80514 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -415,26 +415,20 @@ static const struct file_operations 
amdgpu_debugfs_ring_fops = {
 
 #endif
 
-int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
 struct amdgpu_ring *ring)
 {
 #if defined(CONFIG_DEBUG_FS)
struct drm_minor *minor = adev_to_drm(adev)->primary;
-   struct dentry *ent, *root = minor->debugfs_root;
+   struct dentry *root = minor->debugfs_root;
char name[32];
 
sprintf(name, "amdgpu_ring_%s", ring->name);
 
-   ent = debugfs_create_file(name,
- S_IFREG | S_IRUGO, root,
- ring, _debugfs_ring_fops);
-   if (IS_ERR(ent))
-   return -ENOMEM;
-
-   i_size_write(ent->d_inode, ring->ring_size + 12);
-   ring->ent = ent;
+   debugfs_create_file_size(name, S_IFREG | S_IRUGO, root, ring,
+_debugfs_ring_fops,
+ring->ring_size + 12);
 #endif
-   return 0;
 }
 
 /**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 88d80eb3fea1..c29fbce0a5b4 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -253,10 +253,6 @@ struct amdgpu_ring {
boolhas_compute_vm_bug;
boolno_scheduler;
int hw_prio;
-
-#if defined(CONFIG_DEBUG_FS)
-   struct dentry *ent;
-#endif
 };
 
 #define amdgpu_ring_parse_cs(r, p, ib) ((r)->funcs->parse_cs((p), (ib)))
@@ -356,8 +352,6 @@ static inline void amdgpu_ring_write_multiple(struct 
amdgpu_ring *ring,
 
 int amdgpu_ring_test_helper(struct amdgpu_ring *ring);
 
-int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
+void amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
 struct amdgpu_ring *ring);
-void amdgpu_debugfs_ring_fini(struct amdgpu_ring *ring);
-
 #endif
-- 
2.32.0

[PATCH 1/2] drm/amdgpu: use IS_ERR for debugfs APIs

2021-09-02 Thread Nirmoy Das

debugfs APIs returns encoded error so use
IS_ERR for checking return value.

References: https://gitlab.freedesktop.org/drm/amd/-/issues/1686
Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 6 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c| 2 +-
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index d256215ab2c7..077f9baf74fe 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -1696,18 +1696,16 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev)
struct dentry *ent;
int r, i;
 
-
-
ent = debugfs_create_file("amdgpu_preempt_ib", 0600, root, adev,
  _ib_preempt);
-   if (!ent) {
+   if (IS_ERR(ent)) {
DRM_ERROR("unable to create amdgpu_preempt_ib debugsfs file\n");
return -EIO;
}
 
ent = debugfs_create_file("amdgpu_force_sclk", 0200, root, adev,
  _sclk_set);
-   if (!ent) {
+   if (IS_ERR(ent)) {
DRM_ERROR("unable to create amdgpu_set_sclk debugsfs file\n");
return -EIO;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 7b634a1517f9..f40753e1a60d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -428,7 +428,7 @@ int amdgpu_debugfs_ring_init(struct amdgpu_device *adev,
ent = debugfs_create_file(name,
  S_IFREG | S_IRUGO, root,
  ring, _debugfs_ring_fops);
-   if (!ent)
+   if (IS_ERR(ent))
return -ENOMEM;
 
i_size_write(ent->d_inode, ring->ring_size + 12);
-- 
2.32.0

[PATCH v3 1/1] drm/amdgpu: detach ring priority from gfx priority

2021-08-26 Thread Nirmoy Das

Currently AMDGPU_RING_PRIO_MAX is redefinition of a
max gfx hwip priority, this won't work well when we will
have a hwip with different set of priorities than gfx.
Also, HW ring priorities are different from ring priorities.

Create a global enum for ring priority levels which each
HWIP can use to define its own priority levels.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c  | 2 +-
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  | 7 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 9 +++--
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
index c88c5c6c54a2..0d1928260650 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -109,7 +109,7 @@ static int amdgpu_ctx_priority_permit(struct drm_file *filp,
return -EACCES;
 }

-static enum gfx_pipe_priority amdgpu_ctx_prio_to_compute_prio(int32_t prio)
+static enum amdgpu_gfx_pipe_priority amdgpu_ctx_prio_to_compute_prio(int32_t 
prio)
 {
switch (prio) {
case AMDGPU_CTX_PRIORITY_HIGH:
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index d43fe2ed8116..f851196c83a5 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -42,10 +42,9 @@
 #define AMDGPU_MAX_GFX_QUEUES KGD_MAX_QUEUES
 #define AMDGPU_MAX_COMPUTE_QUEUES KGD_MAX_QUEUES

-enum gfx_pipe_priority {
-   AMDGPU_GFX_PIPE_PRIO_NORMAL = 1,
-   AMDGPU_GFX_PIPE_PRIO_HIGH,
-   AMDGPU_GFX_PIPE_PRIO_MAX
+enum amdgpu_gfx_pipe_priority {
+   AMDGPU_GFX_PIPE_PRIO_NORMAL = AMDGPU_RING_PRIO_1,
+   AMDGPU_GFX_PIPE_PRIO_HIGH = AMDGPU_RING_PRIO_2
 };

 /* Argument for PPSMC_MSG_GpuChangeState */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index e713d31619fe..88d80eb3fea1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -36,8 +36,13 @@
 #define AMDGPU_MAX_VCE_RINGS   3
 #define AMDGPU_MAX_UVD_ENC_RINGS   2

-#define AMDGPU_RING_PRIO_DEFAULT   1
-#define AMDGPU_RING_PRIO_MAX   AMDGPU_GFX_PIPE_PRIO_MAX
+enum amdgpu_ring_priority_level {
+   AMDGPU_RING_PRIO_0,
+   AMDGPU_RING_PRIO_1,
+   AMDGPU_RING_PRIO_DEFAULT = 1,
+   AMDGPU_RING_PRIO_2,
+   AMDGPU_RING_PRIO_MAX
+};

 /* some special values for the owner field */
 #define AMDGPU_FENCE_OWNER_UNDEFINED   ((void *)0ul)
--
2.32.0

Re: [PATCH v2 1/1] drm/amdgpu: detach ring priority from gfx priority

2021-08-26 Thread Das, Nirmoy




On 8/26/2021 12:48 PM, Christian König wrote:



Am 26.08.21 um 12:08 schrieb Nirmoy Das:

Currently AMDGPU_RING_PRIO_MAX is redefinition of a
max gfx hwip priority, this won't work well when we will
have a hwip with different set of priorities than gfx.
Also, HW ring priorities are different from ring priorities.

Create a global enum for ring priority levels which each
HWIP can use to define its own priority levels.

Signed-off-by: Nirmoy Das 
Reviewed-by: Lijo Lazar 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  | 5 ++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 9 +++--
  2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h

index d43fe2ed8116..7f747a4291f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -43,9 +43,8 @@
  #define AMDGPU_MAX_COMPUTE_QUEUES KGD_MAX_QUEUES

  enum gfx_pipe_priority {


While at it can you add an amdgpu_ prefix before the enum name?

And if the enum isn't really used maybe even replace the enum with 
defines?



Yes makes sense, I will resend with defines.



Thanks,
Christian.


-    AMDGPU_GFX_PIPE_PRIO_NORMAL = 1,
-    AMDGPU_GFX_PIPE_PRIO_HIGH,
-    AMDGPU_GFX_PIPE_PRIO_MAX
+    AMDGPU_GFX_PIPE_PRIO_NORMAL = AMDGPU_RING_PRIO_1,
+    AMDGPU_GFX_PIPE_PRIO_HIGH = AMDGPU_RING_PRIO_2
  };

  /* Argument for PPSMC_MSG_GpuChangeState */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

index e713d31619fe..88d80eb3fea1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -36,8 +36,13 @@
  #define AMDGPU_MAX_VCE_RINGS    3
  #define AMDGPU_MAX_UVD_ENC_RINGS    2

-#define AMDGPU_RING_PRIO_DEFAULT    1
-#define AMDGPU_RING_PRIO_MAX    AMDGPU_GFX_PIPE_PRIO_MAX
+enum amdgpu_ring_priority_level {
+    AMDGPU_RING_PRIO_0,
+    AMDGPU_RING_PRIO_1,
+    AMDGPU_RING_PRIO_DEFAULT = 1,
+    AMDGPU_RING_PRIO_2,
+    AMDGPU_RING_PRIO_MAX
+};

  /* some special values for the owner field */
  #define AMDGPU_FENCE_OWNER_UNDEFINED    ((void *)0ul)
--
2.32.0

[PATCH v2 1/1] drm/amdgpu: detach ring priority from gfx priority

2021-08-26 Thread Nirmoy Das

Currently AMDGPU_RING_PRIO_MAX is redefinition of a
max gfx hwip priority, this won't work well when we will
have a hwip with different set of priorities than gfx.
Also, HW ring priorities are different from ring priorities.

Create a global enum for ring priority levels which each
HWIP can use to define its own priority levels.

Signed-off-by: Nirmoy Das 
Reviewed-by: Lijo Lazar 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  | 5 ++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 9 +++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index d43fe2ed8116..7f747a4291f3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -43,9 +43,8 @@
 #define AMDGPU_MAX_COMPUTE_QUEUES KGD_MAX_QUEUES

 enum gfx_pipe_priority {
-   AMDGPU_GFX_PIPE_PRIO_NORMAL = 1,
-   AMDGPU_GFX_PIPE_PRIO_HIGH,
-   AMDGPU_GFX_PIPE_PRIO_MAX
+   AMDGPU_GFX_PIPE_PRIO_NORMAL = AMDGPU_RING_PRIO_1,
+   AMDGPU_GFX_PIPE_PRIO_HIGH = AMDGPU_RING_PRIO_2
 };

 /* Argument for PPSMC_MSG_GpuChangeState */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index e713d31619fe..88d80eb3fea1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -36,8 +36,13 @@
 #define AMDGPU_MAX_VCE_RINGS   3
 #define AMDGPU_MAX_UVD_ENC_RINGS   2

-#define AMDGPU_RING_PRIO_DEFAULT   1
-#define AMDGPU_RING_PRIO_MAX   AMDGPU_GFX_PIPE_PRIO_MAX
+enum amdgpu_ring_priority_level {
+   AMDGPU_RING_PRIO_0,
+   AMDGPU_RING_PRIO_1,
+   AMDGPU_RING_PRIO_DEFAULT = 1,
+   AMDGPU_RING_PRIO_2,
+   AMDGPU_RING_PRIO_MAX
+};

 /* some special values for the owner field */
 #define AMDGPU_FENCE_OWNER_UNDEFINED   ((void *)0ul)
--
2.32.0

Re: [PATCH 1/1] drm/amdgpu: detach ring priority from gfx priority

2021-08-26 Thread Das, Nirmoy




On 8/26/2021 11:54 AM, Christian König wrote:

Am 26.08.21 um 11:27 schrieb Lazar, Lijo:

On 8/25/2021 9:12 PM, Nirmoy Das wrote:

Currently AMDGPU_RING_PRIO_MAX is redefinition of a
max gfx hwip priority, this won't work well when we will
have a hwip with different set of priorities than gfx.
Also, HW ring priorities are different from ring priorities.

Create a global enum for ring priority levels which each
HWIP can use to define its own priority levels.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  6 +++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 10 --
  2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h

index d43fe2ed8116..937320293029 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -43,9 +43,9 @@
  #define AMDGPU_MAX_COMPUTE_QUEUES KGD_MAX_QUEUES
    enum gfx_pipe_priority {
-    AMDGPU_GFX_PIPE_PRIO_NORMAL = 1,
-    AMDGPU_GFX_PIPE_PRIO_HIGH,
-    AMDGPU_GFX_PIPE_PRIO_MAX
+    AMDGPU_GFX_PIPE_PRIO_NORMAL = AMDGPU_RING_PRIO_1,
+    AMDGPU_GFX_PIPE_PRIO_HIGH = AMDGPU_RING_PRIO_2,
+    AMDGPU_GFX_PIPE_PRIO_MAX = AMDGPU_RING_PRIO_3


Is this a valid priority level? If not, better avoid it.

Reviewed-by: Lijo Lazar 


Is the _MAX define even used here any more? As far as I can see you 
removed the only use case for that below.



Yes, not used anymore. Sending a v2.



If it's unused just drop it completely.

Christian.




  };
    /* Argument for PPSMC_MSG_GpuChangeState */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

index e713d31619fe..85541005c1ad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -36,8 +36,14 @@
  #define AMDGPU_MAX_VCE_RINGS    3
  #define AMDGPU_MAX_UVD_ENC_RINGS    2
  -#define AMDGPU_RING_PRIO_DEFAULT    1
-#define AMDGPU_RING_PRIO_MAX    AMDGPU_GFX_PIPE_PRIO_MAX
+enum amdgpu_ring_priority_level {
+    AMDGPU_RING_PRIO_0,
+    AMDGPU_RING_PRIO_1,
+    AMDGPU_RING_PRIO_DEFAULT = 1,
+    AMDGPU_RING_PRIO_2,
+    AMDGPU_RING_PRIO_3,
+    AMDGPU_RING_PRIO_MAX
+};
    /* some special values for the owner field */
  #define AMDGPU_FENCE_OWNER_UNDEFINED    ((void *)0ul)

Re: [PATCH 1/1] drm/amdgpu: detach ring priority from gfx priority

2021-08-26 Thread Das, Nirmoy




On 8/26/2021 11:27 AM, Lazar, Lijo wrote:



On 8/25/2021 9:12 PM, Nirmoy Das wrote:

Currently AMDGPU_RING_PRIO_MAX is redefinition of a
max gfx hwip priority, this won't work well when we will
have a hwip with different set of priorities than gfx.
Also, HW ring priorities are different from ring priorities.

Create a global enum for ring priority levels which each
HWIP can use to define its own priority levels.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  6 +++---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 10 --
  2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h

index d43fe2ed8116..937320293029 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -43,9 +43,9 @@
  #define AMDGPU_MAX_COMPUTE_QUEUES KGD_MAX_QUEUES
    enum gfx_pipe_priority {
-    AMDGPU_GFX_PIPE_PRIO_NORMAL = 1,
-    AMDGPU_GFX_PIPE_PRIO_HIGH,
-    AMDGPU_GFX_PIPE_PRIO_MAX
+    AMDGPU_GFX_PIPE_PRIO_NORMAL = AMDGPU_RING_PRIO_1,
+    AMDGPU_GFX_PIPE_PRIO_HIGH = AMDGPU_RING_PRIO_2,
+    AMDGPU_GFX_PIPE_PRIO_MAX = AMDGPU_RING_PRIO_3


Is this a valid priority level? If not, better avoid it.



Yes, it not. I will resend,  Thanks!


Reviewed-by: Lijo Lazar 


  };
    /* Argument for PPSMC_MSG_GpuChangeState */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h

index e713d31619fe..85541005c1ad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -36,8 +36,14 @@
  #define AMDGPU_MAX_VCE_RINGS    3
  #define AMDGPU_MAX_UVD_ENC_RINGS    2
  -#define AMDGPU_RING_PRIO_DEFAULT    1
-#define AMDGPU_RING_PRIO_MAX    AMDGPU_GFX_PIPE_PRIO_MAX
+enum amdgpu_ring_priority_level {
+    AMDGPU_RING_PRIO_0,
+    AMDGPU_RING_PRIO_1,
+    AMDGPU_RING_PRIO_DEFAULT = 1,
+    AMDGPU_RING_PRIO_2,
+    AMDGPU_RING_PRIO_3,
+    AMDGPU_RING_PRIO_MAX
+};
    /* some special values for the owner field */
  #define AMDGPU_FENCE_OWNER_UNDEFINED    ((void *)0ul)

[PATCH 1/1] drm/amdgpu: detach ring priority from gfx priority

2021-08-25 Thread Nirmoy Das

Currently AMDGPU_RING_PRIO_MAX is redefinition of a
max gfx hwip priority, this won't work well when we will
have a hwip with different set of priorities than gfx.
Also, HW ring priorities are different from ring priorities.

Create a global enum for ring priority levels which each
HWIP can use to define its own priority levels.

Signed-off-by: Nirmoy Das 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  |  6 +++---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 10 --
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index d43fe2ed8116..937320293029 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -43,9 +43,9 @@
 #define AMDGPU_MAX_COMPUTE_QUEUES KGD_MAX_QUEUES
 
 enum gfx_pipe_priority {
-   AMDGPU_GFX_PIPE_PRIO_NORMAL = 1,
-   AMDGPU_GFX_PIPE_PRIO_HIGH,
-   AMDGPU_GFX_PIPE_PRIO_MAX
+   AMDGPU_GFX_PIPE_PRIO_NORMAL = AMDGPU_RING_PRIO_1,
+   AMDGPU_GFX_PIPE_PRIO_HIGH = AMDGPU_RING_PRIO_2,
+   AMDGPU_GFX_PIPE_PRIO_MAX = AMDGPU_RING_PRIO_3
 };
 
 /* Argument for PPSMC_MSG_GpuChangeState */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index e713d31619fe..85541005c1ad 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -36,8 +36,14 @@
 #define AMDGPU_MAX_VCE_RINGS   3
 #define AMDGPU_MAX_UVD_ENC_RINGS   2
 
-#define AMDGPU_RING_PRIO_DEFAULT   1
-#define AMDGPU_RING_PRIO_MAX   AMDGPU_GFX_PIPE_PRIO_MAX
+enum amdgpu_ring_priority_level {
+   AMDGPU_RING_PRIO_0,
+   AMDGPU_RING_PRIO_1,
+   AMDGPU_RING_PRIO_DEFAULT = 1,
+   AMDGPU_RING_PRIO_2,
+   AMDGPU_RING_PRIO_3,
+   AMDGPU_RING_PRIO_MAX
+};
 
 /* some special values for the owner field */
 #define AMDGPU_FENCE_OWNER_UNDEFINED   ((void *)0ul)
-- 
2.32.0

Re: [PATCH 1/1] drm/amdgpu: rework context priority handling

2021-08-25 Thread Das, Nirmoy




On 8/25/2021 2:29 PM, Christian König wrote:

Am 25.08.21 um 14:20 schrieb Lazar, Lijo:

On 8/25/2021 4:52 PM, Nirmoy Das wrote:

To get a hardware queue priority for a context, we are currently
mapping AMDGPU_CTX_PRIORITY_* to DRM_SCHED_PRIORITY_* and then
to hardware queue priority, which is not the right way to do that
as DRM_SCHED_PRIORITY_* is software scheduler's priority and it is
independent from a hardware queue priority.

Use userspace provided context priority, AMDGPU_CTX_PRIORITY_* to
map a context to proper hardware queue priority.

Signed-off-by: Nirmoy Das 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c   | 127 
--

  drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.h   |   8 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_sched.c |  44 ++--
  3 files changed, 105 insertions(+), 74 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c

index e7a010b7ca1f..c88c5c6c54a2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ctx.c
@@ -43,14 +43,61 @@ const unsigned int 
amdgpu_ctx_num_entities[AMDGPU_HW_IP_NUM] = {

  [AMDGPU_HW_IP_VCN_JPEG]    =    1,
  };
  +bool amdgpu_ctx_priority_is_valid(int32_t ctx_prio)
+{
+    switch (ctx_prio) {
+    case AMDGPU_CTX_PRIORITY_UNSET:
+    case AMDGPU_CTX_PRIORITY_VERY_LOW:
+    case AMDGPU_CTX_PRIORITY_LOW:
+    case AMDGPU_CTX_PRIORITY_NORMAL:
+    case AMDGPU_CTX_PRIORITY_HIGH:
+    case AMDGPU_CTX_PRIORITY_VERY_HIGH:
+    return true;
+    default:
+    return false;
+    }
+}
+
+static enum drm_sched_priority
+amdgpu_ctx_to_drm_sched_prio(int32_t ctx_prio)
+{
+    switch (ctx_prio) {
+    case AMDGPU_CTX_PRIORITY_UNSET:
+    return DRM_SCHED_PRIORITY_UNSET;
+
+    case AMDGPU_CTX_PRIORITY_VERY_LOW:
+    return DRM_SCHED_PRIORITY_MIN;
+
+    case AMDGPU_CTX_PRIORITY_LOW:
+    return DRM_SCHED_PRIORITY_MIN;
+
+    case AMDGPU_CTX_PRIORITY_NORMAL:
+    return DRM_SCHED_PRIORITY_NORMAL;
+
+    case AMDGPU_CTX_PRIORITY_HIGH:
+    return DRM_SCHED_PRIORITY_HIGH;
+
+    case AMDGPU_CTX_PRIORITY_VERY_HIGH:
+    return DRM_SCHED_PRIORITY_HIGH;
+
+    /* This should not happen as we sanitized userspace provided 
priority

+ * already, WARN if this happens.
+ */
+    default:
+    WARN(1, "Invalid context priority %d\n", ctx_prio);
+    return DRM_SCHED_PRIORITY_NORMAL;
+    }
+
+}
+
  static int amdgpu_ctx_priority_permit(struct drm_file *filp,
-  enum drm_sched_priority priority)
+  int32_t priority)
  {
-    if (priority < 0 || priority >= DRM_SCHED_PRIORITY_COUNT)
+    if (!amdgpu_ctx_priority_is_valid(priority))
  return -EINVAL;
    /* NORMAL and below are accessible by everyone */
-    if (priority <= DRM_SCHED_PRIORITY_NORMAL)
+    if (priority <= AMDGPU_CTX_PRIORITY_NORMAL)
  return 0;
    if (capable(CAP_SYS_NICE))
@@ -62,26 +109,35 @@ static int amdgpu_ctx_priority_permit(struct 
drm_file *filp,

  return -EACCES;
  }
  -static enum gfx_pipe_priority 
amdgpu_ctx_sched_prio_to_compute_prio(enum drm_sched_priority prio)
+static enum gfx_pipe_priority 
amdgpu_ctx_prio_to_compute_prio(int32_t prio)

  {
  switch (prio) {
-    case DRM_SCHED_PRIORITY_HIGH:
-    case DRM_SCHED_PRIORITY_KERNEL:
+    case AMDGPU_CTX_PRIORITY_HIGH:
+    case AMDGPU_CTX_PRIORITY_VERY_HIGH:
  return AMDGPU_GFX_PIPE_PRIO_HIGH;
  default:
  return AMDGPU_GFX_PIPE_PRIO_NORMAL;
  }
  }
  -static unsigned int amdgpu_ctx_prio_sched_to_hw(struct 
amdgpu_device *adev,

- enum drm_sched_priority prio,
- u32 hw_ip)
+static unsigned int amdgpu_ctx_get_hw_prio(struct amdgpu_ctx *ctx, 
u32 hw_ip)

  {
+    struct amdgpu_device *adev = ctx->adev;
+    int32_t ctx_prio;
  unsigned int hw_prio;
  -    hw_prio = (hw_ip == AMDGPU_HW_IP_COMPUTE) ?
-    amdgpu_ctx_sched_prio_to_compute_prio(prio) :
-    AMDGPU_RING_PRIO_DEFAULT;
+    ctx_prio = (ctx->override_priority == AMDGPU_CTX_PRIORITY_UNSET) ?
+    ctx->init_priority : ctx->override_priority;
+
+    switch (hw_ip) {
+    case AMDGPU_HW_IP_COMPUTE:
+    hw_prio = amdgpu_ctx_prio_to_compute_prio(ctx_prio);
+    break;
+    default:
+    hw_prio = AMDGPU_RING_PRIO_DEFAULT;
+    break;
+    }
+
  hw_ip = array_index_nospec(hw_ip, AMDGPU_HW_IP_NUM);
  if (adev->gpu_sched[hw_ip][hw_prio].num_scheds == 0)
  hw_prio = AMDGPU_RING_PRIO_DEFAULT;
@@ -89,15 +145,17 @@ static unsigned int 
amdgpu_ctx_prio_sched_to_hw(struct amdgpu_device *adev,

  return hw_prio;
  }
  +
  static int amdgpu_ctx_init_entity(struct amdgpu_ctx *ctx, u32 hw_ip,
-   const u32 ring)
+  const u32 ring)
  {
  struct amdgpu_device *adev = ctx->adev;
  struct amdgpu_ctx_entity *entity;
  struct drm_gpu_scheduler **scheds = NULL, *sch

1 2 3 4 5 6 7 8 9 >

1 - 100 of 853 matches

Mail list logo