[linux-next:master] BUILD REGRESSION 48e8992e33abf054bcc0bb2e77b2d43bb899212e

2023-12-13 Thread kernel test robot
tree/branch: 
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
branch HEAD: 48e8992e33abf054bcc0bb2e77b2d43bb899212e  Add linux-next specific 
files for 20231213

Error/Warning reports:

https://lore.kernel.org/oe-kbuild-all/202312131758.ned6tvqk-...@intel.com
https://lore.kernel.org/oe-kbuild-all/202312131823.ho2np34f-...@intel.com
https://lore.kernel.org/oe-kbuild-all/202312131907.mnytunno-...@intel.com
https://lore.kernel.org/oe-kbuild-all/202312140633.nmhowpih-...@intel.com
https://lore.kernel.org/oe-kbuild-all/202312140939.za3tdsi2-...@intel.com
https://lore.kernel.org/oe-kbuild-all/202312141227.c2h1izfi-...@intel.com

Error/Warning: (recently discovered and may have been fixed)

drivers/hid/hid-nintendo.c:371:13: error: initializer element is not constant
fs/afs/callback.c:106:(.text+0x308): undefined reference to 
`__generic_xchg_called_with_bad_pointer'
fs/afs/rotate.c:69:(.text+0x26c): undefined reference to 
`__generic_xchg_called_with_bad_pointer'
fs/bcachefs/btree_iter.c:1442:28: warning: variable 'path' set but not used 
[-Wunused-but-set-variable]
include/asm-generic/cmpxchg.h:76:(.text+0x1f6): undefined reference to 
`__generic_xchg_called_with_bad_pointer'
include/linux/rcupdate.h:301:(.text+0x5cb2): relocation truncated to fit: 
R_CKCORE_PCREL_IMM16BY4 against `__jump_table'
kernel/rcu/tree.c:2965:(.text+0x5b44): relocation truncated to fit: 
R_CKCORE_PCREL_IMM16BY4 against `__jump_table'
ld.lld: error: undefined symbol: __bad_xchg

Unverified Error/Warning (likely false positive, please contact us if 
interested):

callback.c:(.text+0x214): relocation truncated to fit: R_NIOS2_CALL26 against 
`__generic_xchg_called_with_bad_pointer'
drivers/gpu/drm/amd/amdgpu/../display/dc/hwss/dcn35/dcn35_hwseq.c:1127: 
warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer 
Documentation/doc-guide/kernel-doc.rst
drivers/scsi/mpi3mr/mpi3mr_app.c:1091 mpi3mr_map_data_buffer_dma() warn: 
returning -1 instead of -ENOMEM is sloppy
drivers/scsi/mpi3mr/mpi3mr_app.c:712 mpi3mr_bsg_build_sgl() warn: missing 
unwind goto?
fs/bcachefs/inode.c:1171 bch2_delete_dead_inodes() error: potentially using 
uninitialized 'ret'.
rotate.c:(.text+0x20c): relocation truncated to fit: R_NIOS2_CALL26 against 
`__generic_xchg_called_with_bad_pointer'

Error/Warning ids grouped by kconfigs:

gcc_recent_errors
|-- alpha-allyesconfig
|   |-- 
fs-bcachefs-chardev.c:warning:function-run_thread_with_file-might-be-a-candidate-for-gnu_printf-format-attribute
|   `-- 
fs-bcachefs-super.c:warning:function-__bch2_print-might-be-a-candidate-for-gnu_printf-format-attribute
|-- arc-allmodconfig
|   |-- 
fs-bcachefs-chardev.c:warning:function-run_thread_with_file-might-be-a-candidate-for-gnu_printf-format-attribute
|   `-- 
fs-bcachefs-super.c:warning:function-__bch2_print-might-be-a-candidate-for-gnu_printf-format-attribute
|-- arc-allyesconfig
|   |-- 
fs-bcachefs-chardev.c:warning:function-run_thread_with_file-might-be-a-candidate-for-gnu_printf-format-attribute
|   `-- 
fs-bcachefs-super.c:warning:function-__bch2_print-might-be-a-candidate-for-gnu_printf-format-attribute
|-- arc-randconfig-r112-20231213
|   `-- 
lib-zstd-compress-zstd_fast.c:sparse:sparse:Using-plain-integer-as-NULL-pointer
|-- arc-randconfig-r131-20231212
|   `-- drivers-hwmon-max31827.c:sparse:sparse:dubious:x-y
|-- arm-allmodconfig
|   |-- 
fs-bcachefs-chardev.c:warning:function-run_thread_with_file-might-be-a-candidate-for-gnu_printf-format-attribute
|   `-- 
fs-bcachefs-super.c:warning:function-__bch2_print-might-be-a-candidate-for-gnu_printf-format-attribute
|-- arm-allyesconfig
|   |-- 
fs-bcachefs-chardev.c:warning:function-run_thread_with_file-might-be-a-candidate-for-gnu_printf-format-attribute
|   `-- 
fs-bcachefs-super.c:warning:function-__bch2_print-might-be-a-candidate-for-gnu_printf-format-attribute
|-- csky-allmodconfig
|   |-- 
fs-bcachefs-chardev.c:warning:function-run_thread_with_file-might-be-a-candidate-for-gnu_printf-format-attribute
|   `-- 
fs-bcachefs-super.c:warning:function-__bch2_print-might-be-a-candidate-for-gnu_printf-format-attribute
|-- csky-allyesconfig
|   |-- 
fs-bcachefs-chardev.c:warning:function-run_thread_with_file-might-be-a-candidate-for-gnu_printf-format-attribute
|   `-- 
fs-bcachefs-super.c:warning:function-__bch2_print-might-be-a-candidate-for-gnu_printf-format-attribute
|-- csky-buildonly-randconfig-r005-20221115
|   |-- 
fs-afs-callback.c:(.text):undefined-reference-to-__generic_xchg_called_with_bad_pointer
|   |-- 
fs-afs-rotate.c:(.text):undefined-reference-to-__generic_xchg_called_with_bad_pointer
|   `-- 
include-asm-generic-cmpxchg.h:(.text):undefined-reference-to-__generic_xchg_called_with_bad_pointer
|-- csky-randconfig-002-20231213
|   |-- 
fs-bcachefs-chardev.c:warning:function-run_thread_with_file-might-be-a-candidate-for-gnu_printf-format-attribute
|   `-- 
fs-bcachefs-super.c:warning:function-__bch2_print-might-be-a-candidate-for-gnu_printf-format-attribute

RE: [PATCH] drm/amd/pm: add power save mode workload for smu 13.0.10

2023-12-13 Thread Gao, Likun
[AMD Official Use Only - General]

This patch is
Reviewed-by: Likun Gao .

Regards,
Likun

-Original Message-
From: amd-gfx  On Behalf Of Kenneth Feng
Sent: Wednesday, December 13, 2023 3:02 PM
To: amd-gfx@lists.freedesktop.org
Cc: Feng, Kenneth 
Subject: [PATCH] drm/amd/pm: add power save mode workload for smu 13.0.10

add power save mode workload for smu 13.0.10, so that in compute mode, pmfw 
will add 35mv voltage margin since some applications requres higher voltages.

Signed-off-by: Kenneth Feng 
---
 .../drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c  | 23 +++
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
index a24aa886c636..231122622a9c 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c
@@ -2545,16 +2545,19 @@ static int smu_v13_0_0_set_power_profile_mode(struct 
smu_context *smu,

workload_mask = 1 << workload_type;

-   /* Add optimizations for SMU13.0.0.  Reuse the power saving profile */
-   if (smu->power_profile_mode == PP_SMC_POWER_PROFILE_COMPUTE &&
-   (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == IP_VERSION(13, 0, 0)) 
&&
-   ((smu->adev->pm.fw_version == 0x004e6601) ||
-(smu->adev->pm.fw_version >= 0x004e7400))) {
-   workload_type = smu_cmn_to_asic_specific_index(smu,
-  
CMN2ASIC_MAPPING_WORKLOAD,
-  
PP_SMC_POWER_PROFILE_POWERSAVING);
-   if (workload_type >= 0)
-   workload_mask |= 1 << workload_type;
+   /* Add optimizations for SMU13.0.0/10.  Reuse the power saving profile 
*/
+   if (smu->power_profile_mode == PP_SMC_POWER_PROFILE_COMPUTE) {
+   if ((amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == 
IP_VERSION(13, 0, 0) &&
+   ((smu->adev->pm.fw_version == 0x004e6601) ||
+   (smu->adev->pm.fw_version >= 0x004e7300))) ||
+   (amdgpu_ip_version(smu->adev, MP1_HWIP, 0) == 
IP_VERSION(13, 0, 10) &&
+smu->adev->pm.fw_version >= 0x00504500)) {
+   workload_type = smu_cmn_to_asic_specific_index(smu,
+   
   CMN2ASIC_MAPPING_WORKLOAD,
+   
   PP_SMC_POWER_PROFILE_POWERSAVING);
+   if (workload_type >= 0)
+   workload_mask |= 1 << workload_type;
+   }
}

return smu_cmn_send_smc_msg_with_param(smu,
--
2.34.1



Re: [RFC PATCH 10/12] drm/amd/display: Use ARCH_HAS_KERNEL_FPU_SUPPORT

2023-12-13 Thread Timothy Pearson



- Original Message -
> From: "Samuel Holland" 
> To: "Michael Ellerman" 
> Cc: "linux-kernel" , "amd-gfx" 
> , "linux-arch"
> , "linux-arm-kernel" 
> , loonga...@lists.linux.dev,
> "linuxppc-dev" , "x86" , 
> linux-ri...@lists.infradead.org, "Christoph
> Hellwig" , "Timothy Pearson" 
> 
> Sent: Wednesday, December 13, 2023 7:03:20 PM
> Subject: Re: [RFC PATCH 10/12] drm/amd/display: Use 
> ARCH_HAS_KERNEL_FPU_SUPPORT

> On 2023-12-11 6:23 AM, Michael Ellerman wrote:
>> Hi Samuel,
>> 
>> Thanks for trying to clean all this up.
>> 
>> One problem below.
>> 
>> Samuel Holland  writes:
>>> Now that all previously-supported architectures select
>>> ARCH_HAS_KERNEL_FPU_SUPPORT, this code can depend on that symbol instead
>>> of the existing list of architectures. It can also take advantage of the
>>> common kernel-mode FPU API and method of adjusting CFLAGS.
>>>
>>> Signed-off-by: Samuel Holland 
>> ...
>>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c
>>> b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c
>>> index 4ae4720535a5..b64f917174ca 100644
>>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c
>>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c
>>> @@ -87,20 +78,9 @@ void dc_fpu_begin(const char *function_name, const int 
>>> line)
>>> WARN_ON_ONCE(!in_task());
>>> preempt_disable();
>>> depth = __this_cpu_inc_return(fpu_recursion_depth);
>>> -
>>> if (depth == 1) {
>>> -#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
>>> +   BUG_ON(!kernel_fpu_available());
>>> kernel_fpu_begin();
>>> -#elif defined(CONFIG_PPC64)
>>> -   if (cpu_has_feature(CPU_FTR_VSX_COMP))
>>> -   enable_kernel_vsx();
>>> -   else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP))
>>> -   enable_kernel_altivec();
>>  
>> Note altivec.
>> 
>>> -   else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE))
>>> -   enable_kernel_fp();
>>> -#elif defined(CONFIG_ARM64)
>>> -   kernel_neon_begin();
>>> -#endif
>>> }
>>>  
>>> TRACE_DCN_FPU(true, function_name, line, depth);
>>> diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile
>>> b/drivers/gpu/drm/amd/display/dc/dml/Makefile
>>> index ea7d60f9a9b4..5aad0f572ba3 100644
>>> --- a/drivers/gpu/drm/amd/display/dc/dml/Makefile
>>> +++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile
>>> @@ -25,40 +25,8 @@
>>>  # It provides the general basic services required by other DAL
>>>  # subcomponents.
>>>  
>>> -ifdef CONFIG_X86
>>> -dml_ccflags-$(CONFIG_CC_IS_GCC) := -mhard-float
>>> -dml_ccflags := $(dml_ccflags-y) -msse
>>> -endif
>>> -
>>> -ifdef CONFIG_PPC64
>>> -dml_ccflags := -mhard-float -maltivec
>>> -endif
>> 
>> And altivec is enabled in the flags there.
>> 
>> That doesn't match your implementation for powerpc in patch 7, which
>> only deals with float.
>> 
>> I suspect the AMD driver actually doesn't need altivec enabled, but I
>> don't know that for sure. It compiles without it, but I don't have a GPU
>> to actually test. I've added Timothy on Cc who added the support for
>> powerpc to the driver originally, hopefully he has a test system.

If you would like me to test I'm happy to do so, but I am travelling until 
Friday so would need to wait until then.

Thanks!


[PATCH] drm/amdkfd: only flush mes process context if mes support is there

2023-12-13 Thread Jonathan Kim
Fix up on mes process context flush to prevent non-mes devices from
spamming error messages or running into undefined behaviour during
process termination.

Fixes: 73204d028eb5 ("drm/amdkfd: fix mes set shader debugger process 
management")
Signed-off-by: Jonathan Kim 
---
 drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index 8e55e78fce4e..43eff221eae5 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -87,7 +87,8 @@ void kfd_process_dequeue_from_device(struct 
kfd_process_device *pdd)
return;
 
dev->dqm->ops.process_termination(dev->dqm, >qpd);
-   amdgpu_mes_flush_shader_debugger(dev->adev, pdd->proc_ctx_gpu_addr);
+   if (dev->kfd->shared_resources.enable_mes)
+   amdgpu_mes_flush_shader_debugger(dev->adev, 
pdd->proc_ctx_gpu_addr);
pdd->already_dequeued = true;
 }
 
-- 
2.34.1



RE: [PATCH 2/2] drm/amdgpu/vpe: enable vpe dpm

2023-12-13 Thread Yu, Lang
[Public]

The series is.

Reviewed-by: Lang Yu 

>-Original Message-
>From: Lee, Peyton 
>Sent: Wednesday, December 13, 2023 10:49 AM
>To: amd-gfx@lists.freedesktop.org
>Cc: Deucher, Alexander ; Zhang, Yifan
>; Ma, Li ; Yu, Lang
>; Lee, Peyton 
>Subject: [PATCH 2/2] drm/amdgpu/vpe: enable vpe dpm
>
>enable vpe dpm
>
>Signed-off-by: Peyton Lee 
>---
> drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c | 250
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.h |
>12 ++
> drivers/gpu/drm/amd/amdgpu/vpe_v6_1.c   |  15 ++
> 3 files changed, 277 insertions(+)
>
>diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>index e81579708e96..2020ddb4182a 100644
>--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
>@@ -26,6 +26,7 @@
> #include "amdgpu.h"
> #include "amdgpu_ucode.h"
> #include "amdgpu_vpe.h"
>+#include "amdgpu_smu.h"
> #include "soc15_common.h"
> #include "vpe_v6_1.h"
>
>@@ -33,8 +34,186 @@
> /* VPE CSA resides in the 4th page of CSA */
> #define AMDGPU_CSA_VPE_OFFSET (4096 * 3)
>
>+/* 1 second timeout */
>+#define VPE_IDLE_TIMEOUT  msecs_to_jiffies(1000)
>+
>+#define VPE_MAX_DPM_LEVEL 4
>+#define FIXED1_8_BITS_PER_FRACTIONAL_PART 8
>+#define GET_PRATIO_INTEGER_PART(x)((x) >>
>FIXED1_8_BITS_PER_FRACTIONAL_PART)
>+
> static void vpe_set_ring_funcs(struct amdgpu_device *adev);
>
>+static inline uint16_t div16_u16_rem(uint16_t dividend, uint16_t
>+divisor, uint16_t *remainder) {
>+  *remainder = dividend % divisor;
>+  return dividend / divisor;
>+}
>+
>+static inline uint16_t complete_integer_division_u16(
>+  uint16_t dividend,
>+  uint16_t divisor,
>+  uint16_t *remainder)
>+{
>+  return div16_u16_rem(dividend, divisor, (uint16_t *)remainder); }
>+
>+static uint16_t vpe_u1_8_from_fraction(uint16_t numerator, uint16_t
>+denominator) {
>+  bool arg1_negative = numerator < 0;
>+  bool arg2_negative = denominator < 0;
>+
>+  uint16_t arg1_value = (uint16_t)(arg1_negative ? -numerator :
>numerator);
>+  uint16_t arg2_value = (uint16_t)(arg2_negative ? -denominator :
>+denominator);
>+
>+  uint16_t remainder;
>+
>+  /* determine integer part */
>+  uint16_t res_value = complete_integer_division_u16(
>+  arg1_value, arg2_value, );
>+
>+  if (res_value > 127 /* CHAR_MAX */)
>+  return 0;
>+
>+  /* determine fractional part */
>+  {
>+  unsigned int i = FIXED1_8_BITS_PER_FRACTIONAL_PART;
>+
>+  do {
>+  remainder <<= 1;
>+
>+  res_value <<= 1;
>+
>+  if (remainder >= arg2_value) {
>+  res_value |= 1;
>+  remainder -= arg2_value;
>+  }
>+  } while (--i != 0);
>+  }
>+
>+  /* round up LSB */
>+  {
>+  uint16_t summand = (remainder << 1) >= arg2_value;
>+
>+  if ((res_value + summand) > 32767 /* SHRT_MAX */)
>+  return 0;
>+
>+  res_value += summand;
>+  }
>+
>+  if (arg1_negative ^ arg2_negative)
>+  res_value = -res_value;
>+
>+  return res_value;
>+}
>+
>+static uint16_t vpe_internal_get_pratio(uint16_t from_frequency,
>+uint16_t to_frequency) {
>+  uint16_t pratio = vpe_u1_8_from_fraction(from_frequency,
>+to_frequency);
>+
>+  if (GET_PRATIO_INTEGER_PART(pratio) > 1)
>+  pratio = 0;
>+
>+  return pratio;
>+}
>+
>+/*
>+ * VPE has 4 DPM levels from level 0 (lowerest) to 3 (highest),
>+ * VPE FW will dynamically decide which level should be used according to
>current loading.
>+ *
>+ * Get VPE and SOC clocks from PM, and select the appropriate four
>+clock values,
>+ * calculate the ratios of adjusting from one clock to another.
>+ * The VPE FW can then request the appropriate frequency from the PMFW.
>+ */
>+int amdgpu_vpe_configure_dpm(struct amdgpu_vpe *vpe) {
>+  struct amdgpu_device *adev = vpe->ring.adev;
>+  uint32_t dpm_ctl;
>+
>+  if (adev->pm.dpm_enabled) {
>+  struct dpm_clocks clock_table = { 0 };
>+  struct dpm_clock *VPEClks;
>+  struct dpm_clock *SOCClks;
>+  uint32_t idx;
>+  uint32_t pratio_vmax_vnorm = 0, pratio_vnorm_vmid = 0,
>pratio_vmid_vmin = 0;
>+  uint16_t pratio_vmin_freq = 0, pratio_vmid_freq = 0,
>+pratio_vnorm_freq = 0, pratio_vmax_freq = 0;
>+
>+  dpm_ctl = RREG32(vpe_get_reg_offset(vpe, 0, vpe-
>>regs.dpm_enable));
>+  dpm_ctl |= 1; /* DPM enablement */
>+  WREG32(vpe_get_reg_offset(vpe, 0, vpe->regs.dpm_enable),
>dpm_ctl);
>+
>+  /* Get VPECLK and SOCCLK */
>+  if (amdgpu_dpm_get_dpm_clock_table(adev, _table)) {
>+  dev_dbg(adev->dev, "%s: get clock failed!\n", __func__);
>+  goto 

[PATCH v4] drm/amdkfd: Use partial hmm page walk during buffer validation in SVM

2023-12-13 Thread Xiaogang . Chen
From: Xiaogang Chen 

v2:
-not need calculate vram page number for new registered svm range, only
do it for split vram pages.

v3:
-use dma address to calculate vram page number of split svm range;
use migrate_vma from hmm to calculate page number that migrate to vram.

v4:
-combine calculating of vram page number of split svm range and page dma
address copy in same loop if original svm range includes vram pages.

SVM uses hmm page walk to valid buffer before map to gpu vm. After have partial
migration/mapping do validation on same vm range as migration/map do instead of
whole svm range that can be very large. This change is expected to improve svm
code performance.

Signed-off-by: Xiaogang Chen
---
 drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 35 ---
 drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 79 +++-
 2 files changed, 48 insertions(+), 66 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
index b854cbf06dce..3fb8e59acfbf 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c
@@ -260,19 +260,6 @@ static void svm_migrate_put_sys_page(unsigned long addr)
put_page(page);
 }
 
-static unsigned long svm_migrate_successful_pages(struct migrate_vma *migrate)
-{
-   unsigned long cpages = 0;
-   unsigned long i;
-
-   for (i = 0; i < migrate->npages; i++) {
-   if (migrate->src[i] & MIGRATE_PFN_VALID &&
-   migrate->src[i] & MIGRATE_PFN_MIGRATE)
-   cpages++;
-   }
-   return cpages;
-}
-
 static unsigned long svm_migrate_unsuccessful_pages(struct migrate_vma 
*migrate)
 {
unsigned long upages = 0;
@@ -402,6 +389,7 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct 
svm_range *prange,
struct dma_fence *mfence = NULL;
struct migrate_vma migrate = { 0 };
unsigned long cpages = 0;
+   unsigned long mpages = 0;
dma_addr_t *scratch;
void *buf;
int r = -ENOMEM;
@@ -450,12 +438,13 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct 
svm_range *prange,
r = svm_migrate_copy_to_vram(node, prange, , , scratch, 
ttm_res_offset);
migrate_vma_pages();
 
-   pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n",
-   svm_migrate_successful_pages(), cpages, migrate.npages);
-
svm_migrate_copy_done(adev, mfence);
migrate_vma_finalize();
 
+   mpages = cpages - svm_migrate_unsuccessful_pages();
+   pr_debug("successful/cpages/npages 0x%lx/0x%lx/0x%lx\n",
+mpages, cpages, migrate.npages);
+
kfd_smi_event_migration_end(node, p->lead_thread->pid,
start >> PAGE_SHIFT, end >> PAGE_SHIFT,
0, node->id, trigger);
@@ -465,12 +454,12 @@ svm_migrate_vma_to_vram(struct kfd_node *node, struct 
svm_range *prange,
 out_free:
kvfree(buf);
 out:
-   if (!r && cpages) {
+   if (!r && mpages) {
pdd = svm_range_get_pdd_by_node(prange, node);
if (pdd)
-   WRITE_ONCE(pdd->page_in, pdd->page_in + cpages);
+   WRITE_ONCE(pdd->page_in, pdd->page_in + mpages);
 
-   return cpages;
+   return mpages;
}
return r;
 }
@@ -498,7 +487,7 @@ svm_migrate_ram_to_vram(struct svm_range *prange, uint32_t 
best_loc,
struct vm_area_struct *vma;
uint64_t ttm_res_offset;
struct kfd_node *node;
-   unsigned long cpages = 0;
+   unsigned long mpages = 0;
long r = 0;
 
if (start_mgr < prange->start || last_mgr > prange->last) {
@@ -540,15 +529,15 @@ svm_migrate_ram_to_vram(struct svm_range *prange, 
uint32_t best_loc,
pr_debug("failed %ld to migrate\n", r);
break;
} else {
-   cpages += r;
+   mpages += r;
}
ttm_res_offset += next - addr;
addr = next;
}
 
-   if (cpages) {
+   if (mpages) {
prange->actual_loc = best_loc;
-   prange->vram_pages = prange->vram_pages + cpages;
+   prange->vram_pages = prange->vram_pages + mpages;
} else if (!prange->actual_loc) {
/* if no page migrated and all pages from prange are at
 * sys ram drop svm_bo got from svm_range_vram_node_new
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
index 2834fb351818..61e363e388f8 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -158,13 +158,12 @@ svm_is_valid_dma_mapping_addr(struct device *dev, 
dma_addr_t dma_addr)
 static int
 svm_range_dma_map_dev(struct amdgpu_device *adev, struct svm_range *prange,
  unsigned long offset, unsigned long npages,

RE: [PATCH] drm/amdkfd: fix mes set shader debugger process management

2023-12-13 Thread Liu, Shaoyun
[Public]

Check the MES API,  It's  my fault , originally I think it will use  trap_en as 
parameter to tell MES to enable/disable shader debugger , but actually it's not 
.  So we either need to add a parameter for this (ex . add flag for 
enable/disable)  or as your solution add flag for flush.  Consider it's 
possible user process can  be killed after call set_shader but before any 
add_queue , then notify MES to do a process context flush after process 
termination seems  more reasonable .

Regards
Shaoyun.liu



From: Kim, Jonathan 
Sent: Tuesday, December 12, 2023 11:43 PM
To: Liu, Shaoyun ; Huang, JinHuiEric 
; amd-gfx@lists.freedesktop.org
Cc: Wong, Alice ; Kuehling, Felix 
; Kasiviswanathan, Harish 

Subject: RE: [PATCH] drm/amdkfd: fix mes set shader debugger process management


[Public]

Again, MES only knows to flush if there was something enqueued in the first 
place.
SET_SHADER dictates what's on the process list.
SET_SHADER can be the last call prior to process termination with nothing 
enqueued, hence no MES auto flush occurs.

MES doesn't block anything on the flush flag request.
The driver guarantees that flush is only done on process termination after 
device dequeue, whether there were queues or not.
MES has no idea what an invalid context is.
It just has a value stored in its linked list that's associated with a driver 
allocated BO that no longer exists after process termination.

If you're still not sure about this solution, then this should be discussed 
offline with the MES team.
We're not going to gain ground discussing this here.  The solution has already 
been merged.
Feel free to propose a better solution if you're not satisfied with this one.

Jon

From: Liu, Shaoyun mailto:shaoyun@amd.com>>
Sent: Tuesday, December 12, 2023 11:08 PM
To: Kim, Jonathan mailto:jonathan@amd.com>>; Huang, 
JinHuiEric mailto:jinhuieric.hu...@amd.com>>; 
amd-gfx@lists.freedesktop.org
Cc: Wong, Alice mailto:shiwei.w...@amd.com>>; Kuehling, 
Felix mailto:felix.kuehl...@amd.com>>; Kasiviswanathan, 
Harish mailto:harish.kasiviswanat...@amd.com>>
Subject: Re: [PATCH] drm/amdkfd: fix mes set shader debugger process management


[Public]

You try to add one new interface to inform mes about the context flush after 
driver side finish process termination , from my understanding, mes already 
know the process context need to be purged after all the related queues been 
removed even without this notification. What do you expect mes to do about this 
context flush flag ? Mes should block this process context for next set_sched 
command? Mes can achieve  this by ignore the set_sched command with trap 
disable parameter on an invalid process context .

Shaoyun.liu

Get Outlook for iOS

From: Kim, Jonathan mailto:jonathan@amd.com>>
Sent: Tuesday, December 12, 2023 8:19:09 PM
To: Liu, Shaoyun mailto:shaoyun@amd.com>>; Huang, 
JinHuiEric mailto:jinhuieric.hu...@amd.com>>; 
amd-gfx@lists.freedesktop.org 
mailto:amd-gfx@lists.freedesktop.org>>
Cc: Wong, Alice mailto:shiwei.w...@amd.com>>; Kuehling, 
Felix mailto:felix.kuehl...@amd.com>>; Kasiviswanathan, 
Harish mailto:harish.kasiviswanat...@amd.com>>
Subject: RE: [PATCH] drm/amdkfd: fix mes set shader debugger process management

[Public]

> -Original Message-
> From: Liu, Shaoyun mailto:shaoyun@amd.com>>
> Sent: Tuesday, December 12, 2023 7:08 PM
> To: Kim, Jonathan mailto:jonathan@amd.com>>; Huang, 
> JinHuiEric
> mailto:jinhuieric.hu...@amd.com>>; 
> amd-gfx@lists.freedesktop.org
> Cc: Wong, Alice mailto:shiwei.w...@amd.com>>; Kuehling, 
> Felix
> mailto:felix.kuehl...@amd.com>>; Kasiviswanathan, 
> Harish
> mailto:harish.kasiviswanat...@amd.com>>
> Subject: RE: [PATCH] drm/amdkfd: fix mes set shader debugger process
> management
>
> [Public]
>
> I see,  so the  problem is after process context , set_shader been  called 
> with
> disable parameter,  do you know the  reason why  MES re-added the
> process context into the  list ?

Because MES has no idea what disable means.

All it knows is that without the flush flag, set_shader should update the 
necessary per-VMID (process) registers as requested by the driver, which 
requires persistent per-process HW settings so that potential future waves can 
inherit those settings i.e. ADD_QUEUE.skip_process_ctx_clear is set (why 
ADD_QUEUE auto clears the process context otherwise is another long story, 
basically an unsolvable MES cache bug problem).

Common use case example:
add_queue -> set_shader call either transiently stalls the SPI per-VMID or 
transiently dequeues the HWS per-VMID depending on the request settings -> 
fulfils the per-VMID register write updates -> resumes process queues so that 
potential waves on those queues inherit new debug settings.

You can't do this kind of operation at the queue level alone.

[pull] amdgpu drm-fixes-6.7

2023-12-13 Thread Alex Deucher
Hi Dave, Sima,

Fixes for 6.7.

The following changes since commit b7b5a56acec819bb8dcd03c687e97a091b29d28f:

  Merge tag 'exynos-drm-next-for-v6.7-rc5' of 
git://git.kernel.org/pub/scm/linux/kernel/git/daeinki/drm-exynos into drm-fixes 
(2023-12-08 13:55:32 +1000)

are available in the Git repository at:

  https://gitlab.freedesktop.org/agd5f/linux.git 
tags/amd-drm-fixes-6.7-2023-12-13

for you to fetch changes up to a4236c4b410857a70647c410e886c8a0455ec4fb:

  drm/amdgpu: warn when there are still mappings when a BO is destroyed v2 
(2023-12-13 16:50:46 -0500)


amd-drm-fixes-6.7-2023-12-13:

amdgpu:
- Fix suspend fix that got accidently mangled last week
- Fix OD regression
- PSR fixes
- OLED Backlight regression fix
- JPEG 4.0.5 fix
- Misc display fixes
- SDMA 5.2 fix
- SDMA 2.4 regression fix
- GPUVM race fix


Alex Deucher (2):
  drm/amdgpu: fix buffer funcs setting order on suspend harder
  drm/amdgpu/sdma5.2: add begin/end_use ring callbacks

Christian König (2):
  drm/amdgpu: fix tear down order in amdgpu_vm_pt_free
  drm/amdgpu: warn when there are still mappings when a BO is destroyed v2

Dmitrii Galantsev (1):
  drm/amd/pm: fix pp_*clk_od typo

Fangzhi Zuo (1):
  drm/amd/display: Populate dtbclk from bounding box

Hamza Mahfooz (1):
  drm/amd/display: fix hw rotated modes when PSR-SU is enabled

Mario Limonciello (3):
  drm/amd/display: Restore guard against default backlight value < 1 nit
  drm/amd/display: Disable PSR-SU on Parade 0803 TCON again
  drm/amd: Fix a probing order problem on SDMA 2.4

Saleemkhan Jamadar (1):
  drm/amdgpu/jpeg: configure doorbell for each playback

Taimur Hassan (1):
  drm/amd/display: Revert "Fix conversions between bytes and KB"

 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c |  2 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c |  2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c  |  3 ++-
 drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_5.c   | 15 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c |  4 ++--
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 28 ++
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c  |  3 +++
 drivers/gpu/drm/amd/display/dc/dc_hw_types.h   |  1 +
 drivers/gpu/drm/amd/display/dc/dcn20/dcn20_hubp.c  | 12 --
 .../gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c   | 14 +++
 .../drm/amd/display/dc/dml2/display_mode_core.c| 16 ++---
 .../amd/display/dc/dml2/dml2_translation_helper.c  |  5 ++--
 .../drm/amd/display/dc/hwss/dcn10/dcn10_hwseq.c|  3 ++-
 .../dc/link/protocols/link_edp_panel_control.c |  4 ++--
 .../drm/amd/display/modules/power/power_helpers.c  |  2 ++
 drivers/gpu/drm/amd/pm/amdgpu_pm.c |  4 ++--
 16 files changed, 84 insertions(+), 34 deletions(-)


Re: [PATCH 1/2] drm: update drm_show_memory_stats() for dma-bufs

2023-12-13 Thread Felix Kuehling

On 2023-12-07 13:02, Alex Deucher wrote:

Show buffers as shared if they are shared via dma-buf as well
(e.g., shared with v4l or some other subsystem).


You can add KFD to that list. With the in-progress CUDA11 VM changes and 
improved interop between KFD and render nodes, sharing DMABufs between 
KFD and render nodes will become much more common.


Regards,
  Felix




Signed-off-by: Alex Deucher 
Cc: Rob Clark 
---
  drivers/gpu/drm/drm_file.c | 2 +-
  1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/drm_file.c b/drivers/gpu/drm/drm_file.c
index 5ddaffd32586..5d5f93b9c263 100644
--- a/drivers/gpu/drm/drm_file.c
+++ b/drivers/gpu/drm/drm_file.c
@@ -973,7 +973,7 @@ void drm_show_memory_stats(struct drm_printer *p, struct 
drm_file *file)
DRM_GEM_OBJECT_PURGEABLE;
}
  
-		if (obj->handle_count > 1) {

+   if ((obj->handle_count > 1) || obj->dma_buf) {
status.shared += obj->size;
} else {
status.private += obj->size;


[PATCH 3/4] drm/amdgpu: add new INFO IOCTL query for input power

2023-12-13 Thread Alex Deucher
Some chips provide both average and input power.  Previously
we just exposed average power, add a new query for input
power.

Example userspace:
https://github.com/Umio-Yasuno/libdrm-amdgpu-sys-rs/tree/input_power

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 9 +
 include/uapi/drm/amdgpu_drm.h   | 2 ++
 2 files changed, 11 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index bf4f48fe438d..48496bb585c7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1114,6 +1114,15 @@ int amdgpu_info_ioctl(struct drm_device *dev, void 
*data, struct drm_file *filp)
}
ui32 >>= 8;
break;
+   case AMDGPU_INFO_SENSOR_GPU_INPUT_POWER:
+   /* get input GPU power */
+   if (amdgpu_dpm_read_sensor(adev,
+  
AMDGPU_PP_SENSOR_GPU_INPUT_POWER,
+  (void *), _size)) {
+   return -EINVAL;
+   }
+   ui32 >>= 8;
+   break;
case AMDGPU_INFO_SENSOR_VDDNB:
/* get VDDNB in millivolts */
if (amdgpu_dpm_read_sensor(adev,
diff --git a/include/uapi/drm/amdgpu_drm.h b/include/uapi/drm/amdgpu_drm.h
index ad21c613fec8..96e32dafd4f0 100644
--- a/include/uapi/drm/amdgpu_drm.h
+++ b/include/uapi/drm/amdgpu_drm.h
@@ -865,6 +865,8 @@ struct drm_amdgpu_cs_chunk_cp_gfx_shadow {
#define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_SCLK 0xa
/* Subquery id: Query GPU peak pstate memory clock */
#define AMDGPU_INFO_SENSOR_PEAK_PSTATE_GFX_MCLK 0xb
+   /* Subquery id: Query input GPU power   */
+   #define AMDGPU_INFO_SENSOR_GPU_INPUT_POWER  0xc
 /* Number of VRAM page faults on CPU access. */
 #define AMDGPU_INFO_NUM_VRAM_CPU_PAGE_FAULTS   0x1E
 #define AMDGPU_INFO_VRAM_LOST_COUNTER  0x1F
-- 
2.42.0



[PATCH 4/4] drm/amdgpu/pm: clarify debugfs pm output

2023-12-13 Thread Alex Deucher
On APUs power is SoC power, not just GPU.
Clarify that for UVD/VCE/VCN the IP is powered down,
not disabled which can confusing and lead to concerns
that the IP is actually not available.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/pm/amdgpu_pm.c | 28 ++--
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
index 83907c60c70e..4fff4677e13f 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c
@@ -4353,11 +4353,19 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file 
*m, struct amdgpu_device *a
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_VDDNB, (void 
*), ))
seq_printf(m, "\t%u mV (VDDNB)\n", value);
size = sizeof(uint32_t);
-   if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_AVG_POWER, (void 
*), ))
-   seq_printf(m, "\t%u.%02u W (average GPU)\n", query >> 8, query 
& 0xff);
+   if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_AVG_POWER, (void 
*), )) {
+   if (adev->flags & AMD_IS_APU)
+   seq_printf(m, "\t%u.%02u W (average SoC including 
CPU)\n", query >> 8, query & 0xff);
+   else
+   seq_printf(m, "\t%u.%02u W (average SoC)\n", query >> 
8, query & 0xff);
+   }
size = sizeof(uint32_t);
-   if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_INPUT_POWER, 
(void *), ))
-   seq_printf(m, "\t%u.%02u W (current GPU)\n", query >> 8, query 
& 0xff);
+   if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_GPU_INPUT_POWER, 
(void *), )) {
+   if (adev->flags & AMD_IS_APU)
+   seq_printf(m, "\t%u.%02u W (current SoC including 
CPU)\n", query >> 8, query & 0xff);
+   else
+   seq_printf(m, "\t%u.%02u W (current SoC)\n", query >> 
8, query & 0xff);
+   }
size = sizeof(value);
seq_printf(m, "\n");
 
@@ -4383,9 +4391,9 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file *m, 
struct amdgpu_device *a
/* VCN clocks */
if (!amdgpu_dpm_read_sensor(adev, 
AMDGPU_PP_SENSOR_VCN_POWER_STATE, (void *), )) {
if (!value) {
-   seq_printf(m, "VCN: Disabled\n");
+   seq_printf(m, "VCN: Powered down\n");
} else {
-   seq_printf(m, "VCN: Enabled\n");
+   seq_printf(m, "VCN: Powered up\n");
if (!amdgpu_dpm_read_sensor(adev, 
AMDGPU_PP_SENSOR_UVD_DCLK, (void *), ))
seq_printf(m, "\t%u MHz (DCLK)\n", 
value/100);
if (!amdgpu_dpm_read_sensor(adev, 
AMDGPU_PP_SENSOR_UVD_VCLK, (void *), ))
@@ -4397,9 +4405,9 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file *m, 
struct amdgpu_device *a
/* UVD clocks */
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_UVD_POWER, 
(void *), )) {
if (!value) {
-   seq_printf(m, "UVD: Disabled\n");
+   seq_printf(m, "UVD: Powered down\n");
} else {
-   seq_printf(m, "UVD: Enabled\n");
+   seq_printf(m, "UVD: Powered up\n");
if (!amdgpu_dpm_read_sensor(adev, 
AMDGPU_PP_SENSOR_UVD_DCLK, (void *), ))
seq_printf(m, "\t%u MHz (DCLK)\n", 
value/100);
if (!amdgpu_dpm_read_sensor(adev, 
AMDGPU_PP_SENSOR_UVD_VCLK, (void *), ))
@@ -4411,9 +4419,9 @@ static int amdgpu_debugfs_pm_info_pp(struct seq_file *m, 
struct amdgpu_device *a
/* VCE clocks */
if (!amdgpu_dpm_read_sensor(adev, AMDGPU_PP_SENSOR_VCE_POWER, 
(void *), )) {
if (!value) {
-   seq_printf(m, "VCE: Disabled\n");
+   seq_printf(m, "VCE: Powered down\n");
} else {
-   seq_printf(m, "VCE: Enabled\n");
+   seq_printf(m, "VCE: Powered up\n");
if (!amdgpu_dpm_read_sensor(adev, 
AMDGPU_PP_SENSOR_VCE_ECCLK, (void *), ))
seq_printf(m, "\t%u MHz (ECCLK)\n", 
value/100);
}
-- 
2.42.0



[PATCH 2/4] drm/amdgpu: fall back to INPUT power for AVG power via INFO IOCTL

2023-12-13 Thread Alex Deucher
For backwards compatibility with userspace.

Fixes: 47f1724db4fe ("drm/amd: Introduce `AMDGPU_PP_SENSOR_GPU_INPUT_POWER`")
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2897
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
index b5ebafd4a3ad..bf4f48fe438d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c
@@ -1105,7 +1105,12 @@ int amdgpu_info_ioctl(struct drm_device *dev, void 
*data, struct drm_file *filp)
if (amdgpu_dpm_read_sensor(adev,
   
AMDGPU_PP_SENSOR_GPU_AVG_POWER,
   (void *), _size)) {
-   return -EINVAL;
+   /* fall back to input power for backwards 
compat */
+   if (amdgpu_dpm_read_sensor(adev,
+  
AMDGPU_PP_SENSOR_GPU_INPUT_POWER,
+  (void *), 
_size)) {
+   return -EINVAL;
+   }
}
ui32 >>= 8;
break;
-- 
2.42.0



[PATCH 1/4] drm/amdgpu: fix avg vs input power reporting on smu7

2023-12-13 Thread Alex Deucher
Hawaii, Bonaire, Fiji, and Tonga support average power, the others
support current power.

Signed-off-by: Alex Deucher 
---
 .../gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c 
b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
index 11372fcc59c8..a2c7b2e111fa 100644
--- a/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
+++ b/drivers/gpu/drm/amd/pm/powerplay/hwmgr/smu7_hwmgr.c
@@ -3995,6 +3995,7 @@ static int smu7_read_sensor(struct pp_hwmgr *hwmgr, int 
idx,
uint32_t sclk, mclk, activity_percent;
uint32_t offset, val_vid;
struct smu7_hwmgr *data = (struct smu7_hwmgr *)(hwmgr->backend);
+   struct amdgpu_device *adev = hwmgr->adev;
 
/* size must be at least 4 bytes for all sensors */
if (*size < 4)
@@ -4038,7 +4039,21 @@ static int smu7_read_sensor(struct pp_hwmgr *hwmgr, int 
idx,
*size = 4;
return 0;
case AMDGPU_PP_SENSOR_GPU_INPUT_POWER:
-   return smu7_get_gpu_power(hwmgr, (uint32_t *)value);
+   if ((adev->asic_type != CHIP_HAWAII) &&
+   (adev->asic_type != CHIP_BONAIRE) &&
+   (adev->asic_type != CHIP_FIJI) &&
+   (adev->asic_type != CHIP_TONGA))
+   return smu7_get_gpu_power(hwmgr, (uint32_t *)value);
+   else
+   return -EOPNOTSUPP;
+   case AMDGPU_PP_SENSOR_GPU_AVG_POWER:
+   if ((adev->asic_type != CHIP_HAWAII) &&
+   (adev->asic_type != CHIP_BONAIRE) &&
+   (adev->asic_type != CHIP_FIJI) &&
+   (adev->asic_type != CHIP_TONGA))
+   return -EOPNOTSUPP;
+   else
+   return smu7_get_gpu_power(hwmgr, (uint32_t *)value);
case AMDGPU_PP_SENSOR_VDDGFX:
if ((data->vr_config & VRCONF_VDDGFX_MASK) ==
(VR_SVI2_PLANE_2 << VRCONF_VDDGFX_SHIFT))
-- 
2.42.0



[PATCH 2/2] drm/amdgpu/atom: make amdgpu_atomfirmware_allocate_fb parsing consistent

2023-12-13 Thread Alex Deucher
For 2.1, ATOM_VRAM_BLOCK_SRIOV_MSG_SHARE_RESERVATION is SR-IOV only.
For 2.2, return usage_bytes properly for the non-SR-IOV case.

Fixes: 4864f2ee9ee2 ("drm/amdgpu: add vram reservation based on 
vram_usagebyfirmware_v2_2")
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1215802
Signed-off-by: Alex Deucher 
---
 .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c  | 55 ++-
 1 file changed, 30 insertions(+), 25 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
index d8393e3f2778..b1c1fafa2d8a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
@@ -115,18 +115,21 @@ static int amdgpu_atomfirmware_allocate_fb_v2_1(struct 
amdgpu_device *adev,
  fw_size,
  drv_size);
 
-   if ((start_addr & ATOM_VRAM_OPERATION_FLAGS_MASK) ==
-   (u32)(ATOM_VRAM_BLOCK_SRIOV_MSG_SHARE_RESERVATION <<
-   ATOM_VRAM_OPERATION_FLAGS_SHIFT)) {
-   /* Firmware request VRAM reservation for SR-IOV */
-   adev->mman.fw_vram_usage_start_offset = (start_addr &
-   (~ATOM_VRAM_OPERATION_FLAGS_MASK)) << 10;
-   adev->mman.fw_vram_usage_size = fw_size << 10;
-   /* Use the default scratch size */
-   *usage_bytes = 0;
+   if (amdgpu_sriov_vf(adev)) {
+   if ((start_addr & ATOM_VRAM_OPERATION_FLAGS_MASK) ==
+   (u32)(ATOM_VRAM_BLOCK_SRIOV_MSG_SHARE_RESERVATION <<
+ ATOM_VRAM_OPERATION_FLAGS_SHIFT)) {
+   /* Firmware request VRAM reservation for SR-IOV */
+   adev->mman.fw_vram_usage_start_offset = (start_addr &
+
(~ATOM_VRAM_OPERATION_FLAGS_MASK)) << 10;
+   adev->mman.fw_vram_usage_size = fw_size << 10;
+   /* Use the default scratch size */
+   *usage_bytes = 0;
+   }
} else {
*usage_bytes = drv_size << 10;
}
+
return 0;
 }
 
@@ -147,25 +150,27 @@ static int amdgpu_atomfirmware_allocate_fb_v2_2(struct 
amdgpu_device *adev,
  drv_start_addr,
  drv_size);
 
-   if (amdgpu_sriov_vf(adev) &&
-   ((fw_start_addr & (ATOM_VRAM_BLOCK_NEEDS_NO_RESERVATION <<
-   ATOM_VRAM_OPERATION_FLAGS_SHIFT)) == 0)) {
-   /* Firmware request VRAM reservation for SR-IOV */
-   adev->mman.fw_vram_usage_start_offset = (fw_start_addr &
-   (~ATOM_VRAM_OPERATION_FLAGS_MASK)) << 10;
-   adev->mman.fw_vram_usage_size = fw_size << 10;
-   }
+   if (amdgpu_sriov_vf(adev)) {
+   if ((fw_start_addr & (ATOM_VRAM_BLOCK_NEEDS_NO_RESERVATION <<
+ ATOM_VRAM_OPERATION_FLAGS_SHIFT)) == 0) {
+   /* Firmware request VRAM reservation for SR-IOV */
+   adev->mman.fw_vram_usage_start_offset = (fw_start_addr &
+
(~ATOM_VRAM_OPERATION_FLAGS_MASK)) << 10;
+   adev->mman.fw_vram_usage_size = fw_size << 10;
+   }
 
-   if (amdgpu_sriov_vf(adev) &&
-   ((drv_start_addr & (ATOM_VRAM_BLOCK_NEEDS_NO_RESERVATION <<
-   ATOM_VRAM_OPERATION_FLAGS_SHIFT)) == 0)) {
-   /* driver request VRAM reservation for SR-IOV */
-   adev->mman.drv_vram_usage_start_offset = (drv_start_addr &
-   (~ATOM_VRAM_OPERATION_FLAGS_MASK)) << 10;
-   adev->mman.drv_vram_usage_size = drv_size << 10;
+   if ((drv_start_addr & (ATOM_VRAM_BLOCK_NEEDS_NO_RESERVATION <<
+  ATOM_VRAM_OPERATION_FLAGS_SHIFT)) == 0) {
+   /* driver request VRAM reservation for SR-IOV */
+   adev->mman.drv_vram_usage_start_offset = 
(drv_start_addr &
+ 
(~ATOM_VRAM_OPERATION_FLAGS_MASK)) << 10;
+   adev->mman.drv_vram_usage_size = drv_size << 10;
+   }
+   *usage_bytes = 0;
+   } else {
+   *usage_bytes = drv_size << 10;
}
 
-   *usage_bytes = 0;
return 0;
 }
 
-- 
2.42.0



[PATCH 1/2] drm/amdgpu/atom: fix vram_usagebyfirmware parsing

2023-12-13 Thread Alex Deucher
The changes to support vram_usagebyfirmware v2.2 changed the behavior
to explicitly match 2.1 for everything older rather than just using it
by default.  If the version is 2.2 or newer, use the 2.2 parsing, for
everything else, use the 2.1 parsing.  This restores the previous
behavior for tables that didn't explicitly match 2.1.

Fixes: 4864f2ee9ee2 ("drm/amdgpu: add vram reservation based on 
vram_usagebyfirmware_v2_2")
Link: https://bugzilla.opensuse.org/show_bug.cgi?id=1215802
Signed-off-by: Alex Deucher 
---
 .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c   | 18 +-
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
index fb2681dd6b33..d8393e3f2778 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c
@@ -181,18 +181,18 @@ int amdgpu_atomfirmware_allocate_fb_scratch(struct 
amdgpu_device *adev)
int usage_bytes = 0;
 
if (amdgpu_atom_parse_data_header(ctx, index, NULL, , , 
_offset)) {
-   if (frev == 2 && crev == 1) {
-   fw_usage_v2_1 =
-   (struct vram_usagebyfirmware_v2_1 *)(ctx->bios 
+ data_offset);
-   amdgpu_atomfirmware_allocate_fb_v2_1(adev,
-   fw_usage_v2_1,
-   _bytes);
-   } else if (frev >= 2 && crev >= 2) {
+   if (frev >= 2 && crev >= 2) {
fw_usage_v2_2 =
(struct vram_usagebyfirmware_v2_2 *)(ctx->bios 
+ data_offset);
amdgpu_atomfirmware_allocate_fb_v2_2(adev,
-   fw_usage_v2_2,
-   _bytes);
+fw_usage_v2_2,
+_bytes);
+   } else {
+   fw_usage_v2_1 =
+   (struct vram_usagebyfirmware_v2_1 *)(ctx->bios 
+ data_offset);
+   amdgpu_atomfirmware_allocate_fb_v2_1(adev,
+fw_usage_v2_1,
+_bytes);
}
}
 
-- 
2.42.0



[PATCH] drm/amdgpu/debugfs: check if pcie register callbacks are valid

2023-12-13 Thread Alex Deucher
Before trying to use them in the debugfs register access functions.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 2cebf2145d9a..16e2c34fc18f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -525,6 +525,9 @@ static ssize_t amdgpu_debugfs_regs_pcie_read(struct file 
*f, char __user *buf,
if (size & 0x3 || *pos & 0x3)
return -EINVAL;
 
+   if (!adev->pcie_rreg)
+   return -EOPNOTSUPP;
+
r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
if (r < 0) {
pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
@@ -581,6 +584,9 @@ static ssize_t amdgpu_debugfs_regs_pcie_write(struct file 
*f, const char __user
if (size & 0x3 || *pos & 0x3)
return -EINVAL;
 
+   if (!adev->pcie_wreg)
+   return -EOPNOTSUPP;
+
r = pm_runtime_get_sync(adev_to_drm(adev)->dev);
if (r < 0) {
pm_runtime_put_autosuspend(adev_to_drm(adev)->dev);
-- 
2.42.0



Re: [PATCH 0/2] fdinfo shared stats

2023-12-13 Thread Alex Deucher
On Thu, Dec 7, 2023 at 1:03 PM Alex Deucher  wrote:
>
> We had a request to add shared buffer stats to fdinfo for amdgpu and
> while implementing that, Christian mentioned that just looking at
> the GEM handle count doesn't take into account buffers shared with other
> subsystems like V4L or RDMA.  Those subsystems don't use GEM, so it
> doesn't really matter from a GPU top perspective, but it's more
> correct if you actually want to see shared buffers.

Any thoughts on this?  Seem reasonable?

Alex

>
> Alex Deucher (2):
>   drm: update drm_show_memory_stats() for dma-bufs
>   drm/amdgpu: add shared fdinfo stats
>
>  drivers/gpu/drm/amd/amdgpu/amdgpu_fdinfo.c |  4 
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 11 +++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  6 ++
>  drivers/gpu/drm/drm_file.c |  2 +-
>  4 files changed, 22 insertions(+), 1 deletion(-)
>
> --
> 2.42.0
>


[PATCH v2] drm/amd: Add a workaround for GFX11 systems that fail to flush TLB

2023-12-13 Thread Mario Limonciello
Some systems with MP1 13.0.4 or 13.0.11 have a firmware bug that
causes the first MES packet after resume to fail. Typically this
packet is used to flush the TLB when GART is enabled.

This issue is fixed in newer firmware, but as OEMs may not roll this
out to the field, introduce a workaround that will add an extra dummy
read on resume that the result is discarded.

Cc: sta...@vger.kernel.org # 6.1+
Cc: Tim Huang 
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3045
Signed-off-by: Mario Limonciello 
---
v1->v2:
 * Add a dummy read callback instead and use that.
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 19 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  3 +++
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  | 11 +++
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  |  8 ++--
 4 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 9ddbf1494326..cd5e1a027bdf 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -868,6 +868,25 @@ int amdgpu_mes_reg_wait(struct amdgpu_device *adev, 
uint32_t reg,
return r;
 }
 
+void amdgpu_mes_reg_dummy_read(struct amdgpu_device *adev)
+{
+   struct mes_misc_op_input op_input = {
+   .op = MES_MISC_OP_READ_REG,
+   .read_reg.reg_offset = 0,
+   .read_reg.buffer_addr = adev->mes.read_val_gpu_addr,
+   };
+
+   if (!adev->mes.funcs->misc_op) {
+   DRM_ERROR("mes misc op is not supported!\n");
+   return;
+   }
+
+   adev->mes.silent_errors = true;
+   if (adev->mes.funcs->misc_op(>mes, _input))
+   DRM_DEBUG("failed to amdgpu_mes_reg_dummy_read\n");
+   adev->mes.silent_errors = false;
+}
+
 int amdgpu_mes_set_shader_debugger(struct amdgpu_device *adev,
uint64_t process_context_addr,
uint32_t spi_gdbg_per_vmid_cntl,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index a27b424ffe00..d208e60c1d99 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -135,6 +135,8 @@ struct amdgpu_mes {
 
/* ip specific functions */
const struct amdgpu_mes_funcs   *funcs;
+
+   boolsilent_errors;
 };
 
 struct amdgpu_mes_process {
@@ -356,6 +358,7 @@ int amdgpu_mes_unmap_legacy_queue(struct amdgpu_device 
*adev,
  u64 gpu_addr, u64 seq);
 
 uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg);
+void amdgpu_mes_reg_dummy_read(struct amdgpu_device *adev);
 int amdgpu_mes_wreg(struct amdgpu_device *adev,
uint32_t reg, uint32_t val);
 int amdgpu_mes_reg_wait(struct amdgpu_device *adev, uint32_t reg,
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 23d7b548d13f..a2ba45f859ea 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -960,6 +960,17 @@ static int gmc_v11_0_resume(void *handle)
int r;
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+   switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
+   case IP_VERSION(13, 0, 4):
+   case IP_VERSION(13, 0, 11):
+   /* avoid a lost packet @ first GFXOFF exit after resume */
+   if ((adev->pm.fw_version & 0x00FF) < 0x004c4900 && 
adev->in_s0ix)
+   amdgpu_mes_reg_dummy_read(adev);
+   break;
+   default:
+   break;
+   }
+
r = gmc_v11_0_hw_init(adev);
if (r)
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 4dfec56e1b7f..71df5cb65485 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -137,8 +137,12 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct 
amdgpu_mes *mes,
r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
  timeout);
if (r < 1) {
-   DRM_ERROR("MES failed to response msg=%d\n",
- x_pkt->header.opcode);
+   if (mes->silent_errors)
+   DRM_DEBUG("MES failed to response msg=%d\n",
+ x_pkt->header.opcode);
+   else
+   DRM_ERROR("MES failed to response msg=%d\n",
+ x_pkt->header.opcode);
 
while (halt_if_hws_hang)
schedule();
-- 
2.34.1



Re: [PATCH] drm/amd: Add a workaround for GFX11 systems that fail to flush TLB

2023-12-13 Thread Alex Deucher
On Wed, Dec 13, 2023 at 2:32 PM Mario Limonciello
 wrote:
>
> On 12/13/2023 13:12, Mario Limonciello wrote:
> > On 12/13/2023 13:07, Alex Deucher wrote:
> >> On Wed, Dec 13, 2023 at 1:00 PM Mario Limonciello
> >>  wrote:
> >>>
> >>> Some systems with MP1 13.0.4 or 13.0.11 have a firmware bug that
> >>> causes the first MES packet after resume to fail. This packet is
> >>> used to flush the TLB when GART is enabled.
> >>>
> >>> This issue is fixed in newer firmware, but as OEMs may not roll this
> >>> out to the field, introduce a workaround that will retry the flush
> >>> when detecting running on an older firmware and decrease relevant
> >>> error messages to debug while workaround is in use.
> >>>
> >>> Cc: sta...@vger.kernel.org # 6.1+
> >>> Cc: Tim Huang 
> >>> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3045
> >>> Signed-off-by: Mario Limonciello 
> >>> ---
> >>>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 10 --
> >>>   drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  2 ++
> >>>   drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  | 17 -
> >>>   drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  |  8 ++--
> >>>   4 files changed, 32 insertions(+), 5 deletions(-)
> >>>
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> >>> index 9ddbf1494326..6ce3f6e6b6de 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> >>> @@ -836,8 +836,14 @@ int amdgpu_mes_reg_write_reg_wait(struct
> >>> amdgpu_device *adev,
> >>>  }
> >>>
> >>>  r = adev->mes.funcs->misc_op(>mes, _input);
> >>> -   if (r)
> >>> -   DRM_ERROR("failed to reg_write_reg_wait\n");
> >>> +   if (r) {
> >>> +   const char *msg = "failed to reg_write_reg_wait\n";
> >>> +
> >>> +   if (adev->mes.suspend_workaround)
> >>> +   DRM_DEBUG(msg);
> >>> +   else
> >>> +   DRM_ERROR(msg);
> >>> +   }
> >>>
> >>>   error:
> >>>  return r;
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> >>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> >>> index a27b424ffe00..90f2bba3b12b 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> >>> @@ -135,6 +135,8 @@ struct amdgpu_mes {
> >>>
> >>>  /* ip specific functions */
> >>>  const struct amdgpu_mes_funcs   *funcs;
> >>> +
> >>> +   boolsuspend_workaround;
> >>>   };
> >>>
> >>>   struct amdgpu_mes_process {
> >>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> >>> b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> >>> index 23d7b548d13f..e810c7bb3156 100644
> >>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> >>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> >>> @@ -889,7 +889,11 @@ static int gmc_v11_0_gart_enable(struct
> >>> amdgpu_device *adev)
> >>>  false : true;
> >>>
> >>>  adev->mmhub.funcs->set_fault_enable_default(adev, value);
> >>> -   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
> >>> +
> >>> +   do {
> >>> +   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
> >>> +   adev->mes.suspend_workaround = false;
> >>> +   } while (adev->mes.suspend_workaround);
> >>
> >> Shouldn't this be something like:
> >>
> >>> +   do {
> >>> +   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
> >>> +   adev->mes.suspend_workaround = false;
> >>> +   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
> >>> +   } while (adev->mes.suspend_workaround);
> >>
> >> If we actually need the flush.  Maybe a better approach would be to
> >> check if we are in s0ix in
> >
> > Ah you're right; I had shifted this around to keep less stateful
> > variables and push them up the stack from when I first made it and that
> > logic is wrong now.
> >
> > I don't think the one you suggested is right either; it's going to apply
> > twice on ASICs that only need it once.
> >
> > I guess pending on what Christian comments on below I'll respin to logic
> > that only calls twice on resume for these ASICs.
>
> One more comment.  Tim and I both did an experiment for this (skipping
> the flush) separately.  The problem isn't the flush itself, rather it's
> the first MES packet after exiting GFXOFF.
>
> So it seems that it pushes off the issue to the next thing which is a
> ring buffer test:
>
> [drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on comp_1.0.0
> (-110).
> [drm:process_one_work] *ERROR* ib ring test failed (-110).
>
> So maybe a better workaround is a "dummy" command that is only sent for
> the broken firmware that we don't care about the outcome and discard errors.
>
> Then the workaround doesn't need to get as entangled with correct flow.

Yeah. Something like that seems cleaner.  Just a question of where to
put it since we skip GC and MES 

Re: [PATCH] drm/amd: Add a workaround for GFX11 systems that fail to flush TLB

2023-12-13 Thread Mario Limonciello

On 12/13/2023 13:12, Mario Limonciello wrote:

On 12/13/2023 13:07, Alex Deucher wrote:

On Wed, Dec 13, 2023 at 1:00 PM Mario Limonciello
 wrote:


Some systems with MP1 13.0.4 or 13.0.11 have a firmware bug that
causes the first MES packet after resume to fail. This packet is
used to flush the TLB when GART is enabled.

This issue is fixed in newer firmware, but as OEMs may not roll this
out to the field, introduce a workaround that will retry the flush
when detecting running on an older firmware and decrease relevant
error messages to debug while workaround is in use.

Cc: sta...@vger.kernel.org # 6.1+
Cc: Tim Huang 
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3045
Signed-off-by: Mario Limonciello 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 10 --
  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  2 ++
  drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  | 17 -
  drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  |  8 ++--
  4 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c

index 9ddbf1494326..6ce3f6e6b6de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -836,8 +836,14 @@ int amdgpu_mes_reg_write_reg_wait(struct 
amdgpu_device *adev,

 }

 r = adev->mes.funcs->misc_op(>mes, _input);
-   if (r)
-   DRM_ERROR("failed to reg_write_reg_wait\n");
+   if (r) {
+   const char *msg = "failed to reg_write_reg_wait\n";
+
+   if (adev->mes.suspend_workaround)
+   DRM_DEBUG(msg);
+   else
+   DRM_ERROR(msg);
+   }

  error:
 return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h

index a27b424ffe00..90f2bba3b12b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -135,6 +135,8 @@ struct amdgpu_mes {

 /* ip specific functions */
 const struct amdgpu_mes_funcs   *funcs;
+
+   bool    suspend_workaround;
  };

  struct amdgpu_mes_process {
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c

index 23d7b548d13f..e810c7bb3156 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -889,7 +889,11 @@ static int gmc_v11_0_gart_enable(struct 
amdgpu_device *adev)

 false : true;

 adev->mmhub.funcs->set_fault_enable_default(adev, value);
-   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
+
+   do {
+   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
+   adev->mes.suspend_workaround = false;
+   } while (adev->mes.suspend_workaround);


Shouldn't this be something like:


+   do {
+   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
+   adev->mes.suspend_workaround = false;
+   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
+   } while (adev->mes.suspend_workaround);


If we actually need the flush.  Maybe a better approach would be to
check if we are in s0ix in


Ah you're right; I had shifted this around to keep less stateful 
variables and push them up the stack from when I first made it and that 
logic is wrong now.


I don't think the one you suggested is right either; it's going to apply 
twice on ASICs that only need it once.


I guess pending on what Christian comments on below I'll respin to logic 
that only calls twice on resume for these ASICs.


One more comment.  Tim and I both did an experiment for this (skipping 
the flush) separately.  The problem isn't the flush itself, rather it's 
the first MES packet after exiting GFXOFF.


So it seems that it pushes off the issue to the next thing which is a 
ring buffer test:


[drm:amdgpu_ib_ring_tests [amdgpu]] *ERROR* IB test failed on comp_1.0.0 
(-110).

[drm:process_one_work] *ERROR* ib ring test failed (-110).

So maybe a better workaround is a "dummy" command that is only sent for 
the broken firmware that we don't care about the outcome and discard errors.


Then the workaround doesn't need to get as entangled with correct flow.





diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c in gmc_v11_0_flush_gpu_tlb():
index 23d7b548d13f..bd6d9953a80e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -227,7 +227,8 @@ static void gmc_v11_0_flush_gpu_tlb(struct
amdgpu_device *adev, uint32_t vmid,
  * Directly use kiq to do the vm invalidation instead
  */
 if ((adev->gfx.kiq[0].ring.sched.ready || 
adev->mes.ring.sched.ready) &&

-   (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev))) {
+   (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) ||
+   !adev->in_s0ix) {
  

Re: [PATCH] drm/amd: Add a workaround for GFX11 systems that fail to flush TLB

2023-12-13 Thread Mario Limonciello

On 12/13/2023 13:07, Alex Deucher wrote:

On Wed, Dec 13, 2023 at 1:00 PM Mario Limonciello
 wrote:


Some systems with MP1 13.0.4 or 13.0.11 have a firmware bug that
causes the first MES packet after resume to fail. This packet is
used to flush the TLB when GART is enabled.

This issue is fixed in newer firmware, but as OEMs may not roll this
out to the field, introduce a workaround that will retry the flush
when detecting running on an older firmware and decrease relevant
error messages to debug while workaround is in use.

Cc: sta...@vger.kernel.org # 6.1+
Cc: Tim Huang 
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3045
Signed-off-by: Mario Limonciello 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 10 --
  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  2 ++
  drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  | 17 -
  drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  |  8 ++--
  4 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 9ddbf1494326..6ce3f6e6b6de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -836,8 +836,14 @@ int amdgpu_mes_reg_write_reg_wait(struct amdgpu_device 
*adev,
 }

 r = adev->mes.funcs->misc_op(>mes, _input);
-   if (r)
-   DRM_ERROR("failed to reg_write_reg_wait\n");
+   if (r) {
+   const char *msg = "failed to reg_write_reg_wait\n";
+
+   if (adev->mes.suspend_workaround)
+   DRM_DEBUG(msg);
+   else
+   DRM_ERROR(msg);
+   }

  error:
 return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index a27b424ffe00..90f2bba3b12b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -135,6 +135,8 @@ struct amdgpu_mes {

 /* ip specific functions */
 const struct amdgpu_mes_funcs   *funcs;
+
+   boolsuspend_workaround;
  };

  struct amdgpu_mes_process {
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 23d7b548d13f..e810c7bb3156 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -889,7 +889,11 @@ static int gmc_v11_0_gart_enable(struct amdgpu_device 
*adev)
 false : true;

 adev->mmhub.funcs->set_fault_enable_default(adev, value);
-   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
+
+   do {
+   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
+   adev->mes.suspend_workaround = false;
+   } while (adev->mes.suspend_workaround);


Shouldn't this be something like:


+   do {
+   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
+   adev->mes.suspend_workaround = false;
+   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
+   } while (adev->mes.suspend_workaround);


If we actually need the flush.  Maybe a better approach would be to
check if we are in s0ix in


Ah you're right; I had shifted this around to keep less stateful 
variables and push them up the stack from when I first made it and that 
logic is wrong now.


I don't think the one you suggested is right either; it's going to apply 
twice on ASICs that only need it once.


I guess pending on what Christian comments on below I'll respin to logic 
that only calls twice on resume for these ASICs.




diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c in gmc_v11_0_flush_gpu_tlb():
index 23d7b548d13f..bd6d9953a80e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -227,7 +227,8 @@ static void gmc_v11_0_flush_gpu_tlb(struct
amdgpu_device *adev, uint32_t vmid,
  * Directly use kiq to do the vm invalidation instead
  */
 if ((adev->gfx.kiq[0].ring.sched.ready || adev->mes.ring.sched.ready) 
&&
-   (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev))) {
+   (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) ||
+   !adev->in_s0ix) {
 amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
 1 << vmid, GET_INST(GC, 0));
 return;

@Christian Koenig is this logic correct?

 /* For SRIOV run time, driver shouldn't access the register
through MMIO
  * Directly use kiq to do the vm invalidation instead
  */
 if ((adev->gfx.kiq[0].ring.sched.ready || adev->mes.ring.sched.ready) 
&&
 (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev))) {
 amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
 1 << vmid, GET_INST(GC, 0));
 return;
 }

We basically always 

Re: [PATCH] drm/amd: Add a workaround for GFX11 systems that fail to flush TLB

2023-12-13 Thread Alex Deucher
On Wed, Dec 13, 2023 at 1:00 PM Mario Limonciello
 wrote:
>
> Some systems with MP1 13.0.4 or 13.0.11 have a firmware bug that
> causes the first MES packet after resume to fail. This packet is
> used to flush the TLB when GART is enabled.
>
> This issue is fixed in newer firmware, but as OEMs may not roll this
> out to the field, introduce a workaround that will retry the flush
> when detecting running on an older firmware and decrease relevant
> error messages to debug while workaround is in use.
>
> Cc: sta...@vger.kernel.org # 6.1+
> Cc: Tim Huang 
> Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3045
> Signed-off-by: Mario Limonciello 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 10 --
>  drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  2 ++
>  drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  | 17 -
>  drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  |  8 ++--
>  4 files changed, 32 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> index 9ddbf1494326..6ce3f6e6b6de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
> @@ -836,8 +836,14 @@ int amdgpu_mes_reg_write_reg_wait(struct amdgpu_device 
> *adev,
> }
>
> r = adev->mes.funcs->misc_op(>mes, _input);
> -   if (r)
> -   DRM_ERROR("failed to reg_write_reg_wait\n");
> +   if (r) {
> +   const char *msg = "failed to reg_write_reg_wait\n";
> +
> +   if (adev->mes.suspend_workaround)
> +   DRM_DEBUG(msg);
> +   else
> +   DRM_ERROR(msg);
> +   }
>
>  error:
> return r;
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> index a27b424ffe00..90f2bba3b12b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
> @@ -135,6 +135,8 @@ struct amdgpu_mes {
>
> /* ip specific functions */
> const struct amdgpu_mes_funcs   *funcs;
> +
> +   boolsuspend_workaround;
>  };
>
>  struct amdgpu_mes_process {
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c 
> b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> index 23d7b548d13f..e810c7bb3156 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
> @@ -889,7 +889,11 @@ static int gmc_v11_0_gart_enable(struct amdgpu_device 
> *adev)
> false : true;
>
> adev->mmhub.funcs->set_fault_enable_default(adev, value);
> -   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
> +
> +   do {
> +   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
> +   adev->mes.suspend_workaround = false;
> +   } while (adev->mes.suspend_workaround);

Shouldn't this be something like:

> +   do {
> +   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
> +   adev->mes.suspend_workaround = false;
> +   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
> +   } while (adev->mes.suspend_workaround);

If we actually need the flush.  Maybe a better approach would be to
check if we are in s0ix in

diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c in gmc_v11_0_flush_gpu_tlb():
index 23d7b548d13f..bd6d9953a80e 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -227,7 +227,8 @@ static void gmc_v11_0_flush_gpu_tlb(struct
amdgpu_device *adev, uint32_t vmid,
 * Directly use kiq to do the vm invalidation instead
 */
if ((adev->gfx.kiq[0].ring.sched.ready || adev->mes.ring.sched.ready) &&
-   (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev))) {
+   (amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev)) ||
+   !adev->in_s0ix) {
amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
1 << vmid, GET_INST(GC, 0));
return;

@Christian Koenig is this logic correct?

/* For SRIOV run time, driver shouldn't access the register
through MMIO
 * Directly use kiq to do the vm invalidation instead
 */
if ((adev->gfx.kiq[0].ring.sched.ready || adev->mes.ring.sched.ready) &&
(amdgpu_sriov_runtime(adev) || !amdgpu_sriov_vf(adev))) {
amdgpu_virt_kiq_reg_write_reg_wait(adev, req, ack, inv_req,
1 << vmid, GET_INST(GC, 0));
return;
}

We basically always use the MES with that logic.  If that is the case,
we should just drop the rest of that function.  Shouldn't we only use
KIQ or MES for SR-IOV?  gmc v10 has similar logic which also seems
wrong.

Alex


>
> DRM_INFO("PCIE GART of %uM enabled (table at 0x%016llX).\n",
>  (unsigned int)(adev->gmc.gart_size >> 

Re: [PATCH v2] drm/radeon: Prevent multiple debug error lines on suspend

2023-12-13 Thread Alex Deucher
Applied manually.  Please double check how you are sending the
patches.  git complained about a malformed patch.  I'd suggest using
git-send-email.

Thanks,

Alex

On Wed, Dec 13, 2023 at 1:45 AM Christian König
 wrote:
>
> Am 13.12.23 um 00:31 schrieb Woody Suwalski:
> > Fix to avoid multiple debug error lines printed on every suspend by
> > Radeon driver's debugfs.
> >
> > radeon_debugfs_init() calls debugfs_create_file() for every ring.
> >
> > This results in printing multiple error lines to the screen and dmesg
> > similar to this:
> >
> > [   92.378726] debugfs: File 'radeon_ring_gfx' in directory
> > ':00:01.0' already present!
> > [   92.378732] debugfs: File 'radeon_ring_cp1' in directory
> > ':00:01.0' already present!
> > [   92.378734] debugfs: File 'radeon_ring_cp2' in directory
> > ':00:01.0' already present!
> > [   92.378737] debugfs: File 'radeon_ring_dma1' in directory
> > ':00:01.0' already present!
> > [   92.378739] debugfs: File 'radeon_ring_dma2' in directory
> > ':00:01.0' already present!
> > [   92.380775] debugfs: File 'radeon_ring_uvd' in directory
> > ':00:01.0' already present!
> > [   92.406620] debugfs: File 'radeon_ring_vce1' in directory
> > ':00:01.0' already present!
> > [   92.406624] debugfs: File 'radeon_ring_vce2' in directory
> > ':00:01.0' already present!
> >
> >
> > Patch v1: The fix was to run lookup() for the file before trying to
> > (re)create that debug file.
> > Patch v2: Call the radeon_debugfs_init() only once when radeon ring is
> > initialized (as suggested  by Christian K. - thanks)
> >
> > Signed-off-by: Woody Suwalski 
>
> Reviewed-by: Christian König 
>
> Thanks for the help,
> Christian.
>
> >
> > diff --git a/drivers/gpu/drm/radeon/radeon_ring.c
> > b/drivers/gpu/drm/radeon/radeon_ring.c
> > index e6534fa9f1fb..38048593bb4a 100644
> > --- a/drivers/gpu/drm/radeon/radeon_ring.c
> > +++ b/drivers/gpu/drm/radeon/radeon_ring.c
> > @@ -413,6 +413,7 @@ int radeon_ring_init(struct radeon_device *rdev,
> > struct radeon_ring *ring, unsig
> >  dev_err(rdev->dev, "(%d) ring map failed\n", r);
> >  return r;
> >  }
> > +radeon_debugfs_ring_init(rdev, ring);
> >  }
> >  ring->ptr_mask = (ring->ring_size / 4) - 1;
> >  ring->ring_free_dw = ring->ring_size / 4;
> > @@ -421,7 +422,6 @@ int radeon_ring_init(struct radeon_device *rdev,
> > struct radeon_ring *ring, unsig
> >  ring->next_rptr_gpu_addr = rdev->wb.gpu_addr + index;
> >  ring->next_rptr_cpu_addr = >wb.wb[index/4];
> >  }
> > -radeon_debugfs_ring_init(rdev, ring);
> >  radeon_ring_lockup_update(rdev, ring);
> >  return 0;
> >  }
> >
>


[PATCH] drm/amd: Add a workaround for GFX11 systems that fail to flush TLB

2023-12-13 Thread Mario Limonciello
Some systems with MP1 13.0.4 or 13.0.11 have a firmware bug that
causes the first MES packet after resume to fail. This packet is
used to flush the TLB when GART is enabled.

This issue is fixed in newer firmware, but as OEMs may not roll this
out to the field, introduce a workaround that will retry the flush
when detecting running on an older firmware and decrease relevant
error messages to debug while workaround is in use.

Cc: sta...@vger.kernel.org # 6.1+
Cc: Tim Huang 
Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3045
Signed-off-by: Mario Limonciello 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 10 --
 drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h |  2 ++
 drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c  | 17 -
 drivers/gpu/drm/amd/amdgpu/mes_v11_0.c  |  8 ++--
 4 files changed, 32 insertions(+), 5 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 9ddbf1494326..6ce3f6e6b6de 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -836,8 +836,14 @@ int amdgpu_mes_reg_write_reg_wait(struct amdgpu_device 
*adev,
}
 
r = adev->mes.funcs->misc_op(>mes, _input);
-   if (r)
-   DRM_ERROR("failed to reg_write_reg_wait\n");
+   if (r) {
+   const char *msg = "failed to reg_write_reg_wait\n";
+
+   if (adev->mes.suspend_workaround)
+   DRM_DEBUG(msg);
+   else
+   DRM_ERROR(msg);
+   }
 
 error:
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index a27b424ffe00..90f2bba3b12b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -135,6 +135,8 @@ struct amdgpu_mes {
 
/* ip specific functions */
const struct amdgpu_mes_funcs   *funcs;
+
+   boolsuspend_workaround;
 };
 
 struct amdgpu_mes_process {
diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
index 23d7b548d13f..e810c7bb3156 100644
--- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c
@@ -889,7 +889,11 @@ static int gmc_v11_0_gart_enable(struct amdgpu_device 
*adev)
false : true;
 
adev->mmhub.funcs->set_fault_enable_default(adev, value);
-   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
+
+   do {
+   gmc_v11_0_flush_gpu_tlb(adev, 0, AMDGPU_MMHUB0(0), 0);
+   adev->mes.suspend_workaround = false;
+   } while (adev->mes.suspend_workaround);
 
DRM_INFO("PCIE GART of %uM enabled (table at 0x%016llX).\n",
 (unsigned int)(adev->gmc.gart_size >> 20),
@@ -960,6 +964,17 @@ static int gmc_v11_0_resume(void *handle)
int r;
struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+   switch (amdgpu_ip_version(adev, MP1_HWIP, 0)) {
+   case IP_VERSION(13, 0, 4):
+   case IP_VERSION(13, 0, 11):
+   /* avoid problems with first TLB flush after resume */
+   if ((adev->pm.fw_version & 0x00FF) < 0x004c4900)
+   adev->mes.suspend_workaround = adev->in_s0ix;
+   break;
+   default:
+   break;
+   }
+
r = gmc_v11_0_hw_init(adev);
if (r)
return r;
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
index 4dfec56e1b7f..84ab8c611e5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_v11_0.c
@@ -137,8 +137,12 @@ static int mes_v11_0_submit_pkt_and_poll_completion(struct 
amdgpu_mes *mes,
r = amdgpu_fence_wait_polling(ring, ring->fence_drv.sync_seq,
  timeout);
if (r < 1) {
-   DRM_ERROR("MES failed to response msg=%d\n",
- x_pkt->header.opcode);
+   if (mes->suspend_workaround)
+   DRM_DEBUG("MES failed to response msg=%d\n",
+ x_pkt->header.opcode);
+   else
+   DRM_ERROR("MES failed to response msg=%d\n",
+ x_pkt->header.opcode);
 
while (halt_if_hws_hang)
schedule();
-- 
2.34.1



Re: [PATCH 1/2] drm/amdgpu: increase hmm range get pages timeout

2023-12-13 Thread James Zhu



On 2023-12-13 11:23, Felix Kuehling wrote:


On 2023-12-13 10:24, James Zhu wrote:

Ping ...

On 2023-12-08 18:01, James Zhu wrote:

When application tries to allocate all system memory and cause memory
to swap out. Needs more time for hmm_range_fault to validate the
remaining page for allocation. To be safe, increase timeout value to
1 second for 64MB range.

Signed-off-by: James Zhu 


This is not the first time we're incrementing this timeout. Eventually 
we should get rid of that and find a way to make this work reliably 
without a timeout. There can always be situations where faults take 
longer, and we should not fail randomly in those cases.


There are also some FIXMEs in this code that should be addressed at 
the same time.


That said, as a short-term fix, this patch is

[JZ] Yes, it is just a short-term fix. the root cause is still under study,


Acked-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c

index 081267161d40..b24eb5821fd1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
@@ -190,8 +190,8 @@ int amdgpu_hmm_range_get_pages(struct 
mmu_interval_notifier *notifier,

  pr_debug("hmm range: start = 0x%lx, end = 0x%lx",
  hmm_range->start, hmm_range->end);
  -    /* Assuming 128MB takes maximum 1 second to fault page 
address */

-    timeout = max((hmm_range->end - hmm_range->start) >> 27, 1UL);
+    /* Assuming 64MB takes maximum 1 second to fault page 
address */

+    timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL);
  timeout *= HMM_RANGE_DEFAULT_TIMEOUT;
  timeout = jiffies + msecs_to_jiffies(timeout);


Re: [RFC PATCH 06/12] LoongArch: Implement ARCH_HAS_KERNEL_FPU_SUPPORT

2023-12-13 Thread WANG Xuerui

On 12/8/23 13:54, Samuel Holland wrote:

LoongArch already provides kernel_fpu_begin() and kernel_fpu_end() in
asm/fpu.h, so it only needs to add kernel_fpu_available() and export
the CFLAGS adjustments.

Signed-off-by: Samuel Holland 
---

  arch/loongarch/Kconfig   | 1 +
  arch/loongarch/Makefile  | 5 -
  arch/loongarch/include/asm/fpu.h | 1 +
  3 files changed, 6 insertions(+), 1 deletion(-)


This is all intuitive wrapping, so:

Acked-by: WANG Xuerui 

Thanks!

--
WANG "xen0n" Xuerui

Linux/LoongArch mailing list: https://lore.kernel.org/loongarch/



Re: [PATCH 1/2] drm/amdgpu: increase hmm range get pages timeout

2023-12-13 Thread Felix Kuehling



On 2023-12-13 10:24, James Zhu wrote:

Ping ...

On 2023-12-08 18:01, James Zhu wrote:

When application tries to allocate all system memory and cause memory
to swap out. Needs more time for hmm_range_fault to validate the
remaining page for allocation. To be safe, increase timeout value to
1 second for 64MB range.

Signed-off-by: James Zhu 


This is not the first time we're incrementing this timeout. Eventually 
we should get rid of that and find a way to make this work reliably 
without a timeout. There can always be situations where faults take 
longer, and we should not fail randomly in those cases.


There are also some FIXMEs in this code that should be addressed at the 
same time.


That said, as a short-term fix, this patch is

Acked-by: Felix Kuehling 



---
  drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c

index 081267161d40..b24eb5821fd1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
@@ -190,8 +190,8 @@ int amdgpu_hmm_range_get_pages(struct 
mmu_interval_notifier *notifier,

  pr_debug("hmm range: start = 0x%lx, end = 0x%lx",
  hmm_range->start, hmm_range->end);
  -    /* Assuming 128MB takes maximum 1 second to fault page 
address */

-    timeout = max((hmm_range->end - hmm_range->start) >> 27, 1UL);
+    /* Assuming 64MB takes maximum 1 second to fault page 
address */

+    timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL);
  timeout *= HMM_RANGE_DEFAULT_TIMEOUT;
  timeout = jiffies + msecs_to_jiffies(timeout);


Re: [RFC PATCH 04/12] arm64: Implement ARCH_HAS_KERNEL_FPU_SUPPORT

2023-12-13 Thread Will Deacon
On Thu, Dec 07, 2023 at 09:54:34PM -0800, Samuel Holland wrote:
> arm64 provides an equivalent to the common kernel-mode FPU API, but in a
> different header and using different function names. Add a wrapper
> header, and export CFLAGS adjustments as found in lib/raid6/Makefile.
> 
> Signed-off-by: Samuel Holland 
> ---
> 
>  arch/arm64/Kconfig   |  1 +
>  arch/arm64/Makefile  |  9 -
>  arch/arm64/include/asm/fpu.h | 17 +
>  3 files changed, 26 insertions(+), 1 deletion(-)
>  create mode 100644 arch/arm64/include/asm/fpu.h
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 7b071a00425d..485ac389ac11 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -30,6 +30,7 @@ config ARM64
>   select ARCH_HAS_GCOV_PROFILE_ALL
>   select ARCH_HAS_GIGANTIC_PAGE
>   select ARCH_HAS_KCOV
> + select ARCH_HAS_KERNEL_FPU_SUPPORT if KERNEL_MODE_NEON
>   select ARCH_HAS_KEEPINITRD
>   select ARCH_HAS_MEMBARRIER_SYNC_CORE
>   select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
> diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
> index 9a2d3723cd0f..4a65f24c7998 100644
> --- a/arch/arm64/Makefile
> +++ b/arch/arm64/Makefile
> @@ -36,7 +36,14 @@ ifeq ($(CONFIG_BROKEN_GAS_INST),y)
>  $(warning Detected assembler with broken .inst; disassembly will be 
> unreliable)
>  endif
>  
> -KBUILD_CFLAGS+= -mgeneral-regs-only  \
> +# The GCC option -ffreestanding is required in order to compile code 
> containing
> +# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel)
> +CC_FLAGS_FPU := -ffreestanding
> +# Enable 
> +CC_FLAGS_FPU += -isystem $(shell $(CC) -print-file-name=include)
> +CC_FLAGS_NO_FPU  := -mgeneral-regs-only
> +
> +KBUILD_CFLAGS+= $(CC_FLAGS_NO_FPU) \
>  $(compat_vdso) $(cc_has_k_constraint)
>  KBUILD_CFLAGS+= $(call cc-disable-warning, psabi)
>  KBUILD_AFLAGS+= $(compat_vdso)

Can you use this to replace the same logic in arch/arm64/lib/Makefile,
like you do for arch/arm/?

Will


RE: [PATCH v2 03/23] drm/amdkfd: enable pc sampling query

2023-12-13 Thread Yat Sin, David
[AMD Official Use Only - General]



From: Zhu, James 
Sent: Wednesday, December 13, 2023 10:41 AM
To: Yat Sin, David ; Zhu, James ; 
amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix ; Greathouse, Joseph 

Subject: Re: [PATCH v2 03/23] drm/amdkfd: enable pc sampling query



On 2023-12-12 19:55, Yat Sin, David wrote:

[AMD Official Use Only - General]



-Original Message-

From: Zhu, James 

Sent: Thursday, December 7, 2023 5:54 PM

To: amd-gfx@lists.freedesktop.org

Cc: Kuehling, Felix ; 
Greathouse, Joseph

; Yat Sin, David 
;

Zhu, James 

Subject: [PATCH v2 03/23] drm/amdkfd: enable pc sampling query



From: David Yat Sin 



Enable pc sampling to query system capability.



Co-developed-by: James Zhu 

Signed-off-by: James Zhu 

Signed-off-by: David Yat Sin 

---

 drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 54

+++-

 1 file changed, 53 insertions(+), 1 deletion(-)



diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c

b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c

index a7e78ff42d07..49fecbc7013e 100644

--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c

+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c

@@ -25,10 +25,62 @@

 #include "amdgpu_amdkfd.h"

 #include "kfd_pc_sampling.h"



+struct supported_pc_sample_info {

+ uint32_t ip_version;

+ const struct kfd_pc_sample_info *sample_info; };

+

+const struct kfd_pc_sample_info sample_info_hosttrap_9_0_0 = {

+ 0, 1, ~0ULL, 0, KFD_IOCTL_PCS_METHOD_HOSTTRAP,

+KFD_IOCTL_PCS_TYPE_TIME_US };

+

+struct supported_pc_sample_info supported_formats[] = {

+ { IP_VERSION(9, 4, 1), _info_hosttrap_9_0_0 },

+ { IP_VERSION(9, 4, 2), _info_hosttrap_9_0_0 }, };

+

 static int kfd_pc_sample_query_cap(struct kfd_process_device *pdd,

  struct kfd_ioctl_pc_sample_args

__user *user_args)  {

- return -EINVAL;

+ uint64_t sample_offset;

+ int num_method = 0;

+ int i;

+

+ for (i = 0; i < ARRAY_SIZE(supported_formats); i++)

+ if (KFD_GC_VERSION(pdd->dev) ==

supported_formats[i].ip_version)

+ num_method++;

+

+ if (!num_method) {

+ pr_debug("PC Sampling not supported on GC_HWIP:0x%x.",

+ pdd->dev->adev->ip_versions[GC_HWIP][0]);

+ return -EOPNOTSUPP;

+ }

+

+ if (!user_args->sample_info_ptr) {

Should be:

if (!user_args->sample_info_ptr || !user_args->num_sample_info) {



+ user_args->num_sample_info = num_method;

+ return 0;

+ }

+

+ if (user_args->num_sample_info < num_method) {

+ user_args->num_sample_info = num_method;

+ pr_debug("Sample info buffer is not large enough, "

+  "ASIC requires space for %d kfd_pc_sample_info

entries.", num_method);

+ return -ENOSPC;

+ }

+

+ sample_offset = user_args->sample_info_ptr;



If there is another active PC Sampling session that is active, I thought we 
were planning to have code to

return a reduced list with only the methods that are compatible with the 
current active session. Did we

decide to drop this behavior?
[JZ] Do we have design changed here? I though we allow sharing the same active 
PC Sampling session between multiple processes.

[DavidYS] That was part of the original design.
For example, if you already have an active session that is taking samples at 
interval of 500ms, and a user does a capabilities query, then the
capabilities query should only return a range of 500ms, instead of the full 
interval range.





Regards,

David



+ for (i = 0; i < ARRAY_SIZE(supported_formats); i++) {

+ if (KFD_GC_VERSION(pdd->dev) ==

supported_formats[i].ip_version) {

+ int ret = copy_to_user((void __user *) sample_offset,

+ supported_formats[i].sample_info,

sizeof(struct kfd_pc_sample_info));

+ if (ret) {

+ pr_debug("Failed to copy PC sampling info to

user.");

+ return -EFAULT;

+ }

+ sample_offset += sizeof(struct kfd_pc_sample_info);

+ }

+ }

+

+ return 0;

 }



 static int kfd_pc_sample_start(struct kfd_process_device *pdd)

--

2.25.1




Re: [PATCH 2/2] drm/amdgpu: Enable clear page functionality

2023-12-13 Thread Michel Dänzer
On 2023-12-13 16:39, Felix Kuehling wrote:
> On 2023-12-13 9:20, Christian König wrote:
>> Am 12.12.23 um 00:32 schrieb Felix Kuehling:
>>> On 2023-12-11 04:50, Christian König wrote:
 Am 08.12.23 um 20:53 schrieb Alex Deucher:
> [SNIP]
>> You also need a functionality which resets all cleared blocks to
>> uncleared after suspend/resume.
>>
>> No idea how to do this, maybe Alex knows of hand.
> Since the buffers are cleared on creation, is there actually anything to 
> do?

 Well exactly that's the problem, the buffers are no longer always cleared 
 on creation with this patch.

 Instead we clear on free, track which areas are cleared and clear only the 
 ones which aren't cleared yet on creation.
>>>
>>> The code I added for clearing-on-free a long time ago, does not clear to 0, 
>>> but to a non-0 poison value. That was meant to make it easier to catch 
>>> applications incorrectly relying on 0-initialized memory. Is that being 
>>> changed? I didn't see it in this patch series.
>>
>> Yeah, Arun stumbled over that as well. Any objections that we fill with 
>> zeros instead or is that poison value something necessary for debugging?
> 
> I don't think it's strictly necessary. But it may encourage sloppy user mode 
> programming to rely on 0-initialized memory that ends up breaking in corner 
> cases or on older kernels.

>From a security PoV, the kernel should never return uncleared memory to (at 
>least unprivileged) user space. This series seems like a big step in that 
>direction.


-- 
Earthling Michel Dänzer|  https://redhat.com
Libre software enthusiast  | Mesa and Xwayland developer



Re: [PATCH 2/2] drm/amdgpu: Enable clear page functionality

2023-12-13 Thread Christian König

Am 13.12.23 um 16:39 schrieb Felix Kuehling:

On 2023-12-13 9:20, Christian König wrote:

Am 12.12.23 um 00:32 schrieb Felix Kuehling:


On 2023-12-11 04:50, Christian König wrote:

Am 08.12.23 um 20:53 schrieb Alex Deucher:

[SNIP]

You also need a functionality which resets all cleared blocks to
uncleared after suspend/resume.

No idea how to do this, maybe Alex knows of hand.
Since the buffers are cleared on creation, is there actually 
anything to do?


Well exactly that's the problem, the buffers are no longer always 
cleared on creation with this patch.


Instead we clear on free, track which areas are cleared and clear 
only the ones which aren't cleared yet on creation.


The code I added for clearing-on-free a long time ago, does not 
clear to 0, but to a non-0 poison value. That was meant to make it 
easier to catch applications incorrectly relying on 0-initialized 
memory. Is that being changed? I didn't see it in this patch series.


Yeah, Arun stumbled over that as well. Any objections that we fill 
with zeros instead or is that poison value something necessary for 
debugging?


I don't think it's strictly necessary. But it may encourage sloppy 
user mode programming to rely on 0-initialized memory that ends up 
breaking in corner cases or on older kernels.


That said, I see that this patch series adds clearing of memory in the 
VRAM manager, but it doesn't remove the clearing for 
AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE in amdgpu_bo_release_notify and 
amdgpu_move_blit. This will lead to duplicate work.


I'm also not sure how the clearing added in this patch series will 
affect free-latency observed in user mode. Will this be synchronous 
and cause the user mode thread to stall while the memory is being 
cleared?


Yeah, that's not fully working at the moment. I already pointed out as 
well that Arun should remove the clearing in the VRAM manager.


Regards,
Christian.



Regards,
  Felix




Regards,
Christian.



Regards,
  Felix




So some cases need special handling. E.g. when the engine is not 
initialized yet or suspend/resume.


In theory after a suspend/resume cycle the VRAM is cleared to 
zeros, but in practice that's not always true.


Christian.


Alex






Re: [PATCH v2 03/23] drm/amdkfd: enable pc sampling query

2023-12-13 Thread James Zhu


On 2023-12-12 19:55, Yat Sin, David wrote:

[AMD Official Use Only - General]


-Original Message-
From: Zhu, James
Sent: Thursday, December 7, 2023 5:54 PM
To:amd-gfx@lists.freedesktop.org
Cc: Kuehling, Felix; Greathouse, Joseph
; Yat Sin, David;
Zhu, James
Subject: [PATCH v2 03/23] drm/amdkfd: enable pc sampling query

From: David Yat Sin

Enable pc sampling to query system capability.

Co-developed-by: James Zhu
Signed-off-by: James Zhu
Signed-off-by: David Yat Sin
---
  drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c | 54
+++-
  1 file changed, 53 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
index a7e78ff42d07..49fecbc7013e 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_pc_sampling.c
@@ -25,10 +25,62 @@
  #include "amdgpu_amdkfd.h"
  #include "kfd_pc_sampling.h"

+struct supported_pc_sample_info {
+ uint32_t ip_version;
+ const struct kfd_pc_sample_info *sample_info; };
+
+const struct kfd_pc_sample_info sample_info_hosttrap_9_0_0 = {
+ 0, 1, ~0ULL, 0, KFD_IOCTL_PCS_METHOD_HOSTTRAP,
+KFD_IOCTL_PCS_TYPE_TIME_US };
+
+struct supported_pc_sample_info supported_formats[] = {
+ { IP_VERSION(9, 4, 1), _info_hosttrap_9_0_0 },
+ { IP_VERSION(9, 4, 2), _info_hosttrap_9_0_0 }, };
+
  static int kfd_pc_sample_query_cap(struct kfd_process_device *pdd,
   struct kfd_ioctl_pc_sample_args
__user *user_args)  {
- return -EINVAL;
+ uint64_t sample_offset;
+ int num_method = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(supported_formats); i++)
+ if (KFD_GC_VERSION(pdd->dev) ==
supported_formats[i].ip_version)
+ num_method++;
+
+ if (!num_method) {
+ pr_debug("PC Sampling not supported on GC_HWIP:0x%x.",
+ pdd->dev->adev->ip_versions[GC_HWIP][0]);
+ return -EOPNOTSUPP;
+ }
+
+ if (!user_args->sample_info_ptr) {

Should be:
if (!user_args->sample_info_ptr || !user_args->num_sample_info) {


+ user_args->num_sample_info = num_method;
+ return 0;
+ }
+
+ if (user_args->num_sample_info < num_method) {
+ user_args->num_sample_info = num_method;
+ pr_debug("Sample info buffer is not large enough, "
+  "ASIC requires space for %d kfd_pc_sample_info
entries.", num_method);
+ return -ENOSPC;
+ }
+
+ sample_offset = user_args->sample_info_ptr;

If there is another active PC Sampling session that is active, I thought we 
were planning to have code to
return a reduced list with only the methods that are compatible with the 
current active session. Did we
decide to drop this behavior?
[JZ] Do we have design changed here? I though we allow sharing the 
sameactive PC Sampling session between multiple processes.


Regards,
David


+ for (i = 0; i < ARRAY_SIZE(supported_formats); i++) {
+ if (KFD_GC_VERSION(pdd->dev) ==
supported_formats[i].ip_version) {
+ int ret = copy_to_user((void __user *) sample_offset,
+ supported_formats[i].sample_info,
sizeof(struct kfd_pc_sample_info));
+ if (ret) {
+ pr_debug("Failed to copy PC sampling info to
user.");
+ return -EFAULT;
+ }
+ sample_offset += sizeof(struct kfd_pc_sample_info);
+ }
+ }
+
+ return 0;
  }

  static int kfd_pc_sample_start(struct kfd_process_device *pdd)
--
2.25.1

Re: [PATCH 2/2] drm/amdgpu: Enable clear page functionality

2023-12-13 Thread Felix Kuehling

On 2023-12-13 9:20, Christian König wrote:

Am 12.12.23 um 00:32 schrieb Felix Kuehling:


On 2023-12-11 04:50, Christian König wrote:

Am 08.12.23 um 20:53 schrieb Alex Deucher:

[SNIP]

You also need a functionality which resets all cleared blocks to
uncleared after suspend/resume.

No idea how to do this, maybe Alex knows of hand.
Since the buffers are cleared on creation, is there actually 
anything to do?


Well exactly that's the problem, the buffers are no longer always 
cleared on creation with this patch.


Instead we clear on free, track which areas are cleared and clear 
only the ones which aren't cleared yet on creation.


The code I added for clearing-on-free a long time ago, does not clear 
to 0, but to a non-0 poison value. That was meant to make it easier 
to catch applications incorrectly relying on 0-initialized memory. Is 
that being changed? I didn't see it in this patch series.


Yeah, Arun stumbled over that as well. Any objections that we fill 
with zeros instead or is that poison value something necessary for 
debugging?


I don't think it's strictly necessary. But it may encourage sloppy user 
mode programming to rely on 0-initialized memory that ends up breaking 
in corner cases or on older kernels.


That said, I see that this patch series adds clearing of memory in the 
VRAM manager, but it doesn't remove the clearing for 
AMDGPU_GEM_CREATE_VRAM_WIPE_ON_RELEASE in amdgpu_bo_release_notify and 
amdgpu_move_blit. This will lead to duplicate work.


I'm also not sure how the clearing added in this patch series will 
affect free-latency observed in user mode. Will this be synchronous and 
cause the user mode thread to stall while the memory is being cleared?


Regards,
  Felix




Regards,
Christian.



Regards,
  Felix




So some cases need special handling. E.g. when the engine is not 
initialized yet or suspend/resume.


In theory after a suspend/resume cycle the VRAM is cleared to zeros, 
but in practice that's not always true.


Christian.


Alex




Re: [PATCH 1/2] drm/amdgpu: increase hmm range get pages timeout

2023-12-13 Thread James Zhu

Ping ...

On 2023-12-08 18:01, James Zhu wrote:

When application tries to allocate all system memory and cause memory
to swap out. Needs more time for hmm_range_fault to validate the
remaining page for allocation. To be safe, increase timeout value to
1 second for 64MB range.

Signed-off-by: James Zhu 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
index 081267161d40..b24eb5821fd1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
@@ -190,8 +190,8 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier 
*notifier,
pr_debug("hmm range: start = 0x%lx, end = 0x%lx",
hmm_range->start, hmm_range->end);
  
-		/* Assuming 128MB takes maximum 1 second to fault page address */

-   timeout = max((hmm_range->end - hmm_range->start) >> 27, 1UL);
+   /* Assuming 64MB takes maximum 1 second to fault page address */
+   timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL);
timeout *= HMM_RANGE_DEFAULT_TIMEOUT;
timeout = jiffies + msecs_to_jiffies(timeout);
  


Re: [PATCH] Revert "drm/amd/display: Adjust the MST resume flow"

2023-12-13 Thread Mario Limonciello

On 12/13/2023 08:17, Alex Deucher wrote:

On Tue, Dec 12, 2023 at 9:00 PM Mario Limonciello
 wrote:


On 12/12/2023 18:08, Oliver Schmidt wrote:

Hi Wayne,

On 12.12.23 17:06, Mario Limonciello wrote:

I looked through your bugs related to this and I didn't see a reference to the
specific docking station model.
The logs mentioned "Thinkpad dock" but no model.
Could you share more about it so that AMD can try to reproduce it?


Yes, it is a ThinkPad Ultra Dockingstation, part number 40AJ0135EU, see also
https://support.lenovo.com/us/en/solutions/pd500173-thinkpad-ultra-docking-station-overview-and-service-parts



By chance do you have access to any other dock or monitor combinations
that you can conclude it only happens on this dock or only a certain
monitor, or only a certain monitor connected to this dock?


IIRC, Wayne's patch was to fix an HP dock, I suspect reverting this
will just break one dock to fix another.  Wayne, do you have the other
problematic dock that this patch was needed to fix or even the
thinkpad dock?  Would be nice to properly sort this out if possible.



Oliver responded back that a firmware update for the problematic dock 
fixed it.


So in that case I'll revert it in amd-staging-drm-next.


Alex




Best regards,
Oliver

On 12.12.23 17:06, Mario Limonciello wrote:

On 12/12/2023 04:10, Lin, Wayne wrote:

[Public]

Hi Mario,

Thanks for the help.
My feeling is like this problem probably relates to specific dock. Need time
to take
further look.


Oliver,

I looked through your bugs related to this and I didn't see a reference to the
specific docking station model.
The logs mentioned "Thinkpad dock" but no model.

Could you share more about it so that AMD can try to reproduce it?



Since reverting solves the issue now, feel free to add:
Acked-by: Wayne Lin 


Sure, thanks.



Thanks,
Wayne


-Original Message-
From: Limonciello, Mario 
Sent: Tuesday, December 12, 2023 12:15 AM
To: amd-gfx@lists.freedesktop.org; Wentland, Harry

Cc: Linux Regressions ; sta...@vger.kernel.org;
Wheeler, Daniel ; Lin, Wayne
; Oliver Schmidt 
Subject: Re: [PATCH] Revert "drm/amd/display: Adjust the MST resume flow"

Ping on this one.

On 12/5/2023 13:54, Mario Limonciello wrote:

This reverts commit ec5fa9fcdeca69edf7dab5ca3b2e0ceb1c08fe9a.

Reports are that this causes problems with external monitors after
wake up from suspend, which is something it was directly supposed to help.

Cc: Linux Regressions 
Cc: sta...@vger.kernel.org
Cc: Daniel Wheeler 
Cc: Wayne Lin 
Reported-by: Oliver Schmidt 
Link: https://bugzilla.kernel.org/show_bug.cgi?id=218211
Link:
https://forum.manjaro.org/t/problems-with-external-monitor-wake-up-aft
er-suspend/151840
Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3023
Signed-off-by: Mario Limonciello 
---
 .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 93 +++--

--

 1 file changed, 13 insertions(+), 80 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index c146dc9cba92..1ba58e4ecab3 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -2363,62 +2363,14 @@ static int dm_late_init(void *handle)
   return detect_mst_link_for_all_connectors(adev_to_drm(adev));
 }

-static void resume_mst_branch_status(struct drm_dp_mst_topology_mgr
*mgr) -{
-   int ret;
-   u8 guid[16];
-   u64 tmp64;
-
-   mutex_lock(>lock);
-   if (!mgr->mst_primary)
-   goto out_fail;
-
-   if (drm_dp_read_dpcd_caps(mgr->aux, mgr->dpcd) < 0) {
-   drm_dbg_kms(mgr->dev, "dpcd read failed - undocked during

suspend?\n");

-   goto out_fail;
-   }
-
-   ret = drm_dp_dpcd_writeb(mgr->aux, DP_MSTM_CTRL,
-DP_MST_EN |
-DP_UP_REQ_EN |
-DP_UPSTREAM_IS_SRC);
-   if (ret < 0) {
-   drm_dbg_kms(mgr->dev, "mst write failed - undocked during

suspend?\n");

-   goto out_fail;
-   }
-
-   /* Some hubs forget their guids after they resume */
-   ret = drm_dp_dpcd_read(mgr->aux, DP_GUID, guid, 16);
-   if (ret != 16) {
-   drm_dbg_kms(mgr->dev, "dpcd read failed - undocked during

suspend?\n");

-   goto out_fail;
-   }
-
-   if (memchr_inv(guid, 0, 16) == NULL) {
-   tmp64 = get_jiffies_64();
-   memcpy([0], , sizeof(u64));
-   memcpy([8], , sizeof(u64));
-
-   ret = drm_dp_dpcd_write(mgr->aux, DP_GUID, guid, 16);
-
-   if (ret != 16) {
-   drm_dbg_kms(mgr->dev, "check mstb guid failed -

undocked during suspend?\n");

-   goto out_fail;
-   }
-   }
-
-   memcpy(mgr->mst_primary->guid, guid, 16);
-
-out_fail:
-   mutex_unlock(>lock);
-}
-
 static void s3_handle_mst(struct drm_device *dev, bool suspend)
 {
   struct amdgpu_dm_connector *aconnector;
   struct drm_connector *connector;
   struct 

Re: [PATCH v2 1/2] drm/amdgpu: Auto-validate DMABuf imports in compute VMs

2023-12-13 Thread Christian König

Am 06.12.23 um 22:44 schrieb Felix Kuehling:

DMABuf imports in compute VMs are not wrapped in a kgd_mem object on the
process_info->kfd_bo_list. There is no explicit KFD API call to validate
them or add eviction fences to them.

This patch automatically validates and fences dymanic DMABuf imports when
they are added to a compute VM. Revalidation after evictions is handled
in the VM code.

v2:
* Renamed amdgpu_vm_validate_evicted_bos to amdgpu_vm_validate
* Eliminated evicted_user state, use evicted state for VM BOs and user BOs
* Fixed and simplified amdgpu_vm_fence_imports, depends on reserved BOs
* Moved dma_resv_reserve_fences for amdgpu_vm_fence_imports into
   amdgpu_vm_validate, outside the vm->status_lock
* Added dummy version of amdgpu_amdkfd_bo_validate_and_fence for builds
   without KFD

Signed-off-by: Felix Kuehling 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h|  10 ++
  .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  |  44 +---
  drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c|   6 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c   |   4 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c   |  28 -
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c| 101 --
  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h|  10 +-
  drivers/gpu/drm/amd/amdkfd/kfd_svm.c  |  10 +-
  8 files changed, 177 insertions(+), 36 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
index f262b9d89541..584a0cea5572 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -191,6 +191,9 @@ struct amdgpu_amdkfd_fence *to_amdgpu_amdkfd_fence(struct 
dma_fence *f);
  int amdgpu_amdkfd_remove_fence_on_pt_pd_bos(struct amdgpu_bo *bo);
  int amdgpu_amdkfd_evict_userptr(struct mmu_interval_notifier *mni,
unsigned long cur_seq, struct kgd_mem *mem);
+int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
+   uint32_t domain,
+   struct dma_fence *fence);
  #else
  static inline
  bool amdkfd_fence_check_mm(struct dma_fence *f, struct mm_struct *mm)
@@ -216,6 +219,13 @@ int amdgpu_amdkfd_evict_userptr(struct 
mmu_interval_notifier *mni,
  {
return 0;
  }
+static inline
+int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
+   uint32_t domain,
+   struct dma_fence *fence)
+{
+   return 0;
+}
  #endif
  /* Shared API */
  int amdgpu_amdkfd_alloc_gtt_mem(struct amdgpu_device *adev, size_t size,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 5f445d856769..fbabb1e63a67 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -426,9 +426,9 @@ static int amdgpu_amdkfd_bo_validate(struct amdgpu_bo *bo, 
uint32_t domain,
return ret;
  }
  
-static int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,

-  uint32_t domain,
-  struct dma_fence *fence)
+int amdgpu_amdkfd_bo_validate_and_fence(struct amdgpu_bo *bo,
+   uint32_t domain,
+   struct dma_fence *fence)
  {
int ret = amdgpu_bo_reserve(bo, false);
  
@@ -464,13 +464,15 @@ static int amdgpu_amdkfd_validate_vm_bo(void *_unused, struct amdgpu_bo *bo)

   * again. Page directories are only updated after updating page
   * tables.
   */
-static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm)
+static int vm_validate_pt_pd_bos(struct amdgpu_vm *vm,
+struct ww_acquire_ctx *ticket)
  {
struct amdgpu_bo *pd = vm->root.bo;
struct amdgpu_device *adev = amdgpu_ttm_adev(pd->tbo.bdev);
int ret;
  
-	ret = amdgpu_vm_validate_pt_bos(adev, vm, amdgpu_amdkfd_validate_vm_bo, NULL);

+   ret = amdgpu_vm_validate(adev, vm, ticket,
+amdgpu_amdkfd_validate_vm_bo, NULL);
if (ret) {
pr_err("failed to validate PT BOs\n");
return ret;
@@ -1310,14 +1312,15 @@ static int map_bo_to_gpuvm(struct kgd_mem *mem,
return ret;
  }
  
-static int process_validate_vms(struct amdkfd_process_info *process_info)

+static int process_validate_vms(struct amdkfd_process_info *process_info,
+   struct ww_acquire_ctx *ticket)
  {
struct amdgpu_vm *peer_vm;
int ret;
  
  	list_for_each_entry(peer_vm, _info->vm_list_head,

vm_list_node) {
-   ret = vm_validate_pt_pd_bos(peer_vm);
+   ret = vm_validate_pt_pd_bos(peer_vm, ticket);
if (ret)
return ret;
}
@@ -1402,7 +1405,7 @@ static int 

RE: [PATCH 1/2] drm/amd/pm: support return vpe clock table

2023-12-13 Thread Ma, Li
[AMD Official Use Only - General]

Hi Peyton,

After the format problem fixed, this patch is
Reviewed-by: Li Ma 

BRs,
Li
-Original Message-
From: Lee, Peyton 
Sent: Wednesday, December 13, 2023 10:49 AM
To: amd-gfx@lists.freedesktop.org
Cc: Deucher, Alexander ; Zhang, Yifan 
; Ma, Li ; Yu, Lang ; 
Lee, Peyton 
Subject: [PATCH 1/2] drm/amd/pm: support return vpe clock table

pm supports return vpe clock table and soc clock table

Signed-off-by: Peyton Lee 
---
 drivers/gpu/drm/amd/display/dc/dm_pp_smu.h|  2 ++
 drivers/gpu/drm/amd/pm/amdgpu_dpm.c   | 10 ++
 drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h   |  1 +
 .../drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c  | 20 +++
 4 files changed, 33 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/dc/dm_pp_smu.h 
b/drivers/gpu/drm/amd/display/dc/dm_pp_smu.h
index 4440d08743aa..bd7ba0a25198 100644
--- a/drivers/gpu/drm/amd/display/dc/dm_pp_smu.h
+++ b/drivers/gpu/drm/amd/display/dc/dm_pp_smu.h
@@ -247,6 +247,7 @@ struct pp_smu_funcs_nv {  #define 
PP_SMU_NUM_MEMCLK_DPM_LEVELS  4
 #define PP_SMU_NUM_DCLK_DPM_LEVELS8
 #define PP_SMU_NUM_VCLK_DPM_LEVELS8
+#define PP_SMU_NUM_VPECLK_DPM_LEVELS  8

 struct dpm_clock {
   uint32_t  Freq;// In MHz
@@ -262,6 +263,7 @@ struct dpm_clocks {
struct dpm_clock MemClocks[PP_SMU_NUM_MEMCLK_DPM_LEVELS];
struct dpm_clock VClocks[PP_SMU_NUM_VCLK_DPM_LEVELS];
struct dpm_clock DClocks[PP_SMU_NUM_DCLK_DPM_LEVELS];
+   struct dpm_clock VPEClocks[PP_SMU_NUM_VPECLK_DPM_LEVELS];
 };


diff --git a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c 
b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
index 97b40c5fa1ff..6627ee07d52d 100644
--- a/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
+++ b/drivers/gpu/drm/amd/pm/amdgpu_dpm.c
@@ -616,6 +616,16 @@ void amdgpu_dpm_enable_jpeg(struct amdgpu_device *adev, 
bool enable)
  enable ? "enable" : "disable", ret);  }

+void amdgpu_dpm_enable_vpe(struct amdgpu_device *adev, bool enable) {
+   int ret = 0;
+
+   ret = amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_VPE, 
!enable);
+   if (ret)
+   DRM_ERROR("Dpm %s vpe failed, ret = %d.\n",
+ enable ? "enable" : "disable", ret); }
[Ma, Li]: here the location of "}" may need to be modified.
+
 int amdgpu_pm_load_smu_firmware(struct amdgpu_device *adev, uint32_t 
*smu_version)  {
const struct amd_pm_funcs *pp_funcs = adev->powerplay.pp_funcs; diff 
--git a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h 
b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
index d76b0a60db44..3047ffe7f244 100644
--- a/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
+++ b/drivers/gpu/drm/amd/pm/inc/amdgpu_dpm.h
@@ -445,6 +445,7 @@ void amdgpu_dpm_compute_clocks(struct amdgpu_device *adev); 
 void amdgpu_dpm_enable_uvd(struct amdgpu_device *adev, bool enable);  void 
amdgpu_dpm_enable_vce(struct amdgpu_device *adev, bool enable);  void 
amdgpu_dpm_enable_jpeg(struct amdgpu_device *adev, bool enable);
+void amdgpu_dpm_enable_vpe(struct amdgpu_device *adev, bool enable);
 int amdgpu_pm_load_smu_firmware(struct amdgpu_device *adev, uint32_t 
*smu_version);  int amdgpu_dpm_handle_passthrough_sbr(struct amdgpu_device 
*adev, bool enable);  int amdgpu_dpm_send_hbm_bad_pages_num(struct 
amdgpu_device *adev, uint32_t size); diff --git 
a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c 
b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c
index 94ccdbfd7090..47fdbae4adfc 100644
--- a/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c
+++ b/drivers/gpu/drm/amd/pm/swsmu/smu14/smu_v14_0_0_ppt.c
@@ -1085,6 +1085,25 @@ static int smu_v14_0_0_set_umsch_mm_enable(struct 
smu_context *smu,
   0, NULL);
 }

+static int smu_14_0_0_get_dpm_table(struct smu_context *smu, struct
+dpm_clocks *clock_table) {
+   DpmClocks_t *clk_table = smu->smu_table.clocks_table;
+   uint8_t idx;
+
+   /* Only the Clock information of SOC and VPE is copied to provide VPE 
DPM settings for use. */
+   for (idx = 0; idx < NUM_SOCCLK_DPM_LEVELS; idx++) {
+   clock_table->SocClocks[idx].Freq = (idx < 
clk_table->NumSocClkLevelsEnabled) ? clk_table->SocClocks[idx]:0;
+   clock_table->SocClocks[idx].Vol = 0;
+   }
+
+   for (idx = 0; idx < NUM_VPE_DPM_LEVELS; idx++) {
+   clock_table->VPEClocks[idx].Freq = (idx < 
clk_table->VpeClkLevelsEnabled) ? clk_table->VPEClocks[idx]:0;
+   clock_table->VPEClocks[idx].Vol = 0;
+   }
+
+   return 0;
+}
+
 static const struct pptable_funcs smu_v14_0_0_ppt_funcs = {
.check_fw_status = smu_v14_0_check_fw_status,
.check_fw_version = smu_v14_0_check_fw_version, @@ -1115,6 +1134,7 @@ 
static const struct pptable_funcs smu_v14_0_0_ppt_funcs = {
.set_gfx_power_up_by_imu = smu_v14_0_set_gfx_power_up_by_imu,
.dpm_set_vpe_enable = smu_v14_0_0_set_vpe_enable,
.dpm_set_umsch_mm_enable = 

Re: [PATCH 2/2] drm/amdgpu: Enable clear page functionality

2023-12-13 Thread Christian König

Am 12.12.23 um 00:32 schrieb Felix Kuehling:


On 2023-12-11 04:50, Christian König wrote:

Am 08.12.23 um 20:53 schrieb Alex Deucher:

[SNIP]

You also need a functionality which resets all cleared blocks to
uncleared after suspend/resume.

No idea how to do this, maybe Alex knows of hand.
Since the buffers are cleared on creation, is there actually 
anything to do?


Well exactly that's the problem, the buffers are no longer always 
cleared on creation with this patch.


Instead we clear on free, track which areas are cleared and clear 
only the ones which aren't cleared yet on creation.


The code I added for clearing-on-free a long time ago, does not clear 
to 0, but to a non-0 poison value. That was meant to make it easier to 
catch applications incorrectly relying on 0-initialized memory. Is 
that being changed? I didn't see it in this patch series.


Yeah, Arun stumbled over that as well. Any objections that we fill with 
zeros instead or is that poison value something necessary for debugging?


Regards,
Christian.



Regards,
  Felix




So some cases need special handling. E.g. when the engine is not 
initialized yet or suspend/resume.


In theory after a suspend/resume cycle the VRAM is cleared to zeros, 
but in practice that's not always true.


Christian.


Alex




Re: [PATCH 03/25] drm/amd/display: Revert " drm/amd/display: Use channel_width = 2 for vram table 3.0"

2023-12-13 Thread Alex Deucher
On Wed, Dec 13, 2023 at 3:40 AM Wayne Lin  wrote:
>
> From: Alvin Lee 
>
> [Description]
> Revert commit 8c5660987ee1 ("drm/amd/display: Use channel_width = 2 for vram 
> table 3.0")
> Because the issue is being fixed from VBIOS side.
>
> Reviewed-by: Samson Tam 
> Acked-by: Wayne Lin 
> Signed-off-by: Alvin Lee 

Acked-by: Alex Deucher 

> ---
>  drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c | 8 +---
>  1 file changed, 1 insertion(+), 7 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c 
> b/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c
> index 875a064bb9a5..fcd65a2057ad 100644
> --- a/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c
> +++ b/drivers/gpu/drm/amd/display/dc/bios/bios_parser2.c
> @@ -2386,13 +2386,7 @@ static enum bp_result get_vram_info_v30(
> return BP_RESULT_BADBIOSTABLE;
>
> info->num_chans = info_v30->channel_num;
> -   /* As suggested by VBIOS we should always use
> -* dram_channel_width_bytes = 2 when using VRAM
> -* table version 3.0. This is because the channel_width
> -* param in the VRAM info table is changed in 7000 series and
> -* no longer represents the memory channel width.
> -*/
> -   info->dram_channel_width_bytes = 2;
> +   info->dram_channel_width_bytes = (1 << info_v30->channel_width) / 8;
>
> return result;
>  }
> --
> 2.37.3
>


Re: [PATCH] Revert "drm/amd/display: Adjust the MST resume flow"

2023-12-13 Thread Alex Deucher
On Tue, Dec 12, 2023 at 9:00 PM Mario Limonciello
 wrote:
>
> On 12/12/2023 18:08, Oliver Schmidt wrote:
> > Hi Wayne,
> >
> > On 12.12.23 17:06, Mario Limonciello wrote:
> >> I looked through your bugs related to this and I didn't see a reference to 
> >> the
> >> specific docking station model.
> >> The logs mentioned "Thinkpad dock" but no model.
> >> Could you share more about it so that AMD can try to reproduce it?
> >
> > Yes, it is a ThinkPad Ultra Dockingstation, part number 40AJ0135EU, see also
> > https://support.lenovo.com/us/en/solutions/pd500173-thinkpad-ultra-docking-station-overview-and-service-parts
> >
>
> By chance do you have access to any other dock or monitor combinations
> that you can conclude it only happens on this dock or only a certain
> monitor, or only a certain monitor connected to this dock?

IIRC, Wayne's patch was to fix an HP dock, I suspect reverting this
will just break one dock to fix another.  Wayne, do you have the other
problematic dock that this patch was needed to fix or even the
thinkpad dock?  Would be nice to properly sort this out if possible.

Alex

>
> > Best regards,
> > Oliver
> >
> > On 12.12.23 17:06, Mario Limonciello wrote:
> >> On 12/12/2023 04:10, Lin, Wayne wrote:
> >>> [Public]
> >>>
> >>> Hi Mario,
> >>>
> >>> Thanks for the help.
> >>> My feeling is like this problem probably relates to specific dock. Need 
> >>> time
> >>> to take
> >>> further look.
> >>
> >> Oliver,
> >>
> >> I looked through your bugs related to this and I didn't see a reference to 
> >> the
> >> specific docking station model.
> >> The logs mentioned "Thinkpad dock" but no model.
> >>
> >> Could you share more about it so that AMD can try to reproduce it?
> >>
> >>>
> >>> Since reverting solves the issue now, feel free to add:
> >>> Acked-by: Wayne Lin 
> >>
> >> Sure, thanks.
> >>
> >>>
> >>> Thanks,
> >>> Wayne
> >>>
>  -Original Message-
>  From: Limonciello, Mario 
>  Sent: Tuesday, December 12, 2023 12:15 AM
>  To: amd-gfx@lists.freedesktop.org; Wentland, Harry
>  
>  Cc: Linux Regressions ; 
>  sta...@vger.kernel.org;
>  Wheeler, Daniel ; Lin, Wayne
>  ; Oliver Schmidt 
>  Subject: Re: [PATCH] Revert "drm/amd/display: Adjust the MST resume flow"
> 
>  Ping on this one.
> 
>  On 12/5/2023 13:54, Mario Limonciello wrote:
> > This reverts commit ec5fa9fcdeca69edf7dab5ca3b2e0ceb1c08fe9a.
> >
> > Reports are that this causes problems with external monitors after
> > wake up from suspend, which is something it was directly supposed to 
> > help.
> >
> > Cc: Linux Regressions 
> > Cc: sta...@vger.kernel.org
> > Cc: Daniel Wheeler 
> > Cc: Wayne Lin 
> > Reported-by: Oliver Schmidt 
> > Link: https://bugzilla.kernel.org/show_bug.cgi?id=218211
> > Link:
> > https://forum.manjaro.org/t/problems-with-external-monitor-wake-up-aft
> > er-suspend/151840
> > Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3023
> > Signed-off-by: Mario Limonciello  > 
> > ---
> > .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 93 +++--
>  --
> > 1 file changed, 13 insertions(+), 80 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > index c146dc9cba92..1ba58e4ecab3 100644
> > --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
> > @@ -2363,62 +2363,14 @@ static int dm_late_init(void *handle)
> >   return detect_mst_link_for_all_connectors(adev_to_drm(adev));
> > }
> >
> > -static void resume_mst_branch_status(struct drm_dp_mst_topology_mgr
> > *mgr) -{
> > -   int ret;
> > -   u8 guid[16];
> > -   u64 tmp64;
> > -
> > -   mutex_lock(>lock);
> > -   if (!mgr->mst_primary)
> > -   goto out_fail;
> > -
> > -   if (drm_dp_read_dpcd_caps(mgr->aux, mgr->dpcd) < 0) {
> > -   drm_dbg_kms(mgr->dev, "dpcd read failed - undocked during
>  suspend?\n");
> > -   goto out_fail;
> > -   }
> > -
> > -   ret = drm_dp_dpcd_writeb(mgr->aux, DP_MSTM_CTRL,
> > -DP_MST_EN |
> > -DP_UP_REQ_EN |
> > -DP_UPSTREAM_IS_SRC);
> > -   if (ret < 0) {
> > -   drm_dbg_kms(mgr->dev, "mst write failed - undocked during
>  suspend?\n");
> > -   goto out_fail;
> > -   }
> > -
> > -   /* Some hubs forget their guids after they resume */
> > -   ret = drm_dp_dpcd_read(mgr->aux, DP_GUID, guid, 16);
> > -   if (ret != 16) {
> > -   drm_dbg_kms(mgr->dev, "dpcd read failed - undocked during
>  suspend?\n");
> > -   goto out_fail;
> > -   }
> > -
> > -   if (memchr_inv(guid, 0, 16) == NULL) {
> 

Re: [PATCH] Revert "drm/amd/display: Adjust the MST resume flow"

2023-12-13 Thread Oliver Schmidt
On 13.12.23 02:21, Mario Limonciello wrote:
> By chance do you have access to any other dock or monitor combinations that
> you can conclude it only happens on this dock or only a certain monitor, or
> only a certain monitor connected to this dock?

Mario, that was a good suggestion! I indeed had access to another docking
station, although it's the same type. I than tried this docking station with my
AMD X13 Thinkpad and it was working there.

It turned out, the working docking station had newer firmware. So I updated the
firmware and now it works :-)

Firmware Update: ThinkPad Docking Station Firmware Utility v3.3.4
(cs18dkfw334_web.exe) from https://pcsupport.lenovo.com/us/en/downloads/DS505699

How to resolve my issues on freedesktop.org and bugzilla.kernel.org?

Thank you for your support!

On 13.12.23 02:21, Mario Limonciello wrote:
> On 12/12/2023 18:08, Oliver Schmidt wrote:
>> Hi Wayne,
>>
>> On 12.12.23 17:06, Mario Limonciello wrote:
>>> I looked through your bugs related to this and I didn't see a reference to 
>>> the
>>> specific docking station model.
>>> The logs mentioned "Thinkpad dock" but no model.
>>> Could you share more about it so that AMD can try to reproduce it?
>>
>> Yes, it is a ThinkPad Ultra Dockingstation, part number 40AJ0135EU, see also
>> https://support.lenovo.com/us/en/solutions/pd500173-thinkpad-ultra-docking-station-overview-and-service-parts
>>
>>
> 
> By chance do you have access to any other dock or monitor combinations that 
> you
> can conclude it only happens on this dock or only a certain monitor, or only a
> certain monitor connected to this dock?
> 
>> Best regards,
>> Oliver
>>
>> On 12.12.23 17:06, Mario Limonciello wrote:
>>> On 12/12/2023 04:10, Lin, Wayne wrote:
 [Public]

 Hi Mario,

 Thanks for the help.
 My feeling is like this problem probably relates to specific dock. Need 
 time
 to take
 further look.
>>>
>>> Oliver,
>>>
>>> I looked through your bugs related to this and I didn't see a reference to 
>>> the
>>> specific docking station model.
>>> The logs mentioned "Thinkpad dock" but no model.
>>>
>>> Could you share more about it so that AMD can try to reproduce it?
>>>

 Since reverting solves the issue now, feel free to add:
 Acked-by: Wayne Lin 
>>>
>>> Sure, thanks.
>>>

 Thanks,
 Wayne

> -Original Message-
> From: Limonciello, Mario 
> Sent: Tuesday, December 12, 2023 12:15 AM
> To: amd-gfx@lists.freedesktop.org; Wentland, Harry
> 
> Cc: Linux Regressions ; 
> sta...@vger.kernel.org;
> Wheeler, Daniel ; Lin, Wayne
> ; Oliver Schmidt 
> Subject: Re: [PATCH] Revert "drm/amd/display: Adjust the MST resume flow"
>
> Ping on this one.
>
> On 12/5/2023 13:54, Mario Limonciello wrote:
>> This reverts commit ec5fa9fcdeca69edf7dab5ca3b2e0ceb1c08fe9a.
>>
>> Reports are that this causes problems with external monitors after
>> wake up from suspend, which is something it was directly supposed to 
>> help.
>>
>> Cc: Linux Regressions 
>> Cc: sta...@vger.kernel.org
>> Cc: Daniel Wheeler 
>> Cc: Wayne Lin 
>> Reported-by: Oliver Schmidt 
>> Link: https://bugzilla.kernel.org/show_bug.cgi?id=218211
>> Link:
>> https://forum.manjaro.org/t/problems-with-external-monitor-wake-up-aft
>> er-suspend/151840
>> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3023
>> Signed-off-by: Mario Limonciello > 
>> ---
>>     .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 93 +++--
> -- 
>>     1 file changed, 13 insertions(+), 80 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>> b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>> index c146dc9cba92..1ba58e4ecab3 100644
>> --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>> +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
>> @@ -2363,62 +2363,14 @@ static int dm_late_init(void *handle)
>>   return detect_mst_link_for_all_connectors(adev_to_drm(adev));
>>     }
>>
>> -static void resume_mst_branch_status(struct drm_dp_mst_topology_mgr
>> *mgr) -{
>> -   int ret;
>> -   u8 guid[16];
>> -   u64 tmp64;
>> -
>> -   mutex_lock(>lock);
>> -   if (!mgr->mst_primary)
>> -   goto out_fail;
>> -
>> -   if (drm_dp_read_dpcd_caps(mgr->aux, mgr->dpcd) < 0) {
>> -   drm_dbg_kms(mgr->dev, "dpcd read failed - undocked during
> suspend?\n");
>> -   goto out_fail;
>> -   }
>> -
>> -   ret = drm_dp_dpcd_writeb(mgr->aux, DP_MSTM_CTRL,
>> -    DP_MST_EN |
>> -    DP_UP_REQ_EN |
>> -    DP_UPSTREAM_IS_SRC);
>> -   if (ret < 0) {
>> -   drm_dbg_kms(mgr->dev, "mst write failed - undocked during
> suspend?\n");
>> -   goto 

Re: [PATCH v3] drm/amdgpu: fix ftrace event amdgpu_bo_move always move on same heap

2023-12-13 Thread Christian König

Am 13.12.23 um 08:27 schrieb Wang, Beyond:


[AMD Official Use Only - General]


Issue: during evict or validate happened on amdgpu_bo, the 'from' and

'to' is always same in ftrace event of amdgpu_bo_move

where calling the 'trace_amdgpu_bo_move', the comment says move_notify

is called before move happens, but actually it is called after move

happens, here the new_mem is same as bo->resource

Fix: move trace_amdgpu_bo_move from move_notify to amdgpu_bo_move

Signed-off-by: Wang, Beyond wang.bey...@amd.com



Yeah, that makes much more sense. Reviewed-by: Christian König 



Regards,
Christian.


---

drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 13 +

drivers/gpu/drm/amd/amdgpu/amdgpu_object.h |  4 +---

drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c |  5 +++--

3 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c


index 7416799750c1..1870775d582c 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c

@@ -1282,19 +1282,15 @@ int amdgpu_bo_get_metadata(struct amdgpu_bo 
*bo, void *buffer,


  * amdgpu_bo_move_notify - notification about a memory move

  * @bo: pointer to a buffer object

  * @evict: if this move is evicting the buffer from the graphics 
address space


- * @new_mem: new information of the bufer object

  *

  * Marks the corresponding _bo buffer object as invalid, also 
performs


  * bookkeeping.

  * TTM driver callback which is called when ttm moves a buffer.

  */

-void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,

-  bool evict,

-  struct ttm_resource *new_mem)

+void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict)

{

    struct amdgpu_device *adev = amdgpu_ttm_adev(bo->bdev);

    struct amdgpu_bo *abo;

-   struct ttm_resource *old_mem = bo->resource;

    if (!amdgpu_bo_is_amdgpu_bo(bo))

    return;

@@ -1313,13 +1309,6 @@ void amdgpu_bo_move_notify(struct 
ttm_buffer_object *bo,


    /* remember the eviction */

    if (evict)

atomic64_inc(>num_evictions);

-

-   /* update statistics */

-   if (!new_mem)

-   return;

-

-   /* move_notify is called before move happens */

-   trace_amdgpu_bo_move(abo, new_mem->mem_type, old_mem->mem_type);

}

void amdgpu_bo_get_memory(struct amdgpu_bo *bo,

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h


index 876acde6b10a..dee2c577427e 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.h

@@ -360,9 +360,7 @@ int amdgpu_bo_set_metadata (struct amdgpu_bo *bo, 
void *metadata,


int amdgpu_bo_get_metadata(struct amdgpu_bo *bo, void *buffer,

   size_t buffer_size, uint32_t *metadata_size,

   uint64_t *flags);

-void amdgpu_bo_move_notify(struct ttm_buffer_object *bo,

-  bool evict,

-  struct ttm_resource *new_mem);

+void amdgpu_bo_move_notify(struct ttm_buffer_object *bo, bool evict);

void amdgpu_bo_release_notify(struct ttm_buffer_object *bo);

vm_fault_t amdgpu_bo_fault_reserve_notify(struct ttm_buffer_object *bo);

void amdgpu_bo_fence(struct amdgpu_bo *bo, struct dma_fence *fence,

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c


index 41ed6a3e5a06..f0fffbf2bdd5 100644

--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c

+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c

@@ -576,10 +576,11 @@ static int amdgpu_bo_move(struct 
ttm_buffer_object *bo, bool evict,


    return r;

    }

+   trace_amdgpu_bo_move(abo, new_mem->mem_type, old_mem->mem_type);

out:

    /* update statistics */

    atomic64_add(bo->base.size, >num_bytes_moved);

-   amdgpu_bo_move_notify(bo, evict, new_mem);

+   amdgpu_bo_move_notify(bo, evict);

    return 0;

}

@@ -1852,7 +1853,7 @@ static int amdgpu_ttm_access_memory(struct 
ttm_buffer_object *bo,


static void

amdgpu_bo_delete_mem_notify(struct ttm_buffer_object *bo)

{

-   amdgpu_bo_move_notify(bo, false, NULL);

+   amdgpu_bo_move_notify(bo, false);

}

static struct ttm_device_funcs amdgpu_bo_driver = {

--

2.34.1



[PATCH v2] drm/radeon: Prevent multiple debug error lines on suspend

2023-12-13 Thread Woody Suwalski
Fix to avoid multiple debug error lines printed on every suspend by 
Radeon driver's debugfs.


radeon_debugfs_init() calls debugfs_create_file() for every ring.

This results in printing multiple error lines to the screen and dmesg 
similar to this:


[   92.378726] debugfs: File 'radeon_ring_gfx' in directory 
':00:01.0' already present!
[   92.378732] debugfs: File 'radeon_ring_cp1' in directory 
':00:01.0' already present!
[   92.378734] debugfs: File 'radeon_ring_cp2' in directory 
':00:01.0' already present!
[   92.378737] debugfs: File 'radeon_ring_dma1' in directory 
':00:01.0' already present!
[   92.378739] debugfs: File 'radeon_ring_dma2' in directory 
':00:01.0' already present!
[   92.380775] debugfs: File 'radeon_ring_uvd' in directory 
':00:01.0' already present!
[   92.406620] debugfs: File 'radeon_ring_vce1' in directory 
':00:01.0' already present!
[   92.406624] debugfs: File 'radeon_ring_vce2' in directory 
':00:01.0' already present!



Patch v1: The fix was to run lookup() for the file before trying to 
(re)create that debug file.
Patch v2: Call the radeon_debugfs_init() only once when radeon ring is 
initialized (as suggested  by Christian K. - thanks)


Signed-off-by: Woody Suwalski 

diff --git a/drivers/gpu/drm/radeon/radeon_ring.c 
b/drivers/gpu/drm/radeon/radeon_ring.c

index e6534fa9f1fb..38048593bb4a 100644
--- a/drivers/gpu/drm/radeon/radeon_ring.c
+++ b/drivers/gpu/drm/radeon/radeon_ring.c
@@ -413,6 +413,7 @@ int radeon_ring_init(struct radeon_device *rdev, 
struct radeon_ring *ring, unsig

         dev_err(rdev->dev, "(%d) ring map failed\n", r);
         return r;
     }
+        radeon_debugfs_ring_init(rdev, ring);
 }
 ring->ptr_mask = (ring->ring_size / 4) - 1;
 ring->ring_free_dw = ring->ring_size / 4;
@@ -421,7 +422,6 @@ int radeon_ring_init(struct radeon_device *rdev, 
struct radeon_ring *ring, unsig

     ring->next_rptr_gpu_addr = rdev->wb.gpu_addr + index;
     ring->next_rptr_cpu_addr = >wb.wb[index/4];
 }
-    radeon_debugfs_ring_init(rdev, ring);
 radeon_ring_lockup_update(rdev, ring);
 return 0;
 }



Re: [PATCH] Revert "drm/amd/display: Adjust the MST resume flow"

2023-12-13 Thread Oliver Schmidt
Hi Wayne,

On 12.12.23 17:06, Mario Limonciello wrote:
> I looked through your bugs related to this and I didn't see a reference to the
> specific docking station model.
> The logs mentioned "Thinkpad dock" but no model.
> Could you share more about it so that AMD can try to reproduce it?

Yes, it is a ThinkPad Ultra Dockingstation, part number 40AJ0135EU, see also
https://support.lenovo.com/us/en/solutions/pd500173-thinkpad-ultra-docking-station-overview-and-service-parts

Best regards,
Oliver

On 12.12.23 17:06, Mario Limonciello wrote:
> On 12/12/2023 04:10, Lin, Wayne wrote:
>> [Public]
>>
>> Hi Mario,
>>
>> Thanks for the help.
>> My feeling is like this problem probably relates to specific dock. Need time
>> to take
>> further look.
> 
> Oliver,
> 
> I looked through your bugs related to this and I didn't see a reference to the
> specific docking station model.
> The logs mentioned "Thinkpad dock" but no model.
> 
> Could you share more about it so that AMD can try to reproduce it?
> 
>>
>> Since reverting solves the issue now, feel free to add:
>> Acked-by: Wayne Lin 
> 
> Sure, thanks.
> 
>>
>> Thanks,
>> Wayne
>>
>>> -Original Message-
>>> From: Limonciello, Mario 
>>> Sent: Tuesday, December 12, 2023 12:15 AM
>>> To: amd-gfx@lists.freedesktop.org; Wentland, Harry
>>> 
>>> Cc: Linux Regressions ; sta...@vger.kernel.org;
>>> Wheeler, Daniel ; Lin, Wayne
>>> ; Oliver Schmidt 
>>> Subject: Re: [PATCH] Revert "drm/amd/display: Adjust the MST resume flow"
>>>
>>> Ping on this one.
>>>
>>> On 12/5/2023 13:54, Mario Limonciello wrote:
 This reverts commit ec5fa9fcdeca69edf7dab5ca3b2e0ceb1c08fe9a.

 Reports are that this causes problems with external monitors after
 wake up from suspend, which is something it was directly supposed to help.

 Cc: Linux Regressions 
 Cc: sta...@vger.kernel.org
 Cc: Daniel Wheeler 
 Cc: Wayne Lin 
 Reported-by: Oliver Schmidt 
 Link: https://bugzilla.kernel.org/show_bug.cgi?id=218211
 Link:
 https://forum.manjaro.org/t/problems-with-external-monitor-wake-up-aft
 er-suspend/151840
 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3023
 Signed-off-by: Mario Limonciello >>> 
 ---
    .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 93 +++--
>>> -- 
    1 file changed, 13 insertions(+), 80 deletions(-)

 diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
 b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
 index c146dc9cba92..1ba58e4ecab3 100644
 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
 +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
 @@ -2363,62 +2363,14 @@ static int dm_late_init(void *handle)
  return detect_mst_link_for_all_connectors(adev_to_drm(adev));
    }

 -static void resume_mst_branch_status(struct drm_dp_mst_topology_mgr
 *mgr) -{
 -   int ret;
 -   u8 guid[16];
 -   u64 tmp64;
 -
 -   mutex_lock(>lock);
 -   if (!mgr->mst_primary)
 -   goto out_fail;
 -
 -   if (drm_dp_read_dpcd_caps(mgr->aux, mgr->dpcd) < 0) {
 -   drm_dbg_kms(mgr->dev, "dpcd read failed - undocked during
>>> suspend?\n");
 -   goto out_fail;
 -   }
 -
 -   ret = drm_dp_dpcd_writeb(mgr->aux, DP_MSTM_CTRL,
 -    DP_MST_EN |
 -    DP_UP_REQ_EN |
 -    DP_UPSTREAM_IS_SRC);
 -   if (ret < 0) {
 -   drm_dbg_kms(mgr->dev, "mst write failed - undocked during
>>> suspend?\n");
 -   goto out_fail;
 -   }
 -
 -   /* Some hubs forget their guids after they resume */
 -   ret = drm_dp_dpcd_read(mgr->aux, DP_GUID, guid, 16);
 -   if (ret != 16) {
 -   drm_dbg_kms(mgr->dev, "dpcd read failed - undocked during
>>> suspend?\n");
 -   goto out_fail;
 -   }
 -
 -   if (memchr_inv(guid, 0, 16) == NULL) {
 -   tmp64 = get_jiffies_64();
 -   memcpy([0], , sizeof(u64));
 -   memcpy([8], , sizeof(u64));
 -
 -   ret = drm_dp_dpcd_write(mgr->aux, DP_GUID, guid, 16);
 -
 -   if (ret != 16) {
 -   drm_dbg_kms(mgr->dev, "check mstb guid failed -
>>> undocked during suspend?\n");
 -   goto out_fail;
 -   }
 -   }
 -
 -   memcpy(mgr->mst_primary->guid, guid, 16);
 -
 -out_fail:
 -   mutex_unlock(>lock);
 -}
 -
    static void s3_handle_mst(struct drm_device *dev, bool suspend)
    {
  struct amdgpu_dm_connector *aconnector;
  struct drm_connector *connector;
  struct drm_connector_list_iter iter;
  struct drm_dp_mst_topology_mgr *mgr;
 +   int ret;
 +   bool need_hotplug = false;

  drm_connector_list_iter_begin(dev, );
  drm_for_each_connector_iter(connector, ) { @@ -2444,15