[PATCH] drm/amdgpu: add SPM golden settings for Navi10
From: "Tianci.Yin" Add RLC_SPM golden settings Change-Id: I616e127171293d915cb3a05dee02f51cec8d8f6f Signed-off-by: Tianci.Yin --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c|9 + .../gpu/drm/amd/amdgpu/golden_gc_spm_10_1_0.h | 1058 + 2 files changed, 1067 insertions(+) create mode 100644 drivers/gpu/drm/amd/amdgpu/golden_gc_spm_10_1_0.h diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 70edbbf84338..7c96a894ad37 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -48,6 +48,7 @@ #include "v10_structs.h" #include "gfx_v10_0.h" #include "nbio_v2_3.h" +#include "golden_gc_spm_10_1_0.h" /** * Navi10 has two graphic rings to share each graphic pipe. @@ -138,6 +139,11 @@ static const struct soc15_reg_golden golden_settings_gc_10_0_nv10[] = /* Pending on emulation bring up */ }; +static const struct soc15_reg_golden golden_settings_gc_rlc_spm_10_0_nv10[] = +{ + GOLDEN_GC_SPM_10_1_0 +}; + static const struct soc15_reg_golden golden_settings_gc_10_1_1[] = { SOC15_REG_GOLDEN_VALUE(GC, 0, mmCB_HW_CONTROL_4, 0x, 0x003c0014), @@ -388,6 +394,9 @@ static void gfx_v10_0_init_golden_registers(struct amdgpu_device *adev) soc15_program_register_sequence(adev, golden_settings_gc_10_0_nv10, (const u32)ARRAY_SIZE(golden_settings_gc_10_0_nv10)); + soc15_program_register_sequence(adev, + golden_settings_gc_rlc_spm_10_0_nv10, + (const u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_0_nv10)); break; case CHIP_NAVI14: soc15_program_register_sequence(adev, diff --git a/drivers/gpu/drm/amd/amdgpu/golden_gc_spm_10_1_0.h b/drivers/gpu/drm/amd/amdgpu/golden_gc_spm_10_1_0.h new file mode 100644 index ..e65af4a6fcdd --- /dev/null +++ b/drivers/gpu/drm/amd/amdgpu/golden_gc_spm_10_1_0.h @@ -0,0 +1,1058 @@ +#ifndef __GOLDEN_GC_SPM_10_1_0_H__ +#define __GOLDEN_GC_SPM_10_1_0_H__ + +#define GOLDEN_GC_SPM_10_1_0 \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xe000, 0x0), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_ADDR, 0x, 0x28), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_DATA, 0x, 0x9), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_SE_SAMPLEDELAY_IND_ADDR, 0x, 0x8), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_SE_SAMPLEDELAY_IND_DATA, 0x, 0x1b), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x1), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_SE_SAMPLEDELAY_IND_ADDR, 0x, 0x8), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x1), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_SE_SAMPLEDELAY_IND_DATA, 0x, 0x1b), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_ADDR, 0x, 0x20), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_DATA, 0x, 0xe), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_ADDR, 0x, 0xc8), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_DATA, 0x, 0xf), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_ADDR, 0x, 0xcc), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_DATA, 0x, 0xf), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_ADDR, 0x, 0xd0), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_DATA, 0x, 0xf), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_ADDR, 0x, 0xd4), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_DATA, 0x, 0xf), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \ +SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_ADDR, 0x, 0x24), \ +SOC15_REG_GOLDEN_VALUE(GC, 0,
Re: AMD DC graphics display code enables -mhard-float, -msse, -msse2 without any visible FPU state protection
On Thu, 2 Apr 2020 16:13:08 +0200 Peter Zijlstra wrote: > On Thu, Apr 02, 2020 at 09:33:54AM +0200, Christian König wrote: > > Hi Jann, > > > > Am 02.04.20 um 04:34 schrieb Jann Horn: > > > [x86 folks in CC so that they can chime in on the precise rules for this > > > stuff] > > > > > > Hi! > > > > > > I noticed that several makefiles under drivers/gpu/drm/amd/display/dc/ > > > turn on floating-point instructions in the compiler flags > > > (-mhard-float, -msse and -msse2) in order to make the "float" and > > > "double" types usable from C code without requiring helper functions. > > > > > > However, as far as I know, code running in normal kernel context isn't > > > allowed to use floating-point registers without special protection > > > using helpers like kernel_fpu_begin() and kernel_fpu_end() (which also > > > require that the protected code never blocks). If you violate that > > > rule, that can lead to various issues - among other things, I think > > > the kernel will clobber userspace FPU register state, and I think the > > > kernel code can blow up if a context switch happens at the wrong time, > > > since in-kernel task switches don't preserve FPU state. > > > > > > Is there some hidden trick I'm missing that makes it okay to use FPU > > > registers here? > > > > > > I would try testing this, but unfortunately none of the AMD devices I > > > have here have the appropriate graphics hardware... > > > > yes, using the floating point calculations in the display code has been a > > source of numerous problems and confusion in the past. > > > > The calls to kernel_fpu_begin() and kernel_fpu_end() are hidden behind the > > DC_FP_START() and DC_FP_END() macros which are supposed to hide the > > architecture depend handling for x86 and PPC64. > > > > This originated from the graphics block integrated into AMD CPU (where we > > knew which fp unit we had), but as far as I know is now also used for > > dedicated AMD GPUs as well. > > > > I'm not really a fan of this either, but so far we weren't able to convince > > the hardware engineers to not use floating point calculations for the > > display stuff. > > Might I complain that: > > make O=allmodconfig-build drivers/gpu/drm/amd/display/dc/ > > does not in fact work? > > Anyway, how do people feel about something like the below? > > Masami, Boris, is there any semi-sane way we can have insn_is_fpu() ? > While digging through various opcode manuals is of course forever fun, I > do feel like it might not be the best way. Yes, it is possible to add INAT_FPU and insn_is_fpu(). But it seems that the below patch needs more classification based on nmemonic or opcodes. IMHO, it is the time to expand gen-insn-attr.awk or clone it to generate another opcode map, so that user will easily extend the insn infrastructure. (e.g. I had made an in-kernel disassembler, which generates a mnemonic maps from x86-opcode-map.txt) https://github.com/mhiramat/linux/commits/inkernel-disasm-20130414 Thank you, > > --- > arch/x86/include/asm/fpu/api.h | 7 > arch/x86/include/asm/fpu/internal.h | 11 ++ > arch/x86/kernel/fpu/init.c | 5 +++ > tools/objtool/arch.h| 1 + > tools/objtool/arch/x86/decode.c | 71 > ++--- > tools/objtool/check.c | 58 ++ > tools/objtool/check.h | 3 +- > 7 files changed, 134 insertions(+), 22 deletions(-) > > diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h > index b774c52e5411..b9bca1cdc875 100644 > --- a/arch/x86/include/asm/fpu/api.h > +++ b/arch/x86/include/asm/fpu/api.h > @@ -12,6 +12,13 @@ > #define _ASM_X86_FPU_API_H > #include > > +#define annotate_fpu() ({\ > + asm volatile("%c0:\n\t" \ > + ".pushsection .discard.fpu_safe\n\t" \ > + ".long %c0b - .\n\t" \ > + ".popsection\n\t" : : "i" (__COUNTER__)); \ > +}) > + > /* > * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It > * disables preemption so be careful if you intend to use it for long periods > diff --git a/arch/x86/include/asm/fpu/internal.h > b/arch/x86/include/asm/fpu/internal.h > index 44c48e34d799..bc436213a0d4 100644 > --- a/arch/x86/include/asm/fpu/internal.h > +++ b/arch/x86/include/asm/fpu/internal.h > @@ -102,6 +102,11 @@ static inline void fpstate_init_fxstate(struct > fxregs_state *fx) > } > extern void fpstate_sanitize_xstate(struct fpu *fpu); > > +#define _ASM_ANNOTATE_FPU(at) > \ > + ".pushsection .discard.fpu_safe\n" \ > + ".long " #at " - .\n" \ > + ".popsection\n"\ > + > #define
RE: [PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset
Thanks Monk, I just updated the patch and it could passed 1000 rounds TDR test. Sent out an review email. Regards, Jack -Original Message- From: Liu, Monk Sent: Friday, April 3, 2020 11:38 AM To: Kuehling, Felix ; Zhang, Jack (Jian) ; amd-gfx@lists.freedesktop.org Subject: RE: [PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset Thanks Felix Hi Jack I think below changes can resolve your problem , we had this on our customer branch already, it fix the memory leak, and also fix my previous bug . Can you make this change applied to gfx_v10/v9 ? thanks ! diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c index 29749502..532258445 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c @@ -543,6 +543,8 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t temp; struct v10_compute_mqd *m = get_mqd(mqd); + if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset) + return 0; #if 0 unsigned long flags; int retry; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 35b32ad..f6479e1 100755 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3653,6 +3653,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) return r; + amdgpu_amdkfd_pre_reset(adev); + /* Resume IP prior to SMC */ r = amdgpu_device_ip_reinit_early_sriov(adev); if (r) _ Monk Liu|GPU Virtualization Team |AMD -Original Message- From: Kuehling, Felix Sent: Friday, April 3, 2020 1:26 AM To: Zhang, Jack (Jian) ; amd-gfx@lists.freedesktop.org; Liu, Monk Subject: Re: [PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset [+Monk] This looks reasonable to me. However, you're effectively reverting this commit by Monk: a03eb637d2a5 drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV In hind-sight, Monk's commit was broken. Removing the call to pre_reset has other consequences, such as breaking notifications about reset to user mode, and probably invalidating some assumptions in kfd_post_reset. Can you coordinate with Monk to work out why his change was needed, and whether you'll need a different solution for the problem he was trying to address? In the meanwhile, this patch is Acked-by: Felix Kuehling Am 2020-04-02 um 3:20 a.m. schrieb Jack Zhang: > kfd_pre_reset will free mem_objs allocated by kfd_gtt_sa_allocate > > Without this change, sriov tdr code path will never free those > allocated memories and get memory leak. > > Signed-off-by: Jack Zhang > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 8faaa17..832daf7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -3847,6 +3847,8 @@ static int amdgpu_device_reset_sriov(struct > amdgpu_device *adev, { > int r; > > + amdgpu_amdkfd_pre_reset(adev); > + > if (from_hypervisor) > r = amdgpu_virt_request_full_gpu(adev, true); > else ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset
kfd_pre_reset will free mem_objs allocated by kfd_gtt_sa_allocate Without this change, sriov tdr code path will never free those allocated memories and get memory leak. v2:add a bugfix for kiq ring test fail Signed-off-by: Jack Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 3 +++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 3 files changed, 8 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c index 4ec6d0c..bdc1f5a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c @@ -543,6 +543,9 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t temp; struct v10_compute_mqd *m = get_mqd(mqd); + if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset) + return 0; + #if 0 unsigned long flags; int retry; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index df841c2..c2562d6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -541,6 +541,9 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t temp; struct v9_mqd *m = get_mqd(mqd); + if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset) + return 0; + if (adev->in_gpu_reset) return -EIO; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8faaa17..e3f7441 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3854,6 +3854,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) return r; + amdgpu_amdkfd_pre_reset(adev); + /* Resume IP prior to SMC */ r = amdgpu_device_ip_reinit_early_sriov(adev); if (r) -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH] drm/amd/powerplay: avoid using pm_en before it is initialized
[AMD Official Use Only - Internal Distribution Only] Ping... -Original Message- From: Tiecheng Zhou Sent: Thursday, April 2, 2020 5:29 PM To: amd-gfx@lists.freedesktop.org Cc: Zhou, Tiecheng ; Tao, Yintian Subject: [PATCH] drm/amd/powerplay: avoid using pm_en before it is initialized hwmgr->pm_en is initialized at hwmgr_hw_init. during amdgpu_device_init, there is amdgpu_asic_reset that calls to pp_get_asic_baco_capability, while hwmgr->pm_en has not yet been initialized. so avoid using pm_en in pp_get_asic_baco_capability. Signed-off-by: Tiecheng Zhou Signed-off-by: Yintian Tao --- drivers/gpu/drm/amd/powerplay/amd_powerplay.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c index 71b843f542d8..fdff3e1c5e95 100644 --- a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c +++ b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c @@ -1455,7 +1455,8 @@ static int pp_get_asic_baco_state(void *handle, int *state) if (!hwmgr) return -EINVAL; - if (!hwmgr->pm_en || !hwmgr->hwmgr_func->get_asic_baco_state) + if (!(hwmgr->not_vf && amdgpu_dpm) || + !hwmgr->hwmgr_func->get_asic_baco_state) return 0; mutex_lock(>smu_lock); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu: fix gfx hang during suspend with video playback
The system will be hang up during S3 as SMU is pending at GC not respose the register CP_HQD_ACTIVE access request and this issue can be fixed by adding RLC safe mode guard before each HQD map/unmap retrive opt. Signed-off-by: Prike Liang Tested-by: Mengbing Wang --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 6 ++ drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 2 files changed, 10 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c index df841c2..e265063 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c @@ -232,6 +232,7 @@ int kgd_gfx_v9_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, uint32_t *mqd_hqd; uint32_t reg, hqd_base, data; + amdgpu_gfx_rlc_enter_safe_mode(adev); m = get_mqd(mqd); acquire_queue(kgd, pipe_id, queue_id); @@ -299,6 +300,7 @@ int kgd_gfx_v9_hqd_load(struct kgd_dev *kgd, void *mqd, uint32_t pipe_id, release_queue(kgd); + amdgpu_gfx_rlc_exit_safe_mode(adev); return 0; } @@ -497,6 +499,7 @@ bool kgd_gfx_v9_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, bool retval = false; uint32_t low, high; + amdgpu_gfx_rlc_enter_safe_mode(adev); acquire_queue(kgd, pipe_id, queue_id); act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE)); if (act) { @@ -508,6 +511,7 @@ bool kgd_gfx_v9_hqd_is_occupied(struct kgd_dev *kgd, uint64_t queue_address, retval = true; } release_queue(kgd); + amdgpu_gfx_rlc_exit_safe_mode(adev); return retval; } @@ -541,6 +545,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t temp; struct v9_mqd *m = get_mqd(mqd); + amdgpu_gfx_rlc_enter_safe_mode(adev); if (adev->in_gpu_reset) return -EIO; @@ -577,6 +582,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd, } release_queue(kgd); + amdgpu_gfx_rlc_exit_safe_mode(adev); return 0; } diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index 1fea077..ee107d9 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -3533,6 +3533,7 @@ static int gfx_v9_0_kiq_init_register(struct amdgpu_ring *ring) struct v9_mqd *mqd = ring->mqd_ptr; int j; + amdgpu_gfx_rlc_enter_safe_mode(adev); /* disable wptr polling */ WREG32_FIELD15(GC, 0, CP_PQ_WPTR_POLL_CNTL, EN, 0); @@ -3629,6 +3630,7 @@ static int gfx_v9_0_kiq_init_register(struct amdgpu_ring *ring) if (ring->use_doorbell) WREG32_FIELD15(GC, 0, CP_PQ_STATUS, DOORBELL_ENABLE, 1); + amdgpu_gfx_rlc_exit_safe_mode(adev); return 0; } @@ -3637,6 +3639,7 @@ static int gfx_v9_0_kiq_fini_register(struct amdgpu_ring *ring) struct amdgpu_device *adev = ring->adev; int j; + amdgpu_gfx_rlc_enter_safe_mode(adev); /* disable the queue if it's active */ if (RREG32_SOC15(GC, 0, mmCP_HQD_ACTIVE) & 1) { @@ -3668,6 +3671,7 @@ static int gfx_v9_0_kiq_fini_register(struct amdgpu_ring *ring) WREG32_SOC15_RLC(GC, 0, mmCP_HQD_PQ_WPTR_HI, 0); WREG32_SOC15_RLC(GC, 0, mmCP_HQD_PQ_WPTR_LO, 0); + amdgpu_gfx_rlc_exit_safe_mode(adev); return 0; } -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset
Thanks Felix Hi Jack I think below changes can resolve your problem , we had this on our customer branch already, it fix the memory leak, and also fix my previous bug . Can you make this change applied to gfx_v10/v9 ? thanks ! diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c index 29749502..532258445 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c @@ -543,6 +543,8 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd, uint32_t temp; struct v10_compute_mqd *m = get_mqd(mqd); + if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset) + return 0; #if 0 unsigned long flags; int retry; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 35b32ad..f6479e1 100755 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3653,6 +3653,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, if (r) return r; + amdgpu_amdkfd_pre_reset(adev); + /* Resume IP prior to SMC */ r = amdgpu_device_ip_reinit_early_sriov(adev); if (r) _ Monk Liu|GPU Virtualization Team |AMD -Original Message- From: Kuehling, Felix Sent: Friday, April 3, 2020 1:26 AM To: Zhang, Jack (Jian) ; amd-gfx@lists.freedesktop.org; Liu, Monk Subject: Re: [PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset [+Monk] This looks reasonable to me. However, you're effectively reverting this commit by Monk: a03eb637d2a5 drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV In hind-sight, Monk's commit was broken. Removing the call to pre_reset has other consequences, such as breaking notifications about reset to user mode, and probably invalidating some assumptions in kfd_post_reset. Can you coordinate with Monk to work out why his change was needed, and whether you'll need a different solution for the problem he was trying to address? In the meanwhile, this patch is Acked-by: Felix Kuehling Am 2020-04-02 um 3:20 a.m. schrieb Jack Zhang: > kfd_pre_reset will free mem_objs allocated by kfd_gtt_sa_allocate > > Without this change, sriov tdr code path will never free those > allocated memories and get memory leak. > > Signed-off-by: Jack Zhang > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 8faaa17..832daf7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -3847,6 +3847,8 @@ static int amdgpu_device_reset_sriov(struct > amdgpu_device *adev, { > int r; > > + amdgpu_amdkfd_pre_reset(adev); > + > if (from_hypervisor) > r = amdgpu_virt_request_full_gpu(adev, true); > else ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH] drm/amd/powerplay: determine pm_en at amd_powerplay_create
Reviewed-by: Evan Quan -Original Message- From: amd-gfx On Behalf Of Tiecheng Zhou Sent: Thursday, April 2, 2020 1:40 PM To: amd-gfx@lists.freedesktop.org Cc: Zhou, Tiecheng Subject: [PATCH] drm/amd/powerplay: determine pm_en at amd_powerplay_create Need to determine pm_en at amd_powerplay_create of early_init stage. Signed-off-by: Tiecheng Zhou --- drivers/gpu/drm/amd/powerplay/amd_powerplay.c | 3 +++ drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c index 71b843f542d8..a37dc37dfe49 100644 --- a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c +++ b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c @@ -48,6 +48,9 @@ static int amd_powerplay_create(struct amdgpu_device *adev) hwmgr->adev = adev; hwmgr->not_vf = !amdgpu_sriov_vf(adev); + hwmgr->pp_one_vf = amdgpu_sriov_is_pp_one_vf(adev); + hwmgr->pm_en = (amdgpu_dpm && (hwmgr->not_vf || hwmgr->pp_one_vf)) + ? true : false; hwmgr->device = amdgpu_cgs_create_device(adev); mutex_init(>smu_lock); mutex_init(>msg_lock); diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c b/drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c index f48fdc7f0382..7aee382fc1f9 100644 --- a/drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c +++ b/drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c @@ -221,9 +221,6 @@ int hwmgr_hw_init(struct pp_hwmgr *hwmgr) { int ret = 0; - hwmgr->pp_one_vf = amdgpu_sriov_is_pp_one_vf((struct amdgpu_device *)hwmgr->adev); - hwmgr->pm_en = (amdgpu_dpm && (hwmgr->not_vf || hwmgr->pp_one_vf)) - ? true : false; if (!hwmgr->pm_en) return 0; -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cevan.quan%40amd.com%7C879eafc288894347147c08d7d6c845b0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637214028875355960sdata=35oaGAN8RHzU4biNt7wWUZpskOehIQ0h%2Fao5HiVduBM%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH v2] drm/amdkfd: Provide SMI events watch
On 2020-04-02 4:46 p.m., Amber Lin wrote: When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register events they are interested. After the event is registered, the user can use annoymous file descriptor's poll function with wait-time specified to wait for the event to happen. Once the event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. v2: - remove UNREGISTER and add event ENABLE/DISABLE - correct kfifo usage - move event message API to kfd_ioctl.h Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/Makefile | 3 +- drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 30 drivers/gpu/drm/amd/amdkfd/kfd_device.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 12 ++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 177 +++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 31 include/uapi/linux/kfd_ioctl.h | 30 +++- 9 files changed, 286 insertions(+), 2 deletions(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 6147462..cc98b4a 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,7 +53,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_dbgdev.o \ $(AMDKFD_PATH)/kfd_dbgmgr.o \ - $(AMDKFD_PATH)/kfd_crat.o + $(AMDKFD_PATH)/kfd_crat.o \ + $(AMDKFD_PATH)/kfd_smi_events.o ifneq ($(CONFIG_AMD_IOMMU_V2),) AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba9..24b4717 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "cik_int.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); memset(, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f8fa03a..591ac28 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p, return ret; } +/* Handle requests for watching SMI events */ +static int kfd_ioctl_smi_events(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_smi_events_args *args = data; + struct kfd_dev *dev; + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; + + switch (args->op) { + case KFD_SMI_EVENTS_REGISTER: + /* register the device */ + return kfd_smi_event_register(dev, >data); + case KFD_SMI_EVENTS_ENABLE: + /* subscribe events to the device */ + return kfd_smi_event_enable(dev, args->events); + case KFD_SMI_EVENTS_DISABLE: + /* unsubscribe events */ + return kfd_smi_event_disable(dev, args->events); + } + + return -EINVAL; +} + bool kfd_dev_is_large_bar(struct kfd_dev *dev) { struct kfd_local_mem_info mem_info; @@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, kfd_ioctl_alloc_queue_gws, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, + kfd_ioctl_smi_events, 0), }; #define AMDKFD_CORE_IOCTL_COUNT ARRAY_SIZE(amdkfd_ioctls) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index
[PATCH v2] drm/amdkfd: Provide SMI events watch
When the compute is malfunctioning or performance drops, the system admin will use SMI (System Management Interface) tool to monitor/diagnostic what went wrong. This patch provides an event watch interface for the user space to register events they are interested. After the event is registered, the user can use annoymous file descriptor's poll function with wait-time specified to wait for the event to happen. Once the event happens, the user can use read() to retrieve information related to the event. VM fault event is done in this patch. v2: - remove UNREGISTER and add event ENABLE/DISABLE - correct kfifo usage - move event message API to kfd_ioctl.h Signed-off-by: Amber Lin --- drivers/gpu/drm/amd/amdkfd/Makefile | 3 +- drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 30 drivers/gpu/drm/amd/amdkfd/kfd_device.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c | 2 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h| 12 ++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c | 177 +++ drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h | 31 include/uapi/linux/kfd_ioctl.h | 30 +++- 9 files changed, 286 insertions(+), 2 deletions(-) create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile b/drivers/gpu/drm/amd/amdkfd/Makefile index 6147462..cc98b4a 100644 --- a/drivers/gpu/drm/amd/amdkfd/Makefile +++ b/drivers/gpu/drm/amd/amdkfd/Makefile @@ -53,7 +53,8 @@ AMDKFD_FILES := $(AMDKFD_PATH)/kfd_module.o \ $(AMDKFD_PATH)/kfd_int_process_v9.o \ $(AMDKFD_PATH)/kfd_dbgdev.o \ $(AMDKFD_PATH)/kfd_dbgmgr.o \ - $(AMDKFD_PATH)/kfd_crat.o + $(AMDKFD_PATH)/kfd_crat.o \ + $(AMDKFD_PATH)/kfd_smi_events.o ifneq ($(CONFIG_AMD_IOMMU_V2),) AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c index 9f59ba9..24b4717 100644 --- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c +++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c @@ -24,6 +24,7 @@ #include "kfd_events.h" #include "cik_int.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static bool cik_event_interrupt_isr(struct kfd_dev *dev, const uint32_t *ih_ring_entry, @@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev, ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) { struct kfd_vm_fault_info info; + kfd_smi_event_update_vmfault(dev, pasid); kfd_process_vm_fault(dev->dqm, pasid); memset(, 0, sizeof(info)); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index f8fa03a..591ac28 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -39,6 +39,7 @@ #include "kfd_device_queue_manager.h" #include "kfd_dbgmgr.h" #include "amdgpu_amdkfd.h" +#include "kfd_smi_events.h" static long kfd_ioctl(struct file *, unsigned int, unsigned long); static int kfd_open(struct inode *, struct file *); @@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p, return ret; } +/* Handle requests for watching SMI events */ +static int kfd_ioctl_smi_events(struct file *filep, + struct kfd_process *p, void *data) +{ + struct kfd_ioctl_smi_events_args *args = data; + struct kfd_dev *dev; + + dev = kfd_device_by_id(args->gpu_id); + if (!dev) + return -EINVAL; + + switch (args->op) { + case KFD_SMI_EVENTS_REGISTER: + /* register the device */ + return kfd_smi_event_register(dev, >data); + case KFD_SMI_EVENTS_ENABLE: + /* subscribe events to the device */ + return kfd_smi_event_enable(dev, args->events); + case KFD_SMI_EVENTS_DISABLE: + /* unsubscribe events */ + return kfd_smi_event_disable(dev, args->events); + } + + return -EINVAL; +} + bool kfd_dev_is_large_bar(struct kfd_dev *dev) { struct kfd_local_mem_info mem_info; @@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = { AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS, kfd_ioctl_alloc_queue_gws, 0), + + AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS, + kfd_ioctl_smi_events, 0), }; #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 0491ab2..6ac6f31 100644 ---
Re: [PATCH] drm/amd/display: re-order asic declarations
[AMD Official Use Only - Internal Distribution Only] From: amd-gfx on behalf of Shirish S Sent: Thursday, April 2, 2020 5:15 AM To: Deucher, Alexander ; Wentland, Harry ; Li, Sun peng (Leo) Cc: amd-gfx@lists.freedesktop.org ; S, Shirish Subject: [PATCH] drm/amd/display: re-order asic declarations "1382d6409891 drm/amd/display: Fix RV2 Variant Detection" introduces build error of: "use of undeclared identifier 'RENOIR_A0'" To fix the same, this patch re-orders the ASIC declarations accordingly. Signed-off-by: Shirish S Reviewed-by: Zhan Liu --- drivers/gpu/drm/amd/display/include/dal_asic_id.h | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/include/dal_asic_id.h b/drivers/gpu/drm/amd/display/include/dal_asic_id.h index 8a87d0ed90ae..2359e88d6029 100644 --- a/drivers/gpu/drm/amd/display/include/dal_asic_id.h +++ b/drivers/gpu/drm/amd/display/include/dal_asic_id.h @@ -136,6 +136,7 @@ #define RAVEN2_A0 0x81 #define RAVEN1_F0 0xF0 #define RAVEN_UNKNOWN 0xFF +#define RENOIR_A0 0x91 #ifndef ASICREV_IS_RAVEN #define ASICREV_IS_RAVEN(eChipRev) ((eChipRev >= RAVEN_A0) && eChipRev < RAVEN_UNKNOWN) #endif @@ -171,8 +172,6 @@ enum { #define ASICREV_IS_NAVI10_P(eChipRev)(eChipRev < NV_NAVI12_P_A0) #define ASICREV_IS_NAVI12_P(eChipRev)((eChipRev >= NV_NAVI12_P_A0) && (eChipRev < NV_NAVI14_M_A0)) #define ASICREV_IS_NAVI14_M(eChipRev)((eChipRev >= NV_NAVI14_M_A0) && (eChipRev < NV_UNKNOWN)) -#define RENOIR_A0 0x91 -#define DEVICE_ID_RENOIR_1636 0x1636 // Renoir #define ASICREV_IS_RENOIR(eChipRev) ((eChipRev >= RENOIR_A0) && (eChipRev < RAVEN1_F0)) /* @@ -183,6 +182,9 @@ enum { #define DEVICE_ID_TEMASH_9839 0x9839 #define DEVICE_ID_TEMASH_983D 0x983D +/* RENOIR */ +#define DEVICE_ID_RENOIR_1636 0x1636 + /* Asic Family IDs for different asic family. */ #define FAMILY_CI 120 /* Sea Islands: Hawaii (P), Bonaire (M) */ #define FAMILY_KV 125 /* Fusion => Kaveri: Spectre, Spooky; Kabini: Kalindi */ -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Czhan.liu%40amd.com%7C5d6f323ad05349e58ef808d7d6e66ade%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637214158105886093sdata=tjvt8iuzXrd1rg0y%2FIzVNfSGpAFEJL59R04fcqi3UzE%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset
[+Monk] This looks reasonable to me. However, you're effectively reverting this commit by Monk: a03eb637d2a5 drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV In hind-sight, Monk's commit was broken. Removing the call to pre_reset has other consequences, such as breaking notifications about reset to user mode, and probably invalidating some assumptions in kfd_post_reset. Can you coordinate with Monk to work out why his change was needed, and whether you'll need a different solution for the problem he was trying to address? In the meanwhile, this patch is Acked-by: Felix Kuehling Am 2020-04-02 um 3:20 a.m. schrieb Jack Zhang: > kfd_pre_reset will free mem_objs allocated by kfd_gtt_sa_allocate > > Without this change, sriov tdr code path will never free those allocated > memories and get memory leak. > > Signed-off-by: Jack Zhang > --- > drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > index 8faaa17..832daf7 100644 > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c > @@ -3847,6 +3847,8 @@ static int amdgpu_device_reset_sriov(struct > amdgpu_device *adev, > { > int r; > > + amdgpu_amdkfd_pre_reset(adev); > + > if (from_hypervisor) > r = amdgpu_virt_request_full_gpu(adev, true); > else ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: AMD DC graphics display code enables -mhard-float, -msse, -msse2 without any visible FPU state protection
On Thu, Apr 2, 2020 at 11:36 AM Thomas Gleixner wrote: > Jann Horn writes: > > On Thu, Apr 2, 2020 at 9:34 AM Christian König > > wrote: > >> Am 02.04.20 um 04:34 schrieb Jann Horn: > >> > [x86 folks in CC so that they can chime in on the precise rules for > >> > this stuff] > > They are pretty simple. > > Any code using FPU needs to be completely isolated from regular code > either by using inline asm or by moving it to a different compilation > unit. The invocations need fpu_begin/end() of course. [...] > We really need objtool support to validate that. > > Peter, now that we know how to do it (noinstr, clac/stac) we can emit > annotations (see patch below) and validate that any FPU instruction is > inside a safe region. Hmm? One annoying aspect is that for the "move it to a different compilation unit" method, objtool needs to know at compile time (before linking) which functions are in FPU-enabled object files, right? So we'd need to have some sort of function annotation that gets plumbed from the function declaration in a header file through the compiler into the ELF file, and then let objtool verify that calls to FPU-enabled methods occur only when the FPU is available? (Ideally something that covers indirect calls... but this would probably get really complicated unless we can get the compiler to include that annotation in its type checking.) ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: AMD DC graphics display code enables -mhard-float, -msse, -msse2 without any visible FPU state protection
On Thu, Apr 02, 2020 at 09:33:54AM +0200, Christian König wrote: > Hi Jann, > > Am 02.04.20 um 04:34 schrieb Jann Horn: > > [x86 folks in CC so that they can chime in on the precise rules for this > > stuff] > > > > Hi! > > > > I noticed that several makefiles under drivers/gpu/drm/amd/display/dc/ > > turn on floating-point instructions in the compiler flags > > (-mhard-float, -msse and -msse2) in order to make the "float" and > > "double" types usable from C code without requiring helper functions. > > > > However, as far as I know, code running in normal kernel context isn't > > allowed to use floating-point registers without special protection > > using helpers like kernel_fpu_begin() and kernel_fpu_end() (which also > > require that the protected code never blocks). If you violate that > > rule, that can lead to various issues - among other things, I think > > the kernel will clobber userspace FPU register state, and I think the > > kernel code can blow up if a context switch happens at the wrong time, > > since in-kernel task switches don't preserve FPU state. > > > > Is there some hidden trick I'm missing that makes it okay to use FPU > > registers here? > > > > I would try testing this, but unfortunately none of the AMD devices I > > have here have the appropriate graphics hardware... > > yes, using the floating point calculations in the display code has been a > source of numerous problems and confusion in the past. > > The calls to kernel_fpu_begin() and kernel_fpu_end() are hidden behind the > DC_FP_START() and DC_FP_END() macros which are supposed to hide the > architecture depend handling for x86 and PPC64. > > This originated from the graphics block integrated into AMD CPU (where we > knew which fp unit we had), but as far as I know is now also used for > dedicated AMD GPUs as well. > > I'm not really a fan of this either, but so far we weren't able to convince > the hardware engineers to not use floating point calculations for the > display stuff. Might I complain that: make O=allmodconfig-build drivers/gpu/drm/amd/display/dc/ does not in fact work? Anyway, how do people feel about something like the below? Masami, Boris, is there any semi-sane way we can have insn_is_fpu() ? While digging through various opcode manuals is of course forever fun, I do feel like it might not be the best way. --- arch/x86/include/asm/fpu/api.h | 7 arch/x86/include/asm/fpu/internal.h | 11 ++ arch/x86/kernel/fpu/init.c | 5 +++ tools/objtool/arch.h| 1 + tools/objtool/arch/x86/decode.c | 71 ++--- tools/objtool/check.c | 58 ++ tools/objtool/check.h | 3 +- 7 files changed, 134 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index b774c52e5411..b9bca1cdc875 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -12,6 +12,13 @@ #define _ASM_X86_FPU_API_H #include +#define annotate_fpu() ({ \ + asm volatile("%c0:\n\t" \ +".pushsection .discard.fpu_safe\n\t" \ +".long %c0b - .\n\t" \ +".popsection\n\t" : : "i" (__COUNTER__)); \ +}) + /* * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It * disables preemption so be careful if you intend to use it for long periods diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 44c48e34d799..bc436213a0d4 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -102,6 +102,11 @@ static inline void fpstate_init_fxstate(struct fxregs_state *fx) } extern void fpstate_sanitize_xstate(struct fpu *fpu); +#define _ASM_ANNOTATE_FPU(at) \ +".pushsection .discard.fpu_safe\n" \ +".long " #at " - .\n" \ +".popsection\n"\ + #define user_insn(insn, output, input...) \ ({ \ int err;\ @@ -116,6 +121,7 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu); "jmp 2b\n"\ ".previous\n" \ _ASM_EXTABLE(1b, 3b) \ +_ASM_ANNOTATE_FPU(1b) \ : [err] "=r" (err), output \ : "0"(0), input); \
Re: Possibility of RX570 responsible for spontaneous reboots (MCE) with Ryzen 3700x?
Hi Someguy, Your findings sound very familiar, my machine is also rock-solid running Windows-10 - most of the MCEs happened for me with low-load situations but firefox playing youtube in background. First I didn't care that much - but now having experienced corrupted firefox profiles and lost spreadsheet work it starts to get annoying. I've filed a kernel bug regarding this issue: https://bugzilla.kernel.org/show_bug.cgi?id=206903 I would appreciate if you could report your findings there to give the issue more data / weight. Thanks, Clemens Am Do., 2. Apr. 2020 um 15:11 Uhr schrieb someguy108 : > > Hello! I saw Clemens Eisserer email regarding MCE errors with his RX 570 and > 3700x, and I like to add to that list of MCE spontaneous reboots as well. > This is my configuration: > -Ryzen 3900x + Noctua D15 > -MSI X570 Unify (latest agesa as of writing) > -DDR4 3200mhz 32GB kit > -Sapphire Pulse 5700 XT > -Corsair RMX 850 Watt > -Arch Linux with kernel 5.5.13 > -Mesa 20.0.3 > -Early KMS enabled > > I've had this system up and running since November 2019 but initially with a > Nvidia 1060 and Windows 10. Everything was running smoothly. About a month > ago I switched back over to Linux after purchasing my 5700 XT as my initial > plan was to go back to Linux. Since returning I've experienced multiple > spontaneous MCE reboots. All happened while I was playing one particular > game, Warcraft 3 Reforged. The MCE event is the following: > > kernel: mce: [Hardware Error]: Machine check events logged > kernel: mce: [Hardware Error]: CPU 1: Machine Check: 0 Bank 5: > bea00108 > kernel: mce: [Hardware Error]: TSC 0 ADDR 1ad66d6fe MISC d0120001 > SYND 4d00 IPID 500b0 > kernel: mce: [Hardware Error]: PROCESSOR 2:870f10 TIME 1585120217 SOCKET 0 > APIC 2 microcode 8701013 > kernel: #2 #3 #4 #5 #6 #7 #8 #9 #10 #11 #12 #13 #14 #15 > kernel: mce: [Hardware Error]: Machine check events logged > kernel: mce: [Hardware Error]: CPU 15: Machine Check: 0 Bank 5: > bea00108 > kernel: mce: [Hardware Error]: TSC 0 ADDR 1c1196eb6 MISC d0120001 > SYND 4d00 IPID 500b0 > kernel: mce: [Hardware Error]: PROCESSOR 2:870f10 TIME 1585120217 SOCKET 0 > APIC 9 microcode 8701013 > kernel: #16 #17 #18 #19 #20 #21 #22 #23 > > Initially I figured it could be ram so I performed the usual test with no > problems. Also tested with standard JEDEC as well and eventually received a > MCE during Warcraft 3 reforged. After consulting with a few friends I decided > to try a different power supply to no avail. I then bit the bullet and bought > a brand new 3900x. I also cleared CMOS before getting my new 3900x and after. > All cpu values are on auto with no PBO or manual overclocking. The only fancy > is the ram. Yesterday, after owning the new 3900x for three days, I had a MCE > while I was playing Warcraft 3 Reforged. I have tested other games but none > of them caused a MCE or any crashes / freezes for that matter. World of > Warcraft, The Outer Worlds, Stellaris, and Counter-Strike: Global Offensive. > > As the same with Clemens, using the same decoder he used, MCE-Ryzen-Decoder, > from github, it reports the MCE to be the following: > Bank: Execution Unit (EX) > Error: Watchdog Timeout error (WDT 0x0) > > One thing to note is I haven't received it during desktop usage. Only in > Warcraft 3. I do have desktop compositing in both Xfce and KDE disabled and > always have. Both of which used, tested, and received the MCE's during those > sessions. I have noticed a pattern with the MCE crashes with Warcraft 3. They > always happen during a GPU load drop off or increase transition. By that I > mean when exiting a match to return to the lobby, or loading a map and when > it switches from the loading screen to the match itself is when these MCE's > happen. The entire screen quickly turns black, everything is hard locked, and > then after about a minute or so the machine reboots on its own. It hasn't > happened yet while in a middle of a match session, sitting in the lobby or at > the main menu screen. Its consistently been during a transition. My theory is > that this could possibly be a GPU hang from switching from one power state to > another power state. With the GPU hanging, causes the CPU to stall, and thus > a MCE. The GPU hanging could explain the quick solid black screen as well as all output is stopped. But I'm really just assuming here form my own observations from my limited understanding. Possible reason why this triggers in Warcraft is because the other games have few moments of switching power states heavily. The Outer Worlds, World of Warcraft, Stellaris, and Counter-Strike Global Offensive all keep a constant high load on the GPU and the match sessions are long. > > From what its worth, I've had no major issues in Windows 10. The only quirks > where initially a few TDR's that recovered from alt tabing out of most games > with Google
Re: AMD DC graphics display code enables -mhard-float, -msse, -msse2 without any visible FPU state protection
On Thu, Apr 2, 2020 at 9:34 AM Christian König wrote: > Am 02.04.20 um 04:34 schrieb Jann Horn: > > [x86 folks in CC so that they can chime in on the precise rules for this > > stuff] > > I noticed that several makefiles under drivers/gpu/drm/amd/display/dc/ > > turn on floating-point instructions in the compiler flags > > (-mhard-float, -msse and -msse2) in order to make the "float" and > > "double" types usable from C code without requiring helper functions. > > > > However, as far as I know, code running in normal kernel context isn't > > allowed to use floating-point registers without special protection > > using helpers like kernel_fpu_begin() and kernel_fpu_end() (which also > > require that the protected code never blocks). If you violate that > > rule, that can lead to various issues - among other things, I think > > the kernel will clobber userspace FPU register state, and I think the > > kernel code can blow up if a context switch happens at the wrong time, > > since in-kernel task switches don't preserve FPU state. > > > > Is there some hidden trick I'm missing that makes it okay to use FPU > > registers here? > > > > I would try testing this, but unfortunately none of the AMD devices I > > have here have the appropriate graphics hardware... > > yes, using the floating point calculations in the display code has been > a source of numerous problems and confusion in the past. > > The calls to kernel_fpu_begin() and kernel_fpu_end() are hidden behind > the DC_FP_START() and DC_FP_END() macros which are supposed to hide the > architecture depend handling for x86 and PPC64. Hmm... but as far as I can tell, you're using those macros from inside functions that are already compiled with the FPU on: - drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c uses the macros, but is already compiled with calcs_ccflags - drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c uses the macros, but is already compiled with "-mhard-float -msse -msse2" - drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c uses the macros, but is already compiled with "-mhard-float -msse -msse2" AFAIK as soon as you enter any function in any file compiled with FPU instructions, you may encounter SSE instructions, e.g. via things like compiler-generated memory-zeroing code - not just when you're actually using doubles or floats. ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: Possibility of RX570 responsible for spontaneous reboots (MCE) with Ryzen 3700x?
Hello! I saw Clemens Eisserer email regarding MCE errors with his RX 570 and 3700x, and I like to add to that list of MCE spontaneous reboots as well. This is my configuration: -Ryzen 3900x + Noctua D15 -MSI X570 Unify (latest agesa as of writing) -DDR4 3200mhz 32GB kit -Sapphire Pulse 5700 XT -Corsair RMX 850 Watt -Arch Linux with kernel 5.5.13 -Mesa 20.0.3 -Early KMS enabled I've had this system up and running since November 2019 but initially with a Nvidia 1060 and Windows 10. Everything was running smoothly. About a month ago I switched back over to Linux after purchasing my 5700 XT as my initial plan was to go back to Linux. Since returning I've experienced multiple spontaneous MCE reboots. All happened while I was playing one particular game, Warcraft 3 Reforged. The MCE event is the following: kernel: mce: [Hardware Error]: Machine check events logged kernel: mce: [Hardware Error]: CPU 1: Machine Check: 0 Bank 5: bea00108 kernel: mce: [Hardware Error]: TSC 0 ADDR 1ad66d6fe MISC d0120001 SYND 4d00 IPID 500b0 kernel: mce: [Hardware Error]: PROCESSOR 2:870f10 TIME 1585120217 SOCKET 0 APIC 2 microcode 8701013 kernel: #2 #3 #4 #5 #6 #7 #8 #9 #10 #11 #12 #13 #14 #15 kernel: mce: [Hardware Error]: Machine check events logged kernel: mce: [Hardware Error]: CPU 15: Machine Check: 0 Bank 5: bea00108 kernel: mce: [Hardware Error]: TSC 0 ADDR 1c1196eb6 MISC d0120001 SYND 4d00 IPID 500b0 kernel: mce: [Hardware Error]: PROCESSOR 2:870f10 TIME 1585120217 SOCKET 0 APIC 9 microcode 8701013 kernel: #16 #17 #18 #19 #20 #21 #22 #23 Initially I figured it could be ram so I performed the usual test with no problems. Also tested with standard JEDEC as well and eventually received a MCE during Warcraft 3 reforged. After consulting with a few friends I decided to try a different power supply to no avail. I then bit the bullet and bought a brand new 3900x. I also cleared CMOS before getting my new 3900x and after. All cpu values are on auto with no PBO or manual overclocking. The only fancy is the ram. Yesterday, after owning the new 3900x for three days, I had a MCE while I was playing Warcraft 3 Reforged. I have tested other games but none of them caused a MCE or any crashes / freezes for that matter. World of Warcraft, The Outer Worlds, Stellaris, and Counter-Strike: Global Offensive. As the same with Clemens, using the same decoder he used, MCE-Ryzen-Decoder, from github, it reports the MCE to be the following: Bank: Execution Unit (EX) Error: Watchdog Timeout error (WDT 0x0) One thing to note is I haven't received it during desktop usage. Only in Warcraft 3. I do have desktop compositing in both Xfce and KDE disabled and always have. Both of which used, tested, and received the MCE's during those sessions. I have noticed a pattern with the MCE crashes with Warcraft 3. They always happen during a GPU load drop off or increase transition. By that I mean when exiting a match to return to the lobby, or loading a map and when it switches from the loading screen to the match itself is when these MCE's happen. The entire screen quickly turns black, everything is hard locked, and then after about a minute or so the machine reboots on its own. It hasn't happened yet while in a middle of a match session, sitting in the lobby or at the main menu screen. Its consistently been during a transition. My theory is that this could possibly be a GPU hang from switching from one power state to another power state. With the GPU hanging, causes the CPU to stall, and thus a MCE. The GPU hanging could explain the quick solid black screen as well as all output is stopped. But I'm really just assuming here form my own observations from my limited understanding. Possible reason why this triggers in Warcraft is because the other games have few moments of switching power states heavily. The Outer Worlds, World of Warcraft, Stellaris, and Counter-Strike Global Offensive all keep a constant high load on the GPU and the match sessions are long. >From what its worth, I've had no major issues in Windows 10. The only quirks where initially a few TDR's that recovered from alt tabing out of most games with Google Chrome running. Disabling hardware acceleration in Chrome fixed those TDR's while alt-tabing out of games. >From searching, the way I found this mailing list report, I've found quite a few reports of people talking about receiving MCE's that isn't the typical first generation MCE's reports from 2017 involving Ryzen.Where those where fixed by disabling c-states, ram, and changing power supply current from low to typical. These ones within the past year appear to all have a AMD GPU in common. I did notice a few with Intel CPU's as well paired up with a AMD GPU. Any feedback would be greatly appreciated. ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: AMD DC graphics display code enables -mhard-float, -msse, -msse2 without any visible FPU state protection
Jann Horn writes: > On Thu, Apr 2, 2020 at 9:34 AM Christian König > wrote: >> Am 02.04.20 um 04:34 schrieb Jann Horn: >> > [x86 folks in CC so that they can chime in on the precise rules for >> > this stuff] They are pretty simple. Any code using FPU needs to be completely isolated from regular code either by using inline asm or by moving it to a different compilation unit. The invocations need fpu_begin/end() of course. >> > I noticed that several makefiles under drivers/gpu/drm/amd/display/dc/ >> > turn on floating-point instructions in the compiler flags >> > (-mhard-float, -msse and -msse2) in order to make the "float" and >> > "double" types usable from C code without requiring helper functions. >> > >> > However, as far as I know, code running in normal kernel context isn't >> > allowed to use floating-point registers without special protection >> > using helpers like kernel_fpu_begin() and kernel_fpu_end() (which also >> > require that the protected code never blocks). If you violate that >> > rule, that can lead to various issues - among other things, I think >> > the kernel will clobber userspace FPU register state, and I think the >> > kernel code can blow up if a context switch happens at the wrong time, >> > since in-kernel task switches don't preserve FPU state. >> > >> > Is there some hidden trick I'm missing that makes it okay to use FPU >> > registers here? >> > >> > I would try testing this, but unfortunately none of the AMD devices I >> > have here have the appropriate graphics hardware... >> >> yes, using the floating point calculations in the display code has been >> a source of numerous problems and confusion in the past. >> >> The calls to kernel_fpu_begin() and kernel_fpu_end() are hidden behind >> the DC_FP_START() and DC_FP_END() macros which are supposed to hide the >> architecture depend handling for x86 and PPC64. > > Hmm... but as far as I can tell, you're using those macros from inside > functions that are already compiled with the FPU on: > > - drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c uses the macros, > but is already compiled with calcs_ccflags > - drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c uses the > macros, but is already compiled with "-mhard-float -msse -msse2" > - drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c uses the > macros, but is already compiled with "-mhard-float -msse -msse2" > > AFAIK as soon as you enter any function in any file compiled with FPU > instructions, you may encounter SSE instructions, e.g. via things like > compiler-generated memory-zeroing code - not just when you're actually > using doubles or floats. That's correct and this will silently cause FPU state corruption ... We really need objtool support to validate that. Peter, now that we know how to do it (noinstr, clac/stac) we can emit annotations (see patch below) and validate that any FPU instruction is inside a safe region. Hmm? Thanks, tglx 8<--- --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -19,8 +19,27 @@ * If you intend to use the FPU in softirq you need to check first with * irq_fpu_usable() if it is possible. */ -extern void kernel_fpu_begin(void); -extern void kernel_fpu_end(void); +extern void __kernel_fpu_begin(void); +extern void __kernel_fpu_end(void); + +static inline void kernel_fpu_begin(void) +{ + asm volatile("%c0:\n\t" +".pushsection .discard.fpu_begin\n\t" +".long %c0b - .\n\t" +".popsection\n\t" : : "i" (__COUNTER__)); + __kernel_fpu_begin(); +} + +static inline void kernel_fpu_end(void) +{ + __kernel_fpu_end(); + asm volatile("%c0:\n\t" +".pushsection .discard.fpu_end\n\t" +".long %c0b - .\n\t" +".popsection\n\t" : : "i" (__COUNTER__)); +} + extern bool irq_fpu_usable(void); extern void fpregs_mark_activate(void); --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -82,7 +82,7 @@ bool irq_fpu_usable(void) } EXPORT_SYMBOL(irq_fpu_usable); -void kernel_fpu_begin(void) +void __kernel_fpu_begin(void) { preempt_disable(); @@ -102,16 +102,16 @@ void kernel_fpu_begin(void) } __cpu_invalidate_fpregs_state(); } -EXPORT_SYMBOL_GPL(kernel_fpu_begin); +EXPORT_SYMBOL_GPL(__kernel_fpu_begin); -void kernel_fpu_end(void) +void __kernel_fpu_end(void) { WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, false); preempt_enable(); } -EXPORT_SYMBOL_GPL(kernel_fpu_end); +EXPORT_SYMBOL_GPL(__kernel_fpu_end); /* * Save the FPU state (mark it for reload if necessary): ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amd/powerplay: avoid using pm_en before it is initialized
hwmgr->pm_en is initialized at hwmgr_hw_init. during amdgpu_device_init, there is amdgpu_asic_reset that calls to pp_get_asic_baco_capability, while hwmgr->pm_en has not yet been initialized. so avoid using pm_en in pp_get_asic_baco_capability. Signed-off-by: Tiecheng Zhou Signed-off-by: Yintian Tao --- drivers/gpu/drm/amd/powerplay/amd_powerplay.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c index 71b843f542d8..fdff3e1c5e95 100644 --- a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c +++ b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c @@ -1455,7 +1455,8 @@ static int pp_get_asic_baco_state(void *handle, int *state) if (!hwmgr) return -EINVAL; - if (!hwmgr->pm_en || !hwmgr->hwmgr_func->get_asic_baco_state) + if (!(hwmgr->not_vf && amdgpu_dpm) || + !hwmgr->hwmgr_func->get_asic_baco_state) return 0; mutex_lock(>smu_lock); -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amd/display: re-order asic declarations
"1382d6409891 drm/amd/display: Fix RV2 Variant Detection" introduces build error of: "use of undeclared identifier 'RENOIR_A0'" To fix the same, this patch re-orders the ASIC declarations accordingly. Signed-off-by: Shirish S --- drivers/gpu/drm/amd/display/include/dal_asic_id.h | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/include/dal_asic_id.h b/drivers/gpu/drm/amd/display/include/dal_asic_id.h index 8a87d0ed90ae..2359e88d6029 100644 --- a/drivers/gpu/drm/amd/display/include/dal_asic_id.h +++ b/drivers/gpu/drm/amd/display/include/dal_asic_id.h @@ -136,6 +136,7 @@ #define RAVEN2_A0 0x81 #define RAVEN1_F0 0xF0 #define RAVEN_UNKNOWN 0xFF +#define RENOIR_A0 0x91 #ifndef ASICREV_IS_RAVEN #define ASICREV_IS_RAVEN(eChipRev) ((eChipRev >= RAVEN_A0) && eChipRev < RAVEN_UNKNOWN) #endif @@ -171,8 +172,6 @@ enum { #define ASICREV_IS_NAVI10_P(eChipRev)(eChipRev < NV_NAVI12_P_A0) #define ASICREV_IS_NAVI12_P(eChipRev)((eChipRev >= NV_NAVI12_P_A0) && (eChipRev < NV_NAVI14_M_A0)) #define ASICREV_IS_NAVI14_M(eChipRev)((eChipRev >= NV_NAVI14_M_A0) && (eChipRev < NV_UNKNOWN)) -#define RENOIR_A0 0x91 -#define DEVICE_ID_RENOIR_1636 0x1636 // Renoir #define ASICREV_IS_RENOIR(eChipRev) ((eChipRev >= RENOIR_A0) && (eChipRev < RAVEN1_F0)) /* @@ -183,6 +182,9 @@ enum { #define DEVICE_ID_TEMASH_9839 0x9839 #define DEVICE_ID_TEMASH_983D 0x983D +/* RENOIR */ +#define DEVICE_ID_RENOIR_1636 0x1636 + /* Asic Family IDs for different asic family. */ #define FAMILY_CI 120 /* Sea Islands: Hawaii (P), Bonaire (M) */ #define FAMILY_KV 125 /* Fusion => Kaveri: Spectre, Spooky; Kabini: Kalindi */ -- 2.17.1 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: AMD DC graphics display code enables -mhard-float, -msse, -msse2 without any visible FPU state protection
Hi Jann, Am 02.04.20 um 04:34 schrieb Jann Horn: [x86 folks in CC so that they can chime in on the precise rules for this stuff] Hi! I noticed that several makefiles under drivers/gpu/drm/amd/display/dc/ turn on floating-point instructions in the compiler flags (-mhard-float, -msse and -msse2) in order to make the "float" and "double" types usable from C code without requiring helper functions. However, as far as I know, code running in normal kernel context isn't allowed to use floating-point registers without special protection using helpers like kernel_fpu_begin() and kernel_fpu_end() (which also require that the protected code never blocks). If you violate that rule, that can lead to various issues - among other things, I think the kernel will clobber userspace FPU register state, and I think the kernel code can blow up if a context switch happens at the wrong time, since in-kernel task switches don't preserve FPU state. Is there some hidden trick I'm missing that makes it okay to use FPU registers here? I would try testing this, but unfortunately none of the AMD devices I have here have the appropriate graphics hardware... yes, using the floating point calculations in the display code has been a source of numerous problems and confusion in the past. The calls to kernel_fpu_begin() and kernel_fpu_end() are hidden behind the DC_FP_START() and DC_FP_END() macros which are supposed to hide the architecture depend handling for x86 and PPC64. This originated from the graphics block integrated into AMD CPU (where we knew which fp unit we had), but as far as I know is now also used for dedicated AMD GPUs as well. I'm not really a fan of this either, but so far we weren't able to convince the hardware engineers to not use floating point calculations for the display stuff. Regards, Christian. ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset
-Original Message- From: Jack Zhang Sent: Thursday, April 2, 2020 3:20 PM To: amd-gfx@lists.freedesktop.org Cc: Zhang, Jack (Jian) Subject: [PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset kfd_pre_reset will free mem_objs allocated by kfd_gtt_sa_allocate Without this change, sriov tdr code path will never free those allocated memories and get memory leak. Signed-off-by: Jack Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8faaa17..832daf7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3847,6 +3847,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, { int r; + amdgpu_amdkfd_pre_reset(adev); + if (from_hypervisor) r = amdgpu_virt_request_full_gpu(adev, true); else -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
[PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset
kfd_pre_reset will free mem_objs allocated by kfd_gtt_sa_allocate Without this change, sriov tdr code path will never free those allocated memories and get memory leak. Signed-off-by: Jack Zhang --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8faaa17..832daf7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3847,6 +3847,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, { int r; + amdgpu_amdkfd_pre_reset(adev); + if (from_hypervisor) r = amdgpu_virt_request_full_gpu(adev, true); else -- 2.7.4 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
RE: [PATCH] drm/amdgpu/powerplay: using the FCLK DPM table to set the MCLK
[AMD Official Use Only - Internal Distribution Only] On Wed, Apr 01, 2020 at 07:41:12PM +0800, Yuxian Dai wrote: > 1.Using the FCLK DPM table to set the MCLK for DPM states consist of > three entities: > FCLK > UCLK > MEMCLK > All these three clk change together, MEMCLK from FCLK, so use the fclk > frequency. > 2.we should show the current working clock freqency from clock table > metric > > Signed-off-by: Yuxian Dai > Reviewed-by: Alex Deucher > Reviewed-by: Huang Rui > Reviewed-by: Kevin Wang > --- Next time, if you submit the V2 patch, you can generate it as below command: git format-patch --subject-prefix="PATCH v2" And describe the changes from v1 -> v2 in the commit message. This will help everyone to understand your change. > I got it. Reviewed-by: Huang Rui > drivers/gpu/drm/amd/powerplay/renoir_ppt.c | 6 ++ > drivers/gpu/drm/amd/powerplay/renoir_ppt.h | 2 +- > 2 files changed, 7 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c > b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c > index 7bf52ecba01d..c6b39a7026a8 100644 > --- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c > +++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c > @@ -239,6 +239,7 @@ static int renoir_print_clk_levels(struct smu_context > *smu, > uint32_t cur_value = 0, value = 0, count = 0, min = 0, max = 0; > DpmClocks_t *clk_table = smu->smu_table.clocks_table; > SmuMetrics_t metrics; > + bool cur_value_match_level = false; > > if (!clk_table || clk_type >= SMU_CLK_COUNT) > return -EINVAL; > @@ -297,8 +298,13 @@ static int renoir_print_clk_levels(struct smu_context > *smu, > GET_DPM_CUR_FREQ(clk_table, clk_type, i, value); > size += sprintf(buf + size, "%d: %uMhz %s\n", i, value, > cur_value == value ? "*" : ""); > + if (cur_value == value) > + cur_value_match_level = true; > } > > + if (!cur_value_match_level) > + size += sprintf(buf + size, " %uMhz *\n", cur_value); > + > return size; > } > > diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.h > b/drivers/gpu/drm/amd/powerplay/renoir_ppt.h > index 2a390ddd37dd..89cd6da118a3 100644 > --- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.h > +++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.h > @@ -37,7 +37,7 @@ extern void renoir_set_ppt_funcs(struct smu_context *smu); > freq = table->SocClocks[dpm_level].Freq;\ > break; \ > case SMU_MCLK: \ > - freq = table->MemClocks[dpm_level].Freq;\ > + freq = table->FClocks[dpm_level].Freq; \ > break; \ > case SMU_DCEFCLK: \ > freq = table->DcfClocks[dpm_level].Freq;\ > -- > 2.17.1 > > ___ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist > s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cra > y.huang%40amd.com%7Cfa81baf94d2c409e4f0308d7d6319a8f%7C3dd8961fe4884e6 > 08e11a82d994e183d%7C0%7C0%7C637213381443101089sdata=LF2pLZj%2Fq0C > wiMSfvDiofXFKuGVfgje7o4Iti%2FtoNj0%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx
Re: [PATCH] drm/amdgpu/powerplay: using the FCLK DPM table to set the MCLK
On Wed, Apr 01, 2020 at 07:41:12PM +0800, Yuxian Dai wrote: > 1.Using the FCLK DPM table to set the MCLK for DPM states consist of > three entities: > FCLK > UCLK > MEMCLK > All these three clk change together, MEMCLK from FCLK, so use the fclk > frequency. > 2.we should show the current working clock freqency from clock table metric > > Signed-off-by: Yuxian Dai > Reviewed-by: Alex Deucher > Reviewed-by: Huang Rui > Reviewed-by: Kevin Wang > --- Next time, if you submit the V2 patch, you can generate it as below command: git format-patch --subject-prefix="PATCH v2" And describe the changes from v1 -> v2 in the commit message. This will help everyone to understand your change. Reviewed-by: Huang Rui > drivers/gpu/drm/amd/powerplay/renoir_ppt.c | 6 ++ > drivers/gpu/drm/amd/powerplay/renoir_ppt.h | 2 +- > 2 files changed, 7 insertions(+), 1 deletion(-) > > diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c > b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c > index 7bf52ecba01d..c6b39a7026a8 100644 > --- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c > +++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c > @@ -239,6 +239,7 @@ static int renoir_print_clk_levels(struct smu_context > *smu, > uint32_t cur_value = 0, value = 0, count = 0, min = 0, max = 0; > DpmClocks_t *clk_table = smu->smu_table.clocks_table; > SmuMetrics_t metrics; > + bool cur_value_match_level = false; > > if (!clk_table || clk_type >= SMU_CLK_COUNT) > return -EINVAL; > @@ -297,8 +298,13 @@ static int renoir_print_clk_levels(struct smu_context > *smu, > GET_DPM_CUR_FREQ(clk_table, clk_type, i, value); > size += sprintf(buf + size, "%d: %uMhz %s\n", i, value, > cur_value == value ? "*" : ""); > + if (cur_value == value) > + cur_value_match_level = true; > } > > + if (!cur_value_match_level) > + size += sprintf(buf + size, " %uMhz *\n", cur_value); > + > return size; > } > > diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.h > b/drivers/gpu/drm/amd/powerplay/renoir_ppt.h > index 2a390ddd37dd..89cd6da118a3 100644 > --- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.h > +++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.h > @@ -37,7 +37,7 @@ extern void renoir_set_ppt_funcs(struct smu_context *smu); > freq = table->SocClocks[dpm_level].Freq;\ > break; \ > case SMU_MCLK: \ > - freq = table->MemClocks[dpm_level].Freq;\ > + freq = table->FClocks[dpm_level].Freq; \ > break; \ > case SMU_DCEFCLK: \ > freq = table->DcfClocks[dpm_level].Freq;\ > -- > 2.17.1 > > ___ > amd-gfx mailing list > amd-gfx@lists.freedesktop.org > https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cray.huang%40amd.com%7Cfa81baf94d2c409e4f0308d7d6319a8f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637213381443101089sdata=LF2pLZj%2Fq0CwiMSfvDiofXFKuGVfgje7o4Iti%2FtoNj0%3Dreserved=0 ___ amd-gfx mailing list amd-gfx@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/amd-gfx