[PATCH] drm/amdgpu: add SPM golden settings for Navi10

2020-04-02 Thread Tianci Yin
From: "Tianci.Yin" 

Add RLC_SPM golden settings

Change-Id: I616e127171293d915cb3a05dee02f51cec8d8f6f
Signed-off-by: Tianci.Yin 
---
 drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c|9 +
 .../gpu/drm/amd/amdgpu/golden_gc_spm_10_1_0.h | 1058 +
 2 files changed, 1067 insertions(+)
 create mode 100644 drivers/gpu/drm/amd/amdgpu/golden_gc_spm_10_1_0.h

diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index 70edbbf84338..7c96a894ad37 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -48,6 +48,7 @@
 #include "v10_structs.h"
 #include "gfx_v10_0.h"
 #include "nbio_v2_3.h"
+#include "golden_gc_spm_10_1_0.h"
 
 /**
  * Navi10 has two graphic rings to share each graphic pipe.
@@ -138,6 +139,11 @@ static const struct soc15_reg_golden 
golden_settings_gc_10_0_nv10[] =
/* Pending on emulation bring up */
 };
 
+static const struct soc15_reg_golden golden_settings_gc_rlc_spm_10_0_nv10[] =
+{
+   GOLDEN_GC_SPM_10_1_0
+};
+
 static const struct soc15_reg_golden golden_settings_gc_10_1_1[] =
 {
SOC15_REG_GOLDEN_VALUE(GC, 0, mmCB_HW_CONTROL_4, 0x, 
0x003c0014),
@@ -388,6 +394,9 @@ static void gfx_v10_0_init_golden_registers(struct 
amdgpu_device *adev)
soc15_program_register_sequence(adev,
golden_settings_gc_10_0_nv10,
(const 
u32)ARRAY_SIZE(golden_settings_gc_10_0_nv10));
+   soc15_program_register_sequence(adev,
+   
golden_settings_gc_rlc_spm_10_0_nv10,
+   (const 
u32)ARRAY_SIZE(golden_settings_gc_rlc_spm_10_0_nv10));
break;
case CHIP_NAVI14:
soc15_program_register_sequence(adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/golden_gc_spm_10_1_0.h 
b/drivers/gpu/drm/amd/amdgpu/golden_gc_spm_10_1_0.h
new file mode 100644
index ..e65af4a6fcdd
--- /dev/null
+++ b/drivers/gpu/drm/amd/amdgpu/golden_gc_spm_10_1_0.h
@@ -0,0 +1,1058 @@
+#ifndef __GOLDEN_GC_SPM_10_1_0_H__
+#define __GOLDEN_GC_SPM_10_1_0_H__
+
+#define GOLDEN_GC_SPM_10_1_0 \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xe000, 0x0), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_ADDR, 0x, 
0x28), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_DATA, 0x, 
0x9), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_SE_SAMPLEDELAY_IND_ADDR, 0x, 
0x8), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_SE_SAMPLEDELAY_IND_DATA, 0x, 
0x1b), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x1), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_SE_SAMPLEDELAY_IND_ADDR, 0x, 
0x8), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x1), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_SE_SAMPLEDELAY_IND_DATA, 0x, 
0x1b), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_ADDR, 0x, 
0x20), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_DATA, 0x, 
0xe), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_ADDR, 0x, 
0xc8), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_DATA, 0x, 
0xf), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_ADDR, 0x, 
0xcc), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_DATA, 0x, 
0xf), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_ADDR, 0x, 
0xd0), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_DATA, 0x, 
0xf), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_ADDR, 0x, 
0xd4), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_DATA, 0x, 
0xf), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmGRBM_GFX_INDEX, 0xff, 0x0), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, mmRLC_SPM_GLB_SAMPLEDELAY_IND_ADDR, 0x, 
0x24), \
+SOC15_REG_GOLDEN_VALUE(GC, 0, 

Re: AMD DC graphics display code enables -mhard-float, -msse, -msse2 without any visible FPU state protection

2020-04-02 Thread Masami Hiramatsu
On Thu, 2 Apr 2020 16:13:08 +0200
Peter Zijlstra  wrote:

> On Thu, Apr 02, 2020 at 09:33:54AM +0200, Christian König wrote:
> > Hi Jann,
> > 
> > Am 02.04.20 um 04:34 schrieb Jann Horn:
> > > [x86 folks in CC so that they can chime in on the precise rules for this 
> > > stuff]
> > > 
> > > Hi!
> > > 
> > > I noticed that several makefiles under drivers/gpu/drm/amd/display/dc/
> > > turn on floating-point instructions in the compiler flags
> > > (-mhard-float, -msse and -msse2) in order to make the "float" and
> > > "double" types usable from C code without requiring helper functions.
> > > 
> > > However, as far as I know, code running in normal kernel context isn't
> > > allowed to use floating-point registers without special protection
> > > using helpers like kernel_fpu_begin() and kernel_fpu_end() (which also
> > > require that the protected code never blocks). If you violate that
> > > rule, that can lead to various issues - among other things, I think
> > > the kernel will clobber userspace FPU register state, and I think the
> > > kernel code can blow up if a context switch happens at the wrong time,
> > > since in-kernel task switches don't preserve FPU state.
> > > 
> > > Is there some hidden trick I'm missing that makes it okay to use FPU
> > > registers here?
> > > 
> > > I would try testing this, but unfortunately none of the AMD devices I
> > > have here have the appropriate graphics hardware...
> > 
> > yes, using the floating point calculations in the display code has been a
> > source of numerous problems and confusion in the past.
> > 
> > The calls to kernel_fpu_begin() and kernel_fpu_end() are hidden behind the
> > DC_FP_START() and DC_FP_END() macros which are supposed to hide the
> > architecture depend handling for x86 and PPC64.
> > 
> > This originated from the graphics block integrated into AMD CPU (where we
> > knew which fp unit we had), but as far as I know is now also used for
> > dedicated AMD GPUs as well.
> > 
> > I'm not really a fan of this either, but so far we weren't able to convince
> > the hardware engineers to not use floating point calculations for the
> > display stuff.
> 
> Might I complain that:
> 
>   make O=allmodconfig-build drivers/gpu/drm/amd/display/dc/
> 
> does not in fact work?
> 
> Anyway, how do people feel about something like the below?
> 
> Masami, Boris, is there any semi-sane way we can have insn_is_fpu() ?
> While digging through various opcode manuals is of course forever fun, I
> do feel like it might not be the best way.

Yes, it is possible to add INAT_FPU and insn_is_fpu().
But it seems that the below patch needs more classification based on
nmemonic or opcodes.

IMHO, it is the time to expand gen-insn-attr.awk or clone it to
generate another opcode map, so that user will easily extend the
insn infrastructure.
(e.g. I had made an in-kernel disassembler, which generates a mnemonic
 maps from x86-opcode-map.txt)
 https://github.com/mhiramat/linux/commits/inkernel-disasm-20130414

Thank you,

> 
> ---
>  arch/x86/include/asm/fpu/api.h  |  7 
>  arch/x86/include/asm/fpu/internal.h | 11 ++
>  arch/x86/kernel/fpu/init.c  |  5 +++
>  tools/objtool/arch.h|  1 +
>  tools/objtool/arch/x86/decode.c | 71 
> ++---
>  tools/objtool/check.c   | 58 ++
>  tools/objtool/check.h   |  3 +-
>  7 files changed, 134 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
> index b774c52e5411..b9bca1cdc875 100644
> --- a/arch/x86/include/asm/fpu/api.h
> +++ b/arch/x86/include/asm/fpu/api.h
> @@ -12,6 +12,13 @@
>  #define _ASM_X86_FPU_API_H
>  #include 
>  
> +#define annotate_fpu() ({\
> + asm volatile("%c0:\n\t" \
> +  ".pushsection .discard.fpu_safe\n\t"   \
> +  ".long %c0b - .\n\t"   \
> +  ".popsection\n\t" : : "i" (__COUNTER__));  \
> +})
> +
>  /*
>   * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
>   * disables preemption so be careful if you intend to use it for long periods
> diff --git a/arch/x86/include/asm/fpu/internal.h 
> b/arch/x86/include/asm/fpu/internal.h
> index 44c48e34d799..bc436213a0d4 100644
> --- a/arch/x86/include/asm/fpu/internal.h
> +++ b/arch/x86/include/asm/fpu/internal.h
> @@ -102,6 +102,11 @@ static inline void fpstate_init_fxstate(struct 
> fxregs_state *fx)
>  }
>  extern void fpstate_sanitize_xstate(struct fpu *fpu);
>  
> +#define _ASM_ANNOTATE_FPU(at)
> \
> +  ".pushsection .discard.fpu_safe\n" \
> +  ".long " #at " - .\n"  \
> +  ".popsection\n"\
> +
>  #define 

RE: [PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset

2020-04-02 Thread Zhang, Jack (Jian)
Thanks Monk,

I just updated the patch and it could passed 1000 rounds TDR test.

Sent out an review email.

Regards,
Jack
-Original Message-
From: Liu, Monk  
Sent: Friday, April 3, 2020 11:38 AM
To: Kuehling, Felix ; Zhang, Jack (Jian) 
; amd-gfx@lists.freedesktop.org
Subject: RE: [PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset

Thanks Felix

Hi Jack

I think below changes can resolve your problem , we had this on our customer 
branch already, it fix the memory leak, and also fix my previous bug .
Can you make this change applied to gfx_v10/v9 ? thanks !

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 29749502..532258445 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -543,6 +543,8 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v10_compute_mqd *m = get_mqd(mqd);

+   if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset)
+   return 0;
 #if 0
unsigned long flags;
int retry;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 35b32ad..f6479e1 100755
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3653,6 +3653,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
if (r)
return r;

+   amdgpu_amdkfd_pre_reset(adev);
+
/* Resume IP prior to SMC */
r = amdgpu_device_ip_reinit_early_sriov(adev);
if (r)

_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: Kuehling, Felix 
Sent: Friday, April 3, 2020 1:26 AM
To: Zhang, Jack (Jian) ; amd-gfx@lists.freedesktop.org; 
Liu, Monk 
Subject: Re: [PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset

[+Monk]

This looks reasonable to me. However, you're effectively reverting this commit 
by Monk:

a03eb637d2a5 drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV

In hind-sight, Monk's commit was broken. Removing the call to pre_reset has 
other consequences, such as breaking notifications about reset to user mode, 
and probably invalidating some assumptions in kfd_post_reset.
Can you coordinate with Monk to work out why his change was needed, and whether 
you'll need a different solution for the problem he was trying to address?

In the meanwhile, this patch is

Acked-by: Felix Kuehling 


Am 2020-04-02 um 3:20 a.m. schrieb Jack Zhang:

> kfd_pre_reset will free mem_objs allocated by kfd_gtt_sa_allocate
>
> Without this change, sriov tdr code path will never free those 
> allocated memories and get memory leak.
>
> Signed-off-by: Jack Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 8faaa17..832daf7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3847,6 +3847,8 @@ static int amdgpu_device_reset_sriov(struct 
> amdgpu_device *adev,  {
>   int r;
>  
> + amdgpu_amdkfd_pre_reset(adev);
> +
>   if (from_hypervisor)
>   r = amdgpu_virt_request_full_gpu(adev, true);
>   else
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset

2020-04-02 Thread Jack Zhang
kfd_pre_reset will free mem_objs allocated by kfd_gtt_sa_allocate

Without this change, sriov tdr code path will never free those allocated
memories and get memory leak.

v2:add a bugfix for kiq ring test fail

Signed-off-by: Jack Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c  | 3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 4ec6d0c..bdc1f5a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -543,6 +543,9 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v10_compute_mqd *m = get_mqd(mqd);
 
+   if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset)
+   return 0;
+
 #if 0
unsigned long flags;
int retry;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index df841c2..c2562d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -541,6 +541,9 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v9_mqd *m = get_mqd(mqd);
 
+   if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset)
+   return 0;
+
if (adev->in_gpu_reset)
return -EIO;
 
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 8faaa17..e3f7441 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3854,6 +3854,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
if (r)
return r;
 
+   amdgpu_amdkfd_pre_reset(adev);
+
/* Resume IP prior to SMC */
r = amdgpu_device_ip_reinit_early_sriov(adev);
if (r)
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amd/powerplay: avoid using pm_en before it is initialized

2020-04-02 Thread Zhou, Tiecheng
[AMD Official Use Only - Internal Distribution Only]

Ping...

-Original Message-
From: Tiecheng Zhou  
Sent: Thursday, April 2, 2020 5:29 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhou, Tiecheng ; Tao, Yintian 
Subject: [PATCH] drm/amd/powerplay: avoid using pm_en before it is initialized

hwmgr->pm_en is initialized at hwmgr_hw_init.
during amdgpu_device_init, there is amdgpu_asic_reset that calls to 
pp_get_asic_baco_capability, while hwmgr->pm_en has not yet been initialized.

so avoid using pm_en in pp_get_asic_baco_capability.

Signed-off-by: Tiecheng Zhou 
Signed-off-by: Yintian Tao 
---
 drivers/gpu/drm/amd/powerplay/amd_powerplay.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c 
b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
index 71b843f542d8..fdff3e1c5e95 100644
--- a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
+++ b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
@@ -1455,7 +1455,8 @@ static int pp_get_asic_baco_state(void *handle, int 
*state)
if (!hwmgr)
return -EINVAL;
 
-   if (!hwmgr->pm_en || !hwmgr->hwmgr_func->get_asic_baco_state)
+   if (!(hwmgr->not_vf && amdgpu_dpm) ||
+   !hwmgr->hwmgr_func->get_asic_baco_state)
return 0;
 
mutex_lock(>smu_lock);
--
2.17.1
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu: fix gfx hang during suspend with video playback

2020-04-02 Thread Prike Liang
The system will be hang up during S3 as SMU is pending at GC not
respose the register CP_HQD_ACTIVE access request and this issue
can be fixed by adding RLC safe mode guard before each HQD
map/unmap retrive opt.

Signed-off-by: Prike Liang 
Tested-by: Mengbing Wang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c | 6 ++
 drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 
 2 files changed, 10 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
index df841c2..e265063 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v9.c
@@ -232,6 +232,7 @@ int kgd_gfx_v9_hqd_load(struct kgd_dev *kgd, void *mqd, 
uint32_t pipe_id,
uint32_t *mqd_hqd;
uint32_t reg, hqd_base, data;
 
+   amdgpu_gfx_rlc_enter_safe_mode(adev);
m = get_mqd(mqd);
 
acquire_queue(kgd, pipe_id, queue_id);
@@ -299,6 +300,7 @@ int kgd_gfx_v9_hqd_load(struct kgd_dev *kgd, void *mqd, 
uint32_t pipe_id,
 
release_queue(kgd);
 
+   amdgpu_gfx_rlc_exit_safe_mode(adev);
return 0;
 }
 
@@ -497,6 +499,7 @@ bool kgd_gfx_v9_hqd_is_occupied(struct kgd_dev *kgd, 
uint64_t queue_address,
bool retval = false;
uint32_t low, high;
 
+   amdgpu_gfx_rlc_enter_safe_mode(adev);
acquire_queue(kgd, pipe_id, queue_id);
act = RREG32(SOC15_REG_OFFSET(GC, 0, mmCP_HQD_ACTIVE));
if (act) {
@@ -508,6 +511,7 @@ bool kgd_gfx_v9_hqd_is_occupied(struct kgd_dev *kgd, 
uint64_t queue_address,
retval = true;
}
release_queue(kgd);
+   amdgpu_gfx_rlc_exit_safe_mode(adev);
return retval;
 }
 
@@ -541,6 +545,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v9_mqd *m = get_mqd(mqd);
 
+   amdgpu_gfx_rlc_enter_safe_mode(adev);
if (adev->in_gpu_reset)
return -EIO;
 
@@ -577,6 +582,7 @@ int kgd_gfx_v9_hqd_destroy(struct kgd_dev *kgd, void *mqd,
}
 
release_queue(kgd);
+   amdgpu_gfx_rlc_exit_safe_mode(adev);
return 0;
 }
 
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
index 1fea077..ee107d9 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
@@ -3533,6 +3533,7 @@ static int gfx_v9_0_kiq_init_register(struct amdgpu_ring 
*ring)
struct v9_mqd *mqd = ring->mqd_ptr;
int j;
 
+   amdgpu_gfx_rlc_enter_safe_mode(adev);
/* disable wptr polling */
WREG32_FIELD15(GC, 0, CP_PQ_WPTR_POLL_CNTL, EN, 0);
 
@@ -3629,6 +3630,7 @@ static int gfx_v9_0_kiq_init_register(struct amdgpu_ring 
*ring)
if (ring->use_doorbell)
WREG32_FIELD15(GC, 0, CP_PQ_STATUS, DOORBELL_ENABLE, 1);
 
+   amdgpu_gfx_rlc_exit_safe_mode(adev);
return 0;
 }
 
@@ -3637,6 +3639,7 @@ static int gfx_v9_0_kiq_fini_register(struct amdgpu_ring 
*ring)
struct amdgpu_device *adev = ring->adev;
int j;
 
+   amdgpu_gfx_rlc_enter_safe_mode(adev);
/* disable the queue if it's active */
if (RREG32_SOC15(GC, 0, mmCP_HQD_ACTIVE) & 1) {
 
@@ -3668,6 +3671,7 @@ static int gfx_v9_0_kiq_fini_register(struct amdgpu_ring 
*ring)
WREG32_SOC15_RLC(GC, 0, mmCP_HQD_PQ_WPTR_HI, 0);
WREG32_SOC15_RLC(GC, 0, mmCP_HQD_PQ_WPTR_LO, 0);
 
+   amdgpu_gfx_rlc_exit_safe_mode(adev);
return 0;
 }
 
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset

2020-04-02 Thread Liu, Monk
Thanks Felix

Hi Jack

I think below changes can resolve your problem , we had this on our customer 
branch already, it fix the memory leak, and also fix my previous bug .
Can you make this change applied to gfx_v10/v9 ? thanks !

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
index 29749502..532258445 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gfx_v10.c
@@ -543,6 +543,8 @@ static int kgd_hqd_destroy(struct kgd_dev *kgd, void *mqd,
uint32_t temp;
struct v10_compute_mqd *m = get_mqd(mqd);

+   if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset)
+   return 0;
 #if 0
unsigned long flags;
int retry;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 35b32ad..f6479e1 100755
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3653,6 +3653,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
if (r)
return r;

+   amdgpu_amdkfd_pre_reset(adev);
+
/* Resume IP prior to SMC */
r = amdgpu_device_ip_reinit_early_sriov(adev);
if (r)

_
Monk Liu|GPU Virtualization Team |AMD


-Original Message-
From: Kuehling, Felix  
Sent: Friday, April 3, 2020 1:26 AM
To: Zhang, Jack (Jian) ; amd-gfx@lists.freedesktop.org; 
Liu, Monk 
Subject: Re: [PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset

[+Monk]

This looks reasonable to me. However, you're effectively reverting this commit 
by Monk:

a03eb637d2a5 drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV

In hind-sight, Monk's commit was broken. Removing the call to pre_reset has 
other consequences, such as breaking notifications about reset to user mode, 
and probably invalidating some assumptions in kfd_post_reset.
Can you coordinate with Monk to work out why his change was needed, and whether 
you'll need a different solution for the problem he was trying to address?

In the meanwhile, this patch is

Acked-by: Felix Kuehling 


Am 2020-04-02 um 3:20 a.m. schrieb Jack Zhang:

> kfd_pre_reset will free mem_objs allocated by kfd_gtt_sa_allocate
>
> Without this change, sriov tdr code path will never free those 
> allocated memories and get memory leak.
>
> Signed-off-by: Jack Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 8faaa17..832daf7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3847,6 +3847,8 @@ static int amdgpu_device_reset_sriov(struct 
> amdgpu_device *adev,  {
>   int r;
>  
> + amdgpu_amdkfd_pre_reset(adev);
> +
>   if (from_hypervisor)
>   r = amdgpu_virt_request_full_gpu(adev, true);
>   else
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amd/powerplay: determine pm_en at amd_powerplay_create

2020-04-02 Thread Quan, Evan
Reviewed-by: Evan Quan 

-Original Message-
From: amd-gfx  On Behalf Of Tiecheng Zhou
Sent: Thursday, April 2, 2020 1:40 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhou, Tiecheng 
Subject: [PATCH] drm/amd/powerplay: determine pm_en at amd_powerplay_create

Need to determine pm_en at amd_powerplay_create of early_init stage.

Signed-off-by: Tiecheng Zhou 
---
 drivers/gpu/drm/amd/powerplay/amd_powerplay.c | 3 +++
 drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c   | 3 ---
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c 
b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
index 71b843f542d8..a37dc37dfe49 100644
--- a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
+++ b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
@@ -48,6 +48,9 @@ static int amd_powerplay_create(struct amdgpu_device *adev)
 
hwmgr->adev = adev;
hwmgr->not_vf = !amdgpu_sriov_vf(adev);
+   hwmgr->pp_one_vf = amdgpu_sriov_is_pp_one_vf(adev);
+   hwmgr->pm_en = (amdgpu_dpm && (hwmgr->not_vf || hwmgr->pp_one_vf))
+   ? true : false;
hwmgr->device = amdgpu_cgs_create_device(adev);
mutex_init(>smu_lock);
mutex_init(>msg_lock);
diff --git a/drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c 
b/drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c
index f48fdc7f0382..7aee382fc1f9 100644
--- a/drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c
+++ b/drivers/gpu/drm/amd/powerplay/hwmgr/hwmgr.c
@@ -221,9 +221,6 @@ int hwmgr_hw_init(struct pp_hwmgr *hwmgr)  {
int ret = 0;
 
-   hwmgr->pp_one_vf = amdgpu_sriov_is_pp_one_vf((struct amdgpu_device 
*)hwmgr->adev);
-   hwmgr->pm_en = (amdgpu_dpm && (hwmgr->not_vf || hwmgr->pp_one_vf))
-   ? true : false;
if (!hwmgr->pm_en)
return 0;
 
--
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cevan.quan%40amd.com%7C879eafc288894347147c08d7d6c845b0%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637214028875355960sdata=35oaGAN8RHzU4biNt7wWUZpskOehIQ0h%2Fao5HiVduBM%3Dreserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH v2] drm/amdkfd: Provide SMI events watch

2020-04-02 Thread Felix Kuehling

On 2020-04-02 4:46 p.m., Amber Lin wrote:

When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's poll function
with wait-time specified to wait for the event to happen. Once the event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
 - correct kfifo usage
 - move event message API to kfd_ioctl.h

Signed-off-by: Amber Lin 
---
  drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
  drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  30 
  drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
  drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  12 ++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 177 +++
  drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  31 
  include/uapi/linux/kfd_ioctl.h   |  30 +++-
  9 files changed, 286 insertions(+), 2 deletions(-)
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
  create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
-   $(AMDKFD_PATH)/kfd_crat.o
+   $(AMDKFD_PATH)/kfd_crat.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o
  
  ifneq ($(CONFIG_AMD_IOMMU_V2),)

  AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
  #include "kfd_events.h"
  #include "cik_int.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
  
  static bool cik_event_interrupt_isr(struct kfd_dev *dev,

const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
  
+		kfd_smi_event_update_vmfault(dev, pasid);

kfd_process_vm_fault(dev->dqm, pasid);
  
  		memset(, 0, sizeof(info));

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..591ac28 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
  #include "kfd_device_queue_manager.h"
  #include "kfd_dbgmgr.h"
  #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
  
  static long kfd_ioctl(struct file *, unsigned int, unsigned long);

  static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file *filep, 
struct kfd_process *p,
return ret;
  }
  
+/* Handle requests for watching SMI events */

+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   struct kfd_dev *dev;
+
+   dev = kfd_device_by_id(args->gpu_id);
+   if (!dev)
+   return -EINVAL;
+
+   switch (args->op) {
+   case KFD_SMI_EVENTS_REGISTER:
+   /* register the device */
+   return kfd_smi_event_register(dev, >data);
+   case KFD_SMI_EVENTS_ENABLE:
+   /* subscribe events to the device */
+   return kfd_smi_event_enable(dev, args->events);
+   case KFD_SMI_EVENTS_DISABLE:
+   /* unsubscribe events */
+   return kfd_smi_event_disable(dev, args->events);
+   }
+
+   return -EINVAL;
+}
+
  bool kfd_dev_is_large_bar(struct kfd_dev *dev)
  {
struct kfd_local_mem_info mem_info;
@@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
  
  	AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,

kfd_ioctl_alloc_queue_gws, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+   kfd_ioctl_smi_events, 0),
  };
  
  #define AMDKFD_CORE_IOCTL_COUNT	ARRAY_SIZE(amdkfd_ioctls)

diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 

[PATCH v2] drm/amdkfd: Provide SMI events watch

2020-04-02 Thread Amber Lin
When the compute is malfunctioning or performance drops, the system admin
will use SMI (System Management Interface) tool to monitor/diagnostic what
went wrong. This patch provides an event watch interface for the user
space to register events they are interested. After the event is
registered, the user can use annoymous file descriptor's poll function
with wait-time specified to wait for the event to happen. Once the event
happens, the user can use read() to retrieve information related to the
event.

VM fault event is done in this patch.

v2: - remove UNREGISTER and add event ENABLE/DISABLE
- correct kfifo usage
- move event message API to kfd_ioctl.h

Signed-off-by: Amber Lin 
---
 drivers/gpu/drm/amd/amdkfd/Makefile  |   3 +-
 drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_chardev.c |  30 
 drivers/gpu/drm/amd/amdkfd/kfd_device.c  |   1 +
 drivers/gpu/drm/amd/amdkfd/kfd_int_process_v9.c  |   2 +
 drivers/gpu/drm/amd/amdkfd/kfd_priv.h|  12 ++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c  | 177 +++
 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h  |  31 
 include/uapi/linux/kfd_ioctl.h   |  30 +++-
 9 files changed, 286 insertions(+), 2 deletions(-)
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.c
 create mode 100644 drivers/gpu/drm/amd/amdkfd/kfd_smi_events.h

diff --git a/drivers/gpu/drm/amd/amdkfd/Makefile 
b/drivers/gpu/drm/amd/amdkfd/Makefile
index 6147462..cc98b4a 100644
--- a/drivers/gpu/drm/amd/amdkfd/Makefile
+++ b/drivers/gpu/drm/amd/amdkfd/Makefile
@@ -53,7 +53,8 @@ AMDKFD_FILES  := $(AMDKFD_PATH)/kfd_module.o \
$(AMDKFD_PATH)/kfd_int_process_v9.o \
$(AMDKFD_PATH)/kfd_dbgdev.o \
$(AMDKFD_PATH)/kfd_dbgmgr.o \
-   $(AMDKFD_PATH)/kfd_crat.o
+   $(AMDKFD_PATH)/kfd_crat.o \
+   $(AMDKFD_PATH)/kfd_smi_events.o
 
 ifneq ($(CONFIG_AMD_IOMMU_V2),)
 AMDKFD_FILES += $(AMDKFD_PATH)/kfd_iommu.o
diff --git a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c 
b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
index 9f59ba9..24b4717 100644
--- a/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
+++ b/drivers/gpu/drm/amd/amdkfd/cik_event_interrupt.c
@@ -24,6 +24,7 @@
 #include "kfd_events.h"
 #include "cik_int.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static bool cik_event_interrupt_isr(struct kfd_dev *dev,
const uint32_t *ih_ring_entry,
@@ -107,6 +108,7 @@ static void cik_event_interrupt_wq(struct kfd_dev *dev,
ihre->source_id == CIK_INTSRC_GFX_MEM_PROT_FAULT) {
struct kfd_vm_fault_info info;
 
+   kfd_smi_event_update_vmfault(dev, pasid);
kfd_process_vm_fault(dev->dqm, pasid);
 
memset(, 0, sizeof(info));
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index f8fa03a..591ac28 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -39,6 +39,7 @@
 #include "kfd_device_queue_manager.h"
 #include "kfd_dbgmgr.h"
 #include "amdgpu_amdkfd.h"
+#include "kfd_smi_events.h"
 
 static long kfd_ioctl(struct file *, unsigned int, unsigned long);
 static int kfd_open(struct inode *, struct file *);
@@ -1243,6 +1244,32 @@ static int kfd_ioctl_acquire_vm(struct file *filep, 
struct kfd_process *p,
return ret;
 }
 
+/* Handle requests for watching SMI events */
+static int kfd_ioctl_smi_events(struct file *filep,
+   struct kfd_process *p, void *data)
+{
+   struct kfd_ioctl_smi_events_args *args = data;
+   struct kfd_dev *dev;
+
+   dev = kfd_device_by_id(args->gpu_id);
+   if (!dev)
+   return -EINVAL;
+
+   switch (args->op) {
+   case KFD_SMI_EVENTS_REGISTER:
+   /* register the device */
+   return kfd_smi_event_register(dev, >data);
+   case KFD_SMI_EVENTS_ENABLE:
+   /* subscribe events to the device */
+   return kfd_smi_event_enable(dev, args->events);
+   case KFD_SMI_EVENTS_DISABLE:
+   /* unsubscribe events */
+   return kfd_smi_event_disable(dev, args->events);
+   }
+
+   return -EINVAL;
+}
+
 bool kfd_dev_is_large_bar(struct kfd_dev *dev)
 {
struct kfd_local_mem_info mem_info;
@@ -1827,6 +1854,9 @@ static const struct amdkfd_ioctl_desc amdkfd_ioctls[] = {
 
AMDKFD_IOCTL_DEF(AMDKFD_IOC_ALLOC_QUEUE_GWS,
kfd_ioctl_alloc_queue_gws, 0),
+
+   AMDKFD_IOCTL_DEF(AMDKFD_IOC_SMI_EVENTS,
+   kfd_ioctl_smi_events, 0),
 };
 
 #define AMDKFD_CORE_IOCTL_COUNTARRAY_SIZE(amdkfd_ioctls)
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_device.c
index 0491ab2..6ac6f31 100644
--- 

Re: [PATCH] drm/amd/display: re-order asic declarations

2020-04-02 Thread Liu, Zhan
[AMD Official Use Only - Internal Distribution Only]

From: amd-gfx  on behalf of Shirish S 


Sent: Thursday, April 2, 2020 5:15 AM

To: Deucher, Alexander ; Wentland, Harry 
; Li, Sun peng (Leo) 

Cc: amd-gfx@lists.freedesktop.org ; S, Shirish 


Subject: [PATCH] drm/amd/display: re-order asic declarations




"1382d6409891 drm/amd/display: Fix RV2 Variant Detection"

introduces build error of:

"use of undeclared identifier 'RENOIR_A0'"



To fix the same, this patch re-orders the

ASIC declarations accordingly.



Signed-off-by: Shirish S 


Reviewed-by: Zhan Liu 




---

 drivers/gpu/drm/amd/display/include/dal_asic_id.h | 6 --

 1 file changed, 4 insertions(+), 2 deletions(-)



diff --git a/drivers/gpu/drm/amd/display/include/dal_asic_id.h 
b/drivers/gpu/drm/amd/display/include/dal_asic_id.h

index 8a87d0ed90ae..2359e88d6029 100644

--- a/drivers/gpu/drm/amd/display/include/dal_asic_id.h

+++ b/drivers/gpu/drm/amd/display/include/dal_asic_id.h

@@ -136,6 +136,7 @@

 #define RAVEN2_A0 0x81

 #define RAVEN1_F0 0xF0

 #define RAVEN_UNKNOWN 0xFF

+#define RENOIR_A0 0x91

 #ifndef ASICREV_IS_RAVEN

 #define ASICREV_IS_RAVEN(eChipRev) ((eChipRev >= RAVEN_A0) && eChipRev < 
RAVEN_UNKNOWN)

 #endif

@@ -171,8 +172,6 @@ enum {

 #define ASICREV_IS_NAVI10_P(eChipRev)(eChipRev < NV_NAVI12_P_A0)

 #define ASICREV_IS_NAVI12_P(eChipRev)((eChipRev >= NV_NAVI12_P_A0) && 
(eChipRev < NV_NAVI14_M_A0))

 #define ASICREV_IS_NAVI14_M(eChipRev)((eChipRev >= NV_NAVI14_M_A0) && 
(eChipRev < NV_UNKNOWN))

-#define RENOIR_A0 0x91

-#define DEVICE_ID_RENOIR_1636 0x1636   // Renoir

 #define ASICREV_IS_RENOIR(eChipRev) ((eChipRev >= RENOIR_A0) && (eChipRev < 
RAVEN1_F0))



 /*

@@ -183,6 +182,9 @@ enum {

 #define DEVICE_ID_TEMASH_9839 0x9839

 #define DEVICE_ID_TEMASH_983D 0x983D



+/* RENOIR */

+#define DEVICE_ID_RENOIR_1636 0x1636

+

 /* Asic Family IDs for different asic family. */

 #define FAMILY_CI 120 /* Sea Islands: Hawaii (P), Bonaire (M) */

 #define FAMILY_KV 125 /* Fusion => Kaveri: Spectre, Spooky; Kabini: Kalindi */

--

2.17.1



___

amd-gfx mailing list

amd-gfx@lists.freedesktop.org

https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Czhan.liu%40amd.com%7C5d6f323ad05349e58ef808d7d6e66ade%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637214158105886093sdata=tjvt8iuzXrd1rg0y%2FIzVNfSGpAFEJL59R04fcqi3UzE%3Dreserved=0

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset

2020-04-02 Thread Felix Kuehling
[+Monk]

This looks reasonable to me. However, you're effectively reverting this
commit by Monk:

a03eb637d2a5 drm/amdgpu: fix KIQ ring test fail in TDR of SRIOV

In hind-sight, Monk's commit was broken. Removing the call to pre_reset
has other consequences, such as breaking notifications about reset to
user mode, and probably invalidating some assumptions in kfd_post_reset.
Can you coordinate with Monk to work out why his change was needed, and
whether you'll need a different solution for the problem he was trying
to address?

In the meanwhile, this patch is

Acked-by: Felix Kuehling 


Am 2020-04-02 um 3:20 a.m. schrieb Jack Zhang:

> kfd_pre_reset will free mem_objs allocated by kfd_gtt_sa_allocate
>
> Without this change, sriov tdr code path will never free those allocated
> memories and get memory leak.
>
> Signed-off-by: Jack Zhang 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> index 8faaa17..832daf7 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
> @@ -3847,6 +3847,8 @@ static int amdgpu_device_reset_sriov(struct 
> amdgpu_device *adev,
>  {
>   int r;
>  
> + amdgpu_amdkfd_pre_reset(adev);
> +
>   if (from_hypervisor)
>   r = amdgpu_virt_request_full_gpu(adev, true);
>   else
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: AMD DC graphics display code enables -mhard-float, -msse, -msse2 without any visible FPU state protection

2020-04-02 Thread Jann Horn
On Thu, Apr 2, 2020 at 11:36 AM Thomas Gleixner  wrote:
> Jann Horn  writes:
> > On Thu, Apr 2, 2020 at 9:34 AM Christian König  
> > wrote:
> >> Am 02.04.20 um 04:34 schrieb Jann Horn:
> >> > [x86 folks in CC so that they can chime in on the precise rules for
> >> > this stuff]
>
> They are pretty simple.
>
> Any code using FPU needs to be completely isolated from regular code
> either by using inline asm or by moving it to a different compilation
> unit. The invocations need fpu_begin/end() of course.
[...]
> We really need objtool support to validate that.
>
> Peter, now that we know how to do it (noinstr, clac/stac) we can emit
> annotations (see patch below) and validate that any FPU instruction is
> inside a safe region. Hmm?

One annoying aspect is that for the "move it to a different
compilation unit" method, objtool needs to know at compile time
(before linking) which functions are in FPU-enabled object files,
right? So we'd need to have some sort of function annotation that gets
plumbed from the function declaration in a header file through the
compiler into the ELF file, and then let objtool verify that calls to
FPU-enabled methods occur only when the FPU is available? (Ideally
something that covers indirect calls... but this would probably get
really complicated unless we can get the compiler to include that
annotation in its type checking.)
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: AMD DC graphics display code enables -mhard-float, -msse, -msse2 without any visible FPU state protection

2020-04-02 Thread Peter Zijlstra
On Thu, Apr 02, 2020 at 09:33:54AM +0200, Christian König wrote:
> Hi Jann,
> 
> Am 02.04.20 um 04:34 schrieb Jann Horn:
> > [x86 folks in CC so that they can chime in on the precise rules for this 
> > stuff]
> > 
> > Hi!
> > 
> > I noticed that several makefiles under drivers/gpu/drm/amd/display/dc/
> > turn on floating-point instructions in the compiler flags
> > (-mhard-float, -msse and -msse2) in order to make the "float" and
> > "double" types usable from C code without requiring helper functions.
> > 
> > However, as far as I know, code running in normal kernel context isn't
> > allowed to use floating-point registers without special protection
> > using helpers like kernel_fpu_begin() and kernel_fpu_end() (which also
> > require that the protected code never blocks). If you violate that
> > rule, that can lead to various issues - among other things, I think
> > the kernel will clobber userspace FPU register state, and I think the
> > kernel code can blow up if a context switch happens at the wrong time,
> > since in-kernel task switches don't preserve FPU state.
> > 
> > Is there some hidden trick I'm missing that makes it okay to use FPU
> > registers here?
> > 
> > I would try testing this, but unfortunately none of the AMD devices I
> > have here have the appropriate graphics hardware...
> 
> yes, using the floating point calculations in the display code has been a
> source of numerous problems and confusion in the past.
> 
> The calls to kernel_fpu_begin() and kernel_fpu_end() are hidden behind the
> DC_FP_START() and DC_FP_END() macros which are supposed to hide the
> architecture depend handling for x86 and PPC64.
> 
> This originated from the graphics block integrated into AMD CPU (where we
> knew which fp unit we had), but as far as I know is now also used for
> dedicated AMD GPUs as well.
> 
> I'm not really a fan of this either, but so far we weren't able to convince
> the hardware engineers to not use floating point calculations for the
> display stuff.

Might I complain that:

make O=allmodconfig-build drivers/gpu/drm/amd/display/dc/

does not in fact work?

Anyway, how do people feel about something like the below?

Masami, Boris, is there any semi-sane way we can have insn_is_fpu() ?
While digging through various opcode manuals is of course forever fun, I
do feel like it might not be the best way.

---
 arch/x86/include/asm/fpu/api.h  |  7 
 arch/x86/include/asm/fpu/internal.h | 11 ++
 arch/x86/kernel/fpu/init.c  |  5 +++
 tools/objtool/arch.h|  1 +
 tools/objtool/arch/x86/decode.c | 71 ++---
 tools/objtool/check.c   | 58 ++
 tools/objtool/check.h   |  3 +-
 7 files changed, 134 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h
index b774c52e5411..b9bca1cdc875 100644
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -12,6 +12,13 @@
 #define _ASM_X86_FPU_API_H
 #include 
 
+#define annotate_fpu() ({  \
+   asm volatile("%c0:\n\t" \
+".pushsection .discard.fpu_safe\n\t"   \
+".long %c0b - .\n\t"   \
+".popsection\n\t" : : "i" (__COUNTER__));  \
+})
+
 /*
  * Use kernel_fpu_begin/end() if you intend to use FPU in kernel context. It
  * disables preemption so be careful if you intend to use it for long periods
diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 44c48e34d799..bc436213a0d4 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -102,6 +102,11 @@ static inline void fpstate_init_fxstate(struct 
fxregs_state *fx)
 }
 extern void fpstate_sanitize_xstate(struct fpu *fpu);
 
+#define _ASM_ANNOTATE_FPU(at)  \
+".pushsection .discard.fpu_safe\n" \
+".long " #at " - .\n"  \
+".popsection\n"\
+
 #define user_insn(insn, output, input...)  \
 ({ \
int err;\
@@ -116,6 +121,7 @@ extern void fpstate_sanitize_xstate(struct fpu *fpu);
 "jmp  2b\n"\
 ".previous\n"  \
 _ASM_EXTABLE(1b, 3b)   \
+_ASM_ANNOTATE_FPU(1b)  \
 : [err] "=r" (err), output \
 : "0"(0), input);  \
  

Re: Possibility of RX570 responsible for spontaneous reboots (MCE) with Ryzen 3700x?

2020-04-02 Thread Clemens Eisserer
Hi Someguy,

Your findings sound very familiar, my machine is also rock-solid
running Windows-10 - most of the MCEs happened for me with low-load
situations but firefox playing youtube in background.
First I didn't care that much - but now having experienced corrupted
firefox profiles and lost spreadsheet work it starts to get annoying.

I've filed a kernel bug regarding this issue:
https://bugzilla.kernel.org/show_bug.cgi?id=206903
I would appreciate if you could report your findings there to give the
issue more data / weight.

Thanks, Clemens

Am Do., 2. Apr. 2020 um 15:11 Uhr schrieb someguy108 :
>
> Hello! I saw Clemens Eisserer email regarding MCE errors with his RX 570 and 
> 3700x, and I like to add to that list of MCE spontaneous reboots as well.
> This is my configuration:
> -Ryzen 3900x + Noctua D15
> -MSI X570 Unify (latest agesa as of writing)
> -DDR4 3200mhz 32GB kit
> -Sapphire Pulse 5700 XT
> -Corsair RMX 850 Watt
> -Arch Linux with kernel 5.5.13
> -Mesa 20.0.3
> -Early KMS enabled
>
> I've had this system up and running since November 2019 but initially with a 
> Nvidia 1060 and Windows 10. Everything was running smoothly. About a month 
> ago I switched back over to Linux after purchasing my 5700 XT as my initial 
> plan was to go back to Linux. Since returning I've experienced multiple 
> spontaneous MCE reboots. All happened while I was playing one particular 
> game, Warcraft 3 Reforged. The MCE event is the following:
>
> kernel: mce: [Hardware Error]: Machine check events logged
> kernel: mce: [Hardware Error]: CPU 1: Machine Check: 0 Bank 5: 
> bea00108
> kernel: mce: [Hardware Error]: TSC 0 ADDR 1ad66d6fe MISC d0120001 
> SYND 4d00 IPID 500b0
> kernel: mce: [Hardware Error]: PROCESSOR 2:870f10 TIME 1585120217 SOCKET 0 
> APIC 2 microcode 8701013
> kernel: #2 #3 #4 #5 #6 #7 #8 #9 #10 #11 #12 #13 #14 #15
> kernel: mce: [Hardware Error]: Machine check events logged
> kernel: mce: [Hardware Error]: CPU 15: Machine Check: 0 Bank 5: 
> bea00108
> kernel: mce: [Hardware Error]: TSC 0 ADDR 1c1196eb6 MISC d0120001 
> SYND 4d00 IPID 500b0
> kernel: mce: [Hardware Error]: PROCESSOR 2:870f10 TIME 1585120217 SOCKET 0 
> APIC 9 microcode 8701013
> kernel: #16 #17 #18 #19 #20 #21 #22 #23
>
> Initially I figured it could be ram so I performed the usual test with no 
> problems. Also tested with standard JEDEC as well and eventually received a 
> MCE during Warcraft 3 reforged. After consulting with a few friends I decided 
> to try a different power supply to no avail. I then bit the bullet and bought 
> a brand new 3900x. I also cleared CMOS before getting my new 3900x and after. 
> All cpu values are on auto with no PBO or manual overclocking. The only fancy 
> is the ram. Yesterday, after owning the new 3900x for three days, I had a MCE 
> while I was playing Warcraft 3 Reforged. I have tested other games but none 
> of them caused a MCE or any crashes / freezes for that matter. World of 
> Warcraft, The Outer Worlds, Stellaris, and Counter-Strike: Global Offensive.
>
> As the same with Clemens, using the same decoder he used, MCE-Ryzen-Decoder, 
> from github, it reports the MCE to be the following:
> Bank: Execution Unit (EX)
> Error: Watchdog Timeout error (WDT 0x0)
>
> One thing to note is I haven't received it during desktop usage. Only in 
> Warcraft 3. I do have desktop compositing in both Xfce and KDE disabled and 
> always have. Both of which used, tested, and received the MCE's during those 
> sessions. I have noticed a pattern with the MCE crashes with Warcraft 3. They 
> always happen during a GPU load drop off or increase transition. By that I 
> mean when exiting a match to return to the lobby, or loading a map and when 
> it switches from the loading screen to the match itself is when these MCE's 
> happen. The entire screen quickly turns black, everything is hard locked, and 
> then after about a minute or so the machine reboots on its own. It hasn't 
> happened yet while in a middle of a match session, sitting in the lobby or at 
> the main menu screen. Its consistently been during a transition. My theory is 
> that this could possibly be a GPU hang from switching from one power state to 
> another power state. With the GPU hanging, causes the CPU to stall, and thus 
> a 
 MCE. The GPU hanging could explain the quick solid black screen as well as all 
output is stopped. But I'm really just assuming here form my own observations 
from my limited understanding. Possible reason why this triggers in Warcraft is 
because the other games have few moments of switching power states heavily. The 
Outer Worlds, World of Warcraft, Stellaris, and Counter-Strike Global Offensive 
all keep a constant high load on the GPU and the match sessions are long.
>
> From what its worth, I've had no major issues in Windows 10. The only quirks 
> where initially a few TDR's that recovered from alt tabing out of most games 
> with Google 

Re: AMD DC graphics display code enables -mhard-float, -msse, -msse2 without any visible FPU state protection

2020-04-02 Thread Jann Horn
On Thu, Apr 2, 2020 at 9:34 AM Christian König  wrote:
> Am 02.04.20 um 04:34 schrieb Jann Horn:
> > [x86 folks in CC so that they can chime in on the precise rules for this 
> > stuff]
> > I noticed that several makefiles under drivers/gpu/drm/amd/display/dc/
> > turn on floating-point instructions in the compiler flags
> > (-mhard-float, -msse and -msse2) in order to make the "float" and
> > "double" types usable from C code without requiring helper functions.
> >
> > However, as far as I know, code running in normal kernel context isn't
> > allowed to use floating-point registers without special protection
> > using helpers like kernel_fpu_begin() and kernel_fpu_end() (which also
> > require that the protected code never blocks). If you violate that
> > rule, that can lead to various issues - among other things, I think
> > the kernel will clobber userspace FPU register state, and I think the
> > kernel code can blow up if a context switch happens at the wrong time,
> > since in-kernel task switches don't preserve FPU state.
> >
> > Is there some hidden trick I'm missing that makes it okay to use FPU
> > registers here?
> >
> > I would try testing this, but unfortunately none of the AMD devices I
> > have here have the appropriate graphics hardware...
>
> yes, using the floating point calculations in the display code has been
> a source of numerous problems and confusion in the past.
>
> The calls to kernel_fpu_begin() and kernel_fpu_end() are hidden behind
> the DC_FP_START() and DC_FP_END() macros which are supposed to hide the
> architecture depend handling for x86 and PPC64.

Hmm... but as far as I can tell, you're using those macros from inside
functions that are already compiled with the FPU on:

 - drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c uses the macros,
but is already compiled with calcs_ccflags
 - drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c uses the
macros, but is already compiled with "-mhard-float -msse -msse2"
 - drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c uses the
macros, but is already compiled with "-mhard-float -msse -msse2"

AFAIK as soon as you enter any function in any file compiled with FPU
instructions, you may encounter SSE instructions, e.g. via things like
compiler-generated memory-zeroing code - not just when you're actually
using doubles or floats.
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: Possibility of RX570 responsible for spontaneous reboots (MCE) with Ryzen 3700x?

2020-04-02 Thread someguy108
Hello! I saw Clemens Eisserer email regarding MCE errors with his RX 570
and 3700x, and I like to add to that list of MCE spontaneous reboots as
well.
This is my configuration:
-Ryzen 3900x + Noctua D15
-MSI X570 Unify (latest agesa as of writing)
-DDR4 3200mhz 32GB kit
-Sapphire Pulse 5700 XT
-Corsair RMX 850 Watt
-Arch Linux with kernel 5.5.13
-Mesa 20.0.3
-Early KMS enabled

I've had this system up and running since November 2019 but initially with
a Nvidia 1060 and Windows 10. Everything was running smoothly. About a
month ago I switched back over to Linux after purchasing my 5700 XT as my
initial plan was to go back to Linux. Since returning I've experienced
multiple spontaneous MCE reboots. All happened while I was playing one
particular game, Warcraft 3 Reforged. The MCE event is the following:

kernel: mce: [Hardware Error]: Machine check events logged
kernel: mce: [Hardware Error]: CPU 1: Machine Check: 0 Bank 5:
bea00108
kernel: mce: [Hardware Error]: TSC 0 ADDR 1ad66d6fe MISC
d0120001 SYND 4d00 IPID 500b0
kernel: mce: [Hardware Error]: PROCESSOR 2:870f10 TIME 1585120217 SOCKET 0
APIC 2 microcode 8701013
kernel: #2 #3 #4 #5 #6 #7 #8 #9 #10 #11 #12 #13 #14 #15
kernel: mce: [Hardware Error]: Machine check events logged
kernel: mce: [Hardware Error]: CPU 15: Machine Check: 0 Bank 5:
bea00108
kernel: mce: [Hardware Error]: TSC 0 ADDR 1c1196eb6 MISC
d0120001 SYND 4d00 IPID 500b0
kernel: mce: [Hardware Error]: PROCESSOR 2:870f10 TIME 1585120217 SOCKET 0
APIC 9 microcode 8701013
kernel: #16 #17 #18 #19 #20 #21 #22 #23

Initially I figured it could be ram so I performed the usual test with no
problems. Also tested with standard JEDEC as well and eventually received a
MCE during Warcraft 3 reforged. After consulting with a few friends I
decided to try a different power supply to no avail. I then bit the bullet
and bought a brand new 3900x. I also cleared CMOS before getting my new
3900x and after. All cpu values are on auto with no PBO or manual
overclocking. The only fancy is the ram. Yesterday, after owning the new
3900x for three days, I had a MCE while I was playing Warcraft 3 Reforged.
I have tested other games but none of them caused a MCE or any crashes /
freezes for that matter. World of Warcraft, The Outer Worlds, Stellaris,
and Counter-Strike: Global Offensive.

As the same with Clemens, using the same decoder he
used, MCE-Ryzen-Decoder, from github, it reports the MCE to be the
following:
Bank: Execution Unit (EX)
Error: Watchdog Timeout error (WDT 0x0)

One thing to note is I haven't received it during desktop usage. Only in
Warcraft 3. I do have desktop compositing in both Xfce and KDE disabled and
always have. Both of which used, tested, and received the MCE's during
those sessions. I have noticed a pattern with the MCE crashes with Warcraft
3. They always happen during a GPU load drop off or increase transition. By
that I mean when exiting a match to return to the lobby, or loading a map
and when it switches from the loading screen to the match itself is when
these MCE's happen. The entire screen quickly turns black, everything is
hard locked, and then after about a minute or so the machine reboots on its
own. It hasn't happened yet while in a middle of a match session, sitting
in the lobby or at the main menu screen. Its consistently been during a
transition. My theory is that this could possibly be a GPU hang from
switching from one power state to another power state. With the GPU
hanging, causes the CPU to stall, and thus a MCE. The GPU hanging could
explain the quick solid black screen as well as all output is stopped. But
I'm really just assuming here form my own observations from my limited
understanding. Possible reason why this triggers in Warcraft is because the
other games have few moments of switching power states heavily. The Outer
Worlds, World of Warcraft, Stellaris, and Counter-Strike Global Offensive
all keep a constant high load on the GPU and the match sessions are long.

>From what its worth, I've had no major issues in Windows 10. The only
quirks where initially a few TDR's that recovered from alt tabing out of
most games with Google Chrome running. Disabling hardware acceleration in
Chrome fixed those TDR's while alt-tabing out of games.

>From searching, the way I found this mailing list report, I've found quite
a few reports of people talking about receiving MCE's that isn't the
typical first generation MCE's reports from 2017 involving Ryzen.Where
those where fixed by disabling c-states, ram, and changing power supply
current from low to typical. These ones within the past year appear to all
have a AMD GPU in common. I did notice a few with Intel CPU's as well
paired up with a AMD GPU.

Any feedback would be greatly appreciated.
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: AMD DC graphics display code enables -mhard-float, -msse, -msse2 without any visible FPU state protection

2020-04-02 Thread Thomas Gleixner
Jann Horn  writes:
> On Thu, Apr 2, 2020 at 9:34 AM Christian König  
> wrote:
>> Am 02.04.20 um 04:34 schrieb Jann Horn:
>> > [x86 folks in CC so that they can chime in on the precise rules for
>> > this stuff]

They are pretty simple.

Any code using FPU needs to be completely isolated from regular code
either by using inline asm or by moving it to a different compilation
unit. The invocations need fpu_begin/end() of course.

>> > I noticed that several makefiles under drivers/gpu/drm/amd/display/dc/
>> > turn on floating-point instructions in the compiler flags
>> > (-mhard-float, -msse and -msse2) in order to make the "float" and
>> > "double" types usable from C code without requiring helper functions.
>> >
>> > However, as far as I know, code running in normal kernel context isn't
>> > allowed to use floating-point registers without special protection
>> > using helpers like kernel_fpu_begin() and kernel_fpu_end() (which also
>> > require that the protected code never blocks). If you violate that
>> > rule, that can lead to various issues - among other things, I think
>> > the kernel will clobber userspace FPU register state, and I think the
>> > kernel code can blow up if a context switch happens at the wrong time,
>> > since in-kernel task switches don't preserve FPU state.
>> >
>> > Is there some hidden trick I'm missing that makes it okay to use FPU
>> > registers here?
>> >
>> > I would try testing this, but unfortunately none of the AMD devices I
>> > have here have the appropriate graphics hardware...
>>
>> yes, using the floating point calculations in the display code has been
>> a source of numerous problems and confusion in the past.
>>
>> The calls to kernel_fpu_begin() and kernel_fpu_end() are hidden behind
>> the DC_FP_START() and DC_FP_END() macros which are supposed to hide the
>> architecture depend handling for x86 and PPC64.
>
> Hmm... but as far as I can tell, you're using those macros from inside
> functions that are already compiled with the FPU on:
>
>  - drivers/gpu/drm/amd/display/dc/calcs/dcn_calcs.c uses the macros,
> but is already compiled with calcs_ccflags
>  - drivers/gpu/drm/amd/display/dc/dcn20/dcn20_resource.c uses the
> macros, but is already compiled with "-mhard-float -msse -msse2"
>  - drivers/gpu/drm/amd/display/dc/dcn21/dcn21_resource.c uses the
> macros, but is already compiled with "-mhard-float -msse -msse2"
>
> AFAIK as soon as you enter any function in any file compiled with FPU
> instructions, you may encounter SSE instructions, e.g. via things like
> compiler-generated memory-zeroing code - not just when you're actually
> using doubles or floats.

That's correct and this will silently cause FPU state corruption ...

We really need objtool support to validate that.

Peter, now that we know how to do it (noinstr, clac/stac) we can emit
annotations (see patch below) and validate that any FPU instruction is
inside a safe region. Hmm?

Thanks,

tglx

8<---
--- a/arch/x86/include/asm/fpu/api.h
+++ b/arch/x86/include/asm/fpu/api.h
@@ -19,8 +19,27 @@
  * If you intend to use the FPU in softirq you need to check first with
  * irq_fpu_usable() if it is possible.
  */
-extern void kernel_fpu_begin(void);
-extern void kernel_fpu_end(void);
+extern void __kernel_fpu_begin(void);
+extern void __kernel_fpu_end(void);
+
+static inline void kernel_fpu_begin(void)
+{
+   asm volatile("%c0:\n\t"
+".pushsection .discard.fpu_begin\n\t"
+".long %c0b - .\n\t"
+".popsection\n\t" : : "i" (__COUNTER__));
+   __kernel_fpu_begin();
+}
+
+static inline void kernel_fpu_end(void)
+{
+   __kernel_fpu_end();
+   asm volatile("%c0:\n\t"
+".pushsection .discard.fpu_end\n\t"
+".long %c0b - .\n\t"
+".popsection\n\t" : : "i" (__COUNTER__));
+}
+
 extern bool irq_fpu_usable(void);
 extern void fpregs_mark_activate(void);
 
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -82,7 +82,7 @@ bool irq_fpu_usable(void)
 }
 EXPORT_SYMBOL(irq_fpu_usable);
 
-void kernel_fpu_begin(void)
+void __kernel_fpu_begin(void)
 {
preempt_disable();
 
@@ -102,16 +102,16 @@ void kernel_fpu_begin(void)
}
__cpu_invalidate_fpregs_state();
 }
-EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+EXPORT_SYMBOL_GPL(__kernel_fpu_begin);
 
-void kernel_fpu_end(void)
+void __kernel_fpu_end(void)
 {
WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));
 
this_cpu_write(in_kernel_fpu, false);
preempt_enable();
 }
-EXPORT_SYMBOL_GPL(kernel_fpu_end);
+EXPORT_SYMBOL_GPL(__kernel_fpu_end);
 
 /*
  * Save the FPU state (mark it for reload if necessary):



___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amd/powerplay: avoid using pm_en before it is initialized

2020-04-02 Thread Tiecheng Zhou
hwmgr->pm_en is initialized at hwmgr_hw_init.
during amdgpu_device_init, there is amdgpu_asic_reset that calls to
pp_get_asic_baco_capability, while hwmgr->pm_en has not yet been initialized.

so avoid using pm_en in pp_get_asic_baco_capability.

Signed-off-by: Tiecheng Zhou 
Signed-off-by: Yintian Tao 
---
 drivers/gpu/drm/amd/powerplay/amd_powerplay.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c 
b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
index 71b843f542d8..fdff3e1c5e95 100644
--- a/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
+++ b/drivers/gpu/drm/amd/powerplay/amd_powerplay.c
@@ -1455,7 +1455,8 @@ static int pp_get_asic_baco_state(void *handle, int 
*state)
if (!hwmgr)
return -EINVAL;
 
-   if (!hwmgr->pm_en || !hwmgr->hwmgr_func->get_asic_baco_state)
+   if (!(hwmgr->not_vf && amdgpu_dpm) ||
+   !hwmgr->hwmgr_func->get_asic_baco_state)
return 0;
 
mutex_lock(>smu_lock);
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amd/display: re-order asic declarations

2020-04-02 Thread Shirish S
"1382d6409891 drm/amd/display: Fix RV2 Variant Detection"
introduces build error of:
"use of undeclared identifier 'RENOIR_A0'"

To fix the same, this patch re-orders the
ASIC declarations accordingly.

Signed-off-by: Shirish S 
---
 drivers/gpu/drm/amd/display/include/dal_asic_id.h | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/include/dal_asic_id.h 
b/drivers/gpu/drm/amd/display/include/dal_asic_id.h
index 8a87d0ed90ae..2359e88d6029 100644
--- a/drivers/gpu/drm/amd/display/include/dal_asic_id.h
+++ b/drivers/gpu/drm/amd/display/include/dal_asic_id.h
@@ -136,6 +136,7 @@
 #define RAVEN2_A0 0x81
 #define RAVEN1_F0 0xF0
 #define RAVEN_UNKNOWN 0xFF
+#define RENOIR_A0 0x91
 #ifndef ASICREV_IS_RAVEN
 #define ASICREV_IS_RAVEN(eChipRev) ((eChipRev >= RAVEN_A0) && eChipRev < 
RAVEN_UNKNOWN)
 #endif
@@ -171,8 +172,6 @@ enum {
 #define ASICREV_IS_NAVI10_P(eChipRev)(eChipRev < NV_NAVI12_P_A0)
 #define ASICREV_IS_NAVI12_P(eChipRev)((eChipRev >= NV_NAVI12_P_A0) && 
(eChipRev < NV_NAVI14_M_A0))
 #define ASICREV_IS_NAVI14_M(eChipRev)((eChipRev >= NV_NAVI14_M_A0) && 
(eChipRev < NV_UNKNOWN))
-#define RENOIR_A0 0x91
-#define DEVICE_ID_RENOIR_1636 0x1636   // Renoir
 #define ASICREV_IS_RENOIR(eChipRev) ((eChipRev >= RENOIR_A0) && (eChipRev < 
RAVEN1_F0))
 
 /*
@@ -183,6 +182,9 @@ enum {
 #define DEVICE_ID_TEMASH_9839 0x9839
 #define DEVICE_ID_TEMASH_983D 0x983D
 
+/* RENOIR */
+#define DEVICE_ID_RENOIR_1636 0x1636
+
 /* Asic Family IDs for different asic family. */
 #define FAMILY_CI 120 /* Sea Islands: Hawaii (P), Bonaire (M) */
 #define FAMILY_KV 125 /* Fusion => Kaveri: Spectre, Spooky; Kabini: Kalindi */
-- 
2.17.1

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: AMD DC graphics display code enables -mhard-float, -msse, -msse2 without any visible FPU state protection

2020-04-02 Thread Christian König

Hi Jann,

Am 02.04.20 um 04:34 schrieb Jann Horn:

[x86 folks in CC so that they can chime in on the precise rules for this stuff]

Hi!

I noticed that several makefiles under drivers/gpu/drm/amd/display/dc/
turn on floating-point instructions in the compiler flags
(-mhard-float, -msse and -msse2) in order to make the "float" and
"double" types usable from C code without requiring helper functions.

However, as far as I know, code running in normal kernel context isn't
allowed to use floating-point registers without special protection
using helpers like kernel_fpu_begin() and kernel_fpu_end() (which also
require that the protected code never blocks). If you violate that
rule, that can lead to various issues - among other things, I think
the kernel will clobber userspace FPU register state, and I think the
kernel code can blow up if a context switch happens at the wrong time,
since in-kernel task switches don't preserve FPU state.

Is there some hidden trick I'm missing that makes it okay to use FPU
registers here?

I would try testing this, but unfortunately none of the AMD devices I
have here have the appropriate graphics hardware...


yes, using the floating point calculations in the display code has been 
a source of numerous problems and confusion in the past.


The calls to kernel_fpu_begin() and kernel_fpu_end() are hidden behind 
the DC_FP_START() and DC_FP_END() macros which are supposed to hide the 
architecture depend handling for x86 and PPC64.


This originated from the graphics block integrated into AMD CPU (where 
we knew which fp unit we had), but as far as I know is now also used for 
dedicated AMD GPUs as well.


I'm not really a fan of this either, but so far we weren't able to 
convince the hardware engineers to not use floating point calculations 
for the display stuff.


Regards,
Christian.
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset

2020-04-02 Thread Zhang, Jack (Jian)


-Original Message-
From: Jack Zhang  
Sent: Thursday, April 2, 2020 3:20 PM
To: amd-gfx@lists.freedesktop.org
Cc: Zhang, Jack (Jian) 
Subject: [PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset

kfd_pre_reset will free mem_objs allocated by kfd_gtt_sa_allocate

Without this change, sriov tdr code path will never free those allocated 
memories and get memory leak.

Signed-off-by: Jack Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 8faaa17..832daf7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3847,6 +3847,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,  {
int r;
 
+   amdgpu_amdkfd_pre_reset(adev);
+
if (from_hypervisor)
r = amdgpu_virt_request_full_gpu(adev, true);
else
--
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


[PATCH] drm/amdgpu/sriov add amdgpu_amdkfd_pre_reset in gpu reset

2020-04-02 Thread Jack Zhang
kfd_pre_reset will free mem_objs allocated by kfd_gtt_sa_allocate

Without this change, sriov tdr code path will never free those allocated
memories and get memory leak.

Signed-off-by: Jack Zhang 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 8faaa17..832daf7 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -3847,6 +3847,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device 
*adev,
 {
int r;
 
+   amdgpu_amdkfd_pre_reset(adev);
+
if (from_hypervisor)
r = amdgpu_virt_request_full_gpu(adev, true);
else
-- 
2.7.4

___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


RE: [PATCH] drm/amdgpu/powerplay: using the FCLK DPM table to set the MCLK

2020-04-02 Thread Dai, Yuxian (David)
[AMD Official Use Only - Internal Distribution Only]

On Wed, Apr 01, 2020 at 07:41:12PM +0800, Yuxian Dai wrote:
> 1.Using the FCLK DPM table to set the MCLK for DPM states consist of 
> three entities:
>  FCLK
>  UCLK
>  MEMCLK
> All these three clk change together, MEMCLK from FCLK, so use the fclk 
> frequency.
> 2.we should show the current working clock freqency from clock table 
> metric
> 
> Signed-off-by: Yuxian Dai 
> Reviewed-by: Alex Deucher 
> Reviewed-by: Huang Rui 
> Reviewed-by: Kevin Wang 
> ---

Next time, if you submit the V2 patch, you can generate it as below
command:

git format-patch --subject-prefix="PATCH v2"

And describe the changes from v1 -> v2 in the commit message. This will help 
everyone to understand your change.
>  I got it. 
Reviewed-by: Huang Rui 

>  drivers/gpu/drm/amd/powerplay/renoir_ppt.c | 6 ++  
> drivers/gpu/drm/amd/powerplay/renoir_ppt.h | 2 +-
>  2 files changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c 
> b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c
> index 7bf52ecba01d..c6b39a7026a8 100644
> --- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c
> @@ -239,6 +239,7 @@ static int renoir_print_clk_levels(struct smu_context 
> *smu,
>   uint32_t cur_value = 0, value = 0, count = 0, min = 0, max = 0;
>   DpmClocks_t *clk_table = smu->smu_table.clocks_table;
>   SmuMetrics_t metrics;
> + bool cur_value_match_level = false;
>  
>   if (!clk_table || clk_type >= SMU_CLK_COUNT)
>   return -EINVAL;
> @@ -297,8 +298,13 @@ static int renoir_print_clk_levels(struct smu_context 
> *smu,
>   GET_DPM_CUR_FREQ(clk_table, clk_type, i, value);
>   size += sprintf(buf + size, "%d: %uMhz %s\n", i, value,
>   cur_value == value ? "*" : "");
> + if (cur_value == value)
> + cur_value_match_level = true;
>   }
>  
> + if (!cur_value_match_level)
> + size += sprintf(buf + size, "   %uMhz *\n", cur_value);
> +
>   return size;
>  }
>  
> diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.h 
> b/drivers/gpu/drm/amd/powerplay/renoir_ppt.h
> index 2a390ddd37dd..89cd6da118a3 100644
> --- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.h
> +++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.h
> @@ -37,7 +37,7 @@ extern void renoir_set_ppt_funcs(struct smu_context *smu);
>   freq = table->SocClocks[dpm_level].Freq;\
>   break;  \
>   case SMU_MCLK:  \
> - freq = table->MemClocks[dpm_level].Freq;\
> + freq = table->FClocks[dpm_level].Freq;  \
>   break;  \
>   case SMU_DCEFCLK:   \
>   freq = table->DcfClocks[dpm_level].Freq;\
> --
> 2.17.1
> 
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flist
> s.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cra
> y.huang%40amd.com%7Cfa81baf94d2c409e4f0308d7d6319a8f%7C3dd8961fe4884e6
> 08e11a82d994e183d%7C0%7C0%7C637213381443101089sdata=LF2pLZj%2Fq0C
> wiMSfvDiofXFKuGVfgje7o4Iti%2FtoNj0%3Dreserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx


Re: [PATCH] drm/amdgpu/powerplay: using the FCLK DPM table to set the MCLK

2020-04-02 Thread Huang Rui
On Wed, Apr 01, 2020 at 07:41:12PM +0800, Yuxian Dai wrote:
> 1.Using the FCLK DPM table to set the MCLK for DPM states consist of
> three entities:
>  FCLK
>  UCLK
>  MEMCLK
> All these three clk change together, MEMCLK from FCLK, so use the fclk
> frequency.
> 2.we should show the current working clock freqency from clock table metric
> 
> Signed-off-by: Yuxian Dai 
> Reviewed-by: Alex Deucher 
> Reviewed-by: Huang Rui 
> Reviewed-by: Kevin Wang 
> ---

Next time, if you submit the V2 patch, you can generate it as below
command:

git format-patch --subject-prefix="PATCH v2"

And describe the changes from v1 -> v2 in the commit message. This will
help everyone to understand your change.

Reviewed-by: Huang Rui 

>  drivers/gpu/drm/amd/powerplay/renoir_ppt.c | 6 ++
>  drivers/gpu/drm/amd/powerplay/renoir_ppt.h | 2 +-
>  2 files changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c 
> b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c
> index 7bf52ecba01d..c6b39a7026a8 100644
> --- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.c
> +++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.c
> @@ -239,6 +239,7 @@ static int renoir_print_clk_levels(struct smu_context 
> *smu,
>   uint32_t cur_value = 0, value = 0, count = 0, min = 0, max = 0;
>   DpmClocks_t *clk_table = smu->smu_table.clocks_table;
>   SmuMetrics_t metrics;
> + bool cur_value_match_level = false;
>  
>   if (!clk_table || clk_type >= SMU_CLK_COUNT)
>   return -EINVAL;
> @@ -297,8 +298,13 @@ static int renoir_print_clk_levels(struct smu_context 
> *smu,
>   GET_DPM_CUR_FREQ(clk_table, clk_type, i, value);
>   size += sprintf(buf + size, "%d: %uMhz %s\n", i, value,
>   cur_value == value ? "*" : "");
> + if (cur_value == value)
> + cur_value_match_level = true;
>   }
>  
> + if (!cur_value_match_level)
> + size += sprintf(buf + size, "   %uMhz *\n", cur_value);
> +
>   return size;
>  }
>  
> diff --git a/drivers/gpu/drm/amd/powerplay/renoir_ppt.h 
> b/drivers/gpu/drm/amd/powerplay/renoir_ppt.h
> index 2a390ddd37dd..89cd6da118a3 100644
> --- a/drivers/gpu/drm/amd/powerplay/renoir_ppt.h
> +++ b/drivers/gpu/drm/amd/powerplay/renoir_ppt.h
> @@ -37,7 +37,7 @@ extern void renoir_set_ppt_funcs(struct smu_context *smu);
>   freq = table->SocClocks[dpm_level].Freq;\
>   break;  \
>   case SMU_MCLK:  \
> - freq = table->MemClocks[dpm_level].Freq;\
> + freq = table->FClocks[dpm_level].Freq;  \
>   break;  \
>   case SMU_DCEFCLK:   \
>   freq = table->DcfClocks[dpm_level].Freq;\
> -- 
> 2.17.1
> 
> ___
> amd-gfx mailing list
> amd-gfx@lists.freedesktop.org
> https://nam11.safelinks.protection.outlook.com/?url=https%3A%2F%2Flists.freedesktop.org%2Fmailman%2Flistinfo%2Famd-gfxdata=02%7C01%7Cray.huang%40amd.com%7Cfa81baf94d2c409e4f0308d7d6319a8f%7C3dd8961fe4884e608e11a82d994e183d%7C0%7C0%7C637213381443101089sdata=LF2pLZj%2Fq0CwiMSfvDiofXFKuGVfgje7o4Iti%2FtoNj0%3Dreserved=0
___
amd-gfx mailing list
amd-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx