[linux-next:master] BUILD REGRESSION bc63de6e6ba0b16652c5fb4b9c9916b9e7ca1f23

2023-12-08 Thread kernel test robot
tree/branch: 
https://git.kernel.org/pub/scm/linux/kernel/git/next/linux-next.git master
branch HEAD: bc63de6e6ba0b16652c5fb4b9c9916b9e7ca1f23  Add linux-next specific 
files for 20231208

Error/Warning reports:

https://lore.kernel.org/oe-kbuild-all/202312081716.luuhsns4-...@intel.com
https://lore.kernel.org/oe-kbuild-all/202312081904.nkusjjo0-...@intel.com
https://lore.kernel.org/oe-kbuild-all/202312081955.g3stkpfj-...@intel.com
https://lore.kernel.org/oe-kbuild-all/202312090117.dlmjtqst-...@intel.com

Error/Warning: (recently discovered and may have been fixed)

WARNING: modpost: vmlinux: section mismatch in reference: 
at91_poweroff_probe+0x8c (section: .text) -> at91_wakeup_status (section: 
.init.text)
WARNING: modpost: vmlinux: section mismatch in reference: at91_shdwc_probe+0xd8 
(section: .text) -> at91_wakeup_status (section: .init.text)
arch/mips/mm/cache.c:209:(.text+0x690): undefined reference to `r3k_cache_init'
arch/powerpc/platforms/44x/warp.c:109:15: error: variable 'warp_gpio_leds' has 
initializer but incomplete type
arch/powerpc/platforms/44x/warp.c:109:31: error: storage size of 
'warp_gpio_leds' isn't known
arch/powerpc/platforms/44x/warp.c:110:10: error: 'struct platform_device' has 
no member named 'name'
arch/powerpc/platforms/44x/warp.c:110:19: warning: excess elements in struct 
initializer
arch/powerpc/platforms/44x/warp.c:111:10: error: 'struct platform_device' has 
no member named 'id'
arch/powerpc/platforms/44x/warp.c:112:10: error: 'struct platform_device' has 
no member named 'dev'
arch/powerpc/platforms/44x/warp.c:112:19: error: extra brace group at end of 
initializer
arch/powerpc/platforms/44x/warp.c:197:25: error: implicit declaration of 
function 'platform_device_register'; did you mean 'of_device_register'? 
[-Werror=implicit-function-declaration]
drivers/usb/host/uhci-grlib.c:152:31: error: implicit declaration of function 
'platform_get_drvdata'; did you mean 'pci_get_drvdata'? 
[-Werror=implicit-function-declaration]
drivers/usb/host/uhci-grlib.c:152:31: warning: initialization of 'struct 
usb_hcd *' from 'int' makes pointer from integer without a cast 
[-Wint-conversion]
drivers/usb/host/uhci-grlib.c:184:15: error: variable 'uhci_grlib_driver' has 
initializer but incomplete type
drivers/usb/host/uhci-grlib.c:184:31: error: storage size of 
'uhci_grlib_driver' isn't known
drivers/usb/host/uhci-grlib.c:185:10: error: 'struct platform_driver' has no 
member named 'probe'
drivers/usb/host/uhci-grlib.c:185:27: warning: excess elements in struct 
initializer
drivers/usb/host/uhci-grlib.c:186:10: error: 'struct platform_driver' has no 
member named 'remove_new'
drivers/usb/host/uhci-grlib.c:187:10: error: 'struct platform_driver' has no 
member named 'shutdown'
drivers/usb/host/uhci-grlib.c:188:10: error: 'struct platform_driver' has no 
member named 'driver'
drivers/usb/host/uhci-grlib.c:188:19: error: extra brace group at end of 
initializer
drivers/usb/host/uhci-grlib.c:92:36: error: invalid use of undefined type 
'struct platform_device'
drivers/usb/host/uhci-hcd.c:885:18: error: implicit declaration of function 
'platform_driver_register' [-Werror=implicit-function-declaration]
drivers/usb/host/uhci-hcd.c:902:9: error: implicit declaration of function 
'platform_driver_unregister'; did you mean 'driver_unregister'? 
[-Werror=implicit-function-declaration]
uffd-common.c:636:28: warning: unused variable 'uffdio_move' [-Wunused-variable]

Error/Warning ids grouped by kconfigs:

gcc_recent_errors
|-- alpha-randconfig-r083-20230821
|   `-- 
WARNING:modpost:missing-MODULE_DESCRIPTION()-in-drivers-char-agp-alpha-agp.o
|-- arc-randconfig-r132-20231208
|   `-- 
lib-zstd-compress-zstd_fast.c:sparse:sparse:Using-plain-integer-as-NULL-pointer
|-- arm-allyesconfig
|   `-- qcom_stats.c:(.text):undefined-reference-to-__aeabi_uldivmod
|-- arm-randconfig-r004-20221210
|   `-- 
WARNING:modpost:drivers-pcmcia-omap_cf:section-mismatch-in-reference:omap_cf_driver-(section:.data)-omap_cf_remove-(section:.exit.text)
|-- arm-randconfig-r112-20231208
|   `-- 
lib-zstd-compress-zstd_fast.c:sparse:sparse:Using-plain-integer-as-NULL-pointer
|-- i386-buildonly-randconfig-002-20230825
|   `-- 
include-linux-compiler_types.h:error:call-to-__compiletime_assert_NNN-declared-with-attribute-error:BUILD_BUG_ON-failed:sizeof(-vcpup)-SMP_CACHE_BYTES
|-- i386-randconfig-r133-20231208
|   `-- 
lib-zstd-compress-zstd_fast.c:sparse:sparse:Using-plain-integer-as-NULL-pointer
|-- loongarch-randconfig-r123-20231208
|   `-- 
lib-zstd-compress-zstd_fast.c:sparse:sparse:Using-plain-integer-as-NULL-pointer
|-- mips-allyesconfig
|   `-- 
qcom_stats.c:(.text.qcom_ddr_stats_show):undefined-reference-to-__udivdi3
|-- mips-fuloong2e_defconfig
|   `-- 
WARNING:modpost:missing-MODULE_DESCRIPTION()-in-drivers-base-regmap-regmap-mmio.o
|-- mips-randconfig-r021-20230212
|   `-- arch-mips-mm-cache.c:(.text):undefined-reference-to-r3k_cache_init
|-- parisc-allmodconfig
|   `-- 
WARNING:modpost:missing-MODULE_DESCR

[PATCH 1/2] drm/amdgpu: increase hmm range get pages timeout

2023-12-08 Thread James Zhu
When application tries to allocate all system memory and cause memory
to swap out. Needs more time for hmm_range_fault to validate the
remaining page for allocation. To be safe, increase timeout value to
1 second for 64MB range.

Signed-off-by: James Zhu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
index 081267161d40..b24eb5821fd1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
@@ -190,8 +190,8 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier 
*notifier,
pr_debug("hmm range: start = 0x%lx, end = 0x%lx",
hmm_range->start, hmm_range->end);
 
-   /* Assuming 128MB takes maximum 1 second to fault page address 
*/
-   timeout = max((hmm_range->end - hmm_range->start) >> 27, 1UL);
+   /* Assuming 64MB takes maximum 1 second to fault page address */
+   timeout = max((hmm_range->end - hmm_range->start) >> 26, 1UL);
timeout *= HMM_RANGE_DEFAULT_TIMEOUT;
timeout = jiffies + msecs_to_jiffies(timeout);
 
-- 
2.25.1



[PATCH 2/2] drm/amdgpu: make an improvement on amdgpu_hmm_range_get_pages

2023-12-08 Thread James Zhu
Needn't do schedule for each hmm_range_fault, and use cond_resched
to replace schedule.

Signed-off-by: James Zhu 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
index b24eb5821fd1..c77c4eceea46 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_hmm.c
@@ -199,6 +199,7 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier 
*notifier,
hmm_range->notifier_seq = mmu_interval_read_begin(notifier);
r = hmm_range_fault(hmm_range);
if (unlikely(r)) {
+   cond_resched();
/*
 * FIXME: This timeout should encompass the retry from
 * mmu_interval_read_retry() as well.
@@ -212,7 +213,6 @@ int amdgpu_hmm_range_get_pages(struct mmu_interval_notifier 
*notifier,
break;
hmm_range->hmm_pfns += MAX_WALK_BYTE >> PAGE_SHIFT;
hmm_range->start = hmm_range->end;
-   schedule();
} while (hmm_range->end < end);
 
hmm_range->start = start;
-- 
2.25.1



[PATCH V2] drm/amdgpu/sdma5.2: add begin/end_use ring callbacks

2023-12-08 Thread Alex Deucher
Add begin/end_use ring callbacks to disallow GFXOFF when
SDMA work is submitted and allow it again afterward.

This should avoid corner cases where GFXOFF is erroneously
entered when SDMA is still active.  For now just allow/disallow
GFXOFF in the begin and end helpers until we root cause the
issue.  This should not impact power as SDMA usage is pretty
minimal and GFXOSS should not be active when SDMA is active
anyway, this just makes it explicit.

v2: move everything into sdma5.2 code.  No reason for this
to be generic at this point.

Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2220
Reviewed-by: Mario Limonciello  (v1)
Tested-by: Mario Limonciello  (v1)
Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
index 2e35f3571774..89f7955739f7 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
@@ -1643,6 +1643,20 @@ static void sdma_v5_2_get_clockgating_state(void 
*handle, u64 *flags)
*flags |= AMD_CG_SUPPORT_SDMA_LS;
 }
 
+static void sdma_v5_2_ring_begin_use(struct amdgpu_ring *ring)
+{
+   struct amdgpu_device *adev = ring->adev;
+
+   amdgpu_gfx_off_ctrl(adev, false);
+}
+
+static void sdma_v5_2_ring_end_use(struct amdgpu_ring *ring)
+{
+   struct amdgpu_device *adev = ring->adev;
+
+   amdgpu_gfx_off_ctrl(adev, true);
+}
+
 const struct amd_ip_funcs sdma_v5_2_ip_funcs = {
.name = "sdma_v5_2",
.early_init = sdma_v5_2_early_init,
@@ -1690,6 +1704,8 @@ static const struct amdgpu_ring_funcs 
sdma_v5_2_ring_funcs = {
.test_ib = sdma_v5_2_ring_test_ib,
.insert_nop = sdma_v5_2_ring_insert_nop,
.pad_ib = sdma_v5_2_ring_pad_ib,
+   .begin_use = sdma_v5_2_ring_begin_use,
+   .end_use = sdma_v5_2_ring_end_use,
.emit_wreg = sdma_v5_2_ring_emit_wreg,
.emit_reg_wait = sdma_v5_2_ring_emit_reg_wait,
.emit_reg_write_reg_wait = sdma_v5_2_ring_emit_reg_write_reg_wait,
-- 
2.42.0



Re: [PATCH 1/3] drm/amdgpu/sdma: add begin_use/end_use helpers

2023-12-08 Thread Alex Deucher
On Fri, Dec 8, 2023 at 5:19 PM Alex Deucher  wrote:
>
> Add helper functions to disallow GFXOFF while SDMA has work.
> This should avoid corner cases where GFXOFF is erroneously
> entered when SDMA is still active.  For now just allow/disallow
> GFXOFF in the begin and end helpers until we root cause the
> issue.  This should not impact power as SDMA usage is pretty
> minimal and GFXOSS should not be active when SDMA is active
> anyway, this just makes it explicit.
>
> Signed-off-by: Alex Deucher 

Since sdma5.2 is the only version that currently needs this I think we
can just squash this all into sdma5.2.  There may even be 5.2.x
variants that don't need this.  Better to keep it all in that code.
Will send out a v2.

Alex

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 14 ++
>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  3 +++
>  2 files changed, 17 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> index 1d9d187de6ee..d4b08d03970c 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
> @@ -326,3 +326,17 @@ int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev)
>
> return 0;
>  }
> +
> +void amdgpu_sdma_ring_begin_use(struct amdgpu_ring *ring)
> +{
> +   struct amdgpu_device *adev = ring->adev;
> +
> +   amdgpu_gfx_off_ctrl(adev, false);
> +}
> +
> +void amdgpu_sdma_ring_end_use(struct amdgpu_ring *ring)
> +{
> +   struct amdgpu_device *adev = ring->adev;
> +
> +   amdgpu_gfx_off_ctrl(adev, true);
> +}
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> index 173a2a308078..b52d16829204 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
> @@ -171,4 +171,7 @@ void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device 
> *adev,
>  bool duplicate);
>  int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev);
>
> +void amdgpu_sdma_ring_begin_use(struct amdgpu_ring *ring);
> +void amdgpu_sdma_ring_end_use(struct amdgpu_ring *ring);
> +
>  #endif
> --
> 2.42.0
>


RE: [PATCH 2/3] drm/amdgpu/sdma5.0: add begin/end_use ring callbacks

2023-12-08 Thread Deucher, Alexander
[Public]

> -Original Message-
> From: Deucher, Alexander 
> Sent: Friday, December 8, 2023 5:19 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Deucher, Alexander 
> Subject: [PATCH 2/3] drm/amdgpu/sdma5.0: add begin/end_use ring
> callbacks
>
> Add begin/end_use ring callbacks to disallow GFXOFF when SDMA work is
> submitted and allow it again afterward.
>
> Signed-off-by: Alex Deucher 

This one can probably be dropped.  It's only needed if anyone on navi1x is 
experiencing a similar issue.

Alex

> ---
>  drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index 5c1bb6d07a76..1a68cd2de522 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1790,6 +1790,8 @@ static const struct amdgpu_ring_funcs
> sdma_v5_0_ring_funcs = {
>   .test_ib = sdma_v5_0_ring_test_ib,
>   .insert_nop = sdma_v5_0_ring_insert_nop,
>   .pad_ib = sdma_v5_0_ring_pad_ib,
> + .begin_use = amdgpu_sdma_ring_begin_use,
> + .end_use = amdgpu_sdma_ring_end_use,
>   .emit_wreg = sdma_v5_0_ring_emit_wreg,
>   .emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
>   .emit_reg_write_reg_wait =
> sdma_v5_0_ring_emit_reg_write_reg_wait,
> --
> 2.42.0



Re: [PATCH 1/3] drm/amdgpu/sdma: add begin_use/end_use helpers

2023-12-08 Thread Mario Limonciello

On 12/8/2023 16:19, Alex Deucher wrote:

Add helper functions to disallow GFXOFF while SDMA has work.
This should avoid corner cases where GFXOFF is erroneously
entered when SDMA is still active.  For now just allow/disallow
GFXOFF in the begin and end helpers until we root cause the
issue.  This should not impact power as SDMA usage is pretty
minimal and GFXOSS should not be active when SDMA is active
anyway, this just makes it explicit.

Signed-off-by: Alex Deucher 
---


For the series:

Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2220
Reviewed-by: Mario Limonciello 
Tested-by: Mario Limonciello 


  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 14 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  3 +++
  2 files changed, 17 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 1d9d187de6ee..d4b08d03970c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -326,3 +326,17 @@ int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev)
  
  	return 0;

  }
+
+void amdgpu_sdma_ring_begin_use(struct amdgpu_ring *ring)
+{
+   struct amdgpu_device *adev = ring->adev;
+
+   amdgpu_gfx_off_ctrl(adev, false);
+}
+
+void amdgpu_sdma_ring_end_use(struct amdgpu_ring *ring)
+{
+   struct amdgpu_device *adev = ring->adev;
+
+   amdgpu_gfx_off_ctrl(adev, true);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 173a2a308078..b52d16829204 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -171,4 +171,7 @@ void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device 
*adev,
  bool duplicate);
  int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev);
  
+void amdgpu_sdma_ring_begin_use(struct amdgpu_ring *ring);

+void amdgpu_sdma_ring_end_use(struct amdgpu_ring *ring);
+
  #endif




[PATCH 3/3] drm/amdgpu/sdma5.2: add begin/end_use ring callbacks

2023-12-08 Thread Alex Deucher
Add begin/end_use ring callbacks to disallow GFXOFF when
SDMA work is submitted and allow it again afterward.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
index 2e35f3571774..988238f20315 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
@@ -1690,6 +1690,8 @@ static const struct amdgpu_ring_funcs 
sdma_v5_2_ring_funcs = {
.test_ib = sdma_v5_2_ring_test_ib,
.insert_nop = sdma_v5_2_ring_insert_nop,
.pad_ib = sdma_v5_2_ring_pad_ib,
+   .begin_use = amdgpu_sdma_ring_begin_use,
+   .end_use = amdgpu_sdma_ring_end_use,
.emit_wreg = sdma_v5_2_ring_emit_wreg,
.emit_reg_wait = sdma_v5_2_ring_emit_reg_wait,
.emit_reg_write_reg_wait = sdma_v5_2_ring_emit_reg_write_reg_wait,
-- 
2.42.0



[PATCH 2/3] drm/amdgpu/sdma5.0: add begin/end_use ring callbacks

2023-12-08 Thread Alex Deucher
Add begin/end_use ring callbacks to disallow GFXOFF when
SDMA work is submitted and allow it again afterward.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c 
b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index 5c1bb6d07a76..1a68cd2de522 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -1790,6 +1790,8 @@ static const struct amdgpu_ring_funcs 
sdma_v5_0_ring_funcs = {
.test_ib = sdma_v5_0_ring_test_ib,
.insert_nop = sdma_v5_0_ring_insert_nop,
.pad_ib = sdma_v5_0_ring_pad_ib,
+   .begin_use = amdgpu_sdma_ring_begin_use,
+   .end_use = amdgpu_sdma_ring_end_use,
.emit_wreg = sdma_v5_0_ring_emit_wreg,
.emit_reg_wait = sdma_v5_0_ring_emit_reg_wait,
.emit_reg_write_reg_wait = sdma_v5_0_ring_emit_reg_write_reg_wait,
-- 
2.42.0



[PATCH 1/3] drm/amdgpu/sdma: add begin_use/end_use helpers

2023-12-08 Thread Alex Deucher
Add helper functions to disallow GFXOFF while SDMA has work.
This should avoid corner cases where GFXOFF is erroneously
entered when SDMA is still active.  For now just allow/disallow
GFXOFF in the begin and end helpers until we root cause the
issue.  This should not impact power as SDMA usage is pretty
minimal and GFXOSS should not be active when SDMA is active
anyway, this just makes it explicit.

Signed-off-by: Alex Deucher 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c | 14 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  3 +++
 2 files changed, 17 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
index 1d9d187de6ee..d4b08d03970c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.c
@@ -326,3 +326,17 @@ int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev)
 
return 0;
 }
+
+void amdgpu_sdma_ring_begin_use(struct amdgpu_ring *ring)
+{
+   struct amdgpu_device *adev = ring->adev;
+
+   amdgpu_gfx_off_ctrl(adev, false);
+}
+
+void amdgpu_sdma_ring_end_use(struct amdgpu_ring *ring)
+{
+   struct amdgpu_device *adev = ring->adev;
+
+   amdgpu_gfx_off_ctrl(adev, true);
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
index 173a2a308078..b52d16829204 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
@@ -171,4 +171,7 @@ void amdgpu_sdma_destroy_inst_ctx(struct amdgpu_device 
*adev,
 bool duplicate);
 int amdgpu_sdma_ras_sw_init(struct amdgpu_device *adev);
 
+void amdgpu_sdma_ring_begin_use(struct amdgpu_ring *ring);
+void amdgpu_sdma_ring_end_use(struct amdgpu_ring *ring);
+
 #endif
-- 
2.42.0



RE: [PATCH] drm/amdgpu: xgmi_fill_topology_info

2023-12-08 Thread Kim, Jonathan
[Public]

> -Original Message-
> From: Chander, Vignesh 
> Sent: Thursday, December 7, 2023 7:42 PM
> To: amd-gfx@lists.freedesktop.org
> Cc: Lazar, Lijo ; Luo, Zhigang
> ; Kim, Jonathan ;
> Chander, Vignesh 
> Subject: [PATCH] drm/amdgpu: xgmi_fill_topology_info
>
> 1. Use the mirrored topology info to fill links for VF.
> The new solution is required to simplify and optimize host driver logic.
> Only use the new solution for VFs that support full duplex and
> extended_peer_link_info otherwise the info would be incomplete.
>
> 2. avoid calling extended_link_info on VF as its not supported
>
> Signed-off-by: Vignesh Chander 

Reviewed-by: Jonathan Kim 

> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c  |  4 +-
>  drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 58
> 
>  2 files changed, 52 insertions(+), 10 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> index a21045d018f2..1bf975b8d083 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
> @@ -1433,8 +1433,8 @@ int psp_xgmi_get_topology_info(struct
> psp_context *psp,
>get_extended_data) ||
>   amdgpu_ip_version(psp->adev, MP0_HWIP, 0) ==
>   IP_VERSION(13, 0, 6);
> - bool ta_port_num_support = psp-
> >xgmi_context.xgmi_ta_caps &
> -
>   EXTEND_PEER_LINK_INFO_CMD_FLAG;
> + bool ta_port_num_support = amdgpu_sriov_vf(psp->adev) ?
> 0 :
> + psp->xgmi_context.xgmi_ta_caps &
> EXTEND_PEER_LINK_INFO_CMD_FLAG;
>
>   /* popluate the shared output buffer rather than the cmd
> input buffer
>* with node_ids as the input for GET_PEER_LINKS command
> execution.
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> index 44d8c1a11e1b..dd82d73daed6 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
> @@ -823,6 +823,28 @@ static int
> amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_inf
>   return 0;
>  }
>
> +void amdgpu_xgmi_fill_topology_info(struct amdgpu_device *adev,
> + struct amdgpu_device *peer_adev)
> +{
> + struct psp_xgmi_topology_info *top_info = 
> >psp.xgmi_context.top_info;
> + struct psp_xgmi_topology_info *peer_info = _adev-
> >psp.xgmi_context.top_info;
> +
> + for (int i = 0; i < peer_info->num_nodes; i++) {
> + if (peer_info->nodes[i].node_id == adev->gmc.xgmi.node_id)
> {
> + for (int j = 0; j < top_info->num_nodes; j++) {
> + if (top_info->nodes[j].node_id == peer_adev-
> >gmc.xgmi.node_id) {
> + peer_info->nodes[i].num_hops =
> top_info->nodes[j].num_hops;
> + peer_info-
> >nodes[i].is_sharing_enabled =
> + top_info-
> >nodes[j].is_sharing_enabled;
> + peer_info->nodes[i].num_links =
> + top_info-
> >nodes[j].num_links;
> + return;
> + }
> + }
> + }
> + }
> +}
> +
>  int amdgpu_xgmi_add_device(struct amdgpu_device *adev)
>  {
>   struct psp_xgmi_topology_info *top_info;
> @@ -897,18 +919,38 @@ int amdgpu_xgmi_add_device(struct
> amdgpu_device *adev)
>   goto exit_unlock;
>   }
>
> - /* get latest topology info for each device from psp */
> - list_for_each_entry(tmp_adev, >device_list,
> gmc.xgmi.head) {
> - ret = psp_xgmi_get_topology_info(_adev->psp,
> count,
> - _adev-
> >psp.xgmi_context.top_info, false);
> + if (amdgpu_sriov_vf(adev) &&
> + psp->xgmi_context.xgmi_ta_caps &
> EXTEND_PEER_LINK_INFO_CMD_FLAG) {
> + /* only get topology for VF being init if it can
> support full duplex */
> + ret = psp_xgmi_get_topology_info(>psp,
> count,
> + 
> >psp.xgmi_context.top_info, false);
>   if (ret) {
> - dev_err(tmp_adev->dev,
> + dev_err(adev->dev,
>   "XGMI: Get topology failure on
> device %llx, hive %llx, ret %d",
> - tmp_adev->gmc.xgmi.node_id,
> - tmp_adev->gmc.xgmi.hive_id, ret);
> - /* To do : continue with some node failed or
> disable the whole hive */
> + adev->gmc.xgmi.node_id,
> + adev->gmc.xgmi.hive_id, ret);
> +   

Re: [PATCH 1/2] drm/amdgpu: fix tear down order in amdgpu_vm_pt_free

2023-12-08 Thread Alex Deucher
On Fri, Dec 8, 2023 at 7:55 AM Christian König
 wrote:
>
> When freeing PD/PT with shadows it can happen that the shadow
> destruction races with detaching the PD/PT from the VM causing a NULL
> pointer dereference in the invalidation code.
>
> Fix this by detaching the the PD/PT from the VM first and then
> freeinguthe shadow instead.

typo.  Should read:
freeing the shadow.

WIth that fixed, the series is:
Reviewed-by: Alex Deucher 


>
> Signed-off-by: Christian König 
> Fixes: https://gitlab.freedesktop.org/drm/amd/-/issues/2867
> Cc: 
> ---
>  drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> index a2287bb25223..a160265ddc07 100644
> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
> @@ -642,13 +642,14 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base 
> *entry)
>
> if (!entry->bo)
> return;
> +
> +   entry->bo->vm_bo = NULL;
> shadow = amdgpu_bo_shadowed(entry->bo);
> if (shadow) {
> ttm_bo_set_bulk_move(>tbo, NULL);
> amdgpu_bo_unref();
> }
> ttm_bo_set_bulk_move(>bo->tbo, NULL);
> -   entry->bo->vm_bo = NULL;
>
> spin_lock(>vm->status_lock);
> list_del(>vm_status);
> --
> 2.34.1
>


[pull] amdgpu, amdkfd, radeon drm-next-6.8

2023-12-08 Thread Alex Deucher
Hi Dave, Sima,

More updates for 6.8.

The following changes since commit 5edfd7d94b0310b74136b666551f1d23711ed445:

  Merge tag 'amd-drm-next-6.8-2023-12-01' of 
https://gitlab.freedesktop.org/agd5f/linux into drm-next (2023-12-05 12:11:41 
+1000)

are available in the Git repository at:

  https://gitlab.freedesktop.org/agd5f/linux.git 
tags/amd-drm-next-6.8-2023-12-08

for you to fetch changes up to 47c4533543af4759b7668a06c1a2ce06cdc71173:

  drm/amdgpu: Enable event log on MES 11 (2023-12-07 17:43:28 -0500)


amd-drm-next-6.8-2023-12-08:

amdgpu:
- SR-IOV fixes
- DCN 3.5 updates
- Backlight fixes
- MST fixes
- DMCUB fixes
- DPIA fixes
- Display powergating updates
- Enable writeback connectors
- Misc code cleanups
- Add more register state debugging for aquavanjaram
- Suspend fix
- Clockgating fixes
- SMU 14 updates
- PSR fixes
- MES logging updates
- Misc fixes

amdkfd:
- SVM fix

radeon:
- Fix potential memory leaks in error paths


Alex Deucher (2):
  drm/amd/display: Increase frame warning limit with KASAN or KCSAN in dml
  drm/amdgpu: fix buffer funcs setting order on suspend

Alex Hung (12):
  drm/amd/display: Avoid virtual stream encoder if not explicitly requested
  drm/amd/display: Initialize writeback connector
  drm/amd/display: Check writeback connectors in 
create_validate_stream_for_sink
  drm/amd/display: Hande writeback request from userspace
  drm/amd/display: Add writeback enable/disable in dc
  drm/amd/display: Fix writeback_info never got updated
  drm/amd/display: Validate hw_points_num before using it
  drm/amd/display: Fix writeback_info is not removed
  drm/amd/display: Add writeback enable field (wb_enabled)
  drm/amd/display: Setup for mmhubbub3_warmup_mcif with big buffer
  drm/amd/display: Add new set_fc_enable to struct dwbc_funcs
  drm/amd/display: Disable DWB frame capture to emulate oneshot

Alvin Lee (2):
  drm/amd/display: Optimize fast validation cases
  drm/amd/display: Use channel_width = 2 for vram table 3.0

Aric Cyr (1):
  drm/amd/display: 3.2.263

Aurabindo Pillai (1):
  drm/amd: Add a DC debug mask for DML2

Bokun Zhang (2):
  drm/amd/amdgpu: Move vcn4 fw_shared init to a single function
  drm/amd/amdgpu: SRIOV full reset issue with VCN

Charlene Liu (4):
  drm/amd/display: initialize all the dpm level's stutter latency
  drm/amd/display: insert drv-pmfw log + rollback to new context
  drm/amd/display: revert removing otg toggle w/a back when no active 
display
  drm/amd/display: keep domain24 power on if eDP not exist

Chris Park (1):
  drm/amd/display: Update BIOS FW info table revision

Daniel Miess (1):
  drm/amd/display: Add missing dcn35 RCO registers

Dennis Chan (1):
  drm/amd/display: Fix Replay Desync Error IRQ handler

Dillon Varone (1):
  drm/amd/display: Add dml2 copy functions

Dmitrii Galantsev (1):
  drm/amd/pm: fix pp_*clk_od typo

George Shen (1):
  drm/amd/display: Skip DPIA-specific DP LL automation flag for non-DPIA 
links

Hamza Mahfooz (1):
  drm/amd/display: fix hw rotated modes when PSR-SU is enabled

Harry Wentland (7):
  drm/amd/display: Skip entire amdgpu_dm build if !CONFIG_DRM_AMD_DC
  drm/amd/display: Create one virtual connector in DC
  drm/amd/display: Skip writeback connector when we get amdgpu_dm_connector
  drm/amd/display: Return drm_connector from 
find_first_crtc_matching_connector
  drm/amd/display: Use drm_connector in create_stream_for_sink
  drm/amd/display: Create amdgpu_dm_wb_connector
  drm/amd/display: Create fake sink and stream for writeback connector

Hawking Zhang (1):
  drm/amdgpu: Update fw version for boot time error query

Ilya Bakoulin (1):
  drm/amd/display: Fix MST PBN/X.Y value calculations

Ivan Lipski (1):
  drm/amd/display: Add monitor patch for specific eDP

Jiadong Zhu (1):
  drm/amdgpu: disable MCBP by default

Johnson Chen (1):
  drm/amd/display: Fix null pointer

Josip Pavic (1):
  drm/amd/display: Increase scratch buffer size

Krunoslav Kovac (1):
  drm/amd/display: Change dither policy for 10bpc to round

Lewis Huang (1):
  drm/amd/display: Pass pwrseq inst for backlight and ABM

Li Ma (1):
  drm/amd/swsmu: update smu v14_0_0 driver if version and metrics table

Lijo Lazar (6):
  drm/amdgpu: Read aquavanjaram WAFL register state
  drm/amdgpu: Read aquavanjaram USR register state
  drm/amdgpu: Restrict extended wait to PSP v13.0.6
  drm/amdgpu: Add NULL checks for function pointers
  drm/amdgpu: Update HDP 4.4.2 clock gating flags
  drm/amdgpu: Avoid querying DRM MGCG status

Mario Limonciello (1):
  drm/amd/display: Restore guard against default backlight value < 1 nit

Michael Strauss (1):
  drm/amd/display: Only enumerate top local sink as DP2 

Re: [PATCH 2/2] drm/amdgpu: Enable clear page functionality

2023-12-08 Thread Alex Deucher
On Fri, Dec 8, 2023 at 5:07 AM Christian König  wrote:
>
> Am 07.12.23 um 16:11 schrieb Arunpravin Paneer Selvam:
> > Add clear page support in vram memory region.
>
> The first patch looks good, but this here needs quite some work.
>
> >
> > Signed-off-by: Arunpravin Paneer Selvam 
> > ---
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_object.c| 13 +++--
> >   .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h| 25 ++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   | 50 +++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h   |  4 ++
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  | 14 +-
> >   drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h  |  5 ++
> >   6 files changed, 105 insertions(+), 6 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> > index cef920a93924..bc4ea87f8b5e 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
> > @@ -39,6 +39,7 @@
> >   #include "amdgpu.h"
> >   #include "amdgpu_trace.h"
> >   #include "amdgpu_amdkfd.h"
> > +#include "amdgpu_vram_mgr.h"
> >
> >   /**
> >* DOC: amdgpu_object
> > @@ -629,15 +630,17 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
> >
> >   if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED &&
> >   bo->tbo.resource->mem_type == TTM_PL_VRAM) {
> > - struct dma_fence *fence;
> > + struct dma_fence *fence = NULL;
> >
> > - r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , 
> > true);
> > + r = amdgpu_clear_buffer(bo, bo->tbo.base.resv, , true);
> >   if (unlikely(r))
> >   goto fail_unreserve;
> >
> > - dma_resv_add_fence(bo->tbo.base.resv, fence,
> > -DMA_RESV_USAGE_KERNEL);
> > - dma_fence_put(fence);
> > + if (fence) {
> > + dma_resv_add_fence(bo->tbo.base.resv, fence,
> > +DMA_RESV_USAGE_KERNEL);
> > + dma_fence_put(fence);
> > + }
> >   }
> >   if (!bp->resv)
> >   amdgpu_bo_unreserve(bo);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
> > index 381101d2bf05..50fcd86e1033 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
> > @@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct 
> > amdgpu_res_cursor *cur, uint64_t size)
> >   }
> >   }
> >
> > +/**
> > + * amdgpu_res_cleared - check if blocks are cleared
> > + *
> > + * @cur: the cursor to extract the block
> > + *
> > + * Check if the @cur block is cleared
> > + */
> > +static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur)
> > +{
> > + struct drm_buddy_block *block;
> > +
> > + switch (cur->mem_type) {
> > + case TTM_PL_VRAM:
> > + block = cur->node;
> > +
> > + if (!amdgpu_vram_mgr_is_cleared(block))
> > + return false;
> > + break;
> > + default:
> > + return false;
> > + }
> > +
> > + return true;
> > +}
> > +
> >   #endif
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
> > b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > index 05991c5c8ddb..6d7514e8f40c 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
> > @@ -,6 +,56 @@ static int amdgpu_ttm_fill_mem(struct amdgpu_ring 
> > *ring, uint32_t src_data,
> >   return 0;
> >   }
> >
> > +int amdgpu_clear_buffer(struct amdgpu_bo *bo,
> > + struct dma_resv *resv,
> > + struct dma_fence **fence,
> > + bool delayed)
>
> Drop the delayed parameter, that doesn't make any sense here.
>
> And as Alex said please use an amdgpu_ttm_ prefix for the function name.
>
> > +{
> > + struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
> > + struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
> > + struct amdgpu_res_cursor cursor;
> > + struct dma_fence *f = NULL;
> > + u64 addr;
> > + int r;
> > +
> > + if (!adev->mman.buffer_funcs_enabled)
> > + return -EINVAL;
> > +
> > + amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), );
> > +
> > + mutex_lock(>mman.gtt_window_lock);
> > + while (cursor.remaining) {
> > + struct dma_fence *next = NULL;
> > + u64 size;
> > +
> > + /* Never clear more than 256MiB at once to avoid timeouts */
> > + size = min(cursor.size, 256ULL << 20);
> > +
> > + if (!amdgpu_res_cleared()) {
>
> This needs to come before the min(cursor.size) directly above. I
> suggest a handling like this:
>
> if (amdgpu_res_cleared()) {
> amdgpu_res_next(, cursor.size);
> continue;
> }
>
> 

Re: [PATCH 1/2] drm/amdgpu/debugfs: fix error code when smc register accessors are NULL

2023-12-08 Thread Alex Deucher
On Fri, Dec 8, 2023 at 8:24 AM Christian König
 wrote:
>
> The second patch never made it into my inbox, but the first one is
> Reviewed-by: Christian König .

Thanks.  Patch 2:
https://patchwork.freedesktop.org/patch/569132/

Alex

>
> Christian.
>
> Am 07.12.23 um 18:39 schrieb Alex Deucher:
> > Ping on this series?
> >
> > Alex
> >
> > On Mon, Nov 27, 2023 at 5:52 PM Alex Deucher  
> > wrote:
> >> Should be -EOPNOTSUPP.
> >>
> >> Fixes: 5104fdf50d32 ("drm/amdgpu: Fix a null pointer access when the 
> >> smc_rreg pointer is NULL")
> >> Signed-off-by: Alex Deucher 
> >> ---
> >>   drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 4 ++--
> >>   1 file changed, 2 insertions(+), 2 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> >> index 0e61ebdb3f3e..8d4a3ff65c18 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
> >> @@ -755,7 +755,7 @@ static ssize_t amdgpu_debugfs_regs_smc_read(struct 
> >> file *f, char __user *buf,
> >>  int r;
> >>
> >>  if (!adev->smc_rreg)
> >> -   return -EPERM;
> >> +   return -EOPNOTSUPP;
> >>
> >>  if (size & 0x3 || *pos & 0x3)
> >>  return -EINVAL;
> >> @@ -814,7 +814,7 @@ static ssize_t amdgpu_debugfs_regs_smc_write(struct 
> >> file *f, const char __user *
> >>  int r;
> >>
> >>  if (!adev->smc_wreg)
> >> -   return -EPERM;
> >> +   return -EOPNOTSUPP;
> >>
> >>  if (size & 0x3 || *pos & 0x3)
> >>  return -EINVAL;
> >> --
> >> 2.42.0
> >>
>


Re: [PATCH linux-next] drm/amd/display: replace kzalloc and memcpy with kmemdup

2023-12-08 Thread Christophe JAILLET

Le 08/12/2023 à 03:44, yang.gua...@zte.com.cn a écrit :

From: Yang Guang 

Convert kzalloc/memcpy operations to memdup makes for
cleaner code and avoids memcpy() failures


Hi,

usually, function's names are written with () in commit description. 
(i.e. kzalloc()/memcpy()).


memdup should be kmemdup().

Finally the proposed change does not avoid memcpy() failures. Should it 
fail (what does it mean in this context?), kmemdup() would behave 
exactly the same.




Signed-off-by: Chen Haonan 
---
  drivers/gpu/drm/amd/display/dc/core/dc.c | 10 --
  1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c 
b/drivers/gpu/drm/amd/display/dc/core/dc.c
index 76b47f178127..867e1a0fdef6 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc.c
@@ -2264,12 +2264,10 @@ struct dc_state *dc_copy_state(struct dc_state *src_ctx)

  #ifdef CONFIG_DRM_AMD_DC_FP
if (new_ctx->bw_ctx.dml2) {
-   dml2 = kzalloc(sizeof(struct dml2_context), GFP_KERNEL);
-   if (!dml2)
-   return NULL;
-
-   memcpy(dml2, src_ctx->bw_ctx.dml2, sizeof(struct dml2_context));
-   new_ctx->bw_ctx.dml2 = dml2;
+   dml2 = kmemdup(src_ctx->bw_ctx.dml2, sizeof(struct 
dml2_context), GFP_KERNEL);


sizeof(struct dml2_context) could be sizeof(*dlm2) to be less verbose.

CJ


+   if (!dml2)
+   return NULL;
+   new_ctx->bw_ctx.dml2 = dml2;
}
  #endif





Re: [PATCH] drm/amdgpu: Enable tunneling on high-priority compute queues

2023-12-08 Thread Alex Deucher
On Fri, Dec 8, 2023 at 12:27 PM Joshua Ashton  wrote:
>
> FWIW, we are shipping this right now in SteamOS Preview channel
> (probably going to Stable soon) and it seems to be working as expected
> and fixing issues there in instances we need to composite, compositor
> work we are forced to do would take longer than the compositor redzone
> to vblank.
>
> Previously in high gfx workloads like Cyberpunk using 100% of the GPU,
> we would consistently miss the deadline as composition could take
> anywhere from 2-6ms fairly randomly.
>
> Now it seems the time for the compositor's work to complete is pretty
> consistent and well in-time in gpuvis for every frame.

I was mostly just trying to look up the information to verify that it
was set up correctly, but I guess Marek already did and provided you
with that info, so it's probably fine as is.

>
> The only times we are not meeting deadline now is when there is an
> application using very little GPU and finishes incredibly quick, and the
> compositor is doing significantly more work (eg. FSR from 800p -> 4K or
> whatever), but that's a separate problem that can likely be solved by
> inlining some of the composition work with the client's dmabuf work if
> it has focus to avoid those clock bubbles.
>
> I heard some musings about dmabuf deadline kernel work recently, but not
> sure if any of that is applicable to AMD.

I think something like a workload hint would be more useful.  We did a
few patch sets to allow userspace to provide a hint to the kernel
about the workload type so the kernel could adjust the power
management heuristics accordingly, but there were concerns that the
UMDs would have to maintain application lists to select which
heuristic worked best for each application.  Maybe it would be better
to provide a general classification?  E.g., if the GL or vulkan app
uses these extensions, it's probably a compute type application vs
something more graphics-y.  The usual trade-off between power and
performance.  In general, just letting the firmware pick the clock
based on perf counters generally seems to work the best.  Maybe a
general workload hint set by the compositor based on the content type
it's displaying would be a better option (video vs gaming vs desktop)?

The deadline stuff doesn't really align well with what we can do with
our firmware and seems ripe for abuse.  Apps can just ask for high
clocks all the time which is great for performance, but not great for
power.  Plus there is not much room for anything other than max clocks
since you don't know how big the workload is or which clocks are the
limiting factor.

Alex

>
> - Joshie ✨
>
> On 12/8/23 15:33, Marek Olšák wrote:
> > On Fri, Dec 8, 2023 at 9:57 AM Christian König  > > wrote:
> >
> > Am 08.12.23 um 12:43 schrieb Friedrich Vock:
> >  > On 08.12.23 10:51, Christian König wrote:
> >  >> Well longer story short Alex and I have been digging up the
> >  >> documentation for this and as far as we can tell this isn't correct.
> >  > Huh. I initially talked to Marek about this, adding him in Cc.
> >
> > Yeah, from the userspace side all you need to do is to set the bit as
> > far as I can tell.
> >
> >  >>
> >  >> You need to do quite a bit more before you can turn on this feature.
> >  >> What userspace side do you refer to?
> >  > I was referring to the Mesa merge request I made
> >  > (https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26462
> > ).
> >  > If/When you have more details about what else needs to be done, feel
> >  > free to let me know.
> >
> > For example from the hardware specification explicitly states that the
> > kernel driver should make sure that only one app/queue is using this at
> > the same time. That might work for now since we should only have a
> > single compute priority queue, but we are not 100% sure yet.
> >
> >
> > This is incorrect. While the hw documentation says it's considered
> > "unexpected programming", it also says that the hardware algorithm
> > handles it correctly and it describes what happens in this case:
> > Tunneled waves from different queues are treated as equal.
> >
> > Marek
>


RE: [PATCH] drm/amdgpu: xgmi_fill_topology_info

2023-12-08 Thread Luo, Zhigang
[AMD Official Use Only - General]

Reviewed-by: Zhigang Luo 

-Original Message-
From: Chander, Vignesh 
Sent: Thursday, December 7, 2023 7:42 PM
To: amd-gfx@lists.freedesktop.org
Cc: Lazar, Lijo ; Luo, Zhigang ; Kim, 
Jonathan ; Chander, Vignesh 
Subject: [PATCH] drm/amdgpu: xgmi_fill_topology_info

1. Use the mirrored topology info to fill links for VF.
The new solution is required to simplify and optimize host driver logic.
Only use the new solution for VFs that support full duplex and 
extended_peer_link_info otherwise the info would be incomplete.

2. avoid calling extended_link_info on VF as its not supported

Signed-off-by: Vignesh Chander 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c  |  4 +-  
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 58 
 2 files changed, 52 insertions(+), 10 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
index a21045d018f2..1bf975b8d083 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c
@@ -1433,8 +1433,8 @@ int psp_xgmi_get_topology_info(struct psp_context *psp,
 get_extended_data) ||
amdgpu_ip_version(psp->adev, MP0_HWIP, 0) ==
IP_VERSION(13, 0, 6);
-   bool ta_port_num_support = psp->xgmi_context.xgmi_ta_caps &
-   EXTEND_PEER_LINK_INFO_CMD_FLAG;
+   bool ta_port_num_support = amdgpu_sriov_vf(psp->adev) ? 0 :
+   psp->xgmi_context.xgmi_ta_caps & 
EXTEND_PEER_LINK_INFO_CMD_FLAG;

/* popluate the shared output buffer rather than the cmd input 
buffer
 * with node_ids as the input for GET_PEER_LINKS command 
execution.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index 44d8c1a11e1b..dd82d73daed6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -823,6 +823,28 @@ static int 
amdgpu_xgmi_initialize_hive_get_data_partition(struct amdgpu_hive_inf
return 0;
 }

+void amdgpu_xgmi_fill_topology_info(struct amdgpu_device *adev,
+   struct amdgpu_device *peer_adev)
+{
+   struct psp_xgmi_topology_info *top_info = 
>psp.xgmi_context.top_info;
+   struct psp_xgmi_topology_info *peer_info =
+_adev->psp.xgmi_context.top_info;
+
+   for (int i = 0; i < peer_info->num_nodes; i++) {
+   if (peer_info->nodes[i].node_id == adev->gmc.xgmi.node_id) {
+   for (int j = 0; j < top_info->num_nodes; j++) {
+   if (top_info->nodes[j].node_id == 
peer_adev->gmc.xgmi.node_id) {
+   peer_info->nodes[i].num_hops = 
top_info->nodes[j].num_hops;
+   peer_info->nodes[i].is_sharing_enabled =
+   
top_info->nodes[j].is_sharing_enabled;
+   peer_info->nodes[i].num_links =
+   
top_info->nodes[j].num_links;
+   return;
+   }
+   }
+   }
+   }
+}
+
 int amdgpu_xgmi_add_device(struct amdgpu_device *adev)  {
struct psp_xgmi_topology_info *top_info; @@ -897,18 +919,38 @@ int 
amdgpu_xgmi_add_device(struct amdgpu_device *adev)
goto exit_unlock;
}

-   /* get latest topology info for each device from psp */
-   list_for_each_entry(tmp_adev, >device_list, 
gmc.xgmi.head) {
-   ret = psp_xgmi_get_topology_info(_adev->psp, count,
-   _adev->psp.xgmi_context.top_info, 
false);
+   if (amdgpu_sriov_vf(adev) &&
+   psp->xgmi_context.xgmi_ta_caps & 
EXTEND_PEER_LINK_INFO_CMD_FLAG) {
+   /* only get topology for VF being init if it can 
support full duplex */
+   ret = psp_xgmi_get_topology_info(>psp, count,
+   
>psp.xgmi_context.top_info, false);
if (ret) {
-   dev_err(tmp_adev->dev,
+   dev_err(adev->dev,
"XGMI: Get topology failure on device 
%llx, hive %llx, ret %d",
-   tmp_adev->gmc.xgmi.node_id,
-   tmp_adev->gmc.xgmi.hive_id, ret);
-   /* To do : continue with some node failed or 
disable the whole hive */
+   adev->gmc.xgmi.node_id,
+   adev->gmc.xgmi.hive_id, ret);
+   /* To do: continue with some node failed or 

Re: [PATCH] drm/amdgpu: Enable tunneling on high-priority compute queues

2023-12-08 Thread Joshua Ashton
FWIW, we are shipping this right now in SteamOS Preview channel 
(probably going to Stable soon) and it seems to be working as expected 
and fixing issues there in instances we need to composite, compositor 
work we are forced to do would take longer than the compositor redzone 
to vblank.


Previously in high gfx workloads like Cyberpunk using 100% of the GPU, 
we would consistently miss the deadline as composition could take 
anywhere from 2-6ms fairly randomly.


Now it seems the time for the compositor's work to complete is pretty 
consistent and well in-time in gpuvis for every frame.


The only times we are not meeting deadline now is when there is an 
application using very little GPU and finishes incredibly quick, and the 
compositor is doing significantly more work (eg. FSR from 800p -> 4K or 
whatever), but that's a separate problem that can likely be solved by 
inlining some of the composition work with the client's dmabuf work if 
it has focus to avoid those clock bubbles.


I heard some musings about dmabuf deadline kernel work recently, but not 
sure if any of that is applicable to AMD.


- Joshie ✨

On 12/8/23 15:33, Marek Olšák wrote:
On Fri, Dec 8, 2023 at 9:57 AM Christian König > wrote:


Am 08.12.23 um 12:43 schrieb Friedrich Vock:
 > On 08.12.23 10:51, Christian König wrote:
 >> Well longer story short Alex and I have been digging up the
 >> documentation for this and as far as we can tell this isn't correct.
 > Huh. I initially talked to Marek about this, adding him in Cc.

Yeah, from the userspace side all you need to do is to set the bit as
far as I can tell.

 >>
 >> You need to do quite a bit more before you can turn on this feature.
 >> What userspace side do you refer to?
 > I was referring to the Mesa merge request I made
 > (https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26462
).
 > If/When you have more details about what else needs to be done, feel
 > free to let me know.

For example from the hardware specification explicitly states that the
kernel driver should make sure that only one app/queue is using this at
the same time. That might work for now since we should only have a
single compute priority queue, but we are not 100% sure yet.


This is incorrect. While the hw documentation says it's considered 
"unexpected programming", it also says that the hardware algorithm 
handles it correctly and it describes what happens in this case: 
Tunneled waves from different queues are treated as equal.


Marek




Re: [PATCH] drm/amdgpu: Enable tunneling on high-priority compute queues

2023-12-08 Thread Marek Olšák
On Fri, Dec 8, 2023 at 9:57 AM Christian König 
wrote:

> Am 08.12.23 um 12:43 schrieb Friedrich Vock:
> > On 08.12.23 10:51, Christian König wrote:
> >> Well longer story short Alex and I have been digging up the
> >> documentation for this and as far as we can tell this isn't correct.
> > Huh. I initially talked to Marek about this, adding him in Cc.
>
> Yeah, from the userspace side all you need to do is to set the bit as
> far as I can tell.
>
> >>
> >> You need to do quite a bit more before you can turn on this feature.
> >> What userspace side do you refer to?
> > I was referring to the Mesa merge request I made
> > (https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26462).
> > If/When you have more details about what else needs to be done, feel
> > free to let me know.
>
> For example from the hardware specification explicitly states that the
> kernel driver should make sure that only one app/queue is using this at
> the same time. That might work for now since we should only have a
> single compute priority queue, but we are not 100% sure yet.
>

This is incorrect. While the hw documentation says it's considered
"unexpected programming", it also says that the hardware algorithm handles
it correctly and it describes what happens in this case: Tunneled waves
from different queues are treated as equal.

Marek


Re: [PATCH] drm/amdgpu: Enable tunneling on high-priority compute queues

2023-12-08 Thread Olsak, Marek
[AMD Official Use Only - General]

Christian, firmware has nothing to do with it and doesn't control it. That was 
a wrong group of people to ping. It's only implemented in the SPI and tested by 
the SPI team and PAL team.

Marek

From: Koenig, Christian 
Sent: December 8, 2023 09:38
To: Friedrich Vock ; amd-gfx@lists.freedesktop.org 

Cc: Deucher, Alexander ; Olsak, Marek 

Subject: Re: [PATCH] drm/amdgpu: Enable tunneling on high-priority compute 
queues

Am 08.12.23 um 12:43 schrieb Friedrich Vock:
> On 08.12.23 10:51, Christian König wrote:
>> Well longer story short Alex and I have been digging up the
>> documentation for this and as far as we can tell this isn't correct.
> Huh. I initially talked to Marek about this, adding him in Cc.

Yeah, from the userspace side all you need to do is to set the bit as
far as I can tell.

>>
>> You need to do quite a bit more before you can turn on this feature.
>> What userspace side do you refer to?
> I was referring to the Mesa merge request I made
> (https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26462).
> If/When you have more details about what else needs to be done, feel
> free to let me know.

For example from the hardware specification explicitly states that the
kernel driver should make sure that only one app/queue is using this at
the same time. That might work for now since we should only have a
single compute priority queue, but we are not 100% sure yet.

Apart from that the hardware documentation only says that it's a nice to
have feature and when we pinged firmware engineers to get more
information they didn't know the feature immediately either.

That is usually a strong indicator that stuff was implemented in the
hardware, but not fully completed and tested by the firmware team and
validation team.

Alex and I need to confirm that this feature actually works the way it
should and that it's validated/stable/read for production use.

Regards,
Christian.

> I'm happy to expand this to add the rest of what's needed as well.
>
> Thanks,
> Friedrich
>
>>
>> Regards,
>> Christian.
>>
>> Am 08.12.23 um 09:19 schrieb Friedrich Vock:
>>> Friendly ping on this one.
>>> Userspace side got merged, so would be great to land this patch too :)
>>>
>>> On 02.12.23 01:17, Friedrich Vock wrote:
 This improves latency if the GPU is already busy with other work.
 This is useful for VR compositors that submit highly latency-sensitive
 compositing work on high-priority compute queues while the GPU is busy
 rendering the next frame.

 Userspace merge request:
 https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26462

 Signed-off-by: Friedrich Vock 
 ---
   drivers/gpu/drm/amd/amdgpu/amdgpu.h  |  1 +
   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 10 ++
   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  3 ++-
   drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c   |  3 ++-
   4 files changed, 11 insertions(+), 6 deletions(-)

 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
 b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
 index 9505dc8f9d69..4b923a156c4e 100644
 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
 +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
 @@ -790,6 +790,7 @@ struct amdgpu_mqd_prop {
   uint64_t eop_gpu_addr;
   uint32_t hqd_pipe_priority;
   uint32_t hqd_queue_priority;
 +bool allow_tunneling;
   bool hqd_active;
   };

 diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
 b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
 index 231d49132a56..4d98e8879be8 100644
 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
 +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
 @@ -620,6 +620,10 @@ static void amdgpu_ring_to_mqd_prop(struct
 amdgpu_ring *ring,
   struct amdgpu_mqd_prop *prop)
   {
   struct amdgpu_device *adev = ring->adev;
 +bool is_high_prio_compute = ring->funcs->type ==
 AMDGPU_RING_TYPE_COMPUTE &&
 + amdgpu_gfx_is_high_priority_compute_queue(adev, ring);
 +bool is_high_prio_gfx = ring->funcs->type ==
 AMDGPU_RING_TYPE_GFX &&
 + amdgpu_gfx_is_high_priority_graphics_queue(adev, ring);

   memset(prop, 0, sizeof(*prop));

 @@ -637,10 +641,8 @@ static void amdgpu_ring_to_mqd_prop(struct
 amdgpu_ring *ring,
*/
   prop->hqd_active = ring->funcs->type == AMDGPU_RING_TYPE_KIQ;

 -if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE &&
 - amdgpu_gfx_is_high_priority_compute_queue(adev, ring)) ||
 -(ring->funcs->type == AMDGPU_RING_TYPE_GFX &&
 - amdgpu_gfx_is_high_priority_graphics_queue(adev, ring))) {
 +prop->allow_tunneling = is_high_prio_compute;
 +if (is_high_prio_compute || is_high_prio_gfx) {
   prop->hqd_pipe_priority = AMDGPU_GFX_PIPE_PRIO_HIGH;
   prop->hqd_queue_priority =

Re: [PATCH] drm/amdgpu: Enable tunneling on high-priority compute queues

2023-12-08 Thread Christian König

Am 08.12.23 um 12:43 schrieb Friedrich Vock:

On 08.12.23 10:51, Christian König wrote:

Well longer story short Alex and I have been digging up the
documentation for this and as far as we can tell this isn't correct.

Huh. I initially talked to Marek about this, adding him in Cc.


Yeah, from the userspace side all you need to do is to set the bit as 
far as I can tell.




You need to do quite a bit more before you can turn on this feature.
What userspace side do you refer to?

I was referring to the Mesa merge request I made
(https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26462).
If/When you have more details about what else needs to be done, feel
free to let me know.


For example from the hardware specification explicitly states that the 
kernel driver should make sure that only one app/queue is using this at 
the same time. That might work for now since we should only have a 
single compute priority queue, but we are not 100% sure yet.


Apart from that the hardware documentation only says that it's a nice to 
have feature and when we pinged firmware engineers to get more 
information they didn't know the feature immediately either.


That is usually a strong indicator that stuff was implemented in the 
hardware, but not fully completed and tested by the firmware team and 
validation team.


Alex and I need to confirm that this feature actually works the way it 
should and that it's validated/stable/read for production use.


Regards,
Christian.


I'm happy to expand this to add the rest of what's needed as well.

Thanks,
Friedrich



Regards,
Christian.

Am 08.12.23 um 09:19 schrieb Friedrich Vock:

Friendly ping on this one.
Userspace side got merged, so would be great to land this patch too :)

On 02.12.23 01:17, Friedrich Vock wrote:

This improves latency if the GPU is already busy with other work.
This is useful for VR compositors that submit highly latency-sensitive
compositing work on high-priority compute queues while the GPU is busy
rendering the next frame.

Userspace merge request:
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26462

Signed-off-by: Friedrich Vock 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h  |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 10 ++
  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  3 ++-
  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c   |  3 ++-
  4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9505dc8f9d69..4b923a156c4e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -790,6 +790,7 @@ struct amdgpu_mqd_prop {
  uint64_t eop_gpu_addr;
  uint32_t hqd_pipe_priority;
  uint32_t hqd_queue_priority;
+    bool allow_tunneling;
  bool hqd_active;
  };

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 231d49132a56..4d98e8879be8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -620,6 +620,10 @@ static void amdgpu_ring_to_mqd_prop(struct
amdgpu_ring *ring,
  struct amdgpu_mqd_prop *prop)
  {
  struct amdgpu_device *adev = ring->adev;
+    bool is_high_prio_compute = ring->funcs->type ==
AMDGPU_RING_TYPE_COMPUTE &&
+ amdgpu_gfx_is_high_priority_compute_queue(adev, ring);
+    bool is_high_prio_gfx = ring->funcs->type ==
AMDGPU_RING_TYPE_GFX &&
+ amdgpu_gfx_is_high_priority_graphics_queue(adev, ring);

  memset(prop, 0, sizeof(*prop));

@@ -637,10 +641,8 @@ static void amdgpu_ring_to_mqd_prop(struct
amdgpu_ring *ring,
   */
  prop->hqd_active = ring->funcs->type == AMDGPU_RING_TYPE_KIQ;

-    if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE &&
- amdgpu_gfx_is_high_priority_compute_queue(adev, ring)) ||
-    (ring->funcs->type == AMDGPU_RING_TYPE_GFX &&
- amdgpu_gfx_is_high_priority_graphics_queue(adev, ring))) {
+    prop->allow_tunneling = is_high_prio_compute;
+    if (is_high_prio_compute || is_high_prio_gfx) {
  prop->hqd_pipe_priority = AMDGPU_GFX_PIPE_PRIO_HIGH;
  prop->hqd_queue_priority = 
AMDGPU_GFX_QUEUE_PRIORITY_MAXIMUM;

  }
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index c8a3bf01743f..73f6d7e72c73 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -6593,7 +6593,8 @@ static int gfx_v10_0_compute_mqd_init(struct
amdgpu_device *adev, void *m,
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, ENDIAN_SWAP, 1);
  #endif
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 0);
-    tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH, 0);
+    tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH,
+    prop->allow_tunneling);
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, PRIV_STATE, 1);
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, KMD_QUEUE, 1);
  

Re: [PATCH] drm/amdgpu: Enable tunneling on high-priority compute queues

2023-12-08 Thread Marek Olšák
It's correct according to our documentation.

Reviewed-by: Marek Olšák 

Marek

On Fri, Dec 8, 2023 at 5:47 AM Christian König 
wrote:

> Well longer story short Alex and I have been digging up the
> documentation for this and as far as we can tell this isn't correct.
>
> You need to do quite a bit more before you can turn on this feature.
> What userspace side do you refer to?
>
> Regards,
> Christian.
>
> Am 08.12.23 um 09:19 schrieb Friedrich Vock:
> > Friendly ping on this one.
> > Userspace side got merged, so would be great to land this patch too :)
> >
> > On 02.12.23 01:17, Friedrich Vock wrote:
> >> This improves latency if the GPU is already busy with other work.
> >> This is useful for VR compositors that submit highly latency-sensitive
> >> compositing work on high-priority compute queues while the GPU is busy
> >> rendering the next frame.
> >>
> >> Userspace merge request:
> >> https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26462
> >>
> >> Signed-off-by: Friedrich Vock 
> >> ---
> >>   drivers/gpu/drm/amd/amdgpu/amdgpu.h  |  1 +
> >>   drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 10 ++
> >>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  3 ++-
> >>   drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c   |  3 ++-
> >>   4 files changed, 11 insertions(+), 6 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> >> index 9505dc8f9d69..4b923a156c4e 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
> >> @@ -790,6 +790,7 @@ struct amdgpu_mqd_prop {
> >>   uint64_t eop_gpu_addr;
> >>   uint32_t hqd_pipe_priority;
> >>   uint32_t hqd_queue_priority;
> >> +bool allow_tunneling;
> >>   bool hqd_active;
> >>   };
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> >> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> >> index 231d49132a56..4d98e8879be8 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
> >> @@ -620,6 +620,10 @@ static void amdgpu_ring_to_mqd_prop(struct
> >> amdgpu_ring *ring,
> >>   struct amdgpu_mqd_prop *prop)
> >>   {
> >>   struct amdgpu_device *adev = ring->adev;
> >> +bool is_high_prio_compute = ring->funcs->type ==
> >> AMDGPU_RING_TYPE_COMPUTE &&
> >> + amdgpu_gfx_is_high_priority_compute_queue(adev, ring);
> >> +bool is_high_prio_gfx = ring->funcs->type ==
> >> AMDGPU_RING_TYPE_GFX &&
> >> + amdgpu_gfx_is_high_priority_graphics_queue(adev, ring);
> >>
> >>   memset(prop, 0, sizeof(*prop));
> >>
> >> @@ -637,10 +641,8 @@ static void amdgpu_ring_to_mqd_prop(struct
> >> amdgpu_ring *ring,
> >>*/
> >>   prop->hqd_active = ring->funcs->type == AMDGPU_RING_TYPE_KIQ;
> >>
> >> -if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE &&
> >> - amdgpu_gfx_is_high_priority_compute_queue(adev, ring)) ||
> >> -(ring->funcs->type == AMDGPU_RING_TYPE_GFX &&
> >> - amdgpu_gfx_is_high_priority_graphics_queue(adev, ring))) {
> >> +prop->allow_tunneling = is_high_prio_compute;
> >> +if (is_high_prio_compute || is_high_prio_gfx) {
> >>   prop->hqd_pipe_priority = AMDGPU_GFX_PIPE_PRIO_HIGH;
> >>   prop->hqd_queue_priority = AMDGPU_GFX_QUEUE_PRIORITY_MAXIMUM;
> >>   }
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> >> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> >> index c8a3bf01743f..73f6d7e72c73 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> >> @@ -6593,7 +6593,8 @@ static int gfx_v10_0_compute_mqd_init(struct
> >> amdgpu_device *adev, void *m,
> >>   tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, ENDIAN_SWAP, 1);
> >>   #endif
> >>   tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 0);
> >> -tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH, 0);
> >> +tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH,
> >> +prop->allow_tunneling);
> >>   tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, PRIV_STATE, 1);
> >>   tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, KMD_QUEUE, 1);
> >>   mqd->cp_hqd_pq_control = tmp;
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> >> b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> >> index c659ef0f47ce..bdcf96df69e6 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
> >> @@ -3847,7 +3847,8 @@ static int gfx_v11_0_compute_mqd_init(struct
> >> amdgpu_device *adev, void *m,
> >>   tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, RPTR_BLOCK_SIZE,
> >>   (order_base_2(AMDGPU_GPU_PAGE_SIZE / 4) - 1));
> >>   tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 0);
> >> -tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH, 0);
> >> +tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH,
> >> +prop->allow_tunneling);
> >>   

[PATCH] drm/amd/display: Fix memory leak in dm_set_writeback()

2023-12-08 Thread Harshit Mogalapalli
'wb_info' needs to be freed on error paths or it would leak the memory.

Smatch pointed this out.

Fixes: c81e13b929df ("drm/amd/display: Hande writeback request from userspace")
Signed-off-by: Harshit Mogalapalli 
---
This is based on static analysis and only compile tested
---
 drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
index afdcc43ea06c..333995f70239 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c
@@ -8871,12 +8871,14 @@ static void dm_set_writeback(struct 
amdgpu_display_manager *dm,
acrtc = to_amdgpu_crtc(wb_conn->encoder.crtc);
if (!acrtc) {
DRM_ERROR("no amdgpu_crtc found\n");
+   kfree(wb_info);
return;
}
 
afb = to_amdgpu_framebuffer(new_con_state->writeback_job->fb);
if (!afb) {
DRM_ERROR("No amdgpu_framebuffer found\n");
+   kfree(wb_info);
return;
}
 
-- 
2.39.3



Re: [PATCH 1/2] drm/amdgpu/debugfs: fix error code when smc register accessors are NULL

2023-12-08 Thread Christian König
The second patch never made it into my inbox, but the first one is 
Reviewed-by: Christian König .


Christian.

Am 07.12.23 um 18:39 schrieb Alex Deucher:

Ping on this series?

Alex

On Mon, Nov 27, 2023 at 5:52 PM Alex Deucher  wrote:

Should be -EOPNOTSUPP.

Fixes: 5104fdf50d32 ("drm/amdgpu: Fix a null pointer access when the smc_rreg 
pointer is NULL")
Signed-off-by: Alex Deucher 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
index 0e61ebdb3f3e..8d4a3ff65c18 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c
@@ -755,7 +755,7 @@ static ssize_t amdgpu_debugfs_regs_smc_read(struct file *f, 
char __user *buf,
 int r;

 if (!adev->smc_rreg)
-   return -EPERM;
+   return -EOPNOTSUPP;

 if (size & 0x3 || *pos & 0x3)
 return -EINVAL;
@@ -814,7 +814,7 @@ static ssize_t amdgpu_debugfs_regs_smc_write(struct file 
*f, const char __user *
 int r;

 if (!adev->smc_wreg)
-   return -EPERM;
+   return -EOPNOTSUPP;

 if (size & 0x3 || *pos & 0x3)
 return -EINVAL;
--
2.42.0





[PATCH 2/2] drm/amdgpu: warn when there are still mappings when a BO is destroyed v2

2023-12-08 Thread Christian König
This can only happen when there is a reference counting bug.

v2: fix typo

Signed-off-by: Christian König 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_object.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index cef920a93924..75d86abe15eb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -1343,6 +1343,8 @@ void amdgpu_bo_release_notify(struct ttm_buffer_object 
*bo)
 
abo = ttm_to_amdgpu_bo(bo);
 
+   WARN_ON(abo->vm_bo);
+
if (abo->kfd_bo)
amdgpu_amdkfd_release_notify(abo);
 
-- 
2.34.1



[PATCH 1/2] drm/amdgpu: fix tear down order in amdgpu_vm_pt_free

2023-12-08 Thread Christian König
When freeing PD/PT with shadows it can happen that the shadow
destruction races with detaching the PD/PT from the VM causing a NULL
pointer dereference in the invalidation code.

Fix this by detaching the the PD/PT from the VM first and then
freeinguthe shadow instead.

Signed-off-by: Christian König 
Fixes: https://gitlab.freedesktop.org/drm/amd/-/issues/2867
Cc: 
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
index a2287bb25223..a160265ddc07 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm_pt.c
@@ -642,13 +642,14 @@ static void amdgpu_vm_pt_free(struct amdgpu_vm_bo_base 
*entry)
 
if (!entry->bo)
return;
+
+   entry->bo->vm_bo = NULL;
shadow = amdgpu_bo_shadowed(entry->bo);
if (shadow) {
ttm_bo_set_bulk_move(>tbo, NULL);
amdgpu_bo_unref();
}
ttm_bo_set_bulk_move(>bo->tbo, NULL);
-   entry->bo->vm_bo = NULL;
 
spin_lock(>vm->status_lock);
list_del(>vm_status);
-- 
2.34.1



Re: [PATCH] drm/amdgpu: Enable tunneling on high-priority compute queues

2023-12-08 Thread Friedrich Vock

On 08.12.23 10:51, Christian König wrote:

Well longer story short Alex and I have been digging up the
documentation for this and as far as we can tell this isn't correct.

Huh. I initially talked to Marek about this, adding him in Cc.


You need to do quite a bit more before you can turn on this feature.
What userspace side do you refer to?

I was referring to the Mesa merge request I made
(https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26462).
If/When you have more details about what else needs to be done, feel
free to let me know.
I'm happy to expand this to add the rest of what's needed as well.

Thanks,
Friedrich



Regards,
Christian.

Am 08.12.23 um 09:19 schrieb Friedrich Vock:

Friendly ping on this one.
Userspace side got merged, so would be great to land this patch too :)

On 02.12.23 01:17, Friedrich Vock wrote:

This improves latency if the GPU is already busy with other work.
This is useful for VR compositors that submit highly latency-sensitive
compositing work on high-priority compute queues while the GPU is busy
rendering the next frame.

Userspace merge request:
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26462

Signed-off-by: Friedrich Vock 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h  |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 10 ++
  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  3 ++-
  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c   |  3 ++-
  4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9505dc8f9d69..4b923a156c4e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -790,6 +790,7 @@ struct amdgpu_mqd_prop {
  uint64_t eop_gpu_addr;
  uint32_t hqd_pipe_priority;
  uint32_t hqd_queue_priority;
+    bool allow_tunneling;
  bool hqd_active;
  };

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 231d49132a56..4d98e8879be8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -620,6 +620,10 @@ static void amdgpu_ring_to_mqd_prop(struct
amdgpu_ring *ring,
  struct amdgpu_mqd_prop *prop)
  {
  struct amdgpu_device *adev = ring->adev;
+    bool is_high_prio_compute = ring->funcs->type ==
AMDGPU_RING_TYPE_COMPUTE &&
+ amdgpu_gfx_is_high_priority_compute_queue(adev, ring);
+    bool is_high_prio_gfx = ring->funcs->type ==
AMDGPU_RING_TYPE_GFX &&
+ amdgpu_gfx_is_high_priority_graphics_queue(adev, ring);

  memset(prop, 0, sizeof(*prop));

@@ -637,10 +641,8 @@ static void amdgpu_ring_to_mqd_prop(struct
amdgpu_ring *ring,
   */
  prop->hqd_active = ring->funcs->type == AMDGPU_RING_TYPE_KIQ;

-    if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE &&
- amdgpu_gfx_is_high_priority_compute_queue(adev, ring)) ||
-    (ring->funcs->type == AMDGPU_RING_TYPE_GFX &&
- amdgpu_gfx_is_high_priority_graphics_queue(adev, ring))) {
+    prop->allow_tunneling = is_high_prio_compute;
+    if (is_high_prio_compute || is_high_prio_gfx) {
  prop->hqd_pipe_priority = AMDGPU_GFX_PIPE_PRIO_HIGH;
  prop->hqd_queue_priority = AMDGPU_GFX_QUEUE_PRIORITY_MAXIMUM;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index c8a3bf01743f..73f6d7e72c73 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -6593,7 +6593,8 @@ static int gfx_v10_0_compute_mqd_init(struct
amdgpu_device *adev, void *m,
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, ENDIAN_SWAP, 1);
  #endif
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 0);
-    tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH, 0);
+    tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH,
+    prop->allow_tunneling);
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, PRIV_STATE, 1);
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, KMD_QUEUE, 1);
  mqd->cp_hqd_pq_control = tmp;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index c659ef0f47ce..bdcf96df69e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -3847,7 +3847,8 @@ static int gfx_v11_0_compute_mqd_init(struct
amdgpu_device *adev, void *m,
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, RPTR_BLOCK_SIZE,
  (order_base_2(AMDGPU_GPU_PAGE_SIZE / 4) - 1));
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 0);
-    tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH, 0);
+    tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH,
+    prop->allow_tunneling);
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, PRIV_STATE, 1);
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, KMD_QUEUE, 1);
  mqd->cp_hqd_pq_control = tmp;
--
2.43.0





Re: [PATCH] drm/amdkfd: Fix sparse __rcu annotation warnings

2023-12-08 Thread Christian König

Am 07.12.23 um 20:14 schrieb Felix Kuehling:


On 2023-12-05 17:20, Felix Kuehling wrote:

Properly mark kfd_process->ef as __rcu and consistently access it with
rcu_dereference_protected.

Reported-by: kernel test robot 
Closes: 
https://lore.kernel.org/oe-kbuild-all/202312052245.yfpbsgnh-...@intel.com/

Signed-off-by: Felix Kuehling 


ping.

Christian, would you review this patch, please?


Looks a bit suspicious, especially the rcu_dereference_protected() use.

What is the static checker complaining about in the first place?

Regards,
Christian.



Thanks,
  Felix




---
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h   | 2 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 4 ++--
  drivers/gpu/drm/amd/amdkfd/kfd_priv.h    | 2 +-
  drivers/gpu/drm/amd/amdkfd/kfd_process.c | 6 --
  4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h

index f2e920734c98..20cb266dcedd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
@@ -314,7 +314,7 @@ void 
amdgpu_amdkfd_gpuvm_unmap_gtt_bo_from_kernel(struct kgd_mem *mem);
  int amdgpu_amdkfd_map_gtt_bo_to_gart(struct amdgpu_device *adev, 
struct amdgpu_bo *bo);

    int amdgpu_amdkfd_gpuvm_restore_process_bos(void *process_info,
-    struct dma_fence **ef);
+    struct dma_fence __rcu **ef);
  int amdgpu_amdkfd_gpuvm_get_vm_fault_info(struct amdgpu_device *adev,
    struct kfd_vm_fault_info *info);
  int amdgpu_amdkfd_gpuvm_import_dmabuf_fd(struct amdgpu_device 
*adev, int fd,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c

index 7d91f99acb59..8ba6f6c8363d 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -2806,7 +2806,7 @@ static void 
amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)

  put_task_struct(usertask);
  }
  -static void replace_eviction_fence(struct dma_fence **ef,
+static void replace_eviction_fence(struct dma_fence __rcu **ef,
 struct dma_fence *new_ef)
  {
  struct dma_fence *old_ef = rcu_replace_pointer(*ef, new_ef, true
@@ -2841,7 +2841,7 @@ static void replace_eviction_fence(struct 
dma_fence **ef,

   * 7.  Add fence to all PD and PT BOs.
   * 8.  Unreserve all BOs
   */
-int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct 
dma_fence **ef)
+int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct 
dma_fence __rcu **ef)

  {
  struct amdkfd_process_info *process_info = info;
  struct amdgpu_vm *peer_vm;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h 
b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h

index 45366b4ca976..5a24097a9f28 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -917,7 +917,7 @@ struct kfd_process {
   * fence will be triggered during eviction and new one will be 
created

   * during restore
   */
-    struct dma_fence *ef;
+    struct dma_fence __rcu *ef;
    /* Work items for evicting and restoring BOs */
  struct delayed_work eviction_work;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c 
b/drivers/gpu/drm/amd/amdkfd/kfd_process.c

index 71df51fcc1b0..14b11d61f8dd 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -1110,6 +1110,8 @@ static void kfd_process_wq_release(struct 
work_struct *work)

  {
  struct kfd_process *p = container_of(work, struct kfd_process,
   release_work);
+    struct dma_fence *ef = rcu_dereference_protected(p->ef,
+    kref_read(>ref) == 0);
    kfd_process_dequeue_from_all_devices(p);
  pqm_uninit(>pqm);
@@ -1118,7 +1120,7 @@ static void kfd_process_wq_release(struct 
work_struct *work)

   * destroyed. This allows any BOs to be freed without
   * triggering pointless evictions or waiting for fences.
   */
-    dma_fence_signal(p->ef);
+    dma_fence_signal(ef);
    kfd_process_remove_sysfs(p);
  @@ -1127,7 +1129,7 @@ static void kfd_process_wq_release(struct 
work_struct *work)

  svm_range_list_fini(p);
    kfd_process_destroy_pdds(p);
-    dma_fence_put(p->ef);
+    dma_fence_put(ef);
    kfd_event_free_process(p);




Re: [PATCH 2/2] drm/amdgpu: Enable clear page functionality

2023-12-08 Thread Christian König

Am 07.12.23 um 16:11 schrieb Arunpravin Paneer Selvam:

Add clear page support in vram memory region.


The first patch looks good, but this here needs quite some work.



Signed-off-by: Arunpravin Paneer Selvam 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu_object.c| 13 +++--
  .../gpu/drm/amd/amdgpu/amdgpu_res_cursor.h| 25 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c   | 50 +++
  drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.h   |  4 ++
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.c  | 14 +-
  drivers/gpu/drm/amd/amdgpu/amdgpu_vram_mgr.h  |  5 ++
  6 files changed, 105 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
index cef920a93924..bc4ea87f8b5e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_object.c
@@ -39,6 +39,7 @@
  #include "amdgpu.h"
  #include "amdgpu_trace.h"
  #include "amdgpu_amdkfd.h"
+#include "amdgpu_vram_mgr.h"
  
  /**

   * DOC: amdgpu_object
@@ -629,15 +630,17 @@ int amdgpu_bo_create(struct amdgpu_device *adev,
  
  	if (bp->flags & AMDGPU_GEM_CREATE_VRAM_CLEARED &&

bo->tbo.resource->mem_type == TTM_PL_VRAM) {
-   struct dma_fence *fence;
+   struct dma_fence *fence = NULL;
  
-		r = amdgpu_fill_buffer(bo, 0, bo->tbo.base.resv, , true);

+   r = amdgpu_clear_buffer(bo, bo->tbo.base.resv, , true);
if (unlikely(r))
goto fail_unreserve;
  
-		dma_resv_add_fence(bo->tbo.base.resv, fence,

-  DMA_RESV_USAGE_KERNEL);
-   dma_fence_put(fence);
+   if (fence) {
+   dma_resv_add_fence(bo->tbo.base.resv, fence,
+  DMA_RESV_USAGE_KERNEL);
+   dma_fence_put(fence);
+   }
}
if (!bp->resv)
amdgpu_bo_unreserve(bo);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
index 381101d2bf05..50fcd86e1033 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_res_cursor.h
@@ -164,4 +164,29 @@ static inline void amdgpu_res_next(struct 
amdgpu_res_cursor *cur, uint64_t size)
}
  }
  
+/**

+ * amdgpu_res_cleared - check if blocks are cleared
+ *
+ * @cur: the cursor to extract the block
+ *
+ * Check if the @cur block is cleared
+ */
+static inline bool amdgpu_res_cleared(struct amdgpu_res_cursor *cur)
+{
+   struct drm_buddy_block *block;
+
+   switch (cur->mem_type) {
+   case TTM_PL_VRAM:
+   block = cur->node;
+
+   if (!amdgpu_vram_mgr_is_cleared(block))
+   return false;
+   break;
+   default:
+   return false;
+   }
+
+   return true;
+}
+
  #endif
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
index 05991c5c8ddb..6d7514e8f40c 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
@@ -,6 +,56 @@ static int amdgpu_ttm_fill_mem(struct amdgpu_ring *ring, 
uint32_t src_data,
return 0;
  }
  
+int amdgpu_clear_buffer(struct amdgpu_bo *bo,

+   struct dma_resv *resv,
+   struct dma_fence **fence,
+   bool delayed)


Drop the delayed parameter, that doesn't make any sense here.

And as Alex said please use an amdgpu_ttm_ prefix for the function name.


+{
+   struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
+   struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring;
+   struct amdgpu_res_cursor cursor;
+   struct dma_fence *f = NULL;
+   u64 addr;
+   int r;
+
+   if (!adev->mman.buffer_funcs_enabled)
+   return -EINVAL;
+
+   amdgpu_res_first(bo->tbo.resource, 0, amdgpu_bo_size(bo), );
+
+   mutex_lock(>mman.gtt_window_lock);
+   while (cursor.remaining) {
+   struct dma_fence *next = NULL;
+   u64 size;
+
+   /* Never clear more than 256MiB at once to avoid timeouts */
+   size = min(cursor.size, 256ULL << 20);
+
+   if (!amdgpu_res_cleared()) {


This needs to come before the min(cursor.size) directly above. I 
suggest a handling like this:


if (amdgpu_res_cleared()) {
amdgpu_res_next(, cursor.size);
continue;
}

size = min(


+   r = amdgpu_ttm_map_buffer(>tbo, bo->tbo.resource, 
,
+ 1, ring, false, , );
+   if (r)
+   goto err;
+
+   r = amdgpu_ttm_fill_mem(ring, 0, addr, size, resv,
+   , true, delayed);
+   if (r)
+   goto err;
+   

Re: [PATCH] drm/amdgpu: Enable tunneling on high-priority compute queues

2023-12-08 Thread Christian König
Well longer story short Alex and I have been digging up the 
documentation for this and as far as we can tell this isn't correct.


You need to do quite a bit more before you can turn on this feature. 
What userspace side do you refer to?


Regards,
Christian.

Am 08.12.23 um 09:19 schrieb Friedrich Vock:

Friendly ping on this one.
Userspace side got merged, so would be great to land this patch too :)

On 02.12.23 01:17, Friedrich Vock wrote:

This improves latency if the GPU is already busy with other work.
This is useful for VR compositors that submit highly latency-sensitive
compositing work on high-priority compute queues while the GPU is busy
rendering the next frame.

Userspace merge request:
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26462

Signed-off-by: Friedrich Vock 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h  |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 10 ++
  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  3 ++-
  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c   |  3 ++-
  4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h

index 9505dc8f9d69..4b923a156c4e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -790,6 +790,7 @@ struct amdgpu_mqd_prop {
  uint64_t eop_gpu_addr;
  uint32_t hqd_pipe_priority;
  uint32_t hqd_queue_priority;
+    bool allow_tunneling;
  bool hqd_active;
  };

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c

index 231d49132a56..4d98e8879be8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -620,6 +620,10 @@ static void amdgpu_ring_to_mqd_prop(struct 
amdgpu_ring *ring,

  struct amdgpu_mqd_prop *prop)
  {
  struct amdgpu_device *adev = ring->adev;
+    bool is_high_prio_compute = ring->funcs->type == 
AMDGPU_RING_TYPE_COMPUTE &&

+ amdgpu_gfx_is_high_priority_compute_queue(adev, ring);
+    bool is_high_prio_gfx = ring->funcs->type == 
AMDGPU_RING_TYPE_GFX &&

+ amdgpu_gfx_is_high_priority_graphics_queue(adev, ring);

  memset(prop, 0, sizeof(*prop));

@@ -637,10 +641,8 @@ static void amdgpu_ring_to_mqd_prop(struct 
amdgpu_ring *ring,

   */
  prop->hqd_active = ring->funcs->type == AMDGPU_RING_TYPE_KIQ;

-    if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE &&
- amdgpu_gfx_is_high_priority_compute_queue(adev, ring)) ||
-    (ring->funcs->type == AMDGPU_RING_TYPE_GFX &&
- amdgpu_gfx_is_high_priority_graphics_queue(adev, ring))) {
+    prop->allow_tunneling = is_high_prio_compute;
+    if (is_high_prio_compute || is_high_prio_gfx) {
  prop->hqd_pipe_priority = AMDGPU_GFX_PIPE_PRIO_HIGH;
  prop->hqd_queue_priority = AMDGPU_GFX_QUEUE_PRIORITY_MAXIMUM;
  }
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c

index c8a3bf01743f..73f6d7e72c73 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -6593,7 +6593,8 @@ static int gfx_v10_0_compute_mqd_init(struct 
amdgpu_device *adev, void *m,

  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, ENDIAN_SWAP, 1);
  #endif
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 0);
-    tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH, 0);
+    tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH,
+    prop->allow_tunneling);
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, PRIV_STATE, 1);
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, KMD_QUEUE, 1);
  mqd->cp_hqd_pq_control = tmp;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c

index c659ef0f47ce..bdcf96df69e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -3847,7 +3847,8 @@ static int gfx_v11_0_compute_mqd_init(struct 
amdgpu_device *adev, void *m,

  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, RPTR_BLOCK_SIZE,
  (order_base_2(AMDGPU_GPU_PAGE_SIZE / 4) - 1));
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 0);
-    tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH, 0);
+    tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH,
+    prop->allow_tunneling);
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, PRIV_STATE, 1);
  tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, KMD_QUEUE, 1);
  mqd->cp_hqd_pq_control = tmp;
--
2.43.0





[RFC PATCH 01/12] arch: Add ARCH_HAS_KERNEL_FPU_SUPPORT

2023-12-08 Thread Samuel Holland
Several architectures provide an API to enable the FPU and run
floating-point SIMD code in kernel space. However, the function names,
header locations, and semantics are inconsistent across architectures,
and FPU support may be gated behind other Kconfig options.

Provide a standard way for architectures to declare that kernel space
FPU support is available. Architectures selecting this option must
implement what is currently the most common API (kernel_fpu_begin() and
kernel_fpu_end(), plus a new function kernel_fpu_available()) and
provide the appropriate CFLAGS for compiling floating-point C code.

Suggested-by: Christoph Hellwig 
Signed-off-by: Samuel Holland 
---

 Makefile | 4 
 arch/Kconfig | 9 +
 2 files changed, 13 insertions(+)

diff --git a/Makefile b/Makefile
index 511b5616aa41..e65c186cf2c9 100644
--- a/Makefile
+++ b/Makefile
@@ -969,6 +969,10 @@ KBUILD_CFLAGS  += $(CC_FLAGS_CFI)
 export CC_FLAGS_CFI
 endif
 
+# Architectures can define flags to add/remove for floating-point support
+export CC_FLAGS_FPU
+export CC_FLAGS_NO_FPU
+
 ifneq ($(CONFIG_FUNCTION_ALIGNMENT),0)
 KBUILD_CFLAGS += -falign-functions=$(CONFIG_FUNCTION_ALIGNMENT)
 endif
diff --git a/arch/Kconfig b/arch/Kconfig
index f4b210ab0612..6df834e18e9c 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -1478,6 +1478,15 @@ config ARCH_HAS_NONLEAF_PMD_YOUNG
  address translations. Page table walkers that clear the accessed bit
  may use this capability to reduce their search space.
 
+config ARCH_HAS_KERNEL_FPU_SUPPORT
+   bool
+   help
+ An architecture should select this option if it supports running
+ floating-point code in kernel space. It must export the functions
+ kernel_fpu_available(), kernel_fpu_begin(), and kernel_fpu_end() from
+ , and define CC_FLAGS_FPU and/or CC_FLAGS_NO_FPU as
+ necessary in its Makefile.
+
 source "kernel/gcov/Kconfig"
 
 source "scripts/gcc-plugins/Kconfig"
-- 
2.42.0



Re: [PATCH 3/3] drm/amd/display: Support DRM_AMD_DC_FP on RISC-V

2023-12-08 Thread Samuel Holland
Hi Nathan,

On 2023-11-29 6:42 PM, Nathan Chancellor wrote:
> On Thu, Nov 23, 2023 at 02:23:01PM +, Conor Dooley wrote:
>> On Tue, Nov 21, 2023 at 07:05:15PM -0800, Samuel Holland wrote:
>>> RISC-V uses kernel_fpu_begin()/kernel_fpu_end() like several other
>>> architectures. Enabling hardware FP requires overriding the ISA string
>>> for the relevant compilation units.
>>
>> Ah yes, bringing the joy of frame-larger-than warnings to RISC-V:
>> ../drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/display_mode_vba_32.c:58:13:
>>  warning: stack frame size (2416) exceeds limit (2048) in 
>> 'DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndPerformanceCalculation'
>>  [-Wframe-larger-than]
> 
> :(
> 
>> Nathan, have you given up on these being sorted out?
> 
> Does your configuration have KASAN (I don't think RISC-V supports
> KCSAN)? It is possible that dml/dcn32 needs something similar to commit
> 6740ec97bcdb ("drm/amd/display: Increase frame warning limit with KASAN
> or KCSAN in dml2")?
> 
> I am not really interested in playing whack-a-mole with these warnings
> like I have done in the past for the reasons I outlined here:
> 
> https://lore.kernel.org/20231019205117.GA839902@dev-arch.thelio-3990X/

I also see one of these with clang 17 even with KASAN disabled:

drivers/gpu/drm/amd/amdgpu/../display/dc/dml/dcn32/display_mode_vba_32.c:37:6:
warning: stack frame size (2208) exceeds limit (2048) in 'dml32_recalculate'
[-Wframe-larger-than]
void dml32_recalculate(struct display_mode_lib *mode_lib)

 ^
1532/2208 (69.38%) spills, 676/2208 (30.62%) variables

So I'm in favor of just raising the limit for these files for clang, like you
suggested in the linked thread.

Regards,
Samuel



[RFC PATCH 12/12] selftests/fpu: Allow building on other architectures

2023-12-08 Thread Samuel Holland
Now that ARCH_HAS_KERNEL_FPU_SUPPORT provides a common way to compile
and run floating-point code, this test is no longer x86-specific.

Signed-off-by: Samuel Holland 
---

 lib/Kconfig.debug   |  2 +-
 lib/Makefile| 25 ++---
 lib/test_fpu_glue.c |  5 -
 3 files changed, 7 insertions(+), 25 deletions(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index cc7d53d9dc01..bbab0b054e09 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -2933,7 +2933,7 @@ config TEST_FREE_PAGES
 
 config TEST_FPU
tristate "Test floating point operations in kernel space"
-   depends on X86 && !KCOV_INSTRUMENT_ALL
+   depends on ARCH_HAS_KERNEL_FPU_SUPPORT && !KCOV_INSTRUMENT_ALL
help
  Enable this option to add /sys/kernel/debug/selftest_helpers/test_fpu
  which will trigger a sequence of floating point operations. This is 
used
diff --git a/lib/Makefile b/lib/Makefile
index e7cbd54944a2..b9f28558c9bd 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -109,31 +109,10 @@ CFLAGS_test_fprobe.o += $(CC_FLAGS_FTRACE)
 obj-$(CONFIG_FPROBE_SANITY_TEST) += test_fprobe.o
 obj-$(CONFIG_TEST_OBJPOOL) += test_objpool.o
 
-#
-# CFLAGS for compiling floating point code inside the kernel. x86/Makefile 
turns
-# off the generation of FPU/SSE* instructions for kernel proper but FPU_FLAGS
-# get appended last to CFLAGS and thus override those previous compiler 
options.
-#
-FPU_CFLAGS := -msse -msse2
-ifdef CONFIG_CC_IS_GCC
-# Stack alignment mismatch, proceed with caution.
-# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3
-# (8B stack alignment).
-# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
-#
-# The "-msse" in the first argument is there so that the
-# -mpreferred-stack-boundary=3 build error:
-#
-#  -mpreferred-stack-boundary=3 is not between 4 and 12
-#
-# can be triggered. Otherwise gcc doesn't complain.
-FPU_CFLAGS += -mhard-float
-FPU_CFLAGS += $(call cc-option,-msse 
-mpreferred-stack-boundary=3,-mpreferred-stack-boundary=4)
-endif
-
 obj-$(CONFIG_TEST_FPU) += test_fpu.o
 test_fpu-y := test_fpu_glue.o test_fpu_impl.o
-CFLAGS_test_fpu_impl.o += $(FPU_CFLAGS)
+CFLAGS_test_fpu_impl.o += $(CC_FLAGS_FPU)
+CFLAGS_REMOVE_test_fpu_impl.o += $(CC_FLAGS_NO_FPU)
 
 obj-$(CONFIG_TEST_LIVEPATCH) += livepatch/
 
diff --git a/lib/test_fpu_glue.c b/lib/test_fpu_glue.c
index 2761b51117b0..2e0b4027a5e3 100644
--- a/lib/test_fpu_glue.c
+++ b/lib/test_fpu_glue.c
@@ -17,7 +17,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 int test_fpu(void);
 
@@ -38,6 +38,9 @@ static struct dentry *selftest_dir;
 
 static int __init test_fpu_init(void)
 {
+   if (!kernel_fpu_available())
+   return -EINVAL;
+
selftest_dir = debugfs_create_dir("selftest_helpers", NULL);
if (!selftest_dir)
return -ENOMEM;
-- 
2.42.0



[RFC PATCH 06/12] LoongArch: Implement ARCH_HAS_KERNEL_FPU_SUPPORT

2023-12-08 Thread Samuel Holland
LoongArch already provides kernel_fpu_begin() and kernel_fpu_end() in
asm/fpu.h, so it only needs to add kernel_fpu_available() and export
the CFLAGS adjustments.

Signed-off-by: Samuel Holland 
---

 arch/loongarch/Kconfig   | 1 +
 arch/loongarch/Makefile  | 5 -
 arch/loongarch/include/asm/fpu.h | 1 +
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig
index ee123820a476..65d4475565b8 100644
--- a/arch/loongarch/Kconfig
+++ b/arch/loongarch/Kconfig
@@ -15,6 +15,7 @@ config LOONGARCH
select ARCH_HAS_CPU_FINALIZE_INIT
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_KCOV
+   select ARCH_HAS_KERNEL_FPU_SUPPORT if CPU_HAS_FPU
select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PTE_SPECIAL
diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile
index 204b94b2e6aa..f5c4f7e921db 100644
--- a/arch/loongarch/Makefile
+++ b/arch/loongarch/Makefile
@@ -25,6 +25,9 @@ endif
 32bit-emul = elf32loongarch
 64bit-emul = elf64loongarch
 
+CC_FLAGS_FPU   := -mfpu=64
+CC_FLAGS_NO_FPU:= -msoft-float
+
 ifdef CONFIG_DYNAMIC_FTRACE
 KBUILD_CPPFLAGS += -DCC_USING_PATCHABLE_FUNCTION_ENTRY
 CC_FLAGS_FTRACE := -fpatchable-function-entry=2
@@ -46,7 +49,7 @@ ld-emul   = $(64bit-emul)
 cflags-y   += -mabi=lp64s
 endif
 
-cflags-y   += -pipe -msoft-float
+cflags-y   += -pipe $(CC_FLAGS_NO_FPU)
 LDFLAGS_vmlinux+= -static -n -nostdlib
 
 # When the assembler supports explicit relocation hint, we must use it.
diff --git a/arch/loongarch/include/asm/fpu.h b/arch/loongarch/include/asm/fpu.h
index c2d8962fda00..3177674228f8 100644
--- a/arch/loongarch/include/asm/fpu.h
+++ b/arch/loongarch/include/asm/fpu.h
@@ -21,6 +21,7 @@
 
 struct sigcontext;
 
+#define kernel_fpu_available() cpu_has_fpu
 extern void kernel_fpu_begin(void);
 extern void kernel_fpu_end(void);
 
-- 
2.42.0



[RFC PATCH 07/12] powerpc: Implement ARCH_HAS_KERNEL_FPU_SUPPORT

2023-12-08 Thread Samuel Holland
PowerPC provides an equivalent to the common kernel-mode FPU API, but in
a different header and using different function names. The PowerPC API
also requires a non-preemptible context. Add a wrapper header, and
export the CFLAGS adjustments.

Signed-off-by: Samuel Holland 
---

 arch/powerpc/Kconfig   |  1 +
 arch/powerpc/Makefile  |  5 -
 arch/powerpc/include/asm/fpu.h | 28 
 3 files changed, 33 insertions(+), 1 deletion(-)
 create mode 100644 arch/powerpc/include/asm/fpu.h

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 6f105ee4f3cf..e96cb5b7c571 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -137,6 +137,7 @@ config PPC
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_HUGEPD  if HUGETLB_PAGE
select ARCH_HAS_KCOV
+   select ARCH_HAS_KERNEL_FPU_SUPPORT  if PPC_FPU
select ARCH_HAS_MEMBARRIER_CALLBACKS
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_MEMREMAP_COMPAT_ALIGN   if PPC_64S_HASH_MMU
diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile
index f19dbaa1d541..2d5f21baf6ff 100644
--- a/arch/powerpc/Makefile
+++ b/arch/powerpc/Makefile
@@ -142,6 +142,9 @@ CFLAGS-$(CONFIG_PPC32)  += $(call cc-option, 
$(MULTIPLEWORD))
 
 CFLAGS-$(CONFIG_PPC32) += $(call cc-option,-mno-readonly-in-sdata)
 
+CC_FLAGS_FPU   := $(call cc-option,-mhard-float)
+CC_FLAGS_NO_FPU+= $(call cc-option,-msoft-float)
+
 ifdef CONFIG_FUNCTION_TRACER
 ifdef CONFIG_ARCH_USING_PATCHABLE_FUNCTION_ENTRY
 KBUILD_CPPFLAGS+= -DCC_USING_PATCHABLE_FUNCTION_ENTRY
@@ -163,7 +166,7 @@ asinstr := $(call as-instr,lis 
9$(comma)foo@high,-DHAVE_AS_ATHIGH=1)
 
 KBUILD_CPPFLAGS+= -I $(srctree)/arch/$(ARCH) $(asinstr)
 KBUILD_AFLAGS  += $(AFLAGS-y)
-KBUILD_CFLAGS  += $(call cc-option,-msoft-float)
+KBUILD_CFLAGS  += $(CC_FLAGS_NO_FPU)
 KBUILD_CFLAGS  += $(CFLAGS-y)
 CPP= $(CC) -E $(KBUILD_CFLAGS)
 
diff --git a/arch/powerpc/include/asm/fpu.h b/arch/powerpc/include/asm/fpu.h
new file mode 100644
index ..ca584e4bc40f
--- /dev/null
+++ b/arch/powerpc/include/asm/fpu.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 SiFive
+ */
+
+#ifndef _ASM_POWERPC_FPU_H
+#define _ASM_POWERPC_FPU_H
+
+#include 
+
+#include 
+#include 
+
+#define kernel_fpu_available() (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE))
+
+static inline void kernel_fpu_begin(void)
+{
+   preempt_disable();
+   enable_kernel_fp();
+}
+
+static inline void kernel_fpu_end(void)
+{
+   disable_kernel_fp();
+   preempt_enable();
+}
+
+#endif /* ! _ASM_POWERPC_FPU_H */
-- 
2.42.0



Re: [PATCH] drm/amdgpu: drop the long-double-128 powerpc check/hack

2023-12-08 Thread Michael Ellerman
Christophe Leroy  writes:
> Le 31/03/2023 à 12:53, Michael Ellerman a écrit :
>> "Daniel Kolesa"  writes:
>>> Commit c653c591789b ("drm/amdgpu: Re-enable DCN for 64-bit powerpc")
>>> introduced this check as a workaround for the driver not building
>>> with toolchains that default to 64-bit long double.
>> ...
>>> In mainline, this work is now fully done, so this check is fully
>>> redundant and does not do anything except preventing AMDGPU DC
>>> from being built on systems such as those using musl libc. The
>>> last piece of work to enable this was commit c92b7fe0d92a
>>> ("drm/amd/display: move remaining FPU code to dml folder")
>>> and this has since been backported to 6.1 stable (in 6.1.7).
>>>
>>> Relevant issue: https://gitlab.freedesktop.org/drm/amd/-/issues/2288
>> 
>> I looked to pick this up for 6.3 but was still seeing build errors with
>> some compilers. I assumed that was due to some fixes coming in
>> linux-next that I didn't have.
>> 
>> But applying the patch on v6.3-rc4 I still see build errors. This is
>> building allyesconfig with the kernel.org GCC 12.2.0 / binutils 2.39
>> toolchain:
>> 
>>powerpc64le-linux-gnu-ld: 
>> drivers/gpu/drm/amd/display/dc/dml/display_mode_lib.o uses hard float, 
>> arch/powerpc/lib/test_emulate_step.o uses soft float
>>powerpc64le-linux-gnu-ld: failed to merge target specific data of file 
>> drivers/gpu/drm/amd/display/dc/dml/display_mode_lib.o
>> 
>> etc.
>> 
>> All the conflicts are between test_emulate_step.o and some file in 
>> drivers/gpu/drm/amd/display/dc/dml.
>> 
>> So even with all the hard-float code isolated in the dml folder, we
>> still hit build errors, because allyesconfig wants to link those
>> hard-float using objects with soft-float objects from elsewhere in the
>> kernel.
>> 
>> It seems like the only workable fix is to force the kernel build to use
>> 128-bit long double. I'll send a patch doing that.
>> 
>
> Commit 78f0929884d4 ("powerpc/64: Always build with 128-bit long 
> double") I guess ?

Yes.

> Let's drop this patch from patchwork then.

Thanks.

cheers


[RFC PATCH 04/12] arm64: Implement ARCH_HAS_KERNEL_FPU_SUPPORT

2023-12-08 Thread Samuel Holland
arm64 provides an equivalent to the common kernel-mode FPU API, but in a
different header and using different function names. Add a wrapper
header, and export CFLAGS adjustments as found in lib/raid6/Makefile.

Signed-off-by: Samuel Holland 
---

 arch/arm64/Kconfig   |  1 +
 arch/arm64/Makefile  |  9 -
 arch/arm64/include/asm/fpu.h | 17 +
 3 files changed, 26 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/include/asm/fpu.h

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 7b071a00425d..485ac389ac11 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -30,6 +30,7 @@ config ARM64
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_KCOV
+   select ARCH_HAS_KERNEL_FPU_SUPPORT if KERNEL_MODE_NEON
select ARCH_HAS_KEEPINITRD
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index 9a2d3723cd0f..4a65f24c7998 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -36,7 +36,14 @@ ifeq ($(CONFIG_BROKEN_GAS_INST),y)
 $(warning Detected assembler with broken .inst; disassembly will be unreliable)
 endif
 
-KBUILD_CFLAGS  += -mgeneral-regs-only  \
+# The GCC option -ffreestanding is required in order to compile code containing
+# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel)
+CC_FLAGS_FPU   := -ffreestanding
+# Enable 
+CC_FLAGS_FPU   += -isystem $(shell $(CC) -print-file-name=include)
+CC_FLAGS_NO_FPU:= -mgeneral-regs-only
+
+KBUILD_CFLAGS  += $(CC_FLAGS_NO_FPU) \
   $(compat_vdso) $(cc_has_k_constraint)
 KBUILD_CFLAGS  += $(call cc-disable-warning, psabi)
 KBUILD_AFLAGS  += $(compat_vdso)
diff --git a/arch/arm64/include/asm/fpu.h b/arch/arm64/include/asm/fpu.h
new file mode 100644
index ..664c0a192ab1
--- /dev/null
+++ b/arch/arm64/include/asm/fpu.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * linux/arch/arm64/include/asm/fpu.h
+ *
+ * Copyright (C) 2023 SiFive
+ */
+
+#ifndef __ASM_FPU_H
+#define __ASM_FPU_H
+
+#include 
+
+#define kernel_fpu_available() cpu_has_neon()
+#define kernel_fpu_begin() kernel_neon_begin()
+#define kernel_fpu_end()   kernel_neon_end()
+
+#endif /* ! __ASM_FPU_H */
-- 
2.42.0



[RFC PATCH 11/12] selftests/fpu: Move FP code to a separate translation unit

2023-12-08 Thread Samuel Holland
This ensures no compiler-generated floating-point code can appear
outside kernel_fpu_{begin,end}() sections, and some architectures
enforce this separation.

Signed-off-by: Samuel Holland 
---

 lib/Makefile|  3 ++-
 lib/{test_fpu.c => test_fpu_glue.c} | 32 +-
 lib/test_fpu_impl.c | 35 +
 3 files changed, 38 insertions(+), 32 deletions(-)
 rename lib/{test_fpu.c => test_fpu_glue.c} (71%)
 create mode 100644 lib/test_fpu_impl.c

diff --git a/lib/Makefile b/lib/Makefile
index 6b09731d8e61..e7cbd54944a2 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -132,7 +132,8 @@ FPU_CFLAGS += $(call cc-option,-msse 
-mpreferred-stack-boundary=3,-mpreferred-st
 endif
 
 obj-$(CONFIG_TEST_FPU) += test_fpu.o
-CFLAGS_test_fpu.o += $(FPU_CFLAGS)
+test_fpu-y := test_fpu_glue.o test_fpu_impl.o
+CFLAGS_test_fpu_impl.o += $(FPU_CFLAGS)
 
 obj-$(CONFIG_TEST_LIVEPATCH) += livepatch/
 
diff --git a/lib/test_fpu.c b/lib/test_fpu_glue.c
similarity index 71%
rename from lib/test_fpu.c
rename to lib/test_fpu_glue.c
index e82db19fed84..2761b51117b0 100644
--- a/lib/test_fpu.c
+++ b/lib/test_fpu_glue.c
@@ -19,37 +19,7 @@
 #include 
 #include 
 
-static int test_fpu(void)
-{
-   /*
-* This sequence of operations tests that rounding mode is
-* to nearest and that denormal numbers are supported.
-* Volatile variables are used to avoid compiler optimizing
-* the calculations away.
-*/
-   volatile double a, b, c, d, e, f, g;
-
-   a = 4.0;
-   b = 1e-15;
-   c = 1e-310;
-
-   /* Sets precision flag */
-   d = a + b;
-
-   /* Result depends on rounding mode */
-   e = a + b / 2;
-
-   /* Denormal and very large values */
-   f = b / c;
-
-   /* Depends on denormal support */
-   g = a + c * f;
-
-   if (d > a && e > a && g > a)
-   return 0;
-   else
-   return -EINVAL;
-}
+int test_fpu(void);
 
 static int test_fpu_get(void *data, u64 *val)
 {
diff --git a/lib/test_fpu_impl.c b/lib/test_fpu_impl.c
new file mode 100644
index ..2ff01980bc22
--- /dev/null
+++ b/lib/test_fpu_impl.c
@@ -0,0 +1,35 @@
+// SPDX-License-Identifier: GPL-2.0+
+
+#include 
+
+int test_fpu(void)
+{
+   /*
+* This sequence of operations tests that rounding mode is
+* to nearest and that denormal numbers are supported.
+* Volatile variables are used to avoid compiler optimizing
+* the calculations away.
+*/
+   volatile double a, b, c, d, e, f, g;
+
+   a = 4.0;
+   b = 1e-15;
+   c = 1e-310;
+
+   /* Sets precision flag */
+   d = a + b;
+
+   /* Result depends on rounding mode */
+   e = a + b / 2;
+
+   /* Denormal and very large values */
+   f = b / c;
+
+   /* Depends on denormal support */
+   g = a + c * f;
+
+   if (d > a && e > a && g > a)
+   return 0;
+   else
+   return -EINVAL;
+}
-- 
2.42.0



[PATCH linux-next] drm/amd/display: replace kzalloc and memcpy with kmemdup

2023-12-08 Thread yang.guang5
From: Yang Guang 

Convert kzalloc/memcpy operations to memdup makes for 
cleaner code and avoids memcpy() failures

Signed-off-by: Chen Haonan 
---
 drivers/gpu/drm/amd/display/dc/core/dc.c | 10 --
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c 
b/drivers/gpu/drm/amd/display/dc/core/dc.c
index 76b47f178127..867e1a0fdef6 100644
--- a/drivers/gpu/drm/amd/display/dc/core/dc.c
+++ b/drivers/gpu/drm/amd/display/dc/core/dc.c
@@ -2264,12 +2264,10 @@ struct dc_state *dc_copy_state(struct dc_state *src_ctx)

 #ifdef CONFIG_DRM_AMD_DC_FP
if (new_ctx->bw_ctx.dml2) {
-   dml2 = kzalloc(sizeof(struct dml2_context), GFP_KERNEL);
-   if (!dml2)
-   return NULL;
-
-   memcpy(dml2, src_ctx->bw_ctx.dml2, sizeof(struct dml2_context));
-   new_ctx->bw_ctx.dml2 = dml2;
+   dml2 = kmemdup(src_ctx->bw_ctx.dml2, sizeof(struct 
dml2_context), GFP_KERNEL);
+   if (!dml2)
+   return NULL;
+   new_ctx->bw_ctx.dml2 = dml2;
}
 #endif

-- 
2.25.1


[RFC PATCH 05/12] lib/raid6: Use CC_FLAGS_FPU for NEON CFLAGS

2023-12-08 Thread Samuel Holland
Now that CC_FLAGS_FPU is exported and can be used anywhere in the source
tree, use it instead of duplicating the flags here.

Signed-off-by: Samuel Holland 
---

 lib/raid6/Makefile | 31 ---
 1 file changed, 8 insertions(+), 23 deletions(-)

diff --git a/lib/raid6/Makefile b/lib/raid6/Makefile
index 1c5420ff254e..309fea97efc6 100644
--- a/lib/raid6/Makefile
+++ b/lib/raid6/Makefile
@@ -33,25 +33,6 @@ CFLAGS_REMOVE_vpermxor8.o += -msoft-float
 endif
 endif
 
-# The GCC option -ffreestanding is required in order to compile code containing
-# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel)
-ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
-NEON_FLAGS := -ffreestanding
-# Enable 
-NEON_FLAGS += -isystem $(shell $(CC) -print-file-name=include)
-ifeq ($(ARCH),arm)
-NEON_FLAGS += -march=armv7-a -mfloat-abi=softfp -mfpu=neon
-endif
-CFLAGS_recov_neon_inner.o += $(NEON_FLAGS)
-ifeq ($(ARCH),arm64)
-CFLAGS_REMOVE_recov_neon_inner.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon1.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon2.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon4.o += -mgeneral-regs-only
-CFLAGS_REMOVE_neon8.o += -mgeneral-regs-only
-endif
-endif
-
 quiet_cmd_unroll = UNROLL  $@
   cmd_unroll = $(AWK) -v N=$* -f $(srctree)/$(src)/unroll.awk < $< > $@
 
@@ -75,10 +56,14 @@ targets += vpermxor1.c vpermxor2.c vpermxor4.c vpermxor8.c
 $(obj)/vpermxor%.c: $(src)/vpermxor.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
 
-CFLAGS_neon1.o += $(NEON_FLAGS)
-CFLAGS_neon2.o += $(NEON_FLAGS)
-CFLAGS_neon4.o += $(NEON_FLAGS)
-CFLAGS_neon8.o += $(NEON_FLAGS)
+CFLAGS_neon1.o += $(CC_FLAGS_FPU)
+CFLAGS_neon2.o += $(CC_FLAGS_FPU)
+CFLAGS_neon4.o += $(CC_FLAGS_FPU)
+CFLAGS_neon8.o += $(CC_FLAGS_FPU)
+CFLAGS_REMOVE_neon1.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_neon2.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_neon4.o += $(CC_FLAGS_NO_FPU)
+CFLAGS_REMOVE_neon8.o += $(CC_FLAGS_NO_FPU)
 targets += neon1.c neon2.c neon4.c neon8.c
 $(obj)/neon%.c: $(src)/neon.uc $(src)/unroll.awk FORCE
$(call if_changed,unroll)
-- 
2.42.0



[RFC PATCH 00/12] Unified cross-architecture kernel-mode FPU API

2023-12-08 Thread Samuel Holland
This series supersedes my earier RISC-V specific series[1].

This series unifies the kernel-mode FPU API across several architectures
by wrapping the existing functions (where needed) in consistently-named
functions placed in a consistent header location, with mostly the same
semantics: they can be called from preemptible or non-preemptible task
context, and are not assumed to be reentrant. Architectures are also
expected to provide CFLAGS adjustments for compiling FPU-dependent code.
For the moment, SIMD/vector units are out of scope for this common API.

This allows us to remove the ifdeffery and duplicated Makefile logic at
each FPU user. It then implements the common API on RISC-V, and converts
a couple of users to the new API: the AMDGPU DRM driver, and the FPU
self test.

The underlying goal of this series is to allow using newer AMD GPUs
(e.g. Navi) on RISC-V boards such as SiFive's HiFive Unmatched. Those
GPUs need CONFIG_DRM_AMD_DC_FP to initialize, which requires kernel-mode
FPU support.

[1]: 
https://lore.kernel.org/linux-riscv/20231122030621.3759313-1-samuel.holl...@sifive.com/


Samuel Holland (12):
  arch: Add ARCH_HAS_KERNEL_FPU_SUPPORT
  ARM: Implement ARCH_HAS_KERNEL_FPU_SUPPORT
  ARM: crypto: Use CC_FLAGS_FPU for NEON CFLAGS
  arm64: Implement ARCH_HAS_KERNEL_FPU_SUPPORT
  lib/raid6: Use CC_FLAGS_FPU for NEON CFLAGS
  LoongArch: Implement ARCH_HAS_KERNEL_FPU_SUPPORT
  powerpc: Implement ARCH_HAS_KERNEL_FPU_SUPPORT
  x86: Implement ARCH_HAS_KERNEL_FPU_SUPPORT
  riscv: Add support for kernel-mode FPU
  drm/amd/display: Use ARCH_HAS_KERNEL_FPU_SUPPORT
  selftests/fpu: Move FP code to a separate translation unit
  selftests/fpu: Allow building on other architectures

 Makefile  |  4 ++
 arch/Kconfig  |  9 +
 arch/arm/Kconfig  |  1 +
 arch/arm/Makefile |  7 
 arch/arm/include/asm/fpu.h| 17 +
 arch/arm/lib/Makefile |  3 +-
 arch/arm64/Kconfig|  1 +
 arch/arm64/Makefile   |  9 -
 arch/arm64/include/asm/fpu.h  | 17 +
 arch/loongarch/Kconfig|  1 +
 arch/loongarch/Makefile   |  5 ++-
 arch/loongarch/include/asm/fpu.h  |  1 +
 arch/powerpc/Kconfig  |  1 +
 arch/powerpc/Makefile |  5 ++-
 arch/powerpc/include/asm/fpu.h| 28 ++
 arch/riscv/Kconfig|  1 +
 arch/riscv/Makefile   |  3 ++
 arch/riscv/include/asm/fpu.h  | 26 +
 arch/riscv/kernel/Makefile|  1 +
 arch/riscv/kernel/kernel_mode_fpu.c   | 28 ++
 arch/x86/Kconfig  |  1 +
 arch/x86/Makefile | 20 ++
 arch/x86/include/asm/fpu.h| 13 +++
 drivers/gpu/drm/amd/display/Kconfig   |  2 +-
 .../gpu/drm/amd/display/amdgpu_dm/dc_fpu.c| 33 +
 drivers/gpu/drm/amd/display/dc/dml/Makefile   | 36 +-
 drivers/gpu/drm/amd/display/dc/dml2/Makefile  | 36 +-
 lib/Kconfig.debug |  2 +-
 lib/Makefile  | 26 ++---
 lib/raid6/Makefile| 31 
 lib/{test_fpu.c => test_fpu_glue.c}   | 37 +++
 lib/test_fpu_impl.c   | 35 ++
 32 files changed, 255 insertions(+), 185 deletions(-)
 create mode 100644 arch/arm/include/asm/fpu.h
 create mode 100644 arch/arm64/include/asm/fpu.h
 create mode 100644 arch/powerpc/include/asm/fpu.h
 create mode 100644 arch/riscv/include/asm/fpu.h
 create mode 100644 arch/riscv/kernel/kernel_mode_fpu.c
 create mode 100644 arch/x86/include/asm/fpu.h
 rename lib/{test_fpu.c => test_fpu_glue.c} (71%)
 create mode 100644 lib/test_fpu_impl.c

-- 
2.42.0



[RFC PATCH 02/12] ARM: Implement ARCH_HAS_KERNEL_FPU_SUPPORT

2023-12-08 Thread Samuel Holland
ARM provides an equivalent to the common kernel-mode FPU API, but in a
different header and using different function names. Add a wrapper
header, and export CFLAGS adjustments as found in lib/raid6/Makefile.

Signed-off-by: Samuel Holland 
---

 arch/arm/Kconfig   |  1 +
 arch/arm/Makefile  |  7 +++
 arch/arm/include/asm/fpu.h | 17 +
 3 files changed, 25 insertions(+)
 create mode 100644 arch/arm/include/asm/fpu.h

diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index f8567e95f98b..92e21a4a2903 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -14,6 +14,7 @@ config ARM
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_KEEPINITRD
select ARCH_HAS_KCOV
+   select ARCH_HAS_KERNEL_FPU_SUPPORT if KERNEL_MODE_NEON
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PTE_SPECIAL if ARM_LPAE
diff --git a/arch/arm/Makefile b/arch/arm/Makefile
index 5ba42f69f8ce..1dd860dba5f5 100644
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -130,6 +130,13 @@ endif
 # Accept old syntax despite ".syntax unified"
 AFLAGS_NOWARN  :=$(call 
as-option,-Wa$(comma)-mno-warn-deprecated,-Wa$(comma)-W)
 
+# The GCC option -ffreestanding is required in order to compile code containing
+# ARM/NEON intrinsics in a non C99-compliant environment (such as the kernel)
+CC_FLAGS_FPU   := -ffreestanding
+# Enable 
+CC_FLAGS_FPU   += -isystem $(shell $(CC) -print-file-name=include)
+CC_FLAGS_FPU   += -march=armv7-a -mfloat-abi=softfp -mfpu=neon
+
 ifeq ($(CONFIG_THUMB2_KERNEL),y)
 CFLAGS_ISA :=-Wa,-mimplicit-it=always $(AFLAGS_NOWARN)
 AFLAGS_ISA :=$(CFLAGS_ISA) -Wa$(comma)-mthumb
diff --git a/arch/arm/include/asm/fpu.h b/arch/arm/include/asm/fpu.h
new file mode 100644
index ..d01ca06e700a
--- /dev/null
+++ b/arch/arm/include/asm/fpu.h
@@ -0,0 +1,17 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * linux/arch/arm/include/asm/fpu.h
+ *
+ * Copyright (C) 2023 SiFive
+ */
+
+#ifndef __ASM_FPU_H
+#define __ASM_FPU_H
+
+#include 
+
+#define kernel_fpu_available() cpu_has_neon()
+#define kernel_fpu_begin() kernel_neon_begin()
+#define kernel_fpu_end()   kernel_neon_end()
+
+#endif /* ! __ASM_FPU_H */
-- 
2.42.0



Re: [PATCH 1/3] riscv: Add support for kernel-mode FPU

2023-12-08 Thread Samuel Holland
Hi Christoph,

On 2023-11-22 2:33 AM, Christoph Hellwig wrote:
> On Tue, Nov 21, 2023 at 07:05:13PM -0800, Samuel Holland wrote:
>> +static inline void kernel_fpu_begin(void)
>> +{
>> +preempt_disable();
>> +fstate_save(current, task_pt_regs(current));
>> +csr_set(CSR_SSTATUS, SR_FS);
>> +}
>> +
>> +static inline void kernel_fpu_end(void)
>> +{
>> +csr_clear(CSR_SSTATUS, SR_FS);
>> +fstate_restore(current, task_pt_regs(current));
>> +preempt_enable();
>> +}
> 
> Is there any critical reason to inline these two?  I'd much rather see
> them out of line and exported instead of the low-level helpers.

No, I will define them out of line in v2.

Regards,
Samuel



[RFC PATCH 10/12] drm/amd/display: Use ARCH_HAS_KERNEL_FPU_SUPPORT

2023-12-08 Thread Samuel Holland
Now that all previously-supported architectures select
ARCH_HAS_KERNEL_FPU_SUPPORT, this code can depend on that symbol instead
of the existing list of architectures. It can also take advantage of the
common kernel-mode FPU API and method of adjusting CFLAGS.

Signed-off-by: Samuel Holland 
---

 drivers/gpu/drm/amd/display/Kconfig   |  2 +-
 .../gpu/drm/amd/display/amdgpu_dm/dc_fpu.c| 33 +
 drivers/gpu/drm/amd/display/dc/dml/Makefile   | 36 ++-
 drivers/gpu/drm/amd/display/dc/dml2/Makefile  | 36 ++-
 4 files changed, 6 insertions(+), 101 deletions(-)

diff --git a/drivers/gpu/drm/amd/display/Kconfig 
b/drivers/gpu/drm/amd/display/Kconfig
index 901d1961b739..5fcd4f778dc3 100644
--- a/drivers/gpu/drm/amd/display/Kconfig
+++ b/drivers/gpu/drm/amd/display/Kconfig
@@ -8,7 +8,7 @@ config DRM_AMD_DC
depends on BROKEN || !CC_IS_CLANG || ARM64 || RISCV || SPARC64 || X86_64
select SND_HDA_COMPONENT if SND_HDA_CORE
# !CC_IS_CLANG: https://github.com/ClangBuiltLinux/linux/issues/1752
-   select DRM_AMD_DC_FP if (X86 || LOONGARCH || (PPC64 && ALTIVEC) || 
(ARM64 && KERNEL_MODE_NEON && !CC_IS_CLANG))
+   select DRM_AMD_DC_FP if ARCH_HAS_KERNEL_FPU_SUPPORT && (!ARM64 || 
!CC_IS_CLANG)
help
  Choose this option if you want to use the new display engine
  support for AMDGPU. This adds required support for Vega and
diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c 
b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c
index 4ae4720535a5..b64f917174ca 100644
--- a/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c
+++ b/drivers/gpu/drm/amd/display/amdgpu_dm/dc_fpu.c
@@ -26,16 +26,7 @@
 
 #include "dc_trace.h"
 
-#if defined(CONFIG_X86)
-#include 
-#elif defined(CONFIG_PPC64)
-#include 
-#include 
-#elif defined(CONFIG_ARM64)
-#include 
-#elif defined(CONFIG_LOONGARCH)
 #include 
-#endif
 
 /**
  * DOC: DC FPU manipulation overview
@@ -87,20 +78,9 @@ void dc_fpu_begin(const char *function_name, const int line)
WARN_ON_ONCE(!in_task());
preempt_disable();
depth = __this_cpu_inc_return(fpu_recursion_depth);
-
if (depth == 1) {
-#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
+   BUG_ON(!kernel_fpu_available());
kernel_fpu_begin();
-#elif defined(CONFIG_PPC64)
-   if (cpu_has_feature(CPU_FTR_VSX_COMP))
-   enable_kernel_vsx();
-   else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP))
-   enable_kernel_altivec();
-   else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE))
-   enable_kernel_fp();
-#elif defined(CONFIG_ARM64)
-   kernel_neon_begin();
-#endif
}
 
TRACE_DCN_FPU(true, function_name, line, depth);
@@ -122,18 +102,7 @@ void dc_fpu_end(const char *function_name, const int line)
 
depth = __this_cpu_dec_return(fpu_recursion_depth);
if (depth == 0) {
-#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
kernel_fpu_end();
-#elif defined(CONFIG_PPC64)
-   if (cpu_has_feature(CPU_FTR_VSX_COMP))
-   disable_kernel_vsx();
-   else if (cpu_has_feature(CPU_FTR_ALTIVEC_COMP))
-   disable_kernel_altivec();
-   else if (!cpu_has_feature(CPU_FTR_FPU_UNAVAILABLE))
-   disable_kernel_fp();
-#elif defined(CONFIG_ARM64)
-   kernel_neon_end();
-#endif
} else {
WARN_ON_ONCE(depth < 0);
}
diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile 
b/drivers/gpu/drm/amd/display/dc/dml/Makefile
index ea7d60f9a9b4..5aad0f572ba3 100644
--- a/drivers/gpu/drm/amd/display/dc/dml/Makefile
+++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile
@@ -25,40 +25,8 @@
 # It provides the general basic services required by other DAL
 # subcomponents.
 
-ifdef CONFIG_X86
-dml_ccflags-$(CONFIG_CC_IS_GCC) := -mhard-float
-dml_ccflags := $(dml_ccflags-y) -msse
-endif
-
-ifdef CONFIG_PPC64
-dml_ccflags := -mhard-float -maltivec
-endif
-
-ifdef CONFIG_ARM64
-dml_rcflags := -mgeneral-regs-only
-endif
-
-ifdef CONFIG_LOONGARCH
-dml_ccflags := -mfpu=64
-dml_rcflags := -msoft-float
-endif
-
-ifdef CONFIG_CC_IS_GCC
-ifneq ($(call gcc-min-version, 70100),y)
-IS_OLD_GCC = 1
-endif
-endif
-
-ifdef CONFIG_X86
-ifdef IS_OLD_GCC
-# Stack alignment mismatch, proceed with caution.
-# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3
-# (8B stack alignment).
-dml_ccflags += -mpreferred-stack-boundary=4
-else
-dml_ccflags += -msse2
-endif
-endif
+dml_ccflags := $(CC_FLAGS_FPU)
+dml_rcflags := $(CC_FLAGS_NO_FPU)
 
 ifneq ($(CONFIG_FRAME_WARN),0)
 frame_warn_flag := -Wframe-larger-than=2048
diff --git a/drivers/gpu/drm/amd/display/dc/dml2/Makefile 
b/drivers/gpu/drm/amd/display/dc/dml2/Makefile
index acff3449b8d7..4f6c804a26ad 100644
--- a/drivers/gpu/drm/amd/display/dc/dml2/Makefile
+++ 

[RFC PATCH 08/12] x86: Implement ARCH_HAS_KERNEL_FPU_SUPPORT

2023-12-08 Thread Samuel Holland
x86 already provides kernel_fpu_begin() and kernel_fpu_end(), but in a
different header. Add a wrapper header, and export the CFLAGS
adjustments as found in lib/Makefile.

Signed-off-by: Samuel Holland 
---

 arch/x86/Kconfig   |  1 +
 arch/x86/Makefile  | 20 
 arch/x86/include/asm/fpu.h | 13 +
 3 files changed, 34 insertions(+)
 create mode 100644 arch/x86/include/asm/fpu.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 3762f41bb092..1fe7f2d8d017 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -81,6 +81,7 @@ config X86
select ARCH_HAS_FORTIFY_SOURCE
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_KCOVif X86_64
+   select ARCH_HAS_KERNEL_FPU_SUPPORT
select ARCH_HAS_MEM_ENCRYPT
select ARCH_HAS_MEMBARRIER_SYNC_CORE
select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 1a068de12a56..71576c8dbe79 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -70,6 +70,26 @@ export BITS
 KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx
 KBUILD_RUSTFLAGS += 
-Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2
 
+#
+# CFLAGS for compiling floating point code inside the kernel.
+#
+CC_FLAGS_FPU := -msse -msse2
+ifdef CONFIG_CC_IS_GCC
+# Stack alignment mismatch, proceed with caution.
+# GCC < 7.1 cannot compile code using `double` and -mpreferred-stack-boundary=3
+# (8B stack alignment).
+# See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
+#
+# The "-msse" in the first argument is there so that the
+# -mpreferred-stack-boundary=3 build error:
+#
+#  -mpreferred-stack-boundary=3 is not between 4 and 12
+#
+# can be triggered. Otherwise gcc doesn't complain.
+CC_FLAGS_FPU += -mhard-float
+CC_FLAGS_FPU += $(call cc-option,-msse 
-mpreferred-stack-boundary=3,-mpreferred-stack-boundary=4)
+endif
+
 ifeq ($(CONFIG_X86_KERNEL_IBT),y)
 #
 # Kernel IBT has S_CET.NOTRACK_EN=0, as such the compilers must not generate
diff --git a/arch/x86/include/asm/fpu.h b/arch/x86/include/asm/fpu.h
new file mode 100644
index ..b2743fe19339
--- /dev/null
+++ b/arch/x86/include/asm/fpu.h
@@ -0,0 +1,13 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 SiFive
+ */
+
+#ifndef _ASM_X86_FPU_H
+#define _ASM_X86_FPU_H
+
+#include 
+
+#define kernel_fpu_available() true
+
+#endif /* ! _ASM_X86_FPU_H */
-- 
2.42.0



[RFC PATCH 03/12] ARM: crypto: Use CC_FLAGS_FPU for NEON CFLAGS

2023-12-08 Thread Samuel Holland
Now that CC_FLAGS_FPU is exported and can be used anywhere in the source
tree, use it instead of duplicating the flags here.

Signed-off-by: Samuel Holland 
---

 arch/arm/lib/Makefile | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 650404be6768..0ca5aae1bcc3 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -40,8 +40,7 @@ $(obj)/csumpartialcopy.o: $(obj)/csumpartialcopygeneric.S
 $(obj)/csumpartialcopyuser.o:  $(obj)/csumpartialcopygeneric.S
 
 ifeq ($(CONFIG_KERNEL_MODE_NEON),y)
-  NEON_FLAGS   := -march=armv7-a -mfloat-abi=softfp -mfpu=neon
-  CFLAGS_xor-neon.o+= $(NEON_FLAGS)
+  CFLAGS_xor-neon.o+= $(CC_FLAGS_FPU)
   obj-$(CONFIG_XOR_BLOCKS) += xor-neon.o
 endif
 
-- 
2.42.0



Re: [PATCH 3/3] drm/amd/display: Support DRM_AMD_DC_FP on RISC-V

2023-12-08 Thread Samuel Holland
Hi Christoph,

On 2023-11-22 2:40 AM, Christoph Hellwig wrote:
>> -select DRM_AMD_DC_FP if (X86 || LOONGARCH || (PPC64 && ALTIVEC) || 
>> (ARM64 && KERNEL_MODE_NEON && !CC_IS_CLANG))
>> +select DRM_AMD_DC_FP if ARM64 && KERNEL_MODE_NEON && !CC_IS_CLANG
>> +select DRM_AMD_DC_FP if PPC64 && ALTIVEC
>> +select DRM_AMD_DC_FP if RISCV && FPU
>> +select DRM_AMD_DC_FP if LOONGARCH || X86
> 
> This really is a mess.  Can you add a ARCH_HAS_KERNEL_FPU_SUPPORT
> symbol that all architetures that have it select instead, and them
> make DRM_AMD_DC_FP depend on it?

Yes, I have done this for v2, which I will send shortly.

>> -#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
>> +#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH) || 
>> defined(CONFIG_RISCV)
>>  kernel_fpu_begin();
>>  #elif defined(CONFIG_PPC64)
>>  if (cpu_has_feature(CPU_FTR_VSX_COMP))
>> @@ -122,7 +124,7 @@ void dc_fpu_end(const char *function_name, const int 
>> line)
>>  
>>  depth = __this_cpu_dec_return(fpu_recursion_depth);
>>  if (depth == 0) {
>> -#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH)
>> +#if defined(CONFIG_X86) || defined(CONFIG_LOONGARCH) || 
>> defined(CONFIG_RISCV)
>>  kernel_fpu_end();
>>  #elif defined(CONFIG_PPC64)
>>  if (cpu_has_feature(CPU_FTR_VSX_COMP))
> 
> And then this mess can go away.  We'll need to decide if we want to
> cover all the in-kernel vector support as part of it, which would
> seem reasonable to me, or have a separate generic kernel_vector_begin
> with it's own option.

I think we may want to keep vector separate for performance on architectures
with separate FP and vector register files. For now, I have limited my changes
to FPU support only, which means I have removed VSX/Altivec from here; the
AMDGPU code doesn't need Altivec anyway.

>> diff --git a/drivers/gpu/drm/amd/display/dc/dml/Makefile 
>> b/drivers/gpu/drm/amd/display/dc/dml/Makefile
>> index ea7d60f9a9b4..5c8f840ef323 100644
>> --- a/drivers/gpu/drm/amd/display/dc/dml/Makefile
>> +++ b/drivers/gpu/drm/amd/display/dc/dml/Makefile
>> @@ -43,6 +43,12 @@ dml_ccflags := -mfpu=64
>>  dml_rcflags := -msoft-float
>>  endif
>>  
>> +ifdef CONFIG_RISCV
>> +include $(srctree)/arch/riscv/Makefile.isa
>> +# Remove V from the ISA string, like in arch/riscv/Makefile, but keep F and 
>> D.
>> +dml_ccflags := -march=$(shell echo $(riscv-march-y) | sed -E 
>> 's/(rv32ima|rv64ima)([^v_]*)v?/\1\2/')
>> +endif
>> +
>>  ifdef CONFIG_CC_IS_GCC
>>  ifneq ($(call gcc-min-version, 70100),y)
>>  IS_OLD_GCC = 1
> 
> And this is again not really something we should be doing.
> Instead we need a generic way in Kconfig to enable FPU support
> for an object file or set of, that the arch support can hook
> into.

I've included this in v2 as well.

> Btw, I'm also really worried about folks using the FPU instructions
> outside the kernel_fpu_begin/end windows in general (not directly
> related to the RISC-V support).  Can we have objecttool checks
> for that similar to only allowing the unsafe uaccess in the
> uaccess begin/end pairs?

ARM partially enforces this at compile time: it disallows calling
kernel_neon_begin() inside a translation unit that has NEON enabled. That
doesn't prevent the programmer from calling a FPU-enabled function from outside
a begin/end section, but it does prevent the compiler from generating unexpected
FPU usage behind your back. I implemented this same functionality for RISC-V.

Actually tracking all possibly-FPU-tainted functions and their call sites is
probably possible, but a much larger task.

Regards,
Samuel



[RFC PATCH 09/12] riscv: Add support for kernel-mode FPU

2023-12-08 Thread Samuel Holland
This is motivated by the amdgpu DRM driver, which needs floating-point
code to support recent hardware. That code is not performance-critical,
so only provide a minimal non-preemptible implementation for now.

Use a similar trick as ARM to force placing floating-point code in a
separate translation unit, so it is not possible for compiler-generated
floating-point code to appear outside kernel_fpu_{begin,end}().

Signed-off-by: Samuel Holland 
---

 arch/riscv/Kconfig  |  1 +
 arch/riscv/Makefile |  3 +++
 arch/riscv/include/asm/fpu.h| 26 ++
 arch/riscv/kernel/Makefile  |  1 +
 arch/riscv/kernel/kernel_mode_fpu.c | 28 
 5 files changed, 59 insertions(+)
 create mode 100644 arch/riscv/include/asm/fpu.h
 create mode 100644 arch/riscv/kernel/kernel_mode_fpu.c

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 95a2a06acc6a..cf0967928e6d 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -27,6 +27,7 @@ config RISCV
select ARCH_HAS_GCOV_PROFILE_ALL
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_HAS_KCOV
+   select ARCH_HAS_KERNEL_FPU_SUPPORT if FPU
select ARCH_HAS_MMIOWB
select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE
select ARCH_HAS_PMEM_API
diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile
index a74be78678eb..2e719c369210 100644
--- a/arch/riscv/Makefile
+++ b/arch/riscv/Makefile
@@ -81,6 +81,9 @@ KBUILD_CFLAGS += -march=$(shell echo $(riscv-march-y) | sed 
-E 's/(rv32ima|rv64i
 
 KBUILD_AFLAGS += -march=$(riscv-march-y)
 
+# For C code built with floating-point support, exclude V but keep F and D.
+CC_FLAGS_FPU  := -march=$(shell echo $(riscv-march-y) | sed -E 
's/(rv32ima|rv64ima)([^v_]*)v?/\1\2/')
+
 KBUILD_CFLAGS += -mno-save-restore
 KBUILD_CFLAGS += -DCONFIG_PAGE_OFFSET=$(CONFIG_PAGE_OFFSET)
 
diff --git a/arch/riscv/include/asm/fpu.h b/arch/riscv/include/asm/fpu.h
new file mode 100644
index ..8cd027acc015
--- /dev/null
+++ b/arch/riscv/include/asm/fpu.h
@@ -0,0 +1,26 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2023 SiFive
+ */
+
+#ifndef _ASM_RISCV_FPU_H
+#define _ASM_RISCV_FPU_H
+
+#include 
+
+#define kernel_fpu_available() has_fpu()
+
+#ifdef __riscv_f
+
+#define kernel_fpu_begin() \
+   static_assert(false, "floating-point code must use a separate 
translation unit")
+#define kernel_fpu_end() kernel_fpu_begin()
+
+#else
+
+void kernel_fpu_begin(void);
+void kernel_fpu_end(void);
+
+#endif
+
+#endif /* ! _ASM_RISCV_FPU_H */
diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile
index fee22a3d1b53..662c483e338d 100644
--- a/arch/riscv/kernel/Makefile
+++ b/arch/riscv/kernel/Makefile
@@ -62,6 +62,7 @@ obj-$(CONFIG_MMU) += vdso.o vdso/
 
 obj-$(CONFIG_RISCV_MISALIGNED) += traps_misaligned.o
 obj-$(CONFIG_FPU)  += fpu.o
+obj-$(CONFIG_FPU)  += kernel_mode_fpu.o
 obj-$(CONFIG_RISCV_ISA_V)  += vector.o
 obj-$(CONFIG_SMP)  += smpboot.o
 obj-$(CONFIG_SMP)  += smp.o
diff --git a/arch/riscv/kernel/kernel_mode_fpu.c 
b/arch/riscv/kernel/kernel_mode_fpu.c
new file mode 100644
index ..9b2024cc056b
--- /dev/null
+++ b/arch/riscv/kernel/kernel_mode_fpu.c
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2023 SiFive
+ */
+
+#include 
+#include 
+
+#include 
+#include 
+#include 
+#include 
+
+void kernel_fpu_begin(void)
+{
+   preempt_disable();
+   fstate_save(current, task_pt_regs(current));
+   csr_set(CSR_SSTATUS, SR_FS);
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_begin);
+
+void kernel_fpu_end(void)
+{
+   csr_clear(CSR_SSTATUS, SR_FS);
+   fstate_restore(current, task_pt_regs(current));
+   preempt_enable();
+}
+EXPORT_SYMBOL_GPL(kernel_fpu_end);
-- 
2.42.0



Re: [PATCH] drm/amdgpu: Enable tunneling on high-priority compute queues

2023-12-08 Thread Friedrich Vock

Friendly ping on this one.
Userspace side got merged, so would be great to land this patch too :)

On 02.12.23 01:17, Friedrich Vock wrote:

This improves latency if the GPU is already busy with other work.
This is useful for VR compositors that submit highly latency-sensitive
compositing work on high-priority compute queues while the GPU is busy
rendering the next frame.

Userspace merge request:
https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26462

Signed-off-by: Friedrich Vock 
---
  drivers/gpu/drm/amd/amdgpu/amdgpu.h  |  1 +
  drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 10 ++
  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c   |  3 ++-
  drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c   |  3 ++-
  4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h 
b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9505dc8f9d69..4b923a156c4e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -790,6 +790,7 @@ struct amdgpu_mqd_prop {
uint64_t eop_gpu_addr;
uint32_t hqd_pipe_priority;
uint32_t hqd_queue_priority;
+   bool allow_tunneling;
bool hqd_active;
  };

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c 
b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 231d49132a56..4d98e8879be8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -620,6 +620,10 @@ static void amdgpu_ring_to_mqd_prop(struct amdgpu_ring 
*ring,
struct amdgpu_mqd_prop *prop)
  {
struct amdgpu_device *adev = ring->adev;
+   bool is_high_prio_compute = ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE 
&&
+   
amdgpu_gfx_is_high_priority_compute_queue(adev, ring);
+   bool is_high_prio_gfx = ring->funcs->type == AMDGPU_RING_TYPE_GFX &&
+   
amdgpu_gfx_is_high_priority_graphics_queue(adev, ring);

memset(prop, 0, sizeof(*prop));

@@ -637,10 +641,8 @@ static void amdgpu_ring_to_mqd_prop(struct amdgpu_ring 
*ring,
 */
prop->hqd_active = ring->funcs->type == AMDGPU_RING_TYPE_KIQ;

-   if ((ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE &&
-amdgpu_gfx_is_high_priority_compute_queue(adev, ring)) ||
-   (ring->funcs->type == AMDGPU_RING_TYPE_GFX &&
-amdgpu_gfx_is_high_priority_graphics_queue(adev, ring))) {
+   prop->allow_tunneling = is_high_prio_compute;
+   if (is_high_prio_compute || is_high_prio_gfx) {
prop->hqd_pipe_priority = AMDGPU_GFX_PIPE_PRIO_HIGH;
prop->hqd_queue_priority = AMDGPU_GFX_QUEUE_PRIORITY_MAXIMUM;
}
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
index c8a3bf01743f..73f6d7e72c73 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
@@ -6593,7 +6593,8 @@ static int gfx_v10_0_compute_mqd_init(struct 
amdgpu_device *adev, void *m,
tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, ENDIAN_SWAP, 1);
  #endif
tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 0);
-   tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH, 0);
+   tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH,
+   prop->allow_tunneling);
tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, PRIV_STATE, 1);
tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, KMD_QUEUE, 1);
mqd->cp_hqd_pq_control = tmp;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c 
b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index c659ef0f47ce..bdcf96df69e6 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -3847,7 +3847,8 @@ static int gfx_v11_0_compute_mqd_init(struct 
amdgpu_device *adev, void *m,
tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, RPTR_BLOCK_SIZE,
(order_base_2(AMDGPU_GPU_PAGE_SIZE / 4) - 1));
tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 0);
-   tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH, 0);
+   tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, TUNNEL_DISPATCH,
+   prop->allow_tunneling);
tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, PRIV_STATE, 1);
tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, KMD_QUEUE, 1);
mqd->cp_hqd_pq_control = tmp;
--
2.43.0