Trap temporary GPRs are not currently saved/restored on ASICs
without scalar store instructions. They contain data useful to a
user-mode debugger.

Use vector store instructons to save TTMPs on these ASICs.

Signed-off-by: Jay Cornwall <[email protected]>
Cc: Laurent Morichetti <[email protected]>
---
 .../gpu/drm/amd/amdkfd/cwsr_trap_handler.h    | 119 ++++++++++--------
 .../amd/amdkfd/cwsr_trap_handler_gfx10.asm    |  46 ++++++-
 2 files changed, 114 insertions(+), 51 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
index 9c903c38dd74..d674f6d798f6 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler.h
@@ -1534,7 +1534,7 @@ static const uint32_t cwsr_trap_arcturus_hex[] = {
 };
 
 static const uint32_t cwsr_trap_gfx10_hex[] = {
-       0xbf820001, 0xbf8201cb,
+       0xbf820001, 0xbf8201f5,
        0xb0804004, 0xb978f802,
        0x8a788678, 0xb96ef801,
        0x876eff6e, 0x00000800,
@@ -1563,6 +1563,11 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
        0xbf900004, 0xbf8cc07f,
        0x877aff7f, 0x04000000,
        0x8f7a857a, 0x886d7a6d,
+       0xbefa037e, 0x877bff7f,
+       0x0000ffff, 0xbefe03c1,
+       0xbeff03c1, 0xdc5f8000,
+       0x007a0000, 0x7e000280,
+       0xbefe037a, 0xbeff037b,
        0xb97b02dc, 0x8f7b997b,
        0xb97a2a05, 0x807a817a,
        0xbf0d997b, 0xbf850002,
@@ -1570,58 +1575,74 @@ static const uint32_t cwsr_trap_gfx10_hex[] = {
        0x8f7a8a7a, 0x877bff7f,
        0x0000ffff, 0x807aff7a,
        0x00000200, 0x807a7e7a,
-       0x827b807b, 0xbef4037e,
-       0x8775ff7f, 0x0000ffff,
-       0x8875ff75, 0x00040000,
-       0xbef60380, 0xbef703ff,
-       0x10807fac, 0xbef1037c,
-       0xbef00380, 0xb97302dc,
-       0x8f739973, 0xbefe03c1,
-       0x907c9973, 0x877c817c,
-       0xbf06817c, 0xbf850002,
-       0xbeff0380, 0xbf820002,
-       0xbeff03c1, 0xbf82000b,
+       0x827b807b, 0xd7610000,
+       0x00010870, 0xd7610000,
+       0x00010a71, 0xd7610000,
+       0x00010c72, 0xd7610000,
+       0x00010e73, 0xd7610000,
+       0x00011074, 0xd7610000,
+       0x00011275, 0xd7610000,
+       0x00011476, 0xd7610000,
+       0x00011677, 0xd7610000,
+       0x00011a79, 0xd7610000,
+       0x00011c7e, 0xd7610000,
+       0x00011e7f, 0xbefe03ff,
+       0x00003fff, 0xbeff0380,
+       0xdc5f8040, 0x007a0000,
+       0xd760007a, 0x00011d00,
+       0xd760007b, 0x00011f00,
+       0xbefe037a, 0xbeff037b,
+       0xbef4037e, 0x8775ff7f,
+       0x0000ffff, 0x8875ff75,
+       0x00040000, 0xbef60380,
+       0xbef703ff, 0x10807fac,
+       0xbef1037c, 0xbef00380,
+       0xb97302dc, 0x8f739973,
+       0xbefe03c1, 0x907c9973,
+       0x877c817c, 0xbf06817c,
+       0xbf850002, 0xbeff0380,
+       0xbf820002, 0xbeff03c1,
+       0xbf820009, 0xbef603ff,
+       0x01000000, 0xe0704080,
+       0x705d0100, 0xe0704100,
+       0x705d0200, 0xe0704180,
+       0x705d0300, 0xbf820008,
        0xbef603ff, 0x01000000,
-       0xe0704000, 0x705d0000,
-       0xe0704080, 0x705d0100,
-       0xe0704100, 0x705d0200,
-       0xe0704180, 0x705d0300,
-       0xbf82000a, 0xbef603ff,
-       0x01000000, 0xe0704000,
-       0x705d0000, 0xe0704100,
-       0x705d0100, 0xe0704200,
-       0x705d0200, 0xe0704300,
-       0x705d0300, 0xb9702a05,
-       0x80708170, 0xbf0d9973,
-       0xbf850002, 0x8f708970,
-       0xbf820001, 0x8f708a70,
-       0xb97a1e06, 0x8f7a8a7a,
-       0x80707a70, 0x8070ff70,
-       0x00000200, 0xbef603ff,
-       0x01000000, 0x7e000280,
-       0x7e020280, 0x7e040280,
-       0xbefc0380, 0xd7610002,
-       0x0000f871, 0x807c817c,
-       0xd7610002, 0x0000f86c,
-       0x807c817c, 0x8a7aff6d,
-       0x80000000, 0xd7610002,
-       0x0000f87a, 0x807c817c,
-       0xd7610002, 0x0000f86e,
+       0xe0704100, 0x705d0100,
+       0xe0704200, 0x705d0200,
+       0xe0704300, 0x705d0300,
+       0xb9702a05, 0x80708170,
+       0xbf0d9973, 0xbf850002,
+       0x8f708970, 0xbf820001,
+       0x8f708a70, 0xb97a1e06,
+       0x8f7a8a7a, 0x80707a70,
+       0x8070ff70, 0x00000200,
+       0xbef603ff, 0x01000000,
+       0x7e000280, 0x7e020280,
+       0x7e040280, 0xbefc0380,
+       0xd7610002, 0x0000f871,
        0x807c817c, 0xd7610002,
-       0x0000f86f, 0x807c817c,
-       0xd7610002, 0x0000f878,
-       0x807c817c, 0xb97af803,
+       0x0000f86c, 0x807c817c,
+       0x8a7aff6d, 0x80000000,
        0xd7610002, 0x0000f87a,
        0x807c817c, 0xd7610002,
-       0x0000f87b, 0x807c817c,
-       0xb971f801, 0xd7610002,
-       0x0000f871, 0x807c817c,
-       0xb971f814, 0xd7610002,
-       0x0000f871, 0x807c817c,
-       0xb971f815, 0xd7610002,
-       0x0000f871, 0x807c817c,
-       0xbeff0380, 0xe0704000,
-       0x705d0200, 0xb9702a05,
+       0x0000f86e, 0x807c817c,
+       0xd7610002, 0x0000f86f,
+       0x807c817c, 0xd7610002,
+       0x0000f878, 0x807c817c,
+       0xb97af803, 0xd7610002,
+       0x0000f87a, 0x807c817c,
+       0xd7610002, 0x0000f87b,
+       0x807c817c, 0xb971f801,
+       0xd7610002, 0x0000f871,
+       0x807c817c, 0xb971f814,
+       0xd7610002, 0x0000f871,
+       0x807c817c, 0xb971f815,
+       0xd7610002, 0x0000f871,
+       0x807c817c, 0xbefe03ff,
+       0x0000ffff, 0xbeff0380,
+       0xe0704000, 0x705d0200,
+       0xbefe03c1, 0xb9702a05,
        0x80708170, 0xbf0d9973,
        0xbf850002, 0x8f708970,
        0xbf820001, 0x8f708a70,
diff --git a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm 
b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
index 06947a8767c6..fbe3992d1c2c 100644
--- a/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
+++ b/drivers/gpu/drm/amd/amdkfd/cwsr_trap_handler_gfx10.asm
@@ -253,6 +253,20 @@ L_SLEEP:
        s_lshl_b32      s_save_tmp, s_save_tmp, (S_SAVE_PC_HI_FIRST_WAVE_SHIFT 
- S_SAVE_SPI_INIT_FIRST_WAVE_SHIFT)
        s_or_b32        s_save_pc_hi, s_save_pc_hi, s_save_tmp
 
+#if NO_SQC_STORE
+       // Trap temporaries must be saved via VGPR but all VGPRs are in use.
+       // There is no ttmp space to hold the resource constant for VGPR save.
+       // Save v0 by itself since it requires only two SGPRs.
+       s_mov_b32       s_save_ttmps_lo, exec_lo
+       s_and_b32       s_save_ttmps_hi, exec_hi, 0xFFFF
+       s_mov_b32       exec_lo, 0xFFFFFFFF
+       s_mov_b32       exec_hi, 0xFFFFFFFF
+       global_store_dword_addtid       v0, [s_save_ttmps_lo, s_save_ttmps_hi] 
slc:1 glc:1
+       v_mov_b32       v0, 0x0
+       s_mov_b32       exec_lo, s_save_ttmps_lo
+       s_mov_b32       exec_hi, s_save_ttmps_hi
+#endif
+
        // Save trap temporaries 4-11, 13 initialized by SPI debug dispatch 
logic
        // ttmp SR memory offset : size(VGPR)+size(SGPR)+0x40
        get_wave_size(s_save_ttmps_hi)
@@ -262,7 +276,27 @@ L_SLEEP:
        s_add_u32       s_save_ttmps_lo, s_save_ttmps_lo, s_save_spi_init_lo
        s_addc_u32      s_save_ttmps_hi, s_save_ttmps_hi, 0x0
 
-#if ASIC_TARGET_NAVI1X
+#if NO_SQC_STORE
+       v_writelane_b32 v0, ttmp4, 0x4
+       v_writelane_b32 v0, ttmp5, 0x5
+       v_writelane_b32 v0, ttmp6, 0x6
+       v_writelane_b32 v0, ttmp7, 0x7
+       v_writelane_b32 v0, ttmp8, 0x8
+       v_writelane_b32 v0, ttmp9, 0x9
+       v_writelane_b32 v0, ttmp10, 0xA
+       v_writelane_b32 v0, ttmp11, 0xB
+       v_writelane_b32 v0, ttmp13, 0xD
+       v_writelane_b32 v0, exec_lo, 0xE
+       v_writelane_b32 v0, exec_hi, 0xF
+
+       s_mov_b32       exec_lo, 0x3FFF
+       s_mov_b32       exec_hi, 0x0
+       global_store_dword_addtid       v0, [s_save_ttmps_lo, s_save_ttmps_hi] 
inst_offset:0x40 slc:1 glc:1
+       v_readlane_b32  ttmp14, v0, 0xE
+       v_readlane_b32  ttmp15, v0, 0xF
+       s_mov_b32       exec_lo, ttmp14
+       s_mov_b32       exec_hi, ttmp15
+#else
        s_store_dwordx4 [ttmp4, ttmp5, ttmp6, ttmp7], [s_save_ttmps_lo, 
s_save_ttmps_hi], 0x50 glc:1
        s_store_dwordx4 [ttmp8, ttmp9, ttmp10, ttmp11], [s_save_ttmps_lo, 
s_save_ttmps_hi], 0x60 glc:1
        s_store_dword   ttmp13, [s_save_ttmps_lo, s_save_ttmps_hi], 0x74 glc:1
@@ -303,7 +337,9 @@ L_SAVE_4VGPR_WAVE32:
 
        // VGPR Allocated in 4-GPR granularity
 
+#if !NO_SQC_STORE
        buffer_store_dword      v0, v0, s_save_buf_rsrc0, s_save_mem_offset 
slc:1 glc:1
+#endif
        buffer_store_dword      v1, v0, s_save_buf_rsrc0, s_save_mem_offset 
slc:1 glc:1 offset:128
        buffer_store_dword      v2, v0, s_save_buf_rsrc0, s_save_mem_offset 
slc:1 glc:1 offset:128*2
        buffer_store_dword      v3, v0, s_save_buf_rsrc0, s_save_mem_offset 
slc:1 glc:1 offset:128*3
@@ -314,7 +350,9 @@ L_SAVE_4VGPR_WAVE64:
 
        // VGPR Allocated in 4-GPR granularity
 
+#if !NO_SQC_STORE
        buffer_store_dword      v0, v0, s_save_buf_rsrc0, s_save_mem_offset 
slc:1 glc:1
+#endif
        buffer_store_dword      v1, v0, s_save_buf_rsrc0, s_save_mem_offset 
slc:1 glc:1 offset:256
        buffer_store_dword      v2, v0, s_save_buf_rsrc0, s_save_mem_offset 
slc:1 glc:1 offset:256*2
        buffer_store_dword      v3, v0, s_save_buf_rsrc0, s_save_mem_offset 
slc:1 glc:1 offset:256*3
@@ -361,9 +399,13 @@ L_SAVE_HWREG:
        write_hwreg_to_mem(s_save_m0, s_save_buf_rsrc0, s_save_mem_offset)
 
 #if NO_SQC_STORE
-       // Write HWREG/SGPRs with 32 VGPR lanes, wave32 is common case.
+       // Write HWREGs with 16 VGPR lanes. TTMPs occupy space after this.
+       s_mov_b32       exec_lo, 0xFFFF
        s_mov_b32       exec_hi, 0x0
        buffer_store_dword      v2, v0, s_save_buf_rsrc0, s_save_mem_offset 
slc:1 glc:1
+
+       // Write SGPRs with 32 VGPR lanes. This works in wave32 and wave64 mode.
+       s_mov_b32       exec_lo, 0xFFFFFFFF
 #endif
 
        /* save SGPRs */
-- 
2.17.1

_______________________________________________
amd-gfx mailing list
[email protected]
https://lists.freedesktop.org/mailman/listinfo/amd-gfx

Reply via email to