https://github.com/easyonaadit created https://github.com/llvm/llvm-project/pull/183701
While spilling unaligned tuples, rather than breaking the spill into 32-bit accesses, spill the first register as a single 32-bit spill, and spill the remainder of the tuple as an aligned tuple. Some additional bookkeeping is required in the spilling loop to manage the state. References: https://github.com/llvm/llvm-project/pull/177317 >From a1cad40d31d5505818fb77327ed80c62a11feffe Mon Sep 17 00:00:00 2001 From: Aaditya <[email protected]> Date: Fri, 27 Feb 2026 14:09:17 +0530 Subject: [PATCH] [AMDGPU] Multi dword spilling for unaligned tuples While spilling unaligned tuples, rather than breaking the spill into 32-bit accesses, spill the first register as a single 32-bit spill, and spill the remainder of the tuple as an aligned tuple. Some additional bookkeeping is required in the spilling loop to manage the state. --- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 54 ++++++++++++++++++----- llvm/test/CodeGen/AMDGPU/vgpr-spill.mir | 34 ++++---------- 2 files changed, 52 insertions(+), 36 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 29d6937b1f5ff..752abe9ad9a4e 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1537,9 +1537,13 @@ void SIRegisterInfo::buildSpillLoadStore( const unsigned RegWidth = AMDGPU::getRegBitWidth(*RC) / 8; // On targets with register tuple alignment requirements, - // for unaligned tuples, break the spill into 32-bit pieces. - // TODO: Optimize misaligned spills by using larger aligned chunks instead of - // 32-bit splits. + // for unaligned tuples, spill the first sub-reg as a 32-bit spill, + // and spill the rest as a regular aligned tuple. + // eg: SPILL_V224 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + // will be spilt as: + // SPILL_SCRATCH_DWORD $vgpr1 + // SPILL_SCRATCH_DWORDx4 $vgpr2_vgpr3_vgpr4_vgpr5 + // SPILL_SCRATCH_DWORDx2 $vgpr6_vgpr7 bool IsRegMisaligned = false; if (!IsBlock && RegWidth > 4) { unsigned SpillOpcode = @@ -1560,14 +1564,21 @@ void SIRegisterInfo::buildSpillLoadStore( // Always use 4 byte operations for AGPRs because we need to scavenge // a temporary VGPR. // If we're using a block operation, the element should be the whole block. - // For misaligned registers, use 4-byte elements to avoid alignment errors. - unsigned EltSize = IsBlock ? RegWidth - : (IsFlat && !IsAGPR && !IsRegMisaligned) - ? std::min(RegWidth, 16u) - : 4u; + unsigned EltSize = IsBlock ? RegWidth + : (IsFlat && !IsAGPR) ? std::min(RegWidth, 16u) + : 4u; unsigned NumSubRegs = RegWidth / EltSize; unsigned Size = NumSubRegs * EltSize; unsigned RemSize = RegWidth - Size; + // For unaligned tuples, the first sub-reg is spilt as a single 32-bit spill, + // and will count as an additional reg, so the last chunk will have one less + // register. In some cases, the last chunk could be completly eliminated, + // eg: SPILL_V160 $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5 will be spilt as: + // SPILL_SCRATCH_DWORD $vgpr1 + // SPILL_SCRATCH_DWORD $vgpr2_vgpr3_vgpr4_vgpr5 + unsigned LastChunk = ((RemSize / 4) + 3) % 4; + if (IsRegMisaligned && LastChunk) + NumSubRegs += 1; unsigned NumRemSubRegs = RemSize ? 1 : 0; int64_t Offset = InstOffset + MFI.getObjectOffset(Index); int64_t MaterializedOffset = Offset; @@ -1731,10 +1742,31 @@ void SIRegisterInfo::buildSpillLoadStore( for (unsigned i = 0, e = NumSubRegs + NumRemSubRegs, RegOffset = 0; i != e; ++i, RegOffset += EltSize) { - if (i == NumSubRegs) { - EltSize = RemSize; + unsigned SavedEltSize = EltSize; + if (i == 0 && IsRegMisaligned) { + // For misaligned register tuples, spill only the first sub-reg in the + // first iteration. + EltSize = 4u; LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); } + if (i == 1 && IsRegMisaligned) { + // The first sub-reg was split in the previous iteration. + RegOffset = 4u; + if (RegWidth <= 16) + EltSize = RegWidth - 4u; + LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); + } + if (IsRegMisaligned) { + if (i == (e - 1)) { + EltSize = LastChunk * 4; + LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); + } + } else { + if (i == NumSubRegs) { + EltSize = RemSize; + LoadStoreOp = getFlatScratchSpillOpcode(TII, LoadStoreOp, EltSize); + } + } Desc = &TII->get(LoadStoreOp); if (!IsFlat && UseVGPROffset) { @@ -1968,6 +2000,8 @@ void SIRegisterInfo::buildSpillLoadStore( // scavenged. if (!IsStore && TII->isBlockLoadStore(LoadStoreOp)) addImplicitUsesForBlockCSRLoad(MIB, ValueReg); + if (i == 0 && IsRegMisaligned) + EltSize = SavedEltSize; } if (ScratchOffsetRegDelta != 0) { diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir b/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir index 82111165c0127..038c0ca05fd50 100644 --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill.mir @@ -290,9 +290,7 @@ body: | ; GFX942: liveins: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5) - ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5) - ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5) - ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5) + ; GFX942-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr2_vgpr3_vgpr4, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 4, align 4, addrspace 5) ; ; GFX1200-LABEL: name: spill_v128_kill_unaligned ; GFX1200: liveins: $vgpr1_vgpr2_vgpr3_vgpr4 @@ -303,9 +301,7 @@ body: | ; GFX1250: liveins: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1250-NEXT: {{ $}} ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5) - ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5) - ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5) - ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5) + ; GFX1250-NEXT: SCRATCH_STORE_DWORDX3_SADDR killed $vgpr2_vgpr3_vgpr4, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit killed $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 4, align 4, addrspace 5) SI_SPILL_V128_SAVE killed $vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, addrspace 5) ... @@ -334,9 +330,7 @@ body: | ; GFX942: liveins: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5) - ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5) - ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5) - ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5) + ; GFX942-NEXT: SCRATCH_STORE_DWORDX3_SADDR $vgpr2_vgpr3_vgpr4, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 4, align 4, addrspace 5) ; ; GFX1200-LABEL: name: spill_v128_unaligned ; GFX1200: liveins: $vgpr1_vgpr2_vgpr3_vgpr4 @@ -347,9 +341,7 @@ body: | ; GFX1250: liveins: $vgpr1_vgpr2_vgpr3_vgpr4 ; GFX1250-NEXT: {{ $}} ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5) - ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5) - ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5) - ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5) + ; GFX1250-NEXT: SCRATCH_STORE_DWORDX3_SADDR $vgpr2_vgpr3_vgpr4, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 4, align 4, addrspace 5) SI_SPILL_V128_SAVE $vgpr1_vgpr2_vgpr3_vgpr4, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, addrspace 5) ... @@ -421,13 +413,8 @@ body: | ; GFX942: liveins: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 ; GFX942-NEXT: {{ $}} ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5) - ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5) - ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5) - ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5) - ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 16, addrspace 5) - ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr6, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 20, addrspace 5) - ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr7, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 24, addrspace 5) - ; GFX942-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr8, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 28, addrspace 5) + ; GFX942-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s128) into %stack.0 + 4, align 4, addrspace 5) + ; GFX942-NEXT: SCRATCH_STORE_DWORDX3_SADDR $vgpr6_vgpr7_vgpr8, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 20, align 4, addrspace 5) ; ; GFX1200-LABEL: name: spill_v256_unaligned ; GFX1200: liveins: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 @@ -439,12 +426,7 @@ body: | ; GFX1250: liveins: $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 ; GFX1250-NEXT: {{ $}} ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr32, 0, 0, implicit $exec, implicit $flat_scr, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0, addrspace 5) - ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 4, addrspace 5) - ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr32, 8, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 8, addrspace 5) - ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr32, 12, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 12, addrspace 5) - ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr32, 16, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 16, addrspace 5) - ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr6, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 20, addrspace 5) - ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr7, $sgpr32, 24, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s32) into %stack.0 + 24, addrspace 5) - ; GFX1250-NEXT: SCRATCH_STORE_DWORD_SADDR $vgpr8, $sgpr32, 28, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s32) into %stack.0 + 28, addrspace 5) + ; GFX1250-NEXT: SCRATCH_STORE_DWORDX4_SADDR $vgpr2_vgpr3_vgpr4_vgpr5, $sgpr32, 4, 0, implicit $exec, implicit $flat_scr :: ("amdgpu-thread-private" store (s128) into %stack.0 + 4, align 4, addrspace 5) + ; GFX1250-NEXT: SCRATCH_STORE_DWORDX3_SADDR $vgpr6_vgpr7_vgpr8, $sgpr32, 20, 0, implicit $exec, implicit $flat_scr, implicit $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 :: ("amdgpu-thread-private" store (s96) into %stack.0 + 20, align 4, addrspace 5) SI_SPILL_V256_SAVE $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, addrspace 5) ... _______________________________________________ llvm-branch-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/llvm-branch-commits
