================ @@ -139,21 +134,20 @@ define void @foo4() { ; PTX32-EMPTY: ; PTX32-NEXT: // %bb.0: ; PTX32-NEXT: mov.b32 %SPL, __local_depot3; -; PTX32-NEXT: cvta.local.u32 %SP, %SPL; -; PTX32-NEXT: add.u32 %r1, %SP, 0; -; PTX32-NEXT: add.u32 %r2, %SPL, 0; -; PTX32-NEXT: add.u32 %r3, %SP, 4; -; PTX32-NEXT: add.u32 %r4, %SPL, 4; -; PTX32-NEXT: st.local.b32 [%r2], 0; -; PTX32-NEXT: st.local.b32 [%r4], 0; +; PTX32-NEXT: add.u32 %r1, %SPL, 0; +; PTX32-NEXT: cvta.local.u32 %r2, %r1; +; PTX32-NEXT: add.u32 %r3, %SPL, 4; +; PTX32-NEXT: cvta.local.u32 %r4, %r3; +; PTX32-NEXT: st.local.b32 [%SPL], 0; +; PTX32-NEXT: st.local.b32 [%SPL+4], 0; ---------------- thetheodor wrote:
I think the issue stems from the following: 1. Previously the IR looked like this: ``` define void @foo4() { %A = alloca i32, align 4 %1 = addrspacecast ptr %A to ptr addrspace(5) %B = alloca i32, align 4 %2 = addrspacecast ptr %B to ptr addrspace(5) store i32 0, ptr addrspace(5) %1, align 4 store i32 0, ptr addrspace(5) %2, align 4 call void @bar(ptr %A) call void @bar(ptr %B) ret void } ``` the function call arguments were generic pointers and they were lowered like so: ``` bb.0 (%ir-block.0): %0:b64 = LEA_ADDRi64 %stack.0.A, 0 <- generic address %1:b64 = cvta_to_local_64 %0:b64 %2:b64 = LEA_ADDRi64 %stack.1.B, 0 <- generic address %3:b64 = cvta_to_local_64 %2:b64 ... ST_i64 %0:b64, 0, 0, 101, 64, ¶m0, 0 :: (store (s64), addrspace 101) ... ST_i64 %2:b64, 0, 0, 101, 64, ¶m0, 0 :: (store (s64), addrspace 101) ``` the two `cvta_to_local` are eventually optimized away: ``` $vrframelocal64 = MOV_DEPOT_ADDR_64 3 $vrframe64 = cvta_local_64 $vrframelocal64 %0:b64 = LEA_ADDRi64 $vrframe64, 0 %1:b64 = cvta_to_local_64 %0:b64 %2:b64 = LEA_ADDRi64 $vrframe64, 4 %3:b64 = cvta_to_local_64 %2:b64 ``` -> ``` $vrframelocal64 = MOV_DEPOT_ADDR_64 3 $vrframe64 = cvta_local_64 $vrframelocal64 %0:b64 = LEA_ADDRi64 $vrframe64, 0 %1:b64 = LEA_ADDRi64 $vrframelocal64, 0 %2:b64 = LEA_ADDRi64 $vrframe64, 4 %3:b64 = LEA_ADDRi64 $vrframelocal64, 4 ``` 2. The current IR looks like this: ``` define void @foo4() { %A1 = alloca i32, align 4, addrspace(5) %1 = addrspacecast ptr addrspace(5) %A1 to ptr %B2 = alloca i32, align 4, addrspace(5) %2 = addrspacecast ptr addrspace(5) %B2 to ptr store i32 0, ptr addrspace(5) %A1, align 4 store i32 0, ptr addrspace(5) %B2, align 4 call void @bar(ptr %1) call void @bar(ptr %2) ret void } ``` i.e., the arguments are now the address space cast results (not the alloca pointers as previously). This is lowered to: ``` bb.0 (%ir-block.0): %0:b64 = LEA_ADDRi64 %stack.0.A1, 0. %1:b64 = cvta_local_64 killed %0:b64 <- param0 is a cast result %2:b64 = LEA_ADDRi64 %stack.1.B2, 0 %3:b64 = cvta_local_64 killed %2:b64 <- param1 is a cast result ... ST_i64 killed %1:b64, 0, 0, 101, 64, ¶m0, 0 :: (store (s64), addrspace 101) ... ST_i64 killed %3:b64, 0, 0, 101, 64, ¶m0, 0 :: (store (s64), addrspace 101) ``` The prologue ends up being: ``` $vrframelocal64 = MOV_DEPOT_ADDR_64 3 %0:b64 = LEA_ADDRi64 $vrframelocal64, 0 %1:b64 = cvta_local_64 %0:b64 %2:b64 = LEA_ADDRi64 $vrframelocal64, 4 %3:b64 = cvta_local_64 %2:b64 ``` and never optimized. I am wondering if a simple peephole optimization is missing. https://github.com/llvm/llvm-project/pull/154814 _______________________________________________ cfe-commits mailing list cfe-commits@lists.llvm.org https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits