https://github.com/hekota created 
https://github.com/llvm/llvm-project/pull/169144

When an individual element of a vector is updated via indexing into the vector, 
it needs to be handled as a store operation on that one vector element.

Clang treats vectors as one unit, so a vector element needs to be updated, the 
whole vector is loaded, the element is modified, and then the whole vector is 
stored. In HLSL vector elements are handled individually. We need to avoid this 
load/modify/store sequence to prevent overwriting other vector elements that 
might be getting updated in parallel.

Fixes #160208
Fixes #167729

>From 7ffd077dc7132f1f9b10cee36fcc61d7b24a7dd3 Mon Sep 17 00:00:00 2001
From: Helena Kotas <[email protected]>
Date: Fri, 21 Nov 2025 19:00:47 -0800
Subject: [PATCH] [HLSL] Update indexed vector elements individually

Storing to individual elements of a vector through vector indexing
needs to be handled as separate stores. We need to avoid the load/modify/store
of the whole vector to prevent overwriting other vector elements that might be
getting updated in parallel.

Fixes #160208
Fixes #167729
---
 clang/lib/CodeGen/CGExpr.cpp                  | 24 +++++++++++
 clang/test/CodeGenHLSL/BoolVector.hlsl        | 15 +++----
 .../builtins/VectorElementStore.hlsl          | 41 +++++++++++++++++++
 clang/test/CodeGenHLSL/builtins/lit.hlsl      |  6 ++-
 4 files changed, 75 insertions(+), 11 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/builtins/VectorElementStore.hlsl

diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index f2451b16e78be..763ce6369fc51 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -2575,6 +2575,30 @@ void CodeGenFunction::EmitStoreThroughLValue(RValue Src, 
LValue Dst,
                                              bool isInit) {
   if (!Dst.isSimple()) {
     if (Dst.isVectorElt()) {
+      if (getLangOpts().HLSL) {
+        // In HLSL, storing to individual elements of a vector through
+        // VectorElt LValue needs to be handled as separate store. We need to
+        // avoid the load/modify/store sequence to prevent overwriting other
+        // elements that might be getting updated in parallel.
+        Address DstAddr = Dst.getVectorAddress();
+        llvm::Type *DestAddrTy = DstAddr.getElementType();
+        llvm::Type *ElemTy = DestAddrTy->getScalarType();
+        CharUnits ElemAlign = CharUnits::fromQuantity(
+            CGM.getDataLayout().getPrefTypeAlign(ElemTy));
+
+        llvm::Value *Val = Src.getScalarVal();
+        if (Val->getType()->getPrimitiveSizeInBits() <
+            ElemTy->getScalarSizeInBits())
+          Val = Builder.CreateZExt(Val, ElemTy->getScalarType());
+
+        llvm::Value *Idx = Dst.getVectorIdx();
+        llvm::Value *Zero = llvm::ConstantInt::get(Int32Ty, 0);
+        Address DstElemAddr =
+            Builder.CreateGEP(DstAddr, {Zero, Idx}, DestAddrTy, ElemAlign);
+        Builder.CreateStore(Val, DstElemAddr, Dst.isVolatileQualified());
+        return;
+      }
+
       // Read/modify/write the vector, inserting the new element.
       llvm::Value *Vec = Builder.CreateLoad(Dst.getVectorAddress(),
                                             Dst.isVolatileQualified());
diff --git a/clang/test/CodeGenHLSL/BoolVector.hlsl 
b/clang/test/CodeGenHLSL/BoolVector.hlsl
index d5054a5a92b5d..6be90e8f51ce2 100644
--- a/clang/test/CodeGenHLSL/BoolVector.hlsl
+++ b/clang/test/CodeGenHLSL/BoolVector.hlsl
@@ -69,9 +69,8 @@ bool fn4() {
 // CHECK-LABEL: define hidden void {{.*}}fn5{{.*}}
 // CHECK: [[Arr:%.*]] = alloca <2 x i32>, align 8
 // CHECK-NEXT: store <2 x i32> splat (i32 1), ptr [[Arr]], align 8
-// CHECK-NEXT: [[L:%.*]] = load <2 x i32>, ptr [[Arr]], align 8
-// CHECK-NEXT: [[V:%.*]] = insertelement <2 x i32> [[L]], i32 0, i32 1
-// CHECK-NEXT: store <2 x i32> [[V]], ptr [[Arr]], align 8
+// CHECK-NEXT: [[Ptr:%.*]] = getelementptr <2 x i32>, ptr [[Arr]]
+// CHECK-NEXT: store i32 0, ptr [[Ptr]], align 4
 // CHECK-NEXT: ret void
 void fn5() {
   bool2 Arr = {true,true};
@@ -86,10 +85,9 @@ void fn5() {
 // CHECK-NEXT: [[Y:%.*]] = load i32, ptr [[V]], align 4
 // CHECK-NEXT: [[LV:%.*]] = trunc i32 [[Y]] to i1
 // CHECK-NEXT: [[BV:%.*]] = getelementptr inbounds nuw %struct.S, ptr [[S]], 
i32 0, i32 0
-// CHECK-NEXT: [[X:%.*]] = load <2 x i32>, ptr [[BV]], align 1
 // CHECK-NEXT: [[Z:%.*]] = zext i1 [[LV]] to i32
-// CHECK-NEXT: [[VI:%.*]] = insertelement <2 x i32> [[X]], i32 [[Z]], i32 1
-// CHECK-NEXT: store <2 x i32> [[VI]], ptr [[BV]], align 1
+// CHECK-NEXT: [[Ptr:%.*]] = getelementptr <2 x i32>, ptr [[BV]], i32 0, i32 1
+// CHECK-NEXT: store i32 [[Z]], ptr [[Ptr]], align 4
 // CHECK-NEXT: ret void
 void fn6() {
   bool V = false;
@@ -101,9 +99,8 @@ void fn6() {
 // CHECK: [[Arr:%.*]] = alloca [2 x <2 x i32>], align 8
 // CHECK-NEXT: call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[Arr]], ptr align 
8 {{.*}}, i32 16, i1 false)
 // CHECK-NEXT: [[Idx:%.*]] = getelementptr inbounds [2 x <2 x i32>], ptr 
[[Arr]], i32 0, i32 0
-// CHECK-NEXT: [[X:%.*]] = load <2 x i32>, ptr [[Idx]], align 8
-// CHECK-NEXT: [[VI:%.*]] = insertelement <2 x i32> [[X]], i32 0, i32 1
-// CHECK-NEXT: store <2 x i32> [[VI]], ptr [[Idx]], align 8
+// CHECK-NEXT: %[[Ptr:.*]] = getelementptr <2 x i32>, ptr [[Idx]], i32 0, i32 1
+// CHECK-NEXT: store i32 0, ptr %[[Ptr]], align 4
 // CHECK-NEXT: ret void
 void fn7() {
   bool2 Arr[2] = {{true,true}, {false,false}};
diff --git a/clang/test/CodeGenHLSL/builtins/VectorElementStore.hlsl 
b/clang/test/CodeGenHLSL/builtins/VectorElementStore.hlsl
new file mode 100644
index 0000000000000..e0c3aa54aaeba
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/VectorElementStore.hlsl
@@ -0,0 +1,41 @@
+// RUN: %clang_cc1 -finclude-default-header -emit-llvm -disable-llvm-passes  \
+// RUN:   -triple dxil-pc-shadermodel6.3-library %s -o - | FileCheck %s
+
+// Test groupshared vector element store for uint.
+// CHECK-LABEL: test_uint4
+// CHECK: [[VAL:%.*]] = load i32, ptr %Val.addr, align 4
+// CHECK: [[IDX:%.*]] = load i32, ptr %Idx.addr, align 4
+// CHECK: [[PTR:%.*]] = getelementptr <4 x i32>, ptr addrspace(3) @SMem, i32 
0, i32 [[IDX]]
+// CHECK: store i32 [[VAL]], ptr addrspace(3) [[PTR]], align 4
+// CHECK-: ret void
+groupshared uint4 SMem;
+void test_uint4(uint Idx, uint Val) {
+  SMem[Idx] = Val;
+}
+
+// Test local vector element store for bool.
+// CHECK: [[COND1:%.*]] = load i32, ptr addrspace(3) @Cond, align 4
+// CHECK: [[COND2:%.*]] = trunc i32 [[COND1]] to i1
+// CHECK: [[IDX:%.*]] = load i32, ptr %Idx.addr, align 4
+// CHECK: [[COND3:%.*]] = zext i1 [[COND2]] to i32
+// CHECK: [[PTR:%.*]] = getelementptr <3 x i32>, ptr %Val, i32 0, i32 [[IDX]]
+// CHECK: store i32 [[COND3]], ptr [[PTR]], align 4
+// CHECK: ret
+groupshared bool Cond;
+bool3 test_bool(uint Idx) {
+  bool3 Val = { false, false, false};
+  Val[Idx] = Cond;
+  return Val;
+}
+
+// Test resource vector element store for float.
+// CHECK: [[VAL:%.*]] = load float, ptr %Val.addr, align 4
+// CHECK: [[RES_PTR:%.*]] = call {{.*}} ptr 
@_ZN4hlsl18RWStructuredBufferIDv4_fEixEj(ptr {{.*}} @_ZL3Buf, i32 noundef 0)
+// CHECK: [[IDX:%.*]] = load i32, ptr %Idx.addr, align 4
+// CHECK: [[PTR:%.*]] = getelementptr <4 x float>, ptr [[RES_PTR]], i32 0, i32 
[[IDX]]
+// CHECK: store float [[VAL]], ptr [[PTR]], align 4
+// CHECK: ret void
+RWStructuredBuffer<float4> Buf : register(u0);
+void test_float(uint Idx, float Val) {
+  Buf[0][Idx] = Val;
+}
diff --git a/clang/test/CodeGenHLSL/builtins/lit.hlsl 
b/clang/test/CodeGenHLSL/builtins/lit.hlsl
index b7979960de9f6..364c2e8794ea2 100644
--- a/clang/test/CodeGenHLSL/builtins/lit.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/lit.hlsl
@@ -11,7 +11,8 @@
 // CHECK: %mul.i = fmul reassoc nnan ninf nsz arcp afn half [[LOG]], %{{.*}}
 // CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn half 
@llvm.exp.f16(half %mul.i)
 // CHECK: %hlsl.select7.i = select reassoc nnan ninf nsz arcp afn i1 %{{.*}}, 
half 0xH0000, half %{{.*}}
-// CHECK: %vecins.i = insertelement <4 x half> %{{.*}}, half %hlsl.select7.i, 
i32 2
+// CHECK: [[PTR:%.*]] = getelementptr <4 x half>, ptr %Result.i, i32 0, i32 2
+// CHECK: store half %hlsl.select7.i, ptr [[PTR]], align 2
 // CHECK: ret <4 x half> %{{.*}}
 half4 test_lit_half(half NDotL, half NDotH, half M) { return lit(NDotL, NDotH, 
M); }
 
@@ -26,6 +27,7 @@ half4 test_lit_half(half NDotL, half NDotH, half M) { return 
lit(NDotL, NDotH, M
 // CHECK: %mul.i = fmul reassoc nnan ninf nsz arcp afn float [[LOG]], %{{.*}}
 // CHECK: [[EXP:%.*]] = call reassoc nnan ninf nsz arcp afn float 
@llvm.exp.f32(float %mul.i)
 // CHECK: %hlsl.select7.i = select reassoc nnan ninf nsz arcp afn i1 %{{.*}}, 
float 0.000000e+00, float %{{.*}}
-// CHECK: %vecins.i = insertelement <4 x float> %{{.*}}, float 
%hlsl.select7.i, i32 2
+// CHECK: [[PTR:%.*]] = getelementptr <4 x float>, ptr %Result.i, i32 0, i32 2
+// CHECK: store float %hlsl.select7.i, ptr [[PTR]], align 4
 // CHECK: ret <4 x float> %{{.*}}
 float4 test_lit_float(float NDotL, float NDotH, float M) { return lit(NDotL, 
NDotH, M); }

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

Reply via email to