[clang] [HLSL][Matrix] Make HLSLElementwiseCast respect matrix memory layout (PR #184429)

Deric C. via cfe-commits Mon, 09 Mar 2026 10:33:26 -0700

https://github.com/Icohedron updated 
https://github.com/llvm/llvm-project/pull/184429


>From 9e181cf78fe2c8a9589af6ec735fd6deaa47f7aa Mon Sep 17 00:00:00 2001
From: Deric Cheung <[email protected]>
Date: Mon, 2 Mar 2026 10:06:14 -0800
Subject: [PATCH 1/3] Add missing const to getFlattenedIndex function

---
 clang/include/clang/AST/TypeBase.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/AST/TypeBase.h 
b/clang/include/clang/AST/TypeBase.h
index dc4442bfeb795..ec7845c3b3adb 100644
--- a/clang/include/clang/AST/TypeBase.h
+++ b/clang/include/clang/AST/TypeBase.h
@@ -4435,7 +4435,7 @@ class ConstantMatrixType final : public MatrixType {
   /// row-major order flattened index. Otherwise, returns the column-major 
order
   /// flattened index.
   unsigned getFlattenedIndex(unsigned Row, unsigned Column,
-                             bool IsRowMajor = false) {
+                             bool IsRowMajor = false) const {
     return IsRowMajor ? getRowMajorFlattenedIndex(Row, Column)
                       : getColumnMajorFlattenedIndex(Row, Column);
   }

>From c62f3b212cf0e4fee66cc23e6fd3f4262db41401 Mon Sep 17 00:00:00 2001
From: Deric Cheung <[email protected]>
Date: Mon, 2 Mar 2026 14:21:12 -0800
Subject: [PATCH 2/3] Make HLSLElementwiseCast respect matrix memory layout

Assisted-by: claude-opus-4.6
---
 clang/lib/CodeGen/CGExprScalar.cpp            | 30 ++++++----
 .../BasicFeatures/MatrixElementTypeCast.hlsl  | 60 ++++++++++++-------
 2 files changed, 55 insertions(+), 35 deletions(-)

diff --git a/clang/lib/CodeGen/CGExprScalar.cpp 
b/clang/lib/CodeGen/CGExprScalar.cpp
index 06eadb6c07507..f8c5f76b4d027 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2539,7 +2539,7 @@ static Value *EmitHLSLElementwiseCast(CodeGenFunction 
&CGF, LValue SrcVal,
                                       QualType DestTy, SourceLocation Loc) {
   SmallVector<LValue, 16> LoadList;
   CGF.FlattenAccessAndTypeLValue(SrcVal, LoadList);
-  // Dest is either a vector or a builtin?
+  // Dest is either a vector, constant matrix, or a builtin
   // if its a vector create a temp alloca to store into and return that
   if (auto *VecTy = DestTy->getAs<VectorType>()) {
     assert(LoadList.size() >= VecTy->getNumElements() &&
@@ -2564,20 +2564,24 @@ static Value *EmitHLSLElementwiseCast(CodeGenFunction 
&CGF, LValue SrcVal,
            "Flattened type on RHS must have the same number or more elements "
            "than vector on LHS.");
 
+    bool IsRowMajor = CGF.getLangOpts().getDefaultMatrixMemoryLayout() ==
+                      LangOptions::MatrixMemoryLayout::MatrixRowMajor;
+
     llvm::Value *V = CGF.Builder.CreateLoad(
         CGF.CreateIRTempWithoutCast(DestTy, "flatcast.tmp"));
-    // V is an allocated temporary to build the truncated matrix into.
-    for (unsigned I = 0, E = MatTy->getNumElementsFlattened(); I < E; I++) {
-      unsigned ColMajorIndex =
-          (I % MatTy->getNumRows()) * MatTy->getNumColumns() +
-          (I / MatTy->getNumRows());
-      RValue RVal = CGF.EmitLoadOfLValue(LoadList[ColMajorIndex], Loc);
-      assert(RVal.isScalar() &&
-             "All flattened source values should be scalars.");
-      llvm::Value *Cast = CGF.EmitScalarConversion(
-          RVal.getScalarVal(), LoadList[ColMajorIndex].getType(),
-          MatTy->getElementType(), Loc);
-      V = CGF.Builder.CreateInsertElement(V, Cast, I);
+    // V is an allocated temporary for constructing the matrix.
+    for (unsigned Row = 0, RE = MatTy->getNumRows(); Row < RE; Row++) {
+      for (unsigned Col = 0, CE = MatTy->getNumColumns(); Col < CE; Col++) {
+        unsigned LoadIdx = MatTy->getRowMajorFlattenedIndex(Row, Col);
+        RValue RVal = CGF.EmitLoadOfLValue(LoadList[LoadIdx], Loc);
+        assert(RVal.isScalar() &&
+               "All flattened source values should be scalars.");
+        llvm::Value *Cast = CGF.EmitScalarConversion(
+            RVal.getScalarVal(), LoadList[LoadIdx].getType(),
+            MatTy->getElementType(), Loc);
+        unsigned MatrixIdx = MatTy->getFlattenedIndex(Row, Col, IsRowMajor);
+        V = CGF.Builder.CreateInsertElement(V, Cast, MatrixIdx);
+      }
     }
     return V;
   }
diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl 
b/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl
index b46d1efec1e1a..f21d1ddb7386e 100644
--- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl
+++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixElementTypeCast.hlsl
@@ -1,12 +1,14 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py 
UTC_ARGS: --version 6
-// RUN: %clang_cc1 -finclude-default-header -triple 
dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes 
-fnative-half-type -fnative-int16-type -o - %s | FileCheck %s
+// RUN: %clang_cc1 -finclude-default-header -triple 
dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes 
-fnative-half-type -fnative-int16-type -fmatrix-memory-layout=row-major -o - %s 
| FileCheck %s --check-prefixes=CHECK,ROW-CHECK
+// RUN: %clang_cc1 -finclude-default-header -triple 
dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes 
-fnative-half-type -fnative-int16-type -fmatrix-memory-layout=column-major -o - 
%s | FileCheck %s --check-prefixes=CHECK,COL-CHECK
 
 
 // CHECK-LABEL: define hidden noundef <6 x i32> 
@_Z22elementwise_type_cast0u11matrix_typeILm3ELm2EfE(
 // CHECK-SAME: <6 x float> noundef nofpclass(nan inf) [[F32:%.*]]) 
#[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca [2 x <3 x float>], align 4
-// CHECK-NEXT:    [[I32:%.*]] = alloca [2 x <3 x i32>], align 4
+// ROW-CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca [3 x <2 x float>], align 4
+// ROW-CHECK-NEXT:    [[I32:%.*]] = alloca [3 x <2 x i32>], align 4
+// COL-CHECK-NEXT:    [[F32_ADDR:%.*]] = alloca [2 x <3 x float>], align 4
+// COL-CHECK-NEXT:    [[I32:%.*]] = alloca [2 x <3 x i32>], align 4
 // CHECK-NEXT:    store <6 x float> [[F32]], ptr [[F32_ADDR]], align 4
 // CHECK-NEXT:    [[TMP0:%.*]] = load <6 x float>, ptr [[F32_ADDR]], align 4
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi <6 x float> [[TMP0]] to <6 x i32>
@@ -22,8 +24,10 @@ int3x2 elementwise_type_cast0(float3x2 f32) {
 // CHECK-LABEL: define hidden noundef <6 x i32> 
@_Z22elementwise_type_cast1u11matrix_typeILm3ELm2EsE(
 // CHECK-SAME: <6 x i16> noundef [[I16_32:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I16_32_ADDR:%.*]] = alloca [2 x <3 x i16>], align 2
-// CHECK-NEXT:    [[I32:%.*]] = alloca [2 x <3 x i32>], align 4
+// ROW-CHECK-NEXT:    [[I16_32_ADDR:%.*]] = alloca [3 x <2 x i16>], align 2
+// ROW-CHECK-NEXT:    [[I32:%.*]] = alloca [3 x <2 x i32>], align 4
+// COL-CHECK-NEXT:    [[I16_32_ADDR:%.*]] = alloca [2 x <3 x i16>], align 2
+// COL-CHECK-NEXT:    [[I32:%.*]] = alloca [2 x <3 x i32>], align 4
 // CHECK-NEXT:    store <6 x i16> [[I16_32]], ptr [[I16_32_ADDR]], align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = load <6 x i16>, ptr [[I16_32_ADDR]], align 2
 // CHECK-NEXT:    [[CONV:%.*]] = sext <6 x i16> [[TMP0]] to <6 x i32>
@@ -39,8 +43,10 @@ int3x2 elementwise_type_cast1(int16_t3x2 i16_32) {
 // CHECK-LABEL: define hidden noundef <6 x i32> 
@_Z22elementwise_type_cast2u11matrix_typeILm3ELm2ElE(
 // CHECK-SAME: <6 x i64> noundef [[I64_32:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[I64_32_ADDR:%.*]] = alloca [2 x <3 x i64>], align 8
-// CHECK-NEXT:    [[I32:%.*]] = alloca [2 x <3 x i32>], align 4
+// ROW-CHECK-NEXT:    [[I64_32_ADDR:%.*]] = alloca [3 x <2 x i64>], align 8
+// ROW-CHECK-NEXT:    [[I32:%.*]] = alloca [3 x <2 x i32>], align 4
+// COL-CHECK-NEXT:    [[I64_32_ADDR:%.*]] = alloca [2 x <3 x i64>], align 8
+// COL-CHECK-NEXT:    [[I32:%.*]] = alloca [2 x <3 x i32>], align 4
 // CHECK-NEXT:    store <6 x i64> [[I64_32]], ptr [[I64_32_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load <6 x i64>, ptr [[I64_32_ADDR]], align 8
 // CHECK-NEXT:    [[CONV:%.*]] = trunc <6 x i64> [[TMP0]] to <6 x i32>
@@ -56,8 +62,10 @@ int3x2 elementwise_type_cast2(int64_t3x2 i64_32) {
 // CHECK-LABEL: define hidden noundef <6 x i16> 
@_Z22elementwise_type_cast3u11matrix_typeILm2ELm3EDhE(
 // CHECK-SAME: <6 x half> noundef nofpclass(nan inf) [[H23:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[H23_ADDR:%.*]] = alloca [3 x <2 x half>], align 2
-// CHECK-NEXT:    [[I23:%.*]] = alloca [3 x <2 x i16>], align 2
+// ROW-CHECK-NEXT:    [[H23_ADDR:%.*]] = alloca [2 x <3 x half>], align 2
+// ROW-CHECK-NEXT:    [[I23:%.*]] = alloca [2 x <3 x i16>], align 2
+// COL-CHECK-NEXT:    [[H23_ADDR:%.*]] = alloca [3 x <2 x half>], align 2
+// COL-CHECK-NEXT:    [[I23:%.*]] = alloca [3 x <2 x i16>], align 2
 // CHECK-NEXT:    store <6 x half> [[H23]], ptr [[H23_ADDR]], align 2
 // CHECK-NEXT:    [[TMP0:%.*]] = load <6 x half>, ptr [[H23_ADDR]], align 2
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi <6 x half> [[TMP0]] to <6 x i16>
@@ -73,8 +81,10 @@ int16_t2x3 elementwise_type_cast3(half2x3 h23) {
 // CHECK-LABEL: define hidden noundef <6 x i32> 
@_Z22elementwise_type_cast4u11matrix_typeILm3ELm2EdE(
 // CHECK-SAME: <6 x double> noundef nofpclass(nan inf) [[D32:%.*]]) #[[ATTR0]] 
{
 // CHECK-NEXT:  [[ENTRY:.*:]]
-// CHECK-NEXT:    [[D32_ADDR:%.*]] = alloca [2 x <3 x double>], align 8
-// CHECK-NEXT:    [[I32:%.*]] = alloca [2 x <3 x i32>], align 4
+// ROW-CHECK-NEXT:    [[D32_ADDR:%.*]] = alloca [3 x <2 x double>], align 8
+// ROW-CHECK-NEXT:    [[I32:%.*]] = alloca [3 x <2 x i32>], align 4
+// COL-CHECK-NEXT:    [[D32_ADDR:%.*]] = alloca [2 x <3 x double>], align 8
+// COL-CHECK-NEXT:    [[I32:%.*]] = alloca [2 x <3 x i32>], align 4
 // CHECK-NEXT:    store <6 x double> [[D32]], ptr [[D32_ADDR]], align 8
 // CHECK-NEXT:    [[TMP0:%.*]] = load <6 x double>, ptr [[D32_ADDR]], align 8
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi <6 x double> [[TMP0]] to <6 x i32>
@@ -91,7 +101,8 @@ int3x2 elementwise_type_cast4(double3x2 d32) {
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[A:%.*]] = alloca [2 x [1 x i32]], align 4
-// CHECK-NEXT:    [[B:%.*]] = alloca [1 x <2 x i32>], align 4
+// ROW-CHECK-NEXT:    [[B:%.*]] = alloca [2 x <1 x i32>], align 4
+// COL-CHECK-NEXT:    [[B:%.*]] = alloca [1 x <2 x i32>], align 4
 // CHECK-NEXT:    [[AGG_TEMP:%.*]] = alloca [2 x [1 x i32]], align 4
 // CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <2 x i32>, align 4
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 4 [[A]], ptr 
align 4 @__const._Z5call2v.A, i32 8, i1 false)
@@ -120,7 +131,8 @@ struct S {
 // CHECK-SAME: ) #[[ATTR0]] {
 // CHECK-NEXT:  [[ENTRY:.*:]]
 // CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_S:%.*]], align 1
-// CHECK-NEXT:    [[A:%.*]] = alloca [1 x <2 x i32>], align 4
+// ROW-CHECK-NEXT:    [[A:%.*]] = alloca [2 x <1 x i32>], align 4
+// COL-CHECK-NEXT:    [[A:%.*]] = alloca [1 x <2 x i32>], align 4
 // CHECK-NEXT:    [[AGG_TEMP:%.*]] = alloca [[STRUCT_S]], align 1
 // CHECK-NEXT:    [[FLATCAST_TMP:%.*]] = alloca <2 x i32>, align 4
 // CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[S]], ptr 
align 1 @__const._Z5call3v.s, i32 8, i1 false)
@@ -168,14 +180,16 @@ struct Derived : BFields {
 // CHECK-NEXT:    [[TMP1:%.*]] = load double, ptr [[GEP1]], align 8
 // CHECK-NEXT:    [[CONV:%.*]] = fptosi double [[TMP1]] to i32
 // CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP0]], i32 
[[CONV]], i64 0
-// CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[GEP2]], align 4
-// CHECK-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP3]] to i32
-// CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP2]], i32 
[[CONV4]], i64 1
 // CHECK-NEXT:    [[BF_LOAD:%.*]] = load i24, ptr [[E]], align 1
 // CHECK-NEXT:    [[BF_SHL:%.*]] = shl i24 [[BF_LOAD]], 9
 // CHECK-NEXT:    [[BF_ASHR:%.*]] = ashr i24 [[BF_SHL]], 9
 // CHECK-NEXT:    [[BF_CAST:%.*]] = sext i24 [[BF_ASHR]] to i32
-// CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 
[[BF_CAST]], i64 2
+// ROW-CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 
[[BF_CAST]], i64 1
+// COL-CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 
[[BF_CAST]], i64 2
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[GEP2]], align 4
+// CHECK-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP4]] to i32
+// ROW-CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 
[[CONV4]], i64 2
+// COL-CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP3]], i32 
[[CONV4]], i64 1
 // CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[GEP3]], align 4
 // CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 
[[TMP6]], i64 3
 // CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[A]], align 4
@@ -201,11 +215,13 @@ void call4(Derived D) {
 // CHECK-NEXT:    [[VECEXT:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
 // CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x float> [[TMP1]], float 
[[VECEXT]], i64 0
 // CHECK-NEXT:    [[TMP4:%.*]] = load <4 x float>, ptr [[VECTOR_GEP]], align 16
-// CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x float> [[TMP4]], i32 2
-// CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP3]], float 
[[VECEXT1]], i64 1
+// CHECK-NEXT:    [[VECEXT1:%.*]] = extractelement <4 x float> [[TMP4]], i32 1
+// ROW-CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP3]], float 
[[VECEXT1]], i64 1
+// COL-CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x float> [[TMP3]], float 
[[VECEXT1]], i64 2
 // CHECK-NEXT:    [[TMP6:%.*]] = load <4 x float>, ptr [[VECTOR_GEP]], align 16
-// CHECK-NEXT:    [[VECEXT2:%.*]] = extractelement <4 x float> [[TMP6]], i32 1
-// CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP5]], float 
[[VECEXT2]], i64 2
+// CHECK-NEXT:    [[VECEXT2:%.*]] = extractelement <4 x float> [[TMP6]], i32 2
+// ROW-CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP5]], float 
[[VECEXT2]], i64 2
+// COL-CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x float> [[TMP5]], float 
[[VECEXT2]], i64 1
 // CHECK-NEXT:    [[TMP8:%.*]] = load <4 x float>, ptr [[VECTOR_GEP]], align 16
 // CHECK-NEXT:    [[VECEXT3:%.*]] = extractelement <4 x float> [[TMP8]], i32 3
 // CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP7]], float 
[[VECEXT3]], i64 3

>From 15394569816d1934b3c5f862279f20b068aacba8 Mon Sep 17 00:00:00 2001
From: Deric Cheung <[email protected]>
Date: Tue, 3 Mar 2026 13:14:10 -0800
Subject: [PATCH 3/3] Add comment about LoadList being row-major when
 interpreted as a matrix

---
 clang/lib/CodeGen/CGExprScalar.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/lib/CodeGen/CGExprScalar.cpp 
b/clang/lib/CodeGen/CGExprScalar.cpp
index f8c5f76b4d027..a712e038d93a5 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -2572,6 +2572,8 @@ static Value *EmitHLSLElementwiseCast(CodeGenFunction 
&CGF, LValue SrcVal,
     // V is an allocated temporary for constructing the matrix.
     for (unsigned Row = 0, RE = MatTy->getNumRows(); Row < RE; Row++) {
       for (unsigned Col = 0, CE = MatTy->getNumColumns(); Col < CE; Col++) {
+        // When interpreted as a matrix, \p LoadList is *always* row-major 
order
+        // regardless of the default matrix memory layout.
         unsigned LoadIdx = MatTy->getRowMajorFlattenedIndex(Row, Col);
         RValue RVal = CGF.EmitLoadOfLValue(LoadList[LoadIdx], Loc);
         assert(RVal.isScalar() &&

_______________________________________________
cfe-commits mailing list
[email protected]
https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits

[clang] [HLSL][Matrix] Make HLSLElementwiseCast respect matrix memory layout (PR #184429)

Reply via email to