Author: Deric C. Date: 2026-03-13T09:10:26-07:00 New Revision: fc4fed4d982cef5dddc7568a422d2be430e941d2
URL: https://github.com/llvm/llvm-project/commit/fc4fed4d982cef5dddc7568a422d2be430e941d2 DIFF: https://github.com/llvm/llvm-project/commit/fc4fed4d982cef5dddc7568a422d2be430e941d2.diff LOG: [HLSL] Codegen column-major matrix initializer lists without a vector shuffle (#186228) Fixes #185518 The SPIR-V backend does not handle the lowering of `shufflevector` instructions on vectors with more than 4 elements. This PR changes the codegen of matrix init lists to directly emit vectors with elements in column-major order when the default matrix memory layout is column-major, as opposed to in linear/row-major order followed by a vector shuffle. While an alternative fix could be to change the default depth of [`canEvaluateShuffled`](https://github.com/llvm/llvm-project/blob/main/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp#L1865-L1866) to 16 in `InstCombineVectorOps.cpp` to eliminate the vector shuffle for vectors of up to 16 elements in size (to handle 4x4 matrices), this change would have broader impacts than just HLSL, which does not seem necessary for the scope of this issue (which regards only matrix initializer list codegen). Another alternative fix would be to extend the `shufflevector` lowering in the SPIR-V backend to support vectors of more than 4 elements. However, again, this goes beyond the scope of just matrix initializer list codegen which is so far the only case where a vector shuffle of a vector more than 4 elements appeared. Assisted-by: claude-opus-4.6 Added: Modified: clang/lib/CodeGen/CGExprScalar.cpp clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl clang/test/CodeGenHLSL/BasicFeatures/MatrixInitializerListOrder.hlsl clang/test/CodeGenHLSL/BasicFeatures/MatrixToAndFromVectorConstructors.hlsl clang/test/CodeGenHLSL/BoolMatrix.hlsl Removed: ################################################################################ diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 100982fefc3b0..91f0e4f4ceaa4 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -2322,6 +2322,14 @@ Value *ScalarExprEmitter::VisitInitListExpr(InitListExpr *E) { unsigned ResElts = cast<llvm::FixedVectorType>(VType)->getNumElements(); + // For column-major matrix types, we insert elements directly at their + // column-major positions rather than inserting sequentially and shuffling. + const ConstantMatrixType *ColMajorMT = nullptr; + if (const auto *MT = E->getType()->getAs<ConstantMatrixType>(); + MT && CGF.getLangOpts().getDefaultMatrixMemoryLayout() == + LangOptions::MatrixMemoryLayout::MatrixColMajor) + ColMajorMT = MT; + // Loop over initializers collecting the Value for each, and remembering // whether the source was swizzle (ExtVectorElementExpr). This will allow // us to fold the shuffle for the swizzle into the shuffle for the vector @@ -2376,7 +2384,11 @@ Value *ScalarExprEmitter::VisitInitListExpr(InitListExpr *E) { } } } - V = Builder.CreateInsertElement(V, Init, Builder.getInt32(CurIdx), + unsigned InsertIdx = + ColMajorMT + ? ColMajorMT->mapRowMajorToColumnMajorFlattenedIndex(CurIdx) + : CurIdx; + V = Builder.CreateInsertElement(V, Init, Builder.getInt32(InsertIdx), "vecinit"); VIsPoisonShuffle = false; ++CurIdx; @@ -2446,24 +2458,14 @@ Value *ScalarExprEmitter::VisitInitListExpr(InitListExpr *E) { // Emit remaining default initializers for (/* Do not initialize i*/; CurIdx < ResElts; ++CurIdx) { - Value *Idx = Builder.getInt32(CurIdx); + unsigned InsertIdx = + ColMajorMT ? ColMajorMT->mapRowMajorToColumnMajorFlattenedIndex(CurIdx) + : CurIdx; + Value *Idx = Builder.getInt32(InsertIdx); llvm::Value *Init = llvm::Constant::getNullValue(EltTy); V = Builder.CreateInsertElement(V, Init, Idx, "vecinit"); } - // Matrix initializer lists are in row-major order but the memory layout for - // codegen is determined by the -fmatrix-memory-layout flag (default: - // column-major). When the memory layout is column-major, we need to shuffle - // the elements from row-major to column-major order. - if (const auto *MT = E->getType()->getAs<ConstantMatrixType>(); - MT && CGF.getLangOpts().getDefaultMatrixMemoryLayout() == - LangOptions::MatrixMemoryLayout::MatrixColMajor) { - SmallVector<int, 16> Mask; - for (unsigned I = 0, N = MT->getNumElementsFlattened(); I < N; ++I) - Mask.push_back(MT->mapColumnMajorToRowMajorFlattenedIndex(I)); - V = Builder.CreateShuffleVector(V, Mask, "matrix.rowmajor2colmajor"); - } - return V; } diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl index 9263e46a08822..944f6dd1d46cd 100644 --- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl +++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixConstructor.hlsl @@ -33,17 +33,16 @@ RWStructuredBuffer<float> In; // CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[CALL]], align 4 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <6 x float> poison, float [[TMP0]], i32 0 // CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[CALL1]], align 4 -// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT]], float [[TMP1]], i32 1 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT]], float [[TMP1]], i32 3 // CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[CALL2]], align 4 -// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[TMP2]], i32 2 +// CHECK-NEXT: [[VECINIT7:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[TMP2]], i32 1 // CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[CALL3]], align 4 -// CHECK-NEXT: [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT7]], float [[TMP3]], i32 3 +// CHECK-NEXT: [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT7]], float [[TMP3]], i32 4 // CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[CALL4]], align 4 -// CHECK-NEXT: [[VECINIT9:%.*]] = insertelement <6 x float> [[VECINIT8]], float [[TMP4]], i32 4 +// CHECK-NEXT: [[VECINIT9:%.*]] = insertelement <6 x float> [[VECINIT8]], float [[TMP4]], i32 2 // CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[CALL5]], align 4 // CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <6 x float> [[VECINIT9]], float [[TMP5]], i32 5 -// CHECK-NEXT: [[MATRIX_ROWMAJOR2COLMAJOR:%.*]] = shufflevector <6 x float> [[VECINIT10]], <6 x float> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> -// CHECK-NEXT: ret <6 x float> [[MATRIX_ROWMAJOR2COLMAJOR]] +// CHECK-NEXT: ret <6 x float> [[VECINIT10]] // float3x2 case2() { // vec[0] = Call @@ -70,21 +69,20 @@ float3x2 case2() { // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <6 x float> poison, float [[VECEXT]], i32 0 // CHECK-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16 // CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <3 x float> [[TMP1]], i64 1 -// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <6 x float> [[VECINIT]], float [[VECEXT1]], i32 1 +// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <6 x float> [[VECINIT]], float [[VECEXT1]], i32 3 // CHECK-NEXT: [[TMP2:%.*]] = load <3 x float>, ptr [[A_ADDR]], align 16 // CHECK-NEXT: [[VECEXT3:%.*]] = extractelement <3 x float> [[TMP2]], i64 2 -// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <6 x float> [[VECINIT2]], float [[VECEXT3]], i32 2 +// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <6 x float> [[VECINIT2]], float [[VECEXT3]], i32 1 // CHECK-NEXT: [[TMP3:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16 // CHECK-NEXT: [[VECEXT5:%.*]] = extractelement <3 x float> [[TMP3]], i64 0 -// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT4]], float [[VECEXT5]], i32 3 +// CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <6 x float> [[VECINIT4]], float [[VECEXT5]], i32 4 // CHECK-NEXT: [[TMP4:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16 // CHECK-NEXT: [[VECEXT7:%.*]] = extractelement <3 x float> [[TMP4]], i64 1 -// CHECK-NEXT: [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[VECEXT7]], i32 4 +// CHECK-NEXT: [[VECINIT8:%.*]] = insertelement <6 x float> [[VECINIT6]], float [[VECEXT7]], i32 2 // CHECK-NEXT: [[TMP5:%.*]] = load <3 x float>, ptr [[B_ADDR]], align 16 // CHECK-NEXT: [[VECEXT9:%.*]] = extractelement <3 x float> [[TMP5]], i64 2 // CHECK-NEXT: [[VECINIT10:%.*]] = insertelement <6 x float> [[VECINIT8]], float [[VECEXT9]], i32 5 -// CHECK-NEXT: [[MATRIX_ROWMAJOR2COLMAJOR:%.*]] = shufflevector <6 x float> [[VECINIT10]], <6 x float> poison, <6 x i32> <i32 0, i32 2, i32 4, i32 1, i32 3, i32 5> -// CHECK-NEXT: ret <6 x float> [[MATRIX_ROWMAJOR2COLMAJOR]] +// CHECK-NEXT: ret <6 x float> [[VECINIT10]] // float3x2 case3(float3 a, float3 b) { // vec[0] = A[0] diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixInitializerListOrder.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixInitializerListOrder.hlsl index e7a8080c02406..da4a522ea28ea 100644 --- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixInitializerListOrder.hlsl +++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixInitializerListOrder.hlsl @@ -35,8 +35,7 @@ export float test_row1_col0() { return M[1][0]; } -// Verify the shuffle is emitted for non-constant init lists when the memory -// layout is column-major, and not emitted when it is row-major. +// Verify that elements are inserted at the correct positions according to the default matrix memory layout. export float2x3 test_dynamic(float a, float b, float c, float d, float e, float f) { @@ -44,17 +43,18 @@ export float2x3 test_dynamic(float a, float b, float c, // CHECK: [[A:%.*]] = load float, ptr %a.addr // CHECK: [[VECINIT0:%.*]] = insertelement <6 x float> poison, float [[A]], i32 0 // CHECK: [[B:%.*]] = load float, ptr %b.addr -// CHECK: [[VECINIT1:%.*]] = insertelement <6 x float> [[VECINIT0]], float [[B]], i32 1 +// COL-CHECK: [[VECINIT1:%.*]] = insertelement <6 x float> [[VECINIT0]], float [[B]], i32 2 +// ROW-CHECK: [[VECINIT1:%.*]] = insertelement <6 x float> [[VECINIT0]], float [[B]], i32 1 // CHECK: [[C:%.*]] = load float, ptr %c.addr -// CHECK: [[VECINIT2:%.*]] = insertelement <6 x float> [[VECINIT1]], float [[C]], i32 2 +// COL-CHECK: [[VECINIT2:%.*]] = insertelement <6 x float> [[VECINIT1]], float [[C]], i32 4 +// ROW-CHECK: [[VECINIT2:%.*]] = insertelement <6 x float> [[VECINIT1]], float [[C]], i32 2 // CHECK: [[D:%.*]] = load float, ptr %d.addr -// CHECK: [[VECINIT3:%.*]] = insertelement <6 x float> [[VECINIT2]], float [[D]], i32 3 +// COL-CHECK: [[VECINIT3:%.*]] = insertelement <6 x float> [[VECINIT2]], float [[D]], i32 1 +// ROW-CHECK: [[VECINIT3:%.*]] = insertelement <6 x float> [[VECINIT2]], float [[D]], i32 3 // CHECK: [[E:%.*]] = load float, ptr %e.addr -// CHECK: [[VECINIT4:%.*]] = insertelement <6 x float> [[VECINIT3]], float [[E]], i32 4 +// COL-CHECK: [[VECINIT4:%.*]] = insertelement <6 x float> [[VECINIT3]], float [[E]], i32 3 +// ROW-CHECK: [[VECINIT4:%.*]] = insertelement <6 x float> [[VECINIT3]], float [[E]], i32 4 // CHECK: [[F:%.*]] = load float, ptr %f.addr // CHECK: [[VECINIT5:%.*]] = insertelement <6 x float> [[VECINIT4]], float [[F]], i32 5 -// COL-CHECK: shufflevector <6 x float> [[VECINIT5]], <6 x float> poison, <6 x i32> <i32 0, i32 3, i32 1, i32 4, i32 2, i32 5> -// ROW-CHECK-NOT: shufflevector -// ROW-CHECK: store <6 x float> [[VECINIT5]], ptr return (float2x3){a, b, c, d, e, f}; } diff --git a/clang/test/CodeGenHLSL/BasicFeatures/MatrixToAndFromVectorConstructors.hlsl b/clang/test/CodeGenHLSL/BasicFeatures/MatrixToAndFromVectorConstructors.hlsl index a515b91da01c2..6df9f3351c63f 100644 --- a/clang/test/CodeGenHLSL/BasicFeatures/MatrixToAndFromVectorConstructors.hlsl +++ b/clang/test/CodeGenHLSL/BasicFeatures/MatrixToAndFromVectorConstructors.hlsl @@ -41,16 +41,16 @@ float4 fn(float2x2 m) { // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[VECEXT]], i32 0 // CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[V_ADDR]], align 16 // CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <4 x i32> [[TMP1]], i64 1 -// CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VECEXT1]], i32 1 +// COL-CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VECEXT1]], i32 2 +// ROW-CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[VECEXT1]], i32 1 // CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[V_ADDR]], align 16 // CHECK-NEXT: [[VECEXT3:%.*]] = extractelement <4 x i32> [[TMP2]], i64 2 -// CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <4 x i32> [[VECINIT2]], i32 [[VECEXT3]], i32 2 +// COL-CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <4 x i32> [[VECINIT2]], i32 [[VECEXT3]], i32 1 +// ROW-CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <4 x i32> [[VECINIT2]], i32 [[VECEXT3]], i32 2 // CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[V_ADDR]], align 16 // CHECK-NEXT: [[VECEXT5:%.*]] = extractelement <4 x i32> [[TMP3]], i64 3 // CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <4 x i32> [[VECINIT4]], i32 [[VECEXT5]], i32 3 -// COL-CHECK-NEXT: [[MATRIX_ROWMAJOR2COLMAJOR:%.*]] = shufflevector <4 x i32> [[VECINIT6]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> -// COL-CHECK-NEXT: store <4 x i32> [[MATRIX_ROWMAJOR2COLMAJOR]], ptr [[M]], align 4 -// ROW-CHECK-NEXT: store <4 x i32> [[VECINIT6]], ptr [[M]], align 4 +// CHECK-NEXT: store <4 x i32> [[VECINIT6]], ptr [[M]], align 4 // CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[M]], align 4 // CHECK-NEXT: ret <4 x i32> [[TMP4]] // @@ -70,9 +70,7 @@ int2x2 fn(int4 v) { // CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[V_ADDR]], align 8 // CHECK-NEXT: [[VECEXT1:%.*]] = extractelement <2 x i32> [[TMP1]], i64 1 // CHECK-NEXT: [[VECINIT2:%.*]] = insertelement <2 x i32> [[VECINIT]], i32 [[VECEXT1]], i32 1 -// COL-CHECK-NEXT: [[MATRIX_ROWMAJOR2COLMAJOR:%.*]] = shufflevector <2 x i32> [[VECINIT2]], <2 x i32> poison, <2 x i32> <i32 0, i32 1> -// COL-CHECK-NEXT: ret <2 x i32> [[MATRIX_ROWMAJOR2COLMAJOR]] -// ROW-CHECK-NEXT: ret <2 x i32> [[VECINIT2]] +// CHECK-NEXT: ret <2 x i32> [[VECINIT2]] // int1x2 fn1(int2 v) { return v; @@ -96,9 +94,7 @@ int1x2 fn1(int2 v) { // CHECK-NEXT: [[LOADEDV4:%.*]] = trunc <3 x i32> [[TMP3]] to <3 x i1> // CHECK-NEXT: [[VECEXT5:%.*]] = extractelement <3 x i1> [[LOADEDV4]], i64 2 // CHECK-NEXT: [[VECINIT6:%.*]] = insertelement <3 x i1> [[VECINIT3]], i1 [[VECEXT5]], i32 2 -// COL-CHECK-NEXT: [[MATRIX_ROWMAJOR2COLMAJOR:%.*]] = shufflevector <3 x i1> [[VECINIT6]], <3 x i1> poison, <3 x i32> <i32 0, i32 1, i32 2> -// COL-CHECK-NEXT: ret <3 x i1> [[MATRIX_ROWMAJOR2COLMAJOR]] -// ROW-CHECK-NEXT: ret <3 x i1> [[VECINIT6]] +// CHECK-NEXT: ret <3 x i1> [[VECINIT6]] // bool3x1 fn2(bool3 b) { return b; diff --git a/clang/test/CodeGenHLSL/BoolMatrix.hlsl b/clang/test/CodeGenHLSL/BoolMatrix.hlsl index c61d82635d513..e9841ae6c9a90 100644 --- a/clang/test/CodeGenHLSL/BoolMatrix.hlsl +++ b/clang/test/CodeGenHLSL/BoolMatrix.hlsl @@ -35,13 +35,12 @@ bool fn1() { // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4 // CHECK-NEXT: [[LOADEDV:%.*]] = trunc i32 [[TMP0]] to i1 // CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i1> poison, i1 [[LOADEDV]], i32 0 -// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x i1> [[VECINIT]], i1 true, i32 1 +// CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x i1> [[VECINIT]], i1 true, i32 2 // CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[V_ADDR]], align 4 // CHECK-NEXT: [[LOADEDV2:%.*]] = trunc i32 [[TMP1]] to i1 -// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x i1> [[VECINIT1]], i1 [[LOADEDV2]], i32 2 +// CHECK-NEXT: [[VECINIT3:%.*]] = insertelement <4 x i1> [[VECINIT1]], i1 [[LOADEDV2]], i32 1 // CHECK-NEXT: [[VECINIT4:%.*]] = insertelement <4 x i1> [[VECINIT3]], i1 false, i32 3 -// CHECK-NEXT: [[MATRIX_ROWMAJOR2COLMAJOR:%.*]] = shufflevector <4 x i1> [[VECINIT4]], <4 x i1> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3> -// CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[MATRIX_ROWMAJOR2COLMAJOR]] to <4 x i32> +// CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i1> [[VECINIT4]] to <4 x i32> // CHECK-NEXT: store <4 x i32> [[TMP2]], ptr [[A]], align 4 // CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[A]], align 4 // CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[RETVAL]], align 4 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
