llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT--> @llvm/pr-subscribers-clang-codegen Author: Deric C. (Icohedron) <details> <summary>Changes</summary> Fixes #<!-- -->184906 The SPIRV and DXIL backends assume matrices are provided in column-major order when lowering matrix transpose and matrix multiplication intrinsics. To support row-major order matrices from Clang/HLSL, we therefore need to convert row-major order matrices into column-major order matrices before applying matrix transpose and multiplication. A conversion from column-major order back to row-major order is also required for correctness after a matrix transpose or matrix multiply. This PR adds helper functions to the MatrixBuilder to convert a NxM row-/column-major order matrix into a NxM column-/row-major order matrix by applying a matrix transpose. The transformations take advantage of the fact that a row-major order matrix of NxM dimensions `rNxM` interpreted in column-major order is equivalent to its transpose in column-major order. Example: Let `r3x2 = [ 0, 1, 2, 3, 4, 5 ]`. The 3x2 matrix is visualized as ``` 0 1 2 3 4 5 ``` When `[ 0, 1, 2, 3, 4, 5 ]` is interpreted as a 2x3 column-major order matrix, it is visualized as: ``` 0 2 4 1 3 5 ``` which is equal to the transpose of `r3x2` but in column-major order. These matrix memory layout transformations are inserted before and after the matrix multiply and transpose intrinsics when lowering HLSL mul and transpose. While this method of supporting row-major order matrices is not performant, it is correct and will suffice for now until benchmarks are created and performance becomes a primary concern. Assisted-by: GitHub Copilot (powered by Claude Opus 4.6) --- Full diff: https://github.com/llvm/llvm-project/pull/186898.diff 4 Files Affected: - (modified) clang/lib/CodeGen/CGHLSLBuiltins.cpp (+39-7) - (modified) clang/test/CodeGenHLSL/builtins/mul.hlsl (+30-12) - (modified) clang/test/CodeGenHLSL/builtins/transpose.hlsl (+42-22) - (modified) llvm/include/llvm/IR/MatrixBuilder.h (+16) ``````````diff diff --git a/clang/lib/CodeGen/CGHLSLBuiltins.cpp b/clang/lib/CodeGen/CGHLSLBuiltins.cpp index 80c590437309d..a891864e6d964 100644 --- a/clang/lib/CodeGen/CGHLSLBuiltins.cpp +++ b/clang/lib/CodeGen/CGHLSLBuiltins.cpp @@ -1122,32 +1122,64 @@ Value *CodeGenFunction::EmitHLSLBuiltinExpr(unsigned BuiltinID, bool IsMat0 = QTy0->isConstantMatrixType(); bool IsMat1 = QTy1->isConstantMatrixType(); + // The matrix multiply intrinsic only operates on column-major order + // matrices. Therefore matrix memory layout transforms must be inserted + // before and after matrix multiply intrinsics. + bool IsRowMajor = getLangOpts().getDefaultMatrixMemoryLayout() == + LangOptions::MatrixMemoryLayout::MatrixRowMajor; + llvm::MatrixBuilder MB(Builder); if (IsVec0 && IsMat1) { unsigned N = QTy0->castAs<VectorType>()->getNumElements(); auto *MatTy = QTy1->castAs<ConstantMatrixType>(); - unsigned M = MatTy->getNumColumns(); - return MB.CreateMatrixMultiply(Op0, Op1, 1, N, M, "hlsl.mul"); + unsigned Rows = MatTy->getNumRows(); + unsigned Cols = MatTy->getNumColumns(); + if (IsRowMajor) + Op1 = MB.CreateRowMajorToColumnMajorTransform(Op1, Rows, Cols); + return MB.CreateMatrixMultiply(Op0, Op1, 1, N, Cols, "hlsl.mul"); } if (IsMat0 && IsVec1) { auto *MatTy = QTy0->castAs<ConstantMatrixType>(); unsigned Rows = MatTy->getNumRows(); unsigned Cols = MatTy->getNumColumns(); + if (IsRowMajor) + Op0 = MB.CreateRowMajorToColumnMajorTransform(Op0, Rows, Cols); return MB.CreateMatrixMultiply(Op0, Op1, Rows, Cols, 1, "hlsl.mul"); } assert(IsMat0 && IsMat1); auto *MatTy0 = QTy0->castAs<ConstantMatrixType>(); auto *MatTy1 = QTy1->castAs<ConstantMatrixType>(); - return MB.CreateMatrixMultiply(Op0, Op1, MatTy0->getNumRows(), - MatTy0->getNumColumns(), - MatTy1->getNumColumns(), "hlsl.mul"); + unsigned Rows0 = MatTy0->getNumRows(); + unsigned Rows1 = MatTy1->getNumRows(); + unsigned Cols0 = MatTy0->getNumColumns(); + unsigned Cols1 = MatTy1->getNumColumns(); + if (IsRowMajor) { + Op0 = MB.CreateRowMajorToColumnMajorTransform(Op0, Rows0, Cols0); + Op1 = MB.CreateRowMajorToColumnMajorTransform(Op1, Rows1, Cols1); + } + Value *Result = + MB.CreateMatrixMultiply(Op0, Op1, Rows0, Cols0, Cols1, "hlsl.mul"); + if (IsRowMajor) + Result = MB.CreateColumnMajorToRowMajorTransform(Result, Rows0, Cols1); + return Result; } case Builtin::BI__builtin_hlsl_transpose: { Value *Op0 = EmitScalarExpr(E->getArg(0)); auto *MatTy = E->getArg(0)->getType()->castAs<ConstantMatrixType>(); + unsigned Rows = MatTy->getNumRows(); + unsigned Cols = MatTy->getNumColumns(); llvm::MatrixBuilder MB(Builder); - return MB.CreateMatrixTranspose(Op0, MatTy->getNumRows(), - MatTy->getNumColumns()); + // The matrix transpose intrinsic only operates on column-major order + // matrices. Therefore matrix memory layout transforms must be inserted + // before and after matrix transpose intrinsics. + bool IsRowMajor = getLangOpts().getDefaultMatrixMemoryLayout() == + LangOptions::MatrixMemoryLayout::MatrixRowMajor; + if (IsRowMajor) + Op0 = MB.CreateRowMajorToColumnMajorTransform(Op0, Rows, Cols); + Value *Result = MB.CreateMatrixTranspose(Op0, Rows, Cols); + if (IsRowMajor) + Result = MB.CreateColumnMajorToRowMajorTransform(Result, Cols, Rows); + return Result; } case Builtin::BI__builtin_hlsl_elementwise_rcp: { Value *Op0 = EmitScalarExpr(E->getArg(0)); diff --git a/clang/test/CodeGenHLSL/builtins/mul.hlsl b/clang/test/CodeGenHLSL/builtins/mul.hlsl index 17749e527af65..f9151225d3846 100644 --- a/clang/test/CodeGenHLSL/builtins/mul.hlsl +++ b/clang/test/CodeGenHLSL/builtins/mul.hlsl @@ -1,5 +1,7 @@ -// RUN: %clang_cc1 -finclude-default-header -O1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,DXIL -// RUN: %clang_cc1 -finclude-default-header -O1 -triple spirv-unknown-vulkan1.3-library -fnative-half-type -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,SPIRV +// RUN: %clang_cc1 -finclude-default-header -O1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -emit-llvm -fmatrix-memory-layout=column-major -o - %s | FileCheck %s --check-prefixes=CHECK,COLMAJOR,DXIL +// RUN: %clang_cc1 -finclude-default-header -O1 -triple spirv-unknown-vulkan1.3-library -fnative-half-type -emit-llvm -fmatrix-memory-layout=column-major -o - %s | FileCheck %s --check-prefixes=CHECK,COLMAJOR,SPIRV +// RUN: %clang_cc1 -finclude-default-header -O1 -triple dxil-pc-shadermodel6.3-library -fnative-half-type -emit-llvm -fmatrix-memory-layout=row-major -o - %s | FileCheck %s --check-prefixes=CHECK,ROWMAJOR,DXIL +// RUN: %clang_cc1 -finclude-default-header -O1 -triple spirv-unknown-vulkan1.3-library -fnative-half-type -emit-llvm -fmatrix-memory-layout=row-major -o - %s | FileCheck %s --check-prefixes=CHECK,ROWMAJOR,SPIRV // -- Case 1: scalar * scalar -> scalar -- @@ -74,7 +76,8 @@ export double test_vec_vec_muld(double3 a, double3 b) { return mul(a, b); } // -- Case 6: vector * matrix -> vector -- // CHECK-LABEL: test_vec_mat_mul -// CHECK: %hlsl.mul = {{.*}} call {{.*}} <3 x float> @llvm.matrix.multiply.v3f32.v2f32.v6f32(<2 x float> %v, <6 x float> %m, i32 1, i32 2, i32 3) +// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %{{.*}}, i32 3, i32 2) +// CHECK: %hlsl.mul = {{.*}} call {{.*}} <3 x float> @llvm.matrix.multiply.v3f32.v2f32.v6f32(<2 x float> %v, <6 x float> %{{.*}}, i32 1, i32 2, i32 3) // CHECK: ret <3 x float> %hlsl.mul export float3 test_vec_mat_mul(float2 v, float2x3 m) { return mul(v, m); } @@ -90,22 +93,31 @@ export float2x3 test_mat_scalar_mul(float2x3 a, float b) { return mul(a, b); } // -- Case 8: matrix * vector -> vector -- // CHECK-LABEL: test_mat_vec_mul -// CHECK: %hlsl.mul = {{.*}} call {{.*}} <2 x float> @llvm.matrix.multiply.v2f32.v6f32.v3f32(<6 x float> %m, <3 x float> %v, i32 2, i32 3, i32 1) +// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %{{.*}}, i32 3, i32 2) +// CHECK: %hlsl.mul = {{.*}} call {{.*}} <2 x float> @llvm.matrix.multiply.v2f32.v6f32.v3f32(<6 x float> %{{.*}}, <3 x float> %v, i32 2, i32 3, i32 1) // CHECK: ret <2 x float> %hlsl.mul export float2 test_mat_vec_mul(float2x3 m, float3 v) { return mul(m, v); } // -- Case 9: matrix * matrix -> matrix -- // CHECK-LABEL: test_mat_mat_mul -// CHECK: %hlsl.mul = {{.*}} call {{.*}} <8 x float> @llvm.matrix.multiply.v8f32.v6f32.v12f32(<6 x float> %a, <12 x float> %b, i32 2, i32 3, i32 4) -// CHECK: ret <8 x float> %hlsl.mul +// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %{{.*}}, i32 3, i32 2) +// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <12 x float> @llvm.matrix.transpose.v12f32(<12 x float> %{{.*}}, i32 4, i32 3) +// CHECK: %hlsl.mul = {{.*}} call {{.*}} <8 x float> @llvm.matrix.multiply.v8f32.v6f32.v12f32(<6 x float> %{{.*}}, <12 x float> %{{.*}}, i32 2, i32 3, i32 4) +// COLMAJOR: ret <8 x float> %hlsl.mul +// ROWMAJOR: %[[TRANSPOSE_RES:.*]] = {{.*}} call {{.*}} <8 x float> @llvm.matrix.transpose.v8f32(<8 x float> %hlsl.mul, i32 2, i32 4) +// ROWMAJOR: ret <8 x float> %[[TRANSPOSE_RES]] export float2x4 test_mat_mat_mul(float2x3 a, float3x4 b) { return mul(a, b); } // -- Integer matrix multiply -- // CHECK-LABEL: test_mat_mat_muli -// CHECK: %hlsl.mul = {{.*}} call <8 x i32> @llvm.matrix.multiply.v8i32.v6i32.v12i32(<6 x i32> %a, <12 x i32> %b, i32 2, i32 3, i32 4) -// CHECK: ret <8 x i32> %hlsl.mul +// ROWMAJOR: {{.*}} = {{.*}} call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> %{{.*}}, i32 3, i32 2) +// ROWMAJOR: {{.*}} = {{.*}} call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> %{{.*}}, i32 4, i32 3) +// CHECK: %hlsl.mul = {{.*}} call <8 x i32> @llvm.matrix.multiply.v8i32.v6i32.v12i32(<6 x i32> %{{.*}}, <12 x i32> %{{.*}}, i32 2, i32 3, i32 4) +// COLMAJOR: ret <8 x i32> %hlsl.mul +// ROWMAJOR: %[[TRANSPOSE_RES:.*]] = {{.*}} call <8 x i32> @llvm.matrix.transpose.v8i32(<8 x i32> %hlsl.mul, i32 2, i32 4) +// ROWMAJOR: ret <8 x i32> %[[TRANSPOSE_RES]] export int2x4 test_mat_mat_muli(int2x3 a, int3x4 b) { return mul(a, b); } // -- Half-type overloads (native half) -- @@ -150,16 +162,22 @@ export half test_vec_vec_mulh(half3 a, half3 b) { return mul(a, b); } export half2x3 test_mat_scalar_mulh(half2x3 a, half b) { return mul(a, b); } // CHECK-LABEL: test_vec_mat_mulh -// CHECK: %hlsl.mul = {{.*}}call {{.*}} <3 x half> @llvm.matrix.multiply.v3f16.v2f16.v6f16(<2 x half> %v, <6 x half> %m, i32 1, i32 2, i32 3) +// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %{{.*}}, i32 3, i32 2) +// CHECK: %hlsl.mul = {{.*}}call {{.*}} <3 x half> @llvm.matrix.multiply.v3f16.v2f16.v6f16(<2 x half> %v, <6 x half> %{{.*}}, i32 1, i32 2, i32 3) // CHECK: ret <3 x half> %hlsl.mul export half3 test_vec_mat_mulh(half2 v, half2x3 m) { return mul(v, m); } // CHECK-LABEL: test_mat_vec_mulh -// CHECK: %hlsl.mul = {{.*}}call {{.*}} <2 x half> @llvm.matrix.multiply.v2f16.v6f16.v3f16(<6 x half> %m, <3 x half> %v, i32 2, i32 3, i32 1) +// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %{{.*}}, i32 3, i32 2) +// CHECK: %hlsl.mul = {{.*}}call {{.*}} <2 x half> @llvm.matrix.multiply.v2f16.v6f16.v3f16(<6 x half> %{{.*}}, <3 x half> %v, i32 2, i32 3, i32 1) // CHECK: ret <2 x half> %hlsl.mul export half2 test_mat_vec_mulh(half2x3 m, half3 v) { return mul(m, v); } // CHECK-LABEL: test_mat_mat_mulh -// CHECK: %hlsl.mul = {{.*}}call {{.*}} <8 x half> @llvm.matrix.multiply.v8f16.v6f16.v12f16(<6 x half> %a, <12 x half> %b, i32 2, i32 3, i32 4) -// CHECK: ret <8 x half> %hlsl.mul +// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <6 x half> @llvm.matrix.transpose.v6f16(<6 x half> %{{.*}}, i32 3, i32 2) +// ROWMAJOR: {{.*}} = {{.*}} call {{.*}} <12 x half> @llvm.matrix.transpose.v12f16(<12 x half> %{{.*}}, i32 4, i32 3) +// CHECK: %hlsl.mul = {{.*}}call {{.*}} <8 x half> @llvm.matrix.multiply.v8f16.v6f16.v12f16(<6 x half> %{{.*}}, <12 x half> %{{.*}}, i32 2, i32 3, i32 4) +// COLMAJOR: ret <8 x half> %hlsl.mul +// ROWMAJOR: %[[TRANSPOSE_RES:.*]] = {{.*}} call {{.*}} <8 x half> @llvm.matrix.transpose.v8f16(<8 x half> %hlsl.mul, i32 2, i32 4) +// ROWMAJOR: ret <8 x half> %[[TRANSPOSE_RES]] export half2x4 test_mat_mat_mulh(half2x3 a, half3x4 b) { return mul(a, b); } diff --git a/clang/test/CodeGenHLSL/builtins/transpose.hlsl b/clang/test/CodeGenHLSL/builtins/transpose.hlsl index 9018e3913269d..b0bb99c70fda2 100644 --- a/clang/test/CodeGenHLSL/builtins/transpose.hlsl +++ b/clang/test/CodeGenHLSL/builtins/transpose.hlsl @@ -1,42 +1,62 @@ -// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s -// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan1.3-library -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=column-major -o - %s | FileCheck %s --check-prefixes=CHECK,COLMAJOR +// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan1.3-library -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=column-major -o - %s | FileCheck %s --check-prefixes=CHECK,COLMAJOR +// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=row-major -o - %s | FileCheck %s --check-prefixes=CHECK,ROWMAJOR +// RUN: %clang_cc1 -finclude-default-header -triple spirv-unknown-vulkan1.3-library -emit-llvm -disable-llvm-passes -fmatrix-memory-layout=row-major -o - %s | FileCheck %s --check-prefixes=CHECK,ROWMAJOR // CHECK-LABEL: define {{.*}}test_transpose_bool2x3 -// CHECK: [[A_ADDR:%.*]] = alloca [3 x <2 x i32>], align 4 -// CHECK: [[A_EXT:%.*]] = zext <6 x i1> %{{.*}} to <6 x i32> -// CHECK: store <6 x i32> [[A_EXT]], ptr [[A_ADDR]], align 4 -// CHECK: [[A:%.*]] = load <6 x i32>, ptr [[A_ADDR]], align 4 -// CHECK: [[TRANS:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[A]], i32 2, i32 3) +// COLMAJOR: [[A_ADDR:%.*]] = alloca [3 x <2 x i32>], align 4 +// ROWMAJOR: [[A_ADDR:%.*]] = alloca [2 x <3 x i32>], align 4 +// CHECK: [[A_EXT:%.*]] = zext <6 x i1> %{{.*}} to <6 x i32> +// CHECK: store <6 x i32> [[A_EXT]], ptr [[A_ADDR]], align 4 +// CHECK: [[A:%.*]] = load <6 x i32>, ptr [[A_ADDR]], align 4 +// COLMAJOR: [[TRANS:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[A]], i32 2, i32 3) +// ROWMAJOR: [[LAYOUT:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[A]], i32 3, i32 2) +// ROWMAJOR: [[TRANS:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[LAYOUT]], i32 2, i32 3) +// ROWMAJOR: {{.*}} = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[TRANS]], i32 3, i32 2) bool3x2 test_transpose_bool2x3(bool2x3 a) { return transpose(a); } // CHECK-LABEL: define {{.*}}test_transpose_int4x3 -// CHECK: [[A_ADDR:%.*]] = alloca [3 x <4 x i32>], align 4 -// CHECK: store <12 x i32> %{{.*}}, ptr [[A_ADDR]], align 4 -// CHECK: [[A:%.*]] = load <12 x i32>, ptr [[A_ADDR]], align 4 -// CHECK: [[TRANS:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[A]], i32 4, i32 3) -// CHECK: ret <12 x i32> [[TRANS]] +// COLMAJOR: [[A_ADDR:%.*]] = alloca [3 x <4 x i32>], align 4 +// ROWMAJOR: [[A_ADDR:%.*]] = alloca [4 x <3 x i32>], align 4 +// CHECK: store <12 x i32> %{{.*}}, ptr [[A_ADDR]], align 4 +// CHECK: [[A:%.*]] = load <12 x i32>, ptr [[A_ADDR]], align 4 +// COLMAJOR: [[TRANS:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[A]], i32 4, i32 3) +// COLMAJOR: ret <12 x i32> [[TRANS]] +// ROWMAJOR: [[LAYOUT:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[A]], i32 3, i32 4) +// ROWMAJOR: [[TRANS:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[LAYOUT]], i32 4, i32 3) +// ROWMAJOR: [[RESULT:%.*]] = call <12 x i32> @llvm.matrix.transpose.v12i32(<12 x i32> [[TRANS]], i32 3, i32 4) +// ROWMAJOR: ret <12 x i32> [[RESULT]] int3x4 test_transpose_int4x3(int4x3 a) { return transpose(a); } // CHECK-LABEL: define {{.*}}test_transpose_float4x4 -// CHECK: [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4 -// CHECK: store <16 x float> %{{.*}}, ptr [[A_ADDR]], align 4 -// CHECK: [[A:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4 -// CHECK: [[TRANS:%.*]] = call {{.*}}<16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[A]], i32 4, i32 4) -// CHECK: ret <16 x float> [[TRANS]] +// CHECK: [[A_ADDR:%.*]] = alloca [4 x <4 x float>], align 4 +// CHECK: store <16 x float> %{{.*}}, ptr [[A_ADDR]], align 4 +// CHECK: [[A:%.*]] = load <16 x float>, ptr [[A_ADDR]], align 4 +// COLMAJOR: [[TRANS:%.*]] = call {{.*}}<16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[A]], i32 4, i32 4) +// COLMAJOR: ret <16 x float> [[TRANS]] +// ROWMAJOR: [[LAYOUT:%.*]] = call {{.*}} <16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[A]], i32 4, i32 4) +// ROWMAJOR: [[TRANS:%.*]] = call {{.*}} <16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[LAYOUT]], i32 4, i32 4) +// ROWMAJOR: [[RESULT:%.*]] = call {{.*}} <16 x float> @llvm.matrix.transpose.v16f32(<16 x float> [[TRANS]], i32 4, i32 4) +// ROWMAJOR: ret <16 x float> [[RESULT]] float4x4 test_transpose_float4x4(float4x4 a) { return transpose(a); } // CHECK-LABEL: define {{.*}}test_transpose_double1x4 -// CHECK: [[A_ADDR:%.*]] = alloca [4 x <1 x double>], align 8 -// CHECK: store <4 x double> %{{.*}}, ptr [[A_ADDR]], align 8 -// CHECK: [[A:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 8 -// CHECK: [[TRANS:%.*]] = call {{.*}}<4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[A]], i32 1, i32 4) -// CHECK: ret <4 x double> [[TRANS]] +// COLMAJOR: [[A_ADDR:%.*]] = alloca [4 x <1 x double>], align 8 +// ROWMAJOR: [[A_ADDR:%.*]] = alloca [1 x <4 x double>], align 8 +// CHECK: store <4 x double> %{{.*}}, ptr [[A_ADDR]], align 8 +// CHECK: [[A:%.*]] = load <4 x double>, ptr [[A_ADDR]], align 8 +// COLMAJOR: [[TRANS:%.*]] = call {{.*}}<4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[A]], i32 1, i32 4) +// COLMAJOR: ret <4 x double> [[TRANS]] +// ROWMAJOR: [[LAYOUT:%.*]] = call {{.*}} <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[A]], i32 4, i32 1) +// ROWMAJOR: [[TRANS:%.*]] = call {{.*}} <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[LAYOUT]], i32 1, i32 4) +// ROWMAJOR: [[RESULT:%.*]] = call {{.*}} <4 x double> @llvm.matrix.transpose.v4f64(<4 x double> [[TRANS]], i32 4, i32 1) +// ROWMAJOR: ret <4 x double> [[RESULT]] double4x1 test_transpose_double1x4(double1x4 a) { return transpose(a); } diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h index 5c891d644bfd2..b161e79a1cb6b 100644 --- a/llvm/include/llvm/IR/MatrixBuilder.h +++ b/llvm/include/llvm/IR/MatrixBuilder.h @@ -141,6 +141,22 @@ class MatrixBuilder { return B.CreateCall(TheFn->getFunctionType(), TheFn, Ops, Name); } + /// Create a column-major matrix from a row-major matrix with the given + /// logical dimensions by transposing it. + CallInst *CreateRowMajorToColumnMajorTransform(Value *Matrix, unsigned Rows, + unsigned Columns, + const Twine &Name = "") { + return CreateMatrixTranspose(Matrix, Columns, Rows, Name); + } + + /// Create a row-major matrix from a column-major matrix with the given + /// logical dimensions by transposing it. + CallInst *CreateColumnMajorToRowMajorTransform(Value *Matrix, unsigned Rows, + unsigned Columns, + const Twine &Name = "") { + return CreateMatrixTranspose(Matrix, Rows, Columns, Name); + } + /// Insert a single element \p NewVal into \p Matrix at indices (\p RowIdx, \p /// ColumnIdx). Value *CreateMatrixInsert(Value *Matrix, Value *NewVal, Value *RowIdx, `````````` </details> https://github.com/llvm/llvm-project/pull/186898 _______________________________________________ cfe-commits mailing list [email protected] https://lists.llvm.org/cgi-bin/mailman/listinfo/cfe-commits
