(systemds) branch main updated: [SYSTEMDS-3168] Matrix Multiplication Transposed Kernels

mboehm7 Sun, 29 Mar 2026 04:12:15 -0700

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new ea00f88b60 [SYSTEMDS-3168] Matrix Multiplication Transposed Kernels
ea00f88b60 is described below

commit ea00f88b60c2f14207c4dcf537533373f24b0516
Author: Elman Jahangiri <[email protected]>
AuthorDate: Sun Mar 29 12:45:35 2026 +0200

    [SYSTEMDS-3168] Matrix Multiplication Transposed Kernels
    
    Optimize dense matrix mult for transposed inputs
    
    This introduces specialized kernels for dense matrix multiplication
    involving transposed inputs (t(A)%*%B, A%*%t(B), t(A)%*%t(B)).
    Previously, these operations required an explicit intermediate transpose
    step, which caused unnecessary runtime.
    
    The new kernels perform the operations in-place or using
    tiled-transposition,
    avoiding the full allocation cost.
    
    Performance benchmarks on 100x100 dense matrices show significant
    speedups especially for t(A)%*%B
    and t(A)%*%t(B) and can be tested with higher dimensions.
    
    Closes #2425.
---
 .../java/org/apache/sysds/hops/OptimizerUtils.java |  10 +-
 .../apache/sysds/hops/rewrite/ProgramRewriter.java |  36 ++---
 .../sysds/runtime/matrix/data/LibMatrixMult.java   | 161 +++++++++++++++++++++
 .../MatrixMultTransposedPerformanceTest.java       | 107 ++++++++++++++
 .../matrixmult/MatrixMultTransposedTest.java       |  91 ++++++++++++
 5 files changed, 382 insertions(+), 23 deletions(-)

diff --git a/src/main/java/org/apache/sysds/hops/OptimizerUtils.java 
b/src/main/java/org/apache/sysds/hops/OptimizerUtils.java
index f2d26be535..a02a550350 100644
--- a/src/main/java/org/apache/sysds/hops/OptimizerUtils.java
+++ b/src/main/java/org/apache/sysds/hops/OptimizerUtils.java
@@ -335,12 +335,12 @@ public class OptimizerUtils
 
        public static boolean AUTO_GPU_CACHE_EVICTION = true;
 
-    /**
-     * Boolean specifying if relational algebra rewrites are allowed (e.g. 
Selection Pushdowns).
-     */
-    public static boolean ALLOW_RA_REWRITES = false;
+       /**
+        * Boolean specifying if relational algebra rewrites are allowed (e.g. 
Selection Pushdowns).
+        */
+       public static boolean ALLOW_RA_REWRITES = false;
 
-    //////////////////////
+       //////////////////////
        // Optimizer levels //
        //////////////////////
 
diff --git a/src/main/java/org/apache/sysds/hops/rewrite/ProgramRewriter.java 
b/src/main/java/org/apache/sysds/hops/rewrite/ProgramRewriter.java
index 927433c8c7..49c85191d2 100644
--- a/src/main/java/org/apache/sysds/hops/rewrite/ProgramRewriter.java
+++ b/src/main/java/org/apache/sysds/hops/rewrite/ProgramRewriter.java
@@ -103,27 +103,27 @@ public class ProgramRewriter{
                                _sbRuleSet.add(  new 
RewriteMergeBlockSequence()                 ); //dependency: remove branches, 
remove for-loops
                        if(OptimizerUtils.ALLOW_COMPRESSION_REWRITE)
                                _sbRuleSet.add(      new 
RewriteCompressedReblock()              ); // Compression Rewrite
-                       if( OptimizerUtils.ALLOW_SPLIT_HOP_DAGS )
-                               _sbRuleSet.add(  new 
RewriteSplitDagUnknownCSVRead()             ); //dependency: reblock, merge 
blocks
-                       if( OptimizerUtils.ALLOW_SPLIT_HOP_DAGS && 
-                               
ConfigurationManager.getCompilerConfigFlag(ConfigType.ALLOW_INDIVIDUAL_SB_SPECIFIC_OPS)
 )
-                               _sbRuleSet.add(  new 
RewriteSplitDagDataDependentOperators()     ); //dependency: merge blocks
-                       if( OptimizerUtils.ALLOW_AUTO_VECTORIZATION )
+                       if( OptimizerUtils.ALLOW_SPLIT_HOP_DAGS )
+                               _sbRuleSet.add(  new 
RewriteSplitDagUnknownCSVRead()             ); //dependency: reblock, merge 
blocks
+                       if( OptimizerUtils.ALLOW_SPLIT_HOP_DAGS && 
+                               
ConfigurationManager.getCompilerConfigFlag(ConfigType.ALLOW_INDIVIDUAL_SB_SPECIFIC_OPS)
 )
+                               _sbRuleSet.add(  new 
RewriteSplitDagDataDependentOperators()     ); //dependency: merge blocks
+                       if( OptimizerUtils.ALLOW_AUTO_VECTORIZATION )
                                _sbRuleSet.add(  new 
RewriteForLoopVectorization()               ); //dependency: reblock (reblockop)
-                       _sbRuleSet.add( new 
RewriteInjectSparkLoopCheckpointing(true)        ); //dependency: reblock 
(blocksizes)
-                       if( OptimizerUtils.ALLOW_CODE_MOTION )
-                               _sbRuleSet.add(  new 
RewriteHoistLoopInvariantOperations()       ); //dependency: vectorize, but 
before inplace
-                       if( OptimizerUtils.ALLOW_LOOP_UPDATE_IN_PLACE )
-                               _sbRuleSet.add(  new 
RewriteMarkLoopVariablesUpdateInPlace()     );
-                       if( LineageCacheConfig.getCompAssRW() )
-                               _sbRuleSet.add(  new MarkForLineageReuse()      
                 );
-            if( OptimizerUtils.ALLOW_RA_REWRITES )
-                _sbRuleSet.add(  new RewriteRaPushdown()                       
  );
-                       _sbRuleSet.add(      new 
RewriteRemoveTransformEncodeMeta()          );
-                       _dagRuleSet.add( new RewriteNonScalarPrint()            
             );
+                       _sbRuleSet.add( new 
RewriteInjectSparkLoopCheckpointing(true)        ); //dependency: reblock 
(blocksizes)
+                       if( OptimizerUtils.ALLOW_CODE_MOTION )
+                               _sbRuleSet.add(  new 
RewriteHoistLoopInvariantOperations()       ); //dependency: vectorize, but 
before inplace
+                       if( OptimizerUtils.ALLOW_LOOP_UPDATE_IN_PLACE )
+                               _sbRuleSet.add(  new 
RewriteMarkLoopVariablesUpdateInPlace()     );
+                       if( LineageCacheConfig.getCompAssRW() )
+                               _sbRuleSet.add(  new MarkForLineageReuse()      
                 );
+                       if( OptimizerUtils.ALLOW_RA_REWRITES )
+                               _sbRuleSet.add(  new RewriteRaPushdown()        
                 );
+                       _sbRuleSet.add(      new 
RewriteRemoveTransformEncodeMeta()          );
+                       _dagRuleSet.add( new RewriteNonScalarPrint()            
             );
                        if( OptimizerUtils.ALLOW_JOIN_REORDERING_REWRITE )
                                _sbRuleSet.add( new RewriteJoinReordering() );
-               }
+               }
                
                // DYNAMIC REWRITES (which do require size information)
                if( dynamicRewrites )
diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index b638c7771b..9763e4ea57 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -45,6 +45,7 @@ import org.apache.sysds.lops.WeightedSquaredLoss.WeightsType;
 import org.apache.sysds.lops.WeightedUnaryMM.WUMMType;
 import org.apache.sysds.runtime.DMLRuntimeException;
 import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.DenseBlockFP64;
 import org.apache.sysds.runtime.data.DenseBlockFP64DEDUP;
 import org.apache.sysds.runtime.data.DenseBlockFactory;
 import org.apache.sysds.runtime.data.SparseBlock;
@@ -1026,6 +1027,166 @@ public class LibMatrixMult
        // optimized matrix mult implementation //
        //////////////////////////////////////////
 
+       public static void matrixMultDenseDenseMM(DenseBlock a, DenseBlock b, 
DenseBlock c, boolean transA, boolean transB, int n, int cd, int rl, int ru, 
int cl, int cu) {
+               // C = A %*% B
+               if (!transA && !transB)
+                       matrixMultDenseDenseMM(a, b, c, n, cd, rl, ru, cl, cu);
+               // C = t(A) %*% B
+               else if (transA && !transB)
+                       multDenseDenseTransA(a, b, c, n, cd, rl, ru, cl, cu);
+               // C = A %*% t(B)
+               else if (!transA && transB)
+                       multDenseDenseTransB(a, b, c, n, cd, rl, ru, cl, cu);
+               // C = t(A) %*% t(B)
+               else if (transA && transB)
+                       multDenseDenseTransATransB(a, b, c, n, cd, rl, ru, cl, 
cu);
+       }
+
+       private static void multDenseDenseTransA(DenseBlock a, DenseBlock b, 
DenseBlock c, int n, int cd, int rl, int ru, int cl, int cu) {
+               // process matrices in small blocks for caching
+               final int blocksizeI = 32;
+               final int blocksizeK = 24;
+               final int blocksizeJ = 1024;
+
+               // iterate over block of C rows
+               for (int bi = rl; bi < ru; bi += blocksizeI) {
+                       int bimin = Math.min(ru, bi + blocksizeI);
+
+                       // iterate over blocks of common dimension k
+                       for (int bk = 0; bk < cd; bk += blocksizeK) {
+                               int bkmin = Math.min(cd, bk + blocksizeK);
+
+                               // iterate over blocks of C columns
+                               for (int bj = cl; bj < cu; bj += blocksizeJ) {
+                                       int bjmin = Math.min(cu, bj + 
blocksizeJ);
+                                       int lenJ = bjmin - bj;
+
+                                       // if B is a single contiguous array, 
we skip checks inside the loop
+                                       if (b.isContiguous()) {
+                                               double[] bvals = b.values(0);
+
+                                               int k = bk;
+                                               // process 4 rows of A at the 
same time
+                                               for (; k < bkmin - 3; k += 4) {
+                                                       int bix0 = b.pos(k, bj);
+                                                       int bix1 = b.pos(k+1, 
bj);
+                                                       int bix2 = b.pos(k+2, 
bj);
+                                                       int bix3 = b.pos(k+3, 
bj);
+
+                                                       for (int i = bi; i < 
bimin; i++) {
+                                                               // grab 4 
values from A
+                                                               double val0 = 
a.values(k)[a.pos(k) + i];
+                                                               double val1 = 
a.values(k+1)[a.pos(k+1) + i];
+                                                               double val2 = 
a.values(k+2)[a.pos(k+2) + i];
+                                                               double val3 = 
a.values(k+3)[a.pos(k+3) + i];
+
+                                                               double[] cvals 
= c.values(i);
+                                                               int cix = 
c.pos(i, bj);
+
+                                                               
vectMultiplyAdd4(val0, val1, val2, val3,
+                                                                       bvals, 
cvals,
+                                                                       bix0, 
bix1, bix2, bix3, cix, lenJ);
+                                                       }
+                                               }
+                                               // for the remaining rows
+                                               for (; k < bkmin; k++) {
+                                                       int bix = b.pos(k, bj);
+                                                       for (int i = bi; i < 
bimin; i++) {
+                                                               double val = 
a.values(k)[a.pos(k) + i];
+                                                               if (val != 0) {
+                                                                       
vectMultiplyAdd(val, bvals, c.values(i), bix, c.pos(i, bj), lenJ);
+                                                               }
+                                                       }
+                                               }
+                                       } else {
+                                               for (int k = bk; k < bkmin; 
k++) {
+                                                       for (int i = bi; i < 
bimin; i++) {
+                                                               double val = 
a.values(k)[a.pos(k) + i];
+                                                               if (val != 0) {
+                                                                       
vectMultiplyAdd(val, b.values(k), c.values(i),
+                                                                               
b.pos(k, bj), c.pos(i, bj), lenJ);
+                                                               }
+                                                       }
+                                               }
+                                       }
+                               }
+                       }
+               }
+       }
+
+       private static void multDenseDenseTransB(DenseBlock a, DenseBlock b, 
DenseBlock c, int n, int cd, int rl, int ru, int cl, int cu) {
+               // copy small blocks of B into buffer bufB
+               final int blocksizeK = 24;
+               double[] bufB = new double[blocksizeK * (cu - cl)];
+
+               for (int bk = 0; bk < cd; bk += blocksizeK) {
+                       int bkmin = Math.min(cd, bk + blocksizeK);
+                       int bklen = bkmin - bk;
+
+                       // put B into buffer while transposing
+                       for (int j = cl; j < cu; j++) {
+                               double[] bvals = b.values(j);
+                               int bpos = b.pos(j);
+
+                               for (int k = 0; k < bklen; k++) {
+                                       bufB[k * (cu-cl) + (j-cl)] = bvals[bpos 
+ bk + k];
+                               }
+                       }
+
+                       // perform matrix multiplication with buffer
+                       for (int i = rl; i < ru; i++) {
+                               double[] avals = a.values(i);
+                               int apos = a.pos(i);
+                               double[] cvals = c.values(i);
+                               int cix = c.pos(i, cl);
+
+                               for (int k = 0; k < bklen; k++) {
+                                       double val = avals[apos + bk + k];
+                                       if (val != 0) {
+                                               int bufIx = k * (cu-cl);
+                                               vectMultiplyAdd(val, bufB, 
cvals, bufIx, cix, cu - cl);
+                                       }
+                               }
+                       }
+               }
+       }
+
+       private static void multDenseDenseTransATransB(DenseBlock a, DenseBlock 
b, DenseBlock c, int n, int cd, int rl, int ru, int cl, int cu) {
+               // transpose B into temp Block B
+               // use C = t(A) * B from above as helper method
+
+               // allocate Block for transposing B
+               int tB_rows = cd;
+               int tB_cols = cu - cl;
+
+               // allocate new denseBlock for result matrix
+               DenseBlock tB_block = new DenseBlockFP64(new int[] {tB_rows, 
tB_cols});
+               double[] tB = tB_block.values(0);
+
+               // perform transpose from B to tB_block
+               final int BLOCK = 128;
+               for (int bi = cl; bi < cu; bi += BLOCK) {
+                       int bimin = Math.min(cu, bi + BLOCK);
+                       for (int bk = 0; bk < cd; bk += BLOCK) {
+                               int bkmin = Math.min(cd, bk + BLOCK);
+
+                               for (int j = bi; j < bimin; j++) {
+                                       double[] b_vals = b.values(j);
+                                       int b_pos = b.pos(j);
+
+                                       int tB_col_idx = (j - cl);
+
+                                       for (int k = bk; k < bkmin; k++) {
+                                               tB[k * tB_cols + tB_col_idx] = 
b_vals[b_pos + k];
+                                       }
+                               }
+                       }
+               }
+               // reuse our existing method
+               multDenseDenseTransA(a, tB_block, c, n, cd, rl, ru, 0, tB_cols);
+       }
+
+
        private static void matrixMultDenseDense(MatrixBlock m1, MatrixBlock 
m2, MatrixBlock ret, boolean tm2, boolean pm2, int rl, int ru, int cl, int cu) {
                DenseBlock a = m1.getDenseBlock();
                DenseBlock b = m2.getDenseBlock();
diff --git 
a/src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedPerformanceTest.java
 
b/src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedPerformanceTest.java
new file mode 100644
index 0000000000..18d62251a0
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedPerformanceTest.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.matrixmult;
+
+import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
+import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.junit.Test;
+
+public class MatrixMultTransposedPerformanceTest {
+       // could be adjusted, as it takes a lot of runtime with higher 
dimensions
+       private final int m = 200;
+       private final int n = 200;
+       private final int k = 200;
+
+       @Test
+       public void testPerf_1_NoTransA_TransB() {
+               System.out.println("Case: C = A %*% t(B)");
+               runTest(false, true);
+               System.out.println();
+       }
+
+       @Test
+       public void testPerf_2_TransA_NoTransB() {
+               System.out.println("Case: C = t(A) %*% B");
+               runTest(true, false);
+               System.out.println();
+       }
+
+       @Test
+       public void testPerf_3_TransA_TransB() {
+               System.out.println("Case: C = t(A) %*% t(B)");
+               runTest(true, true);
+       }
+
+       private void runTest(boolean tA, boolean tB) {
+               int REP = 100;
+
+               // setup Dimensions
+               int rowsA = tA ? k : m;
+               int colsA = tA ? m : k;
+               int rowsB = tB ? n : k;
+               int colsB = tB ? k : n;
+
+               // generate random matrices
+               MatrixBlock A = MatrixBlock.randOperations(rowsA, colsA, 1.0, 
-1, 1, "uniform", 7);
+               MatrixBlock B = MatrixBlock.randOperations(rowsB, colsB, 1.0, 
-1, 1, "uniform", 3);
+               MatrixBlock C = new MatrixBlock(m, n, false);
+               C.allocateDenseBlock();
+
+               for(int i=0; i<50; i++) {
+                       runOldMethod(A, B, tA, tB);
+                       runNewKernel(A, B, C, tA, tB);
+               }
+
+               // Measure Old Method
+               long startTimeOld = System.nanoTime();
+               for(int i = 0; i < REP; i++) {
+                       runOldMethod(A, B, tA, tB);
+               }
+               double avgTimeOld = (System.nanoTime() - startTimeOld) / 1e6 / 
REP;
+
+               // Measure New Kernel
+               double startTimeNew = System.nanoTime();
+               for(int i = 0; i < REP; i++) {
+                       runNewKernel(A, B, C, tA, tB);
+               }
+               double avgTimeNew = (System.nanoTime() - startTimeNew) / 1e6 / 
REP;
+
+               // print results comparison
+               System.out.printf("Old Method: %.3f ms | New Kernel: %.3f 
ms%n", avgTimeOld, avgTimeNew);
+       }
+
+       private void runNewKernel(MatrixBlock A, MatrixBlock B, MatrixBlock C, 
boolean tA, boolean tB) {
+               C.reset();
+               LibMatrixMult.matrixMultDenseDenseMM(A.getDenseBlock(), 
B.getDenseBlock(), C.getDenseBlock(), tA, tB, m, k, 0, m, 0, n);
+       }
+
+       private void runOldMethod(MatrixBlock A, MatrixBlock B, boolean tA, 
boolean tB) {
+               // do transpose if needed
+               MatrixBlock A_in = tA ? LibMatrixReorg.transpose(A) : A;
+               MatrixBlock B_in = tB ? LibMatrixReorg.transpose(B) : B;
+
+               MatrixBlock C = new MatrixBlock(m, n, false);
+               C.allocateDenseBlock();
+
+               LibMatrixMult.matrixMultDenseDenseMM(A_in.getDenseBlock(), 
B_in.getDenseBlock(), C.getDenseBlock(), false,
+                       false, m, k, 0, m, 0, n);
+       }
+}
diff --git 
a/src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedTest.java
 
b/src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedTest.java
new file mode 100644
index 0000000000..6c18df009f
--- /dev/null
+++ 
b/src/test/java/org/apache/sysds/test/component/matrixmult/MatrixMultTransposedTest.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.component.matrixmult;
+
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.matrix.data.LibMatrixMult;
+import org.apache.sysds.runtime.matrix.data.LibMatrixReorg;
+import org.apache.sysds.runtime.matrix.data.MatrixBlock;
+import org.apache.sysds.test.TestUtils;
+import org.junit.Test;
+
+import java.util.Random;
+
+public class MatrixMultTransposedTest {
+
+    // run multiple random scenarios
+    @Test
+    public void testCase_noTransA_TransB() {
+        for(int i=0; i<10; i++) {
+            runTest(false, true);
+        }
+    }
+
+    @Test
+    public void testCase_TransA_NoTransB() {
+        for(int i=0; i<10; i++) {
+            runTest(true, false);
+        }
+    }
+
+    @Test
+    public void testCase_TransA_TransB() {
+        for(int i=0; i<10; i++) {
+            runTest(true, true);
+        }
+    }
+
+    private void runTest(boolean tA, boolean tB) {
+        Random rand = new Random();
+
+        // generate random dimensions between 1 and 300
+        int m = rand.nextInt(300) + 1;
+        int n = rand.nextInt(300) + 1;
+        int k = rand.nextInt(300) + 1;
+
+
+        int rowsA = tA ? k : m;
+        int colsA = tA ? m : k;
+        int rowsB = tB ? n : k;
+        int colsB = tB ? k : n;
+
+        MatrixBlock ma = MatrixBlock.randOperations(rowsA, colsA, 1.0, -1, 1, 
"uniform", 7);
+        MatrixBlock mb = MatrixBlock.randOperations(rowsB, colsB, 1.0, -1, 1, 
"uniform", 3);
+
+        MatrixBlock mc = new MatrixBlock(m, n, false);
+        mc.allocateDenseBlock();
+
+        DenseBlock a = ma.getDenseBlock();
+        DenseBlock b = mb.getDenseBlock();
+        DenseBlock c = mc.getDenseBlock();
+
+        LibMatrixMult.matrixMultDenseDenseMM(a, b, c, tA, tB, n, k, 0, m, 0, 
n);
+
+        mc.recomputeNonZeros();
+
+        // calc true result with existing methods
+        MatrixBlock ma_in = tA ? LibMatrixReorg.transpose(ma) : ma;
+        MatrixBlock mb_in = tB ? LibMatrixReorg.transpose(mb) : mb;
+        MatrixBlock expected = LibMatrixMult.matrixMult(ma_in, mb_in);
+
+        // compare results
+        TestUtils.compareMatrices(expected, mc, 1e-8);
+    }
+}

(systemds) branch main updated: [SYSTEMDS-3168] Matrix Multiplication Transposed Kernels

Reply via email to