(systemds) branch main updated: [SYSTEMDS-3773] Fix two matmult kernels w/ parallelization over rhs

mboehm7 Sun, 22 Sep 2024 06:50:56 -0700

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git



The following commit(s) were added to refs/heads/main by this push:
     new c940502658 [SYSTEMDS-3773] Fix two matmult kernels w/ parallelization 
over rhs
c940502658 is described below

commit c94050265827deab65200c1b2ba4bad60f6d6f96
Author: Matthias Boehm <[email protected]>
AuthorDate: Sun Sep 22 15:50:25 2024 +0200

    [SYSTEMDS-3773] Fix two matmult kernels w/ parallelization over rhs
    
    The matmult kernel library parallelizes by default over rows in the
    left-hand-side matrix, but for specific size regimes, switches to a
    parallelization over the rows or columns of the right-hand-side. The
    recently added full-coverage tests found two bug, which this patch fixes
    
    a) dense-dense matrix-vector multiplication w/ large vectors
       --> extended implementation to support the other parallelization
    b) sparse-dense vector-vector dot product
       --> disable parallelization for this specific case to use the
           existing kernel without binary searches
---
 .../sysds/runtime/matrix/data/LibMatrixMult.java    | 21 +++++++++++++--------
 .../component/matrix/MatrixMultiplyKernelTest.java  | 21 +++++++++++++--------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index 328b43d6ca..c4eddd90fa 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -279,7 +279,7 @@ public class LibMatrixMult
                boolean pm2r = !ultraSparse && !sparse && 
checkParMatrixMultRightInputRows(m1, m2, k);
                boolean pm2c = !ultraSparse && 
checkParMatrixMultRightInputCols(m1, m2, k, pm2r);
                int num = pm2r ? m2.rlen : pm2c ? m2.clen : m1.rlen;
-
+               
                // core multi-threaded matrix mult computation
                // (currently: always parallelization over number of rows)
                final ExecutorService pool = CommonThreadPool.get(k);
@@ -1052,7 +1052,7 @@ public class LibMatrixMult
                        matrixMultDenseDenseMVShortRHS(a, b, c, cd, rl, ru);
                }
                else if( n==1 ) {               //MATRIX-VECTOR (tall rhs)
-                       matrixMultDenseDenseMVTallRHS(a, b, c, cd, rl, ru);
+                       matrixMultDenseDenseMVTallRHS(a, b, c, pm2, cd, rl, ru);
                }
                else if( pm2 && m==1 ) {        //VECTOR-MATRIX
                        matrixMultDenseDenseVM(a, b, c, n, cd, rl, ru);
@@ -1075,15 +1075,20 @@ public class LibMatrixMult
                        cvals[i] = dotProduct(a.values(i), bvals, a.pos(i), 0, 
cd);
        }
        
-       private static void matrixMultDenseDenseMVTallRHS(DenseBlock a, 
DenseBlock b, DenseBlock c, int cd, int rl, int ru) {
+       private static void matrixMultDenseDenseMVTallRHS(DenseBlock a, 
DenseBlock b, DenseBlock c, boolean pm2, int cd, int rl, int ru) {
                final int blocksizeI = 32;
                final int blocksizeK = 2*1024; //16KB vector blocks (L1)
                double[] bvals = b.valuesAt(0);
                double[] cvals = c.valuesAt(0);
-               for( int bi=rl; bi<ru; bi+=blocksizeI ) {
-                       int bimin = Math.min(bi+blocksizeI, ru);
-                       for( int bk=0; bk<cd; bk+=blocksizeK ) {
-                               int bkmin = Math.min(bk+blocksizeK, cd);
+               // setup bounds according to parallelization strategy
+               // (default: rows in lhs, pm2: rows in rhs)
+               int cl = pm2 ? rl : 0, cu = pm2 ? ru : cd;
+               int rl2 = pm2 ? 0 : rl, ru2 = pm2 ? a.numRows() : ru;
+               // matrix-vector multication with cache blocking of vector
+               for( int bi=rl2; bi<ru2; bi+=blocksizeI ) {
+                       int bimin = Math.min(bi+blocksizeI, ru2);
+                       for( int bk=cl; bk<cu; bk+=blocksizeK ) {
+                               int bkmin = Math.min(bk+blocksizeK, cu);
                                for( int i=bi; i<bimin; i++) 
                                        cvals[i] += dotProduct(a.values(i), 
bvals, a.pos(i,bk), bk, bkmin-bk);
                        }
@@ -4349,7 +4354,7 @@ public class LibMatrixMult
        private static boolean checkParMatrixMultRightInputRows( MatrixBlock 
m1, MatrixBlock m2, int k ) {
                //parallelize over rows in rhs matrix if number of rows in 
lhs/output is very small
                double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
-               return (m1.rlen==1 && !(m1.isUltraSparse()||m2.isUltraSparse()))
+               return (m1.rlen==1 && !(m1.sparse && m2.clen==1) && 
!(m1.isUltraSparse()||m2.isUltraSparse()))
                        || (m1.rlen<=16 && m2.rlen > m1.rlen && (!m1.sparse | 
m2.clen > 1)
                           && ( !m1.isUltraSparse() && !(m1.sparse & m2.sparse) 
) //dense-dense / sparse-dense / dense-sparse
                           && (long)k * 8 * m1.rlen * m2.clen < 
Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem) );
diff --git 
a/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyKernelTest.java
 
b/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyKernelTest.java
index 9975213762..1d51e6fd8e 100644
--- 
a/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyKernelTest.java
+++ 
b/src/test/java/org/apache/sysds/test/component/matrix/MatrixMultiplyKernelTest.java
@@ -51,10 +51,15 @@ public class MatrixMultiplyKernelTest {
                testMatrixMultiply(MIN_PAR, 16, 1, 1, 1);
        }
        
-//     @Test //FIXME
-//     public void testDenseDenseMatrixLargeVector() {
-//             testMatrixMultiply(16, MIN_PAR, 1, 1, 1);
-//     }
+       @Test //parallelization over rows in lhs
+       public void testDenseDenseMatrixLargeVector() {
+               testMatrixMultiply(4000, 3000, 1, 1, 1);
+       }
+       
+       @Test //parallelization over rows in rhs
+       public void testDenseDenseMatrixLargeVectorPm2() {
+               testMatrixMultiply(16, MIN_PAR, 1, 1, 1);
+       }
        
        @Test
        public void testDenseDenseVectorMatrix() {
@@ -90,10 +95,10 @@ public class MatrixMultiplyKernelTest {
        
        // sparse-dense kernels
        
-//     @Test FIXME
-//     public void testSparseDenseDotProduct() {
-//             testMatrixMultiply(1, MIN_PAR, 1, 0.1, 1);
-//     }
+       @Test
+       public void testSparseDenseDotProduct() {
+               testMatrixMultiply(1, MIN_PAR, 1, 0.1, 1);
+       }
        
        @Test
        public void testSparseDenseMatrixSmallVector() {

(systemds) branch main updated: [SYSTEMDS-3773] Fix two matmult kernels w/ parallelization over rhs

Reply via email to