This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 07486b8  [SYSTEMDS-3042] Performance matrix multiplication w/ short lhs
07486b8 is described below

commit 07486b84f725ce697d99340c3511d4bf7745ebd1
Author: Matthias Boehm <[email protected]>
AuthorDate: Sun Jun 27 15:51:14 2021 +0200

    [SYSTEMDS-3042] Performance matrix multiplication w/ short lhs
    
    While vector-matrix multiplication was already well-optimized, there was
    unnecessary inefficiency with short left hand sides (e.g., left hand
    sides with 2<x<=16 rows) where parallelization switched to rows in the
    lhs and thus did not fully exploit the available degree of parallelism
    and not utilize the available memory bandwidth wisely (redundant scans).
    Instead, we now do a more careful selection when to parallelize over the
    common dimension.
    
    On a machine with 112 vcores and 750GB mem, the following script shows
    performance improvements of ~10x for several important sizes:
    
    X = rand(rows=1e6, cols=1e3); #8GB
    for(i in seq(1,16)) {
      y = rand(rows=i, cols=1e6)
      t1 = time();
      for(j in 1:100)
        q = y %*% X;
      print("matrix multiplication ["+i+"]: "+(time()-t1)/1e11)
    }
    print(sum(q))
    
    Results [in seconds]:
    matrix multiplication [1]: 0.12487190379
    matrix multiplication [2]: 0.12739698981
    matrix multiplication [3]: 1.14 -> 0.12981205165
    matrix multiplication [4]: 1.19 -> 0.13536354042
    matrix multiplication [5]: 1.21 -> 0.14075080461
    matrix multiplication [6]: 1.30 -> 0.14592031624
    matrix multiplication [7]: 1.25 -> 0.15245149887
    matrix multiplication [8]: 1.23 -> 0.1576228154
    matrix multiplication [9]: 1.27 -> 0.16084557115
    matrix multiplication [10]: 1.23 -> 0.16499888784
    matrix multiplication [11]: 1.23 -> 0.15556115622
    matrix multiplication [12]: 1.23 -> 0.13199431345
    matrix multiplication [13]: 1.25 -> 0.1411010925
    matrix multiplication [14]: 1.36 -> 0.14866302508
    matrix multiplication [15]: 1.24 -> 0.16179848953
    matrix multiplication [16]: 1.28 -> 0.17091530956
---
 src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
index 3ff1bb9..dd88ec9 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixMult.java
@@ -3880,10 +3880,11 @@ public class LibMatrixMult
        
        private static boolean checkParMatrixMultRightInputRows( MatrixBlock 
m1, MatrixBlock m2, int k ) {
                //parallelize over rows in rhs matrix if number of rows in 
lhs/output is very small
+               double jvmMem = InfrastructureAnalyzer.getLocalMaxMemory();
                return (m1.rlen==1 && LOW_LEVEL_OPTIMIZATION && m2.clen>1 && 
!(m1.isUltraSparse()||m2.isUltraSparse()))
                        || (m1.rlen<=16 && LOW_LEVEL_OPTIMIZATION && m2.clen>1 
&& m2.rlen > m1.rlen 
                           && ( !m1.isUltraSparse() && !m2.sparse ) 
//dense-dense / sparse/dense
-                          && (long)k * 8 * m1.rlen * m2.clen < 
MEM_OVERHEAD_THRESHOLD ); 
+                          && (long)k * 8 * m1.rlen * m2.clen < 
Math.max(MEM_OVERHEAD_THRESHOLD,0.01*jvmMem) );
        }
 
        private static boolean checkParMatrixMultRightInputCols( MatrixBlock 
m1, MatrixBlock m2, int k, boolean pm2r ) {

Reply via email to