This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 265329c  [SYSTEMDS-2266] Fix native BLAS dgemm/dsyrk for large dense 
blocks >16GB
265329c is described below

commit 265329cf240b728ace5cc5f93426d24de397c20f
Author: Matthias Boehm <[email protected]>
AuthorDate: Sun Jun 6 17:45:47 2021 +0200

    [SYSTEMDS-2266] Fix native BLAS dgemm/dsyrk for large dense blocks >16GB
    
    This patch adds the missing support for native BLAS matrix
    multiplications (DGEMM, and DSYRK aka TSMM) for large dense blocks that
    are internally composed of multiple arrays, each with up to MAX_INT
    (2^(32-1)) cells. Due to issues with concurrent multi-threaded native
    BLAS operations (e.g., w/ Intel MKL) we sequentially execute these
    independent block operations.
---
 .../sysds/runtime/matrix/data/LibMatrixNative.java | 50 +++++++++++++++++++---
 1 file changed, 43 insertions(+), 7 deletions(-)

diff --git 
a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java 
b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
index 9a55111..5c92253 100644
--- a/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
+++ b/src/main/java/org/apache/sysds/runtime/matrix/data/LibMatrixNative.java
@@ -30,6 +30,8 @@ import org.apache.sysds.api.DMLScript;
 import org.apache.sysds.conf.ConfigurationManager;
 import org.apache.sysds.conf.DMLConfig;
 import org.apache.sysds.hops.OptimizerUtils;
+import org.apache.sysds.runtime.data.DenseBlock;
+import org.apache.sysds.runtime.data.DenseBlockFactory;
 import org.apache.sysds.utils.NativeHelper;
 import org.apache.sysds.utils.Statistics;
 
@@ -81,7 +83,8 @@ public class LibMatrixNative
                
                boolean isValidForNative = !isMatMultMemoryBound(m1.rlen, 
m1.clen, m2.clen) 
                        && !m1.isInSparseFormat() && !m2.isInSparseFormat()
-                       && m1.getDenseBlock().isContiguous() && 
m2.getDenseBlock().isContiguous() //contiguous but not allocated
+                       && (m1.getDenseBlock().isContiguous() || 
!isSinglePrecision())
+                       && m2.getDenseBlock().isContiguous() //contiguous but 
not allocated
                        && 8L * ret.getLength() < Integer.MAX_VALUE;
 
                if( NativeHelper.isNativeLibraryLoaded() && isValidForNative ) 
@@ -89,7 +92,7 @@ public class LibMatrixNative
                        ret.sparse = false;
                        ret.allocateDenseBlock();
                        long start = DMLScript.STATISTICS ? System.nanoTime() : 
0;
-                       long nnz;
+                       long nnz = 0;
                        if( isSinglePrecision() ) {
                                FloatBuffer fin1 = 
toFloatBuffer(m1.getDenseBlockValues(), inBuff, true);
                                FloatBuffer fin2 = 
toFloatBuffer(m2.getDenseBlockValues(), filterBuff, true);
@@ -99,8 +102,23 @@ public class LibMatrixNative
                                fromFloatBuffer(outBuff.get(), 
ret.getDenseBlockValues());
                        }
                        else {
-                               nnz = 
NativeHelper.dmmdd(m1.getDenseBlockValues(), m2.getDenseBlockValues(),
-                                       ret.getDenseBlockValues(), 
m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k);
+                               DenseBlock a = m1.getDenseBlock();
+                               if( a.isContiguous() ) {
+                                       nnz = 
NativeHelper.dmmdd(m1.getDenseBlockValues(), m2.getDenseBlockValues(),
+                                               ret.getDenseBlockValues(), 
m1.rlen, m1.clen, m2.clen, k);
+                               }
+                               else {
+                                       //sequential processing of individual 
blocks to 
+                                       //avoid segementation faults with 
concurrent multi-threaded BLAS calls
+                                       for(int bix = 0; bix < a.numBlocks(); 
bix++) {
+                                               double[] tmp = new 
double[a.blockSize(bix)*m2.clen];
+                                               nnz += 
NativeHelper.dmmdd(a.valuesAt(bix), m2.getDenseBlockValues(),
+                                                       tmp, a.blockSize(bix), 
m1.clen, m2.clen, k);
+                                               int rl = bix * a.blockSize();
+                                               ret.getDenseBlock().set(rl, 
rl+a.blockSize(bix), 0, m2.clen,
+                                                       
DenseBlockFactory.createDenseBlock(tmp, new int[]{a.blockSize(bix),m2.clen}));
+                                       }
+                               }
                        }
                        
                        if(nnz > -1) {
@@ -134,14 +152,32 @@ public class LibMatrixNative
                
                if( NativeHelper.isNativeLibraryLoaded() && (ret.clen > 1 || 
ret.getLength()==1)
                        && !LibMatrixMult.isOuterProductTSMM(m1.rlen, m1.clen, 
leftTrans)
-                       && (!m1.sparse && m1.getDenseBlock().isContiguous() ) )
+                       && !m1.sparse && (m1.getDenseBlock().isContiguous() | 
leftTrans) )
                {
                        ret.sparse = false;
                        ret.allocateDenseBlock();
                        long start = DMLScript.STATISTICS ? System.nanoTime() : 
0;
                        
-                       long nnz = NativeHelper.tsmm(m1.getDenseBlockValues(),
-                               ret.getDenseBlockValues(), m1.rlen, m1.clen, 
leftTrans, k);
+                       DenseBlock a = m1.getDenseBlock();
+                       double[] cvals = ret.getDenseBlockValues();
+                       long nnz = 0;
+                       if( a.isContiguous() ) {
+                               nnz = NativeHelper.tsmm(a.valuesAt(0),
+                                       cvals, m1.rlen, m1.clen, leftTrans, k);
+                       }
+                       else { //large blocks (but only leftTrans)
+                               //sequential processing of individual blocks to 
+                               //avoid segementation faults with concurrent 
multi-threaded BLAS calls
+                               IntStream.range(0, a.numBlocks()).forEach(bix 
-> {
+                                       double[] tmp = new 
double[m1.clen*m1.clen];
+                                       NativeHelper.tsmm(a.valuesAt(bix),
+                                               tmp, a.blockSize(bix), m1.clen, 
leftTrans, k);
+                                       LibMatrixMult.vectAdd(tmp, cvals, 0, 0, 
m1.clen*m1.clen);
+                               });
+                               nnz = ret.recomputeNonZeros();
+                       }
+                       //TODO flip upper triangular matrix down for consistent
+                       //representations with the default java implementation?
                        
                        if(nnz > -1) {
                                if(DMLScript.STATISTICS) {

Reply via email to