[SYSTEMML-2260] New native tsmm matrix mult and its integration This patch fixes the existing native tsmm implementation to properly use a BLAS dsyrk instead of BLAS dgemm to performance the computation only for the upper triangular output matrix. Subsequently, we use the existing cache-conscious primitive to flip the upper to the lower triangle.
Furthermore, this patch also integrates the native tsmm implementation, which was not used at all so far. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/2b8161db Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/2b8161db Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/2b8161db Branch: refs/heads/master Commit: 2b8161db97b55b612e70280d45fcec53421dc813 Parents: fde708f Author: Matthias Boehm <[email protected]> Authored: Thu Apr 19 22:22:07 2018 -0700 Committer: Matthias Boehm <[email protected]> Committed: Thu Apr 19 22:22:07 2018 -0700 ---------------------------------------------------------------------- .../cpp/lib/libsystemml_mkl-Linux-x86_64.so | Bin 32168 -> 32208 bytes .../lib/libsystemml_openblas-Linux-x86_64.so | Bin 31240 -> 31288 bytes src/main/cpp/libmatrixmult.cpp | 10 +++---- src/main/cpp/libmatrixmult.h | 2 +- src/main/cpp/systemml.cpp | 4 +-- .../runtime/matrix/data/LibMatrixNative.java | 27 +++++++++++++++++++ .../sysml/runtime/matrix/data/MatrixBlock.java | 4 ++- .../org/apache/sysml/utils/NativeHelper.java | 2 +- 8 files changed, 38 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/2b8161db/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so ---------------------------------------------------------------------- diff --git a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so index faaf5f4..adc3bbe 100755 Binary files a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so and b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so differ http://git-wip-us.apache.org/repos/asf/systemml/blob/2b8161db/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so ---------------------------------------------------------------------- diff --git a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so index 16b0b5d..0b39eaa 100755 Binary files a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so and b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so differ http://git-wip-us.apache.org/repos/asf/systemml/blob/2b8161db/src/main/cpp/libmatrixmult.cpp ---------------------------------------------------------------------- diff --git a/src/main/cpp/libmatrixmult.cpp b/src/main/cpp/libmatrixmult.cpp index 3c669b6..773a85a 100644 --- a/src/main/cpp/libmatrixmult.cpp +++ b/src/main/cpp/libmatrixmult.cpp @@ -51,11 +51,9 @@ void smatmult(float* m1Ptr, float* m2Ptr, float* retPtr, int m, int k, int n, in cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1, m1Ptr, k, m2Ptr, n, 0, retPtr, n); } -void tsmm(double* m1Ptr, double* retPtr, int m1rlen, int m1clen, bool isLeftTranspose, int numThreads) { - int m = isLeftTranspose ? m1clen : m1rlen; - int n = isLeftTranspose ? m1clen : m1rlen; - int k = isLeftTranspose ? m1rlen : m1clen; - +void tsmm(double* m1Ptr, double* retPtr, int m1rlen, int m1clen, bool isLeftTrans, int numThreads) { + int n = isLeftTrans ? m1clen : m1rlen; + int k = isLeftTrans ? m1rlen : m1clen; setNumThreadsForBLAS(numThreads); - cblas_dgemm(CblasRowMajor, isLeftTranspose ? CblasTrans : CblasNoTrans, isLeftTranspose ? CblasNoTrans : CblasTrans, m, n, k, 1, m1Ptr, k, m1Ptr, n, 0, retPtr, n); + cblas_dsyrk(CblasRowMajor, CblasUpper, isLeftTrans ? CblasTrans : CblasNoTrans, n, k, 1, m1Ptr, n, 0, retPtr, n); } http://git-wip-us.apache.org/repos/asf/systemml/blob/2b8161db/src/main/cpp/libmatrixmult.h ---------------------------------------------------------------------- diff --git a/src/main/cpp/libmatrixmult.h b/src/main/cpp/libmatrixmult.h index b6ea1c4..1c7fcd9 100644 --- a/src/main/cpp/libmatrixmult.h +++ b/src/main/cpp/libmatrixmult.h @@ -56,6 +56,6 @@ void setNumThreadsForBLAS(int numThreads); void dmatmult(double* m1Ptr, double* m2Ptr, double* retPtr, int m, int k, int n, int numThreads); void smatmult(float* m1Ptr, float* m2Ptr, float* retPtr, int m, int k, int n, int numThreads); -void tsmm(double* m1Ptr, double* retPtr, int m1rlen, int m1clen, bool isLeftTranspose, int numThreads); +void tsmm(double* m1Ptr, double* retPtr, int m1rlen, int m1clen, bool isLeftTrans, int numThreads); #endif http://git-wip-us.apache.org/repos/asf/systemml/blob/2b8161db/src/main/cpp/systemml.cpp ---------------------------------------------------------------------- diff --git a/src/main/cpp/systemml.cpp b/src/main/cpp/systemml.cpp index b404cc9..fae0c1e 100644 --- a/src/main/cpp/systemml.cpp +++ b/src/main/cpp/systemml.cpp @@ -109,13 +109,13 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_smmdd( } JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_tsmm - (JNIEnv * env, jclass cls, jdoubleArray m1, jdoubleArray ret, jint m1rlen, jint m1clen, jboolean isLeftTranspose, jint numThreads) { + (JNIEnv * env, jclass cls, jdoubleArray m1, jdoubleArray ret, jint m1rlen, jint m1clen, jboolean leftTrans, jint numThreads) { double* m1Ptr = GET_DOUBLE_ARRAY(env, m1, numThreads); double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads); if(m1Ptr == NULL || retPtr == NULL) return (jboolean) false; - tsmm(m1Ptr, retPtr, (int) m1rlen, (int) m1clen, (bool) isLeftTranspose, (int) numThreads); + tsmm(m1Ptr, retPtr, (int)m1rlen, (int)m1clen, (bool)leftTrans, (int)numThreads); RELEASE_INPUT_ARRAY(env, m1, m1Ptr, numThreads); RELEASE_ARRAY(env, ret, retPtr, numThreads); http://git-wip-us.apache.org/repos/asf/systemml/blob/2b8161db/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java index eade43f..cf4501f 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java @@ -118,6 +118,33 @@ public class LibMatrixNative LibMatrixMult.matrixMult(m1, m2, ret, k); } + public static void tsmm(MatrixBlock m1, MatrixBlock ret, boolean leftTrans, int k) { + if( m1.isEmptyBlock(false) ) + return; + if( NativeHelper.isNativeLibraryLoaded() && ret.clen > 1 + && (!m1.sparse && m1.getDenseBlock().isContiguous() ) ) { + ret.sparse = false; + ret.allocateDenseBlock(); + if( NativeHelper.tsmm(m1.getDenseBlockValues(), + ret.getDenseBlockValues(), m1.rlen, m1.clen, leftTrans, k) ) + { + long nnz = (ret.clen==1) ? ret.recomputeNonZeros() : + LibMatrixMult.copyUpperToLowerTriangle(ret); + ret.setNonZeros(nnz); + ret.examSparsity(); + return; + } + else { + Statistics.incrementNativeFailuresCounter(); + //fallback to default java implementation + } + } + if( k > 1 ) + LibMatrixMult.matrixMultTransposeSelf(m1, ret, leftTrans, k); + else + LibMatrixMult.matrixMultTransposeSelf(m1, ret, leftTrans); + } + /** * This method performs convolution (i.e. cross-correlation) operation on input * http://git-wip-us.apache.org/repos/asf/systemml/blob/2b8161db/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java index f306a7a..97f883b 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java @@ -3434,7 +3434,9 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab out.reset(dim, dim, false); //compute matrix mult - if( k > 1 ) + if( NativeHelper.isNativeLibraryLoaded() ) + LibMatrixNative.tsmm(this, out, leftTranspose, k); + else if( k > 1 ) LibMatrixMult.matrixMultTransposeSelf(this, out, leftTranspose, k); else LibMatrixMult.matrixMultTransposeSelf(this, out, leftTranspose); http://git-wip-us.apache.org/repos/asf/systemml/blob/2b8161db/src/main/java/org/apache/sysml/utils/NativeHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/NativeHelper.java b/src/main/java/org/apache/sysml/utils/NativeHelper.java index 86d849f..1a02e00 100644 --- a/src/main/java/org/apache/sysml/utils/NativeHelper.java +++ b/src/main/java/org/apache/sysml/utils/NativeHelper.java @@ -322,7 +322,7 @@ public class NativeHelper { //single-precision matrix multiply dense-dense public static native boolean smmdd(FloatBuffer m1, FloatBuffer m2, FloatBuffer ret, int m1rlen, int m1clen, int m2clen, int numThreads); //transpose-self matrix multiply - private static native boolean tsmm(double [] m1, double [] ret, int m1rlen, int m1clen, boolean isLeftTranspose, int numThreads); + public static native boolean tsmm(double[] m1, double[] ret, int m1rlen, int m1clen, boolean leftTrans, int numThreads); // ---------------------------------------------------------------------------------------------------------------- // LibMatrixDNN operations:
