Repository: systemml Updated Branches: refs/heads/master 525381d51 -> c95019fd9
[SYSTEMML-2106] New single-precision native matrix multiply This patch extends - similar to native conv2d/con2d_bias_add operations - also the native matrix multiply for optional single-precision use. This also includes cleanups of mkl imports and nnz maintenance in double and single-precision conv2d operations. Furthermore, this patch includes build shared libraries for both mkl and openblas. Project: http://git-wip-us.apache.org/repos/asf/systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/c95019fd Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/c95019fd Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/c95019fd Branch: refs/heads/master Commit: c95019fd99076b4b8b7e6c5cfec85fd9949b2512 Parents: 525381d Author: Matthias Boehm <[email protected]> Authored: Fri Feb 2 20:04:53 2018 -0800 Committer: Matthias Boehm <[email protected]> Committed: Fri Feb 2 22:10:33 2018 -0800 ---------------------------------------------------------------------- .../cpp/lib/libsystemml_mkl-Linux-x86_64.so | Bin 32048 -> 32104 bytes .../lib/libsystemml_openblas-Linux-x86_64.so | Bin 31288 -> 36192 bytes src/main/cpp/libmatrixdnn.cpp | 14 +++--- src/main/cpp/libmatrixmult.cpp | 2 + src/main/cpp/libmatrixmult.h | 5 +- src/main/cpp/systemml.cpp | 22 +++++++-- src/main/cpp/systemml.h | 12 +++-- .../runtime/matrix/data/LibMatrixDNNHelper.java | 2 +- .../runtime/matrix/data/LibMatrixNative.java | 48 ++++++++++++------- .../org/apache/sysml/utils/NativeHelper.java | 7 ++- 10 files changed, 79 insertions(+), 33 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so ---------------------------------------------------------------------- diff --git a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so index 9f08870..db96497 100755 Binary files a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so and b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so differ http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so ---------------------------------------------------------------------- diff --git a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so index d6c9477..2fdcddf 100755 Binary files a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so and b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so differ http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/libmatrixdnn.cpp ---------------------------------------------------------------------- diff --git a/src/main/cpp/libmatrixdnn.cpp b/src/main/cpp/libmatrixdnn.cpp index 85efdfc..717cb26 100644 --- a/src/main/cpp/libmatrixdnn.cpp +++ b/src/main/cpp/libmatrixdnn.cpp @@ -406,8 +406,8 @@ int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, do // Step 3: Destroy the description of the operation dnnDelete_F64(pConvolution); + return computeNNZ<double>(retPtr, N*KPQ); #else - // ------------------------------------------------------------------------------------ // First step: Avoids oversubscription and other openmp/internal blas threading issues setNumThreadsForBLAS(1); @@ -418,8 +418,9 @@ int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, do // Allocate temporary data structures used in parallel for int numOpenMPThreads = MIN(numThreads, N); double* loweredMatArrays = new double[numIm2ColElem*numOpenMPThreads]; + int nnz = 0; -#pragma omp parallel for num_threads(numOpenMPThreads) +#pragma omp parallel for reduction(+: nnz) num_threads(numOpenMPThreads) for (int n = 0; n < N; n++) { int threadID = omp_get_thread_num(); double* loweredMat = loweredMatArrays + numIm2ColElem*threadID; @@ -436,12 +437,13 @@ int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, do double* outputArr = retPtr + n*KPQ; if( addBias ) biasAdd<double>(biasPtr, outputArr, K, PQ); - } // end omp parallel for + + // Step 4: thread-local nnz maintenance + nnz += computeNNZ<double>(retPtr + n*KPQ, KPQ); + } delete [] loweredMatArrays; - // ------------------------------------------------------------------------------------ + return nnz; #endif - - return computeNNZ<double>(retPtr, N*KPQ); } int sconv2dBiasAddDense(float* inputPtr, float* biasPtr, float* filterPtr, float* retPtr, http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/libmatrixmult.cpp ---------------------------------------------------------------------- diff --git a/src/main/cpp/libmatrixmult.cpp b/src/main/cpp/libmatrixmult.cpp index 6edbc67..3c669b6 100644 --- a/src/main/cpp/libmatrixmult.cpp +++ b/src/main/cpp/libmatrixmult.cpp @@ -25,6 +25,8 @@ #ifdef USE_OPEN_BLAS #include <cblas.h> +#else + #include <mkl_service.h> #endif int SYSML_CURRENT_NUM_THREADS = -1; http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/libmatrixmult.h ---------------------------------------------------------------------- diff --git a/src/main/cpp/libmatrixmult.h b/src/main/cpp/libmatrixmult.h index ca357c9..b6ea1c4 100644 --- a/src/main/cpp/libmatrixmult.h +++ b/src/main/cpp/libmatrixmult.h @@ -39,12 +39,11 @@ //#endif // Since we call cblas_dgemm in openmp for loop, -// we call "extension" APIs for setting number of threads of the given API. -// For example: for OpenBLAS we use openblas_set_num_threads and -// for MKL we use mkl_set_num_threads. This avoids performance degradation due to overprovisioning. +// we call "extension" APIs for setting the number of threads. #ifdef USE_INTEL_MKL #include <mkl.h> #include <mkl_service.h> + extern "C" void mkl_set_num_threads(int numThreads); #else #include <cblas.h> extern "C" void openblas_set_num_threads(int numThreads); http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/systemml.cpp ---------------------------------------------------------------------- diff --git a/src/main/cpp/systemml.cpp b/src/main/cpp/systemml.cpp index 35a0074..b404cc9 100644 --- a/src/main/cpp/systemml.cpp +++ b/src/main/cpp/systemml.cpp @@ -75,14 +75,15 @@ JNIEXPORT void JNICALL Java_org_apache_sysml_utils_NativeHelper_setMaxNumThreads maxThreads = (int) jmaxThreads; } -JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_matrixMultDenseDense( +JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_dmmdd( JNIEnv* env, jclass cls, jdoubleArray m1, jdoubleArray m2, jdoubleArray ret, - jint m1rlen, jint m1clen, jint m2clen, jint numThreads) { + jint m1rlen, jint m1clen, jint m2clen, jint numThreads) +{ double* m1Ptr = GET_DOUBLE_ARRAY(env, m1, numThreads); double* m2Ptr = GET_DOUBLE_ARRAY(env, m2, numThreads); double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads); if(m1Ptr == NULL || m2Ptr == NULL || retPtr == NULL) - return (jboolean) false; + return (jboolean) false; dmatmult(m1Ptr, m2Ptr, retPtr, (int)m1rlen, (int)m1clen, (int)m2clen, (int)numThreads); @@ -92,6 +93,21 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_matrixMultDe return (jboolean) true; } +JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_smmdd( + JNIEnv* env, jclass cls, jobject m1, jobject m2, jobject ret, + jint m1rlen, jint m1clen, jint m2clen, jint numThreads) +{ + float* m1Ptr = (float*) env->GetDirectBufferAddress(m1); + float* m2Ptr = (float*) env->GetDirectBufferAddress(m2); + float* retPtr = (float*) env->GetDirectBufferAddress(ret); + if(m1Ptr == NULL || m2Ptr == NULL || retPtr == NULL) + return (jboolean) false; + + smatmult(m1Ptr, m2Ptr, retPtr, (int)m1rlen, (int)m1clen, (int)m2clen, (int)numThreads); + + return (jboolean) true; +} + JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_tsmm (JNIEnv * env, jclass cls, jdoubleArray m1, jdoubleArray ret, jint m1rlen, jint m1clen, jboolean isLeftTranspose, jint numThreads) { double* m1Ptr = GET_DOUBLE_ARRAY(env, m1, numThreads); http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/systemml.h ---------------------------------------------------------------------- diff --git a/src/main/cpp/systemml.h b/src/main/cpp/systemml.h index 71155fa..52a3663 100644 --- a/src/main/cpp/systemml.h +++ b/src/main/cpp/systemml.h @@ -28,14 +28,20 @@ extern "C" { #endif /* * Class: org_apache_sysml_utils_NativeHelper - * Method: matrixMultDenseDense - * Signature: ([D[D[DIIII)Z + * Method: dmmdd */ -JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_matrixMultDenseDense +JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_dmmdd (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint); /* * Class: org_apache_sysml_utils_NativeHelper + * Method: smmdd + */ +JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_smmdd + (JNIEnv *, jclass, jobject, jobject, jobject, jint, jint, jint, jint); + +/* + * Class: org_apache_sysml_utils_NativeHelper * Method: tsmm * Signature: ([D[DIIZI)Z */ http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java index f81e929..32a0eaa 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java @@ -68,7 +68,7 @@ public class LibMatrixDNNHelper ret.sparse = false; if(ret.getDenseBlock() == null) ret.allocateDenseBlock(); - NativeHelper.matrixMultDenseDense(m1.getDenseBlockValues(), m2.getDenseBlockValues(), + NativeHelper.dmmdd(m1.getDenseBlockValues(), m2.getDenseBlockValues(), ret.getDenseBlockValues(), m1.rlen, m1.clen, m2.clen, 1); } http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java index dfb8abd..9e3a6ee 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java @@ -34,7 +34,10 @@ import org.apache.sysml.utils.Statistics; public class LibMatrixNative { - /** ThreadLocal reuse of direct buffers for inputs/outputs (extended on demand).*/ + // ThreadLocal reuse of direct buffers for inputs/outputs (extended on demand). + // note: since we anyway have to convert from double to float, we use + // preallocated direct buffers (with thread-local reuse and resizing on demand) + // to ensure there are no additional copies created by the transfer over jni private static ThreadLocal<FloatBuffer> inBuff = new ThreadLocal<FloatBuffer>(); private static ThreadLocal<FloatBuffer> biasBuff = new ThreadLocal<FloatBuffer>(); private static ThreadLocal<FloatBuffer> filterBuff = new ThreadLocal<FloatBuffer>(); @@ -65,32 +68,45 @@ public class LibMatrixNative k = k <= 0 ? NativeHelper.getMaxNumThreads() : k; // check inputs / outputs - if (m1.isEmptyBlock() || m2.isEmptyBlock()) { + if (m1.isEmptyBlock(false) || m2.isEmptyBlock(false)){ ret.setNonZeros(0); if(examSparsity) ret.examSparsity(); // turn empty dense into sparse return; } - if (NativeHelper.isNativeLibraryLoaded() && - !isMatMultMemoryBound(m1.rlen, m1.clen, m2.clen) && !m1.isInSparseFormat() && !m2.isInSparseFormat()) { + + if (NativeHelper.isNativeLibraryLoaded() + && !isMatMultMemoryBound(m1.rlen, m1.clen, m2.clen) + && !m1.isInSparseFormat() && !m2.isInSparseFormat()) + { ret.sparse = false; ret.allocateDenseBlock(); long start = DMLScript.STATISTICS ? System.nanoTime() : 0; - if (NativeHelper.matrixMultDenseDense(m1.getDenseBlockValues(), m2.getDenseBlockValues(), - ret.getDenseBlockValues(), m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k)) { + boolean rccode = false; + if( isSinglePrecision() ) { + FloatBuffer fin1 = toFloatBuffer(m1.getDenseBlockValues(), inBuff, true); + FloatBuffer fin2 = toFloatBuffer(m2.getDenseBlockValues(), filterBuff, true); + FloatBuffer fout = toFloatBuffer(ret.getDenseBlockValues(), outBuff, false); + rccode = NativeHelper.smmdd(fin1, fin2, fout, + m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k); + fromFloatBuffer(outBuff.get(), ret.getDenseBlockValues()); + } + else { + rccode = NativeHelper.dmmdd(m1.getDenseBlockValues(), m2.getDenseBlockValues(), + ret.getDenseBlockValues(), m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k); + } + if (rccode) { if(DMLScript.STATISTICS) { Statistics.nativeLibMatrixMultTime += System.nanoTime() - start; Statistics.numNativeLibMatrixMultCalls.increment(); } ret.recomputeNonZeros(); - // post-processing (nnz maintained in parallel) if(examSparsity) ret.examSparsity(); return; - } else { - // Else fall back to Java - Statistics.incrementNativeFailuresCounter(); } + //else record failure and fallback to java + Statistics.incrementNativeFailuresCounter(); } if (k == 1) LibMatrixMult.matrixMult(m1, m2, ret, examSparsity); @@ -135,14 +151,9 @@ public class LibMatrixNative else { if(params.bias.isInSparseFormat()) params.bias.sparseToDense(); // Bias matrix is usually extremely small - boolean singlePrecision = ConfigurationManager.getDMLConfig() - .getTextValue(DMLConfig.FLOATING_POINT_PRECISION).equals("single"); long start = DMLScript.STATISTICS ? System.nanoTime() : 0; int nnz = -1; - if( singlePrecision ) { - //note: since we anyway have to convert from double to float, we use - //preallocated direct buffers (with thread-local reuse and resizing on demand) - //to ensure there are no additional copies created by the transfer over jni + if( isSinglePrecision() ) { FloatBuffer finput = toFloatBuffer(input.getDenseBlockValues(), inBuff, true); FloatBuffer fbias = toFloatBuffer(params.bias.getDenseBlockValues(), biasBuff, true); FloatBuffer ffilter = toFloatBuffer(filter.getDenseBlockValues(), filterBuff, true); @@ -260,6 +271,11 @@ public class LibMatrixNative LibMatrixDNN.conv2dBackwardData(filter, dout, outputBlock, params); } + private static boolean isSinglePrecision() { + return ConfigurationManager.getDMLConfig() + .getTextValue(DMLConfig.FLOATING_POINT_PRECISION).equals("single"); + } + private static FloatBuffer toFloatBuffer(double[] input, ThreadLocal<FloatBuffer> buff, boolean copy) { //maintain thread-local buffer (resized on demand) FloatBuffer ret = buff.get(); http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/java/org/apache/sysml/utils/NativeHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/NativeHelper.java b/src/main/java/org/apache/sysml/utils/NativeHelper.java index 6ec990d..25e9847 100644 --- a/src/main/java/org/apache/sysml/utils/NativeHelper.java +++ b/src/main/java/org/apache/sysml/utils/NativeHelper.java @@ -324,7 +324,12 @@ public class NativeHelper { } // TODO: Add pmm, wsloss, mmchain, etc. - public static native boolean matrixMultDenseDense(double [] m1, double [] m2, double [] ret, int m1rlen, int m1clen, int m2clen, int numThreads); + + //double-precision matrix multiply dense-dense + public static native boolean dmmdd(double [] m1, double [] m2, double [] ret, int m1rlen, int m1clen, int m2clen, int numThreads); + //single-precision matrix multiply dense-dense + public static native boolean smmdd(FloatBuffer m1, FloatBuffer m2, FloatBuffer ret, int m1rlen, int m1clen, int m2clen, int numThreads); + //transpose-self matrix multiply private static native boolean tsmm(double [] m1, double [] ret, int m1rlen, int m1clen, boolean isLeftTranspose, int numThreads); // ----------------------------------------------------------------------------------------------------------------
