systemml git commit: [SYSTEMML-2106] New single-precision native matrix multiply

mboehm7 Fri, 02 Feb 2018 22:12:07 -0800

Repository: systemml
Updated Branches:
  refs/heads/master 525381d51 -> c95019fd9



[SYSTEMML-2106] New single-precision native matrix multiply

This patch extends - similar to native conv2d/con2d_bias_add operations
- also the native matrix multiply for optional single-precision use.
This also includes cleanups of mkl imports and nnz maintenance in double
and single-precision conv2d operations.

Furthermore, this patch includes build shared libraries for both mkl and
openblas.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/c95019fd
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/c95019fd
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/c95019fd

Branch: refs/heads/master
Commit: c95019fd99076b4b8b7e6c5cfec85fd9949b2512
Parents: 525381d
Author: Matthias Boehm <[email protected]>
Authored: Fri Feb 2 20:04:53 2018 -0800
Committer: Matthias Boehm <[email protected]>
Committed: Fri Feb 2 22:10:33 2018 -0800

----------------------------------------------------------------------
 .../cpp/lib/libsystemml_mkl-Linux-x86_64.so     | Bin 32048 -> 32104 bytes
 .../lib/libsystemml_openblas-Linux-x86_64.so    | Bin 31288 -> 36192 bytes
 src/main/cpp/libmatrixdnn.cpp                   |  14 +++---
 src/main/cpp/libmatrixmult.cpp                  |   2 +
 src/main/cpp/libmatrixmult.h                    |   5 +-
 src/main/cpp/systemml.cpp                       |  22 +++++++--
 src/main/cpp/systemml.h                         |  12 +++--
 .../runtime/matrix/data/LibMatrixDNNHelper.java |   2 +-
 .../runtime/matrix/data/LibMatrixNative.java    |  48 ++++++++++++-------
 .../org/apache/sysml/utils/NativeHelper.java    |   7 ++-
 10 files changed, 79 insertions(+), 33 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so
----------------------------------------------------------------------
diff --git a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so 
b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so
index 9f08870..db96497 100755
Binary files a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so and 
b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so differ

http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so
----------------------------------------------------------------------
diff --git a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so 
b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so
index d6c9477..2fdcddf 100755
Binary files a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so and 
b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so differ

http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/libmatrixdnn.cpp
----------------------------------------------------------------------
diff --git a/src/main/cpp/libmatrixdnn.cpp b/src/main/cpp/libmatrixdnn.cpp
index 85efdfc..717cb26 100644
--- a/src/main/cpp/libmatrixdnn.cpp
+++ b/src/main/cpp/libmatrixdnn.cpp
@@ -406,8 +406,8 @@ int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, 
double* filterPtr, do
   
   // Step 3: Destroy the description of the operation
   dnnDelete_F64(pConvolution);
+  return computeNNZ<double>(retPtr, N*KPQ);
 #else 
-  // 
------------------------------------------------------------------------------------
   // First step:  Avoids oversubscription and other openmp/internal blas 
threading issues
   setNumThreadsForBLAS(1);
   
@@ -418,8 +418,9 @@ int dconv2dBiasAddDense(double* inputPtr, double* biasPtr, 
double* filterPtr, do
   // Allocate temporary data structures used in parallel for
   int numOpenMPThreads = MIN(numThreads, N);
   double* loweredMatArrays = new double[numIm2ColElem*numOpenMPThreads];
+  int nnz = 0;
   
-#pragma omp parallel for num_threads(numOpenMPThreads)
+#pragma omp parallel for reduction(+: nnz) num_threads(numOpenMPThreads)
   for (int n = 0; n < N; n++) {
     int threadID = omp_get_thread_num();
     double* loweredMat = loweredMatArrays + numIm2ColElem*threadID;
@@ -436,12 +437,13 @@ int dconv2dBiasAddDense(double* inputPtr, double* 
biasPtr, double* filterPtr, do
     double* outputArr = retPtr + n*KPQ;
     if( addBias )
        biasAdd<double>(biasPtr, outputArr, K, PQ);
-  } // end omp parallel for
+    
+    // Step 4: thread-local nnz maintenance
+    nnz += computeNNZ<double>(retPtr + n*KPQ, KPQ);    
+  } 
   delete [] loweredMatArrays;
-  // 
------------------------------------------------------------------------------------
+  return nnz;
 #endif
-  
-  return computeNNZ<double>(retPtr, N*KPQ);
 }
 
 int sconv2dBiasAddDense(float* inputPtr, float* biasPtr, float* filterPtr, 
float* retPtr, 

http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/libmatrixmult.cpp
----------------------------------------------------------------------
diff --git a/src/main/cpp/libmatrixmult.cpp b/src/main/cpp/libmatrixmult.cpp
index 6edbc67..3c669b6 100644
--- a/src/main/cpp/libmatrixmult.cpp
+++ b/src/main/cpp/libmatrixmult.cpp
@@ -25,6 +25,8 @@
 
 #ifdef USE_OPEN_BLAS
        #include <cblas.h>
+#else
+  #include <mkl_service.h>  
 #endif
 
 int SYSML_CURRENT_NUM_THREADS = -1;

http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/libmatrixmult.h
----------------------------------------------------------------------
diff --git a/src/main/cpp/libmatrixmult.h b/src/main/cpp/libmatrixmult.h
index ca357c9..b6ea1c4 100644
--- a/src/main/cpp/libmatrixmult.h
+++ b/src/main/cpp/libmatrixmult.h
@@ -39,12 +39,11 @@
 //#endif
 
 // Since we call cblas_dgemm in openmp for loop,
-// we call "extension" APIs for setting number of threads of the given API.
-// For example: for OpenBLAS we use openblas_set_num_threads and  
-// for MKL we use mkl_set_num_threads. This avoids performance degradation due 
to overprovisioning.
+// we call "extension" APIs for setting the number of threads.
 #ifdef USE_INTEL_MKL
   #include <mkl.h>
   #include <mkl_service.h>
+  extern "C" void mkl_set_num_threads(int numThreads);
 #else
   #include <cblas.h>
   extern "C" void openblas_set_num_threads(int numThreads);

http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/systemml.cpp
----------------------------------------------------------------------
diff --git a/src/main/cpp/systemml.cpp b/src/main/cpp/systemml.cpp
index 35a0074..b404cc9 100644
--- a/src/main/cpp/systemml.cpp
+++ b/src/main/cpp/systemml.cpp
@@ -75,14 +75,15 @@ JNIEXPORT void JNICALL 
Java_org_apache_sysml_utils_NativeHelper_setMaxNumThreads
   maxThreads = (int) jmaxThreads;
 }
 
-JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_matrixMultDenseDense(
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_dmmdd(
     JNIEnv* env, jclass cls, jdoubleArray m1, jdoubleArray m2, jdoubleArray 
ret,
-    jint m1rlen, jint m1clen, jint m2clen, jint numThreads) {
+    jint m1rlen, jint m1clen, jint m2clen, jint numThreads)
+{
   double* m1Ptr = GET_DOUBLE_ARRAY(env, m1, numThreads);
   double* m2Ptr = GET_DOUBLE_ARRAY(env, m2, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   if(m1Ptr == NULL || m2Ptr == NULL || retPtr == NULL)
-       return (jboolean) false;
+    return (jboolean) false;
 
   dmatmult(m1Ptr, m2Ptr, retPtr, (int)m1rlen, (int)m1clen, (int)m2clen, 
(int)numThreads);
 
@@ -92,6 +93,21 @@ JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_matrixMultDe
   return (jboolean) true;
 }
 
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_smmdd(
+    JNIEnv* env, jclass cls, jobject m1, jobject m2, jobject ret,
+    jint m1rlen, jint m1clen, jint m2clen, jint numThreads)
+{
+  float* m1Ptr = (float*) env->GetDirectBufferAddress(m1);
+  float* m2Ptr = (float*) env->GetDirectBufferAddress(m2);
+  float* retPtr = (float*) env->GetDirectBufferAddress(ret);
+  if(m1Ptr == NULL || m2Ptr == NULL || retPtr == NULL)
+    return (jboolean) false;
+
+  smatmult(m1Ptr, m2Ptr, retPtr, (int)m1rlen, (int)m1clen, (int)m2clen, 
(int)numThreads);
+
+  return (jboolean) true;
+}
+
 JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_tsmm
   (JNIEnv * env, jclass cls, jdoubleArray m1, jdoubleArray ret, jint m1rlen, 
jint m1clen, jboolean isLeftTranspose, jint numThreads) {
   double* m1Ptr = GET_DOUBLE_ARRAY(env, m1, numThreads);

http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/cpp/systemml.h
----------------------------------------------------------------------
diff --git a/src/main/cpp/systemml.h b/src/main/cpp/systemml.h
index 71155fa..52a3663 100644
--- a/src/main/cpp/systemml.h
+++ b/src/main/cpp/systemml.h
@@ -28,14 +28,20 @@ extern "C" {
 #endif
 /*
  * Class:     org_apache_sysml_utils_NativeHelper
- * Method:    matrixMultDenseDense
- * Signature: ([D[D[DIIII)Z
+ * Method:    dmmdd
  */
-JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_matrixMultDenseDense
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_dmmdd
   (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, 
jint, jint);
 
 /*
  * Class:     org_apache_sysml_utils_NativeHelper
+ * Method:    smmdd
+ */
+JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_smmdd
+  (JNIEnv *, jclass, jobject, jobject, jobject, jint, jint, jint, jint);
+
+/*
+ * Class:     org_apache_sysml_utils_NativeHelper
  * Method:    tsmm
  * Signature: ([D[DIIZI)Z
  */

http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
index f81e929..32a0eaa 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNNHelper.java
@@ -68,7 +68,7 @@ public class LibMatrixDNNHelper
                        ret.sparse = false;
                        if(ret.getDenseBlock() == null)
                                ret.allocateDenseBlock();
-                       
NativeHelper.matrixMultDenseDense(m1.getDenseBlockValues(), 
m2.getDenseBlockValues(),
+                       NativeHelper.dmmdd(m1.getDenseBlockValues(), 
m2.getDenseBlockValues(),
                                ret.getDenseBlockValues(), m1.rlen, m1.clen, 
m2.clen, 1);
                }
                

http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
index dfb8abd..9e3a6ee 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
@@ -34,7 +34,10 @@ import org.apache.sysml.utils.Statistics;
 
 public class LibMatrixNative
 {
-       /** ThreadLocal reuse of direct buffers for inputs/outputs (extended on 
demand).*/
+       // ThreadLocal reuse of direct buffers for inputs/outputs (extended on 
demand).
+       //   note: since we anyway have to convert from double to float, we use
+       //   preallocated direct buffers (with thread-local reuse and resizing 
on demand)
+       //   to ensure there are no additional copies created by the transfer 
over jni
        private static ThreadLocal<FloatBuffer> inBuff = new 
ThreadLocal<FloatBuffer>();
        private static ThreadLocal<FloatBuffer> biasBuff = new 
ThreadLocal<FloatBuffer>();
        private static ThreadLocal<FloatBuffer> filterBuff = new 
ThreadLocal<FloatBuffer>();
@@ -65,32 +68,45 @@ public class LibMatrixNative
                k = k <= 0 ? NativeHelper.getMaxNumThreads() : k;
                
                // check inputs / outputs
-               if (m1.isEmptyBlock() || m2.isEmptyBlock()) {
+               if (m1.isEmptyBlock(false) || m2.isEmptyBlock(false)){
                        ret.setNonZeros(0);
                        if(examSparsity)
                                ret.examSparsity(); // turn empty dense into 
sparse
                        return;
                }
-               if (NativeHelper.isNativeLibraryLoaded() && 
-                               !isMatMultMemoryBound(m1.rlen, m1.clen, 
m2.clen) && !m1.isInSparseFormat() && !m2.isInSparseFormat()) {
+               
+               if (NativeHelper.isNativeLibraryLoaded()
+                       && !isMatMultMemoryBound(m1.rlen, m1.clen, m2.clen) 
+                       && !m1.isInSparseFormat() && !m2.isInSparseFormat()) 
+               {
                        ret.sparse = false;
                        ret.allocateDenseBlock();
                        long start = DMLScript.STATISTICS ? System.nanoTime() : 
0;
-                       if 
(NativeHelper.matrixMultDenseDense(m1.getDenseBlockValues(), 
m2.getDenseBlockValues(),
-                                       ret.getDenseBlockValues(), 
m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k)) {
+                       boolean rccode = false;
+                       if( isSinglePrecision() ) {
+                               FloatBuffer fin1 = 
toFloatBuffer(m1.getDenseBlockValues(), inBuff, true);
+                               FloatBuffer fin2 = 
toFloatBuffer(m2.getDenseBlockValues(), filterBuff, true);
+                               FloatBuffer fout = 
toFloatBuffer(ret.getDenseBlockValues(), outBuff, false);
+                               rccode = NativeHelper.smmdd(fin1, fin2, fout, 
+                                       m1.getNumRows(), m1.getNumColumns(), 
m2.getNumColumns(), k);
+                               fromFloatBuffer(outBuff.get(), 
ret.getDenseBlockValues());
+                       }
+                       else {
+                               rccode = 
NativeHelper.dmmdd(m1.getDenseBlockValues(), m2.getDenseBlockValues(),
+                                       ret.getDenseBlockValues(), 
m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k);
+                       }
+                       if (rccode) {
                                if(DMLScript.STATISTICS) {
                                        Statistics.nativeLibMatrixMultTime += 
System.nanoTime() - start;
                                        
Statistics.numNativeLibMatrixMultCalls.increment();
                                }
                                ret.recomputeNonZeros();
-                               // post-processing (nnz maintained in parallel)
                                if(examSparsity)
                                        ret.examSparsity();
                                return;
-                       } else {
-                               // Else fall back to Java
-                               Statistics.incrementNativeFailuresCounter();
                        }
+                       //else record failure and fallback to java
+                       Statistics.incrementNativeFailuresCounter();
                }
                if (k == 1)
                        LibMatrixMult.matrixMult(m1, m2, ret, examSparsity);
@@ -135,14 +151,9 @@ public class LibMatrixNative
                        else {
                                if(params.bias.isInSparseFormat())
                                        params.bias.sparseToDense(); // Bias 
matrix is usually extremely small
-                               boolean singlePrecision = 
ConfigurationManager.getDMLConfig()
-                                       
.getTextValue(DMLConfig.FLOATING_POINT_PRECISION).equals("single");
                                long start = DMLScript.STATISTICS ? 
System.nanoTime() : 0;
                                int nnz = -1;
-                               if( singlePrecision ) {
-                                       //note: since we anyway have to convert 
from double to float, we use
-                                       //preallocated direct buffers (with 
thread-local reuse and resizing on demand)
-                                       //to ensure there are no additional 
copies created by the transfer over jni
+                               if( isSinglePrecision() ) {
                                        FloatBuffer finput = 
toFloatBuffer(input.getDenseBlockValues(), inBuff, true);
                                        FloatBuffer fbias = 
toFloatBuffer(params.bias.getDenseBlockValues(), biasBuff, true);
                                        FloatBuffer ffilter = 
toFloatBuffer(filter.getDenseBlockValues(), filterBuff, true);
@@ -260,6 +271,11 @@ public class LibMatrixNative
                LibMatrixDNN.conv2dBackwardData(filter, dout, outputBlock, 
params);
        }
        
+       private static boolean isSinglePrecision() {
+               return ConfigurationManager.getDMLConfig()
+                       
.getTextValue(DMLConfig.FLOATING_POINT_PRECISION).equals("single");
+       }
+       
        private static FloatBuffer toFloatBuffer(double[] input, 
ThreadLocal<FloatBuffer> buff, boolean copy) {
                //maintain thread-local buffer (resized on demand)
                FloatBuffer ret = buff.get();

http://git-wip-us.apache.org/repos/asf/systemml/blob/c95019fd/src/main/java/org/apache/sysml/utils/NativeHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/NativeHelper.java 
b/src/main/java/org/apache/sysml/utils/NativeHelper.java
index 6ec990d..25e9847 100644
--- a/src/main/java/org/apache/sysml/utils/NativeHelper.java
+++ b/src/main/java/org/apache/sysml/utils/NativeHelper.java
@@ -324,7 +324,12 @@ public class NativeHelper {
        }
 
        // TODO: Add pmm, wsloss, mmchain, etc.
-       public static native boolean matrixMultDenseDense(double [] m1, double 
[] m2, double [] ret, int m1rlen, int m1clen, int m2clen, int numThreads);
+       
+       //double-precision matrix multiply dense-dense
+       public static native boolean dmmdd(double [] m1, double [] m2, double 
[] ret, int m1rlen, int m1clen, int m2clen, int numThreads);
+       //single-precision matrix multiply dense-dense
+       public static native boolean smmdd(FloatBuffer m1, FloatBuffer m2, 
FloatBuffer ret, int m1rlen, int m1clen, int m2clen, int numThreads);
+       //transpose-self matrix multiply
        private static native boolean tsmm(double [] m1, double [] ret, int 
m1rlen, int m1clen, boolean isLeftTranspose, int numThreads);
 
        // 
----------------------------------------------------------------------------------------------------------------

systemml git commit: [SYSTEMML-2106] New single-precision native matrix multiply

Reply via email to