incubator-systemml git commit: [SYSTEMML-540] [MINOR] Added additional performance counters for native invocation and improved SGDNesterov

niketanpansare Mon, 08 May 2017 20:58:16 -0700

Repository: incubator-systemml
Updated Branches:
  refs/heads/master b8de68b74 -> 5489c665d



[SYSTEMML-540] [MINOR] Added additional performance counters for native 
invocation and improved SGDNesterov

Closes #485.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/5489c665
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/5489c665
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/5489c665

Branch: refs/heads/master
Commit: 5489c665dcb434d88c165492557ef14ad284b69f
Parents: b8de68b
Author: Niketan Pansare <[email protected]>
Authored: Mon May 8 20:55:24 2017 -0700
Committer: Niketan Pansare <[email protected]>
Committed: Mon May 8 20:56:11 2017 -0700

----------------------------------------------------------------------
 .../cpp/lib/libsystemml_mkl-Linux-x86_64.so     | Bin 27408 -> 31824 bytes
 .../lib/libsystemml_openblas-Linux-x86_64.so    | Bin 27416 -> 27520 bytes
 src/main/cpp/libmatrixdnn.cpp                   |  54 ++++++++++------
 src/main/cpp/libmatrixdnn.h                     |   6 +-
 src/main/cpp/systemml.cpp                       |  32 +++++-----
 src/main/cpp/systemml.h                         |  29 ++++-----
 .../sysml/runtime/matrix/data/LibMatrixDNN.java |  24 ++++---
 .../runtime/matrix/data/LibMatrixNative.java    |  63 +++++++++++++------
 .../sysml/runtime/matrix/data/MatrixBlock.java  |  14 +++++
 .../apache/sysml/udf/lib/SGDNesterovUpdate.java |  13 ++--
 .../org/apache/sysml/utils/NativeHelper.java    |   9 +--
 .../java/org/apache/sysml/utils/Statistics.java |  45 +++++++++++--
 .../org/apache/sysml/api/dl/Caffe2DML.scala     |   2 +-
 .../org/apache/sysml/api/dl/CaffeSolver.scala   |   9 ++-
 14 files changed, 199 insertions(+), 101 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so
----------------------------------------------------------------------
diff --git a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so 
b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so
index 0a6427a..a740930 100755
Binary files a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so and 
b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so differ

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so
----------------------------------------------------------------------
diff --git a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so 
b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so
index ffdcd5a..dfd1ecb 100755
Binary files a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so and 
b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so differ

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/libmatrixdnn.cpp
----------------------------------------------------------------------
diff --git a/src/main/cpp/libmatrixdnn.cpp b/src/main/cpp/libmatrixdnn.cpp
index a521804..ed02042 100644
--- a/src/main/cpp/libmatrixdnn.cpp
+++ b/src/main/cpp/libmatrixdnn.cpp
@@ -27,6 +27,14 @@
 #include <cstring>
 #include "omp.h"
 
+int computeNNZ(double* arr, int limit) {
+  int nnz = 0;
+  #pragma omp parallel for reduction(+: nnz)
+  for(int i=0; i<limit; i++)
+    nnz += (arr[i]!=0) ? 1 : 0;
+  return nnz;
+}
+
 void rotate180(double* inputArray, double* outputArray, int N, int C, int H, 
int W,
             int K, int R, int S, int stride_h, int stride_w, int pad_h,
             int pad_w, int P, int Q) {
@@ -124,7 +132,7 @@ void im2col(double* inputArray, double* outputArray, int N, 
int C, int H, int W,
 } 
 
 
-void conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* 
retPtr, int N, int C, int H, int W, int K, int R, int S,
+int conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* 
retPtr, int N, int C, int H, int W, int K, int R, int S,
     int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int 
numThreads) {
   // First step: Avoids oversubscription and other openmp/internal blas 
threading issues
   setNumThreadsForBLAS(1);
@@ -150,7 +158,8 @@ void conv2dBackwardFilterDense(double* inputPtr, double* 
doutPtr, double* retPtr
 
 #pragma omp parallel for num_threads(numOpenMPThreads)
   for (int n = 0; n < N; n++) {
-       double* loweredMat = loweredMatArrays + 
numIm2ColElem*omp_get_thread_num();
+    int threadID = omp_get_thread_num();
+       double* loweredMat = loweredMatArrays + numIm2ColElem*threadID;
 
     // Step 1: Perform im2col
     im2col(inputPtr + n * CHW, loweredMat, 1, C, H, W, K,
@@ -158,18 +167,22 @@ void conv2dBackwardFilterDense(double* inputPtr, double* 
doutPtr, double* retPtr
            P, Q);
            
     // Step 2: Rotate dout
-    double* rotatedDoutPtr = rotatedDoutPtrArrays + 
numRotatedElem*omp_get_thread_num();
+    double* rotatedDoutPtr = rotatedDoutPtrArrays + numRotatedElem*threadID;
     rotate180(doutPtr + n * KPQ, rotatedDoutPtr, 1, C, H, W, K,
            R, S, stride_h, stride_w, pad_h, pad_w,
            P, Q);
     
-    // Multiply to get CRS X K
-    double* temp1 = temp + numTempElem*omp_get_thread_num();
-    // Step 3: loweredMat (CRS X PQ) %*% rotated_dout (PQ X K) 
-    matmult(loweredMat, rotatedDoutPtr, temp1, C * R * S, P * Q, K, 1);
-              
+    // Multiply to get tmp1 = CRS X K
+    double* temp1 = temp + numTempElem*threadID;
+    // Step 3: temp1 = alpha * (loweredMat (CRS X PQ) %*% rotated_dout (PQ X 
K)) + beta*temp1
+    int m1rlen = C * R * S; int m1clen = P * Q; int m2clen = K;
+    double* m1Ptr = loweredMat; double* m2Ptr = rotatedDoutPtr; double alpha = 
1; double beta = 1;
+    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m1rlen, m2clen, 
m1clen, alpha, m1Ptr, m1clen, m2Ptr, m2clen, beta, temp1, m2clen);
   } // end omp parallel for
   
+  delete [] loweredMatArrays;
+  delete [] rotatedDoutPtrArrays;
+  
   // Inplace transpose addition
   int numRow = CRS;
   for(int t = 0; t < numOpenMPThreads; t++) {
@@ -184,11 +197,10 @@ void conv2dBackwardFilterDense(double* inputPtr, double* 
doutPtr, double* retPtr
   }
   
   delete [] temp;
-  delete [] loweredMatArrays;
-  delete [] rotatedDoutPtrArrays;
+  return computeNNZ(retPtr, K*CRS);
 }
 
-void conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* 
retPtr, int N, int C, int H, int W, int K, int R, int S,
+int conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* 
retPtr, int N, int C, int H, int W, int K, int R, int S,
     int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int 
numThreads) {
    // First step: Avoids oversubscription and other openmp/internal blas 
threading issues
   setNumThreadsForBLAS(1);
@@ -207,27 +219,28 @@ void conv2dBackwardDataDense(double* filterPtr, double* 
doutPtr, double* retPtr,
 
 #pragma omp parallel for num_threads(numOpenMPThreads)
   for (int n = 0; n < N; n++) {
+    int threadID = omp_get_thread_num();
     // Step 1: Rotate dout
-    double* rotatedDoutPtr = rotatedDoutPtrArrays + 
numRotatedElem*omp_get_thread_num();
+    double* rotatedDoutPtr = rotatedDoutPtrArrays + numRotatedElem*threadID;
     rotate180(doutPtr + n * KPQ, rotatedDoutPtr, 1, C, H, W, K,
            R, S, stride_h, stride_w, pad_h, pad_w,
            P, Q);
 
     // Step 2: t(rotatedDout (PQ X K) %*% filter (K X CRS))
-    double* col2imInput = col2imInputArrays + 
numCol2ImElem*omp_get_thread_num();
+    double* col2imInput = col2imInputArrays + numCol2ImElem*threadID;
     matmult(rotatedDoutPtr, filterPtr, col2imInput,
             PQ, K, CRS, 1);
 
     // Step 3: Perform col2im
-    col2im(col2imInput, retPtr + n * CHW, 1, C, H, W, K,
+    double* outputArr = retPtr + n * CHW;
+    col2im(col2imInput, outputArr, 1, C, H, W, K,
            R, S, stride_h, stride_w, pad_h, pad_w,
            P, Q);
-
   } // end omp parallel for
   
   delete [] rotatedDoutPtrArrays;
   delete [] col2imInputArrays;
-    
+  return computeNNZ(retPtr, N*CHW);
 }
 
 void conv2dSparse(int apos, int alen, int* aix, double* avals, double* 
filterPtr, double* retPtr, int N, int C, int H, int W, 
@@ -290,7 +303,8 @@ void conv2dBackwardFilterSparseDense(int apos, int alen, 
int* aix, double* avals
        delete [] temp1;
 }
 
-void conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, 
double* retPtr, int N, int C, int H, int W, int K, int R, int S,
+
+int conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, 
double* retPtr, int N, int C, int H, int W, int K, int R, int S,
     int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool 
addBias, int numThreads) {
   // First step:  Avoids oversubscription and other openmp/internal blas 
threading issues
   setNumThreadsForBLAS(1);
@@ -306,7 +320,8 @@ void conv2dBiasAddDense(double* inputPtr, double* biasPtr, 
double* filterPtr, do
   
 #pragma omp parallel for num_threads(numOpenMPThreads)
   for (int n = 0; n < N; n++) {
-    double* loweredMat = loweredMatArrays + numIm2ColElem*omp_get_thread_num();
+    int threadID = omp_get_thread_num();
+    double* loweredMat = loweredMatArrays + numIm2ColElem*threadID;
 
     // Step 1: Perform im2col
     im2col(inputPtr + n * CHW, loweredMat, 1, C, H, W, K,
@@ -318,8 +333,8 @@ void conv2dBiasAddDense(double* inputPtr, double* biasPtr, 
double* filterPtr, do
             C * R * S, P * Q, 1);
     
     // Step 3: Add bias
+    double* outputArr = retPtr + n*KPQ;
     if(addBias) {
-           double* outputArr = retPtr + n*KPQ;
            int index = 0;
                for(int k = 0; k < K; k++) {
                        for(int pq = 0; pq < PQ; pq++, index++) {
@@ -330,4 +345,5 @@ void conv2dBiasAddDense(double* inputPtr, double* biasPtr, 
double* filterPtr, do
   } // end omp parallel for
   
   delete [] loweredMatArrays;
+  return computeNNZ(retPtr, N*KPQ);
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/libmatrixdnn.h
----------------------------------------------------------------------
diff --git a/src/main/cpp/libmatrixdnn.h b/src/main/cpp/libmatrixdnn.h
index bf6c113..86e7b24 100644
--- a/src/main/cpp/libmatrixdnn.h
+++ b/src/main/cpp/libmatrixdnn.h
@@ -20,13 +20,13 @@
 #ifndef _libmatrixdnn_h
 #define _libmatrixdnn_h
 
-void conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* 
retPtr, int N, int C, int H, int W, int K, int R, int S,
+int conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* 
retPtr, int N, int C, int H, int W, int K, int R, int S,
     int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int 
numThreads);
 
-void conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* 
retPtr, int N, int C, int H, int W, int K, int R, int S,
+int conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* 
retPtr, int N, int C, int H, int W, int K, int R, int S,
     int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int 
numThreads);
     
-void conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, 
double* retPtr, int N, int C, int H, int W, int K, int R, int S,
+int conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, 
double* retPtr, int N, int C, int H, int W, int K, int R, int S,
     int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool 
addBias, int numThreads);
     
 void conv2dSparse(int apos, int alen, int* aix, double* avals, double* filter, 
double* ret, int N, int C, int H, int W, 

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/systemml.cpp
----------------------------------------------------------------------
diff --git a/src/main/cpp/systemml.cpp b/src/main/cpp/systemml.cpp
index 41ce0bc..34ae248 100644
--- a/src/main/cpp/systemml.cpp
+++ b/src/main/cpp/systemml.cpp
@@ -144,7 +144,7 @@ JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBackwa
   return (jboolean) true;
 }
 
-JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dDense(
+JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dDense(
        JNIEnv* env, jclass, jdoubleArray input, jdoubleArray filter,
     jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
     jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint 
numThreads) {
@@ -152,18 +152,18 @@ JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dDense(
   double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   if(inputPtr == NULL || filterPtr == NULL || retPtr == NULL)
-       return (jboolean) false;
+       return (jint) -1;
   
-  conv2dBiasAddDense(inputPtr, 0, filterPtr, retPtr, (int) N, (int) C, (int) 
H, (int) W, (int) K, (int) R, (int) S,
+  int nnz = conv2dBiasAddDense(inputPtr, 0, filterPtr, retPtr, (int) N, (int) 
C, (int) H, (int) W, (int) K, (int) R, (int) S,
     (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) 
Q, false, (int) numThreads);
     
   RELEASE_INPUT_DOUBLE_ARRAY(env, input, inputPtr, numThreads);
   RELEASE_INPUT_DOUBLE_ARRAY(env, filter, filterPtr, numThreads);
   RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads); 
-  return (jboolean) true;
+  return (jint) nnz;
 }
 
-JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBiasAddDense(
+JNIEXPORT jint JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBiasAddDense(
        JNIEnv* env, jclass, jdoubleArray input, jdoubleArray bias, 
jdoubleArray filter,
     jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
     jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint 
numThreads) {
@@ -173,19 +173,19 @@ JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBiasAd
   double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   if(inputPtr == NULL || biasPtr == NULL || filterPtr == NULL || retPtr == 
NULL)
-       return (jboolean) false;
+       return (jint) -1;
   
-  conv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr, (int) N, (int) C, 
(int) H, (int) W, (int) K, (int) R, (int) S,
+  int nnz = conv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr, (int) N, 
(int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
     (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) 
Q, true, (int) numThreads);
     
   RELEASE_INPUT_DOUBLE_ARRAY(env, input, inputPtr, numThreads);
   RELEASE_INPUT_DOUBLE_ARRAY(env, bias, biasPtr, numThreads);
   RELEASE_INPUT_DOUBLE_ARRAY(env, filter, filterPtr, numThreads);
   RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads); 
-  return (jboolean) true;
+  return (jint) nnz;
 }
 
-JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardDataDense(
+JNIEXPORT jint JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardDataDense(
        JNIEnv* env, jclass, jdoubleArray filter, jdoubleArray dout,
     jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
     jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint 
numThreads) {
@@ -194,18 +194,18 @@ JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBackwa
   double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   if(doutPtr == NULL || filterPtr == NULL || retPtr == NULL)
-       return (jboolean) false;
+       return (jint) -1;
   
-  conv2dBackwardDataDense(filterPtr, doutPtr, retPtr, (int) N, (int) C, (int) 
H, (int) W, (int) K, (int) R, (int) S,
+  int nnz = conv2dBackwardDataDense(filterPtr, doutPtr, retPtr, (int) N, (int) 
C, (int) H, (int) W, (int) K, (int) R, (int) S,
     (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) 
Q, (int) numThreads);
   
   RELEASE_INPUT_DOUBLE_ARRAY(env, filter, filterPtr, numThreads);
   RELEASE_INPUT_DOUBLE_ARRAY(env, dout, doutPtr, numThreads);
   RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads);
-  return (jboolean) true;
+  return (jint) nnz;
 }
 
-JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterDense(
+JNIEXPORT jint JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterDense(
        JNIEnv* env, jclass, jdoubleArray input, jdoubleArray dout,
     jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S,
     jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint 
numThreads) {
@@ -213,13 +213,13 @@ JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBackwa
   double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads);
   double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads);
   if(doutPtr == NULL || inputPtr == NULL || retPtr == NULL)
-       return (jboolean) false;
+       return (jint) -1;
   
-  conv2dBackwardFilterDense(inputPtr, doutPtr, retPtr, (int) N, (int) C, (int) 
H, (int) W, (int) K, (int) R, (int) S,
+  int nnz = conv2dBackwardFilterDense(inputPtr, doutPtr, retPtr, (int) N, 
(int) C, (int) H, (int) W, (int) K, (int) R, (int) S,
     (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) 
Q, (int) numThreads);
   
   RELEASE_INPUT_DOUBLE_ARRAY(env, input, inputPtr, numThreads);
   RELEASE_INPUT_DOUBLE_ARRAY(env, dout, doutPtr, numThreads);
   RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads);
-  return (jboolean) true;
+  return (jint) nnz;
 }

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/systemml.h
----------------------------------------------------------------------
diff --git a/src/main/cpp/systemml.h b/src/main/cpp/systemml.h
index ac36495..f6f5cd2 100644
--- a/src/main/cpp/systemml.h
+++ b/src/main/cpp/systemml.h
@@ -45,49 +45,49 @@ JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_tsmm
 /*
  * Class:     org_apache_sysml_utils_NativeHelper
  * Method:    conv2dDense
- * Signature: ([D[D[DIIIIIIIIIIIIII)Z
+ * Signature: ([D[D[DIIIIIIIIIIIIII)I
  */
-JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dDense
+JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dDense
   (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, 
jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
 
 /*
  * Class:     org_apache_sysml_utils_NativeHelper
  * Method:    conv2dBiasAddDense
- * Signature: ([D[D[D[DIIIIIIIIIIIIII)Z
+ * Signature: ([D[D[D[DIIIIIIIIIIIIII)I
  */
-JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBiasAddDense
+JNIEXPORT jint JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBiasAddDense
   (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jdoubleArray, 
jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, 
jint);
 
 /*
  * Class:     org_apache_sysml_utils_NativeHelper
- * Method:    conv2dBackwardDataDense
- * Signature: ([D[D[DIIIIIIIIIIIIII)Z
+ * Method:    conv2dBackwardFilterDense
+ * Signature: ([D[D[DIIIIIIIIIIIIII)I
  */
-JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardDataDense
+JNIEXPORT jint JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterDense
   (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, 
jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
 
 /*
  * Class:     org_apache_sysml_utils_NativeHelper
- * Method:    conv2dBackwardFilterDense
- * Signature: ([D[D[DIIIIIIIIIIIIII)Z
+ * Method:    conv2dBackwardDataDense
+ * Signature: ([D[D[DIIIIIIIIIIIIII)I
  */
-JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterDense
+JNIEXPORT jint JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardDataDense
   (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, 
jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint);
 
 /*
  * Class:     org_apache_sysml_utils_NativeHelper
- * Method:    conv2dSparse
+ * Method:    conv2dBackwardFilterSparseDense
  * Signature: (II[I[D[D[DIIIIIIIIIIIIII)Z
  */
-JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dSparse
+JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterSparseDense
   (JNIEnv *, jclass, jint, jint, jintArray, jdoubleArray, jdoubleArray, 
jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, 
jint, jint, jint);
 
 /*
  * Class:     org_apache_sysml_utils_NativeHelper
- * Method:    conv2dBackwardFilterSparse
+ * Method:    conv2dSparse
  * Signature: (II[I[D[D[DIIIIIIIIIIIIII)Z
  */
-JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterSparseDense
+JNIEXPORT jboolean JNICALL 
Java_org_apache_sysml_utils_NativeHelper_conv2dSparse
   (JNIEnv *, jclass, jint, jint, jintArray, jdoubleArray, jdoubleArray, 
jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, 
jint, jint, jint);
 
 /*
@@ -103,4 +103,5 @@ JNIEXPORT void JNICALL 
Java_org_apache_sysml_utils_NativeHelper_setMaxNumThreads
 #endif
 #endif
 
+
  
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
index e4d3ba2..ab82697 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java
@@ -84,7 +84,7 @@ public class LibMatrixDNN {
        private static AtomicLong loopedConvBwdDataCol2ImTime = new 
AtomicLong(0);
        
        public static void appendStatistics(StringBuilder sb) {
-               if(DMLScript.STATISTICS && DISPLAY_STATISTICS && 
(conv2dDenseCount.get() != 0 || conv2dSparseCount.get() != 0)) {
+               if(DMLScript.STATISTICS && DISPLAY_STATISTICS) {
                        sb.append("LibMatrixDNN dense count 
(conv/bwdF/bwdD/im2col/maxBwd):\t" 
                                        + conv2dDenseCount.get() + "/"
                                        + conv2dBwdFilterDenseCount.get() + "/"
@@ -97,15 +97,13 @@ public class LibMatrixDNN {
                                        + conv2dBwdDataSparseCount.get() + "/"
                                        + im2colSparseCount.get() + "/"
                                        + maxPoolBwdSparseCount.get() + ".\n");
-                       if(loopedConvMatMultTime.get() != 0 || 
loopedConvIm2ColTime.get() != 0) {
-                               sb.append("LibMatrixDNN conv(im2col/matmult), 
bwdF (im2col/matmult), bwdD (col2im/matmult) time:\t" +
-                                               String.format("%.3f", 
loopedConvIm2ColTime.get()*1e-9) + "/" +
-                                               String.format("%.3f", 
loopedConvMatMultTime.get()*1e-9) + "/" + 
-                                               String.format("%.3f", 
loopedConvBwdFilterIm2ColTime.get()*1e-9) + "/" +
-                                               String.format("%.3f", 
loopedConvBwdFilterMatMultTime.get()*1e-9) + "/" +
-                                               String.format("%.3f", 
loopedConvBwdDataCol2ImTime.get()*1e-9) + "/" +
-                                               String.format("%.3f", 
loopedConvBwdDataMatMultTime.get()*1e-9) + " sec.\n");
-                       }
+                       sb.append("LibMatrixDNN conv(im2col/matmult), bwdF 
(im2col/matmult), bwdD (col2im/matmult) time:\t" +
+                                       String.format("%.3f", 
loopedConvIm2ColTime.get()*1e-9) + "/" +
+                                       String.format("%.3f", 
loopedConvMatMultTime.get()*1e-9) + "/" + 
+                                       String.format("%.3f", 
loopedConvBwdFilterIm2ColTime.get()*1e-9) + "/" +
+                                       String.format("%.3f", 
loopedConvBwdFilterMatMultTime.get()*1e-9) + "/" +
+                                       String.format("%.3f", 
loopedConvBwdDataCol2ImTime.get()*1e-9) + "/" +
+                                       String.format("%.3f", 
loopedConvBwdDataMatMultTime.get()*1e-9) + " sec.\n");
                }
        }
        public static void resetStatistics() {
@@ -158,7 +156,7 @@ public class LibMatrixDNN {
                        params.bias.sparseToDense(); // Since bias is extremely 
small array
                
                if(isEligibleForConv2dSparse(params))
-                       Statistics.numNativeLibMatrixDNNCalls.increment();
+                       Statistics.numNativeSparseConv2dCalls.increment();
                
                runConvTask(TaskType.LoopedIm2ColConv2d, params);
                
@@ -179,7 +177,7 @@ public class LibMatrixDNN {
                checkInputsConv2dBackwardData(filter, dout, outputBlock, 
params);
                
                if(isEligibleForConv2dBackwardDataDense(params))
-                       Statistics.numNativeLibMatrixDNNCalls.increment();
+                       
Statistics.numNativeSparseConv2dBwdDataCalls.increment();
                
                runConvTask(TaskType.LoopedIm2ColConv2dBwdData, params);
                
@@ -200,7 +198,7 @@ public class LibMatrixDNN {
                checkInputsConv2dBackwardFilter(input, dout, outputBlock, 
params);
                
                if(isEligibleForConv2dBackwardFilterSparseDense(params))
-                       Statistics.numNativeLibMatrixDNNCalls.increment();
+                       
Statistics.numNativeSparseConv2dBwdFilterCalls.increment();
                
                runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params);
                

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
index 4b12596..524218d 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java
@@ -18,6 +18,7 @@
  */
 package org.apache.sysml.runtime.matrix.data;
 
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.hops.OptimizerUtils;
 import org.apache.sysml.runtime.DMLRuntimeException;
 import org.apache.sysml.utils.NativeHelper;
@@ -60,9 +61,13 @@ public class LibMatrixNative {
                                !isMatMultMemoryBound(m1.rlen, m1.clen, 
m2.clen) && !m1.isInSparseFormat() && !m2.isInSparseFormat()) {
                        ret.sparse = false;
                        ret.allocateDenseBlock();
+                       long start = DMLScript.STATISTICS ? System.nanoTime() : 
0;
                        if (NativeHelper.matrixMultDenseDense(m1.denseBlock, 
m2.denseBlock, 
                                        ret.denseBlock, m1.getNumRows(), 
m1.getNumColumns(), m2.getNumColumns(), k)) {
-                               
Statistics.numNativeLibMatrixMultCalls.increment();
+                               if(DMLScript.STATISTICS) {
+                                       Statistics.nativeLibMatrixMultTime += 
System.nanoTime() - start;
+                                       
Statistics.numNativeLibMatrixMultCalls.increment();
+                               }
                                ret.recomputeNonZeros();
                                // post-processing (nnz maintained in parallel)
                                if(examSparsity)
@@ -94,12 +99,17 @@ public class LibMatrixNative {
                if(NativeHelper.isNativeLibraryLoaded() && 
!input.isInSparseFormat() && !filter.isInSparseFormat()) {
                        setNumThreads(params);
                        if(params.bias == null) {
-                               if(NativeHelper.conv2dDense(input.denseBlock, 
filter.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, 
params.W, 
+                               long start = DMLScript.STATISTICS ? 
System.nanoTime() : 0;
+                               int nnz = 
NativeHelper.conv2dDense(input.denseBlock, filter.denseBlock, 
outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
                                                params.K, params.R, params.S, 
params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
-                                               params.P, params.Q, 
params.numThreads)) {
-                                       
Statistics.numNativeLibMatrixDNNCalls.increment();
+                                               params.P, params.Q, 
params.numThreads);
+                               if(nnz != -1) {
+                                       if(DMLScript.STATISTICS) {
+                                               Statistics.nativeConv2dTime += 
System.nanoTime() - start;
+                                               
Statistics.numNativeConv2dCalls.increment();
+                                       }
                                        // post-processing: maintain nnz
-                                       outputBlock.recomputeNonZeros();
+                                       outputBlock.setNonZeros(nnz);
                                        return;
                                }
                                else {
@@ -110,13 +120,18 @@ public class LibMatrixNative {
                        else {
                                if(params.bias.isInSparseFormat())
                                        params.bias.sparseToDense(); // Bias 
matrix is usually extremely small
-                               
if(NativeHelper.conv2dBiasAddDense(input.denseBlock, params.bias.denseBlock, 
filter.denseBlock, outputBlock.denseBlock, 
+                               long start = DMLScript.STATISTICS ? 
System.nanoTime() : 0;
+                               int nnz = 
NativeHelper.conv2dBiasAddDense(input.denseBlock, params.bias.denseBlock, 
filter.denseBlock, outputBlock.denseBlock, 
                                                params.N, params.C, params.H, 
params.W, 
                                                params.K, params.R, params.S, 
params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
-                                               params.P, params.Q, 
params.numThreads)) {
-                                       
Statistics.numNativeLibMatrixDNNCalls.increment();
+                                               params.P, params.Q, 
params.numThreads);
+                               if(nnz != -1) {
+                                       if(DMLScript.STATISTICS) {
+                                               Statistics.nativeConv2dTime += 
System.nanoTime() - start;
+                                               
Statistics.numNativeConv2dCalls.increment();
+                                       }
                                        // post-processing: maintain nnz
-                                       outputBlock.recomputeNonZeros();
+                                       outputBlock.setNonZeros(nnz);
                                        return;
                                }
                                else {
@@ -150,12 +165,17 @@ public class LibMatrixNative {
                params.numThreads = params.numThreads <= 0 ? 
NativeHelper.getMaxNumThreads() : params.numThreads;
                if(NativeHelper.isNativeLibraryLoaded() && 
!dout.isInSparseFormat() && !input.isInSparseFormat()) {
                        setNumThreads(params);
-                       
if(NativeHelper.conv2dBackwardFilterDense(input.denseBlock, dout.denseBlock, 
outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
-                                               params.K, params.R, params.S, 
params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
-                                               params.P, params.Q, 
params.numThreads)) {
-                               
Statistics.numNativeLibMatrixDNNCalls.increment();
+                       long start = DMLScript.STATISTICS ? System.nanoTime() : 
0;
+                       int nnz = 
NativeHelper.conv2dBackwardFilterDense(input.denseBlock, dout.denseBlock, 
outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
+                                       params.K, params.R, params.S, 
params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
+                                       params.P, params.Q, params.numThreads);
+                       if(nnz != -1) {
+                               if(DMLScript.STATISTICS) {
+                                       Statistics.nativeConv2dBwdFilterTime += 
System.nanoTime() - start;
+                                       
Statistics.numNativeConv2dBwdFilterCalls.increment();
+                               }
                                // post-processing: maintain nnz
-                               outputBlock.recomputeNonZeros();
+                               outputBlock.setNonZeros(nnz);
                                return;
                        }
                        else {
@@ -181,12 +201,17 @@ public class LibMatrixNative {
                params.numThreads = params.numThreads <= 0 ? 
NativeHelper.getMaxNumThreads() : params.numThreads;
                if(NativeHelper.isNativeLibraryLoaded() && 
!dout.isInSparseFormat() && !filter.isInSparseFormat()) {
                        setNumThreads(params);
-                       
if(NativeHelper.conv2dBackwardDataDense(filter.denseBlock, dout.denseBlock, 
outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
-                                               params.K, params.R, params.S, 
params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
-                                               params.P, params.Q, 
params.numThreads)) {
-                               
Statistics.numNativeLibMatrixDNNCalls.increment();
+                       long start = DMLScript.STATISTICS ? System.nanoTime() : 
0;
+                       int nnz = 
NativeHelper.conv2dBackwardDataDense(filter.denseBlock, dout.denseBlock, 
outputBlock.denseBlock, params.N, params.C, params.H, params.W, 
+                                       params.K, params.R, params.S, 
params.stride_h, params.stride_w, params.pad_h, params.pad_w, 
+                                       params.P, params.Q, params.numThreads);
+                       if(nnz != -1) {
+                               if(DMLScript.STATISTICS) {
+                                       Statistics.nativeConv2dBwdDataTime += 
System.nanoTime() - start;
+                                       
Statistics.numNativeConv2dBwdDataCalls.increment();
+                               }
                                // post-processing: maintain nnz
-                               outputBlock.recomputeNonZeros();
+                               outputBlock.setNonZeros(nnz);
                                return;
                        }
                        else {

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java 
b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
index ff6a007..233350a 100644
--- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
+++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java
@@ -34,6 +34,7 @@ import java.util.stream.LongStream;
 
 import org.apache.commons.math3.random.Well1024a;
 import org.apache.hadoop.io.DataInputBuffer;
+import org.apache.sysml.api.DMLScript;
 import org.apache.sysml.conf.ConfigurationManager;
 import org.apache.sysml.hops.Hop.OpOp2;
 import org.apache.sysml.hops.OptimizerUtils;
@@ -86,6 +87,7 @@ import 
org.apache.sysml.runtime.util.FastBufferedDataOutputStream;
 import org.apache.sysml.runtime.util.IndexRange;
 import org.apache.sysml.runtime.util.UtilFunctions;
 import org.apache.sysml.utils.NativeHelper;
+import org.apache.sysml.utils.Statistics;
 
 
 
@@ -104,6 +106,8 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
        //basic header (int rlen, int clen, byte type)
        public static final int HEADER_SIZE = 9;
        
+       private static final boolean DISPLAY_STATISTICS = false; // Developer 
flag to measure performance overhead of various functions in this class
+       
        public enum BlockType{
                EMPTY_BLOCK,  
                ULTRA_SPARSE_BLOCK, //ultra sparse representation, in-mem same 
as sparse
@@ -336,6 +340,7 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
                        allocateDenseBlock();
        }
        
+       @SuppressWarnings("unused")
        public void allocateDenseBlock(boolean clearNNZ) 
                        throws RuntimeException 
        {
@@ -350,7 +355,9 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
                
                //allocate block if non-existing or too small (guaranteed to be 
0-initialized),
                if(denseBlock == null || denseBlock.length < limit) {
+                       long start = DISPLAY_STATISTICS && DMLScript.STATISTICS 
? System.nanoTime() : 0;
                        denseBlock = new double[(int)limit];
+                       Statistics.allocateDoubleArrTime += DISPLAY_STATISTICS 
&& DMLScript.STATISTICS ? (System.nanoTime() - start) : 0;
                }
                
                //clear nnz if necessary
@@ -986,9 +993,11 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
         * 
         * @throws DMLRuntimeException if DMLRuntimeException occurs
         */
+       @SuppressWarnings("unused")
        public void examSparsity() 
                throws DMLRuntimeException
        {
+               long start = DISPLAY_STATISTICS && DMLScript.STATISTICS ? 
System.nanoTime() : 0;
                //determine target representation
                boolean sparseDst = evalSparseFormatInMemory(); 
                                
@@ -1002,6 +1011,8 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
                        sparseToDense();
                else if( !sparse && sparseDst )
                        denseToSparse();
+               
+               Statistics.examSparsityTime += DISPLAY_STATISTICS && 
DMLScript.STATISTICS ? (System.nanoTime() - start) : 0;
        }
        
        /**
@@ -1141,6 +1152,7 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
         * of the entire matrix block.
         * 
         */
+       @SuppressWarnings("unused")
        public void recomputeNonZeros()
        {
                if( sparse && sparseBlock!=null ) //SPARSE (max long)
@@ -1150,12 +1162,14 @@ public class MatrixBlock extends MatrixValue implements 
CacheBlock, Externalizab
                }
                else if( !sparse && denseBlock!=null ) //DENSE (max int)
                {
+                       long start = DISPLAY_STATISTICS && DMLScript.STATISTICS 
? System.nanoTime() : 0;
                        double[] a = denseBlock;
                        final int limit=rlen*clen;
                        int nnz = 0;
                        for(int i=0; i<limit; i++)
                                nnz += (a[i]!=0) ? 1 : 0;
                        nonZeros = nnz;
+                       Statistics.recomputeNNZTime += DISPLAY_STATISTICS && 
DMLScript.STATISTICS ? (System.nanoTime() - start) : 0;
                }
        }
        

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/udf/lib/SGDNesterovUpdate.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/udf/lib/SGDNesterovUpdate.java 
b/src/main/java/org/apache/sysml/udf/lib/SGDNesterovUpdate.java
index 9c027d3..a88f230 100644
--- a/src/main/java/org/apache/sysml/udf/lib/SGDNesterovUpdate.java
+++ b/src/main/java/org/apache/sysml/udf/lib/SGDNesterovUpdate.java
@@ -39,7 +39,7 @@ import org.apache.sysml.udf.Matrix.ValueType;
  * Assumption: the input batch fits in CP (which is also the assumption of 
most deep learning systems).
  * 
  * Usage:
- * update_nesterov = externalFunction(matrix[double] X, matrix[double] dX, 
double lr, double mu, matrix[double] v) return (matrix[double] X, 
matrix[double] v) implemented in 
(classname="org.apache.sysml.udf.lib.SGDNesterovUpdate",exectype="mem");
+ * update_nesterov = externalFunction(matrix[double] X, matrix[double] dX, 
double lr, double mu, matrix[double] v, double lambda) return (matrix[double] 
X, matrix[double] v) implemented in 
(classname="org.apache.sysml.udf.lib.SGDNesterovUpdate",exectype="mem");
  * [X, v] = update_nesterov(X, dX, lr, mu, v);
  * 
  * 
@@ -81,16 +81,19 @@ public class SGDNesterovUpdate extends PackageFunction {
                        double mu = 
Double.parseDouble(((Scalar)getFunctionInput(3)).getValue());
                        MatrixBlock v = ((Matrix) 
getFunctionInput(4)).getMatrixObject().acquireRead();
                        
-                       // v = mu * v - lr * dX
+                       double lambda = 
Double.parseDouble(((Scalar)getFunctionInput(5)).getValue());
+                       
+                       // v = mu * v - lr * dX - lr*lambda*X
                        updatedV = new Matrix( "tmp_" + rand.nextLong(), 
v.getNumRows(), v.getNumColumns(), ValueType.Double );
                        MatrixBlock updatedVMB = 
allocateDenseMatrixBlock(updatedV);
                        double [] updatedVData = updatedVMB.getDenseBlock();
-                       if(isDense(v) && isDense(dX)) {
+                       if(isDense(v) && isDense(dX) && isDense(X)) {
                                double [] vArr = v.getDenseBlock();
                                double [] dXArr = dX.getDenseBlock();
+                               double [] XArr = X.getDenseBlock();
                                int nnz = 0;
                                for(int i = 0; i < updatedVData.length; i++) {
-                                       updatedVData[i] = mu*vArr[i] - 
lr*dXArr[i];
+                                       updatedVData[i] = mu*vArr[i] - 
lr*dXArr[i] - lr*lambda*XArr[i];
                                        nnz += (updatedVData[i]!=0) ? 1 : 0;
                                }
                                updatedVMB.setNonZeros(nnz); 
@@ -98,8 +101,10 @@ public class SGDNesterovUpdate extends PackageFunction {
                        else {
                                multiplyByConstant(v, mu, updatedVData);
                                multiplyByConstant(dX, -lr, updatedVData);
+                               multiplyByConstant(X, -lr*lambda, updatedVData);
                                updatedVMB.recomputeNonZeros();
                        }
+                       
                        updatedV.setMatrixDoubleArray(updatedVMB, 
OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo);
                        
                        // X = X - mu * v_prev + (1 + mu) * v

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/utils/NativeHelper.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/NativeHelper.java 
b/src/main/java/org/apache/sysml/utils/NativeHelper.java
index 129824b..fe5e085 100644
--- a/src/main/java/org/apache/sysml/utils/NativeHelper.java
+++ b/src/main/java/org/apache/sysml/utils/NativeHelper.java
@@ -254,17 +254,18 @@ public class NativeHelper {
        // K = number of filters, R = filter height, S = filter width
        // TODO: case not handled: sparse filters (which will only be executed 
in Java). Since filters are relatively smaller, this is a low priority.
        
+       // Returns -1 if failures or returns number of nonzeros
        // Called by ConvolutionCPInstruction if both input and filter are dense
-       public static native boolean conv2dDense(double [] input, double [] 
filter, double [] ret, int N, int C, int H, int W, 
+       public static native int conv2dDense(double [] input, double [] filter, 
double [] ret, int N, int C, int H, int W, 
                        int K, int R, int S, int stride_h, int stride_w, int 
pad_h, int pad_w, int P, int Q, int numThreads);
-       public static native boolean conv2dBiasAddDense(double [] input, double 
[] bias, double [] filter, double [] ret, int N, int C, int H, int W, 
+       public static native int conv2dBiasAddDense(double [] input, double [] 
bias, double [] filter, double [] ret, int N, int C, int H, int W, 
                        int K, int R, int S, int stride_h, int stride_w, int 
pad_h, int pad_w, int P, int Q, int numThreads);
        // Called by ConvolutionCPInstruction if both input and filter are dense
-       public static native boolean conv2dBackwardFilterDense(double [] input, 
double [] dout, double [] ret, int N, int C, int H, int W, 
+       public static native int conv2dBackwardFilterDense(double [] input, 
double [] dout, double [] ret, int N, int C, int H, int W, 
                        int K, int R, int S, int stride_h, int stride_w, int 
pad_h, int pad_w, int P, int Q, int numThreads);
        // If both filter and dout are dense, then called by 
ConvolutionCPInstruction
        // Else, called by LibMatrixDNN's thread if filter is dense. dout[n] is 
converted to dense if sparse.
-       public static native boolean conv2dBackwardDataDense(double [] filter, 
double [] dout, double [] ret, int N, int C, int H, int W, 
+       public static native int conv2dBackwardDataDense(double [] filter, 
double [] dout, double [] ret, int N, int C, int H, int W, 
                        int K, int R, int S, int stride_h, int stride_w, int 
pad_h, int pad_w, int P, int Q, int numThreads);
        
        // Currently only supported with numThreads = 1 and sparse input

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/utils/Statistics.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java 
b/src/main/java/org/apache/sysml/utils/Statistics.java
index 97888cb..49633d8 100644
--- a/src/main/java/org/apache/sysml/utils/Statistics.java
+++ b/src/main/java/org/apache/sysml/utils/Statistics.java
@@ -114,7 +114,21 @@ public class Statistics
        
        private static LongAdder numNativeFailures = new LongAdder();
        public static LongAdder numNativeLibMatrixMultCalls = new LongAdder();
-       public static LongAdder numNativeLibMatrixDNNCalls = new LongAdder();
+       public static LongAdder numNativeConv2dCalls = new LongAdder();
+       public static LongAdder numNativeConv2dBwdDataCalls = new LongAdder();
+       public static LongAdder numNativeConv2dBwdFilterCalls = new LongAdder();
+       public static LongAdder numNativeSparseConv2dCalls = new LongAdder();
+       public static LongAdder numNativeSparseConv2dBwdFilterCalls = new 
LongAdder();
+       public static LongAdder numNativeSparseConv2dBwdDataCalls = new 
LongAdder();
+       public static long nativeLibMatrixMultTime = 0;
+       public static long nativeConv2dTime = 0;
+       public static long nativeConv2dBwdDataTime = 0;
+       public static long nativeConv2dBwdFilterTime = 0;
+       
+       public static long recomputeNNZTime = 0;
+       public static long examSparsityTime = 0;
+       public static long allocateDoubleArrTime = 0;
+       
        public static void incrementNativeFailuresCounter() {
                numNativeFailures.increment();
                // This is very rare and am not sure it is possible at all. Our 
initial experiments never encountered this case.
@@ -378,8 +392,17 @@ public class Statistics
 
                GPUStatistics.reset();
                numNativeLibMatrixMultCalls.reset();
-               numNativeLibMatrixDNNCalls.reset();
+               numNativeSparseConv2dCalls.reset();
+               numNativeSparseConv2dBwdDataCalls.reset();
+               numNativeSparseConv2dBwdFilterCalls.reset();
+               numNativeConv2dCalls.reset();
+               numNativeConv2dBwdDataCalls.reset();
+               numNativeConv2dBwdFilterCalls.reset();
                numNativeFailures.reset();
+               nativeLibMatrixMultTime = 0;
+               nativeConv2dTime = 0;
+               nativeConv2dBwdFilterTime = 0;
+               nativeConv2dBwdDataTime = 0;
                LibMatrixDNN.resetStatistics();
        }
 
@@ -635,11 +658,23 @@ public class Statistics
                //show extended caching/compilation statistics
                if( DMLScript.STATISTICS ) 
                {
-                       if(NativeHelper.blasType != null && 
(numNativeLibMatrixMultCalls.longValue() > 0 || 
-                                       numNativeLibMatrixDNNCalls.longValue() 
> 0)) {
+                       if(NativeHelper.blasType != null) {
                                String blas = NativeHelper.blasType != null ? 
NativeHelper.blasType : ""; 
-                               sb.append("Native " + blas + " calls 
(LibMatrixMult/LibMatrixDNN):\t" + numNativeLibMatrixMultCalls.longValue()  + 
"/" + numNativeLibMatrixDNNCalls.longValue() + ".\n");
+                               sb.append("Native " + blas + " calls (dense 
mult/conv/bwdF/bwdD):\t" + numNativeLibMatrixMultCalls.longValue()  + "/" + 
+                                               
numNativeConv2dCalls.longValue() + "/" + 
numNativeConv2dBwdFilterCalls.longValue()
+                                               + "/" + 
numNativeConv2dBwdDataCalls.longValue() + ".\n");
+                               sb.append("Native " + blas + " calls (sparse 
conv/bwdF/bwdD):\t" +  
+                                               
numNativeSparseConv2dCalls.longValue() + "/" + 
numNativeSparseConv2dBwdFilterCalls.longValue()
+                                               + "/" + 
numNativeSparseConv2dBwdDataCalls.longValue() + ".\n");
+                               sb.append("Native " + blas + " times (dense 
mult/conv/bwdF/bwdD):\t" + String.format("%.3f", nativeLibMatrixMultTime*1e-9) 
+ "/" +
+                                               String.format("%.3f", 
nativeConv2dTime*1e-9) + "/" + String.format("%.3f", 
nativeConv2dBwdFilterTime*1e-9) + "/" + 
+                                               String.format("%.3f", 
nativeConv2dBwdDataTime*1e-9) + ".\n");
+                       }
+                       if(recomputeNNZTime != 0 || examSparsityTime != 0 || 
allocateDoubleArrTime != 0) {
+                               sb.append("MatrixBlock times 
(recomputeNNZ/examSparsity/allocateDoubleArr):\t" + String.format("%.3f", 
recomputeNNZTime*1e-9) + "/" +
+                                       String.format("%.3f", 
examSparsityTime*1e-9) + "/" + String.format("%.3f", 
allocateDoubleArrTime*1e-9)  + ".\n");
                        }
+                       
                        sb.append("Cache hits (Mem, WB, FS, HDFS):\t" + 
CacheStatistics.displayHits() + ".\n");
                        sb.append("Cache writes (WB, FS, HDFS):\t" + 
CacheStatistics.displayWrites() + ".\n");
                        sb.append("Cache times (ACQr/m, RLS, EXP):\t" + 
CacheStatistics.displayTime() + " sec.\n");

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
----------------------------------------------------------------------
diff --git a/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala 
b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
index 377ebf3..f7f85c3 100644
--- a/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
+++ b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala
@@ -291,7 +291,7 @@ class Caffe2DML(val sc: SparkContext, val 
solverParam:Caffe.SolverParameter,
          appendVisualizationHeaders(dmlScript, numTabs)
          
          if(Caffe2DML.USE_NESTEROV_UDF) {
-           tabDMLScript(dmlScript, numTabs).append("update_nesterov = 
externalFunction(matrix[double] X, matrix[double] dX, double lr, double mu, 
matrix[double] v) return (matrix[double] X, matrix[double] v) implemented in 
(classname=\"org.apache.sysml.udf.lib.SGDNesterovUpdate\",exectype=\"mem\");  
\n")
+           tabDMLScript(dmlScript, numTabs).append("update_nesterov = 
externalFunction(matrix[double] X, matrix[double] dX, double lr, double mu, 
matrix[double] v, double lambda) return (matrix[double] X, matrix[double] v) 
implemented in 
(classname=\"org.apache.sysml.udf.lib.SGDNesterovUpdate\",exectype=\"mem\");  
\n")
          }
          
          // Read and convert to one-hote encoding

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala
----------------------------------------------------------------------
diff --git a/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala 
b/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala
index ae3d21d..0620e44 100644
--- a/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala
+++ b/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala
@@ -144,12 +144,15 @@ class AdaGrad(lambda:Double=5e-04, epsilon:Double=1e-6) 
extends CaffeSolver {
  */
 class Nesterov(lambda:Double=5e-04, momentum:Double=0.9) extends CaffeSolver {
   def update(dmlScript:StringBuilder, layer:CaffeLayer):Unit = {
-    l2reg_update(lambda, dmlScript, layer)
     val fn = if(Caffe2DML.USE_NESTEROV_UDF) "update_nesterov" else 
"sgd_nesterov::update"
+    val lastParameter = if(Caffe2DML.USE_NESTEROV_UDF) (", " + lambda) else ""
+    if(!Caffe2DML.USE_NESTEROV_UDF) {
+      l2reg_update(lambda, dmlScript, layer)
+    }
     if(layer.shouldUpdateWeight) dmlScript.append("\t").append("["+ 
commaSep(layer.weight, layer.weight+"_v") + "] " + 
-        "= " + fn + "(" + commaSep(layer.weight, layer.dWeight, 
getWeightLr(layer), momentum.toString, layer.weight+"_v") + ")\n")
+        "= " + fn + "(" + commaSep(layer.weight, layer.dWeight, 
getWeightLr(layer), momentum.toString, layer.weight+"_v") + lastParameter + 
")\n")
     if(layer.shouldUpdateBias) dmlScript.append("\t").append("["+ 
commaSep(layer.bias, layer.bias+"_v") + "] " + 
-        "= " + fn + "(" + commaSep(layer.bias, layer.dBias, getBiasLr(layer), 
momentum.toString, layer.bias+"_v") + ")\n")
+        "= " + fn + "(" + commaSep(layer.bias, layer.dBias, getBiasLr(layer), 
momentum.toString, layer.bias+"_v") + lastParameter + ")\n")
   }
   def init(dmlScript:StringBuilder, layer:CaffeLayer):Unit = {
     if(layer.shouldUpdateWeight) dmlScript.append(layer.weight+"_v = 
sgd_nesterov::init(" + layer.weight + ")\n")

incubator-systemml git commit: [SYSTEMML-540] [MINOR] Added additional performance counters for native invocation and improved SGDNesterov

Reply via email to