Repository: incubator-systemml Updated Branches: refs/heads/master b8de68b74 -> 5489c665d
[SYSTEMML-540] [MINOR] Added additional performance counters for native invocation and improved SGDNesterov Closes #485. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/5489c665 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/5489c665 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/5489c665 Branch: refs/heads/master Commit: 5489c665dcb434d88c165492557ef14ad284b69f Parents: b8de68b Author: Niketan Pansare <[email protected]> Authored: Mon May 8 20:55:24 2017 -0700 Committer: Niketan Pansare <[email protected]> Committed: Mon May 8 20:56:11 2017 -0700 ---------------------------------------------------------------------- .../cpp/lib/libsystemml_mkl-Linux-x86_64.so | Bin 27408 -> 31824 bytes .../lib/libsystemml_openblas-Linux-x86_64.so | Bin 27416 -> 27520 bytes src/main/cpp/libmatrixdnn.cpp | 54 ++++++++++------ src/main/cpp/libmatrixdnn.h | 6 +- src/main/cpp/systemml.cpp | 32 +++++----- src/main/cpp/systemml.h | 29 ++++----- .../sysml/runtime/matrix/data/LibMatrixDNN.java | 24 ++++--- .../runtime/matrix/data/LibMatrixNative.java | 63 +++++++++++++------ .../sysml/runtime/matrix/data/MatrixBlock.java | 14 +++++ .../apache/sysml/udf/lib/SGDNesterovUpdate.java | 13 ++-- .../org/apache/sysml/utils/NativeHelper.java | 9 +-- .../java/org/apache/sysml/utils/Statistics.java | 45 +++++++++++-- .../org/apache/sysml/api/dl/Caffe2DML.scala | 2 +- .../org/apache/sysml/api/dl/CaffeSolver.scala | 9 ++- 14 files changed, 199 insertions(+), 101 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so ---------------------------------------------------------------------- diff --git a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so index 0a6427a..a740930 100755 Binary files a/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so and b/src/main/cpp/lib/libsystemml_mkl-Linux-x86_64.so differ http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so ---------------------------------------------------------------------- diff --git a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so index ffdcd5a..dfd1ecb 100755 Binary files a/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so and b/src/main/cpp/lib/libsystemml_openblas-Linux-x86_64.so differ http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/libmatrixdnn.cpp ---------------------------------------------------------------------- diff --git a/src/main/cpp/libmatrixdnn.cpp b/src/main/cpp/libmatrixdnn.cpp index a521804..ed02042 100644 --- a/src/main/cpp/libmatrixdnn.cpp +++ b/src/main/cpp/libmatrixdnn.cpp @@ -27,6 +27,14 @@ #include <cstring> #include "omp.h" +int computeNNZ(double* arr, int limit) { + int nnz = 0; + #pragma omp parallel for reduction(+: nnz) + for(int i=0; i<limit; i++) + nnz += (arr[i]!=0) ? 1 : 0; + return nnz; +} + void rotate180(double* inputArray, double* outputArray, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q) { @@ -124,7 +132,7 @@ void im2col(double* inputArray, double* outputArray, int N, int C, int H, int W, } -void conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, +int conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) { // First step: Avoids oversubscription and other openmp/internal blas threading issues setNumThreadsForBLAS(1); @@ -150,7 +158,8 @@ void conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr #pragma omp parallel for num_threads(numOpenMPThreads) for (int n = 0; n < N; n++) { - double* loweredMat = loweredMatArrays + numIm2ColElem*omp_get_thread_num(); + int threadID = omp_get_thread_num(); + double* loweredMat = loweredMatArrays + numIm2ColElem*threadID; // Step 1: Perform im2col im2col(inputPtr + n * CHW, loweredMat, 1, C, H, W, K, @@ -158,18 +167,22 @@ void conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr P, Q); // Step 2: Rotate dout - double* rotatedDoutPtr = rotatedDoutPtrArrays + numRotatedElem*omp_get_thread_num(); + double* rotatedDoutPtr = rotatedDoutPtrArrays + numRotatedElem*threadID; rotate180(doutPtr + n * KPQ, rotatedDoutPtr, 1, C, H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, P, Q); - // Multiply to get CRS X K - double* temp1 = temp + numTempElem*omp_get_thread_num(); - // Step 3: loweredMat (CRS X PQ) %*% rotated_dout (PQ X K) - matmult(loweredMat, rotatedDoutPtr, temp1, C * R * S, P * Q, K, 1); - + // Multiply to get tmp1 = CRS X K + double* temp1 = temp + numTempElem*threadID; + // Step 3: temp1 = alpha * (loweredMat (CRS X PQ) %*% rotated_dout (PQ X K)) + beta*temp1 + int m1rlen = C * R * S; int m1clen = P * Q; int m2clen = K; + double* m1Ptr = loweredMat; double* m2Ptr = rotatedDoutPtr; double alpha = 1; double beta = 1; + cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, m1rlen, m2clen, m1clen, alpha, m1Ptr, m1clen, m2Ptr, m2clen, beta, temp1, m2clen); } // end omp parallel for + delete [] loweredMatArrays; + delete [] rotatedDoutPtrArrays; + // Inplace transpose addition int numRow = CRS; for(int t = 0; t < numOpenMPThreads; t++) { @@ -184,11 +197,10 @@ void conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr } delete [] temp; - delete [] loweredMatArrays; - delete [] rotatedDoutPtrArrays; + return computeNNZ(retPtr, K*CRS); } -void conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, +int conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads) { // First step: Avoids oversubscription and other openmp/internal blas threading issues setNumThreadsForBLAS(1); @@ -207,27 +219,28 @@ void conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, #pragma omp parallel for num_threads(numOpenMPThreads) for (int n = 0; n < N; n++) { + int threadID = omp_get_thread_num(); // Step 1: Rotate dout - double* rotatedDoutPtr = rotatedDoutPtrArrays + numRotatedElem*omp_get_thread_num(); + double* rotatedDoutPtr = rotatedDoutPtrArrays + numRotatedElem*threadID; rotate180(doutPtr + n * KPQ, rotatedDoutPtr, 1, C, H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, P, Q); // Step 2: t(rotatedDout (PQ X K) %*% filter (K X CRS)) - double* col2imInput = col2imInputArrays + numCol2ImElem*omp_get_thread_num(); + double* col2imInput = col2imInputArrays + numCol2ImElem*threadID; matmult(rotatedDoutPtr, filterPtr, col2imInput, PQ, K, CRS, 1); // Step 3: Perform col2im - col2im(col2imInput, retPtr + n * CHW, 1, C, H, W, K, + double* outputArr = retPtr + n * CHW; + col2im(col2imInput, outputArr, 1, C, H, W, K, R, S, stride_h, stride_w, pad_h, pad_w, P, Q); - } // end omp parallel for delete [] rotatedDoutPtrArrays; delete [] col2imInputArrays; - + return computeNNZ(retPtr, N*CHW); } void conv2dSparse(int apos, int alen, int* aix, double* avals, double* filterPtr, double* retPtr, int N, int C, int H, int W, @@ -290,7 +303,8 @@ void conv2dBackwardFilterSparseDense(int apos, int alen, int* aix, double* avals delete [] temp1; } -void conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, + +int conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads) { // First step: Avoids oversubscription and other openmp/internal blas threading issues setNumThreadsForBLAS(1); @@ -306,7 +320,8 @@ void conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, do #pragma omp parallel for num_threads(numOpenMPThreads) for (int n = 0; n < N; n++) { - double* loweredMat = loweredMatArrays + numIm2ColElem*omp_get_thread_num(); + int threadID = omp_get_thread_num(); + double* loweredMat = loweredMatArrays + numIm2ColElem*threadID; // Step 1: Perform im2col im2col(inputPtr + n * CHW, loweredMat, 1, C, H, W, K, @@ -318,8 +333,8 @@ void conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, do C * R * S, P * Q, 1); // Step 3: Add bias + double* outputArr = retPtr + n*KPQ; if(addBias) { - double* outputArr = retPtr + n*KPQ; int index = 0; for(int k = 0; k < K; k++) { for(int pq = 0; pq < PQ; pq++, index++) { @@ -330,4 +345,5 @@ void conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, do } // end omp parallel for delete [] loweredMatArrays; + return computeNNZ(retPtr, N*KPQ); } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/libmatrixdnn.h ---------------------------------------------------------------------- diff --git a/src/main/cpp/libmatrixdnn.h b/src/main/cpp/libmatrixdnn.h index bf6c113..86e7b24 100644 --- a/src/main/cpp/libmatrixdnn.h +++ b/src/main/cpp/libmatrixdnn.h @@ -20,13 +20,13 @@ #ifndef _libmatrixdnn_h #define _libmatrixdnn_h -void conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, +int conv2dBackwardFilterDense(double* inputPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); -void conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, +int conv2dBackwardDataDense(double* filterPtr, double* doutPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); -void conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, +int conv2dBiasAddDense(double* inputPtr, double* biasPtr, double* filterPtr, double* retPtr, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, bool addBias, int numThreads); void conv2dSparse(int apos, int alen, int* aix, double* avals, double* filter, double* ret, int N, int C, int H, int W, http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/systemml.cpp ---------------------------------------------------------------------- diff --git a/src/main/cpp/systemml.cpp b/src/main/cpp/systemml.cpp index 41ce0bc..34ae248 100644 --- a/src/main/cpp/systemml.cpp +++ b/src/main/cpp/systemml.cpp @@ -144,7 +144,7 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwa return (jboolean) true; } -JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dDense( +JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dDense( JNIEnv* env, jclass, jdoubleArray input, jdoubleArray filter, jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) { @@ -152,18 +152,18 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dDense( double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads); double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads); if(inputPtr == NULL || filterPtr == NULL || retPtr == NULL) - return (jboolean) false; + return (jint) -1; - conv2dBiasAddDense(inputPtr, 0, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S, + int nnz = conv2dBiasAddDense(inputPtr, 0, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, false, (int) numThreads); RELEASE_INPUT_DOUBLE_ARRAY(env, input, inputPtr, numThreads); RELEASE_INPUT_DOUBLE_ARRAY(env, filter, filterPtr, numThreads); RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads); - return (jboolean) true; + return (jint) nnz; } -JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBiasAddDense( +JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBiasAddDense( JNIEnv* env, jclass, jdoubleArray input, jdoubleArray bias, jdoubleArray filter, jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) { @@ -173,19 +173,19 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBiasAd double* filterPtr = GET_DOUBLE_ARRAY(env, filter, numThreads); double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads); if(inputPtr == NULL || biasPtr == NULL || filterPtr == NULL || retPtr == NULL) - return (jboolean) false; + return (jint) -1; - conv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S, + int nnz = conv2dBiasAddDense(inputPtr, biasPtr, filterPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, true, (int) numThreads); RELEASE_INPUT_DOUBLE_ARRAY(env, input, inputPtr, numThreads); RELEASE_INPUT_DOUBLE_ARRAY(env, bias, biasPtr, numThreads); RELEASE_INPUT_DOUBLE_ARRAY(env, filter, filterPtr, numThreads); RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads); - return (jboolean) true; + return (jint) nnz; } -JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardDataDense( +JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardDataDense( JNIEnv* env, jclass, jdoubleArray filter, jdoubleArray dout, jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) { @@ -194,18 +194,18 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwa double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads); double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads); if(doutPtr == NULL || filterPtr == NULL || retPtr == NULL) - return (jboolean) false; + return (jint) -1; - conv2dBackwardDataDense(filterPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S, + int nnz = conv2dBackwardDataDense(filterPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, (int) numThreads); RELEASE_INPUT_DOUBLE_ARRAY(env, filter, filterPtr, numThreads); RELEASE_INPUT_DOUBLE_ARRAY(env, dout, doutPtr, numThreads); RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads); - return (jboolean) true; + return (jint) nnz; } -JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterDense( +JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterDense( JNIEnv* env, jclass, jdoubleArray input, jdoubleArray dout, jdoubleArray ret, jint N, jint C, jint H, jint W, jint K, jint R, jint S, jint stride_h, jint stride_w, jint pad_h, jint pad_w, jint P, jint Q, jint numThreads) { @@ -213,13 +213,13 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwa double* doutPtr = GET_DOUBLE_ARRAY(env, dout, numThreads); double* retPtr = GET_DOUBLE_ARRAY(env, ret, numThreads); if(doutPtr == NULL || inputPtr == NULL || retPtr == NULL) - return (jboolean) false; + return (jint) -1; - conv2dBackwardFilterDense(inputPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S, + int nnz = conv2dBackwardFilterDense(inputPtr, doutPtr, retPtr, (int) N, (int) C, (int) H, (int) W, (int) K, (int) R, (int) S, (int) stride_h, (int) stride_w, (int) pad_h, (int) pad_w, (int) P, (int) Q, (int) numThreads); RELEASE_INPUT_DOUBLE_ARRAY(env, input, inputPtr, numThreads); RELEASE_INPUT_DOUBLE_ARRAY(env, dout, doutPtr, numThreads); RELEASE_DOUBLE_ARRAY(env, ret, retPtr, numThreads); - return (jboolean) true; + return (jint) nnz; } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/cpp/systemml.h ---------------------------------------------------------------------- diff --git a/src/main/cpp/systemml.h b/src/main/cpp/systemml.h index ac36495..f6f5cd2 100644 --- a/src/main/cpp/systemml.h +++ b/src/main/cpp/systemml.h @@ -45,49 +45,49 @@ JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_tsmm /* * Class: org_apache_sysml_utils_NativeHelper * Method: conv2dDense - * Signature: ([D[D[DIIIIIIIIIIIIII)Z + * Signature: ([D[D[DIIIIIIIIIIIIII)I */ -JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dDense +JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dDense (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint); /* * Class: org_apache_sysml_utils_NativeHelper * Method: conv2dBiasAddDense - * Signature: ([D[D[D[DIIIIIIIIIIIIII)Z + * Signature: ([D[D[D[DIIIIIIIIIIIIII)I */ -JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBiasAddDense +JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBiasAddDense (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint); /* * Class: org_apache_sysml_utils_NativeHelper - * Method: conv2dBackwardDataDense - * Signature: ([D[D[DIIIIIIIIIIIIII)Z + * Method: conv2dBackwardFilterDense + * Signature: ([D[D[DIIIIIIIIIIIIII)I */ -JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardDataDense +JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterDense (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint); /* * Class: org_apache_sysml_utils_NativeHelper - * Method: conv2dBackwardFilterDense - * Signature: ([D[D[DIIIIIIIIIIIIII)Z + * Method: conv2dBackwardDataDense + * Signature: ([D[D[DIIIIIIIIIIIIII)I */ -JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterDense +JNIEXPORT jint JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardDataDense (JNIEnv *, jclass, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint); /* * Class: org_apache_sysml_utils_NativeHelper - * Method: conv2dSparse + * Method: conv2dBackwardFilterSparseDense * Signature: (II[I[D[D[DIIIIIIIIIIIIII)Z */ -JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dSparse +JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterSparseDense (JNIEnv *, jclass, jint, jint, jintArray, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint); /* * Class: org_apache_sysml_utils_NativeHelper - * Method: conv2dBackwardFilterSparse + * Method: conv2dSparse * Signature: (II[I[D[D[DIIIIIIIIIIIIII)Z */ -JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dBackwardFilterSparseDense +JNIEXPORT jboolean JNICALL Java_org_apache_sysml_utils_NativeHelper_conv2dSparse (JNIEnv *, jclass, jint, jint, jintArray, jdoubleArray, jdoubleArray, jdoubleArray, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint, jint); /* @@ -103,4 +103,5 @@ JNIEXPORT void JNICALL Java_org_apache_sysml_utils_NativeHelper_setMaxNumThreads #endif #endif + \ No newline at end of file http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java index e4d3ba2..ab82697 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixDNN.java @@ -84,7 +84,7 @@ public class LibMatrixDNN { private static AtomicLong loopedConvBwdDataCol2ImTime = new AtomicLong(0); public static void appendStatistics(StringBuilder sb) { - if(DMLScript.STATISTICS && DISPLAY_STATISTICS && (conv2dDenseCount.get() != 0 || conv2dSparseCount.get() != 0)) { + if(DMLScript.STATISTICS && DISPLAY_STATISTICS) { sb.append("LibMatrixDNN dense count (conv/bwdF/bwdD/im2col/maxBwd):\t" + conv2dDenseCount.get() + "/" + conv2dBwdFilterDenseCount.get() + "/" @@ -97,15 +97,13 @@ public class LibMatrixDNN { + conv2dBwdDataSparseCount.get() + "/" + im2colSparseCount.get() + "/" + maxPoolBwdSparseCount.get() + ".\n"); - if(loopedConvMatMultTime.get() != 0 || loopedConvIm2ColTime.get() != 0) { - sb.append("LibMatrixDNN conv(im2col/matmult), bwdF (im2col/matmult), bwdD (col2im/matmult) time:\t" + - String.format("%.3f", loopedConvIm2ColTime.get()*1e-9) + "/" + - String.format("%.3f", loopedConvMatMultTime.get()*1e-9) + "/" + - String.format("%.3f", loopedConvBwdFilterIm2ColTime.get()*1e-9) + "/" + - String.format("%.3f", loopedConvBwdFilterMatMultTime.get()*1e-9) + "/" + - String.format("%.3f", loopedConvBwdDataCol2ImTime.get()*1e-9) + "/" + - String.format("%.3f", loopedConvBwdDataMatMultTime.get()*1e-9) + " sec.\n"); - } + sb.append("LibMatrixDNN conv(im2col/matmult), bwdF (im2col/matmult), bwdD (col2im/matmult) time:\t" + + String.format("%.3f", loopedConvIm2ColTime.get()*1e-9) + "/" + + String.format("%.3f", loopedConvMatMultTime.get()*1e-9) + "/" + + String.format("%.3f", loopedConvBwdFilterIm2ColTime.get()*1e-9) + "/" + + String.format("%.3f", loopedConvBwdFilterMatMultTime.get()*1e-9) + "/" + + String.format("%.3f", loopedConvBwdDataCol2ImTime.get()*1e-9) + "/" + + String.format("%.3f", loopedConvBwdDataMatMultTime.get()*1e-9) + " sec.\n"); } } public static void resetStatistics() { @@ -158,7 +156,7 @@ public class LibMatrixDNN { params.bias.sparseToDense(); // Since bias is extremely small array if(isEligibleForConv2dSparse(params)) - Statistics.numNativeLibMatrixDNNCalls.increment(); + Statistics.numNativeSparseConv2dCalls.increment(); runConvTask(TaskType.LoopedIm2ColConv2d, params); @@ -179,7 +177,7 @@ public class LibMatrixDNN { checkInputsConv2dBackwardData(filter, dout, outputBlock, params); if(isEligibleForConv2dBackwardDataDense(params)) - Statistics.numNativeLibMatrixDNNCalls.increment(); + Statistics.numNativeSparseConv2dBwdDataCalls.increment(); runConvTask(TaskType.LoopedIm2ColConv2dBwdData, params); @@ -200,7 +198,7 @@ public class LibMatrixDNN { checkInputsConv2dBackwardFilter(input, dout, outputBlock, params); if(isEligibleForConv2dBackwardFilterSparseDense(params)) - Statistics.numNativeLibMatrixDNNCalls.increment(); + Statistics.numNativeSparseConv2dBwdFilterCalls.increment(); runConvTask(TaskType.LoopedIm2ColConv2dBwdFilter, params); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java index 4b12596..524218d 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/LibMatrixNative.java @@ -18,6 +18,7 @@ */ package org.apache.sysml.runtime.matrix.data; +import org.apache.sysml.api.DMLScript; import org.apache.sysml.hops.OptimizerUtils; import org.apache.sysml.runtime.DMLRuntimeException; import org.apache.sysml.utils.NativeHelper; @@ -60,9 +61,13 @@ public class LibMatrixNative { !isMatMultMemoryBound(m1.rlen, m1.clen, m2.clen) && !m1.isInSparseFormat() && !m2.isInSparseFormat()) { ret.sparse = false; ret.allocateDenseBlock(); + long start = DMLScript.STATISTICS ? System.nanoTime() : 0; if (NativeHelper.matrixMultDenseDense(m1.denseBlock, m2.denseBlock, ret.denseBlock, m1.getNumRows(), m1.getNumColumns(), m2.getNumColumns(), k)) { - Statistics.numNativeLibMatrixMultCalls.increment(); + if(DMLScript.STATISTICS) { + Statistics.nativeLibMatrixMultTime += System.nanoTime() - start; + Statistics.numNativeLibMatrixMultCalls.increment(); + } ret.recomputeNonZeros(); // post-processing (nnz maintained in parallel) if(examSparsity) @@ -94,12 +99,17 @@ public class LibMatrixNative { if(NativeHelper.isNativeLibraryLoaded() && !input.isInSparseFormat() && !filter.isInSparseFormat()) { setNumThreads(params); if(params.bias == null) { - if(NativeHelper.conv2dDense(input.denseBlock, filter.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, + long start = DMLScript.STATISTICS ? System.nanoTime() : 0; + int nnz = NativeHelper.conv2dDense(input.denseBlock, filter.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, - params.P, params.Q, params.numThreads)) { - Statistics.numNativeLibMatrixDNNCalls.increment(); + params.P, params.Q, params.numThreads); + if(nnz != -1) { + if(DMLScript.STATISTICS) { + Statistics.nativeConv2dTime += System.nanoTime() - start; + Statistics.numNativeConv2dCalls.increment(); + } // post-processing: maintain nnz - outputBlock.recomputeNonZeros(); + outputBlock.setNonZeros(nnz); return; } else { @@ -110,13 +120,18 @@ public class LibMatrixNative { else { if(params.bias.isInSparseFormat()) params.bias.sparseToDense(); // Bias matrix is usually extremely small - if(NativeHelper.conv2dBiasAddDense(input.denseBlock, params.bias.denseBlock, filter.denseBlock, outputBlock.denseBlock, + long start = DMLScript.STATISTICS ? System.nanoTime() : 0; + int nnz = NativeHelper.conv2dBiasAddDense(input.denseBlock, params.bias.denseBlock, filter.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, - params.P, params.Q, params.numThreads)) { - Statistics.numNativeLibMatrixDNNCalls.increment(); + params.P, params.Q, params.numThreads); + if(nnz != -1) { + if(DMLScript.STATISTICS) { + Statistics.nativeConv2dTime += System.nanoTime() - start; + Statistics.numNativeConv2dCalls.increment(); + } // post-processing: maintain nnz - outputBlock.recomputeNonZeros(); + outputBlock.setNonZeros(nnz); return; } else { @@ -150,12 +165,17 @@ public class LibMatrixNative { params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads; if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !input.isInSparseFormat()) { setNumThreads(params); - if(NativeHelper.conv2dBackwardFilterDense(input.denseBlock, dout.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, - params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, - params.P, params.Q, params.numThreads)) { - Statistics.numNativeLibMatrixDNNCalls.increment(); + long start = DMLScript.STATISTICS ? System.nanoTime() : 0; + int nnz = NativeHelper.conv2dBackwardFilterDense(input.denseBlock, dout.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, + params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, + params.P, params.Q, params.numThreads); + if(nnz != -1) { + if(DMLScript.STATISTICS) { + Statistics.nativeConv2dBwdFilterTime += System.nanoTime() - start; + Statistics.numNativeConv2dBwdFilterCalls.increment(); + } // post-processing: maintain nnz - outputBlock.recomputeNonZeros(); + outputBlock.setNonZeros(nnz); return; } else { @@ -181,12 +201,17 @@ public class LibMatrixNative { params.numThreads = params.numThreads <= 0 ? NativeHelper.getMaxNumThreads() : params.numThreads; if(NativeHelper.isNativeLibraryLoaded() && !dout.isInSparseFormat() && !filter.isInSparseFormat()) { setNumThreads(params); - if(NativeHelper.conv2dBackwardDataDense(filter.denseBlock, dout.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, - params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, - params.P, params.Q, params.numThreads)) { - Statistics.numNativeLibMatrixDNNCalls.increment(); + long start = DMLScript.STATISTICS ? System.nanoTime() : 0; + int nnz = NativeHelper.conv2dBackwardDataDense(filter.denseBlock, dout.denseBlock, outputBlock.denseBlock, params.N, params.C, params.H, params.W, + params.K, params.R, params.S, params.stride_h, params.stride_w, params.pad_h, params.pad_w, + params.P, params.Q, params.numThreads); + if(nnz != -1) { + if(DMLScript.STATISTICS) { + Statistics.nativeConv2dBwdDataTime += System.nanoTime() - start; + Statistics.numNativeConv2dBwdDataCalls.increment(); + } // post-processing: maintain nnz - outputBlock.recomputeNonZeros(); + outputBlock.setNonZeros(nnz); return; } else { http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java index ff6a007..233350a 100644 --- a/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java +++ b/src/main/java/org/apache/sysml/runtime/matrix/data/MatrixBlock.java @@ -34,6 +34,7 @@ import java.util.stream.LongStream; import org.apache.commons.math3.random.Well1024a; import org.apache.hadoop.io.DataInputBuffer; +import org.apache.sysml.api.DMLScript; import org.apache.sysml.conf.ConfigurationManager; import org.apache.sysml.hops.Hop.OpOp2; import org.apache.sysml.hops.OptimizerUtils; @@ -86,6 +87,7 @@ import org.apache.sysml.runtime.util.FastBufferedDataOutputStream; import org.apache.sysml.runtime.util.IndexRange; import org.apache.sysml.runtime.util.UtilFunctions; import org.apache.sysml.utils.NativeHelper; +import org.apache.sysml.utils.Statistics; @@ -104,6 +106,8 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab //basic header (int rlen, int clen, byte type) public static final int HEADER_SIZE = 9; + private static final boolean DISPLAY_STATISTICS = false; // Developer flag to measure performance overhead of various functions in this class + public enum BlockType{ EMPTY_BLOCK, ULTRA_SPARSE_BLOCK, //ultra sparse representation, in-mem same as sparse @@ -336,6 +340,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab allocateDenseBlock(); } + @SuppressWarnings("unused") public void allocateDenseBlock(boolean clearNNZ) throws RuntimeException { @@ -350,7 +355,9 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab //allocate block if non-existing or too small (guaranteed to be 0-initialized), if(denseBlock == null || denseBlock.length < limit) { + long start = DISPLAY_STATISTICS && DMLScript.STATISTICS ? System.nanoTime() : 0; denseBlock = new double[(int)limit]; + Statistics.allocateDoubleArrTime += DISPLAY_STATISTICS && DMLScript.STATISTICS ? (System.nanoTime() - start) : 0; } //clear nnz if necessary @@ -986,9 +993,11 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab * * @throws DMLRuntimeException if DMLRuntimeException occurs */ + @SuppressWarnings("unused") public void examSparsity() throws DMLRuntimeException { + long start = DISPLAY_STATISTICS && DMLScript.STATISTICS ? System.nanoTime() : 0; //determine target representation boolean sparseDst = evalSparseFormatInMemory(); @@ -1002,6 +1011,8 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab sparseToDense(); else if( !sparse && sparseDst ) denseToSparse(); + + Statistics.examSparsityTime += DISPLAY_STATISTICS && DMLScript.STATISTICS ? (System.nanoTime() - start) : 0; } /** @@ -1141,6 +1152,7 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab * of the entire matrix block. * */ + @SuppressWarnings("unused") public void recomputeNonZeros() { if( sparse && sparseBlock!=null ) //SPARSE (max long) @@ -1150,12 +1162,14 @@ public class MatrixBlock extends MatrixValue implements CacheBlock, Externalizab } else if( !sparse && denseBlock!=null ) //DENSE (max int) { + long start = DISPLAY_STATISTICS && DMLScript.STATISTICS ? System.nanoTime() : 0; double[] a = denseBlock; final int limit=rlen*clen; int nnz = 0; for(int i=0; i<limit; i++) nnz += (a[i]!=0) ? 1 : 0; nonZeros = nnz; + Statistics.recomputeNNZTime += DISPLAY_STATISTICS && DMLScript.STATISTICS ? (System.nanoTime() - start) : 0; } } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/udf/lib/SGDNesterovUpdate.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/udf/lib/SGDNesterovUpdate.java b/src/main/java/org/apache/sysml/udf/lib/SGDNesterovUpdate.java index 9c027d3..a88f230 100644 --- a/src/main/java/org/apache/sysml/udf/lib/SGDNesterovUpdate.java +++ b/src/main/java/org/apache/sysml/udf/lib/SGDNesterovUpdate.java @@ -39,7 +39,7 @@ import org.apache.sysml.udf.Matrix.ValueType; * Assumption: the input batch fits in CP (which is also the assumption of most deep learning systems). * * Usage: - * update_nesterov = externalFunction(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v) return (matrix[double] X, matrix[double] v) implemented in (classname="org.apache.sysml.udf.lib.SGDNesterovUpdate",exectype="mem"); + * update_nesterov = externalFunction(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v, double lambda) return (matrix[double] X, matrix[double] v) implemented in (classname="org.apache.sysml.udf.lib.SGDNesterovUpdate",exectype="mem"); * [X, v] = update_nesterov(X, dX, lr, mu, v); * * @@ -81,16 +81,19 @@ public class SGDNesterovUpdate extends PackageFunction { double mu = Double.parseDouble(((Scalar)getFunctionInput(3)).getValue()); MatrixBlock v = ((Matrix) getFunctionInput(4)).getMatrixObject().acquireRead(); - // v = mu * v - lr * dX + double lambda = Double.parseDouble(((Scalar)getFunctionInput(5)).getValue()); + + // v = mu * v - lr * dX - lr*lambda*X updatedV = new Matrix( "tmp_" + rand.nextLong(), v.getNumRows(), v.getNumColumns(), ValueType.Double ); MatrixBlock updatedVMB = allocateDenseMatrixBlock(updatedV); double [] updatedVData = updatedVMB.getDenseBlock(); - if(isDense(v) && isDense(dX)) { + if(isDense(v) && isDense(dX) && isDense(X)) { double [] vArr = v.getDenseBlock(); double [] dXArr = dX.getDenseBlock(); + double [] XArr = X.getDenseBlock(); int nnz = 0; for(int i = 0; i < updatedVData.length; i++) { - updatedVData[i] = mu*vArr[i] - lr*dXArr[i]; + updatedVData[i] = mu*vArr[i] - lr*dXArr[i] - lr*lambda*XArr[i]; nnz += (updatedVData[i]!=0) ? 1 : 0; } updatedVMB.setNonZeros(nnz); @@ -98,8 +101,10 @@ public class SGDNesterovUpdate extends PackageFunction { else { multiplyByConstant(v, mu, updatedVData); multiplyByConstant(dX, -lr, updatedVData); + multiplyByConstant(X, -lr*lambda, updatedVData); updatedVMB.recomputeNonZeros(); } + updatedV.setMatrixDoubleArray(updatedVMB, OutputInfo.BinaryBlockOutputInfo, InputInfo.BinaryBlockInputInfo); // X = X - mu * v_prev + (1 + mu) * v http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/utils/NativeHelper.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/NativeHelper.java b/src/main/java/org/apache/sysml/utils/NativeHelper.java index 129824b..fe5e085 100644 --- a/src/main/java/org/apache/sysml/utils/NativeHelper.java +++ b/src/main/java/org/apache/sysml/utils/NativeHelper.java @@ -254,17 +254,18 @@ public class NativeHelper { // K = number of filters, R = filter height, S = filter width // TODO: case not handled: sparse filters (which will only be executed in Java). Since filters are relatively smaller, this is a low priority. + // Returns -1 if failures or returns number of nonzeros // Called by ConvolutionCPInstruction if both input and filter are dense - public static native boolean conv2dDense(double [] input, double [] filter, double [] ret, int N, int C, int H, int W, + public static native int conv2dDense(double [] input, double [] filter, double [] ret, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); - public static native boolean conv2dBiasAddDense(double [] input, double [] bias, double [] filter, double [] ret, int N, int C, int H, int W, + public static native int conv2dBiasAddDense(double [] input, double [] bias, double [] filter, double [] ret, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); // Called by ConvolutionCPInstruction if both input and filter are dense - public static native boolean conv2dBackwardFilterDense(double [] input, double [] dout, double [] ret, int N, int C, int H, int W, + public static native int conv2dBackwardFilterDense(double [] input, double [] dout, double [] ret, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); // If both filter and dout are dense, then called by ConvolutionCPInstruction // Else, called by LibMatrixDNN's thread if filter is dense. dout[n] is converted to dense if sparse. - public static native boolean conv2dBackwardDataDense(double [] filter, double [] dout, double [] ret, int N, int C, int H, int W, + public static native int conv2dBackwardDataDense(double [] filter, double [] dout, double [] ret, int N, int C, int H, int W, int K, int R, int S, int stride_h, int stride_w, int pad_h, int pad_w, int P, int Q, int numThreads); // Currently only supported with numThreads = 1 and sparse input http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/java/org/apache/sysml/utils/Statistics.java ---------------------------------------------------------------------- diff --git a/src/main/java/org/apache/sysml/utils/Statistics.java b/src/main/java/org/apache/sysml/utils/Statistics.java index 97888cb..49633d8 100644 --- a/src/main/java/org/apache/sysml/utils/Statistics.java +++ b/src/main/java/org/apache/sysml/utils/Statistics.java @@ -114,7 +114,21 @@ public class Statistics private static LongAdder numNativeFailures = new LongAdder(); public static LongAdder numNativeLibMatrixMultCalls = new LongAdder(); - public static LongAdder numNativeLibMatrixDNNCalls = new LongAdder(); + public static LongAdder numNativeConv2dCalls = new LongAdder(); + public static LongAdder numNativeConv2dBwdDataCalls = new LongAdder(); + public static LongAdder numNativeConv2dBwdFilterCalls = new LongAdder(); + public static LongAdder numNativeSparseConv2dCalls = new LongAdder(); + public static LongAdder numNativeSparseConv2dBwdFilterCalls = new LongAdder(); + public static LongAdder numNativeSparseConv2dBwdDataCalls = new LongAdder(); + public static long nativeLibMatrixMultTime = 0; + public static long nativeConv2dTime = 0; + public static long nativeConv2dBwdDataTime = 0; + public static long nativeConv2dBwdFilterTime = 0; + + public static long recomputeNNZTime = 0; + public static long examSparsityTime = 0; + public static long allocateDoubleArrTime = 0; + public static void incrementNativeFailuresCounter() { numNativeFailures.increment(); // This is very rare and am not sure it is possible at all. Our initial experiments never encountered this case. @@ -378,8 +392,17 @@ public class Statistics GPUStatistics.reset(); numNativeLibMatrixMultCalls.reset(); - numNativeLibMatrixDNNCalls.reset(); + numNativeSparseConv2dCalls.reset(); + numNativeSparseConv2dBwdDataCalls.reset(); + numNativeSparseConv2dBwdFilterCalls.reset(); + numNativeConv2dCalls.reset(); + numNativeConv2dBwdDataCalls.reset(); + numNativeConv2dBwdFilterCalls.reset(); numNativeFailures.reset(); + nativeLibMatrixMultTime = 0; + nativeConv2dTime = 0; + nativeConv2dBwdFilterTime = 0; + nativeConv2dBwdDataTime = 0; LibMatrixDNN.resetStatistics(); } @@ -635,11 +658,23 @@ public class Statistics //show extended caching/compilation statistics if( DMLScript.STATISTICS ) { - if(NativeHelper.blasType != null && (numNativeLibMatrixMultCalls.longValue() > 0 || - numNativeLibMatrixDNNCalls.longValue() > 0)) { + if(NativeHelper.blasType != null) { String blas = NativeHelper.blasType != null ? NativeHelper.blasType : ""; - sb.append("Native " + blas + " calls (LibMatrixMult/LibMatrixDNN):\t" + numNativeLibMatrixMultCalls.longValue() + "/" + numNativeLibMatrixDNNCalls.longValue() + ".\n"); + sb.append("Native " + blas + " calls (dense mult/conv/bwdF/bwdD):\t" + numNativeLibMatrixMultCalls.longValue() + "/" + + numNativeConv2dCalls.longValue() + "/" + numNativeConv2dBwdFilterCalls.longValue() + + "/" + numNativeConv2dBwdDataCalls.longValue() + ".\n"); + sb.append("Native " + blas + " calls (sparse conv/bwdF/bwdD):\t" + + numNativeSparseConv2dCalls.longValue() + "/" + numNativeSparseConv2dBwdFilterCalls.longValue() + + "/" + numNativeSparseConv2dBwdDataCalls.longValue() + ".\n"); + sb.append("Native " + blas + " times (dense mult/conv/bwdF/bwdD):\t" + String.format("%.3f", nativeLibMatrixMultTime*1e-9) + "/" + + String.format("%.3f", nativeConv2dTime*1e-9) + "/" + String.format("%.3f", nativeConv2dBwdFilterTime*1e-9) + "/" + + String.format("%.3f", nativeConv2dBwdDataTime*1e-9) + ".\n"); + } + if(recomputeNNZTime != 0 || examSparsityTime != 0 || allocateDoubleArrTime != 0) { + sb.append("MatrixBlock times (recomputeNNZ/examSparsity/allocateDoubleArr):\t" + String.format("%.3f", recomputeNNZTime*1e-9) + "/" + + String.format("%.3f", examSparsityTime*1e-9) + "/" + String.format("%.3f", allocateDoubleArrTime*1e-9) + ".\n"); } + sb.append("Cache hits (Mem, WB, FS, HDFS):\t" + CacheStatistics.displayHits() + ".\n"); sb.append("Cache writes (WB, FS, HDFS):\t" + CacheStatistics.displayWrites() + ".\n"); sb.append("Cache times (ACQr/m, RLS, EXP):\t" + CacheStatistics.displayTime() + " sec.\n"); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala ---------------------------------------------------------------------- diff --git a/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala index 377ebf3..f7f85c3 100644 --- a/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala +++ b/src/main/scala/org/apache/sysml/api/dl/Caffe2DML.scala @@ -291,7 +291,7 @@ class Caffe2DML(val sc: SparkContext, val solverParam:Caffe.SolverParameter, appendVisualizationHeaders(dmlScript, numTabs) if(Caffe2DML.USE_NESTEROV_UDF) { - tabDMLScript(dmlScript, numTabs).append("update_nesterov = externalFunction(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v) return (matrix[double] X, matrix[double] v) implemented in (classname=\"org.apache.sysml.udf.lib.SGDNesterovUpdate\",exectype=\"mem\"); \n") + tabDMLScript(dmlScript, numTabs).append("update_nesterov = externalFunction(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v, double lambda) return (matrix[double] X, matrix[double] v) implemented in (classname=\"org.apache.sysml.udf.lib.SGDNesterovUpdate\",exectype=\"mem\"); \n") } // Read and convert to one-hote encoding http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/5489c665/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala ---------------------------------------------------------------------- diff --git a/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala b/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala index ae3d21d..0620e44 100644 --- a/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala +++ b/src/main/scala/org/apache/sysml/api/dl/CaffeSolver.scala @@ -144,12 +144,15 @@ class AdaGrad(lambda:Double=5e-04, epsilon:Double=1e-6) extends CaffeSolver { */ class Nesterov(lambda:Double=5e-04, momentum:Double=0.9) extends CaffeSolver { def update(dmlScript:StringBuilder, layer:CaffeLayer):Unit = { - l2reg_update(lambda, dmlScript, layer) val fn = if(Caffe2DML.USE_NESTEROV_UDF) "update_nesterov" else "sgd_nesterov::update" + val lastParameter = if(Caffe2DML.USE_NESTEROV_UDF) (", " + lambda) else "" + if(!Caffe2DML.USE_NESTEROV_UDF) { + l2reg_update(lambda, dmlScript, layer) + } if(layer.shouldUpdateWeight) dmlScript.append("\t").append("["+ commaSep(layer.weight, layer.weight+"_v") + "] " + - "= " + fn + "(" + commaSep(layer.weight, layer.dWeight, getWeightLr(layer), momentum.toString, layer.weight+"_v") + ")\n") + "= " + fn + "(" + commaSep(layer.weight, layer.dWeight, getWeightLr(layer), momentum.toString, layer.weight+"_v") + lastParameter + ")\n") if(layer.shouldUpdateBias) dmlScript.append("\t").append("["+ commaSep(layer.bias, layer.bias+"_v") + "] " + - "= " + fn + "(" + commaSep(layer.bias, layer.dBias, getBiasLr(layer), momentum.toString, layer.bias+"_v") + ")\n") + "= " + fn + "(" + commaSep(layer.bias, layer.dBias, getBiasLr(layer), momentum.toString, layer.bias+"_v") + lastParameter + ")\n") } def init(dmlScript:StringBuilder, layer:CaffeLayer):Unit = { if(layer.shouldUpdateWeight) dmlScript.append(layer.weight+"_v = sgd_nesterov::init(" + layer.weight + ")\n")
