[SYSTEMML-1453] Update Conv & Max Pooling layer names to include "2D"

This updates `conv*.dml` and `max_pool*.dml` to `conv2d*.dml` and
`max_pool2d*.dml` to allow for 1D and 3D variants in the future.

Closes #447.


Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo
Commit: 
http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/07039caa
Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/07039caa
Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/07039caa

Branch: refs/heads/master
Commit: 07039caa9629dd3a26aa66c9ec860cf7f7917724
Parents: 5c59e03
Author: Mike Dusenberry <[email protected]>
Authored: Fri Mar 31 18:39:11 2017 -0700
Committer: Mike Dusenberry <[email protected]>
Committed: Fri Mar 31 18:39:11 2017 -0700

----------------------------------------------------------------------
 projects/breast_cancer/convnet.dml              | 101 +++++----
 .../SystemML-NN/examples/mnist_lenet.dml        |  56 ++---
 scripts/staging/SystemML-NN/nn/layers/conv.dml  | 194 -----------------
 .../staging/SystemML-NN/nn/layers/conv2d.dml    | 194 +++++++++++++++++
 .../SystemML-NN/nn/layers/conv2d_builtin.dml    | 160 ++++++++++++++
 .../SystemML-NN/nn/layers/conv_builtin.dml      | 155 -------------
 .../staging/SystemML-NN/nn/layers/max_pool.dml  | 159 --------------
 .../SystemML-NN/nn/layers/max_pool2d.dml        | 159 ++++++++++++++
 .../nn/layers/max_pool2d_builtin.dml            | 103 +++++++++
 .../SystemML-NN/nn/layers/max_pool_builtin.dml  | 103 ---------
 .../SystemML-NN/nn/test/conv2d_simple.dml       | 215 +++++++++++++++++++
 .../staging/SystemML-NN/nn/test/conv_simple.dml | 215 -------------------
 .../staging/SystemML-NN/nn/test/grad_check.dml  | 170 +++++++--------
 .../SystemML-NN/nn/test/max_pool2d_simple.dml   | 172 +++++++++++++++
 .../SystemML-NN/nn/test/max_pool_simple.dml     | 172 ---------------
 .../staging/SystemML-NN/nn/test/run_tests.dml   |  16 +-
 scripts/staging/SystemML-NN/nn/test/test.dml    | 115 +++++-----
 17 files changed, 1248 insertions(+), 1211 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/projects/breast_cancer/convnet.dml
----------------------------------------------------------------------
diff --git a/projects/breast_cancer/convnet.dml 
b/projects/breast_cancer/convnet.dml
index 5f115a2..85c7dd8 100644
--- a/projects/breast_cancer/convnet.dml
+++ b/projects/breast_cancer/convnet.dml
@@ -7,9 +7,9 @@
 # to you under the Apache License, Version 2.0 (the
 # "License"); you may not use this file except in compliance
 # with the License.  You may obtain a copy of the License at
-# 
+#
 #   http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -24,11 +24,11 @@
  */
 # Imports
 source("nn/layers/affine.dml") as affine
-source("nn/layers/conv_builtin.dml") as conv
+source("nn/layers/conv2d_builtin.dml") as conv2d
 source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
 source("nn/layers/dropout.dml") as dropout
 source("nn/layers/l2_reg.dml") as l2_reg
-source("nn/layers/max_pool_builtin.dml") as max_pool
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
 source("nn/layers/relu.dml") as relu
 source("nn/layers/softmax.dml") as softmax
 #source("nn/optim/adam.dml") as adam
@@ -96,9 +96,9 @@ train = function(matrix[double] X, matrix[double] Y,
   F3 = 32  # num conv filters in conv3
   N1 = 512  # num nodes in affine1
   # Note: affine2 has K nodes, which is equal to the number of target 
dimensions (num classes)
-  [Wc1, bc1] = conv::init(F1, C, Hf, Wf)  # inputs: (N, C*Hin*Win)
-  [Wc2, bc2] = conv::init(F2, F1, Hf, Wf)  # inputs: (N, F1*(Hin/2)*(Win/2))
-  [Wc3, bc3] = conv::init(F3, F2, Hf, Wf)  # inputs: (N, 
F2*(Hin/2^2)*(Win/2^2))
+  [Wc1, bc1] = conv2d::init(F1, C, Hf, Wf)  # inputs: (N, C*Hin*Win)
+  [Wc2, bc2] = conv2d::init(F2, F1, Hf, Wf)  # inputs: (N, F1*(Hin/2)*(Win/2))
+  [Wc3, bc3] = conv2d::init(F3, F2, Hf, Wf)  # inputs: (N, 
F2*(Hin/2^2)*(Win/2^2))
   [Wa1, ba1] = affine::init(F3*(Hin/2^3)*(Win/2^3), N1)  # inputs: (N, 
F3*(Hin/2^3)*(Win/2^3))
   [Wa2, ba2] = affine::init(N1, K)  # inputs: (N, N1)
   Wa2 = Wa2 / sqrt(2)  # different initialization, since being fed into 
softmax, instead of relu
@@ -145,17 +145,23 @@ train = function(matrix[double] X, matrix[double] Y,
 
       # Compute forward pass
       ## conv layer 1: conv1 -> relu1 -> pool1
-      [outc1, Houtc1, Woutc1] = conv::forward(X_batch, Wc1, bc1, C, Hin, Win, 
Hf, Wf, stride, stride, pad, pad)
+      [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, Wc1, bc1, C, Hin, 
Win, Hf, Wf,
+                                                stride, stride, pad, pad)
       outc1r = relu::forward(outc1)
-      [outc1p, Houtc1p, Woutc1p] = max_pool::forward(outc1r, F1, Houtc1, 
Woutc1, Hf=2, Wf=2, strideh=2, stridew=2) 
+      [outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, 
Woutc1, Hf=2, Wf=2,
+                                                       strideh=2, stridew=2)
       ## conv layer 2: conv2 -> relu2 -> pool2
-      [outc2, Houtc2, Woutc2] = conv::forward(outc1p, Wc2, bc2, F1, Houtc1p, 
Woutc1p, Hf, Wf, stride, stride, pad, pad)
+      [outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, 
Woutc1p, Hf, Wf,
+                                                stride, stride, pad, pad)
       outc2r = relu::forward(outc2)
-      [outc2p, Houtc2p, Woutc2p] = max_pool::forward(outc2r, F2, Houtc2, 
Woutc2, Hf=2, Wf=2, strideh=2, stridew=2) 
+      [outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, 
Woutc2, Hf=2, Wf=2,
+                                                       strideh=2, stridew=2)
       ## conv layer 3: conv3 -> relu3 -> pool3
-      [outc3, Houtc3, Woutc3] = conv::forward(outc2p, Wc3, bc3, F2, Houtc2p, 
Woutc2p, Hf, Wf, stride, stride, pad, pad)
+      [outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, 
Woutc2p, Hf, Wf,
+                                                stride, stride, pad, pad)
       outc3r = relu::forward(outc3)
-      [outc3p, Houtc3p, Woutc3p] = max_pool::forward(outc3r, F3, Houtc3, 
Woutc3, Hf=2, Wf=2, strideh=2, stridew=2) 
+      [outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, 
Woutc3, Hf=2, Wf=2,
+                                                       strideh=2, stridew=2)
       ## affine layer 1:  affine1 -> relu1 -> dropout1
       outa1 = affine::forward(outc3p, Wa1, ba1)
       outa1r = relu::forward(outa1)
@@ -176,17 +182,23 @@ train = function(matrix[double] X, matrix[double] Y,
       douta1 = relu::backward(douta1r, outa1)
       [doutc3p, dWa1, dba1] = affine::backward(douta1, outc3p, Wa1, ba1)
       ## conv layer 3: conv3 -> relu3 -> pool3
-      doutc3r = max_pool::backward(doutc3p, Houtc3p, Woutc3p, outc3r, F3, 
Houtc3, Woutc3, Hf=2, Wf=2, strideh=2, stridew=2)
+      doutc3r = max_pool2d::backward(doutc3p, Houtc3p, Woutc3p, outc3r, F3, 
Houtc3, Woutc3,
+                                     Hf=2, Wf=2, strideh=2, stridew=2)
       doutc3 = relu::backward(doutc3r, outc3)
-      [doutc2p, dWc3, dbc3] = conv::backward(doutc3, Houtc3, Woutc3, outc2p, 
Wc3, bc2, F2, Houtc2p, Woutc2p, Hf, Wf, stride, stride, pad, pad)
+      [doutc2p, dWc3, dbc3] = conv2d::backward(doutc3, Houtc3, Woutc3, outc2p, 
Wc3, bc2, F2,
+                                               Houtc2p, Woutc2p, Hf, Wf, 
stride, stride, pad, pad)
       ## conv layer 2: conv2 -> relu2 -> pool2
-      doutc2r = max_pool::backward(doutc2p, Houtc2p, Woutc2p, outc2r, F2, 
Houtc2, Woutc2, Hf=2, Wf=2, strideh=2, stridew=2)
+      doutc2r = max_pool2d::backward(doutc2p, Houtc2p, Woutc2p, outc2r, F2, 
Houtc2, Woutc2,
+                                     Hf=2, Wf=2, strideh=2, stridew=2)
       doutc2 = relu::backward(doutc2r, outc2)
-      [doutc1p, dWc2, dbc2] = conv::backward(doutc2, Houtc2, Woutc2, outc1p, 
Wc2, bc2, F1, Houtc1p, Woutc1p, Hf, Wf, stride, stride, pad, pad)
+      [doutc1p, dWc2, dbc2] = conv2d::backward(doutc2, Houtc2, Woutc2, outc1p, 
Wc2, bc2, F1,
+                                               Houtc1p, Woutc1p, Hf, Wf, 
stride, stride, pad, pad)
       ## conv layer 1: conv1 -> relu1 -> pool1
-      doutc1r = max_pool::backward(doutc1p, Houtc1p, Woutc1p, outc1r, F1, 
Houtc1, Woutc1, Hf=2, Wf=2, strideh=2, stridew=2)
+      doutc1r = max_pool2d::backward(doutc1p, Houtc1p, Woutc1p, outc1r, F1, 
Houtc1, Woutc1,
+                                     Hf=2, Wf=2, strideh=2, stridew=2)
       doutc1 = relu::backward(doutc1r, outc1)
-      [dX_batch, dWc1, dbc1] = conv::backward(doutc1, Houtc1, Woutc1, X_batch, 
Wc1, bc1, C, Hin, Win, Hf, Wf, stride, stride, pad, pad)
+      [dX_batch, dWc1, dbc1] = conv2d::backward(doutc1, Houtc1, Woutc1, 
X_batch, Wc1, bc1, C,
+                                                Hin, Win, Hf, Wf, stride, 
stride, pad, pad)
 
       # Compute regularization backward pass
       dWc1_reg = l2_reg::backward(Wc1, lambda)
@@ -222,7 +234,7 @@ train = function(matrix[double] X, matrix[double] Y,
       #[ba1, mba1, vba1] = adam::update(ba1, dba1, lr, beta1, beta2, eps, t, 
mba1, vba1)
       #[Wa2, mWa2, vWa2] = adam::update(Wa2, dWa2, lr, beta1, beta2, eps, t, 
mWa2, vWa2)
       #[ba2, mba2, vba2] = adam::update(ba2, dba2, lr, beta1, beta2, eps, t, 
mba2, vba2)
-        
+
       # Compute loss & accuracy for training & validation data every 
`log_interval` iterations.
       if (i %% log_interval == 0) {
         # Compute training loss & accuracy
@@ -348,7 +360,8 @@ predict = function(matrix[double] X, int C, int Hin, int 
Win,
   N = nrow(X)
 
   # Network:
-  # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> conv3 -> relu3 -> 
pool3 -> affine1 -> relu1 -> affine2 -> softmax
+  # conv1 -> relu1 -> pool1 -> conv2 -> relu2 -> pool2 -> conv3 -> relu3 -> 
pool3
+  #  -> affine1 -> relu1 -> affine2 -> softmax
   Hf = 3  # filter height
   Wf = 3  # filter width
   stride = 1
@@ -365,17 +378,23 @@ predict = function(matrix[double] X, int C, int Hin, int 
Win,
   # so that it can be efficiently used for parallel predictions.
   ## Compute forward pass
   ### conv layer 1: conv1 -> relu1 -> pool1
-  #[outc1, Houtc1, Woutc1] = conv::forward(X, Wc1, bc1, C, Hin, Win, Hf, Wf, 
stride, stride, pad, pad)
+  #[outc1, Houtc1, Woutc1] = conv2d::forward(X, Wc1, bc1, C, Hin, Win, Hf, Wf, 
stride, stride,
+  #                                          pad, pad)
   #outc1r = relu::forward(outc1)
-  #[outc1p, Houtc1p, Woutc1p] = max_pool::forward(outc1r, F1, Houtc1, Woutc1, 
Hf=2, Wf=2, strideh=2, stridew=2) 
+  #[outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, 
Woutc1, Hf=2, Wf=2,
+  #                                                 strideh=2, stridew=2)
   ### conv layer 2: conv2 -> relu2 -> pool2
-  #[outc2, Houtc2, Woutc2] = conv::forward(outc1p, Wc2, bc2, F1, Houtc1p, 
Woutc1p, Hf, Wf, stride, stride, pad, pad)
+  #[outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, 
Woutc1p, Hf, Wf,
+  #                                          stride, stride, pad, pad)
   #outc2r = relu::forward(outc2)
-  #[outc2p, Houtc2p, Woutc2p] = max_pool::forward(outc2r, F2, Houtc2, Woutc2, 
Hf=2, Wf=2, strideh=2, stridew=2) 
+  #[outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, 
Woutc2, Hf=2, Wf=2,
+  #                                                 strideh=2, stridew=2)
   ### conv layer 3: conv3 -> relu3 -> pool3
-  #[outc3, Houtc3, Woutc3] = conv::forward(outc2p, Wc3, bc3, F2, Houtc2p, 
Woutc2p, Hf, Wf, stride, stride, pad, pad)
+  #[outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, 
Woutc2p, Hf, Wf,
+  #                                          stride, stride, pad, pad)
   #outc3r = relu::forward(outc3)
-  #[outc3p, Houtc3p, Woutc3p] = max_pool::forward(outc3r, F3, Houtc3, Woutc3, 
Hf=2, Wf=2, strideh=2, stridew=2) 
+  #[outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, 
Woutc3, Hf=2, Wf=2,
+  #                                                 strideh=2, stridew=2)
   ### affine layer 1:  affine1 -> relu1 -> dropout
   #outa1 = affine::forward(outc3p, Wa1, ba1)
   #outa1r = relu::forward(outa1)
@@ -398,17 +417,23 @@ predict = function(matrix[double] X, int C, int Hin, int 
Win,
 
     # Compute forward pass
     ## conv layer 1: conv1 -> relu1 -> pool1
-    [outc1, Houtc1, Woutc1] = conv::forward(X_batch, Wc1, bc1, C, Hin, Win, 
Hf, Wf, stride, stride, pad, pad)
+    [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, Wc1, bc1, C, Hin, Win, 
Hf, Wf,
+                                              stride, stride, pad, pad)
     outc1r = relu::forward(outc1)
-    [outc1p, Houtc1p, Woutc1p] = max_pool::forward(outc1r, F1, Houtc1, Woutc1, 
Hf=2, Wf=2, strideh=2, stridew=2) 
+    [outc1p, Houtc1p, Woutc1p] = max_pool2d::forward(outc1r, F1, Houtc1, 
Woutc1, Hf=2, Wf=2,
+                                                     strideh=2, stridew=2)
     ## conv layer 2: conv2 -> relu2 -> pool2
-    [outc2, Houtc2, Woutc2] = conv::forward(outc1p, Wc2, bc2, F1, Houtc1p, 
Woutc1p, Hf, Wf, stride, stride, pad, pad)
+    [outc2, Houtc2, Woutc2] = conv2d::forward(outc1p, Wc2, bc2, F1, Houtc1p, 
Woutc1p, Hf, Wf,
+                                              stride, stride, pad, pad)
     outc2r = relu::forward(outc2)
-    [outc2p, Houtc2p, Woutc2p] = max_pool::forward(outc2r, F2, Houtc2, Woutc2, 
Hf=2, Wf=2, strideh=2, stridew=2) 
+    [outc2p, Houtc2p, Woutc2p] = max_pool2d::forward(outc2r, F2, Houtc2, 
Woutc2, Hf=2, Wf=2,
+                                                     strideh=2, stridew=2)
     ## conv layer 3: conv3 -> relu3 -> pool3
-    [outc3, Houtc3, Woutc3] = conv::forward(outc2p, Wc3, bc3, F2, Houtc2p, 
Woutc2p, Hf, Wf, stride, stride, pad, pad)
+    [outc3, Houtc3, Woutc3] = conv2d::forward(outc2p, Wc3, bc3, F2, Houtc2p, 
Woutc2p, Hf, Wf,
+                                              stride, stride, pad, pad)
     outc3r = relu::forward(outc3)
-    [outc3p, Houtc3p, Woutc3p] = max_pool::forward(outc3r, F3, Houtc3, Woutc3, 
Hf=2, Wf=2, strideh=2, stridew=2) 
+    [outc3p, Houtc3p, Woutc3p] = max_pool2d::forward(outc3r, F3, Houtc3, 
Woutc3, Hf=2, Wf=2,
+                                                     strideh=2, stridew=2)
     ## affine layer 1:  affine1 -> relu1 -> dropout
     outa1 = affine::forward(outc3p, Wa1, ba1)
     outa1r = relu::forward(outa1)
@@ -433,7 +458,7 @@ eval = function(matrix[double] probs, matrix[double] Y)
    *
    * Inputs:
    *  - probs: Class probabilities, of shape (N, K).
-   *  - Y: Target matrix, of shape (N, 
+   *  - Y: Target matrix, of shape (N,
    *
    * Outputs:
    *  - loss: Scalar loss, of shape (1).
@@ -448,7 +473,7 @@ eval = function(matrix[double] probs, matrix[double] Y)
 generate_dummy_data = function()
     return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) {
   /*
-   * Generate a dummy dataset similar to the MNIST dataset.
+   * Generate a dummy dataset similar to the breast cancer dataset.
    *
    * Outputs:
    *  - X: Input data matrix, of shape (N, D).
@@ -459,9 +484,9 @@ generate_dummy_data = function()
    */
   # Generate dummy input data
   N = 1024  # num examples
-  C = 1  # num input channels
-  Hin = 64  # input height
-  Win = 64  # input width
+  C = 3  # num input channels
+  Hin = 256  # input height
+  Win = 256  # input width
   K = 3  # num target classes
   X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
   classes = round(rand(rows=N, cols=1, min=1, max=K, pdf="uniform"))

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/examples/mnist_lenet.dml 
b/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
index f991487..e2895b8 100644
--- a/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
+++ b/scripts/staging/SystemML-NN/examples/mnist_lenet.dml
@@ -24,11 +24,11 @@
  */
 # Imports
 source("nn/layers/affine.dml") as affine
-source("nn/layers/conv_builtin.dml") as conv
+source("nn/layers/conv2d_builtin.dml") as conv2d
 source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
 source("nn/layers/dropout.dml") as dropout
 source("nn/layers/l2_reg.dml") as l2_reg
-source("nn/layers/max_pool_builtin.dml") as max_pool
+source("nn/layers/max_pool2d_builtin.dml") as max_pool2d
 source("nn/layers/relu.dml") as relu
 source("nn/layers/softmax.dml") as softmax
 source("nn/optim/sgd_nesterov.dml") as sgd_nesterov
@@ -81,8 +81,8 @@ train = function(matrix[double] X, matrix[double] y,
   N3 = 512  # num nodes in affine3
   # Note: affine4 has K nodes, which is equal to the number of target 
dimensions (num classes)
 
-  [W1, b1] = conv::init(F1, C, Hf, Wf)  # inputs: (N, C*Hin*Win)
-  [W2, b2] = conv::init(F2, F1, Hf, Wf)  # inputs: (N, F1*(Hin/2)*(Win/2))
+  [W1, b1] = conv2d::init(F1, C, Hf, Wf)  # inputs: (N, C*Hin*Win)
+  [W2, b2] = conv2d::init(F2, F1, Hf, Wf)  # inputs: (N, F1*(Hin/2)*(Win/2))
   [W3, b3] = affine::init(F2*(Hin/2/2)*(Win/2/2), N3)  # inputs: (N, 
F2*(Hin/2/2)*(Win/2/2))
   [W4, b4] = affine::init(N3, K)  # inputs: (N, N3)
   W4 = W4 / sqrt(2)  # different initialization, since being fed into softmax, 
instead of relu
@@ -114,17 +114,17 @@ train = function(matrix[double] X, matrix[double] y,
 
       # Compute forward pass
       ## layer 1: conv1 -> relu1 -> pool1
-      [outc1, Houtc1, Woutc1] = conv::forward(X_batch, W1, b1, C, Hin, Win, 
Hf, Wf, stride, stride,
-                                              pad, pad)
+      [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, 
Hf, Wf, stride, stride,
+                                                pad, pad)
       outr1 = relu::forward(outc1)
-      [outp1, Houtp1, Woutp1] = max_pool::forward(outr1, F1, Houtc1, Woutc1, 
Hf=2, Wf=2,
-                                                  strideh=2, stridew=2, pad=0, 
pad=0)
+      [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, 
Hf=2, Wf=2,
+                                                    strideh=2, stridew=2, 
pad=0, pad=0)
       ## layer 2: conv2 -> relu2 -> pool2
-      [outc2, Houtc2, Woutc2] = conv::forward(outp1, W2, b2, F1, Houtp1, 
Woutp1, Hf, Wf,
-                                              stride, stride, pad, pad)
+      [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, 
Woutp1, Hf, Wf,
+                                                stride, stride, pad, pad)
       outr2 = relu::forward(outc2)
-      [outp2, Houtp2, Woutp2] = max_pool::forward(outr2, F2, Houtc2, Woutc2, 
Hf=2, Wf=2,
-                                                  strideh=2, stridew=2, pad=0, 
pad=0)
+      [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, 
Hf=2, Wf=2,
+                                                    strideh=2, stridew=2, 
pad=0, pad=0)
       ## layer 3:  affine3 -> relu3 -> dropout
       outa3 = affine::forward(outp2, W3, b3)
       outr3 = relu::forward(outa3)
@@ -165,17 +165,17 @@ train = function(matrix[double] X, matrix[double] y,
       douta3 = relu::backward(doutr3, outa3)
       [doutp2, dW3, db3] = affine::backward(douta3, outp2, W3, b3)
       ## layer 2: conv2 -> relu2 -> pool2
-      doutr2 = max_pool::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, 
Woutc2, Hf=2, Wf=2,
-                                  strideh=2, stridew=2, pad=0, pad=0)
+      doutr2 = max_pool2d::backward(doutp2, Houtp2, Woutp2, outr2, F2, Houtc2, 
Woutc2, Hf=2, Wf=2,
+                                    strideh=2, stridew=2, pad=0, pad=0)
       doutc2 = relu::backward(doutr2, outc2)
-      [doutp1, dW2, db2] = conv::backward(doutc2, Houtc2, Woutc2, outp1, W2, 
b2, F1,
-                                          Houtp1, Woutp1, Hf, Wf, stride, 
stride, pad, pad)
+      [doutp1, dW2, db2] = conv2d::backward(doutc2, Houtc2, Woutc2, outp1, W2, 
b2, F1,
+                                            Houtp1, Woutp1, Hf, Wf, stride, 
stride, pad, pad)
       ## layer 1: conv1 -> relu1 -> pool1
-      doutr1 = max_pool::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, 
Woutc1, Hf=2, Wf=2,
-                                  strideh=2, stridew=2, pad=0, pad=0)
+      doutr1 = max_pool2d::backward(doutp1, Houtp1, Woutp1, outr1, F1, Houtc1, 
Woutc1, Hf=2, Wf=2,
+                                    strideh=2, stridew=2, pad=0, pad=0)
       doutc1 = relu::backward(doutr1, outc1)
-      [dX_batch, dW1, db1] = conv::backward(doutc1, Houtc1, Woutc1, X_batch, 
W1, b1, C, Hin, Win,
-                                            Hf, Wf, stride, stride, pad, pad)
+      [dX_batch, dW1, db1] = conv2d::backward(doutc1, Houtc1, Woutc1, X_batch, 
W1, b1, C, Hin, Win,
+                                              Hf, Wf, stride, stride, pad, pad)
 
       # Compute regularization backward pass
       dW1_reg = l2_reg::backward(W1, lambda)
@@ -260,17 +260,17 @@ predict = function(matrix[double] X, int C, int Hin, int 
Win,
 
     # Compute forward pass
     ## layer 1: conv1 -> relu1 -> pool1
-    [outc1, Houtc1, Woutc1] = conv::forward(X_batch, W1, b1, C, Hin, Win, Hf, 
Wf, stride, stride,
-                                            pad, pad)
+    [outc1, Houtc1, Woutc1] = conv2d::forward(X_batch, W1, b1, C, Hin, Win, 
Hf, Wf, stride, stride,
+                                              pad, pad)
     outr1 = relu::forward(outc1)
-    [outp1, Houtp1, Woutp1] = max_pool::forward(outr1, F1, Houtc1, Woutc1, 
Hf=2, Wf=2,
-                                                strideh=2, stridew=2, pad=0, 
pad=0)
+    [outp1, Houtp1, Woutp1] = max_pool2d::forward(outr1, F1, Houtc1, Woutc1, 
Hf=2, Wf=2,
+                                                  strideh=2, stridew=2, pad=0, 
pad=0)
     ## layer 2: conv2 -> relu2 -> pool2
-    [outc2, Houtc2, Woutc2] = conv::forward(outp1, W2, b2, F1, Houtp1, Woutp1, 
Hf, Wf,
-                                            stride, stride, pad, pad)
+    [outc2, Houtc2, Woutc2] = conv2d::forward(outp1, W2, b2, F1, Houtp1, 
Woutp1, Hf, Wf,
+                                              stride, stride, pad, pad)
     outr2 = relu::forward(outc2)
-    [outp2, Houtp2, Woutp2] = max_pool::forward(outr2, F2, Houtc2, Woutc2, 
Hf=2, Wf=2,
-                                                strideh=2, stridew=2, pad=0, 
pad=0)
+    [outp2, Houtp2, Woutp2] = max_pool2d::forward(outr2, F2, Houtc2, Woutc2, 
Hf=2, Wf=2,
+                                                  strideh=2, stridew=2, pad=0, 
pad=0)
     ## layer 3:  affine3 -> relu3
     outa3 = affine::forward(outp2, W3, b3)
     outr3 = relu::forward(outa3)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/layers/conv.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv.dml 
b/scripts/staging/SystemML-NN/nn/layers/conv.dml
deleted file mode 100644
index 435b3cf..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/conv.dml
+++ /dev/null
@@ -1,194 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Convolutional layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
-                   int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial convolutional layer with
-   * F filters.  The input data has N examples, each represented as a 3D
-   * volume unrolled into a single vector.
-   *
-   * This implementation uses `im2col` internally for each image to
-   * extract local image regions (patches) into columns, and then
-   * performs a matrix multiplication with the filters to compute the
-   * output maps.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      For same output height as input, set `padh = (Hf - 1) / 2`,
-   *      assuming `strideh = 1`.
-   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
-   *      preserves the spatial dimensions of the input.
-   *  - padw: Padding for left and right sides.
-   *      For same output width as input, set `padw = (Wf - 1) / 2`,
-   *      assuming `stridew = 1`.
-   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
-   *      preserves the spatial dimensions of the input.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  F = nrow(W)
-  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
-  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
-
-  # Create output volume
-  out = matrix(0, rows=N, cols=F*Hout*Wout)
-
-  # Convolution - im2col implementation
-  parfor (n in 1:N) {  # all examples
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
-
-    # Pad image
-    Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0)  # shape (C, 
(Hin+2*padh)*(Win+2*padw))
-
-    # Extract local image patches into columns with im2col, of shape (C*Hf*Wf, 
Hout*Wout)
-    Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, 
strideh, stridew)
-
-    # Convolve patches with filters
-    outn = W %*% Xn_padded_cols + b  # shape (F, Hout*Wout)
-    out[n,] = matrix(outn, rows=1, cols=F*Hout*Wout)  # reshape
-  }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout,
-                    matrix[double] X, matrix[double] W, matrix[double] b,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
-  /*
-   * Computes the backward pass for a 2D spatial convolutional layer
-   * with F filters.
-   *
-   * This implementation uses `im2col` and `col2im` internally.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *  - padw: Padding for left and right sides.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
-   *  - db: Gradient wrt `b`, of shape (F, 1).
-   */
-  N = nrow(X)
-  F = nrow(W)
-
-  # Create gradient volumes
-  # Note: Create convenience gradient volumes for dW and db that will
-  # allow for one gradient to be stored per example, allowing for
-  # parallel computation at the expense of memory.  We will reduce at
-  # the end.
-  dX = matrix(0, rows=N, cols=C*Hin*Win)
-  dWN = matrix(0, rows=N, cols=F*C*Hf*Wf)  # dW = matrix(0, rows=F, 
cols=C*Hf*Wf)
-  dbN = matrix(0, rows=N, cols=F)  # db = matrix(0, rows=F, cols=1)
-
-  # Partial derivatives for convolution - im2col implementation
-  parfor (n in 1:N) {  # all examples
-    doutn = matrix(dout[n,], rows=F, cols=Hout*Wout)
-
-    # Compute dW
-    Xn = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
-    Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0)  # shape (C, 
(Hin+2*padh)*(Win+2*padw))
-    Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, 
strideh, stridew)
-    # dW = dW + doutn %*% t(Xn_padded_cols)
-    dWN[n,] = matrix(doutn %*% t(Xn_padded_cols), rows=1, cols=F*C*Hf*Wf)
-
-    # Compute db
-    # db = db + rowSums(doutn)
-    dbN[n,] = matrix(rowSums(doutn), rows=1, cols=F)
-
-    # Compute dX
-    dXn_padded_cols = t(W) %*% doutn  # shape (C*Hf*Wf, Hout*Wout)
-    dXn_padded = util::col2im(dXn_padded_cols, C, Hin+2*padh, Win+2*padw, Hf, 
Wf,
-                              strideh, stridew, "add")
-    dXn = util::unpad_image(dXn_padded, Hin, Win, padh, padw)
-    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)  # reshape
-  }
-
-  # Reduce convenience gradient volumes with one gradient per example
-  # into single gradients for W and b.
-  dW = matrix(colSums(dWN), rows=F, cols=C*Hf*Wf)
-  db = matrix(colSums(dbN), rows=F, cols=1)
-}
-
-init = function(int F, int C, int Hf, int Wf)
-    return (matrix[double] W, matrix[double] b) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * We use the heuristic by He et al., which limits the magnification
-   * of inputs/gradients during forward/backward passes by scaling
-   * unit-Gaussian weights by a factor of sqrt(2/n), under the
-   * assumption of relu neurons.
-   *  - http://arxiv.org/abs/1502.01852
-   *
-   * Inputs:
-   *  - F: Number of filters.
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *
-   * Outputs:
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   */
-  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
-  b = matrix(0, rows=F, cols=1)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv2d.dml 
b/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
new file mode 100644
index 0000000..435b3cf
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/layers/conv2d.dml
@@ -0,0 +1,194 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Convolutional layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
+                   int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial convolutional layer with
+   * F filters.  The input data has N examples, each represented as a 3D
+   * volume unrolled into a single vector.
+   *
+   * This implementation uses `im2col` internally for each image to
+   * extract local image regions (patches) into columns, and then
+   * performs a matrix multiplication with the filters to compute the
+   * output maps.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      For same output height as input, set `padh = (Hf - 1) / 2`,
+   *      assuming `strideh = 1`.
+   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+   *      preserves the spatial dimensions of the input.
+   *  - padw: Padding for left and right sides.
+   *      For same output width as input, set `padw = (Wf - 1) / 2`,
+   *      assuming `stridew = 1`.
+   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+   *      preserves the spatial dimensions of the input.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  F = nrow(W)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
+
+  # Create output volume
+  out = matrix(0, rows=N, cols=F*Hout*Wout)
+
+  # Convolution - im2col implementation
+  parfor (n in 1:N) {  # all examples
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
+
+    # Pad image
+    Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0)  # shape (C, 
(Hin+2*padh)*(Win+2*padw))
+
+    # Extract local image patches into columns with im2col, of shape (C*Hf*Wf, 
Hout*Wout)
+    Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, 
strideh, stridew)
+
+    # Convolve patches with filters
+    outn = W %*% Xn_padded_cols + b  # shape (F, Hout*Wout)
+    out[n,] = matrix(outn, rows=1, cols=F*Hout*Wout)  # reshape
+  }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout,
+                    matrix[double] X, matrix[double] W, matrix[double] b,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
+  /*
+   * Computes the backward pass for a 2D spatial convolutional layer
+   * with F filters.
+   *
+   * This implementation uses `im2col` and `col2im` internally.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *  - padw: Padding for left and right sides.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+   *  - db: Gradient wrt `b`, of shape (F, 1).
+   */
+  N = nrow(X)
+  F = nrow(W)
+
+  # Create gradient volumes
+  # Note: Create convenience gradient volumes for dW and db that will
+  # allow for one gradient to be stored per example, allowing for
+  # parallel computation at the expense of memory.  We will reduce at
+  # the end.
+  dX = matrix(0, rows=N, cols=C*Hin*Win)
+  dWN = matrix(0, rows=N, cols=F*C*Hf*Wf)  # dW = matrix(0, rows=F, 
cols=C*Hf*Wf)
+  dbN = matrix(0, rows=N, cols=F)  # db = matrix(0, rows=F, cols=1)
+
+  # Partial derivatives for convolution - im2col implementation
+  parfor (n in 1:N) {  # all examples
+    doutn = matrix(dout[n,], rows=F, cols=Hout*Wout)
+
+    # Compute dW
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
+    Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0)  # shape (C, 
(Hin+2*padh)*(Win+2*padw))
+    Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, 
strideh, stridew)
+    # dW = dW + doutn %*% t(Xn_padded_cols)
+    dWN[n,] = matrix(doutn %*% t(Xn_padded_cols), rows=1, cols=F*C*Hf*Wf)
+
+    # Compute db
+    # db = db + rowSums(doutn)
+    dbN[n,] = matrix(rowSums(doutn), rows=1, cols=F)
+
+    # Compute dX
+    dXn_padded_cols = t(W) %*% doutn  # shape (C*Hf*Wf, Hout*Wout)
+    dXn_padded = util::col2im(dXn_padded_cols, C, Hin+2*padh, Win+2*padw, Hf, 
Wf,
+                              strideh, stridew, "add")
+    dXn = util::unpad_image(dXn_padded, Hin, Win, padh, padw)
+    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)  # reshape
+  }
+
+  # Reduce convenience gradient volumes with one gradient per example
+  # into single gradients for W and b.
+  dW = matrix(colSums(dWN), rows=F, cols=C*Hf*Wf)
+  db = matrix(colSums(dbN), rows=F, cols=1)
+}
+
+init = function(int F, int C, int Hf, int Wf)
+    return (matrix[double] W, matrix[double] b) {
+  /*
+   * Initialize the parameters of this layer.
+   *
+   * Note: This is just a convenience function, and parameters
+   * may be initialized manually if needed.
+   *
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
+   *
+   * Inputs:
+   *  - F: Number of filters.
+   *  - C: Number of input channels (dimensionality of depth).
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *
+   * Outputs:
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   */
+  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
+  b = matrix(0, rows=F, cols=1)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml 
b/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
new file mode 100644
index 0000000..29021cf
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/layers/conv2d_builtin.dml
@@ -0,0 +1,160 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Convolutional layer.
+ *
+ * This implementation uses a built-in operator for higher performance.
+ */
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
+                   int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial convolutional layer with
+   * F filters.  The input data has N examples, each represented as a 3D
+   * volume unrolled into a single vector.
+   *
+   * This implementation uses a built-in operator for higher
+   * performance.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      For same output height as input, set `padh = (Hf - 1) / 2`,
+   *      assuming `strideh = 1`.
+   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+   *      preserves the spatial dimensions of the input.
+   *  - padw: Padding for left and right sides.
+   *      For same output width as input, set `padw = (Wf - 1) / 2`,
+   *      assuming `stridew = 1`.
+   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+   *      preserves the spatial dimensions of the input.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  F = nrow(W)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
+
+  # Convolution - built-in implementation
+  out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
+               stride=[strideh,stridew], padding=[padh,padw])
+
+  # Add bias term to each output filter
+  out = bias_add(out, b)
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout,
+                    matrix[double] X, matrix[double] W, matrix[double] b,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
+  /*
+   * Computes the backward pass for a 2D spatial convolutional layer
+   * with F filters.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      For same output height as input, set `padh = (Hf - 1) / 2`,
+   *      assuming `strideh = 1`.
+   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
+   *      preserves the spatial dimensions of the input.
+   *  - padw: Padding for left and right sides.
+   *      For same output width as input, set `padw = (Wf - 1) / 2`,
+   *      assuming `stridew = 1`.
+   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
+   *      preserves the spatial dimensions of the input.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+   *  - db: Gradient wrt `b`, of shape (F, 1).
+   */
+  N = nrow(X)
+  F = nrow(W)
+
+  # Partial derivatives for convolution - built-in implementation
+  dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], 
padding=[padh,padw],
+                              input_shape=[N,C,Hin,Win], 
filter_shape=[F,C,Hf,Wf])
+  dX = conv2d_backward_data(W, dout, stride=[strideh, stridew], 
padding=[padh,padw],
+                            input_shape=[N,C,Hin,Win], 
filter_shape=[F,C,Hf,Wf])
+
+  # Partial derivatives for bias vector
+  db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout))
+}
+
+init = function(int F, int C, int Hf, int Wf)
+    return (matrix[double] W, matrix[double] b) {
+  /*
+   * Initialize the parameters of this layer.
+   *
+   * Note: This is just a convenience function, and parameters
+   * may be initialized manually if needed.
+   *
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
+   *
+   * Inputs:
+   *  - F: Number of filters.
+   *  - C: Number of input channels (dimensionality of depth).
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *
+   * Outputs:
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   */
+  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
+  b = matrix(0, rows=F, cols=1)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml 
b/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
deleted file mode 100644
index c2b809e..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml
+++ /dev/null
@@ -1,155 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * 2D Convolutional layer.
- */
-
-forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
-                   int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial convolutional layer with
-   * F filters.  The input data has N examples, each represented as a 3D
-   * volume unrolled into a single vector.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      For same output height as input, set `padh = (Hf - 1) / 2`,
-   *      assuming `strideh = 1`.
-   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
-   *      preserves the spatial dimensions of the input.
-   *  - padw: Padding for left and right sides.
-   *      For same output width as input, set `padw = (Wf - 1) / 2`,
-   *      assuming `stridew = 1`.
-   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
-   *      preserves the spatial dimensions of the input.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  F = nrow(W)
-  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
-  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
-
-  # Convolution - built-in implementation
-  out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf],
-               stride=[strideh,stridew], padding=[padh,padw])
-
-  # Add bias term to each output filter
-  out = bias_add(out, b)
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout,
-                    matrix[double] X, matrix[double] W, matrix[double] b,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
-  /*
-   * Computes the backward pass for a 2D spatial convolutional layer
-   * with F filters.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, F*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      For same output height as input, set `padh = (Hf - 1) / 2`,
-   *      assuming `strideh = 1`.
-   *      More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2`
-   *      preserves the spatial dimensions of the input.
-   *  - padw: Padding for left and right sides.
-   *      For same output width as input, set `padw = (Wf - 1) / 2`,
-   *      assuming `stridew = 1`.
-   *      More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2`
-   *      preserves the spatial dimensions of the input.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
-   *  - db: Gradient wrt `b`, of shape (F, 1).
-   */
-  N = nrow(X)
-  F = nrow(W)
-
-  # Partial derivatives for convolution - built-in implementation
-  dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], 
padding=[padh,padw],
-                              input_shape=[N,C,Hin,Win], 
filter_shape=[F,C,Hf,Wf])
-  dX = conv2d_backward_data(W, dout, stride=[strideh, stridew], 
padding=[padh,padw],
-                            input_shape=[N,C,Hin,Win], 
filter_shape=[F,C,Hf,Wf])
-
-  # Partial derivatives for bias vector
-  db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout))
-}
-
-init = function(int F, int C, int Hf, int Wf)
-    return (matrix[double] W, matrix[double] b) {
-  /*
-   * Initialize the parameters of this layer.
-   *
-   * Note: This is just a convenience function, and parameters
-   * may be initialized manually if needed.
-   *
-   * We use the heuristic by He et al., which limits the magnification
-   * of inputs/gradients during forward/backward passes by scaling
-   * unit-Gaussian weights by a factor of sqrt(2/n), under the
-   * assumption of relu neurons.
-   *  - http://arxiv.org/abs/1502.01852
-   *
-   * Inputs:
-   *  - F: Number of filters.
-   *  - C: Number of input channels (dimensionality of depth).
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *
-   * Outputs:
-   *  - W: Weights, of shape (F, C*Hf*Wf).
-   *  - b: Biases, of shape (F, 1).
-   */
-  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
-  b = matrix(0, rows=F, cols=1)
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool.dml 
b/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
deleted file mode 100644
index a12877f..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool.dml
+++ /dev/null
@@ -1,159 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Max pooling layer.
- */
-source("nn/util.dml") as util
-
-forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * This implementation uses `im2col` internally for each image to
-   * extract local image regions (patches) of each channel slice into
-   * columns, and then performs max pooling over the patches to compute
-   * the output maps.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
-  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
-  pad_value = -1/0  # in max pooling we pad with -infinity
-
-  # Create output volume
-  out = matrix(0, rows=N, cols=C*Hout*Wout)
-
-  # Max pooling - im2col implementation
-  parfor (n in 1:N) {  # all examples
-    img = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
-
-    if (padh > 0 | padw > 0) {
-      # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
-      img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
-    }
-
-    img_maxes = matrix(0, rows=C, cols=Hout*Wout)  # zeros
-    parfor (c in 1:C) {  # all channels
-      # Extract local image slice patches into columns with im2col, of shape 
(Hf*Wf, Hout*Wout)
-      img_slice_cols = util::im2col(img[c,], Hin+2*padh, Win+2*padw, Hf, Wf, 
strideh, stridew)
-
-      # Max pooling on patches
-      img_maxes[c,] = colMaxs(img_slice_cols)
-    }
-
-    out[n,] = matrix(img_maxes, rows=1, cols=C*Hout*Wout)
-  }
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Input data matrix, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   */
-  N = nrow(X)
-  pad_value = -1/0  # in max pooling we pad with -infinity
-
-  # Create gradient volume
-  dX = matrix(0, rows=N, cols=C*Hin*Win)
-
-  # Gradient of max pooling
-  parfor (n in 1:N, check=0) {  # all examples
-    img = matrix(X[n,], rows=C, cols=Hin*Win)
-    if (padh > 0 | padw > 0) {
-      # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
-      img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
-    }
-
-    dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
-    parfor (c in 1:C, check=0) {  # all channels
-      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
-      dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
-      for (hout in 1:Hout, check=0) {  # all output rows
-        hin = (hout-1)*strideh + 1
-        for (wout in 1:Wout) {  # all output columns
-          win = (wout-1)*stridew + 1
-          img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
-          max_val_ind = img_slice_patch == max(img_slice_patch)  # max value 
indicator matrix
-          # gradient passes through only for the max value(s) in this patch
-          dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + 
(hout-1)*Wout + wout]
-          dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, 
win:win+Wf-1]
-                                                   + dimg_slice_patch
-        }
-      }
-      dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
-    }
-
-    if (padh > 0 | padw > 0) {
-      # Unpad image gradient
-      dimg = util::unpad_image(dimg, Hin, Win, padh, padw)  # shape (C, 
(Hin+2*padh)*(Win+2*padw))
-    }
-    dX[n,] = matrix(dimg, rows=1, cols=C*Hin*Win)
-  }
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml 
b/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
new file mode 100644
index 0000000..229b7b9
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/layers/max_pool2d.dml
@@ -0,0 +1,159 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Max Pooling layer.
+ */
+source("nn/util.dml") as util
+
+forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * This implementation uses `im2col` internally for each image to
+   * extract local image regions (patches) of each channel slice into
+   * columns, and then performs max pooling over the patches to compute
+   * the output maps.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
+  pad_value = -1/0  # in max pooling we pad with -infinity
+
+  # Create output volume
+  out = matrix(0, rows=N, cols=C*Hout*Wout)
+
+  # Max pooling - im2col implementation
+  parfor (n in 1:N) {  # all examples
+    img = matrix(X[n,], rows=C, cols=Hin*Win)  # reshape
+
+    if (padh > 0 | padw > 0) {
+      # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
+      img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
+    }
+
+    img_maxes = matrix(0, rows=C, cols=Hout*Wout)  # zeros
+    parfor (c in 1:C) {  # all channels
+      # Extract local image slice patches into columns with im2col, of shape 
(Hf*Wf, Hout*Wout)
+      img_slice_cols = util::im2col(img[c,], Hin+2*padh, Win+2*padw, Hf, Wf, 
strideh, stridew)
+
+      # Max pooling on patches
+      img_maxes[c,] = colMaxs(img_slice_cols)
+    }
+
+    out[n,] = matrix(img_maxes, rows=1, cols=C*Hout*Wout)
+  }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   */
+  N = nrow(X)
+  pad_value = -1/0  # in max pooling we pad with -infinity
+
+  # Create gradient volume
+  dX = matrix(0, rows=N, cols=C*Hin*Win)
+
+  # Gradient of max pooling
+  parfor (n in 1:N, check=0) {  # all examples
+    img = matrix(X[n,], rows=C, cols=Hin*Win)
+    if (padh > 0 | padw > 0) {
+      # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw))
+      img = util::pad_image(img, Hin, Win, padh, padw, pad_value)
+    }
+
+    dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
+    parfor (c in 1:C, check=0) {  # all channels
+      img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
+      for (hout in 1:Hout, check=0) {  # all output rows
+        hin = (hout-1)*strideh + 1
+        for (wout in 1:Wout) {  # all output columns
+          win = (wout-1)*stridew + 1
+          img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
+          max_val_ind = img_slice_patch == max(img_slice_patch)  # max value 
indicator matrix
+          # gradient passes through only for the max value(s) in this patch
+          dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + 
(hout-1)*Wout + wout]
+          dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, 
win:win+Wf-1]
+                                                   + dimg_slice_patch
+        }
+      }
+      dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
+    }
+
+    if (padh > 0 | padw > 0) {
+      # Unpad image gradient
+      dimg = util::unpad_image(dimg, Hin, Win, padh, padw)  # shape (C, 
(Hin+2*padh)*(Win+2*padw))
+    }
+    dX[n,] = matrix(dimg, rows=1, cols=C*Hin*Win)
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml 
b/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
new file mode 100644
index 0000000..be4e195
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/layers/max_pool2d_builtin.dml
@@ -0,0 +1,103 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Max Pooling layer.
+ *
+ * This implementation uses a built-in operator for higher performance.
+ */
+
+forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * This implementation uses a built-in operator for higher
+   * performance.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  Hout = as.integer((Hin-Hf)/strideh + 1)
+  Wout = as.integer((Win-Wf)/stridew + 1)
+
+  # Max pooling - built-in implementation
+  out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
+                 stride=[strideh,stridew], padding=[padh,padw])
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   */
+  N = nrow(X)
+
+  # Gradient of max pooling
+  dX = max_pool_backward(X, dout, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
+                         stride=[strideh,stridew], padding=[padh,padw])
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml 
b/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
deleted file mode 100644
index f1cb863..0000000
--- a/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml
+++ /dev/null
@@ -1,103 +0,0 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-/*
- * Max pooling layer.
- */
-
-forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
-                   int strideh, int stridew, int padh, int padw)
-    return (matrix[double] out, int Hout, int Wout) {
-  /*
-   * Computes the forward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * This implementation uses `im2col` internally for each image to
-   * extract local image regions (patches) of each channel slice into
-   * columns, and then performs max pooling over the patches to compute
-   * the output maps.
-   *
-   * Inputs:
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - out: Outputs, of shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   */
-  N = nrow(X)
-  Hout = as.integer((Hin-Hf)/strideh + 1)
-  Wout = as.integer((Win-Wf)/stridew + 1)
-
-  # Max pooling - built-in implementation
-  out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
-                 stride=[strideh,stridew], padding=[padh,padw])
-}
-
-backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X,
-                    int C, int Hin, int Win, int Hf, int Wf,
-                    int strideh, int stridew, int padh, int padw)
-    return (matrix[double] dX) {
-  /*
-   * Computes the backward pass for a 2D spatial max pooling layer.
-   * The input data has N examples, each represented as a 3D volume
-   * unrolled into a single vector.
-   *
-   * Inputs:
-   *  - dout: Gradient wrt `out` from upstream, of
-   *      shape (N, C*Hout*Wout).
-   *  - Hout: Output height.
-   *  - Wout: Output width.
-   *  - X: Inputs, of shape (N, C*Hin*Win).
-   *  - C: Number of input channels (dimensionality of input depth).
-   *  - Hin: Input height.
-   *  - Win: Input width.
-   *  - Hf: Filter height.
-   *  - Wf: Filter width.
-   *  - strideh: Stride over height.
-   *  - stridew: Stride over width.
-   *  - padh: Padding for top and bottom sides.
-   *      A typical value is 0.
-   *  - padw: Padding for left and right sides.
-   *      A typical value is 0.
-   *
-   * Outputs:
-   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
-   */
-  N = nrow(X)
-
-  # Gradient of max pooling
-  dX = max_pool_backward(X, dout, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf],
-                         stride=[strideh,stridew], padding=[padh,padw])
-}
-

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/07039caa/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml 
b/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
new file mode 100644
index 0000000..efd99c3
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml
@@ -0,0 +1,215 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Convolutional layer.
+ *
+ * This implementation is intended to be a simple, reference version.
+ */
+
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
+                   int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial convolutional layer with
+   * F filters.  The input data has N examples, each represented as a 3D
+   * volume unrolled into a single vector.
+   *
+   * This implementation is intended to be a simple, reference version.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *  - padw: Padding for left and right sides.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  F = nrow(W)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
+
+  # Create output volume
+  out = matrix(0, rows=N, cols=F*Hout*Wout)
+
+  # Convolution - Simple reference implementation
+  parfor (n in 1:N) {  # all examples
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+    # Pad image
+    Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
+    parfor (c in 1:C) {
+      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
+      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, 
cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+    }
+    # Convolve image with filters
+    parfor (f in 1:F, check=0) {  # all filters
+      parfor (hout in 1:Hout, check=0) {  # all output rows
+        h0 = (hout-1)*strideh + 1
+        parfor (wout in 1:Wout, check=0) {  # all output columns
+          w0 = (wout-1)*stridew + 1
+          # Create a patch of the input example corresponding spatially to the 
filter sizes
+          Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
+          parfor (c in 1:C, check=0) {
+            Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, 
cols=Win+2*padw)  # reshape
+            Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, 
w0:w0-1+Wf], rows=1,
+                                         cols=Hf*Wf)  # reshape
+          }
+          out[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout] =
+              W[f,] %*% matrix(Xn_padded_patch, rows=C*Hf*Wf, cols=1) + b[f,]
+        }
+      }
+    }
+  }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout,
+                    matrix[double] X, matrix[double] W, matrix[double] b,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
+  /*
+   * Computes the backward pass for a 2D spatial convolutional layer
+   * with F filters.
+   *
+   * This implementation is intended to be a simple, reference version.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *  - padw: Padding for left and right sides.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   *  - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf).
+   *  - db: Gradient wrt `b`, of shape (F, 1).
+   */
+  N = nrow(X)
+  F = nrow(W)
+  Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1)
+  Wout = as.integer((Win + 2*padw - Wf)/stridew + 1)
+
+  # Create gradient volumes
+  dX = matrix(0, rows=N, cols=C*Hin*Win)
+  dW = matrix(0, rows=F, cols=C*Hf*Wf)
+  db = matrix(0, rows=F, cols=1)
+
+  # Partial derivatives for convolution - Simple reference implementation
+  for (n in 1:N) {  # all examples
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+    # Pad image
+    Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
+    parfor (c in 1:C) {
+      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
+      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+      Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, 
cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+    }
+    dXn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
+    for (f in 1:F) {  # all filters
+      for (hout in 1:Hout) {  # all output rows
+        h0 = (hout-1) * strideh + 1
+        for (wout in 1:Wout) {  # all output columns
+          w0 = (wout-1) * stridew + 1
+          # Create a patch of the input example corresponding spatially to the 
filter sizes
+          Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
+          dXn_padded_patch = matrix(W[f,] * dout[n, (f-1)*Hout*Wout + 
(hout-1)*Wout + wout],
+                                    rows=C, cols=Hf*Wf)  # reshape
+          for (c in 1:C) {
+            Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, 
cols=Win+2*padw)  # reshape
+            Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, 
w0:w0-1+Wf],
+                                         rows=1, cols=Hf*Wf)  # reshape
+            dXn_padded_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
+            dXn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf] = 
matrix(dXn_padded_patch[c,],
+                                                              rows=Hf, 
cols=Wf)  # reshape
+            dXn_padded[c,] = dXn_padded[c,] + matrix(dXn_padded_slice,
+                                                     rows=1, 
cols=(Hin+2*padh)*(Win+2*padw))
+          }
+          dW[f,] = dW[f,]
+                   + matrix(Xn_padded_patch, rows=1, cols=C*Hf*Wf)
+                   * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
+          db[f,] = db[f,] + dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
+        }
+      }
+    }
+    # Unpad derivs on input
+    dXn = matrix(0, rows=C, cols=Hin*Win)
+    parfor (c in 1:C, check=0) {
+      dXn_padded_slice = matrix(dXn_padded[c,], rows=(Hin+2*padh), 
cols=(Win+2*padw))
+      dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
+      dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win)
+    }
+    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
+  }
+}
+
+init = function(int F, int C, int Hf, int Wf)
+    return (matrix[double] W, matrix[double] b) {
+  /*
+   * Initialize the parameters of this layer.
+   *
+   * We use the heuristic by He et al., which limits the magnification
+   * of inputs/gradients during forward/backward passes by scaling
+   * unit-Gaussian weights by a factor of sqrt(2/n), under the
+   * assumption of relu neurons.
+   *  - http://arxiv.org/abs/1502.01852
+   *
+   * Inputs:
+   *  - F: Number of filters.
+   *  - C: Number of input channels (dimensionality of depth).
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *
+   * Outputs:
+   *  - W: Weights, of shape (F, C*Hf*Wf).
+   *  - b: Biases, of shape (F, 1).
+   */
+  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
+  b = matrix(0, rows=F, cols=1)
+}
+

Reply via email to