[1/2] incubator-systemml git commit: [SYSTEMML-618] Deep Learning DML Library

dusenberrymw Wed, 18 May 2016 18:03:06 -0700

Repository: incubator-systemml
Updated Branches:
  refs/heads/master 9dc42b2fc -> 781d24d86



http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/781d24d8/scripts/staging/SystemML-NN/nn/test/conv_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/conv_simple.dml 
b/scripts/staging/SystemML-NN/nn/test/conv_simple.dml
new file mode 100644
index 0000000..f065668
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/test/conv_simple.dml
@@ -0,0 +1,211 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * 2D Convolutional layer.
+ *
+ * This implementation is intended to be a simple, reference version.
+ */
+forward = function(matrix[double] X, matrix[double] W, matrix[double] b,
+                   int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew, int padh, int padw)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial convolutional layer with
+   * F filters.  The input data has N examples, each represented as a 3D
+   * volume unrolled into a single vector.
+   *
+   * This implementation is intended to be a simple, reference version.
+   *
+   * Inputs:
+   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
+   *  - b: Biases vector, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *  - padw: Padding for left and right sides.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  F = nrow(W)
+  Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1)
+  Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1)
+  
+  # Create output volume
+  out = matrix(0, rows=N, cols=F*Hout*Wout)
+
+  # Convolution - Simple reference implementation
+  parfor (n in 1:N) {  # all examples
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+    # Pad image
+    Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
+    parfor (c in 1:C) {
+      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
+      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+      Xn_padded[c, ] = matrix(Xn_padded_slice, rows=1, 
cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+    }
+    # Convolve image with filters
+    parfor (f in 1:F, check=0) {  # all filters
+      parfor (hout in 1:Hout, check=0) {  # all output rows
+        h0 = (hout-1) * strideh + 1
+        parfor (wout in 1:Wout, check=0) {  # all output columns
+          w0 = (wout-1) * stridew + 1
+          # Create a patch of the input example corresponding spatially to the 
filter sizes
+          Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
+          parfor (c in 1:C, check=0) {
+            Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, 
cols=Win+2*padw)  # reshape
+            Xn_padded_patch[c,] = 
+              matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf], rows=1, 
cols=Hf*Wf)  # reshape
+          }
+          out[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout] = 
+            W[f,] %*% matrix(Xn_padded_patch, rows=C*Hf*Wf, cols=1) + b[f,]
+        }
+      }
+    }
+  }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout,
+                    matrix[double] X, matrix[double] W, matrix[double] b,
+                    int C, int Hin, int Win, int Hf, int Wf,
+                    int strideh, int stridew, int padh, int padw)
+    return (matrix[double] dX, matrix[double] dW, matrix[double] db) {
+  /*
+   * Computes the backward pass for a 2D spatial convolutional layer
+   * with F filters.
+   *
+   * This implementation is intended to be a simple, reference version.
+   *
+   * Inputs:
+   *  - dout: Derivatives from upstream, of shape (N, F*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Previous input data matrix, of shape (N, C*Hin*Win).
+   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
+   *  - b: Biases vector, of shape (F, 1).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *  - padw: Padding for left and right sides.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt X, of shape (N, C*Hin*Win).
+   *  - dW: Gradient wrt W, of shape (F, C*Hf*Wf).
+   *  - db: Gradient wrt b, of shape (F, 1).
+   */
+  N = nrow(X)
+  F = nrow(W)
+  Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1)
+  Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1)
+  
+  # Create gradient volumes
+  dX = matrix(0, rows=N, cols=C*Hin*Win)
+  dW = matrix(0, rows=F, cols=C*Hf*Wf)
+  db = matrix(0, rows=F, cols=1)
+
+  # Partial derivatives for convolution - Simple reference implementation
+  for (n in 1:N) {  # all examples
+    Xn = matrix(X[n,], rows=C, cols=Hin*Win)
+    # Pad image
+    Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
+    parfor (c in 1:C) {
+      Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win)  # depth slice C reshaped
+      Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw)
+      Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice
+      Xn_padded[c, ] = matrix(Xn_padded_slice, rows=1, 
cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+    }
+    dXn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))
+    for (f in 1:F) {  # all filters
+      for (hout in 1:Hout) {  # all output rows
+        h0 = (hout-1) * strideh + 1
+        for (wout in 1:Wout) {  # all output columns
+          w0 = (wout-1) * stridew + 1
+          # Create a patch of the input example corresponding spatially to the 
filter sizes
+          Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
+          dXn_padded_patch = matrix(W[f,] * dout[n, (f-1)*Hout*Wout + 
(hout-1)*Wout + wout],
+                                    rows=C, cols=Hf*Wf)  # reshape
+          for (c in 1:C) {
+            Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, 
cols=Win+2*padw)  # reshape
+            Xn_padded_patch[c,] = 
+              matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf], rows=1, 
cols=Hf*Wf)  # reshape
+            dXn_padded_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
+            dXn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf] =
+              matrix(dXn_padded_patch[c,], rows=Hf, cols=Wf)  # reshape
+            dXn_padded[c,] = dXn_padded[c,] +
+              matrix(dXn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw))
+          }
+          dW[f,] = dW[f,] + matrix(Xn_padded_patch, rows=1, cols=C*Hf*Wf) *
+            dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
+          db[f,] = db[f,] + dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout]
+        }
+      }
+    }
+    # Unpad derivs on input
+    dXn = matrix(0, rows=C, cols=Hin*Win)
+    parfor (c in 1:C, check=0) {
+      dXn_padded_slice = matrix(dXn_padded[c,], rows=(Hin+2*padh), 
cols=(Win+2*padw))
+      dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
+      dXn[c, ] = matrix(dXn_slice, rows=1, cols=Hin*Win)
+    }
+    dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win)
+  }
+}
+
+init = function(int F, int C, int Hf, int Wf)
+    return (matrix[double] W, matrix[double] b) {
+  /*
+   * Initialize the parameters of this layer.
+   * 
+   * We use the heuristic by He et al. [http://arxiv.org/abs/1502.01852],
+   * which limits the magnification of inputs/gradients during
+   * forward/backward passes by scaling unit-Gaussian weights by a
+   * factor of sqrt(2/n), under the assumption of relu neurons.
+   *
+   * Inputs:
+   *  - F: Number of filters.
+   *  - C: Number of input channels (dimensionality of depth).
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *
+   * Outputs:
+   *  - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf).
+   *  - b: Biases vector, of shape (F, 1).
+   */
+  W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf))
+  b = matrix(0, rows=F, cols=1) 
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/781d24d8/scripts/staging/SystemML-NN/nn/test/grad_check.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/grad_check.dml 
b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
new file mode 100644
index 0000000..af985a3
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/test/grad_check.dml
@@ -0,0 +1,1139 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Gradient checks for various architectures.
+ */
+source("nn/layers/affine.dml") as affine
+source("nn/layers/conv.dml") as conv
+source("nn/layers/conv_builtin.dml") as conv_builtin
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+source("nn/layers/dropout.dml") as dropout
+source("nn/layers/l1_loss.dml") as l1_loss
+source("nn/layers/l1_reg.dml") as l1_reg
+source("nn/layers/l2_loss.dml") as l2_loss
+source("nn/layers/l2_reg.dml") as l2_reg
+source("nn/layers/log_loss.dml") as log_loss
+source("nn/layers/max_pool.dml") as max_pool
+source("nn/layers/max_pool_builtin.dml") as max_pool_builtin
+source("nn/layers/relu.dml") as relu
+source("nn/layers/sigmoid.dml") as sigmoid
+source("nn/layers/softmax.dml") as softmax
+source("nn/layers/tanh.dml") as tanh
+source("nn/test/conv_simple.dml") as conv_simple
+source("nn/test/max_pool_simple.dml") as max_pool_simple
+source("nn/util.dml") as util
+
+check_rel_error = function(double dw_a, double dw_n, double lossph, double 
lossmh)
+    return (double rel_error) {
+  /*
+   * Check and report any issues with the relative error measure between
+   * the analytical and numerical partial derivatives.
+   *
+   *  - Issues an "ERROR" statement for relative errors > 1e-2, 
+   *  indicating that the gradient is likely incorrect.
+   *  - Issues a "WARNING" statement for relative errors < 1e-2
+   *  but > 1e-4, indicating that the may be incorrect.
+   *
+   * Inputs:
+   *  - dw_a: Analytical partial derivative wrt w.
+   *  - dw_n: Numerical partial derivative wrt w.
+   *  - lossph: Loss evaluated with w set to w+h.
+   *  - lossmh: Loss evaluated with w set to w-h.
+   *
+   * Outputs:
+   *  - rel_error: Relative error measure between the two derivatives.
+   */
+  # Compute relative error
+  rel_error = util::compute_rel_error(dw_a, dw_n)
+  
+  # Evaluate relative error
+  if (rel_error > 1e-2) {
+      print("ERROR: Relative error " + rel_error + " > 1e-2 with " + dw_a +
+            " analytical vs " + dw_n + " numerical, with lossph " + lossph +
+            " and lossmh " + lossmh)
+  }
+  else if (rel_error > 1e-4 & rel_error <= 1e-2) {
+      print("WARNING: Relative error " + rel_error + " <= 1e-2 & > 1e-4 with " 
+ dw_a +
+            " analytical vs " + dw_n + " numerical, with lossph " + lossph +
+            " and lossmh " + lossmh)
+  }
+}
+
+affine = function() {
+  /*
+   * Gradient check for the affine layer.
+   */
+  print("Grad checking the affine layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  D = 100 # num features
+  M = 10 # num neurons
+  X = rand(rows=N, cols=D)
+  y = rand(rows=N, cols=M)
+  [W, b] = affine::init(D, M)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = affine::forward(X, W, b)
+  dout = l2_loss::backward(out, y)
+  [dX, dW, db] = affine::backward(dout, X, W, b)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = affine::forward(X, W, b)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = affine::forward(X, W, b)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W.")
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      outmh = affine::forward(X, W, b)
+      lossmh = l2_loss::forward(outmh, y)
+      W[i,j] = old + h
+      outph = affine::forward(X, W, b)
+      lossph = l2_loss::forward(outph, y)
+      W[i,j] = old  # reset
+      dW_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b.")
+  for (i in 1:nrow(b)) {
+    for (j in 1:ncol(b)) {
+      # Compute numerical derivative
+      old = as.scalar(b[i,j])
+      b[i,j] = old - h
+      outmh = affine::forward(X, W, b)
+      lossmh = l2_loss::forward(outmh, y)
+      b[i,j] = old + h
+      outph = affine::forward(X, W, b)
+      lossph = l2_loss::forward(outph, y)
+      b[i,j] = old  # reset
+      db_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+    }
+  }
+}
+
+conv = function() {
+  /*
+   * Gradient check for the convolutional layer using `im2col`.
+   */
+  print("Grad checking the `im2col` convolutional layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  F = 2  # num filters
+  Hf = 3  # filter height
+  Wf = 3  # filter width
+  stride = 1
+  pad = 1
+  X = rand(rows=N, cols=C*Hin*Win)
+  y = rand(rows=N, cols=F*Hin*Win)
+
+  # Create layers
+  [W, b] = conv::init(F, C, Hf, Wf)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, 
stride, pad, pad)
+  dout = l2_loss::backward(out, y)
+  [dX, dW, db] =
+    conv::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, stride, 
stride, pad, pad)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, 
stride, stride, pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      [outph, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, 
stride, stride, pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W.")
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      [outmh, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, 
stride, stride, pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      W[i,j] = old + h
+      [outph, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, 
stride, stride, pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      W[i,j] = old  # reset
+      dW_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b.")
+  for (i in 1:nrow(b)) {
+    for (j in 1:ncol(b)) {
+      # Compute numerical derivative
+      old = as.scalar(b[i,j])
+      b[i,j] = old - h
+      [outmh, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, 
stride, stride, pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      b[i,j] = old + h
+      [outph, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, 
stride, stride, pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      b[i,j] = old  # reset
+      db_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+    }
+  }
+}
+
+conv_builtin = function() {
+  /*
+   * Gradient check for the convolutional layer using built-in functions.
+   */
+  print("Grad checking the built-in convolutional layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  F = 2  # num filters
+  Hf = 3  # filter height
+  Wf = 3  # filter width
+  stride = 1
+  pad = 1
+  X = rand(rows=N, cols=C*Hin*Win)
+  y = rand(rows=N, cols=F*Hin*Win)
+
+  # Create layers
+  [W, b] = conv_builtin::init(F, C, Hf, Wf)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, Hout, Wout] = conv_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, 
stride, stride, pad, pad)
+  dout = l2_loss::backward(out, y)
+  [dX, dW, db] =
+    conv_builtin::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, 
stride, stride, pad, pad)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, Hout, Wout] = conv_builtin::forward(X, W, b, C, Hin, Win, Hf, 
Wf, stride, stride, pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      [outph, Hout, Wout] = conv_builtin::forward(X, W, b, C, Hin, Win, Hf, 
Wf, stride, stride, pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W.")
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      [outmh, Hout, Wout] = conv_builtin::forward(X, W, b, C, Hin, Win, Hf, 
Wf, stride, stride, pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      W[i,j] = old + h
+      [outph, Hout, Wout] = conv_builtin::forward(X, W, b, C, Hin, Win, Hf, 
Wf, stride, stride, pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      W[i,j] = old  # reset
+      dW_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b.")
+  for (i in 1:nrow(b)) {
+    for (j in 1:ncol(b)) {
+      # Compute numerical derivative
+      old = as.scalar(b[i,j])
+      b[i,j] = old - h
+      [outmh, Hout, Wout] = conv_builtin::forward(X, W, b, C, Hin, Win, Hf, 
Wf, stride, stride, pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      b[i,j] = old + h
+      [outph, Hout, Wout] = conv_builtin::forward(X, W, b, C, Hin, Win, Hf, 
Wf, stride, stride, pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      b[i,j] = old  # reset
+      db_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+    }
+  }
+}
+
+conv_simple = function() {
+  /*
+   * Gradient check for the simple reference convolutional layer.
+   */
+  print("Grad checking the simple reference convolutional layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  F = 2  # num filters
+  Hf = 3  # filter height
+  Wf = 3  # filter width
+  stride = 1
+  pad = 1
+  X = rand(rows=N, cols=C*Hin*Win)
+  y = rand(rows=N, cols=F*Hin*Win)
+
+  # Create layers
+  [W, b] = conv_simple::init(F, C, Hf, Wf)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, Hout, Wout] = conv_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, 
stride, stride, pad, pad)
+  dout = l2_loss::backward(out, y)
+  [dX, dW, db] =
+    conv_simple::backward(dout, Hout, Wout, X, W, b, C, Hin, Win, Hf, Wf, 
stride, stride, pad, pad)
+
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, Hout, Wout] = conv_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, 
stride, stride, pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      [outph, Hout, Wout] = conv_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, 
stride, stride, pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W.")
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      [outmh, Hout, Wout] = conv_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, 
stride, stride, pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      W[i,j] = old + h
+      [outph, Hout, Wout] = conv_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, 
stride, stride, pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      W[i,j] = old  # reset
+      dW_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking b.")
+  for (i in 1:nrow(b)) {
+    for (j in 1:ncol(b)) {
+      # Compute numerical derivative
+      old = as.scalar(b[i,j])
+      b[i,j] = old - h
+      [outmh, Hout, Wout] = conv_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, 
stride, stride, pad, pad)
+      lossmh = l2_loss::forward(outmh, y)
+      b[i,j] = old + h
+      [outph, Hout, Wout] = conv_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, 
stride, stride, pad, pad)
+      lossph = l2_loss::forward(outph, y)
+      b[i,j] = old  # reset
+      db_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(db[i,j]), db_num, lossph, lossmh)
+    }
+  }
+}
+
+cross_entropy_loss = function() {
+  /*
+   * Gradient check for the cross-entropy loss function.
+   */
+  print("Grad checking the cross-entropy loss function.")
+
+  # Generate data
+  N = 3 # num examples
+  K = 10 # num targets
+  pred = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
+  pred = pred / rowSums(pred)  # normalized probs
+  y = rand(rows=N, cols=K, min=0, max=1, pdf="uniform")
+  y = y / rowSums(y)  # normalized probs
+
+  # Compute analytical gradient
+  dpred = cross_entropy_loss::backward(pred, y)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(pred)) {
+    for (j in 1:ncol(pred)) {
+      # Compute numerical derivative
+      old = as.scalar(pred[i,j])
+      pred[i,j] = old - h
+      lossmh = cross_entropy_loss::forward(pred, y)
+      pred[i,j] = old + h
+      lossph = cross_entropy_loss::forward(pred, y)
+      pred[i,j] = old  # reset W[i,j]
+      dpred_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dpred[i,j]), dpred_num, lossph, 
lossmh)
+    }
+  }
+}
+
+dropout = function() {
+  /*
+   * Gradient check for the (inverted) dropout layer.
+   */
+  print("Grad checking the (inverted) dropout layer with L2 loss.")
+
+  # Generate data
+  N = 3  # num examples
+  M = 100  # num neurons
+  p = 0.5  # probability of dropping neuron output
+  seed = as.integer(floor(as.scalar(rand(rows=1, cols=1, min=1, max=100000)))) 
 # random seed
+  X = rand(rows=N, cols=M)
+  y = rand(rows=N, cols=M)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, mask] = dropout::forward(X, p, seed)
+  dout = l2_loss::backward(out, y)
+  dX = dropout::backward(dout, X, p, mask)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, mask] = dropout::forward(X, p, seed)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      [outph, mask] = dropout::forward(X, p, seed)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+l1_loss = function() {
+  /*
+   * Gradient check for the L1 loss function.
+   */
+  print("Grad checking the L1 loss function.")
+
+  # Generate data
+  N = 3 # num examples
+  D = 2 # num targets
+  pred = rand(rows=N, cols=D)
+  y = rand(rows=N, cols=D)
+
+  # Compute analytical gradient
+  dpred = l1_loss::backward(pred, y)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(pred)) {
+    for (j in 1:ncol(pred)) {
+      # Compute numerical derivative
+      old = as.scalar(pred[i,j])
+      pred[i,j] = old - h
+      lossmh = l1_loss::forward(pred, y)
+      pred[i,j] = old + h
+      lossph = l1_loss::forward(pred, y)
+      pred[i,j] = old  # reset W[i,j]
+      dpred_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dpred[i,j]), dpred_num, lossph, 
lossmh)
+    }
+  }
+}
+
+l1_reg = function() {
+  /*
+   * Gradient check for the L1 regularization function.
+   */
+  print("Grad checking the L1 regularization function.")
+
+  # Generate data
+  D = 5 # num features
+  M = 3 # num neurons
+  lambda = 0.01
+  W = rand(rows=D, cols=M)
+
+  # Compute analytical gradient
+  dW = l1_reg::backward(W, lambda)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      reg_lossmh = l1_reg::forward(W, lambda)
+      W[i,j] = old + h
+      reg_lossph = l1_reg::forward(W, lambda)
+      W[i,j] = old  # reset W[i,j]
+      dW_num = (reg_lossph - reg_lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, reg_lossph, 
reg_lossmh)
+    }
+  }
+}
+
+l2_loss = function() {
+  /*
+   * Gradient check for the L2 loss function.
+   */
+  print("Grad checking the L2 loss function.")
+
+  # Generate data
+  N = 3 # num examples
+  D = 2 # num targets
+  pred = rand(rows=N, cols=D)
+  y = rand(rows=N, cols=D)
+
+  # Compute analytical gradient
+  dpred = l2_loss::backward(pred, y)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(pred)) {
+    for (j in 1:ncol(pred)) {
+      # Compute numerical derivative
+      old = as.scalar(pred[i,j])
+      pred[i,j] = old - h
+      lossmh = l2_loss::forward(pred, y)
+      pred[i,j] = old + h
+      lossph = l2_loss::forward(pred, y)
+      pred[i,j] = old  # reset W[i,j]
+      dpred_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dpred[i,j]), dpred_num, lossph, 
lossmh)
+    }
+  }
+}
+
+l2_reg = function() {
+  /*
+   * Gradient check for the L2 regularization function.
+   */
+  print("Grad checking the L2 regularization function.")
+
+  # Generate data
+  D = 5 # num features
+  M = 3 # num neurons
+  lambda = 0.01
+  W = rand(rows=D, cols=M)
+
+  # Compute analytical gradient
+  dW = l2_reg::backward(W, lambda)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(W)) {
+    for (j in 1:ncol(W)) {
+      # Compute numerical derivative
+      old = as.scalar(W[i,j])
+      W[i,j] = old - h
+      reg_lossmh = l2_reg::forward(W, lambda)
+      W[i,j] = old + h
+      reg_lossph = l2_reg::forward(W, lambda)
+      W[i,j] = old  # reset W[i,j]
+      dW_num = (reg_lossph - reg_lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dW[i,j]), dW_num, reg_lossph, 
reg_lossmh)
+    }
+  }
+}
+
+log_loss = function() {
+  /*
+   * Gradient check for the log loss function.
+   */
+  print("Grad checking the log loss function.")
+
+  # Generate data
+  N = 20 # num examples
+  D = 1 # num targets
+  pred = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
+  y = round(rand(rows=N, cols=D, min=0, max=1, pdf="uniform"))
+
+  # Compute analytical gradient
+  dpred = log_loss::backward(pred, y)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(pred)) {
+    for (j in 1:ncol(pred)) {
+      # Compute numerical derivative
+      old = as.scalar(pred[i,j])
+      pred[i,j] = old - h
+      lossmh = log_loss::forward(pred, y)
+      pred[i,j] = old + h
+      lossph = log_loss::forward(pred, y)
+      pred[i,j] = old  # reset W[i,j]
+      dpred_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dpred[i,j]), dpred_num, lossph, 
lossmh)
+    }
+  }
+}
+
+max_pool = function() {
+  /*
+   * Gradient check for the max pooling layer.
+   */
+  print("Grad checking the max pooling layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 4  # input height
+  Win = 4  # input width
+  Hf = 2  # pool filter height
+  Wf = 2  # pool filter width
+  stride = 2
+  X = rand(rows=N, cols=C*Hin*Win)
+  y = rand(rows=N, cols=C*2*2)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, Hout, Wout] = max_pool::forward(X, C, Hin, Win, Hf, Wf, stride, stride)
+  dout = l2_loss::backward(out, y)
+  dX = max_pool::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, stride, 
stride)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, Hout, Wout] = max_pool::forward(X, C, Hin, Win, Hf, Wf, stride, 
stride)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      [outph, Hout, Wout] = max_pool::forward(X, C, Hin, Win, Hf, Wf, stride, 
stride)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+max_pool_builtin = function() {
+  /*
+   * Gradient check for the max pooling layer.
+   */
+  print("Grad checking the built-in max pooling layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 4  # input height
+  Win = 4  # input width
+  Hf = 2  # pool filter height
+  Wf = 2  # pool filter width
+  stride = 2
+  X = rand(rows=N, cols=C*Hin*Win)
+  y = rand(rows=N, cols=C*2*2)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, Hout, Wout] = max_pool_builtin::forward(X, C, Hin, Win, Hf, Wf, 
stride, stride)
+  dout = l2_loss::backward(out, y)
+  dX = max_pool_builtin::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, 
stride, stride)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, Hout, Wout] = max_pool_builtin::forward(X, C, Hin, Win, Hf, Wf, 
stride, stride)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      [outph, Hout, Wout] = max_pool_builtin::forward(X, C, Hin, Win, Hf, Wf, 
stride, stride)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+max_pool_simple = function() {
+  /*
+   * Gradient check for the simple reference max pooling layer.
+   */
+  print("Grad checking the simple reference max pooling layer with L2 loss.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 2  # num channels
+  Hin = 4  # input height
+  Win = 4  # input width
+  Hf = 2  # pool filter height
+  Wf = 2  # pool filter width
+  stride = 2
+  X = rand(rows=N, cols=C*Hin*Win)
+  y = rand(rows=N, cols=C*2*2)
+
+  # Compute analytical gradients of loss wrt parameters
+  [out, Hout, Wout] = max_pool_simple::forward(X, C, Hin, Win, Hf, Wf, stride, 
stride)
+  dout = l2_loss::backward(out, y)
+  dX = max_pool_simple::backward(dout, Hout, Wout, X, C, Hin, Win, Hf, Wf, 
stride, stride)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      [outmh, Hout, Wout] = max_pool_simple::forward(X, C, Hin, Win, Hf, Wf, 
stride, stride)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      [outph, Hout, Wout] = max_pool_simple::forward(X, C, Hin, Win, Hf, Wf, 
stride, stride)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+relu = function() {
+  /*
+   * Gradient check for the ReLU nonlinearity layer.
+   *
+   * NOTE: This could result in a false-negative in which the test
+   * fails due to a kink being crossed in the nonlinearity.  This
+   * occurs when the tests, f(x-h) and f(x+h), end up on opposite
+   * sides of the zero threshold of max(0, fx).  For now, just run
+   * the tests again.  In the future, we can explicitly check for
+   * this and rerun the test automatically.
+   */
+  print("Grad checking the ReLU nonlinearity layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  M = 10 # num neurons
+  X = rand(rows=N, cols=M, min=-5, max=5)
+  y = rand(rows=N, cols=M)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = relu::forward(X)
+  dout = l2_loss::backward(out, y)
+  dX = relu::backward(dout, X)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = relu::forward(X)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = relu::forward(X)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+sigmoid = function() {
+  /*
+   * Gradient check for the sigmoid nonlinearity layer.
+   */
+  print("Grad checking the sigmoid nonlinearity layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  M = 10 # num neurons
+  X = rand(rows=N, cols=M)
+  y = rand(rows=N, cols=M)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = sigmoid::forward(X)
+  dout = l2_loss::backward(out, y)
+  dX = sigmoid::backward(dout, X)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = sigmoid::forward(X)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = sigmoid::forward(X)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+softmax = function() {
+  /*
+   * Gradient check for the softmax layer.
+   */
+  print("Grad checking the softmax layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  D = 10 # num classes
+  X = rand(rows=N, cols=D)
+  y = rand(rows=N, cols=D, min=0, max=1, pdf="uniform")
+  y = y / rowSums(y)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = softmax::forward(X)
+  dout = l2_loss::backward(out, y)
+  dX = softmax::backward(dout, X)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = softmax::forward(X)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = softmax::forward(X)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+tanh = function() {
+  /*
+   * Gradient check for the hyperbolic tangent (tanh) nonlinearity layer.
+   */
+  print("Grad checking the tanh nonlinearity layer with L2 loss.")
+
+  # Generate data
+  N = 3 # num examples
+  M = 10 # num neurons
+  X = rand(rows=N, cols=M)
+  y = rand(rows=N, cols=M)
+
+  # Compute analytical gradients of loss wrt parameters
+  out = tanh::forward(X)
+  dout = l2_loss::backward(out, y)
+  dX = tanh::backward(dout, X)
+
+  # Grad check
+  h = 1e-5
+  for (i in 1:nrow(X)) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old = as.scalar(X[i,j])
+      X[i,j] = old - h
+      outmh = tanh::forward(X)
+      lossmh = l2_loss::forward(outmh, y)
+      X[i,j] = old + h
+      outph = tanh::forward(X)
+      lossph = l2_loss::forward(outph, y)
+      X[i,j] = old  # reset
+      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+}
+
+two_layer_affine_l2_net = function() {
+  /*
+   * Gradient check for a two-layer, fully-connected, feed-forward
+   * network with ReLU nonlinearity and L2 loss.
+   *
+   * NOTE: This could result in a false-negative in which the test
+   * fails due to a kink being crossed in the ReLU nonlinearity.  This
+   * occurs when the tests, f(x-h) and f(x+h), end up on opposite
+   * sides of the zero threshold of max(0, fx).  For now, just run
+   * the tests again.  In the future, we can explicitly check for
+   * this and rerun the test automatically.
+   */
+  print("Grad checking a two-layer, fully-connected, feed-forward network with 
a ReLU " +
+        "nonlinearity, and an L2 loss function.")
+
+  # Generate input data
+  N = 1000 # num examples
+  D = 100 # num features
+  yD = 5 # num targets
+  X = rand(rows=N, cols=D, pdf="normal") * 0.0001
+  y = rand(rows=N, cols=yD)
+
+  # Create 2-layer, fully-connected network
+  M = 10 # number of hidden neurons
+  [W1, b1] = affine::init(D, M)
+  [W2, b2] = affine::init(M, yD)
+
+  # Optimize for short "burn-in" time to move to characteristic
+  # mode of operation and unmask any real issues.
+  print(" - Burn-in:")
+  lr = 0.0001
+  decay = 0.99
+  for(i in 1:5) {
+    # Compute forward and backward passes of net
+    [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, 
W1, b1, W2, b2)
+    print("   - L2 loss: " + loss)
+
+    # Optimize with basic SGD
+    W1 = W1 - lr * dW1
+    b1 = b1 - lr * db1
+    W2 = W2 - lr * dW2
+    b2 = b2 - lr * db2
+    lr = lr * decay
+  }
+
+  # Compute analytical gradients
+  [pred, loss, dX, dW1, db1, dW2, db2] = two_layer_affine_l2_net_run(X, y, W1, 
b1, W2, b2)
+  
+  # Grad check
+  h = 1e-5
+  print(" - Grad checking X.")
+  for (i in 1:2) {
+    for (j in 1:ncol(X)) {
+      # Compute numerical derivative
+      old_w = as.scalar(X[i,j])
+      X[i,j] = old_w - h
+      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, 
b1, W2, b2)
+      X[i,j] = old_w + h
+      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, 
b1, W2, b2)
+      X[i,j] = old_w  # reset W[i,j]
+      dX_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dX[i,j]), dX_num, lossph, lossmh)
+    }
+  }
+
+  print(" - Grad checking W1.")
+  for (i in 1:nrow(W1)) {
+    for (j in 1:ncol(W1)) {
+      # Compute numerical derivative
+      old_w = as.scalar(W1[i,j])
+      W1[i,j] = old_w - h
+      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, 
b1, W2, b2)
+      W1[i,j] = old_w + h
+      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, 
b1, W2, b2)
+      W1[i,j] = old_w  # reset W[i,j]
+      dWij_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dW1[i,j]), dWij_num, lossph, 
lossmh)
+    }
+  }
+
+  print(" - Grad checking W2.")
+  for (i in 1:nrow(W2)) {
+    for (j in 1:ncol(W2)) {
+      # Compute numerical derivative
+      old_w = as.scalar(W2[i,j])
+      W2[i,j] = old_w - h
+      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, 
b1, W2, b2)
+      W2[i,j] = old_w + h
+      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, 
b1, W2, b2)
+      W2[i,j] = old_w  # reset W[i,j]
+      dWij_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(dW2[i,j]), dWij_num, lossph, 
lossmh)
+    }
+  }
+
+  print(" - Grad checking b1.")
+  for (i in 1:nrow(b1)) {
+    for (j in 1:ncol(b1)) {
+      # Compute numerical derivative
+      old_b = as.scalar(b1[i,j])
+      b1[i,j] = old_b - h
+      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, 
b1, W2, b2)
+      b1[i,j] = old_b + h
+      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, 
b1, W2, b2)
+      b1[i,j] = old_b  # reset b[1,j]
+      dbij_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(db1[i,j]), dbij_num, lossph, 
lossmh)
+    }
+  }
+
+  print(" - Grad checking b2.")
+  for (i in 1:nrow(b2)) {
+    for (j in 1:ncol(b2)) {
+      # Compute numerical derivative
+      old_b = as.scalar(b2[i,j])
+      b2[i,j] = old_b - h
+      [lossmh, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, 
b1, W2, b2)
+      b2[i,j] = old_b + h
+      [lossph, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, 
b1, W2, b2)
+      b2[i,j] = old_b  # reset b[1,j]
+      dbij_num = (lossph - lossmh) / (2 * h) # numerical derivative
+
+      # Check error
+      rel_error = check_rel_error(as.scalar(db2[i,j]), dbij_num, lossph, 
lossmh)
+    }
+  }
+}
+
+/*
+ * Test network with forward/backward functions.
+ */
+two_layer_affine_l2_net_run = function(matrix[double] X, matrix[double] y,
+                                       matrix[double] W1, matrix[double] b1,
+                                       matrix[double] W2, matrix[double] b2)
+    return (matrix[double] pred, double loss,
+            matrix[double] dX,
+            matrix[double] dW1, matrix[double] db1,
+            matrix[double] dW2, matrix[double] db2) {
+  # Compute forward pass
+  [loss, pred, aout, hout] = two_layer_affine_l2_net_forward(X, y, W1, b1, W2, 
b2)
+
+  # Compute backward pass
+  [dX, dpred, daout, dhout, dW1, db1, dW2, db2] =
+    two_layer_affine_l2_net_backward(X, y, pred, aout, hout, W1, b1, W2, b2)
+}
+
+two_layer_affine_l2_net_forward = function(matrix[double] X, matrix[double] y,
+                                           matrix[double] W1, matrix[double] 
b1,
+                                           matrix[double] W2, matrix[double] 
b2)
+    return (double loss, matrix[double] pred, matrix[double] aout, 
matrix[double] hout) {
+  # Compute forward pass
+  hout = affine::forward(X, W1, b1)
+  aout = relu::forward(hout)
+  pred = affine::forward(aout, W2, b2)
+
+  # Compute loss
+  loss = l2_loss::forward(pred, y)
+}
+
+two_layer_affine_l2_net_backward = function(matrix[double] X, matrix[double] 
y, matrix[double] pred,
+                                            matrix[double] aout, 
matrix[double] hout,
+                                            matrix[double] W1, matrix[double] 
b1,
+                                            matrix[double] W2, matrix[double] 
b2)
+    return (matrix[double] dX, matrix[double] dpred,
+            matrix[double] daout, matrix[double] dhout,
+            matrix[double] dW1, matrix[double] db1, matrix[double] dW2, 
matrix[double] db2) {
+  # Compute backward pass
+  dpred = l2_loss::backward(pred, y)
+  [daout, dW2, db2] = affine::backward(dpred, aout, W2, b2)
+  dhout = relu::backward(daout, hout)
+  [dX, dW1, db1] = affine::backward(dhout, X, W1, b1)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/781d24d8/scripts/staging/SystemML-NN/nn/test/max_pool_simple.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/max_pool_simple.dml 
b/scripts/staging/SystemML-NN/nn/test/max_pool_simple.dml
new file mode 100644
index 0000000..2f90779
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/test/max_pool_simple.dml
@@ -0,0 +1,130 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Max pooling layer.
+ *
+ * This implementation is intended to be a simple, reference version.
+ */
+forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf,
+                   int strideh, int stridew)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * This implementation is intended to be a simple, reference version.
+   *
+   * Inputs:
+   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+  N = nrow(X)
+  Hout = as.integer((Hin - Hf) / strideh + 1)
+  Wout = as.integer((Win - Wf) / stridew + 1)
+
+  # Create output volume
+  out = matrix(0, rows=N, cols=C*Hout*Wout)  
+
+  # Max pooling
+  parfor (n in 1:N, check=0) {  # all examples
+    img = matrix(X[n,], rows=C, cols=Hin*Win)
+    parfor (c in 1:C, check=0) {  # all channels
+      img_slice = matrix(img[c,], rows=Hin, cols=Win)
+      parfor (hout in 1:Hout, check=0) {  # all output rows
+        hin = (hout-1) * strideh + 1
+        parfor (wout in 1:Wout, check=0) {  # all output columns
+          win = (wout-1) * stridew + 1
+          out[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout] =
+            max(img_slice[hin:hin+Hf-1, win:win+Wf-1])
+        }
+      }
+    }
+  }
+}
+
+backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X, 
int C, 
+                    int Hin, int Win, int Hf, int Wf, int strideh, int stridew)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for a 2D spatial max pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * Inputs:
+   *  - dout: Derivatives from upstream, of shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   *  - X: Input data matrix, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt X, of shape (N, C*Hin*Win).
+   */
+  N = nrow(X)
+  
+  # Create gradient volume
+  dX = matrix(0, rows=N, cols=C*Hin*Win)
+  
+  # Gradient of max pooling
+  parfor (n in 1:N, check=0) {  # all examples
+    img = matrix(X[n,], rows=C, cols=Hin*Win)
+    dimg = matrix(0, rows=C, cols=Hin*Win)
+    parfor (c in 1:C, check=0) {  # all channels
+      img_slice = matrix(img[c,], rows=Hin, cols=Win)
+      dimg_slice = matrix(0, rows=Hin, cols=Win)
+      for (hout in 1:Hout, check=0) {  # all output rows
+        hin = (hout-1) * strideh + 1
+        for (wout in 1:Wout) {  # all output columns
+          win = (wout-1) * stridew + 1
+          img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1]
+          max_val = max(img_slice_patch)
+          max_val_ind = ppred(img_slice_patch, max_val, "==")  # max value 
indicator
+          # gradient passes through only for the max value in this patch
+          dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + 
(hout-1)*Wout + wout]
+          dimg_slice[hin:hin+Hf-1, win:win+Wf-1] =
+            dimg_slice[hin:hin+Hf-1, win:win+Wf-1] + dimg_slice_patch
+        }
+      }
+      dimg[c,] = matrix(dimg_slice, rows=1, cols=Hin*Win)
+    }
+    dX[n,] = matrix(dimg, rows=1, cols=C*Hin*Win)
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/781d24d8/scripts/staging/SystemML-NN/nn/test/test.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/test.dml 
b/scripts/staging/SystemML-NN/nn/test/test.dml
new file mode 100644
index 0000000..58ee3e1
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/test/test.dml
@@ -0,0 +1,192 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Various tests, not including gradient checks.
+ */
+source("nn/layers/conv.dml") as conv
+source("nn/layers/conv_builtin.dml") as conv_builtin
+source("nn/layers/max_pool.dml") as max_pool
+source("nn/layers/max_pool_builtin.dml") as max_pool_builtin
+source("nn/test/conv_simple.dml") as conv_simple
+source("nn/test/max_pool_simple.dml") as max_pool_simple
+source("nn/util.dml") as util
+
+conv = function() {
+  /*
+   * Test for the `conv` functions.
+   */
+  print("Testing the conv functions.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 3  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  F = 2  # num filters
+  Hf = 3  # filter height
+  Wf = 3  # filter width
+  stride = 1
+  pad = 1
+  X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
+
+  # Create layer
+  [W, b] = conv::init(F, C, Hf, Wf)
+
+  # Forward
+  [out, Hout, Wout] = conv::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, 
stride, pad, pad)
+  [out_simple, Hout_simple, Wout_simple] =
+    conv_simple::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, 
pad)
+  [out_builtin, Hout_builtin, Wout_builtin] =
+    conv_builtin::forward(X, W, b, C, Hin, Win, Hf, Wf, stride, stride, pad, 
pad)
+
+  # Equivalency check
+  out = matrix(out, rows=1, cols=N*F*Hout*Wout)
+  out_simple = matrix(out_simple, rows=1, cols=N*F*Hout*Wout)
+  out_builtin = matrix(out_builtin, rows=1, cols=N*F*Hout*Wout)
+  for (i in 1:length(out)) {
+    rel_error = util::check_rel_error(as.scalar(out[1,i]), 
as.scalar(out_simple[1,i]), 1e-10, 1e-12)
+    rel_error = util::check_rel_error(as.scalar(out[1,i]), 
as.scalar(out_builtin[1,i]), 1e-10, 1e-12)
+  }
+}
+
+im2col = function() {
+  /*
+   * Test for the `im2col` and `col2im` functions.
+   */
+  print("Testing the im2col and col2im functions.")
+
+       # Generate data
+  C = 3  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  Hf = 3  # filter height
+  Wf = 3  # filter width
+  stride = 2
+  pad = (Hin * stride - Hin + Hf - stride) / 2
+  Hout = as.integer((Hin + 2 * pad - Hf) / stride + 1)
+  Wout = as.integer((Win + 2 * pad - Wf) / stride + 1)
+  x = rand(rows=C, cols=Hin*Win)
+
+  # pad
+  x_pad = util::pad_image(x, Hin, Win, pad, pad)
+
+  # im2col
+  x_cols = util::im2col(x_pad, Hin+2*pad, Win+2*pad, Hf, Wf, stride, stride)
+
+  # col2im
+  x_pad2 = util::col2im(x_cols, C, Hin+2*pad, Win+2*pad, Hf, Wf, stride, 
stride, "none")
+
+  # Equivalency check
+  equivalent = util::all_equal(x_pad, x_pad2)
+  if (!equivalent)
+    print("ERROR: im2col and then col2im does not yield the original image.")
+}
+
+padding = function() {
+  /*
+   * Test for the `pad_image` and `unpad_image` functions.
+   */
+  print("Testing the padding and unpadding functions.")
+
+  # Generate data
+  C = 3  # num channels
+  Hin = 5  # input height
+  Win = 5  # input width
+  pad = 3  # padding
+  x = rand(rows=C, cols=Hin*Win)
+
+  # Pad image
+  x_pad = util::pad_image(x, Hin, Win, pad, pad)
+  
+  # Check for padded rows & columns
+  for (c in 1:C) {
+    x_pad_slice = matrix(x_pad[c,], rows=Hin+2*pad, cols=Win+2*pad)
+    for (i in 1:pad) {
+      rowsum = sum(x_pad_slice[i,])
+      colsum = sum(x_pad_slice[,i])
+      if (rowsum != 0)
+        print("ERROR: Padding was not applied to row " + i + ".")
+      if (colsum != 0)
+        print("ERROR: Padding was not applied to column " + i + ".")
+    }
+  }
+
+  # Unpad image
+  x1 = util::unpad_image(x_pad, Hin, Win, pad, pad)
+
+  # Equivalency check
+  equivalent = util::all_equal(x, x1)
+  if (!equivalent)
+    print("ERROR: Padding and then unpadding does not yield the original 
image.")
+}
+
+max_pool = function() {
+  /*
+   * Test for the `max_pool` functions.
+   */
+  print("Testing the max pool functions.")
+
+  # Generate data
+  N = 2  # num examples
+  C = 3  # num channels
+  Hin = 8  # input height
+  Win = 8  # input width
+  Hf = 2  # filter height
+  Wf = 2  # filter width
+  stride = 2
+  X = rand(rows=N, cols=C*Hin*Win, pdf="normal")
+
+  # Forward
+  [out, Hout, Wout] = max_pool::forward(X, C, Hin, Win, Hf, Wf, stride, stride)
+  [out_simple, Hout_simple, Wout_simple] =
+    max_pool_simple::forward(X, C, Hin, Win, Hf, Wf, stride, stride)
+  [out_builtin, Hout_builtin, Wout_builtin] =
+    max_pool_builtin::forward(X, C, Hin, Win, Hf, Wf, stride, stride)
+
+  # Equivalency check
+  out = matrix(out, rows=1, cols=N*C*Hout*Wout)
+  out_simple = matrix(out_simple, rows=1, cols=N*C*Hout*Wout)
+  out_builtin = matrix(out_builtin, rows=1, cols=N*C*Hout*Wout)
+  for (i in 1:length(out)) {
+    rel_error = util::check_rel_error(as.scalar(out[1,i]), 
as.scalar(out_simple[1,i]), 1e-10, 1e-12)
+    rel_error = util::check_rel_error(as.scalar(out[1,i]), 
as.scalar(out_builtin[1,i]), 1e-10, 1e-12)
+  }
+
+  # ---
+  # Check for correct behavior
+  # Generate data
+  C = 2  # num channels
+  Hin = 4  # input height
+  Win = 4  # input width
+  X = matrix(seq(1,16,1), rows=Hin, cols=Win)
+  X = matrix(rbind(X, t(X)), rows=1, cols=C*Hin*Win)
+  X = rbind(X, X)  # N=2
+
+  # Forward
+  [out, Hout, Wout] = max_pool::forward(X, C, Hin, Win, Hf, Wf, stride, stride)
+
+  # Equivalency check
+  target = matrix("6 8 14 16 6 14 8 16", rows=1, cols=C*Hout*Wout)
+  target = rbind(target, target)  # N=2
+  tmp = util::check_all_equal(out, target)
+}
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/781d24d8/scripts/staging/SystemML-NN/nn/test/tests.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/test/tests.dml 
b/scripts/staging/SystemML-NN/nn/test/tests.dml
new file mode 100644
index 0000000..1b91967
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/test/tests.dml
@@ -0,0 +1,72 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Script to run tests.
+ */
+source("nn/test/grad_check.dml") as grad_check
+source("nn/test/test.dml") as test
+
+print("")
+print("Starting grad checks.")
+print("---")
+
+tmp = grad_check::cross_entropy_loss()
+tmp = grad_check::l1_loss()
+tmp = grad_check::l2_loss()
+tmp = grad_check::log_loss()
+tmp = grad_check::affine()
+tmp = grad_check::conv_simple()
+tmp = grad_check::conv()
+tmp = grad_check::conv_builtin()
+tmp = grad_check::dropout()
+tmp = grad_check::l1_reg()
+tmp = grad_check::l2_reg()
+tmp = grad_check::max_pool_simple()
+tmp = grad_check::max_pool()
+tmp = grad_check::max_pool_builtin()
+tmp = grad_check::relu()
+tmp = grad_check::sigmoid()
+tmp = grad_check::softmax()
+tmp = grad_check::tanh()
+tmp = grad_check::two_layer_affine_l2_net()
+
+print("---")
+print("Grad checks complete -- look for any ERRORs or WARNINGs.")
+print("If any tests involving ReLUs failed, try a few times " +
+      "to ensure that they were not false negatives due to " +
+      "kinks being crossed.")
+print("")
+
+print("")
+print("Starting other tests.")
+print("---")
+
+tmp = test::im2col()
+tmp = test::padding()
+tmp = test::conv()
+tmp = test::max_pool()
+
+print("---")
+print("Other tests complete -- look for any ERRORs or WARNINGs.")
+print("")
+print("")
+

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/781d24d8/scripts/staging/SystemML-NN/nn/util.dml
----------------------------------------------------------------------
diff --git a/scripts/staging/SystemML-NN/nn/util.dml 
b/scripts/staging/SystemML-NN/nn/util.dml
new file mode 100644
index 0000000..213363b
--- /dev/null
+++ b/scripts/staging/SystemML-NN/nn/util.dml
@@ -0,0 +1,266 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Utility functions.
+ */
+all_equal = function(matrix[double] X1, matrix[double] X2)
+    return(boolean equivalent) {
+  /*
+   * Determine if two matrices are equivalent.
+   *
+   * Inputs:
+   *  - X1: Input matrix, of shape (any, any).
+   *  - X2: Input matrix, of same shape as X1.
+   *
+   * Outputs:
+   *  - equivalent: Whether or not the two matrices are equivalent.
+   */
+  equivalent = as.logical(prod(ppred(X1, X2, "==")))
+}
+
+check_all_equal = function(matrix[double] X1, matrix[double] X2)
+    return(boolean equivalent) {
+  /*
+   * Check if two matrices are equivalent, and report any issues.
+   *
+   *  - Issues an "ERROR" statement if elements of the two matrices
+   *  are not equal.
+   *
+   * Inputs:
+   *  - X1: Input matrix, of shape (any, any).
+   *  - X2: Input matrix, of same shape as X1.
+   *
+   * Outputs:
+   *  - equivalent: Whether or not the two matrices are equivalent.
+   */
+  # Determine if matrices are equivalent
+  equivalent = all_equal(X1, X2)
+
+  # Evaluate relative error
+  if (!equivalent) {
+      print("ERROR: The two matrices are not equivalent.")
+  }
+}
+
+compute_rel_error = function(double x1, double x2) return (double rel_error) {
+  /*
+   * Relative error measure between two values.
+   *
+   * Uses smoothing to avoid divide-by-zero errors.
+   *
+   * Inputs:
+   *  - x1: First value.
+   *  - x2: Second value.
+   *
+   * Outputs:
+   *  - rel_error: Relative error measure between the two values.
+   */
+  rel_error = abs(x1 - x2) / max(1e-8, abs(x1) + abs(x2))
+}
+
+check_rel_error = function(double x1, double x2, double thresh_error, double 
thresh_warn)
+    return (double rel_error) {
+  /*
+   * Check and report any issues with the relative error measure between
+   * two values.
+   *
+   *  - Issues an "ERROR" statement for relative errors > thresh_error,
+   *  indicating that the implementation is likely incorrect.
+   *  - Issues a "WARNING" statement for relative errors < thresh_error
+   *  but > thresh_warn, indicating that the implementation may be incorrect.
+   *
+   * Inputs:
+   *  - x1: First value.
+   *  - x2: Second value.
+   *  - thresh_error: Error threshold.
+   *  - thresh_warn: Warning threshold.
+   *
+   * Outputs:
+   *  - rel_error: Relative error measure between the two values.
+   */
+  # Compute relative error
+  rel_error = compute_rel_error(x1, x2)
+
+  # Evaluate relative error
+  if (rel_error > thresh_error) {
+      print("ERROR: Relative error " + rel_error + " > " + thresh_error + " 
with " + x1 +
+            " vs " + x2 + ".")
+  }
+  else if (rel_error > thresh_warn & rel_error < thresh_error) {
+      print("WARNING: Relative error " + rel_error + " > " + thresh_warn + " 
with " + x1 +
+            " vs " + x2 + ".")
+  }
+}
+
+im2col = function(matrix[double] img, int Hin, int Win, int Hf, int Wf, int 
strideh, int stridew)
+    return (matrix[double] img_cols) {
+  /*
+   * Rearrange local image regions (patches) into columns.
+   *
+   * Assumes image has already been padded as necessary.
+   *
+   * Inputs:
+   *  - img: Input image, of shape (C, Hin*Win), where C is the number
+   *      of input channels (depth).
+   *  - Hin: Input height, including padding.
+   *  - Win: Input width, including padding.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *
+   * Outputs:
+   *  - img_cols: Local spatial regions (patches) of the image stretched
+   *      out into columns, of shape (C*Hf*Wf, Hout*Wout).
+   */
+  C = nrow(img)
+  Hout = as.integer((Hin - Hf) / strideh + 1)
+  Wout = as.integer((Win - Wf) / stridew + 1)
+
+  img_cols = matrix(0, rows=C*Hf*Wf, cols=Hout*Wout)  # zeros
+  parfor (hout in 1:Hout, check=0) {  # all output rows
+    hin = (hout-1) * strideh + 1
+    parfor (wout in 1:Wout, check=0) {  # all output columns
+      win = (wout-1) * stridew + 1
+      # Extract a local patch of the input image corresponding spatially to 
the filter sizes.
+      img_patch = matrix(0, rows=C, cols=Hf*Wf)  # zeros
+      parfor (c in 1:C) {  # all channels
+        img_slice = matrix(img[c,], rows=Hin, cols=Win)  # reshape
+        img_patch[c,] = matrix(img_slice[hin:hin+Hf-1, win:win+Wf-1], rows=1, 
cols=Hf*Wf)
+      }
+      img_cols[,(hout-1)*Wout + wout] = matrix(img_patch, rows=C*Hf*Wf, 
cols=1)  # reshape
+    }
+  }
+}
+
+col2im = function(matrix[double] img_cols, int C, int Hin, int Win, int Hf, 
int Wf,
+                  int strideh, int stridew, string reduction)
+    return (matrix[double] img) {
+  /*
+   * Create an image from columns of local image regions (patches).
+   *
+   * The reduction strategy determines how to deal with overlapping
+   * patches.  If it is set to "add", any overlapping patches will be
+   * added together when creating the image.  This is useful when
+   * computing gradients on the original image given gradients on the
+   * patches.  Otherwise, if "none" is provided, any overlapping
+   * patches will just override previous ones when creating the image.
+   * This is useful when recreating an image from the output of
+   * `im2col`.
+   *
+   * Assumes original image was already padded as necessary.
+   *
+   * Inputs:
+   *  - img_cols: Local spatial regions (patches) of the image stretched
+   *      out into columns, of shape (C*Hf*Wf, Hout*Wout).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height, including padding.
+   *  - Win: Input width, including padding.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - reduction: The reduction strategy to use for overlapping
+   *      patches.  Valid options are "add" and "none".
+   *
+   * Outputs:
+   *  - img: Input image, of shape (C, Hin*Win).
+   */
+  Hout = as.integer((Hin - Hf) / strideh + 1)
+  Wout = as.integer((Win - Wf) / stridew + 1)
+
+  img = matrix(0, rows=C, cols=Hin*Win)  # zeros
+  for (hout in 1:Hout) {  # all output rows
+    hin = (hout-1) * strideh + 1
+    for (wout in 1:Wout) {  # all output columns
+      win = (wout-1) * stridew + 1
+      # Extract a local patch of the input image corresponding spatially to 
the filter sizes.
+      img_patch = matrix(img_cols[,(hout-1)*Wout + wout], rows=C, cols=Hf*Wf)  
# zeros
+      parfor (c in 1:C) {  # all channels
+        img_patch_slice = matrix(img_patch[c,], rows=Hf, cols=Wf)  # reshape
+        if (reduction == "add") {
+          img_slice = matrix(0, rows=Hin, cols=Win)
+          img_slice[hin:hin+Hf-1, win:win+Wf-1] = img_patch_slice
+          img[c,] = img[c,] + matrix(img_slice, rows=1, cols=Hin*Win)
+        } else {
+          img_slice = matrix(img[c,], rows=Hin, cols=Win)
+          img_slice[hin:hin+Hf-1, win:win+Wf-1] = img_patch_slice
+          img[c,] = matrix(img_slice, rows=1, cols=Hin*Win)
+        }
+      }
+    }
+  }
+}
+
+pad_image = function(matrix[double] img, int Hin, int Win, int padh, int padw)
+    return (matrix[double] img_padded) {
+  /*
+   * Pads an image along the height and width dimensions with zeros.
+   *
+   * Inputs:
+   *  - img: Input image, of shape (C, Hin*Win), where C is the number
+   *      of input channels (depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - padh: Padding for top and bottom sides.
+   *  - padw: Padding for left and right sides.
+   *
+   * Outputs:
+   *  - img_padded: The input image padded along the height and width
+   *      dimensions, of shape (C, (Hin+2*padh)*(Win+2*padw)).
+   */
+  C = nrow(img)
+  img_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw))  # zeros
+  parfor (c in 1:C) {
+    img_slice = matrix(img[c,], rows=Hin, cols=Win)  # depth slice C reshaped
+    img_padded_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw)
+    img_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = img_slice
+    img_padded[c,] = matrix(img_padded_slice, rows=1, 
cols=(Hin+2*padh)*(Win+2*padw))  # reshape
+  }
+}
+
+unpad_image = function(matrix[double] img_padded, int Hin, int Win, int padh, 
int padw)
+    return (matrix[double] img) {
+  /*
+   * Unpads an image along the height and width dimensions.
+   *
+   * Inputs:
+   *  - img_padded: The input image padded along the height and width
+   *      dimensions, of shape (C, (Hin+2*padh)*(Win+2*padw)).
+   *  - Hin: Input height of unpadded image.
+   *  - Win: Input width of unpadded image.
+   *  - padh: Padding for top and bottom sides.
+   *  - padw: Padding for left and right sides.
+   *
+   * Outputs:
+   *  - img: Input image, of shape (C, Hin*Win), where C is the number
+   *      of input channels (depth).
+   */
+  C = nrow(img_padded)
+  img = matrix(0, rows=C, cols=Hin*Win)
+  parfor (c in 1:C) {
+    img_padded_slice = matrix(img_padded[c,], rows=(Hin+2*padh), 
cols=(Win+2*padw))
+    img_slice = img_padded_slice[padh+1:padh+Hin, padw+1:padw+Win]
+    img[c,] = matrix(img_slice, rows=1, cols=Hin*Win)
+  }
+}
+

[1/2] incubator-systemml git commit: [SYSTEMML-618] Deep Learning DML Library

Reply via email to