[SYSTEMML-1452] General code cleanup of SystemML-NN This commmit performs a general code & documentation cleanup across the library.
Closes #447. Project: http://git-wip-us.apache.org/repos/asf/incubator-systemml/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-systemml/commit/16b1cbd7 Tree: http://git-wip-us.apache.org/repos/asf/incubator-systemml/tree/16b1cbd7 Diff: http://git-wip-us.apache.org/repos/asf/incubator-systemml/diff/16b1cbd7 Branch: refs/heads/master Commit: 16b1cbd72601afbed0b19c1d4125a898fd324b1c Parents: 2e48d95 Author: Mike Dusenberry <[email protected]> Authored: Fri Mar 31 18:38:15 2017 -0700 Committer: Mike Dusenberry <[email protected]> Committed: Fri Mar 31 18:38:16 2017 -0700 ---------------------------------------------------------------------- projects/breast_cancer/hyperparam_tuning.dml | 8 +- projects/breast_cancer/softmax_clf.dml | 16 +-- .../staging/SystemML-NN/nn/layers/affine.dml | 36 ++++--- .../SystemML-NN/nn/layers/batch_norm.dml | 17 +-- scripts/staging/SystemML-NN/nn/layers/conv.dml | 50 ++++----- .../SystemML-NN/nn/layers/conv_builtin.dml | 63 ++++++----- .../nn/layers/cross_entropy_loss.dml | 29 +++-- .../staging/SystemML-NN/nn/layers/dropout.dml | 23 ++-- .../staging/SystemML-NN/nn/layers/l1_loss.dml | 29 +++-- .../staging/SystemML-NN/nn/layers/l1_reg.dml | 15 +-- .../staging/SystemML-NN/nn/layers/l2_loss.dml | 29 +++-- .../staging/SystemML-NN/nn/layers/l2_reg.dml | 15 +-- .../staging/SystemML-NN/nn/layers/log_loss.dml | 40 ++++--- scripts/staging/SystemML-NN/nn/layers/lstm.dml | 65 ++++++------ .../staging/SystemML-NN/nn/layers/max_pool.dml | 15 +-- .../SystemML-NN/nn/layers/max_pool_builtin.dml | 14 +-- scripts/staging/SystemML-NN/nn/layers/relu.dml | 22 ++-- scripts/staging/SystemML-NN/nn/layers/rnn.dml | 43 ++++---- .../staging/SystemML-NN/nn/layers/sigmoid.dml | 30 ++++-- .../staging/SystemML-NN/nn/layers/softmax.dml | 29 ++--- .../nn/layers/spatial_batch_norm.dml | 12 +-- scripts/staging/SystemML-NN/nn/layers/tanh.dml | 28 ++--- .../staging/SystemML-NN/nn/optim/adagrad.dml | 22 ++-- scripts/staging/SystemML-NN/nn/optim/adam.dml | 38 +++---- .../staging/SystemML-NN/nn/optim/rmsprop.dml | 24 +++-- scripts/staging/SystemML-NN/nn/optim/sgd.dml | 12 ++- .../SystemML-NN/nn/optim/sgd_momentum.dml | 24 +++-- .../SystemML-NN/nn/optim/sgd_nesterov.dml | 23 ++-- .../staging/SystemML-NN/nn/test/conv_simple.dml | 51 ++++----- .../staging/SystemML-NN/nn/test/grad_check.dml | 106 ++++++++++--------- .../SystemML-NN/nn/test/max_pool_simple.dml | 18 ++-- scripts/staging/SystemML-NN/nn/util.dml | 46 ++++---- 32 files changed, 549 insertions(+), 443 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/projects/breast_cancer/hyperparam_tuning.dml ---------------------------------------------------------------------- diff --git a/projects/breast_cancer/hyperparam_tuning.dml b/projects/breast_cancer/hyperparam_tuning.dml index 464c659..4f054c3 100644 --- a/projects/breast_cancer/hyperparam_tuning.dml +++ b/projects/breast_cancer/hyperparam_tuning.dml @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -66,7 +66,9 @@ parfor(j in 1:10000) { log_interval = 10 # Train - [Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2] = clf::train(X, Y, X_val, Y_val, C, Hin, Win, lr, mu, decay, lambda, batch_size, epochs, log_interval, dir) + [Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2] = + clf::train(X, Y, X_val, Y_val, C, Hin, Win, lr, mu, decay, lambda, batch_size, epochs, + log_interval, dir) # Eval #probs = clf::predict(X, C, Hin, Win, Wc1, bc1, Wc2, bc2, Wc3, bc3, Wa1, ba1, Wa2, ba2) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/projects/breast_cancer/softmax_clf.dml ---------------------------------------------------------------------- diff --git a/projects/breast_cancer/softmax_clf.dml b/projects/breast_cancer/softmax_clf.dml index e106a36..35fd545 100644 --- a/projects/breast_cancer/softmax_clf.dml +++ b/projects/breast_cancer/softmax_clf.dml @@ -7,9 +7,9 @@ # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -79,7 +79,7 @@ train = function(matrix[double] X, matrix[double] Y, accuracy_val = mean(rowIndexMax(probs_val) == rowIndexMax(Y_val)) # Output results print("Start: Val Loss: " + loss_val + ", Val Accuracy: " + accuracy_val) - + # Optimize print("Starting optimization") iters = ceil(N / batch_size) @@ -152,7 +152,7 @@ predict = function(matrix[double] X, matrix[double] W, matrix[double] b) */ N = nrow(X) # num examples K = ncol(W) # num classes - + # Compute forward pass ## affine & softmax: out = affine::forward(X, W, b) @@ -185,7 +185,7 @@ eval = function(matrix[double] probs, matrix[double] Y) generate_dummy_data = function() return (matrix[double] X, matrix[double] Y, int C, int Hin, int Win) { /* - * Generate a dummy dataset similar to the MNIST dataset. + * Generate a dummy dataset similar to the breast cancer dataset. * * Outputs: * - X: Input data matrix, of shape (N, D). @@ -196,9 +196,9 @@ generate_dummy_data = function() */ # Generate dummy input data N = 1024 # num examples - C = 1 # num input channels - Hin = 28 # input height - Win = 28 # input width + C = 3 # num input channels + Hin = 256 # input height + Win = 256 # input width T = 10 # num targets X = rand(rows=N, cols=C*Hin*Win, pdf="normal") classes = round(rand(rows=N, cols=1, min=1, max=T, pdf="uniform")) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/affine.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/affine.dml b/scripts/staging/SystemML-NN/nn/layers/affine.dml index 6a4c210..f9f8559 100644 --- a/scripts/staging/SystemML-NN/nn/layers/affine.dml +++ b/scripts/staging/SystemML-NN/nn/layers/affine.dml @@ -22,6 +22,7 @@ /* * Fully-connected (affine) layer. */ + forward = function(matrix[double] X, matrix[double] W, matrix[double] b) return (matrix[double] out) { /* @@ -29,9 +30,9 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b) * M neurons. The input data has N examples, each with D features. * * Inputs: - * - X: Input data matrix, of shape (N, D). - * - W: Weights (parameters) matrix, of shape (D, M). - * - b: Biases vector, of shape (1, M). + * - X: Inputs, of shape (N, D). + * - W: Weights, of shape (D, M). + * - b: Biases, of shape (1, M). * * Outputs: * - out: Outputs, of shape (N, M). @@ -47,15 +48,15 @@ backward = function(matrix[double] dout, matrix[double] X, * with M neurons. * * Inputs: - * - dout: Derivatives from upstream, of shape (N, M). - * - X: Previous input data matrix, of shape (N, D). - * - W: Weights (parameters) matrix, of shape (D, M). - * - b: Biases vector, of shape (1, M). + * - dout: Gradient wrt `out` from upstream, of shape (N, M). + * - X: Inputs, of shape (N, D). + * - W: Weights, of shape (D, M). + * - b: Biases, of shape (1, M). * * Outputs: - * - dX: Gradient wrt X, of shape (N, D). - * - dW: Gradient wrt W, of shape (D, M). - * - db: Gradient wrt b, of shape (1, M). + * - dX: Gradient wrt `X`, of shape (N, D). + * - dW: Gradient wrt `W`, of shape (D, M). + * - db: Gradient wrt `b`, of shape (1, M). */ dX = dout %*% t(W) dW = t(X) %*% dout @@ -70,18 +71,19 @@ init = function(int D, int M) * Note: This is just a convenience function, and parameters * may be initialized manually if needed. * - * We use the heuristic by He et al. [http://arxiv.org/abs/1502.01852], - * which limits the magnification of inputs/gradients during - * forward/backward passes by scaling unit-Gaussian weights by a - * factor of sqrt(2/n), under the assumption of relu neurons. + * We use the heuristic by He et al., which limits the magnification + * of inputs/gradients during forward/backward passes by scaling + * unit-Gaussian weights by a factor of sqrt(2/n), under the + * assumption of relu neurons. + * - http://arxiv.org/abs/1502.01852 * * Inputs: - * - D: Dimensionality of the input features. + * - D: Dimensionality of the input features (number of features). * - M: Number of neurons in this layer. * * Outputs: - * - W: Weight matrix, of shape (D, M). - * - b: Biases vector, of shape (1, M). + * - W: Weights, of shape (D, M). + * - b: Biases, of shape (1, M). */ W = rand(rows=D, cols=M, pdf="normal") * sqrt(2.0/D) b = matrix(0, rows=1, cols=M) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml b/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml index d332e8c..82240f7 100644 --- a/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml +++ b/scripts/staging/SystemML-NN/nn/layers/batch_norm.dml @@ -22,6 +22,7 @@ /* * Batch normalization layer. */ + forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta, string mode, matrix[double] ema_mean, matrix[double] ema_var, double mu, double epsilon) @@ -36,7 +37,7 @@ forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta, * introduces learnable parameters (gamma, beta) to control the * amount of normalization. * - * y = ((x-mean) / sqrt(var+eps)) * gamma + beta + * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta` * * This implementation maintains exponential moving averages of the * mean and variance during training for use during testing. @@ -47,7 +48,7 @@ forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta, * - https://arxiv.org/abs/1502.03167 * * Inputs: - * - X: Input data matrix, of shape (N, D). + * - X: Inputs, of shape (N, D). * - gamma: Scale parameters, of shape (1, D). * - beta: Shift parameters, of shape (1, D). * - mode: 'train' or 'test' to indicate if the model is currently @@ -118,7 +119,7 @@ backward = function(matrix[double] dout, matrix[double] out, * Computes the backward pass for a batch normalization layer. * * Inputs: - * - dout: Derivatives from upstream, of shape (N, D). + * - dout: Gradient wrt `out` from upstream, of shape (N, D). * - out: Outputs from the forward pass, of shape (N, D). * - ema_mean_upd: Updated exponential moving average of the mean * from the forward pass, of shape (1, D). @@ -133,7 +134,7 @@ backward = function(matrix[double] dout, matrix[double] out, * - cache_norm: Cache of the normalized inputs from the forward * pass, of shape (N, D). Note: This is used for performance * during training. - * - X: Input data matrix to the forward pass, of shape (N, D). + * - X: Inputs, of shape (N, D). * - gamma: Scale parameters, of shape (1, D). * - beta: Shift parameters, of shape (1, D). * - mode: 'train' or 'test' to indicate if the model is currently @@ -151,9 +152,9 @@ backward = function(matrix[double] dout, matrix[double] out, * Typical values are in the range of [1e-5, 1e-3]. * * Outputs: - * - dX: Gradient wrt X, of shape (N, D). - * - dgamma: Gradient wrt W, of shape (1, D). - * - dbeta: Gradient wrt b, of shape (1, D). + * - dX: Gradient wrt `X`, of shape (N, D). + * - dgamma: Gradient wrt `W`, of shape (1, D). + * - dbeta: Gradient wrt `b`, of shape (1, D). * */ N = nrow(X) @@ -190,7 +191,7 @@ init = function(int D) * may be initialized manually if needed. * * Inputs: - * - D: Dimensionality of the input features. + * - D: Dimensionality of the input features (number of features). * * Outputs: * - gamma: Scale parameters, of shape (1, D). http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/conv.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/conv.dml b/scripts/staging/SystemML-NN/nn/layers/conv.dml index cc60a46..435b3cf 100644 --- a/scripts/staging/SystemML-NN/nn/layers/conv.dml +++ b/scripts/staging/SystemML-NN/nn/layers/conv.dml @@ -39,9 +39,9 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b, * output maps. * * Inputs: - * - X: Input data matrix, of shape (N, C*Hin*Win). - * - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf). - * - b: Biases vector, of shape (F, 1). + * - X: Inputs, of shape (N, C*Hin*Win). + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). * - C: Number of input channels (dimensionality of input depth). * - Hin: Input height. * - Win: Input width. @@ -50,14 +50,14 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b, * - strideh: Stride over height. * - stridew: Stride over width. * - padh: Padding for top and bottom sides. - * For same output height as input, set padh = (Hf - 1) / 2, - * assuming strideh = 1. - * More generally, padh = (Hin*(strideh-1) + Hf - strideh) / 2 + * For same output height as input, set `padh = (Hf - 1) / 2`, + * assuming `strideh = 1`. + * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2` * preserves the spatial dimensions of the input. * - padw: Padding for left and right sides. - * For same output width as input, set padw = (Wf - 1) / 2, - * assuming stridew = 1. - * More generally, padw = (Win*(stridew-1) + Wf - stridew) / 2 + * For same output width as input, set `padw = (Wf - 1) / 2`, + * assuming `stridew = 1`. + * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2` * preserves the spatial dimensions of the input. * * Outputs: @@ -67,8 +67,8 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b, */ N = nrow(X) F = nrow(W) - Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1) - Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1) + Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1) + Wout = as.integer((Win + 2*padw - Wf)/stridew + 1) # Create output volume out = matrix(0, rows=N, cols=F*Hout*Wout) @@ -101,12 +101,13 @@ backward = function(matrix[double] dout, int Hout, int Wout, * This implementation uses `im2col` and `col2im` internally. * * Inputs: - * - dout: Derivatives from upstream, of shape (N, F*Hout*Wout). + * - dout: Gradient wrt `out` from upstream, of + * shape (N, F*Hout*Wout). * - Hout: Output height. * - Wout: Output width. - * - X: Previous input data matrix, of shape (N, C*Hin*Win). - * - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf). - * - b: Biases vector, of shape (F, 1). + * - X: Inputs, of shape (N, C*Hin*Win). + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). * - C: Number of input channels (dimensionality of input depth). * - Hin: Input height. * - Win: Input width. @@ -118,9 +119,9 @@ backward = function(matrix[double] dout, int Hout, int Wout, * - padw: Padding for left and right sides. * * Outputs: - * - dX: Gradient wrt X, of shape (N, C*Hin*Win). - * - dW: Gradient wrt W, of shape (F, C*Hf*Wf). - * - db: Gradient wrt b, of shape (F, 1). + * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). + * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf). + * - db: Gradient wrt `b`, of shape (F, 1). */ N = nrow(X) F = nrow(W) @@ -171,10 +172,11 @@ init = function(int F, int C, int Hf, int Wf) * Note: This is just a convenience function, and parameters * may be initialized manually if needed. * - * We use the heuristic by He et al. [http://arxiv.org/abs/1502.01852], - * which limits the magnification of inputs/gradients during - * forward/backward passes by scaling unit-Gaussian weights by a - * factor of sqrt(2/n), under the assumption of relu neurons. + * We use the heuristic by He et al., which limits the magnification + * of inputs/gradients during forward/backward passes by scaling + * unit-Gaussian weights by a factor of sqrt(2/n), under the + * assumption of relu neurons. + * - http://arxiv.org/abs/1502.01852 * * Inputs: * - F: Number of filters. @@ -183,8 +185,8 @@ init = function(int F, int C, int Hf, int Wf) * - Wf: Filter width. * * Outputs: - * - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf). - * - b: Biases vector, of shape (F, 1). + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). */ W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf)) b = matrix(0, rows=F, cols=1) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml index 44df74a..c2b809e 100644 --- a/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml +++ b/scripts/staging/SystemML-NN/nn/layers/conv_builtin.dml @@ -22,6 +22,7 @@ /* * 2D Convolutional layer. */ + forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int C, int Hin, int Win, int Hf, int Wf, int strideh, int stridew, int padh, int padw) @@ -32,10 +33,10 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b, * volume unrolled into a single vector. * * Inputs: - * - X: Input data matrix, of shape (N, C*Hin*Win). - * - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf). - * - b: Biases vector, of shape (F, 1). - * - C: Number of input channels (dimensionality of input depth). + * - X: Inputs, of shape (N, C*Hin*Win). + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). + * - C: Number of input channels (dimensionality of depth). * - Hin: Input height. * - Win: Input width. * - Hf: Filter height. @@ -43,14 +44,14 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b, * - strideh: Stride over height. * - stridew: Stride over width. * - padh: Padding for top and bottom sides. - * For same output height as input, set padh = (Hf - 1) / 2, - * assuming strideh = 1. - * More generally, padh = (Hin*(strideh-1) + Hf - strideh) / 2 + * For same output height as input, set `padh = (Hf - 1) / 2`, + * assuming `strideh = 1`. + * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2` * preserves the spatial dimensions of the input. * - padw: Padding for left and right sides. - * For same output width as input, set padw = (Wf - 1) / 2, - * assuming stridew = 1. - * More generally, padw = (Win*(stridew-1) + Wf - stridew) / 2 + * For same output width as input, set `padw = (Wf - 1) / 2`, + * assuming `stridew = 1`. + * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2` * preserves the spatial dimensions of the input. * * Outputs: @@ -60,8 +61,8 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b, */ N = nrow(X) F = nrow(W) - Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1) - Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1) + Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1) + Wout = as.integer((Win + 2*padw - Wf)/stridew + 1) # Convolution - built-in implementation out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf], @@ -81,13 +82,14 @@ backward = function(matrix[double] dout, int Hout, int Wout, * with F filters. * * Inputs: - * - dout: Derivatives from upstream, of shape (N, F*Hout*Wout). + * - dout: Gradient wrt `out` from upstream, of + * shape (N, F*Hout*Wout). * - Hout: Output height. * - Wout: Output width. - * - X: Previous input data matrix, of shape (N, C*Hin*Win). - * - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf). - * - b: Biases vector, of shape (F, 1). - * - C: Number of input channels (dimensionality of input depth). + * - X: Inputs, of shape (N, C*Hin*Win). + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). + * - C: Number of input channels (dimensionality of depth). * - Hin: Input height. * - Win: Input width. * - Hf: Filter height. @@ -95,12 +97,20 @@ backward = function(matrix[double] dout, int Hout, int Wout, * - strideh: Stride over height. * - stridew: Stride over width. * - padh: Padding for top and bottom sides. + * For same output height as input, set `padh = (Hf - 1) / 2`, + * assuming `strideh = 1`. + * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2` + * preserves the spatial dimensions of the input. * - padw: Padding for left and right sides. + * For same output width as input, set `padw = (Wf - 1) / 2`, + * assuming `stridew = 1`. + * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2` + * preserves the spatial dimensions of the input. * * Outputs: - * - dX: Gradient wrt X, of shape (N, C*Hin*Win). - * - dW: Gradient wrt W, of shape (F, C*Hf*Wf). - * - db: Gradient wrt b, of shape (F, 1). + * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). + * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf). + * - db: Gradient wrt `b`, of shape (F, 1). */ N = nrow(X) F = nrow(W) @@ -123,10 +133,11 @@ init = function(int F, int C, int Hf, int Wf) * Note: This is just a convenience function, and parameters * may be initialized manually if needed. * - * We use the heuristic by He et al. [http://arxiv.org/abs/1502.01852], - * which limits the magnification of inputs/gradients during - * forward/backward passes by scaling unit-Gaussian weights by a - * factor of sqrt(2/n), under the assumption of relu neurons. + * We use the heuristic by He et al., which limits the magnification + * of inputs/gradients during forward/backward passes by scaling + * unit-Gaussian weights by a factor of sqrt(2/n), under the + * assumption of relu neurons. + * - http://arxiv.org/abs/1502.01852 * * Inputs: * - F: Number of filters. @@ -135,8 +146,8 @@ init = function(int F, int C, int Hf, int Wf) * - Wf: Filter width. * * Outputs: - * - W: Weights (parameters) matrix, of shape (F, C*Hf*Wf). - * - b: Biases vector, of shape (F, 1). + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). */ W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf)) b = matrix(0, rows=F, cols=1) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml b/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml index f9cd507..55552e1 100644 --- a/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml +++ b/scripts/staging/SystemML-NN/nn/layers/cross_entropy_loss.dml @@ -21,11 +21,8 @@ /* * Cross-entropy loss function. - * - * L_i = -y_i^T * log(pred_i), where y_i and pred_i are K-dimensional - * vectors of class probs. - * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples. */ + forward = function(matrix[double] pred, matrix[double] y) return (double loss) { /* @@ -33,16 +30,26 @@ forward = function(matrix[double] pred, matrix[double] y) * inputs consist of N examples, each with K dimensions corresponding * to normalized probabilities of K classes. * + * ``` + * L_i = -y_i^T * log(pred_i) + * L = (1/N) sum(L_i) for i=1 to N + * ``` + * + * In these equations, `L` is the total loss, `L_i` is the loss for + * example `i`, `y_i` is the K-dimensional vector of target class + * probabilities, `pred_i` is K-dimensional vector of predicted + * class probabilities, and `N` is the number of examples. + * * This can be interpreted as the negative log-likelihood assuming * a Bernoulli distribution generalized to K dimensions, or a - * Multinomial with 1 observation. + * Multinomial with one observation. * * Inputs: - * - pred: Prediction matrix, of shape (N, K). - * - y: Target matrix, of shape (N, K). + * - pred: Predictions, of shape (N, K). + * - y: Targets, of shape (N, K). * * Outputs: - * - loss: Scalar loss, of shape (1). + * - loss: Average loss. */ N = nrow(y) eps = 1e-10 # numerical stability to avoid log(0) @@ -58,11 +65,11 @@ backward = function(matrix[double] pred, matrix[double] y) * to normalized probabilities of K classes. * * Inputs: - * - pred: Prediction matrix, of shape (N, K). - * - y: Target matrix, of shape (N, K). + * - pred: Predictions, of shape (N, K). + * - y: Targets, of shape (N, K). * * Outputs: - * - dpred: Gradient wrt pred, of shape (N, K). + * - dpred: Gradient wrt `pred`, of shape (N, K). */ N = nrow(y) eps = 1e-10 # numerical stability to avoid divide-by-zero http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/dropout.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/dropout.dml b/scripts/staging/SystemML-NN/nn/layers/dropout.dml index 2b1bd1d..b348642 100644 --- a/scripts/staging/SystemML-NN/nn/layers/dropout.dml +++ b/scripts/staging/SystemML-NN/nn/layers/dropout.dml @@ -22,6 +22,7 @@ /* * Dropout layer. */ + forward = function(matrix[double] X, double p, int seed) return (matrix[double] out, matrix[double] mask) { /* @@ -32,14 +33,13 @@ forward = function(matrix[double] X, double p, int seed) * the outputs of neurons) at test time. * * Inputs: - * - X: Input data matrix, of shape (any, any). + * - X: Inputs, of shape (any, any). * - p: Probability of keeping a neuron output. - * - seed: [Optional: -1] Random number generator seed. Setting this - * allows for deterministic evaluation. Set to -1 for a random - * seed. + * - seed: [Optional: -1] Random number generator seed to allow for + * deterministic evaluation. Set to -1 for a random seed. * * Outputs: - * - out: Ouptuts, of same shape as X. + * - out: Outputs, of same shape as `X`. * - mask: Dropout mask used to compute the output. */ # Normally, we might use something like @@ -48,8 +48,7 @@ forward = function(matrix[double] X, double p, int seed) # the `rand` function that allows use to create a mask directly. if (seed == -1) { mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p) - } - else { + } else { mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p, seed=seed) } out = X * mask / p @@ -64,13 +63,13 @@ backward = function(matrix[double] dout, matrix[double] X, double p, matrix[doub * maintain the expected values at test time. * * Inputs: - * - dout: Derivatives from upstream, of same shape as X. - * - X: Previous input data matrix, of shape (any, any). - * - p: Previous probability of keeping a neuron output. - * - mask: Previous dropout mask used to compute the output. + * - dout: Gradient wrt `out`, of same shape as `X`. + * - X: Inputs, of shape (any, any). + * - p: Probability of keeping a neuron output. + * - mask: Dropout mask used to compute the output. * * Outputs: - * - dX: Gradient wrt X, of same shape as X. + * - dX: Gradient wrt `X`, of same shape as `X`. */ dX = mask / p * dout } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml index 7d6c821..24b15e2 100644 --- a/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml +++ b/scripts/staging/SystemML-NN/nn/layers/l1_loss.dml @@ -21,28 +21,35 @@ /* * L1 loss function. - * - * L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j. - * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples. */ + forward = function(matrix[double] pred, matrix[double] y) return (double loss) { /* * Computes the forward pass for an L1 loss function. The inputs * consist of N examples, each with M dimensions to predict. * + * ``` + * L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j. + * L = (1/N) sum(L_i) for i=1 to N + * ``` + * + * In these equations, `L` is the total loss, `L_i` is the loss for + * example `i`, `y_i` is the scalar target, `pred_i` is the scalar + * prediction, and `N` is the number of examples. + * * This can be interpreted as the negative log-likelihood assuming * a Laplace distribution. * * Inputs: - * - pred: Prediction matrix, of shape (N, M). - * - y: Target matrix, of shape (N, M). + * - pred: Predictions, of shape (N, M). + * - y: Targets, of shape (N, M). * * Outputs: - * - loss: Scalar loss, of shape (1). + * - loss: Average loss. */ N = nrow(y) - losses = rowSums(abs(pred - y)) + losses = rowSums(abs(pred-y)) loss = sum(losses) / N } @@ -53,13 +60,13 @@ backward = function(matrix[double] pred, matrix[double] y) * consist of N examples, each with M dimensions to predict. * * Inputs: - * - pred: Prediction matrix, of shape (N, M). - * - y: Target matrix, of shape (N, M). + * - pred: Predictions, of shape (N, M). + * - y: Targets, of shape (N, M). * * Outputs: - * - dpred: Gradient wrt pred, of shape (N, M). + * - dpred: Gradient wrt `pred`, of shape (N, M). */ N = nrow(y) - dpred = sign(pred - y) / N + dpred = sign(pred-y) / N } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml index b2175ab..f643274 100644 --- a/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml +++ b/scripts/staging/SystemML-NN/nn/layers/l1_reg.dml @@ -22,31 +22,34 @@ /* * L1 regularizataion. */ -forward = function(matrix[double] X, double lambda) return (double reg_loss) { + +forward = function(matrix[double] X, double lambda) + return (double reg_loss) { /* * Computes the forward pass for an L1 regularization function. * * Inputs: - * - X: Parameters, of shape (any, any). + * - X: Inputs, of shape (any, any). * - lambda: Regularization strength. * A typical value is 0.01. * * Outputs: - * - reg_loss: Scalar L1 regularization loss, of shape (1). + * - reg_loss: Total regularization loss. */ reg_loss = lambda * sum(abs(X)) } -backward = function(matrix[double] X, double lambda) return (matrix[double] dX) { +backward = function(matrix[double] X, double lambda) + return (matrix[double] dX) { /* * Computes the backward pass for an L1 regularization function. * * Inputs: - * - X: Parameters, of shape (any, any). + * - X: Inputs, of shape (any, any). * - lambda: Regularization strength. * * Outputs: - * - dX: Gradient wrt X, of same shape as X. + * - dX: Gradient wrt `X`, of same shape as `X`. */ dX = lambda * sign(X) } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml b/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml index 9f27cc2..df8bc1c 100644 --- a/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml +++ b/scripts/staging/SystemML-NN/nn/layers/l2_loss.dml @@ -21,28 +21,35 @@ /* * L2 loss function. - * - * L_i = (1/2) 2norm(pred_i - y_i)^2 - * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples. */ + forward = function(matrix[double] pred, matrix[double] y) return (double loss) { /* * Computes the forward pass for an L2 loss function. The inputs * consist of N examples, each with M dimensions to predict. * + * ``` + * L_i = (1/2) norm(pred_i - y_i)^2 + * L = (1/N) sum(L_i) for i=1 to N + * ``` + * + * In these equations, `L` is the total loss, `L_i` is the loss for + * example `i`, `y_i` is the scalar target, `pred_i` is the scalar + * prediction, and `N` is the number of examples. + * * This can be interpreted as the negative log-likelihood assuming * a Gaussian distribution. * * Inputs: - * - pred: Prediction matrix, of shape (N, M). - * - y: Target matrix, of shape (N, M). + * - pred: Predictions, of shape (N, M). + * - y: Targets, of shape (N, M). * * Outputs: - * - loss: Scalar loss, of shape (1). + * - loss: Average loss. */ N = nrow(y) - losses = 0.5 * rowSums((pred - y)^2) + losses = 0.5 * rowSums((pred-y)^2) loss = sum(losses) / N } @@ -53,13 +60,13 @@ backward = function(matrix[double] pred, matrix[double] y) * consist of N examples, each with M dimensions to predict. * * Inputs: - * - pred: Prediction matrix, of shape (N, M). - * - y: Target matrix, of shape (N, M). + * - pred: Predictions, of shape (N, M). + * - y: Targets, of shape (N, M). * * Outputs: - * - dpred: Gradient wrt pred, of shape (N, M). + * - dpred: Gradient wrt `pred`, of shape (N, M). */ N = nrow(y) - dpred = (pred - y) / N + dpred = (pred-y) / N } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml b/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml index 44f2a54..5074c06 100644 --- a/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml +++ b/scripts/staging/SystemML-NN/nn/layers/l2_reg.dml @@ -22,31 +22,34 @@ /* * L2 regularizataion. */ -forward = function(matrix[double] X, double lambda) return (double reg_loss) { + +forward = function(matrix[double] X, double lambda) + return (double reg_loss) { /* * Computes the forward pass for an L2 regularization function. * * Inputs: - * - X: Parameters, of shape (any, any). + * - X: Inputs, of shape (any, any). * - lambda: Regularization strength. * A typical value is 0.01. * * Outputs: - * - reg_loss: Scalar l2 regularization loss, of shape (1). + * - reg_loss: Total regularization loss. */ reg_loss = 0.5 * lambda * sum(X^2) } -backward = function(matrix[double] X, double lambda) return (matrix[double] dX) { +backward = function(matrix[double] X, double lambda) + return (matrix[double] dX) { /* * Computes the backward pass for an L2 regularization function. * * Inputs: - * - X: Parameters, of shape (any, any). + * - X: Inputs, of shape (any, any). * - lambda: Regularization strength. * * Outputs: - * - dX: Gradient wrt X, of same shape as X. + * - dX: Gradient wrt `X`, of same shape as `X`. */ dX = lambda * X } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/log_loss.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml b/scripts/staging/SystemML-NN/nn/layers/log_loss.dml index ad5e561..7dd85d3 100644 --- a/scripts/staging/SystemML-NN/nn/layers/log_loss.dml +++ b/scripts/staging/SystemML-NN/nn/layers/log_loss.dml @@ -21,30 +21,37 @@ /* * Log loss function. - * - * L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i), where y_i is a - * binary target, and pred_i is a probability of y=1. - * L = (1/N) sum(L_i) for i=1 to N, where N is the number of examples. */ + forward = function(matrix[double] pred, matrix[double] y) return (double loss) { /* * Computes the forward pass for a log loss function. * + * ``` + * L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i) + * L = (1/N) sum(L_i) for i=1 to N + * ``` + * + * In these equations, `L` is the total loss, `L_i` is the loss for + * example `i`, `y_i` is the binary target, `pred_i` is probability + * of the true class (i.e. `y=1`), and `N` is the number of examples. + * * This can be interpreted as the negative log-likelihood assuming * a Bernoulli distribution. * * Inputs: - * - pred: Prediction matrix, of shape (N, 1). Predictions should - * be probabilities that y=1. - * - y: Target matrix, of shape (N, 1). Targets should be binary - * in the set {0,1}. + * - pred: Predictions, of shape (N, 1). + * Predictions should be probabilities of the true + * class (i.e. probability of `y=1`). + * - y: Targets, of shape (N, 1). + * Targets should be binary in the set {0, 1}. * * Outputs: - * - loss: Scalar loss, of shape (1). + * - loss: Average loss. */ N = nrow(y) - losses = -y * log(pred) - (1-y) * log(1-pred) + losses = -y*log(pred) - (1-y)*log(1-pred) loss = sum(losses) / N } @@ -54,15 +61,16 @@ backward = function(matrix[double] pred, matrix[double] y) * Computes the backward pass for a log loss function. * * Inputs: - * - pred: Prediction matrix, of shape (N, 1). Predictions should - * be probabilities that y=1. - * - y: Target matrix, of shape (N, 1). Targets should be binary - * in the set {0,1}. + * - pred: Predictions, of shape (N, 1). + * Predictions should be probabilities of the true + * class (i.e. probability of `y=1`). + * - y: Targets, of shape (N, 1). + * Targets should be binary in the set {0, 1}. * * Outputs: - * - dpred: Gradient wrt pred, of shape (N, 1). + * - dpred: Gradient wrt `pred`, of shape (N, 1). */ N = nrow(y) - dpred = (1/N) * (pred-y) / (pred * (1-pred)) + dpred = (1/N) * (pred-y) / (pred*(1-pred)) } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/lstm.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/lstm.dml b/scripts/staging/SystemML-NN/nn/layers/lstm.dml index 0dd9f4c..44f2ef2 100644 --- a/scripts/staging/SystemML-NN/nn/layers/lstm.dml +++ b/scripts/staging/SystemML-NN/nn/layers/lstm.dml @@ -44,16 +44,16 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, * - http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf * * Inputs: - * - X: Input data matrix, of shape (N, T*D). - * - W: Weights (parameters) matrix, of shape (D+M, 4M). - * - b: Biases vector, of shape (1, 4M). + * - X: Inputs, of shape (N, T*D). + * - W: Weights, of shape (D+M, 4M). + * - b: Biases, of shape (1, 4M). * - T: Length of example sequences (number of timesteps). - * - D: Dimensionality of the input features. + * - D: Dimensionality of the input features (number of features). * - return_sequences: Whether to return `out` at all timesteps, * or just for the final timestep. - * - out0: Output matrix at previous timestep, of shape (N, M). + * - out0: Outputs from previous timestep, of shape (N, M). * Note: This is *optional* and could just be an empty matrix. - * - c0: Initial cell state matrix, of shape (N, M). + * - c0: Initial cell state, of shape (N, M). * Note: This is *optional* and could just be an empty matrix. * * Outputs: @@ -123,23 +123,27 @@ backward = function(matrix[double] dout, matrix[double] dc, * Computes the backward pass for an LSTM layer with M neurons. * * Inputs: - * - dout: Gradient on output from upstream. If `given_sequences` - * is True, contains gradients on outputs for all timesteps, - * of shape (N, T*M). Else, contains gradient on output for - * the final timestep, of shape (N, M). - * - dc: Gradient on final (current) cell state from later in time, - * of shape (N, M). - * - X: Input data matrix, of shape (N, T*D). - * - W: Weights (parameters) matrix, of shape (D+M, 4M). - * - b: Biases vector, of shape (1, 4M). + * - dout: Gradient wrt `out`. If `given_sequences` is `True`, + * contains gradients on outputs for all timesteps, of + * shape (N, T*M). Else, contains the gradient on the output + * for the final timestep, of shape (N, M). + * - dc: Gradient wrt `c` (from later in time), of shape (N, M). + * This would come from later in time if the cell state was used + * downstream as the initial cell state for another LSTM layer. + * Typically, this would be used when a sequence was cut at + * timestep `T` and then continued in the next batch. If `c` + * was not used downstream, then `dc` would be an empty matrix. + * - X: Inputs, of shape (N, T*D). + * - W: Weights, of shape (D+M, 4M). + * - b: Biases, of shape (1, 4M). * - T: Length of example sequences (number of timesteps). * - D: Dimensionality of the input features. * - given_sequences: Whether `dout` is for all timesteps, * or just for the final timestep. This is based on whether * `return_sequences` was true in the forward pass. - * - out0: Output matrix at previous timestep, of shape (N, M). + * - out0: Outputs from previous timestep, of shape (N, M). * Note: This is *optional* and could just be an empty matrix. - * - c0: Initial cell state matrix, of shape (N, M). + * - c0: Initial cell state, of shape (N, M). * Note: This is *optional* and could just be an empty matrix. * - cache_out: Cache of outputs, of shape (T, N*M). * Note: This is used for performance during training. @@ -149,11 +153,11 @@ backward = function(matrix[double] dout, matrix[double] dc, * Note: This is used for performance during training. * * Outputs: - * - dX: Gradient wrt X, of shape (N, T*D). - * - dW: Gradient wrt W, of shape (D+M, 4M). - * - db: Gradient wrt b, of shape (1, 4M). - * - dout0: Gradient wrt out0, of shape (N, M). - * - dc0: Gradient wrt c0, of shape (N, M). + * - dX: Gradient wrt `X`, of shape (N, T*D). + * - dW: Gradient wrt `W`, of shape (D+M, 4M). + * - db: Gradient wrt `b`, of shape (1, 4M). + * - dout0: Gradient wrt `out0`, of shape (N, M). + * - dc0: Gradient wrt `c0`, of shape (N, M). */ N = nrow(X) M = as.integer(ncol(W)/4) @@ -190,7 +194,7 @@ backward = function(matrix[double] dout, matrix[double] dc, g = ifog[,3*M+1:4*M] # g gate, shape (N, M) tmp = tanh::backward(dout_t, ct) - dct = dct + o * tmp # shape (N, M) + dct = dct + o*tmp # shape (N, M) tmp = tanh::forward(ct) do = tmp * dout_t # output gate, shape (N, M) df = c_prev * dct # forget gate, shape (N, M) @@ -201,7 +205,7 @@ backward = function(matrix[double] dout, matrix[double] dc, di_raw = i * (1-i) * di df_raw = f * (1-f) * df do_raw = o * (1-o) * do - dg_raw = (1 - g^2) * dg + dg_raw = (1-g^2) * dg difog_raw = cbind(di_raw, cbind(df_raw, cbind(do_raw, dg_raw))) # shape (N, 4M) dW = dW + t(input) %*% difog_raw # shape (D+M, 4M) @@ -217,7 +221,7 @@ backward = function(matrix[double] dout, matrix[double] dc, dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev # shape (N, M) dct = dc_prev # shape (N, M) } - t = t-1 + t = t - 1 } } @@ -232,17 +236,18 @@ init = function(int N, int D, int M) * We use the Glorot uniform heuristic which limits the magnification * of inputs/gradients during forward/backward passes by scaling * uniform weights by a factor of sqrt(6/(fan_in + fan_out)). + * - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf * * Inputs: * - N: Number of examples in batch. - * - D: Dimensionality of the input features. + * - D: Dimensionality of the input features (number of features). * - M: Number of neurons in this layer. * * Outputs: - * - W: Weights (parameters) matrix, of shape (D+M, 4M). - * - b: Biases vector, of shape (1, 4M). - * - out0: Dummy output matrix at previous timestep, of shape (N, M). - * - c0: Initial empty cell state matrix, of shape (N, M). + * - W: Weights, of shape (D+M, 4M). + * - b: Biases, of shape (1, 4M). + * - out0: Empty previous timestep output matrix, of shape (N, M). + * - c0: Empty initial cell state matrix, of shape (N, M). */ fan_in = D+M fan_out = 4*M http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/max_pool.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool.dml index 22e1747..a12877f 100644 --- a/scripts/staging/SystemML-NN/nn/layers/max_pool.dml +++ b/scripts/staging/SystemML-NN/nn/layers/max_pool.dml @@ -38,7 +38,7 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf, * the output maps. * * Inputs: - * - X: Input data matrix, of shape (N, C*Hin*Win). + * - X: Inputs, of shape (N, C*Hin*Win). * - C: Number of input channels (dimensionality of input depth). * - Hin: Input height. * - Win: Input width. @@ -57,8 +57,8 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf, * - Wout: Output width. */ N = nrow(X) - Hout = as.integer((Hin + 2 * padh - Hf) / strideh + 1) - Wout = as.integer((Win + 2 * padw - Wf) / stridew + 1) + Hout = as.integer((Hin + 2*padh - Hf)/strideh + 1) + Wout = as.integer((Win + 2*padw - Wf)/stridew + 1) pad_value = -1/0 # in max pooling we pad with -infinity # Create output volume @@ -96,7 +96,8 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X, * unrolled into a single vector. * * Inputs: - * - dout: Derivatives from upstream, of shape (N, C*Hout*Wout). + * - dout: Gradient wrt `out` from upstream, of + * shape (N, C*Hout*Wout). * - Hout: Output height. * - Wout: Output width. * - X: Input data matrix, of shape (N, C*Hin*Win). @@ -113,7 +114,7 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X, * A typical value is 0. * * Outputs: - * - dX: Gradient wrt X, of shape (N, C*Hin*Win). + * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). */ N = nrow(X) pad_value = -1/0 # in max pooling we pad with -infinity @@ -134,9 +135,9 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X, img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw) dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw) for (hout in 1:Hout, check=0) { # all output rows - hin = (hout-1) * strideh + 1 + hin = (hout-1)*strideh + 1 for (wout in 1:Wout) { # all output columns - win = (wout-1) * stridew + 1 + win = (wout-1)*stridew + 1 img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1] max_val_ind = img_slice_patch == max(img_slice_patch) # max value indicator matrix # gradient passes through only for the max value(s) in this patch http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml b/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml index ae2b4a1..f1cb863 100644 --- a/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml +++ b/scripts/staging/SystemML-NN/nn/layers/max_pool_builtin.dml @@ -22,6 +22,7 @@ /* * Max pooling layer. */ + forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf, int strideh, int stridew, int padh, int padw) return (matrix[double] out, int Hout, int Wout) { @@ -36,7 +37,7 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf, * the output maps. * * Inputs: - * - X: Input data matrix, of shape (N, C*Hin*Win). + * - X: Inputs, of shape (N, C*Hin*Win). * - C: Number of input channels (dimensionality of input depth). * - Hin: Input height. * - Win: Input width. @@ -55,8 +56,8 @@ forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf, * - Wout: Output width. */ N = nrow(X) - Hout = as.integer((Hin - Hf) / strideh + 1) - Wout = as.integer((Win - Wf) / stridew + 1) + Hout = as.integer((Hin-Hf)/strideh + 1) + Wout = as.integer((Win-Wf)/stridew + 1) # Max pooling - built-in implementation out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf], @@ -73,10 +74,11 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X, * unrolled into a single vector. * * Inputs: - * - dout: Derivatives from upstream, of shape (N, C*Hout*Wout). + * - dout: Gradient wrt `out` from upstream, of + * shape (N, C*Hout*Wout). * - Hout: Output height. * - Wout: Output width. - * - X: Input data matrix, of shape (N, C*Hin*Win). + * - X: Inputs, of shape (N, C*Hin*Win). * - C: Number of input channels (dimensionality of input depth). * - Hin: Input height. * - Win: Input width. @@ -90,7 +92,7 @@ backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X, * A typical value is 0. * * Outputs: - * - dX: Gradient wrt X, of shape (N, C*Hin*Win). + * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). */ N = nrow(X) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/relu.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/relu.dml b/scripts/staging/SystemML-NN/nn/layers/relu.dml index a5c5230..6a4c15c 100644 --- a/scripts/staging/SystemML-NN/nn/layers/relu.dml +++ b/scripts/staging/SystemML-NN/nn/layers/relu.dml @@ -22,33 +22,37 @@ /* * Rectified Linear Unit (ReLU) nonlinearity layer. */ -forward = function(matrix[double] X) return (matrix[double] out) { + +forward = function(matrix[double] X) + return (matrix[double] out) { /* * Computes the forward pass for a ReLU nonlinearity layer. * - * Performs an element-wise evaluation of f(input) = max(0, input). + * Performs an element-wise evaluation of `f(input) = max(0, input)`. * * Inputs: - * - X: Input data matrix, of shape (any, any). + * - X: Inputs, of shape (any, any). * * Outputs: - * - out: Ouptuts, of same shape as X. + * - out: Outputs, of same shape as `X`. */ - out = max(0.0, X) + out = max(X, 0) } -backward = function(matrix[double] dout, matrix[double] X) return (matrix[double] dX) { +backward = function(matrix[double] dout, matrix[double] X) + return (matrix[double] dX) { /* * Computes the backward pass for a ReLU nonlinearity layer. * - * Essentially performs a pass-through of the upstream gradient for cells > 0. + * Essentially performs a pass-through of the upstream gradient + * for cells > 0. * * Inputs: - * - dout: Derivatives from upstream, of same shape as X. + * - dout: Gradient wrt `out` from upstream, of same shape as `X`. * - X: Previous input data matrix, of shape (any, any). * * Outputs: - * - dX: Gradient wrt X, of same shape as X. + * - dX: Gradient wrt `X`, of same shape as `X`. */ dX = (X > 0) * dout } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/rnn.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/rnn.dml b/scripts/staging/SystemML-NN/nn/layers/rnn.dml index cd3eefe..cdceab8 100644 --- a/scripts/staging/SystemML-NN/nn/layers/rnn.dml +++ b/scripts/staging/SystemML-NN/nn/layers/rnn.dml @@ -35,14 +35,14 @@ forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, * in as an additional input at the current timestep. * * Inputs: - * - X: Input data matrix, of shape (N, T*D). - * - W: Weights (parameters) matrix, of shape (D+M, M). - * - b: Biases vector, of shape (1, M). + * - X: Inputs, of shape (N, T*D). + * - W: Weights, of shape (D+M, M). + * - b: Biases, of shape (1, M). * - T: Length of example sequences (number of timesteps). - * - D: Dimensionality of the input features. + * - D: Dimensionality of the input features (number of features). * - return_sequences: Whether to return `out` at all timesteps, * or just for the final timestep. - * - out0: Output matrix at previous timestep, of shape (N, M). + * - out0: Output matrix from previous timestep, of shape (N, M). * Note: This is *optional* and could just be an empty matrix. * * Outputs: @@ -88,28 +88,28 @@ backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, mat * Computes the backward pass for a simple RNN layer with M neurons. * * Inputs: - * - dout: Gradient on output from upstream. If `given_sequences` + * - dout: Gradient wrt `out` from upstream. If `given_sequences` * is True, contains gradients on outputs for all timesteps, * of shape (N, T*M). Else, contains gradient on output for * the final timestep, of shape (N, M). - * - X: Input data matrix, of shape (N, T*D). - * - W: Weights (parameters) matrix, of shape (D+M, M). - * - b: Biases vector, of shape (1, M). + * - X: Inputs, of shape (N, T*D). + * - W: Weights, of shape (D+M, M). + * - b: Biases, of shape (1, M). * - T: Length of example sequences (number of timesteps). - * - D: Dimensionality of the input features. + * - D: Dimensionality of the input features (number of features). * - given_sequences: Whether `dout` is for all timesteps, * or just for the final timestep. This is based on whether * `return_sequences` was true in the forward pass. - * - out0: Output matrix at previous timestep, of shape (N, M). + * - out0: Output matrix from previous timestep, of shape (N, M). * Note: This is *optional* and could just be an empty matrix. * - cache_out: Cache of outputs, of shape (T, N*M). * Note: This is used for performance during training. * * Outputs: - * - dX: Gradient wrt X, of shape (N, T*D). - * - dW: Gradient wrt W, of shape (D+M, 4M). - * - db: Gradient wrt b, of shape (1, 4M). - * - dout0: Gradient wrt out0, of shape (N, M). + * - dX: Gradient wrt `X`, of shape (N, T*D). + * - dW: Gradient wrt `W`, of shape (D+M, 4M). + * - db: Gradient wrt `b`, of shape (1, 4M). + * - dout0: Gradient wrt `out0`, of shape (N, M). */ N = nrow(X) M = ncol(W) @@ -134,7 +134,7 @@ backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, mat out_prev = matrix(cache_out[t-1,], rows=N, cols=M) # shape (N, M) } input = cbind(X_t, out_prev) # shape (N, D+M) - dout_t_raw = (1 - out_t^2) * dout_t # into tanh, shape (N, M) + dout_t_raw = (1-out_t^2) * dout_t # into tanh, shape (N, M) dW = dW + t(input) %*% dout_t_raw # shape (D+M, M) db = db + colSums(dout_t_raw) # shape (1, M) dinput = dout_t_raw %*% t(W) # shape (N, D+M) @@ -146,7 +146,7 @@ backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, mat else { dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev # shape (N, M) } - t = t-1 + t = t - 1 } } @@ -161,16 +161,17 @@ init = function(int N, int D, int M) * We use the Glorot uniform heuristic which limits the magnification * of inputs/gradients during forward/backward passes by scaling * uniform weights by a factor of sqrt(6/(fan_in + fan_out)). + * - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf * * Inputs: * - N: Number of examples in batch. - * - D: Dimensionality of the input features. + * - D: Dimensionality of the input features (number of features). * - M: Number of neurons in this layer. * * Outputs: - * - W: Weights (parameters) matrix, of shape (D+M, M). - * - b: Biases vector, of shape (1, M). - * - out0: Dummy output matrix at previous timestep, of shape (N, M). + * - W: Weights, of shape (D+M, M). + * - b: Biases, of shape (1, M). + * - out0: Empty previous timestep output matrix, of shape (N, M). */ fan_in = D+M fan_out = M http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml b/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml index a7066f2..185befb 100644 --- a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml +++ b/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml @@ -22,33 +22,41 @@ /* * Sigmoid nonlinearity layer. */ -forward = function(matrix[double] X) return (matrix[double] out) { + +forward = function(matrix[double] X) + return (matrix[double] out) { /* * Computes the forward pass for a sigmoid nonlinearity layer. * - * sigmoid(x) = 1 / (1 + e^-x) + * `sigmoid(x) = 1 / (1 + e^-x)` + * + * If `X` contains a single feature column, the output of a sigmoid + * layer can be interpreted as a predicted probability of a true + * class when paired with a log loss function in a binary + * classification problem. * * Inputs: - * - X: Input data matrix, of shape (any, any). + * - X: Inputs, of shape (any, any). * * Outputs: - * - out: Ouptuts, of same shape as X. + * - out: Outputs, of same shape as `X`. */ - out = 1 / (1 + exp(-X)) + out = 1 / (1+exp(-X)) } -backward = function(matrix[double] dout, matrix[double] X) return (matrix[double] dX) { +backward = function(matrix[double] dout, matrix[double] X) + return (matrix[double] dX) { /* * Computes the backward pass for a sigmoid nonlinearity layer. * * Inputs: - * - dout: Derivatives from upstream, of same shape as X. - * - X: Previous input data matrix, of shape (any, any). + * - dout: Gradient wrt `out` from upstream, of same shape as `X`. + * - X: Inputs, of shape (any, any). * * Outputs: - * - dX: Gradient wrt X, of same shape as X. + * - dX: Gradient wrt `X`, of same shape as `X`. */ - out = 1 / (1 + exp(-X)) - dX = out * (1 - out) * dout + out = 1 / (1+exp(-X)) + dX = out * (1-out) * dout } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/softmax.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/softmax.dml b/scripts/staging/SystemML-NN/nn/layers/softmax.dml index 854e8a8..1751838 100644 --- a/scripts/staging/SystemML-NN/nn/layers/softmax.dml +++ b/scripts/staging/SystemML-NN/nn/layers/softmax.dml @@ -22,7 +22,9 @@ /* * Softmax classifier layer. */ -forward = function(matrix[double] scores) return (matrix[double] probs) { + +forward = function(matrix[double] scores) + return (matrix[double] probs) { /* * Computes the forward pass for a softmax classifier. The inputs * are interpreted as unnormalized, log-probabilities for each of @@ -32,10 +34,10 @@ forward = function(matrix[double] scores) return (matrix[double] probs) { * This can be interpreted as a generalization of the sigmoid * function to multiple classes. * - * probs_ij = e^scores_ij / sum(e^scores_i) + * `probs_ij = e^scores_ij / sum(e^scores_i)` * * Inputs: - * - scores: Input data matrix, of shape (N, D). + * - scores: Inputs, of shape (N, D). * * Outputs: * - probs: Outputs, of shape (N, D). @@ -56,20 +58,23 @@ backward = function(matrix[double] dprobs, matrix[double] scores) /* * Computes the backward pass for a softmax classifier. * - * Note that dscores_ij has multiple sources: + * Note that dscores_ij has multiple source branches: * - * dprobs_ij/dscores_ij = probs_ij * (1 - probs_ij) - * dprobs_ik/dscores_ij = -probs_ik * probs_ij, for all k != j + * ``` + * dprobs_ij/dscores_ij = probs_ij * (1 - probs_ij) + * dprobs_ik/dscores_ij = -probs_ik * probs_ij, for all k != j * - * dloss/dscores_ij = dloss/dprobs_ij * dprobs_ij/dscores_ij + - * sum_{k!=j}(dloss/dprobs_ik * dprobs_ik/dscores_ij) + * dloss/dscores_ij = + * (dloss/dprobs_ij * dprobs_ij/dscores_ij) + * + sum_{k!=j}(dloss/dprobs_ik * dprobs_ik/dscores_ij) + * ``` * * Inputs: - * - dprobs: Derivatives from upstream, of shape (N, D). - * - scores: Previous input data matrix, of shape (N, D). + * - dprobs: Gradient wrt `probs` from upstream, of shape (N, D). + * - scores: Inputs, of shape (N, D). * * Outputs: - * - dscores: Gradient wrt scores, of shape (N, D). + * - dscores: Gradient wrt `scores`, of shape (N, D). */ scores = scores - rowMaxs(scores) # numerical stability unnorm_probs = exp(scores) # unnormalized probabilities @@ -77,6 +82,6 @@ backward = function(matrix[double] dprobs, matrix[double] scores) # After some cancellation: # dscores = dprobs*probs - probs*rowSums(dprobs*probs) dtemp = dprobs * probs - dscores = dtemp - probs * rowSums(dtemp) + dscores = dtemp - probs*rowSums(dtemp) } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml b/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml index 53ca989..0185a2c 100644 --- a/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml +++ b/scripts/staging/SystemML-NN/nn/layers/spatial_batch_norm.dml @@ -39,7 +39,7 @@ forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta, * introduces learnable parameters (gamma, beta) to control the * amount of normalization. * - * y = ((x-mean) / sqrt(var+eps)) * gamma + beta + * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta` * * This implementation maintains exponential moving averages of the * mean and variance during training for use during testing. @@ -50,7 +50,7 @@ forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta, * - https://arxiv.org/abs/1502.03167 * * Inputs: - * - X: Input data matrix, of shape (N, C*Hin*Win). + * - X: Inputs, of shape (N, C*Hin*Win). * - gamma: Scale parameters, of shape (C, 1). * - beta: Shift parameters, of shape (C, 1). * - C: Number of input channels (dimensionality of input depth). @@ -134,7 +134,7 @@ backward = function(matrix[double] dout, matrix[double] out, * Computes the backward pass for a spatial batch normalization layer. * * Inputs: - * - dout: Derivatives from upstream, of shape (N, C*Hin*Win). + * - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win). * - out: Outputs from the forward pass, of shape (N, C*Hin*Win). * - ema_mean_upd: Updated exponential moving average of the mean * from the forward pass, of shape (C, 1). @@ -171,9 +171,9 @@ backward = function(matrix[double] dout, matrix[double] out, * Typical values are in the range of [1e-5, 1e-3]. * * Outputs: - * - dX: Gradient wrt X, of shape (N, C*Hin*Win). - * - dgamma: Gradient wrt W, of shape (C, 1). - * - dbeta: Gradient wrt b, of shape (C, 1). + * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). + * - dgamma: Gradient wrt `W`, of shape (C, 1). + * - dbeta: Gradient wrt `b`, of shape (C, 1). * */ N = nrow(X) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/layers/tanh.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/tanh.dml b/scripts/staging/SystemML-NN/nn/layers/tanh.dml index 9308a7c..589a574 100644 --- a/scripts/staging/SystemML-NN/nn/layers/tanh.dml +++ b/scripts/staging/SystemML-NN/nn/layers/tanh.dml @@ -24,38 +24,42 @@ */ source("nn/layers/sigmoid.dml") as sigmoid -forward = function(matrix[double] X) return (matrix[double] out) { +forward = function(matrix[double] X) + return (matrix[double] out) { /* * Computes the forward pass for a tanh nonlinearity layer. * - * tanh(x) = (e^x - e^-x) / (e^x + e^-x) - * = 2 * sigmoid(2x) - 1 + * ``` + * tanh(x) = (e^x - e^-x) / (e^x + e^-x) + * = 2 * sigmoid(2x) - 1 + * ``` * * Inputs: - * - X: Input data matrix, of shape (any, any). + * - X: Inputs, of shape (any, any). * * Outputs: - * - out: Ouptuts, of same shape as X. + * - out: Outputs, of same shape as `X`. */ # out = (exp(X) - exp(-X)) / (exp(X) + exp(-X)) # Simplification of the above formulation to use the sigmoid function: sigma2X = sigmoid::forward(2*X) - out = 2 * sigma2X - 1 + out = 2*sigma2X - 1 } -backward = function(matrix[double] dout, matrix[double] X) return (matrix[double] dX) { +backward = function(matrix[double] dout, matrix[double] X) + return (matrix[double] dX) { /* * Computes the backward pass for a tanh nonlinearity layer. * * Inputs: - * - dout: Derivatives from upstream, of same shape as X. - * - X: Previous input data matrix, of shape (any, any). + * - dout: Gradient wrt `out` from upstream, of same shape as `X`. + * - X: Inputs, of shape (any, any). * * Outputs: - * - dX: Gradient wrt X, of same shape as X. + * - dX: Gradient wrt `X`, of same shape as `X`. */ sigma2X = sigmoid::forward(2*X) - out = 2 * sigma2X - 1 - dX = (1 - out^2) * dout + out = 2*sigma2X - 1 + dX = (1-out^2) * dout } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/adagrad.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml b/scripts/staging/SystemML-NN/nn/optim/adagrad.dml index 688109b..20b26c4 100644 --- a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml +++ b/scripts/staging/SystemML-NN/nn/optim/adagrad.dml @@ -22,6 +22,7 @@ /* * Adagrad optimizer. */ + update = function(matrix[double] X, matrix[double] dX, double lr, double epsilon, matrix[double] cache) return (matrix[double] X, matrix[double] cache) { @@ -39,24 +40,25 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double epsilon * * Inputs: * - X: Parameters to update, of shape (any, any). - * - dX: Gradient of X wrt to a loss function being optimized, of - * same shape as X. + * - dX: Gradient wrt `X` of a loss function being optimized, of + * same shape as `X`. * - lr: Learning rate. * - epsilon: Smoothing term to avoid divide by zero errors. * Typical values are in the range of [1e-8, 1e-4]. * - cache: State that maintains per-parameter sum of squared - * gradients, of same shape as X. + * gradients, of same shape as `X`. * * Outputs: - * - X: Updated parameters X, of same shape as input X. - * - v: Updated velocity of the parameters X, of same shape as - * input v. + * - X: Updated parameters `X`, of same shape as input `X`. + * - cache: State that maintains per-parameter sum of squared + * gradients, of same shape as `X`. */ cache = cache + dX^2 - X = X - lr * dX / (sqrt(cache) + epsilon) + X = X - (lr * dX / (sqrt(cache)+epsilon)) } -init = function(matrix[double] X) return (matrix[double] cache) { +init = function(matrix[double] X) + return (matrix[double] cache) { /* * Initialize the state for this optimizer. * @@ -65,10 +67,10 @@ init = function(matrix[double] X) return (matrix[double] cache) { * * Inputs: * - X: Parameters to update, of shape (any, any). - * + * * Outputs: * - cache: State that maintains per-parameter sum of squared - * gradients, of same shape as X. + * gradients, of same shape as `X`. */ cache = matrix(0, rows=nrow(X), cols=ncol(X)) } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/adam.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/optim/adam.dml b/scripts/staging/SystemML-NN/nn/optim/adam.dml index a25f74d..0607fa5 100644 --- a/scripts/staging/SystemML-NN/nn/optim/adam.dml +++ b/scripts/staging/SystemML-NN/nn/optim/adam.dml @@ -22,6 +22,7 @@ /* * Adam optimizer. */ + update = function(matrix[double] X, matrix[double] dX, double lr, double beta1, double beta2, double epsilon, int t, matrix[double] m, matrix[double] v) return (matrix[double] X, matrix[double] m, matrix[double] v) { @@ -34,8 +35,8 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double beta1, * * Inputs: * - X: Parameters to update, of shape (any, any). - * - dX: Gradient of X wrt to a loss function being optimized, of - * same shape as X. + * - dX: Gradient wrt `X` of a loss function being optimized, of + * same shape as `X`. * - lr: Learning rate. Recommended value is 0.001. * - beta1: Exponential decay rate for the 1st moment estimates. * Recommended value is 0.9. @@ -46,32 +47,33 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double beta1, * - t: Timestep, starting at 0. * - m: State containing the 1st moment (mean) estimate by * maintaining exponential moving averages of the gradients, of - * same shape as X. + * same shape as `X`. * - v: State containing the 2nd raw moment (uncentered variance) * estimate by maintaining exponential moving averages of the - * squared gradients, of same shape as X. + * squared gradients, of same shape as `X`. * * Outputs: - * - X: Updated parameters X, of same shape as input X. + * - X: Updated parameters `X`, of same shape as input `X`. * - m: Updated state containing the 1st moment (mean) estimate by * maintaining exponential moving averages of the gradients, of - * same shape as X. + * same shape as `X`. * - v: Updated state containing the 2nd raw moment (uncentered * variance) estimate by maintaining exponential moving averages - * of the squared gradients, of same shape as X. + * of the squared gradients, of same shape as `X`. */ t = t + 1 - m = beta1 * m + (1 - beta1) * dX # update biased 1st moment estimate - v = beta2 * v + (1 - beta2) * dX^2 # update biased 2nd raw moment estimate - #m = m / (1 - beta1^t) # compute bias-corrected 1st moment estimate - #v = v / (1 - beta2^t) # compute bias-corrected 2nd raw moment estimate - #X = X - lr * m / (sqrt(v) + epsilon) # param update + m = beta1*m + (1-beta1)*dX # update biased 1st moment estimate + v = beta2*v + (1-beta2)*dX^2 # update biased 2nd raw moment estimate + # m = m / (1-beta1^t) # compute bias-corrected 1st moment estimate + # v = v / (1-beta2^t) # compute bias-corrected 2nd raw moment estimate + # X = X - (lr * m / (sqrt(v)+epsilon)) # param update # Simplified for computational efficiency: - lr = lr * sqrt(1 - beta2^t) / (1 - beta1^t) - X = X - lr * m / (sqrt(v) + epsilon) + lr = lr * sqrt(1-beta2^t) / (1-beta1^t) + X = X - (lr * m / (sqrt(v)+epsilon)) } -init = function(matrix[double] X) return (matrix[double] m, matrix[double] v) { +init = function(matrix[double] X) + return (matrix[double] m, matrix[double] v) { /* * Initialize the state for this optimizer. * @@ -80,14 +82,14 @@ init = function(matrix[double] X) return (matrix[double] m, matrix[double] v) { * * Inputs: * - X: Parameters to update, of shape (any, any). - * + * * Outputs: * - m: Initial state containing the 1st moment (mean) estimate by * maintaining exponential moving averages of the gradients, of - * same shape as X. + * same shape as `X`. * - v: Initial state containing the 2nd raw moment (uncentered * variance) estimate by maintaining exponential moving averages - * of the squared gradients, of same shape as X. + * of the squared gradients, of same shape as `X`. */ m = matrix(0, rows=nrow(X), cols=ncol(X)) v = matrix(0, rows=nrow(X), cols=ncol(X)) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml b/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml index e256000..80c75a0 100644 --- a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml +++ b/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml @@ -22,6 +22,7 @@ /* * RMSprop optimizer. */ + update = function(matrix[double] X, matrix[double] dX, double lr, double decay_rate, double epsilon, matrix[double] cache) return (matrix[double] X, matrix[double] cache) { @@ -39,26 +40,27 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double decay_r * * Inputs: * - X: Parameters to update, of shape (any, any). - * - dX: Gradient of X wrt to a loss function being optimized, of - * same shape as X. + * - dX: Gradient wrt `X` of a loss function being optimized, of + * same shape as `X`. * - lr: Learning rate. * - decay_rate: Term controlling the rate of the moving average. * Typical values are in the range of [0.9, 0.999]. * - epsilon: Smoothing term to avoid divide by zero errors. * Typical values are in the range of [1e-8, 1e-4]. * - cache: State that maintains the moving average of the squared - * gradients, of same shape as X. + * gradients, of same shape as `X`. * * Outputs: - * - X: Updated parameters X, of same shape as input X. - * - v: Updated velocity of the parameters X, of same shape as - * input v. + * - X: Updated parameters `X`, of same shape as input `X`. + * - cache: Updated state that maintains the moving average of the + * squared gradients, of same shape as `X`. */ - cache = decay_rate * cache + (1 - decay_rate) * dX^2 - X = X - lr * dX / (sqrt(cache) + epsilon) + cache = decay_rate*cache + (1-decay_rate)*dX^2 + X = X - (lr * dX / (sqrt(cache)+epsilon)) } -init = function(matrix[double] X) return (matrix[double] cache) { +init = function(matrix[double] X) + return (matrix[double] cache) { /* * Initialize the state for this optimizer. * @@ -67,10 +69,10 @@ init = function(matrix[double] X) return (matrix[double] cache) { * * Inputs: * - X: Parameters to update, of shape (any, any). - * + * * Outputs: * - cache: State that maintains the moving average of the squared - * gradients, of same shape as X. + * gradients, of same shape as `X`. */ cache = matrix(0, rows=nrow(X), cols=ncol(X)) } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/sgd.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd.dml b/scripts/staging/SystemML-NN/nn/optim/sgd.dml index 554569a..a3fc744 100644 --- a/scripts/staging/SystemML-NN/nn/optim/sgd.dml +++ b/scripts/staging/SystemML-NN/nn/optim/sgd.dml @@ -22,19 +22,21 @@ /* * Stochastic Gradient Descent (SGD) optimizer. */ -update = function(matrix[double] X, matrix[double] dX, double lr) return (matrix[double] X) { + +update = function(matrix[double] X, matrix[double] dX, double lr) + return (matrix[double] X) { /* * Performs a vanilla SGD update. * * Inputs: * - X: Parameters to update, of shape (any, any). - * - dX: Gradient of X wrt to a loss function being optimized, of - * same shape as X. + * - dX: Gradient wrt `X` of a loss function being optimized, of + * same shape as `X`. * - lr: Learning rate. * * Outputs: - * - X: Updated parameters X, of same shape as input X. + * - X: Updated parameters `X`, of same shape as input `X`. */ - X = X - lr * dX + X = X - lr*dX } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml index c2a441b..2cb9890 100644 --- a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml +++ b/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml @@ -22,6 +22,7 @@ /* * Stochastic Gradient Descent with momentum (SGD-momentum) optimizer. */ + update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v) return (matrix[double] X, matrix[double] v) { /* @@ -33,25 +34,26 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double mu, mat * * Inputs: * - X: Parameters to update, of shape (any, any). - * - dX: Gradient of X wrt to a loss function being optimized, of - * same shape as X. + * - dX: Gradient wrt `X` of a loss function being optimized, of + * same shape as `X`. * - lr: Learning rate. * - mu: Momentum value. * Typical values are in the range of [0.5, 0.99], usually * started at the lower end and annealed towards the higher end. - * - v: State maintaining the velocity of the parameters X, of same - * shape as X. + * - v: State maintaining the velocity of the parameters `X`, of same + * shape as `X`. * * Outputs: - * - X: Updated parameters X, of same shape as input X. - * - v: Updated velocity of the parameters X, of same shape as - * input v. + * - X: Updated parameters `X`, of same shape as input `X`. + * - v: Updated velocity of the parameters `X`, of same shape as + * input `X`. */ - v = mu * v - lr * dX # update velocity + v = mu*v - lr*dX # update velocity X = X + v # update position } -init = function(matrix[double] X) return (matrix[double] v) { +init = function(matrix[double] X) + return (matrix[double] v) { /* * Initialize the state for this optimizer. * @@ -60,9 +62,9 @@ init = function(matrix[double] X) return (matrix[double] v) { * * Inputs: * - X: Parameters to update, of shape (any, any). - * + * * Outputs: - * - v: Initial velocity of the parameters X. + * - v: Initial velocity of the parameters `X`. */ v = matrix(0, rows=nrow(X), cols=ncol(X)) } http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/16b1cbd7/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml index 56c6ab0..fee6585 100644 --- a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml +++ b/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml @@ -22,6 +22,7 @@ /* * Stochastic Gradient Descent with Nesterov momentum (SGD-Nesterov) optimizer. */ + update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v) return (matrix[double] X, matrix[double] v) { /* @@ -36,19 +37,20 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double mu, mat * store the parameters in their position after momentum. * * Reference: - * - Advances in optimizing Recurrent Networks, Bengio et al., section 3.5. + * - Advances in optimizing Recurrent Networks, Bengio et al., + * section 3.5. * - http://arxiv.org/abs/1212.0901 * * Inputs: * - X: Parameters to update, of shape (any, any). - * - dX: Gradient of X wrt to a loss function being optimized, of - * same shape as X. + * - dX: Gradient wrt `X` of a loss function being optimized, of + * same shape as `X`. * - lr: Learning rate. * - mu: Momentum value. * Typical values are in the range of [0.5, 0.99], usually * started at the lower end and annealed towards the higher end. - * - v: State maintaining the velocity of the parameters X, of same - * shape as X. + * - v: State maintaining the velocity of the parameters `X`, of same + * shape as `X`. * * Outputs: * - X: Updated parameters X, of same shape as input X. @@ -56,11 +58,12 @@ update = function(matrix[double] X, matrix[double] dX, double lr, double mu, mat * input v. */ v_prev = v - v = mu * v - lr * dX # update velocity - X = X - mu * v_prev + (1 + mu) * v # update position, including momentum + v = mu*v - lr*dX # update velocity + X = X - mu*v_prev + (1+mu)*v # update position, including momentum } -init = function(matrix[double] X) return (matrix[double] v) { +init = function(matrix[double] X) + return (matrix[double] v) { /* * Initialize the state for this optimizer. * @@ -69,9 +72,9 @@ init = function(matrix[double] X) return (matrix[double] v) { * * Inputs: * - X: Parameters to update, of shape (any, any). - * + * * Outputs: - * - v: Initial velocity of the parameters X. + * - v: Initial velocity of the parameters `X`. */ v = matrix(0, rows=nrow(X), cols=ncol(X)) }
