http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/rnn.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/rnn.dml b/scripts/staging/SystemML-NN/nn/layers/rnn.dml deleted file mode 100644 index 3c6faae..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/rnn.dml +++ /dev/null @@ -1,183 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * Simple (Vanilla) RNN layer. - */ -source("nn/layers/tanh.dml") as tanh - -forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, int D, - boolean return_sequences, matrix[double] out0) - return (matrix[double] out, matrix[double] cache_out) { - /* - * Computes the forward pass for a simple RNN layer with M neurons. - * The input data has N sequences of T examples, each with D features. - * - * In a simple RNN, the output of the previous timestep is fed back - * in as an additional input at the current timestep. - * - * Inputs: - * - X: Inputs, of shape (N, T*D). - * - W: Weights, of shape (D+M, M). - * - b: Biases, of shape (1, M). - * - T: Length of example sequences (number of timesteps). - * - D: Dimensionality of the input features (number of features). - * - return_sequences: Whether to return `out` at all timesteps, - * or just for the final timestep. - * - out0: Output matrix from previous timestep, of shape (N, M). - * Note: This is *optional* and could just be an empty matrix. - * - * Outputs: - * - out: If `return_sequences` is True, outputs for all timesteps, - * of shape (N, T*M). Else, outputs for the final timestep, of - * shape (N, M). - * - cache_out: Cache of outputs, of shape (T, N*M). - * Note: This is used for performance during training. - */ - N = nrow(X) - M = ncol(W) - out_prev = out0 - if (return_sequences) { - out = matrix(0, rows=N, cols=T*M) - } - else { - out = matrix(0, rows=N, cols=M) - } - # caches to be used during the backward pass for performance - cache_out = matrix(0, rows=T, cols=N*M) - - for (t in 1:T) { # each timestep - X_t = X[,(t-1)*D+1:t*D] # shape (N, D) - input = cbind(X_t, out_prev) # shape (N, D+M) - out_t = tanh::forward(input %*% W + b) # shape (N, M) - # store - if (return_sequences) { - out[,(t-1)*M+1:t*M] = out_t - } - else { - out = out_t - } - out_prev = out_t - cache_out[t,] = matrix(out_t, rows=1, cols=N*M) # reshape - } -} - -backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, matrix[double] b, - int T, int D, boolean given_sequences, matrix[double] out0, - matrix[double] cache_out) - return (matrix[double] dX, matrix[double] dW, matrix[double] db, matrix[double] dout0) { - /* - * Computes the backward pass for a simple RNN layer with M neurons. - * - * Inputs: - * - dout: Gradient wrt `out` from upstream. If `given_sequences` - * is True, contains gradients on outputs for all timesteps, - * of shape (N, T*M). Else, contains gradient on output for - * the final timestep, of shape (N, M). - * - X: Inputs, of shape (N, T*D). - * - W: Weights, of shape (D+M, M). - * - b: Biases, of shape (1, M). - * - T: Length of example sequences (number of timesteps). - * - D: Dimensionality of the input features (number of features). - * - given_sequences: Whether `dout` is for all timesteps, - * or just for the final timestep. This is based on whether - * `return_sequences` was true in the forward pass. - * - out0: Output matrix from previous timestep, of shape (N, M). - * Note: This is *optional* and could just be an empty matrix. - * - cache_out: Cache of outputs, of shape (T, N*M). - * Note: This is used for performance during training. - * - * Outputs: - * - dX: Gradient wrt `X`, of shape (N, T*D). - * - dW: Gradient wrt `W`, of shape (D+M, 4M). - * - db: Gradient wrt `b`, of shape (1, 4M). - * - dout0: Gradient wrt `out0`, of shape (N, M). - */ - N = nrow(X) - M = ncol(W) - dX = matrix(0, rows=N, cols=T*D) - dW = matrix(0, rows=D+M, cols=M) - db = matrix(0, rows=1, cols=M) - dout0 = matrix(0, rows=N, cols=M) - if (!given_sequences) { - # only given dout for output at final timestep, so prepend empty douts for all other timesteps - dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout) # shape (N, T*M) - } - - t = T - for (iter in 1:T) { # each timestep in reverse order - X_t = X[,(t-1)*D+1:t*D] # shape (N, D) - dout_t = dout[,(t-1)*M+1:t*M] # shape (N, M) - out_t = matrix(cache_out[t,], rows=N, cols=M) # shape (N, M) - if (t == 1) { - out_prev = out0 # shape (N, M) - } - else { - out_prev = matrix(cache_out[t-1,], rows=N, cols=M) # shape (N, M) - } - input = cbind(X_t, out_prev) # shape (N, D+M) - dout_t_raw = (1-out_t^2) * dout_t # into tanh, shape (N, M) - dW = dW + t(input) %*% dout_t_raw # shape (D+M, M) - db = db + colSums(dout_t_raw) # shape (1, M) - dinput = dout_t_raw %*% t(W) # shape (N, D+M) - dX[,(t-1)*D+1:t*D] = dinput[,1:D] - dout_prev = dinput[,D+1:D+M] # shape (N, M) - if (t == 1) { - dout0 = dout_prev # shape (N, M) - } - else { - dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev # shape (N, M) - } - t = t - 1 - } -} - -init = function(int N, int D, int M) - return (matrix[double] W, matrix[double] b, matrix[double] out0) { - /* - * Initialize the parameters of this layer. - * - * Note: This is just a convenience function, and parameters - * may be initialized manually if needed. - * - * We use the Glorot uniform heuristic which limits the magnification - * of inputs/gradients during forward/backward passes by scaling - * uniform weights by a factor of sqrt(6/(fan_in + fan_out)). - * - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf - * - * Inputs: - * - N: Number of examples in batch. - * - D: Dimensionality of the input features (number of features). - * - M: Number of neurons in this layer. - * - * Outputs: - * - W: Weights, of shape (D+M, M). - * - b: Biases, of shape (1, M). - * - out0: Empty previous timestep output matrix, of shape (N, M). - */ - fan_in = D+M - fan_out = M - scale = sqrt(6/(fan_in+fan_out)) - W = rand(rows=D+M, cols=M, min=-scale, max=scale, pdf="uniform") - b = matrix(0, rows=1, cols=M) - out0 = matrix(0, rows=N, cols=M) -} -
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml b/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml deleted file mode 100644 index 7e162a3..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/scale_shift1d.dml +++ /dev/null @@ -1,95 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * 1D Scale & Shift layer. - */ - -forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta) - return (matrix[double] out) { - /* - * Computes the forward pass for a 1D scale & shift layer. The input - * data has N examples, each with D features. - * - * A 1D scale & shift layer introduces learnable parameters - * (gamma, beta) to scale and shift the input on a per-feature basis. - * - * `y = x*gamma + beta` - * - * Inputs: - * - X: Inputs, of shape (N, D). - * - gamma: Scale parameters, of shape (1, D). - * - beta: Shift parameters, of shape (1, D). - * - * Outputs: - * - out: Outputs, of shape (N, D). - */ - # Scale and shift - out = X*gamma + beta # shape (N, D) -} - -backward = function(matrix[double] dout, matrix[double] out, - matrix[double] X, matrix[double] gamma, matrix[double] beta) - return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) { - /* - * Computes the backward pass for a 1D scale & shift layer. - * - * Inputs: - * - dout: Gradient wrt `out` from upstream, of shape (N, D). - * - out: Outputs from the forward pass, of shape (N, D). - * - X: Inputs, of shape (N, D). - * - gamma: Scale parameters, of shape (1, D). - * - beta: Shift parameters, of shape (1, D). - * - * Outputs: - * - dX: Gradient wrt `X`, of shape (N, D). - * - dgamma: Gradient wrt `W`, of shape (1, D). - * - dbeta: Gradient wrt `b`, of shape (1, D). - * - */ - # Compute gradients during training - dgamma = colSums(dout*X) # shape (1, D) - dbeta = colSums(dout) # shape (1, D) - dX = dout * gamma # shape (N, D) -} - -init = function(int D) - return (matrix[double] gamma, matrix[double] beta) { - /* - * Initialize the parameters of this layer. - * - * By default, we initialize to an identity function, with a scale - * filler of `1`, and a shift filler of `0`. - * - * Note: This is just a convenience function, and parameters - * may be initialized manually if needed. - * - * Inputs: - * - D: Dimensionality of the input features (number of features). - * - * Outputs: - * - gamma: Scale parameters, of shape (1, D). - * - beta: Shift parameters, of shape (1, D). - */ - gamma = matrix(1, rows=1, cols=D) - beta = matrix(0, rows=1, cols=D) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml b/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml deleted file mode 100644 index 79c884a..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/scale_shift2d.dml +++ /dev/null @@ -1,107 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * 2D Scale & Shift layer. - */ -source("nn/util.dml") as util - -forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta, - int C, int Hin, int Win) - return (matrix[double] out) { - /* - * Computes the forward pass for a 2D scale & shift layer. The input - * data has N examples, each represented as a 3D volume unrolled into - * a single vector. - * - * A 2D scale & shift layer introduces learnable parameters - * (gamma, beta) to scale and shift the input on a per-channel basis. - * - * `y = x*gamma + beta` - * - * Inputs: - * - X: Inputs, of shape (N, C*Hin*Win). - * - gamma: Scale parameters, of shape (C, 1). - * - beta: Shift parameters, of shape (C, 1). - * - C: Number of input channels (dimensionality of input depth). - * - Hin: Input height. - * - Win: Input width. - * - * Outputs: - * - out: Outputs, of shape (N, C*Hin*Win). - */ - # Scale and shift - scaled = bias_multiply(X, gamma) # shape (N, C*Hin*Win) - out = bias_add(scaled, beta) # shape (N, C*Hin*Win) -} - -backward = function(matrix[double] dout, matrix[double] out, - matrix[double] X, matrix[double] gamma, matrix[double] beta, - int C, int Hin, int Win) - return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) { - /* - * Computes the backward pass for a 2D scale & shift layer. - * - * Inputs: - * - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win). - * - out: Outputs from the forward pass, of shape (N, C*Hin*Win). - * - X: Input data matrix to the forward pass, of - * shape (N, C*Hin*Win). - * - gamma: Scale parameters, of shape (C, 1). - * - beta: Shift parameters, of shape (C, 1). - * - C: Number of input channels (dimensionality of input depth). - * - Hin: Input height. - * - Win: Input width. - * - * Outputs: - * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). - * - dgamma: Gradient wrt `W`, of shape (C, 1). - * - dbeta: Gradient wrt `b`, of shape (C, 1). - * - */ - # Compute gradients during training - dgamma = util::channel_sums(dout*X, C, Hin, Win) # shape (C, 1) - dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1) - dX = bias_multiply(dout, gamma) # shape (N, C*Hin*Win) -} - -init = function(int C) - return (matrix[double] gamma, matrix[double] beta) { - /* - * Initialize the parameters of this layer. - * - * By default, we initialize to an identity function, with a scale - * filler of `1`, and a shift filler of `0`. - * - * Note: This is just a convenience function, and parameters - * may be initialized manually if needed. - * - * Inputs: - * - C: Number of input channels (dimensionality of input depth). - * - * Outputs: - * - gamma: Scale parameters, of shape (C, 1). - * - beta: Shift parameters, of shape (C, 1). - */ - gamma = matrix(1, rows=C, cols=1) - beta = matrix(0, rows=C, cols=1) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml b/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml deleted file mode 100644 index 2d85adc..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/sigmoid.dml +++ /dev/null @@ -1,62 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * Sigmoid nonlinearity layer. - */ - -forward = function(matrix[double] X) - return (matrix[double] out) { - /* - * Computes the forward pass for a sigmoid nonlinearity layer. - * - * `sigmoid(x) = 1 / (1 + e^-x)` - * - * If `X` contains a single feature column, the output of a sigmoid - * layer can be interpreted as a predicted probability of a true - * class when paired with a log loss function in a binary - * classification problem. - * - * Inputs: - * - X: Inputs, of shape (any, any). - * - * Outputs: - * - out: Outputs, of same shape as `X`. - */ - out = 1 / (1+exp(-X)) -} - -backward = function(matrix[double] dout, matrix[double] X) - return (matrix[double] dX) { - /* - * Computes the backward pass for a sigmoid nonlinearity layer. - * - * Inputs: - * - dout: Gradient wrt `out` from upstream, of same shape as `X`. - * - X: Inputs, of shape (any, any). - * - * Outputs: - * - dX: Gradient wrt `X`, of same shape as `X`. - */ - out = 1 / (1+exp(-X)) - dX = out * (1-out) * dout -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/softmax.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/softmax.dml b/scripts/staging/SystemML-NN/nn/layers/softmax.dml deleted file mode 100644 index 68a7bc7..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/softmax.dml +++ /dev/null @@ -1,87 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * Softmax classifier layer. - */ - -forward = function(matrix[double] scores) - return (matrix[double] probs) { - /* - * Computes the forward pass for a softmax classifier. The inputs - * are interpreted as unnormalized, log-probabilities for each of - * N examples, and the softmax function transforms them to normalized - * probabilities. - * - * This can be interpreted as a generalization of the sigmoid - * function to multiple classes. - * - * `probs_ij = e^scores_ij / sum(e^scores_i)` - * - * Inputs: - * - scores: Inputs, of shape (N, D). - * - * Outputs: - * - probs: Outputs, of shape (N, D). - */ - # For numerical stability, we subtract the max score of an example from all scores for that - # example. This is equivalent to the original formulation: - # e^scores_i / sum(e^scores_i) == C*e^scores_i / C*sum(e^scores_i) - # == e^(scores_i+log(C)) / sum(e^(scores_i+log(C)) - # set log(C) = -max(scores_i): - # == e^(scores_i-max(scores_i)) / sum(e^(scores_i-max(scores_i)) - scores = scores - rowMaxs(scores) # numerical stability - unnorm_probs = exp(scores) # unnormalized probabilities - probs = unnorm_probs / rowSums(unnorm_probs) # normalized probabilities -} - -backward = function(matrix[double] dprobs, matrix[double] scores) - return (matrix[double] dscores) { - /* - * Computes the backward pass for a softmax classifier. - * - * Note that dscores_ij has multiple source branches: - * - * ``` - * dprobs_ij/dscores_ij = probs_ij * (1 - probs_ij) - * dprobs_ik/dscores_ij = -probs_ik * probs_ij, for all k != j - * - * dloss/dscores_ij = - * (dloss/dprobs_ij * dprobs_ij/dscores_ij) - * + sum_{k!=j}(dloss/dprobs_ik * dprobs_ik/dscores_ij) - * ``` - * - * Inputs: - * - dprobs: Gradient wrt `probs` from upstream, of shape (N, D). - * - scores: Inputs, of shape (N, D). - * - * Outputs: - * - dscores: Gradient wrt `scores`, of shape (N, D). - */ - scores = scores - rowMaxs(scores) # numerical stability - unnorm_probs = exp(scores) # unnormalized probabilities - probs = unnorm_probs / rowSums(unnorm_probs) # normalized probabilities - # After some cancellation: - # dscores = dprobs*probs - probs*rowSums(dprobs*probs) - dtemp = dprobs * probs - dscores = dtemp - probs*rowSums(dtemp) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/layers/tanh.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/layers/tanh.dml b/scripts/staging/SystemML-NN/nn/layers/tanh.dml deleted file mode 100644 index d849d70..0000000 --- a/scripts/staging/SystemML-NN/nn/layers/tanh.dml +++ /dev/null @@ -1,65 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * Tanh nonlinearity layer. - */ -source("nn/layers/sigmoid.dml") as sigmoid - -forward = function(matrix[double] X) - return (matrix[double] out) { - /* - * Computes the forward pass for a tanh nonlinearity layer. - * - * ``` - * tanh(x) = (e^x - e^-x) / (e^x + e^-x) - * = 2 * sigmoid(2x) - 1 - * ``` - * - * Inputs: - * - X: Inputs, of shape (any, any). - * - * Outputs: - * - out: Outputs, of same shape as `X`. - */ - # out = (exp(X) - exp(-X)) / (exp(X) + exp(-X)) - # Simplification of the above formulation to use the sigmoid function: - sigma2X = sigmoid::forward(2*X) - out = 2*sigma2X - 1 -} - -backward = function(matrix[double] dout, matrix[double] X) - return (matrix[double] dX) { - /* - * Computes the backward pass for a tanh nonlinearity layer. - * - * Inputs: - * - dout: Gradient wrt `out` from upstream, of same shape as `X`. - * - X: Inputs, of shape (any, any). - * - * Outputs: - * - dX: Gradient wrt `X`, of same shape as `X`. - */ - sigma2X = sigmoid::forward(2*X) - out = 2*sigma2X - 1 - dX = (1-out^2) * dout -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/adagrad.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml b/scripts/staging/SystemML-NN/nn/optim/adagrad.dml deleted file mode 100644 index 85b1c41..0000000 --- a/scripts/staging/SystemML-NN/nn/optim/adagrad.dml +++ /dev/null @@ -1,77 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * Adagrad optimizer. - */ - -update = function(matrix[double] X, matrix[double] dX, double lr, double epsilon, - matrix[double] cache) - return (matrix[double] X, matrix[double] cache) { - /* - * Performs an Adagrad update. - * - * This is an adaptive learning rate optimizer that maintains the - * sum of squared gradients to automatically adjust the effective - * learning rate. - * - * Reference: - * - Adaptive Subgradient Methods for Online Learning and Stochastic - * Optimization, Duchi et al. - * - http://jmlr.org/papers/v12/duchi11a.html - * - * Inputs: - * - X: Parameters to update, of shape (any, any). - * - dX: Gradient wrt `X` of a loss function being optimized, of - * same shape as `X`. - * - lr: Learning rate. - * - epsilon: Smoothing term to avoid divide by zero errors. - * Typical values are in the range of [1e-8, 1e-4]. - * - cache: State that maintains per-parameter sum of squared - * gradients, of same shape as `X`. - * - * Outputs: - * - X: Updated parameters `X`, of same shape as input `X`. - * - cache: State that maintains per-parameter sum of squared - * gradients, of same shape as `X`. - */ - cache = cache + dX^2 - X = X - (lr * dX / (sqrt(cache)+epsilon)) -} - -init = function(matrix[double] X) - return (matrix[double] cache) { - /* - * Initialize the state for this optimizer. - * - * Note: This is just a convenience function, and state - * may be initialized manually if needed. - * - * Inputs: - * - X: Parameters to update, of shape (any, any). - * - * Outputs: - * - cache: State that maintains per-parameter sum of squared - * gradients, of same shape as `X`. - */ - cache = matrix(0, rows=nrow(X), cols=ncol(X)) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/adam.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/optim/adam.dml b/scripts/staging/SystemML-NN/nn/optim/adam.dml deleted file mode 100644 index 4b6fa2a..0000000 --- a/scripts/staging/SystemML-NN/nn/optim/adam.dml +++ /dev/null @@ -1,97 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * Adam optimizer. - */ - -update = function(matrix[double] X, matrix[double] dX, double lr, double beta1, double beta2, - double epsilon, int t, matrix[double] m, matrix[double] v) - return (matrix[double] X, matrix[double] m, matrix[double] v) { - /* - * Performs an Adam update. - * - * Reference: - * - Adam: A Method for Stochastic Optimization, Kingma, Ba. - * - http://arxiv.org/abs/1412.6980 - * - * Inputs: - * - X: Parameters to update, of shape (any, any). - * - dX: Gradient wrt `X` of a loss function being optimized, of - * same shape as `X`. - * - lr: Learning rate. Recommended value is 0.001. - * - beta1: Exponential decay rate for the 1st moment estimates. - * Recommended value is 0.9. - * - beta2: Exponential decay rate for the 2nd moment estimates. - * Recommended value is 0.999. - * - epsilon: Smoothing term to avoid divide by zero errors. - * Recommended value is 1e-8. - * - t: Timestep, starting at 0. - * - m: State containing the 1st moment (mean) estimate by - * maintaining exponential moving averages of the gradients, of - * same shape as `X`. - * - v: State containing the 2nd raw moment (uncentered variance) - * estimate by maintaining exponential moving averages of the - * squared gradients, of same shape as `X`. - * - * Outputs: - * - X: Updated parameters `X`, of same shape as input `X`. - * - m: Updated state containing the 1st moment (mean) estimate by - * maintaining exponential moving averages of the gradients, of - * same shape as `X`. - * - v: Updated state containing the 2nd raw moment (uncentered - * variance) estimate by maintaining exponential moving averages - * of the squared gradients, of same shape as `X`. - */ - t = t + 1 - m = beta1*m + (1-beta1)*dX # update biased 1st moment estimate - v = beta2*v + (1-beta2)*dX^2 # update biased 2nd raw moment estimate - # m = m / (1-beta1^t) # compute bias-corrected 1st moment estimate - # v = v / (1-beta2^t) # compute bias-corrected 2nd raw moment estimate - # X = X - (lr * m / (sqrt(v)+epsilon)) # param update - # Simplified for computational efficiency: - lr = lr * sqrt(1-beta2^t) / (1-beta1^t) - X = X - (lr * m / (sqrt(v)+epsilon)) -} - -init = function(matrix[double] X) - return (matrix[double] m, matrix[double] v) { - /* - * Initialize the state for this optimizer. - * - * Note: This is just a convenience function, and state - * may be initialized manually if needed. - * - * Inputs: - * - X: Parameters to update, of shape (any, any). - * - * Outputs: - * - m: Initial state containing the 1st moment (mean) estimate by - * maintaining exponential moving averages of the gradients, of - * same shape as `X`. - * - v: Initial state containing the 2nd raw moment (uncentered - * variance) estimate by maintaining exponential moving averages - * of the squared gradients, of same shape as `X`. - */ - m = matrix(0, rows=nrow(X), cols=ncol(X)) - v = matrix(0, rows=nrow(X), cols=ncol(X)) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml b/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml deleted file mode 100644 index 1feccaf..0000000 --- a/scripts/staging/SystemML-NN/nn/optim/rmsprop.dml +++ /dev/null @@ -1,79 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * RMSprop optimizer. - */ - -update = function(matrix[double] X, matrix[double] dX, double lr, double decay_rate, - double epsilon, matrix[double] cache) - return (matrix[double] X, matrix[double] cache) { - /* - * Performs an RMSprop update. - * - * This is an adaptive learning rate optimizer that can be viewed - * as an adjustment of the Adagrad method to use a moving average - * of the sum of squared gradients in order to improve convergence. - * - * Reference: - * - Neural Networks for Machine Learning, Lecture 6a, Hinton, - * slide 29. - * - http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf - * - * Inputs: - * - X: Parameters to update, of shape (any, any). - * - dX: Gradient wrt `X` of a loss function being optimized, of - * same shape as `X`. - * - lr: Learning rate. - * - decay_rate: Term controlling the rate of the moving average. - * Typical values are in the range of [0.9, 0.999]. - * - epsilon: Smoothing term to avoid divide by zero errors. - * Typical values are in the range of [1e-8, 1e-4]. - * - cache: State that maintains the moving average of the squared - * gradients, of same shape as `X`. - * - * Outputs: - * - X: Updated parameters `X`, of same shape as input `X`. - * - cache: Updated state that maintains the moving average of the - * squared gradients, of same shape as `X`. - */ - cache = decay_rate*cache + (1-decay_rate)*dX^2 - X = X - (lr * dX / (sqrt(cache)+epsilon)) -} - -init = function(matrix[double] X) - return (matrix[double] cache) { - /* - * Initialize the state for this optimizer. - * - * Note: This is just a convenience function, and state - * may be initialized manually if needed. - * - * Inputs: - * - X: Parameters to update, of shape (any, any). - * - * Outputs: - * - cache: State that maintains the moving average of the squared - * gradients, of same shape as `X`. - */ - cache = matrix(0, rows=nrow(X), cols=ncol(X)) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/sgd.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd.dml b/scripts/staging/SystemML-NN/nn/optim/sgd.dml deleted file mode 100644 index 3ba7eba..0000000 --- a/scripts/staging/SystemML-NN/nn/optim/sgd.dml +++ /dev/null @@ -1,42 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * Stochastic Gradient Descent (SGD) optimizer. - */ - -update = function(matrix[double] X, matrix[double] dX, double lr) - return (matrix[double] X) { - /* - * Performs a vanilla SGD update. - * - * Inputs: - * - X: Parameters to update, of shape (any, any). - * - dX: Gradient wrt `X` of a loss function being optimized, of - * same shape as `X`. - * - lr: Learning rate. - * - * Outputs: - * - X: Updated parameters `X`, of same shape as input `X`. - */ - X = X - lr*dX -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml deleted file mode 100644 index 85922da..0000000 --- a/scripts/staging/SystemML-NN/nn/optim/sgd_momentum.dml +++ /dev/null @@ -1,71 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * Stochastic Gradient Descent with momentum (SGD-momentum) optimizer. - */ - -update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v) - return (matrix[double] X, matrix[double] v) { - /* - * Performs an SGD update with momentum. - * - * In SGD with momentum, we assume that the parameters have a velocity - * that continues with some momentum, and that is influenced by the - * gradient. - * - * Inputs: - * - X: Parameters to update, of shape (any, any). - * - dX: Gradient wrt `X` of a loss function being optimized, of - * same shape as `X`. - * - lr: Learning rate. - * - mu: Momentum value. - * Typical values are in the range of [0.5, 0.99], usually - * started at the lower end and annealed towards the higher end. - * - v: State maintaining the velocity of the parameters `X`, of same - * shape as `X`. - * - * Outputs: - * - X: Updated parameters `X`, of same shape as input `X`. - * - v: Updated velocity of the parameters `X`, of same shape as - * input `X`. - */ - v = mu*v - lr*dX # update velocity - X = X + v # update position -} - -init = function(matrix[double] X) - return (matrix[double] v) { - /* - * Initialize the state for this optimizer. - * - * Note: This is just a convenience function, and state - * may be initialized manually if needed. - * - * Inputs: - * - X: Parameters to update, of shape (any, any). - * - * Outputs: - * - v: Initial velocity of the parameters `X`. - */ - v = matrix(0, rows=nrow(X), cols=ncol(X)) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml b/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml deleted file mode 100644 index 3b62c6e..0000000 --- a/scripts/staging/SystemML-NN/nn/optim/sgd_nesterov.dml +++ /dev/null @@ -1,81 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * Stochastic Gradient Descent with Nesterov momentum (SGD-Nesterov) optimizer. - */ - -update = function(matrix[double] X, matrix[double] dX, double lr, double mu, matrix[double] v) - return (matrix[double] X, matrix[double] v) { - /* - * Performs an SGD update with Nesterov momentum. - * - * As with regular SGD with momentum, in SGD with Nesterov momentum, - * we assume that the parameters have a velocity that continues - * with some momentum, and that is influenced by the gradient. - * In this view specifically, we perform the position update from the - * position that the momentum is about to carry the parameters to, - * rather than from the previous position. Additionally, we always - * store the parameters in their position after momentum. - * - * Reference: - * - Advances in optimizing Recurrent Networks, Bengio et al., - * section 3.5. - * - http://arxiv.org/abs/1212.0901 - * - * Inputs: - * - X: Parameters to update, of shape (any, any). - * - dX: Gradient wrt `X` of a loss function being optimized, of - * same shape as `X`. - * - lr: Learning rate. - * - mu: Momentum value. - * Typical values are in the range of [0.5, 0.99], usually - * started at the lower end and annealed towards the higher end. - * - v: State maintaining the velocity of the parameters `X`, of same - * shape as `X`. - * - * Outputs: - * - X: Updated parameters X, of same shape as input X. - * - v: Updated velocity of the parameters X, of same shape as - * input v. - */ - v_prev = v - v = mu*v - lr*dX # update velocity - X = X - mu*v_prev + (1+mu)*v # update position, including momentum -} - -init = function(matrix[double] X) - return (matrix[double] v) { - /* - * Initialize the state for this optimizer. - * - * Note: This is just a convenience function, and state - * may be initialized manually if needed. - * - * Inputs: - * - X: Parameters to update, of shape (any, any). - * - * Outputs: - * - v: Initial velocity of the parameters `X`. - */ - v = matrix(0, rows=nrow(X), cols=ncol(X)) -} - http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/README.md ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/test/README.md b/scripts/staging/SystemML-NN/nn/test/README.md deleted file mode 100644 index b714d50..0000000 --- a/scripts/staging/SystemML-NN/nn/test/README.md +++ /dev/null @@ -1,32 +0,0 @@ -<!-- -{% comment %} -Licensed to the Apache Software Foundation (ASF) under one or more -contributor license agreements. See the NOTICE file distributed with -this work for additional information regarding copyright ownership. -The ASF licenses this file to you under the Apache License, Version 2.0 -(the "License"); you may not use this file except in compliance with -the License. You may obtain a copy of the License at - -http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -{% endcomment %} ---> - -# SystemML-NN Tests - -#### This folder contains tests for the *SystemML-NN* (`nn`) deep learning library. - ---- -## Tests -#### All layers are tested for correct derivatives ("gradient-checking"), and many layers also have correctness tests against simpler reference implementations. -* `grad_check.dml` - Contains gradient-checks for all layers as individual DML functions. -* `test.dml` - Contains correctness tests for several of the more complicated layers by checking against simple reference implementations, such as `conv_simple.dml`. All tests are formulated as individual DML functions. -* `tests.dml` - A DML script that runs all of the tests in `grad_check.dml` and `test.dml`. - -## Execution -* `spark-submit SystemML.jar -f nn/test/tests.dml` from the base of the project. http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml ---------------------------------------------------------------------- diff --git a/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml b/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml deleted file mode 100644 index 9f126d0..0000000 --- a/scripts/staging/SystemML-NN/nn/test/conv2d_simple.dml +++ /dev/null @@ -1,213 +0,0 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -/* - * 2D Convolutional layer. - * - * This implementation is intended to be a simple, reference version. - */ - -forward = function(matrix[double] X, matrix[double] W, matrix[double] b, - int C, int Hin, int Win, int Hf, int Wf, - int strideh, int stridew, int padh, int padw) - return (matrix[double] out, int Hout, int Wout) { - /* - * Computes the forward pass for a 2D spatial convolutional layer with - * F filters. The input data has N examples, each represented as a 3D - * volume unrolled into a single vector. - * - * This implementation is intended to be a simple, reference version. - * - * Inputs: - * - X: Inputs, of shape (N, C*Hin*Win). - * - W: Weights, of shape (F, C*Hf*Wf). - * - b: Biases, of shape (F, 1). - * - C: Number of input channels (dimensionality of input depth). - * - Hin: Input height. - * - Win: Input width. - * - Hf: Filter height. - * - Wf: Filter width. - * - strideh: Stride over height. - * - stridew: Stride over width. - * - padh: Padding for top and bottom sides. - * - padw: Padding for left and right sides. - * - * Outputs: - * - out: Outputs, of shape (N, F*Hout*Wout). - * - Hout: Output height. - * - Wout: Output width. - */ - N = nrow(X) - F = nrow(W) - Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1)) - Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1)) - - # Create output volume - out = matrix(0, rows=N, cols=F*Hout*Wout) - - # Convolution - Simple reference implementation - parfor (n in 1:N) { # all examples - Xn = matrix(X[n,], rows=C, cols=Hin*Win) - # Pad image - Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros - parfor (c in 1:C) { - Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win) # depth slice C reshaped - Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw) - Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice - Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape - } - # Convolve image with filters - parfor (f in 1:F, check=0) { # all filters - parfor (hout in 1:Hout, check=0) { # all output rows - h0 = (hout-1)*strideh + 1 - parfor (wout in 1:Wout, check=0) { # all output columns - w0 = (wout-1)*stridew + 1 - # Create a patch of the input example corresponding spatially to the filter sizes - Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf) # zeros - parfor (c in 1:C, check=0) { - Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw) # reshape - Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf], rows=1, - cols=Hf*Wf) # reshape - } - out[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout] = - W[f,] %*% matrix(Xn_padded_patch, rows=C*Hf*Wf, cols=1) + b[f,] - } - } - } - } -} - -backward = function(matrix[double] dout, int Hout, int Wout, - matrix[double] X, matrix[double] W, matrix[double] b, - int C, int Hin, int Win, int Hf, int Wf, - int strideh, int stridew, int padh, int padw) - return (matrix[double] dX, matrix[double] dW, matrix[double] db) { - /* - * Computes the backward pass for a 2D spatial convolutional layer - * with F filters. - * - * This implementation is intended to be a simple, reference version. - * - * Inputs: - * - dout: Gradient wrt `out` from upstream, of - * shape (N, F*Hout*Wout). - * - Hout: Output height. - * - Wout: Output width. - * - X: Inputs, of shape (N, C*Hin*Win). - * - W: Weights, of shape (F, C*Hf*Wf). - * - b: Biases, of shape (F, 1). - * - C: Number of input channels (dimensionality of input depth). - * - Hin: Input height. - * - Win: Input width. - * - Hf: Filter height. - * - Wf: Filter width. - * - strideh: Stride over height. - * - stridew: Stride over width. - * - padh: Padding for top and bottom sides. - * - padw: Padding for left and right sides. - * - * Outputs: - * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). - * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf). - * - db: Gradient wrt `b`, of shape (F, 1). - */ - N = nrow(X) - F = nrow(W) - - # Create gradient volumes - dX = matrix(0, rows=N, cols=C*Hin*Win) - dW = matrix(0, rows=F, cols=C*Hf*Wf) - db = matrix(0, rows=F, cols=1) - - # Partial derivatives for convolution - Simple reference implementation - for (n in 1:N) { # all examples - Xn = matrix(X[n,], rows=C, cols=Hin*Win) - # Pad image - Xn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) # zeros - parfor (c in 1:C) { - Xn_slice = matrix(Xn[c,], rows=Hin, cols=Win) # depth slice C reshaped - Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw) - Xn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] = Xn_slice - Xn_padded[c,] = matrix(Xn_padded_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) # reshape - } - dXn_padded = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) - for (f in 1:F) { # all filters - for (hout in 1:Hout) { # all output rows - h0 = (hout-1) * strideh + 1 - for (wout in 1:Wout) { # all output columns - w0 = (wout-1) * stridew + 1 - # Create a patch of the input example corresponding spatially to the filter sizes - Xn_padded_patch = matrix(0, rows=C, cols=Hf*Wf) # zeros - dXn_padded_patch = matrix(W[f,] * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout], - rows=C, cols=Hf*Wf) # reshape - for (c in 1:C) { - Xn_padded_slice = matrix(Xn_padded[c,], rows=Hin+2*padh, cols=Win+2*padw) # reshape - Xn_padded_patch[c,] = matrix(Xn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf], - rows=1, cols=Hf*Wf) # reshape - dXn_padded_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw) - dXn_padded_slice[h0:h0-1+Hf, w0:w0-1+Wf] = matrix(dXn_padded_patch[c,], - rows=Hf, cols=Wf) # reshape - dXn_padded[c,] = dXn_padded[c,] + matrix(dXn_padded_slice, - rows=1, cols=(Hin+2*padh)*(Win+2*padw)) - } - dW[f,] = dW[f,] - + matrix(Xn_padded_patch, rows=1, cols=C*Hf*Wf) - * dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout] - db[f,] = db[f,] + dout[n, (f-1)*Hout*Wout + (hout-1)*Wout + wout] - } - } - } - # Unpad derivs on input - dXn = matrix(0, rows=C, cols=Hin*Win) - parfor (c in 1:C, check=0) { - dXn_padded_slice = matrix(dXn_padded[c,], rows=(Hin+2*padh), cols=(Win+2*padw)) - dXn_slice = dXn_padded_slice[padh+1:padh+Hin, padw+1:padw+Win] - dXn[c,] = matrix(dXn_slice, rows=1, cols=Hin*Win) - } - dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win) - } -} - -init = function(int F, int C, int Hf, int Wf) - return (matrix[double] W, matrix[double] b) { - /* - * Initialize the parameters of this layer. - * - * We use the heuristic by He et al., which limits the magnification - * of inputs/gradients during forward/backward passes by scaling - * unit-Gaussian weights by a factor of sqrt(2/n), under the - * assumption of relu neurons. - * - http://arxiv.org/abs/1502.01852 - * - * Inputs: - * - F: Number of filters. - * - C: Number of input channels (dimensionality of depth). - * - Hf: Filter height. - * - Wf: Filter width. - * - * Outputs: - * - W: Weights, of shape (F, C*Hf*Wf). - * - b: Biases, of shape (F, 1). - */ - W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf)) - b = matrix(0, rows=F, cols=1) -} -
