http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/batch_norm2d.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/batch_norm2d.dml b/scripts/nn/layers/batch_norm2d.dml new file mode 100644 index 0000000..49c6746 --- /dev/null +++ b/scripts/nn/layers/batch_norm2d.dml @@ -0,0 +1,238 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * 2D (Spatial) Batch Normalization layer. + */ +source("nn/util.dml") as util + +forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta, + int C, int Hin, int Win, string mode, + matrix[double] ema_mean, matrix[double] ema_var, + double mu, double epsilon) + return (matrix[double] out, matrix[double] ema_mean_upd, matrix[double] ema_var_upd, + matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm) { + /* + * Computes the forward pass for a 2D (spatial) batch normalization + * layer. The input data has N examples, each represented as a 3D + * volume unrolled into a single vector. + * + * A spatial batch normalization layer uses the per-channel sample + * mean and per-channel uncorrected sample variance during training + * to normalize each channel of the input data. Additionally, it + * introduces learnable parameters (gamma, beta) to control the + * amount of normalization. + * + * `y = ((x-mean) / sqrt(var+eps)) * gamma + beta` + * + * This implementation maintains exponential moving averages of the + * mean and variance during training for use during testing. + * + * Reference: + * - Batch Normalization: Accelerating Deep Network Training by + * Reducing Internal Covariate Shift, S. Ioffe & C. Szegedy, 2015 + * - https://arxiv.org/abs/1502.03167 + * + * Inputs: + * - X: Inputs, of shape (N, C*Hin*Win). + * - gamma: Scale parameters, of shape (C, 1). + * - beta: Shift parameters, of shape (C, 1). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - mode: 'train' or 'test' to indicate if the model is currently + * being trained or tested. During training, the current batch + * mean and variance will be used to normalize the inputs, while + * during testing, the exponential average of the mean and + * variance over all previous batches will be used. + * - ema_mean: Exponential moving average of the mean, of + * shape (C, 1). + * - ema_var: Exponential moving average of the variance, of + * shape (C, 1). + * - mu: Momentum value for moving averages. + * Typical values are in the range of [0.9, 0.999]. + * - epsilon: Smoothing term to avoid divide by zero errors. + * Typical values are in the range of [1e-5, 1e-3]. + * + * Outputs: + * - out: Outputs, of shape (N, C*Hin*Win). + * - ema_mean_upd: Updated exponential moving average of the mean, + * of shape (C, 1). + * - ema_var_upd: Updated exponential moving average of the variance, + * of shape (C, 1). + * - cache_mean: Cache of the batch mean, of shape (C, 1). + * Note: This is used for performance during training. + * - cache_var: Cache of the batch variance, of shape (C, 1). + * Note: This is used for performance during training. + * - cache_norm: Cache of the normalized inputs, of + * shape (C, N*Hin*Win). Note: This is used for performance + * during training. + */ + N = nrow(X) + + if (mode == 'train') { + # Compute channel-wise mean and variance + # Since we don't have tensors, we will compute the means and variances in a piece-wise fashion. + # - mean of total group is mean of subgroup means + # - variance is the mean of the subgroup variances + the variance of the subgroup means + subgrp_means = matrix(colMeans(X), rows=C, cols=Hin*Win) + subgrp_vars = matrix(colVars(X) * ((N-1)/N), rows=C, cols=Hin*Win) # uncorrected variances + mean = rowMeans(subgrp_means) # shape (C, 1) + var = rowMeans(subgrp_vars) + rowVars(subgrp_means)*(((Hin*Win)-1)/(Hin*Win)) # shape (C, 1) + # Update moving averages + ema_mean_upd = mu*ema_mean + (1-mu)*mean + ema_var_upd = mu*ema_var + (1-mu)*var + } + else { + # Use moving averages of mean and variance during testing + mean = ema_mean + var = ema_var + ema_mean_upd = ema_mean + ema_var_upd = ema_var + } + + # Normalize, shift, and scale + # norm = (X-mean)*(var+epsilon)^(-1/2) + # = (X-mean) / sqrt(var+epsilon) + centered = bias_add(X, -mean) # shape (N, C*Hin*Win) + norm = bias_multiply(centered, 1/sqrt(var+epsilon)) # shape (N, C*Hin*Win) + # out = norm*gamma + beta + scaled = bias_multiply(norm, gamma) # shape (N, C*Hin*Win) + out = bias_add(scaled, beta) # shape (N, C*Hin*Win) + + # Save variable for backward pass + cache_mean = mean + cache_var = var + cache_norm = norm +} + +backward = function(matrix[double] dout, matrix[double] out, + matrix[double] ema_mean_upd, matrix[double] ema_var_upd, + matrix[double] cache_mean, matrix[double] cache_var, matrix[double] cache_norm, + matrix[double] X, matrix[double] gamma, matrix[double] beta, + int C, int Hin, int Win, string mode, + matrix[double] ema_mean, matrix[double] ema_var, + double mu, double epsilon) + return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) { + /* + * Computes the backward pass for a 2D (spatial) batch normalization + * layer. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win). + * - out: Outputs from the forward pass, of shape (N, C*Hin*Win). + * - ema_mean_upd: Updated exponential moving average of the mean + * from the forward pass, of shape (C, 1). + * - ema_var_upd: Updated exponential moving average of the variance + * from the forward pass, of shape (C, 1). + * - cache_mean: Cache of the batch mean from the forward pass, of + * shape (C, 1). Note: This is used for performance during + * training. + * - cache_var: Cache of the batch variance from the forward pass, + * of shape (C, 1). Note: This is used for performance during + * training. + * - cache_norm: Cache of the normalized inputs from the forward + * pass, of shape (C, N*Hin*Win). Note: This is used for + * performance during training. + * - X: Input data matrix to the forward pass, of + * shape (N, C*Hin*Win). + * - gamma: Scale parameters, of shape (C, 1). + * - beta: Shift parameters, of shape (C, 1). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - mode: 'train' or 'test' to indicate if the model is currently + * being trained or tested. During training, the current batch + * mean and variance will be used to normalize the inputs, while + * during testing, the exponential average of the mean and + * variance over all previous batches will be used. + * - ema_mean: Exponential moving average of the mean, of + * shape (C, 1). + * - ema_var: Exponential moving average of the variance, of + * shape (C, 1). + * - mu: Momentum value for moving averages. + * Typical values are in the range of [0.9, 0.999]. + * - epsilon: Smoothing term to avoid divide by zero errors. + * Typical values are in the range of [1e-5, 1e-3]. + * + * Outputs: + * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). + * - dgamma: Gradient wrt `W`, of shape (C, 1). + * - dbeta: Gradient wrt `b`, of shape (C, 1). + * + */ + N = nrow(X) + mean = cache_mean + var = cache_var + norm = cache_norm + centered = bias_add(X, -mean) # shape (N, C*Hin*Win) + + if (mode == 'train') { + # Compute gradients during training + dgamma = util::channel_sums(dout*norm, C, Hin, Win) # shape (C, 1) + dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1) + dnorm = bias_multiply(dout, gamma) # shape (N, C*Hin*Win) + dvar = util::channel_sums((-1/2) * bias_multiply(centered, (var+epsilon)^(-3/2)) * dnorm, + C, Hin, Win) # shape (C, 1) + dmean_norm_branch = util::channel_sums(bias_multiply(dnorm, -1/sqrt(var+epsilon)), C, Hin, Win) + dmean_var_branch = util::channel_sums((-2/(N*Hin*Win)) * centered, C, Hin, Win) + dmean_var_branch = dmean_var_branch * dvar # we can't use a function within an expression yet + dmean = dmean_norm_branch + dmean_var_branch # shape (C, 1) + dX_norm_branch = bias_multiply(dnorm, 1/sqrt(var+epsilon)) + dX_mean_branch = (1/(N*Hin*Win)) * bias_add(matrix(0, rows=1, cols=C*Hin*Win), dmean) + dX_var_branch = (2/(N*Hin*Win)) * bias_multiply(centered, dvar) + dX = dX_norm_branch + dX_mean_branch + dX_var_branch # shape (N, C*Hin*Win) + } + else { + # Compute gradients during testing + dgamma = util::channel_sums(dout*norm, C, Hin, Win) # shape (C, 1) + dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1) + dnorm = bias_multiply(dout, gamma) # shape (N, C*Hin*Win) + dX = bias_multiply(dnorm, 1/sqrt(var+epsilon)) # shape (N, C*Hin*Win) + } +} + +init = function(int C) + return (matrix[double] gamma, matrix[double] beta, + matrix[double] ema_mean, matrix[double] ema_var) { + /* + * Initialize the parameters of this layer. + * + * Note: This is just a convenience function, and parameters + * may be initialized manually if needed. + * + * Inputs: + * - C: Number of input channels (dimensionality of input depth). + * + * Outputs: + * - gamma: Scale parameters, of shape (C, 1). + * - beta: Shift parameters, of shape (C, 1). + * - ema_mean: Exponential moving average of the mean, of + * shape (C, 1). + * - ema_var: Exponential moving average of the variance, of + * shape (C, 1). + */ + gamma = matrix(1, rows=C, cols=1) + beta = matrix(0, rows=C, cols=1) + ema_mean = matrix(0, rows=C, cols=1) + ema_var = matrix(1, rows=C, cols=1) +} +
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/conv2d.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/conv2d.dml b/scripts/nn/layers/conv2d.dml new file mode 100644 index 0000000..9d03568 --- /dev/null +++ b/scripts/nn/layers/conv2d.dml @@ -0,0 +1,194 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * 2D Convolutional layer. + */ +source("nn/util.dml") as util + +forward = function(matrix[double] X, matrix[double] W, matrix[double] b, + int C, int Hin, int Win, int Hf, int Wf, + int strideh, int stridew, int padh, int padw) + return (matrix[double] out, int Hout, int Wout) { + /* + * Computes the forward pass for a 2D spatial convolutional layer with + * F filters. The input data has N examples, each represented as a 3D + * volume unrolled into a single vector. + * + * This implementation uses `im2col` internally for each image to + * extract local image regions (patches) into columns, and then + * performs a matrix multiplication with the filters to compute the + * output maps. + * + * Inputs: + * - X: Inputs, of shape (N, C*Hin*Win). + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * For same output height as input, set `padh = (Hf - 1) / 2`, + * assuming `strideh = 1`. + * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2` + * preserves the spatial dimensions of the input. + * - padw: Padding for left and right sides. + * For same output width as input, set `padw = (Wf - 1) / 2`, + * assuming `stridew = 1`. + * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2` + * preserves the spatial dimensions of the input. + * + * Outputs: + * - out: Outputs, of shape (N, F*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + */ + N = nrow(X) + F = nrow(W) + Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1)) + Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1)) + + # Create output volume + out = matrix(0, rows=N, cols=F*Hout*Wout) + + # Convolution - im2col implementation + parfor (n in 1:N) { # all examples + Xn = matrix(X[n,], rows=C, cols=Hin*Win) # reshape + + # Pad image + Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0) # shape (C, (Hin+2*padh)*(Win+2*padw)) + + # Extract local image patches into columns with im2col, of shape (C*Hf*Wf, Hout*Wout) + Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew) + + # Convolve patches with filters + outn = W %*% Xn_padded_cols + b # shape (F, Hout*Wout) + out[n,] = matrix(outn, rows=1, cols=F*Hout*Wout) # reshape + } +} + +backward = function(matrix[double] dout, int Hout, int Wout, + matrix[double] X, matrix[double] W, matrix[double] b, + int C, int Hin, int Win, int Hf, int Wf, + int strideh, int stridew, int padh, int padw) + return (matrix[double] dX, matrix[double] dW, matrix[double] db) { + /* + * Computes the backward pass for a 2D spatial convolutional layer + * with F filters. + * + * This implementation uses `im2col` and `col2im` internally. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of + * shape (N, F*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + * - X: Inputs, of shape (N, C*Hin*Win). + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * - padw: Padding for left and right sides. + * + * Outputs: + * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). + * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf). + * - db: Gradient wrt `b`, of shape (F, 1). + */ + N = nrow(X) + F = nrow(W) + + # Create gradient volumes + # Note: Create convenience gradient volumes for dW and db that will + # allow for one gradient to be stored per example, allowing for + # parallel computation at the expense of memory. We will reduce at + # the end. + dX = matrix(0, rows=N, cols=C*Hin*Win) + dWN = matrix(0, rows=N, cols=F*C*Hf*Wf) # dW = matrix(0, rows=F, cols=C*Hf*Wf) + dbN = matrix(0, rows=N, cols=F) # db = matrix(0, rows=F, cols=1) + + # Partial derivatives for convolution - im2col implementation + parfor (n in 1:N) { # all examples + doutn = matrix(dout[n,], rows=F, cols=Hout*Wout) + + # Compute dW + Xn = matrix(X[n,], rows=C, cols=Hin*Win) # reshape + Xn_padded = util::pad_image(Xn, Hin, Win, padh, padw, 0) # shape (C, (Hin+2*padh)*(Win+2*padw)) + Xn_padded_cols = util::im2col(Xn_padded, Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew) + # dW = dW + doutn %*% t(Xn_padded_cols) + dWN[n,] = matrix(doutn %*% t(Xn_padded_cols), rows=1, cols=F*C*Hf*Wf) + + # Compute db + # db = db + rowSums(doutn) + dbN[n,] = matrix(rowSums(doutn), rows=1, cols=F) + + # Compute dX + dXn_padded_cols = t(W) %*% doutn # shape (C*Hf*Wf, Hout*Wout) + dXn_padded = util::col2im(dXn_padded_cols, C, Hin+2*padh, Win+2*padw, Hf, Wf, + strideh, stridew, "add") + dXn = util::unpad_image(dXn_padded, Hin, Win, padh, padw) + dX[n,] = matrix(dXn, rows=1, cols=C*Hin*Win) # reshape + } + + # Reduce convenience gradient volumes with one gradient per example + # into single gradients for W and b. + dW = matrix(colSums(dWN), rows=F, cols=C*Hf*Wf) + db = matrix(colSums(dbN), rows=F, cols=1) +} + +init = function(int F, int C, int Hf, int Wf) + return (matrix[double] W, matrix[double] b) { + /* + * Initialize the parameters of this layer. + * + * Note: This is just a convenience function, and parameters + * may be initialized manually if needed. + * + * We use the heuristic by He et al., which limits the magnification + * of inputs/gradients during forward/backward passes by scaling + * unit-Gaussian weights by a factor of sqrt(2/n), under the + * assumption of relu neurons. + * - http://arxiv.org/abs/1502.01852 + * + * Inputs: + * - F: Number of filters. + * - C: Number of input channels (dimensionality of depth). + * - Hf: Filter height. + * - Wf: Filter width. + * + * Outputs: + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). + */ + W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf)) + b = matrix(0, rows=F, cols=1) +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/conv2d_builtin.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/conv2d_builtin.dml b/scripts/nn/layers/conv2d_builtin.dml new file mode 100644 index 0000000..bda7a9c --- /dev/null +++ b/scripts/nn/layers/conv2d_builtin.dml @@ -0,0 +1,160 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * 2D Convolutional layer. + * + * This implementation uses a built-in operator for higher performance. + */ + +forward = function(matrix[double] X, matrix[double] W, matrix[double] b, + int C, int Hin, int Win, int Hf, int Wf, + int strideh, int stridew, int padh, int padw) + return (matrix[double] out, int Hout, int Wout) { + /* + * Computes the forward pass for a 2D spatial convolutional layer with + * F filters. The input data has N examples, each represented as a 3D + * volume unrolled into a single vector. + * + * This implementation uses a built-in operator for higher + * performance. + * + * Inputs: + * - X: Inputs, of shape (N, C*Hin*Win). + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). + * - C: Number of input channels (dimensionality of depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * For same output height as input, set `padh = (Hf - 1) / 2`, + * assuming `strideh = 1`. + * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2` + * preserves the spatial dimensions of the input. + * - padw: Padding for left and right sides. + * For same output width as input, set `padw = (Wf - 1) / 2`, + * assuming `stridew = 1`. + * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2` + * preserves the spatial dimensions of the input. + * + * Outputs: + * - out: Outputs, of shape (N, F*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + */ + N = nrow(X) + F = nrow(W) + Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1)) + Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1)) + + # Convolution - built-in implementation + out = conv2d(X, W, input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf], + stride=[strideh,stridew], padding=[padh,padw]) + + # Add bias term to each output filter + out = bias_add(out, b) +} + +backward = function(matrix[double] dout, int Hout, int Wout, + matrix[double] X, matrix[double] W, matrix[double] b, + int C, int Hin, int Win, int Hf, int Wf, + int strideh, int stridew, int padh, int padw) + return (matrix[double] dX, matrix[double] dW, matrix[double] db) { + /* + * Computes the backward pass for a 2D spatial convolutional layer + * with F filters. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of + * shape (N, F*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + * - X: Inputs, of shape (N, C*Hin*Win). + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). + * - C: Number of input channels (dimensionality of depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * For same output height as input, set `padh = (Hf - 1) / 2`, + * assuming `strideh = 1`. + * More generally, `padh = (Hin*(strideh-1) + Hf - strideh) / 2` + * preserves the spatial dimensions of the input. + * - padw: Padding for left and right sides. + * For same output width as input, set `padw = (Wf - 1) / 2`, + * assuming `stridew = 1`. + * More generally, `padw = (Win*(stridew-1) + Wf - stridew) / 2` + * preserves the spatial dimensions of the input. + * + * Outputs: + * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). + * - dW: Gradient wrt `W`, of shape (F, C*Hf*Wf). + * - db: Gradient wrt `b`, of shape (F, 1). + */ + N = nrow(X) + F = nrow(W) + + # Partial derivatives for convolution - built-in implementation + dW = conv2d_backward_filter(X, dout, stride=[strideh,stridew], padding=[padh,padw], + input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf]) + dX = conv2d_backward_data(W, dout, stride=[strideh, stridew], padding=[padh,padw], + input_shape=[N,C,Hin,Win], filter_shape=[F,C,Hf,Wf]) + + # Partial derivatives for bias vector + db = rowSums(matrix(colSums(dout), rows=F, cols=Hout*Wout)) +} + +init = function(int F, int C, int Hf, int Wf) + return (matrix[double] W, matrix[double] b) { + /* + * Initialize the parameters of this layer. + * + * Note: This is just a convenience function, and parameters + * may be initialized manually if needed. + * + * We use the heuristic by He et al., which limits the magnification + * of inputs/gradients during forward/backward passes by scaling + * unit-Gaussian weights by a factor of sqrt(2/n), under the + * assumption of relu neurons. + * - http://arxiv.org/abs/1502.01852 + * + * Inputs: + * - F: Number of filters. + * - C: Number of input channels (dimensionality of depth). + * - Hf: Filter height. + * - Wf: Filter width. + * + * Outputs: + * - W: Weights, of shape (F, C*Hf*Wf). + * - b: Biases, of shape (F, 1). + */ + W = rand(rows=F, cols=C*Hf*Wf, pdf="normal") * sqrt(2.0/(C*Hf*Wf)) + b = matrix(0, rows=F, cols=1) +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/cross_entropy_loss.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/cross_entropy_loss.dml b/scripts/nn/layers/cross_entropy_loss.dml new file mode 100644 index 0000000..63db502 --- /dev/null +++ b/scripts/nn/layers/cross_entropy_loss.dml @@ -0,0 +1,78 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Cross-Entropy loss function. + */ + +forward = function(matrix[double] pred, matrix[double] y) + return (double loss) { + /* + * Computes the forward pass for a cross-entropy loss function. The + * inputs consist of N examples, each with K dimensions corresponding + * to normalized probabilities of K classes. + * + * ``` + * L_i = -y_i^T * log(pred_i) + * L = (1/N) sum(L_i) for i=1 to N + * ``` + * + * In these equations, `L` is the total loss, `L_i` is the loss for + * example `i`, `y_i` is the K-dimensional vector of target class + * probabilities, `pred_i` is K-dimensional vector of predicted + * class probabilities, and `N` is the number of examples. + * + * This can be interpreted as the negative log-likelihood assuming + * a Bernoulli distribution generalized to K dimensions, or a + * Multinomial with one observation. + * + * Inputs: + * - pred: Predictions, of shape (N, K). + * - y: Targets, of shape (N, K). + * + * Outputs: + * - loss: Average loss. + */ + N = nrow(y) + eps = 1e-10 # numerical stability to avoid log(0) + losses = rowSums(-y * log(pred+eps)) + loss = sum(losses) / N +} + +backward = function(matrix[double] pred, matrix[double] y) + return (matrix[double] dpred) { + /* + * Computes the backward pass of a cross-entropy loss function. The + * inputs consist of N examples, each with K dimensions corresponding + * to normalized probabilities of K classes. + * + * Inputs: + * - pred: Predictions, of shape (N, K). + * - y: Targets, of shape (N, K). + * + * Outputs: + * - dpred: Gradient wrt `pred`, of shape (N, K). + */ + N = nrow(y) + eps = 1e-10 # numerical stability to avoid divide-by-zero + dpred = (1/N) * -y * (1/(pred+eps)) +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/dropout.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/dropout.dml b/scripts/nn/layers/dropout.dml new file mode 100644 index 0000000..a36878b --- /dev/null +++ b/scripts/nn/layers/dropout.dml @@ -0,0 +1,76 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Dropout layer. + */ + +forward = function(matrix[double] X, double p, int seed) + return (matrix[double] out, matrix[double] mask) { + /* + * Computes the forward pass for an inverted dropout layer. + * + * Drops the inputs element-wise with a probability p, and divides + * by p to maintain the expected values of those inputs (which are + * the outputs of neurons) at test time. + * + * Inputs: + * - X: Inputs, of shape (any, any). + * - p: Probability of keeping a neuron output. + * - seed: [Optional: -1] Random number generator seed to allow for + * deterministic evaluation. Set to -1 for a random seed. + * + * Outputs: + * - out: Outputs, of same shape as `X`. + * - mask: Dropout mask used to compute the output. + */ + # Normally, we might use something like + # `mask = rand(rows=nrow(X), cols=ncol(X), min=0, max=1, seed=seed) <= p` + # to create a dropout mask. Fortunately, SystemML has a `sparsity` parameter on + # the `rand` function that allows use to create a mask directly. + if (seed == -1) { + mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p) + } else { + mask = rand(rows=nrow(X), cols=ncol(X), min=1, max=1, sparsity=p, seed=seed) + } + out = X * mask / p +} + +backward = function(matrix[double] dout, matrix[double] X, double p, matrix[double] mask) + return (matrix[double] dX) { + /* + * Computes the backward pass for an inverted dropout layer. + * + * Applies the mask to the upstream gradient, and divides by p to + * maintain the expected values at test time. + * + * Inputs: + * - dout: Gradient wrt `out`, of same shape as `X`. + * - X: Inputs, of shape (any, any). + * - p: Probability of keeping a neuron output. + * - mask: Dropout mask used to compute the output. + * + * Outputs: + * - dX: Gradient wrt `X`, of same shape as `X`. + */ + dX = mask / p * dout +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/l1_loss.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/l1_loss.dml b/scripts/nn/layers/l1_loss.dml new file mode 100644 index 0000000..b74566d --- /dev/null +++ b/scripts/nn/layers/l1_loss.dml @@ -0,0 +1,72 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * L1 loss function. + */ + +forward = function(matrix[double] pred, matrix[double] y) + return (double loss) { + /* + * Computes the forward pass for an L1 loss function. The inputs + * consist of N examples, each with M dimensions to predict. + * + * ``` + * L_i = sum_j(abs((pred_i)_j - (y_i)_j)) for all j. + * L = (1/N) sum(L_i) for i=1 to N + * ``` + * + * In these equations, `L` is the total loss, `L_i` is the loss for + * example `i`, `y_i` is the scalar target, `pred_i` is the scalar + * prediction, and `N` is the number of examples. + * + * This can be interpreted as the negative log-likelihood assuming + * a Laplace distribution. + * + * Inputs: + * - pred: Predictions, of shape (N, M). + * - y: Targets, of shape (N, M). + * + * Outputs: + * - loss: Average loss. + */ + N = nrow(y) + losses = rowSums(abs(pred-y)) + loss = sum(losses) / N +} + +backward = function(matrix[double] pred, matrix[double] y) + return (matrix[double] dpred) { + /* + * Computes the backward pass for an L1 loss function. The inputs + * consist of N examples, each with M dimensions to predict. + * + * Inputs: + * - pred: Predictions, of shape (N, M). + * - y: Targets, of shape (N, M). + * + * Outputs: + * - dpred: Gradient wrt `pred`, of shape (N, M). + */ + N = nrow(y) + dpred = sign(pred-y) / N +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/l1_reg.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/l1_reg.dml b/scripts/nn/layers/l1_reg.dml new file mode 100644 index 0000000..2b81c0b --- /dev/null +++ b/scripts/nn/layers/l1_reg.dml @@ -0,0 +1,56 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * L1 regularization. + */ + +forward = function(matrix[double] X, double lambda) + return (double reg_loss) { + /* + * Computes the forward pass for an L1 regularization function. + * + * Inputs: + * - X: Inputs, of shape (any, any). + * - lambda: Regularization strength. + * A typical value is 0.01. + * + * Outputs: + * - reg_loss: Total regularization loss. + */ + reg_loss = lambda * sum(abs(X)) +} + +backward = function(matrix[double] X, double lambda) + return (matrix[double] dX) { + /* + * Computes the backward pass for an L1 regularization function. + * + * Inputs: + * - X: Inputs, of shape (any, any). + * - lambda: Regularization strength. + * + * Outputs: + * - dX: Gradient wrt `X`, of same shape as `X`. + */ + dX = lambda * sign(X) +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/l2_loss.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/l2_loss.dml b/scripts/nn/layers/l2_loss.dml new file mode 100644 index 0000000..0482f25 --- /dev/null +++ b/scripts/nn/layers/l2_loss.dml @@ -0,0 +1,72 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * L2 loss function. + */ + +forward = function(matrix[double] pred, matrix[double] y) + return (double loss) { + /* + * Computes the forward pass for an L2 loss function. The inputs + * consist of N examples, each with M dimensions to predict. + * + * ``` + * L_i = (1/2) norm(pred_i - y_i)^2 + * L = (1/N) sum(L_i) for i=1 to N + * ``` + * + * In these equations, `L` is the total loss, `L_i` is the loss for + * example `i`, `y_i` is the scalar target, `pred_i` is the scalar + * prediction, and `N` is the number of examples. + * + * This can be interpreted as the negative log-likelihood assuming + * a Gaussian distribution. + * + * Inputs: + * - pred: Predictions, of shape (N, M). + * - y: Targets, of shape (N, M). + * + * Outputs: + * - loss: Average loss. + */ + N = nrow(y) + losses = 0.5 * rowSums((pred-y)^2) + loss = sum(losses) / N +} + +backward = function(matrix[double] pred, matrix[double] y) + return (matrix[double] dpred) { + /* + * Computes the backward pass for an L2 loss function. The inputs + * consist of N examples, each with M dimensions to predict. + * + * Inputs: + * - pred: Predictions, of shape (N, M). + * - y: Targets, of shape (N, M). + * + * Outputs: + * - dpred: Gradient wrt `pred`, of shape (N, M). + */ + N = nrow(y) + dpred = (pred-y) / N +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/l2_reg.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/l2_reg.dml b/scripts/nn/layers/l2_reg.dml new file mode 100644 index 0000000..7255efe --- /dev/null +++ b/scripts/nn/layers/l2_reg.dml @@ -0,0 +1,56 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * L2 regularization. + */ + +forward = function(matrix[double] X, double lambda) + return (double reg_loss) { + /* + * Computes the forward pass for an L2 regularization function. + * + * Inputs: + * - X: Inputs, of shape (any, any). + * - lambda: Regularization strength. + * A typical value is 0.01. + * + * Outputs: + * - reg_loss: Total regularization loss. + */ + reg_loss = 0.5 * lambda * sum(X^2) +} + +backward = function(matrix[double] X, double lambda) + return (matrix[double] dX) { + /* + * Computes the backward pass for an L2 regularization function. + * + * Inputs: + * - X: Inputs, of shape (any, any). + * - lambda: Regularization strength. + * + * Outputs: + * - dX: Gradient wrt `X`, of same shape as `X`. + */ + dX = lambda * X +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/log_loss.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/log_loss.dml b/scripts/nn/layers/log_loss.dml new file mode 100644 index 0000000..15914f7 --- /dev/null +++ b/scripts/nn/layers/log_loss.dml @@ -0,0 +1,76 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Log loss function. + */ + +forward = function(matrix[double] pred, matrix[double] y) + return (double loss) { + /* + * Computes the forward pass for a log loss function. + * + * ``` + * L_i = -y_i*log(pred_i) - (1-y_i)*log(1-pred_i) + * L = (1/N) sum(L_i) for i=1 to N + * ``` + * + * In these equations, `L` is the total loss, `L_i` is the loss for + * example `i`, `y_i` is the binary target, `pred_i` is probability + * of the true class (i.e. `y=1`), and `N` is the number of examples. + * + * This can be interpreted as the negative log-likelihood assuming + * a Bernoulli distribution. + * + * Inputs: + * - pred: Predictions, of shape (N, 1). + * Predictions should be probabilities of the true + * class (i.e. probability of `y=1`). + * - y: Targets, of shape (N, 1). + * Targets should be binary in the set {0, 1}. + * + * Outputs: + * - loss: Average loss. + */ + N = nrow(y) + losses = -y*log(pred) - (1-y)*log(1-pred) + loss = sum(losses) / N +} + +backward = function(matrix[double] pred, matrix[double] y) + return (matrix[double] dpred) { + /* + * Computes the backward pass for a log loss function. + * + * Inputs: + * - pred: Predictions, of shape (N, 1). + * Predictions should be probabilities of the true + * class (i.e. probability of `y=1`). + * - y: Targets, of shape (N, 1). + * Targets should be binary in the set {0, 1}. + * + * Outputs: + * - dpred: Gradient wrt `pred`, of shape (N, 1). + */ + N = nrow(y) + dpred = (1/N) * (pred-y) / (pred*(1-pred)) +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/lstm.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/lstm.dml b/scripts/nn/layers/lstm.dml new file mode 100644 index 0000000..a75add4 --- /dev/null +++ b/scripts/nn/layers/lstm.dml @@ -0,0 +1,260 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * LSTM layer. + */ +source("nn/layers/sigmoid.dml") as sigmoid +source("nn/layers/tanh.dml") as tanh + +forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, int D, + boolean return_sequences, matrix[double] out0, matrix[double] c0) + return (matrix[double] out, matrix[double] c, + matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog) { + /* + * Computes the forward pass for an LSTM layer with M neurons. + * The input data has N sequences of T examples, each with D features. + * + * In an LSTM, an internal cell state is maintained, additive + * interactions operate over the cell state at each timestep, and + * some amount of this cell state is exposed as output at each + * timestep. Additionally, the output of the previous timestep is fed + * back in as an additional input at the current timestep. + * + * Reference: + * - Long Short-Term Memory, Hochreiter, 1997 + * - http://deeplearning.cs.cmu.edu/pdfs/Hochreiter97_lstm.pdf + * + * Inputs: + * - X: Inputs, of shape (N, T*D). + * - W: Weights, of shape (D+M, 4M). + * - b: Biases, of shape (1, 4M). + * - T: Length of example sequences (number of timesteps). + * - D: Dimensionality of the input features (number of features). + * - return_sequences: Whether to return `out` at all timesteps, + * or just for the final timestep. + * - out0: Outputs from previous timestep, of shape (N, M). + * Note: This is *optional* and could just be an empty matrix. + * - c0: Initial cell state, of shape (N, M). + * Note: This is *optional* and could just be an empty matrix. + * + * Outputs: + * - out: If `return_sequences` is True, outputs for all timesteps, + * of shape (N, T*M). Else, outputs for the final timestep, of + * shape (N, M). + * - c: Cell state for final timestep, of shape (N, M). + * - cache_out: Cache of outputs, of shape (T, N*M). + * Note: This is used for performance during training. + * - cache_c: Cache of cell state, of shape (T, N*M). + * Note: This is used for performance during training. + * - cache_ifog: Cache of intermediate values, of shape (T, N*4M). + * Note: This is used for performance during training. + */ + N = nrow(X) + M = as.integer(ncol(W)/4) + out_prev = out0 + c_prev = c0 + c = c_prev + if (return_sequences) { + out = matrix(0, rows=N, cols=T*M) + } + else { + out = matrix(0, rows=N, cols=M) + } + # caches to be used during the backward pass for performance + cache_out = matrix(0, rows=T, cols=N*M) + cache_c = matrix(0, rows=T, cols=N*M) + cache_ifog = matrix(0, rows=T, cols=N*4*M) + + for (t in 1:T) { # each timestep + X_t = X[,(t-1)*D+1:t*D] # shape (N, D) + input = cbind(X_t, out_prev) # shape (N, D+M) + ifog = input %*% W + b # input, forget, output, and g gates; shape (N, 4M) + tmp = sigmoid::forward(ifog[,1:3*M]) # i,f,o gates squashed with sigmoid + ifog[,1:3*M] = tmp + tmp = tanh::forward(ifog[,3*M+1:4*M]) # g gate squashed with tanh + ifog[,3*M+1:4*M] = tmp + # c_t = f*prev_c + i*g + c = ifog[,M+1:2*M]*c_prev + ifog[,1:M]*ifog[,3*M+1:4*M] # shape (N, M) + # out_t = o*tanh(c) + tmp = tanh::forward(c) + out_t = ifog[,2*M+1:3*M] * tmp # shape (N, M) + + # store + if (return_sequences) { + out[,(t-1)*M+1:t*M] = out_t + } + else { + out = out_t + } + out_prev = out_t + c_prev = c + cache_out[t,] = matrix(out_t, rows=1, cols=N*M) # reshape + cache_c[t,] = matrix(c, rows=1, cols=N*M) # reshape + cache_ifog[t,] = matrix(ifog, rows=1, cols=N*4*M) # reshape + } +} + +backward = function(matrix[double] dout, matrix[double] dc, + matrix[double] X, matrix[double] W, matrix[double] b, int T, int D, + boolean given_sequences, matrix[double] out0, matrix[double] c0, + matrix[double] cache_out, matrix[double] cache_c, matrix[double] cache_ifog) + return (matrix[double] dX, matrix[double] dW, matrix[double] db, + matrix[double] dout0, matrix[double] dc0) { + /* + * Computes the backward pass for an LSTM layer with M neurons. + * + * Inputs: + * - dout: Gradient wrt `out`. If `given_sequences` is `True`, + * contains gradients on outputs for all timesteps, of + * shape (N, T*M). Else, contains the gradient on the output + * for the final timestep, of shape (N, M). + * - dc: Gradient wrt `c` (from later in time), of shape (N, M). + * This would come from later in time if the cell state was used + * downstream as the initial cell state for another LSTM layer. + * Typically, this would be used when a sequence was cut at + * timestep `T` and then continued in the next batch. If `c` + * was not used downstream, then `dc` would be an empty matrix. + * - X: Inputs, of shape (N, T*D). + * - W: Weights, of shape (D+M, 4M). + * - b: Biases, of shape (1, 4M). + * - T: Length of example sequences (number of timesteps). + * - D: Dimensionality of the input features. + * - given_sequences: Whether `dout` is for all timesteps, + * or just for the final timestep. This is based on whether + * `return_sequences` was true in the forward pass. + * - out0: Outputs from previous timestep, of shape (N, M). + * Note: This is *optional* and could just be an empty matrix. + * - c0: Initial cell state, of shape (N, M). + * Note: This is *optional* and could just be an empty matrix. + * - cache_out: Cache of outputs, of shape (T, N*M). + * Note: This is used for performance during training. + * - cache_c: Cache of cell state, of shape (T, N*M). + * Note: This is used for performance during training. + * - cache_ifog: Cache of intermediate values, of shape (T, N*4*M). + * Note: This is used for performance during training. + * + * Outputs: + * - dX: Gradient wrt `X`, of shape (N, T*D). + * - dW: Gradient wrt `W`, of shape (D+M, 4M). + * - db: Gradient wrt `b`, of shape (1, 4M). + * - dout0: Gradient wrt `out0`, of shape (N, M). + * - dc0: Gradient wrt `c0`, of shape (N, M). + */ + N = nrow(X) + M = as.integer(ncol(W)/4) + dX = matrix(0, rows=N, cols=T*D) + dW = matrix(0, rows=D+M, cols=4*M) + db = matrix(0, rows=1, cols=4*M) + dout0 = matrix(0, rows=N, cols=M) + dc0 = matrix(0, rows=N, cols=M) + dct = dc + if (!given_sequences) { + # only given dout for output at final timestep, so prepend empty douts for all other timesteps + dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout) # shape (N, T*M) + } + + t = T + for (iter in 1:T) { # each timestep in reverse order + X_t = X[,(t-1)*D+1:t*D] # shape (N, D) + dout_t = dout[,(t-1)*M+1:t*M] # shape (N, M) + out_t = matrix(cache_out[t,], rows=N, cols=M) # shape (N, M) + ct = matrix(cache_c[t,], rows=N, cols=M) # shape (N, M) + if (t == 1) { + out_prev = out0 # shape (N, M) + c_prev = c0 # shape (N, M) + } + else { + out_prev = matrix(cache_out[t-1,], rows=N, cols=M) # shape (N, M) + c_prev = matrix(cache_c[t-1,], rows=N, cols=M) # shape (N, M) + } + input = cbind(X_t, out_prev) # shape (N, D+M) + ifog = matrix(cache_ifog[t,], rows=N, cols=4*M) + i = ifog[,1:M] # input gate, shape (N, M) + f = ifog[,M+1:2*M] # forget gate, shape (N, M) + o = ifog[,2*M+1:3*M] # output gate, shape (N, M) + g = ifog[,3*M+1:4*M] # g gate, shape (N, M) + + tmp = tanh::backward(dout_t, ct) + dct = dct + o*tmp # shape (N, M) + tmp = tanh::forward(ct) + do = tmp * dout_t # output gate, shape (N, M) + df = c_prev * dct # forget gate, shape (N, M) + dc_prev = f * dct # shape (N, M) + di = g * dct # input gate, shape (N, M) + dg = i * dct # g gate, shape (N, M) + + di_raw = i * (1-i) * di + df_raw = f * (1-f) * df + do_raw = o * (1-o) * do + dg_raw = (1-g^2) * dg + difog_raw = cbind(di_raw, cbind(df_raw, cbind(do_raw, dg_raw))) # shape (N, 4M) + + dW = dW + t(input) %*% difog_raw # shape (D+M, 4M) + db = db + colSums(difog_raw) # shape (1, 4M) + dinput = difog_raw %*% t(W) # shape (N, D+M) + dX[,(t-1)*D+1:t*D] = dinput[,1:D] + dout_prev = dinput[,D+1:D+M] # shape (N, M) + if (t == 1) { + dout0 = dout_prev # shape (N, M) + dc0 = dc_prev # shape (N, M) + } + else { + dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev # shape (N, M) + dct = dc_prev # shape (N, M) + } + t = t - 1 + } +} + +init = function(int N, int D, int M) + return (matrix[double] W, matrix[double] b, matrix[double] out0, matrix[double] c0) { + /* + * Initialize the parameters of this layer. + * + * Note: This is just a convenience function, and parameters + * may be initialized manually if needed. + * + * We use the Glorot uniform heuristic which limits the magnification + * of inputs/gradients during forward/backward passes by scaling + * uniform weights by a factor of sqrt(6/(fan_in + fan_out)). + * - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf + * + * Inputs: + * - N: Number of examples in batch. + * - D: Dimensionality of the input features (number of features). + * - M: Number of neurons in this layer. + * + * Outputs: + * - W: Weights, of shape (D+M, 4M). + * - b: Biases, of shape (1, 4M). + * - out0: Empty previous timestep output matrix, of shape (N, M). + * - c0: Empty initial cell state matrix, of shape (N, M). + */ + fan_in = D+M + fan_out = 4*M + scale = sqrt(6/(fan_in+fan_out)) + W = rand(rows=D+M, cols=4*M, min=-scale, max=scale, pdf="uniform") + b = matrix(0, rows=1, cols=4*M) + out0 = matrix(0, rows=N, cols=M) + c0 = matrix(0, rows=N, cols=M) +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/max_pool2d.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/max_pool2d.dml b/scripts/nn/layers/max_pool2d.dml new file mode 100644 index 0000000..fba1a4c --- /dev/null +++ b/scripts/nn/layers/max_pool2d.dml @@ -0,0 +1,159 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Max Pooling layer. + */ +source("nn/util.dml") as util + +forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf, + int strideh, int stridew, int padh, int padw) + return (matrix[double] out, int Hout, int Wout) { + /* + * Computes the forward pass for a 2D spatial max pooling layer. + * The input data has N examples, each represented as a 3D volume + * unrolled into a single vector. + * + * This implementation uses `im2col` internally for each image to + * extract local image regions (patches) of each channel slice into + * columns, and then performs max pooling over the patches to compute + * the output maps. + * + * Inputs: + * - X: Inputs, of shape (N, C*Hin*Win). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * A typical value is 0. + * - padw: Padding for left and right sides. + * A typical value is 0. + * + * Outputs: + * - out: Outputs, of shape (N, C*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + */ + N = nrow(X) + Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1)) + Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1)) + pad_value = -1/0 # in max pooling we pad with -infinity + + # Create output volume + out = matrix(0, rows=N, cols=C*Hout*Wout) + + # Max pooling - im2col implementation + parfor (n in 1:N) { # all examples + img = matrix(X[n,], rows=C, cols=Hin*Win) # reshape + + if (padh > 0 | padw > 0) { + # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw)) + img = util::pad_image(img, Hin, Win, padh, padw, pad_value) + } + + img_maxes = matrix(0, rows=C, cols=Hout*Wout) # zeros + parfor (c in 1:C) { # all channels + # Extract local image slice patches into columns with im2col, of shape (Hf*Wf, Hout*Wout) + img_slice_cols = util::im2col(img[c,], Hin+2*padh, Win+2*padw, Hf, Wf, strideh, stridew) + + # Max pooling on patches + img_maxes[c,] = colMaxs(img_slice_cols) + } + + out[n,] = matrix(img_maxes, rows=1, cols=C*Hout*Wout) + } +} + +backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X, + int C, int Hin, int Win, int Hf, int Wf, + int strideh, int stridew, int padh, int padw) + return (matrix[double] dX) { + /* + * Computes the backward pass for a 2D spatial max pooling layer. + * The input data has N examples, each represented as a 3D volume + * unrolled into a single vector. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of + * shape (N, C*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + * - X: Input data matrix, of shape (N, C*Hin*Win). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * A typical value is 0. + * - padw: Padding for left and right sides. + * A typical value is 0. + * + * Outputs: + * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). + */ + N = nrow(X) + pad_value = -1/0 # in max pooling we pad with -infinity + + # Create gradient volume + dX = matrix(0, rows=N, cols=C*Hin*Win) + + # Gradient of max pooling + parfor (n in 1:N, check=0) { # all examples + img = matrix(X[n,], rows=C, cols=Hin*Win) + if (padh > 0 | padw > 0) { + # Pad image to shape (C, (Hin+2*padh)*(Win+2*padw)) + img = util::pad_image(img, Hin, Win, padh, padw, pad_value) + } + + dimg = matrix(0, rows=C, cols=(Hin+2*padh)*(Win+2*padw)) + parfor (c in 1:C, check=0) { # all channels + img_slice = matrix(img[c,], rows=Hin+2*padh, cols=Win+2*padw) + dimg_slice = matrix(0, rows=Hin+2*padh, cols=Win+2*padw) + for (hout in 1:Hout, check=0) { # all output rows + hin = (hout-1)*strideh + 1 + for (wout in 1:Wout) { # all output columns + win = (wout-1)*stridew + 1 + img_slice_patch = img_slice[hin:hin+Hf-1, win:win+Wf-1] + max_val_ind = img_slice_patch == max(img_slice_patch) # max value indicator matrix + # gradient passes through only for the max value(s) in this patch + dimg_slice_patch = max_val_ind * dout[n, (c-1)*Hout*Wout + (hout-1)*Wout + wout] + dimg_slice[hin:hin+Hf-1, win:win+Wf-1] = dimg_slice[hin:hin+Hf-1, win:win+Wf-1] + + dimg_slice_patch + } + } + dimg[c,] = matrix(dimg_slice, rows=1, cols=(Hin+2*padh)*(Win+2*padw)) + } + + if (padh > 0 | padw > 0) { + # Unpad image gradient + dimg = util::unpad_image(dimg, Hin, Win, padh, padw) # shape (C, (Hin+2*padh)*(Win+2*padw)) + } + dX[n,] = matrix(dimg, rows=1, cols=C*Hin*Win) + } +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/max_pool2d_builtin.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/max_pool2d_builtin.dml b/scripts/nn/layers/max_pool2d_builtin.dml new file mode 100644 index 0000000..880f818 --- /dev/null +++ b/scripts/nn/layers/max_pool2d_builtin.dml @@ -0,0 +1,103 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * 2D Max Pooling layer. + * + * This implementation uses a built-in operator for higher performance. + */ + +forward = function(matrix[double] X, int C, int Hin, int Win, int Hf, int Wf, + int strideh, int stridew, int padh, int padw) + return (matrix[double] out, int Hout, int Wout) { + /* + * Computes the forward pass for a 2D spatial max pooling layer. + * The input data has N examples, each represented as a 3D volume + * unrolled into a single vector. + * + * This implementation uses a built-in operator for higher + * performance. + * + * Inputs: + * - X: Inputs, of shape (N, C*Hin*Win). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * A typical value is 0. + * - padw: Padding for left and right sides. + * A typical value is 0. + * + * Outputs: + * - out: Outputs, of shape (N, C*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + */ + N = nrow(X) + Hout = as.integer(floor((Hin + 2*padh - Hf)/strideh + 1)) + Wout = as.integer(floor((Win + 2*padw - Wf)/stridew + 1)) + + # Max pooling - built-in implementation + out = max_pool(X, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf], + stride=[strideh,stridew], padding=[padh,padw]) +} + +backward = function(matrix[double] dout, int Hout, int Wout, matrix[double] X, + int C, int Hin, int Win, int Hf, int Wf, + int strideh, int stridew, int padh, int padw) + return (matrix[double] dX) { + /* + * Computes the backward pass for a 2D spatial max pooling layer. + * The input data has N examples, each represented as a 3D volume + * unrolled into a single vector. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of + * shape (N, C*Hout*Wout). + * - Hout: Output height. + * - Wout: Output width. + * - X: Inputs, of shape (N, C*Hin*Win). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * - Hf: Filter height. + * - Wf: Filter width. + * - strideh: Stride over height. + * - stridew: Stride over width. + * - padh: Padding for top and bottom sides. + * A typical value is 0. + * - padw: Padding for left and right sides. + * A typical value is 0. + * + * Outputs: + * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). + */ + N = nrow(X) + + # Gradient of max pooling + dX = max_pool_backward(X, dout, input_shape=[N,C,Hin,Win], pool_size=[Hf,Wf], + stride=[strideh,stridew], padding=[padh,padw]) +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/relu.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/relu.dml b/scripts/nn/layers/relu.dml new file mode 100644 index 0000000..93a6e90 --- /dev/null +++ b/scripts/nn/layers/relu.dml @@ -0,0 +1,59 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Rectified Linear Unit (ReLU) nonlinearity layer. + */ + +forward = function(matrix[double] X) + return (matrix[double] out) { + /* + * Computes the forward pass for a ReLU nonlinearity layer. + * + * Performs an element-wise evaluation of `f(input) = max(0, input)`. + * + * Inputs: + * - X: Inputs, of shape (any, any). + * + * Outputs: + * - out: Outputs, of same shape as `X`. + */ + out = max(X, 0) +} + +backward = function(matrix[double] dout, matrix[double] X) + return (matrix[double] dX) { + /* + * Computes the backward pass for a ReLU nonlinearity layer. + * + * Essentially performs a pass-through of the upstream gradient + * for cells > 0. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of same shape as `X`. + * - X: Previous input data matrix, of shape (any, any). + * + * Outputs: + * - dX: Gradient wrt `X`, of same shape as `X`. + */ + dX = (X > 0) * dout +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/rnn.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/rnn.dml b/scripts/nn/layers/rnn.dml new file mode 100644 index 0000000..3c6faae --- /dev/null +++ b/scripts/nn/layers/rnn.dml @@ -0,0 +1,183 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Simple (Vanilla) RNN layer. + */ +source("nn/layers/tanh.dml") as tanh + +forward = function(matrix[double] X, matrix[double] W, matrix[double] b, int T, int D, + boolean return_sequences, matrix[double] out0) + return (matrix[double] out, matrix[double] cache_out) { + /* + * Computes the forward pass for a simple RNN layer with M neurons. + * The input data has N sequences of T examples, each with D features. + * + * In a simple RNN, the output of the previous timestep is fed back + * in as an additional input at the current timestep. + * + * Inputs: + * - X: Inputs, of shape (N, T*D). + * - W: Weights, of shape (D+M, M). + * - b: Biases, of shape (1, M). + * - T: Length of example sequences (number of timesteps). + * - D: Dimensionality of the input features (number of features). + * - return_sequences: Whether to return `out` at all timesteps, + * or just for the final timestep. + * - out0: Output matrix from previous timestep, of shape (N, M). + * Note: This is *optional* and could just be an empty matrix. + * + * Outputs: + * - out: If `return_sequences` is True, outputs for all timesteps, + * of shape (N, T*M). Else, outputs for the final timestep, of + * shape (N, M). + * - cache_out: Cache of outputs, of shape (T, N*M). + * Note: This is used for performance during training. + */ + N = nrow(X) + M = ncol(W) + out_prev = out0 + if (return_sequences) { + out = matrix(0, rows=N, cols=T*M) + } + else { + out = matrix(0, rows=N, cols=M) + } + # caches to be used during the backward pass for performance + cache_out = matrix(0, rows=T, cols=N*M) + + for (t in 1:T) { # each timestep + X_t = X[,(t-1)*D+1:t*D] # shape (N, D) + input = cbind(X_t, out_prev) # shape (N, D+M) + out_t = tanh::forward(input %*% W + b) # shape (N, M) + # store + if (return_sequences) { + out[,(t-1)*M+1:t*M] = out_t + } + else { + out = out_t + } + out_prev = out_t + cache_out[t,] = matrix(out_t, rows=1, cols=N*M) # reshape + } +} + +backward = function(matrix[double] dout, matrix[double] X, matrix[double] W, matrix[double] b, + int T, int D, boolean given_sequences, matrix[double] out0, + matrix[double] cache_out) + return (matrix[double] dX, matrix[double] dW, matrix[double] db, matrix[double] dout0) { + /* + * Computes the backward pass for a simple RNN layer with M neurons. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream. If `given_sequences` + * is True, contains gradients on outputs for all timesteps, + * of shape (N, T*M). Else, contains gradient on output for + * the final timestep, of shape (N, M). + * - X: Inputs, of shape (N, T*D). + * - W: Weights, of shape (D+M, M). + * - b: Biases, of shape (1, M). + * - T: Length of example sequences (number of timesteps). + * - D: Dimensionality of the input features (number of features). + * - given_sequences: Whether `dout` is for all timesteps, + * or just for the final timestep. This is based on whether + * `return_sequences` was true in the forward pass. + * - out0: Output matrix from previous timestep, of shape (N, M). + * Note: This is *optional* and could just be an empty matrix. + * - cache_out: Cache of outputs, of shape (T, N*M). + * Note: This is used for performance during training. + * + * Outputs: + * - dX: Gradient wrt `X`, of shape (N, T*D). + * - dW: Gradient wrt `W`, of shape (D+M, 4M). + * - db: Gradient wrt `b`, of shape (1, 4M). + * - dout0: Gradient wrt `out0`, of shape (N, M). + */ + N = nrow(X) + M = ncol(W) + dX = matrix(0, rows=N, cols=T*D) + dW = matrix(0, rows=D+M, cols=M) + db = matrix(0, rows=1, cols=M) + dout0 = matrix(0, rows=N, cols=M) + if (!given_sequences) { + # only given dout for output at final timestep, so prepend empty douts for all other timesteps + dout = cbind(matrix(0, rows=N, cols=(T-1)*D), dout) # shape (N, T*M) + } + + t = T + for (iter in 1:T) { # each timestep in reverse order + X_t = X[,(t-1)*D+1:t*D] # shape (N, D) + dout_t = dout[,(t-1)*M+1:t*M] # shape (N, M) + out_t = matrix(cache_out[t,], rows=N, cols=M) # shape (N, M) + if (t == 1) { + out_prev = out0 # shape (N, M) + } + else { + out_prev = matrix(cache_out[t-1,], rows=N, cols=M) # shape (N, M) + } + input = cbind(X_t, out_prev) # shape (N, D+M) + dout_t_raw = (1-out_t^2) * dout_t # into tanh, shape (N, M) + dW = dW + t(input) %*% dout_t_raw # shape (D+M, M) + db = db + colSums(dout_t_raw) # shape (1, M) + dinput = dout_t_raw %*% t(W) # shape (N, D+M) + dX[,(t-1)*D+1:t*D] = dinput[,1:D] + dout_prev = dinput[,D+1:D+M] # shape (N, M) + if (t == 1) { + dout0 = dout_prev # shape (N, M) + } + else { + dout[,(t-2)*M+1:(t-1)*M] = dout[,(t-2)*M+1:(t-1)*M] + dout_prev # shape (N, M) + } + t = t - 1 + } +} + +init = function(int N, int D, int M) + return (matrix[double] W, matrix[double] b, matrix[double] out0) { + /* + * Initialize the parameters of this layer. + * + * Note: This is just a convenience function, and parameters + * may be initialized manually if needed. + * + * We use the Glorot uniform heuristic which limits the magnification + * of inputs/gradients during forward/backward passes by scaling + * uniform weights by a factor of sqrt(6/(fan_in + fan_out)). + * - http://jmlr.org/proceedings/papers/v9/glorot10a/glorot10a.pdf + * + * Inputs: + * - N: Number of examples in batch. + * - D: Dimensionality of the input features (number of features). + * - M: Number of neurons in this layer. + * + * Outputs: + * - W: Weights, of shape (D+M, M). + * - b: Biases, of shape (1, M). + * - out0: Empty previous timestep output matrix, of shape (N, M). + */ + fan_in = D+M + fan_out = M + scale = sqrt(6/(fan_in+fan_out)) + W = rand(rows=D+M, cols=M, min=-scale, max=scale, pdf="uniform") + b = matrix(0, rows=1, cols=M) + out0 = matrix(0, rows=N, cols=M) +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/scale_shift1d.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/scale_shift1d.dml b/scripts/nn/layers/scale_shift1d.dml new file mode 100644 index 0000000..7e162a3 --- /dev/null +++ b/scripts/nn/layers/scale_shift1d.dml @@ -0,0 +1,95 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * 1D Scale & Shift layer. + */ + +forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta) + return (matrix[double] out) { + /* + * Computes the forward pass for a 1D scale & shift layer. The input + * data has N examples, each with D features. + * + * A 1D scale & shift layer introduces learnable parameters + * (gamma, beta) to scale and shift the input on a per-feature basis. + * + * `y = x*gamma + beta` + * + * Inputs: + * - X: Inputs, of shape (N, D). + * - gamma: Scale parameters, of shape (1, D). + * - beta: Shift parameters, of shape (1, D). + * + * Outputs: + * - out: Outputs, of shape (N, D). + */ + # Scale and shift + out = X*gamma + beta # shape (N, D) +} + +backward = function(matrix[double] dout, matrix[double] out, + matrix[double] X, matrix[double] gamma, matrix[double] beta) + return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) { + /* + * Computes the backward pass for a 1D scale & shift layer. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of shape (N, D). + * - out: Outputs from the forward pass, of shape (N, D). + * - X: Inputs, of shape (N, D). + * - gamma: Scale parameters, of shape (1, D). + * - beta: Shift parameters, of shape (1, D). + * + * Outputs: + * - dX: Gradient wrt `X`, of shape (N, D). + * - dgamma: Gradient wrt `W`, of shape (1, D). + * - dbeta: Gradient wrt `b`, of shape (1, D). + * + */ + # Compute gradients during training + dgamma = colSums(dout*X) # shape (1, D) + dbeta = colSums(dout) # shape (1, D) + dX = dout * gamma # shape (N, D) +} + +init = function(int D) + return (matrix[double] gamma, matrix[double] beta) { + /* + * Initialize the parameters of this layer. + * + * By default, we initialize to an identity function, with a scale + * filler of `1`, and a shift filler of `0`. + * + * Note: This is just a convenience function, and parameters + * may be initialized manually if needed. + * + * Inputs: + * - D: Dimensionality of the input features (number of features). + * + * Outputs: + * - gamma: Scale parameters, of shape (1, D). + * - beta: Shift parameters, of shape (1, D). + */ + gamma = matrix(1, rows=1, cols=D) + beta = matrix(0, rows=1, cols=D) +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/scale_shift2d.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/scale_shift2d.dml b/scripts/nn/layers/scale_shift2d.dml new file mode 100644 index 0000000..79c884a --- /dev/null +++ b/scripts/nn/layers/scale_shift2d.dml @@ -0,0 +1,107 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * 2D Scale & Shift layer. + */ +source("nn/util.dml") as util + +forward = function(matrix[double] X, matrix[double] gamma, matrix[double] beta, + int C, int Hin, int Win) + return (matrix[double] out) { + /* + * Computes the forward pass for a 2D scale & shift layer. The input + * data has N examples, each represented as a 3D volume unrolled into + * a single vector. + * + * A 2D scale & shift layer introduces learnable parameters + * (gamma, beta) to scale and shift the input on a per-channel basis. + * + * `y = x*gamma + beta` + * + * Inputs: + * - X: Inputs, of shape (N, C*Hin*Win). + * - gamma: Scale parameters, of shape (C, 1). + * - beta: Shift parameters, of shape (C, 1). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * + * Outputs: + * - out: Outputs, of shape (N, C*Hin*Win). + */ + # Scale and shift + scaled = bias_multiply(X, gamma) # shape (N, C*Hin*Win) + out = bias_add(scaled, beta) # shape (N, C*Hin*Win) +} + +backward = function(matrix[double] dout, matrix[double] out, + matrix[double] X, matrix[double] gamma, matrix[double] beta, + int C, int Hin, int Win) + return (matrix[double] dX, matrix[double] dgamma, matrix[double] dbeta) { + /* + * Computes the backward pass for a 2D scale & shift layer. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of shape (N, C*Hin*Win). + * - out: Outputs from the forward pass, of shape (N, C*Hin*Win). + * - X: Input data matrix to the forward pass, of + * shape (N, C*Hin*Win). + * - gamma: Scale parameters, of shape (C, 1). + * - beta: Shift parameters, of shape (C, 1). + * - C: Number of input channels (dimensionality of input depth). + * - Hin: Input height. + * - Win: Input width. + * + * Outputs: + * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win). + * - dgamma: Gradient wrt `W`, of shape (C, 1). + * - dbeta: Gradient wrt `b`, of shape (C, 1). + * + */ + # Compute gradients during training + dgamma = util::channel_sums(dout*X, C, Hin, Win) # shape (C, 1) + dbeta = util::channel_sums(dout, C, Hin, Win) # shape (C, 1) + dX = bias_multiply(dout, gamma) # shape (N, C*Hin*Win) +} + +init = function(int C) + return (matrix[double] gamma, matrix[double] beta) { + /* + * Initialize the parameters of this layer. + * + * By default, we initialize to an identity function, with a scale + * filler of `1`, and a shift filler of `0`. + * + * Note: This is just a convenience function, and parameters + * may be initialized manually if needed. + * + * Inputs: + * - C: Number of input channels (dimensionality of input depth). + * + * Outputs: + * - gamma: Scale parameters, of shape (C, 1). + * - beta: Shift parameters, of shape (C, 1). + */ + gamma = matrix(1, rows=C, cols=1) + beta = matrix(0, rows=C, cols=1) +} + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/43c321d1/scripts/nn/layers/sigmoid.dml ---------------------------------------------------------------------- diff --git a/scripts/nn/layers/sigmoid.dml b/scripts/nn/layers/sigmoid.dml new file mode 100644 index 0000000..2d85adc --- /dev/null +++ b/scripts/nn/layers/sigmoid.dml @@ -0,0 +1,62 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +/* + * Sigmoid nonlinearity layer. + */ + +forward = function(matrix[double] X) + return (matrix[double] out) { + /* + * Computes the forward pass for a sigmoid nonlinearity layer. + * + * `sigmoid(x) = 1 / (1 + e^-x)` + * + * If `X` contains a single feature column, the output of a sigmoid + * layer can be interpreted as a predicted probability of a true + * class when paired with a log loss function in a binary + * classification problem. + * + * Inputs: + * - X: Inputs, of shape (any, any). + * + * Outputs: + * - out: Outputs, of same shape as `X`. + */ + out = 1 / (1+exp(-X)) +} + +backward = function(matrix[double] dout, matrix[double] X) + return (matrix[double] dX) { + /* + * Computes the backward pass for a sigmoid nonlinearity layer. + * + * Inputs: + * - dout: Gradient wrt `out` from upstream, of same shape as `X`. + * - X: Inputs, of shape (any, any). + * + * Outputs: + * - dX: Gradient wrt `X`, of same shape as `X`. + */ + out = 1 / (1+exp(-X)) + dX = out * (1-out) * dout +} +
