This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 26a2b35 [SYSTEMDS-3043] New EfficientNet builtin function, incl new
layers
26a2b35 is described below
commit 26a2b3542452fefe959819c47cf007c12a777235
Author: Bene <[email protected]>
AuthorDate: Sun Jun 27 16:44:06 2021 +0200
[SYSTEMDS-3043] New EfficientNet builtin function, incl new layers
Implemented a minified version of Efficient-Net B0 with an additional
mini test script on the MNist data set.
This includes a few new layer implementations:
SILU nonlinearity Layer
Adaptive Global Avg Pooling Layer
Inverted Residual Mobile Layer (MBConv)
We only used a single MBConv layer instead of the total of 16.
The top and stem part are however identical to Efficient-Net B0.
AMLS project SS2021.
Closes #1326.
---
scripts/nn/examples/Example-EfficientNet.dml | 76 ++++++
scripts/nn/examples/efficientNet.dml | 337 +++++++++++++++++++++++++
scripts/nn/layers/global_avg_pool2d.dml | 89 +++++++
scripts/nn/layers/mbconv.dml | 352 +++++++++++++++++++++++++++
scripts/nn/layers/silu.dml | 57 +++++
5 files changed, 911 insertions(+)
diff --git a/scripts/nn/examples/Example-EfficientNet.dml
b/scripts/nn/examples/Example-EfficientNet.dml
new file mode 100644
index 0000000..e2fd5e2
--- /dev/null
+++ b/scripts/nn/examples/Example-EfficientNet.dml
@@ -0,0 +1,76 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+/*
+* The MNIST Data can be downloaded as follows:
+* mkdir -p data/mnist/
+* cd data/mnist/
+* curl -O https://pjreddie.com/media/files/mnist_train.csv
+* curl -O https://pjreddie.com/media/files/mnist_test.csv
+*/
+
+# TODO add tests in functions/builtin, applications/nn
+
+# This script trains a minified version of the EfficientNet-B0 model
+# with a single MBConv layer. This model heavily overfits on a simple
+# MNist dataset since it was originally developed on the ImageNet dataset
+# Thus layer outputs and other factors are too large for normal MNist.
+# Therefore we only train once on the Mnist Train ds and print out its Accuracy
+# Import required methods
+source("nn/examples/efficientNet.dml") as eff
+
+# Read training data
+data = read("data/mnist/mnist_test.csv", format="csv")
+N = nrow(data)
+
+# Extract images and labels
+images = data[,2:ncol(data)]
+labels = data[,1]
+
+# Scale images to [0,1], and one-hot encode the labels
+images = images / 255.0
+labels = table(seq(1, N), labels+1, N, 10)
+
+model = eff::initNetwork(1, 10, -1)
+
+# Train
+epochs = 1
+batch_size = 256
+model = eff::netTrain(model, images, 1, 28, 28, labels, epochs, batch_size,
0.025, 0.9, TRUE)
+
+
+# Also Predict in Batches since otherwise we can run into Memory Issues
+# Could be unnecessary on more powerful machines :)
+iters = ceil(N / batch_size)
+accuracy = 0.0
+for(i in 1:iters) {
+ beg = ((i-1) * batch_size) %% N + 1
+ end = min(N, beg + batch_size - 1)
+ X_batch = images[beg:end,]
+ y_batch = labels[beg:end,]
+
+ pred = eff::netPredict(X_batch, model, 1, 28, 28)
+ partial_acc = mean(rowIndexMax(pred) == rowIndexMax(y_batch))
+ accuracy = accuracy + partial_acc
+}
+
+print("Total Accuracy: " + (accuracy / iters))
diff --git a/scripts/nn/examples/efficientNet.dml
b/scripts/nn/examples/efficientNet.dml
new file mode 100644
index 0000000..012eb33
--- /dev/null
+++ b/scripts/nn/examples/efficientNet.dml
@@ -0,0 +1,337 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# TODO move to builtin functions (needs fix for imports in builtin functions)
+# TODO scale up to real EfficientNet-B0
+
+# Trains a partial Efficient-Net B0 model
+# This script trains the top and bottom part of the Efficient-Net B0
+# The original Efficient-Net B0 has the following Layers
+#----------------------------------------------------------------
+# Layers Dimension Filters Nr Repeats
+#----------------------------------------------------------------
+# 1. Conv3x3 224x224 32 1
+# 2. MBConv1, k3x3 112x112 16 1
+# 3. MBConv6, k3x3 56x 56 24 2
+# 4. MBConv6, k5x5 28x 28 40 2
+# 5. MBConv6, k3x3 14x 14 80 3
+# 6. MBConv6, k5x5 14x 14 112 3
+# 7. MBConv6, k5x5 7x 7 192 4
+# 8. MBConv6, k3x3 7x 7 320 1
+# 9. Conv1x1 & Pooling & FC 7x 7 1280 1
+#----------------------------------------------------------------
+# In this partial implementation we implement the layers number 1, 2 and the
prediction layer 9
+# This init-Method is purely for convenience reasons there is not problem with
a manual initialization of weight and
+# biases. To extend the current implementation to a full EfficientNet-B0 only
the intermediate MBConv need to be extended
+# Both stem and top part are already complete as is the first MBConv layer.
+# The number after MBConv is the corresponding ExpansionFactor and is followed
+# by the kernel size stride and padding can be calculated from the dimension.
If the layer is repeated
+# The skip connection is activated otherwise not.
+#----------------------------------------------------------------
+
+source("nn/layers/batch_norm2d.dml") as batchnorm
+source("nn/layers/conv2d_builtin.dml") as conv2d
+source("nn/layers/conv2d_depthwise.dml") as depthwise
+source("nn/layers/global_avg_pool2d.dml") as global_avg_pool
+source("nn/layers/silu.dml") as silu
+source("nn/layers/upsample2d.dml") as upsample
+source("nn/layers/mbconv.dml") as mbconv
+source("nn/layers/affine.dml") as affine
+source("nn/layers/softmax.dml") as softmax
+source("nn/optim/sgd.dml") as sgd
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+
+initNetwork = function(int InputChannels, int NumberOutputClasses, int seed)
+ return(list[unknown] model)
+{
+ /*
+ * Convenience function for initialization of all required weights and
biases.
+ *
+ * Inputs:
+ * - InputChannels: Number of Input Channels for the model (Cin)
+ * - NumberOutputClasses: Number of classes for the network
+ * - seed: seed for the random generation of the weights
+ *
+ * Outputs:
+ * - model: A list containing the total of 36 matrices needed for the
computation of the
+ * Mini EfficientNet
+ */
+
+ # Layer 1
+ [CW_stem, Cb_stem] = conv2d::init(32, InputChannels, 3, 3, seed)
+ seed = ifelse(seed==-1, -1, seed + 1);
+ [Gamma_stem, Beta_stem, EmaMean_stem, EmaVar_stem] = batchnorm::init(32)
+
+ # Layer 2
+ [mb_parameters] = mbconv::init(32, 16, 3, 3, 1, 0.25, seed)
+ seed = ifelse(seed==-1, -1, seed + 1);
+
+ # Layer 9
+ [CW_top, Cb_top] = conv2d::init(1280, 16, 1, 1, seed)
+ seed = ifelse(seed==-1, -1, seed + 1);
+ [Gamma_top, Beta_top, EmaMean_top, EmaVar_top] = batchnorm::init(1280)
+ [DW_top, Db_top] = affine::init(1280, NumberOutputClasses, seed)
+
+ model = list(CW_stem, Cb_stem, Gamma_stem, Beta_stem,
EmaMean_stem,vEmaVar_stem,
+ as.matrix(mb_parameters[1]),
+ as.matrix(mb_parameters[2]),
+ as.matrix(mb_parameters[3]),
+ as.matrix(mb_parameters[4]),
+ as.matrix(mb_parameters[5]),
+ as.matrix(mb_parameters[6]),
+ as.matrix(mb_parameters[7]),
+ as.matrix(mb_parameters[8]),
+ as.matrix(mb_parameters[9]),
+ as.matrix(mb_parameters[10]),
+ as.matrix(mb_parameters[11]),
+ as.matrix(mb_parameters[12]),
+ as.matrix(mb_parameters[13]),
+ as.matrix(mb_parameters[14]),
+ as.matrix(mb_parameters[15]),
+ as.matrix(mb_parameters[16]),
+ as.matrix(mb_parameters[17]),
+ as.matrix(mb_parameters[18]),
+ as.matrix(mb_parameters[19]),
+ as.matrix(mb_parameters[20]),
+ as.matrix(mb_parameters[21]),
+ as.matrix(mb_parameters[22]),
+ CW_top, Cb_top, Gamma_top, Beta_top, EmaMean_top, EmaVar_top, DW_top,
Db_top)
+}
+
+
+netPredict = function(matrix[double] X, list[unknown] model, int Cin, int Hin,
int Win)
+ return(matrix[double] pred)
+{
+ /*
+ * This function generates the prediction of the model for a input X
+ *
+ * Inputs:
+ * - X: Input features of format (N, Cin * Hin * Win)
+ * - model: the list of length 36 containing the matrices generated from
the initNetwork function
+ * - Cin: Number of input channels (dimensionality of depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ *
+ * Outputs:
+ * - pred: The output of the final softmax layer of the Mini Efficient-Net
+ */
+ CW_stem = as.matrix(model[1])
+ Cb_stem = as.matrix(model[2])
+ Gamma_stem = as.matrix(model[3])
+ Beta_stem = as.matrix(model[4])
+ EmaMean_stem = as.matrix(model[5])
+ EmaVar_stem = as.matrix(model[6])
+ MBConv_params = model[7:28]
+ CW_top = as.matrix(model[29])
+ Cb_top = as.matrix(model[30])
+ Gamma_top = as.matrix(model[31])
+ Beta_top = as.matrix(model[32])
+ EmaMean_top = as.matrix(model[33])
+ EmaVar_top = as.matrix(model[34])
+ DW_top = as.matrix(model[35])
+ Db_top = as.matrix(model[36])
+
+ padh = (Hin + 1) %% 2
+ padw = (Win + 1) %% 2
+
+ [stem_out, stem_h, stem_w] = conv2d::forward(X, CW_stem, Cb_stem, Cin, Hin,
Win, 3, 3, 2, 2, padh, padw)
+ [bn_stem_out, update_EmaMean_stem, update_EmaVar_stem, cache_EmaMean_stem,
cache_EmaVar_stem] = batchnorm::forward(
+ stem_out, Gamma_stem, Beta_stem, 32, stem_h, stem_w, "train",
EmaMean_stem, EmaVar_stem, 0.9, 1e-5)
+ silu_out = silu::forward(bn_stem_out)
+
+ [mbconv_out, intermediate_mbconv, mbconvbatchnorm_updates, mbconv_h,
mbconv_w] = mbconv::forward(
+ silu_out, MBConv_params, 32, 16, stem_h, stem_w, 3, 3, 2, 2, padh, padw,
FALSE, 1, "train", 0.25)
+
+ [top_out, outh, outw] = conv2d::forward(mbconv_out, CW_top, Cb_top, 16,
mbconv_h, mbconv_w, 1, 1, 1, 1, 0, 0)
+ [bntop_out, update_EmaMean_top, update_EmaVar_top, cache_EmaMean_top,
cache_EmaVar_top] = batchnorm::forward(
+ top_out, Gamma_top, Beta_top, 1280, outh, outw, "train", EmaMean_top,
EmaVar_top, 0.9, 1e-5)
+ silu_out2 = silu::forward(bntop_out)
+ [pool_out, None, None] = global_avg_pool::forward(silu_out2, 1280, outh,
outw)
+ dense_out = affine::forward(pool_out, DW_top, Db_top)
+ pred = softmax::forward(dense_out)
+}
+
+netTrain = function(list[unknown] model, matrix[double] X, int Cin, int Hin,
int Win,
+ matrix[double] Y, int epochs, int batch_size, double learning_rate, double
lr_decay, boolean verbose)
+ return(list[unknown] trained_model)
+{
+ /*
+ * This function trains the given model with an sgd optimizer with the given
batch_size for a number of
+ * epochs.
+ *
+ * Inputs:
+ * - model: the list of length 36 containing the matrices generated from
the initNetwork function
+ * - X: Input features of format (N, Cin * Hin * Win)
+ * - Cin: Number of input channels (dimensionality of depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Y: The true labels used for learning in a OneHotEncoding(N,
NumberOutClasses)
+ * - epochs: Number of epochs to train for
+ * - batch_size: Size of batch used for a single update step
+ * - learning_rate: Size of batch used for a single update step
+ * - lr_decay: The learning rate is multiplied with lr_decay after each
epoch.
+ * - verbose: Whether the accuracy and the cross-entropy loss should be
printed after each update step
+ *
+ * Outputs:
+ * - trained_model: The new list of the updated 36 matrices
+ */
+ CW_stem = as.matrix(model[1])
+ Cb_stem = as.matrix(model[2])
+ Gamma_stem = as.matrix(model[3])
+ Beta_stem = as.matrix(model[4])
+ EmaMean_stem = as.matrix(model[5])
+ EmaVar_stem = as.matrix(model[6])
+ MBConv_params = model[7:28]
+ CW_top = as.matrix(model[29])
+ Cb_top = as.matrix(model[30])
+ Gamma_top = as.matrix(model[31])
+ Beta_top = as.matrix(model[32])
+ EmaMean_top = as.matrix(model[33])
+ EmaVar_top = as.matrix(model[34])
+ DW_top = as.matrix(model[35])
+ Db_top = as.matrix(model[36])
+
+ padh = (Hin + 1) %% 2
+ padw = (Win + 1) %% 2
+
+ N = nrow(X)
+ lr = learning_rate
+
+ # Optimize
+ iters = ceil(N / batch_size)
+ for (e in 1:epochs) {
+ for(i in 1:iters) {
+ # Get next batch
+ beg = ((i-1) * batch_size) %% N + 1
+ end = min(N, beg + batch_size - 1)
+ X_batch = X[beg:end,]
+ y_batch = Y[beg:end,]
+
+ # Compute forward pass
+ [stem_out, stem_h, stem_w] = conv2d::forward(X_batch, CW_stem, Cb_stem,
Cin, Hin, Win, 3, 3, 2, 2, padh, padw)
+ [bn_stem_out, update_EmaMean_stem, update_EmaVar_stem,
cache_EmaMean_stem, cache_EmaVar_stem] = batchnorm::forward(stem_out,
Gamma_stem, Beta_stem, 32, stem_h, stem_w, "train", EmaMean_stem, EmaVar_stem,
0.9, 1e-5)
+ silu_out = silu::forward(bn_stem_out)
+
+ [mbconv_out, intermediate_mbconv, mbconvbatchnorm_updates, mbconv_h,
mbconv_w] = mbconv::forward(silu_out, MBConv_params, 32, 16, stem_h, stem_w, 3,
3, 2, 2, padh, padw, FALSE, 1, "train", 0.25)
+
+ [top_out, outh, outw] = conv2d::forward(mbconv_out, CW_top, Cb_top, 16,
mbconv_h, mbconv_w, 1, 1, 1, 1, 0, 0)
+ [bntop_out, update_EmaMean_top, update_EmaVar_top, cache_EmaMean_top,
cache_EmaVar_top] = batchnorm::forward(top_out, Gamma_top, Beta_top, 1280,
outh, outw, "train", EmaMean_top, EmaVar_top, 0.9, 1e-5)
+ silu_out2 = silu::forward(bntop_out)
+ [pool_out, None, None] = global_avg_pool::forward(silu_out2, 1280, outh,
outw)
+ dense_out = affine::forward(pool_out, DW_top, Db_top)
+ pred = softmax::forward(dense_out)
+
+ # Compute loss & accuracy for training
+ loss = cross_entropy_loss::forward(pred, y_batch)
+ if(verbose) {
+ accuracy = mean(rowIndexMax(pred) == rowIndexMax(y_batch))
+ print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ",
Train Accuracy: " + accuracy)
+ }
+
+ # Compute backward pass
+ ## loss:
+ dprobs = cross_entropy_loss::backward(pred, y_batch)
+
+ ## TOP
+ d_softmax = softmax::backward(dprobs, dense_out)
+ [d_dense_back, dDenseW_top, dDenseb_top] = affine::backward(d_softmax,
pool_out, DW_top, Db_top)
+ d_pool_back = global_avg_pool::backward(d_dense_back, silu_out2, 1280,
outh, outw)
+ d_silu2_back = silu::backward(d_pool_back, bntop_out)
+ [d_bntop_back, dGamma_top, dBeta_top] =
batchnorm::backward(d_silu2_back, cache_EmaMean_top, cache_EmaVar_top, top_out,
Gamma_top, 1280, outh, outw, 1e-5)
+ [dtop_back, d_ConvW_top, d_Convb_top] = conv2d::backward(d_bntop_back,
outh, outw, mbconv_out, CW_top, Cb_top, 16, mbconv_h, mbconv_w, 1, 1, 1, 1, 0,
0)
+
+ # MBCONV
+ [d_mbconv_back, mbconv_gradients] = mbconv::backward(dtop_back,
silu_out, MBConv_params, intermediate_mbconv, mbconvbatchnorm_updates, 32, 16,
stem_h, stem_w, 3, 3, 2, 2, padh, padw, FALSE, 1, "train", 0.25)
+
+ ## STEM
+ d_silu_back = silu::backward(d_mbconv_back, bn_stem_out)
+ [d_bn_stem_back, dGamma_stem, dBeta_stem] =
batchnorm::backward(d_silu_back, cache_EmaMean_stem, cache_EmaVar_stem,
stem_out, Gamma_stem, 32, stem_h, stem_w, 1e-5)
+ [dconv_back, dW_stem, db_stem] = conv2d::backward(d_bn_stem_back,
stem_h, stem_w, X_batch, CW_stem, Cb_stem, Cin, Hin, Win, 3, 3, 2, 2, padh,
padw)
+
+ #Optimize with SGD
+ # Update Stem
+ CW_stem = sgd::update(CW_stem, dW_stem, lr)
+ Cb_stem = sgd::update(Cb_stem, db_stem, lr)
+ Gamma_stem = sgd::update(Gamma_stem, dGamma_stem, lr)
+ Beta_stem = sgd::update(Beta_stem, dBeta_stem, lr)
+ EmaMean_stem = update_EmaMean_stem
+ EmaVar_stem = update_EmaVar_stem
+
+ # Update MBConv
+ update_depth_W = sgd::update(as.matrix(MBConv_params[7]),
as.matrix(mbconv_gradients[11]), lr)
+ update_depth_b = sgd::update(as.matrix(MBConv_params[8]),
as.matrix(mbconv_gradients[12]), lr)
+ update_gamma_depth = sgd::update(as.matrix(MBConv_params[9]),
as.matrix(mbconv_gradients[9]), lr)
+ update_beta_depth = sgd::update(as.matrix(MBConv_params[10]),
as.matrix(mbconv_gradients[10]), lr)
+ update_ema_mean_depth = as.matrix(mbconvbatchnorm_updates[5])
+ update_ema_var_depth = as.matrix(mbconvbatchnorm_updates[6])
+ update_squeeze_W = sgd::update(as.matrix(MBConv_params[13]),
as.matrix(mbconv_gradients[7]), lr)
+ update_squeeze_b = sgd::update(as.matrix(MBConv_params[14]),
as.matrix(mbconv_gradients[8]), lr)
+ update_excite_W = sgd::update(as.matrix(MBConv_params[15]),
as.matrix(mbconv_gradients[5]), lr)
+ update_excite_b = sgd::update(as.matrix(MBConv_params[16]),
as.matrix(mbconv_gradients[6]), lr)
+ update_out_W = sgd::update(as.matrix(MBConv_params[17]),
as.matrix(mbconv_gradients[3]), lr)
+ update_out_b = sgd::update(as.matrix(MBConv_params[18]),
as.matrix(mbconv_gradients[4]), lr)
+ update_out_gamma = sgd::update(as.matrix(MBConv_params[19]),
as.matrix(mbconv_gradients[1]), lr)
+ update_out_beta = sgd::update(as.matrix(MBConv_params[20]),
as.matrix(mbconv_gradients[2]), lr)
+ update_ema_mean_out = as.matrix(mbconvbatchnorm_updates[9])
+ update_ema_var_out = as.matrix(mbconvbatchnorm_updates[10])
+
+ MBConv_params = list(
+ as.matrix(model[7]), as.matrix(model[8]),
+ as.matrix(model[9]), as.matrix(model[10]),
+ as.matrix(model[11]), as.matrix(model[12]),
+ update_depth_W, update_depth_b,
+ update_gamma_depth, update_beta_depth,
+ update_ema_mean_depth, update_ema_var_depth,
+ update_squeeze_W, update_squeeze_b,
+ update_excite_W, update_excite_b,
+ update_out_W, update_out_b,
+ update_out_gamma, update_out_beta,
+ update_ema_mean_out, update_ema_var_out)
+
+ # Update Top
+ CW_top = sgd::update(CW_top, d_ConvW_top, lr)
+ Cb_top = sgd::update(Cb_top, d_Convb_top, lr)
+ Gamma_top = sgd::update(Gamma_top, dGamma_top, lr)
+ Beta_top = sgd::update(Beta_top, dBeta_top, lr)
+ EmaMean_top = update_EmaMean_top
+ EmaVar_top = update_EmaVar_top
+ DW_top = sgd::update(DW_top, dDenseW_top, lr)
+ Db_top = sgd::update(Db_top, dDenseb_top, lr)
+ }
+ # Decay learning rate
+ lr = lr * lr_decay
+ }
+
+ # Pack everything into model format
+ trained_model = list(CW_stem, Cb_stem, Gamma_stem, Beta_stem, EmaMean_stem,
EmaVar_stem,
+ as.matrix(MBConv_params[1]), as.matrix(MBConv_params[2]),
+ as.matrix(MBConv_params[3]), as.matrix(MBConv_params[4]),
+ as.matrix(MBConv_params[5]), as.matrix(MBConv_params[6]),
+ as.matrix(MBConv_params[7]), as.matrix(MBConv_params[8]),
+ as.matrix(MBConv_params[9]), as.matrix(MBConv_params[10]),
+ as.matrix(MBConv_params[11]), as.matrix(MBConv_params[12]),
+ as.matrix(MBConv_params[13]), as.matrix(MBConv_params[14]),
+ as.matrix(MBConv_params[15]), as.matrix(MBConv_params[16]),
+ as.matrix(MBConv_params[17]), as.matrix(MBConv_params[18]),
+ as.matrix(MBConv_params[19]), as.matrix(MBConv_params[20]),
+ as.matrix(MBConv_params[21]), as.matrix(MBConv_params[22]),
+ CW_top, Cb_top, Gamma_top, Beta_top, EmaMean_top, EmaVar_top, DW_top,
Db_top)
+}
diff --git a/scripts/nn/layers/global_avg_pool2d.dml
b/scripts/nn/layers/global_avg_pool2d.dml
new file mode 100644
index 0000000..1a31f62
--- /dev/null
+++ b/scripts/nn/layers/global_avg_pool2d.dml
@@ -0,0 +1,89 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Global Average Pooling 2D layer.
+ *
+ * This implementation uses a built-in operator for higher performance.
+ */
+
+forward = function(matrix[double] X, int C, int Hin, int Win)
+ return (matrix[double] out, int Hout, int Wout) {
+ /*
+ * Computes the forward pass for a 2D Global average pooling layer.
+ * The input data has N examples, each represented as a 3D volume
+ * unrolled into a single vector. The output dimension for Hout and Wout is
always 1.
+ *
+ * This implementation uses a built-in operator for higher
+ * performance.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ *
+ * Outputs:
+ * - out: Outputs, of shape (N, C*Hout*Wout).
+ * - Hout: Output height.
+ * - Wout: Output width.
+ */
+
+ # Max pooling - built-in implementation
+ N = nrow(X)
+ Hout = 1
+ Wout = 1
+ out = avg_pool(X, input_shape=[N,C,Hin,Win],
+ pool_size=[Hin,Win], stride=[1,1], padding=[0, 0])
+}
+
+backward = function(matrix[double] dout, matrix[double] X, int C, int Hin, int
Win)
+ return (matrix[double] dX)
+{
+ /*
+ * Computes the backward pass for a 2D spatial average pooling layer.
+ * The input data has N examples, each represented as a 3D volume
+ * unrolled into a single vector.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of
+ * shape (N, C*Hout*Wout).
+ * - X: Inputs, of shape (N, C*Hin*Win).
+ * - C: Number of input channels (dimensionality of input depth).
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - Hf: Filter height.
+ * - Wf: Filter width.
+ * - strideh: Stride over height.
+ * - stridew: Stride over width.
+ * - padh: Padding for top and bottom sides.
+ * A typical value is 0.
+ * - padw: Padding for left and right sides.
+ * A typical value is 0.
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+ */
+ N = nrow(X)
+ # Gradient of average pooling
+ dX = avg_pool_backward(X, dout, input_shape=[N,C,Hin,Win],
+ pool_size=[Hin,Win], stride=[1, 1], padding=[0, 0])
+}
diff --git a/scripts/nn/layers/mbconv.dml b/scripts/nn/layers/mbconv.dml
new file mode 100644
index 0000000..ebede05
--- /dev/null
+++ b/scripts/nn/layers/mbconv.dml
@@ -0,0 +1,352 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Implementation of a MBConv Layer (Inverted ResNet Layer)
+ *
+ * 1.) Expansion Phase (1x1 Convolution) & BN
+ * 2.) Depthwise Convolution
+ * 3.) BatchNorm
+ * 4.) SILU Activation
+ * 5.) Global Avg Pooling
+ * 5.) Squeeze and Excitation phase
+ * 7.) Output Phase (1x1 Convolution)
+ * 8.) BatchNorm
+ * 9.) Optional Skip Add layer
+ */
+
+
+source("nn/layers/batch_norm2d.dml") as batchnorm
+source("nn/layers/conv2d_builtin.dml") as conv2d
+source("nn/layers/conv2d_depthwise.dml") as depthwise
+source("nn/layers/global_avg_pool2d.dml") as global_avg_pool
+source("nn/layers/silu.dml") as silu
+source("nn/layers/upsample2d.dml") as upsample
+
+
+forward = function(matrix[double] X, list[unknown] model, int Fin, int Fout,
int Hin,
+ int Win, int filter_width, int filter_height, int strideh, int stridew, int
padh, int padw,
+ boolean SkipConnection, int ExpansionFactor, string BNMode, double
squeeze_factor)
+ return (matrix[double] layer_out, list[unknown] intermediate_outputs,
list[unknown]
+ batchnorm_updates, int Hout, int Wout)
+{
+ /*
+ * Computes the backward pass for a MBConv layer.
+ *
+ * Inputs:
+ * - X: Previous input data matrix, of shape (N, Fin * Hin * Win).
+ * - model: list of all 22 matrices needed for a complete mbconv layer
+ * - Fin: Number of filters incoming to the MBConv Block.
+ * - Fout: Number of filters this MBconv Block produces.
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - filter_width: Width of the depthwise convolution filter
+ * - filter_height: Height of the depthwise convolution filter
+ * - strideh: stride of the depthwise convolution in height
+ * - stridew: stride of the depthwise convolution in width
+ * - padh: padding of the depthwise convolution in height
+ * - padw: padding of the depthwise convolution in width
+ * - SkipConnection: Whether the skip connection was used or not. For this
to work the Filters Fin and Fout, and
+ * the outputs dimension and Hin and Win must be the same.
+ * - ExpansionFactor: Factor of expansion of the initial Filters coming
into this block
+ * - BNMode: BatchNorm mode used must be either "train" or "test"
+ * - Squeeze_factor: Factor for the squeeze and excitation layer. This
factor should be between 0 and 1
+
+ * Outputs:
+ * - layer_out: Output of the MBConv Layer
+ * - intermediate_outputs: list of outputs of intermediate layers generated
by the forward pass
+ * - batchnorm_updates: list of updates of batchnorm layers generated by
the forward pass
+ * - Hout: Height of the output
+ * - Wout: Width of the output
+ */
+ # Unpack parameterlist
+ W_expansion = as.matrix(model[1])
+ b_expansion = as.matrix(model[2])
+ Gamma_expansion = as.matrix(model[3])
+ Beta_expansion = as.matrix(model[4])
+ EmaMean_expansion = as.matrix(model[5])
+ EmaVar_expansion = as.matrix(model[6])
+
+ W_depth = as.matrix(model[7])
+ b_depth = as.matrix(model[8])
+ Gamma_depth = as.matrix(model[9])
+ Beta_depth = as.matrix(model[10])
+ EmaMean_depth = as.matrix(model[11])
+ EmaVar_depth = as.matrix(model[12])
+ W_squeeze = as.matrix(model[13])
+ b_squeeze = as.matrix(model[14])
+ W_excite = as.matrix(model[15])
+ b_excite = as.matrix(model[16])
+
+ W_out = as.matrix(model[17])
+ b_out = as.matrix(model[18])
+ Gamma_out = as.matrix(model[19])
+ Beta_out = as.matrix(model[20])
+ EmaMean_out = as.matrix(model[21])
+ EmaVar_out = as.matrix(model[22])
+
+ # Either produce expanded input or use identity
+ if (ExpansionFactor > 1) {
+ filter_expansion = Fin * ExpansionFactor
+ [out_expansion, dim_h_exp, dim_w_exp] = conv2d::forward(X, W_expansion,
b_expansion, Fin, Hin, Win, 1, 1, 1, 1, 0, 0)
+ [out_bn_expansion, bn_ema_mean_expansion, bn_ema_var_expansion,
cache_mean_expansion, cache_var_expansion] = batchnorm::forward(out_expansion,
Gamma_expansion, Beta_expansion, filter_expansion, Hin, Win, BNMode,
EmaMean_expansion, EmaVar_expansion, 0.9, 1e-5)
+ depthwise_in = silu::forward(out_bn_expansion)
+ }
+ else {
+ # dummy variables so that indexing remains constant
+ out_expansion = matrix(0, 0, 0)
+ out_bn_expansion = matrix(0, 0, 0)
+ bn_ema_mean_expansion = matrix(0, 0, 0)
+ bn_ema_var_expansion = matrix(0, 0, 0)
+ cache_mean_expansion = matrix(0, 0, 0)
+ cache_var_expansion = matrix(0, 0, 0)
+
+ filter_expansion = Fin
+ depthwise_in = X
+ }
+
+ [depth_out, depth_dim_h, depth_dim_w] = depthwise::forward(depthwise_in,
W_depth, b_depth, Hin, Win, 1, filter_height, filter_width, strideh, stridew,
padh, padw)
+ [depth_bn_out, depth_bn_mean, depth_bn_var, depth_cache_mean,
depth_cache_var] =
+ batchnorm::forward(depth_out, Gamma_depth, Beta_depth, filter_expansion,
depth_dim_h, depth_dim_w, "train", EmaMean_depth, EmaVar_depth, 0.9, 1e-5)
+ depth_act_out = silu::forward(depth_bn_out)
+
+
+ # Squeeze and Expansion
+ squeeze_dim = round(filter_expansion * squeeze_factor)
+ [pooled_out, pool_h, pool_w] = global_avg_pool::forward(depth_act_out,
filter_expansion, depth_dim_h, depth_dim_w)
+ [squeeze_out, dim_squeeze_h, dim_squeeze_w] = conv2d::forward(pooled_out,
W_squeeze, b_squeeze, filter_expansion, pool_h, pool_w, 1, 1, 1, 1, 0, 0)
+ [expand_out, dim_squeeze_h, dim_squeeze_w] = conv2d::forward(squeeze_out,
W_excite, b_excite, squeeze_dim, dim_squeeze_h, dim_squeeze_w, 1, 1, 1, 1, 0, 0)
+ upscaled_out = upsample::forward(expand_out, filter_expansion,
dim_squeeze_h, dim_squeeze_w, depth_dim_h, depth_dim_w)
+ multiplied_out = depth_act_out * upscaled_out
+
+ # Output Layer
+ [conv_out, conv_dim_h, conv_dim_w] = conv2d::forward(multiplied_out, W_out,
b_out, filter_expansion, depth_dim_h, depth_dim_w, 1, 1, 1, 1, 0, 0)
+ [conv_bn_out, conv_bn_mean, conv_bn_var, conv_cache_mean, conv_cache_var] =
+ batchnorm::forward(conv_out, Gamma_depth, Beta_depth, filter_expansion,
depth_dim_h, depth_dim_w, BNMode, EmaMean_depth, EmaVar_depth, 0.9, 1e-5)
+
+ if (SkipConnection)
+ layer_out = conv_bn_out + X
+ else
+ layer_out = conv_out
+
+ Hout = conv_dim_h
+ Wout = conv_dim_w
+
+ intermediate_outputs = list(out_expansion, out_bn_expansion, depthwise_in,
+ depth_out, depth_bn_out, depth_act_out, pooled_out, squeeze_out,
expand_out,
+ upscaled_out, multiplied_out, conv_out, conv_bn_out, layer_out)
+ batchnorm_updates = list(bn_ema_mean_expansion, bn_ema_var_expansion,
+ cache_mean_expansion, cache_var_expansion, depth_bn_mean, depth_bn_var,
depth_cache_mean,
+ depth_cache_var, conv_bn_mean, conv_bn_var, conv_cache_mean,
conv_cache_var)
+}
+
+backward = function(matrix[double] dout, matrix[double] X, list[unknown]
model, list[unknown] intermediate_outputs,
+ list[unknown] batchnorm_updates, int Fin, int Fout, int Hin, int Win, int
filter_width, int filter_height, int strideh,
+ int stridew, int padh, int padw, boolean SkipConnection, int
ExpansionFactor, string BNMode, double squeeze_factor)
+ return (matrix[double] dX, list[unknown] gradients)
+{
+ /*
+ * Computes the backward pass for a MBConv layer.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+ * - X: Previous input data matrix, of shape (N, Fin * Hin * Win).
+ * - model: list of all 22 matrices needed for a complete mbconv layer
+ * - intermediate_outputs: list of outputs of intermediate layers generated
by the forward pass
+ * - batchnorm_updates: list of updates of batchnorm layers generated by
the forward pass
+ * - Fin: Number of filters incoming to the MBConv Block.
+ * - Fout: Number of filters this MBconv Block produces.
+ * - Hin: Input height.
+ * - Win: Input width.
+ * - filter_width: Width of the depthwise convolution filter
+ * - filter_height: Height of the depthwise convolution filter
+ * - strideh: stride of the depthwise convolution in height
+ * - stridew: stride of the depthwise convolution in width
+ * - padh: padding of the depthwise convolution in height
+ * - padw: padding of the depthwise convolution in width
+ * - SkipConnection: Whether the skip connection was used or not. For this
to work the Filters Fin and Fout, and
+ * the outputs dimension and Hin and Win must be the same.
+ * - ExpansionFactor: Factor of expansion of the initial Filters coming
into this block
+ * - BNMode: BatchNorm mode used must be either "train" or "test"
+ * - Squeeze_factor: Factor for the squeeze and excitation layer. This
factor should be between 0 and 1
+
+ * Outputs:
+ * - dX: Gradient wrt `X`, of same shape as `X`.
+ * - gradients: list containing all the gradients for the parameter updates.
+ */
+
+ # Unpack parameterlist
+ W_expansion = as.matrix(model[1])
+ b_expansion = as.matrix(model[2])
+ Gamma_expansion = as.matrix(model[3])
+ Beta_expansion = as.matrix(model[4])
+ EmaMean_expansion = as.matrix(model[5])
+ EmaVar_expansion = as.matrix(model[6])
+
+ W_depth = as.matrix(model[7])
+ b_depth = as.matrix(model[8])
+ Gamma_depth = as.matrix(model[9])
+ Beta_depth = as.matrix(model[10])
+ EmaMean_depth = as.matrix(model[11])
+ EmaVar_depth = as.matrix(model[12])
+ W_squeeze = as.matrix(model[13])
+ b_squeeze = as.matrix(model[14])
+ W_excite = as.matrix(model[15])
+ b_excite = as.matrix(model[16])
+
+ W_out = as.matrix(model[17])
+ b_out = as.matrix(model[18])
+ Gamma_out = as.matrix(model[19])
+ Beta_out = as.matrix(model[20])
+ EmaMean_out = as.matrix(model[21])
+ EmaVar_out = as.matrix(model[22])
+
+ # Unpack BN caches
+ cache_mean_expansion = as.matrix(batchnorm_updates[3])
+ cache_var_expansion = as.matrix(batchnorm_updates[4])
+ depth_cache_mean = as.matrix(batchnorm_updates[7])
+ depth_cache_var = as.matrix(batchnorm_updates[8])
+ conv_cache_mean = as.matrix(batchnorm_updates[11])
+ conv_cache_var = as.matrix(batchnorm_updates[12])
+
+ # Unpack Intermediate Outputs
+ out_expansion = as.matrix(intermediate_outputs[1])
+ out_bn_expansion = as.matrix(intermediate_outputs[2])
+ depthwise_in = as.matrix(intermediate_outputs[3])
+ depth_out = as.matrix(intermediate_outputs[4])
+ depth_bn_out = as.matrix(intermediate_outputs[5])
+ depth_act_out = as.matrix(intermediate_outputs[6])
+ pooled_out = as.matrix(intermediate_outputs[7])
+ squeeze_out = as.matrix(intermediate_outputs[8])
+ expand_out = as.matrix(intermediate_outputs[9])
+ upscaled_out = as.matrix(intermediate_outputs[10])
+ multiplied_out = as.matrix(intermediate_outputs[11])
+ conv_out = as.matrix(intermediate_outputs[12])
+ conv_bn_out = as.matrix(intermediate_outputs[13])
+
+ # Calculate Dimension of filters
+ if (ExpansionFactor > 1)
+ expansion_dim = Fin * ExpansionFactor
+ else
+ expansion_dim = Fin
+
+ squeeze_dim = round(expansion_dim * squeeze_factor)
+ depth_dim_h = as.integer(floor((Hin + 2*padh - filter_height)/strideh + 1))
+ depth_dim_w = as.integer(floor((Win + 2*padw - filter_width)/stridew + 1))
+
+ # Start Backpropagation
+ [dback_bn_out, dGamma_out, dBeta_out] = batchnorm::backward(dout,
conv_cache_mean, conv_cache_var, conv_out, Gamma_out, Fout, depth_dim_h,
depth_dim_w, 1e-5)
+ [dconv_out, dConv_w, dConv_b] = conv2d::backward(dback_bn_out, depth_dim_h,
depth_dim_w, multiplied_out, W_out, b_out, expansion_dim, depth_dim_h,
depth_dim_w, 1, 1, 1, 1, 0, 0)
+
+ # multiply backward part1
+ dsqueeze_back = dconv_out * depth_act_out
+ dupsample_back = upsample::backward(dsqueeze_back, expansion_dim, 1, 1,
depth_dim_h, depth_dim_w)
+
+ [dexcite_back, dW_excite, db_excite] = conv2d::backward(dupsample_back, 1,
1, squeeze_out, W_excite, b_excite, squeeze_dim, 1, 1, 1, 1, 1, 1, 0, 0)
+ [dsqueeze_back, dW_squeeze, db_squeeze] = conv2d::backward(dexcite_back, 1,
1, pooled_out, W_squeeze, b_squeeze, expansion_dim, 1, 1, 1, 1, 1, 1, 0, 0)
+ dpool_back = global_avg_pool::backward(dsqueeze_back, depth_act_out,
expansion_dim, depth_dim_h, depth_dim_w)
+
+ #multiply part 2
+ dmult = dconv_out * upscaled_out
+ dsilu_back = dmult + dpool_back
+
+ # Act-BN-CONV
+ dsilu_back2 = silu::backward(dsilu_back, depth_bn_out)
+ [dback_bn_depth, dGamma_depth, dBeta_depth] =
batchnorm::backward(dsilu_back2, depth_cache_mean, depth_cache_var, depth_out,
Gamma_depth, expansion_dim, depth_dim_h, depth_dim_w, 1e-5)
+ [dconv_depth_out, dW_depth, db_depth] = depthwise::backward(dback_bn_depth,
depth_dim_h, depth_dim_w, depthwise_in, W_depth, b_depth, Hin, Win, 1,
filter_height, filter_width, strideh, stridew, padh, padw)
+
+ if (ExpansionFactor > 1) {
+ dsilu_back3 = silu::backward(dconv_depth_out, out_bn_expansion)
+ [dback_bn_expansion, dGamma_expansion, dBeta_expansion] =
batchnorm::backward(dsilu_back3, cache_mean_expansion, cache_var_expansion,
out_expansion, Gamma_expansion, expansion_dim, Hin, Win, 1e-5)
+ [dconv_expansion, dW_expansion, db_expansion] =
conv2d::backward(dback_bn_expansion, Hin, Win, X, W_expansion, b_expansion,
Fin, Hin, Win, 1, 1, 1, 1, 0, 0)
+ dX = dconv_expansion
+ }
+ else {
+ dX = dconv_depth_out
+ }
+ if (SkipConnection)
+ dX = dX + dout
+
+ if (ExpansionFactor > 1) {
+ gradients = list(dGamma_out, dBeta_out, dConv_w, dConv_b, dW_excite,
+ db_excite, dW_squeeze, db_squeeze, dGamma_depth, dBeta_depth, dW_depth,
+ db_depth, dGamma_expansion, dBeta_expansion, dW_expansion, db_expansion)
+ }
+ else {
+ gradients = list(dGamma_out, dBeta_out, dConv_w, dConv_b, dW_excite,
db_excite,
+ dW_squeeze, db_squeeze, dGamma_depth, dBeta_depth, dW_depth, db_depth)
+ }
+}
+
+init = function(int Fin, int Fout, int filter_width, int filter_height,
+ int ExpansionFactor, double SqueezeFactor, int seed = -1)
+ return (list[unknown] mbconv_params)
+{
+ /*
+ * Initialize the parameters of this MBConv layer.
+ *
+ * Note: This is just a convenience function, and parameters
+ * may be initialized manually if needed.
+ *
+ * Inputs:
+ * - Fin: Number of filters incoming to the MBConv Block.
+ * - Fout: Number of filters this MBconv Block produces.
+ * - filter_width: Width of the depthwise convolution filter
+ * - filter_height: Height of the depthwise convolution filter
+ * - ExpansionFactor: Factor of expansion of the initial Filters coming
into this block
+ * - Squeeze_factor: Factor for the squeeze and excitation layer. This
factor should be between 0 and 1
+ * - seed: The seed to initialize the weights
+ *
+ * Outputs:
+ * - mbconv_params: list of all 22 matrices needed for a complete mbconv
layer
+ */
+
+ # Expansion
+ if (ExpansionFactor > 1) {
+ expansion_dim = Fin * ExpansionFactor
+ [W_expansion, b_expansion] = conv2d::init(expansion_dim, Fin, 1, 1, seed)
+ [Gamma_expansion, Beta_expansion, EmaMean_expansion, EmaVar_expansion] =
batchnorm::init(expansion_dim)
+ }
+ else {
+ # Dummy variables so that the model list indices remain the same
+ W_expansion = matrix(0, 0, 0)
+ b_expansion = matrix(0, 0, 0)
+ Gamma_expansion = matrix(0, 0, 0)
+ Beta_expansion = matrix(0, 0, 0)
+ EmaMean_expansion = matrix(0, 0, 0)
+ EmaVar_expansion = matrix(0, 0, 0)
+ expansion_dim = Fin
+ }
+
+ [W_depth, b_depth] = depthwise::init(expansion_dim, 1, filter_width,
filter_height)
+ [Gamma_depth, Beta_depth, EmaMean_depth, EmaVar_depth] =
batchnorm::init(expansion_dim)
+ squeeze_dim = round(expansion_dim * SqueezeFactor)
+ [W_squeeze, b_squeeze] = conv2d::init(squeeze_dim, expansion_dim, 1, 1, seed)
+ [W_excite, b_excite] = conv2d::init(expansion_dim, squeeze_dim, 1, 1, seed)
+
+ [W_out, b_out] = conv2d::init(Fout, expansion_dim, 1, 1, seed)
+ [Gamma_out, Beta_out, EmaMean_out, EmaVar_out] = batchnorm::init(Fout)
+
+ mbconv_params = list(W_expansion, b_expansion, Gamma_expansion,
+ Beta_expansion, EmaMean_expansion, EmaVar_expansion, W_depth, b_depth,
+ Gamma_depth, Beta_depth, EmaMean_depth, EmaVar_depth, W_squeeze, b_squeeze,
+ W_excite, b_excite, W_out, b_out, Gamma_out, Beta_out, EmaMean_out,
EmaVar_out)
+}
diff --git a/scripts/nn/layers/silu.dml b/scripts/nn/layers/silu.dml
new file mode 100644
index 0000000..037e5ae
--- /dev/null
+++ b/scripts/nn/layers/silu.dml
@@ -0,0 +1,57 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * SILU nonlinearity layer.
+ */
+
+forward = function(matrix[double] X)
+ return (matrix[double] out) {
+ /*
+ * Computes the forward pass for a SILU nonlinearity layer.
+ *
+ * Performs an element-wise evaluation of `f(input) = x * \sigmoid(x)`.
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ *
+ * Outputs:
+ * - out: Outputs, of same shape as `X`.
+ */
+ out = X / (1+exp(-X))
+}
+
+backward = function(matrix[double] dout, matrix[double] X)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for a SILU nonlinearity layer.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+ * - X: Previous input data matrix, of shape (any, any).
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of same shape as `X`.
+ */
+
+ sig = 1 / (1+exp(-X))
+ dX = (sig + X * sig * (1 - sig)) * dout
+}