This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 26a2b35  [SYSTEMDS-3043] New EfficientNet builtin function, incl new 
layers
26a2b35 is described below

commit 26a2b3542452fefe959819c47cf007c12a777235
Author: Bene <[email protected]>
AuthorDate: Sun Jun 27 16:44:06 2021 +0200

    [SYSTEMDS-3043] New EfficientNet builtin function, incl new layers
    
    Implemented a minified version of Efficient-Net B0 with an additional
    mini test script on the MNist data set.
    This includes a few new layer implementations:
    
        SILU nonlinearity Layer
        Adaptive Global Avg Pooling Layer
        Inverted Residual Mobile Layer (MBConv)
    
    We only used a single MBConv layer instead of the total of 16.
    The top and stem part are however identical to Efficient-Net B0.
    
    AMLS project SS2021.
    Closes #1326.
---
 scripts/nn/examples/Example-EfficientNet.dml |  76 ++++++
 scripts/nn/examples/efficientNet.dml         | 337 +++++++++++++++++++++++++
 scripts/nn/layers/global_avg_pool2d.dml      |  89 +++++++
 scripts/nn/layers/mbconv.dml                 | 352 +++++++++++++++++++++++++++
 scripts/nn/layers/silu.dml                   |  57 +++++
 5 files changed, 911 insertions(+)

diff --git a/scripts/nn/examples/Example-EfficientNet.dml 
b/scripts/nn/examples/Example-EfficientNet.dml
new file mode 100644
index 0000000..e2fd5e2
--- /dev/null
+++ b/scripts/nn/examples/Example-EfficientNet.dml
@@ -0,0 +1,76 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+
+/*
+* The MNIST Data can be downloaded as follows:
+* mkdir -p data/mnist/
+* cd data/mnist/
+* curl -O https://pjreddie.com/media/files/mnist_train.csv
+* curl -O https://pjreddie.com/media/files/mnist_test.csv
+*/
+
+# TODO add tests in functions/builtin, applications/nn
+
+# This script trains a minified version of the EfficientNet-B0 model
+# with a single MBConv layer. This model heavily overfits on a simple
+# MNist dataset since it was originally developed on the ImageNet dataset
+# Thus layer outputs and other factors are too large for normal MNist.
+# Therefore we only train once on the Mnist Train ds and print out its Accuracy
+# Import required methods
+source("nn/examples/efficientNet.dml") as eff
+
+# Read training data
+data = read("data/mnist/mnist_test.csv", format="csv")
+N = nrow(data)
+
+# Extract images and labels
+images = data[,2:ncol(data)]
+labels = data[,1]
+
+# Scale images to [0,1], and one-hot encode the labels
+images = images / 255.0
+labels = table(seq(1, N), labels+1, N, 10)
+
+model = eff::initNetwork(1, 10, -1)
+
+# Train
+epochs = 1
+batch_size = 256
+model = eff::netTrain(model, images, 1, 28, 28, labels, epochs, batch_size, 
0.025, 0.9, TRUE)
+
+
+# Also Predict in Batches since otherwise we can run into Memory Issues
+# Could be unnecessary on more powerful machines :)
+iters = ceil(N / batch_size)
+accuracy = 0.0
+for(i in 1:iters) {
+  beg = ((i-1) * batch_size) %% N + 1
+  end = min(N, beg + batch_size - 1)
+  X_batch = images[beg:end,]
+  y_batch = labels[beg:end,]
+
+  pred = eff::netPredict(X_batch, model, 1, 28, 28)
+  partial_acc = mean(rowIndexMax(pred) == rowIndexMax(y_batch))
+  accuracy = accuracy + partial_acc
+}
+
+print("Total Accuracy: " + (accuracy / iters))
diff --git a/scripts/nn/examples/efficientNet.dml 
b/scripts/nn/examples/efficientNet.dml
new file mode 100644
index 0000000..012eb33
--- /dev/null
+++ b/scripts/nn/examples/efficientNet.dml
@@ -0,0 +1,337 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# TODO move to builtin functions (needs fix for imports in builtin functions)
+# TODO scale up to real EfficientNet-B0
+
+# Trains a partial Efficient-Net B0 model
+# This script trains the top and bottom part of the Efficient-Net B0
+# The original Efficient-Net B0 has the following Layers
+#----------------------------------------------------------------
+#    Layers                    Dimension      Filters       Nr Repeats
+#----------------------------------------------------------------
+# 1. Conv3x3                    224x224         32          1
+# 2. MBConv1, k3x3              112x112         16          1
+# 3. MBConv6, k3x3               56x 56         24          2
+# 4. MBConv6, k5x5               28x 28         40          2
+# 5. MBConv6, k3x3               14x 14         80          3
+# 6. MBConv6, k5x5               14x 14         112         3
+# 7. MBConv6, k5x5                7x  7         192         4
+# 8. MBConv6, k3x3                7x  7         320         1
+# 9. Conv1x1 & Pooling & FC       7x  7         1280        1
+#----------------------------------------------------------------
+# In this partial implementation we implement the layers number 1, 2 and the 
prediction layer 9
+# This init-Method is purely for convenience reasons there is not problem with 
a manual initialization of weight and
+# biases. To extend the current implementation to a full EfficientNet-B0 only 
the intermediate MBConv need to be extended
+# Both stem and top part are already complete as is the first MBConv layer.
+# The number after MBConv is the corresponding ExpansionFactor and is followed
+# by the kernel size stride and padding can be calculated from the dimension. 
If the layer is repeated
+# The skip connection is activated otherwise not.
+#----------------------------------------------------------------
+
+source("nn/layers/batch_norm2d.dml") as batchnorm
+source("nn/layers/conv2d_builtin.dml") as conv2d
+source("nn/layers/conv2d_depthwise.dml") as depthwise
+source("nn/layers/global_avg_pool2d.dml") as global_avg_pool
+source("nn/layers/silu.dml") as silu
+source("nn/layers/upsample2d.dml") as upsample
+source("nn/layers/mbconv.dml") as mbconv
+source("nn/layers/affine.dml") as affine
+source("nn/layers/softmax.dml") as softmax
+source("nn/optim/sgd.dml") as sgd
+source("nn/layers/cross_entropy_loss.dml") as cross_entropy_loss
+
+initNetwork = function(int InputChannels, int NumberOutputClasses, int seed)
+    return(list[unknown] model)
+{
+  /*
+   * Convenience function for initialization of all required weights and 
biases.
+   *
+   * Inputs:
+   *  - InputChannels: Number of Input Channels for the model (Cin)
+   *  - NumberOutputClasses: Number of classes for the network
+   *  - seed: seed for the random generation of the weights
+   *
+   * Outputs:
+   *  - model: A list containing the total of 36 matrices needed for the 
computation of the
+   *           Mini EfficientNet
+   */
+
+  # Layer 1
+  [CW_stem, Cb_stem] = conv2d::init(32, InputChannels, 3, 3, seed)
+  seed = ifelse(seed==-1, -1, seed + 1);
+  [Gamma_stem, Beta_stem, EmaMean_stem, EmaVar_stem] = batchnorm::init(32)
+
+  # Layer 2
+  [mb_parameters] = mbconv::init(32, 16, 3, 3, 1, 0.25, seed)
+  seed = ifelse(seed==-1, -1, seed + 1);
+
+  # Layer 9
+  [CW_top, Cb_top] = conv2d::init(1280, 16, 1, 1, seed)
+  seed = ifelse(seed==-1, -1, seed + 1);
+  [Gamma_top, Beta_top, EmaMean_top, EmaVar_top] = batchnorm::init(1280)
+  [DW_top, Db_top] = affine::init(1280, NumberOutputClasses, seed)
+
+    model = list(CW_stem, Cb_stem, Gamma_stem, Beta_stem, 
EmaMean_stem,vEmaVar_stem,
+      as.matrix(mb_parameters[1]),
+      as.matrix(mb_parameters[2]),
+      as.matrix(mb_parameters[3]),
+      as.matrix(mb_parameters[4]),
+      as.matrix(mb_parameters[5]),
+      as.matrix(mb_parameters[6]),
+      as.matrix(mb_parameters[7]),
+      as.matrix(mb_parameters[8]),
+      as.matrix(mb_parameters[9]),
+      as.matrix(mb_parameters[10]),
+      as.matrix(mb_parameters[11]),
+      as.matrix(mb_parameters[12]),
+      as.matrix(mb_parameters[13]),
+      as.matrix(mb_parameters[14]),
+      as.matrix(mb_parameters[15]),
+      as.matrix(mb_parameters[16]),
+      as.matrix(mb_parameters[17]),
+      as.matrix(mb_parameters[18]),
+      as.matrix(mb_parameters[19]),
+      as.matrix(mb_parameters[20]),
+      as.matrix(mb_parameters[21]),
+      as.matrix(mb_parameters[22]),
+      CW_top, Cb_top, Gamma_top, Beta_top, EmaMean_top, EmaVar_top, DW_top, 
Db_top)
+}
+
+
+netPredict = function(matrix[double] X, list[unknown] model, int Cin, int Hin, 
int Win)
+    return(matrix[double] pred)
+{
+  /*
+   * This function generates the prediction of the model for a input X
+   *
+   * Inputs:
+   *  - X: Input features of format (N, Cin * Hin * Win)
+   *  - model: the list of length 36 containing the matrices generated from 
the initNetwork function
+   *  - Cin: Number of input channels (dimensionality of depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *
+   * Outputs:
+   *  - pred: The output of the final softmax layer of the Mini Efficient-Net
+   */
+  CW_stem = as.matrix(model[1])
+  Cb_stem = as.matrix(model[2])
+  Gamma_stem = as.matrix(model[3])
+  Beta_stem = as.matrix(model[4])
+  EmaMean_stem = as.matrix(model[5])
+  EmaVar_stem = as.matrix(model[6])
+  MBConv_params = model[7:28]
+  CW_top = as.matrix(model[29])
+  Cb_top = as.matrix(model[30])
+  Gamma_top = as.matrix(model[31])
+  Beta_top = as.matrix(model[32])
+  EmaMean_top = as.matrix(model[33])
+  EmaVar_top = as.matrix(model[34])
+  DW_top = as.matrix(model[35])
+  Db_top = as.matrix(model[36])
+
+  padh = (Hin + 1) %% 2
+  padw = (Win + 1) %% 2
+
+  [stem_out, stem_h, stem_w] = conv2d::forward(X, CW_stem, Cb_stem, Cin, Hin, 
Win, 3, 3, 2, 2, padh, padw)
+  [bn_stem_out, update_EmaMean_stem, update_EmaVar_stem, cache_EmaMean_stem, 
cache_EmaVar_stem] = batchnorm::forward(
+    stem_out, Gamma_stem, Beta_stem, 32, stem_h, stem_w, "train", 
EmaMean_stem, EmaVar_stem, 0.9, 1e-5) 
+  silu_out = silu::forward(bn_stem_out)
+
+  [mbconv_out, intermediate_mbconv, mbconvbatchnorm_updates, mbconv_h, 
mbconv_w] = mbconv::forward(
+    silu_out, MBConv_params, 32, 16, stem_h, stem_w, 3, 3, 2, 2, padh, padw, 
FALSE, 1, "train", 0.25)
+
+  [top_out, outh, outw] = conv2d::forward(mbconv_out, CW_top, Cb_top, 16, 
mbconv_h, mbconv_w, 1, 1, 1, 1, 0, 0)
+  [bntop_out, update_EmaMean_top, update_EmaVar_top, cache_EmaMean_top, 
cache_EmaVar_top] = batchnorm::forward(
+    top_out, Gamma_top, Beta_top, 1280, outh, outw, "train", EmaMean_top, 
EmaVar_top, 0.9, 1e-5)
+  silu_out2 = silu::forward(bntop_out)
+  [pool_out, None, None] = global_avg_pool::forward(silu_out2, 1280, outh, 
outw)
+  dense_out = affine::forward(pool_out, DW_top, Db_top)
+  pred = softmax::forward(dense_out)
+}
+
+netTrain = function(list[unknown] model, matrix[double] X, int Cin, int Hin, 
int Win,
+  matrix[double] Y, int epochs, int batch_size, double learning_rate, double 
lr_decay, boolean verbose)
+  return(list[unknown] trained_model)
+{
+  /*
+   * This function trains the given model with an sgd optimizer with the given 
batch_size for a number of
+   * epochs.
+   *
+   * Inputs:
+   *  - model: the list of length 36 containing the matrices generated from 
the initNetwork function
+   *  - X: Input features of format (N, Cin * Hin * Win)
+   *  - Cin: Number of input channels (dimensionality of depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Y: The true labels used for learning in a OneHotEncoding(N, 
NumberOutClasses)
+   *  - epochs: Number of epochs to train for
+   *  - batch_size: Size of batch used for a single update step
+   *  - learning_rate: Size of batch used for a single update step
+   *  - lr_decay: The learning rate is multiplied with lr_decay after each 
epoch.
+   *  - verbose: Whether the accuracy and the cross-entropy loss should be 
printed after each update step
+   *
+   * Outputs:
+   *  - trained_model: The new list of the updated 36 matrices
+   */
+  CW_stem = as.matrix(model[1])
+  Cb_stem = as.matrix(model[2])
+  Gamma_stem = as.matrix(model[3])
+  Beta_stem = as.matrix(model[4])
+  EmaMean_stem = as.matrix(model[5])
+  EmaVar_stem = as.matrix(model[6])
+  MBConv_params = model[7:28]
+  CW_top = as.matrix(model[29])
+  Cb_top = as.matrix(model[30])
+  Gamma_top = as.matrix(model[31])
+  Beta_top = as.matrix(model[32])
+  EmaMean_top = as.matrix(model[33])
+  EmaVar_top = as.matrix(model[34])
+  DW_top = as.matrix(model[35])
+  Db_top = as.matrix(model[36])
+
+  padh = (Hin + 1) %% 2
+  padw = (Win + 1) %% 2
+
+  N = nrow(X)
+  lr = learning_rate
+
+  # Optimize
+  iters = ceil(N / batch_size)
+  for (e in 1:epochs) {
+    for(i in 1:iters) {
+      # Get next batch
+      beg = ((i-1) * batch_size) %% N + 1
+      end = min(N, beg + batch_size - 1)
+      X_batch = X[beg:end,]
+      y_batch = Y[beg:end,]
+
+      # Compute forward pass
+      [stem_out, stem_h, stem_w] = conv2d::forward(X_batch, CW_stem, Cb_stem, 
Cin, Hin, Win, 3, 3, 2, 2, padh, padw)
+      [bn_stem_out, update_EmaMean_stem, update_EmaVar_stem, 
cache_EmaMean_stem, cache_EmaVar_stem] = batchnorm::forward(stem_out, 
Gamma_stem, Beta_stem, 32, stem_h, stem_w, "train", EmaMean_stem, EmaVar_stem, 
0.9, 1e-5)
+      silu_out = silu::forward(bn_stem_out)
+
+      [mbconv_out, intermediate_mbconv, mbconvbatchnorm_updates, mbconv_h, 
mbconv_w] = mbconv::forward(silu_out, MBConv_params, 32, 16, stem_h, stem_w, 3, 
3, 2, 2, padh, padw, FALSE, 1, "train", 0.25)
+
+      [top_out, outh, outw] = conv2d::forward(mbconv_out, CW_top, Cb_top, 16, 
mbconv_h, mbconv_w, 1, 1, 1, 1, 0, 0)
+      [bntop_out, update_EmaMean_top, update_EmaVar_top, cache_EmaMean_top, 
cache_EmaVar_top] = batchnorm::forward(top_out, Gamma_top, Beta_top, 1280, 
outh, outw, "train", EmaMean_top, EmaVar_top, 0.9, 1e-5)
+      silu_out2 = silu::forward(bntop_out)
+      [pool_out, None, None] = global_avg_pool::forward(silu_out2, 1280, outh, 
outw)
+      dense_out = affine::forward(pool_out, DW_top, Db_top)
+      pred = softmax::forward(dense_out)
+
+      # Compute loss & accuracy for training
+      loss = cross_entropy_loss::forward(pred, y_batch)
+      if(verbose) {
+        accuracy = mean(rowIndexMax(pred) == rowIndexMax(y_batch))
+        print("Epoch: " + e + ", Iter: " + i + ", Train Loss: " + loss + ", 
Train Accuracy: " + accuracy)
+      }
+
+      # Compute backward pass
+      ## loss:
+      dprobs = cross_entropy_loss::backward(pred, y_batch)
+
+      ## TOP
+      d_softmax = softmax::backward(dprobs, dense_out)
+      [d_dense_back, dDenseW_top, dDenseb_top] = affine::backward(d_softmax, 
pool_out, DW_top, Db_top)
+      d_pool_back = global_avg_pool::backward(d_dense_back, silu_out2, 1280, 
outh, outw)
+      d_silu2_back = silu::backward(d_pool_back, bntop_out)
+      [d_bntop_back, dGamma_top, dBeta_top] = 
batchnorm::backward(d_silu2_back, cache_EmaMean_top, cache_EmaVar_top, top_out, 
Gamma_top, 1280, outh, outw, 1e-5)
+      [dtop_back, d_ConvW_top, d_Convb_top] = conv2d::backward(d_bntop_back, 
outh, outw, mbconv_out, CW_top, Cb_top, 16, mbconv_h, mbconv_w, 1, 1, 1, 1, 0, 
0)
+
+      # MBCONV
+      [d_mbconv_back, mbconv_gradients] = mbconv::backward(dtop_back, 
silu_out, MBConv_params, intermediate_mbconv, mbconvbatchnorm_updates, 32, 16, 
stem_h, stem_w, 3, 3, 2, 2, padh, padw, FALSE, 1, "train", 0.25)
+
+      ## STEM
+      d_silu_back = silu::backward(d_mbconv_back, bn_stem_out)
+      [d_bn_stem_back, dGamma_stem, dBeta_stem] = 
batchnorm::backward(d_silu_back, cache_EmaMean_stem, cache_EmaVar_stem, 
stem_out, Gamma_stem, 32, stem_h, stem_w, 1e-5)
+      [dconv_back, dW_stem, db_stem] = conv2d::backward(d_bn_stem_back, 
stem_h, stem_w, X_batch, CW_stem, Cb_stem, Cin, Hin, Win, 3, 3, 2, 2, padh, 
padw)
+
+      #Optimize with SGD
+      # Update Stem
+      CW_stem = sgd::update(CW_stem, dW_stem, lr)
+      Cb_stem = sgd::update(Cb_stem, db_stem, lr)
+      Gamma_stem = sgd::update(Gamma_stem, dGamma_stem, lr)
+      Beta_stem = sgd::update(Beta_stem, dBeta_stem, lr)
+      EmaMean_stem = update_EmaMean_stem
+      EmaVar_stem = update_EmaVar_stem
+
+      # Update MBConv
+      update_depth_W = sgd::update(as.matrix(MBConv_params[7]), 
as.matrix(mbconv_gradients[11]), lr)
+      update_depth_b = sgd::update(as.matrix(MBConv_params[8]), 
as.matrix(mbconv_gradients[12]), lr)
+      update_gamma_depth = sgd::update(as.matrix(MBConv_params[9]), 
as.matrix(mbconv_gradients[9]), lr)
+      update_beta_depth = sgd::update(as.matrix(MBConv_params[10]), 
as.matrix(mbconv_gradients[10]), lr)
+      update_ema_mean_depth = as.matrix(mbconvbatchnorm_updates[5])
+      update_ema_var_depth = as.matrix(mbconvbatchnorm_updates[6])
+      update_squeeze_W = sgd::update(as.matrix(MBConv_params[13]), 
as.matrix(mbconv_gradients[7]), lr)
+      update_squeeze_b = sgd::update(as.matrix(MBConv_params[14]), 
as.matrix(mbconv_gradients[8]), lr)
+      update_excite_W = sgd::update(as.matrix(MBConv_params[15]), 
as.matrix(mbconv_gradients[5]), lr)
+      update_excite_b = sgd::update(as.matrix(MBConv_params[16]), 
as.matrix(mbconv_gradients[6]), lr)
+      update_out_W = sgd::update(as.matrix(MBConv_params[17]), 
as.matrix(mbconv_gradients[3]), lr)
+      update_out_b = sgd::update(as.matrix(MBConv_params[18]), 
as.matrix(mbconv_gradients[4]), lr)
+      update_out_gamma = sgd::update(as.matrix(MBConv_params[19]), 
as.matrix(mbconv_gradients[1]), lr)
+      update_out_beta = sgd::update(as.matrix(MBConv_params[20]), 
as.matrix(mbconv_gradients[2]), lr)
+      update_ema_mean_out = as.matrix(mbconvbatchnorm_updates[9])
+      update_ema_var_out = as.matrix(mbconvbatchnorm_updates[10])
+
+      MBConv_params = list(
+        as.matrix(model[7]), as.matrix(model[8]),
+        as.matrix(model[9]), as.matrix(model[10]),
+        as.matrix(model[11]), as.matrix(model[12]),
+        update_depth_W, update_depth_b,
+        update_gamma_depth, update_beta_depth,
+        update_ema_mean_depth, update_ema_var_depth,
+        update_squeeze_W, update_squeeze_b,
+        update_excite_W, update_excite_b,
+        update_out_W, update_out_b,
+        update_out_gamma, update_out_beta,
+        update_ema_mean_out, update_ema_var_out)
+
+      # Update Top
+      CW_top = sgd::update(CW_top, d_ConvW_top, lr)
+      Cb_top = sgd::update(Cb_top, d_Convb_top, lr)
+      Gamma_top = sgd::update(Gamma_top, dGamma_top, lr)
+      Beta_top = sgd::update(Beta_top, dBeta_top, lr)
+      EmaMean_top = update_EmaMean_top
+      EmaVar_top = update_EmaVar_top
+      DW_top = sgd::update(DW_top, dDenseW_top, lr)
+      Db_top = sgd::update(Db_top, dDenseb_top, lr)
+    }
+    # Decay learning rate
+    lr = lr * lr_decay
+  }
+
+  # Pack everything into model format
+  trained_model = list(CW_stem, Cb_stem, Gamma_stem, Beta_stem, EmaMean_stem, 
EmaVar_stem,
+    as.matrix(MBConv_params[1]), as.matrix(MBConv_params[2]),
+    as.matrix(MBConv_params[3]), as.matrix(MBConv_params[4]),
+    as.matrix(MBConv_params[5]), as.matrix(MBConv_params[6]),
+    as.matrix(MBConv_params[7]), as.matrix(MBConv_params[8]),
+    as.matrix(MBConv_params[9]), as.matrix(MBConv_params[10]),
+    as.matrix(MBConv_params[11]), as.matrix(MBConv_params[12]),
+    as.matrix(MBConv_params[13]), as.matrix(MBConv_params[14]),
+    as.matrix(MBConv_params[15]), as.matrix(MBConv_params[16]),
+    as.matrix(MBConv_params[17]), as.matrix(MBConv_params[18]),
+    as.matrix(MBConv_params[19]), as.matrix(MBConv_params[20]),
+    as.matrix(MBConv_params[21]), as.matrix(MBConv_params[22]),
+    CW_top, Cb_top, Gamma_top, Beta_top, EmaMean_top, EmaVar_top, DW_top, 
Db_top)
+}
diff --git a/scripts/nn/layers/global_avg_pool2d.dml 
b/scripts/nn/layers/global_avg_pool2d.dml
new file mode 100644
index 0000000..1a31f62
--- /dev/null
+++ b/scripts/nn/layers/global_avg_pool2d.dml
@@ -0,0 +1,89 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Global Average Pooling 2D layer.
+ *
+ * This implementation uses a built-in operator for higher performance.
+ */
+
+forward = function(matrix[double] X, int C, int Hin, int Win)
+    return (matrix[double] out, int Hout, int Wout) {
+  /*
+   * Computes the forward pass for a 2D Global average pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector. The output dimension for Hout and Wout is 
always 1.
+   *
+   * This implementation uses a built-in operator for higher
+   * performance.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *
+   * Outputs:
+   *  - out: Outputs, of shape (N, C*Hout*Wout).
+   *  - Hout: Output height.
+   *  - Wout: Output width.
+   */
+
+  # Max pooling - built-in implementation
+  N = nrow(X)
+  Hout = 1
+  Wout = 1
+  out = avg_pool(X, input_shape=[N,C,Hin,Win],
+    pool_size=[Hin,Win], stride=[1,1], padding=[0, 0])
+}
+
+backward = function(matrix[double] dout, matrix[double] X, int C, int Hin, int 
Win)
+  return (matrix[double] dX)
+{
+  /*
+   * Computes the backward pass for a 2D spatial average pooling layer.
+   * The input data has N examples, each represented as a 3D volume
+   * unrolled into a single vector.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of
+   *      shape (N, C*Hout*Wout).
+   *  - X: Inputs, of shape (N, C*Hin*Win).
+   *  - C: Number of input channels (dimensionality of input depth).
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - Hf: Filter height.
+   *  - Wf: Filter width.
+   *  - strideh: Stride over height.
+   *  - stridew: Stride over width.
+   *  - padh: Padding for top and bottom sides.
+   *      A typical value is 0.
+   *  - padw: Padding for left and right sides.
+   *      A typical value is 0.
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of shape (N, C*Hin*Win).
+   */
+  N = nrow(X)
+  # Gradient of average pooling
+  dX = avg_pool_backward(X, dout, input_shape=[N,C,Hin,Win],
+    pool_size=[Hin,Win], stride=[1, 1], padding=[0, 0])
+}
diff --git a/scripts/nn/layers/mbconv.dml b/scripts/nn/layers/mbconv.dml
new file mode 100644
index 0000000..ebede05
--- /dev/null
+++ b/scripts/nn/layers/mbconv.dml
@@ -0,0 +1,352 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ *  Implementation of a MBConv Layer (Inverted ResNet Layer)
+ *
+ *  1.) Expansion Phase (1x1 Convolution) & BN
+ *  2.) Depthwise Convolution
+ *  3.) BatchNorm
+ *  4.) SILU Activation
+ *  5.) Global Avg Pooling
+ *  5.) Squeeze and Excitation phase
+ *  7.) Output Phase (1x1 Convolution)
+ *  8.) BatchNorm
+ *  9.) Optional Skip Add layer
+ */
+
+
+source("nn/layers/batch_norm2d.dml") as batchnorm
+source("nn/layers/conv2d_builtin.dml") as conv2d
+source("nn/layers/conv2d_depthwise.dml") as depthwise
+source("nn/layers/global_avg_pool2d.dml") as global_avg_pool
+source("nn/layers/silu.dml") as silu
+source("nn/layers/upsample2d.dml") as upsample
+
+
+forward = function(matrix[double] X, list[unknown] model, int Fin, int Fout, 
int Hin,
+  int Win, int filter_width, int filter_height, int strideh, int stridew, int 
padh, int padw, 
+  boolean SkipConnection, int ExpansionFactor, string BNMode, double 
squeeze_factor)
+  return (matrix[double] layer_out, list[unknown] intermediate_outputs, 
list[unknown] 
+    batchnorm_updates, int Hout, int Wout)
+{
+  /*
+   * Computes the backward pass for a MBConv layer.
+   *
+   * Inputs:
+   *  - X: Previous input data matrix, of shape (N, Fin * Hin * Win).
+   *  - model: list of all 22 matrices needed for a complete mbconv layer
+   *  - Fin: Number of filters incoming to the MBConv Block.
+   *  - Fout: Number of filters this MBconv Block produces.
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - filter_width: Width of the depthwise convolution filter
+   *  - filter_height: Height of the depthwise convolution filter
+   *  - strideh: stride of the depthwise convolution in height
+   *  - stridew: stride of the depthwise convolution in width
+   *  - padh: padding of the depthwise convolution in height
+   *  - padw: padding of the depthwise convolution in width
+   *  - SkipConnection: Whether the skip connection was used or not. For this 
to work the Filters Fin and Fout, and
+   *                    the outputs dimension and Hin and Win must be the same.
+   *  - ExpansionFactor: Factor of expansion of the initial Filters coming 
into this block
+   *  - BNMode: BatchNorm mode used must be either "train" or "test"
+   *  - Squeeze_factor: Factor for the squeeze and excitation layer. This 
factor should be between 0 and 1
+
+   * Outputs:
+   *  - layer_out: Output of the MBConv Layer
+   *  - intermediate_outputs: list of outputs of intermediate layers generated 
by the forward pass
+   *  - batchnorm_updates: list of updates of batchnorm layers generated by 
the forward pass
+   *  - Hout: Height of the output
+   *  - Wout: Width of the output
+   */
+  # Unpack parameterlist
+  W_expansion = as.matrix(model[1])
+  b_expansion = as.matrix(model[2])
+  Gamma_expansion = as.matrix(model[3])
+  Beta_expansion = as.matrix(model[4])
+  EmaMean_expansion = as.matrix(model[5])
+  EmaVar_expansion = as.matrix(model[6])
+
+  W_depth = as.matrix(model[7])
+  b_depth = as.matrix(model[8])
+  Gamma_depth = as.matrix(model[9])
+  Beta_depth = as.matrix(model[10])
+  EmaMean_depth = as.matrix(model[11])
+  EmaVar_depth = as.matrix(model[12])
+  W_squeeze = as.matrix(model[13])
+  b_squeeze = as.matrix(model[14])
+  W_excite = as.matrix(model[15])
+  b_excite = as.matrix(model[16])
+
+  W_out = as.matrix(model[17])
+  b_out = as.matrix(model[18])
+  Gamma_out = as.matrix(model[19])
+  Beta_out = as.matrix(model[20])
+  EmaMean_out = as.matrix(model[21])
+  EmaVar_out = as.matrix(model[22])
+
+  # Either produce expanded input or use identity
+  if (ExpansionFactor > 1) {
+    filter_expansion = Fin * ExpansionFactor
+    [out_expansion, dim_h_exp, dim_w_exp] = conv2d::forward(X, W_expansion, 
b_expansion, Fin, Hin, Win, 1, 1, 1, 1, 0, 0)
+    [out_bn_expansion, bn_ema_mean_expansion, bn_ema_var_expansion, 
cache_mean_expansion, cache_var_expansion] = batchnorm::forward(out_expansion, 
Gamma_expansion, Beta_expansion, filter_expansion, Hin, Win, BNMode, 
EmaMean_expansion, EmaVar_expansion, 0.9, 1e-5)
+    depthwise_in = silu::forward(out_bn_expansion)
+  }
+  else {
+    # dummy variables so that indexing remains constant
+    out_expansion = matrix(0, 0, 0)
+    out_bn_expansion = matrix(0, 0, 0)
+    bn_ema_mean_expansion = matrix(0, 0, 0)
+    bn_ema_var_expansion = matrix(0, 0, 0)
+    cache_mean_expansion = matrix(0, 0, 0)
+    cache_var_expansion = matrix(0, 0, 0)
+
+    filter_expansion = Fin
+    depthwise_in = X
+  }
+
+  [depth_out, depth_dim_h, depth_dim_w] = depthwise::forward(depthwise_in, 
W_depth, b_depth, Hin, Win, 1, filter_height, filter_width, strideh, stridew, 
padh, padw)
+  [depth_bn_out, depth_bn_mean, depth_bn_var, depth_cache_mean, 
depth_cache_var] =
+    batchnorm::forward(depth_out, Gamma_depth, Beta_depth, filter_expansion, 
depth_dim_h, depth_dim_w, "train", EmaMean_depth, EmaVar_depth, 0.9, 1e-5)
+  depth_act_out = silu::forward(depth_bn_out)
+
+
+  # Squeeze and Expansion
+  squeeze_dim = round(filter_expansion * squeeze_factor)
+  [pooled_out, pool_h, pool_w] = global_avg_pool::forward(depth_act_out, 
filter_expansion, depth_dim_h, depth_dim_w)
+  [squeeze_out, dim_squeeze_h, dim_squeeze_w] = conv2d::forward(pooled_out, 
W_squeeze, b_squeeze, filter_expansion, pool_h, pool_w, 1, 1, 1, 1, 0, 0)
+  [expand_out, dim_squeeze_h, dim_squeeze_w] = conv2d::forward(squeeze_out, 
W_excite, b_excite, squeeze_dim, dim_squeeze_h, dim_squeeze_w, 1, 1, 1, 1, 0, 0)
+  upscaled_out = upsample::forward(expand_out, filter_expansion, 
dim_squeeze_h, dim_squeeze_w, depth_dim_h, depth_dim_w)
+  multiplied_out = depth_act_out * upscaled_out
+
+  # Output Layer
+  [conv_out, conv_dim_h, conv_dim_w] = conv2d::forward(multiplied_out, W_out, 
b_out, filter_expansion, depth_dim_h, depth_dim_w, 1, 1, 1, 1, 0, 0)
+  [conv_bn_out, conv_bn_mean, conv_bn_var, conv_cache_mean, conv_cache_var] =
+    batchnorm::forward(conv_out, Gamma_depth, Beta_depth, filter_expansion, 
depth_dim_h, depth_dim_w, BNMode, EmaMean_depth, EmaVar_depth, 0.9, 1e-5)
+
+  if (SkipConnection)
+    layer_out = conv_bn_out + X
+  else
+    layer_out = conv_out
+
+  Hout = conv_dim_h
+  Wout = conv_dim_w
+
+  intermediate_outputs = list(out_expansion, out_bn_expansion, depthwise_in,
+    depth_out,  depth_bn_out, depth_act_out, pooled_out, squeeze_out, 
expand_out,
+    upscaled_out, multiplied_out, conv_out, conv_bn_out, layer_out)
+  batchnorm_updates = list(bn_ema_mean_expansion, bn_ema_var_expansion,
+    cache_mean_expansion, cache_var_expansion, depth_bn_mean, depth_bn_var, 
depth_cache_mean, 
+    depth_cache_var, conv_bn_mean, conv_bn_var, conv_cache_mean, 
conv_cache_var)
+}
+
+backward = function(matrix[double] dout, matrix[double] X, list[unknown] 
model, list[unknown] intermediate_outputs, 
+  list[unknown] batchnorm_updates, int Fin, int Fout, int Hin, int Win, int 
filter_width, int filter_height, int strideh, 
+  int stridew, int padh, int padw, boolean SkipConnection, int 
ExpansionFactor, string BNMode, double squeeze_factor)
+  return (matrix[double] dX, list[unknown] gradients)
+{
+  /*
+   * Computes the backward pass for a MBConv layer.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+   *  - X: Previous input data matrix, of shape (N, Fin * Hin * Win).
+   *  - model: list of all 22 matrices needed for a complete mbconv layer
+   *  - intermediate_outputs: list of outputs of intermediate layers generated 
by the forward pass
+   *  - batchnorm_updates: list of updates of batchnorm layers generated by 
the forward pass
+   *  - Fin: Number of filters incoming to the MBConv Block.
+   *  - Fout: Number of filters this MBconv Block produces.
+   *  - Hin: Input height.
+   *  - Win: Input width.
+   *  - filter_width: Width of the depthwise convolution filter
+   *  - filter_height: Height of the depthwise convolution filter
+   *  - strideh: stride of the depthwise convolution in height
+   *  - stridew: stride of the depthwise convolution in width
+   *  - padh: padding of the depthwise convolution in height
+   *  - padw: padding of the depthwise convolution in width
+   *  - SkipConnection: Whether the skip connection was used or not. For this 
to work the Filters Fin and Fout, and
+   *                    the outputs dimension and Hin and Win must be the same.
+   *  - ExpansionFactor: Factor of expansion of the initial Filters coming 
into this block
+   *  - BNMode: BatchNorm mode used must be either "train" or "test"
+   *  - Squeeze_factor: Factor for the squeeze and excitation layer. This 
factor should be between 0 and 1
+
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
+   *  - gradients: list containing all the gradients for the parameter updates.
+   */
+
+  # Unpack parameterlist
+  W_expansion = as.matrix(model[1])
+  b_expansion = as.matrix(model[2])
+  Gamma_expansion = as.matrix(model[3])
+  Beta_expansion = as.matrix(model[4])
+  EmaMean_expansion = as.matrix(model[5])
+  EmaVar_expansion = as.matrix(model[6])
+
+  W_depth = as.matrix(model[7])
+  b_depth = as.matrix(model[8])
+  Gamma_depth = as.matrix(model[9])
+  Beta_depth = as.matrix(model[10])
+  EmaMean_depth = as.matrix(model[11])
+  EmaVar_depth = as.matrix(model[12])
+  W_squeeze = as.matrix(model[13])
+  b_squeeze = as.matrix(model[14])
+  W_excite = as.matrix(model[15])
+  b_excite = as.matrix(model[16])
+
+  W_out = as.matrix(model[17])
+  b_out = as.matrix(model[18])
+  Gamma_out = as.matrix(model[19])
+  Beta_out = as.matrix(model[20])
+  EmaMean_out = as.matrix(model[21])
+  EmaVar_out = as.matrix(model[22])
+
+  # Unpack BN caches
+  cache_mean_expansion = as.matrix(batchnorm_updates[3])
+  cache_var_expansion = as.matrix(batchnorm_updates[4])
+  depth_cache_mean = as.matrix(batchnorm_updates[7])
+  depth_cache_var = as.matrix(batchnorm_updates[8])
+  conv_cache_mean = as.matrix(batchnorm_updates[11])
+  conv_cache_var = as.matrix(batchnorm_updates[12])
+
+  # Unpack Intermediate Outputs
+  out_expansion = as.matrix(intermediate_outputs[1])
+  out_bn_expansion = as.matrix(intermediate_outputs[2])
+  depthwise_in = as.matrix(intermediate_outputs[3])
+  depth_out = as.matrix(intermediate_outputs[4])
+  depth_bn_out = as.matrix(intermediate_outputs[5])
+  depth_act_out = as.matrix(intermediate_outputs[6])
+  pooled_out = as.matrix(intermediate_outputs[7])
+  squeeze_out = as.matrix(intermediate_outputs[8])
+  expand_out = as.matrix(intermediate_outputs[9])
+  upscaled_out = as.matrix(intermediate_outputs[10])
+  multiplied_out = as.matrix(intermediate_outputs[11])
+  conv_out = as.matrix(intermediate_outputs[12])
+  conv_bn_out = as.matrix(intermediate_outputs[13])
+
+  # Calculate Dimension of filters
+  if (ExpansionFactor > 1)
+    expansion_dim = Fin * ExpansionFactor
+  else
+    expansion_dim = Fin
+
+  squeeze_dim = round(expansion_dim * squeeze_factor)
+  depth_dim_h = as.integer(floor((Hin + 2*padh - filter_height)/strideh + 1))
+  depth_dim_w = as.integer(floor((Win + 2*padw - filter_width)/stridew + 1))
+
+  # Start Backpropagation
+  [dback_bn_out, dGamma_out, dBeta_out] = batchnorm::backward(dout, 
conv_cache_mean, conv_cache_var, conv_out, Gamma_out, Fout, depth_dim_h, 
depth_dim_w, 1e-5)
+  [dconv_out, dConv_w, dConv_b] = conv2d::backward(dback_bn_out, depth_dim_h, 
depth_dim_w, multiplied_out, W_out, b_out, expansion_dim, depth_dim_h, 
depth_dim_w, 1, 1, 1, 1, 0, 0)
+
+  # multiply backward part1
+  dsqueeze_back = dconv_out * depth_act_out
+  dupsample_back = upsample::backward(dsqueeze_back, expansion_dim, 1, 1, 
depth_dim_h, depth_dim_w)
+
+  [dexcite_back, dW_excite, db_excite] = conv2d::backward(dupsample_back, 1, 
1, squeeze_out, W_excite, b_excite, squeeze_dim, 1, 1, 1, 1, 1, 1, 0, 0)
+  [dsqueeze_back, dW_squeeze, db_squeeze] = conv2d::backward(dexcite_back, 1, 
1, pooled_out, W_squeeze, b_squeeze, expansion_dim, 1, 1, 1, 1, 1, 1, 0, 0)
+  dpool_back = global_avg_pool::backward(dsqueeze_back, depth_act_out, 
expansion_dim, depth_dim_h, depth_dim_w)
+
+  #multiply part 2
+  dmult = dconv_out * upscaled_out
+  dsilu_back = dmult + dpool_back
+
+  # Act-BN-CONV
+  dsilu_back2 = silu::backward(dsilu_back, depth_bn_out)
+  [dback_bn_depth, dGamma_depth, dBeta_depth] = 
batchnorm::backward(dsilu_back2, depth_cache_mean, depth_cache_var, depth_out, 
Gamma_depth, expansion_dim, depth_dim_h, depth_dim_w, 1e-5)
+  [dconv_depth_out, dW_depth, db_depth] = depthwise::backward(dback_bn_depth, 
depth_dim_h, depth_dim_w, depthwise_in, W_depth, b_depth, Hin, Win, 1, 
filter_height, filter_width, strideh, stridew, padh, padw)
+
+  if (ExpansionFactor > 1) {
+    dsilu_back3 = silu::backward(dconv_depth_out, out_bn_expansion)
+    [dback_bn_expansion, dGamma_expansion, dBeta_expansion] = 
batchnorm::backward(dsilu_back3, cache_mean_expansion, cache_var_expansion, 
out_expansion, Gamma_expansion, expansion_dim, Hin, Win, 1e-5)
+    [dconv_expansion, dW_expansion, db_expansion] = 
conv2d::backward(dback_bn_expansion, Hin, Win, X, W_expansion, b_expansion, 
Fin, Hin, Win, 1, 1, 1, 1, 0, 0)
+    dX = dconv_expansion
+  }
+  else {
+    dX = dconv_depth_out
+  }
+  if (SkipConnection)
+    dX = dX + dout
+
+  if (ExpansionFactor > 1) {
+    gradients = list(dGamma_out, dBeta_out, dConv_w, dConv_b, dW_excite,
+    db_excite, dW_squeeze, db_squeeze, dGamma_depth, dBeta_depth, dW_depth,
+    db_depth, dGamma_expansion, dBeta_expansion, dW_expansion, db_expansion)
+  }
+  else {
+    gradients = list(dGamma_out, dBeta_out, dConv_w, dConv_b, dW_excite, 
db_excite,
+      dW_squeeze, db_squeeze, dGamma_depth, dBeta_depth, dW_depth, db_depth)
+  }
+}
+
+init = function(int Fin, int Fout, int filter_width, int filter_height,
+  int ExpansionFactor, double SqueezeFactor, int seed = -1)
+  return (list[unknown] mbconv_params) 
+{
+  /*
+   * Initialize the parameters of this MBConv layer.
+   *
+   * Note: This is just a convenience function, and parameters
+   * may be initialized manually if needed.
+   *
+   * Inputs:
+   *  - Fin: Number of filters incoming to the MBConv Block.
+   *  - Fout: Number of filters this MBconv Block produces.
+   *  - filter_width: Width of the depthwise convolution filter
+   *  - filter_height: Height of the depthwise convolution filter
+   *  - ExpansionFactor: Factor of expansion of the initial Filters coming 
into this block
+   *  - Squeeze_factor: Factor for the squeeze and excitation layer. This 
factor should be between 0 and 1
+   *  - seed: The seed to initialize the weights
+   *
+   * Outputs:
+   *  - mbconv_params: list of all 22 matrices needed for a complete mbconv 
layer
+   */
+
+  # Expansion
+  if (ExpansionFactor > 1) {
+    expansion_dim = Fin * ExpansionFactor
+    [W_expansion, b_expansion] = conv2d::init(expansion_dim, Fin, 1, 1, seed)
+    [Gamma_expansion, Beta_expansion, EmaMean_expansion, EmaVar_expansion] = 
batchnorm::init(expansion_dim)
+  }
+  else {
+    # Dummy variables so that the model list indices remain the same
+    W_expansion = matrix(0, 0, 0)
+    b_expansion = matrix(0, 0, 0)
+    Gamma_expansion = matrix(0, 0, 0)
+    Beta_expansion = matrix(0, 0, 0)
+    EmaMean_expansion = matrix(0, 0, 0)
+    EmaVar_expansion = matrix(0, 0, 0)
+    expansion_dim = Fin
+  }
+
+  [W_depth, b_depth] = depthwise::init(expansion_dim, 1, filter_width, 
filter_height)
+  [Gamma_depth, Beta_depth, EmaMean_depth, EmaVar_depth] = 
batchnorm::init(expansion_dim)
+  squeeze_dim = round(expansion_dim * SqueezeFactor)
+  [W_squeeze, b_squeeze] = conv2d::init(squeeze_dim, expansion_dim, 1, 1, seed)
+  [W_excite, b_excite] = conv2d::init(expansion_dim, squeeze_dim, 1, 1, seed)
+
+  [W_out, b_out] = conv2d::init(Fout, expansion_dim, 1, 1, seed)
+  [Gamma_out, Beta_out, EmaMean_out, EmaVar_out] = batchnorm::init(Fout)
+
+  mbconv_params = list(W_expansion, b_expansion, Gamma_expansion, 
+  Beta_expansion, EmaMean_expansion, EmaVar_expansion, W_depth, b_depth,
+  Gamma_depth, Beta_depth, EmaMean_depth, EmaVar_depth, W_squeeze, b_squeeze,
+  W_excite, b_excite, W_out, b_out, Gamma_out, Beta_out, EmaMean_out, 
EmaVar_out)
+}
diff --git a/scripts/nn/layers/silu.dml b/scripts/nn/layers/silu.dml
new file mode 100644
index 0000000..037e5ae
--- /dev/null
+++ b/scripts/nn/layers/silu.dml
@@ -0,0 +1,57 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * SILU nonlinearity layer.
+ */
+
+forward = function(matrix[double] X)
+    return (matrix[double] out) {
+  /*
+   * Computes the forward pass for a SILU nonlinearity layer.
+   *
+   * Performs an element-wise evaluation of `f(input) = x * \sigmoid(x)`.
+   *
+   * Inputs:
+   *  - X: Inputs, of shape (any, any).
+   *
+   * Outputs:
+   *  - out: Outputs, of same shape as `X`.
+   */
+  out = X / (1+exp(-X))
+}
+
+backward = function(matrix[double] dout, matrix[double] X)
+    return (matrix[double] dX) {
+  /*
+   * Computes the backward pass for a SILU nonlinearity layer.
+   *
+   * Inputs:
+   *  - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+   *  - X: Previous input data matrix, of shape (any, any).
+   *
+   * Outputs:
+   *  - dX: Gradient wrt `X`, of same shape as `X`.
+   */
+
+  sig = 1 / (1+exp(-X))
+  dX = (sig + X * sig * (1 - sig)) * dout
+}

Reply via email to