This is an automated email from the ASF dual-hosted git repository.
arnabp20 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/main by this push:
new f8522a7d5c [SYSTEMDS-3821] Add GELU Activation Function (Approximation)
f8522a7d5c is described below
commit f8522a7d5cf4c4839873bc3dddd2330dcc64f763
Author: MaximilianSchreff <[email protected]>
AuthorDate: Thu Jan 23 11:28:01 2025 +0100
[SYSTEMDS-3821] Add GELU Activation Function (Approximation)
This patch introduces the Gaussian Error Linear Unit (GELU) activation
function to SystemDS as a built-in operation. The implementation uses the
widely adopted approximate formulation (https://arxiv.org/abs/1606.08415).
This patch is a part of a series of commits to support popular Transformer
architectures in SystemDS. The GELU activation the most commonly used
activation functions in models like BERT and GPT.
Closes #2177
---
scripts/nn/layers/gelu.dml | 70 ++++++++++++++++++++++
.../test/applications/nn/NNComponentTest.java | 5 ++
.../scripts/applications/nn/component/gelu.dml | 66 ++++++++++++++++++++
3 files changed, 141 insertions(+)
diff --git a/scripts/nn/layers/gelu.dml b/scripts/nn/layers/gelu.dml
new file mode 100644
index 0000000000..23c1d407be
--- /dev/null
+++ b/scripts/nn/layers/gelu.dml
@@ -0,0 +1,70 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+/*
+ * Gaussian Error Linear Unit (GELU) nonlinearity layer.
+ */
+
+source("nn/layers/tanh.dml") as tanh
+
+forward = function(matrix[double] X)
+ return (matrix[double] out) {
+ /*
+ * Computes the forward pass for a GELU nonlinearity layer, via
+ * its tanh approximation.
+ *
+ * Performs an element-wise evaluation of
+ * `GELU(x) = x * CDF(x)`.
+ * where CDF is the cumulative distribution function of the
+ * standard normal distribution:
+ * `CDF(x) = 0.5 * (1 + erf(x/sqrt(2)))`
+ * This implementation uses the tanh approximation:
+ * `CDF(x) =~ 0.5 * (1 + tanh(sqrt(2/pi) * (x + 0.044715x^3)))`
+ *
+ * Inputs:
+ * - X: Inputs, of shape (any, any).
+ *
+ * Outputs:
+ * - out: Outputs, of same shape as `X`.
+ */
+ cdf = 0.5 * (1 + tanh(sqrt(2 / pi) * (X + 0.044715 * X^3)))
+ out = cdf * X
+}
+
+backward = function(matrix[double] dout, matrix[double] X)
+ return (matrix[double] dX) {
+ /*
+ * Computes the backward pass for a GELU nonlinearity layer, via
+ * its tanh approximation.
+ *
+ * Inputs:
+ * - dout: Gradient wrt `out` from upstream, of same shape as `X`.
+ * - X: Previous input data matrix, of shape (any, any).
+ *
+ * Outputs:
+ * - dX: Gradient wrt `X`, of same shape as `X`.
+ */
+ a = sqrt(2 / pi)
+ b = 0.044715
+ T = tanh(a * (X + b * X^3))
+ dT = 1 - T^2
+ dX = dout * (0.5 * (1 + T) + 0.5 * X * dT * a * (1 + 3 * b * X^2))
+}
diff --git
a/src/test/java/org/apache/sysds/test/applications/nn/NNComponentTest.java
b/src/test/java/org/apache/sysds/test/applications/nn/NNComponentTest.java
index 3b002871d7..a9922cf35f 100644
--- a/src/test/java/org/apache/sysds/test/applications/nn/NNComponentTest.java
+++ b/src/test/java/org/apache/sysds/test/applications/nn/NNComponentTest.java
@@ -124,6 +124,11 @@ public class NNComponentTest extends TestFolder {
run("resnet_bottleneck.dml");
}
+ @Test
+ public void gelu() {
+ run("gelu.dml");
+ }
+
@Override
protected void run(String name) {
super.run("component/" + name);
diff --git a/src/test/scripts/applications/nn/component/gelu.dml
b/src/test/scripts/applications/nn/component/gelu.dml
new file mode 100644
index 0000000000..3d7ea83345
--- /dev/null
+++ b/src/test/scripts/applications/nn/component/gelu.dml
@@ -0,0 +1,66 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+source("nn/layers/gelu.dml") as gelu
+source("src/test/scripts/applications/nn/util.dml") as test_util
+
+gelu_test1 = function() {
+ print("Testing GELU, test 1")
+
+ X = matrix("1. -0.5
+ 0. 2.", rows=2, cols=2)
+ dout = matrix("1 1
+ 1 1", rows=2, cols=2)
+ out_expected = matrix("0.841192 -0.154286
+ 0. 1.9545977", rows=2, cols=2)
+ gradient_expected = matrix("1.0829641 0.13263011
+ 0.5 1.0860993", rows=2, cols=2)
+
+ out = gelu::forward(X)
+
+ test_util::check_all_close(out, out_expected, 0.00001)
+
+ gradient = gelu::backward(dout, X)
+ test_util::check_all_close(gradient, gradient_expected, 0.00001)
+}
+
+gelu_test2 = function() {
+ print("Testing GELU, test 2")
+
+ X = matrix("0.5 -1.5
+ 1. -2.", rows=2, cols=2)
+ dout = matrix("1 1
+ 1 1", rows=2, cols=2)
+ out_expected = matrix("0.345714 -0.10042843
+ 0.841192 -0.04540229", rows=2, cols=2)
+ gradient_expected = matrix("0.8673699 -0.1277108
+ 1.0829641 -0.08609922", rows=2, cols=2)
+
+ out = gelu::forward(X)
+
+ test_util::check_all_close(out, out_expected, 0.00001)
+
+ gradient = gelu::backward(dout, X)
+ test_util::check_all_close(gradient, gradient_expected, 0.00001)
+}
+
+gelu_test1()
+gelu_test2()
\ No newline at end of file