This is an automated email from the ASF dual-hosted git repository.
aaronmarkham pushed a commit to branch asf-site
in repository https://gitbox.apache.org/repos/asf/incubator-mxnet-site.git
The following commit(s) were added to refs/heads/asf-site by this push:
new 7dfc5f9 Publish triggered by CI
7dfc5f9 is described below
commit 7dfc5f920e00134ef6059f9af727722c98988148
Author: mxnet-ci <[email protected]>
AuthorDate: Mon Nov 9 18:44:14 2020 +0000
Publish triggered by CI
---
api/python/docs/_modules/mxnet/gluon/block.html | 44 +++++++++----
.../docs/_modules/mxnet/gluon/nn/activations.html | 33 +++++++++-
api/python/docs/_sources/api/gluon/nn/index.rst | 2 +
.../gluon/blocks/activations/activations.ipynb | 43 +++++++++---
api/python/docs/api/gluon/nn/index.html | 73 +++++++++++++++++++--
api/python/docs/genindex.html | 4 ++
api/python/docs/objects.inv | Bin 90871 -> 90907 bytes
api/python/docs/searchindex.js | 2 +-
.../gluon/blocks/activations/activations.html | 42 ++++++++----
.../gluon/blocks/activations/activations.html.bak | 42 ++++++++----
date.txt | 1 -
feed.xml | 2 +-
.../python/docs/_modules/mxnet/gluon/block.html | 44 +++++++++----
.../docs/_modules/mxnet/gluon/nn/activations.html | 33 +++++++++-
.../python/docs/_sources/api/gluon/nn/index.rst | 2 +
.../gluon/blocks/activations/activations.ipynb | 43 +++++++++---
.../master/api/python/docs/api/gluon/nn/index.html | 73 +++++++++++++++++++--
versions/master/api/python/docs/genindex.html | 4 ++
versions/master/api/python/docs/objects.inv | Bin 90871 -> 90907 bytes
versions/master/api/python/docs/searchindex.js | 2 +-
.../gluon/blocks/activations/activations.html | 42 ++++++++----
.../gluon/blocks/activations/activations.html.bak | 42 ++++++++----
versions/master/feed.xml | 2 +-
23 files changed, 466 insertions(+), 109 deletions(-)
diff --git a/api/python/docs/_modules/mxnet/gluon/block.html
b/api/python/docs/_modules/mxnet/gluon/block.html
index baa4093..7856e40 100644
--- a/api/python/docs/_modules/mxnet/gluon/block.html
+++ b/api/python/docs/_modules/mxnet/gluon/block.html
@@ -2368,14 +2368,35 @@ Edit on Github
<span class="n">arg_dict</span><span class="p">,</span> <span
class="n">aux_dict</span> <span class="o">=</span> <span
class="nb">dict</span><span class="p">(),</span> <span
class="nb">dict</span><span class="p">()</span>
<span class="k">if</span> <span class="bp">self</span><span
class="o">.</span><span class="n">_backend</span><span class="p">:</span>
- <span class="n">ctx</span> <span class="o">=</span> <span
class="n">args</span><span class="p">[</span><span class="mi">0</span><span
class="p">]</span><span class="o">.</span><span class="n">context</span>
+ <span class="c1"># set context for inputs</span>
+ <span class="n">_</span><span class="p">,</span> <span
class="n">_</span><span class="p">,</span> <span class="n">ctx_set</span><span
class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span
class="n">_gather_type_ctx_info</span><span class="p">(</span><span
class="nb">list</span><span class="p">(</span><span class="n">args</span><span
class="p">))</span>
+ <span class="n">ctx</span> <span class="o">=</span> <span
class="n">ctx_set</span><span class="o">.</span><span class="n">pop</span><span
class="p">()</span> <span class="k">if</span> <span class="nb">len</span><span
class="p">(</span><span class="n">ctx_set</span><span class="p">)</span> <span
class="o">></span> <span class="mi">0</span> <span class="k">else</span>
<span class="kc">None</span>
<span class="c1"># get list of params in the order of
out.list_arguments</span>
- <span class="n">arg_dict</span><span class="o">.</span><span
class="n">update</span><span class="p">({</span><span
class="n">name</span><span class="p">:</span><span class="n">args</span><span
class="p">[</span><span class="n">data_names</span><span
class="p">[</span><span class="n">name</span><span class="p">]]</span> <span
class="k">if</span> <span class="n">name</span> <span class="ow">in</span>
<span class="n">data_names</span><span class="o">.</span><span class="n">keys<
[...]
- <span class="k">for</span> <span
class="n">name</span> <span class="ow">in</span> <span
class="n">out</span><span class="o">.</span><span
class="n">list_arguments</span><span class="p">()})</span>
- <span class="n">aux_dict</span><span class="o">.</span><span
class="n">update</span><span class="p">({</span><span
class="n">name</span><span class="p">:</span><span class="n">args</span><span
class="p">[</span><span class="n">data_names</span><span
class="p">[</span><span class="n">name</span><span class="p">]]</span> <span
class="k">if</span> <span class="n">name</span> <span class="ow">in</span>
<span class="n">data_names</span><span class="o">.</span><span class="n">keys<
[...]
- <span class="k">for</span> <span
class="n">name</span> <span class="ow">in</span> <span
class="n">out</span><span class="o">.</span><span
class="n">list_auxiliary_states</span><span class="p">()})</span>
- <span class="c1"># Partition the graph.</span>
- <span class="n">out</span> <span class="o">=</span> <span
class="n">out</span><span class="o">.</span><span
class="n">optimize_for</span><span class="p">(</span><span
class="bp">self</span><span class="o">.</span><span
class="n">_backend</span><span class="p">,</span> <span
class="n">arg_dict</span><span class="p">,</span> <span
class="n">aux_dict</span><span class="p">,</span> <span
class="n">ctx</span><span class="p">,</span> <span class="o">**</span><span
class="bp">self</ [...]
+ <span class="n">input_shapes</span> <span class="o">=</span> <span
class="nb">dict</span><span class="p">()</span>
+ <span class="k">for</span> <span class="n">name</span> <span
class="ow">in</span> <span class="n">out</span><span class="o">.</span><span
class="n">list_arguments</span><span class="p">():</span>
+ <span class="k">if</span> <span class="n">name</span> <span
class="ow">in</span> <span class="n">data_names</span><span
class="o">.</span><span class="n">keys</span><span class="p">()</span> <span
class="ow">and</span> <span class="n">data_names</span><span
class="p">[</span><span class="n">name</span><span class="p">]</span> <span
class="o"><</span> <span class="nb">len</span><span class="p">(</span><span
class="n">args</span><span class="p">):</span>
+ <span class="k">if</span> <span
class="nb">isinstance</span><span class="p">(</span><span
class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]],</span> <span
class="n">NDArray</span><span class="p">):</span>
+ <span class="n">arg_dict</span><span
class="p">[</span><span class="n">name</span><span class="p">]</span> <span
class="o">=</span> <span class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]]</span>
+ <span class="k">elif</span> <span class="p">(</span><span
class="nb">isinstance</span><span class="p">(</span><span
class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]],</span> <span
class="n">symbol</span><span class="o">.</span><span
class="n">Symbol</span><span class="p">)</span> <span class="ow">and</span>
+ <span class="s1">'__shape__'</span> <span
class="ow">in</span> <span class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]]</span><span class="o">.</span><span
class="n">list_attr</span><span class="p">()):</span>
+ <span class="n">shape_str</span> <span
class="o">=</span> <span class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]]</span><span class="o">.</span><span
class="n">list_attr</span><span class="p">()[</span><span
class="s1">'__shape__'</span><span class="p">]</span>
+ <span class="n">input_shapes</span><span
class="p">[</span><span class="n">name</span><span class="p">]</span> <span
class="o">=</span> <span class="nb">tuple</span><span class="p">(</span><span
class="nb">map</span><span class="p">(</span><span class="nb">int</span><span
class="p">,</span> <span class="n">shape_str</span><span
class="o">.</span><span class="n">strip</span><span class="p">(</span><span
class="s1">'()'</span><span class="p">)</span><span cl [...]
+ <span class="k">elif</span> <span class="n">name</span> <span
class="ow">in</span> <span class="n">params</span><span class="p">:</span>
+ <span class="n">arg_dict</span><span
class="p">[</span><span class="n">name</span><span class="p">]</span> <span
class="o">=</span> <span class="n">params</span><span class="p">[</span><span
class="n">name</span><span class="p">]</span><span class="o">.</span><span
class="n">data</span><span class="p">()</span>
+
+ <span class="k">for</span> <span class="n">name</span> <span
class="ow">in</span> <span class="n">out</span><span class="o">.</span><span
class="n">list_auxiliary_states</span><span class="p">():</span>
+ <span class="k">if</span> <span class="n">name</span> <span
class="ow">in</span> <span class="n">data_names</span><span
class="o">.</span><span class="n">keys</span><span class="p">()</span> <span
class="ow">and</span> <span class="n">data_names</span><span
class="p">[</span><span class="n">name</span><span class="p">]</span> <span
class="o"><</span> <span class="nb">len</span><span class="p">(</span><span
class="n">args</span><span class="p">):</span>
+ <span class="k">if</span> <span
class="nb">isinstance</span><span class="p">(</span><span
class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]],</span> <span
class="n">NDArray</span><span class="p">):</span>
+ <span class="n">aux_dict</span><span
class="p">[</span><span class="n">name</span><span class="p">]</span> <span
class="o">=</span> <span class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]]</span>
+ <span class="k">elif</span> <span class="p">(</span><span
class="nb">isinstance</span><span class="p">(</span><span
class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]],</span> <span
class="n">symbol</span><span class="o">.</span><span
class="n">Symbol</span><span class="p">)</span> <span class="ow">and</span>
+ <span class="s1">'__shape__'</span> <span
class="ow">in</span> <span class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]]</span><span class="o">.</span><span
class="n">list_attr</span><span class="p">()):</span>
+ <span class="n">shape_str</span> <span
class="o">=</span> <span class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]]</span><span class="o">.</span><span
class="n">list_attr</span><span class="p">()[</span><span
class="s1">'__shape__'</span><span class="p">]</span>
+ <span class="n">input_shapes</span><span
class="p">[</span><span class="n">name</span><span class="p">]</span> <span
class="o">=</span> <span class="nb">tuple</span><span class="p">(</span><span
class="nb">map</span><span class="p">(</span><span class="nb">int</span><span
class="p">,</span> <span class="n">shape_str</span><span
class="o">.</span><span class="n">strip</span><span class="p">(</span><span
class="s1">'()'</span><span class="p">)</span><span cl [...]
+ <span class="k">elif</span> <span class="n">name</span> <span
class="ow">in</span> <span class="n">params</span><span class="p">:</span>
+ <span class="n">aux_dict</span><span
class="p">[</span><span class="n">name</span><span class="p">]</span> <span
class="o">=</span> <span class="n">params</span><span class="p">[</span><span
class="n">name</span><span class="p">]</span><span class="o">.</span><span
class="n">data</span><span class="p">()</span>
+
+ <span class="c1"># Partition the graph</span>
+ <span class="n">out</span> <span class="o">=</span> <span
class="n">out</span><span class="o">.</span><span
class="n">optimize_for</span><span class="p">(</span><span
class="bp">self</span><span class="o">.</span><span
class="n">_backend</span><span class="p">,</span> <span
class="n">arg_dict</span><span class="p">,</span> <span
class="n">aux_dict</span><span class="p">,</span> <span
class="n">ctx</span><span class="p">,</span> <span
class="n">input_shapes</span><span class=" [...]
<span class="c1"># convert to numpy symbol if needed</span>
<span class="k">if</span> <span class="n">_mx_npx</span><span
class="o">.</span><span class="n">is_np_array</span><span class="p">():</span>
@@ -2418,7 +2439,7 @@ Edit on Github
<span class="n">param</span> <span class="o">=</span>
<span class="n">Parameter</span><span class="p">(</span><span
class="n">name</span><span class="p">,</span> <span class="n">dtype</span><span
class="o">=</span><span class="n">param_data</span><span
class="o">.</span><span class="n">dtype</span><span class="p">)</span>
<span class="n">param</span><span class="o">.</span><span
class="n">_var_name</span> <span class="o">=</span> <span class="n">name</span>
<span class="n">serialization_name</span> <span
class="o">=</span> <span class="n">name</span> <span class="c1">#
HybridBlock.export</span>
- <span class="n">param</span><span class="o">.</span><span
class="n">_load_init</span><span class="p">(</span><span
class="n">param_data</span><span class="p">,</span> <span
class="n">args</span><span class="p">[</span><span class="mi">0</span><span
class="p">]</span><span class="o">.</span><span class="n">context</span><span
class="p">)</span>
+ <span class="n">param</span><span class="o">.</span><span
class="n">_load_init</span><span class="p">(</span><span
class="n">param_data</span><span class="p">,</span> <span
class="n">param_data</span><span class="o">.</span><span
class="n">context</span><span class="p">)</span>
<span class="n">triple</span> <span class="o">=</span> <span
class="p">(</span><span class="kc">False</span><span class="p">,</span> <span
class="n">serialization_name</span><span class="p">,</span> <span
class="n">param</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span
class="n">_cached_op_args</span><span class="o">.</span><span
class="n">append</span><span class="p">(</span><span
class="n">triple</span><span class="p">)</span>
@@ -2520,14 +2541,11 @@ Edit on Github
<span class="c1"># do part of forward API call</span>
<span class="n">has_symbol</span><span class="p">,</span> <span
class="n">has_ndarray</span><span class="p">,</span> <span
class="n">ctx_set</span><span class="p">,</span> <span class="n">_</span> <span
class="o">=</span> <span class="n">_gather_type_ctx_info</span><span
class="p">([</span><span class="n">x</span><span class="p">]</span> <span
class="o">+</span> <span class="nb">list</span><span class="p">(</span><span
class="n">args</span><span class="p">))</span>
- <span class="k">if</span> <span class="n">has_symbol</span><span
class="p">:</span>
- <span class="k">raise</span> <span
class="ne">ValueError</span><span class="p">(</span><span
class="s1">'Inputs must be NDArrays for the optimize_for API'</span>
- <span class="s1">' Please check the type of
the args.</span><span class="se">\n</span><span class="s1">'</span><span
class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span
class="n">has_symbol</span> <span class="ow">and</span> <span
class="ow">not</span> <span class="n">has_ndarray</span><span class="p">:</span>
- <span class="k">raise</span> <span
class="ne">ValueError</span><span class="p">(</span><span class="s1">'In
HybridBlock, there must be one NDArray as input.'</span>
+ <span class="k">raise</span> <span
class="ne">ValueError</span><span class="p">(</span><span class="s1">'In
HybridBlock, there must be one NDArray or one Symbol in the input.'</span>
<span class="s1">' Please check the type of
the args.</span><span class="se">\n</span><span class="s1">'</span><span
class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span
class="p">(</span><span class="n">ctx_set</span><span class="p">)</span> <span
class="o">></span> <span class="mi">1</span><span class="p">:</span>
- <span class="k">raise</span> <span
class="ne">ValueError</span><span class="p">(</span><span class="s1">'Find
multiple contexts in the input, '</span>
+ <span class="k">raise</span> <span
class="ne">ValueError</span><span class="p">(</span><span class="s1">'Found
multiple contexts in the input, '</span>
<span class="s1">'After hybridized, the
HybridBlock only supports one input '</span>
<span class="s1">'context. You can print the
ele.ctx in the '</span>
<span class="s1">'input arguments to inspect
their contexts. '</span>
diff --git a/api/python/docs/_modules/mxnet/gluon/nn/activations.html
b/api/python/docs/_modules/mxnet/gluon/nn/activations.html
index eac566d..a5ef9ef 100644
--- a/api/python/docs/_modules/mxnet/gluon/nn/activations.html
+++ b/api/python/docs/_modules/mxnet/gluon/nn/activations.html
@@ -1356,7 +1356,7 @@ Edit on Github
<span class="c1"># coding: utf-8</span>
<span class="c1"># pylint: disable= arguments-differ</span>
<span class="sd">"""Basic neural network
layers."""</span>
-<span class="n">__all__</span> <span class="o">=</span> <span
class="p">[</span><span class="s1">'Activation'</span><span
class="p">,</span> <span class="s1">'LeakyReLU'</span><span
class="p">,</span> <span class="s1">'PReLU'</span><span
class="p">,</span> <span class="s1">'ELU'</span><span
class="p">,</span> <span class="s1">'SELU'</span><span
class="p">,</span> <span class="s1">'Swish'</span><span
class="p">,</span> <span class="s1">' [...]
+<span class="n">__all__</span> <span class="o">=</span> <span
class="p">[</span><span class="s1">'Activation'</span><span
class="p">,</span> <span class="s1">'LeakyReLU'</span><span
class="p">,</span> <span class="s1">'PReLU'</span><span
class="p">,</span> <span class="s1">'ELU'</span><span
class="p">,</span> <span class="s1">'SELU'</span><span
class="p">,</span> <span class="s1">'Swish'</span><span
class="p">,</span> <span class="s1">' [...]
<span class="kn">from</span> <span class="nn">...</span> <span
class="kn">import</span> <span class="n">initializer</span>
<span class="kn">from</span> <span class="nn">..block</span> <span
class="kn">import</span> <span class="n">HybridBlock</span>
@@ -1553,7 +1553,7 @@ Edit on Github
<div class="viewcode-block" id="Swish"><a class="viewcode-back"
href="../../../../api/gluon/nn/index.html#mxnet.gluon.nn.Swish">[docs]</a><span
class="k">class</span> <span class="nc">Swish</span><span
class="p">(</span><span class="n">HybridBlock</span><span class="p">):</span>
<span class="sa">r</span><span class="sd">"""</span>
-<span class="sd"> Swish Activation function</span>
+<span class="sd"> Swish Activation function (SiLU with a
hyperparameter)</span>
<span class="sd"> https://arxiv.org/pdf/1710.05941.pdf</span>
<span class="sd"> Parameters</span>
@@ -1578,6 +1578,35 @@ Edit on Github
<span class="k">return</span> <span class="n">x</span> <span
class="o">*</span> <span class="n">F</span><span class="o">.</span><span
class="n">npx</span><span class="o">.</span><span class="n">sigmoid</span><span
class="p">(</span><span class="bp">self</span><span class="o">.</span><span
class="n">_beta</span> <span class="o">*</span> <span class="n">x</span><span
class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">x</span> <span
class="o">*</span> <span class="n">F</span><span class="o">.</span><span
class="n">sigmoid</span><span class="p">(</span><span
class="bp">self</span><span class="o">.</span><span class="n">_beta</span>
<span class="o">*</span> <span class="n">x</span><span class="p">,</span> <span
class="n">name</span><span class="o">=</span><span
class="s1">'fwd'</span><span class="p">)</span></div></div>
+
+
+<div class="viewcode-block" id="SiLU"><a class="viewcode-back"
href="../../../../api/gluon/nn/index.html#mxnet.gluon.nn.SiLU">[docs]</a><span
class="k">class</span> <span class="nc">SiLU</span><span
class="p">(</span><span class="n">HybridBlock</span><span class="p">):</span>
+ <span class="sa">r</span><span class="sd">"""</span>
+<span class="sd"> Sigmoid Linear Units</span>
+<span class="sd"> Originally proposed "Gaussian Error Linear Units
(GELUs)", Hendrycks et al, 2016</span>
+<span class="sd"> https://arxiv.org/abs/1606.08415</span>
+
+<span class="sd"> Parameters</span>
+<span class="sd"> ----------</span>
+<span class="sd"> beta : float</span>
+<span class="sd"> silu(x) = x * sigmoid(x)</span>
+
+
+<span class="sd"> Inputs:</span>
+<span class="sd"> - **data**: input tensor with arbitrary shape.</span>
+
+<span class="sd"> Outputs:</span>
+<span class="sd"> - **out**: output tensor with the same shape as
`data`.</span>
+<span class="sd"> """</span>
+
+ <span class="k">def</span> <span class="fm">__init__</span><span
class="p">(</span><span class="bp">self</span><span class="p">,</span> <span
class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
+ <span class="nb">super</span><span class="p">(</span><span
class="n">SiLU</span><span class="p">,</span> <span class="bp">self</span><span
class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span
class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span
class="p">)</span>
+
+<div class="viewcode-block" id="SiLU.hybrid_forward"><a class="viewcode-back"
href="../../../../api/gluon/nn/index.html#mxnet.gluon.nn.SiLU.hybrid_forward">[docs]</a>
<span class="k">def</span> <span class="nf">hybrid_forward</span><span
class="p">(</span><span class="bp">self</span><span class="p">,</span> <span
class="n">F</span><span class="p">,</span> <span class="n">x</span><span
class="p">):</span>
+ <span class="k">if</span> <span class="n">is_np_array</span><span
class="p">():</span>
+ <span class="k">return</span> <span class="n">x</span> <span
class="o">*</span> <span class="n">F</span><span class="o">.</span><span
class="n">npx</span><span class="o">.</span><span class="n">sigmoid</span><span
class="p">(</span><span class="n">x</span><span class="p">)</span>
+ <span class="k">else</span><span class="p">:</span>
+ <span class="k">return</span> <span class="n">x</span> <span
class="o">*</span> <span class="n">F</span><span class="o">.</span><span
class="n">sigmoid</span><span class="p">(</span><span class="n">x</span><span
class="p">,</span> <span class="n">name</span><span class="o">=</span><span
class="s1">'fwd'</span><span class="p">)</span></div></div>
</pre></div>
<hr class="feedback-hr-top" />
diff --git a/api/python/docs/_sources/api/gluon/nn/index.rst
b/api/python/docs/_sources/api/gluon/nn/index.rst
index 0094647..7aeacd8 100644
--- a/api/python/docs/_sources/api/gluon/nn/index.rst
+++ b/api/python/docs/_sources/api/gluon/nn/index.rst
@@ -147,6 +147,8 @@ Advanced Activation Layers
nn.ELU
nn.SELU
nn.Swish
+ nn.SiLU
+ nn.GELU
API Reference
-------------
diff --git
a/api/python/docs/_sources/tutorials/packages/gluon/blocks/activations/activations.ipynb
b/api/python/docs/_sources/tutorials/packages/gluon/blocks/activations/activations.ipynb
index fbdaec2..57f9196 100644
---
a/api/python/docs/_sources/tutorials/packages/gluon/blocks/activations/activations.ipynb
+++
b/api/python/docs/_sources/tutorials/packages/gluon/blocks/activations/activations.ipynb
@@ -25,7 +25,7 @@
"\n",
"Deep neural networks are a way to express a nonlinear function with lots
of parameters from input data to outputs. The nonlinearities that allow neural
networks to capture complex patterns in data are referred to as activation
functions. Over the course of the development of neural networks, several
nonlinear activation functions have been introduced to make gradient-based deep
learning tractable. \n",
"\n",
- "If you are looking to answer the question, 'which activation function
should I use for my neural network model?', you should probably go with *ReLU*.
Unless you're trying to implement something like a gating mechanism, like in
LSTMs or GRU cells, then you should opt for sigmoid and/or tanh in those cells.
However, if you have a working model architecture and you're trying to improve
its performance by swapping out activation functions or treating the activation
function as a hyperpa [...]
+ "If you are looking to answer the question, 'which activation function
should I use for my neural network model?', you should probably go with *ReLU*.
Unless you're trying to implement something like a gating mechanism, like in
LSTMs or GRU cells, then you should opt for sigmoid and/or tanh in those cells.
However, if you have a working model architecture and you're trying to improve
its performance by swapping out activation functions or treating the activation
function as a hyperpa [...]
"\n",
"## Visualizing Activations\n",
"In order to compare the various activation functions and to understand
the nuances of their differences we have a snippet of code to plot the
activation functions (used in the forward pass) and their gradients (used in
the backward pass)."
@@ -324,14 +324,14 @@
"\n",
"\n",
"\n",
- "### Swish\n",
- "Swish is an activation function that attempts to address the shortcomings
of ReLU by combining ideas from ReLU and sigmoid. Swish was discovered by
searching the space of activation functions using a combination of exhaustive
and reinforcement learning-based search and was introduced in the paper by
[Ramchandran et al](https://arxiv.org/pdf/1710.05941.pdf).\n",
+ "### SiLU\n",
+ "The SiLU is an activation function that attempts to address the
shortcomings of ReLU by combining ideas from ReLU and sigmoid. The SiLU serves
as a smooth approximation to the ReLU and was originally introduced in
[Hendrycks et al](https://arxiv.org/abs/1606.08415).\n",
"\n",
- "The swish function is given as \n",
+ "The silu function is given as \n",
"\n",
- "$$ swish(x) = x\\cdot\\sigma(\\beta x)$$\n",
+ "$$ silu(x) = x\\cdot\\sigma(x)$$\n",
"\n",
- "where $\\sigma$ is the sigmoid activation function $\\sigma(x) =
\\frac{1}{1 + e^{-x}}$ described above and $\\beta$ is a hyperparameter set to
1 by default in MXNet."
+ "where $\\sigma$ is the sigmoid activation function $\\sigma(x) =
\\frac{1}{1 + e^{-x}}$ described above."
]
},
{
@@ -340,16 +340,41 @@
"metadata": {},
"outputs": [],
"source": [
- "visualize_activation(mx.gluon.nn.Swish())"
+ "visualize_activation(mx.gluon.nn.SiLU())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
+ "\n",
"\n",
+ "### GELU\n",
+ "The GELU is a smooth approximation to the ReLU and was introduced in
[Hendrycks et al](https://arxiv.org/abs/1606.08415). It is a common activation
function in architectures such as Transformers, BERT, and GPT.\n",
"\n",
+ "The gelu function is given as \n",
+ "\n",
+ "$$ gelu(x) = x\\cdot\\Phi(x),$$\n",
+ "\n",
+ "whereas the ReLU can be written as $x\\cdot\\mathbf{1}(x>0)$, so $Phi(x)$
serves as a smooth approximation to the ReLU's indicator function.\n",
+ "\n",
+ "Note $\\Phi(x) = \\frac{1}{\\sqrt{2 \\pi}}
\\exp\\left\\{-\\frac{x^2}{2}\\right\\}$ is the standard normal cumulative
distribution."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "visualize_activation(mx.gluon.nn.GELU())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
"\n",
"## Summary\n",
"\n",
@@ -358,7 +383,7 @@
"* Sigmoids like the logistic (sigmoid) function and tanh where the first
kinds of activation functions used in neural networks. They have since fallen
out of use because of their tendency to saturate and have vanishing
gradients.\n",
"* Rectifiers like ReLU do not saturate like the Sigmoids and so address
the vanishing gradient problem making them the de facto activation functions.
ReLU however is still plagued by the dying ReLU problem.\n",
"* LeakyReLU and PReLU are two similar approaches to improve ReLU and
address the dying ReLU by introducing a parameter $\\alpha$ (learned in PReLU)
that leaks to the gradient of negative inputs\n",
- "* MXNet also implements custom state-of-the-art activations like ELU,
SELU and Swish.\n",
+ "* MXNet also implements custom state-of-the-art activations like ELU,
SELU, SiLU, and GELU.\n",
"\n",
"\n",
"\n",
diff --git a/api/python/docs/api/gluon/nn/index.html
b/api/python/docs/api/gluon/nn/index.html
index 106f97b..24069ae 100644
--- a/api/python/docs/api/gluon/nn/index.html
+++ b/api/python/docs/api/gluon/nn/index.html
@@ -1583,7 +1583,13 @@ two modules:</p>
<td><p>Scaled Exponential Linear Unit (SELU)</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.Swish" title="mxnet.gluon.nn.Swish"><code class="xref py
py-obj docutils literal notranslate"><span
class="pre">nn.Swish</span></code></a></p></td>
-<td><p>Swish Activation function</p></td>
+<td><p>Swish Activation function (SiLU with a hyperparameter)</p></td>
+</tr>
+<tr class="row-even"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.SiLU" title="mxnet.gluon.nn.SiLU"><code class="xref py
py-obj docutils literal notranslate"><span
class="pre">nn.SiLU</span></code></a></p></td>
+<td><p>Sigmoid Linear Units</p></td>
+</tr>
+<tr class="row-odd"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.GELU" title="mxnet.gluon.nn.GELU"><code class="xref py
py-obj docutils literal notranslate"><span
class="pre">nn.GELU</span></code></a></p></td>
+<td><p>Gaussian Exponential Linear Unit (GELU)</p></td>
</tr>
</tbody>
</table>
@@ -1742,13 +1748,16 @@ two modules:</p>
<tr class="row-even"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.Sequential" title="mxnet.gluon.nn.Sequential"><code
class="xref py py-obj docutils literal notranslate"><span
class="pre">Sequential</span></code></a>()</p></td>
<td><p>Stacks Blocks sequentially.</p></td>
</tr>
-<tr class="row-odd"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.Swish" title="mxnet.gluon.nn.Swish"><code class="xref py
py-obj docutils literal notranslate"><span
class="pre">Swish</span></code></a>([beta])</p></td>
-<td><p>Swish Activation function</p></td>
+<tr class="row-odd"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.SiLU" title="mxnet.gluon.nn.SiLU"><code class="xref py
py-obj docutils literal notranslate"><span
class="pre">SiLU</span></code></a>(**kwargs)</p></td>
+<td><p>Sigmoid Linear Units</p></td>
+</tr>
+<tr class="row-even"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.Swish" title="mxnet.gluon.nn.Swish"><code class="xref py
py-obj docutils literal notranslate"><span
class="pre">Swish</span></code></a>([beta])</p></td>
+<td><p>Swish Activation function (SiLU with a hyperparameter)</p></td>
</tr>
-<tr class="row-even"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.SymbolBlock" title="mxnet.gluon.nn.SymbolBlock"><code
class="xref py py-obj docutils literal notranslate"><span
class="pre">SymbolBlock</span></code></a>(outputs, inputs[, params])</p></td>
+<tr class="row-odd"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.SymbolBlock" title="mxnet.gluon.nn.SymbolBlock"><code
class="xref py py-obj docutils literal notranslate"><span
class="pre">SymbolBlock</span></code></a>(outputs, inputs[, params])</p></td>
<td><p>Construct block from symbol.</p></td>
</tr>
-<tr class="row-odd"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.SyncBatchNorm" title="mxnet.gluon.nn.SyncBatchNorm"><code
class="xref py py-obj docutils literal notranslate"><span
class="pre">SyncBatchNorm</span></code></a>([in_channels, num_devices,
…])</p></td>
+<tr class="row-even"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.SyncBatchNorm" title="mxnet.gluon.nn.SyncBatchNorm"><code
class="xref py py-obj docutils literal notranslate"><span
class="pre">SyncBatchNorm</span></code></a>([in_channels, num_devices,
…])</p></td>
<td><p>Cross-GPU Synchronized Batch normalization (SyncBN)</p></td>
</tr>
</tbody>
@@ -4918,11 +4927,63 @@ non-hybrid children.</p>
</dd></dl>
<dl class="class">
+<dt id="mxnet.gluon.nn.SiLU">
+<em class="property">class </em><code class="sig-name
descname">SiLU</code><span class="sig-paren">(</span><em
class="sig-param">**kwargs</em><span class="sig-paren">)</span><a
class="reference internal"
href="../../../_modules/mxnet/gluon/nn/activations.html#SiLU"><span
class="viewcode-link">[source]</span></a><a class="headerlink"
href="#mxnet.gluon.nn.SiLU" title="Permalink to this definition">¶</a></dt>
+<dd><p>Bases: <code class="xref py py-class docutils literal
notranslate"><span class="pre">mxnet.gluon.block.HybridBlock</span></code></p>
+<dl class="simple">
+<dt>Sigmoid Linear Units</dt><dd><p>Originally proposed “Gaussian Error Linear
Units (GELUs)”, Hendrycks et al, 2016
+<a class="reference external"
href="https://arxiv.org/abs/1606.08415">https://arxiv.org/abs/1606.08415</a></p>
+</dd>
+</dl>
+<p><strong>Methods</strong></p>
+<table class="longtable docutils align-default">
+<colgroup>
+<col style="width: 10%" />
+<col style="width: 90%" />
+</colgroup>
+<tbody>
+<tr class="row-odd"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.SiLU.hybrid_forward"
title="mxnet.gluon.nn.SiLU.hybrid_forward"><code class="xref py py-obj docutils
literal notranslate"><span class="pre">hybrid_forward</span></code></a>(F,
x)</p></td>
+<td><p>Overrides to construct symbolic graph for this
<cite>Block</cite>.</p></td>
+</tr>
+</tbody>
+</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>beta</strong> (<em>float</em>) – silu(x) = x
* sigmoid(x)</p>
+</dd>
+</dl>
+<dl class="simple">
+<dt>Inputs:</dt><dd><ul class="simple">
+<li><p><strong>data</strong>: input tensor with arbitrary shape.</p></li>
+</ul>
+</dd>
+<dt>Outputs:</dt><dd><ul class="simple">
+<li><p><strong>out</strong>: output tensor with the same shape as
<cite>data</cite>.</p></li>
+</ul>
+</dd>
+</dl>
+<dl class="method">
+<dt id="mxnet.gluon.nn.SiLU.hybrid_forward">
+<code class="sig-name descname">hybrid_forward</code><span
class="sig-paren">(</span><em class="sig-param">F</em>, <em
class="sig-param">x</em><span class="sig-paren">)</span><a class="reference
internal"
href="../../../_modules/mxnet/gluon/nn/activations.html#SiLU.hybrid_forward"><span
class="viewcode-link">[source]</span></a><a class="headerlink"
href="#mxnet.gluon.nn.SiLU.hybrid_forward" title="Permalink to this
definition">¶</a></dt>
+<dd><p>Overrides to construct symbolic graph for this <cite>Block</cite>.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>x</strong> (<a class="reference internal"
href="../../legacy/symbol/symbol.html#mxnet.symbol.Symbol"
title="mxnet.symbol.Symbol"><em>Symbol</em></a><em> or </em><a class="reference
internal" href="../../legacy/ndarray/ndarray.html#mxnet.ndarray.NDArray"
title="mxnet.ndarray.NDArray"><em>NDArray</em></a>) – The first input
tensor.</p></li>
+<li><p><strong>*args</strong> (<em>list of Symbol</em><em> or </em><em>list of
NDArray</em>) – Additional input tensors.</p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="class">
<dt id="mxnet.gluon.nn.Swish">
<em class="property">class </em><code class="sig-name
descname">Swish</code><span class="sig-paren">(</span><em
class="sig-param">beta=1.0</em>, <em class="sig-param">**kwargs</em><span
class="sig-paren">)</span><a class="reference internal"
href="../../../_modules/mxnet/gluon/nn/activations.html#Swish"><span
class="viewcode-link">[source]</span></a><a class="headerlink"
href="#mxnet.gluon.nn.Swish" title="Permalink to this definition">¶</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal
notranslate"><span class="pre">mxnet.gluon.block.HybridBlock</span></code></p>
<dl class="simple">
-<dt>Swish Activation function</dt><dd><p><a class="reference external"
href="https://arxiv.org/pdf/1710.05941.pdf">https://arxiv.org/pdf/1710.05941.pdf</a></p>
+<dt>Swish Activation function (SiLU with a hyperparameter)</dt><dd><p><a
class="reference external"
href="https://arxiv.org/pdf/1710.05941.pdf">https://arxiv.org/pdf/1710.05941.pdf</a></p>
</dd>
</dl>
<p><strong>Methods</strong></p>
diff --git a/api/python/docs/genindex.html b/api/python/docs/genindex.html
index 790f518..36e8330 100644
--- a/api/python/docs/genindex.html
+++ b/api/python/docs/genindex.html
@@ -4544,6 +4544,8 @@ Edit on Github
</li>
<li><a
href="api/gluon/loss/index.html#mxnet.gluon.loss.SigmoidBinaryCrossEntropyLoss.hybrid_forward">(SigmoidBinaryCrossEntropyLoss
method)</a>
</li>
+ <li><a
href="api/gluon/nn/index.html#mxnet.gluon.nn.SiLU.hybrid_forward">(SiLU
method)</a>
+</li>
<li><a
href="api/gluon/loss/index.html#mxnet.gluon.loss.SoftmaxCrossEntropyLoss.hybrid_forward">(SoftmaxCrossEntropyLoss
method)</a>
</li>
<li><a
href="api/gluon/loss/index.html#mxnet.gluon.loss.SquaredHingeLoss.hybrid_forward">(SquaredHingeLoss
method)</a>
@@ -7897,6 +7899,8 @@ Edit on Github
<li><a
href="api/legacy/symbol/op/index.html#mxnet.symbol.op.signum_update">(in module
mxnet.symbol.op)</a>
</li>
</ul></li>
+ <li><a href="api/gluon/nn/index.html#mxnet.gluon.nn.SiLU">SiLU (class in
mxnet.gluon.nn)</a>
+</li>
<li><a
href="api/gluon/data/index.html#mxnet.gluon.data.SimpleDataset">SimpleDataset
(class in mxnet.gluon.data)</a>
</li>
<li><a href="api/legacy/ndarray/ndarray.html#mxnet.ndarray.sin">sin()
(in module mxnet.ndarray)</a>
diff --git a/api/python/docs/objects.inv b/api/python/docs/objects.inv
index be88475..a4f66a4 100644
Binary files a/api/python/docs/objects.inv and b/api/python/docs/objects.inv
differ
diff --git a/api/python/docs/searchindex.js b/api/python/docs/searchindex.js
index a65f605..3dd6047 100644
--- a/api/python/docs/searchindex.js
+++ b/api/python/docs/searchindex.js
@@ -1 +1 @@
-Search.setIndex({docnames:["api/autograd/index","api/context/index","api/contrib/index","api/contrib/io/index","api/contrib/ndarray/index","api/contrib/onnx/index","api/contrib/symbol/index","api/contrib/tensorboard/index","api/contrib/tensorrt/index","api/contrib/text/index","api/engine/index","api/executor/index","api/gluon/block","api/gluon/constant","api/gluon/contrib/index","api/gluon/data/index","api/gluon/data/vision/datasets/index","api/gluon/data/vision/index","api/gluon/data/vi
[...]
\ No newline at end of file
+Search.setIndex({docnames:["api/autograd/index","api/context/index","api/contrib/index","api/contrib/io/index","api/contrib/ndarray/index","api/contrib/onnx/index","api/contrib/symbol/index","api/contrib/tensorboard/index","api/contrib/tensorrt/index","api/contrib/text/index","api/engine/index","api/executor/index","api/gluon/block","api/gluon/constant","api/gluon/contrib/index","api/gluon/data/index","api/gluon/data/vision/datasets/index","api/gluon/data/vision/index","api/gluon/data/vi
[...]
\ No newline at end of file
diff --git
a/api/python/docs/tutorials/packages/gluon/blocks/activations/activations.html
b/api/python/docs/tutorials/packages/gluon/blocks/activations/activations.html
index ce95976..43d90b3 100644
---
a/api/python/docs/tutorials/packages/gluon/blocks/activations/activations.html
+++
b/api/python/docs/tutorials/packages/gluon/blocks/activations/activations.html
@@ -1544,7 +1544,7 @@ div.rendered_html tbody tr:hover {
<h1>Activation Blocks<a class="headerlink" href="#Activation-Blocks"
title="Permalink to this headline">¶</a></h1>
<p>Deep neural networks are a way to express a nonlinear function with lots of
parameters from input data to outputs. The nonlinearities that allow neural
networks to capture complex patterns in data are referred to as activation
functions. Over the course of the development of neural networks, several
nonlinear activation functions have been introduced to make gradient-based deep
learning tractable.</p>
<p>If you are looking to answer the question, ‘which activation function
should I use for my neural network model?’, you should probably go with
<em>ReLU</em>. Unless you’re trying to implement something like a gating
mechanism, like in LSTMs or GRU cells, then you should opt for sigmoid and/or
tanh in those cells. However, if you have a working model architecture and
you’re trying to improve its performance by swapping out activation functions
or treating the activation function as a hy [...]
-you may want to try hand-designed activations like SELU or a function
discovered by reinforcement learning and exhaustive search like Swish. This
guide describes these activation functions and others implemented in MXNet in
detail.</p>
+you may want to try hand-designed activations like SELU, SiLU, or GELU. This
guide describes these activation functions and others implemented in MXNet in
detail.</p>
<div class="section" id="Visualizing-Activations">
<h2>Visualizing Activations<a class="headerlink"
href="#Visualizing-Activations" title="Permalink to this headline">¶</a></h2>
<p>In order to compare the various activation functions and to understand the
nuances of their differences we have a snippet of code to plot the activation
functions (used in the forward pass) and their gradients (used in the backward
pass).</p>
@@ -1764,23 +1764,42 @@ addressed by ensuring that the tuning the learning rate
to ensure that it’s no
</div>
<p><img alt="selu activation and gradient"
src="tutorials/packages/gluon/blocks/activations/images/selu.png" /></p>
</div>
-<div class="section" id="Swish">
-<h3>Swish<a class="headerlink" href="#Swish" title="Permalink to this
headline">¶</a></h3>
-<p>Swish is an activation function that attempts to address the shortcomings
of ReLU by combining ideas from ReLU and sigmoid. Swish was discovered by
searching the space of activation functions using a combination of exhaustive
and reinforcement learning-based search and was introduced in the paper by <a
class="reference external"
href="https://arxiv.org/pdf/1710.05941.pdf">Ramchandran et al</a>.</p>
-<p>The swish function is given as</p>
+<div class="section" id="SiLU">
+<h3>SiLU<a class="headerlink" href="#SiLU" title="Permalink to this
headline">¶</a></h3>
+<p>The SiLU is an activation function that attempts to address the
shortcomings of ReLU by combining ideas from ReLU and sigmoid. The SiLU serves
as a smooth approximation to the ReLU and was originally introduced in <a
class="reference external" href="https://arxiv.org/abs/1606.08415">Hendrycks et
al</a>.</p>
+<p>The silu function is given as</p>
<div class="math notranslate nohighlight">
-\[swish(x) = x\cdot\sigma(\beta x)\]</div>
-<p>where <span class="math notranslate nohighlight">\(\sigma\)</span> is the
sigmoid activation function <span class="math notranslate
nohighlight">\(\sigma(x) = \frac{1}{1 + e^{-x}}\)</span> described above and
<span class="math notranslate nohighlight">\(\beta\)</span> is a hyperparameter
set to 1 by default in MXNet.</p>
+\[silu(x) = x\cdot\sigma(x)\]</div>
+<p>where <span class="math notranslate nohighlight">\(\sigma\)</span> is the
sigmoid activation function <span class="math notranslate
nohighlight">\(\sigma(x) = \frac{1}{1 + e^{-x}}\)</span> described above.</p>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div
class="highlight"><pre><span></span>[ ]:
</pre></div>
</div>
<div class="input_area highlight-python notranslate"><div
class="highlight"><pre>
-<span></span><span class="n">visualize_activation</span><span
class="p">(</span><span class="n">mx</span><span class="o">.</span><span
class="n">gluon</span><span class="o">.</span><span class="n">nn</span><span
class="o">.</span><span class="n">Swish</span><span class="p">())</span>
+<span></span><span class="n">visualize_activation</span><span
class="p">(</span><span class="n">mx</span><span class="o">.</span><span
class="n">gluon</span><span class="o">.</span><span class="n">nn</span><span
class="o">.</span><span class="n">SiLU</span><span class="p">())</span>
</pre></div>
</div>
</div>
-<p><img alt="swish activation and gradient"
src="tutorials/packages/gluon/blocks/activations/images/swish.png" /></p>
+<p><img alt="silu activation and gradient"
src="tutorials/packages/gluon/blocks/activations/images/silu.png" /></p>
+</div>
+<div class="section" id="GELU">
+<h3>GELU<a class="headerlink" href="#GELU" title="Permalink to this
headline">¶</a></h3>
+<p>The GELU is a smooth approximation to the ReLU and was introduced in <a
class="reference external" href="https://arxiv.org/abs/1606.08415">Hendrycks et
al</a>. It is a common activation function in architectures such as
Transformers, BERT, and GPT.</p>
+<p>The gelu function is given as</p>
+<div class="math notranslate nohighlight">
+\[gelu(x) = x\cdot\Phi(x),\]</div>
+<p>whereas the ReLU can be written as <span class="math notranslate
nohighlight">\(x\cdot\mathbf{1}(x>0)\)</span>, so <span class="math
notranslate nohighlight">\(Phi(x)\)</span> serves as a smooth approximation to
the ReLU’s indicator function.</p>
+<p>Note <span class="math notranslate nohighlight">\(\Phi(x) =
\frac{1}{\sqrt{2 \pi}} \exp\left\{-\frac{x^2}{2}\right\}\)</span> is the
standard normal cumulative distribution.</p>
+<div class="nbinput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div
class="highlight"><pre><span></span>[ ]:
+</pre></div>
+</div>
+<div class="input_area highlight-python notranslate"><div
class="highlight"><pre>
+<span></span><span class="n">visualize_activation</span><span
class="p">(</span><span class="n">mx</span><span class="o">.</span><span
class="n">gluon</span><span class="o">.</span><span class="n">nn</span><span
class="o">.</span><span class="n">GELU</span><span class="p">())</span>
+</pre></div>
+</div>
+</div>
+<p><img alt="gelu activation and gradient"
src="tutorials/packages/gluon/blocks/activations/images/gelu.png" /></p>
</div>
</div>
<div class="section" id="Summary">
@@ -1791,7 +1810,7 @@ addressed by ensuring that the tuning the learning rate
to ensure that it’s no
<li><p>Sigmoids like the logistic (sigmoid) function and tanh where the first
kinds of activation functions used in neural networks. They have since fallen
out of use because of their tendency to saturate and have vanishing
gradients.</p></li>
<li><p>Rectifiers like ReLU do not saturate like the Sigmoids and so address
the vanishing gradient problem making them the de facto activation functions.
ReLU however is still plagued by the dying ReLU problem.</p></li>
<li><p>LeakyReLU and PReLU are two similar approaches to improve ReLU and
address the dying ReLU by introducing a parameter <span class="math notranslate
nohighlight">\(\alpha\)</span> (learned in PReLU) that leaks to the gradient of
negative inputs</p></li>
-<li><p>MXNet also implements custom state-of-the-art activations like ELU,
SELU and Swish.</p></li>
+<li><p>MXNet also implements custom state-of-the-art activations like ELU,
SELU, SiLU, and GELU.</p></li>
</ul>
</div>
<div class="section" id="Next-Steps">
@@ -1836,7 +1855,8 @@ guide</a> to learn how to implement your own custom
activation layer.</p>
<li><a class="reference internal" href="#PReLU">PReLU</a></li>
<li><a class="reference internal" href="#ELU">ELU</a></li>
<li><a class="reference internal" href="#SELU">SELU</a></li>
-<li><a class="reference internal" href="#Swish">Swish</a></li>
+<li><a class="reference internal" href="#SiLU">SiLU</a></li>
+<li><a class="reference internal" href="#GELU">GELU</a></li>
</ul>
</li>
<li><a class="reference internal" href="#Summary">Summary</a></li>
diff --git
a/api/python/docs/tutorials/packages/gluon/blocks/activations/activations.html.bak
b/api/python/docs/tutorials/packages/gluon/blocks/activations/activations.html.bak
index 8427e78..a674337 100644
---
a/api/python/docs/tutorials/packages/gluon/blocks/activations/activations.html.bak
+++
b/api/python/docs/tutorials/packages/gluon/blocks/activations/activations.html.bak
@@ -1544,7 +1544,7 @@ div.rendered_html tbody tr:hover {
<h1>Activation Blocks<a class="headerlink" href="#Activation-Blocks"
title="Permalink to this headline">¶</a></h1>
<p>Deep neural networks are a way to express a nonlinear function with lots of
parameters from input data to outputs. The nonlinearities that allow neural
networks to capture complex patterns in data are referred to as activation
functions. Over the course of the development of neural networks, several
nonlinear activation functions have been introduced to make gradient-based deep
learning tractable.</p>
<p>If you are looking to answer the question, ‘which activation function
should I use for my neural network model?’, you should probably go with
<em>ReLU</em>. Unless you’re trying to implement something like a gating
mechanism, like in LSTMs or GRU cells, then you should opt for sigmoid and/or
tanh in those cells. However, if you have a working model architecture and
you’re trying to improve its performance by swapping out activation functions
or treating the activation function as a hy [...]
-you may want to try hand-designed activations like SELU or a function
discovered by reinforcement learning and exhaustive search like Swish. This
guide describes these activation functions and others implemented in MXNet in
detail.</p>
+you may want to try hand-designed activations like SELU, SiLU, or GELU. This
guide describes these activation functions and others implemented in MXNet in
detail.</p>
<div class="section" id="Visualizing-Activations">
<h2>Visualizing Activations<a class="headerlink"
href="#Visualizing-Activations" title="Permalink to this headline">¶</a></h2>
<p>In order to compare the various activation functions and to understand the
nuances of their differences we have a snippet of code to plot the activation
functions (used in the forward pass) and their gradients (used in the backward
pass).</p>
@@ -1764,23 +1764,42 @@ addressed by ensuring that the tuning the learning rate
to ensure that it’s no
</div>
<p><img alt="selu activation and gradient"
src="tutorials/packages/gluon/blocks/activations/images/selu.png" /></p>
</div>
-<div class="section" id="Swish">
-<h3>Swish<a class="headerlink" href="#Swish" title="Permalink to this
headline">¶</a></h3>
-<p>Swish is an activation function that attempts to address the shortcomings
of ReLU by combining ideas from ReLU and sigmoid. Swish was discovered by
searching the space of activation functions using a combination of exhaustive
and reinforcement learning-based search and was introduced in the paper by <a
class="reference external"
href="https://arxiv.org/pdf/1710.05941.pdf">Ramchandran et al</a>.</p>
-<p>The swish function is given as</p>
+<div class="section" id="SiLU">
+<h3>SiLU<a class="headerlink" href="#SiLU" title="Permalink to this
headline">¶</a></h3>
+<p>The SiLU is an activation function that attempts to address the
shortcomings of ReLU by combining ideas from ReLU and sigmoid. The SiLU serves
as a smooth approximation to the ReLU and was originally introduced in <a
class="reference external" href="https://arxiv.org/abs/1606.08415">Hendrycks et
al</a>.</p>
+<p>The silu function is given as</p>
<div class="math notranslate nohighlight">
-\[swish(x) = x\cdot\sigma(\beta x)\]</div>
-<p>where <span class="math notranslate nohighlight">\(\sigma\)</span> is the
sigmoid activation function <span class="math notranslate
nohighlight">\(\sigma(x) = \frac{1}{1 + e^{-x}}\)</span> described above and
<span class="math notranslate nohighlight">\(\beta\)</span> is a hyperparameter
set to 1 by default in MXNet.</p>
+\[silu(x) = x\cdot\sigma(x)\]</div>
+<p>where <span class="math notranslate nohighlight">\(\sigma\)</span> is the
sigmoid activation function <span class="math notranslate
nohighlight">\(\sigma(x) = \frac{1}{1 + e^{-x}}\)</span> described above.</p>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div
class="highlight"><pre><span></span>[ ]:
</pre></div>
</div>
<div class="input_area highlight-python notranslate"><div
class="highlight"><pre>
-<span></span><span class="n">visualize_activation</span><span
class="p">(</span><span class="n">mx</span><span class="o">.</span><span
class="n">gluon</span><span class="o">.</span><span class="n">nn</span><span
class="o">.</span><span class="n">Swish</span><span class="p">())</span>
+<span></span><span class="n">visualize_activation</span><span
class="p">(</span><span class="n">mx</span><span class="o">.</span><span
class="n">gluon</span><span class="o">.</span><span class="n">nn</span><span
class="o">.</span><span class="n">SiLU</span><span class="p">())</span>
</pre></div>
</div>
</div>
-<p><img alt="swish activation and gradient"
src="tutorials/packages/gluon/blocks/activations/images/swish.png" /></p>
+<p><img alt="silu activation and gradient"
src="tutorials/packages/gluon/blocks/activations/images/silu.png" /></p>
+</div>
+<div class="section" id="GELU">
+<h3>GELU<a class="headerlink" href="#GELU" title="Permalink to this
headline">¶</a></h3>
+<p>The GELU is a smooth approximation to the ReLU and was introduced in <a
class="reference external" href="https://arxiv.org/abs/1606.08415">Hendrycks et
al</a>. It is a common activation function in architectures such as
Transformers, BERT, and GPT.</p>
+<p>The gelu function is given as</p>
+<div class="math notranslate nohighlight">
+\[gelu(x) = x\cdot\Phi(x),\]</div>
+<p>whereas the ReLU can be written as <span class="math notranslate
nohighlight">\(x\cdot\mathbf{1}(x>0)\)</span>, so <span class="math
notranslate nohighlight">\(Phi(x)\)</span> serves as a smooth approximation to
the ReLU’s indicator function.</p>
+<p>Note <span class="math notranslate nohighlight">\(\Phi(x) =
\frac{1}{\sqrt{2 \pi}} \exp\left\{-\frac{x^2}{2}\right\}\)</span> is the
standard normal cumulative distribution.</p>
+<div class="nbinput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div
class="highlight"><pre><span></span>[ ]:
+</pre></div>
+</div>
+<div class="input_area highlight-python notranslate"><div
class="highlight"><pre>
+<span></span><span class="n">visualize_activation</span><span
class="p">(</span><span class="n">mx</span><span class="o">.</span><span
class="n">gluon</span><span class="o">.</span><span class="n">nn</span><span
class="o">.</span><span class="n">GELU</span><span class="p">())</span>
+</pre></div>
+</div>
+</div>
+<p><img alt="gelu activation and gradient"
src="tutorials/packages/gluon/blocks/activations/images/gelu.png" /></p>
</div>
</div>
<div class="section" id="Summary">
@@ -1791,7 +1810,7 @@ addressed by ensuring that the tuning the learning rate
to ensure that it’s no
<li><p>Sigmoids like the logistic (sigmoid) function and tanh where the first
kinds of activation functions used in neural networks. They have since fallen
out of use because of their tendency to saturate and have vanishing
gradients.</p></li>
<li><p>Rectifiers like ReLU do not saturate like the Sigmoids and so address
the vanishing gradient problem making them the de facto activation functions.
ReLU however is still plagued by the dying ReLU problem.</p></li>
<li><p>LeakyReLU and PReLU are two similar approaches to improve ReLU and
address the dying ReLU by introducing a parameter <span class="math notranslate
nohighlight">\(\alpha\)</span> (learned in PReLU) that leaks to the gradient of
negative inputs</p></li>
-<li><p>MXNet also implements custom state-of-the-art activations like ELU,
SELU and Swish.</p></li>
+<li><p>MXNet also implements custom state-of-the-art activations like ELU,
SELU, SiLU, and GELU.</p></li>
</ul>
</div>
<div class="section" id="Next-Steps">
@@ -1836,7 +1855,8 @@ guide</a> to learn how to implement your own custom
activation layer.</p>
<li><a class="reference internal" href="#PReLU">PReLU</a></li>
<li><a class="reference internal" href="#ELU">ELU</a></li>
<li><a class="reference internal" href="#SELU">SELU</a></li>
-<li><a class="reference internal" href="#Swish">Swish</a></li>
+<li><a class="reference internal" href="#SiLU">SiLU</a></li>
+<li><a class="reference internal" href="#GELU">GELU</a></li>
</ul>
</li>
<li><a class="reference internal" href="#Summary">Summary</a></li>
diff --git a/date.txt b/date.txt
deleted file mode 100644
index b906a8b..0000000
--- a/date.txt
+++ /dev/null
@@ -1 +0,0 @@
-Mon Nov 9 12:43:55 UTC 2020
diff --git a/feed.xml b/feed.xml
index 3fc6f78..486d677 100644
--- a/feed.xml
+++ b/feed.xml
@@ -1 +1 @@
-<?xml version="1.0" encoding="utf-8"?><feed
xmlns="http://www.w3.org/2005/Atom" ><generator uri="https://jekyllrb.com/"
version="4.0.0">Jekyll</generator><link
href="https://mxnet.apache.org/versions/master/feed.xml" rel="self"
type="application/atom+xml" /><link
href="https://mxnet.apache.org/versions/master/" rel="alternate"
type="text/html"
/><updated>2020-11-09T12:32:51+00:00</updated><id>https://mxnet.apache.org/versions/master/feed.xml</id><title
type="html">Apache MXNet</title><su [...]
\ No newline at end of file
+<?xml version="1.0" encoding="utf-8"?><feed
xmlns="http://www.w3.org/2005/Atom" ><generator uri="https://jekyllrb.com/"
version="4.0.0">Jekyll</generator><link
href="https://mxnet.apache.org/versions/master/feed.xml" rel="self"
type="application/atom+xml" /><link
href="https://mxnet.apache.org/versions/master/" rel="alternate"
type="text/html"
/><updated>2020-11-09T18:33:17+00:00</updated><id>https://mxnet.apache.org/versions/master/feed.xml</id><title
type="html">Apache MXNet</title><su [...]
\ No newline at end of file
diff --git a/versions/master/api/python/docs/_modules/mxnet/gluon/block.html
b/versions/master/api/python/docs/_modules/mxnet/gluon/block.html
index baa4093..7856e40 100644
--- a/versions/master/api/python/docs/_modules/mxnet/gluon/block.html
+++ b/versions/master/api/python/docs/_modules/mxnet/gluon/block.html
@@ -2368,14 +2368,35 @@ Edit on Github
<span class="n">arg_dict</span><span class="p">,</span> <span
class="n">aux_dict</span> <span class="o">=</span> <span
class="nb">dict</span><span class="p">(),</span> <span
class="nb">dict</span><span class="p">()</span>
<span class="k">if</span> <span class="bp">self</span><span
class="o">.</span><span class="n">_backend</span><span class="p">:</span>
- <span class="n">ctx</span> <span class="o">=</span> <span
class="n">args</span><span class="p">[</span><span class="mi">0</span><span
class="p">]</span><span class="o">.</span><span class="n">context</span>
+ <span class="c1"># set context for inputs</span>
+ <span class="n">_</span><span class="p">,</span> <span
class="n">_</span><span class="p">,</span> <span class="n">ctx_set</span><span
class="p">,</span> <span class="n">_</span> <span class="o">=</span> <span
class="n">_gather_type_ctx_info</span><span class="p">(</span><span
class="nb">list</span><span class="p">(</span><span class="n">args</span><span
class="p">))</span>
+ <span class="n">ctx</span> <span class="o">=</span> <span
class="n">ctx_set</span><span class="o">.</span><span class="n">pop</span><span
class="p">()</span> <span class="k">if</span> <span class="nb">len</span><span
class="p">(</span><span class="n">ctx_set</span><span class="p">)</span> <span
class="o">></span> <span class="mi">0</span> <span class="k">else</span>
<span class="kc">None</span>
<span class="c1"># get list of params in the order of
out.list_arguments</span>
- <span class="n">arg_dict</span><span class="o">.</span><span
class="n">update</span><span class="p">({</span><span
class="n">name</span><span class="p">:</span><span class="n">args</span><span
class="p">[</span><span class="n">data_names</span><span
class="p">[</span><span class="n">name</span><span class="p">]]</span> <span
class="k">if</span> <span class="n">name</span> <span class="ow">in</span>
<span class="n">data_names</span><span class="o">.</span><span class="n">keys<
[...]
- <span class="k">for</span> <span
class="n">name</span> <span class="ow">in</span> <span
class="n">out</span><span class="o">.</span><span
class="n">list_arguments</span><span class="p">()})</span>
- <span class="n">aux_dict</span><span class="o">.</span><span
class="n">update</span><span class="p">({</span><span
class="n">name</span><span class="p">:</span><span class="n">args</span><span
class="p">[</span><span class="n">data_names</span><span
class="p">[</span><span class="n">name</span><span class="p">]]</span> <span
class="k">if</span> <span class="n">name</span> <span class="ow">in</span>
<span class="n">data_names</span><span class="o">.</span><span class="n">keys<
[...]
- <span class="k">for</span> <span
class="n">name</span> <span class="ow">in</span> <span
class="n">out</span><span class="o">.</span><span
class="n">list_auxiliary_states</span><span class="p">()})</span>
- <span class="c1"># Partition the graph.</span>
- <span class="n">out</span> <span class="o">=</span> <span
class="n">out</span><span class="o">.</span><span
class="n">optimize_for</span><span class="p">(</span><span
class="bp">self</span><span class="o">.</span><span
class="n">_backend</span><span class="p">,</span> <span
class="n">arg_dict</span><span class="p">,</span> <span
class="n">aux_dict</span><span class="p">,</span> <span
class="n">ctx</span><span class="p">,</span> <span class="o">**</span><span
class="bp">self</ [...]
+ <span class="n">input_shapes</span> <span class="o">=</span> <span
class="nb">dict</span><span class="p">()</span>
+ <span class="k">for</span> <span class="n">name</span> <span
class="ow">in</span> <span class="n">out</span><span class="o">.</span><span
class="n">list_arguments</span><span class="p">():</span>
+ <span class="k">if</span> <span class="n">name</span> <span
class="ow">in</span> <span class="n">data_names</span><span
class="o">.</span><span class="n">keys</span><span class="p">()</span> <span
class="ow">and</span> <span class="n">data_names</span><span
class="p">[</span><span class="n">name</span><span class="p">]</span> <span
class="o"><</span> <span class="nb">len</span><span class="p">(</span><span
class="n">args</span><span class="p">):</span>
+ <span class="k">if</span> <span
class="nb">isinstance</span><span class="p">(</span><span
class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]],</span> <span
class="n">NDArray</span><span class="p">):</span>
+ <span class="n">arg_dict</span><span
class="p">[</span><span class="n">name</span><span class="p">]</span> <span
class="o">=</span> <span class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]]</span>
+ <span class="k">elif</span> <span class="p">(</span><span
class="nb">isinstance</span><span class="p">(</span><span
class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]],</span> <span
class="n">symbol</span><span class="o">.</span><span
class="n">Symbol</span><span class="p">)</span> <span class="ow">and</span>
+ <span class="s1">'__shape__'</span> <span
class="ow">in</span> <span class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]]</span><span class="o">.</span><span
class="n">list_attr</span><span class="p">()):</span>
+ <span class="n">shape_str</span> <span
class="o">=</span> <span class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]]</span><span class="o">.</span><span
class="n">list_attr</span><span class="p">()[</span><span
class="s1">'__shape__'</span><span class="p">]</span>
+ <span class="n">input_shapes</span><span
class="p">[</span><span class="n">name</span><span class="p">]</span> <span
class="o">=</span> <span class="nb">tuple</span><span class="p">(</span><span
class="nb">map</span><span class="p">(</span><span class="nb">int</span><span
class="p">,</span> <span class="n">shape_str</span><span
class="o">.</span><span class="n">strip</span><span class="p">(</span><span
class="s1">'()'</span><span class="p">)</span><span cl [...]
+ <span class="k">elif</span> <span class="n">name</span> <span
class="ow">in</span> <span class="n">params</span><span class="p">:</span>
+ <span class="n">arg_dict</span><span
class="p">[</span><span class="n">name</span><span class="p">]</span> <span
class="o">=</span> <span class="n">params</span><span class="p">[</span><span
class="n">name</span><span class="p">]</span><span class="o">.</span><span
class="n">data</span><span class="p">()</span>
+
+ <span class="k">for</span> <span class="n">name</span> <span
class="ow">in</span> <span class="n">out</span><span class="o">.</span><span
class="n">list_auxiliary_states</span><span class="p">():</span>
+ <span class="k">if</span> <span class="n">name</span> <span
class="ow">in</span> <span class="n">data_names</span><span
class="o">.</span><span class="n">keys</span><span class="p">()</span> <span
class="ow">and</span> <span class="n">data_names</span><span
class="p">[</span><span class="n">name</span><span class="p">]</span> <span
class="o"><</span> <span class="nb">len</span><span class="p">(</span><span
class="n">args</span><span class="p">):</span>
+ <span class="k">if</span> <span
class="nb">isinstance</span><span class="p">(</span><span
class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]],</span> <span
class="n">NDArray</span><span class="p">):</span>
+ <span class="n">aux_dict</span><span
class="p">[</span><span class="n">name</span><span class="p">]</span> <span
class="o">=</span> <span class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]]</span>
+ <span class="k">elif</span> <span class="p">(</span><span
class="nb">isinstance</span><span class="p">(</span><span
class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]],</span> <span
class="n">symbol</span><span class="o">.</span><span
class="n">Symbol</span><span class="p">)</span> <span class="ow">and</span>
+ <span class="s1">'__shape__'</span> <span
class="ow">in</span> <span class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]]</span><span class="o">.</span><span
class="n">list_attr</span><span class="p">()):</span>
+ <span class="n">shape_str</span> <span
class="o">=</span> <span class="n">args</span><span class="p">[</span><span
class="n">data_names</span><span class="p">[</span><span
class="n">name</span><span class="p">]]</span><span class="o">.</span><span
class="n">list_attr</span><span class="p">()[</span><span
class="s1">'__shape__'</span><span class="p">]</span>
+ <span class="n">input_shapes</span><span
class="p">[</span><span class="n">name</span><span class="p">]</span> <span
class="o">=</span> <span class="nb">tuple</span><span class="p">(</span><span
class="nb">map</span><span class="p">(</span><span class="nb">int</span><span
class="p">,</span> <span class="n">shape_str</span><span
class="o">.</span><span class="n">strip</span><span class="p">(</span><span
class="s1">'()'</span><span class="p">)</span><span cl [...]
+ <span class="k">elif</span> <span class="n">name</span> <span
class="ow">in</span> <span class="n">params</span><span class="p">:</span>
+ <span class="n">aux_dict</span><span
class="p">[</span><span class="n">name</span><span class="p">]</span> <span
class="o">=</span> <span class="n">params</span><span class="p">[</span><span
class="n">name</span><span class="p">]</span><span class="o">.</span><span
class="n">data</span><span class="p">()</span>
+
+ <span class="c1"># Partition the graph</span>
+ <span class="n">out</span> <span class="o">=</span> <span
class="n">out</span><span class="o">.</span><span
class="n">optimize_for</span><span class="p">(</span><span
class="bp">self</span><span class="o">.</span><span
class="n">_backend</span><span class="p">,</span> <span
class="n">arg_dict</span><span class="p">,</span> <span
class="n">aux_dict</span><span class="p">,</span> <span
class="n">ctx</span><span class="p">,</span> <span
class="n">input_shapes</span><span class=" [...]
<span class="c1"># convert to numpy symbol if needed</span>
<span class="k">if</span> <span class="n">_mx_npx</span><span
class="o">.</span><span class="n">is_np_array</span><span class="p">():</span>
@@ -2418,7 +2439,7 @@ Edit on Github
<span class="n">param</span> <span class="o">=</span>
<span class="n">Parameter</span><span class="p">(</span><span
class="n">name</span><span class="p">,</span> <span class="n">dtype</span><span
class="o">=</span><span class="n">param_data</span><span
class="o">.</span><span class="n">dtype</span><span class="p">)</span>
<span class="n">param</span><span class="o">.</span><span
class="n">_var_name</span> <span class="o">=</span> <span class="n">name</span>
<span class="n">serialization_name</span> <span
class="o">=</span> <span class="n">name</span> <span class="c1">#
HybridBlock.export</span>
- <span class="n">param</span><span class="o">.</span><span
class="n">_load_init</span><span class="p">(</span><span
class="n">param_data</span><span class="p">,</span> <span
class="n">args</span><span class="p">[</span><span class="mi">0</span><span
class="p">]</span><span class="o">.</span><span class="n">context</span><span
class="p">)</span>
+ <span class="n">param</span><span class="o">.</span><span
class="n">_load_init</span><span class="p">(</span><span
class="n">param_data</span><span class="p">,</span> <span
class="n">param_data</span><span class="o">.</span><span
class="n">context</span><span class="p">)</span>
<span class="n">triple</span> <span class="o">=</span> <span
class="p">(</span><span class="kc">False</span><span class="p">,</span> <span
class="n">serialization_name</span><span class="p">,</span> <span
class="n">param</span><span class="p">)</span>
<span class="bp">self</span><span class="o">.</span><span
class="n">_cached_op_args</span><span class="o">.</span><span
class="n">append</span><span class="p">(</span><span
class="n">triple</span><span class="p">)</span>
@@ -2520,14 +2541,11 @@ Edit on Github
<span class="c1"># do part of forward API call</span>
<span class="n">has_symbol</span><span class="p">,</span> <span
class="n">has_ndarray</span><span class="p">,</span> <span
class="n">ctx_set</span><span class="p">,</span> <span class="n">_</span> <span
class="o">=</span> <span class="n">_gather_type_ctx_info</span><span
class="p">([</span><span class="n">x</span><span class="p">]</span> <span
class="o">+</span> <span class="nb">list</span><span class="p">(</span><span
class="n">args</span><span class="p">))</span>
- <span class="k">if</span> <span class="n">has_symbol</span><span
class="p">:</span>
- <span class="k">raise</span> <span
class="ne">ValueError</span><span class="p">(</span><span
class="s1">'Inputs must be NDArrays for the optimize_for API'</span>
- <span class="s1">' Please check the type of
the args.</span><span class="se">\n</span><span class="s1">'</span><span
class="p">)</span>
<span class="k">if</span> <span class="ow">not</span> <span
class="n">has_symbol</span> <span class="ow">and</span> <span
class="ow">not</span> <span class="n">has_ndarray</span><span class="p">:</span>
- <span class="k">raise</span> <span
class="ne">ValueError</span><span class="p">(</span><span class="s1">'In
HybridBlock, there must be one NDArray as input.'</span>
+ <span class="k">raise</span> <span
class="ne">ValueError</span><span class="p">(</span><span class="s1">'In
HybridBlock, there must be one NDArray or one Symbol in the input.'</span>
<span class="s1">' Please check the type of
the args.</span><span class="se">\n</span><span class="s1">'</span><span
class="p">)</span>
<span class="k">if</span> <span class="nb">len</span><span
class="p">(</span><span class="n">ctx_set</span><span class="p">)</span> <span
class="o">></span> <span class="mi">1</span><span class="p">:</span>
- <span class="k">raise</span> <span
class="ne">ValueError</span><span class="p">(</span><span class="s1">'Find
multiple contexts in the input, '</span>
+ <span class="k">raise</span> <span
class="ne">ValueError</span><span class="p">(</span><span class="s1">'Found
multiple contexts in the input, '</span>
<span class="s1">'After hybridized, the
HybridBlock only supports one input '</span>
<span class="s1">'context. You can print the
ele.ctx in the '</span>
<span class="s1">'input arguments to inspect
their contexts. '</span>
diff --git
a/versions/master/api/python/docs/_modules/mxnet/gluon/nn/activations.html
b/versions/master/api/python/docs/_modules/mxnet/gluon/nn/activations.html
index eac566d..a5ef9ef 100644
--- a/versions/master/api/python/docs/_modules/mxnet/gluon/nn/activations.html
+++ b/versions/master/api/python/docs/_modules/mxnet/gluon/nn/activations.html
@@ -1356,7 +1356,7 @@ Edit on Github
<span class="c1"># coding: utf-8</span>
<span class="c1"># pylint: disable= arguments-differ</span>
<span class="sd">"""Basic neural network
layers."""</span>
-<span class="n">__all__</span> <span class="o">=</span> <span
class="p">[</span><span class="s1">'Activation'</span><span
class="p">,</span> <span class="s1">'LeakyReLU'</span><span
class="p">,</span> <span class="s1">'PReLU'</span><span
class="p">,</span> <span class="s1">'ELU'</span><span
class="p">,</span> <span class="s1">'SELU'</span><span
class="p">,</span> <span class="s1">'Swish'</span><span
class="p">,</span> <span class="s1">' [...]
+<span class="n">__all__</span> <span class="o">=</span> <span
class="p">[</span><span class="s1">'Activation'</span><span
class="p">,</span> <span class="s1">'LeakyReLU'</span><span
class="p">,</span> <span class="s1">'PReLU'</span><span
class="p">,</span> <span class="s1">'ELU'</span><span
class="p">,</span> <span class="s1">'SELU'</span><span
class="p">,</span> <span class="s1">'Swish'</span><span
class="p">,</span> <span class="s1">' [...]
<span class="kn">from</span> <span class="nn">...</span> <span
class="kn">import</span> <span class="n">initializer</span>
<span class="kn">from</span> <span class="nn">..block</span> <span
class="kn">import</span> <span class="n">HybridBlock</span>
@@ -1553,7 +1553,7 @@ Edit on Github
<div class="viewcode-block" id="Swish"><a class="viewcode-back"
href="../../../../api/gluon/nn/index.html#mxnet.gluon.nn.Swish">[docs]</a><span
class="k">class</span> <span class="nc">Swish</span><span
class="p">(</span><span class="n">HybridBlock</span><span class="p">):</span>
<span class="sa">r</span><span class="sd">"""</span>
-<span class="sd"> Swish Activation function</span>
+<span class="sd"> Swish Activation function (SiLU with a
hyperparameter)</span>
<span class="sd"> https://arxiv.org/pdf/1710.05941.pdf</span>
<span class="sd"> Parameters</span>
@@ -1578,6 +1578,35 @@ Edit on Github
<span class="k">return</span> <span class="n">x</span> <span
class="o">*</span> <span class="n">F</span><span class="o">.</span><span
class="n">npx</span><span class="o">.</span><span class="n">sigmoid</span><span
class="p">(</span><span class="bp">self</span><span class="o">.</span><span
class="n">_beta</span> <span class="o">*</span> <span class="n">x</span><span
class="p">)</span>
<span class="k">else</span><span class="p">:</span>
<span class="k">return</span> <span class="n">x</span> <span
class="o">*</span> <span class="n">F</span><span class="o">.</span><span
class="n">sigmoid</span><span class="p">(</span><span
class="bp">self</span><span class="o">.</span><span class="n">_beta</span>
<span class="o">*</span> <span class="n">x</span><span class="p">,</span> <span
class="n">name</span><span class="o">=</span><span
class="s1">'fwd'</span><span class="p">)</span></div></div>
+
+
+<div class="viewcode-block" id="SiLU"><a class="viewcode-back"
href="../../../../api/gluon/nn/index.html#mxnet.gluon.nn.SiLU">[docs]</a><span
class="k">class</span> <span class="nc">SiLU</span><span
class="p">(</span><span class="n">HybridBlock</span><span class="p">):</span>
+ <span class="sa">r</span><span class="sd">"""</span>
+<span class="sd"> Sigmoid Linear Units</span>
+<span class="sd"> Originally proposed "Gaussian Error Linear Units
(GELUs)", Hendrycks et al, 2016</span>
+<span class="sd"> https://arxiv.org/abs/1606.08415</span>
+
+<span class="sd"> Parameters</span>
+<span class="sd"> ----------</span>
+<span class="sd"> beta : float</span>
+<span class="sd"> silu(x) = x * sigmoid(x)</span>
+
+
+<span class="sd"> Inputs:</span>
+<span class="sd"> - **data**: input tensor with arbitrary shape.</span>
+
+<span class="sd"> Outputs:</span>
+<span class="sd"> - **out**: output tensor with the same shape as
`data`.</span>
+<span class="sd"> """</span>
+
+ <span class="k">def</span> <span class="fm">__init__</span><span
class="p">(</span><span class="bp">self</span><span class="p">,</span> <span
class="o">**</span><span class="n">kwargs</span><span class="p">):</span>
+ <span class="nb">super</span><span class="p">(</span><span
class="n">SiLU</span><span class="p">,</span> <span class="bp">self</span><span
class="p">)</span><span class="o">.</span><span class="fm">__init__</span><span
class="p">(</span><span class="o">**</span><span class="n">kwargs</span><span
class="p">)</span>
+
+<div class="viewcode-block" id="SiLU.hybrid_forward"><a class="viewcode-back"
href="../../../../api/gluon/nn/index.html#mxnet.gluon.nn.SiLU.hybrid_forward">[docs]</a>
<span class="k">def</span> <span class="nf">hybrid_forward</span><span
class="p">(</span><span class="bp">self</span><span class="p">,</span> <span
class="n">F</span><span class="p">,</span> <span class="n">x</span><span
class="p">):</span>
+ <span class="k">if</span> <span class="n">is_np_array</span><span
class="p">():</span>
+ <span class="k">return</span> <span class="n">x</span> <span
class="o">*</span> <span class="n">F</span><span class="o">.</span><span
class="n">npx</span><span class="o">.</span><span class="n">sigmoid</span><span
class="p">(</span><span class="n">x</span><span class="p">)</span>
+ <span class="k">else</span><span class="p">:</span>
+ <span class="k">return</span> <span class="n">x</span> <span
class="o">*</span> <span class="n">F</span><span class="o">.</span><span
class="n">sigmoid</span><span class="p">(</span><span class="n">x</span><span
class="p">,</span> <span class="n">name</span><span class="o">=</span><span
class="s1">'fwd'</span><span class="p">)</span></div></div>
</pre></div>
<hr class="feedback-hr-top" />
diff --git a/versions/master/api/python/docs/_sources/api/gluon/nn/index.rst
b/versions/master/api/python/docs/_sources/api/gluon/nn/index.rst
index 0094647..7aeacd8 100644
--- a/versions/master/api/python/docs/_sources/api/gluon/nn/index.rst
+++ b/versions/master/api/python/docs/_sources/api/gluon/nn/index.rst
@@ -147,6 +147,8 @@ Advanced Activation Layers
nn.ELU
nn.SELU
nn.Swish
+ nn.SiLU
+ nn.GELU
API Reference
-------------
diff --git
a/versions/master/api/python/docs/_sources/tutorials/packages/gluon/blocks/activations/activations.ipynb
b/versions/master/api/python/docs/_sources/tutorials/packages/gluon/blocks/activations/activations.ipynb
index fbdaec2..57f9196 100644
---
a/versions/master/api/python/docs/_sources/tutorials/packages/gluon/blocks/activations/activations.ipynb
+++
b/versions/master/api/python/docs/_sources/tutorials/packages/gluon/blocks/activations/activations.ipynb
@@ -25,7 +25,7 @@
"\n",
"Deep neural networks are a way to express a nonlinear function with lots
of parameters from input data to outputs. The nonlinearities that allow neural
networks to capture complex patterns in data are referred to as activation
functions. Over the course of the development of neural networks, several
nonlinear activation functions have been introduced to make gradient-based deep
learning tractable. \n",
"\n",
- "If you are looking to answer the question, 'which activation function
should I use for my neural network model?', you should probably go with *ReLU*.
Unless you're trying to implement something like a gating mechanism, like in
LSTMs or GRU cells, then you should opt for sigmoid and/or tanh in those cells.
However, if you have a working model architecture and you're trying to improve
its performance by swapping out activation functions or treating the activation
function as a hyperpa [...]
+ "If you are looking to answer the question, 'which activation function
should I use for my neural network model?', you should probably go with *ReLU*.
Unless you're trying to implement something like a gating mechanism, like in
LSTMs or GRU cells, then you should opt for sigmoid and/or tanh in those cells.
However, if you have a working model architecture and you're trying to improve
its performance by swapping out activation functions or treating the activation
function as a hyperpa [...]
"\n",
"## Visualizing Activations\n",
"In order to compare the various activation functions and to understand
the nuances of their differences we have a snippet of code to plot the
activation functions (used in the forward pass) and their gradients (used in
the backward pass)."
@@ -324,14 +324,14 @@
"\n",
"\n",
"\n",
- "### Swish\n",
- "Swish is an activation function that attempts to address the shortcomings
of ReLU by combining ideas from ReLU and sigmoid. Swish was discovered by
searching the space of activation functions using a combination of exhaustive
and reinforcement learning-based search and was introduced in the paper by
[Ramchandran et al](https://arxiv.org/pdf/1710.05941.pdf).\n",
+ "### SiLU\n",
+ "The SiLU is an activation function that attempts to address the
shortcomings of ReLU by combining ideas from ReLU and sigmoid. The SiLU serves
as a smooth approximation to the ReLU and was originally introduced in
[Hendrycks et al](https://arxiv.org/abs/1606.08415).\n",
"\n",
- "The swish function is given as \n",
+ "The silu function is given as \n",
"\n",
- "$$ swish(x) = x\\cdot\\sigma(\\beta x)$$\n",
+ "$$ silu(x) = x\\cdot\\sigma(x)$$\n",
"\n",
- "where $\\sigma$ is the sigmoid activation function $\\sigma(x) =
\\frac{1}{1 + e^{-x}}$ described above and $\\beta$ is a hyperparameter set to
1 by default in MXNet."
+ "where $\\sigma$ is the sigmoid activation function $\\sigma(x) =
\\frac{1}{1 + e^{-x}}$ described above."
]
},
{
@@ -340,16 +340,41 @@
"metadata": {},
"outputs": [],
"source": [
- "visualize_activation(mx.gluon.nn.Swish())"
+ "visualize_activation(mx.gluon.nn.SiLU())"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "\n",
+ "\n",
"\n",
+ "### GELU\n",
+ "The GELU is a smooth approximation to the ReLU and was introduced in
[Hendrycks et al](https://arxiv.org/abs/1606.08415). It is a common activation
function in architectures such as Transformers, BERT, and GPT.\n",
"\n",
+ "The gelu function is given as \n",
+ "\n",
+ "$$ gelu(x) = x\\cdot\\Phi(x),$$\n",
+ "\n",
+ "whereas the ReLU can be written as $x\\cdot\\mathbf{1}(x>0)$, so $Phi(x)$
serves as a smooth approximation to the ReLU's indicator function.\n",
+ "\n",
+ "Note $\\Phi(x) = \\frac{1}{\\sqrt{2 \\pi}}
\\exp\\left\\{-\\frac{x^2}{2}\\right\\}$ is the standard normal cumulative
distribution."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "visualize_activation(mx.gluon.nn.GELU())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
"\n",
"## Summary\n",
"\n",
@@ -358,7 +383,7 @@
"* Sigmoids like the logistic (sigmoid) function and tanh where the first
kinds of activation functions used in neural networks. They have since fallen
out of use because of their tendency to saturate and have vanishing
gradients.\n",
"* Rectifiers like ReLU do not saturate like the Sigmoids and so address
the vanishing gradient problem making them the de facto activation functions.
ReLU however is still plagued by the dying ReLU problem.\n",
"* LeakyReLU and PReLU are two similar approaches to improve ReLU and
address the dying ReLU by introducing a parameter $\\alpha$ (learned in PReLU)
that leaks to the gradient of negative inputs\n",
- "* MXNet also implements custom state-of-the-art activations like ELU,
SELU and Swish.\n",
+ "* MXNet also implements custom state-of-the-art activations like ELU,
SELU, SiLU, and GELU.\n",
"\n",
"\n",
"\n",
diff --git a/versions/master/api/python/docs/api/gluon/nn/index.html
b/versions/master/api/python/docs/api/gluon/nn/index.html
index 106f97b..24069ae 100644
--- a/versions/master/api/python/docs/api/gluon/nn/index.html
+++ b/versions/master/api/python/docs/api/gluon/nn/index.html
@@ -1583,7 +1583,13 @@ two modules:</p>
<td><p>Scaled Exponential Linear Unit (SELU)</p></td>
</tr>
<tr class="row-odd"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.Swish" title="mxnet.gluon.nn.Swish"><code class="xref py
py-obj docutils literal notranslate"><span
class="pre">nn.Swish</span></code></a></p></td>
-<td><p>Swish Activation function</p></td>
+<td><p>Swish Activation function (SiLU with a hyperparameter)</p></td>
+</tr>
+<tr class="row-even"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.SiLU" title="mxnet.gluon.nn.SiLU"><code class="xref py
py-obj docutils literal notranslate"><span
class="pre">nn.SiLU</span></code></a></p></td>
+<td><p>Sigmoid Linear Units</p></td>
+</tr>
+<tr class="row-odd"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.GELU" title="mxnet.gluon.nn.GELU"><code class="xref py
py-obj docutils literal notranslate"><span
class="pre">nn.GELU</span></code></a></p></td>
+<td><p>Gaussian Exponential Linear Unit (GELU)</p></td>
</tr>
</tbody>
</table>
@@ -1742,13 +1748,16 @@ two modules:</p>
<tr class="row-even"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.Sequential" title="mxnet.gluon.nn.Sequential"><code
class="xref py py-obj docutils literal notranslate"><span
class="pre">Sequential</span></code></a>()</p></td>
<td><p>Stacks Blocks sequentially.</p></td>
</tr>
-<tr class="row-odd"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.Swish" title="mxnet.gluon.nn.Swish"><code class="xref py
py-obj docutils literal notranslate"><span
class="pre">Swish</span></code></a>([beta])</p></td>
-<td><p>Swish Activation function</p></td>
+<tr class="row-odd"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.SiLU" title="mxnet.gluon.nn.SiLU"><code class="xref py
py-obj docutils literal notranslate"><span
class="pre">SiLU</span></code></a>(**kwargs)</p></td>
+<td><p>Sigmoid Linear Units</p></td>
+</tr>
+<tr class="row-even"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.Swish" title="mxnet.gluon.nn.Swish"><code class="xref py
py-obj docutils literal notranslate"><span
class="pre">Swish</span></code></a>([beta])</p></td>
+<td><p>Swish Activation function (SiLU with a hyperparameter)</p></td>
</tr>
-<tr class="row-even"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.SymbolBlock" title="mxnet.gluon.nn.SymbolBlock"><code
class="xref py py-obj docutils literal notranslate"><span
class="pre">SymbolBlock</span></code></a>(outputs, inputs[, params])</p></td>
+<tr class="row-odd"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.SymbolBlock" title="mxnet.gluon.nn.SymbolBlock"><code
class="xref py py-obj docutils literal notranslate"><span
class="pre">SymbolBlock</span></code></a>(outputs, inputs[, params])</p></td>
<td><p>Construct block from symbol.</p></td>
</tr>
-<tr class="row-odd"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.SyncBatchNorm" title="mxnet.gluon.nn.SyncBatchNorm"><code
class="xref py py-obj docutils literal notranslate"><span
class="pre">SyncBatchNorm</span></code></a>([in_channels, num_devices,
…])</p></td>
+<tr class="row-even"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.SyncBatchNorm" title="mxnet.gluon.nn.SyncBatchNorm"><code
class="xref py py-obj docutils literal notranslate"><span
class="pre">SyncBatchNorm</span></code></a>([in_channels, num_devices,
…])</p></td>
<td><p>Cross-GPU Synchronized Batch normalization (SyncBN)</p></td>
</tr>
</tbody>
@@ -4918,11 +4927,63 @@ non-hybrid children.</p>
</dd></dl>
<dl class="class">
+<dt id="mxnet.gluon.nn.SiLU">
+<em class="property">class </em><code class="sig-name
descname">SiLU</code><span class="sig-paren">(</span><em
class="sig-param">**kwargs</em><span class="sig-paren">)</span><a
class="reference internal"
href="../../../_modules/mxnet/gluon/nn/activations.html#SiLU"><span
class="viewcode-link">[source]</span></a><a class="headerlink"
href="#mxnet.gluon.nn.SiLU" title="Permalink to this definition">¶</a></dt>
+<dd><p>Bases: <code class="xref py py-class docutils literal
notranslate"><span class="pre">mxnet.gluon.block.HybridBlock</span></code></p>
+<dl class="simple">
+<dt>Sigmoid Linear Units</dt><dd><p>Originally proposed “Gaussian Error Linear
Units (GELUs)”, Hendrycks et al, 2016
+<a class="reference external"
href="https://arxiv.org/abs/1606.08415">https://arxiv.org/abs/1606.08415</a></p>
+</dd>
+</dl>
+<p><strong>Methods</strong></p>
+<table class="longtable docutils align-default">
+<colgroup>
+<col style="width: 10%" />
+<col style="width: 90%" />
+</colgroup>
+<tbody>
+<tr class="row-odd"><td><p><a class="reference internal"
href="#mxnet.gluon.nn.SiLU.hybrid_forward"
title="mxnet.gluon.nn.SiLU.hybrid_forward"><code class="xref py py-obj docutils
literal notranslate"><span class="pre">hybrid_forward</span></code></a>(F,
x)</p></td>
+<td><p>Overrides to construct symbolic graph for this
<cite>Block</cite>.</p></td>
+</tr>
+</tbody>
+</table>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><p><strong>beta</strong> (<em>float</em>) – silu(x) = x
* sigmoid(x)</p>
+</dd>
+</dl>
+<dl class="simple">
+<dt>Inputs:</dt><dd><ul class="simple">
+<li><p><strong>data</strong>: input tensor with arbitrary shape.</p></li>
+</ul>
+</dd>
+<dt>Outputs:</dt><dd><ul class="simple">
+<li><p><strong>out</strong>: output tensor with the same shape as
<cite>data</cite>.</p></li>
+</ul>
+</dd>
+</dl>
+<dl class="method">
+<dt id="mxnet.gluon.nn.SiLU.hybrid_forward">
+<code class="sig-name descname">hybrid_forward</code><span
class="sig-paren">(</span><em class="sig-param">F</em>, <em
class="sig-param">x</em><span class="sig-paren">)</span><a class="reference
internal"
href="../../../_modules/mxnet/gluon/nn/activations.html#SiLU.hybrid_forward"><span
class="viewcode-link">[source]</span></a><a class="headerlink"
href="#mxnet.gluon.nn.SiLU.hybrid_forward" title="Permalink to this
definition">¶</a></dt>
+<dd><p>Overrides to construct symbolic graph for this <cite>Block</cite>.</p>
+<dl class="field-list simple">
+<dt class="field-odd">Parameters</dt>
+<dd class="field-odd"><ul class="simple">
+<li><p><strong>x</strong> (<a class="reference internal"
href="../../legacy/symbol/symbol.html#mxnet.symbol.Symbol"
title="mxnet.symbol.Symbol"><em>Symbol</em></a><em> or </em><a class="reference
internal" href="../../legacy/ndarray/ndarray.html#mxnet.ndarray.NDArray"
title="mxnet.ndarray.NDArray"><em>NDArray</em></a>) – The first input
tensor.</p></li>
+<li><p><strong>*args</strong> (<em>list of Symbol</em><em> or </em><em>list of
NDArray</em>) – Additional input tensors.</p></li>
+</ul>
+</dd>
+</dl>
+</dd></dl>
+
+</dd></dl>
+
+<dl class="class">
<dt id="mxnet.gluon.nn.Swish">
<em class="property">class </em><code class="sig-name
descname">Swish</code><span class="sig-paren">(</span><em
class="sig-param">beta=1.0</em>, <em class="sig-param">**kwargs</em><span
class="sig-paren">)</span><a class="reference internal"
href="../../../_modules/mxnet/gluon/nn/activations.html#Swish"><span
class="viewcode-link">[source]</span></a><a class="headerlink"
href="#mxnet.gluon.nn.Swish" title="Permalink to this definition">¶</a></dt>
<dd><p>Bases: <code class="xref py py-class docutils literal
notranslate"><span class="pre">mxnet.gluon.block.HybridBlock</span></code></p>
<dl class="simple">
-<dt>Swish Activation function</dt><dd><p><a class="reference external"
href="https://arxiv.org/pdf/1710.05941.pdf">https://arxiv.org/pdf/1710.05941.pdf</a></p>
+<dt>Swish Activation function (SiLU with a hyperparameter)</dt><dd><p><a
class="reference external"
href="https://arxiv.org/pdf/1710.05941.pdf">https://arxiv.org/pdf/1710.05941.pdf</a></p>
</dd>
</dl>
<p><strong>Methods</strong></p>
diff --git a/versions/master/api/python/docs/genindex.html
b/versions/master/api/python/docs/genindex.html
index 790f518..36e8330 100644
--- a/versions/master/api/python/docs/genindex.html
+++ b/versions/master/api/python/docs/genindex.html
@@ -4544,6 +4544,8 @@ Edit on Github
</li>
<li><a
href="api/gluon/loss/index.html#mxnet.gluon.loss.SigmoidBinaryCrossEntropyLoss.hybrid_forward">(SigmoidBinaryCrossEntropyLoss
method)</a>
</li>
+ <li><a
href="api/gluon/nn/index.html#mxnet.gluon.nn.SiLU.hybrid_forward">(SiLU
method)</a>
+</li>
<li><a
href="api/gluon/loss/index.html#mxnet.gluon.loss.SoftmaxCrossEntropyLoss.hybrid_forward">(SoftmaxCrossEntropyLoss
method)</a>
</li>
<li><a
href="api/gluon/loss/index.html#mxnet.gluon.loss.SquaredHingeLoss.hybrid_forward">(SquaredHingeLoss
method)</a>
@@ -7897,6 +7899,8 @@ Edit on Github
<li><a
href="api/legacy/symbol/op/index.html#mxnet.symbol.op.signum_update">(in module
mxnet.symbol.op)</a>
</li>
</ul></li>
+ <li><a href="api/gluon/nn/index.html#mxnet.gluon.nn.SiLU">SiLU (class in
mxnet.gluon.nn)</a>
+</li>
<li><a
href="api/gluon/data/index.html#mxnet.gluon.data.SimpleDataset">SimpleDataset
(class in mxnet.gluon.data)</a>
</li>
<li><a href="api/legacy/ndarray/ndarray.html#mxnet.ndarray.sin">sin()
(in module mxnet.ndarray)</a>
diff --git a/versions/master/api/python/docs/objects.inv
b/versions/master/api/python/docs/objects.inv
index be88475..a4f66a4 100644
Binary files a/versions/master/api/python/docs/objects.inv and
b/versions/master/api/python/docs/objects.inv differ
diff --git a/versions/master/api/python/docs/searchindex.js
b/versions/master/api/python/docs/searchindex.js
index a65f605..3dd6047 100644
--- a/versions/master/api/python/docs/searchindex.js
+++ b/versions/master/api/python/docs/searchindex.js
@@ -1 +1 @@
-Search.setIndex({docnames:["api/autograd/index","api/context/index","api/contrib/index","api/contrib/io/index","api/contrib/ndarray/index","api/contrib/onnx/index","api/contrib/symbol/index","api/contrib/tensorboard/index","api/contrib/tensorrt/index","api/contrib/text/index","api/engine/index","api/executor/index","api/gluon/block","api/gluon/constant","api/gluon/contrib/index","api/gluon/data/index","api/gluon/data/vision/datasets/index","api/gluon/data/vision/index","api/gluon/data/vi
[...]
\ No newline at end of file
+Search.setIndex({docnames:["api/autograd/index","api/context/index","api/contrib/index","api/contrib/io/index","api/contrib/ndarray/index","api/contrib/onnx/index","api/contrib/symbol/index","api/contrib/tensorboard/index","api/contrib/tensorrt/index","api/contrib/text/index","api/engine/index","api/executor/index","api/gluon/block","api/gluon/constant","api/gluon/contrib/index","api/gluon/data/index","api/gluon/data/vision/datasets/index","api/gluon/data/vision/index","api/gluon/data/vi
[...]
\ No newline at end of file
diff --git
a/versions/master/api/python/docs/tutorials/packages/gluon/blocks/activations/activations.html
b/versions/master/api/python/docs/tutorials/packages/gluon/blocks/activations/activations.html
index ce95976..43d90b3 100644
---
a/versions/master/api/python/docs/tutorials/packages/gluon/blocks/activations/activations.html
+++
b/versions/master/api/python/docs/tutorials/packages/gluon/blocks/activations/activations.html
@@ -1544,7 +1544,7 @@ div.rendered_html tbody tr:hover {
<h1>Activation Blocks<a class="headerlink" href="#Activation-Blocks"
title="Permalink to this headline">¶</a></h1>
<p>Deep neural networks are a way to express a nonlinear function with lots of
parameters from input data to outputs. The nonlinearities that allow neural
networks to capture complex patterns in data are referred to as activation
functions. Over the course of the development of neural networks, several
nonlinear activation functions have been introduced to make gradient-based deep
learning tractable.</p>
<p>If you are looking to answer the question, ‘which activation function
should I use for my neural network model?’, you should probably go with
<em>ReLU</em>. Unless you’re trying to implement something like a gating
mechanism, like in LSTMs or GRU cells, then you should opt for sigmoid and/or
tanh in those cells. However, if you have a working model architecture and
you’re trying to improve its performance by swapping out activation functions
or treating the activation function as a hy [...]
-you may want to try hand-designed activations like SELU or a function
discovered by reinforcement learning and exhaustive search like Swish. This
guide describes these activation functions and others implemented in MXNet in
detail.</p>
+you may want to try hand-designed activations like SELU, SiLU, or GELU. This
guide describes these activation functions and others implemented in MXNet in
detail.</p>
<div class="section" id="Visualizing-Activations">
<h2>Visualizing Activations<a class="headerlink"
href="#Visualizing-Activations" title="Permalink to this headline">¶</a></h2>
<p>In order to compare the various activation functions and to understand the
nuances of their differences we have a snippet of code to plot the activation
functions (used in the forward pass) and their gradients (used in the backward
pass).</p>
@@ -1764,23 +1764,42 @@ addressed by ensuring that the tuning the learning rate
to ensure that it’s no
</div>
<p><img alt="selu activation and gradient"
src="tutorials/packages/gluon/blocks/activations/images/selu.png" /></p>
</div>
-<div class="section" id="Swish">
-<h3>Swish<a class="headerlink" href="#Swish" title="Permalink to this
headline">¶</a></h3>
-<p>Swish is an activation function that attempts to address the shortcomings
of ReLU by combining ideas from ReLU and sigmoid. Swish was discovered by
searching the space of activation functions using a combination of exhaustive
and reinforcement learning-based search and was introduced in the paper by <a
class="reference external"
href="https://arxiv.org/pdf/1710.05941.pdf">Ramchandran et al</a>.</p>
-<p>The swish function is given as</p>
+<div class="section" id="SiLU">
+<h3>SiLU<a class="headerlink" href="#SiLU" title="Permalink to this
headline">¶</a></h3>
+<p>The SiLU is an activation function that attempts to address the
shortcomings of ReLU by combining ideas from ReLU and sigmoid. The SiLU serves
as a smooth approximation to the ReLU and was originally introduced in <a
class="reference external" href="https://arxiv.org/abs/1606.08415">Hendrycks et
al</a>.</p>
+<p>The silu function is given as</p>
<div class="math notranslate nohighlight">
-\[swish(x) = x\cdot\sigma(\beta x)\]</div>
-<p>where <span class="math notranslate nohighlight">\(\sigma\)</span> is the
sigmoid activation function <span class="math notranslate
nohighlight">\(\sigma(x) = \frac{1}{1 + e^{-x}}\)</span> described above and
<span class="math notranslate nohighlight">\(\beta\)</span> is a hyperparameter
set to 1 by default in MXNet.</p>
+\[silu(x) = x\cdot\sigma(x)\]</div>
+<p>where <span class="math notranslate nohighlight">\(\sigma\)</span> is the
sigmoid activation function <span class="math notranslate
nohighlight">\(\sigma(x) = \frac{1}{1 + e^{-x}}\)</span> described above.</p>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div
class="highlight"><pre><span></span>[ ]:
</pre></div>
</div>
<div class="input_area highlight-python notranslate"><div
class="highlight"><pre>
-<span></span><span class="n">visualize_activation</span><span
class="p">(</span><span class="n">mx</span><span class="o">.</span><span
class="n">gluon</span><span class="o">.</span><span class="n">nn</span><span
class="o">.</span><span class="n">Swish</span><span class="p">())</span>
+<span></span><span class="n">visualize_activation</span><span
class="p">(</span><span class="n">mx</span><span class="o">.</span><span
class="n">gluon</span><span class="o">.</span><span class="n">nn</span><span
class="o">.</span><span class="n">SiLU</span><span class="p">())</span>
</pre></div>
</div>
</div>
-<p><img alt="swish activation and gradient"
src="tutorials/packages/gluon/blocks/activations/images/swish.png" /></p>
+<p><img alt="silu activation and gradient"
src="tutorials/packages/gluon/blocks/activations/images/silu.png" /></p>
+</div>
+<div class="section" id="GELU">
+<h3>GELU<a class="headerlink" href="#GELU" title="Permalink to this
headline">¶</a></h3>
+<p>The GELU is a smooth approximation to the ReLU and was introduced in <a
class="reference external" href="https://arxiv.org/abs/1606.08415">Hendrycks et
al</a>. It is a common activation function in architectures such as
Transformers, BERT, and GPT.</p>
+<p>The gelu function is given as</p>
+<div class="math notranslate nohighlight">
+\[gelu(x) = x\cdot\Phi(x),\]</div>
+<p>whereas the ReLU can be written as <span class="math notranslate
nohighlight">\(x\cdot\mathbf{1}(x>0)\)</span>, so <span class="math
notranslate nohighlight">\(Phi(x)\)</span> serves as a smooth approximation to
the ReLU’s indicator function.</p>
+<p>Note <span class="math notranslate nohighlight">\(\Phi(x) =
\frac{1}{\sqrt{2 \pi}} \exp\left\{-\frac{x^2}{2}\right\}\)</span> is the
standard normal cumulative distribution.</p>
+<div class="nbinput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div
class="highlight"><pre><span></span>[ ]:
+</pre></div>
+</div>
+<div class="input_area highlight-python notranslate"><div
class="highlight"><pre>
+<span></span><span class="n">visualize_activation</span><span
class="p">(</span><span class="n">mx</span><span class="o">.</span><span
class="n">gluon</span><span class="o">.</span><span class="n">nn</span><span
class="o">.</span><span class="n">GELU</span><span class="p">())</span>
+</pre></div>
+</div>
+</div>
+<p><img alt="gelu activation and gradient"
src="tutorials/packages/gluon/blocks/activations/images/gelu.png" /></p>
</div>
</div>
<div class="section" id="Summary">
@@ -1791,7 +1810,7 @@ addressed by ensuring that the tuning the learning rate
to ensure that it’s no
<li><p>Sigmoids like the logistic (sigmoid) function and tanh where the first
kinds of activation functions used in neural networks. They have since fallen
out of use because of their tendency to saturate and have vanishing
gradients.</p></li>
<li><p>Rectifiers like ReLU do not saturate like the Sigmoids and so address
the vanishing gradient problem making them the de facto activation functions.
ReLU however is still plagued by the dying ReLU problem.</p></li>
<li><p>LeakyReLU and PReLU are two similar approaches to improve ReLU and
address the dying ReLU by introducing a parameter <span class="math notranslate
nohighlight">\(\alpha\)</span> (learned in PReLU) that leaks to the gradient of
negative inputs</p></li>
-<li><p>MXNet also implements custom state-of-the-art activations like ELU,
SELU and Swish.</p></li>
+<li><p>MXNet also implements custom state-of-the-art activations like ELU,
SELU, SiLU, and GELU.</p></li>
</ul>
</div>
<div class="section" id="Next-Steps">
@@ -1836,7 +1855,8 @@ guide</a> to learn how to implement your own custom
activation layer.</p>
<li><a class="reference internal" href="#PReLU">PReLU</a></li>
<li><a class="reference internal" href="#ELU">ELU</a></li>
<li><a class="reference internal" href="#SELU">SELU</a></li>
-<li><a class="reference internal" href="#Swish">Swish</a></li>
+<li><a class="reference internal" href="#SiLU">SiLU</a></li>
+<li><a class="reference internal" href="#GELU">GELU</a></li>
</ul>
</li>
<li><a class="reference internal" href="#Summary">Summary</a></li>
diff --git
a/versions/master/api/python/docs/tutorials/packages/gluon/blocks/activations/activations.html.bak
b/versions/master/api/python/docs/tutorials/packages/gluon/blocks/activations/activations.html.bak
index 8427e78..a674337 100644
---
a/versions/master/api/python/docs/tutorials/packages/gluon/blocks/activations/activations.html.bak
+++
b/versions/master/api/python/docs/tutorials/packages/gluon/blocks/activations/activations.html.bak
@@ -1544,7 +1544,7 @@ div.rendered_html tbody tr:hover {
<h1>Activation Blocks<a class="headerlink" href="#Activation-Blocks"
title="Permalink to this headline">¶</a></h1>
<p>Deep neural networks are a way to express a nonlinear function with lots of
parameters from input data to outputs. The nonlinearities that allow neural
networks to capture complex patterns in data are referred to as activation
functions. Over the course of the development of neural networks, several
nonlinear activation functions have been introduced to make gradient-based deep
learning tractable.</p>
<p>If you are looking to answer the question, ‘which activation function
should I use for my neural network model?’, you should probably go with
<em>ReLU</em>. Unless you’re trying to implement something like a gating
mechanism, like in LSTMs or GRU cells, then you should opt for sigmoid and/or
tanh in those cells. However, if you have a working model architecture and
you’re trying to improve its performance by swapping out activation functions
or treating the activation function as a hy [...]
-you may want to try hand-designed activations like SELU or a function
discovered by reinforcement learning and exhaustive search like Swish. This
guide describes these activation functions and others implemented in MXNet in
detail.</p>
+you may want to try hand-designed activations like SELU, SiLU, or GELU. This
guide describes these activation functions and others implemented in MXNet in
detail.</p>
<div class="section" id="Visualizing-Activations">
<h2>Visualizing Activations<a class="headerlink"
href="#Visualizing-Activations" title="Permalink to this headline">¶</a></h2>
<p>In order to compare the various activation functions and to understand the
nuances of their differences we have a snippet of code to plot the activation
functions (used in the forward pass) and their gradients (used in the backward
pass).</p>
@@ -1764,23 +1764,42 @@ addressed by ensuring that the tuning the learning rate
to ensure that it’s no
</div>
<p><img alt="selu activation and gradient"
src="tutorials/packages/gluon/blocks/activations/images/selu.png" /></p>
</div>
-<div class="section" id="Swish">
-<h3>Swish<a class="headerlink" href="#Swish" title="Permalink to this
headline">¶</a></h3>
-<p>Swish is an activation function that attempts to address the shortcomings
of ReLU by combining ideas from ReLU and sigmoid. Swish was discovered by
searching the space of activation functions using a combination of exhaustive
and reinforcement learning-based search and was introduced in the paper by <a
class="reference external"
href="https://arxiv.org/pdf/1710.05941.pdf">Ramchandran et al</a>.</p>
-<p>The swish function is given as</p>
+<div class="section" id="SiLU">
+<h3>SiLU<a class="headerlink" href="#SiLU" title="Permalink to this
headline">¶</a></h3>
+<p>The SiLU is an activation function that attempts to address the
shortcomings of ReLU by combining ideas from ReLU and sigmoid. The SiLU serves
as a smooth approximation to the ReLU and was originally introduced in <a
class="reference external" href="https://arxiv.org/abs/1606.08415">Hendrycks et
al</a>.</p>
+<p>The silu function is given as</p>
<div class="math notranslate nohighlight">
-\[swish(x) = x\cdot\sigma(\beta x)\]</div>
-<p>where <span class="math notranslate nohighlight">\(\sigma\)</span> is the
sigmoid activation function <span class="math notranslate
nohighlight">\(\sigma(x) = \frac{1}{1 + e^{-x}}\)</span> described above and
<span class="math notranslate nohighlight">\(\beta\)</span> is a hyperparameter
set to 1 by default in MXNet.</p>
+\[silu(x) = x\cdot\sigma(x)\]</div>
+<p>where <span class="math notranslate nohighlight">\(\sigma\)</span> is the
sigmoid activation function <span class="math notranslate
nohighlight">\(\sigma(x) = \frac{1}{1 + e^{-x}}\)</span> described above.</p>
<div class="nbinput nblast docutils container">
<div class="prompt highlight-none notranslate"><div
class="highlight"><pre><span></span>[ ]:
</pre></div>
</div>
<div class="input_area highlight-python notranslate"><div
class="highlight"><pre>
-<span></span><span class="n">visualize_activation</span><span
class="p">(</span><span class="n">mx</span><span class="o">.</span><span
class="n">gluon</span><span class="o">.</span><span class="n">nn</span><span
class="o">.</span><span class="n">Swish</span><span class="p">())</span>
+<span></span><span class="n">visualize_activation</span><span
class="p">(</span><span class="n">mx</span><span class="o">.</span><span
class="n">gluon</span><span class="o">.</span><span class="n">nn</span><span
class="o">.</span><span class="n">SiLU</span><span class="p">())</span>
</pre></div>
</div>
</div>
-<p><img alt="swish activation and gradient"
src="tutorials/packages/gluon/blocks/activations/images/swish.png" /></p>
+<p><img alt="silu activation and gradient"
src="tutorials/packages/gluon/blocks/activations/images/silu.png" /></p>
+</div>
+<div class="section" id="GELU">
+<h3>GELU<a class="headerlink" href="#GELU" title="Permalink to this
headline">¶</a></h3>
+<p>The GELU is a smooth approximation to the ReLU and was introduced in <a
class="reference external" href="https://arxiv.org/abs/1606.08415">Hendrycks et
al</a>. It is a common activation function in architectures such as
Transformers, BERT, and GPT.</p>
+<p>The gelu function is given as</p>
+<div class="math notranslate nohighlight">
+\[gelu(x) = x\cdot\Phi(x),\]</div>
+<p>whereas the ReLU can be written as <span class="math notranslate
nohighlight">\(x\cdot\mathbf{1}(x>0)\)</span>, so <span class="math
notranslate nohighlight">\(Phi(x)\)</span> serves as a smooth approximation to
the ReLU’s indicator function.</p>
+<p>Note <span class="math notranslate nohighlight">\(\Phi(x) =
\frac{1}{\sqrt{2 \pi}} \exp\left\{-\frac{x^2}{2}\right\}\)</span> is the
standard normal cumulative distribution.</p>
+<div class="nbinput nblast docutils container">
+<div class="prompt highlight-none notranslate"><div
class="highlight"><pre><span></span>[ ]:
+</pre></div>
+</div>
+<div class="input_area highlight-python notranslate"><div
class="highlight"><pre>
+<span></span><span class="n">visualize_activation</span><span
class="p">(</span><span class="n">mx</span><span class="o">.</span><span
class="n">gluon</span><span class="o">.</span><span class="n">nn</span><span
class="o">.</span><span class="n">GELU</span><span class="p">())</span>
+</pre></div>
+</div>
+</div>
+<p><img alt="gelu activation and gradient"
src="tutorials/packages/gluon/blocks/activations/images/gelu.png" /></p>
</div>
</div>
<div class="section" id="Summary">
@@ -1791,7 +1810,7 @@ addressed by ensuring that the tuning the learning rate
to ensure that it’s no
<li><p>Sigmoids like the logistic (sigmoid) function and tanh where the first
kinds of activation functions used in neural networks. They have since fallen
out of use because of their tendency to saturate and have vanishing
gradients.</p></li>
<li><p>Rectifiers like ReLU do not saturate like the Sigmoids and so address
the vanishing gradient problem making them the de facto activation functions.
ReLU however is still plagued by the dying ReLU problem.</p></li>
<li><p>LeakyReLU and PReLU are two similar approaches to improve ReLU and
address the dying ReLU by introducing a parameter <span class="math notranslate
nohighlight">\(\alpha\)</span> (learned in PReLU) that leaks to the gradient of
negative inputs</p></li>
-<li><p>MXNet also implements custom state-of-the-art activations like ELU,
SELU and Swish.</p></li>
+<li><p>MXNet also implements custom state-of-the-art activations like ELU,
SELU, SiLU, and GELU.</p></li>
</ul>
</div>
<div class="section" id="Next-Steps">
@@ -1836,7 +1855,8 @@ guide</a> to learn how to implement your own custom
activation layer.</p>
<li><a class="reference internal" href="#PReLU">PReLU</a></li>
<li><a class="reference internal" href="#ELU">ELU</a></li>
<li><a class="reference internal" href="#SELU">SELU</a></li>
-<li><a class="reference internal" href="#Swish">Swish</a></li>
+<li><a class="reference internal" href="#SiLU">SiLU</a></li>
+<li><a class="reference internal" href="#GELU">GELU</a></li>
</ul>
</li>
<li><a class="reference internal" href="#Summary">Summary</a></li>
diff --git a/versions/master/feed.xml b/versions/master/feed.xml
index 3fc6f78..486d677 100644
--- a/versions/master/feed.xml
+++ b/versions/master/feed.xml
@@ -1 +1 @@
-<?xml version="1.0" encoding="utf-8"?><feed
xmlns="http://www.w3.org/2005/Atom" ><generator uri="https://jekyllrb.com/"
version="4.0.0">Jekyll</generator><link
href="https://mxnet.apache.org/versions/master/feed.xml" rel="self"
type="application/atom+xml" /><link
href="https://mxnet.apache.org/versions/master/" rel="alternate"
type="text/html"
/><updated>2020-11-09T12:32:51+00:00</updated><id>https://mxnet.apache.org/versions/master/feed.xml</id><title
type="html">Apache MXNet</title><su [...]
\ No newline at end of file
+<?xml version="1.0" encoding="utf-8"?><feed
xmlns="http://www.w3.org/2005/Atom" ><generator uri="https://jekyllrb.com/"
version="4.0.0">Jekyll</generator><link
href="https://mxnet.apache.org/versions/master/feed.xml" rel="self"
type="application/atom+xml" /><link
href="https://mxnet.apache.org/versions/master/" rel="alternate"
type="text/html"
/><updated>2020-11-09T18:33:17+00:00</updated><id>https://mxnet.apache.org/versions/master/feed.xml</id><title
type="html">Apache MXNet</title><su [...]
\ No newline at end of file