anirudhacharya closed pull request #12748: [MXNET-1028] Adding CELU Activation
function
URL: https://github.com/apache/incubator-mxnet/pull/12748
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git a/docs/api/python/gluon/nn.md b/docs/api/python/gluon/nn.md
index 25c82f06668..3e2f786c0ef 100644
--- a/docs/api/python/gluon/nn.md
+++ b/docs/api/python/gluon/nn.md
@@ -79,6 +79,7 @@ This document lists the neural network blocks in Gluon:
PReLU
ELU
SELU
+ CELU
Swish
```
diff --git a/python/mxnet/gluon/nn/activations.py
b/python/mxnet/gluon/nn/activations.py
index fa8eee9d298..4d0369317d2 100644
--- a/python/mxnet/gluon/nn/activations.py
+++ b/python/mxnet/gluon/nn/activations.py
@@ -18,7 +18,7 @@
# coding: utf-8
# pylint: disable= arguments-differ
"""Basic neural network layers."""
-__all__ = ['Activation', 'LeakyReLU', 'PReLU', 'ELU', 'SELU', 'Swish']
+__all__ = ['Activation', 'LeakyReLU', 'PReLU', 'ELU', 'SELU', 'CELU', 'Swish']
from ... import initializer
from ..block import HybridBlock
@@ -181,6 +181,39 @@ def hybrid_forward(self, F, x):
return F.LeakyReLU(x, act_type='selu', name='fwd')
+class CELU(HybridBlock):
+ r"""
+ Continuous Exponential Linear Unit. (CELU)
+ https://arxiv.org/abs/1704.07483
+
+ .. math::
+
+ f\left(x\right) = \left\{
+ \begin{array}{lr}
+ \alpha (\exp(x/\alpha) - 1) & : x \lt 0 \\
+ x & : x \geq 0 \\
+ \end{array}
+ \right.\\
+
+ Parameters
+ ----------
+ alpha : float
+ slope coefficient for the negative half axis.
+
+ Inputs:
+ - **data**: input tensor with arbitrary shape.
+
+ Outputs:
+ - **out**: output tensor with the same shape as `data`.
+ """
+ def __init__(self, alpha=0.25, **kwargs):
+ super(CELU, self).__init__(**kwargs)
+ self._alpha = alpha
+
+ def hybrid_forward(self, F, x):
+ return F.LeakyReLU(x, slope=self._alpha, act_type='celu', name='fwd')
+
+
class Swish(HybridBlock):
r"""
Swish Activation function
diff --git a/src/operator/leaky_relu-inl.h b/src/operator/leaky_relu-inl.h
index fe2668959af..df0c7b694d4 100644
--- a/src/operator/leaky_relu-inl.h
+++ b/src/operator/leaky_relu-inl.h
@@ -47,7 +47,7 @@ namespace op {
namespace leakyrelu {
enum LeakyReLUOpInputs {kData, kGamma};
enum LeakyReLUOpOutputs {kOut, kMask};
-enum LeakyReLUOpType {kLeakyReLU, kPReLU, kRReLU, kELU, kSELU};
+enum LeakyReLUOpType {kLeakyReLU, kPReLU, kRReLU, kELU, kSELU, kCELU};
enum LeakyReLUOpResource {kRandom};
} // namespace leakyrelu
@@ -64,9 +64,10 @@ struct LeakyReLUParam : public
dmlc::Parameter<LeakyReLUParam> {
.add_enum("prelu", leakyrelu::kPReLU)
.add_enum("elu", leakyrelu::kELU)
.add_enum("selu", leakyrelu::kSELU)
+ .add_enum("celu", leakyrelu::kCELU)
.describe("Activation function to be applied.");
DMLC_DECLARE_FIELD(slope).set_default(0.25f)
- .describe("Init slope for the activation. (For leaky and elu only)");
+ .describe("Init slope for the activation. (For leaky, elu and celu only)");
DMLC_DECLARE_FIELD(lower_bound).set_default(0.125f)
.describe("Lower bound of random slope. (For rrelu only)");
DMLC_DECLARE_FIELD(upper_bound).set_default(0.334f)
@@ -190,8 +191,16 @@ class LeakyReLUOp : public Operator {
});
break;
}
+ case leakyrelu::kCELU: {
+ MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kOut], Req, {
+ mxnet_op::Kernel<mxnet_op::op_with_req<mshadow_op::celu, Req>,
xpu>::Launch(
+ s, out.size(0) * out.size(1) * out.size(2), out.dptr_, data.dptr_,
+ DType(param_.slope));
+ });
+ break;
+ }
default:
- LOG(FATAL) << "Not implmented";
+ LOG(FATAL) << "Not implemented";
}
}
@@ -287,8 +296,17 @@ class LeakyReLUOp : public Operator {
});
break;
}
+ case leakyrelu::kCELU: {
+ MXNET_ASSIGN_REQ_SWITCH(req[leakyrelu::kData], Req, {
+ mxnet_op::Kernel<mxnet_op::op_with_req<
+ mxnet_op::backward_grad_tuned<mshadow_op::celu_grad>, Req>,
xpu>::Launch(
+ s, gdata.size(0) * gdata.size(1) * gdata.size(2), gdata.dptr_,
grad.dptr_,
+ output.dptr_, DType(param_.slope));
+ });
+ break;
+ }
default:
- LOG(FATAL) << "Not implmented";
+ LOG(FATAL) << "Not implemented";
}
}
diff --git a/src/operator/leaky_relu.cc b/src/operator/leaky_relu.cc
index 45f9511c908..a07500ac548 100644
--- a/src/operator/leaky_relu.cc
+++ b/src/operator/leaky_relu.cc
@@ -61,6 +61,8 @@ The following modified ReLU Activation functions are
supported:
- *rrelu*: Randomized ReLU. same as *leaky* but the `slope` is uniformly and
randomly chosen from
*[lower_bound, upper_bound)* for training, while fixed to be
*(lower_bound+upper_bound)/2* for inference.
+- *celu*: Continuous exponential linear unit. `y = x >= 0 ? x : slope *
(exp(x/slope)-1)` as
+ described in https://arxiv.org/abs/1704.07483
)code" ADD_FILELINE)
.add_argument("data", "NDArray-or-Symbol", "Input data to activation
function.")
diff --git a/src/operator/mshadow_op.h b/src/operator/mshadow_op.h
index 06a223dda39..60eb59f22d0 100644
--- a/src/operator/mshadow_op.h
+++ b/src/operator/mshadow_op.h
@@ -148,6 +148,11 @@ MXNET_BINARY_MATH_OP_NC(elu, a > DType(0) ? a :
MXNET_BINARY_MATH_OP_NC(elu_grad, a > DType(0) ? DType(1) : DType(b + a));
+MXNET_BINARY_MATH_OP_NC(celu, a >= DType(0) ? a :
+ DType(math::id(b) * math::expm1(a/b)));
+
+MXNET_BINARY_MATH_OP_NC(celu_grad, a >= DType(0) ? DType(1) : DType((b + a) /
b));
+
MXNET_SIMPLE_UNARY_MATH_OP(tanh);
MXNET_UNARY_MATH_OP(tanh_grad, 1.0f - math::sqr(a));
diff --git a/src/operator/operator_tune.cc b/src/operator/operator_tune.cc
index cf5412f9824..975db225138 100644
--- a/src/operator/operator_tune.cc
+++ b/src/operator/operator_tune.cc
@@ -320,12 +320,14 @@
IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::power); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::rpower); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::xelu); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::elu); // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::celu); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::power_grad); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::rpower_grad); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::power_rgrad); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::xelu_grad); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::prelu_grad); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::elu_grad); // NOLINT()
+IMPLEMENT_BINARY_WORKLOAD_BWD(mxnet::op::mshadow_op::celu_grad); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::maximum); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::minimum); // NOLINT()
IMPLEMENT_BINARY_WORKLOAD_FWD(mxnet::op::mshadow_op::hypot); // NOLINT()
diff --git a/tests/python/unittest/test_gluon.py
b/tests/python/unittest/test_gluon.py
index 3049674821c..cd1fd230a46 100644
--- a/tests/python/unittest/test_gluon.py
+++ b/tests/python/unittest/test_gluon.py
@@ -1036,6 +1036,16 @@ def elu(x):
for test_point, ref_point in zip(elu_test(point_to_validate),
elu(point_to_validate)):
assert test_point == ref_point
+ celu = mx.gluon.nn.CELU()
+ def celu_test(x):
+ def celu(x):
+ alpha = 0.25
+ return alpha * (mx.nd.exp(x / alpha) - 1) if x < 0 else x
+ return [celu(x_i) for x_i in x]
+
+ for test_point, ref_point in zip(celu_test(point_to_validate),
celu(point_to_validate)):
+ assert test_point == ref_point
+
selu = mx.gluon.nn.SELU()
def selu_test(x):
def selu(x):
@@ -1043,7 +1053,7 @@ def selu(x):
return scale * x if x >= 0 else alpha * mx.nd.exp(x) - alpha
return [selu(x_i) for x_i in x]
- for test_point, ref_point in zip(selu(point_to_validate),
selu(point_to_validate)):
+ for test_point, ref_point in zip(selu_test(point_to_validate),
selu(point_to_validate)):
assert test_point == ref_point
prelu = mx.gluon.nn.PReLU()
diff --git a/tests/python/unittest/test_operator.py
b/tests/python/unittest/test_operator.py
index 80a83df45da..64c8fd1ce90 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -735,6 +735,8 @@ def fleaky_relu(x, act_type, slope=0.25):
out[neg_indices] = slope * np.expm1(out[neg_indices])
elif act_type == 'leaky':
out[neg_indices] = slope * out[neg_indices]
+ elif act_type == 'celu':
+ out[neg_indices] = slope * np.expm1(out[neg_indices] / slope)
return out
def fleaky_relu_grad(grad, x, y, act_type, slope=0.25):
neg_indices = x < 0
@@ -743,6 +745,8 @@ def fleaky_relu_grad(grad, x, y, act_type, slope=0.25):
out[neg_indices] = y[neg_indices] + slope
elif act_type == 'leaky':
out[neg_indices] = slope
+ elif act_type == 'celu':
+ out[neg_indices] = (y[neg_indices] + slope) / slope
return out * grad
for ndim in range(1, 4):
shape = rand_shape_nd(ndim)
@@ -754,7 +758,7 @@ def fleaky_relu_grad(grad, x, y, act_type, slope=0.25):
rtol = 1e-2
atol = 1e-3
xa[abs(xa) < eps] = 1.0
- for act_type in ['elu', 'leaky']:
+ for act_type in ['elu', 'leaky', 'celu']:
y = mx.symbol.LeakyReLU(data=x, slope=slp, act_type=act_type)
ya = fleaky_relu(xa, slope=slp, act_type=act_type)
ga = fleaky_relu_grad(np.ones(shape), xa, ya, slope=slp,
act_type=act_type)
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
With regards,
Apache Git Services