This is an automated email from the ASF dual-hosted git repository.
srk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new a5e883e846 [RUNTIME][CLML] Fix for Softmax op for 4D tensors (#16328)
a5e883e846 is described below
commit a5e883e8465e11221d3f22d6ef2f61a1bfa5d1f2
Author: krishnaraj36 <[email protected]>
AuthorDate: Thu Jan 18 12:38:57 2024 +0530
[RUNTIME][CLML] Fix for Softmax op for 4D tensors (#16328)
Fixed the softmax layer for 4D tensors to support for NCHW and NHWC
layout types.
Enabled relevant test cases for softmax layer
---
python/tvm/relay/op/contrib/clml.py | 3 +-
src/runtime/contrib/clml/clml_runtime.cc | 62 ++++++++++++++++-----
tests/python/contrib/test_clml/test_ops.py | 86 ++++++++++++++++--------------
3 files changed, 98 insertions(+), 53 deletions(-)
diff --git a/python/tvm/relay/op/contrib/clml.py
b/python/tvm/relay/op/contrib/clml.py
index 14dd35a3cb..53b022c347 100644
--- a/python/tvm/relay/op/contrib/clml.py
+++ b/python/tvm/relay/op/contrib/clml.py
@@ -437,7 +437,8 @@ def clml_pattern_table():
def check_softmax_op(extract):
call = extract
- if len(call.args[0].checked_type.shape) > 2:
+ # supports 2D and 4D tensors
+ if len(call.args[0].checked_type.shape) not in [2, 4]:
return False
return True
diff --git a/src/runtime/contrib/clml/clml_runtime.cc
b/src/runtime/contrib/clml/clml_runtime.cc
index aa1e2b82b6..8e69cb8bd1 100644
--- a/src/runtime/contrib/clml/clml_runtime.cc
+++ b/src/runtime/contrib/clml/clml_runtime.cc
@@ -511,6 +511,7 @@ class CLMLRuntime : public JSONRuntimeBase {
/*!
* \brief Create an CLML tensor from JSON node entry. Lookup storage map
before creation.
+ * Update input placeholder for NHWC layout
*
* \param nid The node index of graph JSON.
* \param shape shape information of tensor
@@ -528,15 +529,22 @@ class CLMLRuntime : public JSONRuntimeBase {
uint32_t eid = EntryID(nid, 0);
node_data = data_entry_[eid]->data;
}
+
auto clml_tensor = MakeCLMLTensorFromJSONNode(node, layout, dtype,
node_data, shape);
+
this->layer_.storage_map.insert({nid, std::make_pair(clml_tensor,
node)});
if ("input" == node.GetOpType()) {
this->layer_.inputs.insert({nid, this->layer_.storage_map[nid].first});
// Input copy placeholder Tensor
- this->layer_.in_placeholder.insert(
- {nid, MakeCLMLTensorFromJSONNode(node, CL_TENSOR_LAYOUT_NCHW_QCOM,
dtype, node_data,
- shape)});
+ if (layout == CL_TENSOR_LAYOUT_OPTIMAL_QCOM) {
+ this->layer_.in_placeholder.insert(
+ {nid, MakeCLMLTensorFromJSONNode(node,
CL_TENSOR_LAYOUT_NCHW_QCOM, dtype, node_data,
+ shape)});
+ } else {
+ this->layer_.in_placeholder.insert(
+ {nid, MakeCLMLTensorFromJSONNode(node, layout, dtype, node_data,
shape)});
+ }
}
return clml_tensor;
@@ -559,6 +567,7 @@ class CLMLRuntime : public JSONRuntimeBase {
const auto& node = nodes_[nid];
if ("nn.dense" == node.GetOpName()) CreateDenseLayerTensor(&layer_,
node, nid);
if ("nn.batch_matmul" == node.GetOpName())
CreateBatchMatmulLayerTensor(&layer_, node, nid);
+ if ("nn.softmax" == node.GetOpName()) CreateSoftmaxLayerTensor(&layer_,
node, nid);
}
for (nid = 0; nid < nodes_.size(); ++nid) {
@@ -1092,6 +1101,37 @@ class CLMLRuntime : public JSONRuntimeBase {
return;
}
+ /*!
+ * \brief Create a Softmax layer Tensors with supported layout.
+ * \param layer The CLML layer to build. Containing inputs, outputs and the
CLML function.
+ * \param node The JSON representation of the operator.
+ * \param nid The node index of JSON graph node, which points to this
operator.
+ */
+
+ void CreateSoftmaxLayerTensor(CachedLayer* layer, const JSONGraphNode& node,
size_t nid) {
+ cl_ml_tensor_layout_qcom layout;
+ cl_int result = 0;
+ cl_ml_op_qcom op = nullptr;
+ DLDataType tvm_dtype = node.GetOpDataType()[0];
+ cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
+ auto out_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
+ int axis = std::stoi(node.GetAttr<std::vector<std::string>>("axis")[0]);
+ // enabling NHWC layout && NCHW layout for 4D, basis the axis value
+ if (out_dims.h >= 1 && out_dims.w >= 1) {
+ if (axis == 3 || axis == -1) {
+ layout = CL_TENSOR_LAYOUT_NHWC_QCOM;
+ } else {
+ layout = CL_TENSOR_LAYOUT_NCHW_QCOM;
+ }
+ } else { // default layout for 2D
+ layout = CL_TENSOR_LAYOUT_OPTIMAL_QCOM;
+ }
+ auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
layout, cl_dtype);
+
+ return;
+ }
+
/*!
* \brief Create a SoftMax layer.
*
@@ -1100,24 +1140,20 @@ class CLMLRuntime : public JSONRuntimeBase {
* \param nid The node index of JSON graph node, which points to this
operator.
*/
void CreateSoftMaxLayer(CachedLayer* layer, const JSONGraphNode& node,
size_t nid) {
+ cl_ml_tensor_layout_qcom layout;
+ cl_softmax_mode_qcom mode = CL_SOFTMAX_MODE_SPATIAL_QCOM;
cl_int result = 0;
cl_ml_op_qcom op = nullptr;
DLDataType tvm_dtype = node.GetOpDataType()[0];
cl_channel_type cl_dtype = MakeCLDataType(tvm_dtype);
cl_arithmetic_mode_qcom cl_arithmetic_mode = MakeCLArithMode(cl_dtype,
cl_dtype);
- auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
- CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
cl_dtype);
- auto out_dims = GetTensorDims(nodes_[node.GetInputs()[0].id_]);
- auto output = MakeCLMLTensorFromJSONEntry(nid, {out_dims.n, out_dims.c, 1,
1},
- CL_TENSOR_LAYOUT_OPTIMAL_QCOM,
cl_dtype);
-
- cl_ml_op_softmax_desc_qcom softmax_desc =
{CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM,
- CL_SOFTMAX_MODE_INSTANCE_QCOM,
cl_arithmetic_mode};
-
+ auto output = MakeCLMLTensorFromJSONEntry(nid, {}, layout, cl_dtype);
+ auto input = MakeCLMLTensorFromJSONEntry(node.GetInputs()[0].id_, {},
layout, cl_dtype);
+ cl_ml_op_softmax_desc_qcom softmax_desc =
{CL_SOFTMAX_ALGORITHM_ACCURATE_QCOM, mode,
+ cl_arithmetic_mode};
result = CLML_INTF->clCreateMLOpSoftmaxQCOM(CLML_CTX, nullptr,
&softmax_desc, input->tensor,
output->tensor, &op,
layer_.tuning_cache);
ICHECK(op && result == CL_SUCCESS) << "SoftMax Error:" << result;
-
layer->function.push_back(op);
return;
}
diff --git a/tests/python/contrib/test_clml/test_ops.py
b/tests/python/contrib/test_clml/test_ops.py
index 58365bf429..3d89994126 100644
--- a/tests/python/contrib/test_clml/test_ops.py
+++ b/tests/python/contrib/test_clml/test_ops.py
@@ -280,9 +280,9 @@ def test_conv2d(remote, dtype, target, trials,
executor_type):
has_activation=composite[2],
)
outputs = _build_and_run_network(remote, func, params, inputs, target,
executor_type)
- out_rtol = 1e-1 if dtype == "float16" else 1e-5
+ out_tol = 1e-1 if dtype == "float16" else 1e-5
tvm.testing.assert_allclose(
- outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol,
atol=out_rtol
+ outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
)
args = (shape, kernel_h, kernel_w, pad, stride, dilation, groups, dtype,
out_channels)
exp_codegen = _get_conv_expected_codegen(
@@ -373,9 +373,9 @@ def test_conv2d_transpose(remote, dtype, target, trials,
executor_type):
func = relay.Function([x, w], y)
mod = IRModule.from_expr(func)
outputs = _build_and_run_network(remote, mod, params, inputs, target,
executor_type)
- out_rtol = 1e-1 if dtype == "float16" else 1e-5
+ out_tol = 1e-1 if dtype == "float16" else 1e-5
tvm.testing.assert_allclose(
- outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol,
atol=out_rtol
+ outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
)
args = (
dshape,
@@ -425,9 +425,9 @@ def test_batchnorm(remote, dtype, target, trials,
executor_type):
"a": input_arr,
}
outputs = _build_and_run_network(remote, mod, params, inputs, target,
executor_type)
- out_rtol = 1e-3 if dtype == "float16" else 1e-5
+ out_tol = 1e-3 if dtype == "float16" else 1e-5
tvm.testing.assert_allclose(
- outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol,
atol=out_rtol
+ outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
)
exp_codegen = [
{
@@ -485,9 +485,9 @@ def test_concat(remote, dtype, target, trials,
executor_type):
func = relay.concatenate((a, b), axis=1)
outputs = _build_and_run_network(remote, func, params, inputs, target,
executor_type)
- out_rtol = 1e-2 if dtype == "float16" else 1e-5
+ out_tol = 1e-2 if dtype == "float16" else 1e-5
tvm.testing.assert_allclose(
- outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol,
atol=out_rtol
+ outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
)
exp_codegen = [
@@ -601,9 +601,9 @@ def test_pool(remote, dtype, target, trials, executor_type):
func = relay.nn.avg_pool2d(a, pool_size=pool_size, strides=stride,
padding=padding)
outputs = _build_and_run_network(remote, func, params, inputs, target,
executor_type)
- out_rtol = 1e-2 if dtype == "float16" else 1e-5
+ out_tol = 1e-2 if dtype == "float16" else 1e-5
tvm.testing.assert_allclose(
- outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol,
atol=out_rtol
+ outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
)
args = (input_shape, pool_size, stride, padding, pooling_type, dtype)
exp_codegen = _get_pool_expected_codegen(*args)
@@ -690,9 +690,9 @@ def test_dense(remote, dtype, target, trials,
executor_type):
def _verify(out, params, inputs, exp_codegen):
mod = IRModule.from_expr(out)
outputs = _build_and_run_network(remote, mod, params, inputs, target,
executor_type)
- out_rtol = 1e-1 if dtype == "float16" else 1e-5
+ out_tol = 1e-1 if dtype == "float16" else 1e-5
tvm.testing.assert_allclose(
- outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol,
atol=out_rtol
+ outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol,
atol=out_tol
)
verify_codegen(remote, mod, params, exp_codegen, target)
@@ -718,9 +718,9 @@ def test_binary_ops(remote, dtype, target, executor_type):
def _verify(out, params, inputs):
mod = IRModule.from_expr(out)
outputs = _build_and_run_network(remote, mod, params, inputs, target,
executor_type)
- out_rtol = 1e-2 if dtype == "float16" else 1e-5
+ out_tol = 1e-2 if dtype == "float16" else 1e-5
tvm.testing.assert_allclose(
- outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol,
atol=out_rtol
+ outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol,
atol=out_tol
)
exp_codegen = [
{
@@ -776,9 +776,9 @@ def test_unary_ops(remote, dtype, target, executor_type):
def _verify(out, params, inputs):
mod = IRModule.from_expr(out)
outputs = _build_and_run_network(remote, mod, params, inputs, target,
executor_type)
- out_rtol = 1e-2 if dtype == "float16" else 1e-5
+ out_tol = 1e-2 if dtype == "float16" else 1e-5
tvm.testing.assert_allclose(
- outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol,
atol=out_rtol
+ outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol,
atol=out_tol
)
exp_codegen = [
@@ -823,12 +823,11 @@ def test_depth_to_space(remote, dtype, target,
executor_type):
def _verify(out, params, inputs):
mod = IRModule.from_expr(out)
outputs = _build_and_run_network(remote, mod, params, inputs, target,
executor_type)
- out_rtol = 1e-2 if dtype == "float16" else 1e-5
+ out_tol = 1e-2 if dtype == "float16" else 1e-5
tvm.testing.assert_allclose(
- outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol,
atol=out_rtol
+ outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol,
atol=out_tol
)
- # Check to make sure these ops are offloaded to CLML instead of TVM.
exp_codegen = [
{
"attrs": {
@@ -877,12 +876,11 @@ def test_resize_bilinear(remote, dtype, target,
executor_type):
def _verify(out, params, inputs):
mod = IRModule.from_expr(out)
outputs = _build_and_run_network(remote, mod, params, inputs, target,
executor_type)
- out_rtol = 1e-2 if dtype == "float16" else 1e-5
+ out_tol = 1e-2 if dtype == "float16" else 1e-5
tvm.testing.assert_allclose(
- outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol,
atol=out_rtol
+ outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol,
atol=out_tol
)
- # Check to make sure these ops are offloaded to CLML instead of TVM.
exp_codegen = [
{
"attrs": {
@@ -944,12 +942,11 @@ def test_batch_matmul(remote, dtype, target,
executor_type, trials):
def _verify(out, params, inputs):
mod = IRModule.from_expr(out)
outputs = _build_and_run_network(remote, mod, params, inputs, target,
executor_type)
- out_rtol = 1e-1 if dtype == "float16" else 1e-5
+ out_tol = 1e-1 if dtype == "float16" else 1e-5
tvm.testing.assert_allclose(
- outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol,
atol=out_rtol
+ outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol,
atol=out_tol
)
- # Check to make sure these ops are offloaded to CLML instead of TVM.
exp_codegen = [
{
"attrs": {
@@ -1026,20 +1023,30 @@ def test_softmax(remote, dtype, target, executor_type):
params = {}
return out, params, inputs, axis
- def _verify(out, params, inputs, axis):
+ def _verify(out, params, inputs, axis, out_tol):
mod = IRModule.from_expr(out)
outputs = _build_and_run_network(remote, mod, params, inputs, target,
executor_type)
- out_rtol = 1e-1 if dtype == "float16" else 1e-5
tvm.testing.assert_allclose(
- outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol,
atol=out_rtol
+ outputs[0].asnumpy(), outputs[1].numpy(), rtol=out_tol,
atol=out_tol
)
args = (inputs, dtype, outputs[0].shape, axis)
exp_codegen = _get_softmax_exp_codegen(*args)
verify_codegen(remote, mod, params, exp_codegen, target)
- _verify(*(_get_model((1, 5), 1)))
- _verify(*(_get_model((1, 1000), 1)))
- _verify(*(_get_model((1, 3), 1)))
+ # 2D Tensor TEST CASES
+ _verify(*(_get_model((1, 5), 1)), 1e-3)
+ _verify(*(_get_model((1, 16), 1)), 1e-3)
+ _verify(*(_get_model((1, 1000), -1)), 1e-3)
+
+ # 4D Tensor TEST CASES layout = NCHW
+ _verify(*(_get_model((1, 100, 64, 100), 1)), 1e-3)
+ _verify(*(_get_model((1, 64, 64, 64), 1)), 1e-3)
+ _verify(*(_get_model((1, 5, 3, 4), 1)), 1e-3)
+
+ # 4D Tensor TEST CASES layout = NHWC
+ _verify(*(_get_model((1, 64, 100, 100), 3)), 1e-1)
+ _verify(*(_get_model((1, 100, 100, 100), 3)), 1e-1)
+ _verify(*(_get_model((1, 64, 5, 32), -1)), 1e-1)
@pytest.mark.parametrize("dtype", ["float32", "float16"])
@@ -1066,9 +1073,9 @@ def test_upsampling(remote, dtype, target, executor_type,
trials):
)
mod = IRModule.from_expr(func)
outputs = _build_and_run_network(remote, mod, params, inputs, target,
executor_type)
- out_rtol = 1e-2 if dtype == "float16" else 1e-5
+ out_tol = 1e-2 if dtype == "float16" else 1e-5
tvm.testing.assert_allclose(
- outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol,
atol=out_rtol
+ outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol,
atol=out_tol
)
exp_codegen = [
{
@@ -1124,9 +1131,9 @@ def test_reshape(remote, dtype, target, executor_type,
trials):
params = {}
mod = IRModule.from_expr(out)
outputs = _build_and_run_network(remote, mod, params, inputs, target,
executor_type)
- out_rtol = 1e-3 if dtype == "float16" else 1e-5
+ out_tol = 1e-3 if dtype == "float16" else 1e-5
tvm.testing.assert_allclose(
- outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol,
atol=out_rtol
+ outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol,
atol=out_tol
)
exp_codegen = [
{
@@ -1223,9 +1230,9 @@ def test_pool_global(remote, dtype, target,
executor_type, trials):
func = relay.nn.global_avg_pool2d(a)
mod = IRModule.from_expr(func)
outputs = _build_and_run_network(remote, mod, params, inputs, target,
executor_type)
- out_rtol = 1e-3 if dtype == "float16" else 1e-5
+ out_tol = 1e-3 if dtype == "float16" else 1e-5
tvm.testing.assert_allclose(
- outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol,
atol=out_rtol
+ outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol, atol=out_tol
)
args = (input_shape, pooling_type, dtype, outputs[0].shape)
exp_codegen = _get_pool_global_expected_codegen(*args)
@@ -1241,6 +1248,7 @@ def test_batch_flatten(remote, dtype, target,
executor_type):
# Defined the test case with unary operator
# Single batch_flatten op is failing in native OpenCL
# Empty TVM mod in VM doesn't pick appropriate cross compiler
+ np.random.seed(0)
out = relay.nn.relu(a)
out = relay.nn.batch_flatten(out)
inputs = {"a": tvm.nd.array(np.random.uniform(-1, 1,
a_shape).astype(dtype))}
@@ -1250,9 +1258,9 @@ def test_batch_flatten(remote, dtype, target,
executor_type):
def _verify(out, params, inputs):
mod = IRModule.from_expr(out)
outputs = _build_and_run_network(remote, mod, params, inputs, target,
executor_type)
- out_rtol = 1e-3 if dtype == "float16" else 1e-5
+ out_tol = 1e-3 if dtype == "float16" else 1e-5
tvm.testing.assert_allclose(
- outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_rtol,
atol=out_rtol
+ outputs[0].asnumpy(), outputs[1].asnumpy(), rtol=out_tol,
atol=out_tol
)
exp_codegen = [
{