masahi commented on code in PR #11557:
URL: https://github.com/apache/tvm/pull/11557#discussion_r927042110


##########
gallery/how_to/work_with_relay/using_pipeline_executor.py:
##########
@@ -0,0 +1,250 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Using Pipeline Executor in Relay
+=================================
+**Author**: `Hua Jiang <https://https://github.com/huajsj>`_
+
+This is a short tutorial on how to use "Pipeline Executor" with Relay.
+"""
+import tvm
+from tvm import te
+import numpy as np
+from tvm.contrib import graph_executor as runtime
+from tvm.relay.op.contrib.cutlass import partition_for_cutlass
+from tvm import relay
+from tvm.relay import testing
+import tvm.testing
+from tvm.contrib.cutlass import (
+    has_cutlass,
+    num_cutlass_partitions,
+    finalize_modules,
+    finalize_modules_vm,
+)
+
+img_size = 8
+#######################################################################
+# Create a simple network, this network can be a pre-trained model too.
+# ---------------------------------------------------------------------
+# Let's create a very simple network for demonstration.
+# It consists of convolution, batch normalization, dense, and ReLU activation.
+def get_network():
+    out_channels = 16
+    batch_size = 1
+    data = relay.var("data", relay.TensorType((batch_size, 3, img_size, 
img_size), "float16"))
+    dense_weight = relay.var(
+        "dweight", relay.TensorType((batch_size, 16 * img_size * img_size), 
"float16")
+    )
+    weight = relay.var("weight")
+    second_weight = relay.var("second_weight")
+    bn_gamma = relay.var("bn_gamma")
+    bn_beta = relay.var("bn_beta")
+    bn_mmean = relay.var("bn_mean")
+    bn_mvar = relay.var("bn_var")
+    simple_net = relay.nn.conv2d(
+        data=data, weight=weight, kernel_size=(3, 3), channels=out_channels, 
padding=(1, 1)
+    )
+    simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, 
bn_mvar)[0]
+    simple_net = relay.nn.relu(simple_net)
+    simple_net = relay.nn.batch_flatten(simple_net)
+    simple_net = relay.nn.dense(simple_net, dense_weight)
+    simple_net = relay.Function(relay.analysis.free_vars(simple_net), 
simple_net)
+    data_shape = (batch_size, 3, img_size, img_size)
+    net, params = testing.create_workload(simple_net)
+    return net, params, data_shape
+
+
+net, params, data_shape = get_network()
+###########################################
+# Splitting the network into two subgraphs.
+# -----------------------------------------
+# This function called 'graph_split' from a unit test is just an example. User 
can create a customized logic
+# to split the graph.
+import inspect
+import os
+
+test_path = os.path.dirname(inspect.getfile(lambda: None))

Review Comment:
   I think you can simply use `__file__` here instead of `inspect`. And rename 
`test_path` to `tutorial_dir`. 



##########
gallery/how_to/work_with_relay/using_pipeline_executor.py:
##########
@@ -0,0 +1,250 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Using Pipeline Executor in Relay
+=================================
+**Author**: `Hua Jiang <https://https://github.com/huajsj>`_
+
+This is a short tutorial on how to use "Pipeline Executor" with Relay.
+"""
+import tvm
+from tvm import te
+import numpy as np
+from tvm.contrib import graph_executor as runtime
+from tvm.relay.op.contrib.cutlass import partition_for_cutlass
+from tvm import relay
+from tvm.relay import testing
+import tvm.testing
+from tvm.contrib.cutlass import (
+    has_cutlass,
+    num_cutlass_partitions,
+    finalize_modules,
+    finalize_modules_vm,
+)
+
+img_size = 8
+#######################################################################
+# Create a simple network, this network can be a pre-trained model too.
+# ---------------------------------------------------------------------
+# Let's create a very simple network for demonstration.
+# It consists of convolution, batch normalization, dense, and ReLU activation.
+def get_network():
+    out_channels = 16
+    batch_size = 1
+    data = relay.var("data", relay.TensorType((batch_size, 3, img_size, 
img_size), "float16"))
+    dense_weight = relay.var(
+        "dweight", relay.TensorType((batch_size, 16 * img_size * img_size), 
"float16")
+    )
+    weight = relay.var("weight")
+    second_weight = relay.var("second_weight")
+    bn_gamma = relay.var("bn_gamma")
+    bn_beta = relay.var("bn_beta")
+    bn_mmean = relay.var("bn_mean")
+    bn_mvar = relay.var("bn_var")
+    simple_net = relay.nn.conv2d(
+        data=data, weight=weight, kernel_size=(3, 3), channels=out_channels, 
padding=(1, 1)
+    )
+    simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, 
bn_mvar)[0]
+    simple_net = relay.nn.relu(simple_net)
+    simple_net = relay.nn.batch_flatten(simple_net)
+    simple_net = relay.nn.dense(simple_net, dense_weight)
+    simple_net = relay.Function(relay.analysis.free_vars(simple_net), 
simple_net)
+    data_shape = (batch_size, 3, img_size, img_size)
+    net, params = testing.create_workload(simple_net)
+    return net, params, data_shape
+
+
+net, params, data_shape = get_network()
+###########################################
+# Splitting the network into two subgraphs.
+# -----------------------------------------
+# This function called 'graph_split' from a unit test is just an example. User 
can create a customized logic
+# to split the graph.
+import inspect
+import os
+
+test_path = os.path.dirname(inspect.getfile(lambda: None))
+os.sys.path.append(os.path.join(test_path, "../../../tests/python/relay"))
+from test_pipeline_executor import graph_split
+
+###########################################
+# Splitting the network into two subgraphs.
+split_config = [{"op_name": "nn.relu", "op_index": 0}]
+subgraphs = graph_split(net["main"], split_config, params)
+###########################################################
+# The generated subgraphs should look something like below.
+
+"""
+#subgraphs[0])
+
+ def @main(%data: Tensor[(1, 3, img_size, img_size), float16]) {
+  %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), 
float16] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* 
ty=Tensor[(1, 16, img_size, img_size), float16] */;
+  %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float16] 
*/, meta[relay.Constant][2] /* ty=Tensor[(16), float16]*/, 
meta[relay.Constant][3] /* ty=Tensor[(16), float16] */, meta[relay.Constant][4] 
/* ty=Tensor[(16), float16] */) /* ty=(Tensor[(1,16, img_size, img_size), 
float16], Tensor[(16), float16], Tensor[(16), float16]) */;
+  %2 = %1.0;
+  nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float16] */
+ }
+
+#subgraphs[1]
+
+ def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 
8), float16] */) {
+  %0 = nn.batch_flatten(%data_n_0) /* ty=Tensor[(1, 1024), float16] */;
+  nn.dense(%0, meta[relay.Constant][0] /* ty=Tensor[(1, 1024), float16] */, 
units=None) /* ty=Tensor[(1, 1), float16] */
+ }
+
+"""
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
+#########################################
+# Build the subgraph with cutlass target.
+# ---------------------------------------
+
+cutlass = tvm.target.Target(
+    {
+        "kind": "cutlass",
+        "sm": int(tvm.target.Target("cuda").arch.split("_")[1]),
+        "use_3xtf32": True,
+        "split_k_slices": [1],
+        "profile_all_alignments": False,
+        "find_first_valid": True,
+        "use_multiprocessing": True,
+        "use_fast_math": False,
+        "tmp_dir": "./tmp",
+    },
+    host=tvm.target.Target("llvm"),
+)
+
+
+def cutlass_build(mod, target, params=None, target_host=None, 
mod_name="default"):
+    target = [target, cutlass]
+    lib = relay.build_module.build(
+        mod, target=target, params=params, target_host=target_host, 
mod_name=mod_name
+    )
+    return lib
+
+
+###########################################################
+# Run the two subgraphs in pipeline with pipeline executor.
+# ---------------------------------------------------------
+# Set 'USE_PIPELINE_EXECUTOR' as ON, and set USE_CUTLASS' as ON  in cmake.
+from tvm.contrib import graph_executor, pipeline_executor, 
pipeline_executor_build
+
+#########################################
+# Create subgraph pipeline configuration.
+# Associate a subgraph module with a target.
+# Use CUTLASS BYOC to build the second subgraph module.
+mod0, mod1 = subgraphs[0], subgraphs[1]
+# Use cutlass as the codegen.
+mod1 = partition_for_cutlass(mod1)
+#################################################
+# Get the pipeline executor configuration object.
+pipe_config = pipeline_executor_build.PipelineConfig()
+###########################################################################
+# Set the compile target of the subgraph module.
+pipe_config[mod0].target = "llvm"
+pipe_config[mod0].dev = tvm.cpu(0)
+###############################################################################
+# Set the cpu affinity for control flow, for example using cpu 0 for control 
flow.
+pipe_config[mod1].cpu_affinity = "0"
+##############################################################
+# Set the compile target of the second subgraph module as cuda.
+pipe_config[mod1].target = "cuda"
+pipe_config[mod1].dev = tvm.device("cuda", 0)
+pipe_config[mod1].build_func = cutlass_build
+pipe_config[mod1].export_cc = "nvcc"
+#################################################################################
+# Set the cpu afinity for control flow, for example using cpu 1 for control 
flow.
+pipe_config[mod1].cpu_affinity = "1"
+pipe_config["input"]["data"].connect(pipe_config[mod0]["input"]["data"])
+pipe_config[mod0]["output"][0].connect(pipe_config[mod1]["input"]["data_n_0"])
+pipe_config[mod1]["output"]["0"].connect(pipe_config["output"][0])

Review Comment:
   Are these three lines related to affinity control? You should have another 
######## before them and explain what they do. 
   
   I have to say, this is not a good API. For example, where the names "data" 
and "data_n_0" come from? What is `pipe_config[mod0]["output"][0]`? And why you 
use "0" at L178? 
   
   



##########
gallery/how_to/work_with_relay/using_pipeline_executor.py:
##########
@@ -0,0 +1,250 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Using Pipeline Executor in Relay
+=================================
+**Author**: `Hua Jiang <https://https://github.com/huajsj>`_
+
+This is a short tutorial on how to use "Pipeline Executor" with Relay.
+"""
+import tvm
+from tvm import te
+import numpy as np
+from tvm.contrib import graph_executor as runtime
+from tvm.relay.op.contrib.cutlass import partition_for_cutlass
+from tvm import relay
+from tvm.relay import testing
+import tvm.testing
+from tvm.contrib.cutlass import (
+    has_cutlass,
+    num_cutlass_partitions,
+    finalize_modules,
+    finalize_modules_vm,
+)
+
+img_size = 8
+#######################################################################
+# Create a simple network, this network can be a pre-trained model too.
+# ---------------------------------------------------------------------
+# Let's create a very simple network for demonstration.
+# It consists of convolution, batch normalization, dense, and ReLU activation.
+def get_network():
+    out_channels = 16
+    batch_size = 1
+    data = relay.var("data", relay.TensorType((batch_size, 3, img_size, 
img_size), "float16"))
+    dense_weight = relay.var(
+        "dweight", relay.TensorType((batch_size, 16 * img_size * img_size), 
"float16")
+    )
+    weight = relay.var("weight")
+    second_weight = relay.var("second_weight")
+    bn_gamma = relay.var("bn_gamma")
+    bn_beta = relay.var("bn_beta")
+    bn_mmean = relay.var("bn_mean")
+    bn_mvar = relay.var("bn_var")
+    simple_net = relay.nn.conv2d(
+        data=data, weight=weight, kernel_size=(3, 3), channels=out_channels, 
padding=(1, 1)
+    )
+    simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, 
bn_mvar)[0]
+    simple_net = relay.nn.relu(simple_net)
+    simple_net = relay.nn.batch_flatten(simple_net)
+    simple_net = relay.nn.dense(simple_net, dense_weight)
+    simple_net = relay.Function(relay.analysis.free_vars(simple_net), 
simple_net)
+    data_shape = (batch_size, 3, img_size, img_size)
+    net, params = testing.create_workload(simple_net)
+    return net, params, data_shape
+
+
+net, params, data_shape = get_network()
+###########################################
+# Splitting the network into two subgraphs.
+# -----------------------------------------
+# This function called 'graph_split' from a unit test is just an example. User 
can create a customized logic
+# to split the graph.
+import inspect
+import os
+
+test_path = os.path.dirname(inspect.getfile(lambda: None))
+os.sys.path.append(os.path.join(test_path, "../../../tests/python/relay"))
+from test_pipeline_executor import graph_split
+
+###########################################
+# Splitting the network into two subgraphs.
+split_config = [{"op_name": "nn.relu", "op_index": 0}]
+subgraphs = graph_split(net["main"], split_config, params)
+###########################################################
+# The generated subgraphs should look something like below.
+
+"""
+#subgraphs[0])
+
+ def @main(%data: Tensor[(1, 3, img_size, img_size), float16]) {
+  %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), 
float16] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* 
ty=Tensor[(1, 16, img_size, img_size), float16] */;
+  %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float16] 
*/, meta[relay.Constant][2] /* ty=Tensor[(16), float16]*/, 
meta[relay.Constant][3] /* ty=Tensor[(16), float16] */, meta[relay.Constant][4] 
/* ty=Tensor[(16), float16] */) /* ty=(Tensor[(1,16, img_size, img_size), 
float16], Tensor[(16), float16], Tensor[(16), float16]) */;
+  %2 = %1.0;
+  nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float16] */
+ }
+
+#subgraphs[1]
+
+ def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 
8), float16] */) {
+  %0 = nn.batch_flatten(%data_n_0) /* ty=Tensor[(1, 1024), float16] */;
+  nn.dense(%0, meta[relay.Constant][0] /* ty=Tensor[(1, 1024), float16] */, 
units=None) /* ty=Tensor[(1, 1), float16] */
+ }
+
+"""
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
+#########################################
+# Build the subgraph with cutlass target.
+# ---------------------------------------
+
+cutlass = tvm.target.Target(
+    {
+        "kind": "cutlass",
+        "sm": int(tvm.target.Target("cuda").arch.split("_")[1]),
+        "use_3xtf32": True,
+        "split_k_slices": [1],
+        "profile_all_alignments": False,
+        "find_first_valid": True,
+        "use_multiprocessing": True,
+        "use_fast_math": False,
+        "tmp_dir": "./tmp",
+    },
+    host=tvm.target.Target("llvm"),
+)
+
+
+def cutlass_build(mod, target, params=None, target_host=None, 
mod_name="default"):
+    target = [target, cutlass]
+    lib = relay.build_module.build(
+        mod, target=target, params=params, target_host=target_host, 
mod_name=mod_name
+    )
+    return lib
+
+
+###########################################################
+# Run the two subgraphs in pipeline with pipeline executor.
+# ---------------------------------------------------------
+# Set 'USE_PIPELINE_EXECUTOR' as ON, and set USE_CUTLASS' as ON  in cmake.
+from tvm.contrib import graph_executor, pipeline_executor, 
pipeline_executor_build
+
+#########################################
+# Create subgraph pipeline configuration.
+# Associate a subgraph module with a target.
+# Use CUTLASS BYOC to build the second subgraph module.
+mod0, mod1 = subgraphs[0], subgraphs[1]
+# Use cutlass as the codegen.
+mod1 = partition_for_cutlass(mod1)
+#################################################
+# Get the pipeline executor configuration object.
+pipe_config = pipeline_executor_build.PipelineConfig()
+###########################################################################
+# Set the compile target of the subgraph module.
+pipe_config[mod0].target = "llvm"
+pipe_config[mod0].dev = tvm.cpu(0)
+###############################################################################
+# Set the cpu affinity for control flow, for example using cpu 0 for control 
flow.
+pipe_config[mod1].cpu_affinity = "0"
+##############################################################
+# Set the compile target of the second subgraph module as cuda.
+pipe_config[mod1].target = "cuda"
+pipe_config[mod1].dev = tvm.device("cuda", 0)
+pipe_config[mod1].build_func = cutlass_build
+pipe_config[mod1].export_cc = "nvcc"
+#################################################################################
+# Set the cpu afinity for control flow, for example using cpu 1 for control 
flow.
+pipe_config[mod1].cpu_affinity = "1"

Review Comment:
   `pipe_config[mod1].cpu_affinity` is written twice, here and at L166.



##########
gallery/how_to/work_with_relay/using_pipeline_executor.py:
##########
@@ -0,0 +1,250 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Using Pipeline Executor in Relay
+=================================
+**Author**: `Hua Jiang <https://https://github.com/huajsj>`_
+
+This is a short tutorial on how to use "Pipeline Executor" with Relay.
+"""
+import tvm
+from tvm import te
+import numpy as np
+from tvm.contrib import graph_executor as runtime
+from tvm.relay.op.contrib.cutlass import partition_for_cutlass
+from tvm import relay
+from tvm.relay import testing
+import tvm.testing
+from tvm.contrib.cutlass import (
+    has_cutlass,
+    num_cutlass_partitions,
+    finalize_modules,
+    finalize_modules_vm,
+)
+
+img_size = 8
+#######################################################################
+# Create a simple network, this network can be a pre-trained model too.
+# ---------------------------------------------------------------------
+# Let's create a very simple network for demonstration.
+# It consists of convolution, batch normalization, dense, and ReLU activation.
+def get_network():
+    out_channels = 16
+    batch_size = 1
+    data = relay.var("data", relay.TensorType((batch_size, 3, img_size, 
img_size), "float16"))
+    dense_weight = relay.var(
+        "dweight", relay.TensorType((batch_size, 16 * img_size * img_size), 
"float16")
+    )
+    weight = relay.var("weight")
+    second_weight = relay.var("second_weight")
+    bn_gamma = relay.var("bn_gamma")
+    bn_beta = relay.var("bn_beta")
+    bn_mmean = relay.var("bn_mean")
+    bn_mvar = relay.var("bn_var")
+    simple_net = relay.nn.conv2d(
+        data=data, weight=weight, kernel_size=(3, 3), channels=out_channels, 
padding=(1, 1)
+    )
+    simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, 
bn_mvar)[0]
+    simple_net = relay.nn.relu(simple_net)
+    simple_net = relay.nn.batch_flatten(simple_net)
+    simple_net = relay.nn.dense(simple_net, dense_weight)
+    simple_net = relay.Function(relay.analysis.free_vars(simple_net), 
simple_net)
+    data_shape = (batch_size, 3, img_size, img_size)
+    net, params = testing.create_workload(simple_net)
+    return net, params, data_shape
+
+
+net, params, data_shape = get_network()
+###########################################
+# Splitting the network into two subgraphs.
+# -----------------------------------------
+# This function called 'graph_split' from a unit test is just an example. User 
can create a customized logic
+# to split the graph.
+import inspect
+import os
+
+test_path = os.path.dirname(inspect.getfile(lambda: None))
+os.sys.path.append(os.path.join(test_path, "../../../tests/python/relay"))
+from test_pipeline_executor import graph_split
+
+###########################################
+# Splitting the network into two subgraphs.
+split_config = [{"op_name": "nn.relu", "op_index": 0}]
+subgraphs = graph_split(net["main"], split_config, params)
+###########################################################
+# The generated subgraphs should look something like below.
+
+"""
+#subgraphs[0])
+
+ def @main(%data: Tensor[(1, 3, img_size, img_size), float16]) {
+  %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), 
float16] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* 
ty=Tensor[(1, 16, img_size, img_size), float16] */;
+  %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float16] 
*/, meta[relay.Constant][2] /* ty=Tensor[(16), float16]*/, 
meta[relay.Constant][3] /* ty=Tensor[(16), float16] */, meta[relay.Constant][4] 
/* ty=Tensor[(16), float16] */) /* ty=(Tensor[(1,16, img_size, img_size), 
float16], Tensor[(16), float16], Tensor[(16), float16]) */;
+  %2 = %1.0;
+  nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float16] */
+ }
+
+#subgraphs[1]
+
+ def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 
8), float16] */) {
+  %0 = nn.batch_flatten(%data_n_0) /* ty=Tensor[(1, 1024), float16] */;
+  nn.dense(%0, meta[relay.Constant][0] /* ty=Tensor[(1, 1024), float16] */, 
units=None) /* ty=Tensor[(1, 1), float16] */
+ }
+
+"""
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
+#########################################
+# Build the subgraph with cutlass target.
+# ---------------------------------------
+
+cutlass = tvm.target.Target(
+    {
+        "kind": "cutlass",
+        "sm": int(tvm.target.Target("cuda").arch.split("_")[1]),
+        "use_3xtf32": True,
+        "split_k_slices": [1],
+        "profile_all_alignments": False,
+        "find_first_valid": True,
+        "use_multiprocessing": True,
+        "use_fast_math": False,
+        "tmp_dir": "./tmp",
+    },
+    host=tvm.target.Target("llvm"),
+)
+
+
+def cutlass_build(mod, target, params=None, target_host=None, 
mod_name="default"):
+    target = [target, cutlass]
+    lib = relay.build_module.build(
+        mod, target=target, params=params, target_host=target_host, 
mod_name=mod_name
+    )
+    return lib
+
+
+###########################################################
+# Run the two subgraphs in pipeline with pipeline executor.
+# ---------------------------------------------------------
+# Set 'USE_PIPELINE_EXECUTOR' as ON, and set USE_CUTLASS' as ON  in cmake.
+from tvm.contrib import graph_executor, pipeline_executor, 
pipeline_executor_build
+
+#########################################
+# Create subgraph pipeline configuration.
+# Associate a subgraph module with a target.
+# Use CUTLASS BYOC to build the second subgraph module.
+mod0, mod1 = subgraphs[0], subgraphs[1]
+# Use cutlass as the codegen.
+mod1 = partition_for_cutlass(mod1)
+#################################################
+# Get the pipeline executor configuration object.
+pipe_config = pipeline_executor_build.PipelineConfig()
+###########################################################################
+# Set the compile target of the subgraph module.
+pipe_config[mod0].target = "llvm"
+pipe_config[mod0].dev = tvm.cpu(0)
+###############################################################################
+# Set the cpu affinity for control flow, for example using cpu 0 for control 
flow.
+pipe_config[mod1].cpu_affinity = "0"
+##############################################################
+# Set the compile target of the second subgraph module as cuda.
+pipe_config[mod1].target = "cuda"
+pipe_config[mod1].dev = tvm.device("cuda", 0)
+pipe_config[mod1].build_func = cutlass_build
+pipe_config[mod1].export_cc = "nvcc"
+#################################################################################
+# Set the cpu afinity for control flow, for example using cpu 1 for control 
flow.

Review Comment:
   typo: `afinity`



##########
gallery/how_to/work_with_relay/using_pipeline_executor.py:
##########
@@ -0,0 +1,251 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+Using Pipeline Executor in Relay
+=================================
+**Author**: `Hua Jiang <https://https://github.com/huajsj>`_
+
+This is a short tutorial on how to use "Pipeline Executor" with Relay.
+"""
+import tvm
+from tvm import te
+import numpy as np
+from tvm.contrib import graph_executor as runtime
+from tvm.relay.op.contrib.cutlass import partition_for_cutlass
+from tvm import relay
+from tvm.relay import testing
+import tvm.testing
+from tvm.contrib.cutlass import (
+    has_cutlass,
+    num_cutlass_partitions,
+    finalize_modules,
+    finalize_modules_vm,
+)
+
+img_size = 8
+#######################################################################
+# Create a simple network, this network can be a pre-trained model too.
+# ---------------------------------------------------------------------
+# Let's create a very simple network for demonstration.
+# It consists of convolution, batch normalization, dense, and ReLU activation.
+def get_network():
+    out_channels = 16
+    batch_size = 1
+    data = relay.var("data", relay.TensorType((batch_size, 3, img_size, 
img_size), "float16"))
+    dense_weight = relay.var(
+        "dweight", relay.TensorType((batch_size, 16 * img_size * img_size), 
"float16")
+    )
+    weight = relay.var("weight")
+    second_weight = relay.var("second_weight")
+    bn_gamma = relay.var("bn_gamma")
+    bn_beta = relay.var("bn_beta")
+    bn_mmean = relay.var("bn_mean")
+    bn_mvar = relay.var("bn_var")
+    simple_net = relay.nn.conv2d(
+        data=data, weight=weight, kernel_size=(3, 3), channels=out_channels, 
padding=(1, 1)
+    )
+    simple_net = relay.nn.batch_norm(simple_net, bn_gamma, bn_beta, bn_mmean, 
bn_mvar)[0]
+    simple_net = relay.nn.relu(simple_net)
+    simple_net = relay.nn.batch_flatten(simple_net)
+    simple_net = relay.nn.dense(simple_net, dense_weight)
+    simple_net = relay.Function(relay.analysis.free_vars(simple_net), 
simple_net)
+    data_shape = (batch_size, 3, img_size, img_size)
+    net, params = testing.create_workload(simple_net)
+    return net, params, data_shape
+
+
+net, params, data_shape = get_network()
+###########################################
+# Splitting the network into two subgraphs.
+# -----------------------------------------
+# It is an example that the graph splitting function comes from a unit test. 
User can create  a
+# customized function to split the graph.
+import inspect
+import os
+
+test_path = os.path.dirname(inspect.getfile(lambda: None))
+os.sys.path.append(os.path.join(test_path, "../../../tests/python/relay"))
+from test_pipeline_executor import graph_split
+
+###########################################
+# Splitting the network into two subgraphs.
+split_config = [{"op_name": "nn.relu", "op_index": 0}]
+subgraphs = graph_split(net["main"], split_config, params)
+###########################################################
+# The generated subgraphs should look something like below.
+
+"""
+#subgraphs[0])
+
+ def @main(%data: Tensor[(1, 3, img_size, img_size), float16]) {
+  %0 = nn.conv2d(%data, meta[relay.Constant][0] /* ty=Tensor[(16, 3, 3, 3), 
float16] */, padding=[1, 1, 1, 1], channels=16, kernel_size=[3, 3]) /* 
ty=Tensor[(1, 16, img_size, img_size), float16] */;
+  %1 = nn.batch_norm(%0, meta[relay.Constant][1] /* ty=Tensor[(16), float16] 
*/, meta[relay.Constant][2] /* ty=Tensor[(16), float16]*/, 
meta[relay.Constant][3] /* ty=Tensor[(16), float16] */, meta[relay.Constant][4] 
/* ty=Tensor[(16), float16] */) /* ty=(Tensor[(1,16, img_size, img_size), 
float16], Tensor[(16), float16], Tensor[(16), float16]) */;
+  %2 = %1.0;
+  nn.relu(%2) /* ty=Tensor[(1, 16, img_size, img_size), float16] */
+ }
+
+#subgraphs[1]
+
+ def @main(%data_n_0: Tensor[(1, 16, 8, 8), float16] /* ty=Tensor[(1, 16, 8, 
8), float16] */) {
+  %0 = nn.batch_flatten(%data_n_0) /* ty=Tensor[(1, 1024), float16] */;
+  nn.dense(%0, meta[relay.Constant][0] /* ty=Tensor[(1, 1024), float16] */, 
units=None) /* ty=Tensor[(1, 1), float16] */
+ }
+
+"""
+
+# sphinx_gallery_start_ignore
+from tvm import testing
+
+testing.utils.install_request_hook(depth=3)
+# sphinx_gallery_end_ignore
+
+#########################################
+# Build the subgraph with cutlass target.
+# ---------------------------------------
+
+cutlass = tvm.target.Target(
+    {
+        "kind": "cutlass",
+        "sm": int(tvm.target.Target("cuda").arch.split("_")[1]),
+        "use_3xtf32": True,
+        "split_k_slices": [1],
+        "profile_all_alignments": False,
+        "find_first_valid": True,
+        "use_multiprocessing": True,
+        "use_fast_math": False,
+        "tmp_dir": "./tmp",
+    },
+    host=tvm.target.Target("llvm"),
+)
+
+
+def cutlass_build(mod, target, params=None, target_host=None, 
mod_name="default"):
+    target = [target, cutlass]
+    lib = relay.build_module.build(
+        mod, target=target, params=params, target_host=target_host, 
mod_name=mod_name
+    )
+    return lib
+
+
+###########################################################
+# Run the two subgraphs in pipeline with pipeline executor.
+# ---------------------------------------------------------
+# Set 'USE_PIPELINE_EXECUTOR' as ON, and set USE_CUTLASS' as ON  in cmake.
+from tvm.contrib import graph_executor, pipeline_executor, 
pipeline_executor_build
+
+#########################################
+# Create subgraph pipeline configuration.
+# Associate a subgraph module with a target.
+# Use CUTLASS BYOC to build the second subgraph module.
+mod0, mod1 = subgraphs[0], subgraphs[1]
+# Use cutlass as the codegen.
+mod1 = partition_for_cutlass(mod1)
+#################################################
+# Get the pipeline executor configuration object.
+pipe_config = pipeline_executor_build.PipelineConfig()
+###########################################################################
+# Set the compile target of the subgraph module.
+pipe_config[mod0].target = "llvm"
+pipe_config[mod0].dev = tvm.cpu(0)
+###############################################################################
+# Set the cpu afinity for control flow, for example using cpu 0 for control 
flow.

Review Comment:
   "control flow" usually means if/else or for loop in TVM or in general. How 
about "host operations"?
   
   This also doesn't sound like something most users should be concerned about. 
I suggest removing affinity stuff from the tutorial and set the default 
affinity inside some runtime function. If you require affinity control by 
users, please summarize and add what you said above to the tutorial with 
correct English. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to