lhutton1 commented on code in PR #11591:
URL: https://github.com/apache/tvm/pull/11591#discussion_r896765760
##########
src/tir/contrib/ethosu/passes.cc:
##########
@@ -121,16 +128,16 @@ tvm::transform::Pass HoistAllocates() {
TVM_REGISTER_GLOBAL("tir.contrib.ethos-u.HoistAllocates").set_body_typed(HoistAllocates);
/*!
- * \brief Reorders copy and compute nodes in such a way that independent DMA
copies,
+ * \brief Reorders copy and compute nodes in such a way that independent DMA
copies
* and computes happen in parallel.
- * Copies to buffers with local scope are not reordered, indeed they copy LUT
- * into the SHRAM which already happens in parallel with copying weights into
+ * Copies to buffers with local scope are not reordered since they copy LUT
+ * into the SHRAM and that already happens in parallel with copying weights
into
* the weights encoder.
Review Comment:
Nit: worth copying the new text in the python pass declaration here aswell?
##########
python/tvm/relay/backend/contrib/ethosu/tir/passes.py:
##########
@@ -916,14 +916,20 @@ def HoistAllocates() -> tvm.IRModule:
return _ffi_api.HoistAllocates()
-def CopyComputeReordering(max_copy_movements: Optional[int] = None) ->
tvm.IRModule:
+def CopyComputeReordering(
+ max_copy_movements: Optional[int] = None, reorder_by_cycles:
Optional[bool] = None
+) -> tvm.IRModule:
"""
- Reorders copy and compute nodes in such a way that independent DMA copies,
+ Reorders copy and compute nodes in such a way that independent DMA copies
and computes happen in parallel.
- Copies to buffers with local scope are not reordered, indeed they copy LUT
- into the SHRAM which already happens in parallel with copying weights into
+ Copies to buffers with local scope are not reordered since they copy LUT
+ into the SHRAM and that already happens in parallel with copying weights
into
the weights encoder.
+ If reorder_by_cycles is set, we use the cycle hint to decide the
reordering. If it is not set,
Review Comment:
Nit: s/cycles hint/compute_cycles_hint
##########
tests/python/contrib/test_ethosu/test_copy_compute_reordering.py:
##########
@@ -468,5 +468,288 @@ def main() -> None:
tvm.ir.assert_structural_equal(test_mod, reference_mod, True)
+def test_reordering_based_on_cycles():
+ # fmt: off
+ @tvm.script.ir_module
+ class ModuleBefore:
+ @T.prim_func
+ def main(placeholder: T.Buffer[(256,), "int8"], placeholder_encoded:
T.Buffer[(288,), "uint8"], placeholder_encoded_2: T.Buffer[(128,), "uint8"],
placeholder_encoded_4: T.Buffer[(288,), "uint8"], placeholder_encoded_6:
T.Buffer[(128,), "uint8"], placeholder_encoded_8: T.Buffer[(144,), "uint8"],
ethosu_write: T.Buffer[(572,), "int8"]) -> None:
+ # function attr dict
+ T.func_attr({"from_legacy_te_schedule": True, "global_symbol":
"main", "tir.noalias": True})
+ ax0_ax1_fused_ax2_fused_ax3_fused = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_1 = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_2 = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_3 = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_4 = T.var("int32")
+ nn = T.var("int32")
+ nn_1 = T.var("int32")
+ nn_2 = T.var("int32")
+ nn_3 = T.var("int32")
+ nn_4 = T.var("int32")
+ nn_5 = T.var("int32")
+ nn_6 = T.var("int32")
+ nn_7 = T.var("int32")
+ nn_8 = T.var("int32")
+ nn_9 = T.var("int32")
+ T.preflattened_buffer(placeholder, [1, 8, 8, 4], dtype="int8",
data=placeholder.data)
+ T.preflattened_buffer(placeholder_encoded, [4, 3, 3, 4],
dtype="int8")
+ T.preflattened_buffer(placeholder_encoded_2, [4, 3, 3, 1],
dtype="int8")
+ T.preflattened_buffer(placeholder_encoded_4, [4, 3, 3, 4],
dtype="int8")
+ T.preflattened_buffer(placeholder_encoded_6, [4, 3, 3, 1],
dtype="int8")
+ T.preflattened_buffer(placeholder_encoded_8, [4, 1, 3, 4],
dtype="int8")
+ T.preflattened_buffer(ethosu_write, [1, 13, 11, 4], dtype="int8",
data=ethosu_write.data)
+ # body
+ placeholder_d_d_global = T.allocate([288], "uint8", "global")
+ ethosu_write_2 = T.allocate([256], "int8", "global")
+ placeholder_d_d_global_2 = T.allocate([128], "uint8", "global")
+ ethosu_write_3 = T.allocate([256], "int8", "global")
+ placeholder_d_d_global_4 = T.allocate([288], "uint8", "global")
+ ethosu_write_4 = T.allocate([256], "int8", "global")
+ ethosu_write_5 = T.allocate([256], "int8", "global")
+ ethosu_write_6 = T.allocate([324], "int8", "global")
+ placeholder_d_global = T.allocate([128], "uint8", "global")
+ ethosu_write_7 = T.allocate([324], "int8", "global")
+ ethosu_write_8 = T.allocate([484], "int8", "global")
+ ethosu_write_9 = T.allocate([484], "int8", "global")
+ ethosu_write_10 = T.allocate([484], "int8", "global")
+ placeholder_global = T.allocate([144], "uint8", "global")
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None,
"DataPar", ""), "pragma_compute_cycles_hint", 2304):
+ T.evaluate(T.call_extern("ethosu_copy",
placeholder_encoded[0], 288, placeholder_d_d_global[0], dtype="handle"))
+ with T.attr(T.iter_var(nn, None, "DataPar", ""),
"pragma_compute_cycles_hint", 320):
+ T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8,
0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 32, 4, 1, "int8", 8,
8, 4, 8, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 4,
1, 3, 3, 1, 1, 1, 1, placeholder_d_d_global[0], 240, T.int8(-1), T.int8(-1),
12, placeholder_d_d_global[240], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1,
"NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle"))
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None,
"DataPar", ""), "pragma_compute_cycles_hint", 576):
+ T.evaluate(T.call_extern("ethosu_copy",
placeholder_encoded_2[0], 128, placeholder_d_d_global_2[0], dtype="handle"))
+ with T.attr(T.iter_var(nn_1, None, "DataPar", ""),
"pragma_compute_cycles_hint", 320):
+ T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 8,
8, 4, 8, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.59999999999999998), 11,
"NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_3[0], 0, 0, 0,
T.float32(0.26000000000000001), 15, "NHWC", 32, 4, 1, 3, 3, 1, 1, 1, 1,
placeholder_d_d_global_2[0], 80, 13, placeholder_d_d_global_2[80], 48, 1, 1, 1,
1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle"))
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_2, None,
"DataPar", ""), "pragma_compute_cycles_hint", 2304):
+ T.evaluate(T.call_extern("ethosu_copy",
placeholder_encoded_4[0], 288, placeholder_d_d_global_4[0], dtype="handle"))
+ with T.attr(T.iter_var(nn_2, None, "DataPar", ""),
"pragma_compute_cycles_hint", 320):
+ T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8,
0, 8, ethosu_write_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 32, 4, 1, "int8",
8, 8, 4, 8, 0, 8, ethosu_write_4[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32,
4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_d_global_4[0], 240, T.int8(-1),
T.int8(-1), 12, placeholder_d_d_global_4[240], 48, T.int8(-1), T.int8(-1), 1,
1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle"))
+ with T.attr(T.iter_var(nn_3, None, "DataPar", ""),
"pragma_compute_cycles_hint", 192):
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 8, 8, 4, 8,
0, 8, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1, "int8", 8,
8, 4, 8, 0, 8, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1,
"MAX", 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8,
dtype="handle"))
+ with T.attr(T.iter_var(nn_4, None, "DataPar", ""),
"pragma_compute_cycles_hint", 300):
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 8, 8, 4, 8,
0, 8, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1, "int8", 9,
9, 4, 9, 0, 9, ethosu_write_6[0], 0, 0, 0, T.float32(1), 0, "NHWC", 36, 4, 1,
"AVG", 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 10, 10, 8,
dtype="handle"))
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_3, None,
"DataPar", ""), "pragma_compute_cycles_hint", 576):
+ T.evaluate(T.call_extern("ethosu_copy",
placeholder_encoded_6[0], 128, placeholder_d_global[0], dtype="handle"))
+ with T.attr(T.iter_var(nn_5, None, "DataPar", ""),
"pragma_compute_cycles_hint", 500):
+ T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 9,
9, 4, 9, 0, 9, ethosu_write_6[0], 0, 0, 0, T.float32(0.59999999999999998), 11,
"NHWC", 36, 4, 1, "int8", 9, 9, 4, 9, 0, 9, ethosu_write_7[0], 0, 0, 0,
T.float32(0.26000000000000001), 15, "NHWC", 36, 4, 1, 3, 3, 1, 1, 1, 1,
placeholder_d_global[0], 80, 13, placeholder_d_global[80], 48, 1, 1, 1, 1,
"NONE", 0, 0, "TFL", "NONE", 10, 10, 8, dtype="handle"))
+ with T.attr(T.iter_var(nn_6, None, "DataPar", ""),
"pragma_compute_cycles_hint", 432):
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 9, 9, 4, 9,
0, 9, ethosu_write_7[0], 0, 0, 0, T.float32(1), 0, "NHWC", 36, 4, 1, "int8",
11, 11, 4, 11, 0, 11, ethosu_write_8[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44,
4, 1, "MAX", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 12,
8, dtype="handle"))
+ with T.attr(T.iter_var(nn_7, None, "DataPar", ""),
"pragma_compute_cycles_hint", 432):
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 11, 11, 4,
11, 0, 11, ethosu_write_8[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1,
"int8", 11, 11, 4, 11, 0, 11, ethosu_write_9[0], 0, 0, 0, T.float32(1), 0,
"NHWC", 44, 4, 1, "AVG", 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL",
"NONE", 12, 12, 8, dtype="handle"))
+ with T.attr(T.iter_var(nn_8, None, "DataPar", ""),
"pragma_compute_cycles_hint", 432):
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 11, 11, 4,
11, 0, 11, ethosu_write_9[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44, 4, 1,
"int8", 11, 11, 4, 11, 0, 11, ethosu_write_10[0], 0, 0, 0, T.float32(1), 0,
"NHWC", 44, 4, 1, "AVG", 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL",
"NONE", 12, 12, 8, dtype="handle"))
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_4, None,
"DataPar", ""), "pragma_compute_cycles_hint", 768):
+ T.evaluate(T.call_extern("ethosu_copy",
placeholder_encoded_8[0], 144, placeholder_global[0], dtype="handle"))
+ T.attr(T.iter_var(nn_9, None, "DataPar", ""),
"pragma_compute_cycles_hint", 504)
+ T.evaluate(T.call_extern("ethosu_conv2d", "int8", 11, 11, 4, 11,
0, 11, ethosu_write_10[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 44, 4, 1,
"int8", 13, 11, 4, 13, 0, 11, ethosu_write[0], 0, 0, 0, T.float32(0.25), 14,
"NHWC", 44, 4, 1, 3, 1, 1, 1, 1, 1, placeholder_global[0], 96, T.int8(-1),
T.int8(-1), 12, placeholder_global[96], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1,
"NONE", 0, 0, "TFL", "NONE", 14, 12, 8, dtype="handle"))
+
+
+
+ @tvm.script.ir_module
+ class ModuleAfter:
+ @T.prim_func
+ def main(placeholder: T.Buffer[(256,), "int8"], placeholder_encoded:
T.Buffer[(288,), "uint8"], placeholder_encoded_2: T.Buffer[(128,), "uint8"],
placeholder_encoded_4: T.Buffer[(288,), "uint8"], placeholder_encoded_6:
T.Buffer[(128,), "uint8"], placeholder_encoded_8: T.Buffer[(144,), "uint8"],
ethosu_write: T.Buffer[(572,), "int8"]) -> None:
+ # function attr dict
+ T.func_attr({"from_legacy_te_schedule": True, "global_symbol":
"main", "tir.noalias": True})
+ ax0_ax1_fused_ax2_fused_ax3_fused = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_1 = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_2 = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_3 = T.var("int32")
+ ax0_ax1_fused_ax2_fused_ax3_fused_4 = T.var("int32")
+ nn = T.var("int32")
+ nn_1 = T.var("int32")
+ nn_2 = T.var("int32")
+ nn_3 = T.var("int32")
+ nn_4 = T.var("int32")
+ nn_5 = T.var("int32")
+ nn_6 = T.var("int32")
+ nn_7 = T.var("int32")
+ nn_8 = T.var("int32")
+ nn_9 = T.var("int32")
+ T.preflattened_buffer(placeholder, [1, 8, 8, 4], dtype="int8",
data=placeholder.data)
+ T.preflattened_buffer(placeholder_encoded, [4, 3, 3, 4],
dtype="int8", data=placeholder_encoded.data)
+ T.preflattened_buffer(placeholder_encoded_2, [4, 3, 3, 1],
dtype="int8", data=placeholder_encoded_2.data)
+ T.preflattened_buffer(placeholder_encoded_4, [4, 3, 3, 4],
dtype="int8", data=placeholder_encoded_4.data)
+ T.preflattened_buffer(placeholder_encoded_6, [4, 3, 3, 1],
dtype="int8", data=placeholder_encoded_6.data)
+ T.preflattened_buffer(placeholder_encoded_8, [4, 1, 3, 4],
dtype="int8", data=placeholder_encoded_8.data)
+ T.preflattened_buffer(ethosu_write, [1, 13, 11, 4], dtype="int8",
data=ethosu_write.data)
+ # body
+ placeholder_d_d_global = T.allocate([288], "uint8", "global")
+ ethosu_write_2 = T.allocate([256], "int8", "global")
+ placeholder_d_d_global_2 = T.allocate([128], "uint8", "global")
+ ethosu_write_3 = T.allocate([256], "int8", "global")
+ placeholder_d_d_global_4 = T.allocate([288], "uint8", "global")
+ ethosu_write_4 = T.allocate([256], "int8", "global")
+ ethosu_write_5 = T.allocate([256], "int8", "global")
+ ethosu_write_6 = T.allocate([324], "int8", "global")
+ placeholder_d_global = T.allocate([128], "uint8", "global")
+ ethosu_write_7 = T.allocate([324], "int8", "global")
+ ethosu_write_8 = T.allocate([484], "int8", "global")
+ ethosu_write_9 = T.allocate([484], "int8", "global")
+ ethosu_write_10 = T.allocate([484], "int8", "global")
+ placeholder_global = T.allocate([144], "uint8", "global")
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused, None,
"DataPar", ""), "pragma_compute_cycles_hint", 2304):
+ T.evaluate(T.call_extern("ethosu_copy",
placeholder_encoded[0], 288, placeholder_d_d_global[0], dtype="handle"))
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_1, None,
"DataPar", ""), "pragma_compute_cycles_hint", 576):
+ T.evaluate(T.call_extern("ethosu_copy",
placeholder_encoded_2[0], 128, placeholder_d_d_global_2[0], dtype="handle"))
+ with T.attr(T.iter_var(nn, None, "DataPar", ""),
"pragma_compute_cycles_hint", 320):
+ T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8,
0, 8, placeholder[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 32, 4, 1, "int8", 8,
8, 4, 8, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32, 4,
1, 3, 3, 1, 1, 1, 1, placeholder_d_d_global[0], 240, T.int8(-1), T.int8(-1),
12, placeholder_d_d_global[240], 48, T.int8(-1), T.int8(-1), 1, 1, 1, 1,
"NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle"))
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_2, None,
"DataPar", ""), "pragma_compute_cycles_hint", 2304):
+ T.evaluate(T.call_extern("ethosu_copy",
placeholder_encoded_4[0], 288, placeholder_d_d_global_4[0], dtype="handle"))
+ with T.attr(T.iter_var(nn_1, None, "DataPar", ""),
"pragma_compute_cycles_hint", 320):
+ T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 8,
8, 4, 8, 0, 8, ethosu_write_2[0], 0, 0, 0, T.float32(0.59999999999999998), 11,
"NHWC", 32, 4, 1, "int8", 8, 8, 4, 8, 0, 8, ethosu_write_3[0], 0, 0, 0,
T.float32(0.26000000000000001), 15, "NHWC", 32, 4, 1, 3, 3, 1, 1, 1, 1,
placeholder_d_d_global_2[0], 80, 13, placeholder_d_d_global_2[80], 48, 1, 1, 1,
1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle"))
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_3, None,
"DataPar", ""), "pragma_compute_cycles_hint", 576):
+ T.evaluate(T.call_extern("ethosu_copy",
placeholder_encoded_6[0], 128, placeholder_d_global[0], dtype="handle"))
+ with T.attr(T.iter_var(nn_2, None, "DataPar", ""),
"pragma_compute_cycles_hint", 320):
+ T.evaluate(T.call_extern("ethosu_conv2d", "int8", 8, 8, 4, 8,
0, 8, ethosu_write_3[0], 0, 0, 0, T.float32(0.5), 10, "NHWC", 32, 4, 1, "int8",
8, 8, 4, 8, 0, 8, ethosu_write_4[0], 0, 0, 0, T.float32(0.25), 14, "NHWC", 32,
4, 1, 3, 3, 1, 1, 1, 1, placeholder_d_d_global_4[0], 240, T.int8(-1),
T.int8(-1), 12, placeholder_d_d_global_4[240], 48, T.int8(-1), T.int8(-1), 1,
1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8, dtype="handle"))
+ with T.attr(T.iter_var(nn_3, None, "DataPar", ""),
"pragma_compute_cycles_hint", 192):
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 8, 8, 4, 8,
0, 8, ethosu_write_4[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1, "int8", 8,
8, 4, 8, 0, 8, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1,
"MAX", 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 8, 8, 8,
dtype="handle"))
+ with T.attr(T.iter_var(nn_4, None, "DataPar", ""),
"pragma_compute_cycles_hint", 300):
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 8, 8, 4, 8,
0, 8, ethosu_write_5[0], 0, 0, 0, T.float32(1), 0, "NHWC", 32, 4, 1, "int8", 9,
9, 4, 9, 0, 9, ethosu_write_6[0], 0, 0, 0, T.float32(1), 0, "NHWC", 36, 4, 1,
"AVG", 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 10, 10, 8,
dtype="handle"))
+ with T.attr(T.iter_var(nn_5, None, "DataPar", ""),
"pragma_compute_cycles_hint", 500):
+ T.evaluate(T.call_extern("ethosu_depthwise_conv2d", "int8", 9,
9, 4, 9, 0, 9, ethosu_write_6[0], 0, 0, 0, T.float32(0.59999999999999998), 11,
"NHWC", 36, 4, 1, "int8", 9, 9, 4, 9, 0, 9, ethosu_write_7[0], 0, 0, 0,
T.float32(0.26000000000000001), 15, "NHWC", 36, 4, 1, 3, 3, 1, 1, 1, 1,
placeholder_d_global[0], 80, 13, placeholder_d_global[80], 48, 1, 1, 1, 1,
"NONE", 0, 0, "TFL", "NONE", 10, 10, 8, dtype="handle"))
+ with T.attr(T.iter_var(nn_6, None, "DataPar", ""),
"pragma_compute_cycles_hint", 432):
+ T.evaluate(T.call_extern("ethosu_pooling", "int8", 9, 9, 4, 9,
0, 9, ethosu_write_7[0], 0, 0, 0, T.float32(1), 0, "NHWC", 36, 4, 1, "int8",
11, 11, 4, 11, 0, 11, ethosu_write_8[0], 0, 0, 0, T.float32(1), 0, "NHWC", 44,
4, 1, "MAX", 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, "NONE", 0, 0, "TFL", "NONE", 12, 12,
8, dtype="handle"))
+ with T.attr(T.iter_var(ax0_ax1_fused_ax2_fused_ax3_fused_4, None,
"DataPar", ""), "pragma_compute_cycles_hint", 768):
+ T.evaluate(T.call_extern("ethosu_copy",
placeholder_encoded_8[0], 144, placeholder_global[0], dtype="handle"))
Review Comment:
Curious, why did this copy not get moved up further so that it is hidden by
the pooling operation with compute_cycles_hint=500?
##########
python/tvm/relay/backend/contrib/ethosu/tir/passes.py:
##########
@@ -916,14 +916,20 @@ def HoistAllocates() -> tvm.IRModule:
return _ffi_api.HoistAllocates()
-def CopyComputeReordering(max_copy_movements: Optional[int] = None) ->
tvm.IRModule:
+def CopyComputeReordering(
+ max_copy_movements: Optional[int] = None, reorder_by_cycles:
Optional[bool] = None
+) -> tvm.IRModule:
"""
- Reorders copy and compute nodes in such a way that independent DMA copies,
+ Reorders copy and compute nodes in such a way that independent DMA copies
and computes happen in parallel.
- Copies to buffers with local scope are not reordered, indeed they copy LUT
- into the SHRAM which already happens in parallel with copying weights into
+ Copies to buffers with local scope are not reordered since they copy LUT
+ into the SHRAM and that already happens in parallel with copying weights
into
the weights encoder.
+ If reorder_by_cycles is set, we use the cycle hint to decide the
reordering. If it is not set,
+ we move the copies up by a fixed number of movements, either by
max_copy_movements if it is
+ specified, or by default value of 1.
+
Review Comment:
It's probably worth going into a bit more detail about how the algorithm
that does the reordering based on the compute cycles hint works. Does it
produce an optimal ordering based on the hints?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]