vinx13 commented on code in PR #12895:
URL: https://github.com/apache/tvm/pull/12895#discussion_r984973110
##########
tests/python/unittest/test_meta_schedule_space_generator.py:
##########
@@ -104,4 +108,5 @@ class TestPySpaceGenerator(PySpaceGenerator):
if __name__ == "__main__":
- tvm.testing.main()
+ test_meta_schedule_design_space_generator_NIE()
+ # tvm.testing.main()
Review Comment:
keep this
##########
src/meta_schedule/schedule_rule/schedule_rule.cc:
##########
@@ -51,6 +51,125 @@ ScheduleRule ScheduleRule::PyScheduleRule(
return ScheduleRule(n);
}
+Array<ScheduleRule> ScheduleRule::DefaultLLVM() {
+ return {
+ ScheduleRule::AutoInline(
+ /*into_producer=*/false,
+ /*into_consumer=*/true,
+ /*inline_const_tensor=*/true,
+ /*disallow_if_then_else=*/true,
+ /*require_injective=*/true,
+ /*require_ordered=*/true,
+ /*disallow_op=*/Array<String>{"tir.exp"}),
+ ScheduleRule::AddRFactor(
+ /*max_jobs_per_core=*/16,
+ /*max_innermost_factor=*/Integer(64)),
+ ScheduleRule::MultiLevelTiling(
+ /*structure=*/"SSRSRS",
+ /*tile_binds=*/NullOpt,
+ /*max_innermost_factor=*/Integer(64),
+ /*vector_load_lens=*/NullOpt,
+ /*reuse_read=*/NullOpt,
+ /*reuse_write=*/
+ Map<String, ObjectRef>{{"req", String("may")},
+ {"levels", Array<Integer>{1, 2}},
+ {"scope", String("global")}}),
+ ScheduleRule::ParallelizeVectorizeUnroll(
+ /*max_jobs_per_core=*/16,
+ /*max_vectorize_extent=*/64,
+ /*unroll_max_steps=*/Array<Integer>{0, 16, 64, 512},
+ /*unroll_explicit=*/true),
+ ScheduleRule::RandomComputeLocation(),
+ };
+}
+
+Array<ScheduleRule> ScheduleRule::DefaultCUDA() {
+ return {
+ ScheduleRule::MultiLevelTiling(
+ /*structure=*/"SSSRRSRS",
+ /*tile_binds=*/Array<String>{"blockIdx.x", "vthread.x",
"threadIdx.x"},
+ /*max_innermost_factor=*/Integer(64),
+ /*vector_load_lens=*/Array<Integer>{1, 2, 3, 4, 8, 16},
+ /*reuse_read=*/
+ Map<String, ObjectRef>{{"req", String("must")},
+ {"levels", Array<Integer>{4}}, //
+ {"scope", String("shared")}},
+ /*reuse_write=*/
+ Map<String, ObjectRef>{{"req", String("must")},
+ {"levels", Array<Integer>{3}}, //
+ {"scope", String("local")}}),
+ ScheduleRule::AutoInline(
+ /*into_producer=*/true,
+ /*into_consumer=*/true,
+ /*inline_const_tensor=*/true,
+ /*disallow_if_then_else=*/false,
+ /*require_injective=*/false,
+ /*require_ordered=*/false,
+ /*disallow_op=*/Array<String>{}),
+ ScheduleRule::CrossThreadReduction(
+ /*thread_extents=*/Array<Integer>{4, 8, 16, 32, 64, 128, 256, 512}),
+ ScheduleRule::ParallelizeVectorizeUnroll(
+ /*max_jobs_per_core=*/-1,
+ /*max_vectorize_extent=*/-1,
+ /*unroll_max_steps=*/Array<Integer>{0, 16, 64, 512, 1024},
+ /*unroll_explicit=*/true),
+ ScheduleRule::AutoBind(
+ /*max_threadblocks=*/256,
+ /*thread_extents*/ Array<Integer>{32, 64, 128, 256, 512, 1024}),
+ };
+}
+
+Array<ScheduleRule> ScheduleRule::DefaultCUDATensorCore() {
+ Array<Map<String, String>> intrin_groups = {
+ {
+ {"init", "wmma_fill_16x16x16_f16"},
+ {"load_a", "wmma_load_16x16x16_f16_a"},
+ {"load_b", "wmma_load_16x16x16_f16_b"},
+ {"compute", "wmma_sync_16x16x16_f16f16f16"},
+ {"store", "wmma_store_16x16x16_f16_shared"},
+ },
+ {
+ {"init", "wmma_fill_16x16x16_f16"},
+ {"load_a", "wmma_load_16x16x16_f16_a"},
+ {"load_b", "wmma_load_16x16x16_f16_b_trans"},
+ {"compute", "wmma_sync_16x16x16_f16f16f16_trans"},
+ {"store", "wmma_store_16x16x16_f16_shared"},
+ },
+ {
+ {"init", "wmma_fill_16x16x16_s32"},
+ {"load_a", "wmma_load_16x16x16_s8_a"},
+ {"load_b", "wmma_load_16x16x16_s8_b"},
+ {"compute", "wmma_sync_16x16x16_s8s8s32"},
+ {"store", "wmma_store_16x16x16_s32_shared"},
+ },
+ {
+ {"init", "wmma_fill_16x16x16_s32"},
+ {"load_a", "wmma_load_16x16x16_s8_a"},
+ {"load_b", "wmma_load_16x16x16_s8_b_trans"},
+ {"compute", "wmma_sync_16x16x16_s8s8s32_trans"},
+ {"store", "wmma_store_16x16x16_s32_shared"},
+ },
+ };
+ Array<ScheduleRule> results{ScheduleRule::MultiLevelTilingTensorCore(
+ /*intrin_groups=*/intrin_groups,
+ /*structure=*/"SSSRRSRS",
+ /*tile_binds=*/Array<String>{"blockIdx.x", "vthread.x", "threadIdx.x"},
+ /*max_innermost_factor=*/Integer(64),
Review Comment:
```suggestion
/*max_innermost_factor=*/Integer(4),
```
##########
src/meta_schedule/task_scheduler/task_scheduler.cc:
##########
@@ -21,83 +21,225 @@
namespace tvm {
namespace meta_schedule {
-void TaskSchedulerNode::InitializeTask(int task_id) {
+TaskRecord::TaskRecord(TuneContext ctx, double task_weight) {
+ ObjectPtr<TaskRecordNode> n = runtime::make_object<TaskRecordNode>();
+ n->ctx = ctx;
+ n->task_weight = task_weight;
+ n->flop = 1.0;
auto _ = Profiler::TimedScope("InitializeTask");
- TuneContext task = this->tasks[task_id];
- TVM_PY_LOG(INFO, this->logging_func)
- << "Initializing Task #" << task_id << ": " << task->task_name;
- TVM_PY_LOG(INFO, task->logging_func)
- << "Initializing Task #" << task_id << ": " << task->task_name;
- CHECK(task->mod.defined()) << "ValueError: Require `context.mod`, but it is
not defined";
- CHECK(task->space_generator.defined())
+ CHECK(ctx->mod.defined()) << "ValueError: Require `context.mod`, but it is
not defined";
+ CHECK(ctx->space_generator.defined())
<< "ValueError: Require `context.space_generator`, but it is not
defined";
- CHECK(task->search_strategy.defined())
+ CHECK(ctx->search_strategy.defined())
<< "ValueError: Require `context.search_strategy`, but it is not
defined";
- TVM_PY_LOG(INFO, task->logging_func) << "\n" << tir::AsTVMScript(task->mod);
- task->Initialize();
- Array<tir::Schedule> design_spaces =
- task->space_generator.value()->GenerateDesignSpace(task->mod.value());
- TVM_PY_LOG(INFO, task->logging_func)
- << "Total " << design_spaces.size() << " design space(s) generated";
- for (int i = 0, n = design_spaces.size(); i < n; ++i) {
- tir::Schedule sch = design_spaces[i];
- tir::Trace trace = sch->trace().value();
- trace = trace->Simplified(true);
- TVM_PY_LOG(INFO, task->logging_func) << "Design space #" << i << ":\n"
- << tir::AsTVMScript(sch->mod()) <<
"\n"
- << Concat(trace->AsPython(false),
"\n");
+ TVM_PY_LOG(INFO, ctx->logger) << "\n" << tir::AsTVMScript(ctx->mod);
+ ctx->Initialize();
+ n->flop = std::max(1.0, tir::EstimateTIRFlops(ctx->mod.value()));
+ this->data_ = std::move(n);
+}
+
+void SendToBuilder(TaskRecordNode* self, const Builder& builder) {
+ auto _ = Profiler::TimedScope("SendToBuilder");
+ Array<MeasureCandidate> candidates = self->measure_candidates.value();
+ Target target = self->ctx->target.value();
+ Array<BuilderInput> inputs;
+ inputs.reserve(candidates.size());
+ for (const MeasureCandidate& candidate : candidates) {
+ inputs.push_back(BuilderInput(candidate->sch->mod(), target));
}
- task->search_strategy.value()->PreTuning(design_spaces, database,
cost_model);
+ self->builder_results = builder->Build(inputs);
}
-void TaskSchedulerNode::Tune() {
- int n_tasks = this->tasks.size();
- for (int task_id = 0; task_id < n_tasks; ++task_id) {
- InitializeTask(task_id);
+void SendToRunner(TaskRecordNode* self, const Runner& runner) {
+ auto _ = Profiler::TimedScope("SendToRunner");
+ Array<MeasureCandidate> candidates = self->measure_candidates.value();
+ Array<BuilderResult> builder_results = self->builder_results.value();
+ Target target = self->ctx->target.value();
+ ICHECK_EQ(candidates.size(), builder_results.size());
+ int n = candidates.size();
+ int n_build_errors = 0;
+ Array<RunnerInput> inputs;
+ inputs.reserve(n);
+ for (int i = 0; i < n; ++i) {
+ const MeasureCandidate& candidate = candidates[i];
+ const BuilderResult& builder_result = builder_results[i];
+ if (builder_result->error_msg.defined()) {
+ ++n_build_errors;
+ continue;
+ }
+
inputs.push_back(RunnerInput(/*artifact_path=*/builder_result->artifact_path.value(),
+ /*device_type=*/target->kind->name,
+ /*args_info=*/candidate->args_info));
+ }
+ Array<RunnerFuture> futures = runner->Run(inputs);
+ if (n_build_errors == 0) {
+ self->runner_futures = futures;
+ return;
+ }
+ Array<RunnerFuture> results;
+ results.reserve(n);
+ for (int i = 0, j = 0; i < n; ++i) {
+ const BuilderResult& builder_result = builder_results[i];
+ if (builder_result->error_msg.defined()) {
+ results.push_back(RunnerFuture(
+ /*f_done=*/[]() -> bool { return true; },
+ /*f_result=*/
+ [msg = builder_result->error_msg]() -> RunnerResult {
+ return RunnerResult(NullOpt, msg);
+ }));
+ } else {
+ results.push_back(futures[j++]);
+ }
+ }
+ self->runner_futures = results;
+}
+
+void TaskCleanUp(TaskRecordNode* self, int task_id, const Array<RunnerResult>&
results) {
+ ICHECK_EQ(self->builder_results.value().size(), results.size());
+ ICHECK_EQ(self->runner_futures.value().size(), results.size());
+ int n = results.size();
+ std::string name = self->ctx->task_name.value();
+ const PackedFunc& logger = self->ctx->logger;
+ for (int i = 0; i < n; ++i) {
+ const BuilderResult& builder_result = self->builder_results.value()[i];
+ const MeasureCandidate& candidate = self->measure_candidates.value()[i];
+ const RunnerResult& runner_result = results[i];
+ Optional<String> error_msg = NullOpt;
+ int trials = self->latency_ms.size() + 1;
+ double run_ms = 1e9;
+ if ((error_msg = builder_result->error_msg)) {
+ ++self->build_error_count;
+ } else if ((error_msg = runner_result->error_msg)) {
+ ++self->run_error_count;
+ } else {
+ run_ms = GetRunMsMedian(runner_result);
+ }
+ self->latency_ms.push_back(run_ms);
+ if (error_msg) {
Review Comment:
I'm wondering how frequent these errors happen during tuning. Should we
print all errors or just errors above count threshold?
##########
python/tvm/meta_schedule/relay_integration.py:
##########
@@ -15,28 +15,82 @@
# specific language governing permissions and limitations
# under the License.
"""MetaSchedule-Relay integration"""
-from typing import Any, Dict, List, Optional
+from contextlib import contextmanager
+from typing import Dict, List, Optional, Tuple, Union
+# isort: off
+from typing_extensions import Literal
+
+# isort: on
import numpy as np # type: ignore
from tvm import nd
from tvm._ffi import get_global_func
from tvm.ir import IRModule, transform
from tvm.runtime import NDArray
from tvm.target import Target
+from .builder import Builder
+from .cost_model import CostModel
+from .database import Database
from .extracted_task import ExtractedTask
-from .utils import autotvm_silencer
+from .logging import get_loggers_from_work_dir
+from .measure_callback import MeasureCallback
+from .profiler import Profiler
+from .runner import Runner
+from .search_strategy import SearchStrategy
+from .space_generator import SpaceGenerator
+from .task_scheduler import TaskScheduler
+from .tune import tune_tasks
+from .tune_context import TuneContext
+from .utils import fork_seed
+
+_extract_task = get_global_func( # pylint: disable=invalid-name
+ "relay.backend.MetaScheduleExtractTask",
+ allow_missing=False,
+)
+
+
+@contextmanager
+def _autotvm_silencer():
+ """A context manager that silences autotvm warnings."""
+ from tvm import autotvm # pylint: disable=import-outside-toplevel
+
+ silent = autotvm.GLOBAL_SCOPE.silent
+ autotvm.GLOBAL_SCOPE.silent = True
+ try:
+ yield
+ finally:
+ autotvm.GLOBAL_SCOPE.silent = silent
+
+
+def _normalize_params(
+ mod: IRModule,
+ target: Union[Target, str],
+ params: Optional[Dict[str, NDArray]],
+) -> Tuple[IRModule, Target, Dict[str, NDArray]]:
+ from tvm.relay import Function # pylint: disable=import-outside-toplevel
+
+ if isinstance(mod, Function):
+ mod = IRModule.from_expr(mod)
+ if not isinstance(target, Target):
+ target = Target(target)
+ if params is None:
+ params = {}
+ relay_params = {}
+ for name, param in params.items():
+ if isinstance(param, np.ndarray):
+ param = nd.array(param)
+ relay_params[name] = param
+
+ return mod, target, relay_params
def extract_task_from_relay(
mod: IRModule,
- target: Target,
- params: Optional[Dict[str, NDArray]] = None,
+ target: Union[Target, str],
+ params: Optional[Dict[str, NDArray]],
*,
- opt_level: int = 3,
- pass_config: Optional[Dict[str, Any]] = None,
Review Comment:
do we have alternative? a possible use case is to tune and compile relay
under a different pass config or opt level
##########
python/tvm/meta_schedule/task_scheduler/task_scheduler.py:
##########
@@ -101,15 +90,43 @@ def join_running_task(self, task_id: int) ->
List[RunnerResult]:
"""
return _ffi_api.TaskSchedulerJoinRunningTask(self, task_id) # type:
ignore # pylint: disable=no-member
- def initialize_task(self, task_id: int) -> None:
- """Initialize modules of the given task.
+ def tune(
+ self,
+ tasks: List[TuneContext],
+ task_weights: List[float],
+ max_trials_global: int,
+ max_trials_per_task: int,
+ num_trials_per_iter: int,
+ builder: Builder,
+ runner: Runner,
+ measure_callbacks: List[MeasureCallback],
+ database: Optional[Database],
+ cost_model: Optional[CostModel],
+ ) -> None:
+ """Auto-tuning."""
Review Comment:
document params
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]