MasterJH5574 commented on a change in pull request #8467: URL: https://github.com/apache/tvm/pull/8467#discussion_r670074474
########## File path: src/tir/schedule/primitive/fuse_split.cc ########## @@ -0,0 +1,483 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "../utils.h" +namespace tvm { +namespace tir { + +/*! \brief Append a new predicate to the each children of type BlockRealize (not recursively) */ +class PredicateUpdater : public StmtMutator { + public: + /*! + * \brief Constructor + * \param predicate The predicate to be apppend to BlockRealizeNode + */ + explicit PredicateUpdater(const PrimExpr& predicate, arith::Analyzer* ana) + : predicate_(predicate) { + if (!ana->CanProve(predicate)) { + add_predicate_ = true; + } + } + + private: + // For each direct child of type BlockRealizeNode, append the predicate + Stmt VisitStmt_(const BlockRealizeNode* realize) final { + // We do not recursively do this + if (add_predicate_) { + ObjectPtr<BlockRealizeNode> n = CopyOnWrite(realize); + n->predicate = n->predicate && predicate_; + return BlockRealize(n); + } else { + return GetRef<BlockRealize>(realize); + } + } + + /*! \brief The predicate to be added */ + const PrimExpr& predicate_; + /*! \brief whether to add predicate */ + bool add_predicate_; +}; +/*! \brief Substitute vars and collect the reuse mapping of opaque blocks */ +class IRSubstituteAndCollectOpaqueBlock : public StmtExprMutator { + public: + explicit IRSubstituteAndCollectOpaqueBlock(std::function<Optional<PrimExpr>(const Var&)> vmap, + Map<Block, Block>* opaque_blocks) + : vmap_(vmap), opaque_blocks_(opaque_blocks) {} + + private: + PrimExpr VisitExpr_(const VarNode* op) final { + Var var = GetRef<Var>(op); + Optional<PrimExpr> ret = vmap_(var); + if (ret.defined()) { + return ret.value(); + } else { + return std::move(var); + } + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + Stmt res = StmtMutator::VisitStmt_(op); + if (op->block->iter_vars.empty()) { + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + opaque_blocks_->Set(op->block, realize->block); + } + return res; + } + + /*! \brief The substitute function */ + std::function<Optional<PrimExpr>(const Var&)> vmap_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SubstituteAndCollectOpaqueBlock(Stmt stmt, Map<Block, Block>* opaque_blocks, + std::function<Optional<PrimExpr>(const Var&)> vmap) { + return IRSubstituteAndCollectOpaqueBlock(vmap, opaque_blocks)(std::move(stmt)); +} + +/*! \brief Simplify the binding of block realize and update the opaque block reuse mapping*/ +class BlockRealizeRewriter : public StmtExprMutator { + public: + explicit BlockRealizeRewriter( + const std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual>& loop_map, + Map<Block, Block>* opaque_blocks) + : opaque_blocks_(opaque_blocks) { + loop_map_.insert(loop_map.begin(), loop_map.end()); + } + + private: + Stmt VisitStmt_(const ForNode* op) final { + loop_map_[op->loop_var] = Range::FromMinExtent(op->min, op->extent); + Stmt res = StmtMutator::VisitStmt_(op); + loop_map_.erase(op->loop_var); + return res; + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + // skip opaque block and update mapping + if (op->iter_values.empty()) { + Stmt res = StmtMutator::VisitStmt_(op); + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + for (const std::pair<Block, Block>& entry : *opaque_blocks_) { + if (entry.second.same_as(op->block)) { + opaque_blocks_->Set(entry.first, realize->block); + break; + } + } + return res; + } + auto v = arith::IterMapSimplify(op->iter_values, loop_map_, op->predicate, false); + if (v.same_as(op->iter_values)) { + return GetRef<Stmt>(op); + } else { + auto n = CopyOnWrite(op); + n->iter_values = std::move(v); + return Stmt(n); + } + } + /*! \brief The range of loops */ + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SimplifyBindings(const Stmt& stmt, const Array<StmtSRef>& loops, + Map<Block, Block>* opaque_blocks) { + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map; + for (const StmtSRef& sref : loops) { + const auto* loop = sref->StmtAs<ForNode>(); + loop_map[loop->loop_var] = Range::FromMinExtent(loop->min, loop->extent); + } + BlockRealizeRewriter rewriter(loop_map, opaque_blocks); + return rewriter(stmt); +} + +class NotLoopError : public ScheduleError { + public: + explicit NotLoopError(IRModule mod, String type) : mod_(mod), type_(type) {} + + String FastErrorString() const final { + return "ScheduleError: this primitive only operates on a " + "loop"; + } + + String DetailRenderTemplate() const final { + return "this primitive only operates on a loop, but the StmtSref passed in points to" + "type: {0} "; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {type_}; } + + IRModule mod_; + String type_; +}; + +class HasAnnotationError : public ScheduleError { + public: + explicit HasAnnotationError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has annotation"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has annotation"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class HasThreadBindingError : public ScheduleError { + public: + explicit HasThreadBindingError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has thread binding"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has thread binding"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class OuterNotInnerParent : public ScheduleError { + public: + explicit OuterNotInnerParent(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the outer loop is not the parent of the inner loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the outer loop {0} is not the parent of the inner " + "loop {1}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class NotOnlyChildError : public ScheduleError { + public: + explicit NotOnlyChildError(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the inner loop is not the only child of outer loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the inner loop {1} is not the only child of outer " + "loop {0}."; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class LoopNotStartWithZeroError : public ScheduleError { + public: + explicit LoopNotStartWithZeroError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: the primitive only supports loop starting with 0"; + } + + String DetailRenderTemplate() const final { + return "The loop {0} does not start with 0, which is not supported"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class NotSingleInferFactorError : public ScheduleError { + public: + explicit NotSingleInferFactorError(IRModule mod) : mod_(mod) {} + + String FastErrorString() const final { + return "ScheduleError: only one factor can be specified as -1 or none"; + } + + String DetailRenderTemplate() const final { + return "Only one factor can be specified as -1 or none"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {}; } + + IRModule mod_; +}; + +class WrongFactorProductError : public ScheduleError { + public: + explicit WrongFactorProductError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The product of factors is not larger than or equal to the extent of " + "loop"; + } + + String DetailRenderTemplate() const final { + return "The product of factors is not larger than or equal to the extent of loop {0}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref, + const Array<PrimExpr>& factors) { + // Invariance + // - The total repeat number has not changed for each direct child block with updating predicate. + // - The execution order has not changed. (The block executes with the same args and the same + // order with before. + // Step 1. Check correctness + GetScopeRootAndCheckStagePipeline(self, loop_sref); + const auto* loop = loop_sref->StmtAs<ForNode>(); + if (loop == nullptr) { + throw NotLoopError(self->mod, loop_sref->stmt->GetTypeKey()); + } + if (!loop->annotations.empty()) { + throw HasAnnotationError(self->mod, GetRef<For>(loop)); + } + if (loop->thread_binding.defined()) { + throw HasThreadBindingError(self->mod, GetRef<For>(loop)); + } + // Currently, loops starting with 0 is not supported + arith::Analyzer analyzer; Review comment: ```suggestion // Currently, loops not starting with 0 are not supported arith::Analyzer analyzer; ``` ########## File path: src/tir/schedule/primitive/fuse_split.cc ########## @@ -0,0 +1,483 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "../utils.h" +namespace tvm { +namespace tir { + +/*! \brief Append a new predicate to the each children of type BlockRealize (not recursively) */ +class PredicateUpdater : public StmtMutator { + public: + /*! + * \brief Constructor + * \param predicate The predicate to be apppend to BlockRealizeNode + */ + explicit PredicateUpdater(const PrimExpr& predicate, arith::Analyzer* ana) + : predicate_(predicate) { + if (!ana->CanProve(predicate)) { + add_predicate_ = true; + } + } + + private: + // For each direct child of type BlockRealizeNode, append the predicate + Stmt VisitStmt_(const BlockRealizeNode* realize) final { + // We do not recursively do this + if (add_predicate_) { + ObjectPtr<BlockRealizeNode> n = CopyOnWrite(realize); + n->predicate = n->predicate && predicate_; + return BlockRealize(n); + } else { + return GetRef<BlockRealize>(realize); + } + } + + /*! \brief The predicate to be added */ + const PrimExpr& predicate_; + /*! \brief whether to add predicate */ + bool add_predicate_; +}; +/*! \brief Substitute vars and collect the reuse mapping of opaque blocks */ +class IRSubstituteAndCollectOpaqueBlock : public StmtExprMutator { + public: + explicit IRSubstituteAndCollectOpaqueBlock(std::function<Optional<PrimExpr>(const Var&)> vmap, + Map<Block, Block>* opaque_blocks) + : vmap_(vmap), opaque_blocks_(opaque_blocks) {} + + private: + PrimExpr VisitExpr_(const VarNode* op) final { + Var var = GetRef<Var>(op); + Optional<PrimExpr> ret = vmap_(var); + if (ret.defined()) { + return ret.value(); + } else { + return std::move(var); + } + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + Stmt res = StmtMutator::VisitStmt_(op); + if (op->block->iter_vars.empty()) { + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + opaque_blocks_->Set(op->block, realize->block); + } + return res; + } + + /*! \brief The substitute function */ + std::function<Optional<PrimExpr>(const Var&)> vmap_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SubstituteAndCollectOpaqueBlock(Stmt stmt, Map<Block, Block>* opaque_blocks, + std::function<Optional<PrimExpr>(const Var&)> vmap) { + return IRSubstituteAndCollectOpaqueBlock(vmap, opaque_blocks)(std::move(stmt)); +} + +/*! \brief Simplify the binding of block realize and update the opaque block reuse mapping*/ +class BlockRealizeRewriter : public StmtExprMutator { + public: + explicit BlockRealizeRewriter( + const std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual>& loop_map, + Map<Block, Block>* opaque_blocks) + : opaque_blocks_(opaque_blocks) { + loop_map_.insert(loop_map.begin(), loop_map.end()); + } + + private: + Stmt VisitStmt_(const ForNode* op) final { + loop_map_[op->loop_var] = Range::FromMinExtent(op->min, op->extent); + Stmt res = StmtMutator::VisitStmt_(op); + loop_map_.erase(op->loop_var); + return res; + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + // skip opaque block and update mapping + if (op->iter_values.empty()) { + Stmt res = StmtMutator::VisitStmt_(op); + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + for (const std::pair<Block, Block>& entry : *opaque_blocks_) { + if (entry.second.same_as(op->block)) { + opaque_blocks_->Set(entry.first, realize->block); + break; + } + } + return res; + } + auto v = arith::IterMapSimplify(op->iter_values, loop_map_, op->predicate, false); + if (v.same_as(op->iter_values)) { + return GetRef<Stmt>(op); + } else { + auto n = CopyOnWrite(op); + n->iter_values = std::move(v); + return Stmt(n); + } + } + /*! \brief The range of loops */ + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SimplifyBindings(const Stmt& stmt, const Array<StmtSRef>& loops, + Map<Block, Block>* opaque_blocks) { + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map; + for (const StmtSRef& sref : loops) { + const auto* loop = sref->StmtAs<ForNode>(); + loop_map[loop->loop_var] = Range::FromMinExtent(loop->min, loop->extent); + } + BlockRealizeRewriter rewriter(loop_map, opaque_blocks); + return rewriter(stmt); +} + +class NotLoopError : public ScheduleError { + public: + explicit NotLoopError(IRModule mod, String type) : mod_(mod), type_(type) {} + + String FastErrorString() const final { + return "ScheduleError: this primitive only operates on a " + "loop"; + } + + String DetailRenderTemplate() const final { + return "this primitive only operates on a loop, but the StmtSref passed in points to" + "type: {0} "; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {type_}; } + + IRModule mod_; + String type_; +}; + +class HasAnnotationError : public ScheduleError { + public: + explicit HasAnnotationError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has annotation"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has annotation"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class HasThreadBindingError : public ScheduleError { + public: + explicit HasThreadBindingError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has thread binding"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has thread binding"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class OuterNotInnerParent : public ScheduleError { + public: + explicit OuterNotInnerParent(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the outer loop is not the parent of the inner loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the outer loop {0} is not the parent of the inner " + "loop {1}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class NotOnlyChildError : public ScheduleError { + public: + explicit NotOnlyChildError(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the inner loop is not the only child of outer loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the inner loop {1} is not the only child of outer " + "loop {0}."; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class LoopNotStartWithZeroError : public ScheduleError { + public: + explicit LoopNotStartWithZeroError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: the primitive only supports loop starting with 0"; + } + + String DetailRenderTemplate() const final { + return "The loop {0} does not start with 0, which is not supported"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class NotSingleInferFactorError : public ScheduleError { + public: + explicit NotSingleInferFactorError(IRModule mod) : mod_(mod) {} + + String FastErrorString() const final { + return "ScheduleError: only one factor can be specified as -1 or none"; + } + + String DetailRenderTemplate() const final { + return "Only one factor can be specified as -1 or none"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {}; } + + IRModule mod_; +}; + +class WrongFactorProductError : public ScheduleError { + public: + explicit WrongFactorProductError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The product of factors is not larger than or equal to the extent of " + "loop"; + } + + String DetailRenderTemplate() const final { + return "The product of factors is not larger than or equal to the extent of loop {0}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref, + const Array<PrimExpr>& factors) { + // Invariance + // - The total repeat number has not changed for each direct child block with updating predicate. + // - The execution order has not changed. (The block executes with the same args and the same + // order with before. + // Step 1. Check correctness + GetScopeRootAndCheckStagePipeline(self, loop_sref); + const auto* loop = loop_sref->StmtAs<ForNode>(); + if (loop == nullptr) { + throw NotLoopError(self->mod, loop_sref->stmt->GetTypeKey()); + } + if (!loop->annotations.empty()) { + throw HasAnnotationError(self->mod, GetRef<For>(loop)); + } + if (loop->thread_binding.defined()) { + throw HasThreadBindingError(self->mod, GetRef<For>(loop)); + } + // Currently, loops starting with 0 is not supported + arith::Analyzer analyzer; + if (!analyzer.CanProve(loop->min == 0)) { + throw LoopNotStartWithZeroError(self->mod, GetRef<For>(loop)); + } + PrimExpr tot_length = 1; + int infer_index = -1; + for (size_t i = 0; i < factors.size(); i++) { + if (!analyzer.CanProve(factors[i] == -1)) { + tot_length *= factors[i]; + } else { + if (infer_index != -1) { + throw NotSingleInferFactorError(self->mod); + } else { + infer_index = i; + } + } + } + // Step 2. infer factors if needed + Array<PrimExpr> inferred_factors(factors); + if (infer_index != -1) { + inferred_factors.Set(infer_index, + analyzer.Simplify(floordiv(loop->extent + tot_length - 1, tot_length))); + } else { + if (!analyzer.CanProve(tot_length >= loop->extent)) { + throw WrongFactorProductError(self->mod, GetRef<For>(loop)); + } + } + // Step 3. Replace all occurrence of the original loop var with new variables + std::vector<Var> new_loop_vars; + new_loop_vars.reserve(inferred_factors.size()); + for (size_t i = 0; i < inferred_factors.size(); i++) { + new_loop_vars.push_back(loop->loop_var.copy_with_suffix("_" + std::to_string(i))); + } + PrimExpr substitute_value = 0; + for (size_t i = 0; i < inferred_factors.size(); i++) { + substitute_value *= inferred_factors[i]; + substitute_value += new_loop_vars[i]; + } + Map<Block, Block> opaque_block_reuse; + auto substitute_function = [&](const Var& v) -> Optional<PrimExpr> { Review comment: Use a more conventional naming. ```suggestion auto f_substitute = [&](const Var& v) -> Optional<PrimExpr> { ``` ########## File path: src/tir/schedule/primitive/fuse_split.cc ########## @@ -0,0 +1,483 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "../utils.h" +namespace tvm { +namespace tir { + +/*! \brief Append a new predicate to the each children of type BlockRealize (not recursively) */ +class PredicateUpdater : public StmtMutator { + public: + /*! + * \brief Constructor + * \param predicate The predicate to be apppend to BlockRealizeNode + */ + explicit PredicateUpdater(const PrimExpr& predicate, arith::Analyzer* ana) + : predicate_(predicate) { + if (!ana->CanProve(predicate)) { + add_predicate_ = true; + } + } + + private: + // For each direct child of type BlockRealizeNode, append the predicate + Stmt VisitStmt_(const BlockRealizeNode* realize) final { + // We do not recursively do this + if (add_predicate_) { + ObjectPtr<BlockRealizeNode> n = CopyOnWrite(realize); + n->predicate = n->predicate && predicate_; + return BlockRealize(n); + } else { + return GetRef<BlockRealize>(realize); + } + } + + /*! \brief The predicate to be added */ + const PrimExpr& predicate_; + /*! \brief whether to add predicate */ + bool add_predicate_; +}; +/*! \brief Substitute vars and collect the reuse mapping of opaque blocks */ +class IRSubstituteAndCollectOpaqueBlock : public StmtExprMutator { + public: + explicit IRSubstituteAndCollectOpaqueBlock(std::function<Optional<PrimExpr>(const Var&)> vmap, + Map<Block, Block>* opaque_blocks) + : vmap_(vmap), opaque_blocks_(opaque_blocks) {} + + private: + PrimExpr VisitExpr_(const VarNode* op) final { + Var var = GetRef<Var>(op); + Optional<PrimExpr> ret = vmap_(var); + if (ret.defined()) { + return ret.value(); + } else { + return std::move(var); + } + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + Stmt res = StmtMutator::VisitStmt_(op); + if (op->block->iter_vars.empty()) { + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + opaque_blocks_->Set(op->block, realize->block); + } + return res; + } + + /*! \brief The substitute function */ + std::function<Optional<PrimExpr>(const Var&)> vmap_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SubstituteAndCollectOpaqueBlock(Stmt stmt, Map<Block, Block>* opaque_blocks, + std::function<Optional<PrimExpr>(const Var&)> vmap) { + return IRSubstituteAndCollectOpaqueBlock(vmap, opaque_blocks)(std::move(stmt)); +} + +/*! \brief Simplify the binding of block realize and update the opaque block reuse mapping*/ +class BlockRealizeRewriter : public StmtExprMutator { + public: + explicit BlockRealizeRewriter( + const std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual>& loop_map, + Map<Block, Block>* opaque_blocks) + : opaque_blocks_(opaque_blocks) { + loop_map_.insert(loop_map.begin(), loop_map.end()); + } + + private: + Stmt VisitStmt_(const ForNode* op) final { + loop_map_[op->loop_var] = Range::FromMinExtent(op->min, op->extent); + Stmt res = StmtMutator::VisitStmt_(op); + loop_map_.erase(op->loop_var); + return res; + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + // skip opaque block and update mapping + if (op->iter_values.empty()) { + Stmt res = StmtMutator::VisitStmt_(op); + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + for (const std::pair<Block, Block>& entry : *opaque_blocks_) { + if (entry.second.same_as(op->block)) { + opaque_blocks_->Set(entry.first, realize->block); + break; + } + } + return res; + } + auto v = arith::IterMapSimplify(op->iter_values, loop_map_, op->predicate, false); + if (v.same_as(op->iter_values)) { + return GetRef<Stmt>(op); + } else { + auto n = CopyOnWrite(op); + n->iter_values = std::move(v); + return Stmt(n); + } + } + /*! \brief The range of loops */ + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SimplifyBindings(const Stmt& stmt, const Array<StmtSRef>& loops, + Map<Block, Block>* opaque_blocks) { + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map; + for (const StmtSRef& sref : loops) { + const auto* loop = sref->StmtAs<ForNode>(); + loop_map[loop->loop_var] = Range::FromMinExtent(loop->min, loop->extent); + } + BlockRealizeRewriter rewriter(loop_map, opaque_blocks); + return rewriter(stmt); +} + +class NotLoopError : public ScheduleError { + public: + explicit NotLoopError(IRModule mod, String type) : mod_(mod), type_(type) {} + + String FastErrorString() const final { + return "ScheduleError: this primitive only operates on a " + "loop"; + } + + String DetailRenderTemplate() const final { + return "this primitive only operates on a loop, but the StmtSref passed in points to" + "type: {0} "; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {type_}; } + + IRModule mod_; + String type_; +}; + +class HasAnnotationError : public ScheduleError { + public: + explicit HasAnnotationError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has annotation"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has annotation"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class HasThreadBindingError : public ScheduleError { + public: + explicit HasThreadBindingError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has thread binding"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has thread binding"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class OuterNotInnerParent : public ScheduleError { + public: + explicit OuterNotInnerParent(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the outer loop is not the parent of the inner loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the outer loop {0} is not the parent of the inner " + "loop {1}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class NotOnlyChildError : public ScheduleError { + public: + explicit NotOnlyChildError(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the inner loop is not the only child of outer loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the inner loop {1} is not the only child of outer " + "loop {0}."; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class LoopNotStartWithZeroError : public ScheduleError { + public: + explicit LoopNotStartWithZeroError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: the primitive only supports loop starting with 0"; + } + + String DetailRenderTemplate() const final { + return "The loop {0} does not start with 0, which is not supported"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class NotSingleInferFactorError : public ScheduleError { + public: + explicit NotSingleInferFactorError(IRModule mod) : mod_(mod) {} + + String FastErrorString() const final { + return "ScheduleError: only one factor can be specified as -1 or none"; + } + + String DetailRenderTemplate() const final { + return "Only one factor can be specified as -1 or none"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {}; } + + IRModule mod_; +}; + +class WrongFactorProductError : public ScheduleError { + public: + explicit WrongFactorProductError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The product of factors is not larger than or equal to the extent of " + "loop"; + } + + String DetailRenderTemplate() const final { + return "The product of factors is not larger than or equal to the extent of loop {0}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref, + const Array<PrimExpr>& factors) { + // Invariance + // - The total repeat number has not changed for each direct child block with updating predicate. + // - The execution order has not changed. (The block executes with the same args and the same + // order with before. + // Step 1. Check correctness + GetScopeRootAndCheckStagePipeline(self, loop_sref); + const auto* loop = loop_sref->StmtAs<ForNode>(); + if (loop == nullptr) { + throw NotLoopError(self->mod, loop_sref->stmt->GetTypeKey()); + } + if (!loop->annotations.empty()) { + throw HasAnnotationError(self->mod, GetRef<For>(loop)); + } + if (loop->thread_binding.defined()) { + throw HasThreadBindingError(self->mod, GetRef<For>(loop)); + } + // Currently, loops starting with 0 is not supported + arith::Analyzer analyzer; + if (!analyzer.CanProve(loop->min == 0)) { + throw LoopNotStartWithZeroError(self->mod, GetRef<For>(loop)); + } + PrimExpr tot_length = 1; + int infer_index = -1; + for (size_t i = 0; i < factors.size(); i++) { + if (!analyzer.CanProve(factors[i] == -1)) { + tot_length *= factors[i]; + } else { + if (infer_index != -1) { + throw NotSingleInferFactorError(self->mod); + } else { + infer_index = i; + } + } Review comment: ```suggestion } else if (infer_index != -1) { throw NotSingleInferFactorError(self->mod); } else { infer_index = i; } ``` ########## File path: tests/python/unittest/test_tir_schedule_split_fuse.py ########## @@ -0,0 +1,469 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=missing-function-docstring,missing-module-docstring +import pytest +import tvm +from tvm import tir +from tvm.script import ty + +# pylint: disable=no-member,invalid-name,unused-variable + + [email protected] +def elementwise(a: ty.handle, b: ty.handle) -> None: + A = tir.match_buffer(a, (128, 128, 128)) + B = tir.match_buffer(b, (128, 128, 128)) + with tir.block([128, 128, 128], "B") as [vi, vj, vk]: + B[vi, vj, vk] = A[vi, vj, vk] * 2.0 + + [email protected] +def elementwise_symbolic(a: ty.handle, b: ty.handle, n: ty.int32) -> None: + A = tir.match_buffer(a, (128, 128, n)) + B = tir.match_buffer(b, (128, 128, n)) + for i, j, k in tir.grid(128, 128, n): + with tir.block([128, 128, n], "B") as [vi, vj, vk]: + B[vi, vj, vk] = A[vi, vj, vk] * 2.0 + + [email protected] +def elementwise_symbolic_fused(a: ty.handle, b: ty.handle, n: ty.int32) -> None: + A = tir.match_buffer(a, (128, 128, n)) + B = tir.match_buffer(b, (128, 128, n)) + for i_j_k_fused in tir.serial(0, (n * 16384)): + with tir.block([128, 128, n], "B") as [vi, vj, vk]: + tir.bind(vi, tir.floordiv(i_j_k_fused, (n * 128))) + tir.bind(vj, tir.floormod(tir.floordiv(i_j_k_fused, n), 128)) + tir.bind(vk, tir.floormod(i_j_k_fused, n)) + tir.reads([A[vi, vj, vk]]) + tir.writes([B[vi, vj, vk]]) + B[vi, vj, vk] = A[vi, vj, vk] * 2.0 + + [email protected] +def elementwise_symbolic_split(a: ty.handle, b: ty.handle, n: ty.int32) -> None: + A = tir.match_buffer(a, (128, 128, n)) + B = tir.match_buffer(b, (128, 128, n)) + for i, j, k0, k1 in tir.grid(128, 128, 10, tir.floordiv((n + 9), 10)): + with tir.block([128, 128, n], "B") as [vi, vj, vk]: + tir.where((((k0 * tir.floordiv((n + 9), 10)) + k1) < n)) + tir.bind(vi, i) + tir.bind(vj, j) + tir.bind(vk, ((k0 * tir.floordiv((n + 9), 10)) + k1)) + tir.reads([A[vi, vj, vk]]) + tir.writes([B[vi, vj, vk]]) + B[vi, vj, vk] = A[vi, vj, vk] * 2.0 + + [email protected] +def elementwise_with_seq(a: ty.handle, b: ty.handle) -> None: + A = tir.match_buffer(a, (128, 128, 128)) + B = tir.match_buffer(b, (128, 128, 128)) + C = tir.alloc_buffer((128, 128, 128)) + for i, j in tir.grid(128, 128): + for k in tir.serial(0, 128): + with tir.block([128, 128, 128], "C") as [vi, vj, vk]: + C[vi, vj, vk] = A[vi, vj, vk] * 2.0 + for k in tir.serial(0, 128): + with tir.block([128, 128, 128], "B") as [vi, vj, vk]: + B[vi, vj, vk] = C[vi, vj, vk] * 2.0 + + [email protected] +def elementwise_with_anno(a: ty.handle, b: ty.handle) -> None: + A = tir.match_buffer(a, (128, 128, 128)) + B = tir.match_buffer(b, (128, 128, 128)) + for i, j in tir.grid(128, 128): + for k in tir.serial(0, 128, annotations={"useless_annotation": True}): + with tir.block([128, 128, 128], "B") as [vi, vj, vk]: + tir.bind(vi, i) + tir.bind(vj, j) + tir.bind(vk, k) + tir.reads([A[vi, vj, vk]]) + tir.writes([B[vi, vj, vk]]) + B[vi, vj, vk] = A[vi, vj, vk] * 2.0 + + [email protected] +def elementwise_with_thread_binding(a: ty.handle, b: ty.handle) -> None: + A = tir.match_buffer(a, (128, 128, 128)) + B = tir.match_buffer(b, (128, 128, 128)) + for i, j in tir.grid(128, 128): + for k in tir.thread_binding(0, 128, thread="threadIdx.x"): + with tir.block([128, 128, 128], "B") as [vi, vj, vk]: + tir.bind(vi, i) + tir.bind(vj, j) + tir.bind(vk, k) + tir.reads([A[vi, vj, vk]]) + tir.writes([B[vi, vj, vk]]) + B[vi, vj, vk] = A[vi, vj, vk] * 2.0 + + [email protected] +def elementwise_with_starting_point(a: ty.handle, b: ty.handle) -> None: + A = tir.match_buffer(a, (128, 128, 128)) + B = tir.match_buffer(b, (128, 128, 128)) + for i, j in tir.grid(128, 128): + for k in tir.serial(10, 128): + with tir.block([128, 128, 128], "B") as [vi, vj, vk]: + tir.bind(vi, i) + tir.bind(vj, j) + tir.bind(vk, k) + tir.reads([A[vi, vj, vk]]) + tir.writes([B[vi, vj, vk]]) + B[vi, vj, vk] = A[vi, vj, vk] * 2.0 + + [email protected] +def elementwise_with_opaque_block(a: ty.handle, b: ty.handle) -> None: + A = tir.match_buffer(a, (128, 128, 128)) + B = tir.match_buffer(b, (128, 128, 128)) + for i, j, k in tir.grid(128, 128, 128): + with tir.block([], "opaque"): + tir.reads([A[i, j, k]]) + tir.writes([B[i, j, k]]) + with tir.block([128, 128, 128], "B") as [vi, vj, vk]: + tir.bind(vi, i) + tir.bind(vj, j) + tir.bind(vk, k) + tir.reads([A[vi, vj, vk]]) + tir.writes([B[vi, vj, vk]]) + B[vi, vj, vk] = A[vi, vj, vk] * 2.0 + + [email protected] +def elementwise_fused(a: ty.handle, b: ty.handle) -> None: + A = tir.match_buffer(a, (128, 128, 128)) + B = tir.match_buffer(b, (128, 128, 128)) + for fused in tir.serial(0, 2097152): + with tir.block([128, 128, 128], "B") as [vi, vj, vk]: + tir.bind(vi, tir.floordiv(fused, 16384)) + tir.bind(vj, tir.floormod(tir.floordiv(fused, 128), 128)) + tir.bind(vk, tir.floormod(fused, 128)) + tir.reads([A[vi, vj, vk]]) + tir.writes([B[vi, vj, vk]]) + B[vi, vj, vk] = A[vi, vj, vk] * 2.0 + + [email protected] +def elementwise_split_case0(a: ty.handle, b: ty.handle) -> None: + A = tir.match_buffer(a, [128, 128, 128]) + B = tir.match_buffer(b, [128, 128, 128]) + for i1, i2, i3, j1, j2, k1, k2 in tir.grid(2, 1, 64, 4, 32, 16, 8): + with tir.block([128, 128, 128], "B") as [vi, vj, vk]: + tir.bind(vi, ((i1 * 64) + i3)) + tir.bind(vj, ((j1 * 32) + j2)) + tir.bind(vk, ((k1 * 8) + k2)) + tir.reads([A[vi, vj, vk]]) + tir.writes([B[vi, vj, vk]]) + B[vi, vj, vk] = A[vi, vj, vk] * 2.0 + + [email protected] +def elementwise_split_case1(a: ty.handle, b: ty.handle) -> None: + A = tir.match_buffer(a, [128, 128, 128]) + B = tir.match_buffer(b, [128, 128, 128]) + for i1, i2, i3, j1, j2, j3, k1, k2, k3 in tir.grid(2, 1, 64, 2, 1, 64, 2, 1, 64): + with tir.block([128, 128, 128], "B") as [vi, vj, vk]: + tir.bind(vi, i1 * 64 + i3) + tir.bind(vj, j1 * 64 + j3) + tir.bind(vk, k1 * 64 + k3) + tir.reads([A[vi, vj, vk]]) + tir.writes([B[vi, vj, vk]]) + B[vi, vj, vk] = A[vi, vj, vk] * 2.0 + + [email protected] +def elementwise_split_with_predicate(a: ty.handle, b: ty.handle) -> None: + B = tir.match_buffer(b, [128, 128, 128]) + A = tir.match_buffer(a, [128, 128, 128]) + for i0, i1, i2, j0, j1, k0, k1 in tir.grid(1000, 2, 3, 1, 129, 3, 43): + with tir.block([128, 128, 128], "B") as [vi, vj, vk]: + tir.where( + ( + ((((((i0 * 2) + i1) * 3) + i2) < 128) and (((j0 * 129) + j1) < 128)) + and (((k0 * 43) + k1) < 128) + ) + ) + tir.bind(vi, (((i0 * 6) + (i1 * 3)) + i2)) + tir.bind(vj, j1) + tir.bind(vk, ((k0 * 43) + k1)) + tir.reads([A[vi, vj, vk]]) + tir.writes([B[vi, vj, vk]]) + B[vi, vj, vk] = A[vi, vj, vk] * 2.0 + + [email protected] +def elementwise_fuse_with_opaque_block(a: ty.handle, b: ty.handle) -> None: + B = tir.match_buffer(b, [128, 128, 128]) + A = tir.match_buffer(a, [128, 128, 128]) + for i_j_k_fused in tir.serial(0, 2097152): + with tir.block([], "opaque"): + tir.reads( + [ + A[ + tir.floormod(tir.floordiv(tir.floordiv(i_j_k_fused, 128), 128), 128), + tir.floormod(tir.floordiv(i_j_k_fused, 128), 128), + tir.floormod(i_j_k_fused, 128), + ] + ] + ) + tir.writes( + [ + B[ + tir.floormod(tir.floordiv(tir.floordiv(i_j_k_fused, 128), 128), 128), + tir.floormod(tir.floordiv(i_j_k_fused, 128), 128), + tir.floormod(i_j_k_fused, 128), + ] + ] + ) + with tir.block([128, 128, 128], "B") as [vi, vj, vk]: + tir.bind(vi, tir.floordiv(i_j_k_fused, 16384)) + tir.bind(vj, tir.floormod(tir.floordiv(i_j_k_fused, 128), 128)) + tir.bind(vk, tir.floormod(i_j_k_fused, 128)) + tir.reads([A[vi, vj, vk]]) + tir.writes([B[vi, vj, vk]]) + B[vi, vj, vk] = A[vi, vj, vk] * 2.0 + + [email protected] +def elementwise_split_with_opaque_block(a: ty.handle, b: ty.handle) -> None: + B = tir.match_buffer(b, [128, 128, 128]) + A = tir.match_buffer(a, [128, 128, 128]) + + for i0, i1, j, k in tir.grid(8, 16, 128, 128): + with tir.block([], "opaque"): + tir.reads([A[i0 * 16 + i1, j, k]]) + tir.writes([B[i0 * 16 + i1, j, k]]) + with tir.block([128, 128, 128], "B") as [vi, vj, vk]: + tir.bind(vi, i0 * 16 + i1) + tir.bind(vj, j) + tir.bind(vk, k) + tir.reads([A[vi, vj, vk]]) + tir.writes([B[vi, vj, vk]]) + B[vi, vj, vk] = A[vi, vj, vk] * 2.0 + + [email protected] +def opaque_access(a: ty.handle, b: ty.handle) -> None: + A = tir.match_buffer(a, [16, 16], "float32") + B = tir.match_buffer(b, [16, 16], "float32") + with tir.block([16, 16], "A") as [vi, vj]: + tir.reads([]) + tir.writes([A[0:16, 0:16]]) + tir.store(A.data, vi * 16 + vj, 1) + with tir.block([16, 16], "B") as [vi, vj]: + tir.reads([]) + tir.writes([B[0:16, 0:16]]) + tir.evaluate(tir.tvm_fill_fragment(B.data, 16, 16, 16, 0, vi * 16 + vj, dtype="handle")) + + [email protected] +def opaque_access_fused(a: ty.handle, b: ty.handle) -> None: + A = tir.match_buffer(a, [16, 16]) + B = tir.match_buffer(b, [16, 16]) + for i_j_fused in tir.serial(0, 256): + with tir.block([16, 16], "A") as [vi, vj]: + tir.bind(vi, tir.floordiv(i_j_fused, 16)) + tir.bind(vj, tir.floormod(i_j_fused, 16)) + tir.reads([]) + tir.writes([A[0:16, 0:16]]) + tir.store(A.data, ((vi * 16) + vj), 1, 1) + for i_j_fused in tir.serial(0, 256): + with tir.block([16, 16], "B") as [vi, vj]: + tir.bind(vi, tir.floordiv(i_j_fused, 16)) + tir.bind(vj, tir.floormod(i_j_fused, 16)) + tir.reads([]) + tir.writes([B[0:16, 0:16]]) + tir.evaluate( + tir.tvm_fill_fragment(B.data, 16, 16, 16, 0, ((vi * 16) + vj), dtype="handle") + ) + + [email protected] +def opaque_access_split(a: ty.handle, b: ty.handle) -> None: + A = tir.match_buffer(a, (16, 16)) + B = tir.match_buffer(b, (16, 16)) + for i, j0, j1 in tir.grid(16, 4, 4): + with tir.block([16, 16], "A") as [vi, vj]: + tir.bind(vi, i) + tir.bind(vj, ((j0 * 4) + j1)) + tir.reads([]) + tir.writes([A[0:16, 0:16]]) + tir.store(A.data, ((vi * 16) + vj), 1, 1) + for i, j0, j1 in tir.grid(16, 4, 4): + with tir.block([16, 16], "B") as [vi, vj]: + tir.bind(vi, i) + tir.bind(vj, ((j0 * 4) + j1)) + tir.reads([]) + tir.writes([B[0:16, 0:16]]) + tir.evaluate( + tir.tvm_fill_fragment(B.data, 16, 16, 16, 0, ((vi * 16) + vj), dtype="handle") + ) + + +# pylint: enable=no-member,invalid-name,unused-variable + + +def test_fuse(): + sch = tir.Schedule(elementwise, debug_mode=True) + block_b = sch.get_block("B") + i, j, k = sch.get_loops(block_b) + sch.fuse(i, j, k) + assert sch.state._get_cached_flags(sch.get_sref(block_b)).stage_pipeline + tvm.ir.assert_structural_equal(elementwise_fused, sch.mod["main"]) + + +def test_split(): + sch = tir.Schedule(elementwise, debug_mode=True) + block_b = sch.get_block("B") + i, j, k = sch.get_loops(block_b) + sch.split(i, factors=[2, 1, 64]) + sch.split(j, factors=[4, 32]) + sch.split(k, factors=[16, 8]) + assert sch.state._get_cached_flags(sch.get_sref(block_b)).stage_pipeline + tvm.ir.assert_structural_equal(elementwise_split_case0, sch.mod["main"]) + + +def test_split_with_inferred_factor(): + sch = tir.Schedule(elementwise, debug_mode=True) + block_b = sch.get_block("B") + i, j, k = sch.get_loops(block_b) + sch.split(i, factors=[None, 1, 64]) + sch.split(j, factors=[2, None, 64]) + sch.split(k, factors=[2, 1, -1]) + tvm.ir.assert_structural_equal(elementwise_split_case1, sch.mod["main"]) + + +def test_split_with_predicate(): + sch = tir.Schedule(elementwise, debug_mode=True) + block_b = sch.get_block("B") + i, j, k = sch.get_loops(block_b) + sch.split(i, factors=[1000, 2, 3]) + sch.split(j, factors=[None, 129]) + sch.split(k, factors=[3, None]) + assert sch.state._get_cached_flags(sch.get_sref(block_b)).stage_pipeline + tvm.ir.assert_structural_equal(elementwise_split_with_predicate, sch.mod["main"]) + + +def test_fuse_fail_not_only_child(): + sch = tir.Schedule(elementwise_with_seq, debug_mode=True) + block_b = sch.get_block("B") + i, j, k = sch.get_loops(block_b) + with pytest.raises(tvm.tir.ScheduleError): + sch.fuse(j, k) + + +def test_fuse_split_fail_with_annotation(): + sch = tir.Schedule(elementwise_with_anno, debug_mode=True) + block_b = sch.get_block("B") + i, j, k = sch.get_loops(block_b) + with pytest.raises(tvm.tir.ScheduleError): + sch.fuse(j, k) + with pytest.raises(tvm.tir.ScheduleError): + sch.split(k, factors=[None, 10]) + + +def test_fuse_split_fail_not_start_with_zero(): + sch = tir.Schedule(elementwise_with_anno, debug_mode=True) + block_b = sch.get_block("B") + i, j, k = sch.get_loops(block_b) + with pytest.raises(tvm.tir.ScheduleError): + sch.fuse(j, k) + with pytest.raises(tvm.tir.ScheduleError): + sch.split(k, factors=[None, 10]) + + +def test_fuse_with_opaque_block(): + sch = tir.Schedule(elementwise_with_opaque_block, debug_mode=True) + block_opaque = sch.get_block("opaque") + i, j, k = sch.get_loops(block_opaque) + sch.fuse(i, j, k) + tvm.ir.assert_structural_equal(elementwise_fuse_with_opaque_block, sch.mod["main"]) + + +def test_fuse_with_opaque_access(): + sch = tir.Schedule(opaque_access, debug_mode=True) + block_a = sch.get_block("A") + i, j = sch.get_loops(block_a) + sch.fuse(i, j) + block_b = sch.get_block("B") + i, j = sch.get_loops(block_b) + sch.fuse(i, j) + tvm.ir.assert_structural_equal(opaque_access_fused, sch.mod["main"]) + + +def test_split_with_opaque_block(): + sch = tir.Schedule(elementwise_with_opaque_block, debug_mode=True) + block_opaque = sch.get_block("opaque") + i, j, k = sch.get_loops(block_opaque) + sch.split(i, factors=[None, 16]) + tvm.ir.assert_structural_equal(elementwise_split_with_opaque_block, sch.mod["main"]) + + +def test_split_with_opaque_access(): + sch = tir.Schedule(opaque_access, debug_mode=True) + block_a = sch.get_block("A") + i, j = sch.get_loops(block_a) + sch.split(j, factors=[None, 4]) + block_b = sch.get_block("B") + i, j = sch.get_loops(block_b) + sch.split(j, factors=[None, 4]) + tvm.ir.assert_structural_equal(opaque_access_split, sch.mod["main"]) + + +def test_fuse_split_fail_with_thread_binding(): + sch = tir.Schedule(elementwise_with_thread_binding, debug_mode=True) + block_b = sch.get_block("B") + i, j, k = sch.get_loops(block_b) + with pytest.raises(tvm.tir.ScheduleError): + sch.fuse(j, k) + with pytest.raises(tvm.tir.ScheduleError): + sch.split(k, factors=[None, 10]) + + +def test_fuse_symbolic(): + sch = tir.Schedule(elementwise_symbolic, debug_mode=True) + block_b = sch.get_block("B") + i, j, k = sch.get_loops(block_b) + sch.fuse(i, j, k) + tvm.ir.assert_structural_equal(elementwise_symbolic_fused, sch.mod["main"]) + + +def test_split_symbolic(): + sch = tir.Schedule(elementwise_symbolic, debug_mode=True) + block_b = sch.get_block("B") + i, j, k = sch.get_loops(block_b) + sch.split(k, factors=[10, None]) + tvm.ir.assert_structural_equal(elementwise_symbolic_split, sch.mod["main"]) + + +if __name__ == "__main__": + test_fuse() + test_fuse_with_opaque_block() + test_fuse_with_opaque_access() + test_fuse_symbolic() + test_split() + test_split_with_inferred_factor() + test_split_with_opaque_block() + test_split_with_opaque_access() + test_split_with_predicate() + test_split_symbolic() + test_fuse_fail_not_only_child() + test_fuse_split_fail_with_annotation() + test_fuse_split_fail_not_start_with_zero() + test_fuse_split_fail_with_thread_binding() Review comment: It would be better if you could reorder the tests above according to this order. ########## File path: src/tir/schedule/primitive/fuse_split.cc ########## @@ -0,0 +1,483 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "../utils.h" +namespace tvm { +namespace tir { + +/*! \brief Append a new predicate to the each children of type BlockRealize (not recursively) */ +class PredicateUpdater : public StmtMutator { + public: + /*! + * \brief Constructor + * \param predicate The predicate to be apppend to BlockRealizeNode + */ + explicit PredicateUpdater(const PrimExpr& predicate, arith::Analyzer* ana) + : predicate_(predicate) { + if (!ana->CanProve(predicate)) { + add_predicate_ = true; + } + } + + private: + // For each direct child of type BlockRealizeNode, append the predicate + Stmt VisitStmt_(const BlockRealizeNode* realize) final { + // We do not recursively do this + if (add_predicate_) { + ObjectPtr<BlockRealizeNode> n = CopyOnWrite(realize); + n->predicate = n->predicate && predicate_; + return BlockRealize(n); + } else { + return GetRef<BlockRealize>(realize); + } + } + + /*! \brief The predicate to be added */ + const PrimExpr& predicate_; + /*! \brief whether to add predicate */ + bool add_predicate_; +}; +/*! \brief Substitute vars and collect the reuse mapping of opaque blocks */ +class IRSubstituteAndCollectOpaqueBlock : public StmtExprMutator { + public: + explicit IRSubstituteAndCollectOpaqueBlock(std::function<Optional<PrimExpr>(const Var&)> vmap, + Map<Block, Block>* opaque_blocks) + : vmap_(vmap), opaque_blocks_(opaque_blocks) {} + + private: + PrimExpr VisitExpr_(const VarNode* op) final { + Var var = GetRef<Var>(op); + Optional<PrimExpr> ret = vmap_(var); + if (ret.defined()) { + return ret.value(); + } else { + return std::move(var); + } + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + Stmt res = StmtMutator::VisitStmt_(op); + if (op->block->iter_vars.empty()) { + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + opaque_blocks_->Set(op->block, realize->block); + } + return res; + } + + /*! \brief The substitute function */ + std::function<Optional<PrimExpr>(const Var&)> vmap_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SubstituteAndCollectOpaqueBlock(Stmt stmt, Map<Block, Block>* opaque_blocks, + std::function<Optional<PrimExpr>(const Var&)> vmap) { + return IRSubstituteAndCollectOpaqueBlock(vmap, opaque_blocks)(std::move(stmt)); +} + +/*! \brief Simplify the binding of block realize and update the opaque block reuse mapping*/ +class BlockRealizeRewriter : public StmtExprMutator { + public: + explicit BlockRealizeRewriter( + const std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual>& loop_map, + Map<Block, Block>* opaque_blocks) + : opaque_blocks_(opaque_blocks) { + loop_map_.insert(loop_map.begin(), loop_map.end()); + } + + private: + Stmt VisitStmt_(const ForNode* op) final { + loop_map_[op->loop_var] = Range::FromMinExtent(op->min, op->extent); + Stmt res = StmtMutator::VisitStmt_(op); + loop_map_.erase(op->loop_var); + return res; + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + // skip opaque block and update mapping + if (op->iter_values.empty()) { + Stmt res = StmtMutator::VisitStmt_(op); + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + for (const std::pair<Block, Block>& entry : *opaque_blocks_) { + if (entry.second.same_as(op->block)) { + opaque_blocks_->Set(entry.first, realize->block); + break; + } + } + return res; + } + auto v = arith::IterMapSimplify(op->iter_values, loop_map_, op->predicate, false); + if (v.same_as(op->iter_values)) { + return GetRef<Stmt>(op); + } else { + auto n = CopyOnWrite(op); + n->iter_values = std::move(v); + return Stmt(n); + } + } + /*! \brief The range of loops */ + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SimplifyBindings(const Stmt& stmt, const Array<StmtSRef>& loops, + Map<Block, Block>* opaque_blocks) { + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map; + for (const StmtSRef& sref : loops) { + const auto* loop = sref->StmtAs<ForNode>(); + loop_map[loop->loop_var] = Range::FromMinExtent(loop->min, loop->extent); + } + BlockRealizeRewriter rewriter(loop_map, opaque_blocks); + return rewriter(stmt); +} + +class NotLoopError : public ScheduleError { + public: + explicit NotLoopError(IRModule mod, String type) : mod_(mod), type_(type) {} + + String FastErrorString() const final { + return "ScheduleError: this primitive only operates on a " + "loop"; + } + + String DetailRenderTemplate() const final { + return "this primitive only operates on a loop, but the StmtSref passed in points to" + "type: {0} "; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {type_}; } + + IRModule mod_; + String type_; +}; + +class HasAnnotationError : public ScheduleError { + public: + explicit HasAnnotationError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has annotation"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has annotation"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class HasThreadBindingError : public ScheduleError { + public: + explicit HasThreadBindingError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has thread binding"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has thread binding"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class OuterNotInnerParent : public ScheduleError { + public: + explicit OuterNotInnerParent(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the outer loop is not the parent of the inner loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the outer loop {0} is not the parent of the inner " + "loop {1}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class NotOnlyChildError : public ScheduleError { + public: + explicit NotOnlyChildError(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the inner loop is not the only child of outer loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the inner loop {1} is not the only child of outer " + "loop {0}."; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class LoopNotStartWithZeroError : public ScheduleError { + public: + explicit LoopNotStartWithZeroError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: the primitive only supports loop starting with 0"; + } + + String DetailRenderTemplate() const final { + return "The loop {0} does not start with 0, which is not supported"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class NotSingleInferFactorError : public ScheduleError { + public: + explicit NotSingleInferFactorError(IRModule mod) : mod_(mod) {} + + String FastErrorString() const final { + return "ScheduleError: only one factor can be specified as -1 or none"; + } + + String DetailRenderTemplate() const final { + return "Only one factor can be specified as -1 or none"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {}; } + + IRModule mod_; +}; + +class WrongFactorProductError : public ScheduleError { + public: + explicit WrongFactorProductError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The product of factors is not larger than or equal to the extent of " + "loop"; + } + + String DetailRenderTemplate() const final { + return "The product of factors is not larger than or equal to the extent of loop {0}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref, + const Array<PrimExpr>& factors) { + // Invariance + // - The total repeat number has not changed for each direct child block with updating predicate. + // - The execution order has not changed. (The block executes with the same args and the same + // order with before. + // Step 1. Check correctness + GetScopeRootAndCheckStagePipeline(self, loop_sref); + const auto* loop = loop_sref->StmtAs<ForNode>(); + if (loop == nullptr) { + throw NotLoopError(self->mod, loop_sref->stmt->GetTypeKey()); + } + if (!loop->annotations.empty()) { + throw HasAnnotationError(self->mod, GetRef<For>(loop)); + } + if (loop->thread_binding.defined()) { + throw HasThreadBindingError(self->mod, GetRef<For>(loop)); + } + // Currently, loops starting with 0 is not supported + arith::Analyzer analyzer; + if (!analyzer.CanProve(loop->min == 0)) { + throw LoopNotStartWithZeroError(self->mod, GetRef<For>(loop)); + } + PrimExpr tot_length = 1; + int infer_index = -1; + for (size_t i = 0; i < factors.size(); i++) { Review comment: `++i` is more preferred. ```suggestion for (size_t i = 0; i < factors.size(); ++i) { ``` ########## File path: src/tir/schedule/primitive/fuse_split.cc ########## @@ -0,0 +1,483 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "../utils.h" +namespace tvm { +namespace tir { + +/*! \brief Append a new predicate to the each children of type BlockRealize (not recursively) */ +class PredicateUpdater : public StmtMutator { + public: + /*! + * \brief Constructor + * \param predicate The predicate to be apppend to BlockRealizeNode + */ + explicit PredicateUpdater(const PrimExpr& predicate, arith::Analyzer* ana) + : predicate_(predicate) { + if (!ana->CanProve(predicate)) { + add_predicate_ = true; + } + } + + private: + // For each direct child of type BlockRealizeNode, append the predicate + Stmt VisitStmt_(const BlockRealizeNode* realize) final { + // We do not recursively do this + if (add_predicate_) { + ObjectPtr<BlockRealizeNode> n = CopyOnWrite(realize); + n->predicate = n->predicate && predicate_; + return BlockRealize(n); + } else { + return GetRef<BlockRealize>(realize); + } + } + + /*! \brief The predicate to be added */ + const PrimExpr& predicate_; + /*! \brief whether to add predicate */ + bool add_predicate_; +}; +/*! \brief Substitute vars and collect the reuse mapping of opaque blocks */ +class IRSubstituteAndCollectOpaqueBlock : public StmtExprMutator { + public: + explicit IRSubstituteAndCollectOpaqueBlock(std::function<Optional<PrimExpr>(const Var&)> vmap, + Map<Block, Block>* opaque_blocks) + : vmap_(vmap), opaque_blocks_(opaque_blocks) {} + + private: + PrimExpr VisitExpr_(const VarNode* op) final { + Var var = GetRef<Var>(op); + Optional<PrimExpr> ret = vmap_(var); + if (ret.defined()) { + return ret.value(); + } else { + return std::move(var); + } + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + Stmt res = StmtMutator::VisitStmt_(op); + if (op->block->iter_vars.empty()) { + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + opaque_blocks_->Set(op->block, realize->block); + } + return res; + } + + /*! \brief The substitute function */ + std::function<Optional<PrimExpr>(const Var&)> vmap_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SubstituteAndCollectOpaqueBlock(Stmt stmt, Map<Block, Block>* opaque_blocks, + std::function<Optional<PrimExpr>(const Var&)> vmap) { + return IRSubstituteAndCollectOpaqueBlock(vmap, opaque_blocks)(std::move(stmt)); +} + +/*! \brief Simplify the binding of block realize and update the opaque block reuse mapping*/ +class BlockRealizeRewriter : public StmtExprMutator { + public: + explicit BlockRealizeRewriter( + const std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual>& loop_map, + Map<Block, Block>* opaque_blocks) + : opaque_blocks_(opaque_blocks) { + loop_map_.insert(loop_map.begin(), loop_map.end()); + } + + private: + Stmt VisitStmt_(const ForNode* op) final { + loop_map_[op->loop_var] = Range::FromMinExtent(op->min, op->extent); + Stmt res = StmtMutator::VisitStmt_(op); + loop_map_.erase(op->loop_var); + return res; + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + // skip opaque block and update mapping + if (op->iter_values.empty()) { + Stmt res = StmtMutator::VisitStmt_(op); + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + for (const std::pair<Block, Block>& entry : *opaque_blocks_) { + if (entry.second.same_as(op->block)) { + opaque_blocks_->Set(entry.first, realize->block); + break; + } + } + return res; + } + auto v = arith::IterMapSimplify(op->iter_values, loop_map_, op->predicate, false); + if (v.same_as(op->iter_values)) { + return GetRef<Stmt>(op); + } else { + auto n = CopyOnWrite(op); + n->iter_values = std::move(v); + return Stmt(n); + } + } + /*! \brief The range of loops */ + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SimplifyBindings(const Stmt& stmt, const Array<StmtSRef>& loops, + Map<Block, Block>* opaque_blocks) { + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map; + for (const StmtSRef& sref : loops) { + const auto* loop = sref->StmtAs<ForNode>(); + loop_map[loop->loop_var] = Range::FromMinExtent(loop->min, loop->extent); + } + BlockRealizeRewriter rewriter(loop_map, opaque_blocks); + return rewriter(stmt); +} + +class NotLoopError : public ScheduleError { + public: + explicit NotLoopError(IRModule mod, String type) : mod_(mod), type_(type) {} + + String FastErrorString() const final { + return "ScheduleError: this primitive only operates on a " + "loop"; + } + + String DetailRenderTemplate() const final { + return "this primitive only operates on a loop, but the StmtSref passed in points to" + "type: {0} "; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {type_}; } + + IRModule mod_; + String type_; +}; + +class HasAnnotationError : public ScheduleError { + public: + explicit HasAnnotationError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has annotation"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has annotation"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class HasThreadBindingError : public ScheduleError { + public: + explicit HasThreadBindingError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has thread binding"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has thread binding"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class OuterNotInnerParent : public ScheduleError { + public: + explicit OuterNotInnerParent(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the outer loop is not the parent of the inner loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the outer loop {0} is not the parent of the inner " + "loop {1}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class NotOnlyChildError : public ScheduleError { + public: + explicit NotOnlyChildError(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the inner loop is not the only child of outer loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the inner loop {1} is not the only child of outer " + "loop {0}."; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class LoopNotStartWithZeroError : public ScheduleError { + public: + explicit LoopNotStartWithZeroError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: the primitive only supports loop starting with 0"; + } + + String DetailRenderTemplate() const final { + return "The loop {0} does not start with 0, which is not supported"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class NotSingleInferFactorError : public ScheduleError { + public: + explicit NotSingleInferFactorError(IRModule mod) : mod_(mod) {} + + String FastErrorString() const final { + return "ScheduleError: only one factor can be specified as -1 or none"; + } + + String DetailRenderTemplate() const final { + return "Only one factor can be specified as -1 or none"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {}; } + + IRModule mod_; +}; + +class WrongFactorProductError : public ScheduleError { + public: + explicit WrongFactorProductError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The product of factors is not larger than or equal to the extent of " + "loop"; + } + + String DetailRenderTemplate() const final { + return "The product of factors is not larger than or equal to the extent of loop {0}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref, + const Array<PrimExpr>& factors) { + // Invariance + // - The total repeat number has not changed for each direct child block with updating predicate. + // - The execution order has not changed. (The block executes with the same args and the same + // order with before. + // Step 1. Check correctness + GetScopeRootAndCheckStagePipeline(self, loop_sref); + const auto* loop = loop_sref->StmtAs<ForNode>(); + if (loop == nullptr) { + throw NotLoopError(self->mod, loop_sref->stmt->GetTypeKey()); + } + if (!loop->annotations.empty()) { + throw HasAnnotationError(self->mod, GetRef<For>(loop)); + } + if (loop->thread_binding.defined()) { + throw HasThreadBindingError(self->mod, GetRef<For>(loop)); + } + // Currently, loops starting with 0 is not supported + arith::Analyzer analyzer; + if (!analyzer.CanProve(loop->min == 0)) { + throw LoopNotStartWithZeroError(self->mod, GetRef<For>(loop)); + } + PrimExpr tot_length = 1; + int infer_index = -1; + for (size_t i = 0; i < factors.size(); i++) { + if (!analyzer.CanProve(factors[i] == -1)) { + tot_length *= factors[i]; + } else { + if (infer_index != -1) { + throw NotSingleInferFactorError(self->mod); + } else { + infer_index = i; + } + } + } + // Step 2. infer factors if needed + Array<PrimExpr> inferred_factors(factors); + if (infer_index != -1) { + inferred_factors.Set(infer_index, + analyzer.Simplify(floordiv(loop->extent + tot_length - 1, tot_length))); + } else { + if (!analyzer.CanProve(tot_length >= loop->extent)) { + throw WrongFactorProductError(self->mod, GetRef<For>(loop)); + } + } + // Step 3. Replace all occurrence of the original loop var with new variables Review comment: ```suggestion // Step 3. Replace all occurrences of the original loop var with new variables ``` ########## File path: src/tir/schedule/primitive/fuse_split.cc ########## @@ -0,0 +1,483 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "../utils.h" +namespace tvm { +namespace tir { + +/*! \brief Append a new predicate to the each children of type BlockRealize (not recursively) */ +class PredicateUpdater : public StmtMutator { + public: + /*! + * \brief Constructor + * \param predicate The predicate to be apppend to BlockRealizeNode + */ + explicit PredicateUpdater(const PrimExpr& predicate, arith::Analyzer* ana) + : predicate_(predicate) { + if (!ana->CanProve(predicate)) { + add_predicate_ = true; + } + } + + private: + // For each direct child of type BlockRealizeNode, append the predicate + Stmt VisitStmt_(const BlockRealizeNode* realize) final { + // We do not recursively do this + if (add_predicate_) { + ObjectPtr<BlockRealizeNode> n = CopyOnWrite(realize); + n->predicate = n->predicate && predicate_; + return BlockRealize(n); + } else { + return GetRef<BlockRealize>(realize); + } + } + + /*! \brief The predicate to be added */ + const PrimExpr& predicate_; + /*! \brief whether to add predicate */ + bool add_predicate_; +}; +/*! \brief Substitute vars and collect the reuse mapping of opaque blocks */ +class IRSubstituteAndCollectOpaqueBlock : public StmtExprMutator { + public: + explicit IRSubstituteAndCollectOpaqueBlock(std::function<Optional<PrimExpr>(const Var&)> vmap, + Map<Block, Block>* opaque_blocks) + : vmap_(vmap), opaque_blocks_(opaque_blocks) {} + + private: + PrimExpr VisitExpr_(const VarNode* op) final { + Var var = GetRef<Var>(op); + Optional<PrimExpr> ret = vmap_(var); + if (ret.defined()) { + return ret.value(); + } else { + return std::move(var); + } + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + Stmt res = StmtMutator::VisitStmt_(op); + if (op->block->iter_vars.empty()) { + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + opaque_blocks_->Set(op->block, realize->block); + } + return res; + } + + /*! \brief The substitute function */ + std::function<Optional<PrimExpr>(const Var&)> vmap_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SubstituteAndCollectOpaqueBlock(Stmt stmt, Map<Block, Block>* opaque_blocks, + std::function<Optional<PrimExpr>(const Var&)> vmap) { + return IRSubstituteAndCollectOpaqueBlock(vmap, opaque_blocks)(std::move(stmt)); +} + +/*! \brief Simplify the binding of block realize and update the opaque block reuse mapping*/ +class BlockRealizeRewriter : public StmtExprMutator { + public: + explicit BlockRealizeRewriter( + const std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual>& loop_map, + Map<Block, Block>* opaque_blocks) + : opaque_blocks_(opaque_blocks) { + loop_map_.insert(loop_map.begin(), loop_map.end()); + } + + private: + Stmt VisitStmt_(const ForNode* op) final { + loop_map_[op->loop_var] = Range::FromMinExtent(op->min, op->extent); + Stmt res = StmtMutator::VisitStmt_(op); + loop_map_.erase(op->loop_var); + return res; + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + // skip opaque block and update mapping + if (op->iter_values.empty()) { + Stmt res = StmtMutator::VisitStmt_(op); + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + for (const std::pair<Block, Block>& entry : *opaque_blocks_) { + if (entry.second.same_as(op->block)) { + opaque_blocks_->Set(entry.first, realize->block); + break; + } + } + return res; + } + auto v = arith::IterMapSimplify(op->iter_values, loop_map_, op->predicate, false); + if (v.same_as(op->iter_values)) { + return GetRef<Stmt>(op); + } else { + auto n = CopyOnWrite(op); + n->iter_values = std::move(v); + return Stmt(n); + } + } + /*! \brief The range of loops */ + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SimplifyBindings(const Stmt& stmt, const Array<StmtSRef>& loops, + Map<Block, Block>* opaque_blocks) { + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map; + for (const StmtSRef& sref : loops) { + const auto* loop = sref->StmtAs<ForNode>(); + loop_map[loop->loop_var] = Range::FromMinExtent(loop->min, loop->extent); + } + BlockRealizeRewriter rewriter(loop_map, opaque_blocks); + return rewriter(stmt); +} + +class NotLoopError : public ScheduleError { + public: + explicit NotLoopError(IRModule mod, String type) : mod_(mod), type_(type) {} + + String FastErrorString() const final { + return "ScheduleError: this primitive only operates on a " + "loop"; + } + + String DetailRenderTemplate() const final { + return "this primitive only operates on a loop, but the StmtSref passed in points to" + "type: {0} "; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {type_}; } + + IRModule mod_; + String type_; +}; + +class HasAnnotationError : public ScheduleError { + public: + explicit HasAnnotationError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has annotation"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has annotation"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class HasThreadBindingError : public ScheduleError { + public: + explicit HasThreadBindingError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has thread binding"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has thread binding"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class OuterNotInnerParent : public ScheduleError { + public: + explicit OuterNotInnerParent(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the outer loop is not the parent of the inner loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the outer loop {0} is not the parent of the inner " + "loop {1}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class NotOnlyChildError : public ScheduleError { + public: + explicit NotOnlyChildError(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the inner loop is not the only child of outer loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the inner loop {1} is not the only child of outer " + "loop {0}."; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class LoopNotStartWithZeroError : public ScheduleError { + public: + explicit LoopNotStartWithZeroError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: the primitive only supports loop starting with 0"; + } + + String DetailRenderTemplate() const final { + return "The loop {0} does not start with 0, which is not supported"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class NotSingleInferFactorError : public ScheduleError { + public: + explicit NotSingleInferFactorError(IRModule mod) : mod_(mod) {} + + String FastErrorString() const final { + return "ScheduleError: only one factor can be specified as -1 or none"; + } + + String DetailRenderTemplate() const final { + return "Only one factor can be specified as -1 or none"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {}; } + + IRModule mod_; +}; + +class WrongFactorProductError : public ScheduleError { + public: + explicit WrongFactorProductError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The product of factors is not larger than or equal to the extent of " + "loop"; + } + + String DetailRenderTemplate() const final { + return "The product of factors is not larger than or equal to the extent of loop {0}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref, + const Array<PrimExpr>& factors) { + // Invariance + // - The total repeat number has not changed for each direct child block with updating predicate. + // - The execution order has not changed. (The block executes with the same args and the same + // order with before. + // Step 1. Check correctness + GetScopeRootAndCheckStagePipeline(self, loop_sref); + const auto* loop = loop_sref->StmtAs<ForNode>(); + if (loop == nullptr) { + throw NotLoopError(self->mod, loop_sref->stmt->GetTypeKey()); + } Review comment: This suggestion also works for line 417-419. ########## File path: src/tir/schedule/primitive/fuse_split.cc ########## @@ -0,0 +1,483 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "../utils.h" +namespace tvm { +namespace tir { + +/*! \brief Append a new predicate to the each children of type BlockRealize (not recursively) */ +class PredicateUpdater : public StmtMutator { + public: + /*! + * \brief Constructor + * \param predicate The predicate to be apppend to BlockRealizeNode + */ + explicit PredicateUpdater(const PrimExpr& predicate, arith::Analyzer* ana) + : predicate_(predicate) { + if (!ana->CanProve(predicate)) { + add_predicate_ = true; + } + } + + private: + // For each direct child of type BlockRealizeNode, append the predicate + Stmt VisitStmt_(const BlockRealizeNode* realize) final { + // We do not recursively do this + if (add_predicate_) { + ObjectPtr<BlockRealizeNode> n = CopyOnWrite(realize); + n->predicate = n->predicate && predicate_; + return BlockRealize(n); + } else { + return GetRef<BlockRealize>(realize); + } + } + + /*! \brief The predicate to be added */ + const PrimExpr& predicate_; + /*! \brief whether to add predicate */ + bool add_predicate_; +}; +/*! \brief Substitute vars and collect the reuse mapping of opaque blocks */ +class IRSubstituteAndCollectOpaqueBlock : public StmtExprMutator { + public: + explicit IRSubstituteAndCollectOpaqueBlock(std::function<Optional<PrimExpr>(const Var&)> vmap, + Map<Block, Block>* opaque_blocks) + : vmap_(vmap), opaque_blocks_(opaque_blocks) {} + + private: + PrimExpr VisitExpr_(const VarNode* op) final { + Var var = GetRef<Var>(op); + Optional<PrimExpr> ret = vmap_(var); + if (ret.defined()) { + return ret.value(); + } else { + return std::move(var); + } + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + Stmt res = StmtMutator::VisitStmt_(op); + if (op->block->iter_vars.empty()) { + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + opaque_blocks_->Set(op->block, realize->block); + } + return res; + } + + /*! \brief The substitute function */ + std::function<Optional<PrimExpr>(const Var&)> vmap_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SubstituteAndCollectOpaqueBlock(Stmt stmt, Map<Block, Block>* opaque_blocks, + std::function<Optional<PrimExpr>(const Var&)> vmap) { + return IRSubstituteAndCollectOpaqueBlock(vmap, opaque_blocks)(std::move(stmt)); +} + +/*! \brief Simplify the binding of block realize and update the opaque block reuse mapping*/ +class BlockRealizeRewriter : public StmtExprMutator { + public: + explicit BlockRealizeRewriter( + const std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual>& loop_map, + Map<Block, Block>* opaque_blocks) + : opaque_blocks_(opaque_blocks) { + loop_map_.insert(loop_map.begin(), loop_map.end()); + } + + private: + Stmt VisitStmt_(const ForNode* op) final { + loop_map_[op->loop_var] = Range::FromMinExtent(op->min, op->extent); + Stmt res = StmtMutator::VisitStmt_(op); + loop_map_.erase(op->loop_var); + return res; + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + // skip opaque block and update mapping + if (op->iter_values.empty()) { + Stmt res = StmtMutator::VisitStmt_(op); + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + for (const std::pair<Block, Block>& entry : *opaque_blocks_) { + if (entry.second.same_as(op->block)) { + opaque_blocks_->Set(entry.first, realize->block); + break; + } + } + return res; + } + auto v = arith::IterMapSimplify(op->iter_values, loop_map_, op->predicate, false); + if (v.same_as(op->iter_values)) { + return GetRef<Stmt>(op); + } else { + auto n = CopyOnWrite(op); + n->iter_values = std::move(v); + return Stmt(n); + } + } + /*! \brief The range of loops */ + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SimplifyBindings(const Stmt& stmt, const Array<StmtSRef>& loops, + Map<Block, Block>* opaque_blocks) { + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map; + for (const StmtSRef& sref : loops) { + const auto* loop = sref->StmtAs<ForNode>(); + loop_map[loop->loop_var] = Range::FromMinExtent(loop->min, loop->extent); + } + BlockRealizeRewriter rewriter(loop_map, opaque_blocks); + return rewriter(stmt); +} + +class NotLoopError : public ScheduleError { + public: + explicit NotLoopError(IRModule mod, String type) : mod_(mod), type_(type) {} + + String FastErrorString() const final { + return "ScheduleError: this primitive only operates on a " + "loop"; + } + + String DetailRenderTemplate() const final { + return "this primitive only operates on a loop, but the StmtSref passed in points to" + "type: {0} "; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {type_}; } + + IRModule mod_; + String type_; +}; + +class HasAnnotationError : public ScheduleError { + public: + explicit HasAnnotationError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has annotation"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has annotation"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class HasThreadBindingError : public ScheduleError { + public: + explicit HasThreadBindingError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has thread binding"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has thread binding"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class OuterNotInnerParent : public ScheduleError { + public: + explicit OuterNotInnerParent(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the outer loop is not the parent of the inner loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the outer loop {0} is not the parent of the inner " + "loop {1}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class NotOnlyChildError : public ScheduleError { + public: + explicit NotOnlyChildError(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the inner loop is not the only child of outer loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the inner loop {1} is not the only child of outer " + "loop {0}."; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class LoopNotStartWithZeroError : public ScheduleError { + public: + explicit LoopNotStartWithZeroError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: the primitive only supports loop starting with 0"; + } + + String DetailRenderTemplate() const final { + return "The loop {0} does not start with 0, which is not supported"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class NotSingleInferFactorError : public ScheduleError { + public: + explicit NotSingleInferFactorError(IRModule mod) : mod_(mod) {} + + String FastErrorString() const final { + return "ScheduleError: only one factor can be specified as -1 or none"; + } + + String DetailRenderTemplate() const final { + return "Only one factor can be specified as -1 or none"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {}; } + + IRModule mod_; +}; + +class WrongFactorProductError : public ScheduleError { + public: + explicit WrongFactorProductError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The product of factors is not larger than or equal to the extent of " + "loop"; + } + + String DetailRenderTemplate() const final { + return "The product of factors is not larger than or equal to the extent of loop {0}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref, + const Array<PrimExpr>& factors) { + // Invariance + // - The total repeat number has not changed for each direct child block with updating predicate. + // - The execution order has not changed. (The block executes with the same args and the same + // order with before. + // Step 1. Check correctness + GetScopeRootAndCheckStagePipeline(self, loop_sref); + const auto* loop = loop_sref->StmtAs<ForNode>(); + if (loop == nullptr) { + throw NotLoopError(self->mod, loop_sref->stmt->GetTypeKey()); + } + if (!loop->annotations.empty()) { + throw HasAnnotationError(self->mod, GetRef<For>(loop)); + } + if (loop->thread_binding.defined()) { + throw HasThreadBindingError(self->mod, GetRef<For>(loop)); + } + // Currently, loops starting with 0 is not supported + arith::Analyzer analyzer; + if (!analyzer.CanProve(loop->min == 0)) { + throw LoopNotStartWithZeroError(self->mod, GetRef<For>(loop)); + } + PrimExpr tot_length = 1; + int infer_index = -1; + for (size_t i = 0; i < factors.size(); i++) { + if (!analyzer.CanProve(factors[i] == -1)) { + tot_length *= factors[i]; + } else { + if (infer_index != -1) { + throw NotSingleInferFactorError(self->mod); + } else { + infer_index = i; + } + } + } + // Step 2. infer factors if needed + Array<PrimExpr> inferred_factors(factors); + if (infer_index != -1) { + inferred_factors.Set(infer_index, + analyzer.Simplify(floordiv(loop->extent + tot_length - 1, tot_length))); + } else { + if (!analyzer.CanProve(tot_length >= loop->extent)) { + throw WrongFactorProductError(self->mod, GetRef<For>(loop)); + } + } + // Step 3. Replace all occurrence of the original loop var with new variables + std::vector<Var> new_loop_vars; + new_loop_vars.reserve(inferred_factors.size()); + for (size_t i = 0; i < inferred_factors.size(); i++) { + new_loop_vars.push_back(loop->loop_var.copy_with_suffix("_" + std::to_string(i))); + } + PrimExpr substitute_value = 0; + for (size_t i = 0; i < inferred_factors.size(); i++) { + substitute_value *= inferred_factors[i]; + substitute_value += new_loop_vars[i]; + } + Map<Block, Block> opaque_block_reuse; + auto substitute_function = [&](const Var& v) -> Optional<PrimExpr> { + if (v.same_as(loop->loop_var)) { + return substitute_value; + } else { + return NullOpt; + } + }; + Stmt new_loop_body = + SubstituteAndCollectOpaqueBlock(loop->body, &opaque_block_reuse, substitute_function); + for (size_t i = 0; i < inferred_factors.size(); i++) { + analyzer.Bind(new_loop_vars[i], Range::FromMinExtent(0, inferred_factors[i])); + } + // Step 4. Update predicate to guard the loop + PrimExpr predicate = substitute_value < loop->extent; + new_loop_body = PredicateUpdater(predicate, &analyzer)(new_loop_body); + // Step 5. Generate tnested loops to replace the original loop and simplify the binding + Stmt outer_stmt = new_loop_body; + for (int i = inferred_factors.size() - 1; i >= 0; i--) { + outer_stmt = For(new_loop_vars[i], 0, inferred_factors[i], loop->kind, outer_stmt); + } + + outer_stmt = + Downcast<For>(SimplifyBindings(outer_stmt, GetLoops(loop_sref), &opaque_block_reuse)); + self->Replace(loop_sref, outer_stmt, opaque_block_reuse); + Array<StmtSRef> result_srefs; + result_srefs.reserve(inferred_factors.size()); + for (size_t i = 0; i < inferred_factors.size(); i++) { + result_srefs.push_back(self->stmt2ref.at(outer_stmt.get())); + const ForNode* outer_loop = outer_stmt.as<ForNode>(); + ICHECK(outer_loop); + outer_stmt = outer_loop->body; + } + return result_srefs; +} + +StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs) { + // Invariance + // - The total repeat number has not changed for each direct child block. + // - The execution order has not changed. (The block executes with the same + // args and the same order with before.) + std::vector<const ForNode*> loops; + loops.reserve(loop_srefs.size()); + StmtSRef outer_sref{nullptr}; + const ForNode* outer_loop = nullptr; + arith::Analyzer analyzer; + // Step 1. check correctness + GetScopeRootAndCheckStagePipeline(self, loop_srefs[0]); + for (const StmtSRef& sref : loop_srefs) { + const auto* loop = sref->StmtAs<ForNode>(); + if (loop == nullptr) { + throw NotLoopError(self->mod, sref->stmt->GetTypeKey()); + } + if (!loop->annotations.empty()) { + throw HasAnnotationError(self->mod, GetRef<For>(loop)); + } + if (loop->thread_binding.defined()) { + throw HasThreadBindingError(self->mod, GetRef<For>(loop)); + } + if (outer_sref.defined()) { + if (sref->parent != outer_sref.get()) { + throw OuterNotInnerParent(self->mod, GetRef<For>(outer_loop), GetRef<For>(loop)); + } + Array<Stmt> outer_children = GetChildren(GetRef<Stmt>(outer_loop)); + if (outer_children.size() != 1 || outer_children[0].get() != loop) { + throw NotOnlyChildError(self->mod, GetRef<For>(outer_loop), GetRef<For>(loop)); + } + } + outer_sref = sref; + outer_loop = loop; + if (!analyzer.CanProve(loop->min == 0)) { + throw LoopNotStartWithZeroError(self->mod, GetRef<For>(loop)); + } + loops.push_back(loop); + } + // Step 2. Create fused loop var and replace the original loop vars + std::string suffix; + for (size_t i = 1; i < loops.size(); i++) { + suffix += "_" + loops[i]->loop_var->name_hint; + } + suffix += "_fused"; + Var fused_var = loops[0]->loop_var.copy_with_suffix(suffix); + Array<PrimExpr> substitute_value; + substitute_value.resize(loops.size()); Review comment: ```suggestion substitute_value.reserve(loops.size()); ``` ########## File path: src/tir/schedule/primitive/fuse_split.cc ########## @@ -0,0 +1,483 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "../utils.h" +namespace tvm { +namespace tir { + +/*! \brief Append a new predicate to the each children of type BlockRealize (not recursively) */ +class PredicateUpdater : public StmtMutator { + public: + /*! + * \brief Constructor + * \param predicate The predicate to be apppend to BlockRealizeNode + */ + explicit PredicateUpdater(const PrimExpr& predicate, arith::Analyzer* ana) + : predicate_(predicate) { + if (!ana->CanProve(predicate)) { + add_predicate_ = true; + } + } + + private: + // For each direct child of type BlockRealizeNode, append the predicate + Stmt VisitStmt_(const BlockRealizeNode* realize) final { + // We do not recursively do this + if (add_predicate_) { + ObjectPtr<BlockRealizeNode> n = CopyOnWrite(realize); + n->predicate = n->predicate && predicate_; + return BlockRealize(n); + } else { + return GetRef<BlockRealize>(realize); + } + } + + /*! \brief The predicate to be added */ + const PrimExpr& predicate_; + /*! \brief whether to add predicate */ + bool add_predicate_; +}; +/*! \brief Substitute vars and collect the reuse mapping of opaque blocks */ +class IRSubstituteAndCollectOpaqueBlock : public StmtExprMutator { + public: + explicit IRSubstituteAndCollectOpaqueBlock(std::function<Optional<PrimExpr>(const Var&)> vmap, + Map<Block, Block>* opaque_blocks) + : vmap_(vmap), opaque_blocks_(opaque_blocks) {} + + private: + PrimExpr VisitExpr_(const VarNode* op) final { + Var var = GetRef<Var>(op); + Optional<PrimExpr> ret = vmap_(var); + if (ret.defined()) { + return ret.value(); + } else { + return std::move(var); + } + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + Stmt res = StmtMutator::VisitStmt_(op); + if (op->block->iter_vars.empty()) { + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + opaque_blocks_->Set(op->block, realize->block); + } + return res; + } + + /*! \brief The substitute function */ + std::function<Optional<PrimExpr>(const Var&)> vmap_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SubstituteAndCollectOpaqueBlock(Stmt stmt, Map<Block, Block>* opaque_blocks, + std::function<Optional<PrimExpr>(const Var&)> vmap) { + return IRSubstituteAndCollectOpaqueBlock(vmap, opaque_blocks)(std::move(stmt)); +} + +/*! \brief Simplify the binding of block realize and update the opaque block reuse mapping*/ +class BlockRealizeRewriter : public StmtExprMutator { + public: + explicit BlockRealizeRewriter( + const std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual>& loop_map, + Map<Block, Block>* opaque_blocks) + : opaque_blocks_(opaque_blocks) { + loop_map_.insert(loop_map.begin(), loop_map.end()); + } + + private: + Stmt VisitStmt_(const ForNode* op) final { + loop_map_[op->loop_var] = Range::FromMinExtent(op->min, op->extent); + Stmt res = StmtMutator::VisitStmt_(op); + loop_map_.erase(op->loop_var); + return res; + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + // skip opaque block and update mapping + if (op->iter_values.empty()) { + Stmt res = StmtMutator::VisitStmt_(op); + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + for (const std::pair<Block, Block>& entry : *opaque_blocks_) { + if (entry.second.same_as(op->block)) { + opaque_blocks_->Set(entry.first, realize->block); + break; + } + } + return res; + } + auto v = arith::IterMapSimplify(op->iter_values, loop_map_, op->predicate, false); + if (v.same_as(op->iter_values)) { + return GetRef<Stmt>(op); + } else { + auto n = CopyOnWrite(op); + n->iter_values = std::move(v); + return Stmt(n); + } + } + /*! \brief The range of loops */ + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SimplifyBindings(const Stmt& stmt, const Array<StmtSRef>& loops, + Map<Block, Block>* opaque_blocks) { + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map; + for (const StmtSRef& sref : loops) { + const auto* loop = sref->StmtAs<ForNode>(); + loop_map[loop->loop_var] = Range::FromMinExtent(loop->min, loop->extent); + } + BlockRealizeRewriter rewriter(loop_map, opaque_blocks); + return rewriter(stmt); +} + +class NotLoopError : public ScheduleError { + public: + explicit NotLoopError(IRModule mod, String type) : mod_(mod), type_(type) {} + + String FastErrorString() const final { + return "ScheduleError: this primitive only operates on a " + "loop"; + } + + String DetailRenderTemplate() const final { + return "this primitive only operates on a loop, but the StmtSref passed in points to" + "type: {0} "; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {type_}; } + + IRModule mod_; + String type_; +}; + +class HasAnnotationError : public ScheduleError { + public: + explicit HasAnnotationError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has annotation"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has annotation"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class HasThreadBindingError : public ScheduleError { + public: + explicit HasThreadBindingError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has thread binding"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has thread binding"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class OuterNotInnerParent : public ScheduleError { + public: + explicit OuterNotInnerParent(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the outer loop is not the parent of the inner loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the outer loop {0} is not the parent of the inner " + "loop {1}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class NotOnlyChildError : public ScheduleError { + public: + explicit NotOnlyChildError(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the inner loop is not the only child of outer loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the inner loop {1} is not the only child of outer " + "loop {0}."; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class LoopNotStartWithZeroError : public ScheduleError { + public: + explicit LoopNotStartWithZeroError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: the primitive only supports loop starting with 0"; + } + + String DetailRenderTemplate() const final { + return "The loop {0} does not start with 0, which is not supported"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class NotSingleInferFactorError : public ScheduleError { + public: + explicit NotSingleInferFactorError(IRModule mod) : mod_(mod) {} + + String FastErrorString() const final { + return "ScheduleError: only one factor can be specified as -1 or none"; + } + + String DetailRenderTemplate() const final { + return "Only one factor can be specified as -1 or none"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {}; } + + IRModule mod_; +}; + +class WrongFactorProductError : public ScheduleError { + public: + explicit WrongFactorProductError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The product of factors is not larger than or equal to the extent of " + "loop"; + } + + String DetailRenderTemplate() const final { + return "The product of factors is not larger than or equal to the extent of loop {0}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref, + const Array<PrimExpr>& factors) { + // Invariance + // - The total repeat number has not changed for each direct child block with updating predicate. + // - The execution order has not changed. (The block executes with the same args and the same + // order with before. + // Step 1. Check correctness + GetScopeRootAndCheckStagePipeline(self, loop_sref); + const auto* loop = loop_sref->StmtAs<ForNode>(); + if (loop == nullptr) { + throw NotLoopError(self->mod, loop_sref->stmt->GetTypeKey()); + } + if (!loop->annotations.empty()) { + throw HasAnnotationError(self->mod, GetRef<For>(loop)); + } + if (loop->thread_binding.defined()) { + throw HasThreadBindingError(self->mod, GetRef<For>(loop)); + } + // Currently, loops starting with 0 is not supported + arith::Analyzer analyzer; + if (!analyzer.CanProve(loop->min == 0)) { + throw LoopNotStartWithZeroError(self->mod, GetRef<For>(loop)); + } + PrimExpr tot_length = 1; + int infer_index = -1; + for (size_t i = 0; i < factors.size(); i++) { + if (!analyzer.CanProve(factors[i] == -1)) { + tot_length *= factors[i]; + } else { + if (infer_index != -1) { + throw NotSingleInferFactorError(self->mod); + } else { + infer_index = i; + } + } + } + // Step 2. infer factors if needed + Array<PrimExpr> inferred_factors(factors); + if (infer_index != -1) { + inferred_factors.Set(infer_index, + analyzer.Simplify(floordiv(loop->extent + tot_length - 1, tot_length))); + } else { + if (!analyzer.CanProve(tot_length >= loop->extent)) { + throw WrongFactorProductError(self->mod, GetRef<For>(loop)); + } + } + // Step 3. Replace all occurrence of the original loop var with new variables + std::vector<Var> new_loop_vars; + new_loop_vars.reserve(inferred_factors.size()); + for (size_t i = 0; i < inferred_factors.size(); i++) { + new_loop_vars.push_back(loop->loop_var.copy_with_suffix("_" + std::to_string(i))); + } + PrimExpr substitute_value = 0; + for (size_t i = 0; i < inferred_factors.size(); i++) { + substitute_value *= inferred_factors[i]; + substitute_value += new_loop_vars[i]; + } + Map<Block, Block> opaque_block_reuse; + auto substitute_function = [&](const Var& v) -> Optional<PrimExpr> { + if (v.same_as(loop->loop_var)) { + return substitute_value; + } else { + return NullOpt; + } + }; + Stmt new_loop_body = + SubstituteAndCollectOpaqueBlock(loop->body, &opaque_block_reuse, substitute_function); + for (size_t i = 0; i < inferred_factors.size(); i++) { + analyzer.Bind(new_loop_vars[i], Range::FromMinExtent(0, inferred_factors[i])); + } + // Step 4. Update predicate to guard the loop + PrimExpr predicate = substitute_value < loop->extent; + new_loop_body = PredicateUpdater(predicate, &analyzer)(new_loop_body); + // Step 5. Generate tnested loops to replace the original loop and simplify the binding + Stmt outer_stmt = new_loop_body; + for (int i = inferred_factors.size() - 1; i >= 0; i--) { + outer_stmt = For(new_loop_vars[i], 0, inferred_factors[i], loop->kind, outer_stmt); + } + + outer_stmt = + Downcast<For>(SimplifyBindings(outer_stmt, GetLoops(loop_sref), &opaque_block_reuse)); + self->Replace(loop_sref, outer_stmt, opaque_block_reuse); + Array<StmtSRef> result_srefs; + result_srefs.reserve(inferred_factors.size()); + for (size_t i = 0; i < inferred_factors.size(); i++) { + result_srefs.push_back(self->stmt2ref.at(outer_stmt.get())); + const ForNode* outer_loop = outer_stmt.as<ForNode>(); + ICHECK(outer_loop); + outer_stmt = outer_loop->body; + } + return result_srefs; +} + +StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs) { + // Invariance + // - The total repeat number has not changed for each direct child block. + // - The execution order has not changed. (The block executes with the same + // args and the same order with before.) Review comment: ```suggestion // Invariance // - The total repeat number has not changed for each direct child block. // - The execution order has not changed. (The block executes with the same // args and the same order with before.) ``` ########## File path: src/tir/schedule/primitive/fuse_split.cc ########## @@ -0,0 +1,483 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#include "../utils.h" +namespace tvm { +namespace tir { + +/*! \brief Append a new predicate to the each children of type BlockRealize (not recursively) */ +class PredicateUpdater : public StmtMutator { + public: + /*! + * \brief Constructor + * \param predicate The predicate to be apppend to BlockRealizeNode + */ + explicit PredicateUpdater(const PrimExpr& predicate, arith::Analyzer* ana) + : predicate_(predicate) { + if (!ana->CanProve(predicate)) { + add_predicate_ = true; + } + } + + private: + // For each direct child of type BlockRealizeNode, append the predicate + Stmt VisitStmt_(const BlockRealizeNode* realize) final { + // We do not recursively do this + if (add_predicate_) { + ObjectPtr<BlockRealizeNode> n = CopyOnWrite(realize); + n->predicate = n->predicate && predicate_; + return BlockRealize(n); + } else { + return GetRef<BlockRealize>(realize); + } + } + + /*! \brief The predicate to be added */ + const PrimExpr& predicate_; + /*! \brief whether to add predicate */ + bool add_predicate_; +}; +/*! \brief Substitute vars and collect the reuse mapping of opaque blocks */ +class IRSubstituteAndCollectOpaqueBlock : public StmtExprMutator { + public: + explicit IRSubstituteAndCollectOpaqueBlock(std::function<Optional<PrimExpr>(const Var&)> vmap, + Map<Block, Block>* opaque_blocks) + : vmap_(vmap), opaque_blocks_(opaque_blocks) {} + + private: + PrimExpr VisitExpr_(const VarNode* op) final { + Var var = GetRef<Var>(op); + Optional<PrimExpr> ret = vmap_(var); + if (ret.defined()) { + return ret.value(); + } else { + return std::move(var); + } + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + Stmt res = StmtMutator::VisitStmt_(op); + if (op->block->iter_vars.empty()) { + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + opaque_blocks_->Set(op->block, realize->block); + } + return res; + } + + /*! \brief The substitute function */ + std::function<Optional<PrimExpr>(const Var&)> vmap_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SubstituteAndCollectOpaqueBlock(Stmt stmt, Map<Block, Block>* opaque_blocks, + std::function<Optional<PrimExpr>(const Var&)> vmap) { + return IRSubstituteAndCollectOpaqueBlock(vmap, opaque_blocks)(std::move(stmt)); +} + +/*! \brief Simplify the binding of block realize and update the opaque block reuse mapping*/ +class BlockRealizeRewriter : public StmtExprMutator { + public: + explicit BlockRealizeRewriter( + const std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual>& loop_map, + Map<Block, Block>* opaque_blocks) + : opaque_blocks_(opaque_blocks) { + loop_map_.insert(loop_map.begin(), loop_map.end()); + } + + private: + Stmt VisitStmt_(const ForNode* op) final { + loop_map_[op->loop_var] = Range::FromMinExtent(op->min, op->extent); + Stmt res = StmtMutator::VisitStmt_(op); + loop_map_.erase(op->loop_var); + return res; + } + + Stmt VisitStmt_(const BlockRealizeNode* op) final { + // skip opaque block and update mapping + if (op->iter_values.empty()) { + Stmt res = StmtMutator::VisitStmt_(op); + const BlockRealizeNode* realize = res.as<BlockRealizeNode>(); + for (const std::pair<Block, Block>& entry : *opaque_blocks_) { + if (entry.second.same_as(op->block)) { + opaque_blocks_->Set(entry.first, realize->block); + break; + } + } + return res; + } + auto v = arith::IterMapSimplify(op->iter_values, loop_map_, op->predicate, false); + if (v.same_as(op->iter_values)) { + return GetRef<Stmt>(op); + } else { + auto n = CopyOnWrite(op); + n->iter_values = std::move(v); + return Stmt(n); + } + } + /*! \brief The range of loops */ + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map_; + /*! \brief The reuse mapping */ + Map<Block, Block>* opaque_blocks_; +}; + +Stmt SimplifyBindings(const Stmt& stmt, const Array<StmtSRef>& loops, + Map<Block, Block>* opaque_blocks) { + std::unordered_map<Var, Range, ObjectPtrHash, ObjectPtrEqual> loop_map; + for (const StmtSRef& sref : loops) { + const auto* loop = sref->StmtAs<ForNode>(); + loop_map[loop->loop_var] = Range::FromMinExtent(loop->min, loop->extent); + } + BlockRealizeRewriter rewriter(loop_map, opaque_blocks); + return rewriter(stmt); +} + +class NotLoopError : public ScheduleError { + public: + explicit NotLoopError(IRModule mod, String type) : mod_(mod), type_(type) {} + + String FastErrorString() const final { + return "ScheduleError: this primitive only operates on a " + "loop"; + } + + String DetailRenderTemplate() const final { + return "this primitive only operates on a loop, but the StmtSref passed in points to" + "type: {0} "; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {type_}; } + + IRModule mod_; + String type_; +}; + +class HasAnnotationError : public ScheduleError { + public: + explicit HasAnnotationError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has annotation"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has annotation"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class HasThreadBindingError : public ScheduleError { + public: + explicit HasThreadBindingError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The primitive can't be applied because the loop has thread binding"; + } + + String DetailRenderTemplate() const final { + return "The primitive can't be applied because the loop {0} has thread binding"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class OuterNotInnerParent : public ScheduleError { + public: + explicit OuterNotInnerParent(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the outer loop is not the parent of the inner loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the outer loop {0} is not the parent of the inner " + "loop {1}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class NotOnlyChildError : public ScheduleError { + public: + explicit NotOnlyChildError(IRModule mod, For outer, For inner) + : mod_(mod), outer_(outer), inner_(inner) {} + + String FastErrorString() const final { + return "ScheduleError: the inner loop is not the only child of outer loop"; + } + + String DetailRenderTemplate() const final { + return "The loops can't be fused because the inner loop {1} is not the only child of outer " + "loop {0}."; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {outer_, inner_}; } + + IRModule mod_; + For outer_; + For inner_; +}; + +class LoopNotStartWithZeroError : public ScheduleError { + public: + explicit LoopNotStartWithZeroError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: the primitive only supports loop starting with 0"; + } + + String DetailRenderTemplate() const final { + return "The loop {0} does not start with 0, which is not supported"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +class NotSingleInferFactorError : public ScheduleError { + public: + explicit NotSingleInferFactorError(IRModule mod) : mod_(mod) {} + + String FastErrorString() const final { + return "ScheduleError: only one factor can be specified as -1 or none"; + } + + String DetailRenderTemplate() const final { + return "Only one factor can be specified as -1 or none"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {}; } + + IRModule mod_; +}; + +class WrongFactorProductError : public ScheduleError { + public: + explicit WrongFactorProductError(IRModule mod, For loop) : mod_(mod), loop_(loop) {} + + String FastErrorString() const final { + return "ScheduleError: The product of factors is not larger than or equal to the extent of " + "loop"; + } + + String DetailRenderTemplate() const final { + return "The product of factors is not larger than or equal to the extent of loop {0}"; + } + + IRModule mod() const final { return mod_; } + Array<ObjectRef> LocationsOfInterest() const final { return {loop_}; } + + IRModule mod_; + For loop_; +}; + +Array<StmtSRef> Split(ScheduleState self, const StmtSRef& loop_sref, + const Array<PrimExpr>& factors) { + // Invariance + // - The total repeat number has not changed for each direct child block with updating predicate. + // - The execution order has not changed. (The block executes with the same args and the same + // order with before. + // Step 1. Check correctness + GetScopeRootAndCheckStagePipeline(self, loop_sref); + const auto* loop = loop_sref->StmtAs<ForNode>(); + if (loop == nullptr) { + throw NotLoopError(self->mod, loop_sref->stmt->GetTypeKey()); + } + if (!loop->annotations.empty()) { + throw HasAnnotationError(self->mod, GetRef<For>(loop)); + } + if (loop->thread_binding.defined()) { + throw HasThreadBindingError(self->mod, GetRef<For>(loop)); + } + // Currently, loops starting with 0 is not supported + arith::Analyzer analyzer; + if (!analyzer.CanProve(loop->min == 0)) { + throw LoopNotStartWithZeroError(self->mod, GetRef<For>(loop)); + } + PrimExpr tot_length = 1; + int infer_index = -1; + for (size_t i = 0; i < factors.size(); i++) { + if (!analyzer.CanProve(factors[i] == -1)) { + tot_length *= factors[i]; + } else { + if (infer_index != -1) { + throw NotSingleInferFactorError(self->mod); + } else { + infer_index = i; + } + } + } + // Step 2. infer factors if needed + Array<PrimExpr> inferred_factors(factors); + if (infer_index != -1) { + inferred_factors.Set(infer_index, + analyzer.Simplify(floordiv(loop->extent + tot_length - 1, tot_length))); + } else { + if (!analyzer.CanProve(tot_length >= loop->extent)) { + throw WrongFactorProductError(self->mod, GetRef<For>(loop)); + } + } + // Step 3. Replace all occurrence of the original loop var with new variables + std::vector<Var> new_loop_vars; + new_loop_vars.reserve(inferred_factors.size()); + for (size_t i = 0; i < inferred_factors.size(); i++) { + new_loop_vars.push_back(loop->loop_var.copy_with_suffix("_" + std::to_string(i))); + } + PrimExpr substitute_value = 0; + for (size_t i = 0; i < inferred_factors.size(); i++) { + substitute_value *= inferred_factors[i]; + substitute_value += new_loop_vars[i]; + } + Map<Block, Block> opaque_block_reuse; + auto substitute_function = [&](const Var& v) -> Optional<PrimExpr> { + if (v.same_as(loop->loop_var)) { + return substitute_value; + } else { + return NullOpt; + } + }; + Stmt new_loop_body = + SubstituteAndCollectOpaqueBlock(loop->body, &opaque_block_reuse, substitute_function); + for (size_t i = 0; i < inferred_factors.size(); i++) { + analyzer.Bind(new_loop_vars[i], Range::FromMinExtent(0, inferred_factors[i])); + } + // Step 4. Update predicate to guard the loop + PrimExpr predicate = substitute_value < loop->extent; + new_loop_body = PredicateUpdater(predicate, &analyzer)(new_loop_body); + // Step 5. Generate tnested loops to replace the original loop and simplify the binding + Stmt outer_stmt = new_loop_body; + for (int i = inferred_factors.size() - 1; i >= 0; i--) { + outer_stmt = For(new_loop_vars[i], 0, inferred_factors[i], loop->kind, outer_stmt); + } + + outer_stmt = + Downcast<For>(SimplifyBindings(outer_stmt, GetLoops(loop_sref), &opaque_block_reuse)); + self->Replace(loop_sref, outer_stmt, opaque_block_reuse); + Array<StmtSRef> result_srefs; + result_srefs.reserve(inferred_factors.size()); + for (size_t i = 0; i < inferred_factors.size(); i++) { + result_srefs.push_back(self->stmt2ref.at(outer_stmt.get())); + const ForNode* outer_loop = outer_stmt.as<ForNode>(); + ICHECK(outer_loop); + outer_stmt = outer_loop->body; + } + return result_srefs; +} + +StmtSRef Fuse(ScheduleState self, const Array<StmtSRef>& loop_srefs) { + // Invariance + // - The total repeat number has not changed for each direct child block. + // - The execution order has not changed. (The block executes with the same + // args and the same order with before.) + std::vector<const ForNode*> loops; + loops.reserve(loop_srefs.size()); + StmtSRef outer_sref{nullptr}; + const ForNode* outer_loop = nullptr; + arith::Analyzer analyzer; + // Step 1. check correctness + GetScopeRootAndCheckStagePipeline(self, loop_srefs[0]); + for (const StmtSRef& sref : loop_srefs) { + const auto* loop = sref->StmtAs<ForNode>(); + if (loop == nullptr) { + throw NotLoopError(self->mod, sref->stmt->GetTypeKey()); + } + if (!loop->annotations.empty()) { + throw HasAnnotationError(self->mod, GetRef<For>(loop)); + } + if (loop->thread_binding.defined()) { + throw HasThreadBindingError(self->mod, GetRef<For>(loop)); + } + if (outer_sref.defined()) { + if (sref->parent != outer_sref.get()) { + throw OuterNotInnerParent(self->mod, GetRef<For>(outer_loop), GetRef<For>(loop)); + } + Array<Stmt> outer_children = GetChildren(GetRef<Stmt>(outer_loop)); + if (outer_children.size() != 1 || outer_children[0].get() != loop) { + throw NotOnlyChildError(self->mod, GetRef<For>(outer_loop), GetRef<For>(loop)); + } + } + outer_sref = sref; + outer_loop = loop; + if (!analyzer.CanProve(loop->min == 0)) { + throw LoopNotStartWithZeroError(self->mod, GetRef<For>(loop)); + } + loops.push_back(loop); + } + // Step 2. Create fused loop var and replace the original loop vars + std::string suffix; + for (size_t i = 1; i < loops.size(); i++) { + suffix += "_" + loops[i]->loop_var->name_hint; + } + suffix += "_fused"; + Var fused_var = loops[0]->loop_var.copy_with_suffix(suffix); + Array<PrimExpr> substitute_value; + substitute_value.resize(loops.size()); + PrimExpr tot = fused_var; + for (int i = loops.size() - 1; i >= 0; i--) { + substitute_value.Set(i, floormod(tot, loops[i]->extent)); + tot = floordiv(tot, loops[i]->extent); + } + Stmt loop_body = loops.back()->body; + Map<Block, Block> opaque_block_reuse; + auto substitute_function = [&](const Var& v) -> Optional<PrimExpr> { Review comment: Ditto, `f_substitute`. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
