This is an automated email from the ASF dual-hosted git repository.
masahi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 6bad21e9fe [Texture] Add 2d memory support into static memory planner
(#11876)
6bad21e9fe is described below
commit 6bad21e9fe711e6994df238e8a3edc89073b894b
Author: Andrey Malyshev <[email protected]>
AuthorDate: Mon Jul 18 23:39:59 2022 +0300
[Texture] Add 2d memory support into static memory planner (#11876)
* [Texture] Add 2d memory support into static memory planner
Co-authored-by: Chris Sullivan <[email protected]>
* Add test verifying GraphPlanMemory work for 2d memory
Co-authored-by: Chris Sullivan <[email protected]>
---
python/tvm/relay/expr.py | 4 +
src/relay/backend/graph_plan_memory.cc | 351 ++++++++++++++++------
src/relay/backend/utils.cc | 8 +
tests/python/relay/test_backend_graph_executor.py | 95 ++++++
4 files changed, 363 insertions(+), 95 deletions(-)
diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 811e205fb2..fefc285723 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -564,6 +564,10 @@ class StorageInfo(Node):
def storage_sizes(self):
return _ffi_api.StorageInfoStorageSizes(self)
+ @property
+ def virtual_devices(self):
+ return _ffi_api.StorageInfoVirtualDevices(self)
+
@tvm._ffi.register_object("relay.StaticMemoryPlan")
class StaticMemoryPlan(Node):
diff --git a/src/relay/backend/graph_plan_memory.cc
b/src/relay/backend/graph_plan_memory.cc
index 0019b22f1a..dab951b7e9 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -31,6 +31,7 @@
#include <tvm/runtime/container/array.h>
#include <tvm/tir/op.h>
+#include "../../runtime/texture.h"
#include "../../support/arena.h"
#include "../op/annotation/annotation.h"
#include "../op/call/call.h"
@@ -41,6 +42,10 @@
namespace tvm {
namespace relay {
+using TargetsMap = Map<Integer, Target>;
+using Texture2DShape = runtime::Texture2DShape<int64_t>;
+constexpr auto Is2DStorage = runtime::IsTextureStorage;
+
using backend::StaticMemoryPlan;
using backend::StorageInfo;
using IntegerArray = Array<Integer>;
@@ -151,12 +156,13 @@ class StorageAllocaBaseVisitor : public
transform::DeviceAwareExprVisitor {
*/
const std::vector<StorageToken*>& GetToken(const Expr& expr) {
this->VisitExpr(expr);
+ // See through on_device calls.
+ Expr real_expr = IgnoreOnDevice(expr);
+
// Functions don't require data storage, represented by the empty token
- if (expr->checked_type().as<FuncTypeNode>()) {
+ if (real_expr->checked_type().as<FuncTypeNode>()) {
return no_tokens_;
}
- // See through on_device calls.
- Expr real_expr = IgnoreOnDevice(expr);
this->VisitExpr(real_expr);
auto it = token_map_.find(real_expr.get());
ICHECK(it != token_map_.end()) << "Expression not found in storage map:"
<< std::endl
@@ -225,6 +231,7 @@ class StorageAllocaInit : protected
StorageAllocaBaseVisitor {
private:
// allocator
support::Arena* arena_;
+ Map<Expr, Array<String>> node_storage_map_;
};
/*! \brief Associate storage with every expression, reusing storage where
possible. */
@@ -272,7 +279,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
num_nodes++;
storage_ids.push_back(tok->storage_id);
virtual_devices.push_back(tok->virtual_device);
- sid_sizes_byte.push_back(GetMemorySize(tok));
+ sid_sizes_byte.push_back(allocator_.GetMemorySize(tok));
}
auto storage_info = backend::StorageInfo(std::move(storage_ids),
std::move(virtual_devices),
std::move(sid_sizes_byte));
@@ -301,10 +308,10 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
for (StorageToken* tok : it->second) {
ICHECK(tok->virtual_device == virtual_device);
if (can_realloc) {
- tokens.push_back(Request(tok));
+ tokens.push_back(allocator_.Request(tok));
} else {
// Allocate a new token,
- StorageToken* allocated_tok = Alloc(tok, GetMemorySize(tok));
+ StorageToken* allocated_tok = allocator_.Alloc(tok);
allocated_tok->virtual_device = tok->virtual_device;
// ensure it never get de-allocated.
allocated_tok->ref_counter += 1;
@@ -365,108 +372,260 @@ class StorageAllocator : public
StorageAllocaBaseVisitor {
// check if there is orphaned output that can be released immediately.
for (StorageToken* tok : token_map_.at(call_node)) {
- CheckForRelease(tok);
+ allocator_.CheckForRelease(tok);
}
for (StorageToken* tok : args) {
tok->ref_counter -= 1;
- CheckForRelease(tok);
+ allocator_.CheckForRelease(tok);
}
}
- /*!
- * \brief ceil(size/word_size) to get number of words.
- * \param size The original size.
- * \param word_size The element size.
- */
- static int64_t DivRoundUp(int64_t size, int64_t word_size) {
- return (size + word_size - 1) / word_size;
- }
- /*!
- * \brief Get the memory requirement.
- * \param prototype The prototype token.
- * \return The required memory size.
- *
- * TODO(mbs): Gf GetMemorySizeBytes in aot_executor_codegen.cc,
- * CalculateRelayExprSizeBytes in utils.cc
+ /**
+ * @brief Memory manager for flattened 1d memory (buffers)
*/
- static int64_t GetMemorySize(StorageToken* prototype) {
- TensorType ttype = prototype->ttype;
- ICHECK(ttype.defined());
- int64_t size = 1;
- for (IndexExpr dim : ttype->shape) {
- const int64_t* pval = tir::as_const_int(dim);
- ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape
" << ttype->shape;
- ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative
shape" << *pval;
- size *= pval[0];
+ class TokenAllocator1D {
+ public:
+ /*!
+ * \brief ceil(size/word_size) to get number of words.
+ * \param size The original size.
+ * \param word_size The element size.
+ */
+ static size_t DivRoundUp(size_t size, size_t word_size) {
+ return (size + word_size - 1) / word_size;
}
- size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
- return size;
- }
- /*!
- * \brief Request a storage token for a given prototype.
- * \param prototype. The prototype storage token.
- * \return The result token.
- */
- StorageToken* Request(StorageToken* prototype) {
- // calculate the size;
- size_t size = GetMemorySize(prototype);
- // search memory block in [size / match_range_, size * match_range_)
- if (match_range_ == 0) {
- return this->Alloc(prototype, size);
+
+ /*!
+ * \brief Get the memory requirement.
+ * \param prototype The prototype token.
+ * \return The required memory size.
+ *
+ * TODO(mbs): Gf GetMemorySizeBytes in aot_executor_codegen.cc,
+ * CalculateRelayExprSizeBytes in utils.cc
+ */
+ size_t GetMemorySize(StorageToken* prototype) {
+ TensorType ttype = prototype->ttype;
+ ICHECK(ttype.defined());
+ size_t size = 1;
+ for (IndexExpr dim : ttype->shape) {
+ const int64_t* pval = tir::as_const_int(dim);
+ ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor
shape " << ttype->shape;
+ ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with
negative shape" << *pval;
+ size *= static_cast<size_t>(pval[0]);
+ }
+ size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
+ return size;
}
- auto begin = free_.lower_bound(size / match_range_);
- auto mid = free_.lower_bound(size);
- auto end = free_.upper_bound(size * match_range_);
- // search for memory blocks larger than requested
- for (auto it = mid; it != end; ++it) {
- StorageToken* tok = it->second;
- if (!tok->is_compatible(*prototype)) continue;
- ICHECK_EQ(tok->ref_counter, 0);
- // Use exect matching strategy
- tok->max_bytes = std::max(size, tok->max_bytes);
- tok->ref_counter = prototype->ref_counter;
- // find a exact match, erase from map and return
- free_.erase(it);
- return tok;
+ /*!
+ * \brief Request a storage token for a given prototype.
+ * \param prototype. The prototype storage token.
+ * \return The result token.
+ */
+ StorageToken* Request(StorageToken* prototype) {
+ // calculate the size;
+ size_t size = GetMemorySize(prototype);
+ // search memory block in [size / match_range_, size * match_range_)
+ if (match_range_ == 0) {
+ return nullptr;
+ }
+ auto begin = free_.lower_bound(size / match_range_);
+ auto mid = free_.lower_bound(size);
+ auto end = free_.upper_bound(size * match_range_);
+ // search for memory blocks larger than requested
+ for (auto it = mid; it != end; ++it) {
+ StorageToken* tok = it->second;
+ if (!tok->is_compatible(*prototype)) continue;
+ ICHECK_EQ(tok->ref_counter, 0);
+ // Use exect matching strategy
+ tok->max_bytes = std::max(size, tok->max_bytes);
+ tok->ref_counter = prototype->ref_counter;
+ // find a exact match, erase from map and return
+ free_.erase(it);
+ return tok;
+ }
+ // then search for memory blocks smaller than requested space
+ for (auto it = mid; it != begin;) {
+ --it;
+ StorageToken* tok = it->second;
+ if (!tok->is_compatible(*prototype)) continue;
+ ICHECK_EQ(tok->ref_counter, 0);
+ // Use exect matching strategy
+ tok->max_bytes = std::max(size, tok->max_bytes);
+ tok->ref_counter = prototype->ref_counter;
+ // erase from map and return
+ free_.erase(it);
+ return tok;
+ }
+ return nullptr;
}
- // then search for memory blocks smaller than requested space
- for (auto it = mid; it != begin;) {
- --it;
- StorageToken* tok = it->second;
- if (!tok->is_compatible(*prototype)) continue;
- ICHECK_EQ(tok->ref_counter, 0);
- // Use exect matching strategy
- tok->max_bytes = std::max(size, tok->max_bytes);
- tok->ref_counter = prototype->ref_counter;
- // erase from map and return
- free_.erase(it);
- return tok;
+ /*!
+ * \brief Alloacte a storage token by consuming prototype
+ * \param prototype The prototype token.
+ * \param size The size of memory being requested.
+ */
+ StorageToken* Alloc(StorageToken* prototype, int64_t storage_id) {
+ size_t size = GetMemorySize(prototype);
+ prototype->max_bytes = size;
+ prototype->storage_id = storage_id;
+ data_.push_back(prototype);
+ return prototype;
}
- // cannot find anything return a new one.
- return this->Alloc(prototype, size);
- }
- /*!
- * \brief Allocate a storage token by consuming prototype
- * \param prototype The prototype token.
- * \param size The size of memory being requested.
- */
- StorageToken* Alloc(StorageToken* prototype, size_t size) {
- prototype->max_bytes = size;
- prototype->storage_id = static_cast<int64_t>(data_.size());
- data_.push_back(prototype);
- return prototype;
- }
- /*!
- * \brief Check if we can release token.
- * \param tok The token to be released.
+ /*!
+ * \brief Check if we can release token.
+ * \param tok The token to be released.
+ */
+ void CheckForRelease(StorageToken* tok) {
+ ICHECK_GE(tok->storage_id, 0);
+ ICHECK_GE(tok->ref_counter, 0);
+ if (tok->ref_counter == 0) {
+ free_.insert({tok->max_bytes, tok});
+ }
+ }
+
+ private:
+ // scale used for rough match
+ const size_t match_range_{16};
+ // free list of storage entry
+ std::multimap<size_t, StorageToken*> free_;
+ // all the storage resources available
+ std::vector<StorageToken*> data_;
+ };
+
+ /**
+ * @brief Memory manager for 2d memory (textures)
*/
- void CheckForRelease(StorageToken* tok) {
- ICHECK_GE(tok->storage_id, 0);
- ICHECK_GE(tok->ref_counter, 0);
- if (tok->ref_counter == 0) {
- free_.insert({tok->max_bytes, tok});
+ class TokenAllocator2D {
+ public:
+ /*!
+ * \brief Request a storage token for a given prototype.
+ * \param prototype. The prototype storage token.
+ * \return The result token.
+ */
+ StorageToken* Request(StorageToken* prototype) {
+ auto shape = GetSize2D(prototype);
+ int64_t requested_size = shape.height * shape.width;
+ int64_t min_added_size = std::numeric_limits<int64_t>::max();
+ int64_t min_wasted_size = std::numeric_limits<int64_t>::max();
+ int64_t best_storage_id = -1;
+ MemBlock best_mem, new_mem;
+ for (int64_t free_id : free_list_) {
+ MemBlock& cached = blocks_[free_id];
+ // Can only reuse texture 2d blocks of the same type
+ if (cached.token_->ttype->dtype != prototype->ttype->dtype) {
+ continue;
+ }
+ int64_t cached_size = cached.x_ * cached.y_;
+ new_mem.x_ = std::max(cached.x_, shape.width);
+ new_mem.y_ = std::max(cached.y_, shape.height);
+ int64_t expanded_size = new_mem.x_ * new_mem.y_;
+ int64_t added_size = expanded_size - cached_size;
+ int64_t wasted_size = expanded_size - requested_size;
+ // Prioritize minimization of added size first, then minimize
+ // wasted size among blocks which would not require expansion
+ if ((min_added_size > 0 && added_size < min_added_size) ||
+ (min_added_size == 0 && wasted_size < min_wasted_size)) {
+ min_added_size = added_size;
+ min_wasted_size = wasted_size;
+ best_storage_id = free_id;
+ best_mem = new_mem;
+ }
+ }
+
+ if (min_added_size <= requested_size) {
+ best_mem.token_ = blocks_[best_storage_id].token_;
+ // Reset the reference counter of the now live token
+ best_mem.token_->ref_counter = prototype->ref_counter;
+ blocks_[best_storage_id] = best_mem;
+ free_list_.erase(best_storage_id);
+ return best_mem.token_;
+ }
+ return nullptr;
}
- }
+ /*!
+ * \brief Alloacte a storage token by consuming prototype
+ * \param prototype The prototype token.
+ * \param size The size of memory being requested.
+ */
+ StorageToken* Alloc(StorageToken* prototype, int64_t storage_id) {
+ auto shape = GetSize2D(prototype);
+ MemBlock block;
+ block.x_ = shape.width;
+ block.y_ = shape.height;
+ prototype->storage_id = storage_id;
+ block.token_ = prototype;
+ blocks_[prototype->storage_id] = block;
+ return prototype;
+ }
+ /*!
+ * \brief Check if we can release token.
+ * \param tok The token to be released.
+ */
+ void CheckForRelease(StorageToken* tok) {
+ ICHECK_GE(tok->storage_id, 0);
+ ICHECK_GE(tok->ref_counter, 0);
+ if (tok->ref_counter == 0) {
+ free_list_.insert(tok->storage_id);
+ }
+ }
+ /*!
+ * \brief Get the texture 2d size requirement
+ * \param prototype The prototype token.
+ * \return The required texture 2d memory size in (width, height, channel).
+ */
+ Texture2DShape GetSize2D(StorageToken* prototype) {
+ TensorType ttype = prototype->ttype;
+ ICHECK(ttype.defined());
+ size_t axis = runtime::DefaultTextureLayoutSeparator(ttype->shape.size(),
+
prototype->virtual_device->memory_scope);
+ struct Shape {
+ const Array<PrimExpr>& shape;
+ int64_t operator[](size_t i) const { return
*tir::as_const_int(shape[i]); }
+ };
+ return runtime::ApplyTexture2DFlattening<int64_t>(Shape{ttype->shape},
ttype->shape.size(),
+ axis);
+ }
+
+ private:
+ struct MemBlock {
+ StorageToken* token_;
+ int64_t x_;
+ int64_t y_;
+ };
+
+ std::unordered_map<int64_t, MemBlock> blocks_;
+ std::unordered_set<int64_t> free_list_;
+ };
+
+ class TokenAllocator {
+ public:
+ StorageToken* Alloc(StorageToken* proto) {
+ return Is2DStorage(proto) ? token_2d_.Alloc(proto, storage_ids_++)
+ : token_1d_.Alloc(proto, storage_ids_++);
+ }
+ StorageToken* Request(StorageToken* proto) {
+ StorageToken* token =
+ Is2DStorage(proto) ? token_2d_.Request(proto) :
token_1d_.Request(proto);
+ return token ? token : this->Alloc(proto);
+ }
+ void CheckForRelease(StorageToken* tok) {
+ return Is2DStorage(tok) ? token_2d_.CheckForRelease(tok) :
token_1d_.CheckForRelease(tok);
+ }
+
+ size_t GetMemorySize(StorageToken* tok) {
+ // TODO(amalyshe): figure out who requries sizes and for what
+ // size in case of texture is not enough - we can return any value if it
+ // assumed to be used for memory allocatoion or we can return real size
+ // if it is just for information
+ return Is2DStorage(tok) ? 0 : token_1d_.GetMemorySize(tok);
+ }
+ static bool Is2DStorage(StorageToken* tok) {
+ return relay::Is2DStorage(tok->virtual_device->memory_scope);
+ }
+
+ private:
+ int64_t storage_ids_{0};
+ TokenAllocator1D token_1d_;
+ TokenAllocator2D token_2d_;
+ };
private:
// allocator
@@ -479,6 +638,8 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
std::vector<StorageToken*> data_;
/*! \brief internal prototype token map */
std::unordered_map<const ExprNode*, std::vector<StorageToken*>> prototype_;
+ /*! \brief token allocator for optimizing 1d and 2d token alloc requests */
+ TokenAllocator allocator_;
};
StaticMemoryPlan GraphPlanMemory(const Function& func) { return
StorageAllocator().Plan(func); }
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index fe8127d60d..340986770e 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -114,6 +114,14 @@
TVM_REGISTER_GLOBAL("relay.ir.StorageInfoStorageSizes").set_body_typed([](Storag
return storage_sizes_in_bytes;
});
+TVM_REGISTER_GLOBAL("relay.ir.StorageInfoVirtualDevices").set_body_typed([](StorageInfo
si) {
+ Array<VirtualDevice> virtual_devices;
+ for (auto id : si->virtual_devices) {
+ virtual_devices.push_back(id);
+ }
+ return virtual_devices;
+});
+
TVM_REGISTER_NODE_TYPE(StaticMemoryPlanNode);
StaticMemoryPlan::StaticMemoryPlan(Map<Expr, StorageInfo>
expr_to_storage_info) {
diff --git a/tests/python/relay/test_backend_graph_executor.py
b/tests/python/relay/test_backend_graph_executor.py
index b797e4ce9d..0522c0db10 100644
--- a/tests/python/relay/test_backend_graph_executor.py
+++ b/tests/python/relay/test_backend_graph_executor.py
@@ -184,6 +184,101 @@ def test_plan_memory():
)
+def test_plan_2d_memory():
+ """Verification if GraphPlanMemory manages 2d memory reffered as
+ global.texture* memory scopes in json file."""
+ global_virtual_device = tvm.target.VirtualDevice(memory_scope="global")
+ texture_virtual_device =
tvm.target.VirtualDevice(memory_scope="global.texture")
+ metatable = {
+ "VirtualDevice": [
+ global_virtual_device,
+ texture_virtual_device,
+ ]
+ }
+
+ mod = tvm.parser.parse(
+ """
+ #[version = "0.0.5"]
+ def @main(%data1: Tensor[(1, 32, 40, 40), float32],
+ %data2: Tensor[(1, 32, 40, 40), float32]) {
+ %0 = fn (%a, Primitive=1) {
+ layout_transform(%a, src_layout="NCHW", dst_layout="NCHW4c")
+ };
+ %1 = %0(%data1);
+ %3 = %0(%data2);
+ %5 = fn (%a {virtual_device=meta[VirtualDevice][0]}, // global
+ %b {virtual_device=meta[VirtualDevice][0]}, // global
+ virtual_device=meta[VirtualDevice][1], // texture
+ Primitive=1) {
+ add(%a, %b)
+ };
+ %6 = %5(%1, %3);
+ %7 = fn (%a {virtual_device=meta[VirtualDevice][1]}, // texture
+ %b {virtual_device=meta[VirtualDevice][0]}, // global
+ virtual_device=meta[VirtualDevice][1], // texture
+ Primitive=1) {
+ add(%a, %b)
+ };
+ %8 = %7(%6, %3);
+ %9 = fn (%a {virtual_device=meta[VirtualDevice][1]}, // texture
+ %b {virtual_device=meta[VirtualDevice][1]}, // texture
+ virtual_device=meta[VirtualDevice][1], // texture
+ Primitive=1) {
+ add(%a, %b)
+ };
+ %10 = %9(%8, %6);
+ %11 = fn (%a,
+ virtual_device=meta[VirtualDevice][0], // global
+ Primitive=1) {
+ layout_transform(%a, src_layout="NCHW4c", dst_layout="NCHW")
+ };
+ %11(%10)
+ }
+ """,
+ "from_string",
+ None,
+ metatable,
+ )
+
+ GPU_DEVICE = tvm.device("cuda")
+ HOST_TARGET = tvm.target.Target("llvm")
+ GPU_TARGET = tvm.target.Target("cuda").with_host(HOST_TARGET)
+ GPU = tvm.target.VirtualDevice(GPU_DEVICE, GPU_TARGET) # device_type=2
+ CTXT = tvm.transform.PassContext(config={"relay.fallback_device_type":
GPU.device_type_int})
+ config = tvm.target.make_compilation_config(CTXT, GPU_TARGET)
+ mod = relay.transform.InferType()(mod)
+ # PlanDevices should succeed.
+ mod = relay.transform.PlanDevices(config)(mod)
+
+ func = mod["main"]
+ memory_plan = relay.backend._backend.GraphPlanMemory(func)
+ virtual_devices = {}
+
+ # We do not have execution ordered information, the only order that we can
stick
+ # in this place - storage_id
+ # for above graph we know that
+ # We have
+ # - 8 manageable storages for above graph
+ # - 5 of them are buffers
+ # - 3 of them are textures (2d storages)
+ # - 1 of buffer will be reused, since we have storage id maped data, we
will have 4th
+ # storage id reuesed and hidden in virtual_devices map
+ # - no textures are reused so far
+ for k, v in memory_plan.expr_to_storage_info.items():
+ virtual_devices[v.storage_ids[0]] = v.virtual_devices[0].memory_scope
+
+ # Check the scopes according to abvoce expectaions
+ assert (
+ virtual_devices[0] == "global"
+ and virtual_devices[1] == "global"
+ and virtual_devices[2] == "global"
+ and virtual_devices[3] == "global"
+ and virtual_devices[4] == "global.texture"
+ and virtual_devices[5] == "global.texture"
+ and virtual_devices[6] == "global.texture"
+ )
+
+
def test_reshape_nop():
# test that reshape can be turned into nop
x = relay.var("x", shape=(10, 4))