[tvm] branch main updated: [Texture] Add 2d memory support into static memory planner (#11876)

masahi Mon, 18 Jul 2022 13:40:12 -0700

This is an automated email from the ASF dual-hosted git repository.

masahi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new 6bad21e9fe [Texture] Add 2d memory support into static memory planner 
(#11876)
6bad21e9fe is described below

commit 6bad21e9fe711e6994df238e8a3edc89073b894b
Author: Andrey Malyshev <[email protected]>
AuthorDate: Mon Jul 18 23:39:59 2022 +0300

    [Texture] Add 2d memory support into static memory planner (#11876)
    
    * [Texture] Add 2d memory support into static memory planner
    
    Co-authored-by: Chris Sullivan <[email protected]>
    
    * Add test verifying GraphPlanMemory work for 2d memory
    
    Co-authored-by: Chris Sullivan <[email protected]>
---
 python/tvm/relay/expr.py                          |   4 +
 src/relay/backend/graph_plan_memory.cc            | 351 ++++++++++++++++------
 src/relay/backend/utils.cc                        |   8 +
 tests/python/relay/test_backend_graph_executor.py |  95 ++++++
 4 files changed, 363 insertions(+), 95 deletions(-)

diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py
index 811e205fb2..fefc285723 100644
--- a/python/tvm/relay/expr.py
+++ b/python/tvm/relay/expr.py
@@ -564,6 +564,10 @@ class StorageInfo(Node):
     def storage_sizes(self):
         return _ffi_api.StorageInfoStorageSizes(self)
 
+    @property
+    def virtual_devices(self):
+        return _ffi_api.StorageInfoVirtualDevices(self)
+
 
 @tvm._ffi.register_object("relay.StaticMemoryPlan")
 class StaticMemoryPlan(Node):
diff --git a/src/relay/backend/graph_plan_memory.cc 
b/src/relay/backend/graph_plan_memory.cc
index 0019b22f1a..dab951b7e9 100644
--- a/src/relay/backend/graph_plan_memory.cc
+++ b/src/relay/backend/graph_plan_memory.cc
@@ -31,6 +31,7 @@
 #include <tvm/runtime/container/array.h>
 #include <tvm/tir/op.h>
 
+#include "../../runtime/texture.h"
 #include "../../support/arena.h"
 #include "../op/annotation/annotation.h"
 #include "../op/call/call.h"
@@ -41,6 +42,10 @@
 namespace tvm {
 namespace relay {
 
+using TargetsMap = Map<Integer, Target>;
+using Texture2DShape = runtime::Texture2DShape<int64_t>;
+constexpr auto Is2DStorage = runtime::IsTextureStorage;
+
 using backend::StaticMemoryPlan;
 using backend::StorageInfo;
 using IntegerArray = Array<Integer>;
@@ -151,12 +156,13 @@ class StorageAllocaBaseVisitor : public 
transform::DeviceAwareExprVisitor {
    */
   const std::vector<StorageToken*>& GetToken(const Expr& expr) {
     this->VisitExpr(expr);
+    // See through on_device calls.
+    Expr real_expr = IgnoreOnDevice(expr);
+
     // Functions don't require data storage, represented by the empty token
-    if (expr->checked_type().as<FuncTypeNode>()) {
+    if (real_expr->checked_type().as<FuncTypeNode>()) {
       return no_tokens_;
     }
-    // See through on_device calls.
-    Expr real_expr = IgnoreOnDevice(expr);
     this->VisitExpr(real_expr);
     auto it = token_map_.find(real_expr.get());
     ICHECK(it != token_map_.end()) << "Expression not found in storage map:" 
<< std::endl
@@ -225,6 +231,7 @@ class StorageAllocaInit : protected 
StorageAllocaBaseVisitor {
  private:
   // allocator
   support::Arena* arena_;
+  Map<Expr, Array<String>> node_storage_map_;
 };
 
 /*! \brief Associate storage with every expression, reusing storage where 
possible. */
@@ -272,7 +279,7 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
         num_nodes++;
         storage_ids.push_back(tok->storage_id);
         virtual_devices.push_back(tok->virtual_device);
-        sid_sizes_byte.push_back(GetMemorySize(tok));
+        sid_sizes_byte.push_back(allocator_.GetMemorySize(tok));
       }
       auto storage_info = backend::StorageInfo(std::move(storage_ids), 
std::move(virtual_devices),
                                                std::move(sid_sizes_byte));
@@ -301,10 +308,10 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
     for (StorageToken* tok : it->second) {
       ICHECK(tok->virtual_device == virtual_device);
       if (can_realloc) {
-        tokens.push_back(Request(tok));
+        tokens.push_back(allocator_.Request(tok));
       } else {
         // Allocate a new token,
-        StorageToken* allocated_tok = Alloc(tok, GetMemorySize(tok));
+        StorageToken* allocated_tok = allocator_.Alloc(tok);
         allocated_tok->virtual_device = tok->virtual_device;
         // ensure it never get de-allocated.
         allocated_tok->ref_counter += 1;
@@ -365,108 +372,260 @@ class StorageAllocator : public 
StorageAllocaBaseVisitor {
 
     // check if there is orphaned output that can be released immediately.
     for (StorageToken* tok : token_map_.at(call_node)) {
-      CheckForRelease(tok);
+      allocator_.CheckForRelease(tok);
     }
     for (StorageToken* tok : args) {
       tok->ref_counter -= 1;
-      CheckForRelease(tok);
+      allocator_.CheckForRelease(tok);
     }
   }
-  /*!
-   * \brief ceil(size/word_size) to get number of words.
-   * \param size The original size.
-   * \param word_size The element size.
-   */
-  static int64_t DivRoundUp(int64_t size, int64_t word_size) {
-    return (size + word_size - 1) / word_size;
-  }
 
-  /*!
-   * \brief Get the memory requirement.
-   * \param prototype The prototype token.
-   * \return The required memory size.
-   *
-   * TODO(mbs): Gf GetMemorySizeBytes in aot_executor_codegen.cc,
-   * CalculateRelayExprSizeBytes in utils.cc
+  /**
+   * @brief Memory manager for flattened 1d memory (buffers)
    */
-  static int64_t GetMemorySize(StorageToken* prototype) {
-    TensorType ttype = prototype->ttype;
-    ICHECK(ttype.defined());
-    int64_t size = 1;
-    for (IndexExpr dim : ttype->shape) {
-      const int64_t* pval = tir::as_const_int(dim);
-      ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor shape 
" << ttype->shape;
-      ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with negative 
shape" << *pval;
-      size *= pval[0];
+  class TokenAllocator1D {
+   public:
+    /*!
+     * \brief ceil(size/word_size) to get number of words.
+     * \param size The original size.
+     * \param word_size The element size.
+     */
+    static size_t DivRoundUp(size_t size, size_t word_size) {
+      return (size + word_size - 1) / word_size;
     }
-    size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
-    return size;
-  }
-  /*!
-   * \brief Request a storage token for a given prototype.
-   * \param prototype. The prototype storage token.
-   * \return The result token.
-   */
-  StorageToken* Request(StorageToken* prototype) {
-    // calculate the size;
-    size_t size = GetMemorySize(prototype);
-    // search memory block in [size / match_range_, size * match_range_)
-    if (match_range_ == 0) {
-      return this->Alloc(prototype, size);
+
+    /*!
+     * \brief Get the memory requirement.
+     * \param prototype The prototype token.
+     * \return The required memory size.
+     *
+     * TODO(mbs): Gf GetMemorySizeBytes in aot_executor_codegen.cc,
+     * CalculateRelayExprSizeBytes in utils.cc
+     */
+    size_t GetMemorySize(StorageToken* prototype) {
+      TensorType ttype = prototype->ttype;
+      ICHECK(ttype.defined());
+      size_t size = 1;
+      for (IndexExpr dim : ttype->shape) {
+        const int64_t* pval = tir::as_const_int(dim);
+        ICHECK(pval != nullptr) << "Cannot allocate memory symbolic tensor 
shape " << ttype->shape;
+        ICHECK_GE(*pval, 0) << "Cannot allocate memory for tensor with 
negative shape" << *pval;
+        size *= static_cast<size_t>(pval[0]);
+      }
+      size *= DivRoundUp(ttype->dtype.bits() * ttype->dtype.lanes(), 8);
+      return size;
     }
-    auto begin = free_.lower_bound(size / match_range_);
-    auto mid = free_.lower_bound(size);
-    auto end = free_.upper_bound(size * match_range_);
-    // search for memory blocks larger than requested
-    for (auto it = mid; it != end; ++it) {
-      StorageToken* tok = it->second;
-      if (!tok->is_compatible(*prototype)) continue;
-      ICHECK_EQ(tok->ref_counter, 0);
-      // Use exect matching strategy
-      tok->max_bytes = std::max(size, tok->max_bytes);
-      tok->ref_counter = prototype->ref_counter;
-      // find a exact match, erase from map and return
-      free_.erase(it);
-      return tok;
+    /*!
+     * \brief Request a storage token for a given prototype.
+     * \param prototype. The prototype storage token.
+     * \return The result token.
+     */
+    StorageToken* Request(StorageToken* prototype) {
+      // calculate the size;
+      size_t size = GetMemorySize(prototype);
+      // search memory block in [size / match_range_, size * match_range_)
+      if (match_range_ == 0) {
+        return nullptr;
+      }
+      auto begin = free_.lower_bound(size / match_range_);
+      auto mid = free_.lower_bound(size);
+      auto end = free_.upper_bound(size * match_range_);
+      // search for memory blocks larger than requested
+      for (auto it = mid; it != end; ++it) {
+        StorageToken* tok = it->second;
+        if (!tok->is_compatible(*prototype)) continue;
+        ICHECK_EQ(tok->ref_counter, 0);
+        // Use exect matching strategy
+        tok->max_bytes = std::max(size, tok->max_bytes);
+        tok->ref_counter = prototype->ref_counter;
+        // find a exact match, erase from map and return
+        free_.erase(it);
+        return tok;
+      }
+      // then search for memory blocks smaller than requested space
+      for (auto it = mid; it != begin;) {
+        --it;
+        StorageToken* tok = it->second;
+        if (!tok->is_compatible(*prototype)) continue;
+        ICHECK_EQ(tok->ref_counter, 0);
+        // Use exect matching strategy
+        tok->max_bytes = std::max(size, tok->max_bytes);
+        tok->ref_counter = prototype->ref_counter;
+        // erase from map and return
+        free_.erase(it);
+        return tok;
+      }
+      return nullptr;
     }
-    // then search for memory blocks smaller than requested space
-    for (auto it = mid; it != begin;) {
-      --it;
-      StorageToken* tok = it->second;
-      if (!tok->is_compatible(*prototype)) continue;
-      ICHECK_EQ(tok->ref_counter, 0);
-      // Use exect matching strategy
-      tok->max_bytes = std::max(size, tok->max_bytes);
-      tok->ref_counter = prototype->ref_counter;
-      // erase from map and return
-      free_.erase(it);
-      return tok;
+    /*!
+     * \brief Alloacte a storage token by consuming prototype
+     * \param prototype The prototype token.
+     * \param size The size of memory being requested.
+     */
+    StorageToken* Alloc(StorageToken* prototype, int64_t storage_id) {
+      size_t size = GetMemorySize(prototype);
+      prototype->max_bytes = size;
+      prototype->storage_id = storage_id;
+      data_.push_back(prototype);
+      return prototype;
     }
-    // cannot find anything return a new one.
-    return this->Alloc(prototype, size);
-  }
-  /*!
-   * \brief Allocate a storage token by consuming prototype
-   * \param prototype The prototype token.
-   * \param size The size of memory being requested.
-   */
-  StorageToken* Alloc(StorageToken* prototype, size_t size) {
-    prototype->max_bytes = size;
-    prototype->storage_id = static_cast<int64_t>(data_.size());
-    data_.push_back(prototype);
-    return prototype;
-  }
-  /*!
-   * \brief Check if we can release token.
-   * \param tok The token to be released.
+    /*!
+     * \brief Check if we can release token.
+     * \param tok The token to be released.
+     */
+    void CheckForRelease(StorageToken* tok) {
+      ICHECK_GE(tok->storage_id, 0);
+      ICHECK_GE(tok->ref_counter, 0);
+      if (tok->ref_counter == 0) {
+        free_.insert({tok->max_bytes, tok});
+      }
+    }
+
+   private:
+    // scale used for rough match
+    const size_t match_range_{16};
+    // free list of storage entry
+    std::multimap<size_t, StorageToken*> free_;
+    // all the storage resources available
+    std::vector<StorageToken*> data_;
+  };
+
+  /**
+   * @brief Memory manager for 2d memory (textures)
    */
-  void CheckForRelease(StorageToken* tok) {
-    ICHECK_GE(tok->storage_id, 0);
-    ICHECK_GE(tok->ref_counter, 0);
-    if (tok->ref_counter == 0) {
-      free_.insert({tok->max_bytes, tok});
+  class TokenAllocator2D {
+   public:
+    /*!
+     * \brief Request a storage token for a given prototype.
+     * \param prototype. The prototype storage token.
+     * \return The result token.
+     */
+    StorageToken* Request(StorageToken* prototype) {
+      auto shape = GetSize2D(prototype);
+      int64_t requested_size = shape.height * shape.width;
+      int64_t min_added_size = std::numeric_limits<int64_t>::max();
+      int64_t min_wasted_size = std::numeric_limits<int64_t>::max();
+      int64_t best_storage_id = -1;
+      MemBlock best_mem, new_mem;
+      for (int64_t free_id : free_list_) {
+        MemBlock& cached = blocks_[free_id];
+        // Can only reuse texture 2d blocks of the same type
+        if (cached.token_->ttype->dtype != prototype->ttype->dtype) {
+          continue;
+        }
+        int64_t cached_size = cached.x_ * cached.y_;
+        new_mem.x_ = std::max(cached.x_, shape.width);
+        new_mem.y_ = std::max(cached.y_, shape.height);
+        int64_t expanded_size = new_mem.x_ * new_mem.y_;
+        int64_t added_size = expanded_size - cached_size;
+        int64_t wasted_size = expanded_size - requested_size;
+        // Prioritize minimization of added size first, then minimize
+        // wasted size among blocks which would not require expansion
+        if ((min_added_size > 0 && added_size < min_added_size) ||
+            (min_added_size == 0 && wasted_size < min_wasted_size)) {
+          min_added_size = added_size;
+          min_wasted_size = wasted_size;
+          best_storage_id = free_id;
+          best_mem = new_mem;
+        }
+      }
+
+      if (min_added_size <= requested_size) {
+        best_mem.token_ = blocks_[best_storage_id].token_;
+        // Reset the reference counter of the now live token
+        best_mem.token_->ref_counter = prototype->ref_counter;
+        blocks_[best_storage_id] = best_mem;
+        free_list_.erase(best_storage_id);
+        return best_mem.token_;
+      }
+      return nullptr;
     }
-  }
+    /*!
+     * \brief Alloacte a storage token by consuming prototype
+     * \param prototype The prototype token.
+     * \param size The size of memory being requested.
+     */
+    StorageToken* Alloc(StorageToken* prototype, int64_t storage_id) {
+      auto shape = GetSize2D(prototype);
+      MemBlock block;
+      block.x_ = shape.width;
+      block.y_ = shape.height;
+      prototype->storage_id = storage_id;
+      block.token_ = prototype;
+      blocks_[prototype->storage_id] = block;
+      return prototype;
+    }
+    /*!
+     * \brief Check if we can release token.
+     * \param tok The token to be released.
+     */
+    void CheckForRelease(StorageToken* tok) {
+      ICHECK_GE(tok->storage_id, 0);
+      ICHECK_GE(tok->ref_counter, 0);
+      if (tok->ref_counter == 0) {
+        free_list_.insert(tok->storage_id);
+      }
+    }
+    /*!
+     * \brief Get the texture 2d size requirement
+     * \param prototype The prototype token.
+     * \return The required texture 2d memory size in (width, height, channel).
+     */
+    Texture2DShape GetSize2D(StorageToken* prototype) {
+      TensorType ttype = prototype->ttype;
+      ICHECK(ttype.defined());
+      size_t axis = runtime::DefaultTextureLayoutSeparator(ttype->shape.size(),
+                                                           
prototype->virtual_device->memory_scope);
+      struct Shape {
+        const Array<PrimExpr>& shape;
+        int64_t operator[](size_t i) const { return 
*tir::as_const_int(shape[i]); }
+      };
+      return runtime::ApplyTexture2DFlattening<int64_t>(Shape{ttype->shape}, 
ttype->shape.size(),
+                                                        axis);
+    }
+
+   private:
+    struct MemBlock {
+      StorageToken* token_;
+      int64_t x_;
+      int64_t y_;
+    };
+
+    std::unordered_map<int64_t, MemBlock> blocks_;
+    std::unordered_set<int64_t> free_list_;
+  };
+
+  class TokenAllocator {
+   public:
+    StorageToken* Alloc(StorageToken* proto) {
+      return Is2DStorage(proto) ? token_2d_.Alloc(proto, storage_ids_++)
+                                : token_1d_.Alloc(proto, storage_ids_++);
+    }
+    StorageToken* Request(StorageToken* proto) {
+      StorageToken* token =
+          Is2DStorage(proto) ? token_2d_.Request(proto) : 
token_1d_.Request(proto);
+      return token ? token : this->Alloc(proto);
+    }
+    void CheckForRelease(StorageToken* tok) {
+      return Is2DStorage(tok) ? token_2d_.CheckForRelease(tok) : 
token_1d_.CheckForRelease(tok);
+    }
+
+    size_t GetMemorySize(StorageToken* tok) {
+      // TODO(amalyshe): figure out who requries sizes and for what
+      // size in case of texture is not enough - we can return any value if it
+      // assumed to be used for memory allocatoion or we can return real size
+      // if it is just for information
+      return Is2DStorage(tok) ? 0 : token_1d_.GetMemorySize(tok);
+    }
+    static bool Is2DStorage(StorageToken* tok) {
+      return relay::Is2DStorage(tok->virtual_device->memory_scope);
+    }
+
+   private:
+    int64_t storage_ids_{0};
+    TokenAllocator1D token_1d_;
+    TokenAllocator2D token_2d_;
+  };
 
  private:
   // allocator
@@ -479,6 +638,8 @@ class StorageAllocator : public StorageAllocaBaseVisitor {
   std::vector<StorageToken*> data_;
   /*! \brief internal prototype token map */
   std::unordered_map<const ExprNode*, std::vector<StorageToken*>> prototype_;
+  /*! \brief token allocator for optimizing 1d and 2d token alloc requests */
+  TokenAllocator allocator_;
 };
 
 StaticMemoryPlan GraphPlanMemory(const Function& func) { return 
StorageAllocator().Plan(func); }
diff --git a/src/relay/backend/utils.cc b/src/relay/backend/utils.cc
index fe8127d60d..340986770e 100644
--- a/src/relay/backend/utils.cc
+++ b/src/relay/backend/utils.cc
@@ -114,6 +114,14 @@ 
TVM_REGISTER_GLOBAL("relay.ir.StorageInfoStorageSizes").set_body_typed([](Storag
   return storage_sizes_in_bytes;
 });
 
+TVM_REGISTER_GLOBAL("relay.ir.StorageInfoVirtualDevices").set_body_typed([](StorageInfo
 si) {
+  Array<VirtualDevice> virtual_devices;
+  for (auto id : si->virtual_devices) {
+    virtual_devices.push_back(id);
+  }
+  return virtual_devices;
+});
+
 TVM_REGISTER_NODE_TYPE(StaticMemoryPlanNode);
 
 StaticMemoryPlan::StaticMemoryPlan(Map<Expr, StorageInfo> 
expr_to_storage_info) {
diff --git a/tests/python/relay/test_backend_graph_executor.py 
b/tests/python/relay/test_backend_graph_executor.py
index b797e4ce9d..0522c0db10 100644
--- a/tests/python/relay/test_backend_graph_executor.py
+++ b/tests/python/relay/test_backend_graph_executor.py
@@ -184,6 +184,101 @@ def test_plan_memory():
     )
 
 
+def test_plan_2d_memory():
+    """Verification if GraphPlanMemory manages 2d memory reffered as
+    global.texture* memory scopes in json file."""
+    global_virtual_device = tvm.target.VirtualDevice(memory_scope="global")
+    texture_virtual_device = 
tvm.target.VirtualDevice(memory_scope="global.texture")
+    metatable = {
+        "VirtualDevice": [
+            global_virtual_device,
+            texture_virtual_device,
+        ]
+    }
+
+    mod = tvm.parser.parse(
+        """
+        #[version = "0.0.5"]
+        def @main(%data1: Tensor[(1, 32, 40, 40), float32],
+                  %data2: Tensor[(1, 32, 40, 40), float32]) {
+          %0 = fn (%a, Primitive=1) {
+            layout_transform(%a, src_layout="NCHW", dst_layout="NCHW4c")
+          };
+          %1 = %0(%data1);
+          %3 = %0(%data2);
+          %5 = fn (%a {virtual_device=meta[VirtualDevice][0]},  // global
+                   %b {virtual_device=meta[VirtualDevice][0]},  // global
+                   virtual_device=meta[VirtualDevice][1],       // texture
+                   Primitive=1) {
+            add(%a, %b)
+          };
+          %6 = %5(%1, %3);
+          %7 = fn (%a {virtual_device=meta[VirtualDevice][1]},  // texture
+                   %b {virtual_device=meta[VirtualDevice][0]},  // global
+                   virtual_device=meta[VirtualDevice][1],       // texture
+                   Primitive=1) {
+            add(%a, %b)
+          };
+          %8 = %7(%6, %3);
+          %9 = fn (%a {virtual_device=meta[VirtualDevice][1]},  // texture
+                   %b {virtual_device=meta[VirtualDevice][1]},  // texture
+                   virtual_device=meta[VirtualDevice][1],       // texture
+                   Primitive=1) {
+            add(%a, %b)
+          };
+          %10 = %9(%8, %6);
+          %11 = fn (%a,
+                    virtual_device=meta[VirtualDevice][0],      // global
+                    Primitive=1) {
+            layout_transform(%a, src_layout="NCHW4c", dst_layout="NCHW")
+          };
+          %11(%10)
+        }
+        """,
+        "from_string",
+        None,
+        metatable,
+    )
+
+    GPU_DEVICE = tvm.device("cuda")
+    HOST_TARGET = tvm.target.Target("llvm")
+    GPU_TARGET = tvm.target.Target("cuda").with_host(HOST_TARGET)
+    GPU = tvm.target.VirtualDevice(GPU_DEVICE, GPU_TARGET)  # device_type=2
+    CTXT = tvm.transform.PassContext(config={"relay.fallback_device_type": 
GPU.device_type_int})
+    config = tvm.target.make_compilation_config(CTXT, GPU_TARGET)
+    mod = relay.transform.InferType()(mod)
+    # PlanDevices should succeed.
+    mod = relay.transform.PlanDevices(config)(mod)
+
+    func = mod["main"]
+    memory_plan = relay.backend._backend.GraphPlanMemory(func)
+    virtual_devices = {}
+
+    # We do not have execution ordered information, the only order that we can 
stick
+    # in this place - storage_id
+    # for above graph we know that
+    # We have
+    #  - 8 manageable storages for above graph
+    #  - 5 of them are buffers
+    #  - 3 of them are textures (2d storages)
+    #  - 1 of buffer will be reused, since we have storage id maped data, we 
will have 4th
+    #      storage id reuesed and hidden in virtual_devices map
+    #  - no textures are reused so far
+    for k, v in memory_plan.expr_to_storage_info.items():
+        virtual_devices[v.storage_ids[0]] = v.virtual_devices[0].memory_scope
+
+    # Check the scopes according to abvoce expectaions
+    assert (
+        virtual_devices[0] == "global"
+        and virtual_devices[1] == "global"
+        and virtual_devices[2] == "global"
+        and virtual_devices[3] == "global"
+        and virtual_devices[4] == "global.texture"
+        and virtual_devices[5] == "global.texture"
+        and virtual_devices[6] == "global.texture"
+    )
+
+
 def test_reshape_nop():
     # test that reshape can be turned into nop
     x = relay.var("x", shape=(10, 4))

[tvm] branch main updated: [Texture] Add 2d memory support into static memory planner (#11876)

Reply via email to