[GitHub] [tvm] echuraev commented on a diff in pull request #15058: [OpenCL][Texture] Improved texture memory planning

via GitHub Tue, 13 Jun 2023 00:33:39 -0700


echuraev commented on code in PR #15058:
URL: https://github.com/apache/tvm/pull/15058#discussion_r1227581867



##########
src/runtime/graph_executor/graph_executor.cc:
##########
@@ -424,26 +423,36 @@ void GraphExecutor::SetupStorage() {
       }
       size_t bits = t.bits * t.lanes;
       ICHECK(bits % 8U == 0U || bits == 1U || bits == 4U);
-      int64_t bytes = ((bits + 7U) / 8U) * size;
-      pool_entry[sid].shape[0] = std::max(pool_entry[sid].shape[0], bytes);
-      pool_entry[sid].dtype = DLDataType{kDLFloat, 32, 1};
+      int64_t alloc_size = ((bits + 7U) / 8U) * size;
+
+      if (pool_entry[sid].alloc_size < alloc_size) {

Review Comment:
   Is it possible for now that fields `dtype`, `shape`, `alloc_size` and 
`scope` won't be initialized? In this case, will we have problems with access 
to these fields below in loop by `pool_entry`?
   Can we add such test for `SetupStorage`?



##########
src/runtime/cuda/cuda_device_api.cc:
##########
@@ -105,6 +105,8 @@ class CUDADeviceAPI final : public DeviceAPI {
       }
       case kDriverVersion:
         return;
+      default:

Review Comment:
   Probably it is better to handle `kImagePitchAlignment` explicitly?
   ```suggestion
         case kImagePitchAlignment:
   ```



##########
src/runtime/opencl/memory_pool.cc:
##########
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file memory_pool.h
+ * \brief OpenCL Workspace pool utility.
+ */
+#include "memory_pool.h"
+
+#include <memory>
+
+#include "opencl_common.h"
+
+#define ALIGN_UP(num, align) (((num) + ((align)-1)) & ~((align)-1))
+
+namespace tvm {
+namespace runtime {
+namespace cl {
+
+void* Pool::Alloc(Device dev, DeviceAPI* device, size_t nbytes) {
+  Entry e;
+  DLDataType type;
+  type.code = kDLUInt;
+  type.bits = 8;
+  type.lanes = 1;
+  if (free_list_.size() == 2) {

Review Comment:
   Why do you have such specific logic for `size = 2`?



##########
src/runtime/opencl/memory_pool.cc:
##########
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file memory_pool.h
+ * \brief OpenCL Workspace pool utility.
+ */
+#include "memory_pool.h"
+
+#include <memory>
+
+#include "opencl_common.h"
+
+#define ALIGN_UP(num, align) (((num) + ((align)-1)) & ~((align)-1))
+
+namespace tvm {
+namespace runtime {
+namespace cl {
+
+void* Pool::Alloc(Device dev, DeviceAPI* device, size_t nbytes) {
+  Entry e;
+  DLDataType type;
+  type.code = kDLUInt;
+  type.bits = 8;
+  type.lanes = 1;
+  if (free_list_.size() == 2) {
+    e = free_list_.back();
+    free_list_.pop_back();
+    if (e.size < nbytes) {
+      // resize the page
+      OpenCLWorkspace::Global()->FreeCLBuffer(dev, e.data);
+      e.data = OpenCLWorkspace::Global()->AllocCLBuffer(dev, nbytes, 
kTempAllocaAlignment, type);
+      e.size = nbytes;
+    }
+  } else if (free_list_.size() == 1) {
+    e.data = OpenCLWorkspace::Global()->AllocCLBuffer(dev, nbytes, 
kTempAllocaAlignment, type);
+    e.size = nbytes;
+  } else {
+    if (free_list_.back().size >= nbytes) {
+      // find smallest fit
+      auto it = free_list_.end() - 2;
+      for (; it->size >= nbytes; --it) {
+      }
+      e = *(it + 1);
+      free_list_.erase(it + 1);
+    } else {
+      // resize the page
+      e = free_list_.back();
+      free_list_.pop_back();
+      OpenCLWorkspace::Global()->FreeCLBuffer(dev, e.data);
+      e.data = OpenCLWorkspace::Global()->AllocCLBuffer(dev, nbytes, 
kTempAllocaAlignment, type);
+      e.size = nbytes;
+    }
+  }
+  e.dev = dev;
+  allocated_.push_back(e);
+  return e.data;
+}
+
+void Pool::Free(void* data) {
+  Entry e;
+  if (allocated_.back().data == data) {
+    // quick path, last allocated.
+    e = allocated_.back();
+    allocated_.pop_back();
+  } else {
+    int index = static_cast<int>(allocated_.size()) - 2;
+    for (; index > 0 && allocated_[index].data != data; --index) {
+    }
+    ICHECK_GT(index, 0) << "trying to free things that has not been allocated";
+    e = allocated_[index];
+    allocated_.erase(allocated_.begin() + index);
+  }
+  if (free_list_.back().size < e.size) {
+    free_list_.push_back(e);
+  } else if (free_list_.size() == 2) {
+    free_list_.push_back(free_list_.back());
+    free_list_[1] = e;
+  } else {
+    size_t i = free_list_.size() - 1;
+    free_list_.resize(free_list_.size() + 1);
+    for (; e.size < free_list_[i].size; --i) {
+      free_list_[i + 1] = free_list_[i];
+    }
+    free_list_[i + 1] = e;
+  }
+}
+
+void Pool::Release(Device dev, DeviceAPI* device) {
+  for (size_t i = 1; i < free_list_.size(); ++i) {
+    OpenCLWorkspace::Global()->FreeCLBuffer(dev, free_list_[i].data);
+  }
+  free_list_.clear();
+}
+
+MemoryPool::MemoryPool(DLDeviceType device_type, DeviceAPI* device)
+    : device_type_(device_type), device_(device) {}
+
+MemoryPool::~MemoryPool() {
+  for (size_t i = 0; i < array_.size(); ++i) {
+    if (array_[i] != nullptr) {
+      Device dev;
+      dev.device_type = device_type_;
+      dev.device_id = static_cast<int>(i);
+      array_[i]->Release(dev, device_);
+      delete array_[i];
+    }
+  }
+}
+
+void MemoryPool::InitDevicePool(int device_id) {
+  if (static_cast<size_t>(device_id) >= array_.size()) {
+    array_.resize(device_id + 1, nullptr);
+  }
+  if (array_[device_id] == nullptr) {
+    array_[device_id] = new Pool();
+  }
+}
+
+void* MemoryPool::AllocMemory(Device dev, size_t size, DLDataType type_hint) {
+  InitDevicePool(dev.device_id);
+  return array_[dev.device_id]->Alloc(dev, device_, size);
+}
+
+size_t GetRowPitch(Device dev, size_t width, DLDataType type_hint) {
+  cl_uint row_align = 
OpenCLWorkspace::Global()->GetImageAlignment(dev.device_id);
+  size_t pixel_size = (type_hint.bits * type_hint.lanes + 7) / 8;
+  return ALIGN_UP(width * pixel_size * 4, row_align);  // CL_RGBA = 4
+}
+
+void* MemoryPool::AllocMemory(Device dev, size_t width, size_t height, 
DLDataType type_hint,
+                              Optional<String> mem_scope) {
+  InitDevicePool(dev.device_id);
+
+  // Texture allocation given width and height
+  size_t row_pitch = GetRowPitch(dev, width, type_hint);
+  size_t mem_size = row_pitch * height;
+
+  // Alloc back buffer from pool
+  void* back_buffer = nullptr;
+  if (OpenCLWorkspace::Global()->IsBufferToImageSupported(dev.device_id)) {
+    back_buffer = array_[dev.device_id]->Alloc(dev, device_, mem_size);
+  }
+
+  if (!mem_scope.defined()) {
+    mem_scope = Optional<String>("global.texture");
+  }
+  return OpenCLWorkspace::Global()->AllocCLImage(dev, back_buffer, width, 
height, row_pitch,

Review Comment:
   Is it ok if `back_buffer` is `nullptr`?



##########
src/runtime/metal/metal_device_api.mm:
##########
@@ -81,6 +81,8 @@
         return;
       case kDriverVersion:
         return;
+      default:

Review Comment:
   Probably it is better to handle `kImagePitchAlignment` explicitly?



##########
src/runtime/opencl/memory_pool.cc:
##########
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file memory_pool.h
+ * \brief OpenCL Workspace pool utility.
+ */
+#include "memory_pool.h"
+
+#include <memory>
+
+#include "opencl_common.h"
+
+#define ALIGN_UP(num, align) (((num) + ((align)-1)) & ~((align)-1))
+
+namespace tvm {
+namespace runtime {
+namespace cl {
+
+void* Pool::Alloc(Device dev, DeviceAPI* device, size_t nbytes) {

Review Comment:
   In previous allocation algorithm it was a logic which prevent packing too 
small texture to bigger texture, because it leads to significant performance 
degradation. Don't you think that the same problems can be appeared in this 
algorithm?



##########
tests/cpp-runtime/opencl/opencl_texture_pool_test.cc:
##########
@@ -26,126 +26,271 @@
 using namespace tvm::runtime;
 using namespace tvm::runtime::cl;
 
-// PoolWrapper is necessary because in class Pool2D we don't have an access to
-// its protected members. In this class we add new methods which allow us to
-// get and check internal state of class Pool
-class PoolWrapper : public Pool2D {
+// MemoryPoolWrapper and PoolWrapper is necessary because in classes MemoryPoo 
and Pool
+// we don't have an access to its protected members.
+// Using the wrapper we build MemoryPool with external Pool object and access
+// its protected members.
+
+class PoolWrapper : public Pool {
+ public:
+  inline size_t FreeListSize() const { return free_list_.size() - 1; }
+  inline size_t AllocatedListSize() const { return allocated_.size() - 1; }
+};
+
+class MemoryPoolWrapper : public MemoryPool {
  public:
-  inline size_t FreeListSize() const { return free_list_.size(); }
-  inline size_t AllocatedListSize() const { return allocated_.size(); }
-  inline std::pair<size_t, size_t> FreeListItemSize(size_t idx) const {
-    return std::make_pair(free_list_[idx].x, free_list_[idx].y);
+  MemoryPoolWrapper(tvm::Device& dev, DeviceAPI* device) : 
MemoryPool(dev.device_type, device) {
+    auto device_id = dev.device_id;
+    if (static_cast<size_t>(device_id) >= array_.size()) {
+      array_.resize(device_id + 1, nullptr);
+      pool_array_.resize(device_id + 1, nullptr);
+    }
+    if (array_[device_id] == nullptr) {
+      pool_array_[device_id] = new PoolWrapper();
+      array_[device_id] = pool_array_[device_id];
+    }
   }
-  inline std::pair<size_t, size_t> AllocatedListItemSize(size_t idx) const {
-    return std::make_pair(allocated_[idx].x, allocated_[idx].y);
+  inline size_t FreeListSize(int device_id) const { return 
pool_array_[device_id]->FreeListSize(); }
+  inline size_t AllocatedListSize(int device_id) const {
+    return pool_array_[device_id]->AllocatedListSize();
   }
+
+ private:
+  std::vector<PoolWrapper*> pool_array_;
 };
 
-TEST(OpenCLTexturePool, textures_reallocation_optimal_size) {
+void FreeDataSpace(OpenCLWorkspace* workspace, void* ptr, tvm::Device& dev,
+                   MemoryPoolWrapper& pool) {
+  cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
+  if (desc->host_ptr) {
+    clEnqueueUnmapMemObject(workspace->GetQueue(dev), desc->buffer,
+                            reinterpret_cast<void*>(desc->host_ptr), 0, 
nullptr, nullptr);
+  }
+  if (desc->back_buffer) {
+    OPENCL_CALL(clReleaseMemObject(desc->buffer));
+    pool.FreeMemory(dev, desc->back_buffer);
+    delete desc;
+  } else {
+    pool.FreeMemory(dev, desc);
+  }
+}
+
+TEST(OpenCLTexturePool, buffer_allocation_and_free) {
   OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
   OpenCLThreadEntry* t = workspace->GetThreadEntry();
-  PoolWrapper pool;
-  EXPECT_EQ(pool.AllocatedListSize(), 0);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-
+  MemoryPoolWrapper pool(t->device, workspace);
   DLDataType type{kDLFloat, 16, 1};
-  void* data1 = pool.Alloc(t->device, workspace, 1024, 768, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-  auto item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 768);
-
-  pool.Alloc(t->device, workspace, 64, 12455, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 2);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-  item = pool.AllocatedListItemSize(1);
-  EXPECT_EQ(item.first, 64);
-  EXPECT_EQ(item.second, 12455);
-
-  pool.Free(data1);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 1);
-  item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 64);
-  EXPECT_EQ(item.second, 12455);
-  item = pool.FreeListItemSize(0);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 768);
-
-  pool.Alloc(t->device, workspace, 768, 1024, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 2);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-  item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 64);
-  EXPECT_EQ(item.second, 12455);
-  item = pool.AllocatedListItemSize(1);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 1024);
+
+  // Allocate two buffers and check list sizes at each stage
+  auto did = t->device.device_id;
+  void* buf_data1 = pool.AllocMemory(t->device, 1024, type);
+  EXPECT_EQ(pool.AllocatedListSize(did), 1);
+  EXPECT_EQ(pool.FreeListSize(did), 0);
+  void* buf_data2 = pool.AllocMemory(t->device, 2048, type);
+  EXPECT_EQ(pool.AllocatedListSize(did), 2);
+  EXPECT_EQ(pool.FreeListSize(did), 0);
+  FreeDataSpace(workspace, buf_data1, t->device, pool);
+  EXPECT_EQ(pool.AllocatedListSize(did), 1);
+  EXPECT_EQ(pool.FreeListSize(did), 1);
+  FreeDataSpace(workspace, buf_data2, t->device, pool);
+  EXPECT_EQ(pool.AllocatedListSize(did), 0);
+  EXPECT_EQ(pool.FreeListSize(did), 2);
 }
 
-TEST(OpenCLTexturePool, avoid_reusing_too_big_textures) {
+TEST(OpenCLTexturePool, buffer_allocations_and_reuse) {
   OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
   OpenCLThreadEntry* t = workspace->GetThreadEntry();
-  PoolWrapper pool;
-  EXPECT_EQ(pool.AllocatedListSize(), 0);
-  EXPECT_EQ(pool.FreeListSize(), 0);
+  MemoryPoolWrapper pool(t->device, workspace);
+  DLDataType type{kDLFloat, 16, 1};
+
+  // Request buffer of 1024
+  auto did = t->device.device_id;
+  void* buf_data1 = pool.AllocMemory(t->device, 1024, type);
+  EXPECT_EQ(pool.AllocatedListSize(did), 1);
+  EXPECT_EQ(pool.FreeListSize(did), 0);
+  // Request buffer of 2048
+  void* buf_data2 = pool.AllocMemory(t->device, 2048, type);
+  EXPECT_EQ(pool.AllocatedListSize(did), 2);
+  EXPECT_EQ(pool.FreeListSize(did), 0);
+  // Free buffer 1 (1024)
+  FreeDataSpace(workspace, buf_data1, t->device, pool);
+  EXPECT_EQ(pool.AllocatedListSize(did), 1);
+  EXPECT_EQ(pool.FreeListSize(did), 1);
 
+  // Request buffer of 768
+  void* buf_data3 = pool.AllocMemory(t->device, 768, type);
+  EXPECT_EQ(pool.AllocatedListSize(did), 2);  // Should have reused the 1024 
slot
+  EXPECT_EQ(pool.FreeListSize(did), 0);
+
+  // Free buffer 2 (2048)
+  FreeDataSpace(workspace, buf_data2, t->device, pool);
+  EXPECT_EQ(pool.AllocatedListSize(did), 1);

Review Comment:
   I would suggest checking the size of the allocated buffers.



##########
tests/cpp-runtime/opencl/texture_copy_test.cc:
##########
@@ -0,0 +1,318 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <dmlc/logging.h>
+#include <gtest/gtest.h>
+#include <tvm/runtime/registry.h>
+
+#include <cmath>
+#include <random>
+
+#include "../src/runtime/opencl/opencl_common.h"
+
+TEST(TextureCopy, HostDeviceRT) {
+  using namespace tvm;
+  bool enabled = tvm::runtime::RuntimeEnabled("opencl");
+  if (!enabled) {
+    LOG(INFO) << "Skip texture copy test because opencl runtime is 
disabled.\n";
+    return;
+  }
+
+  std::vector<int64_t> shape{16, 16, 4};
+  auto cpu_arr0 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 
0});
+  auto cpu_arr1 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 
0});
+  String mem_scope = "global.texture";
+  auto opencl_txarr0 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, 
{kDLOpenCL, 0}, mem_scope);
+
+  size_t size = 1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    size *= static_cast<size_t>(shape[i]);
+  }
+
+  std::random_device dev;
+  std::mt19937 mt(dev());
+  std::uniform_real_distribution<> random(-10.0, 10.0);
+
+  // Random initialize host ndarray
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr0->data)[i] = random(mt);
+  }
+
+  // Do a roundtrip from host storage to opencl texture storage and back
+  cpu_arr0.CopyTo(opencl_txarr0);
+  opencl_txarr0.CopyTo(cpu_arr1);
+  for (size_t i = 0; i < size; ++i) {
+    ICHECK_LT(
+        std::fabs(static_cast<float*>(cpu_arr1->data)[i] - 
static_cast<float*>(cpu_arr0->data)[i]),
+        1e-5);
+  }
+}
+
+TEST(TextureCopy, ViewBufferAsBuffer) {
+  using namespace tvm;
+  bool enabled = tvm::runtime::RuntimeEnabled("opencl");
+  if (!enabled) {
+    LOG(INFO) << "Skip texture copy test because opencl runtime is 
disabled.\n";
+    return;
+  }

Review Comment:
   You can unify these tests by using test class and changing `TEST` to 
`TEST_F`. In this case you can do this check it `SetUp` method.



##########
tests/cpp-runtime/opencl/opencl_texture_pool_test.cc:
##########
@@ -26,126 +26,271 @@
 using namespace tvm::runtime;
 using namespace tvm::runtime::cl;
 
-// PoolWrapper is necessary because in class Pool2D we don't have an access to
-// its protected members. In this class we add new methods which allow us to
-// get and check internal state of class Pool
-class PoolWrapper : public Pool2D {
+// MemoryPoolWrapper and PoolWrapper is necessary because in classes MemoryPoo 
and Pool
+// we don't have an access to its protected members.
+// Using the wrapper we build MemoryPool with external Pool object and access
+// its protected members.
+
+class PoolWrapper : public Pool {
+ public:
+  inline size_t FreeListSize() const { return free_list_.size() - 1; }
+  inline size_t AllocatedListSize() const { return allocated_.size() - 1; }
+};
+
+class MemoryPoolWrapper : public MemoryPool {
  public:
-  inline size_t FreeListSize() const { return free_list_.size(); }
-  inline size_t AllocatedListSize() const { return allocated_.size(); }
-  inline std::pair<size_t, size_t> FreeListItemSize(size_t idx) const {
-    return std::make_pair(free_list_[idx].x, free_list_[idx].y);
+  MemoryPoolWrapper(tvm::Device& dev, DeviceAPI* device) : 
MemoryPool(dev.device_type, device) {
+    auto device_id = dev.device_id;
+    if (static_cast<size_t>(device_id) >= array_.size()) {
+      array_.resize(device_id + 1, nullptr);
+      pool_array_.resize(device_id + 1, nullptr);
+    }
+    if (array_[device_id] == nullptr) {
+      pool_array_[device_id] = new PoolWrapper();
+      array_[device_id] = pool_array_[device_id];
+    }
   }
-  inline std::pair<size_t, size_t> AllocatedListItemSize(size_t idx) const {
-    return std::make_pair(allocated_[idx].x, allocated_[idx].y);
+  inline size_t FreeListSize(int device_id) const { return 
pool_array_[device_id]->FreeListSize(); }
+  inline size_t AllocatedListSize(int device_id) const {
+    return pool_array_[device_id]->AllocatedListSize();
   }
+
+ private:
+  std::vector<PoolWrapper*> pool_array_;
 };
 
-TEST(OpenCLTexturePool, textures_reallocation_optimal_size) {
+void FreeDataSpace(OpenCLWorkspace* workspace, void* ptr, tvm::Device& dev,
+                   MemoryPoolWrapper& pool) {
+  cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
+  if (desc->host_ptr) {
+    clEnqueueUnmapMemObject(workspace->GetQueue(dev), desc->buffer,
+                            reinterpret_cast<void*>(desc->host_ptr), 0, 
nullptr, nullptr);
+  }
+  if (desc->back_buffer) {
+    OPENCL_CALL(clReleaseMemObject(desc->buffer));
+    pool.FreeMemory(dev, desc->back_buffer);
+    delete desc;
+  } else {
+    pool.FreeMemory(dev, desc);
+  }
+}
+
+TEST(OpenCLTexturePool, buffer_allocation_and_free) {
   OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
   OpenCLThreadEntry* t = workspace->GetThreadEntry();
-  PoolWrapper pool;
-  EXPECT_EQ(pool.AllocatedListSize(), 0);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-
+  MemoryPoolWrapper pool(t->device, workspace);
   DLDataType type{kDLFloat, 16, 1};
-  void* data1 = pool.Alloc(t->device, workspace, 1024, 768, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-  auto item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 768);
-
-  pool.Alloc(t->device, workspace, 64, 12455, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 2);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-  item = pool.AllocatedListItemSize(1);
-  EXPECT_EQ(item.first, 64);
-  EXPECT_EQ(item.second, 12455);
-
-  pool.Free(data1);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 1);
-  item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 64);
-  EXPECT_EQ(item.second, 12455);
-  item = pool.FreeListItemSize(0);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 768);
-
-  pool.Alloc(t->device, workspace, 768, 1024, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 2);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-  item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 64);
-  EXPECT_EQ(item.second, 12455);
-  item = pool.AllocatedListItemSize(1);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 1024);
+
+  // Allocate two buffers and check list sizes at each stage
+  auto did = t->device.device_id;
+  void* buf_data1 = pool.AllocMemory(t->device, 1024, type);
+  EXPECT_EQ(pool.AllocatedListSize(did), 1);
+  EXPECT_EQ(pool.FreeListSize(did), 0);
+  void* buf_data2 = pool.AllocMemory(t->device, 2048, type);
+  EXPECT_EQ(pool.AllocatedListSize(did), 2);
+  EXPECT_EQ(pool.FreeListSize(did), 0);
+  FreeDataSpace(workspace, buf_data1, t->device, pool);
+  EXPECT_EQ(pool.AllocatedListSize(did), 1);
+  EXPECT_EQ(pool.FreeListSize(did), 1);
+  FreeDataSpace(workspace, buf_data2, t->device, pool);
+  EXPECT_EQ(pool.AllocatedListSize(did), 0);
+  EXPECT_EQ(pool.FreeListSize(did), 2);
 }
 
-TEST(OpenCLTexturePool, avoid_reusing_too_big_textures) {
+TEST(OpenCLTexturePool, buffer_allocations_and_reuse) {
   OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
   OpenCLThreadEntry* t = workspace->GetThreadEntry();
-  PoolWrapper pool;
-  EXPECT_EQ(pool.AllocatedListSize(), 0);
-  EXPECT_EQ(pool.FreeListSize(), 0);
+  MemoryPoolWrapper pool(t->device, workspace);
+  DLDataType type{kDLFloat, 16, 1};
+
+  // Request buffer of 1024
+  auto did = t->device.device_id;
+  void* buf_data1 = pool.AllocMemory(t->device, 1024, type);
+  EXPECT_EQ(pool.AllocatedListSize(did), 1);
+  EXPECT_EQ(pool.FreeListSize(did), 0);
+  // Request buffer of 2048
+  void* buf_data2 = pool.AllocMemory(t->device, 2048, type);
+  EXPECT_EQ(pool.AllocatedListSize(did), 2);
+  EXPECT_EQ(pool.FreeListSize(did), 0);
+  // Free buffer 1 (1024)
+  FreeDataSpace(workspace, buf_data1, t->device, pool);
+  EXPECT_EQ(pool.AllocatedListSize(did), 1);
+  EXPECT_EQ(pool.FreeListSize(did), 1);
 
+  // Request buffer of 768
+  void* buf_data3 = pool.AllocMemory(t->device, 768, type);
+  EXPECT_EQ(pool.AllocatedListSize(did), 2);  // Should have reused the 1024 
slot
+  EXPECT_EQ(pool.FreeListSize(did), 0);
+
+  // Free buffer 2 (2048)
+  FreeDataSpace(workspace, buf_data2, t->device, pool);
+  EXPECT_EQ(pool.AllocatedListSize(did), 1);
+  EXPECT_EQ(pool.FreeListSize(did), 1);
+
+  // Request new buffer of 3076
+  void* buf_data4 = pool.AllocMemory(t->device, 3076, type);
+  EXPECT_EQ(pool.AllocatedListSize(did), 2);  // Should go for realloc
+  EXPECT_EQ(pool.FreeListSize(did), 0);
+
+  // Free buffer 3 (768)
+  FreeDataSpace(workspace, buf_data3, t->device, pool);
+  EXPECT_EQ(pool.AllocatedListSize(did), 1);
+  EXPECT_EQ(pool.FreeListSize(did), 1);
+
+  // Free buffer 4 (3076)
+  FreeDataSpace(workspace, buf_data4, t->device, pool);
+  EXPECT_EQ(pool.AllocatedListSize(did), 0);
+  EXPECT_EQ(pool.FreeListSize(did), 2);
+}
+
+TEST(OpenCLTexturePool, reuse_buffers_as_texture_back_buffers) {
+  OpenCLWorkspace* workspace = OpenCLWorkspace::Global();
+  OpenCLThreadEntry* t = workspace->GetThreadEntry();
+  MemoryPoolWrapper pool(t->device, workspace);
   DLDataType type{kDLFloat, 16, 1};
-  void* data1 = pool.Alloc(t->device, workspace, 12455, 64, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 0);
-  auto item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 12455);
-  EXPECT_EQ(item.second, 64);
-
-  pool.Free(data1);
-  EXPECT_EQ(pool.AllocatedListSize(), 0);
-  EXPECT_EQ(pool.FreeListSize(), 1);
-  item = pool.FreeListItemSize(0);
-  EXPECT_EQ(item.first, 12455);
-  EXPECT_EQ(item.second, 64);
-
-  pool.Alloc(t->device, workspace, 1024, 768, type);
-  EXPECT_EQ(pool.AllocatedListSize(), 1);
-  EXPECT_EQ(pool.FreeListSize(), 1);
-  item = pool.FreeListItemSize(0);
-  EXPECT_EQ(item.first, 12455);
-  EXPECT_EQ(item.second, 64);
-  item = pool.AllocatedListItemSize(0);
-  EXPECT_EQ(item.first, 1024);
-  EXPECT_EQ(item.second, 768);
+  auto did = t->device.device_id;
+
+  if (!workspace->IsBufferToImageSupported(did)) {
+    return;

Review Comment:
   ```suggestion
       GTEST_SKIP() << "Skipping this test, because buffer to image is 
unsupported on this device";
   ```



##########
tests/cpp-runtime/opencl/texture_copy_test.cc:
##########
@@ -0,0 +1,318 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <dmlc/logging.h>
+#include <gtest/gtest.h>
+#include <tvm/runtime/registry.h>
+
+#include <cmath>
+#include <random>
+
+#include "../src/runtime/opencl/opencl_common.h"
+
+TEST(TextureCopy, HostDeviceRT) {
+  using namespace tvm;
+  bool enabled = tvm::runtime::RuntimeEnabled("opencl");
+  if (!enabled) {
+    LOG(INFO) << "Skip texture copy test because opencl runtime is 
disabled.\n";
+    return;

Review Comment:
   ```suggestion
       GTEST_SKIP() << "Skip texture copy test because opencl runtime is 
disabled.\n";
   ```



##########
src/runtime/opencl/memory_pool.cc:
##########
@@ -0,0 +1,186 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file memory_pool.h
+ * \brief OpenCL Workspace pool utility.
+ */
+#include "memory_pool.h"
+
+#include <memory>
+
+#include "opencl_common.h"
+
+#define ALIGN_UP(num, align) (((num) + ((align)-1)) & ~((align)-1))
+
+namespace tvm {
+namespace runtime {
+namespace cl {
+
+void* Pool::Alloc(Device dev, DeviceAPI* device, size_t nbytes) {
+  Entry e;
+  DLDataType type;
+  type.code = kDLUInt;
+  type.bits = 8;
+  type.lanes = 1;
+  if (free_list_.size() == 2) {
+    e = free_list_.back();
+    free_list_.pop_back();
+    if (e.size < nbytes) {
+      // resize the page
+      OpenCLWorkspace::Global()->FreeCLBuffer(dev, e.data);
+      e.data = OpenCLWorkspace::Global()->AllocCLBuffer(dev, nbytes, 
kTempAllocaAlignment, type);
+      e.size = nbytes;
+    }
+  } else if (free_list_.size() == 1) {
+    e.data = OpenCLWorkspace::Global()->AllocCLBuffer(dev, nbytes, 
kTempAllocaAlignment, type);
+    e.size = nbytes;
+  } else {
+    if (free_list_.back().size >= nbytes) {
+      // find smallest fit
+      auto it = free_list_.end() - 2;
+      for (; it->size >= nbytes; --it) {
+      }
+      e = *(it + 1);
+      free_list_.erase(it + 1);
+    } else {
+      // resize the page
+      e = free_list_.back();
+      free_list_.pop_back();
+      OpenCLWorkspace::Global()->FreeCLBuffer(dev, e.data);
+      e.data = OpenCLWorkspace::Global()->AllocCLBuffer(dev, nbytes, 
kTempAllocaAlignment, type);
+      e.size = nbytes;
+    }
+  }
+  e.dev = dev;
+  allocated_.push_back(e);
+  return e.data;
+}
+
+void Pool::Free(void* data) {
+  Entry e;
+  if (allocated_.back().data == data) {
+    // quick path, last allocated.
+    e = allocated_.back();
+    allocated_.pop_back();
+  } else {
+    int index = static_cast<int>(allocated_.size()) - 2;
+    for (; index > 0 && allocated_[index].data != data; --index) {
+    }
+    ICHECK_GT(index, 0) << "trying to free things that has not been allocated";
+    e = allocated_[index];
+    allocated_.erase(allocated_.begin() + index);
+  }
+  if (free_list_.back().size < e.size) {
+    free_list_.push_back(e);
+  } else if (free_list_.size() == 2) {
+    free_list_.push_back(free_list_.back());
+    free_list_[1] = e;
+  } else {
+    size_t i = free_list_.size() - 1;
+    free_list_.resize(free_list_.size() + 1);
+    for (; e.size < free_list_[i].size; --i) {
+      free_list_[i + 1] = free_list_[i];
+    }
+    free_list_[i + 1] = e;
+  }
+}
+
+void Pool::Release(Device dev, DeviceAPI* device) {
+  for (size_t i = 1; i < free_list_.size(); ++i) {
+    OpenCLWorkspace::Global()->FreeCLBuffer(dev, free_list_[i].data);
+  }
+  free_list_.clear();
+}
+
+MemoryPool::MemoryPool(DLDeviceType device_type, DeviceAPI* device)
+    : device_type_(device_type), device_(device) {}
+
+MemoryPool::~MemoryPool() {
+  for (size_t i = 0; i < array_.size(); ++i) {
+    if (array_[i] != nullptr) {
+      Device dev;
+      dev.device_type = device_type_;
+      dev.device_id = static_cast<int>(i);
+      array_[i]->Release(dev, device_);
+      delete array_[i];
+    }
+  }
+}
+
+void MemoryPool::InitDevicePool(int device_id) {
+  if (static_cast<size_t>(device_id) >= array_.size()) {
+    array_.resize(device_id + 1, nullptr);
+  }
+  if (array_[device_id] == nullptr) {
+    array_[device_id] = new Pool();
+  }
+}
+
+void* MemoryPool::AllocMemory(Device dev, size_t size, DLDataType type_hint) {
+  InitDevicePool(dev.device_id);
+  return array_[dev.device_id]->Alloc(dev, device_, size);
+}
+
+size_t GetRowPitch(Device dev, size_t width, DLDataType type_hint) {
+  cl_uint row_align = 
OpenCLWorkspace::Global()->GetImageAlignment(dev.device_id);
+  size_t pixel_size = (type_hint.bits * type_hint.lanes + 7) / 8;
+  return ALIGN_UP(width * pixel_size * 4, row_align);  // CL_RGBA = 4
+}
+
+void* MemoryPool::AllocMemory(Device dev, size_t width, size_t height, 
DLDataType type_hint,
+                              Optional<String> mem_scope) {
+  InitDevicePool(dev.device_id);
+
+  // Texture allocation given width and height
+  size_t row_pitch = GetRowPitch(dev, width, type_hint);
+  size_t mem_size = row_pitch * height;
+
+  // Alloc back buffer from pool
+  void* back_buffer = nullptr;
+  if (OpenCLWorkspace::Global()->IsBufferToImageSupported(dev.device_id)) {
+    back_buffer = array_[dev.device_id]->Alloc(dev, device_, mem_size);
+  }
+
+  if (!mem_scope.defined()) {
+    mem_scope = Optional<String>("global.texture");

Review Comment:
   I'm a bit confused because usually if `mem_scope` isn't defined then buffer 
memory scope is used. But here if `mem_scope` is not defined then you use 
texture memory scope.



##########
tests/cpp/relay/backend/graph_plan_token_alloc.cc:
##########
@@ -24,23 +24,24 @@
 namespace tvm {
 namespace relay {
 
-// TokenAllocator2d is necessary because in class TokenAllocator2D we don't
+// TokenAllocatorMixed is necessary because in class TokenAllocatorMixed we 
don't
 // have an access to its protected members. In this class we add new methods
-// which allow us to get and check internal state of class TokenAllocator2D
-class TokenAllocator2DWrapper : public TokenAllocator2D {
+// which allow us to get and check internal state of class TokenAllocatorMixed
+class TokenAllocatorMixedWrapper : public TokenAllocatorMixed {
  public:
-  inline size_t FreeListSize() const { return free_list_.size(); }
-  inline size_t BlockMapSize() const { return blocks_.size(); }
+  inline size_t FreeListSize() const { return free_.size(); }
+  inline size_t AllocListSize() const { return data_.size(); }
 };
 
-TEST(Token2DAlloc, OneToken) {
-  TokenAllocator2DWrapper alloc;
+TEST(TokenMixedAlloc, OneToken) {
+  TokenAllocatorMixedWrapper alloc;
   int storage_ids = 0;
-  EXPECT_EQ(alloc.BlockMapSize(), 0);
+  EXPECT_EQ(alloc.AllocListSize(), 0);
   EXPECT_EQ(alloc.FreeListSize(), 0);
 
   TensorType tt1({1, 22, 20, 20, 4}, DataType(kDLFloat, 32, 1));
-  VirtualDevice vd1(kDLOpenCL, 0, {}, MemoryScope("global.texture-nhwc"));
+  VirtualDevice vd1(kDLOpenCL, 0, Target("opencl -device=adreno"),

Review Comment:
   I'd like to see the same number of tests for `opencl` target (w/o adreno). 
In this case we can check that token allocator works for all scenarios. You can 
do that by using [parameterized 
tests](http://google.github.io/googletest/advanced.html#how-to-write-value-parameterized-tests).



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [tvm] echuraev commented on a diff in pull request #15058: [OpenCL][Texture] Improved texture memory planning

Reply via email to