[tvm] branch main updated: [Texture support][Part 0] Device API and runtime support (#7711)

ziheng Fri, 04 Jun 2021 22:07:02 -0700

This is an automated email from the ASF dual-hosted git repository.

ziheng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new 010d11b  [Texture support][Part 0] Device API and runtime support 
(#7711)
010d11b is described below

commit 010d11ba843ebf315bade46ffd0fa763655652b3
Author: Chris Sullivan <[email protected]>
AuthorDate: Fri Jun 4 22:06:29 2021 -0700

    [Texture support][Part 0] Device API and runtime support (#7711)
    
    * Add TVMBackendAllocTexture and support in OpenCL device API.
    
    * Add runtime optimized caching allocator.
    This should be replaced with AOT memory planning
    when the relay/tir/compile engine refactor lands.
    
    * Few bug fixes for runtime texture allocator.
    
    * Add OpenCL device api support for image2d<float16> textures.
    
    * Update OpenCL DeviceAPI to support Image2D data space
    allocations and copying to/from host/image2d directly.
    Allocation employs a lowering convention to 2d images
    for activations and weights.
    
    * Fix to follow OpenCL spec. for indexing.
    
    * Rename texture_pool.h -> texture.h
    
    * Move Nd to 2d lowering convention code into runtime texture
    utilities that can be shared by codegen and the runtime.
    
    * Update texture lowering utilities
    
    * Add TODO comment about pitch support
    
    * Remove FreeTexture
    
    * Fix ICHECK comment
    
    * Partial cherry pick from @ZihengJiang
    [email protected]:ZihengJiang/tvm.git:52822c5bd
    [RUNTIME] OpenCL texture memory.
    
    * Remove runtime and device texture APIs.
    
    * Add OpenCL packed functions for texture workspace (de)allocations.
    
    * Add OpenCLBuffer structure to track
    memory layout through OpenCL Device API.
    
    * Rebase: TVMContext -> Device
    
    * Implement DLTensor* overload of CopyDataToFrom in OpenCL DeviceAPI.
    
    * Implement OpenCL CopyDataFromTo(DLTensor*...)
    overload and tensor shapes to calculate image extent
    when copying date directly to or from texture cache.
    
    * Update format (cpp-lint)
    
    * Update format (clang)
    
    * Buffer descriptor name change and formatting.
    
    * Add texture pool documentation.
    
    * Update runtime to use new global.texture scope.
    
    * Move texture_pool.cc into opencl impl.
    
    * Add test coverage for copying in and out
    of storage allocs of texture scope.
    
    * Documented APIs and structures, renamed buffer descriptor layout tags.
    
    Co-authored-by: ZihengJiang <[email protected]>
---
 src/runtime/opencl/opencl_common.h      |  64 +++++++++-
 src/runtime/opencl/opencl_device_api.cc | 215 ++++++++++++++++++++++++++++----
 src/runtime/opencl/opencl_module.cc     |   3 +-
 src/runtime/opencl/texture_pool.cc      | 166 ++++++++++++++++++++++++
 src/runtime/texture.h                   | 144 +++++++++++++++++++++
 tests/cpp/texture_copy_test.cc          | 142 +++++++++++++++++++++
 6 files changed, 703 insertions(+), 31 deletions(-)

diff --git a/src/runtime/opencl/opencl_common.h 
b/src/runtime/opencl/opencl_common.h
index ad2040a..c31576f 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -66,6 +66,7 @@
 #include "../file_utils.h"
 #include "../meta_data.h"
 #include "../pack_args.h"
+#include "../texture.h"
 #include "../thread_storage_scope.h"
 #include "../workspace_pool.h"
 
@@ -174,6 +175,29 @@ inline const char* CLGetErrorString(cl_int error) {
   }
 }
 
+inline cl_channel_type DTypeToOpenCLChannelType(DLDataType data_type) {
+  DataType dtype(data_type);
+  if (dtype == DataType::Float(32)) {
+    return CL_FLOAT;
+  } else if (dtype == DataType::Float(16)) {
+    return CL_HALF_FLOAT;
+  } else if (dtype == DataType::Int(8)) {
+    return CL_SIGNED_INT8;
+  } else if (dtype == DataType::Int(16)) {
+    return CL_SIGNED_INT16;
+  } else if (dtype == DataType::Int(32)) {
+    return CL_SIGNED_INT32;
+  } else if (dtype == DataType::UInt(8)) {
+    return CL_UNSIGNED_INT8;
+  } else if (dtype == DataType::UInt(16)) {
+    return CL_UNSIGNED_INT16;
+  } else if (dtype == DataType::UInt(32)) {
+    return CL_UNSIGNED_INT32;
+  }
+  LOG(FATAL) << "data type is not supported in OpenCL runtime yet: " << dtype;
+  return CL_FLOAT;
+}
+
 /*!
  * \brief Protected OpenCL call
  * \param func Expression to call.
@@ -243,11 +267,18 @@ class OpenCLWorkspace : public DeviceAPI {
   void SetDevice(Device dev) final;
   void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final;
   void* AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType 
type_hint) final;
+  void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType 
dtype,
+                       Optional<String> mem_scope = NullOpt) final;
   void FreeDataSpace(Device dev, void* ptr) final;
   void StreamSync(Device dev, TVMStreamHandle stream) final;
   void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final;
   void FreeWorkspace(Device dev, void* data) final;
 
+  // Texture (image2d_t) alloca APIs
+  cl_mem AllocTexture(Device dev, size_t width, size_t height, DLDataType 
type_hint);
+  void* AllocTextureWorkspace(Device dev, size_t width, size_t height, 
DLDataType type_hint);
+  void FreeTextureWorkspace(Device dev, void* data);
+
   /*!
    * \brief Get the thread local ThreadEntry
    */
@@ -256,10 +287,7 @@ class OpenCLWorkspace : public DeviceAPI {
   // get the global workspace
   static OpenCLWorkspace* Global();
 
- protected:
-  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t 
to_offset, size_t size,
-                      Device dev_from, Device dev_to, DLDataType type_hint,
-                      TVMStreamHandle stream) final;
+  void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) 
final;
 };
 
 /*! \brief Thread local workspace */
@@ -278,9 +306,11 @@ class OpenCLThreadEntry {
   std::vector<KTEntry> kernel_table;
   /*! \brief workspace pool */
   WorkspacePool pool;
+  /*! \brief texture pool */
+  TexturePool texture_pool;
   // constructor
   OpenCLThreadEntry(DLDeviceType device_type, DeviceAPI* device_api)
-      : pool(device_type, device_api) {
+      : pool(device_type, device_api), texture_pool(device_type, device_api) {
     device.device_id = 0;
     device.device_type = device_type;
   }
@@ -289,6 +319,29 @@ class OpenCLThreadEntry {
   // get the global workspace
   static OpenCLThreadEntry* ThreadLocal();
 };
+
+/*! \brief OpenCL runtime buffer structure with tracked memory layout */
+struct BufferDescriptor {
+  enum class MemoryLayout {
+    /*! \brief One dimensional buffer in row-major layout*/
+    kBuffer1D,
+    /*! \brief Two dimensional texture w/ width = axis[-1]
+     *          e.g. image2d[height=NCH, width=W]
+     */
+    kImage2DActivation,
+    /*! \brief Two dimensional texture w/ height = axis[0]
+     *         e.g. image2d[height=O, width=IHW]
+     */
+    kImage2DWeight,
+  };
+  BufferDescriptor() = default;
+  explicit BufferDescriptor(Optional<String> scope) : 
layout(MemoryLayoutFromScope(scope)) {}
+  static MemoryLayout MemoryLayoutFromScope(Optional<String> mem_scope);
+  static String ScopeFromMemoryLayout(MemoryLayout mem_scope);
+
+  cl_mem buffer{nullptr};
+  MemoryLayout layout{MemoryLayout::kBuffer1D};
+};
 }  // namespace cl
 
 // Module to support thread-safe multi-device execution.
@@ -349,7 +402,6 @@ class OpenCLModuleNode : public ModuleNode {
   // parsed kernel data
   std::unordered_map<std::string, std::string> parsed_kernels_;
 };
-
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
diff --git a/src/runtime/opencl/opencl_device_api.cc 
b/src/runtime/opencl/opencl_device_api.cc
index e9f092c..26eddb4 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -32,6 +32,63 @@ namespace cl {
 std::string GetPlatformInfo(cl_platform_id pid, cl_platform_info param_name);
 std::string GetDeviceInfo(cl_device_id pid, cl_device_info param_name);
 
+struct ImageInfo {
+  size_t origin[3] = {};
+  size_t region[3] = {};
+  size_t row_pitch = 0;
+  size_t slice_pitch = 0;
+};
+
+/*!
+ * \brief Utility to apply a memory layout specific lowering convention
+ * to infer the physical shape from the provided DLTensor's logical shape.
+ * \param desc Descriptor which contains the buffer and layout tag.
+ * \param The DLTensor used to infer the tensors physical shape.
+ */
+ImageInfo GetImageInfo(const cl::BufferDescriptor* desc, const DLTensor* 
tensor) {
+  ImageInfo info{};
+  ICHECK(tensor->dtype.lanes == 1) << "Image dtype has lanes: " << 
tensor->dtype.lanes;
+
+  info.origin[0] = info.origin[1] = info.origin[2] = 0;
+  info.row_pitch = 0;
+  info.slice_pitch = 0;
+
+  size_t axis = DefaultTextureLayoutSeparator(
+      tensor->ndim, cl::BufferDescriptor::ScopeFromMemoryLayout(desc->layout));
+  auto texture_shape = ApplyTexture2DFlattening<int64_t>(tensor->shape, 
tensor->ndim, axis);
+  info.region[0] = texture_shape.width;
+  info.region[1] = texture_shape.height;
+  info.region[2] = 1;
+  return info;
+}
+
+cl::BufferDescriptor::MemoryLayout cl::BufferDescriptor::MemoryLayoutFromScope(
+    Optional<String> mem_scope) {
+  if (!mem_scope.defined()) {
+    return cl::BufferDescriptor::MemoryLayout::kBuffer1D;
+  } else if (mem_scope.value() == "global.texture") {
+    return cl::BufferDescriptor::MemoryLayout::kImage2DActivation;
+  } else if (mem_scope.value() == "global.texture-weight") {
+    return cl::BufferDescriptor::MemoryLayout::kImage2DWeight;
+  }
+  LOG(FATAL) << "No memory layout defined for memory of scope: " << 
mem_scope.value();
+  return cl::BufferDescriptor::MemoryLayout::kBuffer1D;
+}
+
+String 
cl::BufferDescriptor::ScopeFromMemoryLayout(cl::BufferDescriptor::MemoryLayout 
layout) {
+  switch (layout) {
+    case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
+      return "global";
+    case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
+      return "global.texture";
+    case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
+      return "global.texture-weight";
+  }
+  LOG(FATAL) << "No scope corresponding to the provided memory layout: "
+             << static_cast<int>(layout);
+  return "";
+}
+
 OpenCLThreadEntry* OpenCLWorkspace::GetThreadEntry() { return 
OpenCLThreadEntry::ThreadLocal(); }
 
 OpenCLWorkspace* OpenCLWorkspace::Global() {
@@ -138,9 +195,30 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t 
size, size_t alignment,
   this->Init();
   ICHECK(context != nullptr) << "No OpenCL device";
   cl_int err_code;
-  cl_mem mptr = clCreateBuffer(this->context, CL_MEM_READ_WRITE, size, 
nullptr, &err_code);
+  cl::BufferDescriptor* desc = new cl::BufferDescriptor;
+  desc->buffer = clCreateBuffer(this->context, CL_MEM_READ_WRITE, size, 
nullptr, &err_code);
+  desc->layout = cl::BufferDescriptor::MemoryLayout::kBuffer1D;
   OPENCL_CHECK_ERROR(err_code);
-  return mptr;
+  return desc;
+}
+
+void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t* 
shape, DLDataType dtype,
+                                      Optional<String> mem_scope) {
+  if (!mem_scope.defined() || mem_scope.value() == "global") {
+    return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
+  }
+  ICHECK(IsTextureStorage(std::string(mem_scope.value())))
+      << "Device does not support allocate data space with "
+      << "specified memory scope: " << mem_scope.value();
+
+  ICHECK(ndim > 2) << "Shape for texture allocation must be at least rank 3; "
+                   << "provided shape is rank " << ndim;
+
+  cl::BufferDescriptor* desc = new cl::BufferDescriptor(mem_scope);
+  size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value());
+  auto texture = ApplyTexture2DFlattening<int64_t>(shape, ndim, axis);
+  desc->buffer = AllocTexture(dev, texture.width, texture.height, dtype);
+  return desc;
 }
 
 void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
@@ -148,31 +226,87 @@ void OpenCLWorkspace::FreeDataSpace(Device dev, void* 
ptr) {
   // for some OpenCL platforms.
   OPENCL_CALL(clFinish(this->GetQueue(dev)));
 
-  cl_mem mptr = static_cast<cl_mem>(ptr);
-  OPENCL_CALL(clReleaseMemObject(mptr));
+  cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
+  OPENCL_CALL(clReleaseMemObject(desc->buffer));
+  delete desc;
 }
 
-void OpenCLWorkspace::CopyDataFromTo(const void* from, size_t from_offset, 
void* to,
-                                     size_t to_offset, size_t size, Device 
dev_from, Device dev_to,
-                                     DLDataType type_hint, TVMStreamHandle 
stream) {
+cl_mem OpenCLWorkspace::AllocTexture(Device dev, size_t width, size_t height,
+                                     DLDataType type_hint) {
   this->Init();
-  ICHECK(stream == nullptr);
-  if (IsOpenCLDevice(dev_from) && IsOpenCLDevice(dev_to)) {
-    OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(dev_to),
-                                    static_cast<cl_mem>((void*)from),  // 
NOLINT(*)
-                                    static_cast<cl_mem>(to), from_offset, 
to_offset, size, 0,
-                                    nullptr, nullptr));
-  } else if (IsOpenCLDevice(dev_from) && dev_to.device_type == kDLCPU) {
-    OPENCL_CALL(clEnqueueReadBuffer(this->GetQueue(dev_from),
-                                    static_cast<cl_mem>((void*)from),  // 
NOLINT(*)
-                                    CL_FALSE, from_offset, size, 
static_cast<char*>(to) + to_offset,
-                                    0, nullptr, nullptr));
-    OPENCL_CALL(clFinish(this->GetQueue(dev_from)));
-  } else if (dev_from.device_type == kDLCPU && IsOpenCLDevice(dev_to)) {
-    OPENCL_CALL(clEnqueueWriteBuffer(this->GetQueue(dev_to), 
static_cast<cl_mem>(to), CL_FALSE,
-                                     to_offset, size, static_cast<const 
char*>(from) + from_offset,
-                                     0, nullptr, nullptr));
-    OPENCL_CALL(clFinish(this->GetQueue(dev_to)));
+  ICHECK(context != nullptr) << "No OpenCL device";
+  cl_int err_code;
+  cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint);
+  cl_image_format format = {CL_RGBA, cl_type};
+  cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0, 
0, 0, 0};
+  cl_mem mptr =
+      clCreateImage(this->context, CL_MEM_READ_WRITE, &format, &descriptor, 
nullptr, &err_code);
+  OPENCL_CHECK_ERROR(err_code);
+  return mptr;
+}
+
+void* OpenCLWorkspace::AllocTextureWorkspace(Device dev, size_t width, size_t 
height,
+                                             DLDataType type_hint) {
+  return GetThreadEntry()->texture_pool.AllocTexture(dev, width, height, 
type_hint);
+}
+
+void OpenCLWorkspace::FreeTextureWorkspace(Device dev, void* ptr) {
+  GetThreadEntry()->texture_pool.FreeTexture(dev, ptr);
+}
+
+void OpenCLWorkspace::CopyDataFromTo(DLTensor* from, DLTensor* to, 
TVMStreamHandle stream) {
+  size_t nbytes = GetDataSize(*from);
+  ICHECK_EQ(nbytes, GetDataSize(*to));
+  ICHECK(IsContiguous(*from) && IsContiguous(*to))
+      << "CopyDataFromTo only support contiguous array for now";
+
+  if (IsOpenCLDevice(from->device) && IsOpenCLDevice(to->device)) {
+    const auto* from_desc = static_cast<const 
cl::BufferDescriptor*>(from->data);
+    ICHECK(from_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D)
+        << "Device to device copying is currently only implemented for OpenCL 
buffer storage";
+    auto* to_desc = static_cast<cl::BufferDescriptor*>(to->data);
+    OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(to->device), 
from_desc->buffer, to_desc->buffer,
+                                    from->byte_offset, to->byte_offset, 
nbytes, 0, nullptr,
+                                    nullptr));
+  } else if (IsOpenCLDevice(from->device) && to->device.device_type == kDLCPU) 
{
+    const auto* from_desc = static_cast<const 
cl::BufferDescriptor*>(from->data);
+    switch (from_desc->layout) {
+      case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
+        OPENCL_CALL(clEnqueueReadBuffer(
+            this->GetQueue(from->device), from_desc->buffer, CL_FALSE, 
from->byte_offset, nbytes,
+            static_cast<char*>(to->data) + to->byte_offset, 0, nullptr, 
nullptr));
+        break;
+      case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
+      case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
+        auto image_info = GetImageInfo(from_desc, from);
+        // TODO(csullivan): Support calculating row_pitch correctly in the 
case of reuse.
+        // Note that when utilizing texture pools for memory reuse, the 
allocated image
+        // size can be larger than the size to be read.
+        OPENCL_CALL(clEnqueueReadImage(
+            this->GetQueue(from->device), from_desc->buffer, CL_FALSE, 
image_info.origin,
+            image_info.region, image_info.row_pitch, image_info.slice_pitch,
+            static_cast<char*>(to->data) + to->byte_offset, 0, nullptr, 
nullptr));
+        break;
+    }
+    OPENCL_CALL(clFinish(this->GetQueue(from->device)));
+  } else if (from->device.device_type == kDLCPU && IsOpenCLDevice(to->device)) 
{
+    auto* to_desc = static_cast<cl::BufferDescriptor*>(to->data);
+    switch (to_desc->layout) {
+      case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
+        OPENCL_CALL(clEnqueueWriteBuffer(
+            this->GetQueue(to->device), to_desc->buffer, CL_FALSE, 
to->byte_offset, nbytes,
+            static_cast<const char*>(from->data) + from->byte_offset, 0, 
nullptr, nullptr));
+        break;
+      case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
+      case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
+        auto image_info = GetImageInfo(to_desc, to);
+        OPENCL_CALL(clEnqueueWriteImage(
+            this->GetQueue(to->device), to_desc->buffer, CL_FALSE, 
image_info.origin,
+            image_info.region, image_info.row_pitch, image_info.slice_pitch,
+            static_cast<const char*>(from->data) + from->byte_offset, 0, 
nullptr, nullptr));
+        break;
+    }
+    OPENCL_CALL(clFinish(this->GetQueue(to->device)));
   } else {
     LOG(FATAL) << "Expect copy from/to OpenCL or between OpenCL";
   }
@@ -291,6 +425,39 @@ void OpenCLWorkspace::Init(const std::string& type_key, 
const std::string& devic
   initialized_ = true;
 }
 
+TVM_REGISTER_GLOBAL("device_api.opencl.AllocTexture").set_body([](TVMArgs 
args, TVMRetValue* rv) {
+  int device_type = args[0];
+  int device_id = args[1];
+  int width = args[2];
+  int height = args[3];
+  int dtype_code_hint = args[4];
+  int dtype_bits_hint = args[5];
+  Device dev;
+  dev.device_type = static_cast<DLDeviceType>(device_type);
+  dev.device_id = device_id;
+
+  DLDataType type_hint;
+  type_hint.code = static_cast<decltype(type_hint.code)>(dtype_code_hint);
+  type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
+  type_hint.lanes = 1;
+
+  OpenCLWorkspace* ptr = OpenCLWorkspace::Global();
+  *rv = ptr->AllocTextureWorkspace(dev, static_cast<size_t>(width), 
static_cast<size_t>(height),
+                                   type_hint);
+});
+
+TVM_REGISTER_GLOBAL("device_api.opencl.FreeTexture").set_body([](TVMArgs args, 
TVMRetValue* rv) {
+  int device_type = args[0];
+  int device_id = args[1];
+  void* data = args[2];
+  OpenCLWorkspace* ptr = OpenCLWorkspace::Global();
+  Device dev;
+  dev.device_type = static_cast<DLDeviceType>(device_type);
+  dev.device_id = device_id;
+  ptr->FreeTextureWorkspace(dev, data);
+  *rv = static_cast<int32_t>(0);
+});
+
 TVM_REGISTER_GLOBAL("device_api.opencl").set_body([](TVMArgs args, 
TVMRetValue* rv) {
   DeviceAPI* ptr = OpenCLWorkspace::Global();
   *rv = static_cast<void*>(ptr);
diff --git a/src/runtime/opencl/opencl_module.cc 
b/src/runtime/opencl/opencl_module.cc
index 40aa666..397f57b 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -64,7 +64,8 @@ class OpenCLWrappedFunc {
     }
     // setup arguments.
     for (cl_uint i = 0; i < arg_size_.size(); ++i) {
-      OPENCL_CALL(clSetKernelArg(kernel, i, arg_size_[i], void_args[i]));
+      auto* arg = static_cast<cl::BufferDescriptor*>(void_args[i]);
+      OPENCL_CALL(clSetKernelArg(kernel, i, arg_size_[i], arg->buffer));
     }
     cl_command_queue queue = w_->GetQueue(t->device);
     ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
diff --git a/src/runtime/opencl/texture_pool.cc 
b/src/runtime/opencl/texture_pool.cc
new file mode 100644
index 0000000..bf52894
--- /dev/null
+++ b/src/runtime/opencl/texture_pool.cc
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file texture_pool.h
+ * \brief Texture pool utility.
+ */
+#include <limits>
+#include <memory>
+
+#include "../texture.h"
+
+namespace tvm {
+namespace runtime {
+
+class TexturePool::Pool {
+ public:
+  Pool() = default;
+  void* Alloc(Device dev, DeviceAPI* device, size_t width, size_t height, 
DLDataType type_hint) {
+    Entry e;
+    e.data = nullptr;
+    if (free_list_.size() != 0) {
+      int64_t req_size = height * width;
+      Entry new_mem;
+      int64_t min_added_size = std::numeric_limits<int64_t>::max();
+      int64_t min_wasted_size = std::numeric_limits<int64_t>::max();
+      std::vector<Entry>::iterator best_mem;
+      for (auto it = free_list_.begin(); it != free_list_.end(); ++it) {
+        if (it->type.code != type_hint.code) {
+          continue;
+        }
+        int64_t old_size = it->x * it->y;
+        new_mem.x = std::max(it->x, width);
+        new_mem.y = std::max(it->y, height);
+        int64_t new_size = new_mem.x * new_mem.y;
+        int64_t added_size = new_size - old_size;
+        int64_t wasted_size = new_size - req_size;
+        // Minimize added size first and wasted size thereafter
+        if ((min_added_size > 0 && added_size < min_added_size) ||
+            (min_added_size == 0 && wasted_size < min_wasted_size)) {
+          min_added_size = added_size;
+          min_wasted_size = wasted_size;
+          best_mem = it;
+        }
+      }
+
+      if (min_added_size == 0) {
+        // use existing block
+        e = *best_mem;
+        free_list_.erase(best_mem);
+      } else if (min_added_size <= req_size) {
+        // if added size is less or equal to
+        // what is needed by alloc, then grow entry
+        device->FreeDataSpace(dev, best_mem->data);
+        free_list_.erase(best_mem);
+        new_mem.type = type_hint;
+        std::vector<int64_t> shape{int64_t(new_mem.y), int64_t(new_mem.x), 4};
+        new_mem.data = device->AllocDataSpace(dev, shape.size(), shape.data(), 
new_mem.type,
+                                              
Optional<String>("global.texture"));
+        e = new_mem;
+      }
+    }
+
+    if (e.data == nullptr) {
+      // create new block
+      std::vector<int64_t> shape{int64_t(height), int64_t(width), 4};
+      e.data = device->AllocDataSpace(dev, shape.size(), shape.data(), 
type_hint,
+                                      Optional<String>("global.texture"));
+      e.x = width;
+      e.y = height;
+      e.type = type_hint;
+    }
+
+    allocated_.push_back(e);
+    return e.data;
+  }
+
+  void Free(void* data) {
+    Entry e;
+    if (allocated_.back().data == data) {
+      // quick path, last allocated.
+      e = allocated_.back();
+      allocated_.pop_back();
+    } else {
+      int index = static_cast<int>(allocated_.size()) - 2;
+      for (; index >= 0 && allocated_[index].data != data; --index) {
+      }
+      ICHECK_GE(index, 0) << "Attempt to free texture that has not been 
allocated";
+      e = allocated_[index];
+      allocated_.erase(allocated_.begin() + index);
+    }
+    free_list_.push_back(e);
+  }
+
+  // Release all resources immediately
+  void Release(Device dev, DeviceAPI* device) {
+    for (auto& e : allocated_) {
+      device->FreeDataSpace(dev, e.data);
+    }
+    for (auto& e : free_list_) {
+      device->FreeDataSpace(dev, e.data);
+    }
+    allocated_.clear();
+    free_list_.clear();
+  }
+
+ private:
+  struct Entry {
+    void* data;
+    size_t x;
+    size_t y;
+    DLDataType type;
+  };
+  std::vector<Entry> free_list_;
+  std::vector<Entry> allocated_;
+};
+
+TexturePool::TexturePool(DLDeviceType device_type, DeviceAPI* device)
+    : device_type_(device_type), device_(device) {}
+
+TexturePool::~TexturePool() {
+  for (size_t i = 0; i < array_.size(); ++i) {
+    if (array_[i] != nullptr) {
+      Device dev;
+      dev.device_type = device_type_;
+      dev.device_id = static_cast<int>(i);
+      array_[i]->Release(dev, device_);
+      delete array_[i];
+    }
+  }
+}
+
+void* TexturePool::AllocTexture(Device dev, size_t width, size_t height, 
DLDataType type_hint) {
+  if (static_cast<size_t>(dev.device_id) >= array_.size()) {
+    array_.resize(dev.device_id + 1, nullptr);
+  }
+  if (array_[dev.device_id] == nullptr) {
+    array_[dev.device_id] = new Pool();
+  }
+  return array_[dev.device_id]->Alloc(dev, device_, width, height, type_hint);
+}
+
+void TexturePool::FreeTexture(Device dev, void* ptr) {
+  ICHECK(static_cast<size_t>(dev.device_id) < array_.size() && 
array_[dev.device_id] != nullptr)
+      << "Attempt to free texture from null texture pool";
+  array_[dev.device_id]->Free(ptr);
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/texture.h b/src/runtime/texture.h
new file mode 100644
index 0000000..83725c0
--- /dev/null
+++ b/src/runtime/texture.h
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file texture.h
+ * \brief Texture utilities
+ */
+#ifndef TVM_RUNTIME_TEXTURE_H_
+#define TVM_RUNTIME_TEXTURE_H_
+
+#include <tvm/runtime/device_api.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+
+/*! \brief Structure to represent flattened texture shape */
+template <typename T>
+struct Texture2DShape {
+  T width;
+  T height;
+  T channel;
+};
+
+/*!
+ * \param shape_rank Rank N of the Nd-shape
+ * \param convention Storage scope convention to use for flattening
+ * \return The axis separator that defines the Nd shape partitioning in 2d
+ */
+inline size_t DefaultTextureLayoutSeparator(size_t shape_rank,
+                                            std::string convention = 
"global.texture") {
+  // Texture activation:
+  // e.g. [N,C,H,W,c] -> Texture2d[N*C*H, W, c]
+  // Texture weight:
+  // e.g. [O,I,H,W,c] -> Texture2d[O, I*H*W, c]
+  size_t separator = 0;
+  if (convention == "global.texture") {
+    separator = shape_rank - 2;
+  } else if (convention == "global.texture-weight") {
+    separator = 1;
+  } else {
+    LOG(FATAL) << "Encountered unknown texture lowering convention: " << 
convention;
+  }
+  return separator;
+}
+
+/*!
+ * \param shape Nd shape
+ * \param rank Number of dimensions N of the Nd shape
+ * \param axis The axis separator that splits the Nd axes into two sets
+ * \return Width and height of the 2d shape
+ */
+template <typename T, typename S>
+Texture2DShape<T> ApplyTexture2DFlattening(const S& shape, size_t rank, size_t 
axis) {
+  ICHECK(axis < rank)
+      << "Number of axes to flatten into rows must be less than shape rank for 
2d flattening";
+  Texture2DShape<T> texture{1, 1, shape[rank - 1]};
+  for (size_t i = 0; i < rank - 1; i++) {
+    if (i < axis) {
+      texture.height *= shape[i];
+    } else {
+      texture.width *= shape[i];
+    }
+  }
+  return texture;
+}
+
+inline bool IsTextureStorage(std::string scope) {
+  return scope.find("texture") != std::string::npos;
+}
+
+/*!
+ * \brief A two dimensional storage pool that recycles temporal workspace
+ * allocations for dynamically allocated texture. See AllocTexture docstring
+ * for approach to allocation and reuse.
+ */
+class TVM_DLL TexturePool {
+ public:
+  /*!
+   * \brief Create pool with specific device type and device.
+   * \param device_type The device type.
+   * \param device_api The device API.
+   */
+  TexturePool(DLDeviceType device_type, DeviceAPI* device_api);
+  /*! \brief destructor */
+  ~TexturePool();
+
+  /*!
+   * \brief Allocate a two dimensional temporal texture workspace on device
+   *
+   * \note Two dimensional texture workspaces will be grown and reused
+   * according to the following strategy:
+   *  - Choose the workspace which minimizes the amount of memory required to
+   *    grow the workspace to fit the request.
+   *  - If a set of workspaces exist that fit the current request without
+   *    expansion, choose the workspace of that set which most closely
+   *    matches the request size, minimizing wasted space.
+   *
+   * \param dev The context of allocation.
+   * \param width The width of the 2d texture to be allocated.
+   * \param height The height of the 2d texture to be allocated.
+   * \param type_hint The type of elements.
+   */
+  void* AllocTexture(Device dev, size_t width, size_t height, DLDataType 
type_hint);
+  /*!
+   * \brief Free temporal texture in backend execution.
+   *
+   * \param dev The context of allocation.
+   * \param ptr The pointer to be freed.
+   */
+  void FreeTexture(Device dev, void* ptr);
+
+ private:
+  class Pool;
+  /*! \brief pool of device local array */
+  std::vector<Pool*> array_;
+  /*! \brief device type this pool support */
+  DLDeviceType device_type_;
+  /*! \brief The device API */
+  DeviceAPI* device_;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+#endif  // TVM_RUNTIME_TEXTURE_H_
diff --git a/tests/cpp/texture_copy_test.cc b/tests/cpp/texture_copy_test.cc
new file mode 100644
index 0000000..688bcab
--- /dev/null
+++ b/tests/cpp/texture_copy_test.cc
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <dmlc/logging.h>
+#include <gtest/gtest.h>
+#include <tvm/runtime/registry.h>
+
+#include <cmath>
+#include <random>
+
+TEST(TextureCopy, HostDeviceRT) {
+  using namespace tvm;
+  bool enabled = tvm::runtime::RuntimeEnabled("opencl");
+  if (!enabled) {
+    LOG(INFO) << "Skip texture copy test because opencl runtime is 
disabled.\n";
+    return;
+  }
+
+  std::vector<int64_t> shape{16, 16, 4};
+  auto cpu_arr0 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 
0});
+  auto cpu_arr1 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 
0});
+  String mem_scope = "global.texture";
+  auto opencl_txarr0 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, 
{kDLOpenCL, 0}, mem_scope);
+
+  size_t size = 1;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    size *= static_cast<size_t>(shape[i]);
+  }
+
+  std::random_device dev;
+  std::mt19937 mt(dev());
+  std::uniform_real_distribution<> random(-10.0, 10.0);
+
+  // Random initialize host ndarray
+  for (size_t i = 0; i < size; i++) {
+    static_cast<float*>(cpu_arr0->data)[i] = random(mt);
+  }
+
+  // Do a roundtrip from host storage to opencl texture storage and back
+  cpu_arr0.CopyTo(opencl_txarr0);
+  opencl_txarr0.CopyTo(cpu_arr1);
+  for (size_t i = 0; i < size; ++i) {
+    ICHECK_LT(
+        std::fabs(static_cast<float*>(cpu_arr1->data)[i] - 
static_cast<float*>(cpu_arr0->data)[i]),
+        1e-5);
+  }
+}
+
+TEST(TextureCopy, OverwritePoolSubview) {
+  using namespace tvm;
+  bool enabled = tvm::runtime::RuntimeEnabled("opencl");
+  if (!enabled) {
+    LOG(INFO) << "Skip texture copy test because opencl runtime is 
disabled.\n";
+    return;
+  }
+
+  std::vector<int64_t> shape{16, 16, 4};
+  std::vector<int64_t> shape_pool{32, 32, 4};
+  auto cpu_arr0 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 
0});
+  auto cpu_arr1 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU, 
0});
+  auto cpu_pool0 = runtime::NDArray::Empty(shape_pool, {kDLFloat, 32, 1}, 
{kDLCPU, 0});
+  auto cpu_pool1 = runtime::NDArray::Empty(shape_pool, {kDLFloat, 32, 1}, 
{kDLCPU, 0});
+
+  String mem_scope = "global.texture";
+  auto opencl_txpool =
+      runtime::NDArray::Empty(shape_pool, {kDLFloat, 32, 1}, {kDLOpenCL, 0}, 
mem_scope);
+  auto opencl_txarr0 = opencl_txpool.CreateView(shape, {kDLFloat, 32, 1});
+
+  std::random_device dev;
+  std::mt19937 mt(dev());
+  std::uniform_real_distribution<> random(-10.0, 10.0);
+
+  size_t size = 1;
+  size_t size_pool = 1;
+  for (size_t i = 0; i < shape_pool.size(); ++i) {
+    size *= static_cast<size_t>(shape[i]);
+    size_pool *= static_cast<size_t>(shape_pool[i]);
+  }
+
+  // Random initialize host pool storage
+  for (size_t i = 0; i < size_pool; i++) {
+    static_cast<float*>(cpu_pool0->data)[i] = random(mt);
+  }
+
+  // Random initialize host array
+  for (int64_t h = 0; h < shape[0]; h++) {
+    for (int64_t w = 0; w < shape[1]; w++) {
+      for (int64_t rgba = 0; rgba < shape[2]; rgba++) {
+        static_cast<float*>(cpu_arr0->data)[shape[1] * shape[2] * h + shape[2] 
* w + rgba] = 1.1f;
+      }
+    }
+  }
+
+  // Copy to texture pool for initialization
+  cpu_pool0.CopyTo(opencl_txpool);
+  // Copy host data to subview into texture storage
+  cpu_arr0.CopyTo(opencl_txarr0);
+  // Copy modified pool back
+  opencl_txpool.CopyTo(cpu_pool1);
+
+  // Check that modifications to pool follow two dimensional
+  // strides according to the written texture shape.
+  for (int64_t h = 0; h < shape_pool[0]; h++) {
+    for (int64_t w = 0; w < shape_pool[1]; w++) {
+      for (int64_t rgba = 0; rgba < shape_pool[2]; rgba++) {
+        size_t i = shape_pool[1] * shape_pool[2] * h + shape_pool[2] * w + 
rgba;
+        if (h < shape[0] && w < shape[1] && rgba < shape[2]) {
+          size_t j = shape[1] * shape[2] * h + shape[2] * w + rgba;
+          ICHECK_LT(std::fabs(static_cast<float*>(cpu_pool1->data)[i] -
+                              static_cast<float*>(cpu_arr0->data)[j]),
+                    1e-5);
+        } else {
+          ICHECK_LT(std::fabs(static_cast<float*>(cpu_pool1->data)[i] -
+                              static_cast<float*>(cpu_pool0->data)[i]),
+                    1e-5);
+        }
+      }
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  testing::FLAGS_gtest_death_test_style = "threadsafe";
+  return RUN_ALL_TESTS();
+}

[tvm] branch main updated: [Texture support][Part 0] Device API and runtime support (#7711)

Reply via email to