This is an automated email from the ASF dual-hosted git repository.
ziheng pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 010d11b [Texture support][Part 0] Device API and runtime support
(#7711)
010d11b is described below
commit 010d11ba843ebf315bade46ffd0fa763655652b3
Author: Chris Sullivan <[email protected]>
AuthorDate: Fri Jun 4 22:06:29 2021 -0700
[Texture support][Part 0] Device API and runtime support (#7711)
* Add TVMBackendAllocTexture and support in OpenCL device API.
* Add runtime optimized caching allocator.
This should be replaced with AOT memory planning
when the relay/tir/compile engine refactor lands.
* Few bug fixes for runtime texture allocator.
* Add OpenCL device api support for image2d<float16> textures.
* Update OpenCL DeviceAPI to support Image2D data space
allocations and copying to/from host/image2d directly.
Allocation employs a lowering convention to 2d images
for activations and weights.
* Fix to follow OpenCL spec. for indexing.
* Rename texture_pool.h -> texture.h
* Move Nd to 2d lowering convention code into runtime texture
utilities that can be shared by codegen and the runtime.
* Update texture lowering utilities
* Add TODO comment about pitch support
* Remove FreeTexture
* Fix ICHECK comment
* Partial cherry pick from @ZihengJiang
[email protected]:ZihengJiang/tvm.git:52822c5bd
[RUNTIME] OpenCL texture memory.
* Remove runtime and device texture APIs.
* Add OpenCL packed functions for texture workspace (de)allocations.
* Add OpenCLBuffer structure to track
memory layout through OpenCL Device API.
* Rebase: TVMContext -> Device
* Implement DLTensor* overload of CopyDataToFrom in OpenCL DeviceAPI.
* Implement OpenCL CopyDataFromTo(DLTensor*...)
overload and tensor shapes to calculate image extent
when copying date directly to or from texture cache.
* Update format (cpp-lint)
* Update format (clang)
* Buffer descriptor name change and formatting.
* Add texture pool documentation.
* Update runtime to use new global.texture scope.
* Move texture_pool.cc into opencl impl.
* Add test coverage for copying in and out
of storage allocs of texture scope.
* Documented APIs and structures, renamed buffer descriptor layout tags.
Co-authored-by: ZihengJiang <[email protected]>
---
src/runtime/opencl/opencl_common.h | 64 +++++++++-
src/runtime/opencl/opencl_device_api.cc | 215 ++++++++++++++++++++++++++++----
src/runtime/opencl/opencl_module.cc | 3 +-
src/runtime/opencl/texture_pool.cc | 166 ++++++++++++++++++++++++
src/runtime/texture.h | 144 +++++++++++++++++++++
tests/cpp/texture_copy_test.cc | 142 +++++++++++++++++++++
6 files changed, 703 insertions(+), 31 deletions(-)
diff --git a/src/runtime/opencl/opencl_common.h
b/src/runtime/opencl/opencl_common.h
index ad2040a..c31576f 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -66,6 +66,7 @@
#include "../file_utils.h"
#include "../meta_data.h"
#include "../pack_args.h"
+#include "../texture.h"
#include "../thread_storage_scope.h"
#include "../workspace_pool.h"
@@ -174,6 +175,29 @@ inline const char* CLGetErrorString(cl_int error) {
}
}
+inline cl_channel_type DTypeToOpenCLChannelType(DLDataType data_type) {
+ DataType dtype(data_type);
+ if (dtype == DataType::Float(32)) {
+ return CL_FLOAT;
+ } else if (dtype == DataType::Float(16)) {
+ return CL_HALF_FLOAT;
+ } else if (dtype == DataType::Int(8)) {
+ return CL_SIGNED_INT8;
+ } else if (dtype == DataType::Int(16)) {
+ return CL_SIGNED_INT16;
+ } else if (dtype == DataType::Int(32)) {
+ return CL_SIGNED_INT32;
+ } else if (dtype == DataType::UInt(8)) {
+ return CL_UNSIGNED_INT8;
+ } else if (dtype == DataType::UInt(16)) {
+ return CL_UNSIGNED_INT16;
+ } else if (dtype == DataType::UInt(32)) {
+ return CL_UNSIGNED_INT32;
+ }
+ LOG(FATAL) << "data type is not supported in OpenCL runtime yet: " << dtype;
+ return CL_FLOAT;
+}
+
/*!
* \brief Protected OpenCL call
* \param func Expression to call.
@@ -243,11 +267,18 @@ class OpenCLWorkspace : public DeviceAPI {
void SetDevice(Device dev) final;
void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final;
void* AllocDataSpace(Device dev, size_t size, size_t alignment, DLDataType
type_hint) final;
+ void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType
dtype,
+ Optional<String> mem_scope = NullOpt) final;
void FreeDataSpace(Device dev, void* ptr) final;
void StreamSync(Device dev, TVMStreamHandle stream) final;
void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final;
void FreeWorkspace(Device dev, void* data) final;
+ // Texture (image2d_t) alloca APIs
+ cl_mem AllocTexture(Device dev, size_t width, size_t height, DLDataType
type_hint);
+ void* AllocTextureWorkspace(Device dev, size_t width, size_t height,
DLDataType type_hint);
+ void FreeTextureWorkspace(Device dev, void* data);
+
/*!
* \brief Get the thread local ThreadEntry
*/
@@ -256,10 +287,7 @@ class OpenCLWorkspace : public DeviceAPI {
// get the global workspace
static OpenCLWorkspace* Global();
- protected:
- void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t
to_offset, size_t size,
- Device dev_from, Device dev_to, DLDataType type_hint,
- TVMStreamHandle stream) final;
+ void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream)
final;
};
/*! \brief Thread local workspace */
@@ -278,9 +306,11 @@ class OpenCLThreadEntry {
std::vector<KTEntry> kernel_table;
/*! \brief workspace pool */
WorkspacePool pool;
+ /*! \brief texture pool */
+ TexturePool texture_pool;
// constructor
OpenCLThreadEntry(DLDeviceType device_type, DeviceAPI* device_api)
- : pool(device_type, device_api) {
+ : pool(device_type, device_api), texture_pool(device_type, device_api) {
device.device_id = 0;
device.device_type = device_type;
}
@@ -289,6 +319,29 @@ class OpenCLThreadEntry {
// get the global workspace
static OpenCLThreadEntry* ThreadLocal();
};
+
+/*! \brief OpenCL runtime buffer structure with tracked memory layout */
+struct BufferDescriptor {
+ enum class MemoryLayout {
+ /*! \brief One dimensional buffer in row-major layout*/
+ kBuffer1D,
+ /*! \brief Two dimensional texture w/ width = axis[-1]
+ * e.g. image2d[height=NCH, width=W]
+ */
+ kImage2DActivation,
+ /*! \brief Two dimensional texture w/ height = axis[0]
+ * e.g. image2d[height=O, width=IHW]
+ */
+ kImage2DWeight,
+ };
+ BufferDescriptor() = default;
+ explicit BufferDescriptor(Optional<String> scope) :
layout(MemoryLayoutFromScope(scope)) {}
+ static MemoryLayout MemoryLayoutFromScope(Optional<String> mem_scope);
+ static String ScopeFromMemoryLayout(MemoryLayout mem_scope);
+
+ cl_mem buffer{nullptr};
+ MemoryLayout layout{MemoryLayout::kBuffer1D};
+};
} // namespace cl
// Module to support thread-safe multi-device execution.
@@ -349,7 +402,6 @@ class OpenCLModuleNode : public ModuleNode {
// parsed kernel data
std::unordered_map<std::string, std::string> parsed_kernels_;
};
-
} // namespace runtime
} // namespace tvm
#endif // TVM_RUNTIME_OPENCL_OPENCL_COMMON_H_
diff --git a/src/runtime/opencl/opencl_device_api.cc
b/src/runtime/opencl/opencl_device_api.cc
index e9f092c..26eddb4 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -32,6 +32,63 @@ namespace cl {
std::string GetPlatformInfo(cl_platform_id pid, cl_platform_info param_name);
std::string GetDeviceInfo(cl_device_id pid, cl_device_info param_name);
+struct ImageInfo {
+ size_t origin[3] = {};
+ size_t region[3] = {};
+ size_t row_pitch = 0;
+ size_t slice_pitch = 0;
+};
+
+/*!
+ * \brief Utility to apply a memory layout specific lowering convention
+ * to infer the physical shape from the provided DLTensor's logical shape.
+ * \param desc Descriptor which contains the buffer and layout tag.
+ * \param The DLTensor used to infer the tensors physical shape.
+ */
+ImageInfo GetImageInfo(const cl::BufferDescriptor* desc, const DLTensor*
tensor) {
+ ImageInfo info{};
+ ICHECK(tensor->dtype.lanes == 1) << "Image dtype has lanes: " <<
tensor->dtype.lanes;
+
+ info.origin[0] = info.origin[1] = info.origin[2] = 0;
+ info.row_pitch = 0;
+ info.slice_pitch = 0;
+
+ size_t axis = DefaultTextureLayoutSeparator(
+ tensor->ndim, cl::BufferDescriptor::ScopeFromMemoryLayout(desc->layout));
+ auto texture_shape = ApplyTexture2DFlattening<int64_t>(tensor->shape,
tensor->ndim, axis);
+ info.region[0] = texture_shape.width;
+ info.region[1] = texture_shape.height;
+ info.region[2] = 1;
+ return info;
+}
+
+cl::BufferDescriptor::MemoryLayout cl::BufferDescriptor::MemoryLayoutFromScope(
+ Optional<String> mem_scope) {
+ if (!mem_scope.defined()) {
+ return cl::BufferDescriptor::MemoryLayout::kBuffer1D;
+ } else if (mem_scope.value() == "global.texture") {
+ return cl::BufferDescriptor::MemoryLayout::kImage2DActivation;
+ } else if (mem_scope.value() == "global.texture-weight") {
+ return cl::BufferDescriptor::MemoryLayout::kImage2DWeight;
+ }
+ LOG(FATAL) << "No memory layout defined for memory of scope: " <<
mem_scope.value();
+ return cl::BufferDescriptor::MemoryLayout::kBuffer1D;
+}
+
+String
cl::BufferDescriptor::ScopeFromMemoryLayout(cl::BufferDescriptor::MemoryLayout
layout) {
+ switch (layout) {
+ case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
+ return "global";
+ case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
+ return "global.texture";
+ case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
+ return "global.texture-weight";
+ }
+ LOG(FATAL) << "No scope corresponding to the provided memory layout: "
+ << static_cast<int>(layout);
+ return "";
+}
+
OpenCLThreadEntry* OpenCLWorkspace::GetThreadEntry() { return
OpenCLThreadEntry::ThreadLocal(); }
OpenCLWorkspace* OpenCLWorkspace::Global() {
@@ -138,9 +195,30 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t
size, size_t alignment,
this->Init();
ICHECK(context != nullptr) << "No OpenCL device";
cl_int err_code;
- cl_mem mptr = clCreateBuffer(this->context, CL_MEM_READ_WRITE, size,
nullptr, &err_code);
+ cl::BufferDescriptor* desc = new cl::BufferDescriptor;
+ desc->buffer = clCreateBuffer(this->context, CL_MEM_READ_WRITE, size,
nullptr, &err_code);
+ desc->layout = cl::BufferDescriptor::MemoryLayout::kBuffer1D;
OPENCL_CHECK_ERROR(err_code);
- return mptr;
+ return desc;
+}
+
+void* OpenCLWorkspace::AllocDataSpace(Device dev, int ndim, const int64_t*
shape, DLDataType dtype,
+ Optional<String> mem_scope) {
+ if (!mem_scope.defined() || mem_scope.value() == "global") {
+ return DeviceAPI::AllocDataSpace(dev, ndim, shape, dtype, mem_scope);
+ }
+ ICHECK(IsTextureStorage(std::string(mem_scope.value())))
+ << "Device does not support allocate data space with "
+ << "specified memory scope: " << mem_scope.value();
+
+ ICHECK(ndim > 2) << "Shape for texture allocation must be at least rank 3; "
+ << "provided shape is rank " << ndim;
+
+ cl::BufferDescriptor* desc = new cl::BufferDescriptor(mem_scope);
+ size_t axis = DefaultTextureLayoutSeparator(ndim, mem_scope.value());
+ auto texture = ApplyTexture2DFlattening<int64_t>(shape, ndim, axis);
+ desc->buffer = AllocTexture(dev, texture.width, texture.height, dtype);
+ return desc;
}
void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
@@ -148,31 +226,87 @@ void OpenCLWorkspace::FreeDataSpace(Device dev, void*
ptr) {
// for some OpenCL platforms.
OPENCL_CALL(clFinish(this->GetQueue(dev)));
- cl_mem mptr = static_cast<cl_mem>(ptr);
- OPENCL_CALL(clReleaseMemObject(mptr));
+ cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
+ OPENCL_CALL(clReleaseMemObject(desc->buffer));
+ delete desc;
}
-void OpenCLWorkspace::CopyDataFromTo(const void* from, size_t from_offset,
void* to,
- size_t to_offset, size_t size, Device
dev_from, Device dev_to,
- DLDataType type_hint, TVMStreamHandle
stream) {
+cl_mem OpenCLWorkspace::AllocTexture(Device dev, size_t width, size_t height,
+ DLDataType type_hint) {
this->Init();
- ICHECK(stream == nullptr);
- if (IsOpenCLDevice(dev_from) && IsOpenCLDevice(dev_to)) {
- OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(dev_to),
- static_cast<cl_mem>((void*)from), //
NOLINT(*)
- static_cast<cl_mem>(to), from_offset,
to_offset, size, 0,
- nullptr, nullptr));
- } else if (IsOpenCLDevice(dev_from) && dev_to.device_type == kDLCPU) {
- OPENCL_CALL(clEnqueueReadBuffer(this->GetQueue(dev_from),
- static_cast<cl_mem>((void*)from), //
NOLINT(*)
- CL_FALSE, from_offset, size,
static_cast<char*>(to) + to_offset,
- 0, nullptr, nullptr));
- OPENCL_CALL(clFinish(this->GetQueue(dev_from)));
- } else if (dev_from.device_type == kDLCPU && IsOpenCLDevice(dev_to)) {
- OPENCL_CALL(clEnqueueWriteBuffer(this->GetQueue(dev_to),
static_cast<cl_mem>(to), CL_FALSE,
- to_offset, size, static_cast<const
char*>(from) + from_offset,
- 0, nullptr, nullptr));
- OPENCL_CALL(clFinish(this->GetQueue(dev_to)));
+ ICHECK(context != nullptr) << "No OpenCL device";
+ cl_int err_code;
+ cl_channel_type cl_type = DTypeToOpenCLChannelType(type_hint);
+ cl_image_format format = {CL_RGBA, cl_type};
+ cl_image_desc descriptor = {CL_MEM_OBJECT_IMAGE2D, width, height, 0, 0, 0,
0, 0, 0};
+ cl_mem mptr =
+ clCreateImage(this->context, CL_MEM_READ_WRITE, &format, &descriptor,
nullptr, &err_code);
+ OPENCL_CHECK_ERROR(err_code);
+ return mptr;
+}
+
+void* OpenCLWorkspace::AllocTextureWorkspace(Device dev, size_t width, size_t
height,
+ DLDataType type_hint) {
+ return GetThreadEntry()->texture_pool.AllocTexture(dev, width, height,
type_hint);
+}
+
+void OpenCLWorkspace::FreeTextureWorkspace(Device dev, void* ptr) {
+ GetThreadEntry()->texture_pool.FreeTexture(dev, ptr);
+}
+
+void OpenCLWorkspace::CopyDataFromTo(DLTensor* from, DLTensor* to,
TVMStreamHandle stream) {
+ size_t nbytes = GetDataSize(*from);
+ ICHECK_EQ(nbytes, GetDataSize(*to));
+ ICHECK(IsContiguous(*from) && IsContiguous(*to))
+ << "CopyDataFromTo only support contiguous array for now";
+
+ if (IsOpenCLDevice(from->device) && IsOpenCLDevice(to->device)) {
+ const auto* from_desc = static_cast<const
cl::BufferDescriptor*>(from->data);
+ ICHECK(from_desc->layout == cl::BufferDescriptor::MemoryLayout::kBuffer1D)
+ << "Device to device copying is currently only implemented for OpenCL
buffer storage";
+ auto* to_desc = static_cast<cl::BufferDescriptor*>(to->data);
+ OPENCL_CALL(clEnqueueCopyBuffer(this->GetQueue(to->device),
from_desc->buffer, to_desc->buffer,
+ from->byte_offset, to->byte_offset,
nbytes, 0, nullptr,
+ nullptr));
+ } else if (IsOpenCLDevice(from->device) && to->device.device_type == kDLCPU)
{
+ const auto* from_desc = static_cast<const
cl::BufferDescriptor*>(from->data);
+ switch (from_desc->layout) {
+ case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
+ OPENCL_CALL(clEnqueueReadBuffer(
+ this->GetQueue(from->device), from_desc->buffer, CL_FALSE,
from->byte_offset, nbytes,
+ static_cast<char*>(to->data) + to->byte_offset, 0, nullptr,
nullptr));
+ break;
+ case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
+ case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
+ auto image_info = GetImageInfo(from_desc, from);
+ // TODO(csullivan): Support calculating row_pitch correctly in the
case of reuse.
+ // Note that when utilizing texture pools for memory reuse, the
allocated image
+ // size can be larger than the size to be read.
+ OPENCL_CALL(clEnqueueReadImage(
+ this->GetQueue(from->device), from_desc->buffer, CL_FALSE,
image_info.origin,
+ image_info.region, image_info.row_pitch, image_info.slice_pitch,
+ static_cast<char*>(to->data) + to->byte_offset, 0, nullptr,
nullptr));
+ break;
+ }
+ OPENCL_CALL(clFinish(this->GetQueue(from->device)));
+ } else if (from->device.device_type == kDLCPU && IsOpenCLDevice(to->device))
{
+ auto* to_desc = static_cast<cl::BufferDescriptor*>(to->data);
+ switch (to_desc->layout) {
+ case cl::BufferDescriptor::MemoryLayout::kBuffer1D:
+ OPENCL_CALL(clEnqueueWriteBuffer(
+ this->GetQueue(to->device), to_desc->buffer, CL_FALSE,
to->byte_offset, nbytes,
+ static_cast<const char*>(from->data) + from->byte_offset, 0,
nullptr, nullptr));
+ break;
+ case cl::BufferDescriptor::MemoryLayout::kImage2DActivation:
+ case cl::BufferDescriptor::MemoryLayout::kImage2DWeight:
+ auto image_info = GetImageInfo(to_desc, to);
+ OPENCL_CALL(clEnqueueWriteImage(
+ this->GetQueue(to->device), to_desc->buffer, CL_FALSE,
image_info.origin,
+ image_info.region, image_info.row_pitch, image_info.slice_pitch,
+ static_cast<const char*>(from->data) + from->byte_offset, 0,
nullptr, nullptr));
+ break;
+ }
+ OPENCL_CALL(clFinish(this->GetQueue(to->device)));
} else {
LOG(FATAL) << "Expect copy from/to OpenCL or between OpenCL";
}
@@ -291,6 +425,39 @@ void OpenCLWorkspace::Init(const std::string& type_key,
const std::string& devic
initialized_ = true;
}
+TVM_REGISTER_GLOBAL("device_api.opencl.AllocTexture").set_body([](TVMArgs
args, TVMRetValue* rv) {
+ int device_type = args[0];
+ int device_id = args[1];
+ int width = args[2];
+ int height = args[3];
+ int dtype_code_hint = args[4];
+ int dtype_bits_hint = args[5];
+ Device dev;
+ dev.device_type = static_cast<DLDeviceType>(device_type);
+ dev.device_id = device_id;
+
+ DLDataType type_hint;
+ type_hint.code = static_cast<decltype(type_hint.code)>(dtype_code_hint);
+ type_hint.bits = static_cast<decltype(type_hint.bits)>(dtype_bits_hint);
+ type_hint.lanes = 1;
+
+ OpenCLWorkspace* ptr = OpenCLWorkspace::Global();
+ *rv = ptr->AllocTextureWorkspace(dev, static_cast<size_t>(width),
static_cast<size_t>(height),
+ type_hint);
+});
+
+TVM_REGISTER_GLOBAL("device_api.opencl.FreeTexture").set_body([](TVMArgs args,
TVMRetValue* rv) {
+ int device_type = args[0];
+ int device_id = args[1];
+ void* data = args[2];
+ OpenCLWorkspace* ptr = OpenCLWorkspace::Global();
+ Device dev;
+ dev.device_type = static_cast<DLDeviceType>(device_type);
+ dev.device_id = device_id;
+ ptr->FreeTextureWorkspace(dev, data);
+ *rv = static_cast<int32_t>(0);
+});
+
TVM_REGISTER_GLOBAL("device_api.opencl").set_body([](TVMArgs args,
TVMRetValue* rv) {
DeviceAPI* ptr = OpenCLWorkspace::Global();
*rv = static_cast<void*>(ptr);
diff --git a/src/runtime/opencl/opencl_module.cc
b/src/runtime/opencl/opencl_module.cc
index 40aa666..397f57b 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -64,7 +64,8 @@ class OpenCLWrappedFunc {
}
// setup arguments.
for (cl_uint i = 0; i < arg_size_.size(); ++i) {
- OPENCL_CALL(clSetKernelArg(kernel, i, arg_size_[i], void_args[i]));
+ auto* arg = static_cast<cl::BufferDescriptor*>(void_args[i]);
+ OPENCL_CALL(clSetKernelArg(kernel, i, arg_size_[i], arg->buffer));
}
cl_command_queue queue = w_->GetQueue(t->device);
ThreadWorkLoad wl = thread_axis_cfg_.Extract(args);
diff --git a/src/runtime/opencl/texture_pool.cc
b/src/runtime/opencl/texture_pool.cc
new file mode 100644
index 0000000..bf52894
--- /dev/null
+++ b/src/runtime/opencl/texture_pool.cc
@@ -0,0 +1,166 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file texture_pool.h
+ * \brief Texture pool utility.
+ */
+#include <limits>
+#include <memory>
+
+#include "../texture.h"
+
+namespace tvm {
+namespace runtime {
+
+class TexturePool::Pool {
+ public:
+ Pool() = default;
+ void* Alloc(Device dev, DeviceAPI* device, size_t width, size_t height,
DLDataType type_hint) {
+ Entry e;
+ e.data = nullptr;
+ if (free_list_.size() != 0) {
+ int64_t req_size = height * width;
+ Entry new_mem;
+ int64_t min_added_size = std::numeric_limits<int64_t>::max();
+ int64_t min_wasted_size = std::numeric_limits<int64_t>::max();
+ std::vector<Entry>::iterator best_mem;
+ for (auto it = free_list_.begin(); it != free_list_.end(); ++it) {
+ if (it->type.code != type_hint.code) {
+ continue;
+ }
+ int64_t old_size = it->x * it->y;
+ new_mem.x = std::max(it->x, width);
+ new_mem.y = std::max(it->y, height);
+ int64_t new_size = new_mem.x * new_mem.y;
+ int64_t added_size = new_size - old_size;
+ int64_t wasted_size = new_size - req_size;
+ // Minimize added size first and wasted size thereafter
+ if ((min_added_size > 0 && added_size < min_added_size) ||
+ (min_added_size == 0 && wasted_size < min_wasted_size)) {
+ min_added_size = added_size;
+ min_wasted_size = wasted_size;
+ best_mem = it;
+ }
+ }
+
+ if (min_added_size == 0) {
+ // use existing block
+ e = *best_mem;
+ free_list_.erase(best_mem);
+ } else if (min_added_size <= req_size) {
+ // if added size is less or equal to
+ // what is needed by alloc, then grow entry
+ device->FreeDataSpace(dev, best_mem->data);
+ free_list_.erase(best_mem);
+ new_mem.type = type_hint;
+ std::vector<int64_t> shape{int64_t(new_mem.y), int64_t(new_mem.x), 4};
+ new_mem.data = device->AllocDataSpace(dev, shape.size(), shape.data(),
new_mem.type,
+
Optional<String>("global.texture"));
+ e = new_mem;
+ }
+ }
+
+ if (e.data == nullptr) {
+ // create new block
+ std::vector<int64_t> shape{int64_t(height), int64_t(width), 4};
+ e.data = device->AllocDataSpace(dev, shape.size(), shape.data(),
type_hint,
+ Optional<String>("global.texture"));
+ e.x = width;
+ e.y = height;
+ e.type = type_hint;
+ }
+
+ allocated_.push_back(e);
+ return e.data;
+ }
+
+ void Free(void* data) {
+ Entry e;
+ if (allocated_.back().data == data) {
+ // quick path, last allocated.
+ e = allocated_.back();
+ allocated_.pop_back();
+ } else {
+ int index = static_cast<int>(allocated_.size()) - 2;
+ for (; index >= 0 && allocated_[index].data != data; --index) {
+ }
+ ICHECK_GE(index, 0) << "Attempt to free texture that has not been
allocated";
+ e = allocated_[index];
+ allocated_.erase(allocated_.begin() + index);
+ }
+ free_list_.push_back(e);
+ }
+
+ // Release all resources immediately
+ void Release(Device dev, DeviceAPI* device) {
+ for (auto& e : allocated_) {
+ device->FreeDataSpace(dev, e.data);
+ }
+ for (auto& e : free_list_) {
+ device->FreeDataSpace(dev, e.data);
+ }
+ allocated_.clear();
+ free_list_.clear();
+ }
+
+ private:
+ struct Entry {
+ void* data;
+ size_t x;
+ size_t y;
+ DLDataType type;
+ };
+ std::vector<Entry> free_list_;
+ std::vector<Entry> allocated_;
+};
+
+TexturePool::TexturePool(DLDeviceType device_type, DeviceAPI* device)
+ : device_type_(device_type), device_(device) {}
+
+TexturePool::~TexturePool() {
+ for (size_t i = 0; i < array_.size(); ++i) {
+ if (array_[i] != nullptr) {
+ Device dev;
+ dev.device_type = device_type_;
+ dev.device_id = static_cast<int>(i);
+ array_[i]->Release(dev, device_);
+ delete array_[i];
+ }
+ }
+}
+
+void* TexturePool::AllocTexture(Device dev, size_t width, size_t height,
DLDataType type_hint) {
+ if (static_cast<size_t>(dev.device_id) >= array_.size()) {
+ array_.resize(dev.device_id + 1, nullptr);
+ }
+ if (array_[dev.device_id] == nullptr) {
+ array_[dev.device_id] = new Pool();
+ }
+ return array_[dev.device_id]->Alloc(dev, device_, width, height, type_hint);
+}
+
+void TexturePool::FreeTexture(Device dev, void* ptr) {
+ ICHECK(static_cast<size_t>(dev.device_id) < array_.size() &&
array_[dev.device_id] != nullptr)
+ << "Attempt to free texture from null texture pool";
+ array_[dev.device_id]->Free(ptr);
+}
+
+} // namespace runtime
+} // namespace tvm
diff --git a/src/runtime/texture.h b/src/runtime/texture.h
new file mode 100644
index 0000000..83725c0
--- /dev/null
+++ b/src/runtime/texture.h
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file texture.h
+ * \brief Texture utilities
+ */
+#ifndef TVM_RUNTIME_TEXTURE_H_
+#define TVM_RUNTIME_TEXTURE_H_
+
+#include <tvm/runtime/device_api.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace tvm {
+namespace runtime {
+
+/*! \brief Structure to represent flattened texture shape */
+template <typename T>
+struct Texture2DShape {
+ T width;
+ T height;
+ T channel;
+};
+
+/*!
+ * \param shape_rank Rank N of the Nd-shape
+ * \param convention Storage scope convention to use for flattening
+ * \return The axis separator that defines the Nd shape partitioning in 2d
+ */
+inline size_t DefaultTextureLayoutSeparator(size_t shape_rank,
+ std::string convention =
"global.texture") {
+ // Texture activation:
+ // e.g. [N,C,H,W,c] -> Texture2d[N*C*H, W, c]
+ // Texture weight:
+ // e.g. [O,I,H,W,c] -> Texture2d[O, I*H*W, c]
+ size_t separator = 0;
+ if (convention == "global.texture") {
+ separator = shape_rank - 2;
+ } else if (convention == "global.texture-weight") {
+ separator = 1;
+ } else {
+ LOG(FATAL) << "Encountered unknown texture lowering convention: " <<
convention;
+ }
+ return separator;
+}
+
+/*!
+ * \param shape Nd shape
+ * \param rank Number of dimensions N of the Nd shape
+ * \param axis The axis separator that splits the Nd axes into two sets
+ * \return Width and height of the 2d shape
+ */
+template <typename T, typename S>
+Texture2DShape<T> ApplyTexture2DFlattening(const S& shape, size_t rank, size_t
axis) {
+ ICHECK(axis < rank)
+ << "Number of axes to flatten into rows must be less than shape rank for
2d flattening";
+ Texture2DShape<T> texture{1, 1, shape[rank - 1]};
+ for (size_t i = 0; i < rank - 1; i++) {
+ if (i < axis) {
+ texture.height *= shape[i];
+ } else {
+ texture.width *= shape[i];
+ }
+ }
+ return texture;
+}
+
+inline bool IsTextureStorage(std::string scope) {
+ return scope.find("texture") != std::string::npos;
+}
+
+/*!
+ * \brief A two dimensional storage pool that recycles temporal workspace
+ * allocations for dynamically allocated texture. See AllocTexture docstring
+ * for approach to allocation and reuse.
+ */
+class TVM_DLL TexturePool {
+ public:
+ /*!
+ * \brief Create pool with specific device type and device.
+ * \param device_type The device type.
+ * \param device_api The device API.
+ */
+ TexturePool(DLDeviceType device_type, DeviceAPI* device_api);
+ /*! \brief destructor */
+ ~TexturePool();
+
+ /*!
+ * \brief Allocate a two dimensional temporal texture workspace on device
+ *
+ * \note Two dimensional texture workspaces will be grown and reused
+ * according to the following strategy:
+ * - Choose the workspace which minimizes the amount of memory required to
+ * grow the workspace to fit the request.
+ * - If a set of workspaces exist that fit the current request without
+ * expansion, choose the workspace of that set which most closely
+ * matches the request size, minimizing wasted space.
+ *
+ * \param dev The context of allocation.
+ * \param width The width of the 2d texture to be allocated.
+ * \param height The height of the 2d texture to be allocated.
+ * \param type_hint The type of elements.
+ */
+ void* AllocTexture(Device dev, size_t width, size_t height, DLDataType
type_hint);
+ /*!
+ * \brief Free temporal texture in backend execution.
+ *
+ * \param dev The context of allocation.
+ * \param ptr The pointer to be freed.
+ */
+ void FreeTexture(Device dev, void* ptr);
+
+ private:
+ class Pool;
+ /*! \brief pool of device local array */
+ std::vector<Pool*> array_;
+ /*! \brief device type this pool support */
+ DLDeviceType device_type_;
+ /*! \brief The device API */
+ DeviceAPI* device_;
+};
+
+} // namespace runtime
+} // namespace tvm
+#endif // TVM_RUNTIME_TEXTURE_H_
diff --git a/tests/cpp/texture_copy_test.cc b/tests/cpp/texture_copy_test.cc
new file mode 100644
index 0000000..688bcab
--- /dev/null
+++ b/tests/cpp/texture_copy_test.cc
@@ -0,0 +1,142 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <dmlc/logging.h>
+#include <gtest/gtest.h>
+#include <tvm/runtime/registry.h>
+
+#include <cmath>
+#include <random>
+
+TEST(TextureCopy, HostDeviceRT) {
+ using namespace tvm;
+ bool enabled = tvm::runtime::RuntimeEnabled("opencl");
+ if (!enabled) {
+ LOG(INFO) << "Skip texture copy test because opencl runtime is
disabled.\n";
+ return;
+ }
+
+ std::vector<int64_t> shape{16, 16, 4};
+ auto cpu_arr0 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU,
0});
+ auto cpu_arr1 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU,
0});
+ String mem_scope = "global.texture";
+ auto opencl_txarr0 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1},
{kDLOpenCL, 0}, mem_scope);
+
+ size_t size = 1;
+ for (size_t i = 0; i < shape.size(); ++i) {
+ size *= static_cast<size_t>(shape[i]);
+ }
+
+ std::random_device dev;
+ std::mt19937 mt(dev());
+ std::uniform_real_distribution<> random(-10.0, 10.0);
+
+ // Random initialize host ndarray
+ for (size_t i = 0; i < size; i++) {
+ static_cast<float*>(cpu_arr0->data)[i] = random(mt);
+ }
+
+ // Do a roundtrip from host storage to opencl texture storage and back
+ cpu_arr0.CopyTo(opencl_txarr0);
+ opencl_txarr0.CopyTo(cpu_arr1);
+ for (size_t i = 0; i < size; ++i) {
+ ICHECK_LT(
+ std::fabs(static_cast<float*>(cpu_arr1->data)[i] -
static_cast<float*>(cpu_arr0->data)[i]),
+ 1e-5);
+ }
+}
+
+TEST(TextureCopy, OverwritePoolSubview) {
+ using namespace tvm;
+ bool enabled = tvm::runtime::RuntimeEnabled("opencl");
+ if (!enabled) {
+ LOG(INFO) << "Skip texture copy test because opencl runtime is
disabled.\n";
+ return;
+ }
+
+ std::vector<int64_t> shape{16, 16, 4};
+ std::vector<int64_t> shape_pool{32, 32, 4};
+ auto cpu_arr0 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU,
0});
+ auto cpu_arr1 = runtime::NDArray::Empty(shape, {kDLFloat, 32, 1}, {kDLCPU,
0});
+ auto cpu_pool0 = runtime::NDArray::Empty(shape_pool, {kDLFloat, 32, 1},
{kDLCPU, 0});
+ auto cpu_pool1 = runtime::NDArray::Empty(shape_pool, {kDLFloat, 32, 1},
{kDLCPU, 0});
+
+ String mem_scope = "global.texture";
+ auto opencl_txpool =
+ runtime::NDArray::Empty(shape_pool, {kDLFloat, 32, 1}, {kDLOpenCL, 0},
mem_scope);
+ auto opencl_txarr0 = opencl_txpool.CreateView(shape, {kDLFloat, 32, 1});
+
+ std::random_device dev;
+ std::mt19937 mt(dev());
+ std::uniform_real_distribution<> random(-10.0, 10.0);
+
+ size_t size = 1;
+ size_t size_pool = 1;
+ for (size_t i = 0; i < shape_pool.size(); ++i) {
+ size *= static_cast<size_t>(shape[i]);
+ size_pool *= static_cast<size_t>(shape_pool[i]);
+ }
+
+ // Random initialize host pool storage
+ for (size_t i = 0; i < size_pool; i++) {
+ static_cast<float*>(cpu_pool0->data)[i] = random(mt);
+ }
+
+ // Random initialize host array
+ for (int64_t h = 0; h < shape[0]; h++) {
+ for (int64_t w = 0; w < shape[1]; w++) {
+ for (int64_t rgba = 0; rgba < shape[2]; rgba++) {
+ static_cast<float*>(cpu_arr0->data)[shape[1] * shape[2] * h + shape[2]
* w + rgba] = 1.1f;
+ }
+ }
+ }
+
+ // Copy to texture pool for initialization
+ cpu_pool0.CopyTo(opencl_txpool);
+ // Copy host data to subview into texture storage
+ cpu_arr0.CopyTo(opencl_txarr0);
+ // Copy modified pool back
+ opencl_txpool.CopyTo(cpu_pool1);
+
+ // Check that modifications to pool follow two dimensional
+ // strides according to the written texture shape.
+ for (int64_t h = 0; h < shape_pool[0]; h++) {
+ for (int64_t w = 0; w < shape_pool[1]; w++) {
+ for (int64_t rgba = 0; rgba < shape_pool[2]; rgba++) {
+ size_t i = shape_pool[1] * shape_pool[2] * h + shape_pool[2] * w +
rgba;
+ if (h < shape[0] && w < shape[1] && rgba < shape[2]) {
+ size_t j = shape[1] * shape[2] * h + shape[2] * w + rgba;
+ ICHECK_LT(std::fabs(static_cast<float*>(cpu_pool1->data)[i] -
+ static_cast<float*>(cpu_arr0->data)[j]),
+ 1e-5);
+ } else {
+ ICHECK_LT(std::fabs(static_cast<float*>(cpu_pool1->data)[i] -
+ static_cast<float*>(cpu_pool0->data)[i]),
+ 1e-5);
+ }
+ }
+ }
+ }
+}
+
+int main(int argc, char** argv) {
+ testing::InitGoogleTest(&argc, argv);
+ testing::FLAGS_gtest_death_test_style = "threadsafe";
+ return RUN_ALL_TESTS();
+}