This is an automated email from the ASF dual-hosted git repository.
srk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 099ed94951 [OpenCL] Implement save/load pre-compiled programs (#13868)
099ed94951 is described below
commit 099ed949519f3b6ae182c31ce69496f18a1f60ad
Author: Egor Churaev <[email protected]>
AuthorDate: Fri Feb 3 05:28:35 2023 +0300
[OpenCL] Implement save/load pre-compiled programs (#13868)
* [OpenCL] Implement save/load pre-compiled programs
Using pre-compiled programs might significantly improve inference time
of the first run.
- Added methods `SupportPreCompiledPrograms` which reports if the module
supports using pre-compiled programs.
- Method `GetPreCompiledPrograms` returns string with bytes of
pre-compiled programs.
- Method `SetPreCompiledPrograms` allows user to pass pre-compiled
programs to the module.
* Fix lint
* Apply comment: PackedFunc is used
* Fix build
* Fix CI and rename functions
* Apply comments
---
apps/cpp_rtvm/README.md | 14 ++
apps/cpp_rtvm/main.cc | 9 +
apps/cpp_rtvm/tvm_runner.cc | 29 ++-
apps/cpp_rtvm/tvm_runner.h | 4 +
src/runtime/opencl/opencl_common.h | 2 +
src/runtime/opencl/opencl_device_api.cc | 4 +-
src/runtime/opencl/opencl_module.cc | 77 ++++++++
.../opencl/opencl_wrapper/opencl_wrapper.cc | 12 ++
tests/cpp-runtime/opencl/opencl_compile_to_bin.cc | 208 +++++++++++++++++++++
9 files changed, 356 insertions(+), 3 deletions(-)
diff --git a/apps/cpp_rtvm/README.md b/apps/cpp_rtvm/README.md
index e696153282..c60a7b0e12 100644
--- a/apps/cpp_rtvm/README.md
+++ b/apps/cpp_rtvm/README.md
@@ -352,3 +352,17 @@ python3 -m tvm.driver.tvmc compile --cross-compiler
${ANDROID_NDK_HOME}/toolchai
python3 -m tvm.driver.tvmc run --device="cl" keras-resnet50.tar --rpc-key
${TVM_RPC_KEY} --rpc-tracker {TVM_TRACKER_HOST}:{TVM_TRACKER_PORT} --print-time
```
+
+# Use pre-compiled OpenCL kernels
+Using pre-compiled programs might significantly improve inference time of the
+first run. E.g. for topology with ~300 kernels compilation time on Adreno was
+about 26 seconds. But after dumping compiled programs to binary files and reuse
+them on the next runs, the compilation time was significantly decreased (more
+than 1000 times) and starts to be around 25 ms.
+
+To use such functionality, the developer have to pass parameter
`--pre-compiled`
+to the `rtvm` and specify the file name where pre-compiled programs will be
+stored. If the pre-compiled file name was passed to the `rtvm` then After
method
+`Load`, method `UsePreCompiledProgram` is called. This method loads
pre-compiled
+programs if the file exists. In opposite case the file will be created and
+pre-compiled programs will be saved to this file.
diff --git a/apps/cpp_rtvm/main.cc b/apps/cpp_rtvm/main.cc
index 31019ee0c9..c38a5f62bd 100644
--- a/apps/cpp_rtvm/main.cc
+++ b/apps/cpp_rtvm/main.cc
@@ -54,6 +54,7 @@ static const string kUsage =
"--input - Numpy file for the model input (optional and we use
random of not given)\n"
"--output - Numpy file name to dump the model output as numpy\n"
"--dump-meta - Dump model meta information\n"
+ "--pre-compiled - The file name of a file where pre-compiled programs
should be stored"
"\n"
" Example\n"
" ./rtvm --model=keras-resnet50 --device=\"opencl\" --dump-meta\n"
@@ -66,12 +67,14 @@ static const string kUsage =
* \arg device The target device to use {llvm, cl, ...etc.}
* \arg input Numpy file for the model input
* \arg output Numpy file name to dump the model output as numpy
+ * \arg pre_compiled File name where pre-compiled programs should be stored
*/
struct ToolArgs {
string model;
string device;
string input;
string output;
+ string pre_compiled;
bool dump_meta = false;
};
@@ -84,6 +87,7 @@ void PrintArgs(const ToolArgs& args) {
LOG(INFO) << "Device = " << args.device;
LOG(INFO) << "Input = " << args.input;
LOG(INFO) << "Output = " << args.output;
+ LOG(INFO) << "Pre-compiled = " << args.pre_compiled;
LOG(INFO) << "Dump Metadata = " << ((args.dump_meta) ? ("True") : ("False"));
}
@@ -172,6 +176,8 @@ void ParseCmdArgs(int argc, char* argv[], struct ToolArgs&
args) {
if (!pmeta.empty()) {
args.dump_meta = true;
}
+
+ args.pre_compiled = GetCmdOption(argc, argv, "--pre-compiled=");
}
/*!
@@ -190,6 +196,9 @@ int ExecuteModel(ToolArgs& args) {
// Load the model
runner.Load();
+ if (!args.pre_compiled.empty()) {
+ runner.UsePreCompiledPrograms(args.pre_compiled);
+ }
// Query Model meta Information
TVMMetaInfo mInfo = runner.GetMetaInfo();
diff --git a/apps/cpp_rtvm/tvm_runner.cc b/apps/cpp_rtvm/tvm_runner.cc
index 74498e8170..2fd4f2281e 100644
--- a/apps/cpp_rtvm/tvm_runner.cc
+++ b/apps/cpp_rtvm/tvm_runner.cc
@@ -27,6 +27,7 @@
#include <cnpy.h>
#include <fstream>
+#include <iterator>
#include <streambuf>
#include <string>
@@ -67,7 +68,8 @@ int GetTVMDevice(std::string device) {
* \param path where the tfm compiler artifacts present.
* \param device the target device where we need to load the compiled model.
*/
-TVMRunner::TVMRunner(std::string path, std::string device) :
r_model_path(path), r_device(device) {
+TVMRunner::TVMRunner(std::string path, std::string device)
+ : r_model_path(path), r_device(device), r_run_was_called(false) {
LOG(INFO) << "TVMRunner Constructor:" << r_model_path << " Devices:" <<
r_device;
}
@@ -110,6 +112,30 @@ int TVMRunner::Load(void) {
return 0;
}
+/*!
+ * \brief Specify if the run programs should be dumped to binary and reused in
the next runs.
+ * \param file_name File name where pre-compiled programs should be stored.
+ */
+void TVMRunner::UsePreCompiledPrograms(std::string file_name) {
+ if (r_run_was_called) {
+ LOG(INFO) << "TVMRunner UsePreCompiledPrograms: should be called before
first run";
+ return;
+ }
+ auto f_get = r_mod_handle->GetFunction("opencl.GetPreCompiledPrograms",
true);
+ auto f_set = r_mod_handle->GetFunction("opencl.SetPreCompiledPrograms",
true);
+ if (f_get != nullptr && f_set != nullptr) {
+ std::ifstream ifs(file_name, std::ios::in | std::ios::binary);
+ if (ifs.fail()) {
+ auto bytes = String(f_get());
+ std::ofstream fs(file_name, std::ofstream::binary);
+ fs.write(bytes.c_str(), bytes.size());
+ } else {
+ std::string bytes((std::istreambuf_iterator<char>(ifs)),
std::istreambuf_iterator<char>());
+ f_set(String(bytes));
+ }
+ }
+}
+
/*!
* \brief Calculated the memory size for the NDArray.
* \param NDArray object.
@@ -242,6 +268,7 @@ int TVMRunner::GetOutput(std::string output_id, char*
raw_output) {
*/
int TVMRunner::Run(void) {
LOG(INFO) << "TVMRunner::Run";
+ r_run_was_called = true;
r_graph_handle.GetFunction("run")();
return 0;
diff --git a/apps/cpp_rtvm/tvm_runner.h b/apps/cpp_rtvm/tvm_runner.h
index 37ba53606e..926e009c4c 100644
--- a/apps/cpp_rtvm/tvm_runner.h
+++ b/apps/cpp_rtvm/tvm_runner.h
@@ -56,6 +56,8 @@ class TVMRunner {
/*! \brief Initiates graph runtime and with the compiled model */
int Load(void);
+ /*! \brief Specify if the run programs should be dumped to binary and reused
in the next runs */
+ void UsePreCompiledPrograms(std::string);
/*! \brief Executes one inference cycle */
int Run(void);
/*! \brief To set the inputs from given npz file */
@@ -86,6 +88,8 @@ class TVMRunner {
std::string r_device;
/*! \brief Holds meta information queried from graph runtime */
TVMMetaInfo mInfo;
+ /*! \brief Mark if the run method was called */
+ bool r_run_was_called;
};
} // namespace runtime
diff --git a/src/runtime/opencl/opencl_common.h
b/src/runtime/opencl/opencl_common.h
index c172a0f945..a295ea396c 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -438,6 +438,8 @@ class OpenCLModuleNode : public ModuleNode {
// install a new kernel to thread local entry
cl_kernel InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThreadEntry* t,
const std::string& func_name, const KTRefEntry& e);
+ void SetPreCompiledPrograms(const std::string& bytes);
+ std::string GetPreCompiledPrograms();
private:
// The workspace, need to keep reference to use it in destructor.
diff --git a/src/runtime/opencl/opencl_device_api.cc
b/src/runtime/opencl/opencl_device_api.cc
index aa31d80d6e..c53523267d 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -202,7 +202,7 @@ void*
OpenCLWorkspace::CreateHostPtrIfEnabled(cl::BufferDescriptor* desc, Device
cl_int err_code;
desc->host_ptr = reinterpret_cast<cl_uchar*>(
clEnqueueMapBuffer(this->GetQueue(dev), desc->buffer, CL_TRUE,
CL_MAP_WRITE, 0,
- sizeof(cl_uchar) * size, 0, NULL, NULL, &err_code));
+ sizeof(cl_uchar) * size, 0, nullptr, nullptr,
&err_code));
OPENCL_CHECK_ERROR(err_code);
#endif // OPENCL_ENABLE_HOST_PTR
return desc;
@@ -256,7 +256,7 @@ void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
if (desc->host_ptr) {
clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer,
- reinterpret_cast<void*>(desc->host_ptr), 0, NULL,
NULL);
+ reinterpret_cast<void*>(desc->host_ptr), 0,
nullptr, nullptr);
}
OPENCL_CALL(clReleaseMemObject(desc->buffer));
delete desc;
diff --git a/src/runtime/opencl/opencl_module.cc
b/src/runtime/opencl/opencl_module.cc
index 2fb157aac6..ad41a34dde 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -137,6 +137,15 @@ cl::OpenCLWorkspace*
OpenCLModuleNode::GetGlobalWorkspace() {
PackedFunc OpenCLModuleNode::GetFunction(const std::string& name,
const ObjectPtr<Object>&
sptr_to_self) {
ICHECK_EQ(sptr_to_self.get(), this);
+ if (name == "opencl.GetPreCompiledPrograms") {
+ return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+ *rv = this->GetPreCompiledPrograms();
+ });
+ } else if (name == "opencl.SetPreCompiledPrograms") {
+ return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+ this->SetPreCompiledPrograms(args[0]);
+ });
+ }
ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have
main";
auto it = fmap_.find(name);
if (it == fmap_.end()) return PackedFunc();
@@ -262,6 +271,74 @@ cl_kernel
OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThre
return kernel;
}
+void OpenCLModuleNode::SetPreCompiledPrograms(const std::string& bytes) {
+ std::string data = bytes;
+ dmlc::MemoryStringStream reader(&data);
+ dmlc::Stream* strm = &reader;
+ uint64_t kernels_num;
+ strm->Read(&kernels_num);
+ cl::OpenCLThreadEntry* t = workspace_->GetThreadEntry();
+ int device_id = t->device.device_id;
+ for (size_t i = 0; i < kernels_num; ++i) {
+ std::string name;
+ std::vector<unsigned char> bin_vector;
+ strm->Read(&name);
+ strm->Read(&bin_vector);
+ if (programs_[name][device_id] == nullptr) {
+ cl_int err = 0;
+ cl_int binaryStatus;
+ size_t binarySize = bin_vector.size();
+ const unsigned char* programBinary = bin_vector.data();
+
+ cl_device_id dev = workspace_->devices[device_id];
+ programs_[name][device_id] = clCreateProgramWithBinary(
+ workspace_->context, 1, &dev, &binarySize, &programBinary,
&binaryStatus, &err);
+ OPENCL_CHECK_ERROR(err);
+ OPENCL_CHECK_ERROR(binaryStatus);
+
+ err = clBuildProgram(programs_[name][device_id], 0, nullptr, nullptr,
nullptr, nullptr);
+ if (err != CL_SUCCESS) {
+ size_t len;
+ std::string log;
+ clGetProgramBuildInfo(programs_[name][device_id], dev,
CL_PROGRAM_BUILD_LOG, 0, nullptr,
+ &len);
+ log.resize(len);
+ clGetProgramBuildInfo(programs_[name][device_id], dev,
CL_PROGRAM_BUILD_LOG, len, &log[0],
+ nullptr);
+ LOG(FATAL) << "OpenCL build error for device=" << dev << "\n" << log;
+ }
+ }
+ }
+}
+
+std::string OpenCLModuleNode::GetPreCompiledPrograms() {
+ std::string data;
+ dmlc::MemoryStringStream writer(&data);
+ dmlc::Stream* strm = &writer;
+ strm->Write(static_cast<uint64_t>(parsed_kernels_.size()));
+ for (auto& it : parsed_kernels_) {
+ std::string name = it.first;
+ cl::OpenCLThreadEntry* t = workspace_->GetThreadEntry();
+ int device_id = t->device.device_id;
+ t->kernel_table.resize(workspace_->num_registered_kernels);
+ if (programs_[std::string(name)][device_id] == nullptr) {
+ InstallKernel(workspace_, t, name, kid_map_[name]);
+ }
+ size_t size;
+ clGetProgramInfo(programs_[name][device_id], CL_PROGRAM_BINARY_SIZES,
sizeof(size_t), &size,
+ nullptr);
+ ICHECK(size > 0) << "Size of binary is 0";
+ std::vector<unsigned char> bin_vector(size);
+ unsigned char* binary = bin_vector.data();
+ clGetProgramInfo(programs_[name][device_id], CL_PROGRAM_BINARIES,
sizeof(unsigned char*),
+ &binary, nullptr);
+
+ strm->Write(name);
+ strm->Write(bin_vector);
+ }
+ return data;
+}
+
Module OpenCLModuleCreate(std::string data, std::string fmt,
std::unordered_map<std::string, FunctionInfo> fmap,
std::string source) {
auto n = make_object<OpenCLModuleNode>(data, fmt, fmap, source);
diff --git a/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc
b/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc
index 2c27689454..13b7d94706 100644
--- a/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc
+++ b/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc
@@ -137,6 +137,7 @@ using f_clCreateProgramWithBinary = cl_program
(*)(cl_context, cl_uint, const cl
using f_clReleaseProgram = cl_int (*)(cl_program);
using f_clBuildProgram = cl_int (*)(cl_program, cl_uint, const cl_device_id*,
const char*,
void (*pfn_notify)(cl_program program,
void* user_data), void*);
+using f_clGetProgramInfo = cl_int (*)(cl_program, cl_program_info, size_t,
void*, size_t*);
using f_clGetProgramBuildInfo = cl_int (*)(cl_program, cl_device_id,
cl_program_build_info, size_t,
void*, size_t*);
using f_clCreateKernel = cl_kernel (*)(cl_program, const char*, cl_int*);
@@ -347,6 +348,17 @@ cl_int clBuildProgram(cl_program program, cl_uint
num_devices, const cl_device_i
}
}
+cl_int clGetProgramInfo(cl_program program, cl_program_info param_name, size_t
param_value_size,
+ void* param_value, size_t* param_value_size_ret) {
+ auto& lib = LibOpenCLWrapper::getInstance();
+ auto func = (f_clGetProgramInfo)lib.getOpenCLFunction("clGetProgramInfo");
+ if (func) {
+ return func(program, param_name, param_value_size, param_value,
param_value_size_ret);
+ } else {
+ return CL_INVALID_PLATFORM;
+ }
+}
+
cl_int clGetProgramBuildInfo(cl_program program, cl_device_id device,
cl_program_build_info param_name, size_t
param_value_size,
void* param_value, size_t* param_value_size_ret) {
diff --git a/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc
b/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc
new file mode 100644
index 0000000000..a1bdeb9c14
--- /dev/null
+++ b/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/runtime/profiling.h>
+
+#include <chrono>
+#include <regex>
+
+#include "../src/runtime/opencl/opencl_common.h"
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::cl;
+
+namespace {
+// This kernel was generated by TVM for conv2d operation
+const std::string kernelTemplate = R"(
+// Function: kernel_name_placeholder0
+__kernel void kernel_name_placeholder0(__write_only image2d_t
pad_temp_texture, __read_only image2d_t placeholder0) {
+ const sampler_t image_sampler = CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+ float4 _1 = read_imagef(placeholder0, image_sampler,
(int2)(((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 9) - 1),
((((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) / 81) * 7) +
((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 81) / 9)) - 1)));
+ (void)write_imagef(pad_temp_texture, (int2)((((((int)get_group_id(0)) * 32)
+ ((int)get_local_id(0))) % 9), (((((int)get_group_id(0)) * 32) +
((int)get_local_id(0))) / 9)), (((((9 <= (((((int)get_group_id(0)) * 32) +
((int)get_local_id(0))) % 81)) && ((((((int)get_group_id(0)) * 32) +
((int)get_local_id(0))) % 81) < 72)) && (1 <= (((((int)get_group_id(0)) * 32) +
((int)get_local_id(0))) % 9))) && ((((((int)get_group_id(0)) * 32) +
((int)get_local_id(0))) % 9) < 8)) ? _1 : ((float4)((fl [...]
+}
+
+// Function: kernel_name_placeholder1
+__kernel void kernel_name_placeholder1(__read_only image2d_t pad_temp_texture,
__read_only image2d_t placeholder1, __write_only image2d_t compute, __read_only
image2d_t placeholder2, __read_only image2d_t placeholder3) {
+ const sampler_t image_sampler = CLK_NORMALIZED_COORDS_FALSE |
CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+ float4 compute1[14];
+ vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 0);
+ vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 28);
+ vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 4);
+ vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 32);
+ vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 8);
+ vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 36);
+ vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 12);
+ vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 40);
+ vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 16);
+ vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 44);
+ vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 20);
+ vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 48);
+ vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 24);
+ vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 52);
+ for (int rc_inner = 0; rc_inner < 128; ++rc_inner) {
+ for (int ry_inner = 0; ry_inner < 3; ++ry_inner) {
+ for (int rx_inner = 0; rx_inner < 3; ++rx_inner) {
+ for (int rc = 0; rc < 4; ++rc) {
+ float4 _1 = read_imagef(pad_temp_texture, image_sampler,
(int2)((((int)get_local_id(0)) + rx_inner), ((rc_inner * 9) + ry_inner)));
+ float4 _2 = read_imagef(placeholder1, image_sampler,
(int2)(((((rc_inner * 36) + (rc * 9)) + (ry_inner * 3)) + rx_inner),
((((int)get_group_id(2)) * 16) + ((int)get_local_id(2)))));
+ vstore4((vload4(0, (float*)compute1 + 0) + (((float*)&_1)[rc] *
_2)), 0, (float*)compute1 + 0);
+ float4 _3 = read_imagef(placeholder1, image_sampler,
(int2)(((((rc_inner * 36) + (rc * 9)) + (ry_inner * 3)) + rx_inner),
(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8)));
+ vstore4((vload4(0, (float*)compute1 + 28) + (((float*)&_1)[rc] *
_3)), 0, (float*)compute1 + 28);
+ float4 _4 = read_imagef(pad_temp_texture, image_sampler,
(int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 1)));
+ vstore4((vload4(0, (float*)compute1 + 4) + (((float*)&_4)[rc] *
_2)), 0, (float*)compute1 + 4);
+ vstore4((vload4(0, (float*)compute1 + 32) + (((float*)&_4)[rc] *
_3)), 0, (float*)compute1 + 32);
+ float4 _5 = read_imagef(pad_temp_texture, image_sampler,
(int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 2)));
+ vstore4((vload4(0, (float*)compute1 + 8) + (((float*)&_5)[rc] *
_2)), 0, (float*)compute1 + 8);
+ vstore4((vload4(0, (float*)compute1 + 36) + (((float*)&_5)[rc] *
_3)), 0, (float*)compute1 + 36);
+ float4 _6 = read_imagef(pad_temp_texture, image_sampler,
(int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 3)));
+ vstore4((vload4(0, (float*)compute1 + 12) + (((float*)&_6)[rc] *
_2)), 0, (float*)compute1 + 12);
+ vstore4((vload4(0, (float*)compute1 + 40) + (((float*)&_6)[rc] *
_3)), 0, (float*)compute1 + 40);
+ float4 _7 = read_imagef(pad_temp_texture, image_sampler,
(int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 4)));
+ vstore4((vload4(0, (float*)compute1 + 16) + (((float*)&_7)[rc] *
_2)), 0, (float*)compute1 + 16);
+ vstore4((vload4(0, (float*)compute1 + 44) + (((float*)&_7)[rc] *
_3)), 0, (float*)compute1 + 44);
+ float4 _8 = read_imagef(pad_temp_texture, image_sampler,
(int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 5)));
+ vstore4((vload4(0, (float*)compute1 + 20) + (((float*)&_8)[rc] *
_2)), 0, (float*)compute1 + 20);
+ vstore4((vload4(0, (float*)compute1 + 48) + (((float*)&_8)[rc] *
_3)), 0, (float*)compute1 + 48);
+ float4 _9 = read_imagef(pad_temp_texture, image_sampler,
(int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 6)));
+ vstore4((vload4(0, (float*)compute1 + 24) + (((float*)&_9)[rc] *
_2)), 0, (float*)compute1 + 24);
+ vstore4((vload4(0, (float*)compute1 + 52) + (((float*)&_9)[rc] *
_3)), 0, (float*)compute1 + 52);
+ }
+ }
+ }
+ }
+ float4 _10 = read_imagef(placeholder2, image_sampler,
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+ float4 _11 = read_imagef(placeholder3, image_sampler,
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+ (void)write_imagef(compute, (int2)(((int)get_local_id(0)),
((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7))),
max(((vload4(0, (float*)compute1 + 0) * _10) + _11),
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f))));
+ float4 _12 = read_imagef(placeholder2, image_sampler,
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+ float4 _13 = read_imagef(placeholder3, image_sampler,
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+ (void)write_imagef(compute, (int2)(((int)get_local_id(0)),
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 56)),
max(((vload4(0, (float*)compute1 + 28) * _12) + _13),
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f))));
+ float4 _14 = read_imagef(placeholder2, image_sampler,
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+ float4 _15 = read_imagef(placeholder3, image_sampler,
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+ (void)write_imagef(compute, (int2)(((int)get_local_id(0)),
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 1)),
max(((vload4(0, (float*)compute1 + 4) * _14) + _15),
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f))));
+ float4 _16 = read_imagef(placeholder2, image_sampler,
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+ float4 _17 = read_imagef(placeholder3, image_sampler,
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+ (void)write_imagef(compute, (int2)(((int)get_local_id(0)),
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 57)),
max(((vload4(0, (float*)compute1 + 32) * _16) + _17),
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f))));
+ float4 _18 = read_imagef(placeholder2, image_sampler,
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+ float4 _19 = read_imagef(placeholder3, image_sampler,
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+ (void)write_imagef(compute, (int2)(((int)get_local_id(0)),
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 2)),
max(((vload4(0, (float*)compute1 + 8) * _18) + _19),
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f))));
+ float4 _20 = read_imagef(placeholder2, image_sampler,
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+ float4 _21 = read_imagef(placeholder3, image_sampler,
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+ (void)write_imagef(compute, (int2)(((int)get_local_id(0)),
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 58)),
max(((vload4(0, (float*)compute1 + 36) * _20) + _21),
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f))));
+ float4 _22 = read_imagef(placeholder2, image_sampler,
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+ float4 _23 = read_imagef(placeholder3, image_sampler,
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+ (void)write_imagef(compute, (int2)(((int)get_local_id(0)),
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 3)),
max(((vload4(0, (float*)compute1 + 12) * _22) + _23),
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f))));
+ float4 _24 = read_imagef(placeholder2, image_sampler,
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+ float4 _25 = read_imagef(placeholder3, image_sampler,
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+ (void)write_imagef(compute, (int2)(((int)get_local_id(0)),
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 59)),
max(((vload4(0, (float*)compute1 + 40) * _24) + _25),
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f))));
+ float4 _26 = read_imagef(placeholder2, image_sampler,
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+ float4 _27 = read_imagef(placeholder3, image_sampler,
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+ (void)write_imagef(compute, (int2)(((int)get_local_id(0)),
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 4)),
max(((vload4(0, (float*)compute1 + 16) * _26) + _27),
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f))));
+ float4 _28 = read_imagef(placeholder2, image_sampler,
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+ float4 _29 = read_imagef(placeholder3, image_sampler,
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+ (void)write_imagef(compute, (int2)(((int)get_local_id(0)),
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 60)),
max(((vload4(0, (float*)compute1 + 44) * _28) + _29),
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f))));
+ float4 _30 = read_imagef(placeholder2, image_sampler,
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+ float4 _31 = read_imagef(placeholder3, image_sampler,
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+ (void)write_imagef(compute, (int2)(((int)get_local_id(0)),
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 5)),
max(((vload4(0, (float*)compute1 + 20) * _30) + _31),
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f))));
+ float4 _32 = read_imagef(placeholder2, image_sampler,
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+ float4 _33 = read_imagef(placeholder3, image_sampler,
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+ (void)write_imagef(compute, (int2)(((int)get_local_id(0)),
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 61)),
max(((vload4(0, (float*)compute1 + 48) * _32) + _33),
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f))));
+ float4 _34 = read_imagef(placeholder2, image_sampler,
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+ float4 _35 = read_imagef(placeholder3, image_sampler,
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+ (void)write_imagef(compute, (int2)(((int)get_local_id(0)),
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 6)),
max(((vload4(0, (float*)compute1 + 24) * _34) + _35),
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f))));
+ float4 _36 = read_imagef(placeholder2, image_sampler,
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+ float4 _37 = read_imagef(placeholder3, image_sampler,
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+ (void)write_imagef(compute, (int2)(((int)get_local_id(0)),
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 62)),
max(((vload4(0, (float*)compute1 + 52) * _36) + _37),
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f,
(float)0.000000e+00f))));
+}
+
+ )";
+} // namespace
+
+using Timestamp = std::chrono::time_point<std::chrono::high_resolution_clock>;
+
+class OpenCLCompileBin : public ::testing::Test {
+ protected:
+ virtual void SetUp() override {
+ m_workspace = OpenCLWorkspace::Global();
+ OpenCLThreadEntry* t = m_workspace->GetThreadEntry();
+ t->kernel_table.resize(m_kernelsNum * 2);
+ m_kernelNames.resize(m_kernelsNum * 2);
+ m_dataSrc = "";
+ m_fmap.clear();
+ for (size_t i = 0; i < m_kernelsNum; ++i) {
+ std::string kernel_name = "generated_kernel_" + std::to_string(i) + "_";
+ std::string kernelSource =
+ std::regex_replace(kernelTemplate,
std::regex("kernel_name_placeholder"), kernel_name);
+ FunctionInfo fi1 = {kernel_name + "0"};
+ FunctionInfo fi2 = {kernel_name + "1"};
+ m_fmap[fi1.name] = fi1;
+ m_fmap[fi2.name] = fi2;
+ m_kernelNames[i * 2] = fi1.name;
+ m_kernelNames[i * 2 + 1] = fi2.name;
+ m_dataSrc += kernelSource;
+ }
+ }
+
+ protected:
+ const size_t m_kernelsNum = 100;
+ const std::string m_tmpDirName = "OpenCLCompileBin_dir";
+ OpenCLWorkspace* m_workspace;
+ std::string m_dataSrc;
+ std::unordered_map<std::string, FunctionInfo> m_fmap;
+ std::vector<std::string> m_kernelNames;
+};
+
+TEST_F(OpenCLCompileBin, SourceVsBinaryCompilationPerf) {
+ double compileFromSourceTimeMS, compileFromBinTimeMS;
+ std::string bytes;
+ {
+ OpenCLModuleNode module(m_dataSrc, "cl", m_fmap, std::string());
+ module.Init();
+ Timestamp comp_start = std::chrono::high_resolution_clock::now();
+ for (size_t i = 0; i < m_kernelNames.size(); ++i) {
+ OpenCLModuleNode::KTRefEntry e = {i, 1};
+ module.InstallKernel(m_workspace, m_workspace->GetThreadEntry(),
m_kernelNames[i], e);
+ }
+ Timestamp comp_end = std::chrono::high_resolution_clock::now();
+ auto get_pre_compiled_f =
+ module.GetFunction("opencl.GetPreCompiledPrograms",
GetObjectPtr<Object>(&module));
+ bytes = String(get_pre_compiled_f());
+ std::chrono::duration duration =
+ std::chrono::duration_cast<std::chrono::nanoseconds>(comp_end -
comp_start);
+ compileFromSourceTimeMS = duration.count() * 1e-6;
+ std::cout << "Compile time from source: " << compileFromSourceTimeMS << "
ms." << std::endl;
+ }
+ {
+ OpenCLModuleNode module(m_dataSrc, "cl", m_fmap, std::string());
+ module.Init();
+ module.GetFunction("opencl.SetPreCompiledPrograms",
+ GetObjectPtr<Object>(&module))(String(bytes));
+ Timestamp comp_start = std::chrono::high_resolution_clock::now();
+ for (size_t i = 0; i < m_kernelNames.size(); ++i) {
+ OpenCLModuleNode::KTRefEntry e = {i, 1};
+ module.InstallKernel(m_workspace, m_workspace->GetThreadEntry(),
m_kernelNames[i], e);
+ }
+ Timestamp comp_end = std::chrono::high_resolution_clock::now();
+ std::chrono::duration duration =
+ std::chrono::duration_cast<std::chrono::nanoseconds>(comp_end -
comp_start);
+ compileFromBinTimeMS = duration.count() * 1e-6;
+ std::cout << "Compile time from bin: " << compileFromBinTimeMS << " ms."
<< std::endl;
+ }
+ ASSERT_LT(compileFromBinTimeMS, compileFromSourceTimeMS);
+}