[tvm] branch main updated: [OpenCL] Implement save/load pre-compiled programs (#13868)

srk Thu, 02 Feb 2023 18:28:52 -0800

This is an automated email from the ASF dual-hosted git repository.

srk pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new 099ed94951 [OpenCL] Implement save/load pre-compiled programs (#13868)
099ed94951 is described below

commit 099ed949519f3b6ae182c31ce69496f18a1f60ad
Author: Egor Churaev <[email protected]>
AuthorDate: Fri Feb 3 05:28:35 2023 +0300

    [OpenCL] Implement save/load pre-compiled programs (#13868)
    
    * [OpenCL] Implement save/load pre-compiled programs
    
    Using pre-compiled programs might significantly improve inference time
    of the first run.
    
    - Added methods `SupportPreCompiledPrograms` which reports if the module
      supports using pre-compiled programs.
    - Method `GetPreCompiledPrograms` returns string with bytes of
      pre-compiled programs.
    - Method `SetPreCompiledPrograms` allows user to pass pre-compiled
      programs to the module.
    
    * Fix lint
    
    * Apply comment: PackedFunc is used
    
    * Fix build
    
    * Fix CI and rename functions
    
    * Apply comments
---
 apps/cpp_rtvm/README.md                            |  14 ++
 apps/cpp_rtvm/main.cc                              |   9 +
 apps/cpp_rtvm/tvm_runner.cc                        |  29 ++-
 apps/cpp_rtvm/tvm_runner.h                         |   4 +
 src/runtime/opencl/opencl_common.h                 |   2 +
 src/runtime/opencl/opencl_device_api.cc            |   4 +-
 src/runtime/opencl/opencl_module.cc                |  77 ++++++++
 .../opencl/opencl_wrapper/opencl_wrapper.cc        |  12 ++
 tests/cpp-runtime/opencl/opencl_compile_to_bin.cc  | 208 +++++++++++++++++++++
 9 files changed, 356 insertions(+), 3 deletions(-)

diff --git a/apps/cpp_rtvm/README.md b/apps/cpp_rtvm/README.md
index e696153282..c60a7b0e12 100644
--- a/apps/cpp_rtvm/README.md
+++ b/apps/cpp_rtvm/README.md
@@ -352,3 +352,17 @@ python3 -m tvm.driver.tvmc compile --cross-compiler 
${ANDROID_NDK_HOME}/toolchai
 python3 -m tvm.driver.tvmc run --device="cl" keras-resnet50.tar --rpc-key 
${TVM_RPC_KEY} --rpc-tracker {TVM_TRACKER_HOST}:{TVM_TRACKER_PORT} --print-time
 
 ```
+
+# Use pre-compiled OpenCL kernels
+Using pre-compiled programs might significantly improve inference time of the
+first run. E.g. for topology with ~300 kernels compilation time on Adreno was
+about 26 seconds. But after dumping compiled programs to binary files and reuse
+them on the next runs, the compilation time was significantly decreased (more
+than 1000 times) and starts to be around 25 ms.
+
+To use such functionality, the developer have to pass parameter 
`--pre-compiled`
+to the `rtvm` and specify the file name where pre-compiled programs will be
+stored. If the pre-compiled file name was passed to the `rtvm` then After 
method
+`Load`, method `UsePreCompiledProgram` is called. This method loads 
pre-compiled
+programs if the file exists. In opposite case the file will be created and
+pre-compiled programs will be saved to this file.
diff --git a/apps/cpp_rtvm/main.cc b/apps/cpp_rtvm/main.cc
index 31019ee0c9..c38a5f62bd 100644
--- a/apps/cpp_rtvm/main.cc
+++ b/apps/cpp_rtvm/main.cc
@@ -54,6 +54,7 @@ static const string kUsage =
     "--input        - Numpy file for the model input (optional and we use 
random of not given)\n"
     "--output       - Numpy file name to dump the model output as numpy\n"
     "--dump-meta    - Dump model meta information\n"
+    "--pre-compiled - The file name of a file where pre-compiled programs 
should be stored"
     "\n"
     "  Example\n"
     "  ./rtvm --model=keras-resnet50 --device=\"opencl\" --dump-meta\n"
@@ -66,12 +67,14 @@ static const string kUsage =
  * \arg device The target device to use {llvm, cl, ...etc.}
  * \arg input Numpy file for the model input
  * \arg output Numpy file name to dump the model output as numpy
+ * \arg pre_compiled File name where pre-compiled programs should be stored
  */
 struct ToolArgs {
   string model;
   string device;
   string input;
   string output;
+  string pre_compiled;
   bool dump_meta = false;
 };
 
@@ -84,6 +87,7 @@ void PrintArgs(const ToolArgs& args) {
   LOG(INFO) << "Device        = " << args.device;
   LOG(INFO) << "Input         = " << args.input;
   LOG(INFO) << "Output        = " << args.output;
+  LOG(INFO) << "Pre-compiled  = " << args.pre_compiled;
   LOG(INFO) << "Dump Metadata = " << ((args.dump_meta) ? ("True") : ("False"));
 }
 
@@ -172,6 +176,8 @@ void ParseCmdArgs(int argc, char* argv[], struct ToolArgs& 
args) {
   if (!pmeta.empty()) {
     args.dump_meta = true;
   }
+
+  args.pre_compiled = GetCmdOption(argc, argv, "--pre-compiled=");
 }
 
 /*!
@@ -190,6 +196,9 @@ int ExecuteModel(ToolArgs& args) {
 
   // Load the model
   runner.Load();
+  if (!args.pre_compiled.empty()) {
+    runner.UsePreCompiledPrograms(args.pre_compiled);
+  }
 
   // Query Model meta Information
   TVMMetaInfo mInfo = runner.GetMetaInfo();
diff --git a/apps/cpp_rtvm/tvm_runner.cc b/apps/cpp_rtvm/tvm_runner.cc
index 74498e8170..2fd4f2281e 100644
--- a/apps/cpp_rtvm/tvm_runner.cc
+++ b/apps/cpp_rtvm/tvm_runner.cc
@@ -27,6 +27,7 @@
 #include <cnpy.h>
 
 #include <fstream>
+#include <iterator>
 #include <streambuf>
 #include <string>
 
@@ -67,7 +68,8 @@ int GetTVMDevice(std::string device) {
  * \param path where the tfm compiler artifacts present.
  * \param device the target device where we need to load the compiled model.
  */
-TVMRunner::TVMRunner(std::string path, std::string device) : 
r_model_path(path), r_device(device) {
+TVMRunner::TVMRunner(std::string path, std::string device)
+    : r_model_path(path), r_device(device), r_run_was_called(false) {
   LOG(INFO) << "TVMRunner Constructor:" << r_model_path << " Devices:" << 
r_device;
 }
 
@@ -110,6 +112,30 @@ int TVMRunner::Load(void) {
   return 0;
 }
 
+/*!
+ * \brief Specify if the run programs should be dumped to binary and reused in 
the next runs.
+ * \param file_name File name where pre-compiled programs should be stored.
+ */
+void TVMRunner::UsePreCompiledPrograms(std::string file_name) {
+  if (r_run_was_called) {
+    LOG(INFO) << "TVMRunner UsePreCompiledPrograms: should be called before 
first run";
+    return;
+  }
+  auto f_get = r_mod_handle->GetFunction("opencl.GetPreCompiledPrograms", 
true);
+  auto f_set = r_mod_handle->GetFunction("opencl.SetPreCompiledPrograms", 
true);
+  if (f_get != nullptr && f_set != nullptr) {
+    std::ifstream ifs(file_name, std::ios::in | std::ios::binary);
+    if (ifs.fail()) {
+      auto bytes = String(f_get());
+      std::ofstream fs(file_name, std::ofstream::binary);
+      fs.write(bytes.c_str(), bytes.size());
+    } else {
+      std::string bytes((std::istreambuf_iterator<char>(ifs)), 
std::istreambuf_iterator<char>());
+      f_set(String(bytes));
+    }
+  }
+}
+
 /*!
  * \brief Calculated the memory size for the NDArray.
  * \param NDArray object.
@@ -242,6 +268,7 @@ int TVMRunner::GetOutput(std::string output_id, char* 
raw_output) {
  */
 int TVMRunner::Run(void) {
   LOG(INFO) << "TVMRunner::Run";
+  r_run_was_called = true;
 
   r_graph_handle.GetFunction("run")();
   return 0;
diff --git a/apps/cpp_rtvm/tvm_runner.h b/apps/cpp_rtvm/tvm_runner.h
index 37ba53606e..926e009c4c 100644
--- a/apps/cpp_rtvm/tvm_runner.h
+++ b/apps/cpp_rtvm/tvm_runner.h
@@ -56,6 +56,8 @@ class TVMRunner {
 
   /*! \brief Initiates graph runtime and with the compiled model */
   int Load(void);
+  /*! \brief Specify if the run programs should be dumped to binary and reused 
in the next runs */
+  void UsePreCompiledPrograms(std::string);
   /*! \brief Executes one inference cycle */
   int Run(void);
   /*! \brief To set the inputs from given npz file */
@@ -86,6 +88,8 @@ class TVMRunner {
   std::string r_device;
   /*! \brief Holds meta information queried from graph runtime */
   TVMMetaInfo mInfo;
+  /*! \brief Mark if the run method was called */
+  bool r_run_was_called;
 };
 
 }  // namespace runtime
diff --git a/src/runtime/opencl/opencl_common.h 
b/src/runtime/opencl/opencl_common.h
index c172a0f945..a295ea396c 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -438,6 +438,8 @@ class OpenCLModuleNode : public ModuleNode {
   // install a new kernel to thread local entry
   cl_kernel InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThreadEntry* t,
                           const std::string& func_name, const KTRefEntry& e);
+  void SetPreCompiledPrograms(const std::string& bytes);
+  std::string GetPreCompiledPrograms();
 
  private:
   // The workspace, need to keep reference to use it in destructor.
diff --git a/src/runtime/opencl/opencl_device_api.cc 
b/src/runtime/opencl/opencl_device_api.cc
index aa31d80d6e..c53523267d 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -202,7 +202,7 @@ void* 
OpenCLWorkspace::CreateHostPtrIfEnabled(cl::BufferDescriptor* desc, Device
   cl_int err_code;
   desc->host_ptr = reinterpret_cast<cl_uchar*>(
       clEnqueueMapBuffer(this->GetQueue(dev), desc->buffer, CL_TRUE, 
CL_MAP_WRITE, 0,
-                         sizeof(cl_uchar) * size, 0, NULL, NULL, &err_code));
+                         sizeof(cl_uchar) * size, 0, nullptr, nullptr, 
&err_code));
   OPENCL_CHECK_ERROR(err_code);
 #endif  // OPENCL_ENABLE_HOST_PTR
   return desc;
@@ -256,7 +256,7 @@ void OpenCLWorkspace::FreeDataSpace(Device dev, void* ptr) {
   cl::BufferDescriptor* desc = static_cast<cl::BufferDescriptor*>(ptr);
   if (desc->host_ptr) {
     clEnqueueUnmapMemObject(this->GetQueue(dev), desc->buffer,
-                            reinterpret_cast<void*>(desc->host_ptr), 0, NULL, 
NULL);
+                            reinterpret_cast<void*>(desc->host_ptr), 0, 
nullptr, nullptr);
   }
   OPENCL_CALL(clReleaseMemObject(desc->buffer));
   delete desc;
diff --git a/src/runtime/opencl/opencl_module.cc 
b/src/runtime/opencl/opencl_module.cc
index 2fb157aac6..ad41a34dde 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -137,6 +137,15 @@ cl::OpenCLWorkspace* 
OpenCLModuleNode::GetGlobalWorkspace() {
 PackedFunc OpenCLModuleNode::GetFunction(const std::string& name,
                                          const ObjectPtr<Object>& 
sptr_to_self) {
   ICHECK_EQ(sptr_to_self.get(), this);
+  if (name == "opencl.GetPreCompiledPrograms") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      *rv = this->GetPreCompiledPrograms();
+    });
+  } else if (name == "opencl.SetPreCompiledPrograms") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+      this->SetPreCompiledPrograms(args[0]);
+    });
+  }
   ICHECK_NE(name, symbol::tvm_module_main) << "Device function do not have 
main";
   auto it = fmap_.find(name);
   if (it == fmap_.end()) return PackedFunc();
@@ -262,6 +271,74 @@ cl_kernel 
OpenCLModuleNode::InstallKernel(cl::OpenCLWorkspace* w, cl::OpenCLThre
   return kernel;
 }
 
+void OpenCLModuleNode::SetPreCompiledPrograms(const std::string& bytes) {
+  std::string data = bytes;
+  dmlc::MemoryStringStream reader(&data);
+  dmlc::Stream* strm = &reader;
+  uint64_t kernels_num;
+  strm->Read(&kernels_num);
+  cl::OpenCLThreadEntry* t = workspace_->GetThreadEntry();
+  int device_id = t->device.device_id;
+  for (size_t i = 0; i < kernels_num; ++i) {
+    std::string name;
+    std::vector<unsigned char> bin_vector;
+    strm->Read(&name);
+    strm->Read(&bin_vector);
+    if (programs_[name][device_id] == nullptr) {
+      cl_int err = 0;
+      cl_int binaryStatus;
+      size_t binarySize = bin_vector.size();
+      const unsigned char* programBinary = bin_vector.data();
+
+      cl_device_id dev = workspace_->devices[device_id];
+      programs_[name][device_id] = clCreateProgramWithBinary(
+          workspace_->context, 1, &dev, &binarySize, &programBinary, 
&binaryStatus, &err);
+      OPENCL_CHECK_ERROR(err);
+      OPENCL_CHECK_ERROR(binaryStatus);
+
+      err = clBuildProgram(programs_[name][device_id], 0, nullptr, nullptr, 
nullptr, nullptr);
+      if (err != CL_SUCCESS) {
+        size_t len;
+        std::string log;
+        clGetProgramBuildInfo(programs_[name][device_id], dev, 
CL_PROGRAM_BUILD_LOG, 0, nullptr,
+                              &len);
+        log.resize(len);
+        clGetProgramBuildInfo(programs_[name][device_id], dev, 
CL_PROGRAM_BUILD_LOG, len, &log[0],
+                              nullptr);
+        LOG(FATAL) << "OpenCL build error for device=" << dev << "\n" << log;
+      }
+    }
+  }
+}
+
+std::string OpenCLModuleNode::GetPreCompiledPrograms() {
+  std::string data;
+  dmlc::MemoryStringStream writer(&data);
+  dmlc::Stream* strm = &writer;
+  strm->Write(static_cast<uint64_t>(parsed_kernels_.size()));
+  for (auto& it : parsed_kernels_) {
+    std::string name = it.first;
+    cl::OpenCLThreadEntry* t = workspace_->GetThreadEntry();
+    int device_id = t->device.device_id;
+    t->kernel_table.resize(workspace_->num_registered_kernels);
+    if (programs_[std::string(name)][device_id] == nullptr) {
+      InstallKernel(workspace_, t, name, kid_map_[name]);
+    }
+    size_t size;
+    clGetProgramInfo(programs_[name][device_id], CL_PROGRAM_BINARY_SIZES, 
sizeof(size_t), &size,
+                     nullptr);
+    ICHECK(size > 0) << "Size of binary is 0";
+    std::vector<unsigned char> bin_vector(size);
+    unsigned char* binary = bin_vector.data();
+    clGetProgramInfo(programs_[name][device_id], CL_PROGRAM_BINARIES, 
sizeof(unsigned char*),
+                     &binary, nullptr);
+
+    strm->Write(name);
+    strm->Write(bin_vector);
+  }
+  return data;
+}
+
 Module OpenCLModuleCreate(std::string data, std::string fmt,
                           std::unordered_map<std::string, FunctionInfo> fmap, 
std::string source) {
   auto n = make_object<OpenCLModuleNode>(data, fmt, fmap, source);
diff --git a/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc 
b/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc
index 2c27689454..13b7d94706 100644
--- a/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc
+++ b/src/runtime/opencl/opencl_wrapper/opencl_wrapper.cc
@@ -137,6 +137,7 @@ using f_clCreateProgramWithBinary = cl_program 
(*)(cl_context, cl_uint, const cl
 using f_clReleaseProgram = cl_int (*)(cl_program);
 using f_clBuildProgram = cl_int (*)(cl_program, cl_uint, const cl_device_id*, 
const char*,
                                     void (*pfn_notify)(cl_program program, 
void* user_data), void*);
+using f_clGetProgramInfo = cl_int (*)(cl_program, cl_program_info, size_t, 
void*, size_t*);
 using f_clGetProgramBuildInfo = cl_int (*)(cl_program, cl_device_id, 
cl_program_build_info, size_t,
                                            void*, size_t*);
 using f_clCreateKernel = cl_kernel (*)(cl_program, const char*, cl_int*);
@@ -347,6 +348,17 @@ cl_int clBuildProgram(cl_program program, cl_uint 
num_devices, const cl_device_i
   }
 }
 
+cl_int clGetProgramInfo(cl_program program, cl_program_info param_name, size_t 
param_value_size,
+                        void* param_value, size_t* param_value_size_ret) {
+  auto& lib = LibOpenCLWrapper::getInstance();
+  auto func = (f_clGetProgramInfo)lib.getOpenCLFunction("clGetProgramInfo");
+  if (func) {
+    return func(program, param_name, param_value_size, param_value, 
param_value_size_ret);
+  } else {
+    return CL_INVALID_PLATFORM;
+  }
+}
+
 cl_int clGetProgramBuildInfo(cl_program program, cl_device_id device,
                              cl_program_build_info param_name, size_t 
param_value_size,
                              void* param_value, size_t* param_value_size_ret) {
diff --git a/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc 
b/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc
new file mode 100644
index 0000000000..a1bdeb9c14
--- /dev/null
+++ b/tests/cpp-runtime/opencl/opencl_compile_to_bin.cc
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <gtest/gtest.h>
+#include <tvm/runtime/profiling.h>
+
+#include <chrono>
+#include <regex>
+
+#include "../src/runtime/opencl/opencl_common.h"
+
+using namespace tvm::runtime;
+using namespace tvm::runtime::cl;
+
+namespace {
+// This kernel was generated by TVM for conv2d operation
+const std::string kernelTemplate = R"(
+// Function: kernel_name_placeholder0
+__kernel void kernel_name_placeholder0(__write_only image2d_t 
pad_temp_texture, __read_only image2d_t placeholder0) {
+  const sampler_t image_sampler = CLK_NORMALIZED_COORDS_FALSE | 
CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  float4 _1 = read_imagef(placeholder0, image_sampler, 
(int2)(((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 9) - 1), 
((((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) / 81) * 7) + 
((((((int)get_group_id(0)) * 32) + ((int)get_local_id(0))) % 81) / 9)) - 1)));
+  (void)write_imagef(pad_temp_texture, (int2)((((((int)get_group_id(0)) * 32) 
+ ((int)get_local_id(0))) % 9), (((((int)get_group_id(0)) * 32) + 
((int)get_local_id(0))) / 9)), (((((9 <= (((((int)get_group_id(0)) * 32) + 
((int)get_local_id(0))) % 81)) && ((((((int)get_group_id(0)) * 32) + 
((int)get_local_id(0))) % 81) < 72)) && (1 <= (((((int)get_group_id(0)) * 32) + 
((int)get_local_id(0))) % 9))) && ((((((int)get_group_id(0)) * 32) + 
((int)get_local_id(0))) % 9) < 8)) ? _1 : ((float4)((fl [...]
+}
+
+// Function: kernel_name_placeholder1
+__kernel void kernel_name_placeholder1(__read_only image2d_t pad_temp_texture, 
__read_only image2d_t placeholder1, __write_only image2d_t compute, __read_only 
image2d_t placeholder2, __read_only image2d_t placeholder3) {
+  const sampler_t image_sampler = CLK_NORMALIZED_COORDS_FALSE | 
CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
+  float4 compute1[14];
+  vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 0);
+  vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 28);
+  vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 4);
+  vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 32);
+  vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 8);
+  vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 36);
+  vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 12);
+  vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 40);
+  vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 16);
+  vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 44);
+  vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 20);
+  vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 48);
+  vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 24);
+  vstore4(((float4)((float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f, (float)0.000000e+00f)), 0, (float*)compute1 + 52);
+  for (int rc_inner = 0; rc_inner < 128; ++rc_inner) {
+    for (int ry_inner = 0; ry_inner < 3; ++ry_inner) {
+      for (int rx_inner = 0; rx_inner < 3; ++rx_inner) {
+        for (int rc = 0; rc < 4; ++rc) {
+          float4 _1 = read_imagef(pad_temp_texture, image_sampler, 
(int2)((((int)get_local_id(0)) + rx_inner), ((rc_inner * 9) + ry_inner)));
+          float4 _2 = read_imagef(placeholder1, image_sampler, 
(int2)(((((rc_inner * 36) + (rc * 9)) + (ry_inner * 3)) + rx_inner), 
((((int)get_group_id(2)) * 16) + ((int)get_local_id(2)))));
+          vstore4((vload4(0, (float*)compute1 + 0) + (((float*)&_1)[rc] * 
_2)), 0, (float*)compute1 + 0);
+          float4 _3 = read_imagef(placeholder1, image_sampler, 
(int2)(((((rc_inner * 36) + (rc * 9)) + (ry_inner * 3)) + rx_inner), 
(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8)));
+          vstore4((vload4(0, (float*)compute1 + 28) + (((float*)&_1)[rc] * 
_3)), 0, (float*)compute1 + 28);
+          float4 _4 = read_imagef(pad_temp_texture, image_sampler, 
(int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 1)));
+          vstore4((vload4(0, (float*)compute1 + 4) + (((float*)&_4)[rc] * 
_2)), 0, (float*)compute1 + 4);
+          vstore4((vload4(0, (float*)compute1 + 32) + (((float*)&_4)[rc] * 
_3)), 0, (float*)compute1 + 32);
+          float4 _5 = read_imagef(pad_temp_texture, image_sampler, 
(int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 2)));
+          vstore4((vload4(0, (float*)compute1 + 8) + (((float*)&_5)[rc] * 
_2)), 0, (float*)compute1 + 8);
+          vstore4((vload4(0, (float*)compute1 + 36) + (((float*)&_5)[rc] * 
_3)), 0, (float*)compute1 + 36);
+          float4 _6 = read_imagef(pad_temp_texture, image_sampler, 
(int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 3)));
+          vstore4((vload4(0, (float*)compute1 + 12) + (((float*)&_6)[rc] * 
_2)), 0, (float*)compute1 + 12);
+          vstore4((vload4(0, (float*)compute1 + 40) + (((float*)&_6)[rc] * 
_3)), 0, (float*)compute1 + 40);
+          float4 _7 = read_imagef(pad_temp_texture, image_sampler, 
(int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 4)));
+          vstore4((vload4(0, (float*)compute1 + 16) + (((float*)&_7)[rc] * 
_2)), 0, (float*)compute1 + 16);
+          vstore4((vload4(0, (float*)compute1 + 44) + (((float*)&_7)[rc] * 
_3)), 0, (float*)compute1 + 44);
+          float4 _8 = read_imagef(pad_temp_texture, image_sampler, 
(int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 5)));
+          vstore4((vload4(0, (float*)compute1 + 20) + (((float*)&_8)[rc] * 
_2)), 0, (float*)compute1 + 20);
+          vstore4((vload4(0, (float*)compute1 + 48) + (((float*)&_8)[rc] * 
_3)), 0, (float*)compute1 + 48);
+          float4 _9 = read_imagef(pad_temp_texture, image_sampler, 
(int2)((((int)get_local_id(0)) + rx_inner), (((rc_inner * 9) + ry_inner) + 6)));
+          vstore4((vload4(0, (float*)compute1 + 24) + (((float*)&_9)[rc] * 
_2)), 0, (float*)compute1 + 24);
+          vstore4((vload4(0, (float*)compute1 + 52) + (((float*)&_9)[rc] * 
_3)), 0, (float*)compute1 + 52);
+        }
+      }
+    }
+  }
+  float4 _10 = read_imagef(placeholder2, image_sampler, 
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+  float4 _11 = read_imagef(placeholder3, image_sampler, 
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+  (void)write_imagef(compute, (int2)(((int)get_local_id(0)), 
((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7))), 
max(((vload4(0, (float*)compute1 + 0) * _10) + _11), 
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f))));
+  float4 _12 = read_imagef(placeholder2, image_sampler, 
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+  float4 _13 = read_imagef(placeholder3, image_sampler, 
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+  (void)write_imagef(compute, (int2)(((int)get_local_id(0)), 
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 56)), 
max(((vload4(0, (float*)compute1 + 28) * _12) + _13), 
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f))));
+  float4 _14 = read_imagef(placeholder2, image_sampler, 
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+  float4 _15 = read_imagef(placeholder3, image_sampler, 
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+  (void)write_imagef(compute, (int2)(((int)get_local_id(0)), 
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 1)), 
max(((vload4(0, (float*)compute1 + 4) * _14) + _15), 
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f))));
+  float4 _16 = read_imagef(placeholder2, image_sampler, 
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+  float4 _17 = read_imagef(placeholder3, image_sampler, 
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+  (void)write_imagef(compute, (int2)(((int)get_local_id(0)), 
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 57)), 
max(((vload4(0, (float*)compute1 + 32) * _16) + _17), 
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f))));
+  float4 _18 = read_imagef(placeholder2, image_sampler, 
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+  float4 _19 = read_imagef(placeholder3, image_sampler, 
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+  (void)write_imagef(compute, (int2)(((int)get_local_id(0)), 
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 2)), 
max(((vload4(0, (float*)compute1 + 8) * _18) + _19), 
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f))));
+  float4 _20 = read_imagef(placeholder2, image_sampler, 
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+  float4 _21 = read_imagef(placeholder3, image_sampler, 
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+  (void)write_imagef(compute, (int2)(((int)get_local_id(0)), 
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 58)), 
max(((vload4(0, (float*)compute1 + 36) * _20) + _21), 
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f))));
+  float4 _22 = read_imagef(placeholder2, image_sampler, 
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+  float4 _23 = read_imagef(placeholder3, image_sampler, 
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+  (void)write_imagef(compute, (int2)(((int)get_local_id(0)), 
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 3)), 
max(((vload4(0, (float*)compute1 + 12) * _22) + _23), 
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f))));
+  float4 _24 = read_imagef(placeholder2, image_sampler, 
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+  float4 _25 = read_imagef(placeholder3, image_sampler, 
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+  (void)write_imagef(compute, (int2)(((int)get_local_id(0)), 
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 59)), 
max(((vload4(0, (float*)compute1 + 40) * _24) + _25), 
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f))));
+  float4 _26 = read_imagef(placeholder2, image_sampler, 
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+  float4 _27 = read_imagef(placeholder3, image_sampler, 
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+  (void)write_imagef(compute, (int2)(((int)get_local_id(0)), 
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 4)), 
max(((vload4(0, (float*)compute1 + 16) * _26) + _27), 
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f))));
+  float4 _28 = read_imagef(placeholder2, image_sampler, 
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+  float4 _29 = read_imagef(placeholder3, image_sampler, 
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+  (void)write_imagef(compute, (int2)(((int)get_local_id(0)), 
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 60)), 
max(((vload4(0, (float*)compute1 + 44) * _28) + _29), 
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f))));
+  float4 _30 = read_imagef(placeholder2, image_sampler, 
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+  float4 _31 = read_imagef(placeholder3, image_sampler, 
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+  (void)write_imagef(compute, (int2)(((int)get_local_id(0)), 
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 5)), 
max(((vload4(0, (float*)compute1 + 20) * _30) + _31), 
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f))));
+  float4 _32 = read_imagef(placeholder2, image_sampler, 
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+  float4 _33 = read_imagef(placeholder3, image_sampler, 
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+  (void)write_imagef(compute, (int2)(((int)get_local_id(0)), 
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 61)), 
max(((vload4(0, (float*)compute1 + 48) * _32) + _33), 
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f))));
+  float4 _34 = read_imagef(placeholder2, image_sampler, 
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+  float4 _35 = read_imagef(placeholder3, image_sampler, 
(int2)(((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))), 0));
+  (void)write_imagef(compute, (int2)(((int)get_local_id(0)), 
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 6)), 
max(((vload4(0, (float*)compute1 + 24) * _34) + _35), 
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f))));
+  float4 _36 = read_imagef(placeholder2, image_sampler, 
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+  float4 _37 = read_imagef(placeholder3, image_sampler, 
(int2)((((((int)get_group_id(2)) * 16) + ((int)get_local_id(2))) + 8), 0));
+  (void)write_imagef(compute, (int2)(((int)get_local_id(0)), 
(((((int)get_group_id(2)) * 112) + (((int)get_local_id(2)) * 7)) + 62)), 
max(((vload4(0, (float*)compute1 + 52) * _36) + _37), 
((float4)((float)0.000000e+00f, (float)0.000000e+00f, (float)0.000000e+00f, 
(float)0.000000e+00f))));
+}
+
+    )";
+}  // namespace
+
+using Timestamp = std::chrono::time_point<std::chrono::high_resolution_clock>;
+
+class OpenCLCompileBin : public ::testing::Test {
+ protected:
+  virtual void SetUp() override {
+    m_workspace = OpenCLWorkspace::Global();
+    OpenCLThreadEntry* t = m_workspace->GetThreadEntry();
+    t->kernel_table.resize(m_kernelsNum * 2);
+    m_kernelNames.resize(m_kernelsNum * 2);
+    m_dataSrc = "";
+    m_fmap.clear();
+    for (size_t i = 0; i < m_kernelsNum; ++i) {
+      std::string kernel_name = "generated_kernel_" + std::to_string(i) + "_";
+      std::string kernelSource =
+          std::regex_replace(kernelTemplate, 
std::regex("kernel_name_placeholder"), kernel_name);
+      FunctionInfo fi1 = {kernel_name + "0"};
+      FunctionInfo fi2 = {kernel_name + "1"};
+      m_fmap[fi1.name] = fi1;
+      m_fmap[fi2.name] = fi2;
+      m_kernelNames[i * 2] = fi1.name;
+      m_kernelNames[i * 2 + 1] = fi2.name;
+      m_dataSrc += kernelSource;
+    }
+  }
+
+ protected:
+  const size_t m_kernelsNum = 100;
+  const std::string m_tmpDirName = "OpenCLCompileBin_dir";
+  OpenCLWorkspace* m_workspace;
+  std::string m_dataSrc;
+  std::unordered_map<std::string, FunctionInfo> m_fmap;
+  std::vector<std::string> m_kernelNames;
+};
+
+TEST_F(OpenCLCompileBin, SourceVsBinaryCompilationPerf) {
+  double compileFromSourceTimeMS, compileFromBinTimeMS;
+  std::string bytes;
+  {
+    OpenCLModuleNode module(m_dataSrc, "cl", m_fmap, std::string());
+    module.Init();
+    Timestamp comp_start = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < m_kernelNames.size(); ++i) {
+      OpenCLModuleNode::KTRefEntry e = {i, 1};
+      module.InstallKernel(m_workspace, m_workspace->GetThreadEntry(), 
m_kernelNames[i], e);
+    }
+    Timestamp comp_end = std::chrono::high_resolution_clock::now();
+    auto get_pre_compiled_f =
+        module.GetFunction("opencl.GetPreCompiledPrograms", 
GetObjectPtr<Object>(&module));
+    bytes = String(get_pre_compiled_f());
+    std::chrono::duration duration =
+        std::chrono::duration_cast<std::chrono::nanoseconds>(comp_end - 
comp_start);
+    compileFromSourceTimeMS = duration.count() * 1e-6;
+    std::cout << "Compile time from source:  " << compileFromSourceTimeMS << " 
ms." << std::endl;
+  }
+  {
+    OpenCLModuleNode module(m_dataSrc, "cl", m_fmap, std::string());
+    module.Init();
+    module.GetFunction("opencl.SetPreCompiledPrograms",
+                       GetObjectPtr<Object>(&module))(String(bytes));
+    Timestamp comp_start = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < m_kernelNames.size(); ++i) {
+      OpenCLModuleNode::KTRefEntry e = {i, 1};
+      module.InstallKernel(m_workspace, m_workspace->GetThreadEntry(), 
m_kernelNames[i], e);
+    }
+    Timestamp comp_end = std::chrono::high_resolution_clock::now();
+    std::chrono::duration duration =
+        std::chrono::duration_cast<std::chrono::nanoseconds>(comp_end - 
comp_start);
+    compileFromBinTimeMS = duration.count() * 1e-6;
+    std::cout << "Compile time from bin:  " << compileFromBinTimeMS << " ms." 
<< std::endl;
+  }
+  ASSERT_LT(compileFromBinTimeMS, compileFromSourceTimeMS);
+}

[tvm] branch main updated: [OpenCL] Implement save/load pre-compiled programs (#13868)

Reply via email to