This is an automated email from the ASF dual-hosted git repository.
masahi pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 0f6abea1ca [OpenCL] Change of OpenCL profiling logic (#11180)
0f6abea1ca is described below
commit 0f6abea1cafc664af0cb97f348906ae5715a5f51
Author: Kirill Snezhko <[email protected]>
AuthorDate: Tue May 10 23:14:16 2022 +0300
[OpenCL] Change of OpenCL profiling logic (#11180)
* Enable profiling only when it is used explicitly
* Change logic of clCommandQueue create/destroy
* Update comments
* Linter fix
* Refactor queue create
* Move queue recreation logic to function
* Replace profiling flag by the queue info request
* Enhance readability
* Fix linter errors
---
src/runtime/opencl/opencl_common.h | 57 ++++++++++++++++++++++++++++-----
src/runtime/opencl/opencl_device_api.cc | 5 ---
src/runtime/opencl/opencl_module.cc | 19 +++++------
3 files changed, 59 insertions(+), 22 deletions(-)
diff --git a/src/runtime/opencl/opencl_common.h
b/src/runtime/opencl/opencl_common.h
index 18061a7aee..c2905b4327 100644
--- a/src/runtime/opencl/opencl_common.h
+++ b/src/runtime/opencl/opencl_common.h
@@ -274,6 +274,16 @@ class OpenCLWorkspace : public DeviceAPI {
<< "Invalid OpenCL device_id=" << dev.device_id;
return events[dev.device_id];
}
+ // is current clCommandQueue in profiling mode
+ bool IsProfiling(Device dev) {
+ cl_command_queue queue = GetQueue(dev);
+ cl_command_queue_properties prop;
+
+ OPENCL_CALL(clGetCommandQueueInfo(queue, CL_QUEUE_PROPERTIES,
+ sizeof(cl_command_queue_properties),
&prop, nullptr));
+
+ return prop & CL_QUEUE_PROFILING_ENABLE;
+ }
// override device API
void SetDevice(Device dev) final;
@@ -422,23 +432,32 @@ class OpenCLTimerNode : public TimerNode {
virtual void Start() {
cl::OpenCLWorkspace::Global()->GetEventQueue(dev_).clear();
this->duration = 0;
+ // Very first call of Start() leads to the recreation of
+ // OpenCL command queue in profiling mode. This allows to run profile
after inference.
+ recreateCommandQueue();
}
// Timer stop
virtual void Stop() {
std::vector<cl_event> evt_queue =
cl::OpenCLWorkspace::Global()->GetEventQueue(dev_);
cl_ulong start, end;
- OPENCL_CALL(clWaitForEvents(1,
&(cl::OpenCLWorkspace::Global()->GetEventQueue(dev_).back())));
- for (auto& kevt : evt_queue) {
- OPENCL_CALL(clGetEventProfilingInfo(kevt, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),
- &start, nullptr));
- OPENCL_CALL(
- clGetEventProfilingInfo(kevt, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &end, nullptr));
- this->duration += (end - start);
+ if (cl::OpenCLWorkspace::Global()->GetEventQueue(dev_).size() > 0) {
+ OPENCL_CALL(clWaitForEvents(1,
&(cl::OpenCLWorkspace::Global()->GetEventQueue(dev_).back())));
+ for (auto& kevt : evt_queue) {
+ OPENCL_CALL(clGetEventProfilingInfo(kevt, CL_PROFILING_COMMAND_START,
sizeof(cl_ulong),
+ &start, nullptr));
+ OPENCL_CALL(clGetEventProfilingInfo(kevt, CL_PROFILING_COMMAND_END,
sizeof(cl_ulong), &end,
+ nullptr));
+ this->duration += (end - start);
+ }
}
}
virtual int64_t SyncAndGetElapsedNanos() { return this->duration; }
// destructor
- virtual ~OpenCLTimerNode() {}
+ virtual ~OpenCLTimerNode() {
+ // Profiling session ends, recreate clCommandQueue in non-profiling mode
+ // This will disable collection of cl_events in case of executing
inference after profile
+ recreateCommandQueue();
+ }
// constructor
OpenCLTimerNode() {}
explicit OpenCLTimerNode(Device dev) : dev_(dev) {}
@@ -449,6 +468,28 @@ class OpenCLTimerNode : public TimerNode {
private:
int64_t duration;
Device dev_;
+
+ void recreateCommandQueue() {
+ cl_command_queue_properties prop;
+ if (!cl::OpenCLWorkspace::Global()->IsProfiling(dev_)) {
+ prop = CL_QUEUE_PROFILING_ENABLE;
+ } else {
+ prop = 0;
+ }
+
+ auto queue = cl::OpenCLWorkspace::Global()->GetQueue(dev_);
+
+ OPENCL_CALL(clFlush(queue));
+ OPENCL_CALL(clFinish(queue));
+ OPENCL_CALL(clReleaseCommandQueue(queue));
+
+ cl_int err_code;
+ cl_device_id did = cl::OpenCLWorkspace::Global()->devices[dev_.device_id];
+ auto profiling_queue =
+ clCreateCommandQueue(cl::OpenCLWorkspace::Global()->context, did,
prop, &err_code);
+ OPENCL_CHECK_ERROR(err_code);
+ cl::OpenCLWorkspace::Global()->queues[dev_.device_id] = profiling_queue;
+ }
};
} // namespace runtime
} // namespace tvm
diff --git a/src/runtime/opencl/opencl_device_api.cc
b/src/runtime/opencl/opencl_device_api.cc
index c352716042..80b95a6ebf 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -426,12 +426,7 @@ void OpenCLWorkspace::Init(const std::string& type_key,
const std::string& devic
ICHECK_EQ(this->queues.size(), 0U);
for (size_t i = 0; i < this->devices.size(); ++i) {
cl_device_id did = this->devices[i];
-#ifdef USE_PROFILER
- this->queues.push_back(
- clCreateCommandQueue(this->context, did, CL_QUEUE_PROFILING_ENABLE,
&err_code));
-#else
this->queues.push_back(clCreateCommandQueue(this->context, did, 0,
&err_code));
-#endif
OPENCL_CHECK_ERROR(err_code);
}
this->events.resize(this->devices.size());
diff --git a/src/runtime/opencl/opencl_module.cc
b/src/runtime/opencl/opencl_module.cc
index e08c6070bc..9ae80d59d5 100644
--- a/src/runtime/opencl/opencl_module.cc
+++ b/src/runtime/opencl/opencl_module.cc
@@ -79,15 +79,16 @@ class OpenCLWrappedFunc {
wl.work_size[i] *= wl.work_size[i + 3];
}
// launch kernel
-#ifdef USE_PROFILER
- w_->GetEventQueue(t->device).resize(w_->GetEventQueue(t->device).size() +
1);
- OPENCL_CALL(clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr,
wl.work_size,
- wl.work_size + 3, 0, nullptr,
-
&(w_->GetEventQueue(t->device).back())));
-#else
- OPENCL_CALL(clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr,
wl.work_size,
- wl.work_size + 3, 0, nullptr, nullptr));
-#endif
+
+ if (w_->IsProfiling(t->device)) {
+ w_->GetEventQueue(t->device).resize(w_->GetEventQueue(t->device).size()
+ 1);
+ OPENCL_CALL(clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr,
wl.work_size,
+ wl.work_size + 3, 0, nullptr,
+
&(w_->GetEventQueue(t->device).back())));
+ } else {
+ OPENCL_CALL(clEnqueueNDRangeKernel(queue, kernel, work_dim, nullptr,
wl.work_size,
+ wl.work_size + 3, 0, nullptr,
nullptr));
+ }
}
private: