This is an automated email from the ASF dual-hosted git repository.
tkonolige pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new 92da138bcb [Profiler] Allow user to flush L2 cache in `time_evalutor`
function for profiling CUDA kernels (#13726)
92da138bcb is described below
commit 92da138bcb18c76604763785d3177de3f9f9b296
Author: Zihao Ye <[email protected]>
AuthorDate: Wed Jan 11 02:17:24 2023 +0800
[Profiler] Allow user to flush L2 cache in `time_evalutor` function for
profiling CUDA kernels (#13726)
Currently, our default profiler (time_evaluator) does not flush the L2
cache per execution, this might lead to incorrect time measurement because the
input data last run might reside in L2 cache and reduce the data fetching time
in the next run. Both Triton and nvbench consider this effect thus reporting
more accurate measurements.
Solution: time_evalutor has an argument f_preproc where user can specify a
pre-processing function per execution of the kernel being evaluated. Currently,
TVM supports cache_flush_cpu_non_first_arg which flushes CPU cache. But similar
functionality for GPU is missing.
This PR completely borrows the design of nvbench's l2flush struct and allow
the user to specify "l2_cache_flush_cuda" as a preprocessing function which
flushes NVIDIA GPU's L2 cache. l2_cache_flush_cuda is not a default value so
existing program's behavior would not be influenced.
---
3rdparty/nvbench/l2_cache_flush.h | 74 ++++++++++++++++
LICENSE | 2 +-
LICENSE => licenses/LICENSE.l2_cache_flush.txt | 99 ++++++++--------------
src/runtime/cuda/l2_cache_flush.cc | 42 +++++++++
.../python/unittest/test_evaluator_with_preproc.py | 60 +++++++++++++
5 files changed, 212 insertions(+), 65 deletions(-)
diff --git a/3rdparty/nvbench/l2_cache_flush.h
b/3rdparty/nvbench/l2_cache_flush.h
new file mode 100644
index 0000000000..3d02115645
--- /dev/null
+++ b/3rdparty/nvbench/l2_cache_flush.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2021 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 with the LLVM exception
+ * (the "License"); you may not use this file except in compliance with
+ * the License.
+ *
+ * You may obtain a copy of the License at
+ *
+ * http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * \file l2_cache_flush.h
+ * \brief Functions to flush L2 cache using CUDA's API, adopted from nvbench.
+ */
+#ifndef L2_CACHE_FLUSH_H_
+#define L2_CACHE_FLUSH_H_
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <dmlc/logging.h>
+
+namespace tvm {
+namespace runtime {
+
+#define CUDA_CALL(func) \
+ { \
+ cudaError_t e = (func); \
+ ICHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
+ << "CUDA: " << cudaGetErrorString(e); \
+ }
+
+class L2Flush {
+ public:
+ L2Flush() : initialized_(false), l2_size_(0), l2_buffer_(nullptr) {}
+
+ ~L2Flush() {
+ if (l2_size_ > 0) {
+ CUDA_CALL(cudaFree(l2_buffer_));
+ }
+ }
+
+ void Flush(cudaStream_t stream) {
+ if (!initialized_) {
+ // initialize l2_buffer_ and l2_size_
+ initialized_ = true;
+ int device_id;
+ CUDA_CALL(cudaGetDevice(&device_id));
+ CUDA_CALL(cudaDeviceGetAttribute(&l2_size_, cudaDevAttrL2CacheSize,
device_id));
+ if (l2_size_ > 0) {
+ CUDA_CALL(cudaMalloc(reinterpret_cast<void**>(&l2_buffer_), l2_size_));
+ }
+ }
+ if (l2_size_ > 0) {
+ CUDA_CALL(cudaMemsetAsync(l2_buffer_, 0, l2_size_, stream));
+ }
+ }
+
+ static L2Flush* ThreadLocal();
+
+ private:
+ bool initialized_ = false;
+ int l2_size_;
+ int* l2_buffer_;
+};
+
+} // namespace runtime
+} // namespace tvm
+
+#endif // L2_CACHE_FLUSH_H_
diff --git a/LICENSE b/LICENSE
index 6524d530de..fbc11be2de 100644
--- a/LICENSE
+++ b/LICENSE
@@ -212,6 +212,7 @@ Apache Software Foundation License 2.0
3rdparty/dlpack
3rdparty/dmlc-core
3rdparty/OpenCL-Headers
+3rdparty/nvbench (with LLVM exception)
BSD 2-clause License
@@ -234,7 +235,6 @@ MIT License
3rdparty/cma
3rdparty/compiler-rt/builtin_fp16.h
-
The Unlicense
-------------
diff --git a/LICENSE b/licenses/LICENSE.l2_cache_flush.txt
similarity index 82%
copy from LICENSE
copy to licenses/LICENSE.l2_cache_flush.txt
index 6524d530de..bd8b243dfa 100644
--- a/LICENSE
+++ b/licenses/LICENSE.l2_cache_flush.txt
@@ -2,9 +2,9 @@
Version 2.0, January 2004
http://www.apache.org/licenses/
- TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
- 1. Definitions.
+ 1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
@@ -63,14 +63,14 @@
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
- 2. Grant of Copyright License. Subject to the terms and conditions of
+ 2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
- 3. Grant of Patent License. Subject to the terms and conditions of
+ 3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
@@ -86,7 +86,7 @@
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
- 4. Redistribution. You may reproduce and distribute copies of the
+ 4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
@@ -127,7 +127,7 @@
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
- 5. Submission of Contributions. Unless You explicitly state otherwise,
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
@@ -135,12 +135,12 @@
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
- 6. Trademarks. This License does not grant permission to use the trade
+ 6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
- 7. Disclaimer of Warranty. Unless required by applicable law or
+ 7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
@@ -150,7 +150,7 @@
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
- 8. Limitation of Liability. In no event and under no legal theory,
+ 8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
@@ -162,7 +162,7 @@
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
- 9. Accepting Warranty or Additional Liability. While redistributing
+ 9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
@@ -173,12 +173,12 @@
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
- END OF TERMS AND CONDITIONS
+ END OF TERMS AND CONDITIONS
- APPENDIX: How to apply the Apache License to your work.
+ APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
- boilerplate notice, with the fields enclosed by brackets "{}"
+ boilerplate notice, with the fields enclosed by brackets "[]"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
@@ -186,62 +186,33 @@
same "printed page" as the copyright notice for easier
identification within third-party archives.
- Copyright {yyyy} {name of copyright owner}
+ Copyright [yyyy] [name of copyright owner]
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
-------------------------------------------------------------------------------------
-This product bundles various third-party components under other open source
licenses.
-This section summarizes those components and their licenses. See licenses/
-for text of these licenses.
+--- LLVM Exceptions to the Apache 2.0 License ----
-Apache Software Foundation License 2.0
---------------------------------------
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
-3rdparty/dlpack
-3rdparty/dmlc-core
-3rdparty/OpenCL-Headers
-
-
-BSD 2-clause License
---------------------
-
-3rdparty/picojson
-3rdparty/dmlc-core/include/dmlc/concurrentqueue.h
-
-
-BSD 2-clause License + zlib License
------------------------------------
-
-3rdparty/dmlc-core/include/dmlc/blockingconcurrentqueue.h
-
-
-MIT License
------------
-
-3rdparty/libcrc
-3rdparty/cma
-3rdparty/compiler-rt/builtin_fp16.h
-
-
-The Unlicense
--------------
-
-3rdparty/rang
-
-BSD 3-Clause "New" or "Revised" License
----------------------------------------
-
-3rdparty/cutlass
-3rdparty/libbacktrace
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
diff --git a/src/runtime/cuda/l2_cache_flush.cc
b/src/runtime/cuda/l2_cache_flush.cc
new file mode 100644
index 0000000000..6b2c466530
--- /dev/null
+++ b/src/runtime/cuda/l2_cache_flush.cc
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../../../3rdparty/nvbench/l2_cache_flush.h"
+
+#include <dmlc/thread_local.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
+
+#include "cuda_common.h"
+
+namespace tvm {
+
+namespace runtime {
+
+typedef dmlc::ThreadLocalStore<L2Flush> L2FlushStore;
+
+L2Flush* L2Flush::ThreadLocal() { return L2FlushStore::Get(); }
+
+TVM_REGISTER_GLOBAL("l2_cache_flush_cuda").set_body([](TVMArgs args,
TVMRetValue* rv) {
+ ICHECK(L2Flush::ThreadLocal() != nullptr) << "L2Flush::ThreadLocal do not
exist.";
+ cudaStream_t stream = CUDAThreadEntry::ThreadLocal()->stream;
+ L2Flush::ThreadLocal()->Flush(stream);
+});
+
+} // namespace runtime
+} // namespace tvm
diff --git a/tests/python/unittest/test_evaluator_with_preproc.py
b/tests/python/unittest/test_evaluator_with_preproc.py
new file mode 100644
index 0000000000..fc6eec25b8
--- /dev/null
+++ b/tests/python/unittest/test_evaluator_with_preproc.py
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm import te
+from tvm.script import tir as T
+import tvm.testing
+import numpy as np
+import pytest
+
+
[email protected]_func
+def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
+ A = T.match_buffer(a, [128, 128])
+ B = T.match_buffer(b, [128, 128])
+ C = T.match_buffer(c, [128, 128])
+ for i, j, k in T.grid(128, 128, 128):
+ with T.block("matmul"):
+ vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+ with T.init():
+ C[vi, vj] = 0.0
+ C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
[email protected]_cuda
[email protected]("f_preproc", ["", "l2_cache_flush_cuda"])
+def test_time_evalutor_with_preproc(f_preproc: str):
+ mod = tvm.IRModule.from_expr(matmul)
+ sch = tvm.tir.Schedule(mod)
+ blk = sch.get_block("matmul")
+ i, j, k = sch.get_loops(blk)
+ sch.bind(i, "blockIdx.x")
+ sch.bind(j, "threadIdx.x")
+ f = tvm.build(sch.mod["main"], target="cuda")
+ dev = tvm.cuda(0)
+ evaluator = f.time_evaluator(f.entry_name, dev, repeat=1000, number=1,
f_preproc=f_preproc)
+
+ a = tvm.nd.array(np.random.rand(128, 128).astype("float32"), device=dev)
+ b = tvm.nd.array(np.random.rand(128, 128).astype("float32"), device=dev)
+ c = tvm.nd.array(np.zeros((128, 128)).astype("float32"), device=dev)
+ args = [a, b, c]
+ print("Evaluator (f_preproc={}):\t{:.5f}ms".format(f_preproc,
evaluator(*args).mean * 1000))
+
+
+if __name__ == "__main__":
+ test_time_evalutor_with_preproc("l2_cache_flush_cuda")