[tvm] branch main updated: [Profiler] Allow user to flush L2 cache in `time_evalutor` function for profiling CUDA kernels (#13726)

tkonolige Tue, 10 Jan 2023 10:17:41 -0800

This is an automated email from the ASF dual-hosted git repository.

tkonolige pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git



The following commit(s) were added to refs/heads/main by this push:
     new 92da138bcb [Profiler] Allow user to flush L2 cache in `time_evalutor` 
function for profiling CUDA kernels (#13726)
92da138bcb is described below

commit 92da138bcb18c76604763785d3177de3f9f9b296
Author: Zihao Ye <[email protected]>
AuthorDate: Wed Jan 11 02:17:24 2023 +0800

    [Profiler] Allow user to flush L2 cache in `time_evalutor` function for 
profiling CUDA kernels (#13726)
    
    Currently, our default profiler (time_evaluator) does not flush the L2 
cache per execution, this might lead to incorrect time measurement because the 
input data last run might reside in L2 cache and reduce the data fetching time 
in the next run. Both Triton and nvbench consider this effect thus reporting 
more accurate measurements.
    
    Solution: time_evalutor has an argument f_preproc where user can specify a 
pre-processing function per execution of the kernel being evaluated. Currently, 
TVM supports cache_flush_cpu_non_first_arg which flushes CPU cache. But similar 
functionality for GPU is missing.
    
    This PR completely borrows the design of nvbench's l2flush struct and allow 
the user to specify "l2_cache_flush_cuda" as a preprocessing function which 
flushes NVIDIA GPU's L2 cache. l2_cache_flush_cuda is not a default value so 
existing program's behavior would not be influenced.
---
 3rdparty/nvbench/l2_cache_flush.h                  | 74 ++++++++++++++++
 LICENSE                                            |  2 +-
 LICENSE => licenses/LICENSE.l2_cache_flush.txt     | 99 ++++++++--------------
 src/runtime/cuda/l2_cache_flush.cc                 | 42 +++++++++
 .../python/unittest/test_evaluator_with_preproc.py | 60 +++++++++++++
 5 files changed, 212 insertions(+), 65 deletions(-)

diff --git a/3rdparty/nvbench/l2_cache_flush.h 
b/3rdparty/nvbench/l2_cache_flush.h
new file mode 100644
index 0000000000..3d02115645
--- /dev/null
+++ b/3rdparty/nvbench/l2_cache_flush.h
@@ -0,0 +1,74 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ * \file l2_cache_flush.h
+ * \brief Functions to flush L2 cache using CUDA's API, adopted from nvbench.
+ */
+#ifndef L2_CACHE_FLUSH_H_
+#define L2_CACHE_FLUSH_H_
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <dmlc/logging.h>
+
+namespace tvm {
+namespace runtime {
+
+#define CUDA_CALL(func)                                       \
+  {                                                           \
+    cudaError_t e = (func);                                   \
+    ICHECK(e == cudaSuccess || e == cudaErrorCudartUnloading) \
+        << "CUDA: " << cudaGetErrorString(e);                 \
+  }
+
+class L2Flush {
+ public:
+  L2Flush() : initialized_(false), l2_size_(0), l2_buffer_(nullptr) {}
+
+  ~L2Flush() {
+    if (l2_size_ > 0) {
+      CUDA_CALL(cudaFree(l2_buffer_));
+    }
+  }
+
+  void Flush(cudaStream_t stream) {
+    if (!initialized_) {
+      // initialize l2_buffer_ and l2_size_
+      initialized_ = true;
+      int device_id;
+      CUDA_CALL(cudaGetDevice(&device_id));
+      CUDA_CALL(cudaDeviceGetAttribute(&l2_size_, cudaDevAttrL2CacheSize, 
device_id));
+      if (l2_size_ > 0) {
+        CUDA_CALL(cudaMalloc(reinterpret_cast<void**>(&l2_buffer_), l2_size_));
+      }
+    }
+    if (l2_size_ > 0) {
+      CUDA_CALL(cudaMemsetAsync(l2_buffer_, 0, l2_size_, stream));
+    }
+  }
+
+  static L2Flush* ThreadLocal();
+
+ private:
+  bool initialized_ = false;
+  int l2_size_;
+  int* l2_buffer_;
+};
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // L2_CACHE_FLUSH_H_
diff --git a/LICENSE b/LICENSE
index 6524d530de..fbc11be2de 100644
--- a/LICENSE
+++ b/LICENSE
@@ -212,6 +212,7 @@ Apache Software Foundation License 2.0
 3rdparty/dlpack
 3rdparty/dmlc-core
 3rdparty/OpenCL-Headers
+3rdparty/nvbench (with LLVM exception)
 
 
 BSD 2-clause License
@@ -234,7 +235,6 @@ MIT License
 3rdparty/cma
 3rdparty/compiler-rt/builtin_fp16.h
 
-
 The Unlicense
 -------------
 
diff --git a/LICENSE b/licenses/LICENSE.l2_cache_flush.txt
similarity index 82%
copy from LICENSE
copy to licenses/LICENSE.l2_cache_flush.txt
index 6524d530de..bd8b243dfa 100644
--- a/LICENSE
+++ b/licenses/LICENSE.l2_cache_flush.txt
@@ -2,9 +2,9 @@
                            Version 2.0, January 2004
                         http://www.apache.org/licenses/
 
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
 
-   1. Definitions.
+    1. Definitions.
 
       "License" shall mean the terms and conditions for use, reproduction,
       and distribution as defined by Sections 1 through 9 of this document.
@@ -63,14 +63,14 @@
       on behalf of whom a Contribution has been received by Licensor and
       subsequently incorporated within the Work.
 
-   2. Grant of Copyright License. Subject to the terms and conditions of
+    2. Grant of Copyright License. Subject to the terms and conditions of
       this License, each Contributor hereby grants to You a perpetual,
       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
       copyright license to reproduce, prepare Derivative Works of,
       publicly display, publicly perform, sublicense, and distribute the
       Work and such Derivative Works in Source or Object form.
 
-   3. Grant of Patent License. Subject to the terms and conditions of
+    3. Grant of Patent License. Subject to the terms and conditions of
       this License, each Contributor hereby grants to You a perpetual,
       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
       (except as stated in this section) patent license to make, have made,
@@ -86,7 +86,7 @@
       granted to You under this License for that Work shall terminate
       as of the date such litigation is filed.
 
-   4. Redistribution. You may reproduce and distribute copies of the
+    4. Redistribution. You may reproduce and distribute copies of the
       Work or Derivative Works thereof in any medium, with or without
       modifications, and in Source or Object form, provided that You
       meet the following conditions:
@@ -127,7 +127,7 @@
       reproduction, and distribution of the Work otherwise complies with
       the conditions stated in this License.
 
-   5. Submission of Contributions. Unless You explicitly state otherwise,
+    5. Submission of Contributions. Unless You explicitly state otherwise,
       any Contribution intentionally submitted for inclusion in the Work
       by You to the Licensor shall be under the terms and conditions of
       this License, without any additional terms or conditions.
@@ -135,12 +135,12 @@
       the terms of any separate license agreement you may have executed
       with Licensor regarding such Contributions.
 
-   6. Trademarks. This License does not grant permission to use the trade
+    6. Trademarks. This License does not grant permission to use the trade
       names, trademarks, service marks, or product names of the Licensor,
       except as required for reasonable and customary use in describing the
       origin of the Work and reproducing the content of the NOTICE file.
 
-   7. Disclaimer of Warranty. Unless required by applicable law or
+    7. Disclaimer of Warranty. Unless required by applicable law or
       agreed to in writing, Licensor provides the Work (and each
       Contributor provides its Contributions) on an "AS IS" BASIS,
       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
@@ -150,7 +150,7 @@
       appropriateness of using or redistributing the Work and assume any
       risks associated with Your exercise of permissions under this License.
 
-   8. Limitation of Liability. In no event and under no legal theory,
+    8. Limitation of Liability. In no event and under no legal theory,
       whether in tort (including negligence), contract, or otherwise,
       unless required by applicable law (such as deliberate and grossly
       negligent acts) or agreed to in writing, shall any Contributor be
@@ -162,7 +162,7 @@
       other commercial damages or losses), even if such Contributor
       has been advised of the possibility of such damages.
 
-   9. Accepting Warranty or Additional Liability. While redistributing
+    9. Accepting Warranty or Additional Liability. While redistributing
       the Work or Derivative Works thereof, You may choose to offer,
       and charge a fee for, acceptance of support, warranty, indemnity,
       or other liability obligations and/or rights consistent with this
@@ -173,12 +173,12 @@
       incurred by, or claims asserted against, such Contributor by reason
       of your accepting any such warranty or additional liability.
 
-   END OF TERMS AND CONDITIONS
+    END OF TERMS AND CONDITIONS
 
-   APPENDIX: How to apply the Apache License to your work.
+    APPENDIX: How to apply the Apache License to your work.
 
       To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "{}"
+      boilerplate notice, with the fields enclosed by brackets "[]"
       replaced with your own identifying information. (Don't include
       the brackets!)  The text should be enclosed in the appropriate
       comment syntax for the file format. We also recommend that a
@@ -186,62 +186,33 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright {yyyy} {name of copyright owner}
+    Copyright [yyyy] [name of copyright owner]
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+    Licensed under the Apache License, Version 2.0 (the "License");
+    you may not use this file except in compliance with the License.
+    You may obtain a copy of the License at
 
        http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
 
-------------------------------------------------------------------------------------
-This product bundles various third-party components under other open source 
licenses.
-This section summarizes those components and their licenses. See licenses/
-for text of these licenses.
 
+--- LLVM Exceptions to the Apache 2.0 License ----
 
-Apache Software Foundation License 2.0
---------------------------------------
+As an exception, if, as a result of your compiling your source code, portions
+of this Software are embedded into an Object form of such source code, you
+may redistribute such embedded portions in such Object form without complying
+with the conditions of Sections 4(a), 4(b) and 4(d) of the License.
 
-3rdparty/dlpack
-3rdparty/dmlc-core
-3rdparty/OpenCL-Headers
-
-
-BSD 2-clause License
---------------------
-
-3rdparty/picojson
-3rdparty/dmlc-core/include/dmlc/concurrentqueue.h
-
-
-BSD 2-clause License + zlib License
------------------------------------
-
-3rdparty/dmlc-core/include/dmlc/blockingconcurrentqueue.h
-
-
-MIT License
------------
-
-3rdparty/libcrc
-3rdparty/cma
-3rdparty/compiler-rt/builtin_fp16.h
-
-
-The Unlicense
--------------
-
-3rdparty/rang
-
-BSD 3-Clause "New" or "Revised" License
----------------------------------------
-
-3rdparty/cutlass
-3rdparty/libbacktrace
+In addition, if you combine or link compiled forms of this Software with
+software that is licensed under the GPLv2 ("Combined Software") and if a
+court of competent jurisdiction determines that the patent provision (Section
+3), the indemnity provision (Section 9) or other Section of the License
+conflicts with the conditions of the GPLv2, you may retroactively and
+prospectively choose to deem waived or otherwise exclude such Section(s) of
+the License, but only in their entirety and only with respect to the Combined
+Software.
diff --git a/src/runtime/cuda/l2_cache_flush.cc 
b/src/runtime/cuda/l2_cache_flush.cc
new file mode 100644
index 0000000000..6b2c466530
--- /dev/null
+++ b/src/runtime/cuda/l2_cache_flush.cc
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include "../../../3rdparty/nvbench/l2_cache_flush.h"
+
+#include <dmlc/thread_local.h>
+#include <tvm/runtime/device_api.h>
+#include <tvm/runtime/registry.h>
+
+#include "cuda_common.h"
+
+namespace tvm {
+
+namespace runtime {
+
+typedef dmlc::ThreadLocalStore<L2Flush> L2FlushStore;
+
+L2Flush* L2Flush::ThreadLocal() { return L2FlushStore::Get(); }
+
+TVM_REGISTER_GLOBAL("l2_cache_flush_cuda").set_body([](TVMArgs args, 
TVMRetValue* rv) {
+  ICHECK(L2Flush::ThreadLocal() != nullptr) << "L2Flush::ThreadLocal do not 
exist.";
+  cudaStream_t stream = CUDAThreadEntry::ThreadLocal()->stream;
+  L2Flush::ThreadLocal()->Flush(stream);
+});
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/tests/python/unittest/test_evaluator_with_preproc.py 
b/tests/python/unittest/test_evaluator_with_preproc.py
new file mode 100644
index 0000000000..fc6eec25b8
--- /dev/null
+++ b/tests/python/unittest/test_evaluator_with_preproc.py
@@ -0,0 +1,60 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import tvm
+from tvm import te
+from tvm.script import tir as T
+import tvm.testing
+import numpy as np
+import pytest
+
+
[email protected]_func
+def matmul(a: T.handle, b: T.handle, c: T.handle) -> None:
+    A = T.match_buffer(a, [128, 128])
+    B = T.match_buffer(b, [128, 128])
+    C = T.match_buffer(c, [128, 128])
+    for i, j, k in T.grid(128, 128, 128):
+        with T.block("matmul"):
+            vi, vj, vk = T.axis.remap("SSR", [i, j, k])
+            with T.init():
+                C[vi, vj] = 0.0
+            C[vi, vj] = C[vi, vj] + A[vi, vk] * B[vk, vj]
+
+
[email protected]_cuda
[email protected]("f_preproc", ["", "l2_cache_flush_cuda"])
+def test_time_evalutor_with_preproc(f_preproc: str):
+    mod = tvm.IRModule.from_expr(matmul)
+    sch = tvm.tir.Schedule(mod)
+    blk = sch.get_block("matmul")
+    i, j, k = sch.get_loops(blk)
+    sch.bind(i, "blockIdx.x")
+    sch.bind(j, "threadIdx.x")
+    f = tvm.build(sch.mod["main"], target="cuda")
+    dev = tvm.cuda(0)
+    evaluator = f.time_evaluator(f.entry_name, dev, repeat=1000, number=1, 
f_preproc=f_preproc)
+
+    a = tvm.nd.array(np.random.rand(128, 128).astype("float32"), device=dev)
+    b = tvm.nd.array(np.random.rand(128, 128).astype("float32"), device=dev)
+    c = tvm.nd.array(np.zeros((128, 128)).astype("float32"), device=dev)
+    args = [a, b, c]
+    print("Evaluator (f_preproc={}):\t{:.5f}ms".format(f_preproc, 
evaluator(*args).mean * 1000))
+
+
+if __name__ == "__main__":
+    test_time_evalutor_with_preproc("l2_cache_flush_cuda")

[tvm] branch main updated: [Profiler] Allow user to flush L2 cache in `time_evalutor` function for profiling CUDA kernels (#13726)

Reply via email to