This is an automated email from the ASF dual-hosted git repository.
junrushao pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tvm.git
The following commit(s) were added to refs/heads/main by this push:
new f8b320f523 [MetaSchedule][Runtime] Enhance Runner RandomFill (#11758)
f8b320f523 is described below
commit f8b320f523b24fd8ddb8cf7026e61bbb4f4ea348
Author: Junru Shao <[email protected]>
AuthorDate: Sat Jun 18 02:11:55 2022 -0700
[MetaSchedule][Runtime] Enhance Runner RandomFill (#11758)
---
CMakeLists.txt | 1 +
cmake/config.cmake | 3 +
cmake/modules/CUDA.cmake | 12 +++
cmake/modules/LibInfo.cmake | 1 +
cmake/utils/FindCUDA.cmake | 5 ++
docs/contribute/pull_request.rst | 1 +
python/tvm/auto_scheduler/testing/tune_onnx.py | 10 +--
python/tvm/auto_scheduler/testing/tune_relay.py | 10 +--
python/tvm/auto_scheduler/testing/tune_te.py | 10 +--
python/tvm/meta_schedule/runner/local_runner.py | 48 +++++-----
python/tvm/meta_schedule/runner/rpc_runner.py | 50 ++++++-----
python/tvm/meta_schedule/testing/tune_onnx.py | 8 +-
python/tvm/meta_schedule/testing/tune_relay.py | 8 +-
python/tvm/meta_schedule/testing/tune_te.py | 8 +-
src/runtime/contrib/curand/curand.cc | 104 ++++++++++++++++++++++
src/runtime/contrib/curand/helper_cuda_kernels.cu | 42 +++++++++
src/runtime/contrib/curand/helper_cuda_kernels.h | 41 +++++++++
src/runtime/contrib/random/mt_random_engine.cc | 103 ++++++++++++++++-----
src/runtime/contrib/random/random.cc | 15 ++++
src/support/libinfo.cc | 5 ++
20 files changed, 377 insertions(+), 108 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6931b40c66..31b0a90ef2 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -89,6 +89,7 @@ tvm_option(USE_CUDNN "Build with cuDNN" OFF)
tvm_option(USE_CUBLAS "Build with cuBLAS" OFF)
tvm_option(USE_CUTLASS "Build with CUTLASS" OFF)
tvm_option(USE_THRUST "Build with Thrust" OFF)
+tvm_option(USE_CURAND "Build with cuRAND" OFF)
tvm_option(USE_MIOPEN "Build with ROCM:MIOpen" OFF)
tvm_option(USE_ROCBLAS "Build with ROCM:RoCBLAS" OFF)
tvm_option(USE_SORT "Build with sort support" ON)
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 212b565f25..b9a3aaef7d 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -296,6 +296,9 @@ set(USE_VTA_FPGA OFF)
# Whether use Thrust
set(USE_THRUST OFF)
+# Whether use cuRAND
+set(USE_CURAND OFF)
+
# Whether to build the TensorFlow TVMDSOOp module
set(USE_TF_TVMDSOOP OFF)
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
index 056ed18d44..bbbf6b89ba 100644
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -69,6 +69,18 @@ if(USE_CUDA)
list(APPEND RUNTIME_SRCS ${CONTRIB_THRUST_SRC})
endif(USE_THRUST)
+ if(USE_CURAND)
+ message(STATUS "Build with cuRAND support")
+ message(STATUS "${CUDA_CURAND_LIBRARY}")
+ cmake_minimum_required(VERSION 3.13) # to compile CUDA code
+ enable_language(CUDA)
+ tvm_file_glob(GLOB CONTRIB_CURAND_SRC_CC src/runtime/contrib/curand/*.cc)
+ tvm_file_glob(GLOB CONTRIB_CURAND_SRC_CU src/runtime/contrib/curand/*.cu)
+ list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CURAND_LIBRARY})
+ list(APPEND RUNTIME_SRCS ${CONTRIB_CURAND_SRC_CC})
+ list(APPEND RUNTIME_SRCS ${CONTRIB_CURAND_SRC_CU})
+ endif(USE_CURAND)
+
if(USE_GRAPH_EXECUTOR_CUDA_GRAPH)
if(NOT USE_GRAPH_EXECUTOR)
message(FATAL_ERROR "CUDA Graph is only supported by graph executor,
please set USE_GRAPH_EXECUTOR=ON")
diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake
index 06c42494a3..3b3d8a4bcc 100644
--- a/cmake/modules/LibInfo.cmake
+++ b/cmake/modules/LibInfo.cmake
@@ -111,6 +111,7 @@ function(add_lib_info src_file)
TVM_INFO_USE_TFLITE="${USE_TFLITE}"
TVM_INFO_USE_THREADS="${USE_THREADS}"
TVM_INFO_USE_THRUST="${USE_THRUST}"
+ TVM_INFO_USE_CURAND="${USE_CURAND}"
TVM_INFO_USE_VITIS_AI="${USE_VITIS_AI}"
TVM_INFO_USE_VULKAN="${USE_VULKAN}"
TVM_INFO_USE_CLML="${USE_CLML}"
diff --git a/cmake/utils/FindCUDA.cmake b/cmake/utils/FindCUDA.cmake
index 8f3f638309..607f1761ae 100644
--- a/cmake/utils/FindCUDA.cmake
+++ b/cmake/utils/FindCUDA.cmake
@@ -85,6 +85,10 @@ macro(find_cuda use_cuda use_cudnn)
PATHS ${CUDA_TOOLKIT_ROOT_DIR}
PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib
targets/x86_64-linux/lib/stubs lib64/stubs lib/x86_64-linux-gnu
NO_DEFAULT_PATH)
+ find_library(CUDA_CURAND_LIBRARY curand
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib64
+ ${CUDA_TOOLKIT_ROOT_DIR}/lib
+ NO_DEFAULT_PATH)
find_library(CUDA_CUBLAS_LIBRARY cublas
${CUDA_TOOLKIT_ROOT_DIR}/lib64
${CUDA_TOOLKIT_ROOT_DIR}/lib
@@ -134,6 +138,7 @@ macro(find_cuda use_cuda use_cudnn)
message(STATUS "Found CUDA_CUDNN_INCLUDE_DIRS=" ${CUDA_CUDNN_INCLUDE_DIRS})
message(STATUS "Found CUDA_CUDNN_LIBRARY=" ${CUDA_CUDNN_LIBRARY})
message(STATUS "Found CUDA_CUBLAS_LIBRARY=" ${CUDA_CUBLAS_LIBRARY})
+ message(STATUS "Found CUDA_CURAND_LIBRARY=" ${CUDA_CURAND_LIBRARY})
message(STATUS "Found CUDA_CUBLASLT_LIBRARY=" ${CUDA_CUBLASLT_LIBRARY})
endif(CUDA_FOUND)
endmacro(find_cuda)
diff --git a/docs/contribute/pull_request.rst b/docs/contribute/pull_request.rst
index 26989fb8e6..81852a2126 100644
--- a/docs/contribute/pull_request.rst
+++ b/docs/contribute/pull_request.rst
@@ -118,6 +118,7 @@ space. You can remove stale images that aren't used in the
presently checked-out
other worktrees using the following command:
.. code:: bash
+
docker/clear-stale-images.sh
Consult the ``--help`` for more options.
diff --git a/python/tvm/auto_scheduler/testing/tune_onnx.py
b/python/tvm/auto_scheduler/testing/tune_onnx.py
index 84ab1b48f8..5fbc875d1e 100644
--- a/python/tvm/auto_scheduler/testing/tune_onnx.py
+++ b/python/tvm/auto_scheduler/testing/tune_onnx.py
@@ -26,6 +26,7 @@ from tvm import auto_scheduler
from tvm import meta_schedule as ms
from tvm import relay
from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
+from tvm.meta_schedule.utils import cpu_count
from tvm.relay.frontend import from_onnx
from tvm.support import describe
@@ -73,11 +74,6 @@ def _parse_args():
type=str,
required=True,
)
- args.add_argument(
- "--rpc-workers",
- type=int,
- required=True,
- )
args.add_argument(
"--work-dir",
type=str,
@@ -100,7 +96,7 @@ def _parse_args():
)
args.add_argument(
"--cpu-flush",
- type=bool,
+ type=int,
required=True,
)
parsed = args.parse_args()
@@ -125,7 +121,7 @@ def main():
key=ARGS.rpc_key,
host=ARGS.rpc_host,
port=ARGS.rpc_port,
- n_parallel=ARGS.rpc_workers,
+ n_parallel=cpu_count(logical=True),
number=ARGS.number,
repeat=ARGS.repeat,
min_repeat_ms=ARGS.min_repeat_ms,
diff --git a/python/tvm/auto_scheduler/testing/tune_relay.py
b/python/tvm/auto_scheduler/testing/tune_relay.py
index 2bd7813999..58ea327ec5 100644
--- a/python/tvm/auto_scheduler/testing/tune_relay.py
+++ b/python/tvm/auto_scheduler/testing/tune_relay.py
@@ -26,6 +26,7 @@ from tvm import meta_schedule as ms
from tvm import relay
from tvm.meta_schedule.testing.custom_builder_runner import run_module_via_rpc
from tvm.meta_schedule.testing.relay_workload import get_network
+from tvm.meta_schedule.utils import cpu_count
from tvm.support import describe
@@ -66,11 +67,6 @@ def _parse_args():
type=str,
required=True,
)
- args.add_argument(
- "--rpc-workers",
- type=int,
- required=True,
- )
args.add_argument(
"--work-dir",
type=str,
@@ -98,7 +94,7 @@ def _parse_args():
)
args.add_argument(
"--cpu-flush",
- type=bool,
+ type=int,
required=True,
)
parsed = args.parse_args()
@@ -123,7 +119,7 @@ def main():
key=ARGS.rpc_key,
host=ARGS.rpc_host,
port=ARGS.rpc_port,
- n_parallel=ARGS.rpc_workers,
+ n_parallel=cpu_count(logical=True),
number=ARGS.number,
repeat=ARGS.repeat,
min_repeat_ms=ARGS.min_repeat_ms,
diff --git a/python/tvm/auto_scheduler/testing/tune_te.py
b/python/tvm/auto_scheduler/testing/tune_te.py
index 2eaddbbc08..4a6874a53d 100644
--- a/python/tvm/auto_scheduler/testing/tune_te.py
+++ b/python/tvm/auto_scheduler/testing/tune_te.py
@@ -21,6 +21,7 @@ import os
import tvm
from tvm import auto_scheduler
from tvm.meta_schedule.testing.te_workload import CONFIGS
+from tvm.meta_schedule.utils import cpu_count
from tvm.support import describe
@@ -56,11 +57,6 @@ def _parse_args():
type=str,
required=True,
)
- args.add_argument(
- "--rpc-workers",
- type=int,
- required=True,
- )
args.add_argument(
"--work-dir",
type=str,
@@ -83,7 +79,7 @@ def _parse_args():
)
args.add_argument(
"--cpu-flush",
- type=bool,
+ type=int,
required=True,
)
parsed = args.parse_args()
@@ -132,7 +128,7 @@ def main():
key=ARGS.rpc_key,
host=ARGS.rpc_host,
port=ARGS.rpc_port,
- n_parallel=ARGS.rpc_workers,
+ n_parallel=cpu_count(logical=True),
number=ARGS.number,
repeat=ARGS.repeat,
min_repeat_ms=ARGS.min_repeat_ms,
diff --git a/python/tvm/meta_schedule/runner/local_runner.py
b/python/tvm/meta_schedule/runner/local_runner.py
index d76fe0b840..2d3214f53b 100644
--- a/python/tvm/meta_schedule/runner/local_runner.py
+++ b/python/tvm/meta_schedule/runner/local_runner.py
@@ -23,17 +23,17 @@ import tvm
from ...contrib.popen_pool import PopenPoolExecutor
from ...runtime import Device, Module
+from ..profiler import Profiler
from ..utils import derived_object, get_global_func_with_default_on_worker
from .config import EvaluatorConfig
-from .runner import PyRunner, RunnerFuture, RunnerInput, RunnerResult,
PyRunnerFuture
+from .runner import PyRunner, PyRunnerFuture, RunnerFuture, RunnerInput,
RunnerResult
from .utils import (
- T_ARGUMENT_LIST,
T_ARG_INFO_JSON_OBJ_LIST,
+ T_ARGUMENT_LIST,
alloc_argument_common,
run_evaluator_common,
)
-
logger = logging.getLogger(__name__) # pylint: disable=invalid-name
@@ -137,26 +137,29 @@ def _worker_func(
yield
finally:
# Final step. Always clean up
- f_cleanup()
+ with Profiler.timeit("LocalRunner/cleanup"):
+ f_cleanup()
with resource_handler():
# Step 1: create the local runtime module
- rt_mod = tvm.runtime.load_module(artifact_path)
- # Step 2: create the local device
- device = tvm.runtime.device(dev_type=device_type, dev_id=0)
- # Step 3: Allocate input arguments
- repeated_args: List[T_ARGUMENT_LIST] = f_alloc_argument(
- device,
- args_info,
- alloc_repeat,
- )
- # Step 4: Run time_evaluator
- costs: List[float] = f_run_evaluator(
- rt_mod,
- device,
- evaluator_config,
- repeated_args,
- )
+ with Profiler.timeit("LocalRunner/load_module"):
+ rt_mod = tvm.runtime.load_module(artifact_path)
+ # Step 2: Allocate input arguments
+ with Profiler.timeit("LocalRunner/alloc_argument"):
+ device = tvm.runtime.device(dev_type=device_type, dev_id=0)
+ repeated_args: List[T_ARGUMENT_LIST] = f_alloc_argument(
+ device,
+ args_info,
+ alloc_repeat,
+ )
+ # Step 3: Run time_evaluator
+ with Profiler.timeit("LocalRunner/run_evaluator"):
+ costs: List[float] = f_run_evaluator(
+ rt_mod,
+ device,
+ evaluator_config,
+ repeated_args,
+ )
return costs
@@ -313,9 +316,6 @@ class LocalRunner(PyRunner):
get_global_func_with_default_on_worker(name=f_alloc_argument,
default=None)
get_global_func_with_default_on_worker(name=f_run_evaluator,
default=None)
get_global_func_with_default_on_worker(name=f_cleanup,
default=None)
- get_global_func_with_default_on_worker(
- name="tvm.contrib.random.random_fill", default=None
- )
value = self.pool.submit(
_check,
@@ -348,7 +348,7 @@ def default_alloc_argument(
The allocation args
"""
f_random_fill = get_global_func_with_default_on_worker(
- name="tvm.contrib.random.random_fill", default=None
+ name="tvm.contrib.random.random_fill_for_measure", default=None
)
return alloc_argument_common(f_random_fill, device, args_info,
alloc_repeat)
diff --git a/python/tvm/meta_schedule/runner/rpc_runner.py
b/python/tvm/meta_schedule/runner/rpc_runner.py
index 9ff2489f8e..aa6f3daaac 100644
--- a/python/tvm/meta_schedule/runner/rpc_runner.py
+++ b/python/tvm/meta_schedule/runner/rpc_runner.py
@@ -25,6 +25,7 @@ from tvm.contrib.popen_pool import PopenPoolExecutor
from tvm.rpc import RPCSession
from tvm.runtime import Device, Module
+from ..profiler import Profiler
from ..utils import (
cpu_count,
derived_object,
@@ -243,7 +244,7 @@ class RPCRunner(PyRunner):
f_alloc_argument: Union[T_ALLOC_ARGUMENT, str, None] = None,
f_run_evaluator: Union[T_RUN_EVALUATOR, str, None] = None,
f_cleanup: Union[T_CLEANUP, str, None] = None,
- max_workers: Optional[int] = 1,
+ max_workers: Optional[int] = None,
initializer: Optional[Callable[[], None]] = None,
) -> None:
"""Constructor
@@ -284,7 +285,7 @@ class RPCRunner(PyRunner):
self.f_run_evaluator = f_run_evaluator
self.f_cleanup = f_cleanup
if max_workers is None:
- max_workers = cpu_count()
+ max_workers = cpu_count(logical=True)
logger.info("RPCRunner: max_workers = %d", max_workers)
self.pool = PopenPoolExecutor(
max_workers=max_workers,
@@ -378,31 +379,36 @@ def _worker_func(
yield
finally:
# Final step. Always clean up
- f_cleanup(session, remote_path)
+ with Profiler.timeit("RPCRunner/cleanup"):
+ f_cleanup(session, remote_path)
with resource_handler():
# Step 1. Create session
- session = f_create_session(rpc_config)
- device = session.device(dev_type=device_type, dev_id=0)
+ with Profiler.timeit("RPCRunner/create_session"):
+ session = f_create_session(rpc_config)
+ device = session.device(dev_type=device_type, dev_id=0)
# Step 2. Upload the module
- _, remote_path = osp.split(artifact_path)
- local_path: str = artifact_path
- rt_mod: Module = f_upload_module(session, local_path, remote_path)
+ with Profiler.timeit("RPCRunner/upload_module"):
+ _, remote_path = osp.split(artifact_path)
+ local_path: str = artifact_path
+ rt_mod: Module = f_upload_module(session, local_path, remote_path)
# Step 3: Allocate input arguments
- repeated_args: List[T_ARGUMENT_LIST] = f_alloc_argument(
- session,
- device,
- args_info,
- alloc_repeat,
- )
+ with Profiler.timeit("RPCRunner/alloc_argument"):
+ repeated_args: List[T_ARGUMENT_LIST] = f_alloc_argument(
+ session,
+ device,
+ args_info,
+ alloc_repeat,
+ )
# Step 4: Run time_evaluator
- costs: List[float] = f_run_evaluator(
- session,
- rt_mod,
- device,
- evaluator_config,
- repeated_args,
- )
+ with Profiler.timeit("LocalRunner/run_evaluator"):
+ costs: List[float] = f_run_evaluator(
+ session,
+ rt_mod,
+ device,
+ evaluator_config,
+ repeated_args,
+ )
return costs
@@ -474,7 +480,7 @@ def default_alloc_argument(
"""
f_random_fill = get_global_func_on_rpc_session(
session,
- "tvm.contrib.random.random_fill",
+ "tvm.contrib.random.random_fill_for_measure",
"Please make sure 'USE_RANDOM' is turned ON in the config.cmake on the
RPC server.",
)
diff --git a/python/tvm/meta_schedule/testing/tune_onnx.py
b/python/tvm/meta_schedule/testing/tune_onnx.py
index 1a51622b5c..88cb360c01 100644
--- a/python/tvm/meta_schedule/testing/tune_onnx.py
+++ b/python/tvm/meta_schedule/testing/tune_onnx.py
@@ -71,11 +71,6 @@ def _parse_args():
type=str,
required=True,
)
- args.add_argument(
- "--rpc-workers",
- type=int,
- required=True,
- )
args.add_argument(
"--work-dir",
type=str,
@@ -98,7 +93,7 @@ def _parse_args():
)
args.add_argument(
"--cpu-flush",
- type=bool,
+ type=int,
required=True,
)
parsed = args.parse_args()
@@ -140,7 +135,6 @@ def main():
enable_cpu_cache_flush=ARGS.cpu_flush,
),
alloc_repeat=1,
- max_workers=ARGS.rpc_workers,
)
with ms.Profiler() as profiler:
lib = ms.tune_relay(
diff --git a/python/tvm/meta_schedule/testing/tune_relay.py
b/python/tvm/meta_schedule/testing/tune_relay.py
index 6188e124fd..ce15c60c15 100644
--- a/python/tvm/meta_schedule/testing/tune_relay.py
+++ b/python/tvm/meta_schedule/testing/tune_relay.py
@@ -64,11 +64,6 @@ def _parse_args():
type=str,
required=True,
)
- args.add_argument(
- "--rpc-workers",
- type=int,
- required=True,
- )
args.add_argument(
"--work-dir",
type=str,
@@ -96,7 +91,7 @@ def _parse_args():
)
args.add_argument(
"--cpu-flush",
- type=bool,
+ type=int,
required=True,
)
parsed = args.parse_args()
@@ -141,7 +136,6 @@ def main():
enable_cpu_cache_flush=ARGS.cpu_flush,
),
alloc_repeat=1,
- max_workers=ARGS.rpc_workers,
)
with ms.Profiler() as profiler:
lib = ms.tune_relay(
diff --git a/python/tvm/meta_schedule/testing/tune_te.py
b/python/tvm/meta_schedule/testing/tune_te.py
index cbc310f999..8740d74424 100644
--- a/python/tvm/meta_schedule/testing/tune_te.py
+++ b/python/tvm/meta_schedule/testing/tune_te.py
@@ -59,11 +59,6 @@ def _parse_args():
type=str,
required=True,
)
- args.add_argument(
- "--rpc-workers",
- type=int,
- required=True,
- )
args.add_argument(
"--work-dir",
type=str,
@@ -86,7 +81,7 @@ def _parse_args():
)
args.add_argument(
"--cpu-flush",
- type=bool,
+ type=int,
required=True,
)
parsed = args.parse_args()
@@ -119,7 +114,6 @@ def main():
enable_cpu_cache_flush=ARGS.cpu_flush,
),
alloc_repeat=1,
- max_workers=ARGS.rpc_workers,
)
with ms.Profiler() as profiler:
sch: Optional[tir.Schedule] = ms.tune_tir(
diff --git a/src/runtime/contrib/curand/curand.cc
b/src/runtime/contrib/curand/curand.cc
new file mode 100644
index 0000000000..23282304f7
--- /dev/null
+++ b/src/runtime/contrib/curand/curand.cc
@@ -0,0 +1,104 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <curand.h>
+#include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/registry.h>
+
+#include "../../cuda/cuda_common.h"
+#include "./helper_cuda_kernels.h"
+
+namespace tvm {
+namespace runtime {
+namespace curand {
+
+#define TVM_CURAND_CALL(func) \
+ { \
+ curandStatus_t e = (func); \
+ ICHECK(e == CURAND_STATUS_SUCCESS) << "cuRAND error: " << e; \
+ }
+
+class CURandGenerator {
+ public:
+ CURandGenerator() { TVM_CURAND_CALL(curandCreateGenerator(&gen,
CURAND_RNG_PSEUDO_DEFAULT)); }
+ ~CURandGenerator() { TVM_CURAND_CALL(curandDestroyGenerator(gen)); }
+
+ void Generate32bit(void* ptr, int64_t n) {
+ TVM_CURAND_CALL(curandGenerateNormal(gen, static_cast<float*>(ptr), n,
0.0f, 5.0f));
+ cudaDeviceSynchronize();
+ }
+
+ void Generate64bit(void* ptr, int64_t n) {
+ TVM_CURAND_CALL(curandGenerateNormalDouble(gen, static_cast<double*>(ptr),
n, 0.0f, 5.0f));
+ }
+
+ curandGenerator_t gen;
+};
+
+DeviceAPI* GetCUDADeviceAPI() {
+ const PackedFunc* get_cuda_api = runtime::Registry::Get("device_api.cuda");
+ ICHECK(get_cuda_api) << "ValueError: TVM is not built with USE_CUDA=ON";
+ void* ret = (*get_cuda_api)();
+ runtime::DeviceAPI* cuda_api = static_cast<runtime::DeviceAPI*>(ret);
+ return cuda_api;
+}
+
+int64_t GetTensorSize(DLTensor* tensor) {
+ int64_t tensor_size = 1;
+ for (int i = 0; i < tensor->ndim; ++i) {
+ tensor_size *= tensor->shape[i];
+ }
+ return tensor_size;
+}
+
+struct DeferredFunc {
+ public:
+ explicit DeferredFunc(std::function<void()> func) : func_(func) {}
+ ~DeferredFunc() { func_(); }
+
+ private:
+ std::function<void()> func_;
+};
+
+void RandomFill(DLTensor* tensor) {
+ static DeviceAPI* cuda_api = GetCUDADeviceAPI();
+ CHECK(tensor->device.device_type == DLDeviceType::kDLCUDA)
+ << "ValueError: cuRAND only works on CUDA devices";
+ if (tensor->dtype.code == DLDataTypeCode::kDLFloat && tensor->dtype.bits ==
16) {
+ int64_t tensor_size = GetTensorSize(tensor);
+ void* data = cuda_api->AllocWorkspace(tensor->device, tensor_size *
sizeof(float));
+ {
+ DeferredFunc defer([data, tensor]() {
cuda_api->FreeWorkspace(tensor->device, data); });
+ CURandGenerator().Generate32bit(data, GetTensorSize(tensor));
+ ConvertFp32toFp16(/*src=*/data, /*dst=*/tensor->data,
/*num=*/tensor_size);
+ }
+ } else if (tensor->dtype.code == DLDataTypeCode::kDLFloat &&
tensor->dtype.bits == 32) {
+ CURandGenerator().Generate32bit(tensor->data, GetTensorSize(tensor));
+ } else if (tensor->dtype.code == DLDataTypeCode::kDLFloat &&
tensor->dtype.bits == 64) {
+ CURandGenerator().Generate64bit(tensor->data, GetTensorSize(tensor));
+ } else {
+ LOG(FATAL) << "ValueError: Unsupported dtype: " << tensor->dtype;
+ }
+ TVMSynchronize(tensor->device.device_type, tensor->device.device_type,
nullptr);
+}
+
+TVM_REGISTER_GLOBAL("runtime.contrib.curand.RandomFill").set_body_typed(RandomFill);
+
+} // namespace curand
+} // namespace runtime
+} // namespace tvm
diff --git a/src/runtime/contrib/curand/helper_cuda_kernels.cu
b/src/runtime/contrib/curand/helper_cuda_kernels.cu
new file mode 100644
index 0000000000..a08fc09441
--- /dev/null
+++ b/src/runtime/contrib/curand/helper_cuda_kernels.cu
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <cuda_fp16.h>
+
+#include "./helper_cuda_kernels.h"
+
+namespace tvm {
+namespace runtime {
+namespace curand {
+
+__global__ void KernelFp32ToFp16(const float* src, half* dst, int num) {
+ int idx = blockDim.x * blockIdx.x + threadIdx.x;
+ if (idx < num) {
+ dst[idx] = src[idx];
+ }
+}
+
+void ConvertFp32toFp16(const void* _src, void* _dst, int64_t num) {
+ const float* src = static_cast<const float*>(_src);
+ half* dst = static_cast<half*>(_dst);
+ KernelFp32ToFp16<<<(num + 255) / 256, 256>>>(src, dst, num);
+}
+
+} // namespace curand
+} // namespace runtime
+} // namespace tvm
diff --git a/src/runtime/contrib/curand/helper_cuda_kernels.h
b/src/runtime/contrib/curand/helper_cuda_kernels.h
new file mode 100644
index 0000000000..582162579a
--- /dev/null
+++ b/src/runtime/contrib/curand/helper_cuda_kernels.h
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef TVM_RUNTIME_CONTRIB_CURAND_HELPER_CUDA_KERNELS_H_
+#define TVM_RUNTIME_CONTRIB_CURAND_HELPER_CUDA_KERNELS_H_
+
+#include <curand.h>
+#include <tvm/runtime/registry.h>
+
+namespace tvm {
+namespace runtime {
+namespace curand {
+
+/*!
+ * \brief An auxiliary function to convert an FP32 array to FP16.
+ * \param src The source FP32 array.
+ * \param dst The destination FP16 array.
+ * \param num The number of elements in the array.
+ */
+void ConvertFp32toFp16(const void* src, void* dst, int64_t num);
+
+} // namespace curand
+} // namespace runtime
+} // namespace tvm
+
+#endif // TVM_RUNTIME_CONTRIB_CURAND_HELPER_CUDA_KERNELS_H_
diff --git a/src/runtime/contrib/random/mt_random_engine.cc
b/src/runtime/contrib/random/mt_random_engine.cc
index 161ae62220..ac52594360 100644
--- a/src/runtime/contrib/random/mt_random_engine.cc
+++ b/src/runtime/contrib/random/mt_random_engine.cc
@@ -21,13 +21,16 @@
* \file random/mt_random_engine.cc
* \brief mt19937 random engine
*/
+#include <tvm/runtime/c_backend_api.h>
#include <tvm/runtime/device_api.h>
#include <tvm/runtime/logging.h>
#include <tvm/runtime/ndarray.h>
+#include <tvm/runtime/threading_backend.h>
#include <algorithm>
#include <ctime>
#include <random>
+#include <thread>
#include "../3rdparty/compiler-rt/builtin_fp16.h"
@@ -116,52 +119,112 @@ class RandomEngine {
}
void RandomFill(DLTensor* data) {
- int64_t size = 1;
- for (int i = 0; i < data->ndim; ++i) {
- size *= data->shape[i];
+ if (data->device.device_type == kDLCPU) {
+ FillData(data);
+ } else {
+ runtime::NDArray local = runtime::NDArray::Empty(
+ std::vector<int64_t>{data->shape, data->shape + data->ndim},
data->dtype, {kDLCPU, 0});
+ DLTensor* tensor = const_cast<DLTensor*>(local.operator->());
+ FillData(tensor);
+ runtime::NDArray::CopyFromTo(tensor, data);
}
+ }
+ void RandomFillForMeasure(DLTensor* data) {
if (data->device.device_type == kDLCPU) {
- FillData(data, size);
+ FillDataForMeasure(data);
} else {
runtime::NDArray local = runtime::NDArray::Empty(
std::vector<int64_t>{data->shape, data->shape + data->ndim},
data->dtype, {kDLCPU, 0});
DLTensor* tensor = const_cast<DLTensor*>(local.operator->());
- FillData(tensor, size);
+ FillDataForMeasure(tensor);
runtime::NDArray::CopyFromTo(tensor, data);
}
}
private:
- void FillData(DLTensor* tensor, int64_t size) {
+ void FillDataImpl(void* data, int64_t st, int64_t ed, DLDataType dtype) {
// Make the value be 1.0 - 10.0, not (0.0 - 1.0) so that we could satisfy
// quantized dtype (uint8 / int8) data non-empty requirement
std::uniform_real_distribution<> dist(1.0, 10.0);
// Use float representation could make us work well on float / int type
too.
- if (tensor->dtype.bits == 1) {
- std::generate_n(static_cast<bool*>(tensor->data), size, [&]() { return
dist(rnd_engine_); });
- } else if (tensor->dtype.bits == 4) {
+ if (dtype.bits == 1) {
+ std::generate_n(static_cast<bool*>(data) + st, ed - st, [&]() { return
dist(rnd_engine_); });
+ } else if (dtype.bits == 4) {
// For uint4/int4 we pack two values into a single byte.
// Thus, to ensure both values are non-zero, we use a distribution of 17
- 30.
std::uniform_real_distribution<> packed_dist(17.0, 30.0);
- std::generate_n(reinterpret_cast<uint8_t*>(tensor->data), size,
+ std::generate_n(reinterpret_cast<uint8_t*>(data) + st, ed - st,
[&]() { return packed_dist(rnd_engine_); });
- } else if (tensor->dtype.bits == 8) {
- std::generate_n(static_cast<uint8_t*>(tensor->data), size,
+ } else if (dtype.bits == 8) {
+ std::generate_n(static_cast<uint8_t*>(data) + st, ed - st,
[&]() { return dist(rnd_engine_); });
- } else if (tensor->dtype.bits == 16) {
- std::generate_n(static_cast<uint16_t*>(tensor->data), size, [&]() {
+ } else if (dtype.bits == 16) {
+ std::generate_n(static_cast<uint16_t*>(data) + st, ed - st, [&]() {
return __truncXfYf2__<float, uint32_t, 23, uint16_t, uint16_t, 10>(
static_cast<float>(dist(rnd_engine_)));
});
- } else if (tensor->dtype.bits == 32) {
- std::generate_n(static_cast<float*>(tensor->data), size, [&]() { return
dist(rnd_engine_); });
- } else if (tensor->dtype.bits == 64) {
- std::generate_n(static_cast<double*>(tensor->data), size,
+ } else if (dtype.bits == 32) {
+ std::generate_n(static_cast<float*>(data) + st, ed - st, [&]() { return
dist(rnd_engine_); });
+ } else if (dtype.bits == 64) {
+ std::generate_n(static_cast<double*>(data) + st, ed - st,
[&]() { return dist(rnd_engine_); });
} else {
- LOG(FATAL) << "Doesn't support dtype code " << tensor->dtype.code << "
dtype bits "
- << tensor->dtype.bits;
+ LOG(FATAL) << "Doesn't support dtype code " << dtype.code << " dtype
bits " << dtype.bits;
+ }
+ }
+
+ void FillData(DLTensor* tensor) {
+ int64_t size = 1;
+ for (int i = 0; i < tensor->ndim; ++i) {
+ size *= tensor->shape[i];
+ }
+ DLDataType dtype = tensor->dtype;
+ if (dtype.bits == 1 || dtype.bits == 4 || dtype.bits == 8 || dtype.bits ==
16 ||
+ dtype.bits == 32 || dtype.bits == 64) {
+ FillDataImpl(tensor->data, 0, size, dtype);
+ } else {
+ LOG(FATAL) << "Doesn't support dtype code " << dtype.code << " dtype
bits " << dtype.bits;
+ }
+ }
+
+ void FillDataForMeasure(DLTensor* tensor) {
+ struct ParallelTask {
+ static int RunTask(int task_id, TVMParallelGroupEnv* penv, void* cdata) {
+ ParallelTask* task = static_cast<ParallelTask*>(cdata);
+ task->Run(task_id);
+ return 0;
+ }
+
+ void Run(int i) {
+ int64_t chunk_size = size / num_threads;
+ int64_t st = i * chunk_size;
+ int64_t ed = std::min(st + chunk_size, size);
+ self->FillDataImpl(data, st, ed, dtype);
+ }
+
+ RandomEngine* self;
+ void* data;
+ int num_threads;
+ int64_t size;
+ DLDataType dtype;
+ };
+
+ ParallelTask task;
+ task.self = this;
+ task.data = tensor->data;
+ DLDataType dtype = task.dtype = tensor->dtype;
+ int64_t& size = task.size = 1;
+ for (int i = 0; i < tensor->ndim; ++i) {
+ size *= tensor->shape[i];
+ }
+ if (dtype.bits == 1 || dtype.bits == 4 || dtype.bits == 8 || dtype.bits ==
16 ||
+ dtype.bits == 32 || dtype.bits == 64) {
+ int num_threads = task.num_threads =
runtime::threading::MaxConcurrency();
+ int res = TVMBackendParallelLaunch(ParallelTask::RunTask, &task,
num_threads);
+ ICHECK_EQ(res, 0) << "RandomFillForMeasure: TVMBackendParallelLaunch
failed";
+ } else {
+ LOG(FATAL) << "Doesn't support dtype code " << dtype.code << " dtype
bits " << dtype.bits;
}
}
diff --git a/src/runtime/contrib/random/random.cc
b/src/runtime/contrib/random/random.cc
index 2cb56b87fd..38c2de6555 100644
--- a/src/runtime/contrib/random/random.cc
+++ b/src/runtime/contrib/random/random.cc
@@ -24,6 +24,7 @@
#include <tvm/runtime/data_type.h>
#include <tvm/runtime/logging.h>
#include <tvm/runtime/registry.h>
+#include <tvm/runtime/threading_backend.h>
#include <algorithm>
@@ -123,5 +124,19 @@
TVM_REGISTER_GLOBAL("tvm.contrib.random.random_fill").set_body([](TVMArgs args,
entry->random_engine.RandomFill(out);
});
+TVM_REGISTER_GLOBAL("tvm.contrib.random.random_fill_for_measure")
+ .set_body([](TVMArgs args, TVMRetValue* ret) -> void {
+ static const PackedFunc* curand =
Registry::Get("runtime.contrib.curand.RandomFill");
+ DLTensor* out = args[0];
+ if (curand && out->device.device_type == DLDeviceType::kDLCUDA) {
+ if (out->dtype.code == DLDataTypeCode::kDLFloat) {
+ (*curand)(out);
+ return;
+ }
+ }
+ RandomThreadLocalEntry* entry = RandomThreadLocalEntry::ThreadLocal();
+ entry->random_engine.RandomFillForMeasure(out);
+ });
+
} // namespace contrib
} // namespace tvm
diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc
index be0cd9eb8f..6f0a6114f3 100644
--- a/src/support/libinfo.cc
+++ b/src/support/libinfo.cc
@@ -163,6 +163,10 @@
#define TVM_INFO_USE_THRUST "NOT-FOUND"
#endif
+#ifndef TVM_INFO_USE_CURAND
+#define TVM_INFO_USE_CURAND "NOT-FOUND"
+#endif
+
#ifndef TVM_INFO_USE_MIOPEN
#define TVM_INFO_USE_MIOPEN "NOT-FOUND"
#endif
@@ -308,6 +312,7 @@ TVM_DLL Map<String, String> GetLibInfo() {
{"USE_TFLITE", TVM_INFO_USE_TFLITE},
{"USE_THREADS", TVM_INFO_USE_THREADS},
{"USE_THRUST", TVM_INFO_USE_THRUST},
+ {"USE_CURAND", TVM_INFO_USE_CURAND},
{"USE_VITIS_AI", TVM_INFO_USE_VITIS_AI},
{"USE_VULKAN", TVM_INFO_USE_VULKAN},
{"USE_CLML", TVM_INFO_USE_CLML},