This is an automated email from the ASF dual-hosted git repository.
zhaowu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/incubator-tvm.git
The following commit(s) were added to refs/heads/master by this push:
new 29ae608 [RUNTIME] Hexagon driver for offloading kernels to simulator
(#5492)
29ae608 is described below
commit 29ae608baabb673fb0b2d3ced9321f0e1798f72e
Author: Krzysztof Parzyszek <[email protected]>
AuthorDate: Sun May 10 22:05:59 2020 -0500
[RUNTIME] Hexagon driver for offloading kernels to simulator (#5492)
* [RUNTIME] Hexagon driver for offloading kernels to simulator
* Add sim_dev as external project when building with Hexagon/sim support
* Change target CPU for sim_dev to v60
---
cmake/modules/Hexagon.cmake | 9 +
src/runtime/hexagon/sim/driver/CMakeLists.txt | 62 +++
src/runtime/hexagon/sim/driver/README.md | 38 ++
src/runtime/hexagon/sim/driver/fake_pthread.cc | 292 +++++++++++++
src/runtime/hexagon/sim/driver/pthread.h | 96 +++++
src/runtime/hexagon/sim/driver/sched.h | 31 ++
src/runtime/hexagon/sim/driver/sim_device.cc | 573 +++++++++++++++++++++++++
src/runtime/threading_backend.cc | 11 +
8 files changed, 1112 insertions(+)
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index e70a964..30b4ccb 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.
+include(ExternalProject)
+
set(PICK_SIM "sim")
set(PICK_HW "target")
set(PICK_NONE "OFF")
@@ -77,6 +79,13 @@ if(USE_HEXAGON_DEVICE STREQUAL "${PICK_SIM}")
include_directories("${HEXAGON_TOOLCHAIN}/include/iss")
link_directories("${HEXAGON_TOOLCHAIN}/lib/iss")
list(APPEND TVM_RUNTIME_LINKER_LIBS "-lwrapper")
+ ExternalProject_Add(sim_dev
+ SOURCE_DIR "${CMAKE_SOURCE_DIR}/src/runtime/hexagon/sim/driver"
+ CMAKE_ARGS
+ "-DCMAKE_C_COMPILER=${HEXAGON_TOOLCHAIN}/bin/hexagon-clang"
+ "-DCMAKE_CXX_COMPILER=${HEXAGON_TOOLCHAIN}/bin/hexagon-clang++"
+ INSTALL_COMMAND "true"
+ )
elseif(USE_HEXAGON_DEVICE STREQUAL "${PICK_HW}")
find_hexagon_sdk_root()
find_hexagon_toolchain()
diff --git a/src/runtime/hexagon/sim/driver/CMakeLists.txt
b/src/runtime/hexagon/sim/driver/CMakeLists.txt
new file mode 100644
index 0000000..8632b49
--- /dev/null
+++ b/src/runtime/hexagon/sim/driver/CMakeLists.txt
@@ -0,0 +1,62 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+project(SIM_DEV C CXX)
+cmake_minimum_required(VERSION 3.0.2)
+
+set(CMAKE_SYSTEM_NAME "Linux")
+
+if(EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
+ include(${CMAKE_CURRENT_BINARY_DIR}/config.cmake)
+endif()
+
+set(EXTRA_CXX_FLAGS
+ "-O2"
+ "-Wno-format"
+ "-mhvx -mhvx-length=128b"
+ "-mv60"
+ "-stdlib=libc++"
+)
+
+set(EXTRA_LINK_FLAGS
+ "-stdlib=libc++"
+ "-G0"
+ "-Wl,--force-dynamic"
+ "-Wl,--export-dynamic"
+ "-Wl,--whole-archive" # This should link entire libc, libc++ and libc+abi.
+ "-Wl,--defsym=HEAP_SIZE=0x40000000"
+)
+
+string(REGEX REPLACE ";" " " EXTRA_CXX_FLAGS_STR "${EXTRA_CXX_FLAGS}")
+string(REGEX REPLACE ";" " " EXTRA_LINK_FLAGS_STR "${EXTRA_LINK_FLAGS}")
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_FLAGS "${EXTRA_CXX_FLAGS_STR} ${CMAKE_CXX_FLAGS}")
+set(CMAKE_EXE_LINKER_FLAGS "${EXTRA_LINK_FLAGS_STR} ${CMAKE_EXE_LINKER_FLAGS}")
+
+# Set project properties.
+
+file(GLOB SOURCE_FILES "*.cc")
+add_executable(sim_dev ${SOURCE_FILES})
+target_include_directories(sim_dev
+ PUBLIC "."
+ PUBLIC ".."
+ PUBLIC "../../../../../include"
+ PUBLIC "../../../../../3rdparty/dlpack/include"
+)
+
+target_link_libraries(sim_dev "-ldl")
diff --git a/src/runtime/hexagon/sim/driver/README.md
b/src/runtime/hexagon/sim/driver/README.md
new file mode 100644
index 0000000..3aee1a1
--- /dev/null
+++ b/src/runtime/hexagon/sim/driver/README.md
@@ -0,0 +1,38 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements. See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership. The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License. You may obtain a copy of the License at -->
+
+<!--- http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied. See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+# Hexagon simulator driver
+
+The driver (`sim_dev` executable) is the process running on the Hexagon
simulator that handles the Hexagon-side communication with the TVM runtime
running on x86. The location of `sim_dev` should be added to `PATH` before
running any python code that uses Hexagon. The `sim_dev` executable is not
intended to be run by users, it is automatically loaded by the simulator
control code (in `hexagon_device_sim.cc`).
+
+### Prerequisites
+
+1. Hexagon C/C++ toolchain (such as the one in Hexagon SDK version 3.5.0 or
later).
+
+Hexagon SDK is available at //developer.qualcomm.com/software/hexagon-dsp-sdk.
+
+### Configuring
+
+Set
+```
+CMAKE_C_COMPILER=hexagon-clang
+CMAKE_CXX_COMPILER=hexagon-clang++
+```
+
+### Building
+
+There are no special options required for `make` (or the tool selected with
`cmake`). The location of the resulting binary `sim_dev` should be added to
`PATH`.
diff --git a/src/runtime/hexagon/sim/driver/fake_pthread.cc
b/src/runtime/hexagon/sim/driver/fake_pthread.cc
new file mode 100644
index 0000000..74090d0
--- /dev/null
+++ b/src/runtime/hexagon/sim/driver/fake_pthread.cc
@@ -0,0 +1,292 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <cassert>
+#include <cerrno>
+#include <csetjmp>
+#include <cstddef>
+#include <cstdlib>
+#include <map>
+#include <vector>
+
+#include "pthread.h"
+#include "sched.h"
+
+/*!
+ * Implementation of a subset of pthread API for single-threaded execution.
+ *
+ * They main idea is that the thread function ("start_routine" in the call
+ * to pthread_create) is executed immediately. When pthread_create returns,
+ * the thread function has already finished.
+ *
+ * Since the thread routine can itself call pthread_create, it is possible
+ * to have multiple threads existing at the same time, although only the
+ * last one is running.
+ *
+ * There are two main things that need to be taken care of:
+ * - thread-specific data, i.e. pthread_setspecific, pthread_getspecific,
+ * and the handling of thread keys,
+ * - handling of thread return values.
+ *
+ * Threads are identified by thread ids (of type pthread_t). The main process
+ * thread has the id of 0, the remaining threads have ids starting at 1 and
+ * incrementing by 1. For each thread there is some data (thread_info_t)
+ * associated with it, and stored in "thread_data" map. When a thread
+ * terminates, the corresponding entry from "thread_data" cannot be removed
+ * until the return value is claimed (pthread_join), unless it is explicitly
+ * discarded (pthread_detach). When a new thread is created, it gets the
+ * first available id for which there is no entry in "thread_data". This
+ * could be an id that was never allocated, or an id that was used, but
+ * has since been removed from the map.
+ * A thread can terminate through thread_exit. This means that when the
+ * thread function calls thread_exit, the execution should return to the
+ * pthread_create call that ran it. This is implemented via setjmp/longjmp
+ * (neither longjmp nor pthread_exit unwind the stack).
+ *
+ * Any mutexes or condition variables cannot block, or else it would cause
+ * a deadlock. Since there is only one thread running at a time, locking
+ * a mutex or waiting for a condition always succeeds (returns immediately).
+ */
+
+struct key_entry_t {
+ key_entry_t(void* v, void (*d)(void*)) : value(v), dtor(d) {}
+ void* value = nullptr;
+ void (*dtor)(void*) = nullptr;
+};
+
+struct thread_info_t {
+ thread_info_t() = default;
+ std::map<pthread_key_t, key_entry_t> keys;
+ std::jmp_buf env;
+ void* ret_value = nullptr;
+ bool finished = false;
+ bool detached = false;
+};
+
+static pthread_t main_thread_id = 0;
+
+static std::map<pthread_t, thread_info_t> thread_data = {
+ // Reserve the 0th entry.
+ {main_thread_id, {}}};
+
+static std::vector<pthread_t> running_threads = {main_thread_id};
+
+template <typename K, typename V>
+K first_available_key(const std::map<K, V>& m) {
+ auto i = m.begin(), e = m.end();
+ K key = 1;
+ for (; i != e && key == i->first; ++i, ++key) {
+ }
+ return key;
+}
+
+int pthread_cond_destroy(pthread_cond_t* cond) { return 0; }
+
+int pthread_cond_init(pthread_cond_t* __restrict cond,
+ const pthread_condattr_t* __restrict attr) {
+ return 0;
+}
+
+int pthread_cond_signal(pthread_cond_t* cond) { return 0; }
+
+int pthread_cond_broadcast(pthread_cond_t* cond) { return 0; }
+
+int pthread_cond_timedwait(pthread_cond_t* __restrict cond,
+ pthread_mutex_t* __restrict mutex,
+ const struct timespec* __restrict abstime) {
+ return 0;
+}
+
+int pthread_cond_wait(pthread_cond_t* __restrict cond,
+ pthread_mutex_t* __restrict mutex) {
+ return 0;
+}
+
+int pthread_mutexattr_init(pthread_mutexattr_t* attr) { return 0; }
+
+int pthread_mutexattr_destroy(pthread_mutexattr_t* attr) { return 0; }
+
+int pthread_mutexattr_settype(pthread_mutexattr_t* attr, int type) {
+ return 0;
+}
+
+int pthread_mutexattr_gettype(const pthread_mutexattr_t* __restrict attr,
+ int* __restrict type) {
+ *type = PTHREAD_MUTEX_NORMAL;
+ return 0;
+}
+
+int pthread_mutex_init(pthread_mutex_t* __restrict mutex,
+ const pthread_mutexattr_t* __restrict attr) {
+ return 0;
+}
+
+int pthread_mutex_destroy(pthread_mutex_t* mutex) { return 0; }
+
+int pthread_mutex_lock(pthread_mutex_t* mutex) { return 0; }
+
+int pthread_mutex_trylock(pthread_mutex_t* mutex) { return 0; }
+
+int pthread_mutex_unlock(pthread_mutex_t* mutex) { return 0; }
+
+int pthread_once(pthread_once_t* once_control, void (*init_routine)(void)) {
+ static_assert(PTHREAD_ONCE_INIT != PTHREAD_ONCE_DONE,
+ "PTHREAD_ONCE_INIT must be different from PTHREAD_ONCE_DONE");
+ if (*once_control == PTHREAD_ONCE_INIT) {
+ init_routine();
+ *once_control = PTHREAD_ONCE_DONE;
+ }
+ return 0;
+}
+
+int pthread_equal(pthread_t t1, pthread_t t2) { return t1 == t2; }
+
+int pthread_create(pthread_t* thread, const pthread_attr_t* attr,
+ void* (*start_routine)(void*), void* arg) {
+ std::jmp_buf& env = thread_data[pthread_self()].env;
+ volatile pthread_t tid;
+ if (setjmp(env) == 0) {
+ tid = first_available_key(thread_data);
+ *thread = tid;
+ running_threads.push_back(pthread_t(tid));
+ thread_info_t& thr = thread_data[pthread_t(tid)];
+ thr.ret_value = start_routine(arg);
+ }
+ thread_info_t& thr = thread_data[pthread_t(tid)];
+ thr.finished = true;
+ running_threads.pop_back();
+
+ // Destroy all keys.
+ bool repeat = true;
+ size_t iter = 0;
+ while (repeat && iter++ < PTHREAD_DESTRUCTOR_ITERATIONS) {
+ repeat = false;
+ // Assume that destructors can create new keys (i.e. modify the map).
+ for (size_t k = 0; k != PTHREAD_KEYS_MAX; ++k) {
+ auto f = thr.keys.find(k);
+ if (f == thr.keys.end()) {
+ continue;
+ }
+ key_entry_t& key = f->second;
+ if (key.dtor == nullptr || key.value == nullptr) {
+ continue;
+ }
+ key.dtor(key.value);
+ repeat = true;
+ }
+ }
+
+ if (thr.detached) {
+ thread_data.erase(pthread_t(tid));
+ }
+
+ return 0;
+}
+
+int pthread_join(pthread_t thread, void** retval) {
+ auto f = thread_data.find(thread);
+ if (f == thread_data.end()) {
+ return ESRCH;
+ }
+ thread_info_t& thr = f->second;
+ if (!thr.finished) {
+ return EDEADLK;
+ }
+ if (retval != nullptr) {
+ *retval = thr.ret_value;
+ }
+ thread_data.erase(f);
+ return 0;
+}
+
+int pthread_detach(pthread_t thread) {
+ auto f = thread_data.find(thread);
+ if (f == thread_data.end()) {
+ return ESRCH;
+ }
+ // Can discard the return value.
+ f->second.detached = true;
+ return 0;
+}
+
+void pthread_exit(void* retval) {
+ pthread_t sid = pthread_self();
+ if (sid != main_thread_id) {
+ thread_info_t& self = thread_data[sid];
+ self.ret_value = retval;
+ self.finished = true;
+ longjmp(self.env, 1);
+ }
+ exit(0); // Only executes for the main thread, plus silences
+ // the "should not return" warning.
+}
+
+int pthread_key_create(pthread_key_t* key, void (*destructor)(void*)) {
+ if (key == nullptr) {
+ return EINVAL;
+ }
+ auto& keys = thread_data[pthread_self()].keys;
+ pthread_key_t k = first_available_key(keys);
+ if (k >= PTHREAD_KEYS_MAX) {
+ return EAGAIN;
+ }
+ *key = k;
+ keys.emplace(k, key_entry_t{nullptr, destructor});
+ return 0;
+}
+
+int pthread_key_delete(pthread_key_t key) {
+ auto& keys = thread_data[pthread_self()].keys;
+ auto f = keys.find(key);
+ if (f == keys.end()) {
+ return EINVAL;
+ }
+ // pthread_key_delete does not call key destructors.
+ keys.erase(f);
+ return 0;
+}
+
+int pthread_setspecific(pthread_key_t key, const void* value) {
+ auto& keys = thread_data[pthread_self()].keys;
+ auto f = keys.find(key);
+ if (f == keys.end()) {
+ return EINVAL;
+ }
+ f->second.value = const_cast<void*>(value);
+ return 0;
+}
+
+void* pthread_getspecific(pthread_key_t key) {
+ auto& keys = thread_data[pthread_self()].keys;
+ auto f = keys.find(key);
+ if (f != keys.end()) {
+ return f->second.value;
+ }
+ return nullptr;
+}
+
+pthread_t pthread_self(void) { return running_threads.back(); }
+
+int sched_yield(void) { return 0; }
+
+#ifdef __cplusplus_
+extern "C" int nanosleep(const struct timespec* req, struct timespec* rem);
+#endif
+
+int nanosleep(const struct timespec* req, struct timespec* rem) { return 0; }
diff --git a/src/runtime/hexagon/sim/driver/pthread.h
b/src/runtime/hexagon/sim/driver/pthread.h
new file mode 100644
index 0000000..1748d61
--- /dev/null
+++ b/src/runtime/hexagon/sim/driver/pthread.h
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_HEXAGON_SIM_DRIVER_PTHREAD_H_
+#define TVM_RUNTIME_HEXAGON_SIM_DRIVER_PTHREAD_H_
+
+#define _PROVIDE_POSIX_TIME_DECLS 1
+#include <time.h>
+#undef _PROVIDE_POSIX_TIME_DECLS
+
+typedef int pthread_t;
+typedef int pthread_attr_t;
+typedef int pthread_cond_t;
+typedef int pthread_condattr_t;
+typedef int pthread_key_t;
+typedef int pthread_mutex_t;
+typedef int pthread_mutexattr_t;
+typedef int pthread_once_t;
+
+enum {
+ PTHREAD_COND_INITIALIZER,
+ PTHREAD_MUTEX_DEFAULT,
+ PTHREAD_MUTEX_ERRORCHECK,
+ PTHREAD_MUTEX_INITIALIZER,
+ PTHREAD_MUTEX_NORMAL,
+ PTHREAD_MUTEX_RECURSIVE,
+ PTHREAD_ONCE_INIT = 0, // Must be same as in QuRT
+ PTHREAD_ONCE_DONE, // Non-standard
+};
+
+const size_t PTHREAD_KEYS_MAX = 128;
+const size_t PTHREAD_DESTRUCTOR_ITERATIONS = 4;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int pthread_cond_destroy(pthread_cond_t* cond);
+int pthread_cond_init(pthread_cond_t* __restrict cond,
+ const pthread_condattr_t* __restrict attr);
+int pthread_cond_signal(pthread_cond_t* cond);
+int pthread_cond_broadcast(pthread_cond_t* cond);
+int pthread_cond_timedwait(pthread_cond_t* __restrict cond,
+ pthread_mutex_t* __restrict mutex,
+ const struct timespec* __restrict abstime);
+int pthread_cond_wait(pthread_cond_t* __restrict cond,
+ pthread_mutex_t* __restrict mutex);
+
+int pthread_mutexattr_init(pthread_mutexattr_t* attr);
+int pthread_mutexattr_destroy(pthread_mutexattr_t* attr);
+int pthread_mutexattr_gettype(const pthread_mutexattr_t* __restrict attr,
+ int* __restrict type);
+int pthread_mutexattr_settype(pthread_mutexattr_t* attr, int type);
+
+int pthread_mutex_init(pthread_mutex_t* __restrict mutex,
+ const pthread_mutexattr_t* __restrict attr);
+int pthread_mutex_destroy(pthread_mutex_t* mutex);
+int pthread_mutex_lock(pthread_mutex_t* mutex);
+int pthread_mutex_trylock(pthread_mutex_t* mutex);
+int pthread_mutex_unlock(pthread_mutex_t* mutex);
+
+int pthread_once(pthread_once_t* once_control, void (*init_routine)(void));
+int pthread_equal(pthread_t t1, pthread_t t2);
+
+int pthread_create(pthread_t* thread, const pthread_attr_t* attr,
+ void* (*start_routine)(void*), void* arg);
+int pthread_join(pthread_t thread, void** retval);
+int pthread_detach(pthread_t thread);
+void pthread_exit(void* retval) __attribute__((__noreturn__));
+
+int pthread_key_create(pthread_key_t* key, void (*destructor)(void*));
+int pthread_key_delete(pthread_key_t key);
+int pthread_setspecific(pthread_key_t key, const void* value);
+void* pthread_getspecific(pthread_key_t key);
+
+pthread_t pthread_self(void);
+#ifdef __cplusplus
+}
+#endif
+
+#endif // TVM_RUNTIME_HEXAGON_SIM_DRIVER_PTHREAD_H_
diff --git a/src/runtime/hexagon/sim/driver/sched.h
b/src/runtime/hexagon/sim/driver/sched.h
new file mode 100644
index 0000000..cc63630
--- /dev/null
+++ b/src/runtime/hexagon/sim/driver/sched.h
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef TVM_RUNTIME_HEXAGON_SIM_DRIVER_SCHED_H_
+#define TVM_RUNTIME_HEXAGON_SIM_DRIVER_SCHED_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+int sched_yield(void);
+#ifdef __cplusplus
+}
+#endif
+
+#endif // TVM_RUNTIME_HEXAGON_SIM_DRIVER_SCHED_H_
diff --git a/src/runtime/hexagon/sim/driver/sim_device.cc
b/src/runtime/hexagon/sim/driver/sim_device.cc
new file mode 100644
index 0000000..23dc053
--- /dev/null
+++ b/src/runtime/hexagon/sim/driver/sim_device.cc
@@ -0,0 +1,573 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*
+ Required options:
+ -ldl -G0 For dlinit/dlopen/dlclose.
+ -Wl,--force-dynamic Make this a dynamic executable (with dynamic
+ symbol table).
+ -Wl,-E Export all defined symbols as dynamic.
+ -Wl,--whole-archive Link the entire contents of libc.
+ -mhvx -mhvx-length=128b Enable HVX.
+ -Wno-format Silence format warning (unsigned vs uint32_t).
+*/
+
+#include <assert.h>
+#include <dlfcn.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <algorithm>
+#include <iterator>
+#include <string>
+#include <vector>
+
+#include "hexagon_sim_proto.h"
+#include "pthread.h"
+#include "tvm/runtime/c_runtime_api.h"
+
+static std::string timeNow() {
+ char str[11]; // [hh:mm:ss]
+ time_t time_value = time(NULL);
+ tm* pnow = localtime(&time_value); // NOLINT(runtime/threadsafe_fn)
+
+ snprintf(str, sizeof(str), "[%02d:%02d:%02d]", pnow->tm_hour, pnow->tm_min,
+ pnow->tm_sec);
+ return std::string(str);
+}
+
+#define LOG(FMT, ...) \
+ fprintf(stderr, "%s %s:%d: " FMT "\n", timeNow().c_str(), __FILE__, \
+ __LINE__, ##__VA_ARGS__)
+
+using HVX_Vector =
+ int __attribute__((__vector_size__(128))) __attribute__((aligned(128)));
+
+static unsigned getVectorLength() {
+ HVX_Vector v = __builtin_HEXAGON_V6_lvsplatw_128B(0x01010101);
+ unsigned char* p = reinterpret_cast<unsigned char*>(&v);
+ if (p[127] == 1) return 128;
+ assert(p[63] == 1);
+ return 64;
+}
+
+extern "C" {
+// Print vector functions. They can be used to help debug tensorized
+// code, via
+// ib.emit(tvm.call_extern('int32', 'V6_pv8', 'vector:', v))
+// ib.emit(tvm.call_extern('int32', 'V6_pv16', 'info:', v))
+// ib.emit(tvm.call_extern('int32', 'V6_pv32', 'value:', v))
+
+// The first argument is a string printed before the vector contents.
+int V6_pv8(const char* s, HVX_Vector v);
+int V6_pv16(const char* s, HVX_Vector v);
+int V6_pv32(const char* s, HVX_Vector v);
+}
+
+int V6_pv8(const char* s, HVX_Vector v) {
+ unsigned vlen = getVectorLength();
+ uint8_t* ptr = reinterpret_cast<uint8_t*>(&v);
+ fprintf(stderr, "%s:", s);
+ for (unsigned i = 0; i != vlen; ++i) {
+ fprintf(stderr, " %02x", ptr[i]);
+ }
+ fprintf(stderr, "\n");
+ return 0;
+}
+
+int V6_pv16(const char* s, HVX_Vector v) {
+ unsigned vlen = getVectorLength();
+ uint16_t* ptr = reinterpret_cast<uint16_t*>(&v);
+ fprintf(stderr, "%s:", s);
+ for (unsigned i = 0; i != vlen / sizeof(uint16_t); ++i) {
+ fprintf(stderr, " %04x", ptr[i]);
+ }
+ fprintf(stderr, "\n");
+ return 0;
+}
+
+int V6_pv32(const char* s, HVX_Vector v) {
+ unsigned vlen = getVectorLength();
+ uint32_t* ptr = reinterpret_cast<uint32_t*>(&v);
+ fprintf(stderr, "%s:", s);
+ for (unsigned i = 0; i != vlen / sizeof(uint32_t); ++i) {
+ fprintf(stderr, " %08x", ptr[i]);
+ }
+ fprintf(stderr, "\n");
+ return 0;
+}
+
+extern "C" {
+// Function referenced from libc++.a, but not defined in libc.a.
+int clock_gettime(clockid_t clock_id, struct timespec* tp);
+// pthread_create is wrapped so that we can set a bigger stack size
+// for QuRT. Here this isn't needed, but we still need to implement
+// the wrapper.
+int __wrap_pthread_create(pthread_t* thread, const pthread_attr_t* attr,
+ void* (*start_routine)(void*), void* arg);
+}
+
+int clock_gettime(clockid_t clock_id, struct timespec* tp) {
+ // Stub implementation.
+ return 0;
+}
+
+int __wrap_pthread_create(pthread_t* thread, const pthread_attr_t* attr,
+ void* (*start_routine)(void*), void* arg) {
+ LOG("%s", __func__);
+ return pthread_create(thread, attr, start_routine, arg);
+}
+
+// FIXME(kparzysz-quic): query the cfg register to compute the VTCM base.
+// This works now.
+const unsigned int TCM_BASE = 0xD8000000;
+const unsigned int VTCM_BASE = TCM_BASE + 0x400000;
+
+class Allocator {
+ private:
+ struct Block {
+ Block(void* p, size_t s) : ptr_(p), size_(s), vtcm_(false) {}
+ Block(void* p, size_t s, bool v) : ptr_(p), size_(s), vtcm_(v) {}
+ bool operator<(const Block& b) const {
+ return uintptr_t(ptr_) < uintptr_t(b.ptr_);
+ }
+ void* ptr_;
+ size_t size_;
+ bool vtcm_;
+ };
+
+ using vector_type = std::vector<Block>;
+ using iterator = vector_type::iterator;
+ vector_type allocations_;
+
+ uintptr_t cur_vtcm = VTCM_BASE;
+
+ public:
+ void* alloc(unsigned size, size_t align);
+ void* vtcm_alloc(unsigned size, size_t align);
+ void free(void* p);
+};
+
+void* Allocator::alloc(unsigned size, size_t align) {
+ void* ptr = aligned_alloc(align, size);
+ if (ptr == nullptr) {
+ perror("device: error allocating memory:");
+ return ptr;
+ }
+
+ Block b(ptr, size);
+ iterator i = std::lower_bound(allocations_.begin(), allocations_.end(), b);
+ iterator w = allocations_.insert(i, b);
+ if (w != allocations_.begin()) {
+ iterator pw = w - 1;
+ assert(uintptr_t(pw->ptr_) + pw->size_ < uintptr_t(w->ptr_));
+ }
+ if (w + 1 != allocations_.end()) {
+ iterator nw = w + 1;
+ assert(uintptr_t(w->ptr_) + w->size_ <= uintptr_t(nw->ptr_));
+ }
+
+ LOG("device: allocated %d bytes aligned at %d: %p", size, align, ptr);
+ return ptr;
+}
+
+// For now, just allocation sequentially. This needs to be improved to use a
+// free list.
+void* Allocator::vtcm_alloc(unsigned size, size_t align) {
+ uintptr_t a = cur_vtcm;
+ a = (a + (align - 1)) & -align;
+ cur_vtcm = a + size;
+ void* ptr = reinterpret_cast<void*>(a);
+ if (ptr == nullptr) {
+ perror("device: error allocating vtcm memory:");
+ return ptr;
+ }
+
+ Block b(ptr, size, true);
+ iterator i = std::lower_bound(allocations_.begin(), allocations_.end(), b);
+ iterator w = allocations_.insert(i, b);
+ if (w != allocations_.begin()) {
+ iterator pw = w - 1;
+ assert(uintptr_t(pw->ptr_) + pw->size_ <= uintptr_t(w->ptr_));
+ }
+ if (w + 1 != allocations_.end()) {
+ iterator nw = w + 1;
+ assert(uintptr_t(w->ptr_) + w->size_ <= uintptr_t(nw->ptr_));
+ }
+
+ LOG("device: allocated vtcm %d bytes aligned at %d: %p", size, align, ptr);
+ return ptr;
+}
+
+void Allocator::free(void* ptr) {
+ LOG("device: freeing %p", ptr);
+ iterator i = std::lower_bound(allocations_.begin(), allocations_.end(),
+ Block(ptr, 0));
+ assert(i != allocations_.end());
+ assert(i->ptr_ == ptr);
+ if (!i->vtcm_) ::free(i->ptr_);
+ allocations_.erase(i);
+}
+
+static void printMsgCall(const MsgCall& mc) {
+ auto to_dec_string = [](int v) {
+ char tmp[11];
+ snprintf(tmp, sizeof(tmp), "%d", v);
+ return std::string(tmp);
+ };
+ auto to_hex_string = [](uint32_t v) {
+ char tmp[9];
+ snprintf(tmp, sizeof(tmp), "%lx", v);
+ return std::string(tmp);
+ };
+ std::string str = "device: launching " + to_hex_string(mc.func_va) +
+ " sc:" + to_dec_string(mc.scalar_num) + " {";
+ for (unsigned i = 0; i != mc.scalar_num; ++i) {
+ str += ' ' + to_hex_string(mc.data[i]);
+ if (i + 1 != mc.scalar_num) str += ',';
+ }
+ str += " }, st:" + to_dec_string(mc.stack_num) + " {";
+ for (unsigned i = 0; i != mc.stack_num; ++i) {
+ str += ' ' + to_hex_string(mc.data[i + mc.scalar_num]);
+ if (i + 1 != mc.stack_num) str += ',';
+ }
+ str += " }";
+ LOG("%s", str.c_str());
+}
+
+static std::vector<MsgCall*> task_queue;
+
+struct Environment {
+ Allocator alloc;
+ void* dl_handle = nullptr;
+};
+
+extern "C" {
+volatile Message message_buffer;
+int dispatch(Environment* env) __attribute__((noinline));
+}
+
+static volatile unsigned char payload_buffer[4096];
+
+static void setMsg(uint32_t code, uint32_t len, uint32_t va) {
+ message_buffer.code = code;
+ message_buffer.len = len;
+ message_buffer.va = va;
+}
+
+inline void* pointer(uint32_t v) {
+ return reinterpret_cast<void*>(static_cast<uintptr_t>(v));
+}
+
+inline uint32_t va(const volatile void* p) {
+ return static_cast<uint32_t>(reinterpret_cast<uintptr_t>(p));
+}
+
+__attribute__((naked)) uint32_t launcher(volatile MsgCall* mc, uint64_t* pcc) {
+ __asm__(
+ "// This function is intentionally written to be readable, \n"
+ "// rather than fast. \n"
+ "// r0 = value of 'volatile MsgCall *mc' \n"
+ "// r1 = address where to store the program cycle count \n"
+ "{ memd(r29+#-16) = r21:20 \n"
+ " allocframe(#24) } \n"
+ "{ memd(r29+#0) = r17:16 \n"
+ " memd(r29+#8) = r19:18 } \n"
+ "{ r17:16 = combine(r1,r0) \n"
+ " r18 = r29 \n"
+ " r1 = memw(r0+#4) // scalar_num \n"
+ " r2 = memw(r0+#8) } // stack_num \n"
+ "// If there are no stack values, skip the stack setup. \n"
+ "{ p0 = cmp.eq(r2,#0) \n"
+ " if (p0.new) jump:t .Llauncher1 } \n"
+
+ "// Allocate space on the stack. Let r2 = needed space \n"
+ "// rounded up to a multiple of 8. \n"
+ "{ loop0(.Llauncher0,r2) \n"
+ " r2 = asl(r2,#2) } \n"
+ "{ r2 = add(r2,#4) } \n"
+ "{ r2 = clrbit(r2,#2) } \n"
+ "{ r29 = sub(r29,r2) } \n"
+
+ "// Copy stack contents onto the stack. Stack contents start \n"
+ "// at r3 = r0 + offsetof(data) + scalar_num*4 \n"
+ "{ r3 = addasl(r0,r1,#2) \n"
+ " r4 = r29 } \n"
+ "{ r3 = add(r3,#12) } // offsetof(data) \n"
+ ".Llauncher0: \n"
+ "{ r5 = memw(r3++#4) \n"
+ " memw(r4++#4) = r5.new } :endloop0 \n"
+
+ "// Load registers. Some of the loaded data may actually be \n"
+ "// values from the stack part of 'data', but it's not an issue.\n"
+ ".Llauncher1: \n"
+ "{ r0 = memw(r16+#12) // mc + offsetof(data) \n"
+ " r1 = memw(r16+#16) } \n"
+ "{ r2 = memw(r16+#20) \n"
+ " r3 = memw(r16+#24) } \n"
+ "{ r4 = memw(r16+#28) \n"
+ " r5 = memw(r16+#32) } \n"
+
+ "// Call. \n"
+ "{ r6 = memw(r16+#0) \n"
+ " r21:20 = upcycle } \n"
+ "{ callr r6 } \n"
+
+ "// Restore stack pointer (free up r18), calculate cycle count. \n"
+ "{ r29 = r18 \n"
+ " r19:18 = upcycle } \n"
+ "{ r19:18 = sub(r19:18, r21:20) } \n"
+
+ "// Store pcount, restore non-volatile registers, and return. \n"
+ "{ memd(r17+#0) = r19:18 \n"
+ " r21:20 = memd(r29+#16) } \n"
+ "{ r19:18 = memd(r29+#8) \n"
+ " r17:16 = memd(r29+#0) } \n"
+ "{ dealloc_return } // implicit-use r1:0 \n");
+}
+
+int dispatch(Environment* env) {
+ uint32_t code = message_buffer.code;
+ // Special handling of MsgReq.
+ if (code == kMsgReq) {
+ assert(message_buffer.len <= sizeof(payload_buffer));
+ setMsg(kMsgAck, sizeof(payload_buffer), va(payload_buffer));
+ return 0;
+ }
+
+ switch (code) {
+ case kAlloc: {
+ LOG("device: {kAlloc, %lu, %lx}", message_buffer.len, message_buffer.va);
+ assert(message_buffer.len == sizeof(MsgAlloc));
+ auto* ma = reinterpret_cast<volatile MsgAlloc*>(message_buffer.va);
+ void* p = env->alloc.alloc(ma->size, ma->align);
+ reinterpret_cast<volatile MsgPointer*>(payload_buffer)->va = va(p);
+ setMsg(kNone, sizeof(MsgPointer), va(payload_buffer));
+ break;
+ }
+ case kFree: {
+ LOG("device: {kFree, %lu, %lx}", message_buffer.len, message_buffer.va);
+ assert(message_buffer.len == sizeof(MsgPointer));
+ auto* mp = reinterpret_cast<volatile MsgPointer*>(message_buffer.va);
+ env->alloc.free(pointer(mp->va));
+ setMsg(kNone, 0u, 0u);
+ break;
+ }
+ case kAllocVtcm: {
+ LOG("device: {kAllocVtcm, %lu, %lx}", message_buffer.len,
+ message_buffer.va);
+ assert(message_buffer.len == sizeof(MsgAlloc));
+ auto* ma = reinterpret_cast<volatile MsgAlloc*>(message_buffer.va);
+ void* p = env->alloc.vtcm_alloc(ma->size, ma->align);
+ reinterpret_cast<volatile MsgPointer*>(payload_buffer)->va = va(p);
+ setMsg(kNone, sizeof(MsgPointer), va(payload_buffer));
+ break;
+ }
+ case kCopy: {
+ LOG("device: {kCopy, %lu, %lx}", message_buffer.len, message_buffer.va);
+ assert(message_buffer.len == sizeof(MsgCopy));
+ auto* mc = reinterpret_cast<volatile MsgCopy*>(message_buffer.va);
+ memcpy(pointer(mc->dst), pointer(mc->src), mc->len);
+ setMsg(kNone, 0u, 0u);
+ break;
+ }
+ case kLoad: {
+ if (env->dl_handle != nullptr) dlclose(env->dl_handle);
+ const char* name = static_cast<const char*>(pointer(message_buffer.va));
+ // LOG(stderr, "device: dlopen(%s)", name);
+ env->dl_handle = dlopen(name, RTLD_LAZY);
+ if (env->dl_handle == nullptr) LOG("dlopen: %s\n", dlerror());
+ assert(env->dl_handle != nullptr);
+ reinterpret_cast<volatile MsgPointer*>(payload_buffer)->va =
+ va(env->dl_handle);
+ setMsg(kNone, sizeof(MsgPointer), va(payload_buffer));
+ break;
+ }
+ case kUnload: {
+ assert(env->dl_handle != nullptr);
+ assert(message_buffer.len == sizeof(MsgPointer));
+ auto* mp = reinterpret_cast<volatile MsgPointer*>(message_buffer.va);
+ assert(pointer(mp->va) == env->dl_handle);
+ dlclose(env->dl_handle);
+ env->dl_handle = nullptr;
+ setMsg(kNone, 0u, 0u);
+ break;
+ }
+ case kResolve: {
+ LOG("device: {kResolve, %lu, %lx}", message_buffer.len,
+ message_buffer.va);
+ assert(env->dl_handle != nullptr);
+ dlerror();
+ const char* name = static_cast<const char*>(pointer(message_buffer.va));
+ void* s = dlsym(env->dl_handle, name);
+ reinterpret_cast<volatile MsgPointer*>(payload_buffer)->va = va(s);
+ setMsg(kNone, sizeof(MsgPointer), va(payload_buffer));
+ break;
+ }
+ case kCall: {
+ LOG("device: {kCall, %lu, %lx}", message_buffer.len, message_buffer.va);
+ // Add the task to the queue.
+ auto* mc = reinterpret_cast<MsgCall*>(message_buffer.va);
+ uint32_t size = 4 * (3 + mc->scalar_num + mc->stack_num);
+ MsgCall* t = static_cast<MsgCall*>(malloc(size));
+ memcpy(t, mc, size);
+ task_queue.push_back(t);
+ // Return 0.
+ *reinterpret_cast<volatile uint32_t*>(payload_buffer) = 0;
+ setMsg(kNone, sizeof(uint32_t), va(payload_buffer));
+ break;
+ }
+ case kFlush: {
+ LOG("device: {kFlush}");
+ LOG("device: %d tasks in the queue", task_queue.size());
+ // Execute all tasks from the queue and release memory buffers
+ // for as long as the return values are 0. Upon receiving a non-zero
+ // return value, continue freeing memory but no longer execute
+ // any tasks. The task queue will be cleared in any case.
+ uint32_t rv = 0;
+ uint64_t pcc; // Pcycle counter, will be 0 under simulator (upcycle).
+ for (MsgCall* t : task_queue) {
+ if (rv == 0) {
+ printMsgCall(*t);
+ rv = launcher(t, &pcc);
+ LOG("device: execution took %lld pcycles", pcc);
+ }
+ free(t);
+ }
+ task_queue.clear();
+ *reinterpret_cast<volatile uint32_t*>(payload_buffer) = rv;
+ setMsg(kNone, sizeof(uint32_t), va(payload_buffer));
+ break;
+ }
+ default:
+ LOG("device: unknown code: %lu", message_buffer.code);
+ abort();
+ break;
+ }
+ return 0;
+}
+
+extern "C" {
+int acquire_vector_unit(int);
+void release_vector_unit();
+}
+
+static void makePathList(const std::string& arg,
+ std::vector<std::string>* list) {
+ size_t p = 0, e = arg.size();
+ std::vector<char> tmp;
+
+ while (p < e) {
+ tmp.clear();
+ bool check_next = true;
+ size_t i = p;
+ for (; i != e; ++i) {
+ char c = arg[i];
+ if (check_next) {
+ if (c == '\\') {
+ check_next = false;
+ continue;
+ } else if (c == ':') {
+ break;
+ }
+ }
+ check_next = true;
+ tmp.push_back(c);
+ }
+ if (!tmp.empty()) list->emplace_back(tmp.begin(), tmp.end());
+ p = i + 1;
+ }
+}
+
+static std::string findInPaths(const std::string& filename,
+ const std::string& paths) {
+ std::vector<std::string> path_list;
+ makePathList(paths, &path_list);
+
+ for (const auto& p : path_list) {
+ std::string pf = p + '/' + filename;
+ if (access(pf.c_str(), X_OK) == 0) return std::move(pf);
+ }
+ // If the search failed, try bare filename. If it cannot be loaded,
+ // dlerror will print a meaningful message.
+ return filename;
+}
+
+// Presence of this function indicates that sim_dev is running.
+extern "C" int running_in_sim_dev_17bc90206f6cf5a7();
+int running_in_sim_dev_17bc90206f6cf5a7() { return 0; }
+
+int main(int argc, char* argv[]) {
+ int opt;
+ std::string ld_path;
+ while ((opt = getopt(argc, argv, "L:")) != -1) {
+ switch (opt) {
+ case 'L':
+ ld_path += ':' + std::string(optarg);
+ break;
+ case '?':
+ LOG("Usage %s: [-L path1[:path2...]]", argv[0]);
+ return 1;
+ }
+ }
+
+ std::string rt_path = findInPaths("libtvm_runtime.so", ld_path);
+ LOG("TVM runtime path: %s", rt_path.c_str());
+
+ Environment env;
+ acquire_vector_unit(0);
+
+ const char* builtin[] = {
+ "libgcc.so", "libc.so", "libc++.so",
+ "libc++abi.so", "libc++.so.1", "libc++abi.so.1" // Alternative names.
+ };
+ dlinit(sizeof(builtin) / sizeof(builtin[0]), const_cast<char**>(builtin));
+ void* rt_handle = dlopen(rt_path.c_str(), RTLD_GLOBAL);
+ if (rt_handle == nullptr) {
+ LOG("error loading TVM runtime: %s", dlerror());
+ return 1;
+ }
+
+ // When running TVM runtime on Hexagon there is no longer a device
+ // for Hexagon, but standalone ops can still refer to it. All of
+ // required DeviceAPI's functionality is adequately implemented
+ // via the CPU device, so remap device_api.hexagon to device_api.cpu.
+ auto* get_global = reinterpret_cast<decltype(&TVMFuncGetGlobal)>(
+ dlsym(rt_handle, "TVMFuncGetGlobal"));
+ assert(get_global != nullptr);
+ auto* register_global = reinterpret_cast<decltype(&TVMFuncRegisterGlobal)>(
+ dlsym(rt_handle, "TVMFuncRegisterGlobal"));
+ assert(register_global != nullptr);
+
+ TVMFunctionHandle cpu_api;
+ if (get_global("device_api.cpu", &cpu_api) != 0 ||
+ register_global("device_api.hexagon", cpu_api, true) != 0) {
+ LOG("error setting device_api.hexagon");
+ return 1;
+ }
+
+ while (!dispatch(&env)) {
+ }
+
+ dlclose(rt_handle);
+ release_vector_unit();
+ return 0;
+}
diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc
index 9d14d3a..0a2a60c 100644
--- a/src/runtime/threading_backend.cc
+++ b/src/runtime/threading_backend.cc
@@ -255,6 +255,17 @@ int MaxConcurrency() {
max_concurrency = std::thread::hardware_concurrency();
#if defined(_M_X64) || defined(__x86_64__)
max_concurrency /= 2; // ignore hyper-threading
+#elif defined(__hexagon__)
+ // With unsigned PDs, getting the number of available hardware threads
+ // is not supported in earlier versions of QuRT. In such cases assume 4.
+ // If running on simulator, set max_concurrency to 1.
+ if (max_concurrency == 0) {
+ if (dlsym(RTLD_DEFAULT, "running_in_sim_dev_17bc90206f6cf5a7")) {
+ max_concurrency = 1;
+ } else {
+ max_concurrency = 4;
+ }
+ }
#endif
}
return std::max(max_concurrency, 1);