This is an automated email from the ASF dual-hosted git repository. gary pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/flink.git
The following commit(s) were added to refs/heads/master by this push: new 869ccd6 [FLINK-13345][tests] Dump jstack output for Flink JVMs 869ccd6 is described below commit 869ccd68ac442f72e017232a6e7b91948cadb4dd Author: Gary Yao <g...@apache.org> AuthorDate: Sun Jul 21 20:00:24 2019 +0200 [FLINK-13345][tests] Dump jstack output for Flink JVMs Dump the jstack output for all Flink JVMs at the end of each Jepsen test in the log aggregation phase. This can be helpful for debugging deadlocks. This closes #9194. --- flink-jepsen/README.md | 3 ++- flink-jepsen/src/jepsen/flink/db.clj | 7 ++++++- flink-jepsen/src/jepsen/flink/utils.clj | 30 ++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/flink-jepsen/README.md b/flink-jepsen/README.md index d329c83..5694b74 100644 --- a/flink-jepsen/README.md +++ b/flink-jepsen/README.md @@ -78,4 +78,5 @@ or depending on whether the test passed or not. If neither output is generated, the test did not finish properly due to problems of the environment, bugs in Jepsen or in the test suite, etc. -In addition, the test directories contain all relevant log files aggregated from all hosts. +In addition, the test directories contain all relevant log files, and the jstack output for all Flink JVMs +aggregated from the DB nodes. diff --git a/flink-jepsen/src/jepsen/flink/db.clj b/flink-jepsen/src/jepsen/flink/db.clj index 4aaa3f2..8ca200f 100644 --- a/flink-jepsen/src/jepsen/flink/db.clj +++ b/flink-jepsen/src/jepsen/flink/db.clj @@ -129,7 +129,12 @@ db/LogFiles (log-files [_ _ _] - (fu/find-files! log-dir)))] + (c/su + (fu/dump-jstack-by-pattern! log-dir + "TaskExecutor" + "TaskManager" + "ClusterEntrypoint") + (fu/find-files! log-dir))))] (combined-db [flink-base-db db]))) (defn- sorted-nodes diff --git a/flink-jepsen/src/jepsen/flink/utils.clj b/flink-jepsen/src/jepsen/flink/utils.clj index 5d16568..8f6f654 100644 --- a/flink-jepsen/src/jepsen/flink/utils.clj +++ b/flink-jepsen/src/jepsen/flink/utils.clj @@ -112,3 +112,33 @@ ;; Remove all symlinks in /etc/service except sshd. ;; This is only relevant when tests are run in Docker because there sshd is started using runit. (meh (c/exec :find (c/lit (str "/etc/service -mindepth 1 -maxdepth 1 -type l -not -name 'sshd' -delete")))))) + +;;; jstack + +(defn- includes-any? + [s substrs] + (some #(clojure.string/includes? s %) substrs)) + +(defn- jps! + ([] + (map #(clojure.string/split % #"\s") + (-> (c/exec :jps) + (clojure.string/trim) + (clojure.string/split #"\n")))) + + ([class-name-patterns] + (->> (jps!) + (filter #(= 2 (count %))) + (filter (fn [[_ class-name]] (includes-any? class-name class-name-patterns)))))) + +(defn- write-jstack! + [pid out-path] + (c/exec :jstack :-l pid :> out-path)) + +(defn dump-jstack-by-pattern! + "Dumps the output of jstack for all JVMs that match one of the specified patterns." + [out-dir & class-name-patterns] + (let [pid-class-names (jps! class-name-patterns)] + (doseq [[pid class-name] pid-class-names] + (let [out-path (str out-dir "/jstack_" pid "_" class-name)] + (write-jstack! pid out-path)))))