This is an automated email from the ASF dual-hosted git repository.

gary pushed a commit to branch release-1.9
in repository https://gitbox.apache.org/repos/asf/flink.git


The following commit(s) were added to refs/heads/release-1.9 by this push:
     new 6f27bb1  [FLINK-13345][tests] Dump jstack output for Flink JVMs
6f27bb1 is described below

commit 6f27bb1f655941f2ea9b25281d8925873e6a250f
Author: Gary Yao <[email protected]>
AuthorDate: Sun Jul 21 20:00:24 2019 +0200

    [FLINK-13345][tests] Dump jstack output for Flink JVMs
    
    Dump the jstack output for all Flink JVMs at the end of each Jepsen test in 
the
    log aggregation phase. This can be helpful for debugging deadlocks.
---
 flink-jepsen/README.md                  |  3 ++-
 flink-jepsen/src/jepsen/flink/db.clj    |  7 ++++++-
 flink-jepsen/src/jepsen/flink/utils.clj | 30 ++++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/flink-jepsen/README.md b/flink-jepsen/README.md
index d329c83..5694b74 100644
--- a/flink-jepsen/README.md
+++ b/flink-jepsen/README.md
@@ -78,4 +78,5 @@ or
 depending on whether the test passed or not. If neither output is generated, 
the test did not finish
 properly due to problems of the environment, bugs in Jepsen or in the test 
suite, etc.
 
-In addition, the test directories contain all relevant log files aggregated 
from all hosts.
+In addition, the test directories contain all relevant log files, and the 
jstack output for all Flink JVMs
+aggregated from the DB nodes.
diff --git a/flink-jepsen/src/jepsen/flink/db.clj 
b/flink-jepsen/src/jepsen/flink/db.clj
index 4aaa3f2..8ca200f 100644
--- a/flink-jepsen/src/jepsen/flink/db.clj
+++ b/flink-jepsen/src/jepsen/flink/db.clj
@@ -129,7 +129,12 @@
 
                         db/LogFiles
                         (log-files [_ _ _]
-                          (fu/find-files! log-dir)))]
+                          (c/su
+                            (fu/dump-jstack-by-pattern! log-dir
+                                                        "TaskExecutor"
+                                                        "TaskManager"
+                                                        "ClusterEntrypoint")
+                            (fu/find-files! log-dir))))]
     (combined-db [flink-base-db db])))
 
 (defn- sorted-nodes
diff --git a/flink-jepsen/src/jepsen/flink/utils.clj 
b/flink-jepsen/src/jepsen/flink/utils.clj
index 5d16568..8f6f654 100644
--- a/flink-jepsen/src/jepsen/flink/utils.clj
+++ b/flink-jepsen/src/jepsen/flink/utils.clj
@@ -112,3 +112,33 @@
     ;; Remove all symlinks in /etc/service except sshd.
     ;; This is only relevant when tests are run in Docker because there sshd 
is started using runit.
     (meh (c/exec :find (c/lit (str "/etc/service -mindepth 1 -maxdepth 1 -type 
l -not -name 'sshd' -delete"))))))
+
+;;; jstack
+
+(defn- includes-any?
+  [s substrs]
+  (some #(clojure.string/includes? s %) substrs))
+
+(defn- jps!
+  ([]
+   (map #(clojure.string/split % #"\s")
+        (-> (c/exec :jps)
+            (clojure.string/trim)
+            (clojure.string/split #"\n"))))
+
+  ([class-name-patterns]
+   (->> (jps!)
+        (filter #(= 2 (count %)))
+        (filter (fn [[_ class-name]] (includes-any? class-name 
class-name-patterns))))))
+
+(defn- write-jstack!
+  [pid out-path]
+  (c/exec :jstack :-l pid :> out-path))
+
+(defn dump-jstack-by-pattern!
+  "Dumps the output of jstack for all JVMs that match one of the specified 
patterns."
+  [out-dir & class-name-patterns]
+  (let [pid-class-names (jps! class-name-patterns)]
+    (doseq [[pid class-name] pid-class-names]
+      (let [out-path (str out-dir "/jstack_" pid "_" class-name)]
+        (write-jstack! pid out-path)))))

Reply via email to