This is an automated email from the ASF dual-hosted git repository. trohrmann pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/flink.git
commit c029d2af9de1346cf3372a55fff663cd9b61e3f1 Author: gyao <[email protected]> AuthorDate: Sun Aug 5 14:55:07 2018 +0200 [FLINK-10063][tests] Use runit to supervise mesos processes. --- flink-jepsen/docker/Dockerfile-db | 11 +- flink-jepsen/src/jepsen/flink/mesos.clj | 217 +++++++++++++++++++------------- 2 files changed, 138 insertions(+), 90 deletions(-) diff --git a/flink-jepsen/docker/Dockerfile-db b/flink-jepsen/docker/Dockerfile-db index 1555329..cb60efc 100644 --- a/flink-jepsen/docker/Dockerfile-db +++ b/flink-jepsen/docker/Dockerfile-db @@ -21,7 +21,7 @@ FROM debian:jessie RUN echo "deb http://http.debian.net/debian jessie-backports main" >> /etc/apt/sources.list && \ apt-get update && \ apt-get install -y -t jessie-backports openjdk-8-jdk && \ - apt-get install -y apt-utils bzip2 curl faketime iproute iptables iputils-ping less libzip2 logrotate man man-db net-tools ntpdate psmisc python rsyslog sudo sysvinit sysvinit-core sysvinit-utils tar unzip vim wget + apt-get install -y apt-utils bzip2 curl faketime iproute iptables iputils-ping less libzip2 logrotate man man-db net-tools ntpdate psmisc python rsyslog runit sudo sysvinit sysvinit-core sysvinit-utils tar unzip vim wget RUN apt-get update && \ apt-get -y install openssh-server && \ @@ -35,5 +35,12 @@ RUN mkdir -p /root/.ssh/ && \ chmod 600 /root/.ssh/authorized_keys && \ cat /root/id_rsa.pub >> /root/.ssh/authorized_keys +COPY sshd-run /etc/sv/service/sshd/run +RUN chmod +x /etc/sv/service/sshd/run && \ + ln -sf /etc/sv/service/sshd /etc/service + EXPOSE 22 -CMD exec /usr/sbin/sshd -D + +# Start runit process supervisor which will bring up sshd. +# In our tests we can use runit to supervise more processes, e.g., Mesos. +CMD runsvdir -P /etc/service /dev/null > /dev/null diff --git a/flink-jepsen/src/jepsen/flink/mesos.clj b/flink-jepsen/src/jepsen/flink/mesos.clj index fd75991..a73f25f 100644 --- a/flink-jepsen/src/jepsen/flink/mesos.clj +++ b/flink-jepsen/src/jepsen/flink/mesos.clj @@ -24,11 +24,37 @@ [jepsen.os.debian :as debian] [jepsen.flink.zookeeper :refer [zookeeper-uri]])) +;;; runit process supervisor (http://smarden.org/runit/) +;;; +;;; We use runit to supervise Mesos processes because Mesos uses a "fail-fast" approach to +;;; error handling, e.g., the Mesos master will exit when it discovers it has been partitioned away +;;; from the Zookeeper quorum. + +(def runit-version "2.1.2-3") + +(defn create-supervised-service! + "Registers a service with the process supervisor and starts it." + [service-name cmd] + (let [service-dir (str "/etc/sv/" service-name) + run-script (str service-dir "/run")] + (c/su + (c/exec :mkdir :-p service-dir) + (c/exec :echo (clojure.string/join "\n" ["#!/bin/sh" + "exec 2>&1" + (str "exec " cmd)]) :> run-script) + (c/exec :chmod :+x run-script) + (c/exec :ln :-sf service-dir (str "/etc/service/" service-name))))) + +(defn stop-supervised-service! + "Stops a service and removes it from supervision." + [service-name] + (c/su + (c/exec :sv :down service-name) + (c/exec :rm :-f (str "/etc/service/" service-name)))) + ;;; Mesos (def master-count 1) -(def master-pidfile "/var/run/mesos/master.pid") -(def slave-pidfile "/var/run/mesos/slave.pid") (def master-dir "/var/lib/mesos/master") (def slave-dir "/var/lib/mesos/slave") (def log-dir "/var/log/mesos") @@ -40,115 +66,130 @@ (def marathon-bin "/usr/bin/marathon") (def zk-marathon-namespace "marathon") -(def marathon-pidfile "/var/run/mesos/marathon.pid") (def marathon-rest-port 8080) -(defn install! - [test node mesos-version marathon-version] - (c/su - (debian/add-repo! :mesosphere - "deb http://repos.mesosphere.com/debian jessie main" - "keyserver.ubuntu.com" - "E56151BF") - (debian/install {:mesos mesos-version - :marathon marathon-version}) - (c/exec :mkdir :-p "/var/run/mesos") - (c/exec :mkdir :-p master-dir) - (c/exec :mkdir :-p slave-dir))) - ;;; Mesos functions +(defn mesos-master-cmd + "Returns the command to run the mesos master." + [test node] + (clojure.string/join " " + ["env GLOG_v=1" + master-bin + (str "--hostname=" (name node)) + (str "--log_dir=" log-dir) + (str "--offer_timeout=30secs") + (str "--quorum=" (util/majority master-count)) + (str "--registry_fetch_timeout=120secs") + (str "--registry_store_timeout=5secs") + (str "--work_dir=" master-dir) + (str "--zk=" (zookeeper-uri test zk-namespace))])) + +(defn mesos-slave-cmd + "Returns the command to run the mesos agent." + [test node] + (clojure.string/join " " + ["env GLOG_v=1" + slave-bin + (str "--hostname=" (name node)) + (str "--log_dir=" log-dir) + (str "--master=" (zookeeper-uri test zk-namespace)) + (str "--recovery_timeout=30secs") + (str "--work_dir=" slave-dir)])) + +(defn create-mesos-master-supervised-service! + [test node] + (create-supervised-service! "mesos-master" + (mesos-master-cmd test node))) + +(defn create-mesos-slave-supervised-service! + [test node] + (create-supervised-service! "mesos-slave" + (mesos-slave-cmd test node))) + +(defn master-node? + "Returns a truthy value if the node should run the mesos master." + [test node] + (some #{node} (take master-count (sort (:nodes test))))) + (defn start-master! [test node] - (when (some #{node} (take master-count (sort (:nodes test)))) + (when (master-node? test node) (info node "Starting mesos master") (c/su - (c/exec :start-stop-daemon - :--background - :--chdir master-dir - :--exec "/usr/bin/env" - :--make-pidfile - :--no-close - :--oknodo - :--pidfile master-pidfile - :--start - :-- - "GLOG_v=1" - master-bin - (str "--hostname=" (name node)) - (str "--log_dir=" log-dir) - (str "--offer_timeout=30secs") - (str "--quorum=" (util/majority master-count)) - (str "--registry_fetch_timeout=120secs") - (str "--registry_store_timeout=5secs") - (str "--work_dir=" master-dir) - (str "--zk=" (zookeeper-uri test zk-namespace)) - :>> (str log-dir "/master.stdout") - (c/lit "2>&1"))))) + (create-mesos-master-supervised-service! test node)))) (defn start-slave! [test node] - (when-not (some #{node} (take master-count (sort (:nodes test)))) + (when-not (master-node? test node) (info node "Starting mesos slave") (c/su - (c/exec :start-stop-daemon :--start - :--background - :--chdir slave-dir - :--exec slave-bin - :--make-pidfile - :--no-close - :--pidfile slave-pidfile - :--oknodo - :-- - (str "--hostname=" (name node)) - (str "--log_dir=" log-dir) - (str "--master=" (zookeeper-uri test zk-namespace)) - (str "--recovery_timeout=30secs") - (str "--work_dir=" slave-dir) - :>> (str log-dir "/slave.stdout") - (c/lit "2>&1"))))) + (create-mesos-slave-supervised-service! test node)))) (defn stop-master! - [node] - (info node "Stopping mesos master") - (meh (cu/grepkill! :mesos-master)) - (meh (c/exec :rm :-rf master-pidfile)) - (meh (c/exec :rm :-rf - (c/lit (str log-dir "/*")) - (c/lit (str master-dir "/*"))))) + [test node] + (when (master-node? test node) + (info node "Stopping mesos master") + (stop-supervised-service! "mesos-master") + (meh (c/exec :rm :-rf + (c/lit (str log-dir "/*")) + (c/lit (str master-dir "/*")))))) (defn stop-slave! - [node] - (info node "Stopping mesos slave") - (meh (cu/grepkill! :mesos-slave)) - (meh (c/exec :rm :-rf slave-pidfile)) - (meh (c/exec :rm :-rf - (c/lit (str log-dir "/*")) - (c/lit (str slave-dir "/*"))))) + [test node] + (when-not (master-node? test node) + (info node "Stopping mesos slave") + (stop-supervised-service! "mesos-slave") + (meh (c/exec :rm :-rf + (c/lit (str log-dir "/*")) + (c/lit (str slave-dir "/*")))))) ;;; Marathon functions +(defn install! + [test node mesos-version marathon-version] + (c/su + (debian/add-repo! :mesosphere + "deb http://repos.mesosphere.com/debian jessie main" + "keyserver.ubuntu.com" + "E56151BF") + (debian/install {:mesos mesos-version + :marathon marathon-version + :runit runit-version}) + (c/exec :mkdir :-p "/var/run/mesos") + (c/exec :mkdir :-p master-dir) + (c/exec :mkdir :-p slave-dir))) + +(defn marathon-cmd + "Returns the command to run the marathon." + [test node] + (clojure.string/join " " + [marathon-bin + (str "--hostname " node) + (str "--master " (zookeeper-uri test zk-namespace)) + (str "--zk " (zookeeper-uri test zk-marathon-namespace)) + (str ">> " log-dir "/marathon.out")])) + +(defn create-marathon-supervised-service! + [test node] + (create-supervised-service! "marathon" + (marathon-cmd test node))) + +(defn marathon-node? + [test node] + (= node (first (sort (:nodes test))))) + (defn start-marathon! [test node] - (when (= node (first (sort (:nodes test)))) + (when (marathon-node? test node) (info "Start marathon") (c/su - (c/exec :start-stop-daemon :--start - :--background - :--exec marathon-bin - :--make-pidfile - :--no-close - :--pidfile marathon-pidfile - :-- - (c/lit (str "--hostname " node)) - (c/lit (str "--master " (zookeeper-uri test zk-namespace))) - (c/lit (str "--zk " (zookeeper-uri test zk-marathon-namespace))) - :>> (str log-dir "/marathon.stdout") - (c/lit "2>&1"))))) + (create-marathon-supervised-service! test node)))) (defn stop-marathon! - [] - (cu/grepkill! "marathon")) + [test node] + (when (marathon-node? test node) + (stop-supervised-service! "marathon"))) (defn marathon-base-url [test] @@ -163,9 +204,9 @@ (start-slave! test node) (start-marathon! test node)) (teardown! [this test node] - (stop-slave! node) - (stop-master! node) - (stop-marathon!)) + (stop-slave! test node) + (stop-master! test node) + (stop-marathon! test node)) db/LogFiles (log-files [_ test node] (if (cu/exists? log-dir) (cu/ls-full log-dir) []))))
