jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/346833 )
Change subject: Record job state and history in a file
......................................................................
Record job state and history in a file
Includes a script which will return Icinga-ready text and exit codes.
Change-Id: I893579da632fde8df2a1ea6c2e0e564c859dc950
---
A bin/check-jobs-icinga
M bin/run-job
M examples/process-control.example.yaml
M processcontrol/config.py
M processcontrol/crontab.py
A processcontrol/job_state.py
M processcontrol/runner.py
M setup.py
M tests/data/global_config/global_defaults.yaml
9 files changed, 150 insertions(+), 4 deletions(-)
Approvals:
jenkins-bot: Verified
Ejegg: Looks good to me, approved
diff --git a/bin/check-jobs-icinga b/bin/check-jobs-icinga
new file mode 100755
index 0000000..d2a3618
--- /dev/null
+++ b/bin/check-jobs-icinga
@@ -0,0 +1,53 @@
+#!/usr/bin/python3
+#
+# Report job status in a format that can be consumed by Icinga.
+
+import argparse
+import sys
+
+
+from processcontrol import job_spec
+from processcontrol import job_state
+
+
+def report_statuses():
+ """Build response string and exit code from statuses."""
+ statuses = load_statuses()
+ bad_jobs = []
+ for job, status in statuses.items():
+ # Be conservative about what is failure, for now. Eventually,
we
+ # should warn about invalid and unknown.
+ if status == "failure":
+ bad_jobs.append(job)
+
+ if len(bad_jobs) == 0:
+ print("JOBS OK")
+ sys.exit(0)
+ else:
+ bad_jobs_message = ", ".join(bad_jobs)
+ print("FAILING JOBS: {jobs}".format(jobs=bad_jobs_message))
+ sys.exit(2)
+
+
+def load_statuses():
+ """Fetch statuses from history files."""
+ statuses = {}
+ jobs = job_spec.list()
+ for job in jobs:
+ try:
+ state = job_state.load_state(job)
+ except:
+ statuses[job] = "invalid"
+ if state.last_completion_status is None:
+ statuses[job] = "unknown"
+ else:
+ statuses[job] = state.last_completion_status
+
+ return statuses
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Report the status of all
`process-control` jobs in a format that can be consumed by Icinga.")
+ args = parser.parse_args()
+
+ report_statuses()
diff --git a/bin/run-job b/bin/run-job
index 851e606..2f7fa3f 100755
--- a/bin/run-job
+++ b/bin/run-job
@@ -6,6 +6,7 @@
from processcontrol import runner
from processcontrol import job_spec
+from processcontrol import job_state
def list_jobs():
@@ -25,6 +26,10 @@
if len(job.tags) > 0:
message += "\n tags: " + ", ".join(job.tags)
+
+ stored_state = job_state.load_state(job_slug)
+ message += "\n last status: " +
stored_state.last_completion_status
+
except AssertionError:
message = "{job} ***Invalid
configuration***".format(job=job_slug)
print(message)
diff --git a/examples/process-control.example.yaml
b/examples/process-control.example.yaml
index 746af0a..2f43e63 100644
--- a/examples/process-control.example.yaml
+++ b/examples/process-control.example.yaml
@@ -77,6 +77,9 @@
#
# TODO: The deb install should create this directory and do something about
# permissions.
+# TODO: rename to `lock_directory`
#run_directory: /var/run/process-control
#
run_directory: /tmp
+
+state_directory: /var/cache/process-control
diff --git a/processcontrol/config.py b/processcontrol/config.py
index 9934806..806bf3d 100644
--- a/processcontrol/config.py
+++ b/processcontrol/config.py
@@ -63,8 +63,7 @@
raw value if it's already a list."""
value = self.get(path)
if hasattr(value, "encode"):
- # Is stringlike, so cast to a list and handle along with the plural
- # case below.
+ # Is stringlike, so cast to a list.
return [value]
# Otherwise, it's already a list.
diff --git a/processcontrol/crontab.py b/processcontrol/crontab.py
index 82266ca..e22ba0a 100644
--- a/processcontrol/crontab.py
+++ b/processcontrol/crontab.py
@@ -1,5 +1,3 @@
-from __future__ import print_function
-
from . import config
from . import job_spec
diff --git a/processcontrol/job_state.py b/processcontrol/job_state.py
new file mode 100644
index 0000000..f034d0a
--- /dev/null
+++ b/processcontrol/job_state.py
@@ -0,0 +1,80 @@
+import datetime
+import yaml
+
+
+from . import config
+
+
+def load_state(slug):
+ state = JobState(slug)
+ state.load()
+ return state
+
+
+def statefile_path(slug):
+ global_config = config.GlobalConfiguration()
+ path = "{root}/{job}.yaml".format(
+ root=global_config.get("state_directory"),
+ job=slug)
+ return path
+
+
+class JobState(object):
+ """Manage a statefile for each job, with information about recent run
+ history."""
+
+ def __init__(self, slug):
+ self.slug = slug
+ self.path = statefile_path(slug)
+ self.history = []
+ self.last_completion_status = "unknown"
+
+ def load(self):
+ try:
+ with open(self.path, "r") as f:
+ storage = yaml.safe_load(f)
+ except IOError:
+ # TODO: Might want to remove the file and stuff.
+ return
+
+ self.history = storage["history"]
+ self.last_completion_status = storage["last_completion_status"]
+
+ def write(self):
+ # TODO: Ensure that we've called load() first, so we aren't overwriting
+ # history.
+ if len(self.history) > 20:
+ self.history = self.history[-20:]
+
+ contents = {
+ "history": self.history,
+ }
+
+ contents["last_completion_status"] = self.last_completion_status
+
+ with open(self.path, "w") as f:
+ yaml.dump(contents, stream=f)
+
+ def record_started(self, start_time):
+ self.history.append({
+ "status": "started",
+ "time": start_time.isoformat(" "),
+ })
+ self.write()
+
+ # TODO: We want job duration, etc.
+ def record_success(self):
+ self.history.append({
+ "status": "completed",
+ "time": datetime.datetime.utcnow().isoformat(" "),
+ })
+ self.last_completion_status = "success"
+ self.write()
+
+ def record_failure(self):
+ self.history.append({
+ "status": "failed",
+ "time": datetime.datetime.utcnow().isoformat(" "),
+ })
+ self.last_completion_status = "failure"
+ self.write()
diff --git a/processcontrol/runner.py b/processcontrol/runner.py
index d365d2c..0c39dab 100644
--- a/processcontrol/runner.py
+++ b/processcontrol/runner.py
@@ -6,6 +6,7 @@
import threading
from . import config
+from . import job_state
from . import lock
from . import mailer
from . import output_streamer
@@ -32,6 +33,8 @@
lock.begin(job_tag=self.job.slug)
self.start_time = datetime.datetime.utcnow()
+ job_history = job_state.load_state(self.job.slug)
+ job_history.record_started(self.start_time)
config.log.info("Running job {name}
({slug})".format(name=self.job.name, slug=self.job.slug))
@@ -47,9 +50,11 @@
return_code = self.run_command(command_line)
if return_code != 0:
self.fail_exitcode(return_code)
+ job_history.record_success()
except JobFailure as ex:
config.log.error(str(ex))
self.mailer.fail_mail(str(ex), logfile=self.logfile)
+ job_history.record_failure()
raise
finally:
if self.job.timeout > 0:
diff --git a/setup.py b/setup.py
index a6a07ab..7dc41b5 100755
--- a/setup.py
+++ b/setup.py
@@ -10,6 +10,7 @@
url='https://github.com/adamwight/process-control',
packages=['processcontrol'],
scripts=[
+ 'bin/check-jobs-icinga',
'bin/cron-generate',
'bin/run-job',
],
diff --git a/tests/data/global_config/global_defaults.yaml
b/tests/data/global_config/global_defaults.yaml
index 00c6fd7..13ca90c 100644
--- a/tests/data/global_config/global_defaults.yaml
+++ b/tests/data/global_config/global_defaults.yaml
@@ -26,3 +26,5 @@
output_directory: /tmp
run_directory: /tmp
+
+state_directory: /tmp
--
To view, visit https://gerrit.wikimedia.org/r/346833
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I893579da632fde8df2a1ea6c2e0e564c859dc950
Gerrit-PatchSet: 12
Gerrit-Project: wikimedia/fundraising/process-control
Gerrit-Branch: master
Gerrit-Owner: Awight <[email protected]>
Gerrit-Reviewer: Cdentinger <[email protected]>
Gerrit-Reviewer: Ejegg <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits