ArielGlenn has uploaded a new change for review.
https://gerrit.wikimedia.org/r/236798
Change subject: crap salt cleanup scripts primarily for labs use
......................................................................
crap salt cleanup scripts primarily for labs use
Change-Id: I30a5492525da30e5f41af718358e9f2a0568125f
---
A salt-misc/gather-minion-info.sh
A salt-misc/parse-minion-output.py
A salt-misc/salt-fixups.py
3 files changed, 676 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/software
refs/changes/98/236798/1
diff --git a/salt-misc/gather-minion-info.sh b/salt-misc/gather-minion-info.sh
new file mode 100755
index 0000000..25c7b68
--- /dev/null
+++ b/salt-misc/gather-minion-info.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# fixme set SSH here to what you use
+SSH=""
+
+if [ -z "$SSH" ]; then
+ echo "edit this script to put your ssh command"
+ exit 1
+fi
+
+if [ -z "$1" ]; then
+ echo "Usage: $0 filename"
+ echo "filename should have list of fqdn of instances to check, one per
line"
+ exit 1
+fi
+hostlist=`cat $1`
+
+for minion in $hostlist; do
+ echo "doing $minion"
+ $SSH bastion-restricted.wmflabs.org ssh -l root -o
'StrictHostKeyChecking=no' "$minion" "ls -lt /var/log/salt/minion"
+ $SSH bastion-restricted.wmflabs.org ssh -l root -o
'StrictHostKeyChecking=no' "$minion" "tail -20 /var/log/salt/minion"
+ $SSH bastion-restricted.wmflabs.org ssh -l root -o
'StrictHostKeyChecking=no' "$minion" "cat /etc/salt/minion"
+ $SSH bastion-restricted.wmflabs.org ssh -l root -o
'StrictHostKeyChecking=no' "$minion" 'ps axuww | grep salt | grep -v axuww'
+ $SSH bastion-restricted.wmflabs.org ssh -l root -o
'StrictHostKeyChecking=no' "$minion" "dpkg -l | grep salt"
+ $SSH bastion-restricted.wmflabs.org ssh -l root -o
'StrictHostKeyChecking=no' "$minion" "cat /etc/issue"
+ $SSH bastion-restricted.wmflabs.org ssh -l root -o
'StrictHostKeyChecking=no' "$minion" "ls -lt /var/log/puppet.log"
+ $SSH bastion-restricted.wmflabs.org ssh -l root -o
'StrictHostKeyChecking=no' "$minion" "tail -40 /var/log/puppet.log"
+done
diff --git a/salt-misc/parse-minion-output.py b/salt-misc/parse-minion-output.py
new file mode 100644
index 0000000..da605a1
--- /dev/null
+++ b/salt-misc/parse-minion-output.py
@@ -0,0 +1,256 @@
+import os
+import sys
+
+def get_hostinfo(lines):
+ '''
+ split up the pile of lines of text
+ into separate lists of lines, one
+ for each host
+ '''
+ hosts = []
+ hostinfo = []
+ for line in lines:
+ if line.startswith("doing "):
+ if hostinfo:
+ hosts.append(hostinfo)
+ hostinfo = [line]
+ else:
+ hostinfo.append(line)
+ return hosts
+
+def get_date(line):
+ '''
+ from a line of ls -lt, get the date
+ '''
+ fields = line.split()
+ return " ".join([fields[index] for index in range(5,8)])
+
+def get_salt_version(line):
+ '''
+ from a dpkg-l line, get the version info
+ '''
+ fields = line.split()
+ for field in fields:
+ if field.startswith("20"):
+ return field
+ return ""
+
+def get_process(line):
+ '''
+ dig out the date and command from a ps line
+ '''
+ # root 361 0.0 0.9 69932 19768 ? Ss Aug19 0:00
/usr/bin/python /usr/bin/salt-minion'
+ line.strip()
+ if not line:
+ return ""
+ fields = line.split()
+ if len(fields) < 8:
+ return ""
+ return " ".join(fields[8:])
+
+def get_host_data(hostinfo):
+ '''
+ from list of lines for a host, dig
+ out the info we want and return it
+ in a tuple
+ '''
+ hostdata = {}
+ hostdata['processes'] = []
+ hostdata['masters'] = []
+ for item in ['hostname', 'issue', 'salt_id', 'keysize', 'puppet_rundate',
+ 'minion_logdate', 'salt_version', 'minion_errors']:
+ hostdata[item] = ""
+
+ want_master = False
+ for line in hostinfo:
+ if line.startswith("Debian") or line.startswith("Ubuntu"):
+ hostdata['issue'] = line
+ elif line.startswith("doing"):
+ hostdata['hostname'] = line[6:]
+ elif line.startswith("id:"):
+ hostdata['salt_id'] = line[4:]
+ elif line.startswith("master:"):
+ if not line[8:]:
+ '''
+ master:
+ - labs-puppetmaster-eqiad.wikimedia.org
+ - labs-puppetmaster-codfw.wikimedia.org
+ and this terminates as soon as we see a line with non
whitespace first char.
+ '''
+ want_master = True
+ else:
+ hostdata['masters'].append(line[8:])
+ elif line.startswith("keysize:"):
+ hostdata['keysize'] = line[9:]
+ elif "/var/log/puppet.log" in line:
+ hostdata['puppet_rundate'] = get_date(line)
+ elif "/var/log/salt/minion" in line:
+ hostdata['minion_logdate'] = get_date(line)
+ elif "/usr/bin/salt-minion" in line:
+ process = get_process(line)
+ if process:
+ hostdata['processes'].append(process)
+ elif "salt-common" in line:
+ hostdata['salt_version'] = get_salt_version(line)
+ elif line.startswith("AuthenticationError"):
+ hostdata['minion_errors'] = line
+ elif want_master == True:
+ stripped = line.lstrip()
+ if stripped and stripped[0] == '-':
+ master_name = stripped[1:].lstrip()
+ hostdata['masters'].append(master_name)
+ else:
+ want_master = False
+ return hostdata
+
+def show(hostdata):
+ '''
+ given some extracted data about a host, display it
+ '''
+ #yeah. it's cheap. so what.
+ keys = sorted(hostdata.keys())
+ for key in keys:
+ if hostdata[key]:
+ print key, hostdata[key]
+ print
+
+def show_ec2id_salt_ids(summaries):
+ '''
+ given nicely formatted host info summaries,
+ show all the hostnames where the salt id is i-000 something
+ plus show the ids too
+ '''
+ print "hosts with ec2id salt ids"
+ for summary in summaries:
+ if summary['salt_id'].startswith('i-000'):
+ print summary['hostname'], summary['salt_id'], "last puppet run:",
summary['puppet_rundate']
+ print
+
+def show_oldstyle_salt_ids(summaries):
+ '''
+ given nicely formatted host info summaries,
+ show all the hostnames where the salt id is the old-style
+ hostname (three parts without the project name)
+ plus show the ids too
+ '''
+ print "hosts with old style salt ids"
+ for summary in summaries:
+ if summary['salt_id']:
+ if len(summary['salt_id'].split(".")) != 4:
+ print summary['hostname'], summary['salt_id']
+ print
+
+def show_salt_errors(summaries):
+ '''
+ given nicely formatted host info summaries,
+ show all the hostnames where we had the authentication error
+ for salt
+ '''
+ print "hosts with the authentication minion error"
+ for summary in summaries:
+ if summary['minion_errors']:
+ print summary['hostname']
+ print
+
+def show_salt_bad_versions(summaries):
+ '''
+ given nicely formatted host info summaries,
+ show all the hostnames where the salt version is not 2014.7.5
+ plus show the versions
+ '''
+ print "hosts with bad salt versions"
+ for summary in summaries:
+ if summary['salt_version']:
+ if not summary['salt_version'].startswith('2014.7.5'):
+ print summary['hostname'], summary['salt_version'],
summary['issue']
+ print
+
+def show_salt_other_masters(summaries):
+ '''
+ given nicely formatted host info summaries,
+ show all the hostnames where the salt master is not the
+ standard labs master, plus show the master names
+ '''
+ print "hosts with other salt masters"
+ for summary in summaries:
+ if summary['masters']:
+ for master in summary['masters']:
+ if (not master == 'labs-puppetmaster-eqiad.wikimedia.org' and
+ not master == 'labs-puppetmaster-codfw.wikimedia.org' and
+ not master == 'labcontrol2001.wikimedia.org'):
+ print summary['hostname'], summary['masters']
+ break
+ print
+
+def show_salt_correct_masters(summaries):
+ '''
+ given nicely formatted host info summaries,
+ show all the hostnames where the salt master is only
+ standard labs masters
+ '''
+ print "hosts with correct salt master(s)"
+ for summary in summaries:
+ if summary['masters']:
+ problem = False
+ for master in summary['masters']:
+ if not (master == 'labs-puppetmaster-eqiad.wikimedia.org' or
+ master == 'labs-puppetmaster-codfw.wikimedia.org' or
+ master == 'labcontrol2001.wikimedia.org'):
+ problem = True
+ break
+ if not problem:
+ print summary['hostname'], summary['masters']
+ print
+
+def show_salt_no_processes(summaries):
+ '''
+ show all hosts where no salt minion is running
+ '''
+ print "hosts with no mininon running"
+ for summary in summaries:
+ if not summary['processes']:
+ print summary['hostname']
+ print
+
+def show_salt_too_many_processes(summaries):
+ '''
+ show all hosts where more than one salt minion is running
+ plus the process information
+ '''
+ print "hosts with more than one minion running"
+ for summary in summaries:
+ if len(summary['processes']) > 1:
+ print summary['hostname'], summary['processes']
+ print
+
+def show_salt_no_keysize(summaries):
+ '''
+ show all hosts where keysize has not been set
+ '''
+ print "hosts with no keysize specified"
+ for summary in summaries:
+ if summary['masters'] and not summary['keysize']:
+ print summary['hostname'], 'last puppet run:',
summary['puppet_rundate']
+ print
+
+def main():
+ summaries = []
+ inputfp = sys.stdin
+ content = inputfp.read()
+ lines = content.splitlines()
+ hostinfo = get_hostinfo(lines)
+ for host in hostinfo:
+ results = get_host_data(host)
+ show(results)
+ summaries.append(results)
+ show_ec2id_salt_ids(summaries)
+ show_salt_errors(summaries)
+ show_salt_bad_versions(summaries)
+ show_salt_other_masters(summaries)
+# show_salt_correct_masters(summaries)
+ show_salt_no_processes(summaries)
+ show_salt_too_many_processes(summaries)
+ show_salt_no_keysize(summaries)
+
+if __name__ == "__main__":
+ main()
diff --git a/salt-misc/salt-fixups.py b/salt-misc/salt-fixups.py
new file mode 100644
index 0000000..1bdb819
--- /dev/null
+++ b/salt-misc/salt-fixups.py
@@ -0,0 +1,392 @@
+'''
+this script checks the status of salt on a box
+and does some fixups; used primarily in labs
+because things get out of hand there really fast
+'''
+
+import os
+import sys
+import getopt
+import time
+from subprocess import Popen, PIPE
+
+# any minion older than 30 minutes was not just
+# spawned for a job now.
+OLD_PROC = 1800
+
+# logs older than 90 minutes can be ignored
+OLD_LOG = 5400
+
+LOGFILE = "/var/log/salt/minion"
+
+UPSTART = "/sbin/start"
+SYSTEMCTL = "/bin/systemctl"
+SALT_VERSION = "2014.7.5"
+MASTERS = ['labs-puppetmaster-eqiad.wikimedia.org',
+ 'labs-puppetmaster-codfw.wikimedia.org',
+ 'labcontrol2001.wikimedia.org']
+
+def check_master():
+ masters = get_masters()
+ if not masters:
+ return False
+ for entry in masters:
+ if entry not in MASTERS:
+ return False
+ return True
+
+def get_masters():
+ contents = open("/etc/salt/minion", "r").read()
+ lines = contents.splitlines()
+ masters = []
+ want_master = False
+ for line in lines:
+ if line.startswith("master:"):
+ if not line[8:]:
+ '''
+ master:
+ - labs-puppetmaster-eqiad.wikimedia.org
+ - labs-puppetmaster-codfw.wikimedia.org
+ and this terminates as soon as we see a line with non
whitespace first char.
+ '''
+ want_master = True
+ else:
+ masters.append(line[8:])
+ elif want_master == True:
+ stripped = line.lstrip()
+ if stripped and stripped[0] == '-':
+ master_name = stripped[1:].lstrip()
+ masters.append(master_name)
+ else:
+ return masters
+ return masters
+
+def restart_salt():
+ '''
+ shoot all minions and start one
+ '''
+ shoot_salt_processes()
+ start_salt_process()
+
+def check_salt_auth_error():
+ '''
+ see if the salt minion log ends with the
+ dreaded Authentication Error, usually a
+ sign that the minion is hung
+ '''
+ if os.path.exists(LOGFILE):
+ if time.time() - os.stat(LOGFILE).st_mtime > OLD_LOG:
+ # not that current a message, whatever is in the log
+ return False
+ # now check contents
+ contents = open(LOGFILE, "r").read()
+ if not contents:
+ return False
+ lines = contents.splitlines()
+ if lines[-1].startswith("AuthenticationError: "):
+ return True
+ return False
+
+def apt_update(verbose):
+ '''
+ apt-get update and display the results
+ returns True on success, False on error
+
+ ignores errors starting with W: (warnings)
+ '''
+ return get_popen_output(["apt-get", "update"], "W:", display=verbose)
+
+def apt_install_dryrun(verbose):
+ '''
+ apt-get install dryrun for minion and display the results
+ returns True on success, False on error
+ '''
+ return get_popen_output(["apt-get", "-y", "--simulate", "-o",
+ "DPkg::Options::=--force-confold",
+ "-o", "Apt::Get::AllowUnauthenticated=true",
+ "install", "salt-common", "salt-minion"],
display=verbose)
+
+def apt_install(verbose):
+ return get_popen_output(["apt-get", "-y", "--force-yes",
+ "-o", "DPkg::Options::=--force-confold",
+ "-o", "Apt::Get::AllowUnauthenticated=true",
+ "install", "salt-common", "salt-minion"],
display=verbose)
+
+def fix_salt_version(verbose):
+ '''
+ kill all minions, try apt-get update and install of minion,
+ start a minion afterwards if needed.
+ if any step fails, will not proceed
+ '''
+ shoot_salt_processes()
+ if apt_update(verbose):
+ if apt_install_dryrun(verbose):
+ apt_install(verbose)
+ salt_processes = get_salt_processes()
+ if not salt_processes:
+ start_salt_process()
+
+def get_popen_output(command, ignore=None, display=False, skipretcode=False):
+ '''
+ given a list with command and arguments,
+ run it via Popen, returning lines of output
+ on error, show errors and return None
+ '''
+ proc = Popen(command, stderr=PIPE, stdout=PIPE)
+ output, error = proc.communicate()
+ if not skipretcode and proc.returncode:
+ print "Command:", command
+ print "Errors:", error
+ print "Output:", output
+ return None
+
+ if error and ignore is not None and not error.startswith(ignore):
+ print error
+ return None
+
+ if display:
+ print "INFO:", command, output
+ return output.splitlines()
+
+def get_salt_version():
+ '''
+ get the installed version of salt-minion via dpkg
+ '''
+ entries = get_popen_output(["dpkg", "-s", "salt-minion"])
+ if not entries:
+ return None
+ for entry in entries:
+ if entry.startswith("Version: "):
+ return entry[9:]
+ return None
+
+def check_salt_version(version):
+ if version is None:
+ return False
+ if not version.startswith(SALT_VERSION):
+ return False
+ return True
+
+def get_salt_processes():
+ '''
+ return list of pids of salt-minions running
+ '''
+ return get_popen_output(["pgrep", "salt-minion"], skipretcode=True)
+
+def check_salt_processes(processes):
+ '''
+ if more than 1 such process that is
+ not just now spawned to do a task,
+ then more than one minion is running
+ and processing requests, which we don't
+ want
+
+ return True if processes are all good
+ return False if there's extra processes
+ '''
+ if len(processes) == 1:
+ return True
+
+ entries = get_popen_output(["ps", "-p", ",".join(processes), "-o",
"etimes="], skipretcode=True)
+ for entry in entries:
+ if int(entry) > OLD_PROC:
+ return False
+ return True
+
+def do_popen(command):
+ '''
+ run a command, as a list, no shell, display
+ output if any
+ '''
+ proc = Popen(command, stderr=PIPE, stdout=PIPE)
+ output, error = proc.communicate()
+ if error:
+ print error
+ return False
+ if output:
+ print output
+ return True
+
+def do_upstart():
+ '''
+ start salt-minion via upstart
+ '''
+ return do_popen([UPSTART, "salt-minion"])
+
+def do_systemctl():
+ '''
+ start salt-minion via systemctl
+ '''
+ return do_popen([SYSTEMCTL, "start", "salt-minion.service"])
+
+def start_salt_process():
+ '''
+ start a minion with upstart or systemctl
+ accordingly
+ '''
+ if os.path.exists(UPSTART):
+ do_upstart()
+ elif os.path.exists(SYSTEMCTL):
+ do_systemctl()
+ else:
+ print "failed to find startup command"
+
+def shoot_salt_processes():
+ '''
+ shoot all salt-minion processes with prejudice
+ '''
+ do_popen(["pkill", "salt-minion"])
+ time.sleep(1)
+ salt_processes = get_salt_processes()
+ if salt_processes is not None and len(salt_processes):
+ print "hrm, still some processes around", salt_processes
+ return False
+ return True
+
+def usage(message=None):
+ '''
+ display a helpful usage message with
+ an optional introductory message first
+ '''
+ if message is not None:
+ sys.stderr.write(message)
+ sys.stderr.write("\n")
+ usage_message = """
+Usage: salt-fixups.py --actions actionlist
+ --dryrun --help
+
+Options:
+
+ --actions (-a): comma-separated list of actions which may
+ be one or more of the following:
+
+ autherror -- check and restart if minion is stuck with
authentication error
+ version -- check and fix if salt is wrong version
+ count -- check and fix if there is not exactly one
minion running
+
+ --dryrun (-d): display the commands that would be run to produce the
output but
+ don't actually run them
+ --verbose (-v): display informational messages as this script runs
+ --help (-h): display this message
+"""
+ sys.stderr.write(usage_message)
+ sys.exit(1)
+
+def do_version(dryrun, verbose):
+ '''
+ if salt version isn't what we want,
+ apt-get install it
+ '''
+ salt_version = get_salt_version()
+ if not check_salt_version(salt_version):
+ if dryrun:
+ print "would fix salt version (bad)"
+ else:
+ if verbose:
+ print "fixing salt version (bad)"
+ fix_salt_version(verbose)
+ elif dryrun or verbose:
+ print "salt version is good"
+
+def do_count(dryrun, verbose):
+ '''
+ if there is not exactly one salt minion running
+ and handling requests (excluding any recently forked
+ process that may be doing a request right now),
+ start or shoot/restart minion(s) as needed
+ '''
+ salt_processes = get_salt_processes()
+ if not salt_processes:
+ if dryrun:
+ print "would start a minion (none running)"
+ else:
+ if verbose:
+ print "starting minion (none running)"
+ start_salt_process()
+ elif not check_salt_processes(salt_processes):
+ # more than one such process, and not just
+ # now spawned to do a job
+ if dryrun:
+ print "would shoot minions and start one (too many)"
+ else:
+ if verbose:
+ print "shooting minions (too many)"
+ shoot_salt_processes()
+ if verbose:
+ print "starting minion"
+ start_salt_process()
+ elif dryrun or verbose:
+ print "salt minion count is good"
+
+def do_autherror(dryrun, verbose):
+ '''
+ check if the salt log ends with a notification of an authentication
+ error, this is usually a sign that the minion is out to lunch
+ if so, restart the minion, this usually clears it up
+ '''
+ if check_salt_auth_error():
+ if dryrun:
+ print "would restart minion (auth error)"
+ else:
+ if verbose:
+ print "restarting salt (auth error)"
+ restart_salt()
+ elif dryrun or verbose:
+ print "no minion autherror"
+
+def do_actions(actions, dryrun, verbose):
+ '''
+ handle user-requested actions
+ '''
+ if "autherror" in actions:
+ do_autherror(dryrun, verbose)
+
+ if "version" in actions:
+ do_version(dryrun, verbose)
+
+ if "count" in actions:
+ do_count(dryrun, verbose)
+
+def main():
+ '''
+ make sure salt version is correct
+ make sure we are running exactly one minion
+ if the dreaded authentication error is
+ hanging the minion, restart it
+ '''
+ actions = None
+ dryrun = False
+ verbose = False
+
+ try:
+ (options, remainder) = getopt.gnu_getopt(
+ sys.argv[1:], "a:dvh",
+ ["actions=", "help", "verbose", "dryrun"])
+ except getopt.GetoptError as err:
+ usage("Unknown option specified: " + str(err))
+ for (opt, val) in options:
+ if opt in ["-a", "--actions"]:
+ actions = val.split(",")
+ elif opt in ["-d", "--dryrun"]:
+ dryrun = True
+ elif opt in ["-v", "--verbose"]:
+ verbose = True
+ elif opt in ["-h", "--help"]:
+ usage('Help for this script\n')
+ else:
+ usage("Unknown option specified: <%s>" % opt)
+ if len(remainder) > 0:
+ usage("Unknown option(s) specified: <%s>" % remainder[0])
+
+ if actions is None:
+ usage("At least one action must be specified.")
+
+ if check_master():
+ if dryrun:
+ print "master checks out, proceeding"
+ do_actions(actions, dryrun, verbose)
+ elif dryrun:
+ print "wrong master encountered, stopping"
+
+if __name__ == "__main__":
+ main()
--
To view, visit https://gerrit.wikimedia.org/r/236798
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I30a5492525da30e5f41af718358e9f2a0568125f
Gerrit-PatchSet: 1
Gerrit-Project: operations/software
Gerrit-Branch: master
Gerrit-Owner: ArielGlenn <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits