ArielGlenn has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/236798

Change subject: crap salt cleanup scripts primarily for labs use
......................................................................

crap salt cleanup scripts primarily for labs use

Change-Id: I30a5492525da30e5f41af718358e9f2a0568125f
---
A salt-misc/gather-minion-info.sh
A salt-misc/parse-minion-output.py
A salt-misc/salt-fixups.py
3 files changed, 676 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/software 
refs/changes/98/236798/1

diff --git a/salt-misc/gather-minion-info.sh b/salt-misc/gather-minion-info.sh
new file mode 100755
index 0000000..25c7b68
--- /dev/null
+++ b/salt-misc/gather-minion-info.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+
+# fixme set SSH here to what you use
+SSH=""
+
+if [ -z "$SSH" ]; then
+    echo "edit this script to put your ssh command"
+    exit 1
+fi
+
+if [ -z "$1" ]; then
+    echo "Usage: $0 filename"
+    echo "filename should have list of fqdn of instances to check, one per 
line"
+    exit 1
+fi
+hostlist=`cat $1`
+
+for minion in $hostlist; do
+    echo "doing $minion"
+        $SSH bastion-restricted.wmflabs.org ssh -l root -o 
'StrictHostKeyChecking=no' "$minion" "ls -lt /var/log/salt/minion"
+        $SSH bastion-restricted.wmflabs.org ssh -l root -o 
'StrictHostKeyChecking=no' "$minion" "tail -20 /var/log/salt/minion"
+        $SSH bastion-restricted.wmflabs.org ssh -l root -o 
'StrictHostKeyChecking=no' "$minion" "cat /etc/salt/minion"
+        $SSH bastion-restricted.wmflabs.org ssh -l root -o 
'StrictHostKeyChecking=no' "$minion" 'ps axuww | grep salt | grep -v axuww'
+        $SSH bastion-restricted.wmflabs.org ssh -l root -o 
'StrictHostKeyChecking=no' "$minion" "dpkg -l | grep salt"
+        $SSH bastion-restricted.wmflabs.org ssh -l root -o 
'StrictHostKeyChecking=no' "$minion" "cat /etc/issue"
+        $SSH bastion-restricted.wmflabs.org ssh -l root -o 
'StrictHostKeyChecking=no' "$minion" "ls -lt /var/log/puppet.log"
+        $SSH bastion-restricted.wmflabs.org ssh -l root -o 
'StrictHostKeyChecking=no' "$minion" "tail -40 /var/log/puppet.log"
+done
diff --git a/salt-misc/parse-minion-output.py b/salt-misc/parse-minion-output.py
new file mode 100644
index 0000000..da605a1
--- /dev/null
+++ b/salt-misc/parse-minion-output.py
@@ -0,0 +1,256 @@
+import os
+import sys
+
+def get_hostinfo(lines):
+    '''
+    split up the pile of lines of text
+    into separate lists of lines, one
+    for each host
+    '''
+    hosts = []
+    hostinfo = []
+    for line in lines:
+        if line.startswith("doing "):
+            if hostinfo:
+                hosts.append(hostinfo)
+            hostinfo = [line]
+        else:
+            hostinfo.append(line)
+    return hosts
+
+def get_date(line):
+    '''
+    from a line of ls -lt, get the date
+    '''
+    fields = line.split()
+    return " ".join([fields[index] for index in range(5,8)])
+
+def get_salt_version(line):
+    '''
+    from a dpkg-l line, get the version info
+    '''
+    fields = line.split()
+    for field in fields:
+        if field.startswith("20"):
+            return field
+    return ""
+
+def get_process(line):
+    '''
+    dig out the date and command from a ps line
+    '''
+    # root       361  0.0  0.9  69932 19768 ?        Ss   Aug19   0:00 
/usr/bin/python /usr/bin/salt-minion'
+    line.strip()
+    if not line:
+        return ""
+    fields = line.split()
+    if len(fields) < 8:
+        return ""
+    return " ".join(fields[8:])
+
+def get_host_data(hostinfo):
+    '''
+    from list of lines for a host, dig
+    out the info we want and return it
+    in a tuple
+    '''
+    hostdata = {}
+    hostdata['processes'] = []
+    hostdata['masters'] = []
+    for item in ['hostname', 'issue', 'salt_id', 'keysize', 'puppet_rundate',
+                 'minion_logdate', 'salt_version', 'minion_errors']:
+        hostdata[item] = ""
+
+    want_master = False
+    for line in hostinfo:
+        if line.startswith("Debian") or line.startswith("Ubuntu"):
+            hostdata['issue'] = line
+        elif line.startswith("doing"):
+            hostdata['hostname'] = line[6:]
+        elif line.startswith("id:"):
+            hostdata['salt_id'] = line[4:]
+        elif line.startswith("master:"):
+            if not line[8:]:
+                '''
+                master:
+                - labs-puppetmaster-eqiad.wikimedia.org
+                - labs-puppetmaster-codfw.wikimedia.org
+                and this terminates as soon as we see a line with non 
whitespace first char.
+                '''
+                want_master = True
+            else:
+                hostdata['masters'].append(line[8:])
+        elif line.startswith("keysize:"):
+            hostdata['keysize'] = line[9:]
+        elif "/var/log/puppet.log" in line:
+            hostdata['puppet_rundate'] = get_date(line)
+        elif "/var/log/salt/minion" in line:
+            hostdata['minion_logdate'] = get_date(line)
+        elif "/usr/bin/salt-minion" in line:
+            process = get_process(line)
+            if process:
+                hostdata['processes'].append(process)
+        elif "salt-common" in line:
+            hostdata['salt_version'] = get_salt_version(line)
+        elif line.startswith("AuthenticationError"):
+            hostdata['minion_errors'] = line
+        elif want_master == True:
+            stripped = line.lstrip()
+            if stripped and stripped[0] == '-':
+                master_name = stripped[1:].lstrip()
+                hostdata['masters'].append(master_name)
+            else:
+                want_master = False
+    return hostdata
+
+def show(hostdata):
+    '''
+    given some extracted data about a host, display it
+    '''
+    #yeah. it's cheap. so what.
+    keys = sorted(hostdata.keys())
+    for key in keys:
+        if hostdata[key]:
+            print key, hostdata[key]
+    print
+
+def show_ec2id_salt_ids(summaries):
+    '''
+    given nicely formatted host info summaries,
+    show all the hostnames where the salt id is i-000 something
+    plus show the ids too
+    '''
+    print "hosts with ec2id salt ids"
+    for summary in summaries:
+        if summary['salt_id'].startswith('i-000'):
+            print summary['hostname'], summary['salt_id'], "last puppet run:", 
summary['puppet_rundate']
+    print
+
+def show_oldstyle_salt_ids(summaries):
+    '''
+    given nicely formatted host info summaries,
+    show all the hostnames where the salt id is the old-style
+    hostname (three parts without the project name)
+    plus show the ids too
+    '''
+    print "hosts with old style salt ids"
+    for summary in summaries:
+        if summary['salt_id']:
+            if len(summary['salt_id'].split(".")) != 4:
+                print summary['hostname'], summary['salt_id']
+    print
+
+def show_salt_errors(summaries):
+    '''
+    given nicely formatted host info summaries,
+    show all the hostnames where we had the authentication error
+    for salt
+    '''
+    print "hosts with the authentication minion error"
+    for summary in summaries:
+        if summary['minion_errors']:
+            print summary['hostname']
+    print
+
+def show_salt_bad_versions(summaries):
+    '''
+    given nicely formatted host info summaries,
+    show all the hostnames where the salt version is not 2014.7.5
+    plus show the versions
+    '''
+    print "hosts with bad salt versions"
+    for summary in summaries:
+        if summary['salt_version']:
+            if not summary['salt_version'].startswith('2014.7.5'):
+                print summary['hostname'], summary['salt_version'], 
summary['issue']
+    print
+
+def show_salt_other_masters(summaries):
+    '''
+    given nicely formatted host info summaries,
+    show all the hostnames where the salt master is not the
+    standard labs master, plus show the master names
+    '''
+    print "hosts with other salt masters"
+    for summary in summaries:
+        if summary['masters']:
+            for master in summary['masters']:
+                if (not master == 'labs-puppetmaster-eqiad.wikimedia.org' and
+                    not master == 'labs-puppetmaster-codfw.wikimedia.org' and
+                        not master == 'labcontrol2001.wikimedia.org'):
+                    print summary['hostname'], summary['masters']
+                    break
+    print
+
+def show_salt_correct_masters(summaries):
+    '''
+    given nicely formatted host info summaries,
+    show all the hostnames where the salt master is only
+    standard labs masters
+    '''
+    print "hosts with correct salt master(s)"
+    for summary in summaries:
+        if summary['masters']:
+            problem = False
+            for master in summary['masters']:
+                if not (master == 'labs-puppetmaster-eqiad.wikimedia.org' or
+                        master == 'labs-puppetmaster-codfw.wikimedia.org' or
+                        master == 'labcontrol2001.wikimedia.org'):
+                    problem = True
+                    break
+            if not problem:
+                print summary['hostname'], summary['masters']
+    print
+
+def show_salt_no_processes(summaries):
+    '''
+    show all hosts where no salt minion is running
+    '''
+    print "hosts with no mininon running"
+    for summary in summaries:
+        if not summary['processes']:
+            print summary['hostname']
+    print
+
+def show_salt_too_many_processes(summaries):
+    '''
+    show all hosts where more than one salt minion is running
+    plus the process information
+    '''
+    print "hosts with more than one minion running"
+    for summary in summaries:
+        if len(summary['processes']) > 1:
+            print summary['hostname'], summary['processes']
+    print
+
+def show_salt_no_keysize(summaries):
+    '''
+    show all hosts where keysize has not been set
+    '''
+    print "hosts with no keysize specified"
+    for summary in summaries:
+        if summary['masters'] and not summary['keysize']:
+            print summary['hostname'], 'last puppet run:', 
summary['puppet_rundate']
+    print
+
+def main():
+    summaries = []
+    inputfp = sys.stdin
+    content = inputfp.read()
+    lines = content.splitlines()
+    hostinfo = get_hostinfo(lines)
+    for host in hostinfo:
+        results = get_host_data(host)
+        show(results)
+        summaries.append(results)
+    show_ec2id_salt_ids(summaries)
+    show_salt_errors(summaries)
+    show_salt_bad_versions(summaries)
+    show_salt_other_masters(summaries)
+#    show_salt_correct_masters(summaries)
+    show_salt_no_processes(summaries)
+    show_salt_too_many_processes(summaries)
+    show_salt_no_keysize(summaries)
+
+if __name__ == "__main__":
+  main()
diff --git a/salt-misc/salt-fixups.py b/salt-misc/salt-fixups.py
new file mode 100644
index 0000000..1bdb819
--- /dev/null
+++ b/salt-misc/salt-fixups.py
@@ -0,0 +1,392 @@
+'''
+this script checks the status of salt on a box
+and does some fixups; used primarily in labs
+because things get out of hand there really fast
+'''
+
+import os
+import sys
+import getopt
+import time
+from subprocess import Popen, PIPE
+
+# any minion older than 30 minutes was not just
+# spawned for a job now.
+OLD_PROC = 1800
+
+# logs older than 90 minutes can be ignored
+OLD_LOG = 5400
+
+LOGFILE = "/var/log/salt/minion"
+
+UPSTART = "/sbin/start"
+SYSTEMCTL = "/bin/systemctl"
+SALT_VERSION = "2014.7.5"
+MASTERS = ['labs-puppetmaster-eqiad.wikimedia.org',
+           'labs-puppetmaster-codfw.wikimedia.org',
+           'labcontrol2001.wikimedia.org']
+
+def check_master():
+    masters = get_masters()
+    if not masters:
+        return False
+    for entry in masters:
+        if entry not in MASTERS:
+            return False
+    return True
+
+def get_masters():
+    contents = open("/etc/salt/minion", "r").read()
+    lines = contents.splitlines()
+    masters = []
+    want_master = False
+    for line in lines:
+        if line.startswith("master:"):
+            if not line[8:]:
+                '''
+                master:
+                - labs-puppetmaster-eqiad.wikimedia.org
+                - labs-puppetmaster-codfw.wikimedia.org
+                and this terminates as soon as we see a line with non 
whitespace first char.
+                '''
+                want_master = True
+            else:
+                masters.append(line[8:])
+        elif want_master == True:
+            stripped = line.lstrip()
+            if stripped and stripped[0] == '-':
+                master_name = stripped[1:].lstrip()
+                masters.append(master_name)
+            else:
+                return masters
+    return masters
+
+def restart_salt():
+    '''
+    shoot all minions and start one
+    '''
+    shoot_salt_processes()
+    start_salt_process()
+
+def check_salt_auth_error():
+    '''
+    see if the salt minion log ends with the
+    dreaded Authentication Error, usually a
+    sign that the minion is hung
+    '''
+    if os.path.exists(LOGFILE):
+        if time.time() - os.stat(LOGFILE).st_mtime > OLD_LOG:
+            # not that current a message, whatever is in the log
+            return False
+        # now check contents
+        contents = open(LOGFILE, "r").read()
+        if not contents:
+            return False
+        lines = contents.splitlines()
+        if lines[-1].startswith("AuthenticationError: "):
+            return True
+    return False
+
+def apt_update(verbose):
+    '''
+    apt-get update and display the results
+    returns True on success, False on error
+
+    ignores errors starting with W: (warnings)
+    '''
+    return get_popen_output(["apt-get", "update"], "W:", display=verbose)
+
+def apt_install_dryrun(verbose):
+    '''
+    apt-get install dryrun for minion and display the results
+    returns True on success, False on error
+    '''
+    return get_popen_output(["apt-get", "-y", "--simulate", "-o",
+                             "DPkg::Options::=--force-confold",
+                             "-o", "Apt::Get::AllowUnauthenticated=true",
+                             "install", "salt-common", "salt-minion"], 
display=verbose)
+
+def apt_install(verbose):
+    return get_popen_output(["apt-get", "-y", "--force-yes",
+                             "-o", "DPkg::Options::=--force-confold",
+                             "-o", "Apt::Get::AllowUnauthenticated=true",
+                             "install", "salt-common", "salt-minion"], 
display=verbose)
+
+def fix_salt_version(verbose):
+    '''
+    kill all minions, try apt-get update and install of minion,
+    start a minion afterwards if needed.
+    if any step fails, will not proceed
+    '''
+    shoot_salt_processes()
+    if apt_update(verbose):
+        if apt_install_dryrun(verbose):
+            apt_install(verbose)
+    salt_processes = get_salt_processes()
+    if not salt_processes:
+        start_salt_process()
+
+def get_popen_output(command, ignore=None, display=False, skipretcode=False):
+    '''
+    given a list with command and arguments,
+    run it via Popen, returning lines of output
+    on error, show errors and return None
+    '''
+    proc = Popen(command, stderr=PIPE, stdout=PIPE)
+    output, error = proc.communicate()
+    if not skipretcode and proc.returncode:
+        print "Command:", command
+        print "Errors:", error
+        print "Output:", output
+        return None
+
+    if error and ignore is not None and not error.startswith(ignore):
+        print error
+        return None
+
+    if display:
+        print "INFO:", command, output
+    return output.splitlines()
+
+def get_salt_version():
+    '''
+    get the installed version of salt-minion via dpkg
+    '''
+    entries = get_popen_output(["dpkg", "-s", "salt-minion"])
+    if not entries:
+        return None
+    for entry in entries:
+        if entry.startswith("Version: "):
+            return entry[9:]
+    return None
+
+def check_salt_version(version):
+    if version is None:
+        return False
+    if not version.startswith(SALT_VERSION):
+        return False
+    return True
+
+def get_salt_processes():
+    '''
+    return list of pids of salt-minions running
+    '''
+    return get_popen_output(["pgrep", "salt-minion"], skipretcode=True)
+
+def check_salt_processes(processes):
+    '''
+    if more than 1 such process that is
+    not just now spawned to do a task,
+    then more than one minion is running
+    and processing requests, which we don't
+    want
+
+    return True if processes are all good
+    return False if there's extra processes
+    '''
+    if len(processes) == 1:
+        return True
+
+    entries = get_popen_output(["ps", "-p", ",".join(processes), "-o", 
"etimes="], skipretcode=True)
+    for entry in entries:
+        if int(entry) > OLD_PROC:
+            return False
+    return True
+
+def do_popen(command):
+    '''
+    run a command, as a list, no shell, display
+    output if any
+    '''
+    proc = Popen(command, stderr=PIPE, stdout=PIPE)
+    output, error = proc.communicate()
+    if error:
+        print error
+        return False
+    if output:
+        print output
+    return True
+
+def do_upstart():
+    '''
+    start salt-minion via upstart
+    '''
+    return do_popen([UPSTART, "salt-minion"])
+
+def do_systemctl():
+    '''
+    start salt-minion via systemctl
+    '''
+    return do_popen([SYSTEMCTL, "start", "salt-minion.service"])
+
+def start_salt_process():
+    '''
+    start a minion with upstart or systemctl
+    accordingly
+    '''
+    if os.path.exists(UPSTART):
+        do_upstart()
+    elif os.path.exists(SYSTEMCTL):
+        do_systemctl()
+    else:
+        print "failed to find startup command"
+
+def shoot_salt_processes():
+    '''
+    shoot all salt-minion processes with prejudice
+    '''
+    do_popen(["pkill", "salt-minion"])
+    time.sleep(1)
+    salt_processes = get_salt_processes()
+    if salt_processes is not None and len(salt_processes):
+        print "hrm, still some processes around", salt_processes
+        return False
+    return True
+
+def usage(message=None):
+    '''
+    display a helpful usage message with
+    an optional introductory message first
+    '''
+    if message is not None:
+        sys.stderr.write(message)
+        sys.stderr.write("\n")
+    usage_message = """
+Usage: salt-fixups.py --actions actionlist
+    --dryrun --help
+
+Options:
+
+  --actions (-a):  comma-separated list of actions which may
+                   be one or more of the following:
+
+                   autherror -- check and restart if minion is stuck with 
authentication error
+                   version   -- check and fix if salt is wrong version
+                   count     -- check and fix if there is not exactly one 
minion running
+
+  --dryrun  (-d):  display the commands that would be run to produce the 
output but
+                   don't actually run them
+  --verbose (-v):  display informational messages as this script runs
+  --help    (-h):  display this message
+"""
+    sys.stderr.write(usage_message)
+    sys.exit(1)
+
+def do_version(dryrun, verbose):
+    '''
+    if salt version isn't what we want,
+    apt-get install it
+    '''
+    salt_version = get_salt_version()
+    if not check_salt_version(salt_version):
+        if dryrun:
+            print "would fix salt version (bad)"
+        else:
+            if verbose:
+                print "fixing salt version (bad)"
+            fix_salt_version(verbose)
+    elif dryrun or verbose:
+        print "salt version is good"
+
+def do_count(dryrun, verbose):
+    '''
+    if there is not exactly one salt minion running
+    and handling requests (excluding any recently forked
+    process that may be doing a request right now),
+    start or shoot/restart minion(s) as needed
+    '''
+    salt_processes = get_salt_processes()
+    if not salt_processes:
+        if dryrun:
+            print "would start a minion (none running)"
+        else:
+            if verbose:
+                print "starting minion (none running)"
+            start_salt_process()
+    elif not check_salt_processes(salt_processes):
+        # more than one such process, and not just
+        # now spawned to do a job
+        if dryrun:
+            print "would shoot minions and start one (too many)"
+        else:
+            if verbose:
+                print "shooting minions (too many)"
+            shoot_salt_processes()
+            if verbose:
+                print "starting minion"
+            start_salt_process()
+    elif dryrun or verbose:
+        print "salt minion count is good"
+
+def do_autherror(dryrun, verbose):
+    '''
+    check if the salt log ends with a notification of an authentication
+    error, this is usually a sign that the minion is out to lunch
+    if so, restart the minion, this usually clears it up
+    '''
+    if check_salt_auth_error():
+        if dryrun:
+            print "would restart minion (auth error)"
+        else:
+            if verbose:
+                print "restarting salt (auth error)"
+            restart_salt()
+    elif dryrun or verbose:
+        print "no minion autherror"
+
+def do_actions(actions, dryrun, verbose):
+    '''
+    handle user-requested actions
+    '''
+    if "autherror" in actions:
+        do_autherror(dryrun, verbose)
+
+    if "version" in actions:
+        do_version(dryrun, verbose)
+
+    if "count" in actions:
+        do_count(dryrun, verbose)
+
+def main():
+    '''
+    make sure salt version is correct
+    make sure we are running exactly one minion
+    if the dreaded authentication error is
+       hanging the minion, restart it
+    '''
+    actions = None
+    dryrun = False
+    verbose = False
+
+    try:
+        (options, remainder) = getopt.gnu_getopt(
+            sys.argv[1:], "a:dvh",
+            ["actions=", "help", "verbose", "dryrun"])
+    except getopt.GetoptError as err:
+        usage("Unknown option specified: " + str(err))
+    for (opt, val) in options:
+        if opt in ["-a", "--actions"]:
+            actions = val.split(",")
+        elif opt in ["-d", "--dryrun"]:
+            dryrun = True
+        elif opt in ["-v", "--verbose"]:
+            verbose = True
+        elif opt in ["-h", "--help"]:
+            usage('Help for this script\n')
+        else:
+            usage("Unknown option specified: <%s>" % opt)
+    if len(remainder) > 0:
+        usage("Unknown option(s) specified: <%s>" % remainder[0])
+
+    if actions is None:
+        usage("At least one action must be specified.")
+
+    if check_master():
+        if dryrun:
+            print "master checks out, proceeding"
+        do_actions(actions, dryrun, verbose)
+    elif dryrun:
+        print "wrong master encountered, stopping"
+
+if __name__ == "__main__":
+    main()

-- 
To view, visit https://gerrit.wikimedia.org/r/236798
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I30a5492525da30e5f41af718358e9f2a0568125f
Gerrit-PatchSet: 1
Gerrit-Project: operations/software
Gerrit-Branch: master
Gerrit-Owner: ArielGlenn <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to