ArielGlenn has submitted this change and it was merged.

Change subject: remove now unused auditor.py, move static methods into module
......................................................................


remove now unused auditor.py, move static methods into module

auditor.py has been replaced by the various local/remote audit files
per audit type, plus the tiny salt module file

Change-Id: Id89eb2c78d7fb82f71ab1284487d75c3ecfb6708
---
M dataretention/data_auditor.py
D dataretention/retention/auditor.py
M dataretention/retention/cli.py
A dataretention/retention/fileutils.py
M dataretention/retention/localfileaudit.py
M dataretention/retention/locallogaudit.py
M dataretention/retention/remotefileauditor.py
7 files changed, 261 insertions(+), 1,996 deletions(-)

Approvals:
  ArielGlenn: Verified; Looks good to me, approved



diff --git a/dataretention/data_auditor.py b/dataretention/data_auditor.py
index 966a19f..5031a6e 100644
--- a/dataretention/data_auditor.py
+++ b/dataretention/data_auditor.py
@@ -3,7 +3,7 @@
 sys.path.append('/srv/audits/retention/scripts/')
 
 from retention.cli import CommandLine
-from retention.auditor import HomesAuditor
+#from retention.auditor import HomesAuditor
 from retention.remotefileauditor import RemoteFilesAuditor
 from retention.remotelogauditor import RemoteLogsAuditor
 from retention.remotehomeauditor import RemoteHomesAuditor
diff --git a/dataretention/retention/auditor.py 
b/dataretention/retention/auditor.py
deleted file mode 100644
index 5f88468..0000000
--- a/dataretention/retention/auditor.py
+++ /dev/null
@@ -1,1717 +0,0 @@
-import os
-import sys
-import time
-import re
-import glob
-import json
-import socket
-import runpy
-import stat
-import locale
-import zlib
-import base64
-
-sys.path.append('/srv/audits/retention/scripts/')
-
-import retention.utils
-import retention.magic
-from retention.status import Status
-from retention.saltclientplus import LocalClientPlus
-from retention.rule import Rule, RuleStore
-from retention.config import Config
-from retention.fileinfo import FileInfo, LogInfo, LogUtils
-from retention.utils import JsonHelper
-from retention.runner import Runner
-
-global_keys = [key for key, value_unused in
-               sys.modules[__name__].__dict__.items()]
-
-def get_dirs_toexamine(host_report):
-    '''
-    given full report output from host (list of
-    json entries), return the list
-    of directories with at least one possibly old file
-    and the list of directories skipped due to too
-    many entries
-    '''
-    dirs_problem = set()
-    dirs_skipped = set()
-    lines = host_report.split("\n")
-    for json_entry in lines:
-        if json_entry == "":
-            continue
-
-        if json_entry.startswith("WARNING:"):
-            bad_dir = FilesAuditor.get_dirname_from_warning(json_entry)
-            if bad_dir is not None:
-                dirs_skipped.add(bad_dir)
-                continue
-
-        if (json_entry.startswith("WARNING:") or
-                json_entry.startswith("INFO:")):
-            print json_entry
-            continue
-
-        try:
-            entry = json.loads(json_entry,
-                               object_hook=JsonHelper.decode_dict)
-        except:
-            print "WARNING: failed to load json for", json_entry
-            continue
-        if 'empty' in entry:
-            empty = FileInfo.string_to_bool(entry['empty'])
-            if empty:
-                continue
-        if 'old' in entry:
-            old = FileInfo.string_to_bool(entry['old'])
-            if old is None or old:
-                if os.path.dirname(entry['path']) not in dirs_problem:
-                    dirs_problem.add(os.path.dirname(entry['path']))
-    return sorted(list(dirs_problem)), sorted(list(dirs_skipped))
-
-
-class FilesAuditor(object):
-    '''
-    audit files locally or across a set of remote hosts,
-    in a specified set of directories
-    '''
-    def __init__(self, hosts_expr, audit_type, prettyprint=False,
-                 show_content=False, dirsizes=False, summary_report=False,
-                 depth=2, to_check=None, ignore_also=None,
-                 timeout=60, maxfiles=None,
-                 store_filepath=None,
-                 verbose=False):
-        '''
-        hosts_expr:   list or grain-based or wildcard expr for hosts
-                      to be audited
-        audit_type:   type of audit e.g. 'logs', 'homes'
-        prettyprint:  nicely format the output display
-        show_content: show the first line or so from problematic files
-        dirsizes:     show only directories which have too many files to
-                      audit properly, don't report on files at all
-        summary_report: do a summary of results instead of detailed
-                        this means different thiings depending on the audit
-                        type
-        depth:        the auditor will give up if a directory has too any files
-                      it (saves it form dying on someone's 25gb homedir).
-                      this option tells it how far down the tree to go from
-                      the top dir of the audit, before starting to count.
-                      e.g. do we count in /home/ariel or separately in
-                      /home/ariel/* or in /home/ariel/*/*, etc.
-        to_check:     comma-separated list of dirs (must end in '/') and/or
-                      files that will be checked; if this is None then
-                      all dirs/files will be checked
-        ignore_also:  comma-separated list of dirs (must end in '/') and/or
-                      files that will be skipped in addition to the ones
-                      in the config, rules, etc.
-        timeout:      salt timeout for running remote commands
-        maxfiles:     how many files in a directory tree is too many to audit
-                      (at which point we warn about that and move on)
-        store_filepath: full path to rule store (sqlite3 db)
-        verbose:      show informative messages during processing
-        '''
-
-        global rules
-
-        self.hosts_expr = hosts_expr
-        self.audit_type = audit_type
-        self.locations = audit_type + "_locations"
-        self.prettyprint = prettyprint
-        self.show_sample_content = show_content
-        self.dirsizes = dirsizes
-        self.show_summary = summary_report
-        self.depth = depth + 1  # actually count of path separators in dirname
-        self.to_check = to_check
-        self.set_up_to_check()
-
-        self.ignore_also = ignore_also
-        if self.ignore_also is not None:
-            self.ignore_also = self.ignore_also.split(',')
-        self.timeout = timeout
-        self.store_filepath = store_filepath
-        self.verbose = verbose
-
-        self.set_up_ignored()
-
-        # need this for locally running jobs
-        self.hostname = socket.getfqdn()
-
-        self.cutoff = Config.cf['cutoff']
-
-        if not retention.utils.running_locally(self.hosts_expr):
-            client = LocalClientPlus()
-            hosts, expr_type = Runner.get_hosts_expr_type(self.hosts_expr)
-            self.expanded_hosts = client.cmd_expandminions(
-                hosts, "test.ping", expr_form=expr_type)
-        else:
-            self.expanded_hosts = None
-
-        self.runner = Runner(hosts_expr,
-                             self.expanded_hosts,
-                             self.audit_type,
-                             self.generate_executor,
-                             self.show_sample_content,
-                             self.to_check,
-                             self.timeout,
-                             self.verbose)
-
-        if 'PerHostConfig' in global_keys:
-            self.perhost_rules_from_file = PerHostConfig.perhostcf
-        else:
-            self.perhost_rules_from_file = None
-        self.perhost_raw = None
-        if self.perhost_rules_from_file is None:
-            if not retention.utils.running_locally(self.hosts_expr):
-                if 
os.path.exists('/srv/audits/retention/scripts/audit_files_perhost_config.py'):
-                    try:
-                        self.perhost_rules_from_file = runpy.run_path(
-                            
'/srv/audits/retention/scripts/audit_files_perhost_config.py')['perhostcf']
-                        self.perhost_raw = open(
-                            
'/srv/audits/retention/scripts/audit_files_perhost_config.py').read()
-                    except:
-                        pass
-
-        if retention.utils.running_locally(self.hosts_expr):
-            self.set_up_perhost_rules()
-
-        if not retention.utils.running_locally(self.hosts_expr):
-            self.cdb = RuleStore(self.store_filepath)
-            self.cdb.store_db_init(self.expanded_hosts)
-            self.set_up_and_export_rule_store()
-        else:
-            self.cdb = None
-
-        self.show_ignored(Config.cf[self.locations])
-
-        self.today = time.time()
-        self.magic = retention.magic.magic_open(retention.magic.MAGIC_NONE)
-        self.magic.load()
-        self.summary = None
-        self.display_from_dict = FileInfo.display_from_dict
-        self.set_up_max_files(maxfiles)
-
-    def set_up_max_files(self, maxfiles):
-        '''
-        more than this many files in a subdir we won't process,
-        we'll just try to name top offenders
-
-        if we've been asked only to report dir trees that are
-        too large in this manner, we can set defaults mich
-        higher, since we don't stat files, open them to guess
-        their filetype, etc; processing then goes much quicker
-        '''
-
-        if maxfiles is None:
-            if self.dirsizes:
-                self.MAX_FILES = 1000
-            else:
-                self.MAX_FILES = 100
-        else:
-            self.MAX_FILES = maxfiles
-
-    def set_up_and_export_rule_store(self):
-        hosts = self.cdb.store_db_list_all_hosts()
-        where_to_put = os.path.join(os.path.dirname(self.store_filepath),
-                                    "data_retention.d")
-        if not os.path.isdir(where_to_put):
-            os.makedirs(where_to_put, 0755)
-        for host in hosts:
-            nicepath = os.path.join(where_to_put, host + ".conf")
-            Rule.export_rules(self.cdb, nicepath, host)
-
-    def set_up_to_check(self):
-        '''
-        turn the to_check arg into lists of dirs and files to check
-        '''
-        if self.to_check is not None:
-            check_list = self.to_check.split(',')
-            self.filenames_to_check = [fname for fname in check_list
-                                       if not fname.startswith(os.sep)]
-            if not len(self.filenames_to_check):
-                self.filenames_to_check = None
-            self.dirs_to_check = [d.rstrip(os.path.sep) for d in check_list
-                                  if d.startswith(os.sep)]
-        else:
-            self.filenames_to_check = None
-            self.dirs_to_check = None
-
-    def set_up_perhost_rules(self):
-        self.perhost_rules_from_store = runpy.run_path(
-            '/srv/audits/retention/configs/%s_store.cf' % 
self.hostname)['rules']
-        self.perhost_rules_from_file = runpy.run_path(
-            '/srv/audits/retention/configs/allhosts_file.cf')['perhostcf']
-
-        if self.perhost_rules_from_store is not None:
-            self.add_perhost_rules_to_ignored()
-
-            if self.verbose:
-                print "INFO: rules received from remote: ",
-                print self.perhost_rules_from_store
-
-        if (self.perhost_rules_from_file is not None and
-            'ignored_dirs' in self.perhost_rules_from_file):
-            if '/' not in self.ignored['dirs']:
-                self.ignored['dirs']['/'] = []
-            if self.hostname in self.perhost_rules_from_file['ignored_dirs']:
-                for path in self.perhost_rules_from_file[
-                        'ignored_dirs'][self.hostname]:
-                    if path.startswith('/'):
-                        self.ignored['dirs']['/'].append(path)
-            if '*' in self.perhost_rules_from_file['ignored_dirs']:
-                for path in self.perhost_rules_from_file[
-                        'ignored_dirs'][self.hostname]:
-                    if path.startswith('/'):
-                        self.ignored['dirs']['/'].append(path)
-
-    def set_up_ignored(self):
-        '''
-        collect up initial list of files/dirs to skip during audit
-        '''
-        self.ignored = {}
-        self.ignored['files'] = Config.cf['ignored_files']
-        self.ignored['dirs'] = Config.cf['ignored_dirs']
-        self.ignored['prefixes'] = Config.cf['ignored_prefixes']
-        self.ignored['extensions'] = Config.cf['ignored_extensions']
-
-        if self.ignore_also is not None:
-            # silently skip paths that are not absolute
-            for path in self.ignore_also:
-                if path.startswith('/'):
-                    if path.endswith('/'):
-                        if '/' not in self.ignored['dirs']:
-                            self.ignored['dirs']['/'] = []
-                        self.ignored['dirs']['/'].append(path[:-1])
-                    else:
-                        if '/' not in self.ignored['files']:
-                            self.ignored['files']['/'] = []
-                        self.ignored['files']['/'].append(path)
-
-    def add_perhost_rules_to_ignored(self):
-        '''
-        add dirs/files to be skipped during audit based
-        on rules in the rule store db
-        '''
-        if '/' not in self.ignored['dirs']:
-            self.ignored['dirs']['/'] = []
-        if '/' not in self.ignored['files']:
-            self.ignored['files']['/'] = []
-        for host in self.perhost_rules_from_store:
-            if host == self.hostname:
-                for rule in self.perhost_rules_from_store[host]:
-                    path = os.path.join(rule['basedir'], rule['name'])
-                    if rule['status'] == 'good':
-                        if Rule.entrytype_to_text(rule['type']) == 'dir':
-                            if path not in self.ignored['dirs']['/']:
-                                self.ignored['dirs']['/'].append(path)
-                        elif Rule.entrytype_to_text(rule['type']) == 'file':
-                            if path not in self.ignored['files']['/']:
-                                self.ignored['files']['/'].append(path)
-                        else:
-                            # some other random type, don't care
-                            continue
-                break
-
-    def get_perhost_rules_as_json(self):
-        '''
-        this reads from the data_retention.d directory files for the minions
-        on which the audit will be run, converts each host's rules to json
-        strings, and returns a hash of rules where keys are the hostname and
-        values are the list of rules on that host
-        '''
-        where_to_get = os.path.join(os.path.dirname(self.store_filepath),
-                                    "data_retention.d")
-        if not os.path.isdir(where_to_get):
-            os.mkdir(where_to_get, 0755)
-        # really? or just read each file and be done with it?
-        # also I would like to check the syntax cause paranoid.
-        rules = {}
-        self.cdb = RuleStore(self.store_filepath)
-        self.cdb.store_db_init(self.expanded_hosts)
-        for host in self.expanded_hosts:
-            rules[host] = []
-            nicepath = os.path.join(where_to_get, host + ".conf")
-            if os.path.exists(nicepath):
-                dir_rules = None
-                try:
-                    text = open(nicepath)
-                    exec(text)
-                except:
-                    continue
-                if dir_rules is not None:
-                    for status in Status.status_cf:
-                        if status in dir_rules:
-                            for entry in dir_rules[status]:
-                                if entry[0] != os.path.sep:
-                                    print ("WARNING: relative path in rule,"
-                                           "skipping:", entry)
-                                    continue
-                                if entry[-1] == os.path.sep:
-                                    entry = entry[:-1]
-                                    entry_type = Rule.text_to_entrytype('dir')
-                                else:
-                                    entry_type = Rule.text_to_entrytype('file')
-                                rule = Rule.get_rule_as_json(
-                                    entry, entry_type, status)
-                                rules[host].append(rule)
-        return rules
-
-    def write_perhost_rules_normal_code(self, indent):
-        rules = self.get_perhost_rules_as_json()
-
-        for host in rules:
-            rulescode = "rules = {}\n\n"
-            rulescode += "rules['%s'] = [\n" % host
-            rulescode += (indent +
-                     (",\n%s" % (indent + indent)).join(rules[host]) + "\n")
-            rulescode += "]\n"
-
-            with open("/srv/salt/audits/retention/configs/%s_store.py" % host, 
"w+") as fp:
-                fp.write(rulescode)
-                fp.close()
-
-    def write_rules_for_minion(self):
-        indent = "    "
-        self.write_perhost_rules_normal_code(indent)
-        if self.perhost_raw is not None:
-            with open("/srv/salt/audits/retention/configs/allhosts_file.py", 
"w+") as fp:
-                fp.write(self.perhost_raw)
-                fp.close()
-
-    def generate_executor(self):
-        code = ("""
-def executor():
-    fa = FilesAuditor('localhost', '%s', False, %s, %s,
-                      False, %d, %s, %s, %d, %d, False)
-    fa.audit_hosts()
-""" %
-                (self.audit_type,
-                 self.show_sample_content,
-                 self.dirsizes,
-                 self.depth - 1,
-                 ('"%s"' % self.to_check
-                  if self.to_check is not None else "None"),
-                 ('"%s"' % ",".join(self.ignore_also)
-                  if self.ignore_also is not None else "None"),
-                 self.timeout,
-                 self.MAX_FILES))
-
-        self.write_rules_for_minion()
-
-        return code
-
-    def show_ignored(self, basedirs):
-        if self.verbose:
-            if not retention.utils.running_locally(self.hosts_expr):
-                sys.stderr.write(
-                    "INFO: The below does not include per-host rules\n")
-                sys.stderr.write(
-                    "INFO: or rules derived from the directory status 
entries.\n")
-
-            sys.stderr.write("INFO: Ignoring the following directories:\n")
-
-            for basedir in self.ignored['dirs']:
-                if basedir in basedirs or basedir == '*' or basedir == '/':
-                    sys.stderr.write(
-                        "INFO: " + ','.join(self.ignored['dirs'][basedir])
-                        + " in " + basedir + '\n')
-
-            sys.stderr.write("INFO: Ignoring the following files:\n")
-            for basedir in self.ignored['files']:
-                if basedir in basedirs or basedir == '*' or basedir == '/':
-                    sys.stderr.write(
-                        "INFO: " + ','.join(self.ignored['files'][basedir])
-                        + " in " + basedir + '\n')
-
-            sys.stderr.write(
-                "INFO: Ignoring files starting with the following:\n")
-            sys.stderr.write(
-                "INFO: " + ','.join(self.ignored['prefixes']) + '\n')
-
-            sys.stderr.write(
-                "INFO: Ignoring files ending with the following:\n")
-            for basedir in self.ignored['extensions']:
-                if basedir in basedirs or basedir == '*':
-                    sys.stderr.write("INFO: " + ','.join(
-                        self.ignored['extensions'][basedir])
-                        + " in " + basedir + '\n')
-
-    @staticmethod
-    def startswith(string_arg, list_arg):
-        '''
-        check if the string arg starts with any elt in
-        the list_arg
-        '''
-        for elt in list_arg:
-            if string_arg.startswith(elt):
-                return True
-        return False
-
-    def contains(self, string_arg, list_arg):
-        '''
-        check if the string arg cotains any elt in
-        the list_arg
-        '''
-        for elt in list_arg:
-            if elt in string_arg:
-                return True
-        return False
-
-    @staticmethod
-    def endswith(string_arg, list_arg):
-        '''
-        check if the string arg ends with any elt in
-        the list_arg
-        '''
-        for elt in list_arg:
-            if string_arg.endswith(elt):
-                return True
-        return False
-
-    @staticmethod
-    def startswithpath(string_arg, list_arg):
-        '''
-        check if the string arg starts with any elt in
-        the list_arg and the next character, if any,
-        is the os dir separator
-        '''
-
-        for elt in list_arg:
-            if string_arg == elt or string_arg.startswith(elt + "/"):
-                return True
-        return False
-
-    @staticmethod
-    def subdir_check(dirname, directories):
-        '''
-        check if one of the directories listed is the
-        specified dirname or the dirname is somewhere in
-        a subtree of one of the listed directories,
-        returning True if so and fFalse otherwise
-        '''
-
-        # fixme test this
-        # also see if this needs to replace dirtree_checkeverywhere or not
-        for dname in directories:
-            if dname == dirname or dirname.startswith(dname + "/"):
-                return True
-        return False
-
-    @staticmethod
-    def dirtree_check(dirname, directories):
-        '''
-        check if the dirname is either a directory at or above one of
-        the the directories specified in the tree or vice versa, returning
-        True if so and fFalse otherwise
-        '''
-
-        for dname in directories:
-            if dirname == dname or dirname.startswith(dname + "/"):
-                return True
-            if dname.startswith(dirname + "/"):
-                return True
-        return False
-
-    @staticmethod
-    def expand_ignored_dirs(basedir, ignored):
-        '''
-        find dirs to ignore relative to the specified
-        basedir, in Config entry.  Fall back to wildcard spec
-        if there is not entry for the basedir.  Dirs in
-        Config entry may have one * in the path, this
-        will be treated as a wildcard for the purposes
-        of checking directories against the entry.
-
-        args: absolute path of basedir being crawled
-              hash of ignored dirs, file, etc
-        returns: list of absolute paths of dirs to ignore,
-        plus separate list of abslute paths containing '*',
-        also to ignore, or the empty list if there are none
-        '''
-
-        dirs = []
-        wildcard_dirs = []
-
-        to_expand = []
-        if 'dirs' in ignored:
-            if '*' in ignored['dirs']:
-                to_expand.extend(ignored['dirs']['*'])
-
-            if '/' in ignored['dirs']:
-                to_expand.extend(ignored['dirs']['/'])
-
-            if basedir in ignored['dirs']:
-                to_expand.extend(ignored['dirs'][basedir])
-
-            for dname in to_expand:
-                if '*' in dname:
-                    wildcard_dirs.append(os.path.join(basedir, dname))
-                else:
-                    dirs.append(os.path.join(basedir, dname))
-
-        return dirs, wildcard_dirs
-
-    @staticmethod
-    def wildcard_matches(dirname, wildcard_dirs, exact=True):
-        '''given a list of absolute paths with exactly one '*'
-        in each entry, see if the passed dirname matches
-        any of the list entries'''
-        for dname in wildcard_dirs:
-            if len(dirname) + 1 < len(dname):
-                continue
-
-            left, right = dname.split('*', 1)
-            if dirname.startswith(left):
-                if dirname.endswith(right):
-                    return True
-                elif (not exact and
-                      dirname.rfind(right + "/", len(left)) != -1):
-                    return True
-                else:
-                    continue
-        return False
-
-    def normalize(self, fname):
-        '''
-        subclasses may want to do something different, see
-        LogsAuditor for an example
-        '''
-        return fname
-
-    @staticmethod
-    def file_is_ignored(fname, basedir, ignored):
-        '''
-        pass normalized name (abs path), basedir (location audited),
-        hash of ignored files, dirs, prefixes, extensions
-        get back True if the file is to be ignored and
-        False otherwise
-        '''
-
-        basename = os.path.basename(fname)
-
-        if 'prefixes' in ignored:
-            if FilesAuditor.startswith(basename, ignored['prefixes']):
-                return True
-
-        if 'extensions' in ignored:
-            if '*' in ignored['extensions']:
-                if FilesAuditor.endswith(basename, ignored['extensions']['*']):
-                    return True
-            if basedir in ignored['extensions']:
-                if FilesAuditor.endswith(
-                        basename, ignored['extensions'][basedir]):
-                    return True
-
-        if 'files' in ignored:
-            if basename in ignored['files']:
-                return True
-            if '*' in ignored['files']:
-                if FilesAuditor.endswith(basename, ignored['files']['*']):
-                    return True
-
-            if '/' in ignored['files']:
-                if fname in ignored['files']['/']:
-                    return True
-                if FilesAuditor.wildcard_matches(
-                        fname, [w for w in ignored['files']['/'] if '*' in w]):
-                    return True
-
-            if basedir in ignored['files']:
-                if FilesAuditor.endswith(basename, ignored['files'][basedir]):
-                    return True
-        return False
-
-    def file_is_wanted(self, fname, basedir):
-        '''
-        decide if we want to audit the specific file or not
-        (is it ignored, or in an ignored directory, or of a type
-        we skip)
-        args: fname - the abs path to the file / dir
-
-        returns True if wanted or False if not
-        '''
-        fname = self.normalize(fname)
-
-        if FilesAuditor.file_is_ignored(fname, basedir, self.ignored):
-            return False
-
-        if (self.filenames_to_check is not None and
-                fname not in self.filenames_to_check):
-            return False
-
-        return True
-
-    @staticmethod
-    def dir_is_ignored(dirname, ignored):
-        expanded_dirs, wildcard_dirs = FilesAuditor.expand_ignored_dirs(
-            os.path.dirname(dirname), ignored)
-        if dirname in expanded_dirs:
-            return True
-        if FilesAuditor.wildcard_matches(dirname, wildcard_dirs):
-            return True
-        return False
-
-    @staticmethod
-    def dir_is_wrong_type(dirname):
-        try:
-            dirstat = os.lstat(dirname)
-        except:
-            return True
-        if stat.S_ISLNK(dirstat.st_mode):
-            return True
-        if not stat.S_ISDIR(dirstat.st_mode):
-            return True
-        return False
-
-    def get_subdirs_to_do(self, dirname, dirname_depth, todo):
-
-        locale.setlocale(locale.LC_ALL, '')
-        if FilesAuditor.dir_is_ignored(dirname, self.ignored):
-            return todo
-        if FilesAuditor.dir_is_wrong_type(dirname):
-            return todo
-
-        if self.depth < dirname_depth:
-            return todo
-
-        if dirname_depth not in todo:
-            todo[dirname_depth] = []
-
-        if self.dirs_to_check is not None:
-            if FilesAuditor.subdir_check(dirname, self.dirs_to_check):
-                todo[dirname_depth].append(dirname)
-        else:
-            todo[dirname_depth].append(dirname)
-
-        if self.depth == dirname_depth:
-            # don't read below the depth level
-            return todo
-
-        dirs = [os.path.join(dirname, d)
-                for d in os.listdir(dirname)]
-        if self.dirs_to_check is not None:
-            dirs = [d for d in dirs if FilesAuditor.dirtree_check(
-                d, self.dirs_to_check)]
-
-        for dname in dirs:
-            todo = self.get_subdirs_to_do(dname, dirname_depth + 1, todo)
-        return todo
-
-    def get_dirs_to_do(self, dirname):
-        if (self.dirs_to_check is not None and
-                not FilesAuditor.dirtree_check(dirname, self.dirs_to_check)):
-            if self.verbose:
-                print 'WARNING: no dirs to do for', dirname
-            return {}
-
-        todo = {}
-        depth_of_dirname = dirname.count(os.path.sep)
-        todo = self.get_subdirs_to_do(dirname, depth_of_dirname, todo)
-        return todo
-
-    def process_files_from_path(self, location, base, files, count,
-                                results, checklink=True):
-        '''
-        arguments:
-            location: the location being checked
-            base: directory containing the files to be checked
-            files: files to be checked
-            count: number of files in result set so far for this location
-            results: the result set
-        '''
-
-        for fname, st in files:
-            path = os.path.join(base, fname)
-            if self.file_is_wanted(path, location):
-                count += 1
-                if count > self.MAX_FILES:
-                    if self.dirsizes:
-                        self.warn_dirsize(base)
-                    else:
-                        self.warn_too_many_files(base)
-                    return count
-                # for dirsizes option we don't collect or report files
-                if not self.dirsizes:
-                    results.append((path, st))
-        return count
-
-    def walk_nolinks(self, top):
-        '''replaces (and is stolen from) os.walk, checks for and skips
-        links, returns base, paths, files but it's guaranteed that
-        files really are regular files and base/paths are not symlinks
-        the files list is a list of filename, stat of that filename,
-        because we have to do the stat on it anyways to ensure it's a file
-        and not a dir, so the caller might as well get that info'''
-
-        try:
-            names = os.listdir(top)
-        except os.error, err:
-            return
-
-        dirs, files = [], []
-        for name in names:
-            try:
-                filestat = os.lstat(os.path.join(top, name))
-            except:
-                continue
-            if stat.S_ISLNK(filestat.st_mode):
-                continue
-            if stat.S_ISDIR(filestat.st_mode):
-                dirs.append(name)
-            elif stat.S_ISREG(filestat.st_mode):
-                files.append((name, filestat))
-            else:
-                continue
-
-        yield top, dirs, files
-
-        for name in dirs:
-            new_path = os.path.join(top, name)
-            for x in self.walk_nolinks(new_path):
-                yield x
-
-    def process_one_dir(self, location, subdirpath, depth, results):
-        '''
-        arguments:
-            location: the location being checked
-            subdirpath: the path to the subdirectory being checked
-            depth: the depth of the directory being checked (starting at 1)
-            results: the result set
-        '''
-        if self.dirs_to_check is not None:
-            if not FilesAuditor.dirtree_check(subdirpath, self.dirs_to_check):
-                return
-
-        if FilesAuditor.dir_is_ignored(subdirpath, self.ignored):
-            return True
-
-        count = 0
-
-        if self.verbose:
-            print "INFO: collecting files in", subdirpath
-        # doing a directory higher up in the tree than our depth cutoff,
-        # only do the files in it, because we have the full list of dirs
-        # up to our cutoff we do them one by one
-        if depth < self.depth:
-            filenames = os.listdir(subdirpath)
-            files = []
-            for fname in filenames:
-                try:
-                    filestat = os.stat(os.path.join(subdirpath, fname))
-                except:
-                    continue
-                if (not stat.S_ISLNK(filestat.st_mode) and
-                    stat.S_ISREG(filestat.st_mode)):
-                    files.append((fname, filestat))
-            self.process_files_from_path(location, subdirpath,
-                                         files, count, results)
-            return
-
-        # doing a directory at our cutoff depth, walk it,
-        # because anything below the depth
-        # cutoff won't be in our list
-        temp_results = []
-        for base, paths, files in self.walk_nolinks(subdirpath):
-            expanded_dirs, wildcard_dirs = FilesAuditor.expand_ignored_dirs(
-                base, self.ignored)
-            if self.dirs_to_check is not None:
-                paths[:] = [p for p in paths
-                            if FilesAuditor.dirtree_check(os.path.join(base, 
p),
-                                                          self.dirs_to_check)]
-            paths[:] = [p for p in paths if
-                        (not FilesAuditor.startswithpath(os.path.join(
-                            base, p), expanded_dirs) and
-                         not FilesAuditor.wildcard_matches(os.path.join(
-                             base, p), wildcard_dirs, exact=False))]
-            count = self.process_files_from_path(location, base, files,
-                                                 count, temp_results,
-                                                 checklink=False)
-            if count > self.MAX_FILES:
-                return
-
-        results.extend(temp_results)
-
-    def find_all_files(self):
-        results = []
-        for location in Config.cf[self.locations]:
-            dirs_to_do = self.get_dirs_to_do(location)
-            if self.verbose:
-                print "for location", location, "doing dirs", dirs_to_do
-            if location.count(os.path.sep) >= self.depth + 1:
-                # do the run at least once
-                upper_end = location.count(os.path.sep) + 1
-            else:
-                upper_end = self.depth + 1
-            for depth in range(location.count(os.path.sep), upper_end):
-                if depth in dirs_to_do:
-                    for dname in dirs_to_do[depth]:
-                        self.process_one_dir(location, dname, depth, results)
-        return results
-
-    @staticmethod
-    def get_open_files():
-        '''
-        scrounge /proc/nnn/fd and collect all open files
-        '''
-        open_files = set()
-        dirs = os.listdir("/proc")
-        for dname in dirs:
-            if not re.match('^[0-9]+$', dname):
-                continue
-            try:
-                links = os.listdir(os.path.join("/proc", dname, "fd"))
-            except:
-                # process may have gone away
-                continue
-            # must follow sym link for all of these, yuck
-            files = set()
-            for link in links:
-                try:
-                    files.add(os.readlink(os.path.join("/proc", dname,
-                                                       "fd", link)))
-                except:
-                    continue
-            open_files |= files
-        return open_files
-
-    def warn_too_many_files(self, path=None):
-        print "WARNING: too many files to audit",
-        if path is not None:
-            fields = path.split(os.path.sep)
-            print "in directory %s" % os.path.sep.join(fields[:self.depth + 1])
-
-    def warn_dirsize(self, path):
-        fields = path.split(os.path.sep)
-        print ("WARNING: directory %s has more than %d files"
-               % (os.path.sep.join(fields[:self.depth + 1]), self.MAX_FILES))
-
-    @staticmethod
-    def get_dirname_from_warning(warning):
-        '''
-        some audit output lines warn about directory trees
-        having too many files to audit; grab the dirname
-        out of such a line and return it
-        '''
-        start = "WARNING: directory "
-        if warning.startswith(start):
-            # WARNING: directory %s has more than %d files
-            rindex = warning.rfind(" has more than")
-            if not rindex:
-                return None
-            else:
-                return warning[len(start):rindex]
-
-        start = "WARNING: too many files to audit in directory "
-        if warning.startswith(start):
-            return warning[len(start):]
-
-        return None
-
-    def do_local_audit(self):
-        open_files = FilesAuditor.get_open_files()
-
-        all_files = {}
-        files = self.find_all_files()
-
-        for (f, st) in files:
-            all_files[f] = FileInfo(f, self.magic, st)
-            all_files[f].load_file_info(self.today, self.cutoff, open_files)
-
-        all_files_sorted = sorted(all_files, key=lambda f: all_files[f].path)
-        result = []
-
-        if all_files:
-            max_name_length = max([len(all_files[fname].path)
-                                   for fname in all_files]) + 2
-
-        for fname in all_files_sorted:
-            if (not self.contains(all_files[fname].filetype,
-                                  Config.cf['ignored_types'])
-                    and not all_files[fname].is_empty):
-                result.append(all_files[fname].format_output(
-                    self.show_sample_content,
-                    False if self.show_summary else self.prettyprint,
-                    max_name_length))
-        output = "\n".join(result) + "\n"
-        if self.show_summary:
-            self.display_summary({self.hosts_expr: output})
-        else:
-            print output
-        return output
-
-    def add_stats(self, item, summary):
-        '''
-        gather stats on how many files/dirs
-        may be problematic; summary is where the results
-        are collected, item is the item to include in
-        the summary if needed
-        '''
-        dirname = os.path.dirname(item['path'])
-
-        if dirname not in summary:
-            summary[dirname] = {
-                'binary': {'old': 0, 'maybe_old': 0, 'nonroot': 0},
-                'text': {'old': 0, 'maybe_old': 0, 'nonroot': 0}
-            }
-        if item['binary'] is True:
-            group = 'binary'
-        else:
-            group = 'text'
-
-        if item['old'] == 'T':
-            summary[dirname][group]['old'] += 1
-        elif item['old'] == '-':
-            summary[dirname][group]['maybe_old'] += 1
-        if item['owner'] != 0:
-            summary[dirname][group]['nonroot'] += 1
-        return summary
-
-    def display_host_summary(self):
-        if self.summary is not None:
-            paths = sorted(self.summary.keys())
-            for path in paths:
-                for group in self.summary[path]:
-                    if (self.summary[path][group]['old'] > 0 or
-                            self.summary[path][group]['maybe_old'] > 0 or
-                            self.summary[path][group]['nonroot'] > 0):
-                        print ("in directory %s, (%s), %d old,"
-                               " %d maybe old, %d with non root owner"
-                               % (path, group, 
self.summary[path][group]['old'],
-                                  self.summary[path][group]['maybe_old'],
-                                  self.summary[path][group]['nonroot']))
-
-    def display_summary(self, result):
-        for host in result:
-            self.summary = {}
-            print "host:", host
-
-            if result[host]:
-                self.summary = {}
-                try:
-                    lines = result[host].split('\n')
-                    for line in lines:
-                        if line == '':
-                            continue
-                        if (line.startswith("WARNING:") or
-                                line.startswith("INFO:")):
-                            print line
-                            continue
-                        else:
-                            try:
-                                item = json.loads(
-                                    line, object_hook=JsonHelper.decode_dict)
-                                if item['empty'] is not True:
-                                    self.add_stats(item, self.summary)
-                            except:
-                                print "WARNING: failed to json load from host",
-                                print host, "this line:", line
-                    self.display_host_summary()
-                except:
-                    print "WARNING: failed to process output from host"
-            else:
-                if self.verbose:
-                    print "WARNING: no output from host", host
-
-    def display_remote_host(self, result):
-        try:
-            lines = result.split('\n')
-            files = []
-            for line in lines:
-                if line == "":
-                    continue
-                elif line.startswith("WARNING:") or line.startswith("INFO:"):
-                    print line
-                else:
-                    files.append(json.loads(line, 
object_hook=JsonHelper.decode_dict))
-
-            if files == []:
-                return
-            path_justify = max([len(finfo['path']) for finfo in files]) + 2
-            for finfo in files:
-                self.display_from_dict(finfo, self.show_sample_content, 
path_justify)
-        except:
-            print "WARNING: failed to load json from host"
-
-    def audit_hosts(self):
-        if retention.utils.running_locally(self.hosts_expr):
-            result = self.do_local_audit()
-        else:
-            result = self.runner.run_remotely()
-            if result is None:
-                print "WARNING: failed to get output from audit script on any 
host"
-            elif self.show_summary:
-                self.display_summary(result)
-            else:
-                for host in result:
-                    print "host:", host
-                    if result[host]:
-                        self.display_remote_host(result[host])
-                    else:
-                        if self.verbose:
-                            print "no output from host", host
-            # add some results to rule store
-            self.update_status_rules_from_report(result)
-        return result, self.ignored
-
-    def update_status_rules_from_report(self, report):
-        hostlist = report.keys()
-        for host in hostlist:
-            try:
-                problem_rules = Rule.get_rules(self.cdb, host, 
Status.text_to_status('problem'))
-            except:
-                print 'WARNING: problem retrieving problem rules for host', 
host
-                problem_rules = None
-            if problem_rules is not None:
-                existing_problems = [rule['path'] for rule in problem_rules]
-            else:
-                existing_problems = []
-
-            dirs_problem, dirs_skipped = get_dirs_toexamine(report[host])
-            if dirs_problem is not None:
-                dirs_problem = list(set(dirs_problem))
-                for dirname in dirs_problem:
-                    Rule.do_add_rule(self.cdb, dirname,
-                                     Rule.text_to_entrytype('dir'),
-                                     Status.text_to_status('problem'), host)
-
-            if dirs_skipped is not None:
-                dirs_skipped = list(set(dirs_skipped))
-                for dirname in dirs_skipped:
-                    if dirname in dirs_problem or dirname in existing_problems:
-                        # problem report overrides 'too many to audit'
-                        continue
-                    Rule.do_add_rule(self.cdb, dirname,
-                                     Rule.text_to_entrytype('dir'),
-                                     Status.text_to_status('unreviewed'), host)
-
-
-class LogsAuditor(FilesAuditor):
-    def __init__(self, hosts_expr, audit_type, prettyprint=False,
-                 oldest=False,
-                 show_content=False, show_system_logs=False,
-                 dirsizes=False, summary_report=False, depth=2,
-                 to_check=None, ignore_also=None,
-                 timeout=60, maxfiles=None, store_filepath=None,
-                 verbose=False):
-        super(LogsAuditor, self).__init__(hosts_expr, audit_type, prettyprint,
-                                          show_content, dirsizes,
-                                          summary_report, depth,
-                                          to_check, ignore_also, timeout,
-                                          maxfiles, store_filepath, verbose)
-        self.oldest_only = oldest
-        self.show_system_logs = show_system_logs
-        if self.show_system_logs:
-            self.ignored['files'].pop("/var/log")
-        self.display_from_dict = LogInfo.display_from_dict
-
-    def generate_executor(self):
-        code = ("""
-def executor():
-    la = LogsAuditor('localhost', '%s', False, %s, %s, %s, %s,
-                     False, %d, %s, %s, %d, %d, False)
-    la.audit_hosts()
-""" %
-                (self.audit_type,
-                 self.oldest_only,
-                 self.show_sample_content,
-                 self.dirsizes,
-                 self.show_system_logs,
-                 self.depth - 1,
-                 ('"%s"' % self.to_check
-                  if self.to_check is not None else "None"),
-                 ('"%s"' % ",".join(self.ignore_also)
-                  if self.ignore_also is not None else "None"),
-                 self.timeout, self.MAX_FILES))
-
-        self.write_rules_for_minion()
-
-        return code
-
-    @staticmethod
-    def get_rotated_freq(rotated):
-        '''
-        turn the value you get out of logrotate
-        conf files for 'rotated' into a one
-        char string suitable for our reports
-        '''
-        if rotated == 'weekly':
-            freq = 'w'
-        elif rotated == 'daily':
-            freq = 'd'
-        elif rotated == 'monthly':
-            freq = 'm'
-        elif rotated == 'yearly':
-            freq = 'y'
-        else:
-            freq = None
-        return freq
-
-    @staticmethod
-    def get_rotated_keep(line):
-        fields = line.split()
-        if len(fields) == 2:
-            keep = fields[1]
-        else:
-            keep = None
-        return keep
-
-    @staticmethod
-    def parse_logrotate_contents(contents,
-                                 default_freq='-', default_keep='-'):
-        lines = contents.split('\n')
-        state = 'want_lbracket'
-        logs = {}
-        freq = default_freq
-        keep = default_keep
-        notifempty = '-'
-        log_group = []
-        for line in lines:
-            if line.startswith('#'):
-                continue
-            line = line.strip()
-            if not line:
-                continue
-            if state == 'want_lbracket':
-                if line.endswith('{'):
-                    state = 'want_rbracket'
-                    line = line[:-1].strip()
-                    if not line:
-                        continue
-                if not line.startswith('/'):
-                    # probably a directive or a blank line
-                    continue
-                if '*' in line:
-                    log_group.extend(glob.glob(
-                        os.path.join(Config.cf['rotate_basedir'], line)))
-                else:
-                    log_group.append(line)
-            elif state == 'want_rbracket':
-                tmp_freq = LogsAuditor.get_rotated_freq(line)
-                if tmp_freq:
-                    freq = tmp_freq
-                    continue
-                elif line.startswith('rotate'):
-                    tmp_keep = LogsAuditor.get_rotated_keep(line)
-                    if tmp_keep:
-                        keep = tmp_keep
-                elif line == 'notifempty':
-                    notifempty = 'T'
-                elif line.endswith('}'):
-                    state = 'want_lbracket'
-                    for log in log_group:
-                        logs[log] = [freq, keep, notifempty]
-                    freq = default_freq
-                    keep = default_keep
-                    notifempty = '-'
-                    log_group = []
-        return logs
-
-    def get_logrotate_defaults(self):
-        contents = open(Config.cf['rotate_mainconf']).read()
-        lines = contents.split('\n')
-        skip = False
-        freq = '-'
-        keep = '-'
-        for line in lines:
-            line = line.strip()
-            if not line:
-                continue
-            if line.endswith('{'):
-                skip = True
-                continue
-            elif line.endswith('}'):
-                skip = False
-                continue
-            elif skip:
-                continue
-            tmp_freq = LogsAuditor.get_rotated_freq(line)
-            if tmp_freq:
-                freq = tmp_freq
-                continue
-            elif line.startswith('rotate'):
-                tmp_keep = LogsAuditor.get_rotated_keep(line)
-                if tmp_keep:
-                    keep = tmp_keep
-
-        return freq, keep
-
-    def find_rotated_logs(self):
-        '''
-        gather all names of log files from logrotate
-        config files
-        '''
-        rotated_logs = {}
-        default_freq, default_keep = self.get_logrotate_defaults()
-        rotated_logs.update(LogsAuditor.parse_logrotate_contents(
-            open(Config.cf['rotate_mainconf']).read(),
-            default_freq, default_keep))
-        for fname in os.listdir(Config.cf['rotate_basedir']):
-            pathname = os.path.join(Config.cf['rotate_basedir'], fname)
-            if os.path.isfile(pathname):
-                rotated_logs.update(LogsAuditor.parse_logrotate_contents(
-                    open(pathname).read(), default_freq, default_keep))
-        return rotated_logs
-
-    def check_mysqlconf(self):
-        '''
-        check how long mysql logs are kept around
-        '''
-        # note that I also see my.cnf.s3 and we don't check those (yet)
-        output = ''
-        for filename in Config.cf['mysqlconf']:
-            found = False
-            try:
-                contents = open(filename).read()
-            except:
-                # file or directory probably doesn't exist
-                continue
-            lines = contents.split('\n')
-            for line in lines:
-                line = line.strip()
-                if not line:
-                    continue
-                if line.startswith('datadir'):
-                    fields = line.split('=',1)
-                    fields = [field.strip() for field in fields]
-                    if fields[0] != 'datadir':
-                        continue
-                    if not fields[1].startswith('/'):
-                        continue
-                    datadir = fields[1]
-                    # strip trailing slash if needed
-                    if len(datadir) > 1 and datadir.endswith('/'):
-                        datadir = datadir[:-1]
-                    # we can skip all bin logs, relay logs, and pid files in 
this
-                    # directory. anything else should get looked at.
-                    if '.' in self.hostname:
-                        hostname = self.hostname.split('.')[0]
-                    else:
-                        hostname = self.hostname
-                    ignore_these = [hostname + '-bin', hostname + '-relay-bin',
-                                    hostname + '.pid', hostname + '-bin.index',
-                                    hostname + '-relay-bin.index']
-
-                    # add these files to ignore list; a one line report on
-                    # mysql log expiry configuration is sufficient
-                    if datadir not in self.ignored['files']:
-                        self.ignored['files'][datadir] = ignore_these
-                    else:
-                        self.ignored['files'][datadir].extend(ignore_these)
-                    # skip the subdirectories in here, they will be full of 
mysql dbs
-                    if datadir not in self.ignored['dirs']:
-                        self.ignored['files'][datadir] = ['*']
-                    else:
-                        self.ignored['files'][datadir].append('*')
-
-                if line.startswith('expire_logs_days'):
-                    fields = line.split('=',1)
-                    fields = [field.strip() for field in fields]
-                    if fields[0] != 'expire_logs_days':
-                        continue
-                    if not fields[1].isdigit():
-                        continue
-                    found = True
-                    if int(fields[1]) > Config.cf['cutoff']/86400:
-                        if output:
-                            output = output + '\n'
-                        output = output + ('WARNING: some mysql logs expired 
after %s days in %s'
-                                           % (fields[1], filename))
-            if not found:
-                if output:
-                    output = output + '\n'
-                output = output + 'WARNING: some mysql logs never expired in ' 
+ filename
-        return(output)
-
-    def do_local_audit(self):
-        '''
-        note that no summary report is done for a  single host,
-        for logs we summarize across hosts
-        '''
-        mysql_issues = self.check_mysqlconf()
-        result = []
-        if mysql_issues:
-            result.append(mysql_issues)
-
-        open_files = FilesAuditor.get_open_files()
-        rotated = self.find_rotated_logs()
-
-        all_files = {}
-        files = self.find_all_files()
-
-        for (f, st) in files:
-            all_files[f] = LogInfo(f, self.magic, st)
-            all_files[f].load_file_info(self.today, self.cutoff,
-                                        open_files, rotated)
-
-        all_files_sorted = sorted(all_files,
-                                  key=lambda f: all_files[f].path)
-        last_log_normalized = ''
-        last_log = ''
-        age = 0
-
-        if all_files:
-            max_name_length = max([len(all_files[fname].path)
-                                   for fname in all_files]) + 2
-            max_norm_length = max([len(all_files[fname].normalized)
-                                   for fname in all_files]) + 2
-
-        for fname in all_files_sorted:
-            if self.contains(all_files[fname].filetype,
-                             Config.cf['ignored_types']):
-                continue
-
-            if (self.oldest_only and
-                all_files[fname].normalized == last_log_normalized):
-                # still doing the same group of logs
-                if all_files[fname].age <= age:
-                    continue
-                else:
-                    age = all_files[fname].age
-                    last_log = fname
-            else:
-                if last_log:
-                    result.append(all_files[last_log].format_output(
-                        self.show_sample_content,
-                        self.prettyprint, max_name_length, max_norm_length))
-
-                # starting new set of logs (maybe first set)
-                last_log_normalized = all_files[fname].normalized
-                last_log = fname
-                age = all_files[fname].age
-
-        if last_log:
-            result.append(all_files[last_log].format_output(
-                self.show_sample_content,
-                self.prettyprint, max_name_length, max_norm_length))
-        output = "\n".join(result) + "\n"
-        print output
-        return output
-
-    def display_summary(self, audit_results):
-        logs = {}
-        hosts_count = 0
-        all_hosts = audit_results.keys()
-        hosts_count = len(all_hosts)
-
-        for host in all_hosts:
-            output = None
-            if audit_results[host]:
-                try:
-                    lines = audit_results[host].split('\n')
-                    output = []
-                    for line in lines:
-                        if line == "":
-                            continue
-                        elif (line.startswith("WARNING:") or
-                              line.startswith("INFO:")):
-                            print 'host:', host
-                            print line
-                            continue
-                        output.append(json.loads(
-                            line, object_hook=JsonHelper.decode_dict))
-                except:
-                    if output is not None:
-                        print output
-                    else:
-                        print audit_results[host]
-                    print "WARNING: failed to load json from host", host
-                    continue
-            if output is None:
-                continue
-            for item in output:
-                log_name = item['normalized']
-                if not item['normalized'] in logs:
-                    logs[log_name] = {}
-                    logs[log_name]['old'] = set()
-                    logs[log_name]['maybe_old'] = set()
-                    logs[log_name]['unrot'] = set()
-                    logs[log_name]['notifempty'] = set()
-                if item['old'] == 'T':
-                    logs[log_name]['old'].add(host)
-                elif item['old'] == '-':
-                    logs[log_name]['maybe_old'].add(host)
-                if item['rotated'].startswith('F'):
-                    logs[log_name]['unrot'].add(host)
-                if item['notifempty'] == 'T':
-                    logs[log_name]['notifempty'].add(host)
-        sorted_lognames = sorted(logs.keys())
-        for logname in sorted_lognames:
-            old_count = len(logs[logname]['old'])
-            if not old_count:
-                maybe_old_count = len(logs[logname]['maybe_old'])
-            else:
-                maybe_old_count = 0  # we don't care about possibles now
-            unrot_count = len(logs[logname]['unrot'])
-            notifempty_count = len(logs[logname]['notifempty'])
-            LogsAuditor.display_variance_info(old_count, hosts_count,
-                                              logs[logname]['old'],
-                                              'old', logname)
-            LogsAuditor.display_variance_info(maybe_old_count, hosts_count,
-                                              logs[logname]['maybe_old'],
-                                              'maybe old', logname)
-            LogsAuditor.display_variance_info(unrot_count, hosts_count,
-                                              logs[logname]['unrot'],
-                                              'unrotated', logname)
-            LogsAuditor.display_variance_info(notifempty_count, hosts_count,
-                                              logs[logname]['notifempty'],
-                                              'notifempty', logname)
-
-    @staticmethod
-    def display_variance_info(stat_count, hosts_count,
-                              host_list, stat_name, logname):
-        '''
-        assuming most stats are going to be the same across
-        a group of hosts, try to show just the variances
-        from the norm
-        '''
-        if stat_count == 0:
-            return
-
-        percentage = stat_count * 100 / float(hosts_count)
-
-        if stat_count == 1:
-            output_line = ("1 host has %s as %s" %
-                           (logname, stat_name))
-        else:
-            output_line = ("%s (%.2f%%) hosts have %s as %s" %
-                           (stat_count, percentage,
-                            logname, stat_name))
-
-        if percentage < .20 or stat_count < 6:
-            output_line += ': ' + ','.join(host_list)
-
-        print output_line
-
-    def normalize(self, fname):
-        return LogUtils.normalize(fname)
-
-    def display_remote_host(self, result):
-        '''
-        given the (json) output from the salt run on the remote
-        host, format it nicely and display it
-        '''
-        try:
-            lines = result.split('\n')
-            files = []
-            for line in lines:
-                if line == "":
-                    continue
-                elif line.startswith("WARNING:") or line.startswith("INFO:"):
-                    print line
-                else:
-                    files.append(json.loads(
-                        line, object_hook=JsonHelper.decode_dict))
-
-            if files == []:
-                return
-            path_justify = max([len(finfo['path']) for finfo in files]) + 2
-            norm_justify = max([len(finfo['normalized']) for finfo in files]) 
+ 2
-            for finfo in files:
-                self.display_from_dict(finfo, self.show_sample_content,
-                                       path_justify, norm_justify)
-        except:
-            print "WARNING: failed to load json from host:", result
-
-
-class HomesAuditor(FilesAuditor):
-    '''
-    auditing of home directories on a set of hosts
-
-    users may have a local '.data_retention' file in their
-    home directories with a list, on entry per line, of files
-    or directories (dirs must end in '/') to skip during the audit
-    '''
-
-    def __init__(self, hosts_expr, audit_type, prettyprint=False,
-                 show_content=False, dirsizes=False, summary_report=False,
-                 depth=2, to_check=None, ignore_also=None, timeout=60,
-                 maxfiles=None, store_filepath=None, verbose=False):
-        '''
-        see FilesAuditor for the arguments to the constructor
-        '''
-        super(HomesAuditor, self).__init__(hosts_expr, audit_type, prettyprint,
-                                           show_content, dirsizes,
-                                           summary_report, depth,
-                                           to_check, ignore_also, timeout,
-                                           maxfiles, store_filepath, verbose)
-        self.homes_owners = {}
-
-        local_ignores = HomesAuditor.get_local_ignores(self.locations)
-        local_ignored_dirs, local_ignored_files = 
HomesAuditor.process_local_ignores(
-            local_ignores, self.ignored)
-        self.show_local_ignores(local_ignored_dirs, local_ignored_files)
-
-    @staticmethod
-    def process_local_ignores(local_ignores, ignored):
-        '''
-        files or dirs listed in data retention conf in homedir
-        are considered 'good' and added to ignore list
-
-        non-absolute paths will be taken as relative to the
-        home dir of the data retention config they were
-        read from
-        '''
-
-        local_ignored_dirs = []
-        local_ignored_files = []
-        for basedir in local_ignores:
-            for path in local_ignores[basedir]:
-                if not path.startswith('/'):
-                    path = os.path.join(basedir, path)
-
-                if path.endswith('/'):
-                    if 'dirs' not in ignored:
-                        ignored['dirs'] = {}
-                    if '/' not in ignored['dirs']:
-                        ignored['dirs']['/'] = []
-
-                    ignored['dirs']['/'].append(path[:-1])
-                    local_ignored_dirs.append(path[:-1])
-                else:
-                    if 'files' not in ignored:
-                        ignored['files'] = {}
-                    if '/' not in ignored['files']:
-                        ignored['files']['/'] = []
-
-                    ignored['files']['/'].append(path)
-                    local_ignored_files.append(path)
-        return local_ignored_dirs, local_ignored_files
-
-    def show_local_ignores(self, dirs, files):
-        '''
-        display a list of files and directories being ignored
-        during the audit; pass these lists in as arguments
-        '''
-        if self.verbose:
-            if len(dirs):
-                sys.stderr.write("INFO: Ignoring the following directories:\n")
-                sys.stderr.write(", ".join(dirs) + "\n")
-
-            if len(files):
-                sys.stderr.write("INFO: Ignoring the following files:\n")
-                sys.stderr.write(", ".join(files) + "\n")
-
-    @staticmethod
-    def get_home_dirs(locations):
-        '''
-        get a list of home directories where the root location(s) for home are
-        specified in the Config class (see 'home_locations'), by reading
-        these root location dirs and grabbing all subdirectory names from them
-        '''
-        home_dirs = []
-
-        for location in Config.cf[locations]:
-            if not os.path.isdir(location):
-                continue
-            home_dirs.extend([os.path.join(location, d)
-                              for d in os.listdir(location)
-                              if os.path.isdir(os.path.join(location, d))])
-        return home_dirs
-
-    @staticmethod
-    def get_local_ignores(locations):
-        '''
-        read a list of absolute paths from /home/blah/.data_retention
-        for all blah.  Dirs are specified by op sep at the end ('/')
-        and files without.
-        '''
-        local_ignores = {}
-        home_dirs = HomesAuditor.get_home_dirs(locations)
-        for hdir in home_dirs:
-            local_ignores[hdir] = []
-            if os.path.exists(os.path.join(hdir, ".data_retention")):
-                try:
-                    filep = open(os.path.join(hdir, ".data_retention"))
-                    entries = filep.read().split("\n")
-                    filep.close()
-                except:
-                    pass
-                entries = filter(None, [e.strip() for e in entries])
-                # fixme should sanity check these? ???
-                # what happens if people put wildcards in the wrong
-                # component, or put utter garbage in there, or...?
-                local_ignores[hdir].extend(entries)
-
-        return local_ignores
-
-    def generate_executor(self):
-        code = ("""
-def executor():
-    ha = HomesAuditor('localhost', '%s', False, %s, %s, False,
-                      %d, %s, %s, %d, %d, False)
-    ha.audit_hosts()
-""" %
-                (self.audit_type,
-                 self.show_sample_content,
-                 self.dirsizes,
-                 self.depth - 1,
-                 ('"%s"' % self.to_check
-                  if self.to_check is not None else "None"),
-                 ('"%s"' % ",".join(self.ignore_also)
-                  if self.ignore_also is not None else "None"),
-                 self.timeout,
-                 self.MAX_FILES))
-
-        self.write_rules_for_minion()
-
-        return code
-
-    def display_host_summary(self):
-        '''
-        instead of a detailed report with oe entry per file
-        that may be problematic, display a summary for each homedir
-        on a host
-        '''
-        if self.summary is not None:
-            paths = sorted(self.summary.keys())
-            for path in paths:
-                for group in self.summary[path]:
-                    if (self.summary[path][group]['old'] > 0 or
-                            self.summary[path][group]['maybe_old'] > 0 or
-                            self.summary[path][group]['odd_owner'] > 0):
-                        print ("in directory %s, (%s), %d old,"
-                               " %d maybe old, %d with odd owner"
-                               % (path, group,
-                                  self.summary[path][group]['old'],
-                                  self.summary[path][group]['maybe_old'],
-                                  self.summary[path][group]['odd_owner']))
-
-    def add_stats(self, item, summary):
-        '''
-        gather stats on how many files/dirs
-        may be problematic; summary is where the results
-        are collected, item is the item to include in
-        the summary if needed
-        '''
-        dirname = os.path.dirname(item['path'])
-
-        if dirname not in summary:
-            summary[dirname] = {
-                'binary': {'old': 0, 'maybe_old': 0, 'odd_owner': 0},
-                'text': {'old': 0, 'maybe_old': 0, 'odd_owner': 0}
-            }
-        if item['binary'] is True:
-            group = 'binary'
-        else:
-            group = 'text'
-
-        if item['old'] == 'T':
-            summary[dirname][group]['old'] += 1
-        elif item['old'] == '-':
-            summary[dirname][group]['maybe_old'] += 1
-
-        if not item['path'].startswith('/home/'):
-            return
-
-        empty, home, user, rest = item['path'].split(os.path.sep, 3)
-        home_dir = os.path.join(os.path.sep, home, user)
-        if home_dir not in self.homes_owners:
-            try:
-                dirstat = os.stat(home_dir)
-            except:
-                return
-            self.homes_owners[home_dir] = str(dirstat.st_uid)
-
-        if item['owner'] != self.homes_owners[home_dir]:
-            summary[dirname][group]['odd_owner'] += 1
-
-
diff --git a/dataretention/retention/cli.py b/dataretention/retention/cli.py
index 6f87093..df474b7 100644
--- a/dataretention/retention/cli.py
+++ b/dataretention/retention/cli.py
@@ -12,14 +12,15 @@
 
 from retention.status import Status
 from retention.rule import Rule, RuleStore
-import retention.auditor
-from retention.auditor import FilesAuditor, HomesAuditor, LogsAuditor
+import retention.remotefileauditor
+from retention.localhomeaudit import LocalHomesAuditor
+from retention.locallogaudit import LocalLogsAuditor
 from retention.fileinfo import FileInfo
 import retention.utils
 from retention.utils import JsonHelper
-from retention.runner import Runner
 from retention.config import Config
 from retention.examiner import DirExaminer, FileExaminer
+import retention.fileutils
 
 class LocalIgnores(object):
     '''
@@ -44,7 +45,7 @@
         local_ignores = {}
 
         if retention.utils.running_locally(self.host):
-            local_ignores = HomesAuditor.get_local_ignores(self.locations)
+            local_ignores = LocalHomesAuditor.get_local_ignores(self.locations)
             output = json.dumps(local_ignores)
             print output
         else:
@@ -164,8 +165,8 @@
                         host in self.perhost_rules_from_file['ignored_files']):
                     for path in 
self.perhost_rules_from_file['ignored_files'][host]:
                         if (path.startswith('/') and
-                            path not in self.perhost_ignores_from_rules[
-                                host]['files']['/']):
+                                path not in self.perhost_ignores_from_rules[
+                                    host]['files']['/']):
                             
self.perhost_ignores_from_rules[host]['files']['/'].append(path)
 
     def get_perhostcf_from_file(self):
@@ -361,17 +362,17 @@
 
     def do_one_host(self, host, report):
         self.set_host(host)
-        if not Runner.running_locally(self.host):
+        if not retention.utils.running_locally(self.host):
             self.get_perhost_ignores_from_rules([host])
 
-        if Runner.running_locally(self.host):
-            self.dirs_problem, self.dirs_skipped = 
retention.auditor.get_dirs_toexamine(report)
+        if retention.utils.running_locally(self.host):
+            self.dirs_problem, self.dirs_skipped = 
retention.remotefileauditor.get_dirs_toexamine(report)
         else:
             if host not in report:
                 self.dirs_problem = None
                 self.dirs_skipped = None
             else:
-                self.dirs_problem, self.dirs_skipped = 
retention.auditor.get_dirs_toexamine(report[host])
+                self.dirs_problem, self.dirs_skipped = 
retention.remotefileauditor.get_dirs_toexamine(report[host])
         if self.dirs_problem is None and self.dirs_skipped is None:
             print "No report available from this host"
         elif len(self.dirs_problem) == 0 and len(self.dirs_skipped) == 0:
@@ -379,7 +380,7 @@
         else:
             dirs_problem_to_depth = [CommandLine.get_path_prefix(
                 d, self.max_depth_top_level)
-                for d in self.dirs_problem]
+                                     for d in self.dirs_problem]
             dirs_skipped = [s for s in self.dirs_skipped
                             if s not in dirs_problem_to_depth]
             relevant_dirs = (sorted(list(set(dirs_problem_to_depth)))
@@ -406,7 +407,7 @@
         add/update rules for those dirs and files
         '''
         self.ignored = ignored
-        if Runner.running_locally(self.hosts_expr):
+        if retention.utils.running_locally(self.hosts_expr):
             host_todo = "localhost"
             self.do_one_host(host_todo, report)
             return
@@ -420,7 +421,7 @@
             else:
                 local_ign = LocalIgnores(host_todo, self.timeout, 
self.audit_type)
                 self.local_ignores = local_ign.run(True)
-                local_ignored_dirs, local_ignored_files = 
HomesAuditor.process_local_ignores(
+                local_ignored_dirs, local_ignored_files = 
LocalHomesAuditor.process_local_ignores(
                     self.local_ignores, self.ignored)
                 self.do_one_host(host_todo, report)
 
@@ -642,39 +643,39 @@
     def entry_is_not_ignored(self, path, entrytype):
         basedir = self.get_basedir_from_path(path)
         if self.audit_type == 'logs' and entrytype == 'file':
-            path = LogsAuditor.normalize(path)
+            path = LocalLogsAuditor.normalize(path)
 
         if entrytype == 'file':
-            if FilesAuditor.file_is_ignored(path, basedir, self.ignored):
+            if retention.fileutils.file_is_ignored(path, basedir, 
self.ignored):
                 return False
 
             # check perhost file
             if self.host in self.perhost_ignores:
-                if FilesAuditor.file_is_ignored(
+                if retention.fileutils.file_is_ignored(
                         path, basedir,
                         self.perhost_ignores[self.host]):
                     return False
 
             # check perhost rules
             if self.host in self.perhost_ignores_from_rules:
-                if FilesAuditor.file_is_ignored(
+                if retention.fileutils.file_is_ignored(
                         path, basedir,
                         self.perhost_ignores_from_rules[self.host]):
                     return False
 
         elif entrytype == 'dir':
-            if FilesAuditor.dir_is_ignored(path, self.ignored):
+            if retention.fileutils.dir_is_ignored(path, self.ignored):
                 return False
 
             # check perhost file
             if self.host in self.perhost_ignores:
-                if FilesAuditor.dir_is_ignored(
+                if retention.fileutils.dir_is_ignored(
                         path, self.perhost_ignores[self.host]):
                     return False
 
             # check perhost rules
             if self.host in self.perhost_ignores_from_rules:
-                if FilesAuditor.dir_is_ignored(
+                if retention.fileutils.dir_is_ignored(
                         path, self.perhost_ignores_from_rules[self.host]):
                     return False
         else:
diff --git a/dataretention/retention/fileutils.py 
b/dataretention/retention/fileutils.py
new file mode 100644
index 0000000..9c0b537
--- /dev/null
+++ b/dataretention/retention/fileutils.py
@@ -0,0 +1,218 @@
+import os
+import re
+import stat
+
+def startswith(string_arg, list_arg):
+    '''
+    check if the string arg starts with any elt in
+    the list_arg
+    '''
+    for elt in list_arg:
+        if string_arg.startswith(elt):
+            return True
+    return False
+
+def contains(string_arg, list_arg):
+    '''
+    check if the string arg cotains any elt in
+    the list_arg
+    '''
+    for elt in list_arg:
+        if elt in string_arg:
+            return True
+    return False
+
+def endswith(string_arg, list_arg):
+    '''
+    check if the string arg ends with any elt in
+    the list_arg
+    '''
+    for elt in list_arg:
+        if string_arg.endswith(elt):
+            return True
+    return False
+
+def startswithpath(string_arg, list_arg):
+    '''
+    check if the string arg starts with any elt in
+    the list_arg and the next character, if any,
+    is the os dir separator
+    '''
+
+    for elt in list_arg:
+        if string_arg == elt or string_arg.startswith(elt + "/"):
+            return True
+    return False
+
+def subdir_check(dirname, directories):
+    '''
+    check if one of the directories listed is the
+    specified dirname or the dirname is somewhere in
+    a subtree of one of the listed directories,
+    returning True if so and fFalse otherwise
+    '''
+
+    # fixme test this
+    # also see if this needs to replace dirtree_checkeverywhere or not
+    for dname in directories:
+        if dname == dirname or dirname.startswith(dname + "/"):
+            return True
+    return False
+
+def dirtree_check(dirname, directories):
+    '''
+    check if the dirname is either a directory at or above one of
+    the the directories specified in the tree or vice versa, returning
+    True if so and fFalse otherwise
+    '''
+
+    for dname in directories:
+        if dirname == dname or dirname.startswith(dname + "/"):
+            return True
+        if dname.startswith(dirname + "/"):
+            return True
+    return False
+
+def expand_ignored_dirs(basedir, ignored):
+    '''
+    find dirs to ignore relative to the specified
+    basedir, in Config entry.  Fall back to wildcard spec
+    if there is not entry for the basedir.  Dirs in
+    Config entry may have one * in the path, this
+    will be treated as a wildcard for the purposes
+    of checking directories against the entry.
+
+    args: absolute path of basedir being crawled
+          hash of ignored dirs, file, etc
+    returns: list of absolute paths of dirs to ignore,
+    plus separate list of abslute paths containing '*',
+    also to ignore, or the empty list if there are none
+    '''
+
+    dirs = []
+    wildcard_dirs = []
+
+    to_expand = []
+    if 'dirs' in ignored:
+        if '*' in ignored['dirs']:
+            to_expand.extend(ignored['dirs']['*'])
+
+        if '/' in ignored['dirs']:
+            to_expand.extend(ignored['dirs']['/'])
+
+        if basedir in ignored['dirs']:
+            to_expand.extend(ignored['dirs'][basedir])
+
+        for dname in to_expand:
+            if '*' in dname:
+                wildcard_dirs.append(os.path.join(basedir, dname))
+            else:
+                dirs.append(os.path.join(basedir, dname))
+
+    return dirs, wildcard_dirs
+
+def wildcard_matches(dirname, wildcard_dirs, exact=True):
+    '''given a list of absolute paths with exactly one '*'
+    in each entry, see if the passed dirname matches
+    any of the list entries'''
+    for dname in wildcard_dirs:
+        if len(dirname) + 1 < len(dname):
+            continue
+
+        left, right = dname.split('*', 1)
+        if dirname.startswith(left):
+            if dirname.endswith(right):
+                return True
+            elif (not exact and
+                  dirname.rfind(right + "/", len(left)) != -1):
+                return True
+            else:
+                continue
+    return False
+
+def file_is_ignored(fname, basedir, ignored):
+    '''
+    pass normalized name (abs path), basedir (location audited),
+    hash of ignored files, dirs, prefixes, extensions
+    get back True if the file is to be ignored and
+    False otherwise
+    '''
+
+    basename = os.path.basename(fname)
+
+    if 'prefixes' in ignored:
+        if startswith(basename, ignored['prefixes']):
+            return True
+
+    if 'extensions' in ignored:
+        if '*' in ignored['extensions']:
+            if endswith(basename, ignored['extensions']['*']):
+                return True
+        if basedir in ignored['extensions']:
+            if endswith(
+                    basename, ignored['extensions'][basedir]):
+                return True
+
+    if 'files' in ignored:
+        if basename in ignored['files']:
+            return True
+        if '*' in ignored['files']:
+            if endswith(basename, ignored['files']['*']):
+                return True
+
+        if '/' in ignored['files']:
+            if fname in ignored['files']['/']:
+                return True
+            if wildcard_matches(
+                    fname, [w for w in ignored['files']['/'] if '*' in w]):
+                return True
+
+        if basedir in ignored['files']:
+            if endswith(basename, ignored['files'][basedir]):
+                return True
+    return False
+
+def dir_is_ignored(dirname, ignored):
+    expanded_dirs, wildcard_dirs = expand_ignored_dirs(
+        os.path.dirname(dirname), ignored)
+    if dirname in expanded_dirs:
+        return True
+    if wildcard_matches(dirname, wildcard_dirs):
+        return True
+    return False
+
+def dir_is_wrong_type(dirname):
+    try:
+        dirstat = os.lstat(dirname)
+    except:
+        return True
+    if stat.S_ISLNK(dirstat.st_mode):
+        return True
+    if not stat.S_ISDIR(dirstat.st_mode):
+        return True
+    return False
+
+def get_open_files():
+    '''
+    scrounge /proc/nnn/fd and collect all open files
+    '''
+    open_files = set()
+    dirs = os.listdir("/proc")
+    for dname in dirs:
+        if not re.match('^[0-9]+$', dname):
+            continue
+        try:
+            links = os.listdir(os.path.join("/proc", dname, "fd"))
+        except:
+            # process may have gone away
+            continue
+        # must follow sym link for all of these, yuck
+        files = set()
+        for link in links:
+            try:
+                files.add(os.readlink(os.path.join("/proc", dname,
+                                                   "fd", link)))
+            except:
+                continue
+        open_files |= files
+    return open_files
diff --git a/dataretention/retention/localfileaudit.py 
b/dataretention/retention/localfileaudit.py
index 7bdb705..5a19be9 100644
--- a/dataretention/retention/localfileaudit.py
+++ b/dataretention/retention/localfileaudit.py
@@ -1,12 +1,10 @@
 import os
 import sys
 import time
-import re
 import socket
 import runpy
 import stat
 import locale
-import logging
 
 sys.path.append('/srv/audits/retention/scripts/')
 
@@ -15,8 +13,8 @@
 from retention.rule import Rule
 from retention.config import Config
 from retention.fileinfo import FileInfo
+import retention.fileutils
 
-log = logging.getLogger(__name__)
 
 class LocalFilesAuditor(object):
     '''
@@ -185,190 +183,12 @@
                             continue
                 break
 
-    @staticmethod
-    def startswith(string_arg, list_arg):
-        '''
-        check if the string arg starts with any elt in
-        the list_arg
-        '''
-        for elt in list_arg:
-            if string_arg.startswith(elt):
-                return True
-        return False
-
-    def contains(self, string_arg, list_arg):
-        '''
-        check if the string arg cotains any elt in
-        the list_arg
-        '''
-        for elt in list_arg:
-            if elt in string_arg:
-                return True
-        return False
-
-    @staticmethod
-    def endswith(string_arg, list_arg):
-        '''
-        check if the string arg ends with any elt in
-        the list_arg
-        '''
-        for elt in list_arg:
-            if string_arg.endswith(elt):
-                return True
-        return False
-
-    @staticmethod
-    def startswithpath(string_arg, list_arg):
-        '''
-        check if the string arg starts with any elt in
-        the list_arg and the next character, if any,
-        is the os dir separator
-        '''
-
-        for elt in list_arg:
-            if string_arg == elt or string_arg.startswith(elt + "/"):
-                return True
-        return False
-
-    @staticmethod
-    def subdir_check(dirname, directories):
-        '''
-        check if one of the directories listed is the
-        specified dirname or the dirname is somewhere in
-        a subtree of one of the listed directories,
-        returning True if so and fFalse otherwise
-        '''
-
-        # fixme test this
-        # also see if this needs to replace dirtree_checkeverywhere or not
-        for dname in directories:
-            if dname == dirname or dirname.startswith(dname + "/"):
-                return True
-        return False
-
-    @staticmethod
-    def dirtree_check(dirname, directories):
-        '''
-        check if the dirname is either a directory at or above one of
-        the the directories specified in the tree or vice versa, returning
-        True if so and fFalse otherwise
-        '''
-
-        for dname in directories:
-            if dirname == dname or dirname.startswith(dname + "/"):
-                return True
-            if dname.startswith(dirname + "/"):
-                return True
-        return False
-
-    @staticmethod
-    def expand_ignored_dirs(basedir, ignored):
-        '''
-        find dirs to ignore relative to the specified
-        basedir, in Config entry.  Fall back to wildcard spec
-        if there is not entry for the basedir.  Dirs in
-        Config entry may have one * in the path, this
-        will be treated as a wildcard for the purposes
-        of checking directories against the entry.
-
-        args: absolute path of basedir being crawled
-              hash of ignored dirs, file, etc
-        returns: list of absolute paths of dirs to ignore,
-        plus separate list of abslute paths containing '*',
-        also to ignore, or the empty list if there are none
-        '''
-
-        dirs = []
-        wildcard_dirs = []
-
-        to_expand = []
-        if 'dirs' in ignored:
-            if '*' in ignored['dirs']:
-                to_expand.extend(ignored['dirs']['*'])
-
-            if '/' in ignored['dirs']:
-                to_expand.extend(ignored['dirs']['/'])
-
-            if basedir in ignored['dirs']:
-                to_expand.extend(ignored['dirs'][basedir])
-
-            for dname in to_expand:
-                if '*' in dname:
-                    wildcard_dirs.append(os.path.join(basedir, dname))
-                else:
-                    dirs.append(os.path.join(basedir, dname))
-
-        return dirs, wildcard_dirs
-
-    @staticmethod
-    def wildcard_matches(dirname, wildcard_dirs, exact=True):
-        '''given a list of absolute paths with exactly one '*'
-        in each entry, see if the passed dirname matches
-        any of the list entries'''
-        for dname in wildcard_dirs:
-            if len(dirname) + 1 < len(dname):
-                continue
-
-            left, right = dname.split('*', 1)
-            if dirname.startswith(left):
-                if dirname.endswith(right):
-                    return True
-                elif (not exact and
-                      dirname.rfind(right + "/", len(left)) != -1):
-                    return True
-                else:
-                    continue
-        return False
-
     def normalize(self, fname):
         '''
         subclasses may want to do something different, see
         LogsAuditor for an example
         '''
         return fname
-
-    @staticmethod
-    def file_is_ignored(fname, basedir, ignored):
-        '''
-        pass normalized name (abs path), basedir (location audited),
-        hash of ignored files, dirs, prefixes, extensions
-        get back True if the file is to be ignored and
-        False otherwise
-        '''
-
-        basename = os.path.basename(fname)
-
-        if 'prefixes' in ignored:
-            if LocalFilesAuditor.startswith(basename, ignored['prefixes']):
-                return True
-
-        if 'extensions' in ignored:
-            if '*' in ignored['extensions']:
-                if LocalFilesAuditor.endswith(basename, 
ignored['extensions']['*']):
-                    return True
-            if basedir in ignored['extensions']:
-                if LocalFilesAuditor.endswith(
-                        basename, ignored['extensions'][basedir]):
-                    return True
-
-        if 'files' in ignored:
-            if basename in ignored['files']:
-                return True
-            if '*' in ignored['files']:
-                if LocalFilesAuditor.endswith(basename, ignored['files']['*']):
-                    return True
-
-            if '/' in ignored['files']:
-                if fname in ignored['files']['/']:
-                    return True
-                if LocalFilesAuditor.wildcard_matches(
-                        fname, [w for w in ignored['files']['/'] if '*' in w]):
-                    return True
-
-            if basedir in ignored['files']:
-                if LocalFilesAuditor.endswith(basename, 
ignored['files'][basedir]):
-                    return True
-        return False
 
     def file_is_wanted(self, fname, basedir):
         '''
@@ -381,7 +201,7 @@
         '''
         fname = self.normalize(fname)
 
-        if LocalFilesAuditor.file_is_ignored(fname, basedir, self.ignored):
+        if retention.fileutils.file_is_ignored(fname, basedir, self.ignored):
             return False
 
         if (self.filenames_to_check is not None and
@@ -390,34 +210,12 @@
 
         return True
 
-    @staticmethod
-    def dir_is_ignored(dirname, ignored):
-        expanded_dirs, wildcard_dirs = LocalFilesAuditor.expand_ignored_dirs(
-            os.path.dirname(dirname), ignored)
-        if dirname in expanded_dirs:
-            return True
-        if LocalFilesAuditor.wildcard_matches(dirname, wildcard_dirs):
-            return True
-        return False
-
-    @staticmethod
-    def dir_is_wrong_type(dirname):
-        try:
-            dirstat = os.lstat(dirname)
-        except:
-            return True
-        if stat.S_ISLNK(dirstat.st_mode):
-            return True
-        if not stat.S_ISDIR(dirstat.st_mode):
-            return True
-        return False
-
     def get_subdirs_to_do(self, dirname, dirname_depth, todo):
 
         locale.setlocale(locale.LC_ALL, '')
-        if LocalFilesAuditor.dir_is_ignored(dirname, self.ignored):
+        if retention.fileutils.dir_is_ignored(dirname, self.ignored):
             return todo
-        if LocalFilesAuditor.dir_is_wrong_type(dirname):
+        if retention.fileutils.dir_is_wrong_type(dirname):
             return todo
 
         if self.depth < dirname_depth:
@@ -427,7 +225,7 @@
             todo[dirname_depth] = []
 
         if self.dirs_to_check is not None:
-            if LocalFilesAuditor.subdir_check(dirname, self.dirs_to_check):
+            if retention.fileutils.subdir_check(dirname, self.dirs_to_check):
                 todo[dirname_depth].append(dirname)
         else:
             todo[dirname_depth].append(dirname)
@@ -439,7 +237,7 @@
         dirs = [os.path.join(dirname, d)
                 for d in os.listdir(dirname)]
         if self.dirs_to_check is not None:
-            dirs = [d for d in dirs if LocalFilesAuditor.dirtree_check(
+            dirs = [d for d in dirs if retention.fileutils.dirtree_check(
                 d, self.dirs_to_check)]
 
         for dname in dirs:
@@ -448,7 +246,7 @@
 
     def get_dirs_to_do(self, dirname):
         if (self.dirs_to_check is not None and
-                not LocalFilesAuditor.dirtree_check(dirname, 
self.dirs_to_check)):
+                not retention.fileutils.dirtree_check(dirname, 
self.dirs_to_check)):
             return {}
 
         todo = {}
@@ -526,10 +324,10 @@
             results: the result set
         '''
         if self.dirs_to_check is not None:
-            if not LocalFilesAuditor.dirtree_check(subdirpath, 
self.dirs_to_check):
+            if not retention.fileutils.dirtree_check(subdirpath, 
self.dirs_to_check):
                 return
 
-        if LocalFilesAuditor.dir_is_ignored(subdirpath, self.ignored):
+        if retention.fileutils.dir_is_ignored(subdirpath, self.ignored):
             return True
 
         count = 0
@@ -557,16 +355,16 @@
         # cutoff won't be in our list
         temp_results = []
         for base, paths, files in self.walk_nolinks(subdirpath):
-            expanded_dirs, wildcard_dirs = 
LocalFilesAuditor.expand_ignored_dirs(
+            expanded_dirs, wildcard_dirs = 
retention.fileutils.expand_ignored_dirs(
                 base, self.ignored)
             if self.dirs_to_check is not None:
                 paths[:] = [p for p in paths
-                            if 
LocalFilesAuditor.dirtree_check(os.path.join(base, p),
-                                                               
self.dirs_to_check)]
+                            if 
retention.fileutils.dirtree_check(os.path.join(base, p),
+                                                                 
self.dirs_to_check)]
             paths[:] = [p for p in paths if
-                        (not LocalFilesAuditor.startswithpath(os.path.join(
+                        (not retention.fileutils.startswithpath(os.path.join(
                             base, p), expanded_dirs) and
-                         not LocalFilesAuditor.wildcard_matches(os.path.join(
+                         not retention.fileutils.wildcard_matches(os.path.join(
                              base, p), wildcard_dirs, exact=False))]
             count = self.process_files_from_path(location, base, files,
                                                  count, temp_results,
@@ -591,32 +389,6 @@
                         self.process_one_dir(location, dname, depth, results)
         return results
 
-    @staticmethod
-    def get_open_files():
-        '''
-        scrounge /proc/nnn/fd and collect all open files
-        '''
-        open_files = set()
-        dirs = os.listdir("/proc")
-        for dname in dirs:
-            if not re.match('^[0-9]+$', dname):
-                continue
-            try:
-                links = os.listdir(os.path.join("/proc", dname, "fd"))
-            except:
-                # process may have gone away
-                continue
-            # must follow sym link for all of these, yuck
-            files = set()
-            for link in links:
-                try:
-                    files.add(os.readlink(os.path.join("/proc", dname,
-                                                       "fd", link)))
-                except:
-                    continue
-            open_files |= files
-        return open_files
-
     def warn_too_many_files(self, path=None):
         print "WARNING: too many files to audit",
         if path is not None:
@@ -629,7 +401,7 @@
                % (os.path.sep.join(fields[:self.depth + 1]), self.MAX_FILES))
 
     def do_local_audit(self):
-        open_files = LocalFilesAuditor.get_open_files()
+        open_files = retention.fileutils.get_open_files()
 
         all_files = {}
         files = self.find_all_files()
@@ -650,8 +422,8 @@
                                    for fname in all_files]) + 2
 
         for fname in all_files_sorted:
-            if (not self.contains(all_files[fname].filetype,
-                                  Config.cf['ignored_types'])
+            if (not retention.fileutils.contains(all_files[fname].filetype,
+                                                 Config.cf['ignored_types'])
                     and not all_files[fname].is_empty):
                 result.append(all_files[fname].format_output(
                     self.show_sample_content, False,
diff --git a/dataretention/retention/locallogaudit.py 
b/dataretention/retention/locallogaudit.py
index 19d42fc..cf03239 100644
--- a/dataretention/retention/locallogaudit.py
+++ b/dataretention/retention/locallogaudit.py
@@ -9,6 +9,7 @@
 from retention.config import Config
 from retention.fileinfo import LogInfo, LogUtils
 from retention.localfileaudit import LocalFilesAuditor
+import retention.fileutils
 
 global_keys = [key for key, value_unused in
                sys.modules[__name__].__dict__.items()]
@@ -237,7 +238,7 @@
         if mysql_issues:
             result.append(mysql_issues)
 
-        open_files = LocalFilesAuditor.get_open_files()
+        open_files = retention.fileutils.get_open_files()
         rotated = self.find_rotated_logs()
 
         all_files = {}
@@ -261,8 +262,8 @@
                                    for fname in all_files]) + 2
 
         for fname in all_files_sorted:
-            if self.contains(all_files[fname].filetype,
-                             Config.cf['ignored_types']):
+            if retention.fileutils.contains(all_files[fname].filetype,
+                                            Config.cf['ignored_types']):
                 continue
 
             if (self.oldest_only and
diff --git a/dataretention/retention/remotefileauditor.py 
b/dataretention/retention/remotefileauditor.py
index 76540c5..aa45231 100644
--- a/dataretention/retention/remotefileauditor.py
+++ b/dataretention/retention/remotefileauditor.py
@@ -337,16 +337,6 @@
                         self.ignored['extensions'][basedir])
                         + " in " + basedir + '\n')
 
-    def contains(self, string_arg, list_arg):
-        '''
-        check if the string arg cotains any elt in
-        the list_arg
-        '''
-        for elt in list_arg:
-            if elt in string_arg:
-                return True
-        return False
-
     def normalize(self, fname):
         '''
         subclasses may want to do something different, see

-- 
To view, visit https://gerrit.wikimedia.org/r/233456
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Id89eb2c78d7fb82f71ab1284487d75c3ecfb6708
Gerrit-PatchSet: 2
Gerrit-Project: operations/software
Gerrit-Branch: master
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to