ArielGlenn has submitted this change and it was merged.

Change subject: turn files auditor into salt module
......................................................................


turn files auditor into salt module

remotefileauditor invokes the salt module on the minions
retentionaudit is the module itself, a small interface to
  the local audit class
localfilesaudit handles the actual audit on the local host

next up: convert logs audit in the same fashion

Change-Id: I0cd978a33cd2508197247355355e2141aadcab73
---
M dataretention/data_auditor.py
A dataretention/retention/localfileaudit.py
A dataretention/retention/remotefileauditor.py
A dataretention/retention/retentionaudit.py
M dataretention/retention/runner.py
5 files changed, 1,217 insertions(+), 24 deletions(-)

Approvals:
  ArielGlenn: Verified; Looks good to me, approved



diff --git a/dataretention/data_auditor.py b/dataretention/data_auditor.py
index 739da38..67d9335 100644
--- a/dataretention/data_auditor.py
+++ b/dataretention/data_auditor.py
@@ -3,7 +3,8 @@
 sys.path.append('/srv/audits/retention/scripts/')
 
 from retention.cli import CommandLine
-from retention.auditor import FilesAuditor, LogsAuditor, HomesAuditor
+from retention.auditor import LogsAuditor, HomesAuditor
+from retention.remotefileauditor import RemoteFilesAuditor
 from retention.examiner import FileExaminer, DirExaminer
 
 def usage(message=None):
@@ -256,11 +257,11 @@
             cmdline.run(report, ignored)
 
     elif audit_type == 'root':
-        filesaudit = FilesAuditor(hosts_expr, audit_type, prettyprint,
-                                  show_sample_content, dirsizes,
-                                  summary_report,
-                                  depth, files_to_check, ignore_also,
-                                  timeout, maxfiles, store_filepath, verbose)
+        filesaudit = RemoteFilesAuditor(hosts_expr, audit_type, prettyprint,
+                                        show_sample_content, dirsizes,
+                                        summary_report,
+                                        depth, files_to_check, ignore_also,
+                                        timeout, maxfiles, store_filepath, 
verbose)
         report, ignored = filesaudit.audit_hosts()
         if interactive:
             cmdline = CommandLine(store_filepath, timeout, audit_type, 
hosts_expr)
diff --git a/dataretention/retention/localfileaudit.py 
b/dataretention/retention/localfileaudit.py
new file mode 100644
index 0000000..7bdb705
--- /dev/null
+++ b/dataretention/retention/localfileaudit.py
@@ -0,0 +1,660 @@
+import os
+import sys
+import time
+import re
+import socket
+import runpy
+import stat
+import locale
+import logging
+
+sys.path.append('/srv/audits/retention/scripts/')
+
+import retention.utils
+import retention.magic
+from retention.rule import Rule
+from retention.config import Config
+from retention.fileinfo import FileInfo
+
+log = logging.getLogger(__name__)
+
+class LocalFilesAuditor(object):
+    '''
+    audit files on the local host
+    in a specified set of directories
+    '''
+    def __init__(self, audit_type,
+                 show_content=False, dirsizes=False,
+                 depth=2, to_check=None, ignore_also=None,
+                 timeout=60, maxfiles=None):
+        '''
+        audit_type:   type of audit e.g. 'logs', 'homes'
+        show_content: show the first line or so from problematic files
+        dirsizes:     show only directories which have too many files to
+                      audit properly, don't report on files at all
+        depth:        the auditor will give up if a directory has too any files
+                      it (saves it form dying on someone's 25gb homedir).
+                      this option tells it how far down the tree to go from
+                      the top dir of the audit, before starting to count.
+                      e.g. do we count in /home/ariel or separately in
+                      /home/ariel/* or in /home/ariel/*/*, etc.
+        to_check:     comma-separated list of dirs (must end in '/') and/or
+                      files that will be checked; if this is None then
+                      all dirs/files will be checked
+        ignore_also:  comma-separated list of dirs (must end in '/') and/or
+                      files that will be skipped in addition to the ones
+                      in the config, rules, etc.
+        timeout:      salt timeout for running remote commands
+        maxfiles:     how many files in a directory tree is too many to audit
+                      (at which point we warn about that and move on)
+        '''
+
+        self.audit_type = audit_type
+        self.locations = audit_type + "_locations"
+        self.show_sample_content = show_content
+        self.dirsizes = dirsizes
+        self.depth = depth + 1  # actually count of path separators in dirname
+        self.to_check = to_check
+
+        self.filenames_to_check = None
+        self.dirs_to_check = None
+        self.set_up_to_check()
+
+        self.ignore_also = ignore_also
+        if self.ignore_also is not None:
+            self.ignore_also = self.ignore_also.split(',')
+        self.timeout = timeout
+
+        self.ignored = {}
+        self.set_up_ignored()
+
+        self.hostname = socket.getfqdn()
+
+        self.cutoff = Config.cf['cutoff']
+
+        self.perhost_rules_from_store = None
+        self.perhost_rules_from_file = None
+        self.set_up_perhost_rules()
+
+        self.today = time.time()
+        self.magic = retention.magic.magic_open(retention.magic.MAGIC_NONE)
+        self.magic.load()
+        self.summary = None
+        self.display_from_dict = FileInfo.display_from_dict
+        self.MAX_FILES = maxfiles
+        self.set_up_max_files()
+
+    def set_up_max_files(self):
+        '''
+        more than this many files in a subdir we won't process,
+        we'll just try to name top offenders
+
+        if we've been asked only to report dir trees that are
+        too large in this manner, we can set defaults mich
+        higher, since we don't stat files, open them to guess
+        their filetype, etc; processing then goes much quicker
+        '''
+
+        if self.MAX_FILES is None:
+            if self.dirsizes:
+                self.MAX_FILES = 1000
+            else:
+                self.MAX_FILES = 100
+
+    def set_up_to_check(self):
+        '''
+        turn the to_check arg into lists of dirs and files to check
+        '''
+        if self.to_check is not None:
+            check_list = self.to_check.split(',')
+            self.filenames_to_check = [fname for fname in check_list
+                                       if not fname.startswith(os.sep)]
+            if not len(self.filenames_to_check):
+                self.filenames_to_check = None
+            self.dirs_to_check = [d.rstrip(os.path.sep) for d in check_list
+                                  if d.startswith(os.sep)]
+
+    def set_up_perhost_rules(self):
+        self.perhost_rules_from_store = runpy.run_path(
+            '/srv/audits/retention/configs/%s_store.cf' % 
self.hostname)['rules']
+        self.perhost_rules_from_file = runpy.run_path(
+            '/srv/audits/retention/configs/allhosts_file.cf')['perhostcf']
+
+        if self.perhost_rules_from_store is not None:
+            self.add_perhost_rules_to_ignored()
+
+        if (self.perhost_rules_from_file is not None and
+                'ignored_dirs' in self.perhost_rules_from_file):
+            if '/' not in self.ignored['dirs']:
+                self.ignored['dirs']['/'] = []
+            if self.hostname in self.perhost_rules_from_file['ignored_dirs']:
+                for path in self.perhost_rules_from_file[
+                        'ignored_dirs'][self.hostname]:
+                    if path.startswith('/'):
+                        self.ignored['dirs']['/'].append(path)
+            if '*' in self.perhost_rules_from_file['ignored_dirs']:
+                for path in self.perhost_rules_from_file[
+                        'ignored_dirs'][self.hostname]:
+                    if path.startswith('/'):
+                        self.ignored['dirs']['/'].append(path)
+
+    def set_up_ignored(self):
+        '''
+        collect up initial list of files/dirs to skip during audit
+        '''
+        self.ignored['files'] = Config.cf['ignored_files']
+        self.ignored['dirs'] = Config.cf['ignored_dirs']
+        self.ignored['prefixes'] = Config.cf['ignored_prefixes']
+        self.ignored['extensions'] = Config.cf['ignored_extensions']
+
+        if self.ignore_also is not None:
+            # silently skip paths that are not absolute
+            for path in self.ignore_also:
+                if path.startswith('/'):
+                    if path.endswith('/'):
+                        if '/' not in self.ignored['dirs']:
+                            self.ignored['dirs']['/'] = []
+                        self.ignored['dirs']['/'].append(path[:-1])
+                    else:
+                        if '/' not in self.ignored['files']:
+                            self.ignored['files']['/'] = []
+                        self.ignored['files']['/'].append(path)
+
+    def add_perhost_rules_to_ignored(self):
+        '''
+        add dirs/files to be skipped during audit based
+        on rules in the rule store db
+        '''
+        if '/' not in self.ignored['dirs']:
+            self.ignored['dirs']['/'] = []
+        if '/' not in self.ignored['files']:
+            self.ignored['files']['/'] = []
+        for host in self.perhost_rules_from_store:
+            if host == self.hostname:
+                for rule in self.perhost_rules_from_store[host]:
+                    path = os.path.join(rule['basedir'], rule['name'])
+                    if rule['status'] == 'good':
+                        if Rule.entrytype_to_text(rule['type']) == 'dir':
+                            if path not in self.ignored['dirs']['/']:
+                                self.ignored['dirs']['/'].append(path)
+                        elif Rule.entrytype_to_text(rule['type']) == 'file':
+                            if path not in self.ignored['files']['/']:
+                                self.ignored['files']['/'].append(path)
+                        else:
+                            # some other random type, don't care
+                            continue
+                break
+
+    @staticmethod
+    def startswith(string_arg, list_arg):
+        '''
+        check if the string arg starts with any elt in
+        the list_arg
+        '''
+        for elt in list_arg:
+            if string_arg.startswith(elt):
+                return True
+        return False
+
+    def contains(self, string_arg, list_arg):
+        '''
+        check if the string arg cotains any elt in
+        the list_arg
+        '''
+        for elt in list_arg:
+            if elt in string_arg:
+                return True
+        return False
+
+    @staticmethod
+    def endswith(string_arg, list_arg):
+        '''
+        check if the string arg ends with any elt in
+        the list_arg
+        '''
+        for elt in list_arg:
+            if string_arg.endswith(elt):
+                return True
+        return False
+
+    @staticmethod
+    def startswithpath(string_arg, list_arg):
+        '''
+        check if the string arg starts with any elt in
+        the list_arg and the next character, if any,
+        is the os dir separator
+        '''
+
+        for elt in list_arg:
+            if string_arg == elt or string_arg.startswith(elt + "/"):
+                return True
+        return False
+
+    @staticmethod
+    def subdir_check(dirname, directories):
+        '''
+        check if one of the directories listed is the
+        specified dirname or the dirname is somewhere in
+        a subtree of one of the listed directories,
+        returning True if so and fFalse otherwise
+        '''
+
+        # fixme test this
+        # also see if this needs to replace dirtree_checkeverywhere or not
+        for dname in directories:
+            if dname == dirname or dirname.startswith(dname + "/"):
+                return True
+        return False
+
+    @staticmethod
+    def dirtree_check(dirname, directories):
+        '''
+        check if the dirname is either a directory at or above one of
+        the the directories specified in the tree or vice versa, returning
+        True if so and fFalse otherwise
+        '''
+
+        for dname in directories:
+            if dirname == dname or dirname.startswith(dname + "/"):
+                return True
+            if dname.startswith(dirname + "/"):
+                return True
+        return False
+
+    @staticmethod
+    def expand_ignored_dirs(basedir, ignored):
+        '''
+        find dirs to ignore relative to the specified
+        basedir, in Config entry.  Fall back to wildcard spec
+        if there is not entry for the basedir.  Dirs in
+        Config entry may have one * in the path, this
+        will be treated as a wildcard for the purposes
+        of checking directories against the entry.
+
+        args: absolute path of basedir being crawled
+              hash of ignored dirs, file, etc
+        returns: list of absolute paths of dirs to ignore,
+        plus separate list of abslute paths containing '*',
+        also to ignore, or the empty list if there are none
+        '''
+
+        dirs = []
+        wildcard_dirs = []
+
+        to_expand = []
+        if 'dirs' in ignored:
+            if '*' in ignored['dirs']:
+                to_expand.extend(ignored['dirs']['*'])
+
+            if '/' in ignored['dirs']:
+                to_expand.extend(ignored['dirs']['/'])
+
+            if basedir in ignored['dirs']:
+                to_expand.extend(ignored['dirs'][basedir])
+
+            for dname in to_expand:
+                if '*' in dname:
+                    wildcard_dirs.append(os.path.join(basedir, dname))
+                else:
+                    dirs.append(os.path.join(basedir, dname))
+
+        return dirs, wildcard_dirs
+
+    @staticmethod
+    def wildcard_matches(dirname, wildcard_dirs, exact=True):
+        '''given a list of absolute paths with exactly one '*'
+        in each entry, see if the passed dirname matches
+        any of the list entries'''
+        for dname in wildcard_dirs:
+            if len(dirname) + 1 < len(dname):
+                continue
+
+            left, right = dname.split('*', 1)
+            if dirname.startswith(left):
+                if dirname.endswith(right):
+                    return True
+                elif (not exact and
+                      dirname.rfind(right + "/", len(left)) != -1):
+                    return True
+                else:
+                    continue
+        return False
+
+    def normalize(self, fname):
+        '''
+        subclasses may want to do something different, see
+        LogsAuditor for an example
+        '''
+        return fname
+
+    @staticmethod
+    def file_is_ignored(fname, basedir, ignored):
+        '''
+        pass normalized name (abs path), basedir (location audited),
+        hash of ignored files, dirs, prefixes, extensions
+        get back True if the file is to be ignored and
+        False otherwise
+        '''
+
+        basename = os.path.basename(fname)
+
+        if 'prefixes' in ignored:
+            if LocalFilesAuditor.startswith(basename, ignored['prefixes']):
+                return True
+
+        if 'extensions' in ignored:
+            if '*' in ignored['extensions']:
+                if LocalFilesAuditor.endswith(basename, 
ignored['extensions']['*']):
+                    return True
+            if basedir in ignored['extensions']:
+                if LocalFilesAuditor.endswith(
+                        basename, ignored['extensions'][basedir]):
+                    return True
+
+        if 'files' in ignored:
+            if basename in ignored['files']:
+                return True
+            if '*' in ignored['files']:
+                if LocalFilesAuditor.endswith(basename, ignored['files']['*']):
+                    return True
+
+            if '/' in ignored['files']:
+                if fname in ignored['files']['/']:
+                    return True
+                if LocalFilesAuditor.wildcard_matches(
+                        fname, [w for w in ignored['files']['/'] if '*' in w]):
+                    return True
+
+            if basedir in ignored['files']:
+                if LocalFilesAuditor.endswith(basename, 
ignored['files'][basedir]):
+                    return True
+        return False
+
+    def file_is_wanted(self, fname, basedir):
+        '''
+        decide if we want to audit the specific file or not
+        (is it ignored, or in an ignored directory, or of a type
+        we skip)
+        args: fname - the abs path to the file / dir
+
+        returns True if wanted or False if not
+        '''
+        fname = self.normalize(fname)
+
+        if LocalFilesAuditor.file_is_ignored(fname, basedir, self.ignored):
+            return False
+
+        if (self.filenames_to_check is not None and
+                fname not in self.filenames_to_check):
+            return False
+
+        return True
+
+    @staticmethod
+    def dir_is_ignored(dirname, ignored):
+        expanded_dirs, wildcard_dirs = LocalFilesAuditor.expand_ignored_dirs(
+            os.path.dirname(dirname), ignored)
+        if dirname in expanded_dirs:
+            return True
+        if LocalFilesAuditor.wildcard_matches(dirname, wildcard_dirs):
+            return True
+        return False
+
+    @staticmethod
+    def dir_is_wrong_type(dirname):
+        try:
+            dirstat = os.lstat(dirname)
+        except:
+            return True
+        if stat.S_ISLNK(dirstat.st_mode):
+            return True
+        if not stat.S_ISDIR(dirstat.st_mode):
+            return True
+        return False
+
+    def get_subdirs_to_do(self, dirname, dirname_depth, todo):
+
+        locale.setlocale(locale.LC_ALL, '')
+        if LocalFilesAuditor.dir_is_ignored(dirname, self.ignored):
+            return todo
+        if LocalFilesAuditor.dir_is_wrong_type(dirname):
+            return todo
+
+        if self.depth < dirname_depth:
+            return todo
+
+        if dirname_depth not in todo:
+            todo[dirname_depth] = []
+
+        if self.dirs_to_check is not None:
+            if LocalFilesAuditor.subdir_check(dirname, self.dirs_to_check):
+                todo[dirname_depth].append(dirname)
+        else:
+            todo[dirname_depth].append(dirname)
+
+        if self.depth == dirname_depth:
+            # don't read below the depth level
+            return todo
+
+        dirs = [os.path.join(dirname, d)
+                for d in os.listdir(dirname)]
+        if self.dirs_to_check is not None:
+            dirs = [d for d in dirs if LocalFilesAuditor.dirtree_check(
+                d, self.dirs_to_check)]
+
+        for dname in dirs:
+            todo = self.get_subdirs_to_do(dname, dirname_depth + 1, todo)
+        return todo
+
+    def get_dirs_to_do(self, dirname):
+        if (self.dirs_to_check is not None and
+                not LocalFilesAuditor.dirtree_check(dirname, 
self.dirs_to_check)):
+            return {}
+
+        todo = {}
+        depth_of_dirname = dirname.count(os.path.sep)
+        todo = self.get_subdirs_to_do(dirname, depth_of_dirname, todo)
+        return todo
+
+    def process_files_from_path(self, location, base, files, count,
+                                results, checklink=True):
+        '''
+        arguments:
+            location: the location being checked
+            base: directory containing the files to be checked
+            files: files to be checked
+            count: number of files in result set so far for this location
+            results: the result set
+        '''
+
+        for fname, st in files:
+            path = os.path.join(base, fname)
+            if self.file_is_wanted(path, location):
+                count += 1
+                if count > self.MAX_FILES:
+                    if self.dirsizes:
+                        self.warn_dirsize(base)
+                    else:
+                        self.warn_too_many_files(base)
+                    return count
+                # for dirsizes option we don't collect or report files
+                if not self.dirsizes:
+                    results.append((path, st))
+        return count
+
+    def walk_nolinks(self, top):
+        '''replaces (and is stolen from) os.walk, checks for and skips
+        links, returns base, paths, files but it's guaranteed that
+        files really are regular files and base/paths are not symlinks
+        the files list is a list of filename, stat of that filename,
+        because we have to do the stat on it anyways to ensure it's a file
+        and not a dir, so the caller might as well get that info'''
+
+        try:
+            names = os.listdir(top)
+        except os.error, err:
+            return
+
+        dirs, files = [], []
+        for name in names:
+            try:
+                filestat = os.lstat(os.path.join(top, name))
+            except:
+                continue
+            if stat.S_ISLNK(filestat.st_mode):
+                continue
+            if stat.S_ISDIR(filestat.st_mode):
+                dirs.append(name)
+            elif stat.S_ISREG(filestat.st_mode):
+                files.append((name, filestat))
+            else:
+                continue
+
+        yield top, dirs, files
+
+        for name in dirs:
+            new_path = os.path.join(top, name)
+            for x in self.walk_nolinks(new_path):
+                yield x
+
+    def process_one_dir(self, location, subdirpath, depth, results):
+        '''
+        arguments:
+            location: the location being checked
+            subdirpath: the path to the subdirectory being checked
+            depth: the depth of the directory being checked (starting at 1)
+            results: the result set
+        '''
+        if self.dirs_to_check is not None:
+            if not LocalFilesAuditor.dirtree_check(subdirpath, 
self.dirs_to_check):
+                return
+
+        if LocalFilesAuditor.dir_is_ignored(subdirpath, self.ignored):
+            return True
+
+        count = 0
+
+        # doing a directory higher up in the tree than our depth cutoff,
+        # only do the files in it, because we have the full list of dirs
+        # up to our cutoff we do them one by one
+        if depth < self.depth:
+            filenames = os.listdir(subdirpath)
+            files = []
+            for fname in filenames:
+                try:
+                    filestat = os.stat(os.path.join(subdirpath, fname))
+                except:
+                    continue
+                if (not stat.S_ISLNK(filestat.st_mode) and
+                        stat.S_ISREG(filestat.st_mode)):
+                    files.append((fname, filestat))
+            self.process_files_from_path(location, subdirpath,
+                                         files, count, results)
+            return
+
+        # doing a directory at our cutoff depth, walk it,
+        # because anything below the depth
+        # cutoff won't be in our list
+        temp_results = []
+        for base, paths, files in self.walk_nolinks(subdirpath):
+            expanded_dirs, wildcard_dirs = 
LocalFilesAuditor.expand_ignored_dirs(
+                base, self.ignored)
+            if self.dirs_to_check is not None:
+                paths[:] = [p for p in paths
+                            if 
LocalFilesAuditor.dirtree_check(os.path.join(base, p),
+                                                               
self.dirs_to_check)]
+            paths[:] = [p for p in paths if
+                        (not LocalFilesAuditor.startswithpath(os.path.join(
+                            base, p), expanded_dirs) and
+                         not LocalFilesAuditor.wildcard_matches(os.path.join(
+                             base, p), wildcard_dirs, exact=False))]
+            count = self.process_files_from_path(location, base, files,
+                                                 count, temp_results,
+                                                 checklink=False)
+            if count > self.MAX_FILES:
+                return
+
+        results.extend(temp_results)
+
+    def find_all_files(self):
+        results = []
+        for location in Config.cf[self.locations]:
+            dirs_to_do = self.get_dirs_to_do(location)
+            if location.count(os.path.sep) >= self.depth + 1:
+                # do the run at least once
+                upper_end = location.count(os.path.sep) + 1
+            else:
+                upper_end = self.depth + 1
+            for depth in range(location.count(os.path.sep), upper_end):
+                if depth in dirs_to_do:
+                    for dname in dirs_to_do[depth]:
+                        self.process_one_dir(location, dname, depth, results)
+        return results
+
+    @staticmethod
+    def get_open_files():
+        '''
+        scrounge /proc/nnn/fd and collect all open files
+        '''
+        open_files = set()
+        dirs = os.listdir("/proc")
+        for dname in dirs:
+            if not re.match('^[0-9]+$', dname):
+                continue
+            try:
+                links = os.listdir(os.path.join("/proc", dname, "fd"))
+            except:
+                # process may have gone away
+                continue
+            # must follow sym link for all of these, yuck
+            files = set()
+            for link in links:
+                try:
+                    files.add(os.readlink(os.path.join("/proc", dname,
+                                                       "fd", link)))
+                except:
+                    continue
+            open_files |= files
+        return open_files
+
+    def warn_too_many_files(self, path=None):
+        print "WARNING: too many files to audit",
+        if path is not None:
+            fields = path.split(os.path.sep)
+            print "in directory %s" % os.path.sep.join(fields[:self.depth + 1])
+
+    def warn_dirsize(self, path):
+        fields = path.split(os.path.sep)
+        print ("WARNING: directory %s has more than %d files"
+               % (os.path.sep.join(fields[:self.depth + 1]), self.MAX_FILES))
+
+    def do_local_audit(self):
+        open_files = LocalFilesAuditor.get_open_files()
+
+        all_files = {}
+        files = self.find_all_files()
+
+        count = 0
+        for (f, st) in files:
+            if count < 10:
+                print "got", f, st
+                count += 1
+            all_files[f] = FileInfo(f, self.magic, st)
+            all_files[f].load_file_info(self.today, self.cutoff, open_files)
+
+        all_files_sorted = sorted(all_files, key=lambda f: all_files[f].path)
+        result = []
+
+        if all_files:
+            max_name_length = max([len(all_files[fname].path)
+                                   for fname in all_files]) + 2
+
+        for fname in all_files_sorted:
+            if (not self.contains(all_files[fname].filetype,
+                                  Config.cf['ignored_types'])
+                    and not all_files[fname].is_empty):
+                result.append(all_files[fname].format_output(
+                    self.show_sample_content, False,
+                    max_name_length))
+        output = "\n".join(result) + "\n"
+        return output
diff --git a/dataretention/retention/remotefileauditor.py 
b/dataretention/retention/remotefileauditor.py
new file mode 100644
index 0000000..3762bfa
--- /dev/null
+++ b/dataretention/retention/remotefileauditor.py
@@ -0,0 +1,516 @@
+import os
+import sys
+import time
+import json
+import socket
+import runpy
+
+sys.path.append('/srv/audits/retention/scripts/')
+
+import retention.utils
+import retention.magic
+from retention.status import Status
+from retention.saltclientplus import LocalClientPlus
+from retention.rule import Rule, RuleStore
+from retention.config import Config
+from retention.fileinfo import FileInfo
+from retention.utils import JsonHelper
+from retention.runner import Runner
+from retention.localfileaudit import LocalFilesAuditor
+
+global_keys = [key for key, value_unused in
+               sys.modules[__name__].__dict__.items()]
+
+def get_dirs_toexamine(host_report):
+    '''
+    given full report output from host (list of
+    json entries), return the list
+    of directories with at least one possibly old file
+    and the list of directories skipped due to too
+    many entries
+    '''
+    dirs_problem = set()
+    dirs_skipped = set()
+    lines = host_report.split("\n")
+    for json_entry in lines:
+        if json_entry == "":
+            continue
+
+        if json_entry.startswith("WARNING:"):
+            bad_dir = RemoteFilesAuditor.get_dirname_from_warning(json_entry)
+            if bad_dir is not None:
+                dirs_skipped.add(bad_dir)
+                continue
+
+        if (json_entry.startswith("WARNING:") or
+                json_entry.startswith("INFO:")):
+            print json_entry
+            continue
+
+        try:
+            entry = json.loads(json_entry,
+                               object_hook=JsonHelper.decode_dict)
+        except:
+            print "WARNING: failed to load json for", json_entry
+            continue
+        if 'empty' in entry:
+            empty = FileInfo.string_to_bool(entry['empty'])
+            if empty:
+                continue
+        if 'old' in entry:
+            old = FileInfo.string_to_bool(entry['old'])
+            if old is None or old:
+                if os.path.dirname(entry['path']) not in dirs_problem:
+                    dirs_problem.add(os.path.dirname(entry['path']))
+    return sorted(list(dirs_problem)), sorted(list(dirs_skipped))
+
+
+class RemoteFilesAuditor(object):
+    '''
+    audit files across a set of remote hosts,
+    in a specified set of directories
+    '''
+    def __init__(self, hosts_expr, audit_type, prettyprint=False,
+                 show_content=False, dirsizes=False, summary_report=False,
+                 depth=2, to_check=None, ignore_also=None,
+                 timeout=60, maxfiles=None,
+                 store_filepath=None,
+                 verbose=False):
+        '''
+        hosts_expr:   list or grain-based or wildcard expr for hosts
+                      to be audited
+        audit_type:   type of audit e.g. 'logs', 'homes'
+        prettyprint:  nicely format the output display
+        show_content: show the first line or so from problematic files
+        dirsizes:     show only directories which have too many files to
+                      audit properly, don't report on files at all
+        summary_report: do a summary of results instead of detailed
+                        this means different thiings depending on the audit
+                        type
+        depth:        the auditor will give up if a directory has too any files
+                      it (saves it form dying on someone's 25gb homedir).
+                      this option tells it how far down the tree to go from
+                      the top dir of the audit, before starting to count.
+                      e.g. do we count in /home/ariel or separately in
+                      /home/ariel/* or in /home/ariel/*/*, etc.
+        to_check:     comma-separated list of dirs (must end in '/') and/or
+                      files that will be checked; if this is None then
+                      all dirs/files will be checked
+        ignore_also:  comma-separated list of dirs (must end in '/') and/or
+                      files that will be skipped in addition to the ones
+                      in the config, rules, etc.
+        timeout:      salt timeout for running remote commands
+        maxfiles:     how many files in a directory tree is too many to audit
+                      (at which point we warn about that and move on)
+        store_filepath: full path to rule store (sqlite3 db)
+        verbose:      show informative messages during processing
+        '''
+
+        global rules
+
+        self.hosts_expr = hosts_expr
+        self.audit_type = audit_type
+        self.locations = audit_type + "_locations"
+        self.prettyprint = prettyprint
+        self.show_sample_content = show_content
+        self.dirsizes = dirsizes
+        self.show_summary = summary_report
+        self.depth = depth + 1  # actually count of path separators in dirname
+        self.to_check = to_check
+
+        self.ignore_also = ignore_also
+        if self.ignore_also is not None:
+            self.ignore_also = self.ignore_also.split(',')
+        self.timeout = timeout
+        self.store_filepath = store_filepath
+        self.verbose = verbose
+
+        self.set_up_ignored()
+
+        # need this for locally running jobs
+        self.hostname = socket.getfqdn()
+
+        self.cutoff = Config.cf['cutoff']
+
+        client = LocalClientPlus()
+        hosts, expr_type = Runner.get_hosts_expr_type(self.hosts_expr)
+        self.expanded_hosts = client.cmd_expandminions(
+            hosts, "test.ping", expr_form=expr_type)
+
+        self.set_up_max_files(maxfiles)
+        fileaudit_args = [self.show_sample_content,
+                          self.dirsizes,
+                          self.depth - 1,
+                          self.to_check,
+                          ",".join(self.ignore_also) if self.ignore_also is 
not None else None,
+                          self.timeout,
+                          self.MAX_FILES]
+
+        self.runner = Runner(hosts_expr,
+                             self.expanded_hosts,
+                             self.audit_type,
+                             fileaudit_args,
+                             self.show_sample_content,
+                             self.to_check,
+                             self.timeout,
+                             self.verbose)
+
+        self.perhost_raw = None
+        if 
os.path.exists('/srv/audits/retention/scripts/audit_files_perhost_config.py'):
+            try:
+                self.perhost_rules_from_file = runpy.run_path(
+                    
'/srv/audits/retention/scripts/audit_files_perhost_config.py')['perhostcf']
+                self.perhost_raw = open(
+                    
'/srv/audits/retention/scripts/audit_files_perhost_config.py').read()
+            except:
+                pass
+
+        self.write_rules_for_minion()
+
+        self.cdb = RuleStore(self.store_filepath)
+        self.cdb.store_db_init(self.expanded_hosts)
+        self.set_up_and_export_rule_store()
+
+        self.show_ignored(Config.cf[self.locations])
+
+        self.today = time.time()
+        self.magic = retention.magic.magic_open(retention.magic.MAGIC_NONE)
+        self.magic.load()
+        self.summary = None
+        self.display_from_dict = FileInfo.display_from_dict
+
+    def set_up_max_files(self, maxfiles):
+        '''
+        more than this many files in a subdir we won't process,
+        we'll just try to name top offenders
+
+        if we've been asked only to report dir trees that are
+        too large in this manner, we can set defaults mich
+        higher, since we don't stat files, open them to guess
+        their filetype, etc; processing then goes much quicker
+        '''
+
+        if maxfiles is None:
+            if self.dirsizes:
+                self.MAX_FILES = 1000
+            else:
+                self.MAX_FILES = 100
+        else:
+            self.MAX_FILES = maxfiles
+
+    def set_up_and_export_rule_store(self):
+        hosts = self.cdb.store_db_list_all_hosts()
+        where_to_put = os.path.join(os.path.dirname(self.store_filepath),
+                                    "data_retention.d")
+        if not os.path.isdir(where_to_put):
+            os.makedirs(where_to_put, 0755)
+        for host in hosts:
+            nicepath = os.path.join(where_to_put, host + ".conf")
+            Rule.export_rules(self.cdb, nicepath, host)
+
+    def set_up_ignored(self):
+        '''
+        collect up initial list of files/dirs to skip during audit
+        '''
+        self.ignored = {}
+        self.ignored['files'] = Config.cf['ignored_files']
+        self.ignored['dirs'] = Config.cf['ignored_dirs']
+        self.ignored['prefixes'] = Config.cf['ignored_prefixes']
+        self.ignored['extensions'] = Config.cf['ignored_extensions']
+
+        if self.ignore_also is not None:
+            # silently skip paths that are not absolute
+            for path in self.ignore_also:
+                if path.startswith('/'):
+                    if path.endswith('/'):
+                        if '/' not in self.ignored['dirs']:
+                            self.ignored['dirs']['/'] = []
+                        self.ignored['dirs']['/'].append(path[:-1])
+                    else:
+                        if '/' not in self.ignored['files']:
+                            self.ignored['files']['/'] = []
+                        self.ignored['files']['/'].append(path)
+
+    def get_perhost_rules_as_json(self):
+        '''
+        this reads from the data_retention.d directory files for the minions
+        on which the audit will be run, converts each host's rules to json
+        strings, and returns a hash of rules where keys are the hostname and
+        values are the list of rules on that host
+        '''
+        where_to_get = os.path.join(os.path.dirname(self.store_filepath),
+                                    "data_retention.d")
+        if not os.path.isdir(where_to_get):
+            os.mkdir(where_to_get, 0755)
+        # really? or just read each file and be done with it?
+        # also I would like to check the syntax cause paranoid.
+        rules = {}
+        self.cdb = RuleStore(self.store_filepath)
+        self.cdb.store_db_init(self.expanded_hosts)
+        for host in self.expanded_hosts:
+            rules[host] = []
+            nicepath = os.path.join(where_to_get, host + ".conf")
+            if os.path.exists(nicepath):
+                dir_rules = None
+                try:
+                    text = open(nicepath)
+                    exec(text)
+                except:
+                    continue
+                if dir_rules is not None:
+                    for status in Status.status_cf:
+                        if status in dir_rules:
+                            for entry in dir_rules[status]:
+                                if entry[0] != os.path.sep:
+                                    print ("WARNING: relative path in rule,"
+                                           "skipping:", entry)
+                                    continue
+                                if entry[-1] == os.path.sep:
+                                    entry = entry[:-1]
+                                    entry_type = Rule.text_to_entrytype('dir')
+                                else:
+                                    entry_type = Rule.text_to_entrytype('file')
+                                rule = Rule.get_rule_as_json(
+                                    entry, entry_type, status)
+                                rules[host].append(rule)
+        return rules
+
+    def write_perhost_rules_normal_code(self, indent):
+        rules = self.get_perhost_rules_as_json()
+
+        for host in rules:
+            rulescode = "rules = {}\n\n"
+            rulescode += "rules['%s'] = [\n" % host
+            rulescode += (indent +
+                     (",\n%s" % (indent + indent)).join(rules[host]) + "\n")
+            rulescode += "]\n"
+
+            with open("/srv/salt/audits/retention/configs/%s_store.py" % host, 
"w+") as fp:
+                fp.write(rulescode)
+                fp.close()
+
+    def write_rules_for_minion(self):
+        indent = "    "
+        self.write_perhost_rules_normal_code(indent)
+        if self.perhost_raw is not None:
+            with open("/srv/salt/audits/retention/configs/allhosts_file.py", 
"w+") as fp:
+                fp.write(self.perhost_raw)
+                fp.close()
+
+    def show_ignored(self, basedirs):
+        if self.verbose:
+            sys.stderr.write(
+                "INFO: The below does not include per-host rules\n")
+            sys.stderr.write(
+                "INFO: or rules derived from the directory status entries.\n")
+
+            sys.stderr.write("INFO: Ignoring the following directories:\n")
+
+            for basedir in self.ignored['dirs']:
+                if basedir in basedirs or basedir == '*' or basedir == '/':
+                    sys.stderr.write(
+                        "INFO: " + ','.join(self.ignored['dirs'][basedir])
+                        + " in " + basedir + '\n')
+
+            sys.stderr.write("INFO: Ignoring the following files:\n")
+            for basedir in self.ignored['files']:
+                if basedir in basedirs or basedir == '*' or basedir == '/':
+                    sys.stderr.write(
+                        "INFO: " + ','.join(self.ignored['files'][basedir])
+                        + " in " + basedir + '\n')
+
+            sys.stderr.write(
+                "INFO: Ignoring files starting with the following:\n")
+            sys.stderr.write(
+                "INFO: " + ','.join(self.ignored['prefixes']) + '\n')
+
+            sys.stderr.write(
+                "INFO: Ignoring files ending with the following:\n")
+            for basedir in self.ignored['extensions']:
+                if basedir in basedirs or basedir == '*':
+                    sys.stderr.write("INFO: " + ','.join(
+                        self.ignored['extensions'][basedir])
+                        + " in " + basedir + '\n')
+
+    def contains(self, string_arg, list_arg):
+        '''
+        check if the string arg cotains any elt in
+        the list_arg
+        '''
+        for elt in list_arg:
+            if elt in string_arg:
+                return True
+        return False
+
+    def normalize(self, fname):
+        '''
+        subclasses may want to do something different, see
+        LogsAuditor for an example
+        '''
+        return fname
+
+    @staticmethod
+    def get_dirname_from_warning(warning):
+        '''
+        some audit output lines warn about directory trees
+        having too many files to audit; grab the dirname
+        out of such a line and return it
+        '''
+        start = "WARNING: directory "
+        if warning.startswith(start):
+            # WARNING: directory %s has more than %d files
+            rindex = warning.rfind(" has more than")
+            if not rindex:
+                return None
+            else:
+                return warning[len(start):rindex]
+
+        start = "WARNING: too many files to audit in directory "
+        if warning.startswith(start):
+            return warning[len(start):]
+
+        return None
+
+    def add_stats(self, item, summary):
+        '''
+        gather stats on how many files/dirs
+        may be problematic; summary is where the results
+        are collected, item is the item to include in
+        the summary if needed
+        '''
+        dirname = os.path.dirname(item['path'])
+
+        if dirname not in summary:
+            summary[dirname] = {
+                'binary': {'old': 0, 'maybe_old': 0, 'nonroot': 0},
+                'text': {'old': 0, 'maybe_old': 0, 'nonroot': 0}
+            }
+        if item['binary'] is True:
+            group = 'binary'
+        else:
+            group = 'text'
+
+        if item['old'] == 'T':
+            summary[dirname][group]['old'] += 1
+        elif item['old'] == '-':
+            summary[dirname][group]['maybe_old'] += 1
+        if item['owner'] != 0:
+            summary[dirname][group]['nonroot'] += 1
+        return summary
+
+    def display_host_summary(self):
+        if self.summary is not None:
+            paths = sorted(self.summary.keys())
+            for path in paths:
+                for group in self.summary[path]:
+                    if (self.summary[path][group]['old'] > 0 or
+                            self.summary[path][group]['maybe_old'] > 0 or
+                            self.summary[path][group]['nonroot'] > 0):
+                        print ("in directory %s, (%s), %d old,"
+                               " %d maybe old, %d with non root owner"
+                               % (path, group, 
self.summary[path][group]['old'],
+                                  self.summary[path][group]['maybe_old'],
+                                  self.summary[path][group]['nonroot']))
+
+    def display_summary(self, result):
+        for host in result:
+            self.summary = {}
+            print "host:", host
+
+            if result[host]:
+                self.summary = {}
+                try:
+                    lines = result[host].split('\n')
+                    for line in lines:
+                        if line == '':
+                            continue
+                        if (line.startswith("WARNING:") or
+                                line.startswith("INFO:")):
+                            print line
+                            continue
+                        else:
+                            try:
+                                item = json.loads(
+                                    line, object_hook=JsonHelper.decode_dict)
+                                if item['empty'] is not True:
+                                    self.add_stats(item, self.summary)
+                            except:
+                                print "WARNING: failed to json load from host",
+                                print host, "this line:", line
+                    self.display_host_summary()
+                except:
+                    print "WARNING: failed to process output from host"
+            else:
+                if self.verbose:
+                    print "WARNING: no output from host", host
+
+    def display_remote_host(self, result):
+        try:
+            lines = result.split('\n')
+            files = []
+            for line in lines:
+                if line == "":
+                    continue
+                elif line.startswith("WARNING:") or line.startswith("INFO:"):
+                    print line
+                else:
+                    files.append(json.loads(line, 
object_hook=JsonHelper.decode_dict))
+
+            if files == []:
+                return
+            path_justify = max([len(finfo['path']) for finfo in files]) + 2
+            for finfo in files:
+                self.display_from_dict(finfo, self.show_sample_content, 
path_justify)
+        except:
+            print "WARNING: failed to load json from host"
+
+    def audit_hosts(self):
+        result = self.runner.run_remotely()
+        if result is None:
+            print "WARNING: failed to get output from audit script on any host"
+        elif self.show_summary:
+            self.display_summary(result)
+        else:
+            for host in result:
+                print "host:", host
+                if result[host]:
+                    self.display_remote_host(result[host])
+                else:
+                    if self.verbose:
+                        print "no output from host", host
+        # add some results to rule store
+        self.update_status_rules_from_report(result)
+        return result, self.ignored
+
+    def update_status_rules_from_report(self, report):
+        hostlist = report.keys()
+        for host in hostlist:
+            try:
+                problem_rules = Rule.get_rules(self.cdb, host, 
Status.text_to_status('problem'))
+            except:
+                print 'WARNING: problem retrieving problem rules for host', 
host
+                problem_rules = None
+            if problem_rules is not None:
+                existing_problems = [rule['path'] for rule in problem_rules]
+            else:
+                existing_problems = []
+
+            dirs_problem, dirs_skipped = get_dirs_toexamine(report[host])
+            if dirs_problem is not None:
+                dirs_problem = list(set(dirs_problem))
+                for dirname in dirs_problem:
+                    Rule.do_add_rule(self.cdb, dirname,
+                                     Rule.text_to_entrytype('dir'),
+                                     Status.text_to_status('problem'), host)
+
+            if dirs_skipped is not None:
+                dirs_skipped = list(set(dirs_skipped))
+                for dirname in dirs_skipped:
+                    if dirname in dirs_problem or dirname in existing_problems:
+                        # problem report overrides 'too many to audit'
+                        continue
+                    Rule.do_add_rule(self.cdb, dirname,
+                                     Rule.text_to_entrytype('dir'),
+                                     Status.text_to_status('unreviewed'), host)
+
+
diff --git a/dataretention/retention/retentionaudit.py 
b/dataretention/retention/retentionaudit.py
new file mode 100644
index 0000000..b7fefc5
--- /dev/null
+++ b/dataretention/retention/retentionaudit.py
@@ -0,0 +1,17 @@
+# salt module
+import sys
+import logging
+
+sys.path.append('/srv/audits/retention/scripts/')
+
+from retention.localfileaudit import LocalFilesAuditor
+
+def fileaudit_host(show_content, dirsizes, depth,
+                   to_check, ignore_also, timeout,
+                   maxfiles):
+    fauditor = LocalFilesAuditor('root', show_content,
+                                 dirsizes, depth, to_check,
+                                 ignore_also, timeout,
+                                 maxfiles)
+    result = fauditor.do_local_audit()
+    return result
diff --git a/dataretention/retention/runner.py 
b/dataretention/retention/runner.py
index 5a03930..aef8867 100644
--- a/dataretention/retention/runner.py
+++ b/dataretention/retention/runner.py
@@ -11,7 +11,7 @@
     '''
 
     def __init__(self, hosts_expr, expanded_hosts,
-                 audit_type, generate_executor,
+                 audit_type, auditor_args,
                  show_sample_content=False, to_check=None,
                  timeout=30, verbose=False):
         self.hosts_expr = hosts_expr
@@ -19,22 +19,21 @@
         self.hosts, self.hosts_expr_type = Runner.get_hosts_expr_type(
             self.hosts_expr)
         self.audit_type = audit_type
-        self.generate_executor = generate_executor
+        self.auditmodule_args = auditor_args
         self.show_sample_content = show_sample_content
         self.to_check = to_check
         self.timeout = timeout
         self.verbose = verbose
 
-    @staticmethod
-    def running_locally(hosts_expr):
-        '''
-        determine whether this script is to run on the local
-        host or on one or more remote hosts
-        '''
-        if hosts_expr == "127.0.0.1" or hosts_expr == "localhost":
-            return True
+    def get_auditfunction_name(self):
+        if self.audit_type == 'root':
+            return 'fileaudit_host'
+        elif self.audit_type == 'logs':
+            return 'logaudit_host'
+        elif self.audit_type == 'homes':
+            return 'homeaudit_host'
         else:
-            return False
+            return None
 
     def run_remotely(self):
         '''
@@ -46,10 +45,9 @@
         if self.expanded_hosts is None:
             self.expanded_hosts = client.cmd_expandminions(
                 self.hosts, "test.ping", expr_form=self.hosts_expr_type)
-        code = "# -*- coding: utf-8 -*-\n"
-        code += self.generate_executor()
-        with open('/srv/audits/retention/scripts/data_auditor.py', 'r') as fp_:
-            code += fp_.read()
+
+        # fixme instead of this we call the right salt module based on the
+        # audit type and with the self.auditmodule_args which is a list
 
         hostbatches = [self.expanded_hosts[i: i + Config.cf['batchsize']]
                        for i in range(0, len(self.expanded_hosts),
@@ -72,14 +70,15 @@
                                                  'template=jinja'], 
expr_form='list')
             # fixme only copy if exists, check returns
             # fixme this content should be ordered by host instead of by 
ignore-list type
-            # and split into separate files just as the previous files are
+            # and split into separate files just as the previous files are, 
and actually be in one file
+            # with one copy total per client
             new_result = client.cmd_full_return(hosts, 'cp.get_file',
                                                 
['salt://audits/retention/configs/allhosts_file.py',
                                                  
"/srv/audits/retention/configs/allhosts_file.cf",
                                                  'template=jinja'], 
expr_form='list')
-            print "salt-copy (2):", new_result
 
-            new_result = client.cmd(hosts, "cmd.exec_code", ["python2", code],
+            # step two: run the appropriate salt audit module function
+            new_result = client.cmd(hosts, "retentionaudit.%s" % 
self.get_auditfunction_name(), self.auditmodule_args,
                                     expr_form='list', timeout=self.timeout)
 
             if new_result is not None:

-- 
To view, visit https://gerrit.wikimedia.org/r/233453
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I0cd978a33cd2508197247355355e2141aadcab73
Gerrit-PatchSet: 2
Gerrit-Project: operations/software
Gerrit-Branch: master
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to