ArielGlenn has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/366308 )

Change subject: setup for dumpsdata hosts to serve dumps work area via nfs to 
snapshots
......................................................................

setup for dumpsdata hosts to serve dumps work area via nfs to snapshots

[WIP] so draft, much fixmes
This is here as a placeholder while pieces of it get filled
in further

Architecture notes are on the phab task.

Bug: T169849
Change-Id: I5f52a1d3725e5f794573ea293c8978ceeb9027a8
---
A modules/dumpsdata/files/default-nfs-common
A modules/dumpsdata/files/default-nfs-kernel-server
A modules/dumpsdata/files/rsync_completed_dumpjobs.py
A modules/dumpsdata/manifests/dirs.pp
A modules/dumpsdata/manifests/nfs.pp
A modules/dumpsdata/manifests/rsync.pp
A modules/dumpsdata/templates/nfs_exports.erb
M modules/snapshot/manifests/dumps/cron.pp
8 files changed, 511 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/08/366308/1

diff --git a/modules/dumpsdata/files/default-nfs-common 
b/modules/dumpsdata/files/default-nfs-common
new file mode 100644
index 0000000..dd37dd4
--- /dev/null
+++ b/modules/dumpsdata/files/default-nfs-common
@@ -0,0 +1,16 @@
+# If you do not set values for the NEED_ options, they will be attempted
+# autodetected; this should be sufficient for most people. Valid alternatives
+# for the NEED_ options are "yes" and "no".
+
+# Do you want to start the statd daemon? It is not needed for NFSv4.
+NEED_STATD=
+
+# Options for rpc.statd.
+#   Should rpc.statd listen on a specific port? This is especially useful
+#   when you have a port-based firewall. To use a fixed port, set this
+#   this variable to a statd argument like: "--port 4000 --outgoing-port 4001".
+#   For more information, see rpc.statd(8) or 
http://wiki.debian.org/SecuringNFS
+STATDOPTS="--port 32765 --outgoing-port 32766"
+
+# Do you want to start the gssd daemon? It is required for Kerberos mounts.
+NEED_GSSD=
diff --git a/modules/dumpsdata/files/default-nfs-kernel-server 
b/modules/dumpsdata/files/default-nfs-kernel-server
new file mode 100644
index 0000000..54eba53
--- /dev/null
+++ b/modules/dumpsdata/files/default-nfs-kernel-server
@@ -0,0 +1,23 @@
+# Number of servers to start up
+# To disable nfsv4 on the server, specify '--no-nfs-version 4' here
+RPCNFSDCOUNT=8
+
+# Runtime priority of server (see nice(1))
+RPCNFSDPRIORITY=0
+
+# Options for rpc.mountd.
+# If you have a port-based firewall, you might want to set up
+# a fixed port here using the --port option. For more information,
+# see rpc.mountd(8) or http://wiki.debian.org/SecuringNFS
+# To disable NFSv4 on the server, specify '--no-nfs-version 4' here
+RPCMOUNTDOPTS="--manage-gids -p 32767"
+
+# Do you want to start the svcgssd daemon? It is only required for Kerberos
+# exports. Valid alternatives are "yes" and "no"; the default is "no".
+NEED_SVCGSSD=
+
+# Options for rpc.svcgssd.
+RPCSVCGSSDOPTS=
+
+# Options for rpc.nfsd.
+RPCNFSDOPTS=
diff --git a/modules/dumpsdata/files/rsync_completed_dumpjobs.py 
b/modules/dumpsdata/files/rsync_completed_dumpjobs.py
new file mode 100644
index 0000000..355ce71
--- /dev/null
+++ b/modules/dumpsdata/files/rsync_completed_dumpjobs.py
@@ -0,0 +1,223 @@
+import sys
+import subprocess
+import socket
+
+
+"""
+rsync completed xml dump jobs to target host/dir
+
+we don't rsync incomplete files because we'll just end up
+copying large partial files on every run and wasting bandwith
+for the big wiki dumps (e.g. enwiki)
+
+later we need to deal with rsync of the files in other/ etc, ugh
+
+"""
+class Rsyncer(object):
+    def __init__(self, rsync_args, configfile, dryrun, list_only):
+        self.rsync_args = rsync_args
+        self.dryrun = dryrun
+        self.configfile = configfile
+        self.list_only = list_only
+
+    def get_wikis_to_rsync(self):
+        conf = ConfigParser.SafeConfigParser()
+        self.conf.read(configfile)
+        list_of_wikis = self.conf.get("wiki", "dblist")
+        with open(path) as fhandle:
+        wikis = []
+        for line in fhandle:
+            line = line.strip()
+            if line != "":
+                wikis.append(line)
+        fhandle.close()
+        return sorted(wikis)
+
+   def get_dumpstatus(self, dirname):
+       statusfile = os.path.join(dirname, "dumpstatus.json")
+       status = None
+       try:
+           with open(statusfile) as fhandle:
+               contents = fhandle.read()
+               status = json.loads(contents)
+               fhandle.close()
+           return status
+       except Exception:
+           return None
+       
+   def get_specialfiles(self, dirname):
+       infofile = os.path.join(dirname, "dumpspecialfiles.json")
+       specials = None
+       try:
+           with open(infofile) as fhandle:
+               contents = fhandle.read()
+               specials = json.loads(contents)
+               fhandle.close()
+           return specials
+       except Exception:
+           return None
+
+    def get_files_to_rsync_in_dir(self, dirname):
+        dumpstatus = self.get_dumpstatus(dirname)
+        files = []
+        if not dumpstatus:
+            return files
+        try:
+            for job in dumpstatus['jobs']:
+                if dumpstatus['jobs'][job]['status'] != 'done':
+                    next
+                    files.extend(dumpstatus['jobs'][job]['files'].keys())
+        except Exception:
+            pass
+        return [os.path.join(dirname, filename) for filename in files]
+
+    def get_special_files_to_rsync_in_dir(self, dirname):
+        specialfiles = self.get_specialfiles(dirname)
+        files = []
+        if not specialfiles:
+            return files
+        try:
+            for filename in specialfiles['files']:
+                if specialfiles['files'][filename]['status'] != 'present':
+                    next
+                files.extend(filename)
+        except Exception:
+            pass
+        return [os.path.join(dirname, filename) for filename in files]
+
+    def get_files_to_rsync(self, wiki):
+        dirs = get_directories(wiki)
+        files = []
+        for dirname in dirs:
+            files.extend(get_files_to_rsync_in_dir(dirname))
+        # claim is that sorting these makes rsync more efficient when it goes 
to sync them
+        return sorted(files)
+
+    def get_special_files_to_rsync(self, wiki):
+        dirs = get_directories(wiki)
+        files = []
+        for dirname in dirs:
+            files.extend(get_special_files_to_rsync_in_dir(dirname))
+        # claim is that sorting these makes rsync more efficient when it goes 
to sync them
+        return sorted(files)
+        
+    def rsync_files(self, wiki, files):
+        command = [ "/usr/bin/rsync" ]
+        command.append("--files-from=-")
+        command.extend(self.rsync_args)
+        process = Popen(command, stdin=PIPE, stdout=PIPE, stderr=PIPE, 
shell=False)
+        # FIXME there's this issue with the full path and etc.
+        output, errors = process.communicate(input=" ".join(files))
+
+    def get_toplevel_files(self):
+        files = []
+        # get index.htmls, rsync file listings
+        # FIXME write this
+        return files
+
+    def already_running():
+        command = ["/usr/bin/pgrep", "-f", "rsync_completed_dumpjobs.py" ]
+        try:
+            subprocess.check_output(command)
+            # return code 0 = already running, anything else excepts
+            return False
+        except subprocess.CalledProcessError as err:
+            if err.returncode != 1:
+                # genuine error
+                raise
+            else:
+                return True
+
+    def rsync(self):
+        if already_running():
+            return
+        wikis = get_wikis_to_rsync()
+        for wiki in wikis:
+            files = get_files_to_rsync(wiki)
+            rsync_files(wiki, files)
+            files = get_special_files_to_rsync(wiki)
+            rsync_files(wiki, special_files)
+        files = get_toplevel_files()
+        rsync_files(None, files)
+
+
+def usage(message=None):
+    '''
+    display a helpful usage message with
+    an optional introductory message first
+    '''
+
+    if message is not None:
+        sys.stderr.write(message)
+        sys.stderr.write("\n")
+    usage_message = """
+Usage: rsync_completed_dumpjobs.py --configfile <path> [--rsyncargs <args>]
+                  [--dryrun] [--listonly]| --help
+
+Options:
+  --configfile  (-c):  path to confguration file for dumps
+  --rsyncargs   (-r):  additional arguments to be passed to rsync
+  --listonly    (-l):  list files that would be transferred, don't actually
+                       transfer them
+  --dryrun      (-d):  don't rsync, show the commands that would be run
+  --help        (-h):  display this help message
+"""
+    sys.stderr.write(usage_message)
+    sys.exit(1)
+
+
+def validate_args(configfile, remainder):
+    if configfile is None:
+        usage("Mandatory configfile argument is missing")
+    elif len(remainder) > 0:
+        usage("Unknown option(s) specified: <%s>" % remainder[0])
+
+
+def process_args():
+    """
+    get and check validity of command line args
+    """
+    dryrun = False
+    list_only = False
+    rsync_args = []
+    configfile = None
+    try:
+        (options, remainder) = getopt.gnu_getopt(
+            sys.argv[1:], "c:r:l:dh", ["confilefile=", "rsyncargs=", 
"listonly", "dryrun", "help"])
+    except getopt.GetoptError as err:
+        usage("Unknown option specified: " + str(err))
+
+    for (opt, val) in options:
+        if opt in ["-c", "--configfile"]:
+            configfile = val
+        elif opt in ["-r", "--rsyncargs"]:
+            rsyncargs = val.split(',')
+        elif opt in ["-l", "--listonly"]:
+            listonly = True
+        elif opt in ["-d", "--dryrun"]:
+            dryrun = True
+        elif opt in ["-h", "--help"]:
+            usage("Help for this script")
+
+    validate_args(configfile, remainder)
+    return dryrun, list_only, rsync_args, configfile
+
+
+def do_main():
+    dryrun, list_only, rsync_args, configfile = process_args()
+    rsyncer = Rsyncer(rsync_args, configfile, dryrun, list_only)
+    rsyncer.rsync()
+
+    # Primary for '/public/':
+    #   /usr/bin/rsync -v --bwlimit=40000 -a --delete
+    #          --exclude=wikidump_* --exclude=md5temp.*
+    #          --exclude=/dir-done-by-secondary/
+    #          --exclude=/another-dir-done-by-secondary/
+    #          --exclude=/other/
+    #          /data/xmldatadumps/public/
+    #          remotehost::data/xmldatadumps/public/
+    #
+
+
+if __name__ == '__main__':
+    do_main()
diff --git a/modules/dumpsdata/manifests/dirs.pp 
b/modules/dumpsdata/manifests/dirs.pp
new file mode 100644
index 0000000..b718d21
--- /dev/null
+++ b/modules/dumpsdata/manifests/dirs.pp
@@ -0,0 +1,124 @@
+class dataset::dirs {
+    # Please note that this is incomplete, but new directories
+    # should be defined in puppet (here).
+    $datadir                  = '/data/xmldatadumps'
+    $publicdir                = '/data/xmldatadumps/public'
+    $otherdir                 = "${publicdir}/other"
+    $analyticsdir             = "${otherdir}/analytics"
+    $othermiscdir             = "${otherdir}/misc"
+    $othertestfilesdir        = "${otherdir}/testfiles"
+    $otherdir_wikidata_legacy = "${otherdir}/wikidata"
+    $otherdir_wikibase        = "${otherdir}/wikibase/"
+    $relative_wikidatawiki    = 'other/wikibase/wikidatawiki'
+    $xlationdir               = "${otherdir}/contenttranslation"
+    $cirrussearchdir          = "${otherdir}/cirrussearch"
+    $medialistsdir            = "${otherdir}/imageinfo"
+    $pagetitlesdir            = "${otherdir}/pagetitles"
+    $mediatitlesdir           = "${otherdir}/mediatitles"
+
+    file { $datadir:
+        ensure => 'directory',
+        mode   => '0755',
+        owner  => 'root',
+        group  => 'root',
+    }
+
+    file { $publicdir:
+        ensure => 'directory',
+        mode   => '0775',
+        owner  => 'datasets',
+        group  => 'datasets',
+    }
+
+    file { $otherdir:
+        ensure => 'directory',
+        mode   => '0755',
+        owner  => 'datasets',
+        group  => 'datasets',
+    }
+
+    file { $analyticsdir:
+        ensure => 'directory',
+        mode   => '0755',
+        owner  => 'datasets',
+        group  => 'datasets',
+    }
+
+    file { $othermiscdir:
+        ensure => 'directory',
+        mode   => '0755',
+        owner  => 'datasets',
+        group  => 'datasets',
+    }
+
+    file { $othertestfilesdir:
+        ensure => 'directory',
+        mode   => '0755',
+        owner  => 'datasets',
+        group  => 'datasets',
+    }
+
+    file { $otherdir_wikibase:
+        ensure => 'directory',
+        mode   => '0755',
+        owner  => 'datasets',
+        group  => 'datasets',
+    }
+
+    file { "${publicdir}/${relative_wikidatawiki}":
+        ensure => 'directory',
+        mode   => '0755',
+        owner  => 'datasets',
+        group  => 'datasets',
+    }
+
+    # T72385
+    # needs to be relative because it is mounted via NFS at differing names
+    file { "${publicdir}/wikidatawiki/entities":
+        ensure => 'link',
+        target => "../${relative_wikidatawiki}",
+    }
+
+    # Legacy
+    file { $otherdir_wikidata_legacy:
+        ensure => 'directory',
+        mode   => '0755',
+        owner  => 'datasets',
+        group  => 'datasets',
+    }
+
+    file { $xlationdir:
+        ensure => 'directory',
+        mode   => '0755',
+        owner  => 'datasets',
+        group  => 'datasets',
+    }
+
+    file { $cirrussearchdir:
+        ensure => 'directory',
+        mode   => '0755',
+        owner  => 'datasets',
+        group  => 'datasets',
+    }
+
+    file { $medialistsdir:
+        ensure => 'directory',
+        mode   => '0755',
+        owner  => 'datasets',
+        group  => 'datasets',
+    }
+
+    file { $mediatitlesdir:
+        ensure => 'directory',
+        mode   => '0755',
+        owner  => 'datasets',
+        group  => 'datasets',
+    }
+
+    file { $pagetitlesdir:
+        ensure => 'directory',
+        mode   => '0755',
+        owner  => 'datasets',
+        group  => 'datasets',
+    }
+}
diff --git a/modules/dumpsdata/manifests/nfs.pp 
b/modules/dumpsdata/manifests/nfs.pp
new file mode 100644
index 0000000..03e4776
--- /dev/null
+++ b/modules/dumpsdata/manifests/nfs.pp
@@ -0,0 +1,52 @@
+class dumpsdata::nfs() {
+
+    $clients = hiera('dumpsdata_clients_snapshots')
+
+    file { '/etc/exports':
+        mode    => '0444',
+        owner   => 'root',
+        group   => 'root',
+        content => template('dumpsdata/nfs_exports.erb'),
+        require => Package['nfs-kernel-server'],
+    }
+
+    require_package('nfs-kernel-server', 'nfs-common', 'rpcbind')
+
+    service { 'nfs-kernel-server':
+        ensure    => 'running',
+        require   => [
+            Package['nfs-kernel-server'],
+            File['/etc/exports'],
+        ],
+        subscribe => File['/etc/exports'],
+    }
+
+    file { '/etc/default/nfs-common':
+        mode    => '0444',
+        owner   => 'root',
+        group   => 'root',
+        source  => 'puppet:///modules/dumpsdata/default-nfs-common',
+        require => Package['nfs-kernel-server'],
+    }
+
+    file { '/etc/default/nfs-kernel-server':
+        mode    => '0444',
+        owner   => 'root',
+        group   => 'root',
+        source  => 'puppet:///modules/dumpsdata/default-nfs-kernel-server',
+        require => Package['nfs-kernel-server'],
+    }
+
+    monitoring::service { 'nfs':
+        description   => 'NFS',
+        check_command => 'check_tcp!2049',
+    }
+
+    file { '/etc/modprobe.d/nfs-lockd.conf':
+        ensure  => present,
+        owner   => 'root',
+        group   => 'root',
+        mode    => '0644',
+        content => 'options lockd nlm_udpport=32768 nlm_tcpport=32769',
+    }
+}
diff --git a/modules/dumpsdata/manifests/rsync.pp 
b/modules/dumpsdata/manifests/rsync.pp
new file mode 100644
index 0000000..c05ea4b
--- /dev/null
+++ b/modules/dumpsdata/manifests/rsync.pp
@@ -0,0 +1,22 @@
+class dataset::cron::rsync() {
+    include ::dataset::common
+
+    file { '/usr/local/bin/rsync_completed_dumpjobs.py':
+        ensure => 'present',
+        mode   => '0755',
+        owner  => 'root',
+        group  => 'root',
+        path   => '/usr/local/bin/rsync_completed_dumpjobs.py',
+        source => 'puppet:///modules/dumpsdata/rsync_completed_dumpjobs.py',
+    }
+
+    cron { 'rsync-dumps':
+        ensure  => present,
+        # filter out error messages about vanishing files, we don't want email 
for that
+        command => '/usr/bin/python /usr/local/bin/rsync-completed_dumpjobs.py 
2>&1 | grep -v "vanished" ',
+        user    => 'root',
+        minute  => '10',
+        hour    => '*',
+        require => File['/usr/local/bin/rsync-completed_dumpjobs.py'],
+    }
+}
diff --git a/modules/dumpsdata/templates/nfs_exports.erb 
b/modules/dumpsdata/templates/nfs_exports.erb
new file mode 100644
index 0000000..8d55678
--- /dev/null
+++ b/modules/dumpsdata/templates/nfs_exports.erb
@@ -0,0 +1,15 @@
+# /etc/exports: the access control list for filesystems which may be exported
+#              to NFS clients.  See exports(5).
+#
+#  THIS FILE IS MANAGED BY PUPPET
+#
+#  Source: dumpsdata/templates/nfs_exports.erb
+#
+# Example for NFSv2 and NFSv3:
+# /srv/homes       hostname1(rw,sync,no_subtree_check) 
hostname2(ro,sync,no_subtree_check)
+#
+# Example for NFSv4:
+# /srv/nfs4        gss/krb5i(rw,sync,fsid=0,crossmnt,no_subtree_check)
+# /srv/nfs4/homes  gss/krb5i(rw,sync,no_subtree_check)
+#
+/data -rw,async,no_root_squash,no_subtree_check <%= Array(@clients).join(' ') 
%>
diff --git a/modules/snapshot/manifests/dumps/cron.pp 
b/modules/snapshot/manifests/dumps/cron.pp
index 85e109a..a5e525e 100644
--- a/modules/snapshot/manifests/dumps/cron.pp
+++ b/modules/snapshot/manifests/dumps/cron.pp
@@ -52,4 +52,40 @@
         monthday    => '20-25',
     }
 
+    file { '/usr/local/bin/save_prefetches.sh':
+        ensure  => 'present',
+        path    => '/usr/local/bin/save_prefetches.sh',
+        mode    => '0755',
+        owner   => 'root',
+        group   => 'root',
+        content => template('snapshot/dumps/save_prefetches.sh.erb'),
+    }
+
+    file { '/usr/local/bin/cleanup_old_dumps.sh':
+        ensure  => 'present',
+        path    => '/usr/local/bin/cleanup_old_dumps.sh',
+        mode    => '0755',
+        owner   => 'root',
+        group   => 'root',
+        content => template('snapshot/dumps/cleanup_old_dumps.sh.erb'),
+    }
+
+    file { '/usr/local/bin/cleanup_prefetches.sh':
+        ensure  => 'present',
+        path    => '/usr/local/bin/cleanup_prefetches.sh',
+        mode    => '0755',
+        owner   => 'root',
+        group   => 'root',
+        content => template('snapshot/dumps/cleanup_prefetches.sh.erb'),
+    }
+
+    cron { 'cleanup':
+        ensure      => 'present',
+        environment => '[email protected]',
+        user        => $user,
+        command     => "/usr/local/bin/save_prefetches.sh; 
/usr/local/bin/cleanup_old_dumps.sh; /usr/local/bin/cleanup_prefetches.sh",
+        minute      => '05',
+        hour        => '7',
+        weekday    => '7',
+    }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/366308
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I5f52a1d3725e5f794573ea293c8978ceeb9027a8
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: ArielGlenn <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to