coren has submitted this change and it was merged. Change subject: Labs: Script to back labstore filesystems up ......................................................................
Labs: Script to back labstore filesystems up This will create the snapshot as needed to make a time- consistent copy. TODO: clean snapshots up as free space is becoming low or they are becoming full. Bug: T105027 Change-Id: I078179f84a323957a4124f502aea3073d5c993b5 --- A hieradata/role/common/labstore/fileserver.yaml A modules/labstore/files/replication-rsync.conf A modules/labstore/files/storage-replicate M modules/labstore/manifests/fileserver.pp 4 files changed, 294 insertions(+), 2 deletions(-) Approvals: coren: Looks good to me, approved jenkins-bot: Verified diff --git a/hieradata/role/common/labstore/fileserver.yaml b/hieradata/role/common/labstore/fileserver.yaml new file mode 100644 index 0000000..dab033b --- /dev/null +++ b/hieradata/role/common/labstore/fileserver.yaml @@ -0,0 +1,3 @@ +# Paramiko needs to ssh into these for replication/backups +"ssh::server::disable_nist_kex": false +"ssh::server::explicit_macs": false diff --git a/modules/labstore/files/replication-rsync.conf b/modules/labstore/files/replication-rsync.conf new file mode 100644 index 0000000..fb94d9a --- /dev/null +++ b/modules/labstore/files/replication-rsync.conf @@ -0,0 +1,10 @@ +# Do not back log and default output files up +# (they tend to grow a lot, and are not valuable +# enough to keep for DR purposes) +- /tools/**/*.log +- /tools/**/*.err +- /tools/**/*.out +# Not relevant to rsync +- /lost+found +# Allow endusers to filter their own backups, too +: .nobackup diff --git a/modules/labstore/files/storage-replicate b/modules/labstore/files/storage-replicate new file mode 100755 index 0000000..62333a2 --- /dev/null +++ b/modules/labstore/files/storage-replicate @@ -0,0 +1,264 @@ +#! /usr/bin/python3 +# -*- coding: utf-8 -*- +# +# Copyright © 2015 Marc-André Pelletier <[email protected]> +# +# Permission to use, copy, modify, and/or distribute this software for any +# purpose with or without fee is hereby granted, provided that the above +# copyright notice and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# +## +## THIS FILE IS MANAGED BY PUPPET +## +## Source: modules/labstore/storage-replicate +## From: tbd +## + +## +## storage-replicate +## +## usage: storage-replicate <mountpoint> <host> <dest> +## +## Replicates the directory at <mountpoint> (which must have a +## volume mounted) to the destination <host>, at mountpoint +## <dest>. A snapshot of the source will be taken (and kept) +## and a temporary snapshot of the destination will be taken +## before the rsync proper (so that there exists a consistent +## snapshot at all times). +## +## This script provides for locking to avoid more than one +## replication taking place at a time and making a mess of things. +## The lock directory is also where the source snapshot +## will be mounted. +## + +import argparse +import re +import datetime +import subprocess +import sys +import logging +import logging.handlers +import os +import paramiko +from shlex import quote + + +class RuntimeError(Exception): + def __init__(self, ctx, err): + self.ctx = ctx + self.err = err + + def __str__(self): + if self.ctx.host: + return '[%s] %s' % (self.ctx.host, self.err) + return '[local] ' + repr(self.err) + + +class Context: + """This provides a (trivial) abstraction for executing + commands and reading files either locally (via subprocess + and open) or remotely (via paramiko) such that the same + interface can be used for both.""" + + def __init__(self, host): + if host: + self.host = host + self.client = paramiko.SSHClient() + self.client.load_system_host_keys() + self.client.connect(hostname = host, key_filename = '/root/.ssh/id_labstore') + else: + self.host = None + self.client = None + + def read(self, path): + if self.host: + (out, err) = self.run('/bin/cat', path) + if err and err != "": + raise RuntimeError(self, err) + + return out.splitlines() + + else: + with open(path, 'r') as fd: + return fd.readlines() + + def run(self, *cmd): + """This executes the specified command either as a subprocess + (for local contexts) or via SSH (for remote contexts). + + The method takes the variadic arguments and constructs a + (suitably quoted) argument list in both cases and returns + a tuple of arrays containing, respectively, strings containing + the standard output an error of the command. No provision + is made for passing standard input to the command.""" + + if self.host: + command = ' '.join([quote(arg) for arg in list(cmd)]) + (si, so, se) = self.client.exec_command(command) + + out = so.read() + err = se.read() + + if not err or err=='': + return (out, None) + + return (None, err) + + else: + sub = subprocess.Popen(list(cmd), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (out, err) = sub.communicate() + if sub.returncode: + err = err.splitlines(False)[0].strip() + if not err or err=='': + if sub.returncode < 0: + err = "killed by signal %d" % -sub.returncode + else: + err = "exited with %d" % sub.returncode + return (None, err) + return (out, None) + + +class Lockdir: + """Creates a lock directory and mountpoint for the snapshot. + + This sets up a context guard around the specified path + that guarantees exclusive access (because mkdir is atomic) + and a single, predictable mountpoint for the snapshot; and + (more importantly) makes certain that any mounted snapshot + is properly unmounted before the context terminates.""" + + def __init__(self, ctx, path): + self.ctx = ctx + self.path = path + self.mountpoint = "%s/snapshot" % path + self.err = None + + def __enter__(self): + try: + os.mkdir(self.path, 0o700) + os.mkdir(self.mountpoint, 0o700) + except OSError as e: + self.err = "unable to create lock directory %s: %s" % (self.path, e.strerror) + return self + + def __exit__(self, e1, e2, e3): + (out, err) = self.ctx.run('/bin/umount', '-fl', self.mountpoint) + (out, err) = self.ctx.run('/bin/rm', '-rf', self.path); + return None + + +def volume_device(ctx, path): + + # Find the specified path in /proc/mounts, matching only logical volumes + # and extract the volume group and name from the device entry + + vg = None + lv = None + for line in ctx.read('/proc/mounts'): + # This matches lines of the form: + # /dev/mapper/labstore-maps /srv/project/maps ext4 rw,... + # and extracts the volume group and name matching the mountpoints + match = re.match(r'/dev/mapper/([^-]+)-(\S+)\s+(\S+)\s', line) + if match and match.group(3) == path: + vg, lv = match.group(1, 2) + + if not (vg and lv): + raise RuntimeError(ctx, "%s is not a LVM volume mountpoint" % path) + + # Now check that the specified volume has the correct attributes + (out, err) = ctx.run('/sbin/lvs', '--noheadings', '--options', 'lv_attr', '/dev/mapper/%s-%s' % (vg, lv)) + if err: + raise RuntimeError(ctx, "/sbin/lvs: " + err) + + # Must be: not (s)napshot, (-) not mirror, and (a)ctive + # The format of lv_attr (the only output) is detailed in lvs(8) + if not re.match(r'^[^s]..-a...', out.strip()): + raise RuntimeError(ctx, "%s-%s is not a suitable volume for replication" % (vg, lv)) + + return (vg, lv) + +parser = argparse.ArgumentParser() +parser.add_argument('path', help='Path to the mountpoint to replicate') +parser.add_argument('host', help='Destination host for the replica') +parser.add_argument('dest', help='Destination mountpoint for the replica') +args = parser.parse_args() + +local = Context(None) +remote = Context(args.host) + +(srcvg, srclv) = volume_device(local, args.path) +(dstvg, dstlv) = volume_device(remote, args.dest) + +logging.debug("Backing up %s (%s/%s) -> %s:%s (%s/%s)" + % (args.path, srcvg, srclv, args.host, args.dest, dstvg, dstlv)) + +snapshot = srclv + datetime.datetime.utcnow().strftime("%Y%m%d") +lockdir = '/var/run/lock/storage-replicate-%s-%s' % (srcvg, srclv) + +with Lockdir(local, lockdir) as lock: + + if lock.err: + # The lock directory already exists, so the previous + # rsync is running long. Log the event, and exit. + try: + with open('%s/started' % lockdir, 'r') as f: + when = f.readline().strip() + except IOError as e: + when = 'some time ago? (no start time file: %s)' % e.strerror + logging.warning("Skipping replication; already in progress since %s" % when) + sys.exit(0) + + with open('%s/started' % lockdir, 'w+') as f: + f.write(datetime.datetime.utcnow().strftime("%Y-%m-%d% H%:M\n")) + + (out, err) = local.run( + '/sbin/lvcreate', '--size', '1T', '--snapshot', '--name', snapshot, '%s/%s' % (srcvg, srclv)) + if err: + logging.critical('unable to create local snapshot (%s-%s): %s' % (srcvg, snapshot, err)) + sys.exit(1) + + (out, err) = local.run( + '/bin/mount', '-oro,noload', + '/dev/mapper/%s-%s' % (srcvg, snapshot), + lock.mountpoint) + if err: + logging.critical('unable to mount local snapshot (%s-%s): %s' % (srcvg, snapshot, err)) + sys.exit(1) + + (out, err) = remote.run( + '/sbin/lvcreate', '--size', '1T', '--snapshot', '--name', snapshot, '%s/%s' % (dstvg, srclv)) + if err: + logging.critical('unable to create remote snapshot (%s-%s): %s' % (dstvg, snapshot, err)) + sys.exit(1) + + logging.info("Replication of %s-%s starting" % (srcvg, snapshot)) + + (out, err) = local.run( + '/usr/bin/ionice', '--class', 'Idle', + '/usr/bin/rsync', '--protect-args', + '--archive', '--update', '--hard-links', '--acls', '--xattrs', '--delete-during', + '--rsh=ssh -i /root/.ssh/id_labstore', + '--inplace', '--append-verify', '--filter=._/etc/replication-rsync.conf', + '%s/.' % lock.mountpoint, + '%s:%s' % (args.host, args.dest)) + if err: + logging.critical('rsync failed: %s' % err) + exit(1) + + logging.info("Replication of %s-%s complete" % (srcvg, snapshot)) + + (out, err) = remote.run( + '/sbin/lvremove', '--force', '%s/%s' % (dstvg, snapshot)) + + if err: + logging.warn('unable to remove remote snapshot (%s-%s): %s' % (dstvg, snapshot, err)) + diff --git a/modules/labstore/manifests/fileserver.pp b/modules/labstore/manifests/fileserver.pp index cec9fbc..7d562ed 100644 --- a/modules/labstore/manifests/fileserver.pp +++ b/modules/labstore/manifests/fileserver.pp @@ -17,11 +17,26 @@ } file { '/etc/init/replica-addusers.conf': - source => 'puppet:///modules/labstore/replica-addusers.conf', + source => 'puppet:///modules/labstore/replica-addusers.conf', + owner => 'root', + group => 'root', + mode => '0444', + require => File['/usr/local/sbin/replica-addusers.pl'], + } + + file { '/etc/replication-rsync.conf': + source => 'puppet:///modules/labstore/replication-rsync.conf', owner => 'root', group => 'root', mode => '0444', - require => File['/usr/local/sbin/replica-addusers.pl'], + } + + file { '/usr/local/sbin/storage-replicate': + source => 'puppet:///modules/labstore/storage-replicate', + owner => 'root', + group => 'root', + mode => '0444', + require => File['/etc/replication-rsync.conf'], } # There is no service {} stanza on purpose -- this service -- To view, visit https://gerrit.wikimedia.org/r/224064 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I078179f84a323957a4124f502aea3073d5c993b5 Gerrit-PatchSet: 8 Gerrit-Project: operations/puppet Gerrit-Branch: production Gerrit-Owner: coren <[email protected]> Gerrit-Reviewer: Yuvipanda <[email protected]> Gerrit-Reviewer: coren <[email protected]> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
