ArielGlenn has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/381760 )
Change subject: move hardcoded hostnames out of script for rsync of dumps to
peers
......................................................................
move hardcoded hostnames out of script for rsync of dumps to peers
Change-Id: Id742ff4ff551db154c6de0802b2b00ccb1980654
---
M modules/dumps/files/copying/rsync-dumps.py
M modules/dumps/manifests/copying/peers.pp
M modules/profile/manifests/dumps/web/xmldumps_active.pp
3 files changed, 148 insertions(+), 51 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/60/381760/1
diff --git a/modules/dumps/files/copying/rsync-dumps.py
b/modules/dumps/files/copying/rsync-dumps.py
index eab2cbd..90887ae 100644
--- a/modules/dumps/files/copying/rsync-dumps.py
+++ b/modules/dumps/files/copying/rsync-dumps.py
@@ -1,3 +1,4 @@
+import getopt
import sys
import subprocess
import socket
@@ -29,11 +30,11 @@
excludes.append(host_info[job]['exclude']['dir'])
return excludes
- def rsync_all(self, host_info):
- for job in host_info:
- excludes = self.get_excludes_for_job(job, host_info)
+ def rsync_all(self, rsync_info):
+ for job in rsync_info:
+ excludes = self.get_excludes_for_job(job, rsync_info)
- hosts = host_info[job]['hosts']
+ hosts = rsync_info[job]['hosts']
if self.host not in hosts:
# no rsync job info for this host
continue
@@ -71,7 +72,7 @@
# not a primary, no specific dirs to sync, do nothing
continue
- self.do_rsync(host_info[job]['source'], host_info[job]['dest'],
+ self.do_rsync(rsync_info[job]['source'], rsync_info[job]['dest'],
targets, dir_args)
def do_rsync(self, src, dest, targets, dir_args):
@@ -114,37 +115,7 @@
print errs
-def usage(message):
- if message:
- sys.stderr.write(message + "\n")
- help_message = """Usage: rsync-dumps.py [dryrun] [bw=number] [list]
- dryrun -- show commands that would be run instead of runnning them
- bw -- cap rsync bandwidth to this number (default: 40000)
- list -- only list files that would be transferred instead of sending them
-"""
- sys.stderr.write(help_message)
- sys.exit(1)
-
-
-def do_main():
- dryrun = False
- list_only = False
- max_bandwidth = 40000
-
- for i in range(1, len(sys.argv)):
- if sys.argv[i] == 'dryrun':
- dryrun = True
- elif sys.argv[i].startswith('bw='):
- max_bandwidth = sys.argv[i][3:]
- if not max_bandwidth.isdigit():
- usage("maxbw must be a number")
- elif sys.argv[i] == 'list':
- list_only = True
- else:
- usage("unknown option: " + sys.argv[i])
-
- rsync = Rsyncer(max_bandwidth, dryrun, list_only)
-
+def get_rsync_info_default():
# The rsync commands we would expect to see on...
#
# Primary for '/public/':
@@ -186,18 +157,13 @@
# /data/xmldatadumps/public/other/
# remotehost::data/xmldatadumps/public/other/
- host_info = {
+ rsync_info = {
'public': { # job name
# source is an absolute path
'source': '/data/xmldatadumps/public/',
# dest will be prefixed by 'servername::' in rsync
'dest': 'data/xmldatadumps/public/',
- 'hosts': {
- # everything but a specific list of dirs will be pushed:
- 'dataset1001': {'primary': True},
- # only the specified list of dirs is here:
- 'ms1001': {'dirs': []}
- }
+ 'hosts': {}
},
'other': { # job name
# don't sync this when doing the 'public' job:
@@ -205,13 +171,135 @@
'source': '/data/xmldatadumps/public/other/',
'dest': 'data/xmldatadumps/public/other/',
- 'hosts': {
- 'ms1001': {'dirs': []},
- 'dataset1001': {'primary': True}
- }
+ 'hosts': {}
}
}
- rsync.rsync_all(host_info)
+ return rsync_info
+
+
+def get_source_info(servers, sources_known):
+ """
+ convert servers string argument into a nice structure
+ containing: source, server, is it primary or secondary,
+ and if it is secondary, a list of directorys which it manages
+ """
+ sources = {}
+ #
source=name,server=name,type=primary,dirs=a:b:c;source=name,server=name,type=secondary,dirs=a:b:c;...
+ source_entries = servers.split(';')
+ for source_entry in source_entries:
+ source_info = {}
+ args = source_entry.split(',')
+ for arg in args:
+ if '=' not in arg:
+ usage("bad server info supplied: %s (bad arg %s)" % (servers,
arg))
+ name, value = arg.split('=')
+ if name == 'source':
+ if value not in sources_known:
+ usage("bad server info supplied: %s (bad source name %s)"
% (servers, value))
+ source_info['source'] = value
+ elif name == 'server':
+ source_info['server'] = value
+ elif name == 'type':
+ if value not in ['primary', 'secondary']:
+ usage("bad server info supplied: %s (bad type %s)" %
(servers, value))
+ source_info['type'] = value
+ elif name == 'dirs':
+ source_info['dirs'] = value.split(':')
+ else:
+ usage("bad server info supplied: %s (bad arg name %s)" %
(servers, name))
+ if 'source' not in source_info or 'server' not in source_info or
'type' not in source_info:
+ usage("bad server info supplied: %s (missing source, server or
type name)" % servers)
+ if source_info['type'] == 'secondary' and 'dirs' not in source_info:
+ source_info['dirs'] = []
+ if source_info['source'] not in sources:
+ sources[source_info['source']] = []
+ sources[source_info['source']].append(source_info)
+ return sources
+
+
+def rsync_info_update(rsync_info, sources_info):
+ for source in rsync_info:
+ for entry in sources_info[source]:
+ # Note that if a source has multiple entries for a server, the
+ # last entry will override earlier ones. You probably just want
+ # to avoid dups.
+ if entry['type'] == 'primary':
+ rsync_info[source]['hosts'][entry['server']] = {'primary':
True}
+ elif entry['type'] == 'secondary':
+ rsync_info[source]['hosts'][entry['server']] = {'dirs':
entry['dirs']}
+ return rsync_info
+
+
+def usage(message):
+ if message:
+ sys.stderr.write(message + "\n")
+ help_message = """Usage: rsync-dumps.py servers <serverlist>
[--dryrun] [--bandwidth <number>] [--list]
+
+Arguments:
+ servers (-s) -- list of servers and their directories associated with
each source
+ Format:
source=name,server=name,type=primary;source=name,server=name,type=secondary,dirs=a:b:c;...
+ source: one of the sources listed in this script
(currently 'public' or 'other'). default: none
+ server: short name of server, fqdn not needed.
default: none
+ type: primary (all dirs will be rsynced to peer
except those hosted by the secondary)
+ secondary (only listed dirs will be rsynced).
default: none
+ dirs: list of directories for rsync, if server is
secondary. default: empty list
+ bandwidth (-b) -- cap rsync bandwidth to this number (default: 40000)
+
+Flags:
+ list (-l) -- only list files that would be transferred instead of
sending them
+ dryrun (-d) -- show commands that would be run instead of runnning them
+ help (-h) -- show this message
+"""
+ sys.stderr.write(help_message)
+ sys.exit(1)
+
+
+def do_main():
+ dryrun = False
+ list_only = False
+ max_bandwidth = 40000
+ servers = None
+
+ try:
+ (options, remainder) = getopt.gnu_getopt(
+ sys.argv[1:], "b:s:ldh",
+ ["bandwidth=", "servers=", "list", "dryrun", "help"])
+
+ except getopt.GetoptError as err:
+ usage("Unknown option specified: " + str(err))
+ for (opt, val) in options:
+ if opt in ["-b", "--bandwidth"]:
+ max_bandwidth = val
+ if not max_bandwidth.isdigit():
+ usage("maxbw must be a number")
+ elif opt in ["-s", "--servers"]:
+ servers = val
+ elif opt in ["-l", "--list"]:
+ list_only = True
+ elif opt in ["-d", "--dryrun"]:
+ dryrun = True
+ elif opt in ["-h", "--help"]:
+ usage('Help for this script\n')
+ else:
+ usage("Unknown option specified: <%s>" % opt)
+
+ if servers is None:
+ usage("Mandatory 'servers' argument omitted")
+
+ rsync = Rsyncer(max_bandwidth, dryrun, list_only)
+ rsync_info = get_rsync_info_default()
+ source_info = get_source_info(servers, rsync_info.keys())
+ errors = False
+ for source in rsync_info:
+ if source not in source_info:
+ sys.stderr.write("no servers specified for source %s\n" % source)
+ errors = True
+ if errors:
+ sys.exit(1)
+
+ rsync_info_update(rsync_info, source_info)
+
+ rsync.rsync_all(rsync_info)
if __name__ == '__main__':
diff --git a/modules/dumps/manifests/copying/peers.pp
b/modules/dumps/manifests/copying/peers.pp
index cf9f80d..37514f0 100644
--- a/modules/dumps/manifests/copying/peers.pp
+++ b/modules/dumps/manifests/copying/peers.pp
@@ -1,4 +1,6 @@
-class dumps::copying::peers {
+class dumps::copying::peers(
+ $serverinfo = undef,
+) {
file { '/usr/local/bin/rsync-dumps.py':
ensure => 'present',
mode => '0755',
@@ -11,7 +13,7 @@
cron { 'rsync-dumps':
ensure => 'present',
# filter out error messages about vanishing files, we don't want email
for that
- command => '/usr/bin/python /usr/local/bin/rsync-dumps.py 2>&1 | grep
-v "vanished" ',
+ command => "/usr/bin/python /usr/local/bin/rsync-dumps.py --servers
'$serverinfo' 2>&1 | grep -v vanished",
user => 'root',
minute => '0',
hour => '*/2',
diff --git a/modules/profile/manifests/dumps/web/xmldumps_active.pp
b/modules/profile/manifests/dumps/web/xmldumps_active.pp
index 5cacabe..b58ab07 100644
--- a/modules/profile/manifests/dumps/web/xmldumps_active.pp
+++ b/modules/profile/manifests/dumps/web/xmldumps_active.pp
@@ -9,6 +9,13 @@
xmldumps_server => 'dumps.wikimedia.org',
}
# copy dumps and other datasets to fallback host(s) and to labs
- class {'::dumps::copying::peers':}
+ $primaryserver = 'dataset1001.wikimedia.org'
+ $secondaryserver = 'ms1001.wikimedia.org'
+ $publicsourceinfo =
"source=public,server=$primaryserver,type=primary;source=public,server=$secondaryserver,type=secondary"
+ $othersourceinfo =
"source=other,server=$primaryserver,type=primary;source=other,server=$secondaryserver,type=secondary"
+ class {'::dumps::copying::peers':
+ serverinfo => "$publicsourceinfo $othersourceinfo"
+ }
+
class {'::dumps::copying::labs':}
}
--
To view, visit https://gerrit.wikimedia.org/r/381760
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Id742ff4ff551db154c6de0802b2b00ccb1980654
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: ArielGlenn <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits