ArielGlenn has uploaded a new change for review.
https://gerrit.wikimedia.org/r/62800
Change subject: option to run until all wikis have dumps more recent than
cutoff date
......................................................................
option to run until all wikis have dumps more recent than cutoff date
--cutoff option in wrapper shell script and in main python script
wrapper shell script cleanup (break stuff into functions for
readability)
Change-Id: I405aae9f4c3894eb337d59d487b87e111f37306c
---
M xmldumps-backup/worker
M xmldumps-backup/worker.py
2 files changed, 162 insertions(+), 81 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/dumps
refs/changes/00/62800/1
diff --git a/xmldumps-backup/worker b/xmldumps-backup/worker
index afd9944..ced94f8 100755
--- a/xmldumps-backup/worker
+++ b/xmldumps-backup/worker
@@ -1,92 +1,150 @@
#!/bin/bash
-# default number of failures of worker.py in a row before we decide
-# something serious is broken and we refuse to run
-MAXFAILS=3
-# default: don't pass special config file
-CONFIGFILE=""
-# default: no logging to file
-LOG=""
+usage() {
+ echo "$0: Unknown option $1"
+ echo "Usage: $0 [--configfile filename] [--log] [--maxfails num]
[--basedir dir] [--wiki wikiname]"
+ echo " [--cutoff date]"
+ echo "--configfile use specified file for config file (default:
wikidump.conf)"
+ echo "--log write log of (almost) everything written to stderr
(default: no logging)"
+ echo "--maxfails if more than this many dumps fail in a row, exit
(default: 3)"
+ echo "--basedir scripts and files are relative to this dir (default:
location of this script)"
+ echo "--cutoff dump wikis until all have a dump produced more recent
than the specified cutoff,"
+ echo " then exit. format: yyyymmdd OR 'today'"
+ echo "--wiki name of specific wiki db to dump; otherwise all wikis
in list referenced by"
+ echo " config file will be dumped"
+ echo
+ echo "If the file maintenance.txt is present, no more jobs will be run,
and"
+ echo "this script will check the status again in 5 minutes."
+ echo "If the file exit.txt is present, no more new jobs will be started
and"
+ echo "this script will terminate."
+ exit 1
+}
-failures=0
-WIKIDUMP_BASE=`dirname "$0"`
+set_defaults() {
+ # default number of failures of worker.py in a row before we decide
+ # something serious is broken and we refuse to run
+ MAXFAILS=3
+ # default: don't pass special config file
+ CONFIGFILE=""
+ # default: no logging to file
+ LOG=""
+ # default: no cutoff date
+ CUTOFF=""
+}
-while [ $# -gt 0 ]; do
- if [ $1 == "--configfile" ]; then
- CONFIGFILE="$2"
- shift; shift
- elif [ $1 == "--maxfails" ]; then
- MAXFAILS="$2"
- shift; shift
- elif [ $1 == "--basedir" ]; then
- WIKIDUMP_BASE="$2"
- shift; shift
- elif [ $1 == "--log" ]; then
- LOG=true
- shift;
- elif [ $1 == "--wiki" ]; then
- WIKI="$2"
- shift; shift
- else
- echo "$0: Unknown option $1"
- echo "Usage: $0 [--configfile filename] [--log] [--maxfails num]
[--basedir dir] [--wiki wikiname]"
- echo "--configfile use specified file for config file (default:
wikidump.conf)"
- echo "--log write log of (almost) everything written to stderr
(default: no logging)"
- echo "--maxfails if more than this many dumps fail in a row, exit
(default: 3)"
- echo "--basedir scripts and files are relative to this dir
(default: location of this script)"
- echo "--wiki name of specific wiki db to dump; otherwise all
wikis in list referenced by"
- echo " config file will be dumped"
- echo
- echo "If the file maintenance.txt is present, no more jobs will be run,
and"
- echo "this script will check the status again in 5 minutes."
- echo "If the file exit.txt is present, no more new jobs will be started
and"
- echo "this script will terminate."
- exit 1
+process_opts() {
+ while [ $# -gt 0 ]; do
+ if [ $1 == "--configfile" ]; then
+ CONFIGFILE="$2"
+ shift; shift
+ elif [ $1 == "--maxfails" ]; then
+ MAXFAILS="$2"
+ shift; shift
+ elif [ $1 == "--basedir" ]; then
+ WIKIDUMP_BASE="$2"
+ shift; shift
+ elif [ $1 == "--log" ]; then
+ LOG=true
+ shift;
+ elif [ $1 == "--cutoff" ]; then
+ CUTOFF="$2"
+ shift; shift
+ elif [ $1 == "--wiki" ]; then
+ WIKI="$2"
+ shift; shift
+ else
+ usage
+ fi
+ done
+}
+
+setup_python_args() {
+ # set up the command
+ pythonargs=( "$WIKIDUMP_BASE/worker.py" )
+ if [ ! -z "$CONFIGFILE" ]; then
+ pythonargs=( "${pythonargs[@]}" "--configfile" "$CONFIGFILE" )
fi
-done
+ if [ ! -z "$LOG" ]; then
+ pythonargs=( "${pythonargs[@]}" "--log" )
+ fi
+ if [ ! -z "$CUTOFF" ]; then
+ if [ "$CUTOFF" == "today" ]; then
+ # convert this to yyyymmdd, UTC always
+ CUTOFF=`date -u +"%Y%m%d`
+ else
+ # sanity check of arg
+ result=`date -d "$CUTOFF"`
+ if [ -z "$result" ]; then
+ echo "bad date given for cutoff arg"
+ exit 1
+ fi
+ fi
+ cutoffargs=( "${pythonargs[@]}" "--cutoff" "$CUTOFF" )
+ fi
+ if [ ! -z "$WIKI" ]; then
+ pythonargs=( "${pythonargs[@]}" "$WIKI" )
+ cutoffargs=( "${cutoffargs[@]}" "$WIKI" )
+ fi
+}
-# set up the command
-pythonargs=( "$WIKIDUMP_BASE/worker.py" )
-if [ ! -z "$CONFIGFILE" ]; then
- pythonargs=( "${pythonargs[@]}" "--configfile" "$CONFIGFILE" )
-fi
-if [ ! -z "$LOG" ]; then
- pythonargs=( "${pythonargs[@]}" "--log" )
-fi
-if [ ! -z "$WIKI" ]; then
- pythonargs=( "${pythonargs[@]}" "$WIKI" )
-fi
+dump_wiki() {
+ echo python ${pythonargs[@]}
+ python ${pythonargs[@]}
+ if [ $? -ne 0 ]; then
+ failures=$(($failures+1))
+ if [ $failures -gt $MAXFAILS ]; then
+ echo "more than $MAXFAILS failures in a row, halting."
+ exit 1
+ fi
+ else
+ failures=0
+ fi
+ if [ ! -z "$WIKI" ]; then
+ # we ran for a specific wiki, we don't loop even if it failed.
+ if [ $failures -gt 0 ]; then
+ echo "Dump of wiki $WIKI failed."
+ exit 1
+ else
+ echo "Dump of wiki $WIKI succeeded."
+ exit 0
+ fi
+ fi
+ echo "sleeping"
+ sleep 30
+}
-while true; do
+main_loop() {
if [ -e "$WIKIDUMP_BASE/maintenance.txt" ]; then
echo "in maintenance mode (see 'maintenance.txt'), sleeping 5 minutes"
sleep 300
elif [ -e "$WIKIDUMP_BASE/exit.txt" ]; then
echo "exit requested, remove 'exit.txt' to continue normal operations."
exit 0
- else
- echo python ${pythonargs[@]}
- python ${pythonargs[@]}
- if [ $? -ne 0 ]; then
- failures=$(($failures+1))
- if [ $failures -gt $MAXFAILS ]; then
- echo "more than $MAXFAILS failures in a row, halting."
- exit 1
- fi
+ elif [ ! -z "$CUTOFF" ]; then
+ # see if there are any wikis left with dumps that are not more recent
than CUTOFF
+ echo ${cutoffargs[@]}
+ result=`python ${cutoffargs[@]}`
+ if [ -z "$result" ]; then
+ # nope, so we are done
+ echo "All wikis completed after cutoff $CUTOFF"
+ exit 0
else
- failures=0
+ # there's (at least) one wiki left to do... if some other worker
+ # gets it and we do an 'extra' one it's not a disaster, so don't
worry about
+ # potential race
+ dump_wiki
fi
- if [ ! -z "$WIKI" ]; then
- # we ran for a specific wiki, we don't loop even if it failed.
- if [ $failures -gt 0 ]; then
- echo "Dump of wiki $WIKI failed."
- exit 1
- else
- echo "Dump of wiki $WIKI succeeded."
- exit 0
- fi
- fi
- echo "sleeping"
- sleep 30
+ else
+ dump_wiki
fi
+}
+
+failures=0
+WIKIDUMP_BASE=`dirname "$0"`
+set_defaults
+process_opts "$@"
+setup_python_args
+
+while true; do
+ main_loop
done
diff --git a/xmldumps-backup/worker.py b/xmldumps-backup/worker.py
index e998f9c..2245b55 100644
--- a/xmldumps-backup/worker.py
+++ b/xmldumps-backup/worker.py
@@ -4032,7 +4032,7 @@
command = runner.dbServerInfo.buildSqlCommand(query,
runner.wiki.config.gzip)
return runner.saveCommand(command, outfile)
-def findAndLockNextWiki(config, locksEnabled):
+def findAndLockNextWiki(config, locksEnabled, cutoff):
if config.halt:
print "Dump process halted by config."
return None
@@ -4040,10 +4040,15 @@
next = config.dbListByAge()
next.reverse()
- print "Finding oldest unlocked wiki..."
+ if verbose and not cutoff:
+ print "Finding oldest unlocked wiki..."
for db in next:
wiki = WikiDump.Wiki(config, db)
+ if (cutoff):
+ lastRan = wiki.latestDump()
+ if lastRan > cutoff:
+ return None
try:
if (locksEnabled):
wiki.lock()
@@ -4060,7 +4065,7 @@
if message:
print message
print "Usage: python worker.py [options] [wikidbname]"
- print "Options: --aftercheckpoint, --checkpoint, --chunk, --configfile,
--date, --job, --addnotice, --delnotice, --force, --noprefetch, --nospawn,
--restartfrom, --log"
+ print "Options: --aftercheckpoint, --checkpoint, --chunk, --configfile,
--date, --job, --addnotice, --delnotice, --force, --noprefetch, --nospawn,
--restartfrom, --log, --cutoff"
print "--aftercheckpoint: Restart thie job from the after specified
checkpoint file, doing the"
print " rest of the job for the appropriate chunk if
chunks are configured"
print " or for the all the rest of the revisions if no
chunks are configured;"
@@ -4095,6 +4100,9 @@
print "--restartfrom: Do all jobs after the one specified via --job,
including that one"
print "--log: Log progress messages and other output to logfile
in addition to"
print " the usual console output"
+ print "--cutoff: Given a cutoff date in yyyymmdd format, display
the next wiki for which"
+ print " dumps should be run, if its last dump was not
after the cutoff date,"
+ print " and exit, or if there are no such wikis, just
exit"
print "--verbose: Print lots of stuff (includes printing full
backtraces for any exception)"
print " This is used primarily for debugging"
@@ -4117,12 +4125,13 @@
afterCheckpoint = False
checkpointFile = None
pageIDRange = None
+ cutoff = None
result = False
verbose = False
try:
(options, remainder) = getopt.gnu_getopt(sys.argv[1:],
"",
- ['date=',
'job=', 'configfile=', 'addnotice=', 'delnotice', 'force', 'dryrun',
'noprefetch', 'nospawn', 'restartfrom', 'aftercheckpoint=', 'log', 'chunk=',
'checkpoint=', 'pageidrange=', 'verbose' ])
+ ['date=',
'job=', 'configfile=', 'addnotice=', 'delnotice', 'force', 'dryrun',
'noprefetch', 'nospawn', 'restartfrom', 'aftercheckpoint=', 'log', 'chunk=',
'checkpoint=', 'pageidrange=', 'cutoff=', 'verbose' ])
except:
usage("Unknown option specified")
@@ -4158,6 +4167,10 @@
htmlNotice = False
elif opt == "--pageidrange":
pageIDRange = val
+ elif opt == "--cutoff":
+ cutoff = val
+ if not cutoff.isdigit() or not len(cutoff) == 8:
+ usage("--cutoff value must be in
yyyymmdd format")
elif opt == "--verbose":
verbose = True
@@ -4188,7 +4201,7 @@
else:
config = WikiDump.Config()
- if dryrun or chunkToDo or (jobRequested and not restart):
+ if dryrun or chunkToDo or (jobRequested and not restart) or
cutoff:
locksEnabled = False
else:
locksEnabled = True
@@ -4200,6 +4213,10 @@
if len(remainder) > 0:
wiki = WikiDump.Wiki(config, remainder[0])
+ if cutoff:
+ lastRan = wiki.latestDump()
+ if lastRan > cutoff:
+ wiki = None
if locksEnabled:
if forceLock and wiki.isLocked():
wiki.unlock()
@@ -4207,7 +4224,13 @@
wiki.lock()
else:
- wiki = findAndLockNextWiki(config, locksEnabled)
+ wiki = findAndLockNextWiki(config, locksEnabled, cutoff)
+
+ if cutoff:
+ if wiki:
+ print wiki.dbName
+ WikiDump.cleanup()
+ sys.exit(0)
if wiki:
# process any per-project configuration options
--
To view, visit https://gerrit.wikimedia.org/r/62800
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I405aae9f4c3894eb337d59d487b87e111f37306c
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits