ArielGlenn has submitted this change and it was merged.
Change subject: dumps: add 'skipdone' to worker script and its wrapper
......................................................................
dumps: add 'skipdone' to worker script and its wrapper
allows us to run some or all jobs across one or all wikis
skipping any that completed successfully.
Change-Id: Ice5a4e188e28d4a8b55b363f00cbece48b17727d
---
M xmldumps-backup/WikiDump.py
M xmldumps-backup/worker
M xmldumps-backup/worker.py
3 files changed, 111 insertions(+), 24 deletions(-)
Approvals:
ArielGlenn: Looks good to me, approved
jenkins-bot: Verified
diff --git a/xmldumps-backup/WikiDump.py b/xmldumps-backup/WikiDump.py
index 1b4d2fe..1cc4b9c 100644
--- a/xmldumps-backup/WikiDump.py
+++ b/xmldumps-backup/WikiDump.py
@@ -413,7 +413,7 @@
else:
return(self.conf.get(sectionName,itemName))
- def dbListByAge(self):
+ def dbListByAge(self, use_status_time=False):
"""
Sort wikis in reverse order of last successful dump :
@@ -435,6 +435,7 @@
index page links.
"""
available = []
+ today = int(TimeUtils.today())
for db in self.dbList:
wiki = Wiki(self, db)
@@ -445,13 +446,17 @@
if last:
dumpStatus = os.path.join(wiki.publicDir(),
last, "status.html")
try:
+ if use_status_time:
+ # only use the status file
time, not the dir date
+ date = today
+ else:
+ date = today - int(last)
# tack on the file mtime so that if we
have multiple wikis
# dumped on the same day, they get
ordered properly
- date = int(TimeUtils.today()) -
int(last)
- age = FileUtils.fileAge(dumpStatus)
+ age = FileUtils.fileAge(dumpStatus)
status = FileUtils.readFile(dumpStatus)
except:
- print "dump dir %s corrupt?" %
dumpStatus
+ print "dump dir missing status file
%s?" % dumpStatus
dumpFailed = (status == '') or ('dump aborted' in
status)
available.append((dumpFailed, date, age, db))
available.sort()
diff --git a/xmldumps-backup/worker b/xmldumps-backup/worker
index 815cfcd..882e0e1 100755
--- a/xmldumps-backup/worker
+++ b/xmldumps-backup/worker
@@ -3,7 +3,7 @@
usage() {
echo "$0: Unknown option $1"
echo "Usage: $0 [--configfile filename] [--log] [--maxfails num]
[--basedir dir] [--wiki wikiname]"
- echo " [--cutoff date] [--date date] [--job job] [--exclusive]"
+ echo " [--cutoff date] [--date date] [--job job] [--skipdone]
[--exclusive]"
echo "--configfile use specified file for config file (default:
wikidump.conf)"
echo "--log write log of (almost) everything written to stderr
(default: no logging)"
echo "--maxfails if more than this many dumps fail in a row, exit
(default: 3)"
@@ -11,6 +11,7 @@
echo "--cutoff dump wikis until all have a dump produced more recent
than the specified cutoff,"
echo " then exit. format: yyyymmdd OR 'today'"
echo "--date (re)do dump runs of specified date (yyyymmdd) OR
'last'"
+ echo "--skipdone skip any dump jobs that ran successfully (this makes
sense only for reruns)"
echo "--job do only the specified job for wiki(s)"
echo "--exclusive lock the wiki so other runners can't operate on it at
the same time"
echo " default is true for all runs except those where --job
is specified"
@@ -36,6 +37,8 @@
CUTOFF=""
# default: no date specified
DATE=""
+ # default: run all jobs, not just failed ones
+ SKIPDONE=""
# default: run all jobs
JOB=""
# default for one job: no locking
@@ -68,6 +71,9 @@
elif [ $1 == "--job" ]; then
JOB="$2"
shift; shift
+ elif [ $1 == "--skipdone" ]; then
+ SKIPDONE=true
+ shift
elif [ $1 == "--exclusive" ]; then
EXCLUSIVE=true
shift
@@ -89,6 +95,9 @@
if [ ! -z "$JOB" ]; then
pythonargs=( "${pythonargs[@]}" "--job" "$JOB" )
fi
+ if [ ! -z "$SKIPDONE" ]; then
+ pythonargs=( "${pythonargs[@]}" "--skipdone" )
+ fi
if [ ! -z "$EXCLUSIVE" ]; then
pythonargs=( "${pythonargs[@]}" "--exclusive" )
fi
diff --git a/xmldumps-backup/worker.py b/xmldumps-backup/worker.py
index 4db1301..c1edc59 100644
--- a/xmldumps-backup/worker.py
+++ b/xmldumps-backup/worker.py
@@ -743,16 +743,18 @@
# the rest of the names expand to single items)
# and mark the items in the list as such
# return False if there is no such dump or set of dumps
- def markDumpsToRun(self,job):
+ def markDumpsToRun(self,job, skipgood=False):
if (job == "tables"):
for item in self.dumpItems:
if (item.name()[-5:] == "table"):
- item.setToBeRun(True)
+ if not skipgood or item.status() !=
"done":
+ item.setToBeRun(True)
return True
else:
for item in self.dumpItems:
if (item.name() == job):
- item.setToBeRun(True)
+ if not skipgood or item.status() !=
"done":
+ item.setToBeRun(True)
return True
if job == "noop" or job == "latestlinks":
return True
@@ -764,20 +766,22 @@
sys.stderr.write("%s\n" % item.name())
return False
- def markFollowingJobsToRun(self):
+ def markFollowingJobsToRun(self, skipgood=False):
# find the first one marked to run, mark the following ones
i = 0;
for item in self.dumpItems:
i = i + 1;
if item.toBeRun():
for j in range(i,len(self.dumpItems)):
- self.dumpItems[j].setToBeRun(True)
+ if not skipgood or item.status() !=
"done":
+
self.dumpItems[j].setToBeRun(True)
break
- def markAllJobsToRun(self):
+ def markAllJobsToRun(self, skipgood=False):
"""Marks each and every job to be run"""
for item in self.dumpItems:
- item.setToBeRun( True )
+ if not skipgood or item.status() != "done":
+ item.setToBeRun( True )
def findItemByName(self, name):
for item in self.dumpItems:
@@ -1577,7 +1581,7 @@
return os.path.join(self.wiki.publicDir(), self.wiki.date)
class Runner(object):
- def __init__(self, wiki, prefetch=True, spawn=True, job=None,
restart=False, notice="", dryrun = False, loggingEnabled=False, chunkToDo =
False, checkpointFile = None, pageIDRange = None, verbose = False):
+ def __init__(self, wiki, prefetch=True, spawn=True, job=None,
restart=False, notice="", dryrun = False, loggingEnabled=False, chunkToDo =
False, checkpointFile = None, pageIDRange = None, skipdone = False, verbose =
False):
self.wiki = wiki
self.dbName = wiki.dbName
self.prefetch = prefetch
@@ -1590,6 +1594,7 @@
self._chunkToDo = chunkToDo
self.checkpointFile = checkpointFile
self.pageIDRange = pageIDRange
+ self.skipdone = skipdone
self.verbose = verbose
if (self.checkpointFile):
@@ -1801,14 +1806,15 @@
if (not reply in [ "y", "Y" ]):
raise RuntimeError( "No run information
available for previous dump, exiting" )
- if (not
self.dumpItemList.markDumpsToRun(self.jobRequested)):
- # probably no such job
- raise RuntimeError( "No job marked to run,
exiting" )
+ if (not
self.dumpItemList.markDumpsToRun(self.jobRequested, self.skipdone)):
+ # probably no such job
+ sys.stderr.write( "No job marked to run,
exiting" )
+ return None
if (restart):
# mark all the following jobs to run as well
- self.dumpItemList.markFollowingJobsToRun()
+
self.dumpItemList.markFollowingJobsToRun(self.skipdone)
else:
- self.dumpItemList.markAllJobsToRun();
+ self.dumpItemList.markAllJobsToRun(self.skipdone);
Maintenance.exitIfInMaintenanceMode("In maintenance mode,
exiting dump of %s" % self.dbName )
@@ -4059,12 +4065,61 @@
if (error):
raise BackupError("error dumping all titles list")
-def findAndLockNextWiki(config, locksEnabled, cutoff):
+
+def checkJobDone(wiki, date, job, pageIDRange, chunkToDo, checkpointFile):
+ '''
+ see if dump run on specific date completed specific job(s)
+ or if no job was specified, ran to completion
+ '''
+ if not date:
+ return False
+
+ if date == 'last':
+ dumps = sorted(wiki.dumpDirs())
+ if dumps:
+ date = dumps[-1]
+ else:
+ # never dumped so that's the same as 'job didn't run'
+ return False
+
+ wiki.setDate(date)
+
+ runInfoFile = RunInfoFile(wiki, False)
+ chunkInfo = Chunk(wiki, wiki.dbName)
+ dumpDir = DumpDir(wiki, wiki.dbName)
+ dumpItemList = DumpItemList(wiki, False, False, chunkToDo,
checkpointFile, job, chunkInfo, pageIDRange, runInfoFile, dumpDir)
+ if not dumpItemList.oldRunInfoRetrieved:
+ # failed to get the run's info so let's call it 'didn't run'
+ return False
+
+ results = dumpItemList._runInfoFile.getOldRunInfoFromFile()
+ if (results):
+ for runInfoObj in results:
+ dumpItemList._setDumpItemRunInfo(runInfoObj)
+
+ # mark the jobs we would run
+ if (job):
+ dumpItemList.markDumpsToRun(job, True)
+ if (restart):
+ dumpItemList.markFollowingJobsToRun(True)
+ else:
+ dumpItemList.markAllJobsToRun(True)
+
+ # see if there are any to run. no? then return True (all job(s) done)
+ # otherwise return False (still some to do)
+ for item in dumpItemList.dumpItems:
+ if item.toBeRun():
+ return False
+ return True
+
+
+def findAndLockNextWiki(config, locksEnabled, cutoff, bystatustime=False,
check_job_status=False,
+ date=None, job=None, pageIDRange=None, chunkToDo=None,
checkpointFile=None):
if config.halt:
sys.stderr.write("Dump process halted by config.\n")
return None
- next = config.dbListByAge()
+ next = config.dbListByAge(bystatustime)
next.reverse()
if verbose and not cutoff:
@@ -4076,6 +4131,9 @@
lastRan = wiki.latestDump()
if lastRan > cutoff:
return None
+ if check_job_status:
+ if checkJobDone(wiki, date, job, pageIDRange,
chunkToDo, checkpointFile):
+ continue
try:
if (locksEnabled):
wiki.lock()
@@ -4127,6 +4185,7 @@
sys.stderr.write( " (helpful if the previous files may
have corrupt contents)\n" )
sys.stderr.write( "--nospawn: Do not spawn a separate process in
order to retrieve revision texts\n" )
sys.stderr.write( "--restartfrom: Do all jobs after the one specified
via --job, including that one\n" )
+ sys.stderr.write( "--skipdone: Do only jobs that are not already
succefully completed\n")
sys.stderr.write( "--log: Log progress messages and other
output to logfile in addition to\n" )
sys.stderr.write( " the usual console output\n" )
sys.stderr.write( "--cutoff: Given a cutoff date in yyyymmdd
format, display the next wiki for which\n" )
@@ -4156,12 +4215,13 @@
pageIDRange = None
cutoff = None
result = False
+ skipdone = False
doLocking = False
verbose = False
try:
(options, remainder) = getopt.gnu_getopt(sys.argv[1:],
"",
- ['date=',
'job=', 'configfile=', 'addnotice=', 'delnotice', 'force', 'dryrun',
'noprefetch', 'nospawn', 'restartfrom', 'aftercheckpoint=', 'log', 'chunk=',
'checkpoint=', 'pageidrange=', 'cutoff=', "exclusive", 'verbose' ])
+ ['date=',
'job=', 'configfile=', 'addnotice=', 'delnotice', 'force', 'dryrun',
'noprefetch', 'nospawn', 'restartfrom', 'aftercheckpoint=', 'log', 'chunk=',
'checkpoint=', 'pageidrange=', 'cutoff=', "skipdone", "exclusive", 'verbose' ])
except:
usage("Unknown option specified")
@@ -4201,6 +4261,8 @@
cutoff = val
if not cutoff.isdigit() or not len(cutoff) == 8:
usage("--cutoff value must be in
yyyymmdd format")
+ elif opt == "--skipdone":
+ skipdone = True
elif opt == "--exclusive":
doLocking = True
elif opt == "--verbose":
@@ -4254,8 +4316,18 @@
wiki.lock()
else:
- wiki = findAndLockNextWiki(config, locksEnabled, cutoff)
-
+ # if the run is across all wikis and we are just doing
one job,
+ # we want the age of the wikis by the latest status
update
+ # and not the date the run started
+ if jobRequested:
+ check_status_time = True
+ else:
+ check_status_time = False
+ if skipdone:
+ check_job_status = True
+ else:
+ check_job_status = False
+ wiki = findAndLockNextWiki(config, locksEnabled,
cutoff, check_status_time, check_job_status, date, jobRequested, pageIDRange,
chunkToDo, checkpointFile)
if cutoff:
if wiki:
print wiki.dbName
@@ -4283,7 +4355,8 @@
if not jobRequested or not jobRequested in [
'articlesdump', 'metacurrentdump', 'metahistorybz2dump' ]:
usage("--aftercheckpoint option
requires --job option with one of %s" % ", ".join(afterCheckpointJobs))
- runner = Runner(wiki, prefetch, spawn, jobRequested,
restart, htmlNotice, dryrun, enableLogging, chunkToDo, checkpointFile,
pageIDRange, verbose)
+ runner = Runner(wiki, prefetch, spawn, jobRequested,
restart, htmlNotice, dryrun, enableLogging, chunkToDo, checkpointFile,
pageIDRange, skipdone, verbose)
+
if (restart):
sys.stderr.write("Running %s, restarting from
job %s...\n" % (wiki.dbName, jobRequested))
elif (jobRequested):
--
To view, visit https://gerrit.wikimedia.org/r/215657
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ice5a4e188e28d4a8b55b363f00cbece48b17727d
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits