ArielGlenn has uploaded a new change for review.
https://gerrit.wikimedia.org/r/233417
Change subject: dumps: redo handling of jobs with unrun prereqs
......................................................................
dumps: redo handling of jobs with unrun prereqs
for a wiki not currently locked by another worker, if a prereq for a
job is waiting or in progress, don't call this run a failure,
just mark as waiting. prereqs that failed still result in this
run being marked as failed
if we have requested specific job(s) and skipdone:
for a wiki locked by another worker, if a prereq job is waiting
or in progress, we won't return "no wikis available
to run" as that won't strictly be true. We're just too early.
fix couple typos in comments
Change-Id: I8cf8d27dda0d14262a39c56ac73f7bc00005719a
---
M xmldumps-backup/worker
M xmldumps-backup/worker.py
2 files changed, 73 insertions(+), 19 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/dumps
refs/changes/17/233417/1
diff --git a/xmldumps-backup/worker b/xmldumps-backup/worker
index 7c81a12..b00a8e0 100755
--- a/xmldumps-backup/worker
+++ b/xmldumps-backup/worker
@@ -157,9 +157,8 @@
result=$?
if [ $result -eq 255 ]; then
if [ ! -z "$ONEPASS" -o ! -z "$CUTOFF" ]; then
- exit 0
# this isn't a failure but rather 'no wikis available to run'
- result=0
+ exit 0
fi
fi
if [ $result -ne 0 ]; then
diff --git a/xmldumps-backup/worker.py b/xmldumps-backup/worker.py
index 4392c01..f0006a4 100644
--- a/xmldumps-backup/worker.py
+++ b/xmldumps-backup/worker.py
@@ -364,6 +364,9 @@
class BackupError(Exception):
pass
+class BackupPrereqError(Exception):
+ pass
+
class RunInfoFile(object):
def __init__(self, wiki, enabled, verbose = False):
self.wiki = wiki
@@ -1876,7 +1879,8 @@
sys.stderr.write(repr(traceback.format_exception(exc_type, exc_value,
exc_traceback)))
else:
self.debug("*** exception! " +
str(ex))
- item.setStatus("failed")
+ if exc_type != 'BackupPrereqError':
+ item.setStatus("failed")
if item.status() == "done":
self.checksums.cpMd5TmpFileToPermFile()
@@ -1891,7 +1895,7 @@
self.runHandleFailure()
if (self.dumpItemList.allPossibleJobsDone(self.skipJobs)):
- # All jobs are either in status "done", "wating",
"failed",."skipped"
+ # All jobs are either in status "done", "waiting",
"failed", "skipped"
self.status.updateStatusFiles("done")
else:
# This may happen if we start a dump now and abort
before all items are
@@ -2246,8 +2250,10 @@
"""Attempt to run the operation, updating progress/status
info."""
try:
for prerequisiteItem in self._prerequisiteItems:
- if prerequisiteItem.status() != "done":
- raise BackupError("Required job %s not
marked as done, not starting job %s" % ( prerequisiteItem.name(),self.name() ) )
+ if prerequisiteItem.status() == "failed":
+ raise BackupError("Required job %s
failed, not starting job %s" % ( prerequisiteItem.name(),self.name() ) )
+ elif prerequisiteItem.status() != "done":
+ raise BackupPrereqError("Required job
%s not marked as done, not starting job %s" % (
prerequisiteItem.name(),self.name() ) )
self.run(runner)
self.postRun(runner)
@@ -2255,8 +2261,12 @@
if (self.verbose):
exc_type, exc_value, exc_traceback =
sys.exc_info()
sys.stderr.write(repr(traceback.format_exception(exc_type, exc_value,
exc_traceback)))
- self.setStatus("failed")
+ if exc_type == 'BackupPrereqError':
+ self.setStatus("waiting")
+ else:
+ self.setStatus("failed")
raise
+
self.setStatus("done")
def run(self, runner):
@@ -4031,10 +4041,16 @@
raise BackupError("error dumping all titles list")
-def checkJobDone(wiki, date, job, skipjobs, pageIDRange, chunkToDo,
checkpointFile):
+def checkJobs(wiki, date, job, skipjobs, pageIDRange, chunkToDo,
checkpointFile, prereqs=False):
'''
+ if prereqs is False:
see if dump run on specific date completed specific job(s)
or if no job was specified, ran to completion
+
+ if prereqs is True:
+ see if dump run on specific date completed prereqs for specific job(s)
+ or if no job was specified, return True
+
'''
if not date:
return False
@@ -4046,6 +4062,9 @@
else:
# never dumped so that's the same as 'job didn't run'
return False
+
+ if not job and prereqs:
+ return True
wiki.setDate(date)
@@ -4070,16 +4089,31 @@
else:
dumpItemList.markAllJobsToRun(True)
- # see if there are any to run. no? then return True (all job(s) done)
- # otherwise return False (still some to do)
- for item in dumpItemList.dumpItems:
- if item.toBeRun():
- return False
- return True
+ if not prereqs:
+ # see if there are any to run. no? then return True (all
job(s) done)
+ # otherwise return False (still some to do)
+ for item in dumpItemList.dumpItems:
+ if item.toBeRun():
+ return False
+ return True
+ else:
+ # get the list of prereqs, see if they are all status done, if
so
+ # return True, otherwise False (still some to do)
+ prereqItems = []
+ for item in self.dumpItems:
+ if (item.name() == job):
+ prereqItems = item._prerequisiteItems
+ break
+
+ for item in prereqItems:
+ if item.status() != "done":
+ return False
+ return True
def findAndLockNextWiki(config, locksEnabled, cutoff, bystatustime=False,
check_job_status=False,
- date=None, job=None, skipjobs=None, pageIDRange=None,
chunkToDo=None, checkpointFile=None):
+ check_prereq_status, date=None, job=None,
skipjobs=None, pageIDRange=None,
+ chunkToDo=None, checkpointFile=None):
if config.halt:
sys.stderr.write("Dump process halted by config.\n")
return None
@@ -4090,6 +4124,9 @@
if verbose and not cutoff:
sys.stderr.write("Finding oldest unlocked wiki...\n")
+ # if we skip locked wikis which are missing the prereqs for this job,
+ # there are still wikis where this job needs to run
+ missingPrereqs = False
for db in next:
wiki = WikiDump.Wiki(config, db)
if (cutoff):
@@ -4100,16 +4137,25 @@
# return None
continue
if check_job_status:
- if checkJobDone(wiki, date, job, skipjobs,
pageIDRange, chunkToDo, checkpointFile):
+ if checkJobs(wiki, date, job, skipjobs, pageIDRange,
chunkToDo, checkpointFile):
continue
try:
if (locksEnabled):
wiki.lock()
return wiki
except:
+ if check_prereq_status:
+ # if we skip locked wikis which are missing
the prereqs for this job,
+ # there are still wikis where this job needs
to run
+ if checkJobs(wiki, date, job, skipjobs,
pageIDRange, chunkToDo,
+ checkpointFile, prereqs=True):
+ missingPrereqs = True
sys.stderr.write("Couldn't lock %s, someone else must
have got it...\n" % db)
continue
- return None
+ if missingPrereqs:
+ return False
+ else:
+ return None
def xmlEscape(text):
return text.replace("&", "&").replace("<", "<").replace(">",
">")
@@ -4334,9 +4380,15 @@
check_job_status = True
else:
check_job_status = False
- wiki = findAndLockNextWiki(config, locksEnabled,
cutoff, check_status_time, check_job_status, date, jobRequested, skipJobs,
pageIDRange, chunkToDo, checkpointFile)
+ if jobRequested and skipdone:
+ check_prereq_status = True
+ else:
+ check_prereq_status = False
+ wiki = findAndLockNextWiki(config, locksEnabled,
cutoff, check_status_time,
+ check_job_status,
check_prereq_status,
+ date, jobRequested,
skipJobs, pageIDRange, chunkToDo, checkpointFile)
- if wiki:
+ if wiki is not None and wiki:
# process any per-project configuration options
config.parseConfFilePerProject(wiki.dbName)
@@ -4378,6 +4430,9 @@
# if we are doing one piece only of the dump, we don't
unlock either
if locksEnabled:
wiki.unlock()
+ else:
+ sys.stderr.write("Wikis available to run but prereqs
not complete.\n")
+ exitcode = 0
else:
sys.stderr.write("No wikis available to run.\n")
exitcode = 255
--
To view, visit https://gerrit.wikimedia.org/r/233417
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I8cf8d27dda0d14262a39c56ac73f7bc00005719a
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits