ArielGlenn has uploaded a new change for review.
https://gerrit.wikimedia.org/r/298254
Change subject: add argument for specifying date of dump to use for prefetch
......................................................................
add argument for specifying date of dump to use for prefetch
in case previous dump runs were complete but for some reason we
don't want to use their page content dumps for prefetch, be able
to specify a specific dump run by date to use.
Bug: T137887
Change-Id: I69d504bc52b4c56851b8e936eaef7b7c4ee15417
---
M xmldumps-backup/dumps/runner.py
M xmldumps-backup/dumps/xmljobs.py
M xmldumps-backup/worker.py
3 files changed, 45 insertions(+), 20 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/dumps
refs/changes/54/298254/1
diff --git a/xmldumps-backup/dumps/runner.py b/xmldumps-backup/dumps/runner.py
index b501c4a..196b3e6 100644
--- a/xmldumps-backup/dumps/runner.py
+++ b/xmldumps-backup/dumps/runner.py
@@ -93,7 +93,7 @@
class DumpItemList(object):
- def __init__(self, wiki, prefetch, spawn, partnum_todo, checkpoint_file,
+ def __init__(self, wiki, prefetch, prefetchdate, spawn, partnum_todo,
checkpoint_file,
singleJob, skip_jobs, filepart, page_id_range, dumpjobdata,
dump_dir,
verbose):
self.wiki = wiki
@@ -103,6 +103,7 @@
self._is_wikidata_client = self.wiki.is_wikidata_client()
self._has_flow = self.wiki.has_flow()
self._prefetch = prefetch
+ self._prefetchdate = prefetchdate
self._spawn = spawn
self.filepart = filepart
self.checkpoint_file = checkpoint_file
@@ -217,7 +218,8 @@
"and primary meta-pages.</b></big>",
"This contains current versions of article content, " +
"and is the archive most mirror sites will probably want.",
- self.find_item_by_name('xmlstubsdump'), self._prefetch,
self._spawn,
+ self.find_item_by_name('xmlstubsdump'), self._prefetch,
+ self._prefetchdate, self._spawn,
self.wiki, self._get_partnum_todo("articlesdump"),
self.filepart.get_pages_per_filepart_history(),
checkpoints,
self.checkpoint_file, self.page_id_range, self.verbose))
@@ -238,6 +240,7 @@
"Discussion and user pages are included in this complete
archive. " +
"Most mirrors won't want this extra material.",
self.find_item_by_name('xmlstubsdump'), self._prefetch,
+ self._prefetchdate,
self._spawn, self.wiki,
self._get_partnum_todo("metacurrentdump"),
self.filepart.get_pages_per_filepart_history(),
checkpoints,
self.checkpoint_file, self.page_id_range, self.verbose))
@@ -311,7 +314,8 @@
"20 times the archive download size. " +
"Suitable for archival and statistical use, " +
"most mirror sites won't want or need this.",
- self.find_item_by_name('xmlstubsdump'), self._prefetch,
self._spawn,
+ self.find_item_by_name('xmlstubsdump'), self._prefetch,
+ self._prefetchdate, self._spawn,
self.wiki, self._get_partnum_todo("metahistorybz2dump"),
self.filepart.get_pages_per_filepart_history(),
checkpoints, self.checkpoint_file, self.page_id_range,
self.verbose))
@@ -485,13 +489,15 @@
class Runner(object):
- def __init__(self, wiki, prefetch=True, spawn=True, job=None,
skip_jobs=None,
+ def __init__(self, wiki, prefetch=True, prefetchdate=None, spawn=True,
+ job=None, skip_jobs=None,
restart=False, notice="", dryrun=False, enabled=None,
partnum_todo=None, checkpoint_file=None, page_id_range=None,
skipdone=False, cleanup=False, verbose=False):
self.wiki = wiki
self.db_name = wiki.db_name
self.prefetch = prefetch
+ self.prefetchdate = prefetchdate
self.spawn = spawn
self.filepart_info = FilePartInfo(wiki, self.db_name,
self.log_and_print)
self.restart = restart
@@ -581,7 +587,8 @@
self.verbose)
# some or all of these dump_items will be marked to run
- self.dump_item_list = DumpItemList(self.wiki, self.prefetch,
self.spawn,
+ self.dump_item_list = DumpItemList(self.wiki, self.prefetch,
self.prefetchdate,
+ self.spawn,
self._partnum_todo,
self.checkpoint_file,
self.job_requested, self.skip_jobs,
self.filepart_info,
self.page_id_range,
diff --git a/xmldumps-backup/dumps/xmljobs.py b/xmldumps-backup/dumps/xmljobs.py
index a6bade7..b06c1ba 100644
--- a/xmldumps-backup/dumps/xmljobs.py
+++ b/xmldumps-backup/dumps/xmljobs.py
@@ -196,13 +196,15 @@
class XmlDump(Dump):
"""Primary XML dumps, one section at a time."""
- def __init__(self, subset, name, desc, detail, item_for_stubs, prefetch,
spawn,
+ def __init__(self, subset, name, desc, detail, item_for_stubs, prefetch,
+ prefetchdate, spawn,
wiki, partnum_todo, parts=False, checkpoints=False,
checkpoint_file=None,
page_id_range=None, verbose=False):
self._subset = subset
self._detail = detail
self._desc = desc
self._prefetch = prefetch
+ self._prefetchdate = prefetchdate
self._spawn = spawn
self._parts = parts
if self._parts:
@@ -751,7 +753,10 @@
start_page_id = 1
end_page_id = None
- dumps = self.wiki.dump_dirs()
+ if self._prefetchdate:
+ dumps = [self._prefetchdate]
+ else:
+ dumps = self.wiki.dump_dirs()
dumps.sort()
dumps.reverse()
for date in dumps:
diff --git a/xmldumps-backup/worker.py b/xmldumps-backup/worker.py
index 9706f3a..14bc674 100644
--- a/xmldumps-backup/worker.py
+++ b/xmldumps-backup/worker.py
@@ -12,7 +12,7 @@
def check_jobs(wiki, date, job, skipjobs, page_id_range, partnum_todo,
- checkpoint_file, prefetch, spawn, dryrun, skipdone, verbose,
+ checkpoint_file, prefetch, prefetchdate, spawn, dryrun,
skipdone, verbose,
html_notice, prereqs=False, restart=False):
'''
if prereqs is False:
@@ -40,7 +40,7 @@
wiki.set_date(date)
- runner = Runner(wiki, prefetch=prefetch, spawn=spawn, job=job,
+ runner = Runner(wiki, prefetch=prefetch, prefetchdate=prefetchdate,
spawn=spawn, job=job,
skip_jobs=skipjobs, restart=restart, notice=html_notice,
dryrun=dryrun,
enabled=None, partnum_todo=partnum_todo,
checkpoint_file=checkpoint_file,
page_id_range=page_id_range, skipdone=skipdone,
verbose=verbose)
@@ -84,8 +84,8 @@
return True
-def find_lock_next_wiki(config, locks_enabled, cutoff, prefetch, spawn, dryrun,
- html_notice, bystatustime=False,
+def find_lock_next_wiki(config, locks_enabled, cutoff, prefetch, prefetchdate,
+ spawn, dryrun, html_notice, bystatustime=False,
check_job_status=False, check_prereq_status=False,
date=None, job=None, skipjobs=None, page_id_range=None,
partnum_todo=None, checkpoint_file=None,
skipdone=False, restart=False,
@@ -117,7 +117,8 @@
if check_job_status:
if check_jobs(wiki, date, job, skipjobs, page_id_range,
partnum_todo, checkpoint_file, restart,
- prefetch, spawn, dryrun, skipdone, verbose,
html_notice):
+ prefetch, prefetchdate, spawn, dryrun,
+ skipdone, verbose, html_notice):
continue
try:
if locks_enabled:
@@ -129,7 +130,8 @@
# if we skip locked wikis which are missing the prereqs for
this job,
# there are still wikis where this job needs to run
if not check_jobs(wiki, date, job, skipjobs, page_id_range,
partnum_todo,
- checkpoint_file, prefetch, spawn, dryrun,
skipdone, verbose,
+ checkpoint_file, prefetch, prefetchdate,
+ spawn, dryrun, skipdone, verbose,
html_notice, prereqs=True, restart=restart):
missing_prereqs = True
sys.stderr.write("Couldn't lock %s, someone else must have got
it...\n" % dbname)
@@ -146,7 +148,7 @@
usage_text = """Usage: python worker.py [options] [wikidbname]
Options: --aftercheckpoint, --checkpoint, --partnum, --configfile, --date,
--job,
--skipjobs, --addnotice, --delnotice, --force, --noprefetch,
- --nospawn, --restartfrom, --log, --cleanup, --cutoff\n")
+ --prefetchdate, --nospawn, --restartfrom, --log, --cleanup,
--cutoff\n")
--aftercheckpoint: Restart this job from the after specified checkpoint file,
doing the
rest of the job for the appropriate part number if parallel
subjobs each
doing one part are configured, or for the all the rest of the
revisions
@@ -184,6 +186,10 @@
runners try to work on that wiki. Default: for single jobs,
don't lock
--noprefetch: Do not use a previous file's contents for speeding up the dumps
(helpful if the previous files may have corrupt contents)
+--prefetchdate: Read page content from the dump of the specified date
(YYYYMMDD)
+ and reuse for the current page content dumps. If not
specified
+ and prefetch is enabled (the default), the most recent good
+ dump will be used.
--nospawn: Do not spawn a separate process in order to retrieve revision
texts
--restartfrom: Do all jobs after the one specified via --job, including that
one
--skipdone: Do only jobs that are not already succefully completed
@@ -209,6 +215,7 @@
config_file = False
force_lock = False
prefetch = True
+ prefetchdate = None
spawn = True
restart = False
jobs_requested = None
@@ -231,8 +238,8 @@
(options, remainder) = getopt.gnu_getopt(
sys.argv[1:], "",
['date=', 'job=', 'skipjobs=', 'configfile=', 'addnotice=',
- 'delnotice', 'force', 'dryrun', 'noprefetch', 'nospawn',
- 'restartfrom', 'aftercheckpoint=', 'log', 'partnum=',
+ 'delnotice', 'force', 'dryrun', 'noprefetch', 'prefetchdate=',
+ 'nospawn', 'restartfrom', 'aftercheckpoint=', 'log',
'partnum=',
'checkpoint=', 'pageidrange=', 'cutoff=', "skipdone",
"exclusive", "cleanup", 'verbose'])
except:
@@ -254,6 +261,8 @@
checkpoint_file = val
elif opt == "--noprefetch":
prefetch = False
+ elif opt == "--prefetchdate":
+ prefetchdate = val
elif opt == "--nospawn":
spawn = False
elif opt == "--dryrun":
@@ -313,7 +322,10 @@
usage("--pageidrange option requires --job")
if page_id_range and checkpoint_file is not None:
usage("--pageidrange option cannot be used with --checkpoint
option")
-
+ if prefetchdate is not None and not prefetch:
+ usage("prefetchdate and noprefetch options may not be specified
together")
+ if prefetchdate is not None and (not prefetchdate.isdigit() or
len(prefetchdate) != 8):
+ usage("prefetchdate must be of the form YYYYMMDD")
if skip_jobs is None:
skip_jobs = []
else:
@@ -400,7 +412,8 @@
check_prereq_status = True
else:
check_prereq_status = False
- wiki = find_lock_next_wiki(config, locks_enabled, cutoff,
prefetch, spawn,
+ wiki = find_lock_next_wiki(config, locks_enabled, cutoff, prefetch,
+ prefetchdate, spawn,
dryrun, html_notice, check_status_time,
check_job_status, check_prereq_status,
date, jobs_todo[0], skip_jobs,
page_id_range,
@@ -452,7 +465,7 @@
# no specific jobs requested, runner will do them all
if not len(jobs_todo):
- runner = Runner(wiki, prefetch, spawn, None, skip_jobs,
+ runner = Runner(wiki, prefetch, prefetchdate, spawn, None,
skip_jobs,
restart, html_notice, dryrun, enabled,
partnum_todo, checkpoint_file, page_id_range,
skipdone,
cleanup_files, verbose)
@@ -464,7 +477,7 @@
else:
# do each job requested one at a time
for job in jobs_todo:
- runner = Runner(wiki, prefetch, spawn, job, skip_jobs,
+ runner = Runner(wiki, prefetch, prefetchdate, spawn, job,
skip_jobs,
restart, html_notice, dryrun, enabled,
partnum_todo, checkpoint_file,
page_id_range, skipdone,
cleanup_files, verbose)
--
To view, visit https://gerrit.wikimedia.org/r/298254
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I69d504bc52b4c56851b8e936eaef7b7c4ee15417
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: master
Gerrit-Owner: ArielGlenn <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits