ArielGlenn has submitted this change and it was merged.

Change subject: add argument for specifying date of dump to use for prefetch
......................................................................


add argument for specifying date of dump to use for prefetch

in case previous dump runs were complete but for some reason we
don't want to use their page content dumps for prefetch, be able
to specify a specific dump run by date to use.

Bug: T137887
Change-Id: I69d504bc52b4c56851b8e936eaef7b7c4ee15417
---
M xmldumps-backup/dumps/runner.py
M xmldumps-backup/dumps/xmljobs.py
M xmldumps-backup/worker.py
3 files changed, 45 insertions(+), 20 deletions(-)

Approvals:
  ArielGlenn: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/xmldumps-backup/dumps/runner.py b/xmldumps-backup/dumps/runner.py
index b501c4a..196b3e6 100644
--- a/xmldumps-backup/dumps/runner.py
+++ b/xmldumps-backup/dumps/runner.py
@@ -93,7 +93,7 @@
 
 
 class DumpItemList(object):
-    def __init__(self, wiki, prefetch, spawn, partnum_todo, checkpoint_file,
+    def __init__(self, wiki, prefetch, prefetchdate, spawn, partnum_todo, 
checkpoint_file,
                  singleJob, skip_jobs, filepart, page_id_range, dumpjobdata, 
dump_dir,
                  verbose):
         self.wiki = wiki
@@ -103,6 +103,7 @@
         self._is_wikidata_client = self.wiki.is_wikidata_client()
         self._has_flow = self.wiki.has_flow()
         self._prefetch = prefetch
+        self._prefetchdate = prefetchdate
         self._spawn = spawn
         self.filepart = filepart
         self.checkpoint_file = checkpoint_file
@@ -217,7 +218,8 @@
                     "and primary meta-pages.</b></big>",
                     "This contains current versions of article content, " +
                     "and is the archive most mirror sites will probably want.",
-                    self.find_item_by_name('xmlstubsdump'), self._prefetch, 
self._spawn,
+                    self.find_item_by_name('xmlstubsdump'), self._prefetch,
+                    self._prefetchdate, self._spawn,
                     self.wiki, self._get_partnum_todo("articlesdump"),
                     self.filepart.get_pages_per_filepart_history(), 
checkpoints,
                     self.checkpoint_file, self.page_id_range, self.verbose))
@@ -238,6 +240,7 @@
                     "Discussion and user pages are included in this complete 
archive. " +
                     "Most mirrors won't want this extra material.",
                     self.find_item_by_name('xmlstubsdump'), self._prefetch,
+                    self._prefetchdate,
                     self._spawn, self.wiki, 
self._get_partnum_todo("metacurrentdump"),
                     self.filepart.get_pages_per_filepart_history(), 
checkpoints,
                     self.checkpoint_file, self.page_id_range, self.verbose))
@@ -311,7 +314,8 @@
                 "20 times the archive download size. " +
                 "Suitable for archival and statistical use, " +
                 "most mirror sites won't want or need this.",
-                self.find_item_by_name('xmlstubsdump'), self._prefetch, 
self._spawn,
+                self.find_item_by_name('xmlstubsdump'), self._prefetch,
+                self._prefetchdate, self._spawn,
                 self.wiki, self._get_partnum_todo("metahistorybz2dump"),
                 self.filepart.get_pages_per_filepart_history(),
                 checkpoints, self.checkpoint_file, self.page_id_range, 
self.verbose))
@@ -485,13 +489,15 @@
 
 
 class Runner(object):
-    def __init__(self, wiki, prefetch=True, spawn=True, job=None, 
skip_jobs=None,
+    def __init__(self, wiki, prefetch=True, prefetchdate=None, spawn=True,
+                 job=None, skip_jobs=None,
                  restart=False, notice="", dryrun=False, enabled=None,
                  partnum_todo=None, checkpoint_file=None, page_id_range=None,
                  skipdone=False, cleanup=False, verbose=False):
         self.wiki = wiki
         self.db_name = wiki.db_name
         self.prefetch = prefetch
+        self.prefetchdate = prefetchdate
         self.spawn = spawn
         self.filepart_info = FilePartInfo(wiki, self.db_name, 
self.log_and_print)
         self.restart = restart
@@ -581,7 +587,8 @@
                                           self.verbose)
 
         # some or all of these dump_items will be marked to run
-        self.dump_item_list = DumpItemList(self.wiki, self.prefetch, 
self.spawn,
+        self.dump_item_list = DumpItemList(self.wiki, self.prefetch, 
self.prefetchdate,
+                                           self.spawn,
                                            self._partnum_todo, 
self.checkpoint_file,
                                            self.job_requested, self.skip_jobs,
                                            self.filepart_info, 
self.page_id_range,
diff --git a/xmldumps-backup/dumps/xmljobs.py b/xmldumps-backup/dumps/xmljobs.py
index a6bade7..b06c1ba 100644
--- a/xmldumps-backup/dumps/xmljobs.py
+++ b/xmldumps-backup/dumps/xmljobs.py
@@ -196,13 +196,15 @@
 
 class XmlDump(Dump):
     """Primary XML dumps, one section at a time."""
-    def __init__(self, subset, name, desc, detail, item_for_stubs, prefetch, 
spawn,
+    def __init__(self, subset, name, desc, detail, item_for_stubs, prefetch,
+                 prefetchdate, spawn,
                  wiki, partnum_todo, parts=False, checkpoints=False, 
checkpoint_file=None,
                  page_id_range=None, verbose=False):
         self._subset = subset
         self._detail = detail
         self._desc = desc
         self._prefetch = prefetch
+        self._prefetchdate = prefetchdate
         self._spawn = spawn
         self._parts = parts
         if self._parts:
@@ -751,7 +753,10 @@
             start_page_id = 1
             end_page_id = None
 
-        dumps = self.wiki.dump_dirs()
+        if self._prefetchdate:
+            dumps = [self._prefetchdate]
+        else:
+            dumps = self.wiki.dump_dirs()
         dumps.sort()
         dumps.reverse()
         for date in dumps:
diff --git a/xmldumps-backup/worker.py b/xmldumps-backup/worker.py
index 9706f3a..14bc674 100644
--- a/xmldumps-backup/worker.py
+++ b/xmldumps-backup/worker.py
@@ -12,7 +12,7 @@
 
 
 def check_jobs(wiki, date, job, skipjobs, page_id_range, partnum_todo,
-               checkpoint_file, prefetch, spawn, dryrun, skipdone, verbose,
+               checkpoint_file, prefetch, prefetchdate, spawn, dryrun, 
skipdone, verbose,
                html_notice, prereqs=False, restart=False):
     '''
     if prereqs is False:
@@ -40,7 +40,7 @@
 
     wiki.set_date(date)
 
-    runner = Runner(wiki, prefetch=prefetch, spawn=spawn, job=job,
+    runner = Runner(wiki, prefetch=prefetch, prefetchdate=prefetchdate, 
spawn=spawn, job=job,
                     skip_jobs=skipjobs, restart=restart, notice=html_notice, 
dryrun=dryrun,
                     enabled=None, partnum_todo=partnum_todo, 
checkpoint_file=checkpoint_file,
                     page_id_range=page_id_range, skipdone=skipdone, 
verbose=verbose)
@@ -84,8 +84,8 @@
         return True
 
 
-def find_lock_next_wiki(config, locks_enabled, cutoff, prefetch, spawn, dryrun,
-                        html_notice, bystatustime=False,
+def find_lock_next_wiki(config, locks_enabled, cutoff, prefetch, prefetchdate,
+                        spawn, dryrun, html_notice, bystatustime=False,
                         check_job_status=False, check_prereq_status=False,
                         date=None, job=None, skipjobs=None, page_id_range=None,
                         partnum_todo=None, checkpoint_file=None, 
skipdone=False, restart=False,
@@ -117,7 +117,8 @@
         if check_job_status:
             if check_jobs(wiki, date, job, skipjobs, page_id_range,
                           partnum_todo, checkpoint_file, restart,
-                          prefetch, spawn, dryrun, skipdone, verbose, 
html_notice):
+                          prefetch, prefetchdate, spawn, dryrun,
+                          skipdone, verbose, html_notice):
                 continue
         try:
             if locks_enabled:
@@ -129,7 +130,8 @@
                 # if we skip locked wikis which are missing the prereqs for 
this job,
                 # there are still wikis where this job needs to run
                 if not check_jobs(wiki, date, job, skipjobs, page_id_range, 
partnum_todo,
-                                  checkpoint_file, prefetch, spawn, dryrun, 
skipdone, verbose,
+                                  checkpoint_file, prefetch, prefetchdate,
+                                  spawn, dryrun, skipdone, verbose,
                                   html_notice, prereqs=True, restart=restart):
                     missing_prereqs = True
             sys.stderr.write("Couldn't lock %s, someone else must have got 
it...\n" % dbname)
@@ -146,7 +148,7 @@
     usage_text = """Usage: python worker.py [options] [wikidbname]
 Options: --aftercheckpoint, --checkpoint, --partnum, --configfile, --date, 
--job,
          --skipjobs, --addnotice, --delnotice, --force, --noprefetch,
-         --nospawn, --restartfrom, --log, --cleanup, --cutoff\n")
+         --prefetchdate, --nospawn, --restartfrom, --log, --cleanup, 
--cutoff\n")
 --aftercheckpoint: Restart this job from the after specified checkpoint file, 
doing the
                rest of the job for the appropriate part number if parallel 
subjobs each
                doing one part are configured, or for the all the rest of the 
revisions
@@ -184,6 +186,10 @@
                runners try to work on that wiki. Default: for single jobs, 
don't lock
 --noprefetch:  Do not use a previous file's contents for speeding up the dumps
                (helpful if the previous files may have corrupt contents)
+--prefetchdate:  Read page content from the dump of the specified date 
(YYYYMMDD)
+                 and reuse for the current page content dumps.  If not 
specified
+                 and prefetch is enabled (the default), the most recent good
+                 dump will be used.
 --nospawn:     Do not spawn a separate process in order to retrieve revision 
texts
 --restartfrom: Do all jobs after the one specified via --job, including that 
one
 --skipdone:    Do only jobs that are not already succefully completed
@@ -209,6 +215,7 @@
         config_file = False
         force_lock = False
         prefetch = True
+        prefetchdate = None
         spawn = True
         restart = False
         jobs_requested = None
@@ -231,8 +238,8 @@
             (options, remainder) = getopt.gnu_getopt(
                 sys.argv[1:], "",
                 ['date=', 'job=', 'skipjobs=', 'configfile=', 'addnotice=',
-                 'delnotice', 'force', 'dryrun', 'noprefetch', 'nospawn',
-                 'restartfrom', 'aftercheckpoint=', 'log', 'partnum=',
+                 'delnotice', 'force', 'dryrun', 'noprefetch', 'prefetchdate=',
+                 'nospawn', 'restartfrom', 'aftercheckpoint=', 'log', 
'partnum=',
                  'checkpoint=', 'pageidrange=', 'cutoff=', "skipdone",
                  "exclusive", "cleanup", 'verbose'])
         except:
@@ -254,6 +261,8 @@
                 checkpoint_file = val
             elif opt == "--noprefetch":
                 prefetch = False
+            elif opt == "--prefetchdate":
+                prefetchdate = val
             elif opt == "--nospawn":
                 spawn = False
             elif opt == "--dryrun":
@@ -313,7 +322,10 @@
             usage("--pageidrange option requires --job")
         if page_id_range and checkpoint_file is not None:
             usage("--pageidrange option cannot be used with --checkpoint 
option")
-
+        if prefetchdate is not None and not prefetch:
+            usage("prefetchdate and noprefetch options may not be specified 
together")
+        if prefetchdate is not None and (not prefetchdate.isdigit() or 
len(prefetchdate) != 8):
+            usage("prefetchdate must be of the form YYYYMMDD")
         if skip_jobs is None:
             skip_jobs = []
         else:
@@ -400,7 +412,8 @@
                     check_prereq_status = True
                 else:
                     check_prereq_status = False
-            wiki = find_lock_next_wiki(config, locks_enabled, cutoff, 
prefetch, spawn,
+            wiki = find_lock_next_wiki(config, locks_enabled, cutoff, prefetch,
+                                       prefetchdate, spawn,
                                        dryrun, html_notice, check_status_time,
                                        check_job_status, check_prereq_status,
                                        date, jobs_todo[0], skip_jobs, 
page_id_range,
@@ -452,7 +465,7 @@
 
             # no specific jobs requested, runner will do them all
             if not len(jobs_todo):
-                runner = Runner(wiki, prefetch, spawn, None, skip_jobs,
+                runner = Runner(wiki, prefetch, prefetchdate, spawn, None, 
skip_jobs,
                                 restart, html_notice, dryrun, enabled,
                                 partnum_todo, checkpoint_file, page_id_range, 
skipdone,
                                 cleanup_files, verbose)
@@ -464,7 +477,7 @@
             else:
                 # do each job requested one at a time
                 for job in jobs_todo:
-                    runner = Runner(wiki, prefetch, spawn, job, skip_jobs,
+                    runner = Runner(wiki, prefetch, prefetchdate, spawn, job, 
skip_jobs,
                                     restart, html_notice, dryrun, enabled,
                                     partnum_todo, checkpoint_file, 
page_id_range, skipdone,
                                     cleanup_files, verbose)

-- 
To view, visit https://gerrit.wikimedia.org/r/298254
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I69d504bc52b4c56851b8e936eaef7b7c4ee15417
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: master
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to