ArielGlenn has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/399753 )
Change subject: permit use of 7zip compressed files for prefetch ...................................................................... permit use of 7zip compressed files for prefetch Bug: T179267 Change-Id: I14d4636c78d81a9bfbf04f7f4c218875fcb870dc --- M xmldumps-backup/defaults.conf M xmldumps-backup/dumps/WikiDump.py M xmldumps-backup/dumps/xmlcontentjobs.py 3 files changed, 73 insertions(+), 42 deletions(-) Approvals: ArielGlenn: Looks good to me, approved jenkins-bot: Verified diff --git a/xmldumps-backup/defaults.conf b/xmldumps-backup/defaults.conf index 222cb2d..109fad6 100644 --- a/xmldumps-backup/defaults.conf +++ b/xmldumps-backup/defaults.conf @@ -80,4 +80,5 @@ maxrevs=50000 [misc] -fixeddumporder=0 \ No newline at end of file +fixeddumporder=0 +sevenzipprefetch=0 diff --git a/xmldumps-backup/dumps/WikiDump.py b/xmldumps-backup/dumps/WikiDump.py index 005f858..77b86f0 100644 --- a/xmldumps-backup/dumps/WikiDump.py +++ b/xmldumps-backup/dumps/WikiDump.py @@ -316,6 +316,11 @@ self.conf.add_section('wiki') self.wiki_dir = self.get_opt_for_proj_or_default("wiki", "dir", 0) + if not self.conf.has_section('misc'): + self.conf.add_section('misc') + self.sevenzip_prefetch = self.get_opt_in_overrides_or_default("misc", "sevenzipprefetch", 0) + self.sevenzip_prefetch = int(self.sevenzip_prefetch, 0) + def db_latest_status(self): ''' return list of tuples for each wiki: diff --git a/xmldumps-backup/dumps/xmlcontentjobs.py b/xmldumps-backup/dumps/xmlcontentjobs.py index 2e21232..7b7956a 100644 --- a/xmldumps-backup/dumps/xmlcontentjobs.py +++ b/xmldumps-backup/dumps/xmlcontentjobs.py @@ -198,6 +198,55 @@ pagerange['end'] = None return pagerange + def _find_prefetch_files_from_run(self, runner, date, jobinfo, + pagerange, file_ext): + """ + for a given wiki and date, see if there are dump content + files lying about that can be used for prefetch to the + current job, with the given file extension (might be bz2s + or 7zs or whatever) for the given range of pages + """ + dfnames = get_checkpt_files( + runner.dump_dir, [jobinfo['dumpname']], self.jobinfo['ftype'], + file_ext, date, parts=None) + possible_prefetch_dfnames = self.get_relevant_prefetch_dfnames( + dfnames, pagerange, date, runner) + if len(possible_prefetch_dfnames): + return possible_prefetch_dfnames + + # ok, let's check for file parts instead, from any run + # (may not conform to our numbering for this job) + dfnames = get_reg_files( + runner.dump_dir, [jobinfo['dumpname']], jobinfo['ftype'], + file_ext, date, parts=True) + possible_prefetch_dfnames = self.get_relevant_prefetch_dfnames( + dfnames, pagerange, date, runner) + if len(possible_prefetch_dfnames): + return possible_prefetch_dfnames + + # last shot, get output file that contains all the pages, if there is one + dfnames = get_reg_files( + runner.dump_dir, [jobinfo['dumpname']], + jobinfo['ftype'], file_ext, date, parts=False) + # there is only one, don't bother to check for relevance :-P + possible_prefetch_dfnames = dfnames + dfnames = [] + for prefetch_dfname in possible_prefetch_dfnames: + if runner.wiki.is_private(): + possible_path = runner.dump_dir.filename_private_path(prefetch_dfname, date) + else: + possible_path = runner.dump_dir.filename_public_path(prefetch_dfname, date) + size = os.path.getsize(possible_path) + if size < 70000: + runner.debug("small %d-byte prefetch dump at %s, skipping" % ( + size, possible_path)) + continue + else: + dfnames.append(prefetch_dfname) + if len(dfnames): + return dfnames + return None + def _find_previous_dump(self, runner, partnum=None): """ this finds the content file or files from the first previous successful dump @@ -226,46 +275,15 @@ runner.debug("skipping incomplete or failed dump for prefetch date %s" % date) continue - # first check if there are checkpoint files from this run we can use - dfnames = get_checkpt_files( - runner.dump_dir, [self.jobinfo['dumpname']], self.jobinfo['ftype'], - self.jobinfo['fext'], date, parts=None) - possible_prefetch_dfnames = self.get_relevant_prefetch_dfnames( - dfnames, pagerange, date, runner) - if len(possible_prefetch_dfnames): - return possible_prefetch_dfnames + # might look first for 7z files, then for bz2, + # in any case go through the entire dance for each extension + # before giving up and moving to next one + for file_ext in self.jobinfo['fexts']: - # ok, let's check for file parts instead, from any run - # (may not conform to our numbering for this job) - dfnames = get_reg_files( - runner.dump_dir, [self.jobinfo['dumpname']], self.jobinfo['ftype'], - self.jobinfo['fext'], date, parts=True) - possible_prefetch_dfnames = self.get_relevant_prefetch_dfnames( - dfnames, pagerange, date, runner) - if len(possible_prefetch_dfnames): - return possible_prefetch_dfnames - - # last shot, get output file that contains all the pages, if there is one - dfnames = get_reg_files( - runner.dump_dir, [self.jobinfo['dumpname']], - self.jobinfo['ftype'], self.jobinfo['fext'], date, parts=False) - # there is only one, don't bother to check for relevance :-P - possible_prefetch_dfnames = dfnames - dfnames = [] - for prefetch_dfname in possible_prefetch_dfnames: - if runner.wiki.is_private(): - possible_path = runner.dump_dir.filename_private_path(prefetch_dfname, date) - else: - possible_path = runner.dump_dir.filename_public_path(prefetch_dfname, date) - size = os.path.getsize(possible_path) - if size < 70000: - runner.debug("small %d-byte prefetch dump at %s, skipping" % ( - size, possible_path)) - continue - else: - dfnames.append(prefetch_dfname) - if len(dfnames): - return dfnames + dfnames_found = self._find_prefetch_files_from_run( + runner, date, self.jobinfo, pagerange, file_ext) + if dfnames_found: + return dfnames_found runner.debug("Could not locate a prefetchable dump.") return None @@ -304,7 +322,10 @@ else: partnum_str = "" if len(sources) > 0: - source = "bzip2:%s" % (";".join(sources)) + if sources[0].endswith('7z'): + source = "7zip:%s" % (";".join(sources)) + else: + source = "bzip2:%s" % (";".join(sources)) runner.show_runner_state("... building %s %s XML dump, with text prefetch from %s..." % (self.jobinfo['subset'], partnum_str, source)) prefetch = "--prefetch=%s" % (source) @@ -676,11 +697,15 @@ dfnames_todo = self.make_bitesize_jobs(dfnames_todo, stub_pageranges) if self.jobinfo['prefetch']: + if runner.wiki.config.sevenzip_prefetch: + file_exts = ['7z', self.file_ext] + else: + file_exts = [self.file_ext] prefetcher = PrefetchFinder( self.wiki, {'name': self.name(), 'desc': self.jobinfo['desc'], 'dumpname': self.get_dumpname(), - 'ftype': self.file_type, 'fext': self.file_ext, + 'ftype': self.file_type, 'fexts': file_exts, 'subset': self.jobinfo['subset']}, {'date': self.jobinfo['prefetchdate'], 'parts': self._parts}, self.verbose) -- To view, visit https://gerrit.wikimedia.org/r/399753 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I14d4636c78d81a9bfbf04f7f4c218875fcb870dc Gerrit-PatchSet: 3 Gerrit-Project: operations/dumps Gerrit-Branch: master Gerrit-Owner: ArielGlenn <ar...@wikimedia.org> Gerrit-Reviewer: ArielGlenn <ar...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits