ArielGlenn has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/400226 )
Change subject: make reporting of file sizes for dump steps in progress work again ...................................................................... make reporting of file sizes for dump steps in progress work again This was probably broken when we introduced writing files with a special extension and then moving them into place once the run completes successfully. Bug: T183694 Change-Id: Ib4bc9f6a41f31431e5642a3f7f7415bd2de38ea8 --- M xmldumps-backup/dumps/apijobs.py M xmldumps-backup/dumps/fileutils.py M xmldumps-backup/dumps/flowjob.py M xmldumps-backup/dumps/jobs.py M xmldumps-backup/dumps/recombinejobs.py M xmldumps-backup/dumps/recompressjobs.py M xmldumps-backup/dumps/runnerutils.py M xmldumps-backup/dumps/tablesjobs.py M xmldumps-backup/dumps/xmlcontentjobs.py M xmldumps-backup/dumps/xmljobs.py 10 files changed, 51 insertions(+), 33 deletions(-) Approvals: ArielGlenn: Looks good to me, approved jenkins-bot: Verified diff --git a/xmldumps-backup/dumps/apijobs.py b/xmldumps-backup/dumps/apijobs.py index 3991add..534ca99 100644 --- a/xmldumps-backup/dumps/apijobs.py +++ b/xmldumps-backup/dumps/apijobs.py @@ -1,5 +1,6 @@ import time from dumps.exceptions import BackupError +from dumps.fileutils import DumpFilename from dumps.jobs import Dump @@ -30,11 +31,11 @@ commands = self.build_command(runner) if runner.wiki.is_private(): command_series = runner.get_save_command_series( - commands, self.get_inprogress_name( + commands, DumpFilename.get_inprogress_name( runner.dump_dir.filename_private_path(output_dfname))) else: command_series = runner.get_save_command_series( - commands, self.get_inprogress_name( + commands, DumpFilename.get_inprogress_name( runner.dump_dir.filename_public_path(output_dfname))) self.setup_command_info(runner, command_series, [output_dfname]) diff --git a/xmldumps-backup/dumps/fileutils.py b/xmldumps-backup/dumps/fileutils.py index a264ece..e6b4b67 100644 --- a/xmldumps-backup/dumps/fileutils.py +++ b/xmldumps-backup/dumps/fileutils.py @@ -165,6 +165,8 @@ partnum_int part number as int """ + INPROG = ".inprog" # extension for dump output files that are in progress (not fully written) + @staticmethod def make_checkpoint_string(first_page_id, last_page_id): if first_page_id is not None and last_page_id is not None: @@ -172,6 +174,10 @@ else: return None + @staticmethod + def get_inprogress_name(filename): + return filename + DumpFilename.INPROG + def __init__(self, wiki, date=None, dump_name=None, filetype=None, ext=None, partnum=None, checkpoint=None, temp=False): """Constructor. Arguments: the dump name as it should appear in the filename, diff --git a/xmldumps-backup/dumps/flowjob.py b/xmldumps-backup/dumps/flowjob.py index b875101..f1de495 100644 --- a/xmldumps-backup/dumps/flowjob.py +++ b/xmldumps-backup/dumps/flowjob.py @@ -5,6 +5,7 @@ import os from dumps.exceptions import BackupError from dumps.utils import MultiVersion +from dumps.fileutils import DumpFilename from dumps.jobs import Dump @@ -45,7 +46,7 @@ command.extend(script_command) command.extend(["--wiki=%s" % runner.db_name, "--current", "--report=1000", - "--output=bzip2:%s" % self.get_inprogress_name(flow_output_fpath)]) + "--output=bzip2:%s" % DumpFilename.get_inprogress_name(flow_output_fpath)]) if self.history: command.append("--full") pipeline = [command] diff --git a/xmldumps-backup/dumps/jobs.py b/xmldumps-backup/dumps/jobs.py index 8ebbbf1..256056b 100644 --- a/xmldumps-backup/dumps/jobs.py +++ b/xmldumps-backup/dumps/jobs.py @@ -54,8 +54,6 @@ class Dump(object): - INPROG = ".inprog" # extension for dump output files that are in progress (not fully written) - def __init__(self, name, desc, verbose=False): self._desc = desc self.verbose = verbose @@ -84,14 +82,12 @@ if not hasattr(self, '_parts'): self._parts = False - def get_inprogress_name(self, filename): - return filename + self.INPROG - def setup_command_info(self, runner, command_series, output_dfnames, output_dir=None): command_info = {} command_info['runner'] = runner command_info['series'] = command_series - command_info['output_files'] = [dfname.filename + self.INPROG for dfname in output_dfnames] + command_info['output_files'] = [dfname.filename + DumpFilename.INPROG + for dfname in output_dfnames] if output_dir is not None: command_info['output_dir'] = output_dir else: @@ -239,12 +235,12 @@ file_truncated = True if runner.wiki.is_private(): dcontents = DumpContents(runner.wiki, - self.get_inprogress_name( + DumpFilename.get_inprogress_name( runner.dump_dir.filename_private_path(dfname)), dfname) else: dcontents = DumpContents(runner.wiki, - self.get_inprogress_name( + DumpFilename.get_inprogress_name( runner.dump_dir.filename_public_path(dfname)), dfname) if exists(dcontents.filename): @@ -319,10 +315,11 @@ if not commands['output_files']: return for inprogress_filename in commands['output_files']: - if not inprogress_filename.endswith(self.INPROG): + if not inprogress_filename.endswith(DumpFilename.INPROG): continue final_dfname = DumpFilename(commands['runner'].wiki) - final_dfname.new_from_filename(inprogress_filename[:-1 * len(self.INPROG)]) + final_dfname.new_from_filename( + inprogress_filename[:-1 * len(DumpFilename.INPROG)]) in_progress_path = os.path.join(commands['output_dir'], inprogress_filename) final_path = os.path.join(commands['output_dir'], final_dfname.filename) @@ -347,10 +344,10 @@ os.remove(dump_dir.filename_public_path(dfname)) elif exists(dump_dir.filename_private_path(dfname)): os.remove(dump_dir.filename_private_path(dfname)) - if exists(dump_dir.filename_public_path(dfname) + self.INPROG): - os.remove(dump_dir.filename_public_path(dfname) + self.INPROG) - elif exists(dump_dir.filename_private_path(dfname) + self.INPROG): - os.remove(dump_dir.filename_private_path(dfname) + self.INPROG) + if exists(dump_dir.filename_public_path(dfname) + DumpFilename.INPROG): + os.remove(dump_dir.filename_public_path(dfname) + DumpFilename.INPROG) + elif exists(dump_dir.filename_private_path(dfname) + DumpFilename.INPROG): + os.remove(dump_dir.filename_private_path(dfname) + DumpFilename.INPROG) def cleanup_old_files(self, dump_dir, runner): if "cleanup_old_files" in runner.enabled: diff --git a/xmldumps-backup/dumps/recombinejobs.py b/xmldumps-backup/dumps/recombinejobs.py index 12c49e5..6d1f0a1 100644 --- a/xmldumps-backup/dumps/recombinejobs.py +++ b/xmldumps-backup/dumps/recombinejobs.py @@ -7,6 +7,7 @@ import signal from dumps.exceptions import BackupError from dumps.jobs import Dump +from dumps.fileutils import DumpFilename from dumps.CommandManagement import CommandPipeline @@ -74,7 +75,7 @@ recombines.append(recombine) recombine_command_string = ("(" + ";".join(recombines) + ")" + "|" + "%s %s" % (compression_command, - self.get_inprogress_name(output_filename))) + DumpFilename.get_inprogress_name(output_filename))) return recombine_command_string diff --git a/xmldumps-backup/dumps/recompressjobs.py b/xmldumps-backup/dumps/recompressjobs.py index 90c22c8..49ca8aa 100644 --- a/xmldumps-backup/dumps/recompressjobs.py +++ b/xmldumps-backup/dumps/recompressjobs.py @@ -110,8 +110,8 @@ infilepath = runner.dump_dir.filename_public_path(input_dfname) command_pipe = [["%s -dc %s | %s --pagesperstream 100 --buildindex %s > %s" % (self.wiki.config.bzip2, infilepath, self.wiki.config.recompressxml, - self.get_inprogress_name(outfilepath_index), - self.get_inprogress_name(outfilepath))]] + DumpFilename.get_inprogress_name(outfilepath_index), + DumpFilename.get_inprogress_name(outfilepath))]] return [command_pipe] def run(self, runner): @@ -350,7 +350,7 @@ command_pipe = [["%s -dc %s | %s a -mx=4 -si %s" % (self.wiki.config.bzip2, infilepath, self.wiki.config.sevenzip, - self.get_inprogress_name(outfilepath))]] + DumpFilename.get_inprogress_name(outfilepath))]] command_series.append(command_pipe) return command_series diff --git a/xmldumps-backup/dumps/runnerutils.py b/xmldumps-backup/dumps/runnerutils.py index 4688900..10b0295 100644 --- a/xmldumps-backup/dumps/runnerutils.py +++ b/xmldumps-backup/dumps/runnerutils.py @@ -272,9 +272,18 @@ status ("in-progress", "missing", ...) """ filename = dump_dir.filename_public_path(dfname) + size = None if exists(filename): size = os.path.getsize(filename) - else: + elif item_status == "in-progress": + # note that because multiple files may be produced for a single dump + # job, some may be complete while others are still in progress. + # therefore we check the normal name first, falling back to the + # inprogress name. + filename = filename + DumpFilename.INPROG + if exists(filename): + size = os.path.getsize(filename) + if size is None: item_status = "missing" size = 0 pretty_size = FileUtils.pretty_size(size) diff --git a/xmldumps-backup/dumps/tablesjobs.py b/xmldumps-backup/dumps/tablesjobs.py index a82e01b..e2ca0ce 100644 --- a/xmldumps-backup/dumps/tablesjobs.py +++ b/xmldumps-backup/dumps/tablesjobs.py @@ -9,6 +9,7 @@ from dumps.exceptions import BackupError from dumps.jobs import Dump +from dumps.fileutils import DumpFilename class PublicTable(Dump): @@ -33,11 +34,11 @@ commands = runner.db_server_info.build_sqldump_command(self._table, runner.wiki.config.gzip) if self.private or runner.wiki.is_private(): command_series = runner.get_save_command_series( - commands, self.get_inprogress_name( + commands, DumpFilename.get_inprogress_name( runner.dump_dir.filename_private_path(output_dfname))) else: command_series = runner.get_save_command_series( - commands, self.get_inprogress_name( + commands, DumpFilename.get_inprogress_name( runner.dump_dir.filename_public_path(output_dfname))) return command_series @@ -136,10 +137,12 @@ series = runner.db_server_info.build_sql_command(query, runner.wiki.config.gzip) if runner.wiki.is_private(): return runner.get_save_command_series( - series, self.get_inprogress_name(runner.dump_dir.filename_private_path(out_dfname))) + series, DumpFilename.get_inprogress_name( + runner.dump_dir.filename_private_path(out_dfname))) else: return runner.get_save_command_series( - series, self.get_inprogress_name(runner.dump_dir.filename_public_path(out_dfname))) + series, DumpFilename.get_inprogress_name( + runner.dump_dir.filename_public_path(out_dfname))) def save_sql(self, runner, command_series): """Pass some SQL commands to the server for this DB and save output to a gzipped file.""" diff --git a/xmldumps-backup/dumps/xmlcontentjobs.py b/xmldumps-backup/dumps/xmlcontentjobs.py index b0db10d..2e21232 100644 --- a/xmldumps-backup/dumps/xmlcontentjobs.py +++ b/xmldumps-backup/dumps/xmlcontentjobs.py @@ -765,7 +765,7 @@ bz2mode = "dbzip2" else: bz2mode = "bzip2" - return "--output=%s:%s" % (bz2mode, self.get_inprogress_name(xmlbz2_path)) + return "--output=%s:%s" % (bz2mode, DumpFilename.get_inprogress_name(xmlbz2_path)) def build_command(self, runner, stub_dfname, prefetch, output_dfname): """ diff --git a/xmldumps-backup/dumps/xmljobs.py b/xmldumps-backup/dumps/xmljobs.py index 3b6ea86..4419d42 100644 --- a/xmldumps-backup/dumps/xmljobs.py +++ b/xmldumps-backup/dumps/xmljobs.py @@ -134,9 +134,9 @@ config_file_arg = config_file_arg + ":" + runner.wiki.config.override_section command = ["/usr/bin/python", "xmlstubs.py", "--config", config_file_arg, "--wiki", runner.db_name, - "--articles", self.get_inprogress_name(articles_filepath), - "--history", self.get_inprogress_name(history_filepath), - "--current", self.get_inprogress_name(current_filepath)] + "--articles", DumpFilename.get_inprogress_name(articles_filepath), + "--history", DumpFilename.get_inprogress_name(history_filepath), + "--current", DumpFilename.get_inprogress_name(current_filepath)] if output_dfname.partnum: # set up start end end pageids for this piece @@ -227,7 +227,7 @@ config_file_arg = config_file_arg + ":" + runner.wiki.config.override_section command = ["/usr/bin/python", "xmllogs.py", "--config", config_file_arg, "--wiki", runner.db_name, - "--outfile", self.get_inprogress_name(logging_path)] + "--outfile", DumpFilename.get_inprogress_name(logging_path)] pipeline = [command] series = [pipeline] @@ -297,10 +297,10 @@ variant = self.get_variant_from_dumpname(dfname.dumpname) variant_option = self._variant_option(variant) if runner.wiki.is_private(): - output_paths.append(self.get_inprogress_name( + output_paths.append(DumpFilename.get_inprogress_name( runner.dump_dir.filename_private_path(dfname))) else: - output_paths.append(self.get_inprogress_name( + output_paths.append(DumpFilename.get_inprogress_name( runner.dump_dir.filename_public_path(dfname))) variants.append(variant_option) -- To view, visit https://gerrit.wikimedia.org/r/400226 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ib4bc9f6a41f31431e5642a3f7f7415bd2de38ea8 Gerrit-PatchSet: 1 Gerrit-Project: operations/dumps Gerrit-Branch: master Gerrit-Owner: ArielGlenn <ar...@wikimedia.org> Gerrit-Reviewer: ArielGlenn <ar...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits