ArielGlenn has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/400226 )
Change subject: make reporting of file sizes for dump steps in progress work
again
......................................................................
make reporting of file sizes for dump steps in progress work again
This was probably broken when we introduced writing files with
a special extension and then moving them into place once the
run completes successfully.
Bug: T183694
Change-Id: Ib4bc9f6a41f31431e5642a3f7f7415bd2de38ea8
---
M xmldumps-backup/dumps/apijobs.py
M xmldumps-backup/dumps/fileutils.py
M xmldumps-backup/dumps/flowjob.py
M xmldumps-backup/dumps/jobs.py
M xmldumps-backup/dumps/recombinejobs.py
M xmldumps-backup/dumps/recompressjobs.py
M xmldumps-backup/dumps/runnerutils.py
M xmldumps-backup/dumps/tablesjobs.py
M xmldumps-backup/dumps/xmlcontentjobs.py
M xmldumps-backup/dumps/xmljobs.py
10 files changed, 51 insertions(+), 33 deletions(-)
Approvals:
ArielGlenn: Looks good to me, approved
jenkins-bot: Verified
diff --git a/xmldumps-backup/dumps/apijobs.py b/xmldumps-backup/dumps/apijobs.py
index 3991add..534ca99 100644
--- a/xmldumps-backup/dumps/apijobs.py
+++ b/xmldumps-backup/dumps/apijobs.py
@@ -1,5 +1,6 @@
import time
from dumps.exceptions import BackupError
+from dumps.fileutils import DumpFilename
from dumps.jobs import Dump
@@ -30,11 +31,11 @@
commands = self.build_command(runner)
if runner.wiki.is_private():
command_series = runner.get_save_command_series(
- commands, self.get_inprogress_name(
+ commands, DumpFilename.get_inprogress_name(
runner.dump_dir.filename_private_path(output_dfname)))
else:
command_series = runner.get_save_command_series(
- commands, self.get_inprogress_name(
+ commands, DumpFilename.get_inprogress_name(
runner.dump_dir.filename_public_path(output_dfname)))
self.setup_command_info(runner, command_series, [output_dfname])
diff --git a/xmldumps-backup/dumps/fileutils.py
b/xmldumps-backup/dumps/fileutils.py
index a264ece..e6b4b67 100644
--- a/xmldumps-backup/dumps/fileutils.py
+++ b/xmldumps-backup/dumps/fileutils.py
@@ -165,6 +165,8 @@
partnum_int part number as int
"""
+ INPROG = ".inprog" # extension for dump output files that are in progress
(not fully written)
+
@staticmethod
def make_checkpoint_string(first_page_id, last_page_id):
if first_page_id is not None and last_page_id is not None:
@@ -172,6 +174,10 @@
else:
return None
+ @staticmethod
+ def get_inprogress_name(filename):
+ return filename + DumpFilename.INPROG
+
def __init__(self, wiki, date=None, dump_name=None, filetype=None,
ext=None, partnum=None, checkpoint=None, temp=False):
"""Constructor. Arguments: the dump name as it should appear in the
filename,
diff --git a/xmldumps-backup/dumps/flowjob.py b/xmldumps-backup/dumps/flowjob.py
index b875101..f1de495 100644
--- a/xmldumps-backup/dumps/flowjob.py
+++ b/xmldumps-backup/dumps/flowjob.py
@@ -5,6 +5,7 @@
import os
from dumps.exceptions import BackupError
from dumps.utils import MultiVersion
+from dumps.fileutils import DumpFilename
from dumps.jobs import Dump
@@ -45,7 +46,7 @@
command.extend(script_command)
command.extend(["--wiki=%s" % runner.db_name,
"--current", "--report=1000",
- "--output=bzip2:%s" %
self.get_inprogress_name(flow_output_fpath)])
+ "--output=bzip2:%s" %
DumpFilename.get_inprogress_name(flow_output_fpath)])
if self.history:
command.append("--full")
pipeline = [command]
diff --git a/xmldumps-backup/dumps/jobs.py b/xmldumps-backup/dumps/jobs.py
index 8ebbbf1..256056b 100644
--- a/xmldumps-backup/dumps/jobs.py
+++ b/xmldumps-backup/dumps/jobs.py
@@ -54,8 +54,6 @@
class Dump(object):
- INPROG = ".inprog" # extension for dump output files that are in progress
(not fully written)
-
def __init__(self, name, desc, verbose=False):
self._desc = desc
self.verbose = verbose
@@ -84,14 +82,12 @@
if not hasattr(self, '_parts'):
self._parts = False
- def get_inprogress_name(self, filename):
- return filename + self.INPROG
-
def setup_command_info(self, runner, command_series, output_dfnames,
output_dir=None):
command_info = {}
command_info['runner'] = runner
command_info['series'] = command_series
- command_info['output_files'] = [dfname.filename + self.INPROG for
dfname in output_dfnames]
+ command_info['output_files'] = [dfname.filename + DumpFilename.INPROG
+ for dfname in output_dfnames]
if output_dir is not None:
command_info['output_dir'] = output_dir
else:
@@ -239,12 +235,12 @@
file_truncated = True
if runner.wiki.is_private():
dcontents = DumpContents(runner.wiki,
- self.get_inprogress_name(
+ DumpFilename.get_inprogress_name(
runner.dump_dir.filename_private_path(dfname)),
dfname)
else:
dcontents = DumpContents(runner.wiki,
- self.get_inprogress_name(
+ DumpFilename.get_inprogress_name(
runner.dump_dir.filename_public_path(dfname)),
dfname)
if exists(dcontents.filename):
@@ -319,10 +315,11 @@
if not commands['output_files']:
return
for inprogress_filename in commands['output_files']:
- if not inprogress_filename.endswith(self.INPROG):
+ if not inprogress_filename.endswith(DumpFilename.INPROG):
continue
final_dfname = DumpFilename(commands['runner'].wiki)
- final_dfname.new_from_filename(inprogress_filename[:-1 *
len(self.INPROG)])
+ final_dfname.new_from_filename(
+ inprogress_filename[:-1 * len(DumpFilename.INPROG)])
in_progress_path = os.path.join(commands['output_dir'],
inprogress_filename)
final_path = os.path.join(commands['output_dir'],
final_dfname.filename)
@@ -347,10 +344,10 @@
os.remove(dump_dir.filename_public_path(dfname))
elif exists(dump_dir.filename_private_path(dfname)):
os.remove(dump_dir.filename_private_path(dfname))
- if exists(dump_dir.filename_public_path(dfname) + self.INPROG):
- os.remove(dump_dir.filename_public_path(dfname) + self.INPROG)
- elif exists(dump_dir.filename_private_path(dfname) + self.INPROG):
- os.remove(dump_dir.filename_private_path(dfname) + self.INPROG)
+ if exists(dump_dir.filename_public_path(dfname) + DumpFilename.INPROG):
+ os.remove(dump_dir.filename_public_path(dfname) +
DumpFilename.INPROG)
+ elif exists(dump_dir.filename_private_path(dfname) +
DumpFilename.INPROG):
+ os.remove(dump_dir.filename_private_path(dfname) +
DumpFilename.INPROG)
def cleanup_old_files(self, dump_dir, runner):
if "cleanup_old_files" in runner.enabled:
diff --git a/xmldumps-backup/dumps/recombinejobs.py
b/xmldumps-backup/dumps/recombinejobs.py
index 12c49e5..6d1f0a1 100644
--- a/xmldumps-backup/dumps/recombinejobs.py
+++ b/xmldumps-backup/dumps/recombinejobs.py
@@ -7,6 +7,7 @@
import signal
from dumps.exceptions import BackupError
from dumps.jobs import Dump
+from dumps.fileutils import DumpFilename
from dumps.CommandManagement import CommandPipeline
@@ -74,7 +75,7 @@
recombines.append(recombine)
recombine_command_string = ("(" + ";".join(recombines) + ")" + "|" +
"%s %s" % (compression_command,
-
self.get_inprogress_name(output_filename)))
+
DumpFilename.get_inprogress_name(output_filename)))
return recombine_command_string
diff --git a/xmldumps-backup/dumps/recompressjobs.py
b/xmldumps-backup/dumps/recompressjobs.py
index 90c22c8..49ca8aa 100644
--- a/xmldumps-backup/dumps/recompressjobs.py
+++ b/xmldumps-backup/dumps/recompressjobs.py
@@ -110,8 +110,8 @@
infilepath = runner.dump_dir.filename_public_path(input_dfname)
command_pipe = [["%s -dc %s | %s --pagesperstream 100 --buildindex %s
> %s" %
(self.wiki.config.bzip2, infilepath,
self.wiki.config.recompressxml,
- self.get_inprogress_name(outfilepath_index),
- self.get_inprogress_name(outfilepath))]]
+ DumpFilename.get_inprogress_name(outfilepath_index),
+ DumpFilename.get_inprogress_name(outfilepath))]]
return [command_pipe]
def run(self, runner):
@@ -350,7 +350,7 @@
command_pipe = [["%s -dc %s | %s a -mx=4 -si %s" %
(self.wiki.config.bzip2, infilepath,
self.wiki.config.sevenzip,
- self.get_inprogress_name(outfilepath))]]
+ DumpFilename.get_inprogress_name(outfilepath))]]
command_series.append(command_pipe)
return command_series
diff --git a/xmldumps-backup/dumps/runnerutils.py
b/xmldumps-backup/dumps/runnerutils.py
index 4688900..10b0295 100644
--- a/xmldumps-backup/dumps/runnerutils.py
+++ b/xmldumps-backup/dumps/runnerutils.py
@@ -272,9 +272,18 @@
status ("in-progress", "missing", ...)
"""
filename = dump_dir.filename_public_path(dfname)
+ size = None
if exists(filename):
size = os.path.getsize(filename)
- else:
+ elif item_status == "in-progress":
+ # note that because multiple files may be produced for a single
dump
+ # job, some may be complete while others are still in progress.
+ # therefore we check the normal name first, falling back to the
+ # inprogress name.
+ filename = filename + DumpFilename.INPROG
+ if exists(filename):
+ size = os.path.getsize(filename)
+ if size is None:
item_status = "missing"
size = 0
pretty_size = FileUtils.pretty_size(size)
diff --git a/xmldumps-backup/dumps/tablesjobs.py
b/xmldumps-backup/dumps/tablesjobs.py
index a82e01b..e2ca0ce 100644
--- a/xmldumps-backup/dumps/tablesjobs.py
+++ b/xmldumps-backup/dumps/tablesjobs.py
@@ -9,6 +9,7 @@
from dumps.exceptions import BackupError
from dumps.jobs import Dump
+from dumps.fileutils import DumpFilename
class PublicTable(Dump):
@@ -33,11 +34,11 @@
commands = runner.db_server_info.build_sqldump_command(self._table,
runner.wiki.config.gzip)
if self.private or runner.wiki.is_private():
command_series = runner.get_save_command_series(
- commands, self.get_inprogress_name(
+ commands, DumpFilename.get_inprogress_name(
runner.dump_dir.filename_private_path(output_dfname)))
else:
command_series = runner.get_save_command_series(
- commands, self.get_inprogress_name(
+ commands, DumpFilename.get_inprogress_name(
runner.dump_dir.filename_public_path(output_dfname)))
return command_series
@@ -136,10 +137,12 @@
series = runner.db_server_info.build_sql_command(query,
runner.wiki.config.gzip)
if runner.wiki.is_private():
return runner.get_save_command_series(
- series,
self.get_inprogress_name(runner.dump_dir.filename_private_path(out_dfname)))
+ series, DumpFilename.get_inprogress_name(
+ runner.dump_dir.filename_private_path(out_dfname)))
else:
return runner.get_save_command_series(
- series,
self.get_inprogress_name(runner.dump_dir.filename_public_path(out_dfname)))
+ series, DumpFilename.get_inprogress_name(
+ runner.dump_dir.filename_public_path(out_dfname)))
def save_sql(self, runner, command_series):
"""Pass some SQL commands to the server for this DB and save output to
a gzipped file."""
diff --git a/xmldumps-backup/dumps/xmlcontentjobs.py
b/xmldumps-backup/dumps/xmlcontentjobs.py
index b0db10d..2e21232 100644
--- a/xmldumps-backup/dumps/xmlcontentjobs.py
+++ b/xmldumps-backup/dumps/xmlcontentjobs.py
@@ -765,7 +765,7 @@
bz2mode = "dbzip2"
else:
bz2mode = "bzip2"
- return "--output=%s:%s" % (bz2mode,
self.get_inprogress_name(xmlbz2_path))
+ return "--output=%s:%s" % (bz2mode,
DumpFilename.get_inprogress_name(xmlbz2_path))
def build_command(self, runner, stub_dfname, prefetch, output_dfname):
"""
diff --git a/xmldumps-backup/dumps/xmljobs.py b/xmldumps-backup/dumps/xmljobs.py
index 3b6ea86..4419d42 100644
--- a/xmldumps-backup/dumps/xmljobs.py
+++ b/xmldumps-backup/dumps/xmljobs.py
@@ -134,9 +134,9 @@
config_file_arg = config_file_arg + ":" +
runner.wiki.config.override_section
command = ["/usr/bin/python", "xmlstubs.py", "--config",
config_file_arg,
"--wiki", runner.db_name,
- "--articles", self.get_inprogress_name(articles_filepath),
- "--history", self.get_inprogress_name(history_filepath),
- "--current", self.get_inprogress_name(current_filepath)]
+ "--articles",
DumpFilename.get_inprogress_name(articles_filepath),
+ "--history",
DumpFilename.get_inprogress_name(history_filepath),
+ "--current",
DumpFilename.get_inprogress_name(current_filepath)]
if output_dfname.partnum:
# set up start end end pageids for this piece
@@ -227,7 +227,7 @@
config_file_arg = config_file_arg + ":" +
runner.wiki.config.override_section
command = ["/usr/bin/python", "xmllogs.py", "--config",
config_file_arg, "--wiki", runner.db_name,
- "--outfile", self.get_inprogress_name(logging_path)]
+ "--outfile", DumpFilename.get_inprogress_name(logging_path)]
pipeline = [command]
series = [pipeline]
@@ -297,10 +297,10 @@
variant = self.get_variant_from_dumpname(dfname.dumpname)
variant_option = self._variant_option(variant)
if runner.wiki.is_private():
- output_paths.append(self.get_inprogress_name(
+ output_paths.append(DumpFilename.get_inprogress_name(
runner.dump_dir.filename_private_path(dfname)))
else:
- output_paths.append(self.get_inprogress_name(
+ output_paths.append(DumpFilename.get_inprogress_name(
runner.dump_dir.filename_public_path(dfname)))
variants.append(variant_option)
--
To view, visit https://gerrit.wikimedia.org/r/400226
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ib4bc9f6a41f31431e5642a3f7f7415bd2de38ea8
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: master
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits