ArielGlenn has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/336849 )
Change subject: Clean up temp files from page content dumps before retry
......................................................................
Clean up temp files from page content dumps before retry
[WIP] not tested yet
Bug:T157704
Change-Id: Ib415fba86b8d026e5b5dd53f9a53b4f2f0639b21
---
M xmldumps-backup/dumps/runner.py
M xmldumps-backup/dumps/xmljobs.py
2 files changed, 48 insertions(+), 3 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/dumps
refs/changes/49/336849/1
diff --git a/xmldumps-backup/dumps/runner.py b/xmldumps-backup/dumps/runner.py
index af2d62a..a58aa2f 100644
--- a/xmldumps-backup/dumps/runner.py
+++ b/xmldumps-backup/dumps/runner.py
@@ -466,7 +466,7 @@
for setting in [StatusHtml.NAME, IndexHtml.NAME, Checksummer.NAME,
RunInfoFile.NAME, SymLinks.NAME, RunSettings.NAME,
Feeds.NAME, NoticeFile.NAME, "makedir",
"clean_old_dumps",
- "cleanup_old_files", "check_trunc_files"]:
+ "cleanup_old_files", "check_trunc_files",
"cleanup_tmp_files"]:
self.enabled[setting] = True
if not self.cleanup_old_files:
@@ -481,7 +481,7 @@
del self.enabled[setting]
if self.dryrun:
- for setting in ["check_trunc_files"]:
+ for setting in ["check_trunc_files", "cleanup_tmp_files"]:
if setting in self.enabled:
del self.enabled[setting]
if "logging" in self.enabled:
diff --git a/xmldumps-backup/dumps/xmljobs.py b/xmldumps-backup/dumps/xmljobs.py
index 18bfe83..5d0e366 100644
--- a/xmldumps-backup/dumps/xmljobs.py
+++ b/xmldumps-backup/dumps/xmljobs.py
@@ -12,6 +12,7 @@
from dumps.fileutils import DumpFile, DumpFilename
from dumps.utils import MultiVersion, MiscUtils
from dumps.jobs import Dump
+from dumps.WikiDump import Locker
def batcher(items, batchsize):
@@ -384,9 +385,49 @@
else:
return False
+ def cleanup_tmp_files(self, dump_dir, runner):
+ """
+ with checkpoint files turned on, this job writes output
+ to <something>.xml<-maybemorestuff>.bz2-tmp
+ and if those files are lying around after such a job dies,
+ we should clean them up
+ """
+ if not "cleanup_tmp_files" in runner.enabled:
+ return
+
+ # if we don't have the lock it's possible some
+ # other process is writing tmp files, don't touch
+ locker = Locker(self.wiki, self.wiki.date)
+ lockfiles = locker.is_locked()
+ if not lockfiles:
+ return
+ if len(lockfiles) > 1:
+ # more than one process with the lock? should not
+ # be possible, but if it is... touch nothing!
+ return
+ if not locker.check_owner(lockfiles[0], str(os.getpid())):
+ return
+
+ to_delete = self.get_tmp_files(dump_dir)
+ for finfo in to_delete:
+ if exists(dump_dir.filename_public_path(finfo)):
+ os.remove(dump_dir.filename_public_path(finfo))
+ elif exists(dump_dir.filename_private_path(finfo)):
+ os.remove(dump_dir.filename_private_path(finfo))
+
+
def run(self, runner):
- # here we will either clean up or not depending on how we were called
FIXME
+ # here we will either clean up or not depending on how we were called
+ # FIXME callers should set this appropriately and they don't right now
self.cleanup_old_files(runner.dump_dir, runner)
+
+ # clean up all tmp output files from previous attempts of this job
+ # for this dump wiki and date, otherwise we'll wind up indexing
+ # them and hashsumming them etc.
+ # they may have been left around from an interrupted or failed earlier
+ # run
+ self.cleanup_tmp_files(runner.dump_dir, runner)
+
commands = []
todo = []
@@ -861,6 +902,10 @@
runner.debug("Could not locate a prefetchable dump.")
return None
+ def get_tmp_files(self, dump_dir, dump_names=None):
+ files = Dump.list_outfiles_for_cleanup(self, dump_dir, dump_names)
+ return [fileinfo for fileinfo in files if fileinfo.is_temp_file]
+
def list_outfiles_for_cleanup(self, dump_dir, dump_names=None):
files = Dump.list_outfiles_for_cleanup(self, dump_dir, dump_names)
files_to_return = []
--
To view, visit https://gerrit.wikimedia.org/r/336849
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ib415fba86b8d026e5b5dd53f9a53b4f2f0639b21
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: master
Gerrit-Owner: ArielGlenn <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits