ArielGlenn has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/336849 )

Change subject: Clean up temp files from page content dumps before retry
......................................................................


Clean up temp files from page content dumps before retry

Bug:T157704
Change-Id: Ib415fba86b8d026e5b5dd53f9a53b4f2f0639b21
---
M xmldumps-backup/dumps/runner.py
M xmldumps-backup/dumps/xmljobs.py
2 files changed, 47 insertions(+), 3 deletions(-)

Approvals:
  ArielGlenn: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/xmldumps-backup/dumps/runner.py b/xmldumps-backup/dumps/runner.py
index af2d62a..a58aa2f 100644
--- a/xmldumps-backup/dumps/runner.py
+++ b/xmldumps-backup/dumps/runner.py
@@ -466,7 +466,7 @@
         for setting in [StatusHtml.NAME, IndexHtml.NAME, Checksummer.NAME,
                         RunInfoFile.NAME, SymLinks.NAME, RunSettings.NAME,
                         Feeds.NAME, NoticeFile.NAME, "makedir", 
"clean_old_dumps",
-                        "cleanup_old_files", "check_trunc_files"]:
+                        "cleanup_old_files", "check_trunc_files", 
"cleanup_tmp_files"]:
             self.enabled[setting] = True
 
         if not self.cleanup_old_files:
@@ -481,7 +481,7 @@
                     del self.enabled[setting]
 
         if self.dryrun:
-            for setting in ["check_trunc_files"]:
+            for setting in ["check_trunc_files", "cleanup_tmp_files"]:
                 if setting in self.enabled:
                     del self.enabled[setting]
             if "logging" in self.enabled:
diff --git a/xmldumps-backup/dumps/xmljobs.py b/xmldumps-backup/dumps/xmljobs.py
index 18bfe83..2254fdc 100644
--- a/xmldumps-backup/dumps/xmljobs.py
+++ b/xmldumps-backup/dumps/xmljobs.py
@@ -12,6 +12,7 @@
 from dumps.fileutils import DumpFile, DumpFilename
 from dumps.utils import MultiVersion, MiscUtils
 from dumps.jobs import Dump
+from dumps.WikiDump import Locker
 
 
 def batcher(items, batchsize):
@@ -384,9 +385,48 @@
         else:
             return False
 
+    def cleanup_tmp_files(self, dump_dir, runner):
+        """
+        with checkpoint files turned on, this job writes output
+        to <something>.xml<-maybemorestuff>.bz2-tmp
+        and if those files are lying around after such a job dies,
+        we should clean them up
+        """
+        if "cleanup_tmp_files" not in runner.enabled:
+            return
+
+        # if we don't have the lock it's possible some
+        # other process is writing tmp files, don't touch
+        locker = Locker(self.wiki, self.wiki.date)
+        lockfiles = locker.is_locked()
+        if not lockfiles:
+            return
+        if len(lockfiles) > 1:
+            # more than one process with the lock? should not
+            # be possible, but if it is... touch nothing!
+            return
+        if not locker.check_owner(lockfiles[0], str(os.getpid())):
+            return
+
+        to_delete = self.get_tmp_files(dump_dir)
+        for finfo in to_delete:
+            if exists(dump_dir.filename_public_path(finfo)):
+                os.remove(dump_dir.filename_public_path(finfo))
+            elif exists(dump_dir.filename_private_path(finfo)):
+                os.remove(dump_dir.filename_private_path(finfo))
+
     def run(self, runner):
-        # here we will either clean up or not depending on how we were called 
FIXME
+        # here we will either clean up or not depending on how we were called
+        # FIXME callers should set this appropriately and they don't right now
         self.cleanup_old_files(runner.dump_dir, runner)
+
+        # clean up all tmp output files from previous attempts of this job
+        # for this dump wiki and date, otherwise we'll wind up indexing
+        # them and hashsumming them etc.
+        # they may have been left around from an interrupted or failed earlier
+        # run
+        self.cleanup_tmp_files(runner.dump_dir, runner)
+
         commands = []
 
         todo = []
@@ -861,6 +901,10 @@
         runner.debug("Could not locate a prefetchable dump.")
         return None
 
+    def get_tmp_files(self, dump_dir, dump_names=None):
+        files = Dump.list_outfiles_for_cleanup(self, dump_dir, dump_names)
+        return [fileinfo for fileinfo in files if fileinfo.is_temp_file]
+
     def list_outfiles_for_cleanup(self, dump_dir, dump_names=None):
         files = Dump.list_outfiles_for_cleanup(self, dump_dir, dump_names)
         files_to_return = []

-- 
To view, visit https://gerrit.wikimedia.org/r/336849
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ib415fba86b8d026e5b5dd53f9a53b4f2f0639b21
Gerrit-PatchSet: 3
Gerrit-Project: operations/dumps
Gerrit-Branch: master
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to