[MediaWiki-commits] [Gerrit] dumps: provide sha1 checksums of all files along with the ol... - change (operations/dumps)

ArielGlenn (Code Review) Thu, 01 Oct 2015 01:12:59 -0700

ArielGlenn has submitted this change and it was merged.

Change subject: dumps: provide sha1 checksums of all files along with the old 
md5s
......................................................................



dumps: provide sha1 checksums of all files along with the old md5s

(T101985) if this turns out to be too slow, disabling it is a matter
of changing one line in the code.  We could even turn that into
a configuration option
Change-Id: Ibe83b7248152d815d8743dd794788e7c6a803c91
---
M xmldumps-backup/dumps/fileutils.py
M xmldumps-backup/dumps/jobs.py
M xmldumps-backup/dumps/runnerutils.py
M xmldumps-backup/worker.py
4 files changed, 75 insertions(+), 40 deletions(-)

Approvals:
  ArielGlenn: Verified; Looks good to me, approved



diff --git a/xmldumps-backup/dumps/fileutils.py 
b/xmldumps-backup/dumps/fileutils.py
index 10a9064..f3094f5 100644
--- a/xmldumps-backup/dumps/fileutils.py
+++ b/xmldumps-backup/dumps/fileutils.py
@@ -186,6 +186,8 @@
     Methods:
 
     md5sum(): return md5sum of the file contents.
+    sha1sum(): return sha1sum of the file contents.
+    checksum(htype): return checksum of the specified type, of the file 
contents.
     check_if_truncated(): for compressed files, check if the file is truncated 
(stops
        abruptly before the end of the compressed data) or not, and set and 
return
          self.is_truncated accordingly.  This is fast for bzip2 files
@@ -220,10 +222,9 @@
             self.file_obj = DumpFilename(wiki)
             self.file_obj.new_from_filename(os.path.basename(filename))
 
-    def md5sum(self):
+    def _checksum(self, summer):
         if not self.filename:
             return None
-        summer = hashlib.md5()
         infile = file(self.filename, "rb")
         bufsize = 4192 * 32
         buffer = infile.read(bufsize)
@@ -233,6 +234,22 @@
         infile.close()
         return summer.hexdigest()
 
+    def md5sum(self):
+        summer = hashlib.md5()
+        return self._checksum(summer)
+
+    def sha1sum(self):
+        summer = hashlib.sha1()
+        return self._checksum(summer)
+
+    def checksum(self, htype):
+        if htype == "md5":
+            return self.md5sum()
+        elif htype == "sha1":
+            return self.sha1sum()
+        else:
+            return None
+
     def get_first_500_lines(self):
         if self.first_lines:
             return self.first_lines
diff --git a/xmldumps-backup/dumps/jobs.py b/xmldumps-backup/dumps/jobs.py
index bee577c..77b444a 100644
--- a/xmldumps-backup/dumps/jobs.py
+++ b/xmldumps-backup/dumps/jobs.py
@@ -506,7 +506,7 @@
 # these routines are all used for listing output files for various purposes...
 #
 #
-    # Used for updating md5 lists, index.html
+    # Used for updating md5/sha1 lists, index.html
     # Includes: checkpoints, chunks, chunkless, temp files if they exist. At 
end of run temp files must be gone.
     # This is *all* output files for the dumpname, regardless of what's being 
re-run.
     def list_outfiles_to_publish(self, dump_dir, dump_names=None):
diff --git a/xmldumps-backup/dumps/runnerutils.py 
b/xmldumps-backup/dumps/runnerutils.py
index d0705bb..8bb8a57 100644
--- a/xmldumps-backup/dumps/runnerutils.py
+++ b/xmldumps-backup/dumps/runnerutils.py
@@ -42,52 +42,61 @@
         self.verbose = verbose
         self.timestamp = time.strftime("%Y%m%d%H%M%S", time.gmtime())
         self._enabled = enabled
+        self.hashtypes = ['md5', 'sha1']
 
     def prepare_checksums(self):
         """Create a temporary md5 checksum file.
         Call this at the start of the dump run, and move the file
         into the final location at the completion of the dump run."""
         if self._enabled:
-            checksum_filename = self._get_checksum_filename_tmp()
-            output = file(checksum_filename, "w")
+            for htype in self.hashtypes:
+                checksum_filename = self._get_checksum_filename_tmp(htype)
+                output = file(checksum_filename, "w")
 
-    def checksum(self, file_obj, runner):
+    def checksums(self, file_obj, runner):
         """Run checksum for an output file, and append to the list."""
         if self._enabled:
-            checksum_filename = self._get_checksum_filename_tmp()
-            output = file(checksum_filename, "a")
-            runner.debug("Checksumming %s" % file_obj.filename)
-            dumpfile = DumpFile(self.wiki, 
runner.dump_dir.filename_public_path(file_obj), None, self.verbose)
-            checksum = dumpfile.md5sum()
-            if checksum != None:
-                output.write("%s  %s\n" % (checksum, file_obj.filename))
-            output.close()
+            for htype in self.hashtypes:
+                checksum_filename = self._get_checksum_filename_tmp(htype)
+                output = file(checksum_filename, "a")
+                runner.debug("Checksumming %s via %s" % (file_obj.filename, 
htype))
+                dumpfile = DumpFile(self.wiki, 
runner.dump_dir.filename_public_path(file_obj), None, self.verbose)
+                checksum = dumpfile.checksum(htype)
+                if checksum != None:
+                    output.write("%s  %s\n" % (checksum, file_obj.filename))
+                output.close()
 
-    def move_md5file_into_place(self):
+    def move_chksumfiles_into_place(self):
         if self._enabled:
-            tmp_filename = self._get_checksum_filename_tmp()
-            real_filename = self._get_checksum_filename()
-            os.rename(tmp_filename, real_filename)
+            for htype in self.hashtypes:
+                tmp_filename = self._get_checksum_filename_tmp(htype)
+                real_filename = self._get_checksum_filename(htype)
+                os.rename(tmp_filename, real_filename)
 
-    def cp_md5_tmpfile_to_permfile(self):
+    def cp_chksum_tmpfiles_to_permfile(self):
         if self._enabled:
-            tmp_filename = self._get_checksum_filename_tmp()
-            real_filename = self._get_checksum_filename()
-            text = FileUtils.readFile(tmp_filename)
-            FileUtils.writeFile(self.wiki.config.tempDir, real_filename, text, 
self.wiki.config.fileperms)
+            for htype in self.hashtypes:
+                tmp_filename = self._get_checksum_filename_tmp(htype)
+                real_filename = self._get_checksum_filename(htype)
+                text = FileUtils.readFile(tmp_filename)
+                FileUtils.writeFile(self.wiki.config.tempDir, real_filename, 
text, self.wiki.config.fileperms)
 
-    def get_checksum_filename_basename(self):
-        return "md5sums.txt"
-
+    def get_checksum_filename_basename(self, htype):
+        if htype == "md5":
+            return "md5sums.txt"
+        elif htype == "sha1":
+            return "sha1sums.txt"
+        else:
+            return None
     #
     # functions internal to the class
     #
-    def _get_checksum_filename(self):
-        file_obj = DumpFilename(self.wiki, None, 
self.get_checksum_filename_basename())
+    def _get_checksum_filename(self, htype):
+        file_obj = DumpFilename(self.wiki, None, 
self.get_checksum_filename_basename(htype))
         return self.dump_dir.filename_public_path(file_obj)
 
-    def _get_checksum_filename_tmp(self):
-        file_obj = DumpFilename(self.wiki, None, 
self.get_checksum_filename_basename() + "." + self.timestamp + ".tmp")
+    def _get_checksum_filename_tmp(self, htype):
+        file_obj = DumpFilename(self.wiki, None, 
self.get_checksum_filename_basename(htype) + "." + self.timestamp + ".tmp")
         return self.dump_dir.filename_public_path(file_obj)
 
     def _getmd5file_dir_name(self):
@@ -175,13 +184,21 @@
         else:
             return html
 
+    def get_checksum_html(self, htype):
+        basename =  self.checksums.get_checksum_filename_basename(htype)
+        path = DumpFilename(self.wiki, None, basename)
+        web_path = self.dump_dir.web_path_relative(path)
+        return '<a href="%s">(%s)</a>' %(web_path, htype)
+
     def _report_database_status_detailed(self, done=False):
         """Put together a status page for this database, with all its 
component dumps."""
         self.notice_file.refresh_notice()
         status_items = [self._report_item(item) for item in self.items]
         status_items.reverse()
         html = "\n".join(status_items)
-        fname = DumpFilename(self.wiki, None, 
self.checksums.get_checksum_filename_basename())
+        checksums = [self.get_checksum_html(htype)
+            for htype in self.checksums.hashtypes]
+        checksums_html = ", ".join(checksums)
         return self.wiki.config.readTemplate("report.html") % {
             "db": self.db_name,
             "date": self.wiki.date,
@@ -189,7 +206,7 @@
             "status": self._report_status_summary_line(done),
             "previous": self._report_previous_dump(done),
             "items": html,
-            "checksum": self.dump_dir.web_path_relative(fname),
+            "checksum": checksums_html,
             "index": self.wiki.config.index}
 
     def _report_previous_dump(self, done):
diff --git a/xmldumps-backup/worker.py b/xmldumps-backup/worker.py
index 641bd1a..62c0e2e 100644
--- a/xmldumps-backup/worker.py
+++ b/xmldumps-backup/worker.py
@@ -275,7 +275,7 @@
         if job == "noop" or job == "latestlinks" or job == "createdirs":
             return True
         sys.stderr.write("No job of the name specified exists. Choose one of 
the following:\n")
-        sys.stderr.write("noop (runs no job but rewrites md5sums file and 
resets latest links)\n")
+        sys.stderr.write("noop (runs no job but rewrites checksums files and 
resets latest links)\n")
         sys.stderr.write("latestlinks (runs no job but resets latest links)\n")
         sys.stderr.write("createdirs (runs no job but creates dump dirs for 
the given date)\n")
         sys.stderr.write("tables (includes all items below that end in 
'table')\n")
@@ -546,7 +546,7 @@
                 # were for earlier ones
                 self.sym_links.save_symlink(file_obj)
                 self.feeds.save_feed(file_obj)
-                self.checksums.checksum(file_obj, self)
+                self.checksums.checksums(file_obj, self)
                 self.sym_links.cleanup_symlinks()
                 self.feeds.cleanup_feeds()
 
@@ -616,10 +616,10 @@
                             item.set_status("failed")
 
             if item.status() == "done":
-                self.checksums.cp_md5_tmpfile_to_permfile()
+                self.checksums.cp_chksum_tmpfiles_to_permfile()
                 self.run_update_item_fileinfo(item)
             elif item.status() == "waiting" or item.status() == "skipped":
-                # don't update the md5 file for this item.
+                # don't update the checksum files for this item.
                 continue
             else:
                 # Here for example status is "failed". But maybe also
@@ -710,12 +710,13 @@
 
     def complete_dump(self):
         # note that it's possible for links in "latest" to point to
-        # files from different runs, in which case the md5sums file
+        # files from different runs, in which case the checksum files
         # will have accurate checksums for the run for which it was
         # produced, but not the other files. FIXME
-        self.checksums.move_md5file_into_place()
-        dumpfile = DumpFilename(self.wiki, None, 
self.checksums.get_checksum_filename_basename())
-        self.sym_links.save_symlink(dumpfile)
+        self.checksums.move_chksumfiles_into_place()
+        for htype in self.checksums.hashtypes:
+            dumpfile = DumpFilename(self.wiki, None, 
self.checksums.get_checksum_filename_basename(htype))
+            self.sym_links.save_symlink(dumpfile)
         self.sym_links.cleanup_symlinks()
 
         for item in self.dump_item_list.dump_items:

-- 
To view, visit https://gerrit.wikimedia.org/r/242626
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ibe83b7248152d815d8743dd794788e7c6a803c91
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] dumps: provide sha1 checksums of all files along with the ol... - change (operations/dumps)

Reply via email to