ArielGlenn has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/386162 )
Change subject: one-off scripts for fixing up multistream dump mess ...................................................................... one-off scripts for fixing up multistream dump mess Just in case we ever need them again, hopefully not, here they are. Change-Id: Iadf6d9d3ab8fc39a89836f08d50fb98f7f12d088 --- A fixups/fixup_hashfiles.py A fixups/fixup_html.py A fixups/fixup_recompress_moves.py A fixups/fixup_report_json.py 4 files changed, 514 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/dumps refs/changes/62/386162/1 diff --git a/fixups/fixup_hashfiles.py b/fixups/fixup_hashfiles.py new file mode 100644 index 0000000..cf036fb --- /dev/null +++ b/fixups/fixup_hashfiles.py @@ -0,0 +1,170 @@ +import os +import sys +import hashlib +import json + + +def read_wikis(filepath): + "read list of wikis, one per line, from file, return the list" + fhandle = open(filepath, "r") + text = fhandle.read() + fhandle.close() + return text.splitlines() + + +def checksum(filename, htype): + "return hash of specified file in string format, using specified hash type" + if htype == 'md5': + summer = hashlib.md5() + else: + summer = hashlib.sha1() + infhandle = file(filename, "rb") + bufsize = 4192 * 32 + fbuffer = infhandle.read(bufsize) + while fbuffer: + summer.update(fbuffer) + fbuffer = infhandle.read(bufsize) + infhandle.close() + return summer.hexdigest() + + +def update_hashes_text(hashed_paths, output_file, hash_strings, dryrun): + """ + we expect the file to contain all the existing hashes, + we will append to it + """ + if not os.path.exists(output_file): + # no file with old hashes. something's wrong, skip. + return + + with open(output_file, "r") as fhandle: + content = fhandle.read() + new_file = output_file + ".new" + + if not dryrun: + output_handle = file(new_file, "wt") + output_handle.write(content) + + for idx in range(0, len(hashed_paths)): + if hashed_paths[idx] in content: + # info already present in hash file. skip. + continue + + if dryrun: + print "would append: '{hsum} {path}' to".format( + hsum=hash_strings[idx], path=hashed_paths[idx]), new_file + else: + output_handle.write("{hsum} {path}\n".format(hsum=hash_strings[idx], + path=hashed_paths[idx])) + if not dryrun: + output_handle.close() + + +def update_hashes_json(hashed_paths, output_file, hash_strings, htype, dryrun): + """ + we expect the file to contain all the existing hashes, + we read it, load the json, add our entry to the dict, convert it + back to json and write it back out as new file + """ + if not os.path.exists(output_file): + # no file with old hashes. something's wrong, skip. + return + + with open(output_file, "r") as fhandle: + contents = fhandle.read() + output = json.loads(contents) + + new_file = output_file + ".new" + if not dryrun: + output_handle = file(new_file, "wt") + + for idx in range(0, len(hashed_paths)): + output[htype]["files"][hashed_paths[idx]] = hash_strings[idx] + + if dryrun: + print "would write: '{outp}' to".format(outp=json.dumps(output)), new_file + else: + output_handle.write(json.dumps(output)) + output_handle.close() + + +def update_hashes(file_paths, hashes_path, hash_strings, htype, ftype, dryrun): + filenames = [os.path.basename(path) for path in file_paths] + if ftype == 'txt': + update_hashes_text(filenames, hashes_path, hash_strings, dryrun) + else: + update_hashes_json(filenames, hashes_path, hash_strings, htype, dryrun) + + +def get_hashfile_path(dumpstree, wiki, date, hashtype, filetype): + dumpsdir = os.path.join(dumpstree, wiki, date) + filename = '-'.join([wiki, date, '{htype}sums.{ftype}'.format(htype=hashtype, ftype=filetype)]) + return os.path.join(dumpsdir, filename) + + +def cleanup_hashfiles(wiki, dumpstree, date, filename_bases, dryrun): + """ + For the specified wiki and date, given the base part of the filename, + get the md5 and sha1 sums of the corresponding wiki dump file for + that date, append these to the plaintext files of hashes and write + out new files. + + Also write new json files of hashes to include this information; + these values will overwrite old values if present. + """ + dumpsdir = os.path.join(dumpstree, wiki, date) + if not os.path.exists(dumpsdir): + # skip dirs where the file doesn't exist, + # the run hasn't happened, or it's a private + # wiki with files elsewhere + print "skipping this wiki", dumpsdir + return + + filenames = ['-'.join([wiki, date, base]) for base in filename_bases] + file_paths = [os.path.join(dumpsdir, filename) for filename in filenames] + file_paths = [path for path in file_paths if os.path.exists(path)] + for htype in ['md5', 'sha1']: + for ftype in ['txt', 'json']: + hashes_path = get_hashfile_path(dumpstree, wiki, date, htype, ftype) + hash_strings = [checksum(filename, htype) for filename in file_paths] + update_hashes(file_paths, hashes_path, hash_strings, htype, ftype, dryrun) + + +def usage(message=None): + "display a usage message and exit." + if message is not None: + print message + + usage_message = """Usage: {script} YYYYMMDD [dryrun] +Adds md5sum and sha1sum of multistream content and index files +to the plaintext files and the json files with hash lists. + +The new files are created with the extension '.new' at the end. +""".format(script=sys.argv[0]) + print usage_message + sys.exit(1) + + +def do_main(alldbs, dumpstree, date, filename_bases, dryrun): + "main entry point" + wikis = read_wikis(alldbs) + for wiki in wikis: + cleanup_hashfiles(wiki, dumpstree, date, filename_bases, dryrun) + + +if __name__ == '__main__': + if len(sys.argv) < 2 or len(sys.argv) > 3: + usage() + if sys.argv[1] in ['-h', '--help']: + usage("Help for this script") + + dblist = '/home/datasets/all.dblist.edited' + publicdir = '/mnt/data/xmldatadumps/public' + + #dblist = '/home/ariel/dumptesting/dblists/all.dblist' + #publicdir = '/home/ariel/dumptesting/dumpruns/public' + + basenames = ['pages-articles-multistream-index.txt.bz2', + 'pages-articles-multistream.xml.bz2'] + do_main(dblist, publicdir, sys.argv[1], basenames, + dryrun=True if len(sys.argv) == 3 else False) diff --git a/fixups/fixup_html.py b/fixups/fixup_html.py new file mode 100644 index 0000000..306027c --- /dev/null +++ b/fixups/fixup_html.py @@ -0,0 +1,132 @@ +import os +import sys + + +def read_wikis(filepath): + "read list of wikis from file, one per line, and return list" + fhandle = open(filepath, "r") + text = fhandle.read() + fhandle.close() + return text.splitlines() + + +def pretty_size(size, quanta): + "return size of file scaled down as much as possible." + if size < 1024 or len(quanta) == 1: + return quanta[0] % size + else: + return pretty_size(size / 1024.0, quanta[1:]) + + +def get_printable_size(filepath): + "return size of file with nice human readable format" + quanta = ("%d bytes", "%d KB", "%0.1f MB", "%0.1f GB", "%0.1f TB") + size = os.path.getsize(filepath) + return pretty_size(size, quanta) + + +def get_new_html(multistream_name, multistr_index_name, + multistream_path, multistr_index_path, + html_path): + """ + read old html content, fix up the lines that are missing info + for the multistream content and index files, return the new + content + """ + with open(html_path, "r") as fhandle: + contents = fhandle.read() + lines = contents.splitlines() + + new_lines = [] + for line in lines: + if 'pages-articles-multistream.xml' in line: + line = line.replace( + "<li class='missing'>", + "<li class='file'>" + '<a href="{path}">'.format(path=multistream_name)) + line = line.replace( + "stream.xml.bz2</li>", + "stream.xml.bz2</a> {size} </li>".format(size=get_printable_size(multistream_path))) + elif 'pages-articles-multistream-index.txt' in line: + line = line.replace( + "<li class='missing'>", + "<li class='file'>" + '<a href="{path}">'.format(path=multistr_index_name)) + line = line.replace( + "index.txt.bz2</li>", + "index.txt.bz2</a> {size} </li>".format( + size=get_printable_size(multistr_index_path))) + new_lines.append(line) + return new_lines + + +def cleanup_html(wiki, dumpstree, date, dryrun): + """ + add size and link for content and index multistream files + to index.html file for the dump of the given wiki and date, + writing out a new file. + """ + dumpsdir = os.path.join(dumpstree, wiki, date) + if not os.path.exists(dumpsdir): + # skip dirs where the file doesn't exist, + # the run hasn't happened, or it's a private + # wiki with files elsewhere + return + multistream_name = '-'.join([wiki, date, 'pages-articles-multistream.xml.bz2']) + multistr_index_name = '-'.join([wiki, date, 'pages-articles-multistream-index.txt.bz2']) + + multistream_path = os.path.join(dumpsdir, multistream_name) + multistr_index_path = os.path.join(dumpsdir, multistr_index_name) + + html_path = os.path.join(dumpsdir, 'index.html') + lines = get_new_html(multistream_name, multistr_index_name, + multistream_path, multistr_index_path, + html_path) + + new_file = html_path + '.new' + if dryrun: + print "would write lines to {out}:".format(out=new_file) + for line in lines: + if 'pages-articles-multistream' in line: + print line + else: + output = '\n'.join(lines) + '\n' + output_handle = file(new_file, "wt") + output_handle.write(output) + output_handle.close() + + +def usage(message=None): + "display a usage message and exit." + if message is not None: + print message + + usage_message = """Usage: {script} YYYYMMDD [dryrun] +Add link and size of multistream content and index files to index.html +for all wikis for the given date. +Writes new html files into a temporary location 'index.html.new'. +""".format(script=sys.argv[0]) + print usage_message + sys.exit(1) + + +def do_main(alldbs, dumpstree, date, dryrun): + "entry point" + wikis = read_wikis(alldbs) + for wiki in wikis: + cleanup_html(wiki, dumpstree, date, dryrun) + + +if __name__ == '__main__': + dblist = '/home/datasets/all.dblist.edited' + publicdir = '/mnt/data/xmldatadumps/public' + + #dblist = '/home/ariel/dumptesting/dblists/all.dblist' + #publicdir = '/home/ariel/dumptesting/dumpruns/public' + + if len(sys.argv) < 2 or len(sys.argv) > 3: + usage() + if sys.argv[1] in ['-h', '--help']: + usage("Help for this script") + + do_main(dblist, + publicdir, + date=sys.argv[1], dryrun=True if len(sys.argv) == 3 else False) diff --git a/fixups/fixup_recompress_moves.py b/fixups/fixup_recompress_moves.py new file mode 100644 index 0000000..14718a7 --- /dev/null +++ b/fixups/fixup_recompress_moves.py @@ -0,0 +1,126 @@ +import os +import sys +from subprocess import Popen + + +def read_wikis(filepath): + "read list of wikis, one per line, from file and return the list" + fhandle = open(filepath, "r") + text = fhandle.read() + fhandle.close() + return text.splitlines() + + +def compress(input_path, output_path, dryrun): + """ + returns True on success, False on failure + """ + command = "/bin/bzip2 -zc {inp} > {out}".format( + inp=input_path, out=output_path) + if dryrun: + print "would run", command + return True + try: + proc = Popen(command, shell=True) + _output, error = proc.communicate() + except Exception: + # fixme display the issue too + return False + + if error is not None: + print error + return False + else: + return True + + +def is_compressed(path): + """ + check if the file is bz2 compressed + return True if so, False otherwise + """ + with open(path) as fhandle: + header = fhandle.read(7) + return bool(header.startswith("BZh91AY")) + + +def cleanup_multistreams(wiki, dumpstree, date, dryrun): + """ + for the specified wiki, if there is a multistream + content file with temp filename, move it into the + permanent location; if there is a multistream index + file with temp filename, bzip2 compress it into the + permanent location + """ + dumpsdir = os.path.join(dumpstree, wiki, date) + if not os.path.exists(dumpsdir): + # skip dirs where the file doesn't exist, + # the run hasn't happened, or it's a private + # wiki with files elsewhere + return + multistream_name = '-'.join([wiki, date, 'pages-articles-multistream.xml.bz2']) + index_name = '-'.join([wiki, date, 'pages-articles-multistream-index.txt.bz2']) + extension = '.inprog' + multistream_path = os.path.join(dumpsdir, multistream_name) + index_path = os.path.join(dumpsdir, index_name) + if os.path.exists(multistream_path + extension): + if dryrun: + print "would rename", multistream_path + extension, "to", multistream_path + else: + os.rename(multistream_path + extension, multistream_path) + if os.path.exists(index_path + extension): + if os.path.exists(index_path): + print "target file ", index_path, "already exists, skipping" + else: + if is_compressed(index_path + extension): + # don't compress, just move into place + if dryrun: + print "would rename", index_path + extension, "to", index_path + else: + os.rename(index_path + extension, index_path) + elif compress(index_path + extension, index_path, dryrun): + if dryrun: + print "would remove", index_path + extension + else: + os.unlink(index_path + extension) + + +def do_main(alldbs, dumpstree, date, dryrun): + """ + entry point. for all wikis in the list, for the dump date specified + by date (YYYYMMDD), fix up the articles multistream content and + index file in the subdir wiki/date under the specified dumpstree. + """ + wikis = read_wikis(alldbs) + for wiki in wikis: + cleanup_multistreams(wiki, dumpstree, date, dryrun) + + +def usage(message=None): + "display a usage message and exit." + if message is not None: + print message + + usage_message = """Usage: {script} YYYYMMDD [dryrun] +Moves multistream content file from temp to permanent location;" +Bzip2 compresses index file into permanent location and removes" +temp file. +""".format(script=sys.argv[0]) + print usage_message + sys.exit(1) + + +if __name__ == '__main__': + if len(sys.argv) < 2 or len(sys.argv) > 3: + usage() + if sys.argv[1] in ['-h', '--help']: + usage("Help for this script") + + dblist = '/home/datasets/all.dblist.edited' + publicdir = '/mnt/data/xmldatadumps/public' + + #dblist = '/home/ariel/dumptesting/dblists/all.dblist' + #publicdir = '/home/ariel/dumptesting/dumpruns/public' + do_main(dblist, + publicdir, + date=sys.argv[1], dryrun=True if len(sys.argv) == 3 else False) diff --git a/fixups/fixup_report_json.py b/fixups/fixup_report_json.py new file mode 100644 index 0000000..079c916 --- /dev/null +++ b/fixups/fixup_report_json.py @@ -0,0 +1,86 @@ +import os +import sys +import json + + +def read_wikis(filepath): + "read list of wkis from file, one per line, and return the list" + fhandle = open(filepath, "r") + text = fhandle.read() + fhandle.close() + return text.splitlines() + + +def cleanup_report_json(wiki, dumpstree, date, dryrun): + """add size and relative url for multistream content and index files + to contents of report.json, and write out a new file.""" + dumpsdir = os.path.join(dumpstree, wiki, date) + if not os.path.exists(dumpsdir): + # skip dirs where the file doesn't exist, + # the run hasn't happened, or it's a private + # wiki with files elsewhere + print "skipping this wiki:", wiki + return + multistream_name = '-'.join([wiki, date, 'pages-articles-multistream.xml.bz2']) + index_name = '-'.join([wiki, date, 'pages-articles-multistream-index.txt.bz2']) + + multistream_path = os.path.join(dumpsdir, multistream_name) + index_path = os.path.join(dumpsdir, index_name) + + report_json_path = os.path.join(dumpstree, wiki, date, 'report.json') + with open(report_json_path, "r") as fhandle: + contents = fhandle.read() + output = json.loads(contents) + + if os.path.exists(multistream_path): + output['jobs']['articlesmultistreamdump']['files'][multistream_name] = { + 'size': os.path.getsize(multistream_path), + 'url': os.path.join('/', wiki, date, multistream_name)} + if os.path.exists(index_path): + output['jobs']['articlesmultistreamdump']['files'][index_name] = { + 'size': os.path.getsize(index_path), + 'url': os.path.join('/', wiki, date, index_name)} + + new_file = report_json_path + '.new' + if dryrun: + print "would write '{inp}' to".format(inp=json.dumps(output)), new_file + else: + output_handle = file(new_file, "w") + output_handle.write(json.dumps(output)) + output_handle.close() + + +def usage(message=None): + "display a usage message and exit." + if message is not None: + print message + + usage_message = """Usage: {script} YYYYMMDD [dryrun] +Adds information about the multistream content file and the +index file to report.json, writing a new temp file. +""".format(script=sys.argv[0]) + print usage_message + sys.exit(1) + + +def do_main(alldbs, dumpstree, date, dryrun): + "main entry point" + wikis = read_wikis(alldbs) + for wiki in wikis: + cleanup_report_json(wiki, dumpstree, date, dryrun) + + +if __name__ == '__main__': + dblist = '/home/datasets/all.dblist.edited' + publicdir = '/mnt/data/xmldatadumps/public' + + #dblist = '/home/ariel/dumptesting/dblists/all.dblist' + #publicdir = '/home/ariel/dumptesting/dumpruns/public' + + if len(sys.argv) < 2 or len(sys.argv) > 3: + usage() + if sys.argv[1] in ['-h', '--help']: + usage("Help for this script") + + do_main(dblist, publicdir, date=sys.argv[1], + dryrun=True if len(sys.argv) == 3 else False) -- To view, visit https://gerrit.wikimedia.org/r/386162 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Iadf6d9d3ab8fc39a89836f08d50fb98f7f12d088 Gerrit-PatchSet: 1 Gerrit-Project: operations/dumps Gerrit-Branch: ariel Gerrit-Owner: ArielGlenn <ar...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits