[MediaWiki-commits] [Gerrit] operations/dumps[ariel]: one-off scripts for fixing up multistream dump mess

ArielGlenn (Code Review) Tue, 24 Oct 2017 05:27:08 -0700

ArielGlenn has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/386162 )


Change subject: one-off scripts for fixing up multistream dump mess
......................................................................

one-off scripts for fixing up multistream dump mess

Just in case we ever need them again, hopefully not, here they are.

Change-Id: Iadf6d9d3ab8fc39a89836f08d50fb98f7f12d088
---
A fixups/fixup_hashfiles.py
A fixups/fixup_html.py
A fixups/fixup_recompress_moves.py
A fixups/fixup_report_json.py
4 files changed, 514 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/dumps 
refs/changes/62/386162/1

diff --git a/fixups/fixup_hashfiles.py b/fixups/fixup_hashfiles.py
new file mode 100644
index 0000000..cf036fb
--- /dev/null
+++ b/fixups/fixup_hashfiles.py
@@ -0,0 +1,170 @@
+import os
+import sys
+import hashlib
+import json
+
+
+def read_wikis(filepath):
+    "read list of wikis, one per line, from file, return the list"
+    fhandle = open(filepath, "r")
+    text = fhandle.read()
+    fhandle.close()
+    return text.splitlines()
+
+
+def checksum(filename, htype):
+    "return hash of specified file in string format, using specified hash type"
+    if htype == 'md5':
+        summer = hashlib.md5()
+    else:
+        summer = hashlib.sha1()
+    infhandle = file(filename, "rb")
+    bufsize = 4192 * 32
+    fbuffer = infhandle.read(bufsize)
+    while fbuffer:
+        summer.update(fbuffer)
+        fbuffer = infhandle.read(bufsize)
+    infhandle.close()
+    return summer.hexdigest()
+
+
+def update_hashes_text(hashed_paths, output_file, hash_strings, dryrun):
+    """
+    we expect the file to contain all the existing hashes,
+    we will append to it
+    """
+    if not os.path.exists(output_file):
+        # no file with old hashes. something's wrong, skip.
+        return
+
+    with open(output_file, "r") as fhandle:
+        content = fhandle.read()
+    new_file = output_file + ".new"
+
+    if not dryrun:
+        output_handle = file(new_file, "wt")
+        output_handle.write(content)
+
+    for idx in range(0, len(hashed_paths)):
+        if hashed_paths[idx] in content:
+            # info already present in hash file. skip.
+            continue
+
+        if dryrun:
+            print "would append: '{hsum}  {path}' to".format(
+                hsum=hash_strings[idx], path=hashed_paths[idx]), new_file
+        else:
+            output_handle.write("{hsum}  
{path}\n".format(hsum=hash_strings[idx],
+                                                          
path=hashed_paths[idx]))
+    if not dryrun:
+        output_handle.close()
+
+
+def update_hashes_json(hashed_paths, output_file, hash_strings, htype, dryrun):
+    """
+    we expect the file to contain all the existing hashes,
+    we read it, load the json, add our entry to the dict, convert it
+    back to json and write it back out as new file
+    """
+    if not os.path.exists(output_file):
+        # no file with old hashes. something's wrong, skip.
+        return
+
+    with open(output_file, "r") as fhandle:
+        contents = fhandle.read()
+        output = json.loads(contents)
+
+    new_file = output_file + ".new"
+    if not dryrun:
+        output_handle = file(new_file, "wt")
+
+    for idx in range(0, len(hashed_paths)):
+        output[htype]["files"][hashed_paths[idx]] = hash_strings[idx]
+
+    if dryrun:
+        print "would write: '{outp}' to".format(outp=json.dumps(output)), 
new_file
+    else:
+        output_handle.write(json.dumps(output))
+        output_handle.close()
+
+
+def update_hashes(file_paths, hashes_path, hash_strings, htype, ftype, dryrun):
+    filenames = [os.path.basename(path) for path in file_paths]
+    if ftype == 'txt':
+        update_hashes_text(filenames, hashes_path, hash_strings, dryrun)
+    else:
+        update_hashes_json(filenames, hashes_path, hash_strings, htype, dryrun)
+
+
+def get_hashfile_path(dumpstree, wiki, date, hashtype, filetype):
+    dumpsdir = os.path.join(dumpstree, wiki, date)
+    filename = '-'.join([wiki, date, 
'{htype}sums.{ftype}'.format(htype=hashtype, ftype=filetype)])
+    return os.path.join(dumpsdir, filename)
+
+
+def cleanup_hashfiles(wiki, dumpstree, date, filename_bases, dryrun):
+    """
+    For the specified wiki and date, given the base part of the filename,
+    get the md5 and sha1 sums of the corresponding wiki dump file for
+    that date, append these to the plaintext files of hashes and write
+    out new files.
+
+    Also write new json files of hashes to include this information;
+    these values will overwrite old values if present.
+    """
+    dumpsdir = os.path.join(dumpstree, wiki, date)
+    if not os.path.exists(dumpsdir):
+        # skip dirs where the file doesn't exist,
+        # the run hasn't happened, or it's a private
+        # wiki with files elsewhere
+        print "skipping this wiki", dumpsdir
+        return
+
+    filenames = ['-'.join([wiki, date, base]) for base in filename_bases]
+    file_paths = [os.path.join(dumpsdir, filename) for filename in filenames]
+    file_paths = [path for path in file_paths if os.path.exists(path)]
+    for htype in ['md5', 'sha1']:
+        for ftype in ['txt', 'json']:
+            hashes_path = get_hashfile_path(dumpstree, wiki, date, htype, 
ftype)
+            hash_strings = [checksum(filename, htype) for filename in 
file_paths]
+            update_hashes(file_paths, hashes_path, hash_strings, htype, ftype, 
dryrun)
+
+
+def usage(message=None):
+    "display a usage message and exit."
+    if message is not None:
+        print message
+
+    usage_message = """Usage: {script} YYYYMMDD [dryrun]
+Adds md5sum and sha1sum of multistream content and index files
+to the plaintext files and the json files with hash lists.
+
+The new files are created with the extension '.new' at the end.
+""".format(script=sys.argv[0])
+    print usage_message
+    sys.exit(1)
+
+
+def do_main(alldbs, dumpstree, date, filename_bases, dryrun):
+    "main entry point"
+    wikis = read_wikis(alldbs)
+    for wiki in wikis:
+        cleanup_hashfiles(wiki, dumpstree, date, filename_bases, dryrun)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2 or len(sys.argv) > 3:
+        usage()
+    if sys.argv[1] in ['-h', '--help']:
+        usage("Help for this script")
+
+    dblist = '/home/datasets/all.dblist.edited'
+    publicdir = '/mnt/data/xmldatadumps/public'
+
+    #dblist = '/home/ariel/dumptesting/dblists/all.dblist'
+    #publicdir = '/home/ariel/dumptesting/dumpruns/public'
+
+    basenames = ['pages-articles-multistream-index.txt.bz2',
+                 'pages-articles-multistream.xml.bz2']
+    do_main(dblist, publicdir, sys.argv[1], basenames,
+            dryrun=True if len(sys.argv) == 3 else False)
diff --git a/fixups/fixup_html.py b/fixups/fixup_html.py
new file mode 100644
index 0000000..306027c
--- /dev/null
+++ b/fixups/fixup_html.py
@@ -0,0 +1,132 @@
+import os
+import sys
+
+
+def read_wikis(filepath):
+    "read list of wikis from file, one per line, and return list"
+    fhandle = open(filepath, "r")
+    text = fhandle.read()
+    fhandle.close()
+    return text.splitlines()
+
+
+def pretty_size(size, quanta):
+    "return size of file scaled down as much as possible."
+    if size < 1024 or len(quanta) == 1:
+        return quanta[0] % size
+    else:
+        return pretty_size(size / 1024.0, quanta[1:])
+
+
+def get_printable_size(filepath):
+    "return size of file with nice human readable format"
+    quanta = ("%d bytes", "%d KB", "%0.1f MB", "%0.1f GB", "%0.1f TB")
+    size = os.path.getsize(filepath)
+    return pretty_size(size, quanta)
+
+
+def get_new_html(multistream_name, multistr_index_name,
+                 multistream_path, multistr_index_path,
+                 html_path):
+    """
+    read old html content, fix up the lines that are missing info
+    for the multistream content and index files, return the new
+    content
+    """
+    with open(html_path, "r") as fhandle:
+        contents = fhandle.read()
+        lines = contents.splitlines()
+
+    new_lines = []
+    for line in lines:
+        if 'pages-articles-multistream.xml' in line:
+            line = line.replace(
+                "<li class='missing'>",
+                "<li class='file'>" + '<a 
href="{path}">'.format(path=multistream_name))
+            line = line.replace(
+                "stream.xml.bz2</li>",
+                "stream.xml.bz2</a> {size} 
</li>".format(size=get_printable_size(multistream_path)))
+        elif 'pages-articles-multistream-index.txt' in line:
+            line = line.replace(
+                "<li class='missing'>",
+                "<li class='file'>" + '<a 
href="{path}">'.format(path=multistr_index_name))
+            line = line.replace(
+                "index.txt.bz2</li>",
+                "index.txt.bz2</a> {size} </li>".format(
+                    size=get_printable_size(multistr_index_path)))
+        new_lines.append(line)
+    return new_lines
+
+
+def cleanup_html(wiki, dumpstree, date, dryrun):
+    """
+    add size and link for content and index multistream files
+    to index.html file for the dump of the given wiki and date,
+    writing out a new file.
+    """
+    dumpsdir = os.path.join(dumpstree, wiki, date)
+    if not os.path.exists(dumpsdir):
+        # skip dirs where the file doesn't exist,
+        # the run hasn't happened, or it's a private
+        # wiki with files elsewhere
+        return
+    multistream_name = '-'.join([wiki, date, 
'pages-articles-multistream.xml.bz2'])
+    multistr_index_name = '-'.join([wiki, date, 
'pages-articles-multistream-index.txt.bz2'])
+
+    multistream_path = os.path.join(dumpsdir, multistream_name)
+    multistr_index_path = os.path.join(dumpsdir, multistr_index_name)
+
+    html_path = os.path.join(dumpsdir, 'index.html')
+    lines = get_new_html(multistream_name, multistr_index_name,
+                         multistream_path, multistr_index_path,
+                         html_path)
+
+    new_file = html_path + '.new'
+    if dryrun:
+        print "would write lines to {out}:".format(out=new_file)
+        for line in lines:
+            if 'pages-articles-multistream' in line:
+                print line
+    else:
+        output = '\n'.join(lines) + '\n'
+        output_handle = file(new_file, "wt")
+        output_handle.write(output)
+        output_handle.close()
+
+
+def usage(message=None):
+    "display a usage message and exit."
+    if message is not None:
+        print message
+
+    usage_message = """Usage: {script} YYYYMMDD [dryrun]
+Add link and size of multistream content and index files to index.html
+for all wikis for the given date.
+Writes new html files into a temporary location 'index.html.new'.
+""".format(script=sys.argv[0])
+    print usage_message
+    sys.exit(1)
+
+
+def do_main(alldbs, dumpstree, date, dryrun):
+    "entry point"
+    wikis = read_wikis(alldbs)
+    for wiki in wikis:
+        cleanup_html(wiki, dumpstree, date, dryrun)
+
+
+if __name__ == '__main__':
+    dblist = '/home/datasets/all.dblist.edited'
+    publicdir = '/mnt/data/xmldatadumps/public'
+
+    #dblist = '/home/ariel/dumptesting/dblists/all.dblist'
+    #publicdir = '/home/ariel/dumptesting/dumpruns/public'
+
+    if len(sys.argv) < 2 or len(sys.argv) > 3:
+        usage()
+    if sys.argv[1] in ['-h', '--help']:
+        usage("Help for this script")
+
+    do_main(dblist,
+            publicdir,
+            date=sys.argv[1], dryrun=True if len(sys.argv) == 3 else False)
diff --git a/fixups/fixup_recompress_moves.py b/fixups/fixup_recompress_moves.py
new file mode 100644
index 0000000..14718a7
--- /dev/null
+++ b/fixups/fixup_recompress_moves.py
@@ -0,0 +1,126 @@
+import os
+import sys
+from subprocess import Popen
+
+
+def read_wikis(filepath):
+    "read list of wikis, one per line, from file and return the list"
+    fhandle = open(filepath, "r")
+    text = fhandle.read()
+    fhandle.close()
+    return text.splitlines()
+
+
+def compress(input_path, output_path, dryrun):
+    """
+    returns True on success, False on failure
+    """
+    command = "/bin/bzip2 -zc {inp} > {out}".format(
+        inp=input_path, out=output_path)
+    if dryrun:
+        print "would run", command
+        return True
+    try:
+        proc = Popen(command, shell=True)
+        _output, error = proc.communicate()
+    except Exception:
+        # fixme display the issue too
+        return False
+
+    if error is not None:
+        print error
+        return False
+    else:
+        return True
+
+
+def is_compressed(path):
+    """
+    check if the file is bz2 compressed
+    return True if so, False otherwise
+    """
+    with open(path) as fhandle:
+        header = fhandle.read(7)
+        return bool(header.startswith("BZh91AY"))
+
+
+def cleanup_multistreams(wiki, dumpstree, date, dryrun):
+    """
+    for the specified wiki, if there is a multistream
+    content file with temp filename, move it into the
+    permanent location; if there is a multistream index
+    file with temp filename, bzip2 compress it into the
+    permanent location
+    """
+    dumpsdir = os.path.join(dumpstree, wiki, date)
+    if not os.path.exists(dumpsdir):
+        # skip dirs where the file doesn't exist,
+        # the run hasn't happened, or it's a private
+        # wiki with files elsewhere
+        return
+    multistream_name = '-'.join([wiki, date, 
'pages-articles-multistream.xml.bz2'])
+    index_name = '-'.join([wiki, date, 
'pages-articles-multistream-index.txt.bz2'])
+    extension = '.inprog'
+    multistream_path = os.path.join(dumpsdir, multistream_name)
+    index_path = os.path.join(dumpsdir, index_name)
+    if os.path.exists(multistream_path + extension):
+        if dryrun:
+            print "would rename", multistream_path + extension, "to", 
multistream_path
+        else:
+            os.rename(multistream_path + extension, multistream_path)
+    if os.path.exists(index_path + extension):
+        if os.path.exists(index_path):
+            print "target file ", index_path, "already exists, skipping"
+        else:
+            if is_compressed(index_path + extension):
+                # don't compress, just move into place
+                if dryrun:
+                    print "would rename", index_path + extension, "to", 
index_path
+                else:
+                    os.rename(index_path + extension, index_path)
+            elif compress(index_path + extension, index_path, dryrun):
+                if dryrun:
+                    print "would remove", index_path + extension
+                else:
+                    os.unlink(index_path + extension)
+
+
+def do_main(alldbs, dumpstree, date, dryrun):
+    """
+    entry point. for all wikis in the list, for the dump date specified
+    by date (YYYYMMDD), fix up the articles multistream content and
+    index file in the subdir wiki/date under the specified dumpstree.
+    """
+    wikis = read_wikis(alldbs)
+    for wiki in wikis:
+        cleanup_multistreams(wiki, dumpstree, date, dryrun)
+
+
+def usage(message=None):
+    "display a usage message and exit."
+    if message is not None:
+        print message
+
+    usage_message = """Usage: {script} YYYYMMDD [dryrun]
+Moves multistream content file from temp to permanent location;"
+Bzip2 compresses index file into permanent location and removes"
+temp file.
+""".format(script=sys.argv[0])
+    print usage_message
+    sys.exit(1)
+
+
+if __name__ == '__main__':
+    if len(sys.argv) < 2 or len(sys.argv) > 3:
+        usage()
+    if sys.argv[1] in ['-h', '--help']:
+        usage("Help for this script")
+
+    dblist = '/home/datasets/all.dblist.edited'
+    publicdir = '/mnt/data/xmldatadumps/public'
+
+    #dblist = '/home/ariel/dumptesting/dblists/all.dblist'
+    #publicdir = '/home/ariel/dumptesting/dumpruns/public'
+    do_main(dblist,
+            publicdir,
+            date=sys.argv[1], dryrun=True if len(sys.argv) == 3 else False)
diff --git a/fixups/fixup_report_json.py b/fixups/fixup_report_json.py
new file mode 100644
index 0000000..079c916
--- /dev/null
+++ b/fixups/fixup_report_json.py
@@ -0,0 +1,86 @@
+import os
+import sys
+import json
+
+
+def read_wikis(filepath):
+    "read list of wkis from file, one per line, and return the list"
+    fhandle = open(filepath, "r")
+    text = fhandle.read()
+    fhandle.close()
+    return text.splitlines()
+
+
+def cleanup_report_json(wiki, dumpstree, date, dryrun):
+    """add size and relative url for multistream content and index files
+    to contents of report.json, and write out a new file."""
+    dumpsdir = os.path.join(dumpstree, wiki, date)
+    if not os.path.exists(dumpsdir):
+        # skip dirs where the file doesn't exist,
+        # the run hasn't happened, or it's a private
+        # wiki with files elsewhere
+        print "skipping this wiki:", wiki
+        return
+    multistream_name = '-'.join([wiki, date, 
'pages-articles-multistream.xml.bz2'])
+    index_name = '-'.join([wiki, date, 
'pages-articles-multistream-index.txt.bz2'])
+
+    multistream_path = os.path.join(dumpsdir, multistream_name)
+    index_path = os.path.join(dumpsdir, index_name)
+
+    report_json_path = os.path.join(dumpstree, wiki, date, 'report.json')
+    with open(report_json_path, "r") as fhandle:
+        contents = fhandle.read()
+        output = json.loads(contents)
+
+    if os.path.exists(multistream_path):
+        output['jobs']['articlesmultistreamdump']['files'][multistream_name] = 
{
+            'size': os.path.getsize(multistream_path),
+            'url': os.path.join('/', wiki, date, multistream_name)}
+    if os.path.exists(index_path):
+        output['jobs']['articlesmultistreamdump']['files'][index_name] = {
+            'size': os.path.getsize(index_path),
+            'url': os.path.join('/', wiki, date, index_name)}
+
+    new_file = report_json_path + '.new'
+    if dryrun:
+        print "would write '{inp}' to".format(inp=json.dumps(output)), new_file
+    else:
+        output_handle = file(new_file, "w")
+        output_handle.write(json.dumps(output))
+        output_handle.close()
+
+
+def usage(message=None):
+    "display a usage message and exit."
+    if message is not None:
+        print message
+
+    usage_message = """Usage: {script} YYYYMMDD [dryrun]
+Adds information about the multistream content file and the
+index file to report.json, writing a new temp file.
+""".format(script=sys.argv[0])
+    print usage_message
+    sys.exit(1)
+
+
+def do_main(alldbs, dumpstree, date, dryrun):
+    "main entry point"
+    wikis = read_wikis(alldbs)
+    for wiki in wikis:
+        cleanup_report_json(wiki, dumpstree, date, dryrun)
+
+
+if __name__ == '__main__':
+    dblist = '/home/datasets/all.dblist.edited'
+    publicdir = '/mnt/data/xmldatadumps/public'
+
+    #dblist = '/home/ariel/dumptesting/dblists/all.dblist'
+    #publicdir = '/home/ariel/dumptesting/dumpruns/public'
+
+    if len(sys.argv) < 2 or len(sys.argv) > 3:
+        usage()
+    if sys.argv[1] in ['-h', '--help']:
+        usage("Help for this script")
+
+    do_main(dblist, publicdir, date=sys.argv[1],
+            dryrun=True if len(sys.argv) == 3 else False)

-- 
To view, visit https://gerrit.wikimedia.org/r/386162
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iadf6d9d3ab8fc39a89836f08d50fb98f7f12d088
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <ar...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] operations/dumps[ariel]: one-off scripts for fixing up multistream dump mess

Reply via email to