[MediaWiki-commits] [Gerrit] dumps: do xml stubs via streaming - change (operations/dumps)

ArielGlenn (Code Review) Wed, 03 Jun 2015 10:32:33 -0700

ArielGlenn has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/215666


Change subject: dumps: do xml stubs via streaming
......................................................................

dumps: do xml stubs via streaming

this works around php memory leaks, running dumps for small ranges
and feeding the output files to a compressor for final output,
instead of running the script once over the entire page range
which could run for up to a day or more

Change-Id: I36146797d5fb78717e3ea92125487793982a1ec0
---
M xmldumps-backup/worker.py
A xmldumps-backup/xmlstreams.py
A xmldumps-backup/xmlstubs.py
3 files changed, 381 insertions(+), 10 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/dumps 
refs/changes/66/215666/1

diff --git a/xmldumps-backup/worker.py b/xmldumps-backup/worker.py
index 724910b..fe0c9a1 100644
--- a/xmldumps-backup/worker.py
+++ b/xmldumps-backup/worker.py
@@ -2837,16 +2837,10 @@
                historyFile = 
runner.dumpDir.filenamePublicPath(DumpFilename(runner.wiki, f.date, 
self.historyDumpName, f.fileType, f.fileExt, f.chunk, f.checkpoint, f.temp))
                currentFile = 
runner.dumpDir.filenamePublicPath(DumpFilename(runner.wiki, f.date, 
self.currentDumpName, f.fileType, f.fileExt, f.chunk, f.checkpoint, f.temp))
                scriptCommand = 
MultiVersion.MWScriptAsArray(runner.wiki.config, "dumpBackup.php")
-               command = [ "%s" % runner.wiki.config.php, "-q" ]
-               command.extend(scriptCommand)
-               command.extend(["--wiki=%s" % runner.dbName,
-                               "--full", "--stub", "--report=10000",
-                               "%s" % runner.forceNormalOption(),
-                               "--output=gzip:%s" % historyFile,
-                               "--output=gzip:%s" % currentFile,
-                               "--filter=latest", "--output=gzip:%s" % 
articlesFile,
-                               "--filter=latest", "--filter=notalk", 
"--filter=namespace:!NS_USER"
-                               ])
+
+                command = [ "/usr/bin/python", "xmlstubs.py", "--config", 
runner.wiki.config.files[0], "--wiki", runner.dbName,
+                            runner.forceNormalOption(), "--articles", 
articlesFile,
+                            "--history", historyFile, "--current", currentFile 
]
 
                if (f.chunk):
                        # set up start end end pageids for this piece
diff --git a/xmldumps-backup/xmlstreams.py b/xmldumps-backup/xmlstreams.py
new file mode 100644
index 0000000..83f718e
--- /dev/null
+++ b/xmldumps-backup/xmlstreams.py
@@ -0,0 +1,214 @@
+'''
+generate an xml dump via multiple runs of a php script instead of one
+long run.
+
+avoids memory leak issues, permits retries when a single run fails,
+recovery if db servers go away in the middle of a run by retrying
+the run.
+'''
+
+import os
+import sys
+import time
+import worker
+import WikiDump
+import getopt
+
+from subprocess import Popen, PIPE
+
+# fix all the error returns and make subroutines out of stuff
+# current code puts together a command with a bunch of crap in it
+
+def do_xml_stream(wikidb, outfiles, command, wikiconf, force_normal,
+                  start, end, dryrun, id_field, table,
+                  small_interval, max_interval, ends_with):
+    '''
+    do an xml dump one piece at a time, writing into uncompressed
+    temporary files and shovelling those into gzip's stdin for the
+    concatenated compressed output
+    '''
+    if start is None:
+        start = 1
+
+    interval = None
+    if end is None:
+        end = get_max_id(wikiconf, wikidb, id_field, table)
+        # if the whole wiki is small enough, take
+        # arbitrary hopefully reasonable slices
+        if start == 1 and end < 1000000:
+            interval = small_interval
+
+    if interval is None:
+        # hope this is not too awful a guess
+        interval = (int(end) - int(start))/12
+        if interval > max_interval:
+            interval = max_interval
+
+    # run first
+    piece_command = [field for field in command]
+    piece_command.extend(["--start", str(start)])
+    piece_command.append("--skip-footer")
+    if interval <= end:
+        upto = interval + start
+        piece_command.extend(["--end", str(upto)])
+    else:
+        upto = end
+
+    do_xml_piece(piece_command, outfiles, ends_with, dryrun)
+
+    while upto <= end:
+        piece_command = [field for field in command]
+        piece_command.append("--skip-header")
+        piece_command.extend(["--start=%s" % str(upto)])
+        piece_command.append("--skip-footer")
+        if upto + interval <= end:
+            piece_command.extend(["--end", str(upto + interval)])
+        else:
+            piece_command.extend(["--end", str(end + 1)])
+        upto = upto + interval
+        do_xml_piece(piece_command, outfiles, ends_with, dryrun)
+
+    piece_command = [field for field in command]
+    # get just the footer
+    piece_command.extend(["--skip-header", "--start=1", "--end=1"])
+
+    do_xml_piece(piece_command, outfiles, dryrun=dryrun)
+
+    for filetype in outfiles:
+        outfiles[filetype]['compr'].stdin.close()
+
+    for filetype in outfiles:
+        outfiles[filetype]['compr'].wait()
+
+
+def run_script(command, outfiles, shouldendwith=None):
+    '''
+    given a command
+    returns True on success, None on failure
+    '''
+    failed = False
+    process = Popen(command)
+    # would be best for there to be a timeout for this eh?
+    process.wait()
+    retval = process.returncode
+    if not retval:
+        for filetype in outfiles:
+            outfile = outfiles[filetype]['temp']
+            if os.path.exists(outfile):
+                # file could be empty (all pages in the range deleted)
+                if os.path.getsize(outfile) > 0:
+                    if shouldendwith is not None:
+                        with open(outfile, 'r') as outfd:
+                            outfd.seek(len(shouldendwith) * -1, os.SEEK_END)
+                            remainder = outfd.read()
+                            outfd.close()
+                            if remainder != shouldendwith:
+                                os.unlink(outfile)
+                                failed = True
+    if failed:
+        return False
+    else:
+        return True
+
+
+def catfile(inputfile, process):
+    '''
+    read a file, cat it as fast as possible to the
+    stdin of the process passed, then go away
+    '''
+    if not os.path.exists(inputfile):
+        sys.stderr.write("no such file: %s\n" % inputfile)
+        sys.exit(1)
+    with open(inputfile, "r") as filed:
+        while True:
+            content = filed.read(1048576)
+            if not content:
+                filed.close()
+                break
+            process.stdin.write(content)
+
+
+def gzippit(outfile):
+    '''
+    start a gzip process that reads from stdin
+    and writes to the specified file
+    '''
+    process = Popen("gzip > %s" % outfile, stdin=PIPE, shell=True, bufsize=-1)
+    return process
+
+
+def get_max_id(wikiconf, wikidb, id_field, table):
+    '''
+    retrieve the largest id for this wiki from the db for specific table
+    pass in name of id field, name of table
+    '''
+    wiki = WikiDump.Wiki(wikiconf, wikidb)
+
+    db_info = worker.DbServerInfo(wiki, wikidb)
+    query = "select MAX(%s) from %s%s;" % (id_field, db_info.dBTablePrefix, 
table)
+    results = None
+    retries = 0
+    maxretries = 5
+    end = 0
+    results = db_info.runSqlAndGetOutput(query)
+    if results:
+        lines = results.splitlines()
+        if lines and lines[1]:
+            end = int(lines[1])
+            return end
+
+    while results is None and retries < maxretries:
+        retries = retries + 1
+        time.sleep(5)
+        results = db_info.runSqlAndGetOutput(query)
+        if not results:
+            continue
+        lines = results.splitlines()
+        if lines and lines[1]:
+            end = int(lines[1])
+            break
+
+    if not end:
+        sys.stderr.write("failed to get max page id from db, exiting\n")
+        sys.exit(1)
+    else:
+        return end
+
+
+def do_xml_piece(command, outfiles, ends_with=None, dryrun=False):
+    '''
+    do one piece of a logs dump, output going uncompressed
+    to a temporary file and the that file being shovelled
+    into the compressor's stdin
+
+    we do three retries with plenty of delay, in case
+    the db server has issues or some other problem
+    crops up
+    '''
+
+    if dryrun:
+        sys.stderr.write("would run command: %s\n" % " ".join(command))
+        return
+
+    retries = 0
+    maxretries = 3
+    timeout = 60
+    while retries < maxretries:
+        result = run_script(command, outfiles, ends_with)
+        if result:
+            break
+        time.sleep(timeout)
+        timeout = timeout * 2
+        retries += 1
+    if not result:
+        sys.stderr.write("failed job after max retries\n")
+        for filetype in outfiles:
+            os.unlink(outfiles[filetype]['temp'])
+            # these partial output files can be used later with a 
+            # run that dumps the rest of the pages, and a recombine 
+            outfiles[filetype]['compr'].stdin.close()
+        sys.exit(1)
+
+    for filetype in outfiles:
+        catfile(outfiles[filetype]['temp'], outfiles[filetype]['compr'])
+        os.unlink(outfiles[filetype]['temp'])
diff --git a/xmldumps-backup/xmlstubs.py b/xmldumps-backup/xmlstubs.py
new file mode 100644
index 0000000..768ff9a
--- /dev/null
+++ b/xmldumps-backup/xmlstubs.py
@@ -0,0 +1,163 @@
+'''
+generate an xml dump via multiple runs of a php script instead of one
+long run.
+
+avoids memory leak issues, permits retries when a single run fails,
+recovery if db servers go away in the middle of a run by retrying
+the run.
+'''
+
+import os
+import sys
+import time
+import worker
+import WikiDump
+import getopt
+from xmlstreams import run_script, catfile, gzippit, get_max_id, do_xml_piece, 
do_xml_stream
+
+
+def dostubsbackup(wikidb, history_file, current_file, articles_file,
+                  wikiconf, force_normal, start, end, dryrun):
+    '''
+    do a stubs xml dump one piece at a time, writing into uncompressed
+    temporary files and shovelling those into gzip's stdin for the
+    concatenated compressed output
+    '''
+    outfiles = {'history': {'name': history_file},
+                'current': {'name': current_file},
+                'articles': {'name': articles_file}}
+    for filetype in outfiles:
+        outfiles[filetype]['temp'] = os.path.join(wikiconf.tempDir, 
os.path.basename(outfiles[filetype]['name']) + "_tmp")
+        outfiles[filetype]['compr'] = gzippit(outfiles[filetype]['name'])
+
+    script_command = worker.MultiVersion.MWScriptAsArray(wikiconf, 
"dumpBackup.php")
+    command = [wikiconf.php, "-q"] + script_command
+
+    command.extend(["--wiki=%s" % wikidb,
+                    "--full", "--stub", "--report=10000",
+                    "--output=file:%s" % outfiles['history']['temp'],
+                    "--output=file:%s" % outfiles['current']['temp'],
+                    "--filter=latest",
+                    "--output=file:%s" % outfiles['articles']['temp'],
+                    "--filter=latest", "--filter=notalk",
+                    "--filter=namespace:!NS_USER"
+                    ])
+    if force_normal is not None:
+        command.append("--force-normal")
+
+    do_xml_stream(wikidb, outfiles, command, wikiconf, force_normal,
+                  start, end, dryrun, 'page_id', 'page',
+                  100000, 500000, '</page>\n')
+
+
+def usage(message=None):
+    """
+    display a helpful usage message with
+    an optional introductory message first
+    """
+    if message is not None:
+        sys.stderr.write(message)
+        sys.stderr.write("\n")
+    usage_message = """
+Usage: xmlstubs.py --wiki wikidbname --articles path --current path
+    --history path [--start number] [--end number]
+    [--force-normal bool] [--config path]
+
+Options:
+
+  --wiki (-w):         wiki db name, e.g. enwiki
+  --articles (-a):     full path of articles xml stub dump that will be created
+  --current (-c):      full path of current pages xml stub dump that will be 
created
+  --history (-h):      full path of xml stub dump with full history that will 
be created
+
+  --start (-s):        starting page to dump (default: 1)
+  --end (-e):          ending page to dump (default: dump all)
+
+  --force-normal (-f): if set, this argument will be passed through to 
dumpBackup.php
+                       (default: unset)
+  --config (-C):       path to wikidump configfile (default: "wikidump.conf" 
in current dir)
+  --dryrun (-d):       display the commands that would be run to produce the 
output but
+                       don't actually run them
+"""
+    sys.stderr.write(usage_message)
+    sys.exit(1)
+
+
+def main():
+    'main entry point, does all the work'
+    wiki = None
+    articles_file = None
+    current_file = None
+    history_file = None
+    start = None
+    end = None
+    force_normal = False
+    dryrun = False
+    configfile = "wikidump.conf"
+
+    try:
+        (options, remainder) = getopt.gnu_getopt(
+            sys.argv[1:], "w:a:c:h:s:e:C:fhd",
+            ["wiki=", "articles=", "current=", "history=",
+             "start=", "end=", "config=", "force-normal",
+             "help", "dryrun"])
+
+    except getopt.GetoptError as err:
+        usage("Unknown option specified: " + str(err))
+    for (opt, val) in options:
+        if opt in ["-w", "--wiki"]:
+            wiki = val
+        elif opt in ["-a", "--articles"]:
+            articles_file = val
+        elif opt in ["-c", "--current"]:
+            current_file = val
+        elif opt in ["-h", "--history"]:
+            history_file = val
+        elif opt in ["-s", "--start"]:
+            start = val
+        elif opt in ["-e", "--end"]:
+            end = val
+        elif opt in ["-f", "--force-normal"]:
+            force_normal = True
+        elif opt in ["-C", "--config"]:
+            configfile = val
+        elif opt in ["-d", "--dryrun"]:
+            dryrun = True
+        elif opt in ["-h", "--help"]:
+            usage('Help for this script\n')
+        else:
+            usage("Unknown option specified: <%s>" % opt)
+
+    if len(remainder) > 0:
+        usage("Unknown option(s) specified: <%s>" % remainder[0])
+
+    if wiki is None:
+        usage("mandatory argument argument missing: --wiki")
+    if articles_file is None:
+        usage("mandatory argument argument missing: --articles")
+    if current_file is None:
+        usage("mandatory argument argument missing: --current")
+    if history_file is None:
+        usage("mandatory argument argument missing: --history")
+
+    if start is not None:
+        if not start.isdigit():
+            usage("value for --start must be a number")
+        else:
+            start = int(start)
+
+    if end is not None:
+        if not end.isdigit():
+            usage("value for --end must be a number")
+        else:
+            end = int(end)
+
+    if not os.path.exists(configfile):
+        usage("no such file found: " + configfile)
+
+    wikiconf = WikiDump.Config(configfile)
+    wikiconf.parseConfFilePerProject(wiki)
+    dostubsbackup(wiki, history_file, current_file, articles_file, wikiconf, 
force_normal, start, end, dryrun)
+
+if __name__ == '__main__':
+    main()

-- 
To view, visit https://gerrit.wikimedia.org/r/215666
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I36146797d5fb78717e3ea92125487793982a1ec0
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] dumps: do xml stubs via streaming - change (operations/dumps)

Reply via email to