ArielGlenn has uploaded a new change for review.
https://gerrit.wikimedia.org/r/215666
Change subject: dumps: do xml stubs via streaming
......................................................................
dumps: do xml stubs via streaming
this works around php memory leaks, running dumps for small ranges
and feeding the output files to a compressor for final output,
instead of running the script once over the entire page range
which could run for up to a day or more
Change-Id: I36146797d5fb78717e3ea92125487793982a1ec0
---
M xmldumps-backup/worker.py
A xmldumps-backup/xmlstreams.py
A xmldumps-backup/xmlstubs.py
3 files changed, 381 insertions(+), 10 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/dumps
refs/changes/66/215666/1
diff --git a/xmldumps-backup/worker.py b/xmldumps-backup/worker.py
index 724910b..fe0c9a1 100644
--- a/xmldumps-backup/worker.py
+++ b/xmldumps-backup/worker.py
@@ -2837,16 +2837,10 @@
historyFile =
runner.dumpDir.filenamePublicPath(DumpFilename(runner.wiki, f.date,
self.historyDumpName, f.fileType, f.fileExt, f.chunk, f.checkpoint, f.temp))
currentFile =
runner.dumpDir.filenamePublicPath(DumpFilename(runner.wiki, f.date,
self.currentDumpName, f.fileType, f.fileExt, f.chunk, f.checkpoint, f.temp))
scriptCommand =
MultiVersion.MWScriptAsArray(runner.wiki.config, "dumpBackup.php")
- command = [ "%s" % runner.wiki.config.php, "-q" ]
- command.extend(scriptCommand)
- command.extend(["--wiki=%s" % runner.dbName,
- "--full", "--stub", "--report=10000",
- "%s" % runner.forceNormalOption(),
- "--output=gzip:%s" % historyFile,
- "--output=gzip:%s" % currentFile,
- "--filter=latest", "--output=gzip:%s" %
articlesFile,
- "--filter=latest", "--filter=notalk",
"--filter=namespace:!NS_USER"
- ])
+
+ command = [ "/usr/bin/python", "xmlstubs.py", "--config",
runner.wiki.config.files[0], "--wiki", runner.dbName,
+ runner.forceNormalOption(), "--articles",
articlesFile,
+ "--history", historyFile, "--current", currentFile
]
if (f.chunk):
# set up start end end pageids for this piece
diff --git a/xmldumps-backup/xmlstreams.py b/xmldumps-backup/xmlstreams.py
new file mode 100644
index 0000000..83f718e
--- /dev/null
+++ b/xmldumps-backup/xmlstreams.py
@@ -0,0 +1,214 @@
+'''
+generate an xml dump via multiple runs of a php script instead of one
+long run.
+
+avoids memory leak issues, permits retries when a single run fails,
+recovery if db servers go away in the middle of a run by retrying
+the run.
+'''
+
+import os
+import sys
+import time
+import worker
+import WikiDump
+import getopt
+
+from subprocess import Popen, PIPE
+
+# fix all the error returns and make subroutines out of stuff
+# current code puts together a command with a bunch of crap in it
+
+def do_xml_stream(wikidb, outfiles, command, wikiconf, force_normal,
+ start, end, dryrun, id_field, table,
+ small_interval, max_interval, ends_with):
+ '''
+ do an xml dump one piece at a time, writing into uncompressed
+ temporary files and shovelling those into gzip's stdin for the
+ concatenated compressed output
+ '''
+ if start is None:
+ start = 1
+
+ interval = None
+ if end is None:
+ end = get_max_id(wikiconf, wikidb, id_field, table)
+ # if the whole wiki is small enough, take
+ # arbitrary hopefully reasonable slices
+ if start == 1 and end < 1000000:
+ interval = small_interval
+
+ if interval is None:
+ # hope this is not too awful a guess
+ interval = (int(end) - int(start))/12
+ if interval > max_interval:
+ interval = max_interval
+
+ # run first
+ piece_command = [field for field in command]
+ piece_command.extend(["--start", str(start)])
+ piece_command.append("--skip-footer")
+ if interval <= end:
+ upto = interval + start
+ piece_command.extend(["--end", str(upto)])
+ else:
+ upto = end
+
+ do_xml_piece(piece_command, outfiles, ends_with, dryrun)
+
+ while upto <= end:
+ piece_command = [field for field in command]
+ piece_command.append("--skip-header")
+ piece_command.extend(["--start=%s" % str(upto)])
+ piece_command.append("--skip-footer")
+ if upto + interval <= end:
+ piece_command.extend(["--end", str(upto + interval)])
+ else:
+ piece_command.extend(["--end", str(end + 1)])
+ upto = upto + interval
+ do_xml_piece(piece_command, outfiles, ends_with, dryrun)
+
+ piece_command = [field for field in command]
+ # get just the footer
+ piece_command.extend(["--skip-header", "--start=1", "--end=1"])
+
+ do_xml_piece(piece_command, outfiles, dryrun=dryrun)
+
+ for filetype in outfiles:
+ outfiles[filetype]['compr'].stdin.close()
+
+ for filetype in outfiles:
+ outfiles[filetype]['compr'].wait()
+
+
+def run_script(command, outfiles, shouldendwith=None):
+ '''
+ given a command
+ returns True on success, None on failure
+ '''
+ failed = False
+ process = Popen(command)
+ # would be best for there to be a timeout for this eh?
+ process.wait()
+ retval = process.returncode
+ if not retval:
+ for filetype in outfiles:
+ outfile = outfiles[filetype]['temp']
+ if os.path.exists(outfile):
+ # file could be empty (all pages in the range deleted)
+ if os.path.getsize(outfile) > 0:
+ if shouldendwith is not None:
+ with open(outfile, 'r') as outfd:
+ outfd.seek(len(shouldendwith) * -1, os.SEEK_END)
+ remainder = outfd.read()
+ outfd.close()
+ if remainder != shouldendwith:
+ os.unlink(outfile)
+ failed = True
+ if failed:
+ return False
+ else:
+ return True
+
+
+def catfile(inputfile, process):
+ '''
+ read a file, cat it as fast as possible to the
+ stdin of the process passed, then go away
+ '''
+ if not os.path.exists(inputfile):
+ sys.stderr.write("no such file: %s\n" % inputfile)
+ sys.exit(1)
+ with open(inputfile, "r") as filed:
+ while True:
+ content = filed.read(1048576)
+ if not content:
+ filed.close()
+ break
+ process.stdin.write(content)
+
+
+def gzippit(outfile):
+ '''
+ start a gzip process that reads from stdin
+ and writes to the specified file
+ '''
+ process = Popen("gzip > %s" % outfile, stdin=PIPE, shell=True, bufsize=-1)
+ return process
+
+
+def get_max_id(wikiconf, wikidb, id_field, table):
+ '''
+ retrieve the largest id for this wiki from the db for specific table
+ pass in name of id field, name of table
+ '''
+ wiki = WikiDump.Wiki(wikiconf, wikidb)
+
+ db_info = worker.DbServerInfo(wiki, wikidb)
+ query = "select MAX(%s) from %s%s;" % (id_field, db_info.dBTablePrefix,
table)
+ results = None
+ retries = 0
+ maxretries = 5
+ end = 0
+ results = db_info.runSqlAndGetOutput(query)
+ if results:
+ lines = results.splitlines()
+ if lines and lines[1]:
+ end = int(lines[1])
+ return end
+
+ while results is None and retries < maxretries:
+ retries = retries + 1
+ time.sleep(5)
+ results = db_info.runSqlAndGetOutput(query)
+ if not results:
+ continue
+ lines = results.splitlines()
+ if lines and lines[1]:
+ end = int(lines[1])
+ break
+
+ if not end:
+ sys.stderr.write("failed to get max page id from db, exiting\n")
+ sys.exit(1)
+ else:
+ return end
+
+
+def do_xml_piece(command, outfiles, ends_with=None, dryrun=False):
+ '''
+ do one piece of a logs dump, output going uncompressed
+ to a temporary file and the that file being shovelled
+ into the compressor's stdin
+
+ we do three retries with plenty of delay, in case
+ the db server has issues or some other problem
+ crops up
+ '''
+
+ if dryrun:
+ sys.stderr.write("would run command: %s\n" % " ".join(command))
+ return
+
+ retries = 0
+ maxretries = 3
+ timeout = 60
+ while retries < maxretries:
+ result = run_script(command, outfiles, ends_with)
+ if result:
+ break
+ time.sleep(timeout)
+ timeout = timeout * 2
+ retries += 1
+ if not result:
+ sys.stderr.write("failed job after max retries\n")
+ for filetype in outfiles:
+ os.unlink(outfiles[filetype]['temp'])
+ # these partial output files can be used later with a
+ # run that dumps the rest of the pages, and a recombine
+ outfiles[filetype]['compr'].stdin.close()
+ sys.exit(1)
+
+ for filetype in outfiles:
+ catfile(outfiles[filetype]['temp'], outfiles[filetype]['compr'])
+ os.unlink(outfiles[filetype]['temp'])
diff --git a/xmldumps-backup/xmlstubs.py b/xmldumps-backup/xmlstubs.py
new file mode 100644
index 0000000..768ff9a
--- /dev/null
+++ b/xmldumps-backup/xmlstubs.py
@@ -0,0 +1,163 @@
+'''
+generate an xml dump via multiple runs of a php script instead of one
+long run.
+
+avoids memory leak issues, permits retries when a single run fails,
+recovery if db servers go away in the middle of a run by retrying
+the run.
+'''
+
+import os
+import sys
+import time
+import worker
+import WikiDump
+import getopt
+from xmlstreams import run_script, catfile, gzippit, get_max_id, do_xml_piece,
do_xml_stream
+
+
+def dostubsbackup(wikidb, history_file, current_file, articles_file,
+ wikiconf, force_normal, start, end, dryrun):
+ '''
+ do a stubs xml dump one piece at a time, writing into uncompressed
+ temporary files and shovelling those into gzip's stdin for the
+ concatenated compressed output
+ '''
+ outfiles = {'history': {'name': history_file},
+ 'current': {'name': current_file},
+ 'articles': {'name': articles_file}}
+ for filetype in outfiles:
+ outfiles[filetype]['temp'] = os.path.join(wikiconf.tempDir,
os.path.basename(outfiles[filetype]['name']) + "_tmp")
+ outfiles[filetype]['compr'] = gzippit(outfiles[filetype]['name'])
+
+ script_command = worker.MultiVersion.MWScriptAsArray(wikiconf,
"dumpBackup.php")
+ command = [wikiconf.php, "-q"] + script_command
+
+ command.extend(["--wiki=%s" % wikidb,
+ "--full", "--stub", "--report=10000",
+ "--output=file:%s" % outfiles['history']['temp'],
+ "--output=file:%s" % outfiles['current']['temp'],
+ "--filter=latest",
+ "--output=file:%s" % outfiles['articles']['temp'],
+ "--filter=latest", "--filter=notalk",
+ "--filter=namespace:!NS_USER"
+ ])
+ if force_normal is not None:
+ command.append("--force-normal")
+
+ do_xml_stream(wikidb, outfiles, command, wikiconf, force_normal,
+ start, end, dryrun, 'page_id', 'page',
+ 100000, 500000, '</page>\n')
+
+
+def usage(message=None):
+ """
+ display a helpful usage message with
+ an optional introductory message first
+ """
+ if message is not None:
+ sys.stderr.write(message)
+ sys.stderr.write("\n")
+ usage_message = """
+Usage: xmlstubs.py --wiki wikidbname --articles path --current path
+ --history path [--start number] [--end number]
+ [--force-normal bool] [--config path]
+
+Options:
+
+ --wiki (-w): wiki db name, e.g. enwiki
+ --articles (-a): full path of articles xml stub dump that will be created
+ --current (-c): full path of current pages xml stub dump that will be
created
+ --history (-h): full path of xml stub dump with full history that will
be created
+
+ --start (-s): starting page to dump (default: 1)
+ --end (-e): ending page to dump (default: dump all)
+
+ --force-normal (-f): if set, this argument will be passed through to
dumpBackup.php
+ (default: unset)
+ --config (-C): path to wikidump configfile (default: "wikidump.conf"
in current dir)
+ --dryrun (-d): display the commands that would be run to produce the
output but
+ don't actually run them
+"""
+ sys.stderr.write(usage_message)
+ sys.exit(1)
+
+
+def main():
+ 'main entry point, does all the work'
+ wiki = None
+ articles_file = None
+ current_file = None
+ history_file = None
+ start = None
+ end = None
+ force_normal = False
+ dryrun = False
+ configfile = "wikidump.conf"
+
+ try:
+ (options, remainder) = getopt.gnu_getopt(
+ sys.argv[1:], "w:a:c:h:s:e:C:fhd",
+ ["wiki=", "articles=", "current=", "history=",
+ "start=", "end=", "config=", "force-normal",
+ "help", "dryrun"])
+
+ except getopt.GetoptError as err:
+ usage("Unknown option specified: " + str(err))
+ for (opt, val) in options:
+ if opt in ["-w", "--wiki"]:
+ wiki = val
+ elif opt in ["-a", "--articles"]:
+ articles_file = val
+ elif opt in ["-c", "--current"]:
+ current_file = val
+ elif opt in ["-h", "--history"]:
+ history_file = val
+ elif opt in ["-s", "--start"]:
+ start = val
+ elif opt in ["-e", "--end"]:
+ end = val
+ elif opt in ["-f", "--force-normal"]:
+ force_normal = True
+ elif opt in ["-C", "--config"]:
+ configfile = val
+ elif opt in ["-d", "--dryrun"]:
+ dryrun = True
+ elif opt in ["-h", "--help"]:
+ usage('Help for this script\n')
+ else:
+ usage("Unknown option specified: <%s>" % opt)
+
+ if len(remainder) > 0:
+ usage("Unknown option(s) specified: <%s>" % remainder[0])
+
+ if wiki is None:
+ usage("mandatory argument argument missing: --wiki")
+ if articles_file is None:
+ usage("mandatory argument argument missing: --articles")
+ if current_file is None:
+ usage("mandatory argument argument missing: --current")
+ if history_file is None:
+ usage("mandatory argument argument missing: --history")
+
+ if start is not None:
+ if not start.isdigit():
+ usage("value for --start must be a number")
+ else:
+ start = int(start)
+
+ if end is not None:
+ if not end.isdigit():
+ usage("value for --end must be a number")
+ else:
+ end = int(end)
+
+ if not os.path.exists(configfile):
+ usage("no such file found: " + configfile)
+
+ wikiconf = WikiDump.Config(configfile)
+ wikiconf.parseConfFilePerProject(wiki)
+ dostubsbackup(wiki, history_file, current_file, articles_file, wikiconf,
force_normal, start, end, dryrun)
+
+if __name__ == '__main__':
+ main()
--
To view, visit https://gerrit.wikimedia.org/r/215666
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I36146797d5fb78717e3ea92125487793982a1ec0
Gerrit-PatchSet: 1
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits