ArielGlenn has uploaded a new change for review. https://gerrit.wikimedia.org/r/61765
Change subject: script to shovel in pages-logging xml file into logging table ...................................................................... script to shovel in pages-logging xml file into logging table this script is for testing purposes only (populating the logging table from a pages-logging xml file in order to test xml dumps); it also populates the user table with mostly crap data except for the user name and id as retrieved from the pages-logging xml file. Change-Id: Ibe5ab1d8044a2faa60b1f1097dab1c40a553ebe3 --- A xmlfileutils/scripts/pageslogging2sql.py 1 file changed, 441 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/dumps refs/changes/65/61765/1 diff --git a/xmlfileutils/scripts/pageslogging2sql.py b/xmlfileutils/scripts/pageslogging2sql.py new file mode 100644 index 0000000..d43be39 --- /dev/null +++ b/xmlfileutils/scripts/pageslogging2sql.py @@ -0,0 +1,441 @@ +# -*- coding: utf-8 -*- +import os, re, sys, getopt, urllib, gzip, bz2, subprocess, json, time, select, shutil, string +from wikifile import File + +class WikiContentErr(Exception): + pass + +class NsDict(object): + + def __init__(self, langCode, project, verbose = False): + """Constructor. Arguments: + langCode -- language code of project, like en el etc. + project -- type of project, like wiktionary, wikipedia, etc. + verbose -- display progress messages""" + self.langCode = langCode + self.project = project + self.verbose = verbose + + def getNsDict(self): + """Retrieve namespace informtion for a wiki via the MediaWiki api + and store in in dict form. + On error raises an exception.""" + + # http://en.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=namespaces&format=json + apiUrl = "http://" + self.langCode + "." + self.project + "." + "org/w/api.php" + "?action=query&meta=siteinfo&siprop=namespaces&format=json" + nsDict = {} + ufd = urllib.urlopen(apiUrl) + if str(ufd.getcode()).startswith("2"): + output = ufd.read() + ufd.close() + siteInfo = json.loads(output) + if 'query' not in siteInfo or 'namespaces' not in siteInfo['query']: + raise WikiContentErr("Error trying to get namespace information from api\n") + for k in siteInfo['query']['namespaces'].keys(): + if '*' in siteInfo['query']['namespaces'][k]: + nsDict[k] = siteInfo['query']['namespaces'][k]['*'].encode('utf8') + else: + raise WikiContentErr("Error trying to get parse namespace information\n") + return nsDict + else: + code = ufd.getcode() + ufd.close() + raise WikiContentErr("Error trying to retrieve namespace info: %s\n" % code); + + return nsDict + +class TitlesDict(object): + def __init__(self, nsDictByString): + """Constructor. Arguments: + nsDictByString -- hash of nstitle => nsnum""" + self.nsDictByString = nsDictByString + + def getTitlesDict(self,sqlFile): + """Arguments: + sqlFile -- file containing pageid whitespace nsnum whitespace pagetitle where the title + is expected to be sql escaped and can be enclosed with single quotes""" + fd = File.openInput(sqlFile) + t = {} + for line in fd: + (pageid, ns, title) = line.split(' ',3) + ns = int(ns) + if title in t: + t[title][ns] = pageid + else: + t[title] = { ns: pageid } + return t + +class LoggingXml(object): + def __init__(self, nsDictByString, titlesDict, xmlFile, outputFile, userOutFile): + """Constructor. Arguments: + nsDictByString -- hash of nstitle => nsnum + titlesDict -- hash of pagetitle => [ pageid, nsnum ] + xmlFile -- path to filename with logging.xml + logOutFile -- path to logging output filename""" + + self.nsDictByString = nsDictByString + self.titlesDict = titlesDict + self.xmlFile = xmlFile + self.logOutFile = logOutFile + self.userOutFile = userOutFile + + self.logitemPattern = "^\s*<logitem>\s*\n$" + self.compiledLogitemPattern = re.compile(self.logitemPattern) + self.idPattern = "^\s*<id>(?P<i>.+)</id>\s*\n$" + self.compiledIdPattern = re.compile(self.idPattern) + self.timestampPattern = "^\s*<timestamp>(?P<t>.+)</timestamp>\s*\n$" + self.compiledTimestampPattern = re.compile(self.timestampPattern) + self.contributorPattern = "^\s*<contributor>\n$" + self.compiledContributorPattern = re.compile(self.contributorPattern) + self.usernamePattern = "^\s*<username>(?P<u>.+)</username>\s*\n$" + self.compiledUsernamePattern = re.compile(self.usernamePattern) + self.endContributorPattern = "^\s*</contributor>\n$" + self.compiledEndContributorPattern = re.compile(self.endContributorPattern) + self.commentPattern = "^\s*<comment>(?P<c>.+)</comment>\s*\n$" + self.compiledCommentPattern = re.compile(self.commentPattern, re.DOTALL) + self.typePattern = "^\s*<type>(?P<t>.+)</type>\s*\n$" + self.compiledTypePattern = re.compile(self.typePattern) + self.actionPattern = "^\s*<action>(?P<a>.+)</action>\s*\n$" + self.compiledActionPattern = re.compile(self.actionPattern) + self.logtitlePattern = "^\s*<logtitle>(?P<l>.+)</logtitle>\s*\n$" + self.compiledLogtitlePattern = re.compile(self.logtitlePattern) + self.paramsPattern = '^\s*<params\s+xml:space="preserve">(?P<p>.+)</params>\s*\n$' + self.compiledParamsPattern = re.compile(self.paramsPattern, re.DOTALL) + self.noParamsPattern = '^\s*<params\s+xml:space="preserve" />\s*\n$' + self.compiledNoParamsPattern = re.compile(self.noParamsPattern) + self.endLogitemPattern = "^\s*</logitem>\s*\n$" + self.compiledEndLogitemPattern = re.compile(self.endLogitemPattern) + self.all=string.maketrans('','') + self.nodigs=self.all.translate(self.all, string.digits) + + def skipHeader(self, fd): + """skip over mediawiki site header etc""" + endHeaderPattern = "^\s*</siteinfo>" + compiledEndHeaderPattern = re.compile(endHeaderPattern) + for line in fd: + if compiledEndHeaderPattern.match(line): + return True + return False # never found it + + def unXMLEscape(self, title): + """Convert XML sanitized title to its regular format. + This expects no newlines, \r or \t in titles and unescapes + these characters: & " ' < > + Arguments: + title -- title to be desantized""" + + title = title.replace(""", '"') + title = title.replace("<", '<') + title = title.replace(">", '>') + title = title.replace("'", "'") + title = title.replace("&", '&') # this one must be last + return title + + def sqlEscape(self, string, underscores = True): + """Escape string in preparation for it to be written + to an sql file for import. + $wgLegalTitleChars = " %!\"$&'()*,\\-.\\/0-9:;=?@A-Z\\\\^_`a-z~\\x80-\\xFF+"; + Escapes these characters: ' " \ by adding leading \ + Note that in the database all titles are stored with underscores instead of spaces + so replace those; also enclose the title in single quotes + Arguments: + string -- string of to escape""" + + string = string.replace('\\', "\\\\") # must insert new backslashs after this step + string = string.replace("\'", "\\'") + string = string.replace('"', '\\"') + if underscores: + string = string.replace(' ', '_') + return "'" + string + "'" + + # format: + # <logitem> + # <id>1</id> + # <timestamp>2005-02-26T19:37:52Z</timestamp> + # <contributor> + # <username>Leonariso</username> + # <id>3</id> + # </contributor> + # <comment>content was: ''''Έντονης γραφής κείμενο'''''Πλάγιας γραφής κείμενο''[[Τίτλος σύνδεσης]]== Headline text ==[[...'</comment> + # <type>delete</type> + # <action>delete</action> + # <logtitle>Βικιλεξικό:By topic</logtitle> + # <params xml:space="preserve" /> + # </logitem> + def doLogItem(self, fd, logOutFd, userOutFd): + # note that it's possible for a comment or the params to have an embedded newline in them + # the rest of the fields, no + + line = fd.readline() + result = self.compiledLogitemPattern.match(line) + if not result: + if "</mediawiki" in line: + return True # eof + else: + raise WikiContentErr("bad line in logging file, expected <logitem>, found <%s>\n" % line) + + line = fd.readline() + result = self.compiledIdPattern.match(line) + if not result: + raise WikiContentErr("bad line in logging file, expected <id>, found <%s>\n" % line) + logid = result.group("i") + + line = fd.readline() + result = self.compiledTimestampPattern.match(line) + if not result: + raise WikiContentErr("bad line in logging file, expected <timestamp>, found <%s>\n" % line) + timestamp = result.group("t") + + line = fd.readline() + result = self.compiledContributorPattern.match(line) + if not result: + if not "<contributor deleted" in line: + raise WikiContentErr("bad line in logging file, expected <contributor>, found <%s>\n" % line) + else: + username = '' + userid = '0' + else: + line = fd.readline() + result = self.compiledUsernamePattern.match(line) + if not result: + raise WikiContentErr("bad line in logging file, expected <username>, found <%s>\n" % line) + username = result.group("u") + + line = fd.readline() + result = self.compiledIdPattern.match(line) + if not result: + raise WikiContentErr("bad line in logging file, expected <id>, found <%s>\n" % line) + userid = result.group("i") + + line = fd.readline() + result = self.compiledEndContributorPattern.match(line) + if not result: + raise WikiContentErr("bad line in logging file, expected </contributor>, found <%s>\n" % line) + + line = fd.readline() + if "<comment>" not in line: + # apparently comment is optional. OR it can be 'deleted'. wonderful. + if "<comment deleted" in line: + line = fd.readline() + comment = '' + else: + while "</comment>" not in line: + line = line + fd.readline() + result = self.compiledCommentPattern.match(line) + if not result: + raise WikiContentErr("bad line in logging file, expected <comment>, found <%s>\n" % line) + comment = result.group("c") + line = fd.readline() + + result = self.compiledTypePattern.match(line) + if not result: + raise WikiContentErr("bad line in logging file, expected <type>, found <%s>\n" % line) + type = result.group("t") + + line = fd.readline() + result = self.compiledActionPattern.match(line) + if not result: + raise WikiContentErr("bad line in logging file, expected <action>, found <%s>\n" % line) + action = result.group("a") + + line = fd.readline() + result = self.compiledLogtitlePattern.match(line) + if not result: + if "<text deleted" in line: + logtitle = '' + else: + raise WikiContentErr("bad line in logging file, expected <logtitle>, found <%s>\n" % line) + else: + logtitle = result.group("l") + + line = fd.readline() + # do the no params case first + result = self.compiledNoParamsPattern.match(line) + if result: + params = '' + line = fd.readline() + else: + if "<params" in line: + # ok it has some params, possibly over more than one line + while "</params>" not in line: + line = line + fd.readline() + result = self.compiledParamsPattern.match(line) + if not result: + raise WikiContentErr("bad line in logging file, expected <params xml:space=\"preserve\" />, found <%s> for %s\n" % (line, logtitle)) + else: + params = result.group("p") + line = fd.readline() + else: # it's some other tag, this elt was missing altogether + params = '' + + result = self.compiledEndLogitemPattern.match(line) + if not result: + raise WikiContentErr("bad line in logging file, expected </logitem>, found <%s>\n" % line) + + # turn logtitle into pageid, namespace, title-with-no-namespace-prefix + sep = logtitle.find(":") + if sep != -1: + prefix = logtitle[:sep] + if prefix in self.nsDictByString: + pagetitle = self.sqlEscape(self.unXMLEscape(logtitle[sep+1:])) + nsnum = self.nsDictByString[prefix] + if pagetitle in self.titlesDict: + pageid = self.titlesDict[pagetitle][nsnum] + else: + pageid = "NULL" + else: + pagetitle = self.sqlEscape(self.unXMLEscape(logtitle)) + nsnum = 0 + if pagetitle in self.titlesDict: + pageid = self.titlesDict[pagetitle][0] + else: + pageid = "NULL" + else: + pagetitle = self.sqlEscape(self.unXMLEscape(logtitle)) + nsnum = 0 + if pagetitle in self.titlesDict: + pageid = self.titlesDict[pagetitle][0] + else: + pageid = "NULL" + + comment = self.sqlEscape(self.unXMLEscape(comment), False) + username = self.sqlEscape(self.unXMLEscape(username), False) + params = self.sqlEscape(self.unXMLEscape(params), False) + + line = "INSERT INTO logging ( log_id, log_type, log_action, log_timestamp, log_user, log_user_text, log_namespace, log_title, log_page, log_comment, log_params, log_deleted ) VALUES " + logOutFd.write(unicode(line).encode('utf-8')) + username = username.decode('utf-8') + pagetitle = pagetitle.decode('utf-8') + comment = comment.decode('utf-8') + params = params.decode('utf-8') + nsnum = str(nsnum) + # need 20130425122902, have 2005-07-23T16:43:37Z + timestamp = timestamp.translate(self.all, self.nodigs) + + line = "( %s );\n" % ", ".join([ logid, "'" + type+ "'", "'" + action + "'" , "'" + timestamp+ "'", userid, username, nsnum, pagetitle, pageid, comment, params, '0' ]) + logOutFd.write(unicode(line).encode('utf-8')) + + if self.userOutFile and userid not in self.userDict: + line = "INSERT INTO user ( user_id, user_name, user_real_name, user_password, user_newpassword, user_newpass_time, user_email, user_touched, user_token, user_email_authenticated, user_email_token, user_email_token_expires, user_registration, user_editcount ) VALUES " + userOutFd.write(unicode(line).encode('utf-8')) + line = "( %s );\n" % ", ".join([ userid, username, "''", "''", "''", "NULL", "''", "'20010101000000'", "'6f9b27b447a7fd49bc525e51cc82320b'", "NULL", "NULL", "NULL", "NULL", "0" ]) + userOutFd.write(unicode(line).encode('utf-8')) + + self.userDict[userid] = True + + return False + + def writeSql(self): + self.userDict = { 1: True } + fd = File.openInput(self.xmlFile) + logOutFd = File.openOutput(self.logOutFile) + if self.userOutFile: + userOutFd = File.openOutput(self.userOutFile) + else: + userOutFd = None + if not self.skipHeader(fd): + raise WikiContentErr("failed to find end of mediawiki/siteinfo header in xml file\n") + eof = False + while not eof: + eof = self.doLogItem(fd, logOutFd, userOutFd) + fd.close() + logOutFd.close() + if self.userOutFile: + userOutFd.close() + return + +def usage(message = None): + """Show usage and help information. Arguments: + message -- message to be shown (e.g. error message) before the help""" + + if message: + sys.stderr.write(message) + sys.stderr.write("\n") + sys.stderr.write("Usage: python %s --lang langcode --project filename --sqlfile filename\n" % sys.argv[0]) + sys.stderr.write(" --logfile filename --logout filename [--userout filename]\n") + sys.stderr.write("\n") + sys.stderr.write("This script converts a pages-logging.xml file to an sql file suitable for\n") + sys.stderr.write("import into the logging table of a MediaWiki installation.\n") + sys.stderr.write("It may get some things wrong because page ids of page titles can change\n") + sys.stderr.write("over time and this program isn't clever about how it looks that up.\n") + sys.stderr.write("We needed this script for testing logging dumps. If you need it for production\n") + sys.stderr.write("purposes, better test it carefully.\n") + sys.stderr.write("\n") + sys.stderr.write("Options:\n") + sys.stderr.write("\n") + sys.stderr.write("--lang the language code of the project from which the logging table was dumped,\n") + sys.stderr.write(" i.e. en, fr, el etc.\n") + sys.stderr.write("--project the type of wiki from which the logging table was dumped, i.e. wikipedia,\n") + sys.stderr.write(" wiktionary, wikisource, etc.\n") + sys.stderr.write("--sqlfile path to an sql fle containing fields pageid namespacenum pagetitle space-\n") + sys.stderr.write(" separated and one triple per line, pagetitle should be sql escaped as it\n") + sys.stderr.write(" would be if written out by mysqldump, and it should not contain the namespace\n") + sys.stderr.write(" prefix.\n") + sys.stderr.write("--loggingfile path to the xml pages-logging file to be converted\n") + sys.stderr.write("--logout path to the file where the converted sql will be written\n") + sys.stderr.write("--userout path to file where fake user table sql will be written, if specified\n") + sys.stderr.write(" the user table is used when generating xml dumps of the log table;\n") + sys.stderr.write(" any user id found in the logging sql file with non null username will\n") + sys.stderr.write(" be added except for the user with uid 1, yes this is a hack\n") + sys.stderr.write(" Make sure that there are no other users except uid 1 already in the table\n") + sys.stderr.write(" and that the username is not in the produced sql BEFORE using it for import\n") + sys.exit(1) + +if __name__ == "__main__": + langCode = None + project = None + sqlFile = None + loggingFile = None + logOutFile = None + userOutFile = None + + try: + (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", [ "lang=", "project=", "sqlfile=", "loggingfile=", "logout=", "userout=" ] ) + except: + usage("Unknown option specified") + + for (opt, val) in options: + + # main opts + if opt == "--lang": + langCode = val; + elif opt == "--project": + project = val + elif opt == "--sqlfile": + sqlFile = val + elif opt == "--loggingfile": + loggingFile = val + elif opt == "--logout": + logOutFile = val + elif opt == "--userout": + userOutFile = val + else: + usage("Unknown option specified: %s" % opt ) + + if len(remainder) > 0: + usage("Unknown option specified: <%s>" % remainder[0]) + + if not langCode: + usage("Missing mandatory option <%s>" % "lang") + if not sqlFile: + usage("Missing mandatory option <%s>" % "sqlfile") + if not project: + usage("Missing mandatory option <%s>" % "project") + if not loggingFile: + usage("Missing mandatory option <%s>" % "loggingfile") + if not logOutFile: + usage("Missing mandatory option <%s>" % "logout") + + ns = NsDict(langCode, project) + nsDict = ns.getNsDict() + + nsDictByString = {} + for nsnum in nsDict.keys(): + nsDictByString[nsDict[nsnum]] = nsnum + + td = TitlesDict(nsDictByString) + titlesDict = td.getTitlesDict(sqlFile) + lx = LoggingXml(nsDictByString, titlesDict, loggingFile, logOutFile, userOutFile) + lx.writeSql() + + -- To view, visit https://gerrit.wikimedia.org/r/61765 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ibe5ab1d8044a2faa60b1f1097dab1c40a553ebe3 Gerrit-PatchSet: 1 Gerrit-Project: operations/dumps Gerrit-Branch: ariel Gerrit-Owner: ArielGlenn <ar...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits