http://www.mediawiki.org/wiki/Special:Code/MediaWiki/90557
Revision: 90557
Author: halfak
Date: 2011-06-21 22:47:39 +0000 (Tue, 21 Jun 2011)
Log Message:
-----------
added wikimedia utilities
Modified Paths:
--------------
trunk/tools/wsor/ts_samples/testing.sql
Added Paths:
-----------
trunk/tools/wsor/wikimedia/
trunk/tools/wsor/wikimedia/setup.py
trunk/tools/wsor/wikimedia/wmf/
trunk/tools/wsor/wikimedia/wmf/__init__.py
trunk/tools/wsor/wikimedia/wmf/dump/
trunk/tools/wsor/wikimedia/wmf/dump/__init__.py
trunk/tools/wsor/wikimedia/wmf/dump/iterator.py
trunk/tools/wsor/wikimedia/wmf/dump/map.py
trunk/tools/wsor/wikimedia/wmf/dump/tests/
trunk/tools/wsor/wikimedia/wmf/dump/tests/__init__.py
trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/
trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/__init__.py
trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/large.xml.lzma
trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/small.xml.lzma
trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/test.py
trunk/tools/wsor/wikimedia/wmf/dump/tests/sample.py
trunk/tools/wsor/wikimedia/wmf/dump/tests/test_iterator.py
trunk/tools/wsor/wikimedia/wmf/dump/tests/test_map.py
trunk/tools/wsor/wikimedia/wmf/dump/xml_iterator.py
trunk/tools/wsor/wikimedia/wmf/util.py
Removed Paths:
-------------
trunk/tools/wsor/scripts/process_dumps.py
Deleted: trunk/tools/wsor/scripts/process_dumps.py
===================================================================
--- trunk/tools/wsor/scripts/process_dumps.py 2011-06-21 22:44:54 UTC (rev
90556)
+++ trunk/tools/wsor/scripts/process_dumps.py 2011-06-21 22:47:39 UTC (rev
90557)
@@ -1,186 +0,0 @@
-import sys, logging, re, types, argparse, os, subprocess
-from multiprocessing import Process, Queue, Lock, cpu_count, Value
-from Queue import Empty
-from gl import wp
-
-class FileTypeError(Exception):pass
-
-def encode(v):
- if type(v) == types.FloatType:
- return str(int(v))
- elif v == None:
- return "\\N"
- else:
- return repr(v)
-
-
-
-class SafeOutput:
-
- def __init__(self, fp):
- self.fp = fp
- self.l = Lock()
-
- def push(self, row, encode=encode):
- if __debug__:
- row = tuple(row)
-
- with self.l:
- self.fp.write("\t".join(clean(v) for v in row) + "\n")
-
-class Processor(Process):
-
- def __init__(self, input, processPage, output, callback, logger):
- self.input = input
- self.processPage = processPage
- self.output = output
- self.callback = callback
- self.logger = logger
- Process.__init__(self)
-
- def run(self):
- try:
- while True:
- foo = self.input.qsize()
- fn = self.input.get(block=False)
- self.logger.info("Processing dump file %s." %
fn)
- dump = wp.dump.Iterator(openDumpFile(fn))
- for page in dump.readPages():
- self.logger.debug("Processing page
%s:%s." % (page.getId(), page.getTitle()))
- try:
- for out in
self.processPage(dump, page):
- self.output.put(out)
- except Exception as e:
- self.logger.error(
- "Failed to process page
%s:%s - %s" % (
- page.getId(),
- page.getTitle(),
- e
- )
- )
-
-
-
-
- except Empty:
- self.logger.info("Nothing left to do. Shutting down
thread.")
- finally:
- self.callback()
-
-
-
-
-def main(args):
- LOGGING_STREAM = sys.stderr
- if __debug__: level = logging.DEBUG
- else: level = logging.INFO
- logging.basicConfig(
- level=level,
- stream=LOGGING_STREAM,
- format='%(name)s: %(asctime)s %(levelname)-8s %(message)s',
- datefmt='%b-%d %H:%M:%S'
- )
- logging.info("Starting dump processor with %s threads." %
min(args.threads, len(args.dump)))
- for row in process_dumps(args.dump, args.processor.process,
args.threads):
- print('\t'.join(encode(v) for v in row))
-
-def process_dumps(dumps, processPage, threads):
- input = dumpFiles(dumps)
- output = Queue(maxsize=10000)
- running = Value('i', 0)
-
- def dec(): running.value -= 1
-
- for i in range(0, min(threads, input.qsize())):
- running.value += 1
- Processor(
- input,
- processPage,
- output,
- dec,
- logging.getLogger("Process %s" % i)
- ).start()
-
-
- #output while processes are running
- while running.value > 0:
- try: yield output.get(timeout=.25)
- except Empty: pass
-
- #finish yielding output buffer
- try:
- while True: yield output.get(block=False)
- except Empty:
- pass
-
-
-
-EXTENSIONS = {
- 'xml': "cat",
- 'bz2': "bzcat",
- '7z': "7z e -so 2>/dev/null",
- 'lzma':"lzcat"
-}
-
-EXT_RE = re.compile(r'\.([^\.]+)$')
-def dumpFile(path):
- path = os.path.expanduser(path)
- if not os.path.isfile(path):
- raise FileTypeError("Can't find file %s" % path)
-
- match = EXT_RE.search(path)
- if match == None:
- raise FileTypeError("No extension found for %s." % path)
- elif match.groups()[0] not in EXTENSIONS:
- raise FileTypeError("File type %r is not supported." % path)
- else:
- return path
-
-def dumpFiles(paths):
- q = Queue()
- for path in paths: q.put(dumpFile(path))
- return q
-
-def openDumpFile(path):
- match = EXT_RE.search(path)
- ext = match.groups()[0]
- p = subprocess.Popen(
- "%s %s" % (EXTENSIONS[ext], path),
- shell=True,
- stdout=subprocess.PIPE
- )
- return p.stdout
-
-
-if __name__ == "__main__":
- parser = argparse.ArgumentParser(
- description='Maps a function across pages of MediaWiki dump
files'
- )
- parser.add_argument(
- '-o', '--out',
- metavar="<path>",
- type=lambda path:open(path, "w"),
- help='the path to an output file to write putput to (defaults
to stdout)',
- default=sys.stdout
- )
- parser.add_argument(
- '-t', '--threads',
- metavar="",
- type=int,
- help='the number of threads to start (defaults to # of cores
-1)',
- default=cpu_count()-1
- )
- parser.add_argument(
- 'processor',
- type=__import__,
- help='the class path to the function to use to process each
page'
- )
- parser.add_argument(
- 'dump',
- type=dumpFile,
- help='the XML dump file(s) to process',
- nargs="+"
- )
- args = parser.parse_args()
- main(args)
-
Modified: trunk/tools/wsor/ts_samples/testing.sql
===================================================================
--- trunk/tools/wsor/ts_samples/testing.sql 2011-06-21 22:44:54 UTC (rev
90556)
+++ trunk/tools/wsor/ts_samples/testing.sql 2011-06-21 22:47:39 UTC (rev
90557)
@@ -40,3 +40,30 @@
CREATE UNIQUE INDEX user_id_idx ON halfak.user_meta (user_id);
CREATE INDEX first_edit_idx ON halfak.user_meta (first_edit);
CREATE INDEX last_edit_idx ON halfak.user_meta (last_edit);
+
+
+SELECT
+ year,
+ biannual,
+ count(*)
+FROM
+(
+SELECT
+ u.user_id,
+ SUBSTRING(first_edit, 1,4) as year,
+ SUBSTRING(first_edit, 5,2) >= "07" as biannual
+FROM halfak.user_meta um
+INNER JOIN user u
+ ON u.user_id = um.user_id
+INNER JOIN page p
+ ON p.page_title = u.user_name
+ AND p.page_namespace = 3
+INNER JOIN revision r
+ ON um.user_id != r.rev_user
+ AND p.page_id = r.rev_page
+GROUP BY
+ user_id,
+ SUBSTRING(first_edit, 1,4),
+ SUBSTRING(first_edit, 5,2)
+) as foo
+GROUP BY year, biannual;
Added: trunk/tools/wsor/wikimedia/setup.py
===================================================================
--- trunk/tools/wsor/wikimedia/setup.py (rev 0)
+++ trunk/tools/wsor/wikimedia/setup.py 2011-06-21 22:47:39 UTC (rev 90557)
@@ -0,0 +1,27 @@
+
+from setuptools import setup, find_packages
+
+setup(
+ name='util',
+ version='1.0',
+ description="WMF utilities",
+ long_description="""
+ A set of utilities originally authored by Aaron Halfaker
+ during the 2011 Wikimedia Summer of Research. The utilities
+ in this package are intended to aid in processing of
+ MediaWiki data related to Wikimedia projects. Many of the
+ utilities have been specifically designed to allow
+ processing of the massive about of data (currently) found
+ in the full history dump of the English Wikipedia
+ """
+ author='Aaron Halfaker',
+ author_email='[email protected]',
+ url='http://meta.wikimedia.org/wiki/User:EpochFail',
+ packages=find_packages(),
+ entry_points = {
+ 'distutils.commands': [
+ 'dump_map = util.dump.map:main',
+ ]
+ },
+
+)
Added: trunk/tools/wsor/wikimedia/wmf/__init__.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/__init__.py (rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/__init__.py 2011-06-21 22:47:39 UTC (rev
90557)
@@ -0,0 +1,2 @@
+from __future__ import absolute_import
+from .util import *
Added: trunk/tools/wsor/wikimedia/wmf/dump/__init__.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/__init__.py
(rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/__init__.py 2011-06-21 22:47:39 UTC
(rev 90557)
@@ -0,0 +1,2 @@
+from .iterator import Iterator
+from .map import map
Added: trunk/tools/wsor/wikimedia/wmf/dump/iterator.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/iterator.py
(rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/iterator.py 2011-06-21 22:47:39 UTC
(rev 90557)
@@ -0,0 +1,220 @@
+from xml_iterator import XMLIterator
+from ..util import wp2Timestamp
+
+def cleanTag(prefix, raw):
+ return raw[len(prefix):]
+
+
+class Iterator:
+ """
+ WikiFile dump processor. This class constructs with a filepointer to a
+ Wikipedia XML dump file.
+
+ """
+
+ def __init__(self, fp):
+ """
+ Constructor
+
+ :Parameters:
+ fp : file pointer
+ a file pointer to the xml file to process.
+ """
+
+ self.fp = fp #:The file pointer passed to the
constructor
+ self.namespaces = {} #:A map of possible namespaces
+ self.siteName = None #:The name of the site
+ self.base = None #:Base of the xml file
+ self.generator = None #:Generator of the dump
+ self.case = None #:The default title case
+
+ self.mediawikiElement = XMLIterator(fp)
+ self.ns = self.mediawikiElement.tag[:-len('mediawiki')]
+
+ pageCount = 0
+ done = False
+ for element in self.mediawikiElement:
+ tag = cleanTag(self.ns, element.tag)
+ if tag == "siteinfo":
+ self.loadSiteInfo(element)
+ element.clear()
+ break
+
+
+
+ def loadSiteInfo(self, siteInfoElement):
+ for element in siteInfoElement:
+ tag = cleanTag(self.ns, element.tag)
+
+ if tag == 'sitename':
+ self.siteName = element.text
+ elif tag == 'base':
+ self.base = element.text
+ elif tag == 'generator':
+ self.generator = element.text
+ elif tag == 'case':
+ self.case = element.text
+ elif tag == 'namespaces':
+ self.loadNamespaces(element)
+ element.clear()
+
+
+
+ def loadNamespaces(self, namespacesElement):
+ for element in namespacesElement:
+ tag = cleanTag(self.ns, element.tag)
+
+ if tag == "namespace":
+ namespace = Namespace(element)
+ self.namespaces[namespace.getName()] =
namespace.getId()
+ else:
+ assert False, "This should never happen"
+
+
+ def readPages(self):
+ for element in self.mediawikiElement:
+ tag = cleanTag(self.ns, element.tag)
+ if tag == "page":
+ yield Page(self.ns, element)
+
+
+
+
+class Namespace:
+
+ def __init__(self, nsElement):
+ self.setId(nsElement.get('key'))
+ self.setName(nsElement.text)
+
+ def setId(self, id): self.id = int(id)
+ def getId(self): return self.id
+
+ def setName(self, name):
+ if name == None:
+ self.name = None
+ else:
+ self.name = unicode(name)
+ def getName(self): return self.name
+
+ def __repr__(self):
+ return "%s(%r, %r)" % (
+ self.__class__.__name__,
+ self.getId(),
+ self.getName()
+ )
+
+ def __eq__(self, other):
+ try:
+ return (
+ self.getId() == other.getId() and
+ self.getName() == other.getName()
+ )
+ except AttributeError:
+ return False
+
+class Page:
+
+ def __init__(self, ns, pageElement):
+ self.id = None
+ self.title = None
+ self.pageElement = pageElement
+ self.ns = ns
+ for element in pageElement:
+ tag = cleanTag(ns, element.tag)
+ if tag == "id":
+ self.setId(element.text)
+ elif tag == "title":
+ self.setTitle(element.text)
+
+ if self.id != None and self.title != None:
+ break
+
+ def readRevisions(self):
+ for element in self.pageElement:
+ tag = cleanTag(self.ns, element.tag)
+ if tag == "revision":
+ yield Revision(self.ns, element)
+ #element.clear()
+
+
+
+ def setId(self, id): self.id = int(id)
+ def getId(self): return self.id
+
+ def setTitle(self, title): self.title = unicode(title)
+ def getTitle(self): return self.title
+
+
+
+class Revision:
+
+ TAG_MAP = {
+ 'id': lambda s,e:s.setId(e.text),
+ 'timestamp': lambda s,e:s.setTimestamp(e.text),
+ 'contributor': lambda s,e:s.setContributor(e),
+ 'minor': lambda s,e:s.setMinor(True),
+ 'comment': lambda s,e:s.setComment(e.text),
+ 'text': lambda s,e:s.setText(e.text)
+ }
+
+ def __init__(self, ns, revisionElement):
+ self.ns = ns
+ self.id = None
+ self.timestamp = None
+ self.contributor = None
+ self.minor = False #No tag means minor edit
+ self.comment = None
+ self.text = None
+ for element in revisionElement:
+ tag = cleanTag(ns, element.tag)
+ self.TAG_MAP[tag](self, element)
+
+ def setId(self, id): self.id = int(id)
+ def getId(self): return self.id
+
+ def setTimestamp(self, timestamp):
+ try: self.timestamp = int(timestamp)
+ except ValueError: self.timestamp = wp2Timestamp(timestamp)
+ def getTimestamp(self): return self.timestamp
+
+ def setContributor(self, element):
+ if element.get("deleted", None) == "deleted":
+ self.contributor = None
+ else:
+ self.contributor = Contributor(self.ns, element)
+
+ def getContributor(self): return self.contributor
+
+ def setMinor(self, minor): self.minor = minor == True
+ def getMinor(self): return self.minor
+
+ def setComment(self, comment): self.comment = unicode(comment)
+ def getComment(self): return self.comment
+
+ def setText(self, text):
+ if text == None: self.text = u''
+ else: self.text = unicode(text)
+ def getText(self): return self.text
+
+class Contributor:
+
+ TAG_MAP = {
+ 'id': lambda s,e:s.setId(e.text),
+ 'username': lambda s,e:s.setUsername(e.text),
+ 'ip': lambda s,e:s.setUsername(e.text)
+ }
+
+ def __init__(self, ns, contributorElement):
+ self.id = None
+ for element in contributorElement:
+ tag = cleanTag(ns, element.tag)
+ self.TAG_MAP[tag](self, element)
+
+ def setId(self, id): self.id = int(id)
+ def getId(self): return self.id
+
+ def setUsername(self, username): self.username = unicode(username)
+ def getUsername(self): return self.username
+
+
+
Added: trunk/tools/wsor/wikimedia/wmf/dump/map.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/map.py (rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/map.py 2011-06-21 22:47:39 UTC (rev
90557)
@@ -0,0 +1,255 @@
+"""
+Dump Mapper
+
+This script acts as a map/function over the pages in a set of MediaWiki
+database dump files. This script allows the algorithm for processing a set of
+pages to be spread across the available processor cores of a system for faster
+analysis.
+
+This script can also be imported as a module to expose the `dump_map()`
function
+that returns an iterator over output rather than printing to stdout.
+
+Examples:
+
+python -O process_dumps.py revision_meta
/dumps/enwiki-20110115-pages-meta-history* > ~/data/revision_meta.tsv
+"""
+import sys, logging, re, types, argparse, os, subprocess
+from multiprocessing import Process, Queue, Lock, cpu_count, Value
+from Queue import Empty
+
+from .iterator import Iterator
+
+class FileTypeError(Exception):pass
+
+class Processor(Process):
+ """
+ A processor for managing the reading of dump files from a queue and
+ the application of a a function for each 'page'.
+ """
+
+ def __init__(self, input, processPage, output, callback, logger):
+ """
+ Constructor
+
+ :Parameters:
+ input : `multiprocessing.Queue`
+ a queue paths to dump files to process
+ processPage : function
+ a function to apply to each page of a dump file
+ output : `multiprocessing.Queue`
+ a queue to send processing output to
+ callback : function
+ a function to run upon completion
+ logger : `logging.Logger`
+ a logger object to send logging events to
+ """
+ self.input = input
+ self.processPage = processPage
+ self.output = output
+ self.callback = callback
+ self.logger = logger
+ Process.__init__(self)
+
+ def run(self):
+ try:
+ while True:
+ foo = self.input.qsize()
+ fn = self.input.get(block=False)
+ self.logger.info("Processing dump file %s." %
fn)
+ dump = Iterator(openDumpFile(fn))
+ for page in dump.readPages():
+ self.logger.debug("Processing page
%s:%s." % (page.getId(), page.getTitle()))
+ try:
+ for out in
self.processPage(dump, page):
+ self.output.put(out)
+ except Exception as e:
+ self.logger.error(
+ "Failed to process page
%s:%s - %s" % (
+ page.getId(),
+ page.getTitle(),
+ e
+ )
+ )
+
+
+
+
+ except Empty:
+ self.logger.info("Nothing left to do. Shutting down
thread.")
+ finally:
+ self.callback()
+
+
+def map(dumps, processPage, threads=cpu_count()-1):
+ """
+ Maps a function across all of the pages in a set of dump files and
returns
+ an (order not guaranteed) iterator over the output.
+
+ :Parameters:
+ dumps : list
+ a list of paths to dump files to process
+ processPage : function
+ a function to run on every page of a set of dump files.
+ threads : int
+ the number of individual processing threads to spool up
+ """
+
+ input = dumpFiles(dumps)
+ output = Queue(maxsize=10000)
+ running = Value('i', 0)
+
+ def dec(): running.value -= 1
+
+ for i in range(0, min(threads, input.qsize())):
+ running.value += 1
+ Processor(
+ input,
+ processPage,
+ output,
+ dec,
+ logging.getLogger("Process %s" % i)
+ ).start()
+
+
+ #output while processes are running
+ while running.value > 0:
+ try: yield output.get(timeout=.25)
+ except Empty: pass
+
+ #finish yielding output buffer
+ try:
+ while True: yield output.get(block=False)
+ except Empty:
+ pass
+
+
+
+EXTENSIONS = {
+ 'xml': "cat",
+ 'bz2': "bzcat",
+ '7z': "7z e -so 2>/dev/null",
+ 'lzma':"lzcat"
+}
+"""
+A map from file extension to the command to run to extract the data to
standard out.
+"""
+
+EXT_RE = re.compile(r'\.([^\.]+)$')
+"""
+A regular expression for extracting the final extension of a file.
+"""
+
+
+def dumpFile(path):
+ """
+ Verifies that a file exists at a given path and that the file has a
+ known extension type.
+
+ :Parameters:
+ path : `str`
+ the path to a dump file
+
+ """
+ path = os.path.expanduser(path)
+ if not os.path.isfile(path):
+ raise FileTypeError("Can't find file %s" % path)
+
+ match = EXT_RE.search(path)
+ if match == None:
+ raise FileTypeError("No extension found for %s." % path)
+ elif match.groups()[0] not in EXTENSIONS:
+ raise FileTypeError("File type %r is not supported." % path)
+ else:
+ return path
+
+def dumpFiles(paths):
+ """
+ Produces a `multiprocessing.Queue` containing path for each value in
+ `paths` to be used by the `Processor`s.
+
+ :Parameters:
+ paths : iterable
+ the paths to add to the processing queue
+ """
+ q = Queue()
+ for path in paths: q.put(dumpFile(path))
+ return q
+
+def openDumpFile(path):
+ """
+ Turns a path to a dump file into a file-like object of (decompressed)
+ XML data.
+
+ :Parameters:
+ path : `str`
+ the path to the dump file to read
+ """
+ match = EXT_RE.search(path)
+ ext = match.groups()[0]
+ p = subprocess.Popen(
+ "%s %s" % (EXTENSIONS[ext], path),
+ shell=True,
+ stdout=subprocess.PIPE
+ )
+ return p.stdout
+
+
+def encode(v):
+ """
+ Encodes an output value as a string intended to be read by eval()
+ """
+ if type(v) == types.FloatType:
+ return str(int(v))
+ elif v == None:
+ return "\\N"
+ else:
+ return repr(v)
+
+
+
+def main():
+ parser = argparse.ArgumentParser(
+ description='Maps a function across pages of MediaWiki dump
files'
+ )
+ parser.add_argument(
+ '-o', '--out',
+ metavar="<path>",
+ type=lambda path:open(path, "w"),
+ help='the path to an output file to write putput to (defaults
to stdout)',
+ default=sys.stdout
+ )
+ parser.add_argument(
+ '-t', '--threads',
+ metavar="",
+ type=int,
+ help='the number of threads to start (defaults to # of cores
-1)',
+ default=cpu_count()-1
+ )
+ parser.add_argument(
+ 'processor',
+ type=__import__,
+ help='the class path to the module that contains the process()
function be passed each page'
+ )
+ parser.add_argument(
+ 'dump',
+ type=dumpFile,
+ help='the XML dump file(s) to process',
+ nargs="+"
+ )
+ args = parser.parse_args()
+
+ LOGGING_STREAM = sys.stderr
+ if __debug__: level = logging.DEBUG
+ else: level = logging.INFO
+ logging.basicConfig(
+ level=level,
+ stream=LOGGING_STREAM,
+ format='%(name)s: %(asctime)s %(levelname)-8s %(message)s',
+ datefmt='%b-%d %H:%M:%S'
+ )
+ logging.info("Starting dump processor with %s threads." %
min(args.threads, len(args.dump)))
+ for row in dump_map(args.dump, args.processor.process, args.threads):
+ print('\t'.join(encode(v) for v in row))
+
+if __name__ == "__main__":
+ main()
Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/__init__.py
===================================================================
Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/__init__.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/__init__.py
(rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/__init__.py
2011-06-21 22:47:39 UTC (rev 90557)
@@ -0,0 +1,28 @@
+import os, subprocess
+
+def extractFile(fileName):
+ decompressCall = "lzma -c -q -d %s" % fileName
+ process = subprocess.Popen(
+ decompressCall,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.PIPE,
+ shell=True
+ )
+ return process.stdout
+
+def getSmallXMLFilePath():
+ pwd = os.path.dirname(os.path.realpath(__file__))
+ return os.path.join(pwd, "small.xml.lzma")
+
+
+def getLargeXMLFilePath():
+ pwd = os.path.dirname(os.path.realpath(__file__))
+ return os.path.join(pwd, "large.xml.lzma")
+
+
+def getSmallXMLFilePointer():
+ return extractFile(getSmallXMLFilePath())
+
+def getLargeXMLFilePointer():
+ return extractFile(getLargeXMLFilePath())
+
\ No newline at end of file
Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/large.xml.lzma
===================================================================
(Binary files differ)
Property changes on:
trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/large.xml.lzma
___________________________________________________________________
Added: svn:mime-type
+ application/octet-stream
Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/small.xml.lzma
===================================================================
(Binary files differ)
Property changes on:
trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/small.xml.lzma
___________________________________________________________________
Added: svn:mime-type
+ application/octet-stream
Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/test.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/test.py
(rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/test.py 2011-06-21
22:47:39 UTC (rev 90557)
@@ -0,0 +1,4 @@
+import os
+print(__file__)
+print(os.path.realpath(__file__))
+print(os.path.realpath(__file__)[:-1*len(__file__)])
Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/sample.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/tests/sample.py
(rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/tests/sample.py 2011-06-21 22:47:39 UTC
(rev 90557)
@@ -0,0 +1 @@
+
Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/test_iterator.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/tests/test_iterator.py
(rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/tests/test_iterator.py 2011-06-21
22:47:39 UTC (rev 90557)
@@ -0,0 +1,81 @@
+import sys, logging
+from nose.tools import eq_
+from . import sample
+from ..iterator import Iterator, Namespace
+import util
+
+logging.basicConfig(level=logging.INFO)
+
+def test_small():
+ fp = sample.getSmallXMLFilePointer()
+ wf = Iterator(fp)
+ for key in [
+ -2, -1, 0, 1, 2, 3, 4, 5, 6,
+ 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 100,101,108,109
+ ]:
+ assert key in wf.namespaces.values(), "Key %s not found in %s"
% (key, wf.namespaces)
+
+ for page in wf.readPages():
+ eq_(
+ page.getTitle(),
+ u'Talk:Pilsbury Block'
+ )
+ for revision in page.readRevisions():
+ eq_(
+ revision.getId(),
+ 213377884
+ )
+ eq_(
+ revision.getTimestamp(),
+ util.wp2Timestamp("2008-05-19T01:41:53Z")
+ )
+ eq_(
+ revision.getContributor().getId(),
+ 905763
+ )
+ eq_(
+ revision.getContributor().getUsername(),
+ u"Swampyank"
+ )
+ eq_(
+ revision.getMinor(),
+ False
+ )
+ eq_(
+ revision.getComment(),
+ u"[[WP:AES|\u2190]]Created page with
'{{WikiProject National Register of Historic Places|class=Stub}} {{WikiProject
Maine|class=Stub|importance=Low}} {{reqphoto|in=Maine}}'"
+ )
+
+ eq_(
+ revision.getText(),
+ u"{{WikiProject National Register of Historic
Places|class=Stub}}\n" +
+ u"{{WikiProject
Maine|class=Stub|importance=Low}}\n" +
+ u"{{reqphoto|in=Maine}}"
+ )
+
+
+
+def test_large():
+ fp = sample.getLargeXMLFilePointer()
+ wf = Iterator(fp)
+ pageCounter = 0
+ revisionCounter = 0
+ for page in wf.readPages():
+ pageCounter += 1
+ for revision in page.readRevisions():
+ assert revision.getId() != None
+ assert revision.getTimestamp() != None
+ __ = revision.getContributor()
+ __ = revision.getComment()
+ assert revision.getMinor() != None
+ assert revision.getText() != None
+ #sys.stderr.write(".")
+ revisionCounter += 1
+ if revisionCounter >= 100: break
+
+
+ eq_(pageCounter, 1)
+ #eq_(revisionCounter, 15180)
+ eq_(revisionCounter, 100)
+
Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/test_map.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/tests/test_map.py
(rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/tests/test_map.py 2011-06-21
22:47:39 UTC (rev 90557)
@@ -0,0 +1,25 @@
+import sys, logging
+from nose.tools import eq_
+from gl import wp
+from . import sample
+from ..map import map
+
+
+def test_simple_map():
+ dumps = [sample.getSmallXMLFilePath(), sample.getLargeXMLFilePath()]
+
+ def processPage(dump, page):
+ assert hasattr(dump, "namespaces")
+ assert hasattr(page, "readRevisions")
+
+ count = 0
+ for rev in page.readRevisions():
+ count += 1
+ if count >= 100: break
+
+ yield (page.getId(), count)
+
+ output = dict(map(dumps, processPage))
+
+ eq_(output[17500012], 1)
+ eq_(output[12], 100)
Added: trunk/tools/wsor/wikimedia/wmf/dump/xml_iterator.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/xml_iterator.py
(rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/xml_iterator.py 2011-06-21 22:47:39 UTC
(rev 90557)
@@ -0,0 +1,76 @@
+try:
+ import xml.etree.cElementTree as etree
+except ImportError:
+ import xml.etree.ElementTree as etree
+
+def XMLIterator(fp):
+ xmlIterator = etree.iterparse(fp, events=("start","end"))
+ return ElementIterator(xmlIterator.next()[1], xmlIterator)
+
+class ElementIteratorError: pass
+
+class ElementIterator:
+
+ def __init__(self, element, xmlIterator):
+ self.element = element
+ self.xmlIterator = xmlIterator
+ self.tagStack = [self.element.tag]
+
+ def __iter__(self):
+ if len(self.tagStack) == 0:
+ raise ElementIteratorError("Element has already been
iterated through.")
+
+ for event, element in self.xmlIterator:
+ if event == "start":
+ element = ElementIterator(element,
self.xmlIterator)
+ yield element
+ element.clear()
+
+ else: #event == "end"
+ assert element.tag == self.element.tag,
"Expected %r, got %r" % (self.element.tag, element.tag)
+ self.tagStack.pop()
+
+ if len(self.tagStack) == 0:
+ break
+
+
+ def get(self, key, alt=None):
+ return self.element.attrib.get(key, alt)
+
+
+ def complete(self):
+ if len(self.tagStack) != 0:
+ for event, element in self.xmlIterator:
+ if event == "start":
+ self.tagStack.append(element.tag)
+ element.clear()
+
+ else: #event == "end"
+ assert self.tagStack[-1] ==
element.tag, "Expected %r at the end of %r" % (element.tag, self.tagStack)
+ self.tagStack.pop()
+
+ if len(self.tagStack) == 0:
+ break
+
+
+ def clear(self):
+ self.complete()
+ self.element.clear()
+
+
+ def __del__(self):
+ self.clear()
+
+ def __getattr__(self, attr):
+ if attr == "attrib":
+ return self.element.attrib
+ elif attr == "tag":
+ return self.element.tag
+ elif attr == "tail":
+ return self.element.tail
+ elif attr == "text":
+ self.complete()
+ return self.element.text
+ else:
+ raise AttributeError("%s has no attribute %r" %
(self.__class__.__name__, attr))
+
Added: trunk/tools/wsor/wikimedia/wmf/util.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/util.py (rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/util.py 2011-06-21 22:47:39 UTC (rev
90557)
@@ -0,0 +1,236 @@
+from __future__ import with_statement, absolute_import
+import re, types
+import time, calendar, datetime
+import hashlib
+import urllib
+
+__docformat__ = "restructuredtext en"
+
+"""
+This module contains utility functions for interacting with Wikipedia.
+"""
+
+LONG_WP_TIME_STRING = '%Y-%m-%dT%H:%M:%SZ'
+"""
+The longhand version of Wikipedia timestamps.
+"""
+
+SHORT_WP_TIME_STRING = '%Y%m%d%H%M%S'
+"""
+The shorthand version of Wikipedia timestamps
+"""
+
+WPAPI_URL = "http://%s.wikipedia.org/w/api.php"
+"""
+The wikipedia API URL. A positional format token is included to so that the
+language specific prefix can be formatted in. See `wpAPIURL()`.
+"""
+
+
+VLOOSE_RE = re.compile(r'''
+ (^revert\ to.+using)
+ | (^reverted\ edits\ by.+using)
+ | (^reverted\ edits\ by.+to\ last\ version\ by)
+ | (^bot\ -\ rv.+to\ last\ version\ by)
+ | (-assisted\ reversion)
+ | (^(revert(ed)?|rv).+to\ last)
+ | (^undo\ revision.+by)
+ ''', re.IGNORECASE | re.DOTALL | re.VERBOSE)
+
+VSTRICT_RE = re.compile(r'''
+ (\brvv)
+ | (\brv[/ ]v)
+ | (vandal(?!proof|bot))
+ | (\b(rv|rev(ert)?|rm)\b.*(blank|spam|nonsense|porn|mass\sdelet|vand))
+ ''', re.IGNORECASE | re.DOTALL | re.VERBOSE)
+
+NAMESPACES = {
+ 'en': set([
+ 'Media',
+ 'Special',
+ 'Talk',
+ 'User talk',
+ 'Wikipedia talk',
+ 'Image talk',
+ 'MediaWiki talk',
+ 'Template talk',
+ 'Help talk',
+ 'Category talk',
+ 'Portal talk',
+ 'File talk',
+ 'User',
+ 'Wikipedia',
+ 'Image',
+ 'MediaWiki',
+ 'Template',
+ 'Help',
+ 'Category',
+ 'Portal',
+ 'File'
+ ])
+}
+
+NAMESPACE_RE = re.compile(r'^((?:%s)):' % ')|(?:'.join(NAMESPACES['en']),
+ re.IGNORECASE)
+
+def wpAPIURL(prefix="en"):
+ """
+ Creates a the URL for the wikipedia API based on a language prefix.
+
+ :Parameters:
+ prefix : string
+ the prefix to be formatted into the url
+
+ :Return:
+ the Wikipedia API url for a given language prefix
+ """
+ return WPAPI_URL % prefix
+
+
+def wp2Timestamp(wpTime):
+ """
+ Converts a Wikipedia timestamp to a Unix Epoch-based timestamp (seconds
+ since Jan. 1st 1970 GMT). This function will handle both long
+ (see `LONG_WP_TIME_STRING`) and short (see `SHORT_WP_TIME_STRING`)
+ time formats.
+
+ :Parameters:
+ wpTime : string
+ Wikipedia timestamp to be converted
+
+ :Return:
+ integer Unix Epoch-based timestamp (seconds since Jan. 1st 1970
+ GMT) version of the provided wpTime.
+ """
+ try:
+ myTime = time.strptime(wpTime, LONG_WP_TIME_STRING)
+ except ValueError as e:
+ try:
+ myTime = time.strptime(wpTime, SHORT_WP_TIME_STRING)
+ except ValueError as e:
+ raise ValueError("'%s' is not a valid Wikipedia date
format" % wpTime)
+
+ return int(calendar.timegm(myTime))
+
+def timestamp2WP(timestamp):
+ """
+ Converts a Unix Epoch-based timestamp (seconds since Jan. 1st 1970 GMT)
+ timestamp to one acceptable by Wikipedia.
+
+ :Parameters:
+ timestamp : int
+ Unix timestamp to be converted
+
+ :Return:
+ string Wikipedia style timestamp
+ """
+
+ return
datetime.datetime.utcfromtimestamp(timestamp).strftime('%Y%m%d%H%M%S')
+
+def digest(content):
+ return hashlib.md5(content.encode("utf-8")).hexdigest()
+
+
+def normalize(name):
+ """
+ Normalizes text from a Wikipedia title/segment by capitalizing the
+ first letter, replacing underscores with spaces, and collapsing all
+ spaces to one space.
+
+ :Parameters:
+ name : string
+ Namespace or title portion of a Wikipedia page name.
+
+ :Return:
+ string Normalized text
+ """
+
+ return name.capitalize().replace("_", " ").strip()
+
+def normalizeTitle(title, namespaces=NAMESPACES['en']):
+ """
+ Normalizes a Wikipedia page title and splits the title into
+ namespace and title pieces.
+
+ :Parameters:
+ title : string
+ The title of a Wikipedia page.
+ namespaces : set
+ A set of namespaces to look for in the title.
+
+ :Return:
+ The namespace, title tuple
+ """
+
+ if type(title) == types.UnicodeType:
+ title = title.encode('utf-8')
+
+ title = title.strip()
+ parts = title.split(":", 1)
+ if len(parts) == 1:
+ namespace = None
+ title = normalize(parts[0])
+ elif parts[1] == '':
+ namespace = None
+ title = normalize(title)
+ else:
+ nsPart = normalize(parts[0])
+ if nsPart in namespaces:
+ namespace = nsPart
+ title = normalize(parts[1])
+ else:
+ namespace = None
+ title = normalize(title)
+
+ return (namespace, title)
+
+def normalizeURLTitle(title, namespaces=NAMESPACES['en']):
+ """
+ Normalizes a Wikipedia page title obtained from a URL and splits
+ the title into namespace and title pieces.
+
+ :Parameters:
+ title : string
+ The title of a Wikipedia page.
+ namespaces : set
+ A set of namespaces to look for in the title.
+
+ :Return:
+ The namespace, title tuple
+ """
+
+ if type(title) == types.UnicodeType:
+ title = title.encode('utf-8')
+ title = urllib.unquote(title).split('#')[0]
+ ns = NAMESPACE_RE.match(title)
+ if not ns:
+ namespace = ""
+ title = normalize(title)
+ else:
+ nsPart = ns.group(1).capitalize()
+ if nsPart in namespaces:
+ namespace = nsPart
+ title = normalize(title[ns.end():])
+ return (namespace, title)
+
+def isVandalismByComment(editComment, testLoose=True, testStrict=True):
+ '''
+ Check the given edit comment against the VLOOSE and VSTRICT regexes
+ as configured, and returns a boolean defining if it matches or not.
+
+ @param editComment: The edit comment to test.
+ @type editComment: str
+
+ @param testLoose: If the edit comment matches VLOOSE_RE, True is
returned
+ @type testLoose: bool
+
+ @param testStrict: If the edit comment matches VSTRICT_RE, True is
returned
+ @type testStrict: bool
+ '''
+
+ if testLoose and VLOOSE_RE.search(editComment):
+ return True;
+ if testStrict and VSTRICT_RE.search(editComment):
+ return True;
+
+ return False;
_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs