wsor

halfak Tue, 21 Jun 2011 15:47:52 -0700

http://www.mediawiki.org/wiki/Special:Code/MediaWiki/90557


Revision: 90557
Author:   halfak
Date:     2011-06-21 22:47:39 +0000 (Tue, 21 Jun 2011)
Log Message:
-----------
added wikimedia utilities

Modified Paths:
--------------
    trunk/tools/wsor/ts_samples/testing.sql

Added Paths:
-----------
    trunk/tools/wsor/wikimedia/
    trunk/tools/wsor/wikimedia/setup.py
    trunk/tools/wsor/wikimedia/wmf/
    trunk/tools/wsor/wikimedia/wmf/__init__.py
    trunk/tools/wsor/wikimedia/wmf/dump/
    trunk/tools/wsor/wikimedia/wmf/dump/__init__.py
    trunk/tools/wsor/wikimedia/wmf/dump/iterator.py
    trunk/tools/wsor/wikimedia/wmf/dump/map.py
    trunk/tools/wsor/wikimedia/wmf/dump/tests/
    trunk/tools/wsor/wikimedia/wmf/dump/tests/__init__.py
    trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/
    trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/__init__.py
    trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/large.xml.lzma
    trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/small.xml.lzma
    trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/test.py
    trunk/tools/wsor/wikimedia/wmf/dump/tests/sample.py
    trunk/tools/wsor/wikimedia/wmf/dump/tests/test_iterator.py
    trunk/tools/wsor/wikimedia/wmf/dump/tests/test_map.py
    trunk/tools/wsor/wikimedia/wmf/dump/xml_iterator.py
    trunk/tools/wsor/wikimedia/wmf/util.py

Removed Paths:
-------------
    trunk/tools/wsor/scripts/process_dumps.py

Deleted: trunk/tools/wsor/scripts/process_dumps.py
===================================================================
--- trunk/tools/wsor/scripts/process_dumps.py   2011-06-21 22:44:54 UTC (rev 
90556)
+++ trunk/tools/wsor/scripts/process_dumps.py   2011-06-21 22:47:39 UTC (rev 
90557)
@@ -1,186 +0,0 @@
-import sys, logging, re, types, argparse, os, subprocess
-from multiprocessing import Process, Queue, Lock, cpu_count, Value
-from Queue import Empty
-from gl import wp
-
-class FileTypeError(Exception):pass
-
-def encode(v):
-       if type(v) == types.FloatType:
-               return str(int(v))
-       elif v == None:
-               return "\\N"
-       else:
-               return repr(v)
-
-
-
-class SafeOutput:
-       
-       def __init__(self, fp):
-               self.fp = fp
-               self.l  = Lock()
-       
-       def push(self, row, encode=encode):
-               if __debug__:
-                       row = tuple(row)
-               
-               with self.l:
-                       self.fp.write("\t".join(clean(v) for v in row) + "\n")
-
-class Processor(Process):
-       
-       def __init__(self, input, processPage, output, callback, logger):
-               self.input       = input
-               self.processPage = processPage
-               self.output      = output
-               self.callback    = callback
-               self.logger      = logger
-               Process.__init__(self)
-       
-       def run(self):
-               try:
-                       while True:
-                               foo = self.input.qsize()
-                               fn = self.input.get(block=False)
-                               self.logger.info("Processing dump file %s." % 
fn)
-                               dump = wp.dump.Iterator(openDumpFile(fn))
-                               for page in dump.readPages():
-                                       self.logger.debug("Processing page 
%s:%s." % (page.getId(), page.getTitle()))
-                                       try:
-                                               for out in 
self.processPage(dump, page):
-                                                       self.output.put(out)
-                                       except Exception as e:
-                                               self.logger.error(
-                                                       "Failed to process page 
%s:%s - %s" % (
-                                                               page.getId(),
-                                                               page.getTitle(),
-                                                               e
-                                                       )
-                                               )
-                                       
-                               
-                       
-                       
-               except Empty:
-                       self.logger.info("Nothing left to do.  Shutting down 
thread.")
-               finally:
-                       self.callback()
-               
-       
-
-
-def main(args):
-       LOGGING_STREAM = sys.stderr
-       if __debug__: level = logging.DEBUG
-       else:         level = logging.INFO
-       logging.basicConfig(
-               level=level,
-               stream=LOGGING_STREAM,
-               format='%(name)s: %(asctime)s %(levelname)-8s %(message)s',
-               datefmt='%b-%d %H:%M:%S'
-       )
-       logging.info("Starting dump processor with %s threads." % 
min(args.threads, len(args.dump)))
-       for row in process_dumps(args.dump, args.processor.process, 
args.threads):
-               print('\t'.join(encode(v) for v in row))
-
-def process_dumps(dumps, processPage, threads):
-       input       = dumpFiles(dumps)
-       output      = Queue(maxsize=10000)
-       running     = Value('i', 0)
-       
-       def dec(): running.value -= 1
-       
-       for i in range(0, min(threads, input.qsize())):
-               running.value += 1
-               Processor(
-                       input, 
-                       processPage,
-                       output, 
-                       dec,
-                       logging.getLogger("Process %s" % i)
-               ).start()
-       
-       
-       #output while processes are running
-       while running.value > 0:
-               try:          yield output.get(timeout=.25)
-               except Empty: pass
-       
-       #finish yielding output buffer
-       try:
-               while True: yield output.get(block=False) 
-       except Empty: 
-               pass
-       
-
-
-EXTENSIONS = {
-       'xml': "cat",
-       'bz2': "bzcat",
-       '7z':  "7z e -so 2>/dev/null",
-       'lzma':"lzcat"
-}
-
-EXT_RE = re.compile(r'\.([^\.]+)$')
-def dumpFile(path):
-       path = os.path.expanduser(path)
-       if not os.path.isfile(path):
-               raise FileTypeError("Can't find file %s" % path)
-       
-       match = EXT_RE.search(path)
-       if match == None:
-               raise FileTypeError("No extension found for %s." % path)
-       elif match.groups()[0] not in EXTENSIONS:
-               raise FileTypeError("File type %r is not supported." % path)
-       else:
-               return path
-
-def dumpFiles(paths):
-       q = Queue()
-       for path in paths: q.put(dumpFile(path))
-       return q
-
-def openDumpFile(path):
-       match = EXT_RE.search(path)
-       ext = match.groups()[0]
-       p = subprocess.Popen(
-               "%s %s" % (EXTENSIONS[ext], path), 
-               shell=True, 
-               stdout=subprocess.PIPE
-       )
-       return p.stdout
-       
-
-if __name__ == "__main__":
-       parser = argparse.ArgumentParser(
-               description='Maps a function across pages of MediaWiki dump 
files'
-       )
-       parser.add_argument(
-               '-o', '--out',
-               metavar="<path>",
-               type=lambda path:open(path, "w"), 
-               help='the path to an output file to write putput to (defaults 
to stdout)',
-               default=sys.stdout
-       )
-       parser.add_argument(
-               '-t', '--threads',
-               metavar="",
-               type=int, 
-               help='the number of threads to start (defaults to # of cores 
-1)',
-               default=cpu_count()-1
-       )
-       parser.add_argument(
-               'processor',
-               type=__import__, 
-               help='the class path to the function to use to process each 
page'
-       )
-       parser.add_argument(
-               'dump',
-               type=dumpFile, 
-               help='the XML dump file(s) to process',
-               nargs="+"
-       )
-       args = parser.parse_args()
-       main(args)
-

Modified: trunk/tools/wsor/ts_samples/testing.sql
===================================================================
--- trunk/tools/wsor/ts_samples/testing.sql     2011-06-21 22:44:54 UTC (rev 
90556)
+++ trunk/tools/wsor/ts_samples/testing.sql     2011-06-21 22:47:39 UTC (rev 
90557)
@@ -40,3 +40,30 @@
 CREATE UNIQUE INDEX user_id_idx ON halfak.user_meta (user_id);
 CREATE INDEX first_edit_idx ON halfak.user_meta (first_edit);
 CREATE INDEX last_edit_idx ON halfak.user_meta (last_edit);
+
+
+SELECT
+       year,
+       biannual,
+       count(*)
+FROM
+(
+SELECT 
+       u.user_id,
+       SUBSTRING(first_edit, 1,4)         as year,
+       SUBSTRING(first_edit, 5,2) >= "07" as biannual
+FROM halfak.user_meta um
+INNER JOIN user u
+       ON u.user_id = um.user_id
+INNER JOIN page p
+       ON p.page_title = u.user_name
+       AND p.page_namespace = 3
+INNER JOIN revision r
+       ON  um.user_id != r.rev_user
+       AND p.page_id  = r.rev_page
+GROUP BY 
+       user_id,
+       SUBSTRING(first_edit, 1,4),
+       SUBSTRING(first_edit, 5,2)
+) as foo
+GROUP BY year, biannual;

Added: trunk/tools/wsor/wikimedia/setup.py
===================================================================
--- trunk/tools/wsor/wikimedia/setup.py                         (rev 0)
+++ trunk/tools/wsor/wikimedia/setup.py 2011-06-21 22:47:39 UTC (rev 90557)
@@ -0,0 +1,27 @@
+
+from setuptools import setup, find_packages
+
+setup(
+       name='util',
+       version='1.0',
+       description="WMF utilities",
+       long_description="""
+               A set of utilities originally authored by Aaron Halfaker 
+               during the 2011 Wikimedia Summer of Research.  The utilities 
+               in this package are intended to aid in processing of 
+               MediaWiki data related to Wikimedia projects.  Many of the 
+               utilities have been specifically designed to allow 
+               processing of the massive about of data (currently) found 
+               in the full history dump of the English Wikipedia
+       """
+       author='Aaron Halfaker',
+       author_email='[email protected]',
+       url='http://meta.wikimedia.org/wiki/User:EpochFail',
+       packages=find_packages(),
+       entry_points = {
+        'distutils.commands': [
+            'dump_map = util.dump.map:main',
+            ]
+        },
+       
+)

Added: trunk/tools/wsor/wikimedia/wmf/__init__.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/__init__.py                          (rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/__init__.py  2011-06-21 22:47:39 UTC (rev 
90557)
@@ -0,0 +1,2 @@
+from __future__ import absolute_import
+from .util import *

Added: trunk/tools/wsor/wikimedia/wmf/dump/__init__.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/__init__.py                             
(rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/__init__.py     2011-06-21 22:47:39 UTC 
(rev 90557)
@@ -0,0 +1,2 @@
+from .iterator import Iterator
+from .map import map

Added: trunk/tools/wsor/wikimedia/wmf/dump/iterator.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/iterator.py                             
(rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/iterator.py     2011-06-21 22:47:39 UTC 
(rev 90557)
@@ -0,0 +1,220 @@
+from xml_iterator import XMLIterator
+from ..util import wp2Timestamp
+
+def cleanTag(prefix, raw):
+       return raw[len(prefix):]
+       
+
+class Iterator:
+       """
+       WikiFile dump processor.  This class constructs with a filepointer to a 
+       Wikipedia XML dump file.
+       
+       """
+       
+       def __init__(self, fp):
+               """
+               Constructor
+               
+               :Parameters:
+                       fp : file pointer
+                               a file pointer to the xml file to process.
+               """
+               
+               self.fp = fp           #:The file pointer passed to the 
constructor
+               self.namespaces = {}   #:A map of possible namespaces
+               self.siteName  = None  #:The name of the site
+               self.base      = None  #:Base of the xml file
+               self.generator = None  #:Generator of the dump
+               self.case      = None  #:The default title case
+               
+               self.mediawikiElement = XMLIterator(fp)
+               self.ns = self.mediawikiElement.tag[:-len('mediawiki')]
+               
+               pageCount = 0
+               done = False
+               for element in self.mediawikiElement:
+                       tag = cleanTag(self.ns, element.tag)
+                       if tag == "siteinfo":
+                               self.loadSiteInfo(element)
+                               element.clear()
+                               break
+                               
+                               
+               
+       def loadSiteInfo(self, siteInfoElement):
+               for element in siteInfoElement:
+                       tag = cleanTag(self.ns, element.tag)
+
+                       if tag == 'sitename':
+                               self.siteName = element.text
+                       elif tag == 'base':
+                               self.base = element.text
+                       elif tag == 'generator':
+                               self.generator = element.text
+                       elif tag == 'case':
+                               self.case = element.text
+                       elif tag == 'namespaces':
+                               self.loadNamespaces(element)
+                               element.clear()
+                       
+                       
+       
+       def loadNamespaces(self, namespacesElement):
+               for element in namespacesElement:
+                       tag = cleanTag(self.ns, element.tag)
+                       
+                       if tag == "namespace":
+                               namespace = Namespace(element)
+                               self.namespaces[namespace.getName()] = 
namespace.getId()
+                       else:
+                               assert False, "This should never happen"
+                               
+               
+       def readPages(self):
+               for element in self.mediawikiElement:
+                       tag = cleanTag(self.ns, element.tag)
+                       if tag == "page":
+                               yield Page(self.ns, element)
+                       
+               
+       
+
+class Namespace:
+       
+       def __init__(self, nsElement):
+               self.setId(nsElement.get('key'))
+               self.setName(nsElement.text)
+       
+       def setId(self, id): self.id = int(id)
+       def getId(self): return self.id
+       
+       def setName(self, name): 
+               if name == None:
+                       self.name = None
+               else:
+                       self.name = unicode(name)
+       def getName(self): return self.name
+       
+       def __repr__(self):
+               return "%s(%r, %r)" % (
+                       self.__class__.__name__,
+                       self.getId(),
+                       self.getName()
+               )
+       
+       def __eq__(self, other):
+               try:
+                       return (
+                               self.getId() == other.getId() and
+                               self.getName() == other.getName()
+                       )
+               except AttributeError:
+                       return False
+
+class Page:
+       
+       def __init__(self, ns, pageElement):
+               self.id = None
+               self.title = None
+               self.pageElement = pageElement
+               self.ns = ns
+               for element in pageElement:
+                       tag = cleanTag(ns, element.tag)
+                       if tag == "id":
+                               self.setId(element.text)
+                       elif tag == "title":
+                               self.setTitle(element.text)
+                       
+                       if self.id != None and self.title != None:
+                               break
+               
+       def readRevisions(self):
+               for element in self.pageElement:
+                       tag = cleanTag(self.ns, element.tag)
+                       if tag == "revision":
+                               yield Revision(self.ns, element)
+                               #element.clear()
+                       
+                       
+       
+       def setId(self, id): self.id = int(id)
+       def getId(self): return self.id
+       
+       def setTitle(self, title): self.title = unicode(title)
+       def getTitle(self): return self.title
+               
+       
+
+class Revision:
+       
+       TAG_MAP = {
+               'id':          lambda s,e:s.setId(e.text),
+               'timestamp':   lambda s,e:s.setTimestamp(e.text),
+               'contributor': lambda s,e:s.setContributor(e),
+               'minor':       lambda s,e:s.setMinor(True),
+               'comment':     lambda s,e:s.setComment(e.text),
+               'text':        lambda s,e:s.setText(e.text)
+       }
+       
+       def __init__(self, ns, revisionElement):
+               self.ns = ns
+               self.id          = None
+               self.timestamp   = None
+               self.contributor = None
+               self.minor       = False #No tag means minor edit
+               self.comment     = None
+               self.text        = None
+               for element in revisionElement:
+                       tag = cleanTag(ns, element.tag)
+                       self.TAG_MAP[tag](self, element)
+       
+       def setId(self, id): self.id = int(id)
+       def getId(self): return self.id
+       
+       def setTimestamp(self, timestamp):
+               try: self.timestamp = int(timestamp)
+               except ValueError: self.timestamp = wp2Timestamp(timestamp)
+       def getTimestamp(self): return self.timestamp
+       
+       def setContributor(self, element): 
+               if element.get("deleted", None) == "deleted":
+                       self.contributor = None
+               else:
+                       self.contributor = Contributor(self.ns, element)
+               
+       def getContributor(self): return self.contributor
+       
+       def setMinor(self, minor): self.minor = minor == True
+       def getMinor(self): return self.minor
+       
+       def setComment(self, comment): self.comment = unicode(comment)
+       def getComment(self): return self.comment
+       
+       def setText(self, text): 
+               if text == None: self.text = u''
+               else: self.text = unicode(text)
+       def getText(self): return self.text
+
+class Contributor:
+       
+       TAG_MAP = {
+               'id':       lambda s,e:s.setId(e.text),
+               'username': lambda s,e:s.setUsername(e.text),
+               'ip':       lambda s,e:s.setUsername(e.text)
+       }
+       
+       def __init__(self, ns, contributorElement):
+               self.id = None
+               for element in contributorElement:
+                       tag = cleanTag(ns, element.tag)
+                       self.TAG_MAP[tag](self, element)
+       
+       def setId(self, id): self.id = int(id)
+       def getId(self): return self.id
+       
+       def setUsername(self, username): self.username = unicode(username)
+       def getUsername(self): return self.username
+       
+
+               

Added: trunk/tools/wsor/wikimedia/wmf/dump/map.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/map.py                          (rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/map.py  2011-06-21 22:47:39 UTC (rev 
90557)
@@ -0,0 +1,255 @@
+"""
+Dump Mapper
+
+This script acts as a map/function over the pages in a set of MediaWiki 
+database dump files.  This script allows the algorithm for processing a set of
+pages to be spread across the available processor cores of a system for faster 
+analysis. 
+
+This script can also be imported as a module to expose the `dump_map()` 
function
+that returns an iterator over output rather than printing to stdout.
+
+Examples:
+
+python -O process_dumps.py revision_meta 
/dumps/enwiki-20110115-pages-meta-history* > ~/data/revision_meta.tsv
+"""
+import sys, logging, re, types, argparse, os, subprocess
+from multiprocessing import Process, Queue, Lock, cpu_count, Value
+from Queue import Empty
+
+from .iterator import Iterator
+
+class FileTypeError(Exception):pass
+
+class Processor(Process):
+       """
+       A processor for managing the reading of dump files from a queue and
+       the application of a a function for each 'page'.
+       """
+       
+       def __init__(self, input, processPage, output, callback, logger):
+               """
+               Constructor
+               
+               :Parameters:
+                       input : `multiprocessing.Queue`
+                               a queue paths to dump files to process
+                       processPage : function
+                               a function to apply to each page of a dump file
+                       output : `multiprocessing.Queue`
+                               a queue to send processing output to
+                       callback : function
+                               a function to run upon completion
+                       logger : `logging.Logger`
+                               a logger object to send logging events to
+               """
+               self.input       = input
+               self.processPage = processPage
+               self.output      = output
+               self.callback    = callback
+               self.logger      = logger
+               Process.__init__(self)
+       
+       def run(self):
+               try:
+                       while True:
+                               foo = self.input.qsize()
+                               fn = self.input.get(block=False)
+                               self.logger.info("Processing dump file %s." % 
fn)
+                               dump = Iterator(openDumpFile(fn))
+                               for page in dump.readPages():
+                                       self.logger.debug("Processing page 
%s:%s." % (page.getId(), page.getTitle()))
+                                       try:
+                                               for out in 
self.processPage(dump, page):
+                                                       self.output.put(out)
+                                       except Exception as e:
+                                               self.logger.error(
+                                                       "Failed to process page 
%s:%s - %s" % (
+                                                               page.getId(),
+                                                               page.getTitle(),
+                                                               e
+                                                       )
+                                               )
+                                       
+                               
+                       
+                       
+               except Empty:
+                       self.logger.info("Nothing left to do.  Shutting down 
thread.")
+               finally:
+                       self.callback()
+               
+
+def map(dumps, processPage, threads=cpu_count()-1):
+       """
+       Maps a function across all of the pages in a set of dump files and 
returns
+       an (order not guaranteed) iterator over the output.
+       
+       :Parameters:
+               dumps : list
+                       a list of paths to dump files to process
+               processPage : function
+                       a function to run on every page of a set of dump files.
+               threads : int
+                       the number of individual processing threads to spool up
+       """
+       
+       input       = dumpFiles(dumps)
+       output      = Queue(maxsize=10000)
+       running     = Value('i', 0)
+       
+       def dec(): running.value -= 1
+       
+       for i in range(0, min(threads, input.qsize())):
+               running.value += 1
+               Processor(
+                       input, 
+                       processPage,
+                       output, 
+                       dec,
+                       logging.getLogger("Process %s" % i)
+               ).start()
+       
+       
+       #output while processes are running
+       while running.value > 0:
+               try:          yield output.get(timeout=.25)
+               except Empty: pass
+       
+       #finish yielding output buffer
+       try:
+               while True: yield output.get(block=False) 
+       except Empty: 
+               pass
+       
+
+
+EXTENSIONS = {
+       'xml': "cat",
+       'bz2': "bzcat",
+       '7z':  "7z e -so 2>/dev/null",
+       'lzma':"lzcat"
+}
+"""
+A map from file extension to the command to run to extract the data to 
standard out.
+"""
+
+EXT_RE = re.compile(r'\.([^\.]+)$')
+"""
+A regular expression for extracting the final extension of a file.
+"""
+
+
+def dumpFile(path):
+       """
+       Verifies that a file exists at a given path and that the file has a 
+       known extension type.
+       
+       :Parameters:
+               path : `str`
+                       the path to a dump file
+               
+       """
+       path = os.path.expanduser(path)
+       if not os.path.isfile(path):
+               raise FileTypeError("Can't find file %s" % path)
+       
+       match = EXT_RE.search(path)
+       if match == None:
+               raise FileTypeError("No extension found for %s." % path)
+       elif match.groups()[0] not in EXTENSIONS:
+               raise FileTypeError("File type %r is not supported." % path)
+       else:
+               return path
+
+def dumpFiles(paths):
+       """
+       Produces a `multiprocessing.Queue` containing path for each value in
+       `paths` to be used by the `Processor`s.
+       
+       :Parameters:
+               paths : iterable
+                       the paths to add to the processing queue
+       """
+       q = Queue()
+       for path in paths: q.put(dumpFile(path))
+       return q
+
+def openDumpFile(path):
+       """
+       Turns a path to a dump file into a file-like object of (decompressed)
+       XML data.
+       
+       :Parameters:
+               path : `str`
+                       the path to the dump file to read
+       """
+       match = EXT_RE.search(path)
+       ext = match.groups()[0]
+       p = subprocess.Popen(
+               "%s %s" % (EXTENSIONS[ext], path), 
+               shell=True, 
+               stdout=subprocess.PIPE
+       )
+       return p.stdout
+
+
+def encode(v):
+       """
+       Encodes an output value as a string intended to be read by eval()
+       """
+       if type(v) == types.FloatType:
+               return str(int(v))
+       elif v == None:
+               return "\\N"
+       else:
+               return repr(v)
+
+
+
+def main():
+       parser = argparse.ArgumentParser(
+               description='Maps a function across pages of MediaWiki dump 
files'
+       )
+       parser.add_argument(
+               '-o', '--out',
+               metavar="<path>",
+               type=lambda path:open(path, "w"), 
+               help='the path to an output file to write putput to (defaults 
to stdout)',
+               default=sys.stdout
+       )
+       parser.add_argument(
+               '-t', '--threads',
+               metavar="",
+               type=int, 
+               help='the number of threads to start (defaults to # of cores 
-1)',
+               default=cpu_count()-1
+       )
+       parser.add_argument(
+               'processor',
+               type=__import__, 
+               help='the class path to the module that contains the process() 
function be passed each page'
+       )
+       parser.add_argument(
+               'dump',
+               type=dumpFile, 
+               help='the XML dump file(s) to process',
+               nargs="+"
+       )
+       args = parser.parse_args()
+       
+       LOGGING_STREAM = sys.stderr
+       if __debug__: level = logging.DEBUG
+       else:         level = logging.INFO
+       logging.basicConfig(
+               level=level,
+               stream=LOGGING_STREAM,
+               format='%(name)s: %(asctime)s %(levelname)-8s %(message)s',
+               datefmt='%b-%d %H:%M:%S'
+       )
+       logging.info("Starting dump processor with %s threads." % 
min(args.threads, len(args.dump)))
+       for row in dump_map(args.dump, args.processor.process, args.threads):
+               print('\t'.join(encode(v) for v in row))
+
+if __name__ == "__main__":
+       main()

Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/__init__.py
===================================================================
Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/__init__.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/__init__.py                
                (rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/__init__.py        
2011-06-21 22:47:39 UTC (rev 90557)
@@ -0,0 +1,28 @@
+import os, subprocess
+
+def extractFile(fileName):
+       decompressCall = "lzma -c -q -d %s" % fileName
+       process = subprocess.Popen(
+               decompressCall,
+               stdout=subprocess.PIPE,
+               stderr=subprocess.PIPE,
+               shell=True
+       )
+       return process.stdout
+       
+def getSmallXMLFilePath():
+       pwd = os.path.dirname(os.path.realpath(__file__))
+       return os.path.join(pwd, "small.xml.lzma")
+
+
+def getLargeXMLFilePath():
+       pwd = os.path.dirname(os.path.realpath(__file__))
+       return os.path.join(pwd, "large.xml.lzma")
+
+
+def getSmallXMLFilePointer():
+       return extractFile(getSmallXMLFilePath())
+       
+def getLargeXMLFilePointer():
+       return extractFile(getLargeXMLFilePath())
+       
\ No newline at end of file

Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/large.xml.lzma
===================================================================
(Binary files differ)


Property changes on: 
trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/large.xml.lzma
___________________________________________________________________
Added: svn:mime-type
   + application/octet-stream

Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/small.xml.lzma
===================================================================
(Binary files differ)


Property changes on: 
trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/small.xml.lzma
___________________________________________________________________
Added: svn:mime-type
   + application/octet-stream

Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/test.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/test.py                    
        (rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/tests/sample/test.py    2011-06-21 
22:47:39 UTC (rev 90557)
@@ -0,0 +1,4 @@
+import os
+print(__file__)
+print(os.path.realpath(__file__))
+print(os.path.realpath(__file__)[:-1*len(__file__)])

Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/sample.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/tests/sample.py                         
(rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/tests/sample.py 2011-06-21 22:47:39 UTC 
(rev 90557)
@@ -0,0 +1 @@
+

Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/test_iterator.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/tests/test_iterator.py                  
        (rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/tests/test_iterator.py  2011-06-21 
22:47:39 UTC (rev 90557)
@@ -0,0 +1,81 @@
+import sys, logging
+from nose.tools import eq_
+from . import sample
+from ..iterator import Iterator, Namespace
+import util
+
+logging.basicConfig(level=logging.INFO)
+
+def test_small():
+       fp = sample.getSmallXMLFilePointer()
+       wf = Iterator(fp)
+       for key in [
+                -2, -1, 0,  1,  2,  3,  4,  5,  6,
+                 7,  8, 9, 10, 11, 12, 13, 14, 15,
+               100,101,108,109
+       ]:
+               assert key in wf.namespaces.values(), "Key %s not found in %s" 
% (key, wf.namespaces)
+       
+       for page in wf.readPages():
+               eq_(
+                       page.getTitle(),
+                       u'Talk:Pilsbury Block'
+               )
+               for revision in page.readRevisions():
+                       eq_(
+                               revision.getId(),
+                               213377884
+                       )
+                       eq_(
+                               revision.getTimestamp(),
+                               util.wp2Timestamp("2008-05-19T01:41:53Z")
+                       )
+                       eq_(
+                               revision.getContributor().getId(),
+                               905763
+                       )
+                       eq_(
+                               revision.getContributor().getUsername(),
+                               u"Swampyank"
+                       )
+                       eq_(
+                               revision.getMinor(),
+                               False
+                       )
+                       eq_(
+                               revision.getComment(),
+                               u"[[WP:AES|\u2190]]Created page with 
'{{WikiProject National Register of Historic Places|class=Stub}} {{WikiProject 
Maine|class=Stub|importance=Low}} {{reqphoto|in=Maine}}'"
+                       )
+                       
+                       eq_(
+                               revision.getText(),
+                               u"{{WikiProject National Register of Historic 
Places|class=Stub}}\n" + 
+                               u"{{WikiProject 
Maine|class=Stub|importance=Low}}\n" + 
+                               u"{{reqphoto|in=Maine}}"
+                       )
+               
+       
+
+def test_large():
+       fp = sample.getLargeXMLFilePointer()
+       wf = Iterator(fp)
+       pageCounter = 0
+       revisionCounter = 0
+       for page in wf.readPages():
+               pageCounter += 1
+               for revision in page.readRevisions():
+                       assert revision.getId() != None
+                       assert revision.getTimestamp() != None
+                       __ = revision.getContributor()
+                       __ = revision.getComment()
+                       assert revision.getMinor() != None
+                       assert revision.getText() != None
+                       #sys.stderr.write(".")
+                       revisionCounter += 1
+                       if revisionCounter >= 100: break
+               
+       
+       eq_(pageCounter, 1)
+       #eq_(revisionCounter, 15180)
+       eq_(revisionCounter, 100)
+

Added: trunk/tools/wsor/wikimedia/wmf/dump/tests/test_map.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/tests/test_map.py                       
        (rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/tests/test_map.py       2011-06-21 
22:47:39 UTC (rev 90557)
@@ -0,0 +1,25 @@
+import sys, logging
+from nose.tools import eq_
+from gl import wp
+from . import sample
+from ..map import map
+
+
+def test_simple_map():
+       dumps = [sample.getSmallXMLFilePath(), sample.getLargeXMLFilePath()]
+       
+       def processPage(dump, page):
+               assert hasattr(dump, "namespaces")
+               assert hasattr(page, "readRevisions")
+               
+               count = 0
+               for rev in page.readRevisions():
+                       count += 1
+                       if count >= 100: break
+               
+               yield (page.getId(), count)
+       
+       output = dict(map(dumps, processPage))
+       
+       eq_(output[17500012], 1)
+       eq_(output[12], 100)

Added: trunk/tools/wsor/wikimedia/wmf/dump/xml_iterator.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/dump/xml_iterator.py                         
(rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/dump/xml_iterator.py 2011-06-21 22:47:39 UTC 
(rev 90557)
@@ -0,0 +1,76 @@
+try:
+       import xml.etree.cElementTree as etree
+except ImportError:
+       import xml.etree.ElementTree as etree
+
+def XMLIterator(fp):
+       xmlIterator = etree.iterparse(fp, events=("start","end"))
+       return ElementIterator(xmlIterator.next()[1], xmlIterator)
+       
+class ElementIteratorError: pass
+
+class ElementIterator:
+       
+       def __init__(self, element, xmlIterator):
+               self.element     = element
+               self.xmlIterator = xmlIterator
+               self.tagStack    = [self.element.tag]
+               
+       def __iter__(self):
+               if len(self.tagStack) == 0:
+                       raise ElementIteratorError("Element has already been 
iterated through.")
+               
+               for event, element in self.xmlIterator:
+                       if event == "start":
+                               element = ElementIterator(element, 
self.xmlIterator)
+                               yield element
+                               element.clear()
+                       
+                       else: #event == "end"
+                               assert element.tag == self.element.tag, 
"Expected %r, got %r" % (self.element.tag, element.tag)
+                               self.tagStack.pop()
+                       
+                       if len(self.tagStack) == 0:
+                               break
+                       
+                       
+       def get(self, key, alt=None):
+               return self.element.attrib.get(key, alt)
+               
+       
+       def complete(self):
+               if len(self.tagStack) != 0:
+                       for event, element in self.xmlIterator:
+                               if event == "start":
+                                       self.tagStack.append(element.tag)
+                                       element.clear()
+                               
+                               else: #event == "end"
+                                       assert self.tagStack[-1] == 
element.tag, "Expected %r at the end of %r" % (element.tag, self.tagStack)
+                                       self.tagStack.pop()
+                               
+                               if len(self.tagStack) == 0:
+                                       break
+                       
+               
+       def clear(self):
+               self.complete()
+               self.element.clear()
+                               
+       
+       def __del__(self):
+               self.clear()
+       
+       def __getattr__(self, attr):
+               if attr == "attrib":
+                       return self.element.attrib
+               elif attr == "tag":
+                       return self.element.tag
+               elif attr == "tail":
+                       return self.element.tail
+               elif attr == "text":
+                       self.complete()
+                       return self.element.text
+               else:
+                       raise AttributeError("%s has no attribute %r" % 
(self.__class__.__name__, attr))
+                               

Added: trunk/tools/wsor/wikimedia/wmf/util.py
===================================================================
--- trunk/tools/wsor/wikimedia/wmf/util.py                              (rev 0)
+++ trunk/tools/wsor/wikimedia/wmf/util.py      2011-06-21 22:47:39 UTC (rev 
90557)
@@ -0,0 +1,236 @@
+from __future__ import with_statement, absolute_import
+import re, types
+import time, calendar, datetime
+import hashlib
+import urllib
+
+__docformat__ = "restructuredtext en"
+
+"""
+This module contains utility functions for interacting with Wikipedia.
+"""
+
+LONG_WP_TIME_STRING = '%Y-%m-%dT%H:%M:%SZ'
+"""
+The longhand version of Wikipedia timestamps.
+"""
+
+SHORT_WP_TIME_STRING = '%Y%m%d%H%M%S'
+"""
+The shorthand version of Wikipedia timestamps
+"""
+
+WPAPI_URL = "http://%s.wikipedia.org/w/api.php";
+"""
+The wikipedia API URL.  A positional format token is included to so that the 
+language specific prefix can be formatted in. See `wpAPIURL()`.
+"""
+
+
+VLOOSE_RE = re.compile(r'''
+         (^revert\ to.+using)
+       | (^reverted\ edits\ by.+using)
+       | (^reverted\ edits\ by.+to\ last\ version\ by)
+       | (^bot\ -\ rv.+to\ last\ version\ by)
+       | (-assisted\ reversion)
+       | (^(revert(ed)?|rv).+to\ last)
+       | (^undo\ revision.+by)
+       ''', re.IGNORECASE | re.DOTALL | re.VERBOSE)
+
+VSTRICT_RE = re.compile(r'''
+         (\brvv)
+       | (\brv[/ ]v)
+       | (vandal(?!proof|bot))
+       | (\b(rv|rev(ert)?|rm)\b.*(blank|spam|nonsense|porn|mass\sdelet|vand))
+       ''', re.IGNORECASE | re.DOTALL | re.VERBOSE)
+
+NAMESPACES = {
+       'en': set([
+               'Media',
+               'Special',
+               'Talk',
+               'User talk',
+               'Wikipedia talk',
+               'Image talk',
+               'MediaWiki talk',
+               'Template talk',
+               'Help talk',
+               'Category talk',
+               'Portal talk',
+               'File talk',
+               'User',
+               'Wikipedia',
+               'Image',
+               'MediaWiki',
+               'Template',
+               'Help',
+               'Category',
+               'Portal',
+               'File'
+       ])
+}
+
+NAMESPACE_RE = re.compile(r'^((?:%s)):' % ')|(?:'.join(NAMESPACES['en']),
+                                                 re.IGNORECASE)
+
+def wpAPIURL(prefix="en"):
+       """
+       Creates a the URL for the wikipedia API based on a language prefix. 
+       
+       :Parameters:
+               prefix : string
+                       the prefix to be formatted into the url
+               
+       :Return:
+               the Wikipedia API url for a given language prefix
+       """
+       return WPAPI_URL % prefix
+
+
+def wp2Timestamp(wpTime):
+       """
+       Converts a Wikipedia timestamp to a Unix Epoch-based timestamp (seconds 
+       since Jan. 1st 1970 GMT).  This function will handle both long 
+       (see `LONG_WP_TIME_STRING`) and short (see `SHORT_WP_TIME_STRING`) 
+       time formats.
+       
+       :Parameters:
+               wpTime : string
+                       Wikipedia timestamp to be converted
+               
+       :Return:
+               integer Unix Epoch-based timestamp (seconds since Jan. 1st 1970 
+               GMT) version of the provided wpTime.
+       """
+       try:
+               myTime = time.strptime(wpTime, LONG_WP_TIME_STRING)
+       except ValueError as e:
+               try:
+                       myTime = time.strptime(wpTime, SHORT_WP_TIME_STRING)
+               except ValueError as e:
+                       raise ValueError("'%s' is not a valid Wikipedia date 
format" % wpTime)
+               
+       return int(calendar.timegm(myTime))
+
+def timestamp2WP(timestamp):
+       """
+       Converts a Unix Epoch-based timestamp (seconds  since Jan. 1st 1970 GMT)
+       timestamp to one acceptable by Wikipedia. 
+       
+       :Parameters:
+               timestamp : int
+                       Unix timestamp to be converted
+               
+       :Return:
+               string Wikipedia style timestamp
+       """
+       
+       return 
datetime.datetime.utcfromtimestamp(timestamp).strftime('%Y%m%d%H%M%S')
+
+def digest(content):
+       return hashlib.md5(content.encode("utf-8")).hexdigest()
+       
+       
+def normalize(name):
+       """
+       Normalizes text from a Wikipedia title/segment by capitalizing the
+       first letter, replacing underscores with spaces, and collapsing all
+       spaces to one space.
+       
+       :Parameters:
+               name : string
+                       Namespace or title portion of a Wikipedia page name.
+               
+       :Return:
+               string Normalized text
+       """
+       
+       return name.capitalize().replace("_", " ").strip()
+       
+def normalizeTitle(title, namespaces=NAMESPACES['en']):
+       """
+       Normalizes a Wikipedia page title and splits the title into
+       namespace and title pieces.
+       
+       :Parameters:
+               title : string
+                       The title of a Wikipedia page.
+               namespaces : set
+                       A set of namespaces to look for in the title.
+               
+       :Return:
+               The namespace, title tuple
+       """
+
+       if type(title) == types.UnicodeType:
+               title = title.encode('utf-8')
+       
+       title = title.strip()
+       parts = title.split(":", 1)
+       if len(parts) == 1:
+               namespace = None
+               title = normalize(parts[0])
+       elif parts[1] == '':
+               namespace = None
+               title = normalize(title)
+       else:
+               nsPart = normalize(parts[0])
+               if nsPart in namespaces:
+                       namespace = nsPart
+                       title = normalize(parts[1])
+               else:
+                       namespace = None
+                       title = normalize(title)
+       
+       return (namespace, title)
+       
+def normalizeURLTitle(title, namespaces=NAMESPACES['en']):
+       """
+       Normalizes a Wikipedia page title obtained from a URL and splits
+       the title into namespace and title pieces.
+       
+       :Parameters:
+               title : string
+                       The title of a Wikipedia page.
+               namespaces : set
+                       A set of namespaces to look for in the title.
+               
+       :Return:
+               The namespace, title tuple
+       """
+
+       if type(title) == types.UnicodeType:
+               title = title.encode('utf-8')
+       title = urllib.unquote(title).split('#')[0]
+       ns = NAMESPACE_RE.match(title)
+       if not ns:
+               namespace = ""
+               title = normalize(title)
+       else:
+               nsPart = ns.group(1).capitalize()
+               if nsPart in namespaces:
+                       namespace = nsPart
+                       title = normalize(title[ns.end():])
+       return (namespace, title)
+
+def isVandalismByComment(editComment, testLoose=True, testStrict=True):
+       '''
+       Check the given edit comment against the VLOOSE and VSTRICT regexes
+       as configured, and returns a boolean defining if it matches or not.
+
+       @param editComment: The edit comment to test.
+       @type editComment: str
+
+       @param testLoose: If the edit comment matches VLOOSE_RE, True is 
returned
+       @type testLoose: bool
+
+       @param testStrict: If the edit comment matches VSTRICT_RE, True is 
returned
+       @type testStrict: bool
+       '''
+
+       if testLoose and VLOOSE_RE.search(editComment):
+               return True;
+       if testStrict and VSTRICT_RE.search(editComment):
+               return True;
+
+       return False;


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

[MediaWiki-CVS] SVN: [90557] trunk/tools/wsor

Reply via email to