[MediaWiki-commits] [Gerrit] operations/dumps[ariel]: toy offline reader: pylint and pep8

ArielGlenn (Code Review) Sat, 30 Dec 2017 02:09:08 -0800

ArielGlenn has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/280109 )


Change subject: toy offline reader: pylint and pep8
......................................................................


toy offline reader: pylint and pep8

Change-Id: Ib0bbf9c3bb1db1a8029a7a7a3e660e82433634b7
---
M toys/bz2multistream/INSTALL.txt
M toys/bz2multistream/wikiarticles.py
M toys/bz2multistream/writetoc.py
3 files changed, 755 insertions(+), 675 deletions(-)

Approvals:
  ArielGlenn: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/toys/bz2multistream/INSTALL.txt b/toys/bz2multistream/INSTALL.txt
index 733c686..1d955ef 100644
--- a/toys/bz2multistream/INSTALL.txt
+++ b/toys/bz2multistream/INSTALL.txt
@@ -23,7 +23,7 @@
 (or you can use the helper script sort.sh for this step)
 
 cat elwiki-pages-multistream-index-sorted.txt | \
-python ./writetoc.py --field 3 --separator ':' --indexfile 
elwiki-pages-multistream-toc.txt
+python ./writetoc.py --field 3 --separator ':' --tocfile 
elwiki-pages-multistream-toc.txt
 
 After the above, you can optionally create a configuration file 
"wikiarticles.conf".
 See the wikiarticles.conf.sample for more information.  An example 
configuration file
diff --git a/toys/bz2multistream/wikiarticles.py 
b/toys/bz2multistream/wikiarticles.py
index 6ebc5b0..27d1e99 100644
--- a/toys/bz2multistream/wikiarticles.py
+++ b/toys/bz2multistream/wikiarticles.py
@@ -1,209 +1,446 @@
-import getopt, os, sys, re, codecs, bz2, ConfigParser
+import getopt
+import os
+import sys
+import re
+import bz2
+import ConfigParser
+
+
+def get_choice_from_batch(titles, start, batch_size):
+    '''
+    display titles from start to start + batchsize, with count in front
+    ask the caller for a title number or an action
+    actions may be Q (quit), N (next batch), B (previous batch), R (redisplay)
+    if caller enters nothing, treat that as default (R)
+    if caller enters something else, whine and treat that as default (R) too
+    arguments:
+    titles     -- full list of titles
+    start      -- display from this point in the list
+    batch_size -- how many titles to display
+    returns a tuple of (action, title number) where one or the other of these
+    may be None
+    '''
+
+    # yay python, it will silently ignore the fact that you requested
+    # more things in the list than exist. (no this is not sarcasm)
+    count = start
+    for line in titles[start:start+batch_size]:
+        print "%s) %s" % (count+1, WATitleMunger.un_normalize_title(line))
+        count += 1
+    print
+    print("Enter number of choice, or Q/N/P/R to "
+          "quit/next page/prev page/redisplay page (default R): "),
+    choice = sys.stdin.readline()
+    choice = choice.strip()
+    if not choice:
+        choice = 'R'
+    if choice.isdigit():
+        num = int(choice)
+        if num < 1 or num > len(titles):
+            print "Bad number given."
+            return "R", None
+        return None, num
+    else:
+        choice = choice.capitalize()
+        if choice in ['N', 'P', 'Q', 'R']:
+            return choice, None
+        else:
+            print "Bad choice given."
+            return "R", None
+
+
+def process_action(action, start, batch_size, total):
+    '''
+    given a caller action,
+    update title list display pointer to the appropriate position
+    arguments:
+    action     -- Q (quit), N (next batch), P (prev batch), or anything else
+    start      -- title list display pointer, a batch of titles from the list
+                  will be displayed starting from this number
+    batch_size -- how many titles are dispayed in a batch
+    total      -- total titles in the list
+    returns: updated title list display pointer, or exits at user request
+    (action Q)
+    note that any action other than Q/N/P will result in the default R 
(redisplay
+    current batch of titles) which means no change, return existing value.  
This
+    includes the None action.
+    '''
+    if action == 'N' or action == 'n':
+        if start + batch_size < total:
+            start += batch_size
+        else:
+            print "End of list reached."
+    elif action == 'P' or action == 'p':
+        if start > batch_size:
+            start = start - batch_size
+        else:
+            print "Beginning of list reached."
+    elif action == 'Q' or action == 'q':
+        print "Exiting at user's request."
+        sys.exit(0)
+    return start
+
+
+def get_user_title_choice(title_hash):
+    '''
+    show a numbered list of page titles on stdout and read the
+    caller's choice on stdin
+    I guess this is a poor person's pager
+    arguments:
+    titleHash -- hash of page titles and their offsets into the xml file
+    returns: the offset into the xml file for the title selected
+    '''
+    titles = title_hash.keys()
+    titles.sort()
+    total = len(titles)
+
+    choice = None
+    start = 0
+    batch_size = 30
+    print "Multiple titles found, please choose from the following."
+    while not choice:
+        (action, choice) = get_choice_from_batch(titles, start, batch_size)
+        if choice:
+            return titles[choice-1]
+        else:
+            start = process_action(action, start, batch_size, total)
+
+
+def get_text(xml):
+    '''
+    get the contents of the <text>...</text> tags if needed,
+    returns the contents found or None if none found
+    '''
+    text = None
+    if xml is not None:
+        match = re.search("<text[^>]*>(.*?)</text>", xml, flags=re.DOTALL)
+        if match:
+            text = match.group(1)
+    return text
+
+
+def get_redirect(text, localized_redir_string, verbose):
+    '''
+    look for and set page title of a redirect link in the text, if any
+    returns the redirction link or None if none was found
+    format: <text ...>#REDIRECT [[link|show to reader]] ...
+    fixme the redirect keyword really should be case insensitive but
+    have we done any unicode stuff here? nope, so that ain't happening
+
+    arguments:
+    text                   -- the raw text dug out from the <text> tags of 
page content
+    localized_redir_string -- the string 'REDIR' in the wiki content language
+    verbose
+    '''
+    redirect = None
+    if text is not None:
+        redir_regex = (re.compile(r"<text 
[^>]+>\s*#(REDIRECT|%s)\s*\[\[([^\|\]]+)"
+                                  % localized_redir_string))
+        match = redir_regex.search(text)
+        if match:
+            if verbose:
+                sys.stderr.write("Found a redirect in the page "
+                                 "text: %s\n" % match.group(2))
+            redirect = match.group(2)
+    return redirect
+
+
+def display(text, file_string, category_string, clean=False, text_only=False):
+    '''
+    display the text, optionally extracting just the content
+    from the <text> tags and optionally doing some cleanup
+    on the text before display
+    arguments:
+    text -- the page text from the xml file, everything between
+    <page>...</page>
+
+    arguments:
+    fileString      --  the string 'File' in the wiki's content language
+    category_string --  the string 'Category' in the wiki's content language
+    clean           --  whether or not to clean up various tags etc or leave 
the raw text
+                        as retrieved from the xml file
+    text_only       --  whether or not to include the metadata and other stuff 
for the page
+                        as retrieved from the xml file or just the content 
from the <text> tags
+    '''
+    if text is None:
+        print "No page text for that title found."
+        return
+
+    if text_only:
+        text = get_text(text)
+
+    if clean:
+        formatter = WATextFormatter(text, file_string, category_string)
+        text = formatter.do_formatting()
+
+    print text
+
+
+def handle_redirects(text, retriever, max_redirs, redir_text, verbose):
+    '''
+    follow redirect link in page text, retrieve the target page text, and
+    check that til we reach a page that's not a redirect or we hit the
+    maxRedirs limit or we hit a redirect to a nonexistent page
+
+    arguments:
+    text      -- initial page text as retrieved from the xml file,
+                 without any cleanup etc.
+    retriever -- WAPageRetriever object (used to follow the redirects)
+    returns the page text, either of the first non-redirect or the
+    last redirect before going over the redir follow limit, or
+    the last page text before following a link to a nonexistent page
+    max_redirs -- follow this many redirect links until giving up, if 0 then
+                  don't follow any (need this so we avoid redirection loops)
+    redir_text -- the string 'REDIRECT' in the wiki's content language
+    verbose    -- whether or not to display messages about processing being 
done
+    '''
+    if not text:
+        return text
+    redirs_done = 0
+    redir_link = None
+    while redirs_done < max_redirs:
+        if verbose:
+            sys.stderr.write("Checking for redirects in text, "
+                             "redirs done already:%s\n" % redirs_done)
+        redir_link = get_redirect(text, redir_text, verbose)
+        if not redir_link:
+            break
+        old_text = text
+        text = retriever.retrieve(WATitleMunger.normalize_title(redir_link), 
True)
+        if text is None:
+            # redirect to nonexistent article
+            text = old_text
+            break
+        redirs_done += 1
+    if redir_link and redirs_done >= max_redirs:
+        sys.stderr.write("Too many redirects encountered.\n")
+    return text
+
 
 class WAPageRetriever(object):
-    # retrive the xml page content from a bz2 miltistream xml file
-    # given an index of the offsets of the streams in the file and
-    # the page titles in those streams, along with a table of
-    # contents into the index file by first letter of page titles
-    def __init__(self, xmlFile, idxFile, tocFile, verbose):
-        # constructor; besides setting instance attributes,
-        # initializes a few things so we know they need to be
-        # properly set later
-        # arguments:
-        # xmlFile - name of the bz2 multistream xml file of pages
-        # idxFile - name of the index into the bz2 multistream file
-        # tocFile - name of the toc into the index file
-        # verbose - whether or not to display extra messages about processing
-        self.xmlFile = xmlFile
-        self.idxFile = idxFile
-        self.tocFile = tocFile
+    '''
+    retrive the xml page content from a bz2 miltistream xml file
+    given an index of the offsets of the streams in the file and
+    the page titles in those streams, along with a table of
+    contents into the index file by first letter of page titles
+    '''
+    def __init__(self, xml_file, idx_file, toc_file, verbose):
+        '''
+        constructor; besides setting instance attributes,
+        initializes a few things so we know they need to be
+        properly set later
+        arguments:
+        xml_file - name of the bz2 multistream xml file of pages
+        idx_file - name of the index into the bz2 multistream file
+        toc_file - name of the toc into the index file
+        verbose  - whether or not to display extra messages about processing
+        '''
+        self.xml_file = xml_file
+        self.idx_file = idx_file
+        self.toc_file = toc_file
         self.verbose = verbose
-        self.xmlFd = None
-        self.tocFd = None
-        self.idxFd = None
-        self.titleMatches = None # hash of titles and offsets that partially 
match, if any
-        self.setupDone = False
+        self.xml_fd = None
+        self.toc_fd = None
+        self.idx_fd = None
+        self.title_matches = None  # hash of titles and offsets that partially 
match, if any
+        self.setup_done = False
 
     def setup(self):
-        # call just after initalizing the instance, opens files. note that the 
bz2
-        # compressed file is not opened with a decompressor; since
-        # we seek around in the file we can't do that.
-        if self.setupDone:
+        '''
+        call just after initalizing the instance, opens files. note that the 
bz2
+        compressed file is not opened with a decompressor; since
+        we seek around in the file we can't do that.
+        '''
+        if self.setup_done:
             return
 
-        if verbose:
+        if self.verbose:
             sys.stderr.write("Opening files\n")
-        self.xmlFd = open(self.xmlFile, "r")
-        self.idxFd = open(self.idxFile, "r")
-        self.tocFd = open(self.tocFile, "r")
-        self.setupDone
+        self.xml_fd = open(self.xml_file, "r")
+        self.idx_fd = open(self.idx_file, "r")
+        self.toc_fd = open(self.toc_file, "r")
+        self.setup_done = True
 
     def teardown(self):
-        # call when the instance is no longer needed
-        # closes all file descriptors and readies
-        # the instance for a new setup() call if desired
-        if verbose:
+        '''
+        call when the instance is no longer needed
+        closes all file descriptors and readies
+        the instance for a new setup() call if desired
+        '''
+        if self.verbose:
             sys.stderr.write("Closing files\n")
-        if self.xmlFd:
-            self.xmlFd.close()
-        if self.idxFd:
-            self.idxFd.close()
-        if self.tocFd:
-            self.tocFd.close()
-        self.xmlFd = None
-        self.tocFd = None
-        self.idxFd = None
-        self.titleMatches = None
-        self.setupDone = False
+        if self.xml_fd:
+            self.xml_fd.close()
+        if self.idx_fd:
+            self.idx_fd.close()
+        if self.toc_fd:
+            self.toc_fd.close()
+        self.xml_fd = None
+        self.toc_fd = None
+        self.idx_fd = None
+        self.title_matches = None
+        self.setup_done = False
 
     def retrieve(self, title, exact):
-        # retrieve the contents of the xml file with the
-        # specified page title.
-        # arguments:
-        # title -- the page title, case sensitive, with spaces, not underscores
-        # exact -- true if the title must match exactly, otherwise a list of
-        #          page titles starting with the specified string is displayed
-        #          on stdout with a prompt for selection from stdin.
-        # returns None if no corresponding page title can be found
-        titleUnicode = title.decode("utf-8")
-        firstChar = titleUnicode[0]
-        idxOffset = self.findCharInToc(firstChar)
-        if idxOffset is None:
+        '''
+        retrieve the contents of the xml file with the
+        specified page title.
+        arguments:
+        title -- the page title, case sensitive, with spaces, not underscores
+        exact -- true if the title must match exactly, otherwise a list of
+                 page titles starting with the specified string is displayed
+                 on stdout with a prompt for selection from stdin.
+        returns None if no corresponding page title can be found
+        '''
+        title_unicode = title.decode("utf-8")
+        first_char = title_unicode[0]
+        idx_offset = self.find_char_in_toc(first_char)
+        if idx_offset is None:
             sys.stderr.write("No such title found in toc.\n")
             return None
         if exact:
             if self.verbose:
-                sys.stderr.write("Found index offset %s\n" % idxOffset)
-            result = self.retrieveExact(title, idxOffset)
+                sys.stderr.write("Found index offset %s\n" % idx_offset)
+            result = self.retrieve_exact(title, idx_offset)
         else:
-            titlesHash = self.findTitleInIndex(title, idxOffset)
+            titles_hash = self.find_title_in_index(title, idx_offset)
             # if there is only one entry in the hash don't ask the user
-            if len(titlesHash.keys()) == 1:
-                title = titlesHash.keys()[0]
+            if len(titles_hash.keys()) == 1:
+                title = titles_hash.keys()[0]
             else:
-                title = self.getUserTitleChoice(titlesHash)
-            xmlOffset = titlesHash[title]
-            result = self.retrieveExact(title, None, xmlOffset)
+                title = get_user_title_choice(titles_hash)
+            xml_offset = titles_hash[title]
+            result = self.retrieve_exact(title, None, xml_offset)
         return result
 
-    def retrieveExact(self, title, idxOffset, xmlOffset=None):
-        # retrieve the contents of the xml file with the
-        # specified page title, first seeking to the appropriate place
-        # in the xml file and possibly the index file
-        # arguments:
-        # title     -- the page title, case sensitive, with spaces, not 
underscores
-        # idxOffset -- the offset in bytes into the index file with the title
-        #              if this is None, the xmlOffset must be provided
-        # xmlOffset -- the offset into the xml file in bytes of the bz2
-        #              stream containing the page with the specified title
-        #              if this is None, the idxOffset must be provided
-        #              if this is provided it will be used and the idxOffset
-        #              will be ignored, as it would only be used to look up
-        #              this value
-        # returns text if found, None otherwise
-        if xmlOffset is None:
-            xmlOffset = self.findTitleInIndexExactMatch(title, idxOffset)
-        if xmlOffset is None:
+    def retrieve_exact(self, title, idx_offset, xml_offset=None):
+        '''
+        retrieve the contents of the xml file with the
+        specified page title, first seeking to the appropriate place
+        in the xml file and possibly the index file
+        arguments:
+        title      -- the page title, case sensitive, with spaces, not 
underscores
+        idx_offset -- the offset in bytes into the index file with the title
+                      if this is None, the xmlOffset must be provided
+        xml_offset -- the offset into the xml file in bytes of the bz2
+                      stream containing the page with the specified title
+                      if this is None, the idxOffset must be provided
+                      if this is provided it will be used and the idxOffset
+                      will be ignored, as it would only be used to look up
+                      this value
+        returns text if found, None otherwise
+        '''
+        if xml_offset is None:
+            xml_offset = self.find_title_in_index_exact_match(title, 
idx_offset)
+        if xml_offset is None:
             sys.stderr.write("No such title found in index.\n")
             return None
         if self.verbose:
-                sys.stderr.write("Found xml offset %s\n" % xmlOffset)
-        text = self.retrieveText(title, xmlOffset)
+            sys.stderr.write("Found xml offset %s\n" % xml_offset)
+        text = self.retrieve_text(title, xml_offset)
         return text
 
-    def findCharInToc(self, char):
-        # given a (possibly multibyte) character, find its entry
-        # in the toc file, read the index file offset listed there
-        # and return it
-        # arguments:
-        # char -- character for which to find the toc entry
-        # returns the index file offset of the character, or None if not found
-        self.tocFd.seek(0)
+    def find_char_in_toc(self, char):
+        '''
+        given a (possibly multibyte) character, find its entry
+        in the toc file, read the index file offset listed there
+        and return it
+        arguments:
+        char -- character for which to find the toc entry
+        returns the index file offset of the character, or None if not found
+        '''
+        self.toc_fd.seek(0)
         # format of these lines is
         # 14815067:A
         # this is offset, first (unicode) character
-        for line in self.tocFd:
+        for line in self.toc_fd:
             stripped = line.rstrip('\n')
             fields = stripped.split(':', 1)
-            if  len(fields) < 2:
-                next
-            indexedCharUnicode = fields[1].decode("utf-8")
-            if indexedCharUnicode == char:
+            if len(fields) < 2:
+                continue
+            indexed_char_unicode = fields[1].decode("utf-8")
+            if indexed_char_unicode == char:
                 return int(fields[0])
-            if indexedCharUnicode > char:
+            if indexed_char_unicode > char:
                 break
         return None
 
-    def findTitleInIndex(self, title, offset):
-        # find entries beginning with the specified in the index file,
-        # first seeking to the specified offset
-        # arguments:
-        # title  -- page title to be found in index
-        # offset -- offset into the index file in bytes of
-        #           the line with the specified title
-        # returns a hash of matching page titles and their offsets into
-        # the xml file, or None if no matches were found
-        titleMatches = {}
-        self.idxFd.seek(offset)
-        titleLen = len(title)
+    def find_title_in_index(self, title, offset):
+        '''
+        find entries beginning with the specified in the index file,
+        first seeking to the specified offset
+        arguments:
+        title  -- page title to be found in index
+        offset -- offset into the index file in bytes of
+                  the line with the specified title
+        returns a hash of matching page titles and their offsets into
+        the xml file, or None if no matches were found
+        '''
+        title_matches = {}
+        self.idx_fd.seek(offset)
+        title_len = len(title)
         # format of these lines is
         # 9186323419:33202778:A Girl like Me (film)
         # this is offset, page id, page title
-        for line in self.idxFd:
+        for line in self.idx_fd:
             stripped = line.rstrip("\n")
             fields = stripped.split(':', 2)
-            if  len(fields) < 2:
-                next
+            if len(fields) < 2:
+                continue
             if fields[2].startswith(title):
-                titleMatches[fields[2]] = int(fields[0]) # offset into xml file
-            if fields[2][:titleLen] > title: # we are past all the matches (if 
there were any)
+                title_matches[fields[2]] = int(fields[0])  # offset into xml 
file
+            if fields[2][:title_len] > title:  # we are past all the matches 
(if there were any)
                 break
-        if not len(titleMatches.keys()):
-            titleMatches = None
-        return titleMatches
+        if not len(title_matches.keys()):
+            title_matches = None
+        return title_matches
 
-    def findTitleInIndexExactMatch(self, title, offset):
-        # find entry in the index file that matches exactly the specified 
title,
-        # first seeking to the specified offset
-        # arguments:
-        # title  -- page title to be found in index
-        # offset -- offset into the index file in bytes of
-        #           the line with the specified title
-        # returns the offset of the title in the xml file, 
-        # as listed in the index file, or None if no exact match was found
-        self.idxFd.seek(offset)
+    def find_title_in_index_exact_match(self, title, offset):
+        '''
+        find entry in the index file that matches exactly the specified title,
+        first seeking to the specified offset
+        arguments:
+        title  -- page title to be found in index
+        offset -- offset into the index file in bytes of
+                  the line with the specified title
+        returns the offset of the title in the xml file,
+        as listed in the index file, or None if no exact match was found
+        '''
+        self.idx_fd.seek(offset)
         # format of these lines is
         # 9186323419:33202778:A Girl like Me (film)
         # this is offset, page id, page title
-        for line in self.idxFd:
+        for line in self.idx_fd:
             stripped = line.rstrip("\n")
             fields = stripped.split(':', 2)
-            if  len(fields) < 2:
-                next
+            if len(fields) < 2:
+                continue
             if len(fields) < 3:
                 sys.stderr.write("Fewer splits than we expected: line %s\n" % 
line)
             if fields[2] == title:
-                return int(fields[0]) # xml offset into file
+                return int(fields[0])  # xml offset into file
             if fields[2] > title:
                 break
         return None
-        
-    def retrieveText(self, title, offset):
-        # retrieve the page text for a given title from the xml file
-        # this does decompression of a bz2 stream so it's more expsive than
-        # other parts of this class
-        # arguments:
-        # title  -- the page title, with spaces and not underscores, case 
sensitive
-        # offset -- the offset in bytes to the bz2 stream in the xml file 
which contains
-        #           the page text
-        # returns the page text or None if no such page was found
-        self.xmlFd.seek(offset)
-        bz = bz2.BZ2Decompressor()
+
+    def retrieve_text(self, title, offset):
+        '''
+        retrieve the page text for a given title from the xml file
+        this does decompression of a bz2 stream so it's more expsive than
+        other parts of this class
+        arguments:
+        title  -- the page title, with spaces and not underscores, case 
sensitive
+        offset -- the offset in bytes to the bz2 stream in the xml file which 
contains
+                  the page text
+        returns the page text or None if no such page was found
+        '''
+        self.xml_fd.seek(offset)
+        unzipper = bz2.BZ2Decompressor()
         out = None
         found = False
         try:
-            block = self.xmlFd.read(262144)
-            out = bz.decompress(block)
+            block = self.xml_fd.read(262144)
+            out = unzipper.decompress(block)
         # hope we got enough back to have the page text
         except:
             raise
@@ -214,9 +451,9 @@
         #   <id>10</id>
         # ...
         #   </page>
-        titleRegex = re.compile("<page>(\s*)<title>%s(\s*)</title>" % 
re.escape(title))
+        title_regex = re.compile(r"<page>(\s*)<title>%s(\s*)</title>" % 
re.escape(title))
         while not found:
-            match = titleRegex.search(out)
+            match = title_regex.search(out)
             if match:
                 found = True
                 text = out[match.start():]
@@ -224,16 +461,16 @@
                     sys.stderr.write("Found page title, first 600 characters: 
%s\n" % text[:600])
                 break
             # we could have a part of the regex at the end of the string, so...
-            if len(out) > 40 + len(title): # length of the above plus extra 
whitespace
-                out = out[-1 *(40 + len(title)):]
+            if len(out) > 40 + len(title):  # length of the above plus extra 
whitespace
+                out = out[-1 * (40 + len(title)):]
             try:
-                block = self.xmlFd.read(262144)
+                block = self.xml_fd.read(262144)
             except:
                 # reached end of file (normal case) or
                 # something really broken (other cases)
                 break
             try:
-                out = out + bz.decompress(block)
+                out = out + unzipper.decompress(block)
             except EOFError:
                 # reached end of bz2 stream
                 # EOFError  means we have some data after end of stream, don't 
care
@@ -254,21 +491,21 @@
                 text = text + out[:ind + len("</page>")]
                 break
             # we could have part of the end page tag at the end of the string
-            text = text + out[:-1 * len("</page>") -1]
+            text = text + out[:-1 * len("</page>") - 1]
             out = out[-1 * len("</page>"):]
             try:
-                block = self.xmlFd.read(262144)
+                block = self.xml_fd.read(262144)
             except:
                 # reached end of file (normal case) or
                 # something really broken (other cases)
                 break
             try:
-                out = out + bz.decompress(block)
+                out = out + unzipper.decompress(block)
             except EOFError:
                 # reached end of bz2 stream
                 # EOFError  means we have some data after end of stream, don't 
care
                 pass
-        
+
         # if not found this can be partial text. should we return it? no
         if not found:
             if self.verbose:
@@ -278,398 +515,207 @@
             text = None
         return text
 
-    def getUserTitleChoice(self, titleHash):
-        # show a numbered list of page titles on stdout and read the
-        # caller's choice on stdin
-        # I guess this is a poor person's pager
-        # arguments:
-        # titleHash -- hash of page titles and their offsets into the xml file
-        # returns: the offset into the xml file for the title selected
-        titles = titleHash.keys()
-        titles.sort()
-        total = len(titles)
-
-        choice = None
-        start = 0
-        batchSize = 30
-        print "Multiple titles found, please choose from the following."
-        while not choice:
-            (action, choice) = self.getChoiceFromBatch(titles, start, 
batchSize)
-            if choice:
-                return titles[choice-1]
-            else:
-                start = self.processAction(action, start, batchSize, total)
-
-    def getChoiceFromBatch(self, titles, start, batchSize):
-        # display titles from start to start + batchsize, with count in front
-        # ask the caller for a title number or an action
-        # actions may be Q (quit), N (next batch), B (previous batch), R 
(redisplay)
-        # if caller enters nothing, treat that as default (R)
-        # if caller enters something else, whine and treat that as default (R) 
too
-        # arguments:
-        # titles     -- full list of titles
-        # start      -- display from this point in the list
-        # batchSize  -- how many titles to display
-        # returns a tuple of (action, title number) where one or the other of 
these
-        # may be None
-
-        # yay python, it will silently ignore the fact that you requested
-        # more things in the list than exist. (no this is not sarcasm)
-        count = start
-        for line in titles[start:start+batchSize]:
-            print "%s) %s" % (count+1, WATitleMunger.unNormalizeTitle(line))
-            count += 1
-        print
-        print "Enter number of choice, or Q/N/P/R to quit/next page/prev 
page/redisplay page (default R): ",
-        choice = sys.stdin.readline()
-        choice = choice.strip()
-        if not choice:
-            choice = 'R'
-        if choice.isdigit():
-            num = int(choice)
-            if num < 1 or num > len(titles):
-                print "Bad number given."
-                return("R", None)
-            return(None, num)
-        else:
-            choice = choice.capitalize()
-            if choice in [ 'N', 'P', 'Q', 'R' ]:
-                return(choice, None)
-            else:
-                print "Bad choice given."
-                return("R", None)
-
-    def processAction(self, action, start, batchSize, total):
-        # given a caller action,
-        # update title list display pointer to the appropriate position
-        # arguments:
-        # action    --  Q (quit), N (next batch), P (prev batch), or anything 
else
-        # start     --  title list display pointer, a batch of titles from the 
list
-        #               will be displayed starting from this number
-        # batchSize -- how many titles are dispayed in a batch
-        # total     -- total titles in the list
-        # returns: updated title list display pointer, or exits at user request
-        # (action Q)
-        # note that any action other than Q/N/P will result in the default R 
(redisplay
-        # current batch of titles) which means no change, return existing 
value.  This
-        # includes the None action.
-        if action == 'N' or action == 'n':
-            if start + batchSize < total:
-                start += batchSize
-            else:
-                print "End of list reached."
-        elif action == 'P' or action == 'p':
-            if start > batchSize:
-                start = start - batchSize
-            else:
-                print "Beginning of list reached."
-        elif action == 'Q' or action == 'q':
-            print "Exiting at user's request."
-            sys.exit(0)
-        return(start)
 
 class WATextFormatter(object):
-    # format page text for a given title as desired by the caller
-    # we do this since we don't have a real renderer of wikitext
-    # with template expansion and all that crapola
-    def __init__(self, text, localizedFileString, localizedCategoryString):
-        # constructor
-        # arguments:
-        # text                    -- page text, could also include xml tags 
and page metadata
-        # localizedFileString     -- the string 'File' in the local wiki 
language
-        # localizedCategoryString -- the string 'Category' in the local wiki 
language
+    '''
+    format page text for a given title as desired by the caller
+    we do this since we don't have a real renderer of wikitext
+    with template expansion and all that crapola
+    '''
+    def __init__(self, text, localized_file_string, localized_category_string):
+        '''
+        constructor
+        arguments:
+        text                      -- page text, could also include xml tags 
and page metadata
+        localized_file_string     -- the string 'File' in the local wiki 
language
+        localized_category_string -- the string 'Category' in the local wiki 
language
+        '''
         self.text = text
-        self.localizedFileString = localizedFileString
-        self.localizedCategoryString = localizedCategoryString
-        self.formattingDone = False
+        self.localized_file_string = localized_file_string
+        self.localized_category_string = localized_category_string
+        self.formatting_done = False
 
-    def cleanupLinks(self):
-        # for all links (has [[ ]] and maybe | in them  -- no special 
treatment for interwiki links
-        # or categories, sorrry but this is a rough cut), toss the [[ ]] and 
the pipe arg if any.
-        # except file and category
+    def cleanup_links(self):
+        '''
+        for all links (has [[ ]] and maybe | in them  -- no special treatment 
for interwiki links
+        or categories, sorrry but this is a rough cut), toss the [[ ]] and the 
pipe arg if any.
+        except file and category
+        '''
         if self.text is not None:
-            nopipes = 
re.sub("\[\[(?!(File|Category|%s|%s))([^\|\]]+)\|([^\]]+)\]\]" %( 
self.localizedFileString, self.localizedCategoryString ), "\\3", self.text)
-            nowikilinks = re.sub("\[\[(?!(File|Category|%s|%s))([^\]]+)\]\]" % 
( self.localizedFileString, self.localizedCategoryString ),"\\2", nopipes)
+            nopipes = 
re.sub(r"\[\[(?!(File|Category|%s|%s))([^\|\]]+)\|([^\]]+)\]\]"
+                             % (self.localized_file_string, 
self.localized_category_string),
+                             "\\3", self.text)
+            nowikilinks = re.sub(r"\[\[(?!(File|Category|%s|%s))([^\]]+)\]\]"
+                                 % (self.localized_file_string, 
self.localized_category_string),
+                                 "\\2", nopipes)
             self.text = nowikilinks
 
-    def cleanupText(self):
-        # convert html entities back into <>"&, remove wiki markup for 
bold/italics, remove <span> tags
+    def cleanup_text(self):
+        '''
+        convert html entities back into <>"&,
+        remove wiki markup for bold/italics, remove <span> tags
+        '''
         if self.text is not None:
-            noampersands = self.text.replace("&lt;", 
'<').replace("&gt;",'>').replace("&quot;",'"').replace("&amp;",'&').replace("&nbsp;",'
 ')
-            nofontstyling = noampersands.replace("'''","").replace("''","")
-            nospans = re.sub("</?span[^>]*>","", nofontstyling)
+            noampersands = self.text.replace(
+                "&lt;", '<').replace("&gt;", '>').replace(
+                    "&quot;", '"').replace("&amp;", '&').replace("&nbsp;", ' ')
+            nofontstyling = noampersands.replace("'''", "").replace("''", "")
+            nospans = re.sub("</?span[^>]*>", "", nofontstyling)
             self.text = nospans
 
-    def cleanupRefs(self):
-        # toss the refs, this should really be overridable by the user. we 
want this so it's
-        # easier to read the plaintext of the article, there will already be a 
ton
-        # of templates and crap in there
+    def cleanup_refs(self):
+        '''
+        toss the refs, this should really be overridable by the user. we want 
this so it's
+        easier to read the plaintext of the article, there will already be a 
ton
+        of templates and crap in there
+        '''
         if self.text is not None:
-            norefs = re.sub("<ref[^>]*>.*?</ref>","", self.text, flags = 
re.DOTALL)
+            norefs = re.sub("<ref[^>]*>.*?</ref>", "", self.text, 
flags=re.DOTALL)
             # <ref name="mises.org"/>
             nosimplerefs = re.sub("<ref.*?/>", "", norefs)
             self.text = nosimplerefs
 
-    def cleanupHtmlComments(self):
-        # toss html (<!-- -->) comments, <nowiki>, <code> and <sup> tags, and 
<br> tags
+    def cleanup_html_comments(self):
+        '''
+        toss html (<!-- -->) comments, <nowiki>, <code> and <sup> tags, and 
<br> tags
+        '''
         if self.text is not None:
-            nocomments = re.sub("<!--.*?-->","", self.text, flags = re.DOTALL)
-            nonowikis = re.sub("</?nowiki>","", nocomments)
-            nocodes = re.sub("</?code>","", nonowikis)
-            nobrs = re.sub("<br\s*/>","",nocodes)
-            nosups = re.sub("</?sup>","",nobrs)
+            nocomments = re.sub(r"<!--.*?-->", "", self.text, flags=re.DOTALL)
+            nonowikis = re.sub(r"</?nowiki>", "", nocomments)
+            nocodes = re.sub(r"</?code>", "", nonowikis)
+            nobrs = re.sub(r"<br\s*/>", "", nocodes)
+            nosups = re.sub(r"</?sup>", "", nobrs)
             self.text = nosups
 
-    def doFormatting(self):
-        # do all the text formatting in some reasonable order
-        # and return the formatted text
-        if not self.formattingDone:
-            self.cleanupLinks()
-            self.cleanupText()
-            self.cleanupRefs()
-            self.cleanupHtmlComments()
-            self.formattingDone = True
-        return(self.text)
-
-class WAXMLExtractor(object):
-    # get various things from the xml page text
-    def __init__(self, XML):
-        # constructor
-        # arguments:
-        # XML -- the xml text of the page, including the <page>...</page>
-        #        tags and everything in between
-        self.XML = XML
-        self.text = None
-
-    def getText(self):
-        # get the contents of the <text>...</text> tags if needed,
-        # returns the contents found or None if none found
-        if self.text is None:
-            if self.XML is not None:
-                match = re.search("<text[^>]*>(.*?)</text>", self.XML, flags = 
re.DOTALL)
-                if match:
-                    self.text = match.group(1)
+    def do_formatting(self):
+        '''
+        do all the text formatting in some reasonable order
+        and return the formatted text
+        '''
+        if not self.formatting_done:
+            self.cleanup_links()
+            self.cleanup_text()
+            self.cleanup_refs()
+            self.cleanup_html_comments()
+            self.formatting_done = True
         return self.text
-
-class WATextExtractor(object):
-    # retrieve various things from page text
-    # right now various = redirection info, but this could have more things 
later
-    def __init__(self, text, localizedRedirString):
-        # constructor
-        # arguments:
-        # text                 -- the raw text dug out from the <text> tags of 
page content
-        # localizedRedirString -- the string 'REDIR' in the wiki content 
language
-        self.text = text
-        self.localizedRedirString = localizedRedirString
-        self.redirect = None
-
-    def getRedirect(self):
-        # look for and set page title of a redirect link in the text, if any
-        # returns the redirction link or None if none was found
-        # format: <text ...>#REDIRECT [[link|show to reader]] ...
-        # fixme the redirect keyword really should be case insensitive but
-        # have we done any unicode stuff here? nope, so that ain't happening
-        if self.redirect is None:
-            if self.text is not None:
-                redirRegex = re.compile("<text 
[^>]+>\s*#(REDIRECT|%s)\s*\[\[([^\|\]]+)" % self.localizedRedirString)
-                match = redirRegex.search(self.text)
-                if match:
-                    if verbose:
-                        sys.stderr.write("Found a redirect in the page text: 
%s\n" % match.group(2))
-                    self.redirect = match.group(2)
-        return self.redirect
 
 
 class WATitleMunger(object):
-    # transform page title to the format in the xml file
-    # or to ordinary plaintext
-
+    '''
+    transform page title to the format in the xml file
+    or to ordinary plaintext
+    '''
     @staticmethod
-    def normalizeTitle(title):
+    def normalize_title(title):
+        '''
         # doesn't do much right now. remember how this is only a proof of 
concept??
-        return title.replace('_', ' ').replace('&','&amp;').replace('"', 
"&quot;")
+        '''
+        return title.replace('_', ' ').replace('&', '&amp;').replace('"', 
"&quot;")
 
     @staticmethod
-    def unNormalizeTitle(title):
+    def un_normalize_title(title):
+        '''
         # not an exact opposite kids cause of the underscore, that's the breaks
-        return title.replace('&amp;','&').replace("&quot;", '"')
+        '''
+        return title.replace('&amp;', '&').replace("&quot;", '"')
 
-class WATextDisplay(object):
-    # process and display text of a page from the xml file of page content
-    def __init__(self, fileString, categoryString, clean = False, textOnly = 
False):
-        # constructor
-        # arguments:
-        # fileString     --  the string 'File' in the wiki's content language
-        # categoryString --  the string 'Category' in the wiki's content 
language
-        # clean          --  whether or not to clean up various tags etc or 
leave the raw text
-        #                    as retrieved from the xml file
-        # textOnly       --  whether or not to include the metadata and other 
stuff for the page
-        #                    as retrieved from the xml file or just the 
content from the <text> tags
-        self.fileString = fileString
-        self.categoryString = categoryString
-        self.clean = clean
-        self.textOnly= textOnly
-    
-    def display(self, text):
-        # display the text, optionally extracting just the content
-        # from the <text> tags and optionally doing some cleanup
-        # on the text before display
-        # arguments:
-        # text -- the page text from the xml file, everything between
-        # <page>...</page>
-        if text is None:
-            print "No page text for that title found."
-            return
 
-        if self.textOnly:
-            xe = WAXMLExtractor(text)
-            text = xe.getText()
-                          
-        if self.clean:
-            tf = WATextFormatter(text, self.fileString, self.categoryString)
-            text = tf.doFormatting()
+def usage(message=None):
+    '''
+    display usage information about the script, after optionally
+    displaying a specified message
+    arguments:
+    message -- message to be displayed before usage information
+               if omitted, only the usage information will be shown
+    '''
+    if message:
+        sys.stderr.write("%s\n" % message)
+    usage_message = """
+Usage: python wikiarticles.py --title titlestring --xmlfile filename
+                 --idxfile filename --tocfile filename [--configfile filename]
+                 [--maxredirs num] [--redirtext string] [--cleanup] [--exact]
+                 [--textonly] [--verbose]
 
-        print text
+Given a bz2-compressed multistream xml file of articles, a sorted plain text
+index file into the article file, and a plain text toc file for the index,
+find and display the xml including article text of any article specified
+by title.
 
-class WAErrorHandler(object):
-    # display warning and error message
-    def __init__(self, whoami):
-        # constructor
-        # arguments:
-        # whoami -- the name of the script being executed
-        self.whoami = whoami
+The user may specify the first so many characters of the title, in which
+case all matching titles will be displayed as a list so that the user
+may select the one desired.
 
-    def usage(self, message = None):
-        # display usage information about the script, after optionally
-        # displaying a specified message
-        # arguments:
-        # message -- message to be displayed before usage information
-        #            if omitted, only the usage information will be shown
-        if message:
-            sys.stderr.write("%s\n" % message)
-        sys.stderr.write("Usage: python %s --title titlestring --xmlfile 
filename\n" % self.whoami)
-        sys.stderr.write("           --idxfile filename --tocfile filename 
[--configfile filename]\n")
-        sys.stderr.write("           [--maxredirs num] [--redirtext string] 
[--cleanup] [--exact]\n")
-        sys.stderr.write("           [--textonly] [--verbose]\n")
-        sys.stderr.write("\n")
-        sys.stderr.write("Given a bz2-compressed multistream xml file of 
articles, a sorted plain text\n")
-        sys.stderr.write("index file into the article file, and a plain text 
toc file for the index,\n")
-        sys.stderr.write("find and display the xml including article text of 
any article specified\n")
-        sys.stderr.write("by title.\n")
-        sys.stderr.write("\n")
-        sys.stderr.write("The user may specify the first so many characters of 
the title, in which\n")
-        sys.stderr.write("case all matching titles will be displayed as a list 
so that the user\n")
-        sys.stderr.write("may select the one desired.\n")
-        sys.stderr.write("\n")
-        sys.stderr.write("If no such title is found, an error message will be 
displayed.\n")
-        sys.stderr.write("\n")
-        sys.stderr.write("Titles are case-sensitive for now.\n")
-        sys.stderr.write("\n")
-        sys.stderr.write("A reasonable front end would parse the xml, strip or 
expand templates,\n")
-        sys.stderr.write("do something interesting with citations, references 
and links, etc.\n")
-        sys.stderr.write("This script does none of that; it is a proof of 
concept only.\n")
-        sys.stderr.write("\n")
-        sys.stderr.write("Arguments:\n")
-        sys.stderr.write("--title:         first so many characters of the 
article title\n")
-        sys.stderr.write("--xmlfile:       path to the bz2 compressed xml 
format article file\n")
-        sys.stderr.write("--idxfile:       plain text file which is the index 
into the bz2 xml file\n")
-        sys.stderr.write("--tocfile:       plain text file which is the toc of 
the index file\n")
-        sys.stderr.write("--configfile:    plain text file which contains 
config options\n")
-        sys.stderr.write("--maxredirs:     maximum number of redirects to 
follow\n")
-        sys.stderr.write("                 default: 3\n")
-        sys.stderr.write("--categorytext:  text of the 'category' string in 
the wiki's content language\n")
-        sys.stderr.write("                 default: Category\n")
-        sys.stderr.write("--filetext:      text of the 'file' string in the 
wiki's content language\n")
-        sys.stderr.write("                 default: File\n")
-        sys.stderr.write("--redirtext:     text in capital letters of the 
'redirect' string in the\n")
-        sys.stderr.write("                 wiki's content language\n")
-        sys.stderr.write("                 default: REDIRECT\n")
-        sys.stderr.write("\n")
-        sys.stderr.write("Flags:\n")
-        sys.stderr.write("--cleanup:   cleanup text (remove refs, font 
stylings, etc) for ease of reading\n")
-        sys.stderr.write("             default: false\n")
-        sys.stderr.write("--exact:     require exact match of specified 
title\n")
-        sys.stderr.write("             default: false\n")
-        sys.stderr.write("--textonly:  print only the contents of the 
xml<text> tag, not the rest of the\n")
-        sys.stderr.write("             page info\n")
-        sys.stderr.write("             default: false\n")
-        sys.stderr.write("--verbose:   print extra message about what is being 
done\n")
-        sys.stderr.write("             default: false\n")
-        sys.stderr.write("\n")
-        sys.stderr.write("Example:\n")
-        sys.stderr.write("python %s --exact --xmlfile 
enwiki-articles-current.xml.bz2 \\\n" % self.whoami)
-        sys.stderr.write("          --idxfile articles-index-sorted.txt 
--tocfile index-toc.txt\n")
-        sys.exit(1)
+If no such title is found, an error message will be displayed.
 
-class WARedirectHandler(object):
-    # follow redirect links in the page text
-    def __init__(self, maxRedirs, redirText, verbose):
-        # constructor
-        # arguments:
-        # maxRedirs -- follow this many redirect links until giving up, if 0 
then
-        #              don't follow any (need this so we avoid redirection 
loops)
-        # redirText -- the string 'REDIRECT' in the wiki's content language
-        # verbose   -- whether or not to display messages about processing 
being done
-        self.maxRedirs = maxRedirs
-        self.redirText = redirText
-        self.verbose = verbose
+Titles are case-sensitive for now.
 
-    def handleRedirects(self, text, retriever):
-        # follow redirect link in page text, retrieve the target page text, and
-        # check that til we reach a page that's not a redirect or we hit the
-        # maxRedirs limit or we hit a redirect to a nonexistent page
-        # arguments:
-        # text      -- initial page text as retrieved from the xml file,
-        #              without any cleanup etc.
-        # retriever -- WAPageRetriever object (used to follow the redirects)
-        # returns the page text, either of the first non-redirect or the
-        # last redirect before going over the redir follow limit, or
-        # the last page text before following a link to a nonexistent page
-        if not text:
-            return text
-        redirsDone = 0
-        redirLink = None
-        while redirsDone < self.maxRedirs:
-            if self.verbose:
-                sys.stderr.write("Checking for redirects in text, redirs done 
already:%s\n" % redirsDone)
-            te = WATextExtractor(text, self.redirText)
-            redirLink = te.getRedirect()
-            if not redirLink:
-                break
-            oldText = text
-            text = retriever.retrieve(WATitleMunger.normalizeTitle(redirLink), 
True)
-            if text is None:
-                # redirect to nonexistent article
-                text = oldText
-                break
-            redirsDone += 1
-        if redirLink and redirsDone >= maxRedirs:
-            sys.stderr.write("Too many redirects encountered.\n")
-        return text
+A reasonable front end would parse the xml, strip or expand templates,
+do something interesting with citations, references and links, etc.
+This script does none of that; it is a proof of concept only.
 
-def readConfig(configFile=None):
-    # set up configuration defaults and read overriding values from files in
-    # the current directory, /etc, and the user's home directory, if they exist
-    # arguments:
-    # configFile -- name of the configuration file in the current dir, if any
-    # returns a ConfigParser object with the configuration values in it
+Arguments:
+
+  --title:         first so many characters of the article title
+  --xmlfile:       path to the bz2 compressed xml format article file
+  --idxfile:       plain text file which is the index into the bz2 xml file
+  --tocfile:       plain text file which is the toc of the index file
+  --configfile:    plain text file which contains config options
+  --maxredirs:     maximum number of redirects to follow
+                   default: 3
+  --categorytext:  text of the 'category' string in the wiki's content language
+                   default: Category
+  --filetext:      text of the 'file' string in the wiki's content language
+                   default: File
+  --redirtext:     text in capital letters of the 'redirect' string in the
+                   wiki's content language
+                   default: REDIRECT
+
+Flags:
+  --cleanup:   cleanup text (remove refs, font stylings, etc) for ease of 
reading
+               default: false
+  --exact:     require exact match of specified title
+               default: false
+  --textonly:  print only the contents of the xml<text> tag, not the rest of 
the
+               page info
+               default: false
+  --verbose:   print extra message about what is being done
+               default: false
+
+Example:
+
+python %s --exact --xmlfile enwiki-articles-current.xml.bz2 \\\n" % 
self.whoami)
+          --idxfile articles-index-sorted.txt --tocfile index-toc.txt
+"""
+    sys.stderr.write(usage_message)
+    sys.exit(1)
+
+
+def read_config(config_file=None):
+    '''
+    set up configuration defaults and read overriding values from files in
+    the current directory, /etc, and the user's home directory, if they exist
+    arguments:
+    configFile -- name of the configuration file in the current dir, if any
+    returns a ConfigParser object with the configuration values in it
+    '''
     home = os.path.dirname(sys.argv[0])
-    if (not configFile):
-        configFile = "wikiarticles.conf"
+    if not config_file:
+        config_file = "wikiarticles.conf"
 
     # fixme I should really check what order these get read in
     # and which files override which
     files = [
-        os.path.join(home,configFile),
+        os.path.join(home, config_file),
         "/etc/wikiarticles.conf",
         os.path.join(os.getenv("HOME"), ".wikiarticles.conf")]
 
     defaults = {
-        #"files": {
+        # "files": {
         "xmlfile": "",
         "idxfile": "",
         "tocfile": "",
-        #"format": {,
+        # "format": {,
         "cleanup": "0",
         "textonly": "0",
         "maxredirs": "3",
@@ -688,92 +734,98 @@
 
     return conf
 
-if __name__ == "__main__":
-    configFileName = None
-    xmlFileName = None
-    indexFileName = None
-    tocFileName = None
-    pageTitle = None
-    exactMatch = None
-    verbose = None
-    maxRedirs = None
-    fileText = None
-    categoryText = None
-    redirText = None
-    cleanup = None
-    textOnly = None
 
-    errs = WAErrorHandler(sys.argv[0])
+def do_main():
+    config_file_name = None
+    xml_file_name = None
+    index_file_name = None
+    toc_file_name = None
+    page_title = None
+    exact_match = None
+    verbose = None
+    max_redirs = None
+    file_text = None
+    category_text = None
+    redir_text = None
+    cleanup = None
+    text_only = None
 
     try:
-        (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", 
['xmlfile=', 'idxfile=', 'tocfile=', 'title=', 'configfile=', 'maxredirs=', 
"filetext=", "categorytext=", "redirtext=", 'cleanup', 'exact', 'textonly', 
'verbose' ])
+        (options, remainder) = getopt.gnu_getopt(
+            sys.argv[1:], "", ['xmlfile=', 'idxfile=', 'tocfile=',
+                               'title=', 'configfile=', 'maxredirs=',
+                               "filetext=", "categorytext=", "redirtext=",
+                               'cleanup', 'exact', 'textonly', 'verbose'])
     except:
-        errs.usage("Unknown option specified")
+        usage("Unknown option specified")
 
     for (opt, val) in options:
         if opt == "--xmlfile":
-            xmlFileName = val
+            xml_file_name = val
         elif opt == "--idxfile":
-            indexFileName = val
+            index_file_name = val
         elif opt == "--tocfile":
-            tocFileName = val
+            toc_file_name = val
         elif opt == "--title":
-            pageTitle = val
+            page_title = val
         elif opt == "--configfile":
-            configFileName = val
+            config_file_name = val
         elif opt == "--maxredirs":
             if not val.isdigit() or int(val) < 0:
-                errs.usage("maxredirs must be a non-negative integer.")
-            maxRedirs = int(val)
+                usage("maxredirs must be a non-negative integer.")
+            max_redirs = int(val)
         elif opt == "--redirtext":
-            redirText = val
+            redir_text = val
         elif opt == "--cleanup":
             cleanup = True
         elif opt == "--exact":
-            exactMatch = True
+            exact_match = True
         elif opt == "--textonly":
-            textOnly = True
+            text_only = True
         elif opt == "--verbose":
             verbose = True
 
-    if (len(remainder) > 0):
-        errs.usage("Unknown option specified")
+    if len(remainder) > 0:
+        usage("Unknown option specified")
 
-    conf = readConfig(configFileName)
-    
+    conf = read_config(config_file_name)
+
     # check config file for fallbacks.
-    if xmlFileName is None:
-        xmlFileName = conf.get("files", "xmlfile")
-    if indexFileName is None:
-        indexFileName = conf.get("files", "idxfile")
-    if tocFileName is None:
-        tocFileName = conf.get("files", "tocfile")
-    if maxRedirs is None:
-        maxRedirs = conf.getint("format", "maxredirs")
-    if redirText is None:
-        redirText = conf.get("format", "redirtext")
-    if fileText is None:
-        fileText = conf.get("format", "filetext")
-    if categoryText is None:
-        categoryText = conf.get("format", "categorytext")
+    if xml_file_name is None:
+        xml_file_name = conf.get("files", "xmlfile")
+    if index_file_name is None:
+        index_file_name = conf.get("files", "idxfile")
+    if toc_file_name is None:
+        toc_file_name = conf.get("files", "tocfile")
+    if max_redirs is None:
+        max_redirs = conf.getint("format", "maxredirs")
+    if redir_text is None:
+        redir_text = conf.get("format", "redirtext")
+    if file_text is None:
+        file_text = conf.get("format", "filetext")
+    if category_text is None:
+        category_text = conf.get("format", "categorytext")
     if cleanup is None:
         cleanup = conf.getboolean("format", "cleanup")
-    if textOnly is None:
-        textOnly = conf.getboolean("format", "textonly")
+    if text_only is None:
+        text_only = conf.getboolean("format", "textonly")
 
-    mandatory = [ ("xmlfile", xmlFileName), ("idxfile", indexFileName), 
("tocfile", tocFileName), ("title", pageTitle) ]
-    for (optName, val) in mandatory:
+    mandatory = [("xmlfile", xml_file_name), ("idxfile", index_file_name),
+                 ("tocfile", toc_file_name), ("title", page_title)]
+    for (opt_name, val) in mandatory:
         if not val:
-            errs.usage("Missing required option '%s'" % optName)
+            usage("Missing required option '%s'" % opt_name)
 
-    pr = WAPageRetriever(xmlFileName, indexFileName, tocFileName, verbose)
-    pr.setup()
-    text = pr.retrieve(WATitleMunger.normalizeTitle(pageTitle), exactMatch)
+    retriever = WAPageRetriever(xml_file_name, index_file_name, toc_file_name, 
verbose)
+    retriever.setup()
+    text = retriever.retrieve(WATitleMunger.normalize_title(page_title), 
exact_match)
     if text:
-        rh = WARedirectHandler(maxRedirs, redirText, verbose)
-        text = rh.handleRedirects(text, pr)
+        text = handle_redirects(text, retriever, max_redirs, redir_text, 
verbose)
 
-    td = WATextDisplay(fileText, categoryText, cleanup, textOnly)
-    td.display(text)
+    display(text, file_text, category_text, cleanup, text_only)
 
-    pr.teardown()
+    retriever.teardown()
+
+
+if __name__ == "__main__":
+    do_main()
diff --git a/toys/bz2multistream/writetoc.py b/toys/bz2multistream/writetoc.py
index 70d6d2b..8d2da93 100644
--- a/toys/bz2multistream/writetoc.py
+++ b/toys/bz2multistream/writetoc.py
@@ -1,139 +1,163 @@
-import getopt, os, sys, re, codecs
+import getopt
+import sys
+import codecs
 
-class indexTOC(object):
-    # Generate a table of contents for an index file,
-    # where the TOC will consist of lines containing
-    # offset:char
-    # where char is a unique starting character of
-    # the text field in the index file, and offset is
-    # the offset into the index file of the first
-    # line with an article starting with the specific
-    # character.
 
-    # The index file should have lines of the format:
-    # xxx:xxx:...:text field:xxx:...
-    # or instead of ':' you can use the field separator
-    # of your choice.
-    # For the purposes of this script we only care about
-    # the contents of the text field.
-    # The index file must have been sorted by the text field
-    # so that all entries starting with the same first
-    # character are consecutive in the file.
+class IndexTOC(object):
+    '''
+    Generate a table of contents for an index file,
+    where the TOC will consist of lines containing
+    offset:char
+    where char is a unique starting character of
+    the text field in the index file, and offset is
+    the offset into the index file of the first
+    line with an article starting with the specific
+    character.
 
-    def __init__(self, inputFd, fieldNum, separator, verbose):
-        # constructor
-        # arguments:
-        # inputFd   -- open file descriptor from which index lines will be read
-        #              it better have been opened with utf8 codec if there are
-        #              any unicode characters in the text fields
-        # fieldNum  -- number of field containing text, numbering starts at 1
-        # sep       -- field separator. for wmf index files this is ':'
-        # verbose   -- whether or not to display info about processing of the 
index lines
-        self.inputFd = inputFd
-        self.fieldNum = fieldNum
+    The index file should have lines of the format:
+    xxx:xxx:...:text field:xxx:...
+    or instead of ':' you can use the field separator
+    of your choice.
+    For the purposes of this script we only care about
+    the contents of the text field.
+    The index file must have been sorted by the text field
+    so that all entries starting with the same first
+    character are consecutive in the file.
+    '''
+
+    def __init__(self, input_fd, field_num, separator, verbose):
+        '''
+        constructor
+        arguments:
+        input_fd  -- open file descriptor from which index
+                     lines will be read; it better have been
+                     opened with utf8 codec if there are any
+                     unicode characters in the text fields
+        field_num -- number of field containing text, numbering
+                     starts at 1
+        sep       -- field separator. for wmf index files this
+                     is ':'
+        verbose   -- whether or not to display info about
+                     processing of the index lines
+        '''
+        self.input_fd = input_fd
+        self.field_num = field_num
         self.sep = separator
         self.verbose = verbose
-        self.currentChar = None
+        self.current_char = None
         self.offset = 0
 
-    def doTOC(self, outFd):
-        # read all input from the input file descriptor
-        # and write a TOC file for that input to the
-        # specified output file descriptor, which should
-        # already have been set up for writing by the
-        # caller
-        outFdUTF8 = codecs.getwriter("utf-8")(outFd)
-        self.currentChar = None
+    def do_toc(self, out_fd):
+        '''
+        read all input from the input file descriptor
+        and write a TOC file for that input to the
+        specified output file descriptor, which should
+        already have been set up for writing by the
+        caller
+        '''
+        out_fd_utf8 = codecs.getwriter("utf-8")(out_fd)
+        self.current_char = None
         self.offset = 0
-        for line in self.inputFd:
-            self.processLine(line, outFdUTF8)
-        
-    def processLine(self, line, outFd):
-        # for a given line of input, see if the
-        # text field in the line starts with a new
-        # unique first character, and if so, write
-        # a TOC entry for that character to the
-        # specified output file descriptor
-        firstChar = self.getFirstCharFromField(line)
-        if not firstChar:
+        for line in self.input_fd:
+            self.process_line(line, out_fd_utf8)
+
+    def process_line(self, line, out_fd):
+        '''
+        for a given line of input, see if the
+        text field in the line starts with a new
+        unique first character, and if so, write
+        a TOC entry for that character to the
+        specified output file descriptor
+        '''
+        first_char = self.get_first_char_from_field(line)
+        if not first_char:
             if self.verbose:
                 sys.stderr.write("no first char retrieved for line: %s, 
skipping\n" % line)
-            self.offset += len(line.encode('utf-8'))
-            next
-        if not self.currentChar or firstChar != self.currentChar:
+        elif not self.current_char or first_char != self.current_char:
             if self.verbose:
                 sys.stderr.write("new first char for line: %s, recording\n" % 
line)
-            self.currentChar = firstChar
-            outFd.write("%s:%s\n" % (self.offset, firstChar))
+            self.current_char = first_char
+            out_fd.write("%s:%s\n" % (self.offset, first_char))
         self.offset += len(line.encode('utf-8'))
 
-    def getFirstCharFromField(self, line):
-        # find the text field in the given line
-        # and return the first character (not byte) in the field
-        # or None if there is none
+    def get_first_char_from_field(self, line):
+        '''
+        find the text field in the given line
+        and return the first character (not byte) in the field
+        or None if there is none
+        '''
         stripped = line.rstrip('\n')
-        fields = stripped.split(self.sep, self.fieldNum-1)
-        if  len(fields) < fieldNum:
+        fields = stripped.split(self.sep, self.field_num-1)
+        if  len(fields) < self.field_num:
             return None
-        if not len(fields[fieldNum -1]):
+        if not len(fields[self.field_num -1]):
             return None
-        return fields[fieldNum -1][0]
+        return fields[self.field_num -1][0]
 
-def usage(message = None):
+
+def usage(message=None):
     if message:
         sys.stderr.write("%s\n" % message)
-    sys.stderr.write("Usage: python %s --field=num --separator=char 
--tocfile=filename [--verbose]\n" % sys.argv[0])
-    sys.stderr.write("\n")
-    sys.stderr.write("Given plain text input consisting lines with several 
fields with a given\n")
-    sys.stderr.write("separator, which have been sorted by a specified field 
from each line, write\n")
-    sys.stderr.write("a TOC  (table of contents) which contains a list of the 
unique first\n")
-    sys.stderr.write("characters of the sort field and the offset to the first 
line of the file in\n")
-    sys.stderr.write("which the sort field starts with that character.  In 
other words, if the text\n")
-    sys.stderr.write("fields of the input file all happen to start only with 
a,b,c, and q, there\n")
-    sys.stderr.write("will be exactly four lines in the created TOC, with 
offsets to the first\n")
-    sys.stderr.write("line from the input with the sort field starting with a, 
the first line\n")
-    sys.stderr.write("from the input the sort field starting with b, and so 
on.\n")
-    sys.stderr.write("\n")
-    sys.stderr.write("This is used to create a TOC into an article XML 
multistream index\n")
-    sys.stderr.write("(after it has been uncompressed and sorted by article 
title), so that\n")
-    sys.stderr.write("retrieval of article text from the article XML 
multistream content file\n")
-    sys.stderr.write("can be done quickly without a database or other 
server-client model.\n")
-    sys.stderr.write("\n")
-    sys.stderr.write("--field:     the number of the field with which the 
input file was\n")
-    sys.stderr.write("             alphabetically sorted, starting with 1\n")
-    sys.stderr.write("             default: 1\n")
-    sys.stderr.write("--tocfile:   path to the TOC file which will be 
created\n")
-    sys.stderr.write("--separator: the string used to separate fields in the 
input file\n")
-    sys.stderr.write("             default: space\n")
-    sys.stderr.write("--verbose:   display extra messages about what is being 
done\n")
-    sys.stderr.write("\n")
-    sys.stderr.write("Example: LC_ALL_save=`echo $LC_ALL`; LC_ALL=C; export 
LC_ALL; \\\n")
-    sys.stderr.write("         bzcat 
enwiki-20120902-pages-articles-multistream-index.txt.bz2 | \\\n")
-    sys.stderr.write("         sort -k 3 -t ':' > 
enwiki-20120902-pages-articles-multistream-index-sorted.txt; \\\n")
-    sys.stderr.write("         LC_ALL=${LC_ALL_save}; export LC_ALL\n")
-    sys.stderr.write("\n")
-    sys.stderr.write("         cat 
enwiki-20120902-pages-articles-multistream-index-sorted.txt | \\\n")
-    sys.stderr.write("         python %s --field 3 --separator ':' --tocfile 
enwiki-20120902-pages-articles-multistream-index-sorted-idx.txt\n" % 
sys.argv[0])
+    usage_message = """
+Usage: python writetoc.py --field=num --separator=char
+                --tocfile=filename [--verbose]
+
+Given plain text input consisting lines with several fields with a given
+separator, which have been sorted by a specified field from each line, write
+a TOC  (table of contents) which contains a list of the unique first
+characters of the sort field and the offset to the first line of the file in
+which the sort field starts with that character.  In other words, if the text
+fields of the input file all happen to start only with a,b,c, and q, there
+will be exactly four lines in the created TOC, with offsets to the first
+line from the input with the sort field starting with a, the first line
+from the input the sort field starting with b, and so on.
+
+This is used to create a TOC into an article XML multistream index
+(after it has been uncompressed and sorted by article title), so that
+retrieval of article text from the article XML multistream content file
+can be done quickly without a database or other server-client model.
+
+--field:     the number of the field with which the input file was
+             alphabetically sorted, starting with 1
+             default: 1
+--tocfile:   path to the TOC file which will be created
+--separator: the string used to separate fields in the input file
+             default: space
+--verbose:   display extra messages about what is being done
+
+Example: LC_ALL_save=`echo $LC_ALL`; LC_ALL=C; export LC_ALL; \\
+         bzcat enwiki-20120902-pages-articles-multistream-index.txt.bz2 | \\
+         sort -k 3 -t ':' > \\
+         enwiki-20120902-pages-articles-multistream-index-sorted.txt; \\
+         LC_ALL=${LC_ALL_save}; export LC_ALL
+
+         cat enwiki-20120902-pages-articles-multistream-index-sorted.txt | \\
+         python writetoc.py --field 3 --separator ':' \\
+         --tocfile 
enwiki-20120902-pages-articles-multistream-index-sorted-idx.txt
+"""
+    sys.stderr.write(usage_message)
     sys.exit(1)
 
-if __name__ == "__main__":
-    tocFileName = None
-    fieldNum = 1
+
+def do_main():
+    toc_file_name = None
+    field_num = 1
     separator = ' '
     verbose = False
 
     try:
-        (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", ['field=', 
'tocfile=', 'separator=', 'verbose' ])
-    except:
+        (options, remainder) = getopt.gnu_getopt(
+            sys.argv[1:], "", ['field=', 'tocfile=', 'separator=', 'verbose'])
+    except Exception:
         usage("Unknown option specified")
 
     for (opt, val) in options:
         if opt == "--field":
             if not val.isdigit():
                 usage("Bad value specified for 'field' option")
-            fieldNum = int(val)
+            field_num = int(val)
         elif opt == "--tocfile":
-            tocFileName = val
+            toc_file_name = val
         elif opt == "--separator":
             if len(separator) != 1:
                 usage("Bad value specified for 'separator' option")
@@ -141,23 +165,27 @@
         elif opt == "--verbose":
             verbose = True
 
-    if (len(remainder) > 0):
+    if len(remainder) > 0:
         usage("Unknown option specified")
 
-    if (not tocFileName):
+    if not toc_file_name:
         usage("Missing required option 'tocfile'")
 
     try:
-        outFile = open(tocFileName, "w")
+        out_file = open(toc_file_name, "w")
     except:
-        sys.stderr.write("failed to open file %s for writing\n", tocFileName)
+        sys.stderr.write("failed to open file %s for writing\n", toc_file_name)
         raise
 
-    inFile = codecs.getreader("utf-8")(sys.stdin)
-    
-    toc = indexTOC(inFile, fieldNum, separator, verbose)
-    toc.doTOC(outFile)
+    in_file = codecs.getreader("utf-8")(sys.stdin)
 
-    outFile.close()
+    toc = IndexTOC(in_file, field_num, separator, verbose)
+    toc.do_toc(out_file)
 
-    exit(0);
+    out_file.close()
+
+    exit(0)
+
+
+if __name__ == "__main__":
+    do_main()

-- 
To view, visit https://gerrit.wikimedia.org/r/280109
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ib0bbf9c3bb1db1a8029a7a7a3e660e82433634b7
Gerrit-PatchSet: 3
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] operations/dumps[ariel]: toy offline reader: pylint and pep8

Reply via email to