ArielGlenn has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/280109 )
Change subject: toy offline reader: pylint and pep8
......................................................................
toy offline reader: pylint and pep8
Change-Id: Ib0bbf9c3bb1db1a8029a7a7a3e660e82433634b7
---
M toys/bz2multistream/INSTALL.txt
M toys/bz2multistream/wikiarticles.py
M toys/bz2multistream/writetoc.py
3 files changed, 755 insertions(+), 675 deletions(-)
Approvals:
ArielGlenn: Looks good to me, approved
jenkins-bot: Verified
diff --git a/toys/bz2multistream/INSTALL.txt b/toys/bz2multistream/INSTALL.txt
index 733c686..1d955ef 100644
--- a/toys/bz2multistream/INSTALL.txt
+++ b/toys/bz2multistream/INSTALL.txt
@@ -23,7 +23,7 @@
(or you can use the helper script sort.sh for this step)
cat elwiki-pages-multistream-index-sorted.txt | \
-python ./writetoc.py --field 3 --separator ':' --indexfile
elwiki-pages-multistream-toc.txt
+python ./writetoc.py --field 3 --separator ':' --tocfile
elwiki-pages-multistream-toc.txt
After the above, you can optionally create a configuration file
"wikiarticles.conf".
See the wikiarticles.conf.sample for more information. An example
configuration file
diff --git a/toys/bz2multistream/wikiarticles.py
b/toys/bz2multistream/wikiarticles.py
index 6ebc5b0..27d1e99 100644
--- a/toys/bz2multistream/wikiarticles.py
+++ b/toys/bz2multistream/wikiarticles.py
@@ -1,209 +1,446 @@
-import getopt, os, sys, re, codecs, bz2, ConfigParser
+import getopt
+import os
+import sys
+import re
+import bz2
+import ConfigParser
+
+
+def get_choice_from_batch(titles, start, batch_size):
+ '''
+ display titles from start to start + batchsize, with count in front
+ ask the caller for a title number or an action
+ actions may be Q (quit), N (next batch), B (previous batch), R (redisplay)
+ if caller enters nothing, treat that as default (R)
+ if caller enters something else, whine and treat that as default (R) too
+ arguments:
+ titles -- full list of titles
+ start -- display from this point in the list
+ batch_size -- how many titles to display
+ returns a tuple of (action, title number) where one or the other of these
+ may be None
+ '''
+
+ # yay python, it will silently ignore the fact that you requested
+ # more things in the list than exist. (no this is not sarcasm)
+ count = start
+ for line in titles[start:start+batch_size]:
+ print "%s) %s" % (count+1, WATitleMunger.un_normalize_title(line))
+ count += 1
+ print
+ print("Enter number of choice, or Q/N/P/R to "
+ "quit/next page/prev page/redisplay page (default R): "),
+ choice = sys.stdin.readline()
+ choice = choice.strip()
+ if not choice:
+ choice = 'R'
+ if choice.isdigit():
+ num = int(choice)
+ if num < 1 or num > len(titles):
+ print "Bad number given."
+ return "R", None
+ return None, num
+ else:
+ choice = choice.capitalize()
+ if choice in ['N', 'P', 'Q', 'R']:
+ return choice, None
+ else:
+ print "Bad choice given."
+ return "R", None
+
+
+def process_action(action, start, batch_size, total):
+ '''
+ given a caller action,
+ update title list display pointer to the appropriate position
+ arguments:
+ action -- Q (quit), N (next batch), P (prev batch), or anything else
+ start -- title list display pointer, a batch of titles from the list
+ will be displayed starting from this number
+ batch_size -- how many titles are dispayed in a batch
+ total -- total titles in the list
+ returns: updated title list display pointer, or exits at user request
+ (action Q)
+ note that any action other than Q/N/P will result in the default R
(redisplay
+ current batch of titles) which means no change, return existing value.
This
+ includes the None action.
+ '''
+ if action == 'N' or action == 'n':
+ if start + batch_size < total:
+ start += batch_size
+ else:
+ print "End of list reached."
+ elif action == 'P' or action == 'p':
+ if start > batch_size:
+ start = start - batch_size
+ else:
+ print "Beginning of list reached."
+ elif action == 'Q' or action == 'q':
+ print "Exiting at user's request."
+ sys.exit(0)
+ return start
+
+
+def get_user_title_choice(title_hash):
+ '''
+ show a numbered list of page titles on stdout and read the
+ caller's choice on stdin
+ I guess this is a poor person's pager
+ arguments:
+ titleHash -- hash of page titles and their offsets into the xml file
+ returns: the offset into the xml file for the title selected
+ '''
+ titles = title_hash.keys()
+ titles.sort()
+ total = len(titles)
+
+ choice = None
+ start = 0
+ batch_size = 30
+ print "Multiple titles found, please choose from the following."
+ while not choice:
+ (action, choice) = get_choice_from_batch(titles, start, batch_size)
+ if choice:
+ return titles[choice-1]
+ else:
+ start = process_action(action, start, batch_size, total)
+
+
+def get_text(xml):
+ '''
+ get the contents of the <text>...</text> tags if needed,
+ returns the contents found or None if none found
+ '''
+ text = None
+ if xml is not None:
+ match = re.search("<text[^>]*>(.*?)</text>", xml, flags=re.DOTALL)
+ if match:
+ text = match.group(1)
+ return text
+
+
+def get_redirect(text, localized_redir_string, verbose):
+ '''
+ look for and set page title of a redirect link in the text, if any
+ returns the redirction link or None if none was found
+ format: <text ...>#REDIRECT [[link|show to reader]] ...
+ fixme the redirect keyword really should be case insensitive but
+ have we done any unicode stuff here? nope, so that ain't happening
+
+ arguments:
+ text -- the raw text dug out from the <text> tags of
page content
+ localized_redir_string -- the string 'REDIR' in the wiki content language
+ verbose
+ '''
+ redirect = None
+ if text is not None:
+ redir_regex = (re.compile(r"<text
[^>]+>\s*#(REDIRECT|%s)\s*\[\[([^\|\]]+)"
+ % localized_redir_string))
+ match = redir_regex.search(text)
+ if match:
+ if verbose:
+ sys.stderr.write("Found a redirect in the page "
+ "text: %s\n" % match.group(2))
+ redirect = match.group(2)
+ return redirect
+
+
+def display(text, file_string, category_string, clean=False, text_only=False):
+ '''
+ display the text, optionally extracting just the content
+ from the <text> tags and optionally doing some cleanup
+ on the text before display
+ arguments:
+ text -- the page text from the xml file, everything between
+ <page>...</page>
+
+ arguments:
+ fileString -- the string 'File' in the wiki's content language
+ category_string -- the string 'Category' in the wiki's content language
+ clean -- whether or not to clean up various tags etc or leave
the raw text
+ as retrieved from the xml file
+ text_only -- whether or not to include the metadata and other stuff
for the page
+ as retrieved from the xml file or just the content
from the <text> tags
+ '''
+ if text is None:
+ print "No page text for that title found."
+ return
+
+ if text_only:
+ text = get_text(text)
+
+ if clean:
+ formatter = WATextFormatter(text, file_string, category_string)
+ text = formatter.do_formatting()
+
+ print text
+
+
+def handle_redirects(text, retriever, max_redirs, redir_text, verbose):
+ '''
+ follow redirect link in page text, retrieve the target page text, and
+ check that til we reach a page that's not a redirect or we hit the
+ maxRedirs limit or we hit a redirect to a nonexistent page
+
+ arguments:
+ text -- initial page text as retrieved from the xml file,
+ without any cleanup etc.
+ retriever -- WAPageRetriever object (used to follow the redirects)
+ returns the page text, either of the first non-redirect or the
+ last redirect before going over the redir follow limit, or
+ the last page text before following a link to a nonexistent page
+ max_redirs -- follow this many redirect links until giving up, if 0 then
+ don't follow any (need this so we avoid redirection loops)
+ redir_text -- the string 'REDIRECT' in the wiki's content language
+ verbose -- whether or not to display messages about processing being
done
+ '''
+ if not text:
+ return text
+ redirs_done = 0
+ redir_link = None
+ while redirs_done < max_redirs:
+ if verbose:
+ sys.stderr.write("Checking for redirects in text, "
+ "redirs done already:%s\n" % redirs_done)
+ redir_link = get_redirect(text, redir_text, verbose)
+ if not redir_link:
+ break
+ old_text = text
+ text = retriever.retrieve(WATitleMunger.normalize_title(redir_link),
True)
+ if text is None:
+ # redirect to nonexistent article
+ text = old_text
+ break
+ redirs_done += 1
+ if redir_link and redirs_done >= max_redirs:
+ sys.stderr.write("Too many redirects encountered.\n")
+ return text
+
class WAPageRetriever(object):
- # retrive the xml page content from a bz2 miltistream xml file
- # given an index of the offsets of the streams in the file and
- # the page titles in those streams, along with a table of
- # contents into the index file by first letter of page titles
- def __init__(self, xmlFile, idxFile, tocFile, verbose):
- # constructor; besides setting instance attributes,
- # initializes a few things so we know they need to be
- # properly set later
- # arguments:
- # xmlFile - name of the bz2 multistream xml file of pages
- # idxFile - name of the index into the bz2 multistream file
- # tocFile - name of the toc into the index file
- # verbose - whether or not to display extra messages about processing
- self.xmlFile = xmlFile
- self.idxFile = idxFile
- self.tocFile = tocFile
+ '''
+ retrive the xml page content from a bz2 miltistream xml file
+ given an index of the offsets of the streams in the file and
+ the page titles in those streams, along with a table of
+ contents into the index file by first letter of page titles
+ '''
+ def __init__(self, xml_file, idx_file, toc_file, verbose):
+ '''
+ constructor; besides setting instance attributes,
+ initializes a few things so we know they need to be
+ properly set later
+ arguments:
+ xml_file - name of the bz2 multistream xml file of pages
+ idx_file - name of the index into the bz2 multistream file
+ toc_file - name of the toc into the index file
+ verbose - whether or not to display extra messages about processing
+ '''
+ self.xml_file = xml_file
+ self.idx_file = idx_file
+ self.toc_file = toc_file
self.verbose = verbose
- self.xmlFd = None
- self.tocFd = None
- self.idxFd = None
- self.titleMatches = None # hash of titles and offsets that partially
match, if any
- self.setupDone = False
+ self.xml_fd = None
+ self.toc_fd = None
+ self.idx_fd = None
+ self.title_matches = None # hash of titles and offsets that partially
match, if any
+ self.setup_done = False
def setup(self):
- # call just after initalizing the instance, opens files. note that the
bz2
- # compressed file is not opened with a decompressor; since
- # we seek around in the file we can't do that.
- if self.setupDone:
+ '''
+ call just after initalizing the instance, opens files. note that the
bz2
+ compressed file is not opened with a decompressor; since
+ we seek around in the file we can't do that.
+ '''
+ if self.setup_done:
return
- if verbose:
+ if self.verbose:
sys.stderr.write("Opening files\n")
- self.xmlFd = open(self.xmlFile, "r")
- self.idxFd = open(self.idxFile, "r")
- self.tocFd = open(self.tocFile, "r")
- self.setupDone
+ self.xml_fd = open(self.xml_file, "r")
+ self.idx_fd = open(self.idx_file, "r")
+ self.toc_fd = open(self.toc_file, "r")
+ self.setup_done = True
def teardown(self):
- # call when the instance is no longer needed
- # closes all file descriptors and readies
- # the instance for a new setup() call if desired
- if verbose:
+ '''
+ call when the instance is no longer needed
+ closes all file descriptors and readies
+ the instance for a new setup() call if desired
+ '''
+ if self.verbose:
sys.stderr.write("Closing files\n")
- if self.xmlFd:
- self.xmlFd.close()
- if self.idxFd:
- self.idxFd.close()
- if self.tocFd:
- self.tocFd.close()
- self.xmlFd = None
- self.tocFd = None
- self.idxFd = None
- self.titleMatches = None
- self.setupDone = False
+ if self.xml_fd:
+ self.xml_fd.close()
+ if self.idx_fd:
+ self.idx_fd.close()
+ if self.toc_fd:
+ self.toc_fd.close()
+ self.xml_fd = None
+ self.toc_fd = None
+ self.idx_fd = None
+ self.title_matches = None
+ self.setup_done = False
def retrieve(self, title, exact):
- # retrieve the contents of the xml file with the
- # specified page title.
- # arguments:
- # title -- the page title, case sensitive, with spaces, not underscores
- # exact -- true if the title must match exactly, otherwise a list of
- # page titles starting with the specified string is displayed
- # on stdout with a prompt for selection from stdin.
- # returns None if no corresponding page title can be found
- titleUnicode = title.decode("utf-8")
- firstChar = titleUnicode[0]
- idxOffset = self.findCharInToc(firstChar)
- if idxOffset is None:
+ '''
+ retrieve the contents of the xml file with the
+ specified page title.
+ arguments:
+ title -- the page title, case sensitive, with spaces, not underscores
+ exact -- true if the title must match exactly, otherwise a list of
+ page titles starting with the specified string is displayed
+ on stdout with a prompt for selection from stdin.
+ returns None if no corresponding page title can be found
+ '''
+ title_unicode = title.decode("utf-8")
+ first_char = title_unicode[0]
+ idx_offset = self.find_char_in_toc(first_char)
+ if idx_offset is None:
sys.stderr.write("No such title found in toc.\n")
return None
if exact:
if self.verbose:
- sys.stderr.write("Found index offset %s\n" % idxOffset)
- result = self.retrieveExact(title, idxOffset)
+ sys.stderr.write("Found index offset %s\n" % idx_offset)
+ result = self.retrieve_exact(title, idx_offset)
else:
- titlesHash = self.findTitleInIndex(title, idxOffset)
+ titles_hash = self.find_title_in_index(title, idx_offset)
# if there is only one entry in the hash don't ask the user
- if len(titlesHash.keys()) == 1:
- title = titlesHash.keys()[0]
+ if len(titles_hash.keys()) == 1:
+ title = titles_hash.keys()[0]
else:
- title = self.getUserTitleChoice(titlesHash)
- xmlOffset = titlesHash[title]
- result = self.retrieveExact(title, None, xmlOffset)
+ title = get_user_title_choice(titles_hash)
+ xml_offset = titles_hash[title]
+ result = self.retrieve_exact(title, None, xml_offset)
return result
- def retrieveExact(self, title, idxOffset, xmlOffset=None):
- # retrieve the contents of the xml file with the
- # specified page title, first seeking to the appropriate place
- # in the xml file and possibly the index file
- # arguments:
- # title -- the page title, case sensitive, with spaces, not
underscores
- # idxOffset -- the offset in bytes into the index file with the title
- # if this is None, the xmlOffset must be provided
- # xmlOffset -- the offset into the xml file in bytes of the bz2
- # stream containing the page with the specified title
- # if this is None, the idxOffset must be provided
- # if this is provided it will be used and the idxOffset
- # will be ignored, as it would only be used to look up
- # this value
- # returns text if found, None otherwise
- if xmlOffset is None:
- xmlOffset = self.findTitleInIndexExactMatch(title, idxOffset)
- if xmlOffset is None:
+ def retrieve_exact(self, title, idx_offset, xml_offset=None):
+ '''
+ retrieve the contents of the xml file with the
+ specified page title, first seeking to the appropriate place
+ in the xml file and possibly the index file
+ arguments:
+ title -- the page title, case sensitive, with spaces, not
underscores
+ idx_offset -- the offset in bytes into the index file with the title
+ if this is None, the xmlOffset must be provided
+ xml_offset -- the offset into the xml file in bytes of the bz2
+ stream containing the page with the specified title
+ if this is None, the idxOffset must be provided
+ if this is provided it will be used and the idxOffset
+ will be ignored, as it would only be used to look up
+ this value
+ returns text if found, None otherwise
+ '''
+ if xml_offset is None:
+ xml_offset = self.find_title_in_index_exact_match(title,
idx_offset)
+ if xml_offset is None:
sys.stderr.write("No such title found in index.\n")
return None
if self.verbose:
- sys.stderr.write("Found xml offset %s\n" % xmlOffset)
- text = self.retrieveText(title, xmlOffset)
+ sys.stderr.write("Found xml offset %s\n" % xml_offset)
+ text = self.retrieve_text(title, xml_offset)
return text
- def findCharInToc(self, char):
- # given a (possibly multibyte) character, find its entry
- # in the toc file, read the index file offset listed there
- # and return it
- # arguments:
- # char -- character for which to find the toc entry
- # returns the index file offset of the character, or None if not found
- self.tocFd.seek(0)
+ def find_char_in_toc(self, char):
+ '''
+ given a (possibly multibyte) character, find its entry
+ in the toc file, read the index file offset listed there
+ and return it
+ arguments:
+ char -- character for which to find the toc entry
+ returns the index file offset of the character, or None if not found
+ '''
+ self.toc_fd.seek(0)
# format of these lines is
# 14815067:A
# this is offset, first (unicode) character
- for line in self.tocFd:
+ for line in self.toc_fd:
stripped = line.rstrip('\n')
fields = stripped.split(':', 1)
- if len(fields) < 2:
- next
- indexedCharUnicode = fields[1].decode("utf-8")
- if indexedCharUnicode == char:
+ if len(fields) < 2:
+ continue
+ indexed_char_unicode = fields[1].decode("utf-8")
+ if indexed_char_unicode == char:
return int(fields[0])
- if indexedCharUnicode > char:
+ if indexed_char_unicode > char:
break
return None
- def findTitleInIndex(self, title, offset):
- # find entries beginning with the specified in the index file,
- # first seeking to the specified offset
- # arguments:
- # title -- page title to be found in index
- # offset -- offset into the index file in bytes of
- # the line with the specified title
- # returns a hash of matching page titles and their offsets into
- # the xml file, or None if no matches were found
- titleMatches = {}
- self.idxFd.seek(offset)
- titleLen = len(title)
+ def find_title_in_index(self, title, offset):
+ '''
+ find entries beginning with the specified in the index file,
+ first seeking to the specified offset
+ arguments:
+ title -- page title to be found in index
+ offset -- offset into the index file in bytes of
+ the line with the specified title
+ returns a hash of matching page titles and their offsets into
+ the xml file, or None if no matches were found
+ '''
+ title_matches = {}
+ self.idx_fd.seek(offset)
+ title_len = len(title)
# format of these lines is
# 9186323419:33202778:A Girl like Me (film)
# this is offset, page id, page title
- for line in self.idxFd:
+ for line in self.idx_fd:
stripped = line.rstrip("\n")
fields = stripped.split(':', 2)
- if len(fields) < 2:
- next
+ if len(fields) < 2:
+ continue
if fields[2].startswith(title):
- titleMatches[fields[2]] = int(fields[0]) # offset into xml file
- if fields[2][:titleLen] > title: # we are past all the matches (if
there were any)
+ title_matches[fields[2]] = int(fields[0]) # offset into xml
file
+ if fields[2][:title_len] > title: # we are past all the matches
(if there were any)
break
- if not len(titleMatches.keys()):
- titleMatches = None
- return titleMatches
+ if not len(title_matches.keys()):
+ title_matches = None
+ return title_matches
- def findTitleInIndexExactMatch(self, title, offset):
- # find entry in the index file that matches exactly the specified
title,
- # first seeking to the specified offset
- # arguments:
- # title -- page title to be found in index
- # offset -- offset into the index file in bytes of
- # the line with the specified title
- # returns the offset of the title in the xml file,
- # as listed in the index file, or None if no exact match was found
- self.idxFd.seek(offset)
+ def find_title_in_index_exact_match(self, title, offset):
+ '''
+ find entry in the index file that matches exactly the specified title,
+ first seeking to the specified offset
+ arguments:
+ title -- page title to be found in index
+ offset -- offset into the index file in bytes of
+ the line with the specified title
+ returns the offset of the title in the xml file,
+ as listed in the index file, or None if no exact match was found
+ '''
+ self.idx_fd.seek(offset)
# format of these lines is
# 9186323419:33202778:A Girl like Me (film)
# this is offset, page id, page title
- for line in self.idxFd:
+ for line in self.idx_fd:
stripped = line.rstrip("\n")
fields = stripped.split(':', 2)
- if len(fields) < 2:
- next
+ if len(fields) < 2:
+ continue
if len(fields) < 3:
sys.stderr.write("Fewer splits than we expected: line %s\n" %
line)
if fields[2] == title:
- return int(fields[0]) # xml offset into file
+ return int(fields[0]) # xml offset into file
if fields[2] > title:
break
return None
-
- def retrieveText(self, title, offset):
- # retrieve the page text for a given title from the xml file
- # this does decompression of a bz2 stream so it's more expsive than
- # other parts of this class
- # arguments:
- # title -- the page title, with spaces and not underscores, case
sensitive
- # offset -- the offset in bytes to the bz2 stream in the xml file
which contains
- # the page text
- # returns the page text or None if no such page was found
- self.xmlFd.seek(offset)
- bz = bz2.BZ2Decompressor()
+
+ def retrieve_text(self, title, offset):
+ '''
+ retrieve the page text for a given title from the xml file
+ this does decompression of a bz2 stream so it's more expsive than
+ other parts of this class
+ arguments:
+ title -- the page title, with spaces and not underscores, case
sensitive
+ offset -- the offset in bytes to the bz2 stream in the xml file which
contains
+ the page text
+ returns the page text or None if no such page was found
+ '''
+ self.xml_fd.seek(offset)
+ unzipper = bz2.BZ2Decompressor()
out = None
found = False
try:
- block = self.xmlFd.read(262144)
- out = bz.decompress(block)
+ block = self.xml_fd.read(262144)
+ out = unzipper.decompress(block)
# hope we got enough back to have the page text
except:
raise
@@ -214,9 +451,9 @@
# <id>10</id>
# ...
# </page>
- titleRegex = re.compile("<page>(\s*)<title>%s(\s*)</title>" %
re.escape(title))
+ title_regex = re.compile(r"<page>(\s*)<title>%s(\s*)</title>" %
re.escape(title))
while not found:
- match = titleRegex.search(out)
+ match = title_regex.search(out)
if match:
found = True
text = out[match.start():]
@@ -224,16 +461,16 @@
sys.stderr.write("Found page title, first 600 characters:
%s\n" % text[:600])
break
# we could have a part of the regex at the end of the string, so...
- if len(out) > 40 + len(title): # length of the above plus extra
whitespace
- out = out[-1 *(40 + len(title)):]
+ if len(out) > 40 + len(title): # length of the above plus extra
whitespace
+ out = out[-1 * (40 + len(title)):]
try:
- block = self.xmlFd.read(262144)
+ block = self.xml_fd.read(262144)
except:
# reached end of file (normal case) or
# something really broken (other cases)
break
try:
- out = out + bz.decompress(block)
+ out = out + unzipper.decompress(block)
except EOFError:
# reached end of bz2 stream
# EOFError means we have some data after end of stream, don't
care
@@ -254,21 +491,21 @@
text = text + out[:ind + len("</page>")]
break
# we could have part of the end page tag at the end of the string
- text = text + out[:-1 * len("</page>") -1]
+ text = text + out[:-1 * len("</page>") - 1]
out = out[-1 * len("</page>"):]
try:
- block = self.xmlFd.read(262144)
+ block = self.xml_fd.read(262144)
except:
# reached end of file (normal case) or
# something really broken (other cases)
break
try:
- out = out + bz.decompress(block)
+ out = out + unzipper.decompress(block)
except EOFError:
# reached end of bz2 stream
# EOFError means we have some data after end of stream, don't
care
pass
-
+
# if not found this can be partial text. should we return it? no
if not found:
if self.verbose:
@@ -278,398 +515,207 @@
text = None
return text
- def getUserTitleChoice(self, titleHash):
- # show a numbered list of page titles on stdout and read the
- # caller's choice on stdin
- # I guess this is a poor person's pager
- # arguments:
- # titleHash -- hash of page titles and their offsets into the xml file
- # returns: the offset into the xml file for the title selected
- titles = titleHash.keys()
- titles.sort()
- total = len(titles)
-
- choice = None
- start = 0
- batchSize = 30
- print "Multiple titles found, please choose from the following."
- while not choice:
- (action, choice) = self.getChoiceFromBatch(titles, start,
batchSize)
- if choice:
- return titles[choice-1]
- else:
- start = self.processAction(action, start, batchSize, total)
-
- def getChoiceFromBatch(self, titles, start, batchSize):
- # display titles from start to start + batchsize, with count in front
- # ask the caller for a title number or an action
- # actions may be Q (quit), N (next batch), B (previous batch), R
(redisplay)
- # if caller enters nothing, treat that as default (R)
- # if caller enters something else, whine and treat that as default (R)
too
- # arguments:
- # titles -- full list of titles
- # start -- display from this point in the list
- # batchSize -- how many titles to display
- # returns a tuple of (action, title number) where one or the other of
these
- # may be None
-
- # yay python, it will silently ignore the fact that you requested
- # more things in the list than exist. (no this is not sarcasm)
- count = start
- for line in titles[start:start+batchSize]:
- print "%s) %s" % (count+1, WATitleMunger.unNormalizeTitle(line))
- count += 1
- print
- print "Enter number of choice, or Q/N/P/R to quit/next page/prev
page/redisplay page (default R): ",
- choice = sys.stdin.readline()
- choice = choice.strip()
- if not choice:
- choice = 'R'
- if choice.isdigit():
- num = int(choice)
- if num < 1 or num > len(titles):
- print "Bad number given."
- return("R", None)
- return(None, num)
- else:
- choice = choice.capitalize()
- if choice in [ 'N', 'P', 'Q', 'R' ]:
- return(choice, None)
- else:
- print "Bad choice given."
- return("R", None)
-
- def processAction(self, action, start, batchSize, total):
- # given a caller action,
- # update title list display pointer to the appropriate position
- # arguments:
- # action -- Q (quit), N (next batch), P (prev batch), or anything
else
- # start -- title list display pointer, a batch of titles from the
list
- # will be displayed starting from this number
- # batchSize -- how many titles are dispayed in a batch
- # total -- total titles in the list
- # returns: updated title list display pointer, or exits at user request
- # (action Q)
- # note that any action other than Q/N/P will result in the default R
(redisplay
- # current batch of titles) which means no change, return existing
value. This
- # includes the None action.
- if action == 'N' or action == 'n':
- if start + batchSize < total:
- start += batchSize
- else:
- print "End of list reached."
- elif action == 'P' or action == 'p':
- if start > batchSize:
- start = start - batchSize
- else:
- print "Beginning of list reached."
- elif action == 'Q' or action == 'q':
- print "Exiting at user's request."
- sys.exit(0)
- return(start)
class WATextFormatter(object):
- # format page text for a given title as desired by the caller
- # we do this since we don't have a real renderer of wikitext
- # with template expansion and all that crapola
- def __init__(self, text, localizedFileString, localizedCategoryString):
- # constructor
- # arguments:
- # text -- page text, could also include xml tags
and page metadata
- # localizedFileString -- the string 'File' in the local wiki
language
- # localizedCategoryString -- the string 'Category' in the local wiki
language
+ '''
+ format page text for a given title as desired by the caller
+ we do this since we don't have a real renderer of wikitext
+ with template expansion and all that crapola
+ '''
+ def __init__(self, text, localized_file_string, localized_category_string):
+ '''
+ constructor
+ arguments:
+ text -- page text, could also include xml tags
and page metadata
+ localized_file_string -- the string 'File' in the local wiki
language
+ localized_category_string -- the string 'Category' in the local wiki
language
+ '''
self.text = text
- self.localizedFileString = localizedFileString
- self.localizedCategoryString = localizedCategoryString
- self.formattingDone = False
+ self.localized_file_string = localized_file_string
+ self.localized_category_string = localized_category_string
+ self.formatting_done = False
- def cleanupLinks(self):
- # for all links (has [[ ]] and maybe | in them -- no special
treatment for interwiki links
- # or categories, sorrry but this is a rough cut), toss the [[ ]] and
the pipe arg if any.
- # except file and category
+ def cleanup_links(self):
+ '''
+ for all links (has [[ ]] and maybe | in them -- no special treatment
for interwiki links
+ or categories, sorrry but this is a rough cut), toss the [[ ]] and the
pipe arg if any.
+ except file and category
+ '''
if self.text is not None:
- nopipes =
re.sub("\[\[(?!(File|Category|%s|%s))([^\|\]]+)\|([^\]]+)\]\]" %(
self.localizedFileString, self.localizedCategoryString ), "\\3", self.text)
- nowikilinks = re.sub("\[\[(?!(File|Category|%s|%s))([^\]]+)\]\]" %
( self.localizedFileString, self.localizedCategoryString ),"\\2", nopipes)
+ nopipes =
re.sub(r"\[\[(?!(File|Category|%s|%s))([^\|\]]+)\|([^\]]+)\]\]"
+ % (self.localized_file_string,
self.localized_category_string),
+ "\\3", self.text)
+ nowikilinks = re.sub(r"\[\[(?!(File|Category|%s|%s))([^\]]+)\]\]"
+ % (self.localized_file_string,
self.localized_category_string),
+ "\\2", nopipes)
self.text = nowikilinks
- def cleanupText(self):
- # convert html entities back into <>"&, remove wiki markup for
bold/italics, remove <span> tags
+ def cleanup_text(self):
+ '''
+ convert html entities back into <>"&,
+ remove wiki markup for bold/italics, remove <span> tags
+ '''
if self.text is not None:
- noampersands = self.text.replace("<",
'<').replace(">",'>').replace(""",'"').replace("&",'&').replace(" ",'
')
- nofontstyling = noampersands.replace("'''","").replace("''","")
- nospans = re.sub("</?span[^>]*>","", nofontstyling)
+ noampersands = self.text.replace(
+ "<", '<').replace(">", '>').replace(
+ """, '"').replace("&", '&').replace(" ", ' ')
+ nofontstyling = noampersands.replace("'''", "").replace("''", "")
+ nospans = re.sub("</?span[^>]*>", "", nofontstyling)
self.text = nospans
- def cleanupRefs(self):
- # toss the refs, this should really be overridable by the user. we
want this so it's
- # easier to read the plaintext of the article, there will already be a
ton
- # of templates and crap in there
+ def cleanup_refs(self):
+ '''
+ toss the refs, this should really be overridable by the user. we want
this so it's
+ easier to read the plaintext of the article, there will already be a
ton
+ of templates and crap in there
+ '''
if self.text is not None:
- norefs = re.sub("<ref[^>]*>.*?</ref>","", self.text, flags =
re.DOTALL)
+ norefs = re.sub("<ref[^>]*>.*?</ref>", "", self.text,
flags=re.DOTALL)
# <ref name="mises.org"/>
nosimplerefs = re.sub("<ref.*?/>", "", norefs)
self.text = nosimplerefs
- def cleanupHtmlComments(self):
- # toss html (<!-- -->) comments, <nowiki>, <code> and <sup> tags, and
<br> tags
+ def cleanup_html_comments(self):
+ '''
+ toss html (<!-- -->) comments, <nowiki>, <code> and <sup> tags, and
<br> tags
+ '''
if self.text is not None:
- nocomments = re.sub("<!--.*?-->","", self.text, flags = re.DOTALL)
- nonowikis = re.sub("</?nowiki>","", nocomments)
- nocodes = re.sub("</?code>","", nonowikis)
- nobrs = re.sub("<br\s*/>","",nocodes)
- nosups = re.sub("</?sup>","",nobrs)
+ nocomments = re.sub(r"<!--.*?-->", "", self.text, flags=re.DOTALL)
+ nonowikis = re.sub(r"</?nowiki>", "", nocomments)
+ nocodes = re.sub(r"</?code>", "", nonowikis)
+ nobrs = re.sub(r"<br\s*/>", "", nocodes)
+ nosups = re.sub(r"</?sup>", "", nobrs)
self.text = nosups
- def doFormatting(self):
- # do all the text formatting in some reasonable order
- # and return the formatted text
- if not self.formattingDone:
- self.cleanupLinks()
- self.cleanupText()
- self.cleanupRefs()
- self.cleanupHtmlComments()
- self.formattingDone = True
- return(self.text)
-
-class WAXMLExtractor(object):
- # get various things from the xml page text
- def __init__(self, XML):
- # constructor
- # arguments:
- # XML -- the xml text of the page, including the <page>...</page>
- # tags and everything in between
- self.XML = XML
- self.text = None
-
- def getText(self):
- # get the contents of the <text>...</text> tags if needed,
- # returns the contents found or None if none found
- if self.text is None:
- if self.XML is not None:
- match = re.search("<text[^>]*>(.*?)</text>", self.XML, flags =
re.DOTALL)
- if match:
- self.text = match.group(1)
+ def do_formatting(self):
+ '''
+ do all the text formatting in some reasonable order
+ and return the formatted text
+ '''
+ if not self.formatting_done:
+ self.cleanup_links()
+ self.cleanup_text()
+ self.cleanup_refs()
+ self.cleanup_html_comments()
+ self.formatting_done = True
return self.text
-
-class WATextExtractor(object):
- # retrieve various things from page text
- # right now various = redirection info, but this could have more things
later
- def __init__(self, text, localizedRedirString):
- # constructor
- # arguments:
- # text -- the raw text dug out from the <text> tags of
page content
- # localizedRedirString -- the string 'REDIR' in the wiki content
language
- self.text = text
- self.localizedRedirString = localizedRedirString
- self.redirect = None
-
- def getRedirect(self):
- # look for and set page title of a redirect link in the text, if any
- # returns the redirction link or None if none was found
- # format: <text ...>#REDIRECT [[link|show to reader]] ...
- # fixme the redirect keyword really should be case insensitive but
- # have we done any unicode stuff here? nope, so that ain't happening
- if self.redirect is None:
- if self.text is not None:
- redirRegex = re.compile("<text
[^>]+>\s*#(REDIRECT|%s)\s*\[\[([^\|\]]+)" % self.localizedRedirString)
- match = redirRegex.search(self.text)
- if match:
- if verbose:
- sys.stderr.write("Found a redirect in the page text:
%s\n" % match.group(2))
- self.redirect = match.group(2)
- return self.redirect
class WATitleMunger(object):
- # transform page title to the format in the xml file
- # or to ordinary plaintext
-
+ '''
+ transform page title to the format in the xml file
+ or to ordinary plaintext
+ '''
@staticmethod
- def normalizeTitle(title):
+ def normalize_title(title):
+ '''
# doesn't do much right now. remember how this is only a proof of
concept??
- return title.replace('_', ' ').replace('&','&').replace('"',
""")
+ '''
+ return title.replace('_', ' ').replace('&', '&').replace('"',
""")
@staticmethod
- def unNormalizeTitle(title):
+ def un_normalize_title(title):
+ '''
# not an exact opposite kids cause of the underscore, that's the breaks
- return title.replace('&','&').replace(""", '"')
+ '''
+ return title.replace('&', '&').replace(""", '"')
-class WATextDisplay(object):
- # process and display text of a page from the xml file of page content
- def __init__(self, fileString, categoryString, clean = False, textOnly =
False):
- # constructor
- # arguments:
- # fileString -- the string 'File' in the wiki's content language
- # categoryString -- the string 'Category' in the wiki's content
language
- # clean -- whether or not to clean up various tags etc or
leave the raw text
- # as retrieved from the xml file
- # textOnly -- whether or not to include the metadata and other
stuff for the page
- # as retrieved from the xml file or just the
content from the <text> tags
- self.fileString = fileString
- self.categoryString = categoryString
- self.clean = clean
- self.textOnly= textOnly
-
- def display(self, text):
- # display the text, optionally extracting just the content
- # from the <text> tags and optionally doing some cleanup
- # on the text before display
- # arguments:
- # text -- the page text from the xml file, everything between
- # <page>...</page>
- if text is None:
- print "No page text for that title found."
- return
- if self.textOnly:
- xe = WAXMLExtractor(text)
- text = xe.getText()
-
- if self.clean:
- tf = WATextFormatter(text, self.fileString, self.categoryString)
- text = tf.doFormatting()
+def usage(message=None):
+ '''
+ display usage information about the script, after optionally
+ displaying a specified message
+ arguments:
+ message -- message to be displayed before usage information
+ if omitted, only the usage information will be shown
+ '''
+ if message:
+ sys.stderr.write("%s\n" % message)
+ usage_message = """
+Usage: python wikiarticles.py --title titlestring --xmlfile filename
+ --idxfile filename --tocfile filename [--configfile filename]
+ [--maxredirs num] [--redirtext string] [--cleanup] [--exact]
+ [--textonly] [--verbose]
- print text
+Given a bz2-compressed multistream xml file of articles, a sorted plain text
+index file into the article file, and a plain text toc file for the index,
+find and display the xml including article text of any article specified
+by title.
-class WAErrorHandler(object):
- # display warning and error message
- def __init__(self, whoami):
- # constructor
- # arguments:
- # whoami -- the name of the script being executed
- self.whoami = whoami
+The user may specify the first so many characters of the title, in which
+case all matching titles will be displayed as a list so that the user
+may select the one desired.
- def usage(self, message = None):
- # display usage information about the script, after optionally
- # displaying a specified message
- # arguments:
- # message -- message to be displayed before usage information
- # if omitted, only the usage information will be shown
- if message:
- sys.stderr.write("%s\n" % message)
- sys.stderr.write("Usage: python %s --title titlestring --xmlfile
filename\n" % self.whoami)
- sys.stderr.write(" --idxfile filename --tocfile filename
[--configfile filename]\n")
- sys.stderr.write(" [--maxredirs num] [--redirtext string]
[--cleanup] [--exact]\n")
- sys.stderr.write(" [--textonly] [--verbose]\n")
- sys.stderr.write("\n")
- sys.stderr.write("Given a bz2-compressed multistream xml file of
articles, a sorted plain text\n")
- sys.stderr.write("index file into the article file, and a plain text
toc file for the index,\n")
- sys.stderr.write("find and display the xml including article text of
any article specified\n")
- sys.stderr.write("by title.\n")
- sys.stderr.write("\n")
- sys.stderr.write("The user may specify the first so many characters of
the title, in which\n")
- sys.stderr.write("case all matching titles will be displayed as a list
so that the user\n")
- sys.stderr.write("may select the one desired.\n")
- sys.stderr.write("\n")
- sys.stderr.write("If no such title is found, an error message will be
displayed.\n")
- sys.stderr.write("\n")
- sys.stderr.write("Titles are case-sensitive for now.\n")
- sys.stderr.write("\n")
- sys.stderr.write("A reasonable front end would parse the xml, strip or
expand templates,\n")
- sys.stderr.write("do something interesting with citations, references
and links, etc.\n")
- sys.stderr.write("This script does none of that; it is a proof of
concept only.\n")
- sys.stderr.write("\n")
- sys.stderr.write("Arguments:\n")
- sys.stderr.write("--title: first so many characters of the
article title\n")
- sys.stderr.write("--xmlfile: path to the bz2 compressed xml
format article file\n")
- sys.stderr.write("--idxfile: plain text file which is the index
into the bz2 xml file\n")
- sys.stderr.write("--tocfile: plain text file which is the toc of
the index file\n")
- sys.stderr.write("--configfile: plain text file which contains
config options\n")
- sys.stderr.write("--maxredirs: maximum number of redirects to
follow\n")
- sys.stderr.write(" default: 3\n")
- sys.stderr.write("--categorytext: text of the 'category' string in
the wiki's content language\n")
- sys.stderr.write(" default: Category\n")
- sys.stderr.write("--filetext: text of the 'file' string in the
wiki's content language\n")
- sys.stderr.write(" default: File\n")
- sys.stderr.write("--redirtext: text in capital letters of the
'redirect' string in the\n")
- sys.stderr.write(" wiki's content language\n")
- sys.stderr.write(" default: REDIRECT\n")
- sys.stderr.write("\n")
- sys.stderr.write("Flags:\n")
- sys.stderr.write("--cleanup: cleanup text (remove refs, font
stylings, etc) for ease of reading\n")
- sys.stderr.write(" default: false\n")
- sys.stderr.write("--exact: require exact match of specified
title\n")
- sys.stderr.write(" default: false\n")
- sys.stderr.write("--textonly: print only the contents of the
xml<text> tag, not the rest of the\n")
- sys.stderr.write(" page info\n")
- sys.stderr.write(" default: false\n")
- sys.stderr.write("--verbose: print extra message about what is being
done\n")
- sys.stderr.write(" default: false\n")
- sys.stderr.write("\n")
- sys.stderr.write("Example:\n")
- sys.stderr.write("python %s --exact --xmlfile
enwiki-articles-current.xml.bz2 \\\n" % self.whoami)
- sys.stderr.write(" --idxfile articles-index-sorted.txt
--tocfile index-toc.txt\n")
- sys.exit(1)
+If no such title is found, an error message will be displayed.
-class WARedirectHandler(object):
- # follow redirect links in the page text
- def __init__(self, maxRedirs, redirText, verbose):
- # constructor
- # arguments:
- # maxRedirs -- follow this many redirect links until giving up, if 0
then
- # don't follow any (need this so we avoid redirection
loops)
- # redirText -- the string 'REDIRECT' in the wiki's content language
- # verbose -- whether or not to display messages about processing
being done
- self.maxRedirs = maxRedirs
- self.redirText = redirText
- self.verbose = verbose
+Titles are case-sensitive for now.
- def handleRedirects(self, text, retriever):
- # follow redirect link in page text, retrieve the target page text, and
- # check that til we reach a page that's not a redirect or we hit the
- # maxRedirs limit or we hit a redirect to a nonexistent page
- # arguments:
- # text -- initial page text as retrieved from the xml file,
- # without any cleanup etc.
- # retriever -- WAPageRetriever object (used to follow the redirects)
- # returns the page text, either of the first non-redirect or the
- # last redirect before going over the redir follow limit, or
- # the last page text before following a link to a nonexistent page
- if not text:
- return text
- redirsDone = 0
- redirLink = None
- while redirsDone < self.maxRedirs:
- if self.verbose:
- sys.stderr.write("Checking for redirects in text, redirs done
already:%s\n" % redirsDone)
- te = WATextExtractor(text, self.redirText)
- redirLink = te.getRedirect()
- if not redirLink:
- break
- oldText = text
- text = retriever.retrieve(WATitleMunger.normalizeTitle(redirLink),
True)
- if text is None:
- # redirect to nonexistent article
- text = oldText
- break
- redirsDone += 1
- if redirLink and redirsDone >= maxRedirs:
- sys.stderr.write("Too many redirects encountered.\n")
- return text
+A reasonable front end would parse the xml, strip or expand templates,
+do something interesting with citations, references and links, etc.
+This script does none of that; it is a proof of concept only.
-def readConfig(configFile=None):
- # set up configuration defaults and read overriding values from files in
- # the current directory, /etc, and the user's home directory, if they exist
- # arguments:
- # configFile -- name of the configuration file in the current dir, if any
- # returns a ConfigParser object with the configuration values in it
+Arguments:
+
+ --title: first so many characters of the article title
+ --xmlfile: path to the bz2 compressed xml format article file
+ --idxfile: plain text file which is the index into the bz2 xml file
+ --tocfile: plain text file which is the toc of the index file
+ --configfile: plain text file which contains config options
+ --maxredirs: maximum number of redirects to follow
+ default: 3
+ --categorytext: text of the 'category' string in the wiki's content language
+ default: Category
+ --filetext: text of the 'file' string in the wiki's content language
+ default: File
+ --redirtext: text in capital letters of the 'redirect' string in the
+ wiki's content language
+ default: REDIRECT
+
+Flags:
+ --cleanup: cleanup text (remove refs, font stylings, etc) for ease of
reading
+ default: false
+ --exact: require exact match of specified title
+ default: false
+ --textonly: print only the contents of the xml<text> tag, not the rest of
the
+ page info
+ default: false
+ --verbose: print extra message about what is being done
+ default: false
+
+Example:
+
+python %s --exact --xmlfile enwiki-articles-current.xml.bz2 \\\n" %
self.whoami)
+ --idxfile articles-index-sorted.txt --tocfile index-toc.txt
+"""
+ sys.stderr.write(usage_message)
+ sys.exit(1)
+
+
+def read_config(config_file=None):
+ '''
+ set up configuration defaults and read overriding values from files in
+ the current directory, /etc, and the user's home directory, if they exist
+ arguments:
+ configFile -- name of the configuration file in the current dir, if any
+ returns a ConfigParser object with the configuration values in it
+ '''
home = os.path.dirname(sys.argv[0])
- if (not configFile):
- configFile = "wikiarticles.conf"
+ if not config_file:
+ config_file = "wikiarticles.conf"
# fixme I should really check what order these get read in
# and which files override which
files = [
- os.path.join(home,configFile),
+ os.path.join(home, config_file),
"/etc/wikiarticles.conf",
os.path.join(os.getenv("HOME"), ".wikiarticles.conf")]
defaults = {
- #"files": {
+ # "files": {
"xmlfile": "",
"idxfile": "",
"tocfile": "",
- #"format": {,
+ # "format": {,
"cleanup": "0",
"textonly": "0",
"maxredirs": "3",
@@ -688,92 +734,98 @@
return conf
-if __name__ == "__main__":
- configFileName = None
- xmlFileName = None
- indexFileName = None
- tocFileName = None
- pageTitle = None
- exactMatch = None
- verbose = None
- maxRedirs = None
- fileText = None
- categoryText = None
- redirText = None
- cleanup = None
- textOnly = None
- errs = WAErrorHandler(sys.argv[0])
+def do_main():
+ config_file_name = None
+ xml_file_name = None
+ index_file_name = None
+ toc_file_name = None
+ page_title = None
+ exact_match = None
+ verbose = None
+ max_redirs = None
+ file_text = None
+ category_text = None
+ redir_text = None
+ cleanup = None
+ text_only = None
try:
- (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "",
['xmlfile=', 'idxfile=', 'tocfile=', 'title=', 'configfile=', 'maxredirs=',
"filetext=", "categorytext=", "redirtext=", 'cleanup', 'exact', 'textonly',
'verbose' ])
+ (options, remainder) = getopt.gnu_getopt(
+ sys.argv[1:], "", ['xmlfile=', 'idxfile=', 'tocfile=',
+ 'title=', 'configfile=', 'maxredirs=',
+ "filetext=", "categorytext=", "redirtext=",
+ 'cleanup', 'exact', 'textonly', 'verbose'])
except:
- errs.usage("Unknown option specified")
+ usage("Unknown option specified")
for (opt, val) in options:
if opt == "--xmlfile":
- xmlFileName = val
+ xml_file_name = val
elif opt == "--idxfile":
- indexFileName = val
+ index_file_name = val
elif opt == "--tocfile":
- tocFileName = val
+ toc_file_name = val
elif opt == "--title":
- pageTitle = val
+ page_title = val
elif opt == "--configfile":
- configFileName = val
+ config_file_name = val
elif opt == "--maxredirs":
if not val.isdigit() or int(val) < 0:
- errs.usage("maxredirs must be a non-negative integer.")
- maxRedirs = int(val)
+ usage("maxredirs must be a non-negative integer.")
+ max_redirs = int(val)
elif opt == "--redirtext":
- redirText = val
+ redir_text = val
elif opt == "--cleanup":
cleanup = True
elif opt == "--exact":
- exactMatch = True
+ exact_match = True
elif opt == "--textonly":
- textOnly = True
+ text_only = True
elif opt == "--verbose":
verbose = True
- if (len(remainder) > 0):
- errs.usage("Unknown option specified")
+ if len(remainder) > 0:
+ usage("Unknown option specified")
- conf = readConfig(configFileName)
-
+ conf = read_config(config_file_name)
+
# check config file for fallbacks.
- if xmlFileName is None:
- xmlFileName = conf.get("files", "xmlfile")
- if indexFileName is None:
- indexFileName = conf.get("files", "idxfile")
- if tocFileName is None:
- tocFileName = conf.get("files", "tocfile")
- if maxRedirs is None:
- maxRedirs = conf.getint("format", "maxredirs")
- if redirText is None:
- redirText = conf.get("format", "redirtext")
- if fileText is None:
- fileText = conf.get("format", "filetext")
- if categoryText is None:
- categoryText = conf.get("format", "categorytext")
+ if xml_file_name is None:
+ xml_file_name = conf.get("files", "xmlfile")
+ if index_file_name is None:
+ index_file_name = conf.get("files", "idxfile")
+ if toc_file_name is None:
+ toc_file_name = conf.get("files", "tocfile")
+ if max_redirs is None:
+ max_redirs = conf.getint("format", "maxredirs")
+ if redir_text is None:
+ redir_text = conf.get("format", "redirtext")
+ if file_text is None:
+ file_text = conf.get("format", "filetext")
+ if category_text is None:
+ category_text = conf.get("format", "categorytext")
if cleanup is None:
cleanup = conf.getboolean("format", "cleanup")
- if textOnly is None:
- textOnly = conf.getboolean("format", "textonly")
+ if text_only is None:
+ text_only = conf.getboolean("format", "textonly")
- mandatory = [ ("xmlfile", xmlFileName), ("idxfile", indexFileName),
("tocfile", tocFileName), ("title", pageTitle) ]
- for (optName, val) in mandatory:
+ mandatory = [("xmlfile", xml_file_name), ("idxfile", index_file_name),
+ ("tocfile", toc_file_name), ("title", page_title)]
+ for (opt_name, val) in mandatory:
if not val:
- errs.usage("Missing required option '%s'" % optName)
+ usage("Missing required option '%s'" % opt_name)
- pr = WAPageRetriever(xmlFileName, indexFileName, tocFileName, verbose)
- pr.setup()
- text = pr.retrieve(WATitleMunger.normalizeTitle(pageTitle), exactMatch)
+ retriever = WAPageRetriever(xml_file_name, index_file_name, toc_file_name,
verbose)
+ retriever.setup()
+ text = retriever.retrieve(WATitleMunger.normalize_title(page_title),
exact_match)
if text:
- rh = WARedirectHandler(maxRedirs, redirText, verbose)
- text = rh.handleRedirects(text, pr)
+ text = handle_redirects(text, retriever, max_redirs, redir_text,
verbose)
- td = WATextDisplay(fileText, categoryText, cleanup, textOnly)
- td.display(text)
+ display(text, file_text, category_text, cleanup, text_only)
- pr.teardown()
+ retriever.teardown()
+
+
+if __name__ == "__main__":
+ do_main()
diff --git a/toys/bz2multistream/writetoc.py b/toys/bz2multistream/writetoc.py
index 70d6d2b..8d2da93 100644
--- a/toys/bz2multistream/writetoc.py
+++ b/toys/bz2multistream/writetoc.py
@@ -1,139 +1,163 @@
-import getopt, os, sys, re, codecs
+import getopt
+import sys
+import codecs
-class indexTOC(object):
- # Generate a table of contents for an index file,
- # where the TOC will consist of lines containing
- # offset:char
- # where char is a unique starting character of
- # the text field in the index file, and offset is
- # the offset into the index file of the first
- # line with an article starting with the specific
- # character.
- # The index file should have lines of the format:
- # xxx:xxx:...:text field:xxx:...
- # or instead of ':' you can use the field separator
- # of your choice.
- # For the purposes of this script we only care about
- # the contents of the text field.
- # The index file must have been sorted by the text field
- # so that all entries starting with the same first
- # character are consecutive in the file.
+class IndexTOC(object):
+ '''
+ Generate a table of contents for an index file,
+ where the TOC will consist of lines containing
+ offset:char
+ where char is a unique starting character of
+ the text field in the index file, and offset is
+ the offset into the index file of the first
+ line with an article starting with the specific
+ character.
- def __init__(self, inputFd, fieldNum, separator, verbose):
- # constructor
- # arguments:
- # inputFd -- open file descriptor from which index lines will be read
- # it better have been opened with utf8 codec if there are
- # any unicode characters in the text fields
- # fieldNum -- number of field containing text, numbering starts at 1
- # sep -- field separator. for wmf index files this is ':'
- # verbose -- whether or not to display info about processing of the
index lines
- self.inputFd = inputFd
- self.fieldNum = fieldNum
+ The index file should have lines of the format:
+ xxx:xxx:...:text field:xxx:...
+ or instead of ':' you can use the field separator
+ of your choice.
+ For the purposes of this script we only care about
+ the contents of the text field.
+ The index file must have been sorted by the text field
+ so that all entries starting with the same first
+ character are consecutive in the file.
+ '''
+
+ def __init__(self, input_fd, field_num, separator, verbose):
+ '''
+ constructor
+ arguments:
+ input_fd -- open file descriptor from which index
+ lines will be read; it better have been
+ opened with utf8 codec if there are any
+ unicode characters in the text fields
+ field_num -- number of field containing text, numbering
+ starts at 1
+ sep -- field separator. for wmf index files this
+ is ':'
+ verbose -- whether or not to display info about
+ processing of the index lines
+ '''
+ self.input_fd = input_fd
+ self.field_num = field_num
self.sep = separator
self.verbose = verbose
- self.currentChar = None
+ self.current_char = None
self.offset = 0
- def doTOC(self, outFd):
- # read all input from the input file descriptor
- # and write a TOC file for that input to the
- # specified output file descriptor, which should
- # already have been set up for writing by the
- # caller
- outFdUTF8 = codecs.getwriter("utf-8")(outFd)
- self.currentChar = None
+ def do_toc(self, out_fd):
+ '''
+ read all input from the input file descriptor
+ and write a TOC file for that input to the
+ specified output file descriptor, which should
+ already have been set up for writing by the
+ caller
+ '''
+ out_fd_utf8 = codecs.getwriter("utf-8")(out_fd)
+ self.current_char = None
self.offset = 0
- for line in self.inputFd:
- self.processLine(line, outFdUTF8)
-
- def processLine(self, line, outFd):
- # for a given line of input, see if the
- # text field in the line starts with a new
- # unique first character, and if so, write
- # a TOC entry for that character to the
- # specified output file descriptor
- firstChar = self.getFirstCharFromField(line)
- if not firstChar:
+ for line in self.input_fd:
+ self.process_line(line, out_fd_utf8)
+
+ def process_line(self, line, out_fd):
+ '''
+ for a given line of input, see if the
+ text field in the line starts with a new
+ unique first character, and if so, write
+ a TOC entry for that character to the
+ specified output file descriptor
+ '''
+ first_char = self.get_first_char_from_field(line)
+ if not first_char:
if self.verbose:
sys.stderr.write("no first char retrieved for line: %s,
skipping\n" % line)
- self.offset += len(line.encode('utf-8'))
- next
- if not self.currentChar or firstChar != self.currentChar:
+ elif not self.current_char or first_char != self.current_char:
if self.verbose:
sys.stderr.write("new first char for line: %s, recording\n" %
line)
- self.currentChar = firstChar
- outFd.write("%s:%s\n" % (self.offset, firstChar))
+ self.current_char = first_char
+ out_fd.write("%s:%s\n" % (self.offset, first_char))
self.offset += len(line.encode('utf-8'))
- def getFirstCharFromField(self, line):
- # find the text field in the given line
- # and return the first character (not byte) in the field
- # or None if there is none
+ def get_first_char_from_field(self, line):
+ '''
+ find the text field in the given line
+ and return the first character (not byte) in the field
+ or None if there is none
+ '''
stripped = line.rstrip('\n')
- fields = stripped.split(self.sep, self.fieldNum-1)
- if len(fields) < fieldNum:
+ fields = stripped.split(self.sep, self.field_num-1)
+ if len(fields) < self.field_num:
return None
- if not len(fields[fieldNum -1]):
+ if not len(fields[self.field_num -1]):
return None
- return fields[fieldNum -1][0]
+ return fields[self.field_num -1][0]
-def usage(message = None):
+
+def usage(message=None):
if message:
sys.stderr.write("%s\n" % message)
- sys.stderr.write("Usage: python %s --field=num --separator=char
--tocfile=filename [--verbose]\n" % sys.argv[0])
- sys.stderr.write("\n")
- sys.stderr.write("Given plain text input consisting lines with several
fields with a given\n")
- sys.stderr.write("separator, which have been sorted by a specified field
from each line, write\n")
- sys.stderr.write("a TOC (table of contents) which contains a list of the
unique first\n")
- sys.stderr.write("characters of the sort field and the offset to the first
line of the file in\n")
- sys.stderr.write("which the sort field starts with that character. In
other words, if the text\n")
- sys.stderr.write("fields of the input file all happen to start only with
a,b,c, and q, there\n")
- sys.stderr.write("will be exactly four lines in the created TOC, with
offsets to the first\n")
- sys.stderr.write("line from the input with the sort field starting with a,
the first line\n")
- sys.stderr.write("from the input the sort field starting with b, and so
on.\n")
- sys.stderr.write("\n")
- sys.stderr.write("This is used to create a TOC into an article XML
multistream index\n")
- sys.stderr.write("(after it has been uncompressed and sorted by article
title), so that\n")
- sys.stderr.write("retrieval of article text from the article XML
multistream content file\n")
- sys.stderr.write("can be done quickly without a database or other
server-client model.\n")
- sys.stderr.write("\n")
- sys.stderr.write("--field: the number of the field with which the
input file was\n")
- sys.stderr.write(" alphabetically sorted, starting with 1\n")
- sys.stderr.write(" default: 1\n")
- sys.stderr.write("--tocfile: path to the TOC file which will be
created\n")
- sys.stderr.write("--separator: the string used to separate fields in the
input file\n")
- sys.stderr.write(" default: space\n")
- sys.stderr.write("--verbose: display extra messages about what is being
done\n")
- sys.stderr.write("\n")
- sys.stderr.write("Example: LC_ALL_save=`echo $LC_ALL`; LC_ALL=C; export
LC_ALL; \\\n")
- sys.stderr.write(" bzcat
enwiki-20120902-pages-articles-multistream-index.txt.bz2 | \\\n")
- sys.stderr.write(" sort -k 3 -t ':' >
enwiki-20120902-pages-articles-multistream-index-sorted.txt; \\\n")
- sys.stderr.write(" LC_ALL=${LC_ALL_save}; export LC_ALL\n")
- sys.stderr.write("\n")
- sys.stderr.write(" cat
enwiki-20120902-pages-articles-multistream-index-sorted.txt | \\\n")
- sys.stderr.write(" python %s --field 3 --separator ':' --tocfile
enwiki-20120902-pages-articles-multistream-index-sorted-idx.txt\n" %
sys.argv[0])
+ usage_message = """
+Usage: python writetoc.py --field=num --separator=char
+ --tocfile=filename [--verbose]
+
+Given plain text input consisting lines with several fields with a given
+separator, which have been sorted by a specified field from each line, write
+a TOC (table of contents) which contains a list of the unique first
+characters of the sort field and the offset to the first line of the file in
+which the sort field starts with that character. In other words, if the text
+fields of the input file all happen to start only with a,b,c, and q, there
+will be exactly four lines in the created TOC, with offsets to the first
+line from the input with the sort field starting with a, the first line
+from the input the sort field starting with b, and so on.
+
+This is used to create a TOC into an article XML multistream index
+(after it has been uncompressed and sorted by article title), so that
+retrieval of article text from the article XML multistream content file
+can be done quickly without a database or other server-client model.
+
+--field: the number of the field with which the input file was
+ alphabetically sorted, starting with 1
+ default: 1
+--tocfile: path to the TOC file which will be created
+--separator: the string used to separate fields in the input file
+ default: space
+--verbose: display extra messages about what is being done
+
+Example: LC_ALL_save=`echo $LC_ALL`; LC_ALL=C; export LC_ALL; \\
+ bzcat enwiki-20120902-pages-articles-multistream-index.txt.bz2 | \\
+ sort -k 3 -t ':' > \\
+ enwiki-20120902-pages-articles-multistream-index-sorted.txt; \\
+ LC_ALL=${LC_ALL_save}; export LC_ALL
+
+ cat enwiki-20120902-pages-articles-multistream-index-sorted.txt | \\
+ python writetoc.py --field 3 --separator ':' \\
+ --tocfile
enwiki-20120902-pages-articles-multistream-index-sorted-idx.txt
+"""
+ sys.stderr.write(usage_message)
sys.exit(1)
-if __name__ == "__main__":
- tocFileName = None
- fieldNum = 1
+
+def do_main():
+ toc_file_name = None
+ field_num = 1
separator = ' '
verbose = False
try:
- (options, remainder) = getopt.gnu_getopt(sys.argv[1:], "", ['field=',
'tocfile=', 'separator=', 'verbose' ])
- except:
+ (options, remainder) = getopt.gnu_getopt(
+ sys.argv[1:], "", ['field=', 'tocfile=', 'separator=', 'verbose'])
+ except Exception:
usage("Unknown option specified")
for (opt, val) in options:
if opt == "--field":
if not val.isdigit():
usage("Bad value specified for 'field' option")
- fieldNum = int(val)
+ field_num = int(val)
elif opt == "--tocfile":
- tocFileName = val
+ toc_file_name = val
elif opt == "--separator":
if len(separator) != 1:
usage("Bad value specified for 'separator' option")
@@ -141,23 +165,27 @@
elif opt == "--verbose":
verbose = True
- if (len(remainder) > 0):
+ if len(remainder) > 0:
usage("Unknown option specified")
- if (not tocFileName):
+ if not toc_file_name:
usage("Missing required option 'tocfile'")
try:
- outFile = open(tocFileName, "w")
+ out_file = open(toc_file_name, "w")
except:
- sys.stderr.write("failed to open file %s for writing\n", tocFileName)
+ sys.stderr.write("failed to open file %s for writing\n", toc_file_name)
raise
- inFile = codecs.getreader("utf-8")(sys.stdin)
-
- toc = indexTOC(inFile, fieldNum, separator, verbose)
- toc.doTOC(outFile)
+ in_file = codecs.getreader("utf-8")(sys.stdin)
- outFile.close()
+ toc = IndexTOC(in_file, field_num, separator, verbose)
+ toc.do_toc(out_file)
- exit(0);
+ out_file.close()
+
+ exit(0)
+
+
+if __name__ == "__main__":
+ do_main()
--
To view, visit https://gerrit.wikimedia.org/r/280109
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ib0bbf9c3bb1db1a8029a7a7a3e660e82433634b7
Gerrit-PatchSet: 3
Gerrit-Project: operations/dumps
Gerrit-Branch: ariel
Gerrit-Owner: ArielGlenn <[email protected]>
Gerrit-Reviewer: ArielGlenn <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits