Hello,
While writing a bot, I had to discard redirected pages from the XML
dump. In order to be able to do it early, I modified xmlreader.py to
parse the <redirect /> tag and add it to XmlEntry. I'm attaching the
patch, which is not extensively tested.
I haven't updated the regex_parse method since it looks outdated
anyway (it tries to create an XmlEntry with different arguments than
usual).
Best regards,
--
Santiago M. Mola
Jabber ID: [email protected]
diff --git a/xmlreader.py b/xmlreader.py
index fe99e09..0125528 100644
--- a/xmlreader.py
+++ b/xmlreader.py
@@ -56,7 +56,7 @@ class XmlEntry:
"""
Represents a page.
"""
- def __init__(self, title, id, text, username, ipedit, timestamp, editRestriction, moveRestriction, revisionid, comment):
+ def __init__(self, title, id, text, username, ipedit, timestamp, editRestriction, moveRestriction, revisionid, comment, redirect):
# TODO: there are more tags we can read.
self.title = title
self.id = id
@@ -68,6 +68,7 @@ class XmlEntry:
self.moveRestriction = moveRestriction
self.revisionid = revisionid
self.comment = comment
+ self.redirect = redirect
class XmlHeaderEntry:
@@ -94,6 +95,7 @@ class MediaWikiXmlHandler(xml.sax.handler.ContentHandler):
self.id = u''
self.revisionid = u''
self.comment = u''
+ self.redirect = False
def setCallback(self, callback):
self.callback = callback
@@ -159,6 +161,8 @@ class MediaWikiXmlHandler(xml.sax.handler.ContentHandler):
self.inContributorTag = False
elif name == 'restrictions':
self.editRestriction, self.moveRestriction = parseRestrictions(self.restrictions)
+ elif name == 'redirect':
+ self.redirect = True
elif name == 'revision':
# All done for this.
# Remove trailing newlines and spaces
@@ -178,7 +182,7 @@ class MediaWikiXmlHandler(xml.sax.handler.ContentHandler):
text, self.username,
self.ipedit, timestamp,
self.editRestriction, self.moveRestriction,
- self.revisionid, self.comment)
+ self.revisionid, self.comment, self.redirect)
self.inRevisionTag = False
self.callback(entry)
elif self.headercallback:
@@ -313,6 +317,10 @@ Consider installing the python-celementtree package.''')
self.title = elem.findtext("{%s}title" % self.uri)
self.pageid = elem.findtext("{%s}id" % self.uri)
self.restrictions = elem.findtext("{%s}restrictions" % self.uri)
+ if elem.findtext("{%s}redirect" % self.uri) is None:
+ self.redirect = False
+ else:
+ self.redirect = True
def _create_revision(self, revision):
"""Creates a Single revision"""
@@ -332,7 +340,8 @@ Consider installing the python-celementtree package.''')
editRestriction=editRestriction,
moveRestriction=moveRestriction,
revisionid=revisionid,
- comment=comment
+ comment=comment,
+ redirect=self.redirect
)
def regex_parse(self):
_______________________________________________
Pywikipedia-l mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-l