Hello,

While writing a bot, I had to discard redirected pages from the XML
dump. In order to be able to do it early, I modified xmlreader.py to
parse the <redirect /> tag and add it to XmlEntry. I'm attaching the
patch, which is not extensively tested.

I haven't updated the regex_parse method since it looks outdated
anyway (it tries to create an XmlEntry with different arguments than
usual).

Best regards,
-- 
Santiago M. Mola
Jabber ID: [email protected]
diff --git a/xmlreader.py b/xmlreader.py
index fe99e09..0125528 100644
--- a/xmlreader.py
+++ b/xmlreader.py
@@ -56,7 +56,7 @@ class XmlEntry:
     """
     Represents a page.
     """
-    def __init__(self, title, id, text, username, ipedit, timestamp, editRestriction, moveRestriction, revisionid, comment):
+    def __init__(self, title, id, text, username, ipedit, timestamp, editRestriction, moveRestriction, revisionid, comment, redirect):
         # TODO: there are more tags we can read.
         self.title = title
         self.id = id
@@ -68,6 +68,7 @@ class XmlEntry:
         self.moveRestriction = moveRestriction
         self.revisionid = revisionid
         self.comment = comment
+        self.redirect = redirect
 
 
 class XmlHeaderEntry:
@@ -94,6 +95,7 @@ class MediaWikiXmlHandler(xml.sax.handler.ContentHandler):
         self.id = u''
         self.revisionid = u''
         self.comment = u''
+        self.redirect = False
 
     def setCallback(self, callback):
         self.callback = callback
@@ -159,6 +161,8 @@ class MediaWikiXmlHandler(xml.sax.handler.ContentHandler):
             self.inContributorTag = False
         elif name == 'restrictions':
             self.editRestriction, self.moveRestriction = parseRestrictions(self.restrictions)
+        elif name == 'redirect':
+            self.redirect = True
         elif name == 'revision':
             # All done for this.
             # Remove trailing newlines and spaces
@@ -178,7 +182,7 @@ class MediaWikiXmlHandler(xml.sax.handler.ContentHandler):
                              text, self.username, 
                              self.ipedit, timestamp, 
                              self.editRestriction, self.moveRestriction, 
-                             self.revisionid, self.comment)
+                             self.revisionid, self.comment, self.redirect)
             self.inRevisionTag = False
             self.callback(entry)
         elif self.headercallback:
@@ -313,6 +317,10 @@ Consider installing the python-celementtree package.''')
         self.title = elem.findtext("{%s}title" % self.uri)
         self.pageid = elem.findtext("{%s}id" % self.uri)
         self.restrictions = elem.findtext("{%s}restrictions" % self.uri)
+        if elem.findtext("{%s}redirect" % self.uri) is None:
+            self.redirect = False
+        else:
+            self.redirect = True
 
     def _create_revision(self, revision):
         """Creates a Single revision"""
@@ -332,7 +340,8 @@ Consider installing the python-celementtree package.''')
                        editRestriction=editRestriction,
                        moveRestriction=moveRestriction,
                        revisionid=revisionid,
-                       comment=comment
+                       comment=comment,
+                       redirect=self.redirect
                       )
 
     def regex_parse(self):
_______________________________________________
Pywikipedia-l mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/pywikipedia-l

Reply via email to