Revision: 8103
http://svn.sourceforge.net/mailman/?rev=8103&view=rev
Author: bwarsaw
Date: 2006-12-02 16:22:17 -0800 (Sat, 02 Dec 2006)
Log Message:
-----------
A simple attachment scraper, given that SF has fixed the tracker data export
feature.
Added Paths:
-----------
trunk/sf/
trunk/sf/getfiles.py
Property changes on: trunk/sf
___________________________________________________________________
Name: svn:ignore
+ mailman.xml
attachments
Added: trunk/sf/getfiles.py
===================================================================
--- trunk/sf/getfiles.py (rev 0)
+++ trunk/sf/getfiles.py 2006-12-03 00:22:17 UTC (rev 8103)
@@ -0,0 +1,112 @@
+#! /usr/bin/env python
+
+import os
+import re
+import sys
+import errno
+
+from xml.dom import minidom
+
+# The xml doesn't contain a mapping from artifact_type to tracker_id. In
+# fact, tracker_id is nowhere to be (structurally) found in the xml. So we
+# just have to know.
+
+group_id = 103 # Mailman
+
+tracker_ids = {
+ 'bugs' : 100103,
+ 'feature requests' : 350103,
+ 'patches' : 300103,
+ }
+
+urltmpl = ('http://sourceforge.net/tracker/download.php?'
+ 'group_id=%d&atid=%d&file_id=%d&aid=%d')
+
+doc = minidom.parse(open(sys.argv[1]))
+doc.normalize()
+
+# In order to download all attachments, we need to know 4 pieces of
+# information, the group_id (e.g. 103 for Mailman), the tracker_id, the
+# file_id and the aid (artifact id). To find the aid, you have to search for
+# a field with a name "artifact_history", then see if there's a field named
+# "field_name" inside there with text value "File Added". If you find that,
+# look for a field named "old_value" in the artifact history's and pluck off
+# the 6 digit number preceding the colon and the patch name. Yay, couldn't be
+# easier <wink>.
+
+# list of tuples (tracker_id, file_id, aid)
+attachments = []
+
+for node in doc.getElementsByTagName('history'):
+ file_added = False
+ file_id = None
+ for child in node.childNodes:
+ if child.nodeType <> minidom.Node.ELEMENT_NODE or \
+ child.nodeName <> 'field':
+ continue
+ name = child.getAttribute('name')
+ if name == 'field_name':
+ for grandchild in child.childNodes:
+ if grandchild.nodeType == minidom.Node.TEXT_NODE and \
+ grandchild.nodeValue.lower() == 'file added':
+ file_added = True
+ break
+ elif name == 'old_value':
+ for grandchild in child.childNodes:
+ if grandchild.nodeType == minidom.Node.TEXT_NODE:
+ mo = re.match('(\d+):', grandchild.nodeValue)
+ if mo:
+ file_id = int(mo.group(1))
+ break
+ if not file_added:
+ # No attachments
+ continue
+ # Search up nodes to find artifact_id and artifact_type, the latter
+ # which maps to tracker_id
+ artifact_id = None
+ tracker_id = None
+ parent = node.parentNode
+ while parent:
+ if parent.nodeName <> 'artifact':
+ parent = parent.parentNode
+ continue
+ for child in parent.childNodes:
+ if child.nodeType <> minidom.Node.ELEMENT_NODE \
+ or child.nodeName <> 'field':
+ continue
+ attr = child.getAttribute('name')
+ if attr == 'artifact_id':
+ for grandchild in child.childNodes:
+ if grandchild.nodeType == minidom.Node.TEXT_NODE:
+ artifact_id = int(grandchild.nodeValue)
+ break
+ elif attr == 'artifact_type':
+ for grandchild in child.childNodes:
+ if grandchild.nodeType == minidom.Node.TEXT_NODE:
+ atype = grandchild.nodeValue
+ tracker_id = tracker_ids.get(atype.lower())
+ break
+ # We've parsed the artifact node, so that's all we need
+ break
+ # Sanity check
+ if artifact_id is None:
+ print 'missing artifact id'
+ elif tracker_id is None:
+ print 'missing tracker_id for artifact:', artifact_id
+ elif file_id is None:
+ print 'missing file_id for artifact:', artifact_id
+ else:
+ attachments.append((tracker_id, file_id, artifact_id))
+
+print 'attachments found:', len(attachments)
+
+try:
+ os.mkdir('attachments')
+except OSError, e:
+ if e.errno <> errno.EEXIST:
+ raise
+
+for tracker_id, file_id, aid in attachments:
+ url = urltmpl % (group_id, tracker_id, file_id, aid)
+ cmd = "wget '%s' -O attachments/%d" % (url, file_id)
+ os.system(cmd)
Property changes on: trunk/sf/getfiles.py
___________________________________________________________________
Name: svn:executable
+ *
This was sent by the SourceForge.net collaborative development platform, the
world's largest Open Source development site.
_______________________________________________
Mailman-checkins mailing list
[email protected]
Unsubscribe:
http://mail.python.org/mailman/options/mailman-checkins/archive%40jab.org