Revision: 8103
          http://svn.sourceforge.net/mailman/?rev=8103&view=rev
Author:   bwarsaw
Date:     2006-12-02 16:22:17 -0800 (Sat, 02 Dec 2006)

Log Message:
-----------
A simple attachment scraper, given that SF has fixed the tracker data export
feature.

Added Paths:
-----------
    trunk/sf/
    trunk/sf/getfiles.py


Property changes on: trunk/sf
___________________________________________________________________
Name: svn:ignore
   + mailman.xml
attachments


Added: trunk/sf/getfiles.py
===================================================================
--- trunk/sf/getfiles.py                                (rev 0)
+++ trunk/sf/getfiles.py        2006-12-03 00:22:17 UTC (rev 8103)
@@ -0,0 +1,112 @@
+#! /usr/bin/env python
+
+import os
+import re
+import sys
+import errno
+
+from xml.dom import minidom
+
+# The xml doesn't contain a mapping from artifact_type to tracker_id.  In
+# fact, tracker_id is nowhere to be (structurally) found in the xml.  So we
+# just have to know.
+
+group_id = 103 # Mailman
+
+tracker_ids = {
+    'bugs'              : 100103,
+    'feature requests'  : 350103,
+    'patches'           : 300103,
+    }
+
+urltmpl = ('http://sourceforge.net/tracker/download.php?'
+           'group_id=%d&atid=%d&file_id=%d&aid=%d')
+
+doc = minidom.parse(open(sys.argv[1]))
+doc.normalize()
+
+# In order to download all attachments, we need to know 4 pieces of
+# information, the group_id (e.g. 103 for Mailman), the tracker_id, the
+# file_id and the aid (artifact id).  To find the aid, you have to search for
+# a field with a name "artifact_history", then see if there's a field named
+# "field_name" inside there with text value "File Added".  If you find that,
+# look for a field named "old_value" in the artifact history's and pluck off
+# the 6 digit number preceding the colon and the patch name.  Yay, couldn't be
+# easier <wink>.
+
+# list of tuples (tracker_id, file_id, aid)
+attachments = []
+
+for node in doc.getElementsByTagName('history'):
+    file_added = False
+    file_id = None
+    for child in node.childNodes:
+        if     child.nodeType <> minidom.Node.ELEMENT_NODE or \
+               child.nodeName <> 'field':
+            continue
+        name = child.getAttribute('name')
+        if name == 'field_name':
+            for grandchild in child.childNodes:
+                if     grandchild.nodeType == minidom.Node.TEXT_NODE and \
+                       grandchild.nodeValue.lower() == 'file added':
+                    file_added = True
+                    break
+        elif name == 'old_value':
+            for grandchild in child.childNodes:
+                if grandchild.nodeType == minidom.Node.TEXT_NODE:
+                    mo = re.match('(\d+):', grandchild.nodeValue)
+                    if mo:
+                        file_id = int(mo.group(1))
+                        break
+    if not file_added:
+        # No attachments
+        continue
+    # Search up nodes to find artifact_id and artifact_type, the latter
+    # which maps to tracker_id
+    artifact_id = None
+    tracker_id = None
+    parent = node.parentNode
+    while parent:
+        if parent.nodeName <> 'artifact':
+            parent = parent.parentNode
+            continue
+        for child in parent.childNodes:
+            if     child.nodeType <> minidom.Node.ELEMENT_NODE \
+                   or child.nodeName <> 'field':
+                continue
+            attr = child.getAttribute('name')
+            if attr == 'artifact_id':
+                for grandchild in child.childNodes:
+                    if grandchild.nodeType == minidom.Node.TEXT_NODE:
+                        artifact_id = int(grandchild.nodeValue)
+                        break
+            elif attr == 'artifact_type':
+                for grandchild in child.childNodes:
+                    if grandchild.nodeType == minidom.Node.TEXT_NODE:
+                        atype = grandchild.nodeValue
+                        tracker_id = tracker_ids.get(atype.lower())
+                        break
+        # We've parsed the artifact node, so that's all we need
+        break
+    # Sanity check
+    if artifact_id is None:
+        print 'missing artifact id'
+    elif tracker_id is None:
+        print 'missing tracker_id for artifact:', artifact_id
+    elif file_id is None:
+        print 'missing file_id for artifact:', artifact_id
+    else:
+        attachments.append((tracker_id, file_id, artifact_id))
+
+print 'attachments found:', len(attachments)
+
+try:
+    os.mkdir('attachments')
+except OSError, e:
+    if e.errno <> errno.EEXIST:
+        raise
+
+for tracker_id, file_id, aid in attachments:
+    url = urltmpl % (group_id, tracker_id, file_id, aid)
+    cmd = "wget '%s' -O attachments/%d" % (url, file_id)
+    os.system(cmd)


Property changes on: trunk/sf/getfiles.py
___________________________________________________________________
Name: svn:executable
   + *


This was sent by the SourceForge.net collaborative development platform, the 
world's largest Open Source development site.
_______________________________________________
Mailman-checkins mailing list
[email protected]
Unsubscribe: 
http://mail.python.org/mailman/options/mailman-checkins/archive%40jab.org

Reply via email to