parsereleases.py

sebb Tue, 12 Sep 2023 13:59:13 -0700

Author: sebb
Date: Tue Sep 12 20:58:20 2023
New Revision: 1912272

URL: http://svn.apache.org/viewvc?rev=1912272&view=rev
Log:
Use find-ls listing instead of fetching and parsing web pages.


This reduces run-time to less than a minute
compared with about 15 minutes previously

Modified:
    comdev/projects.apache.org/trunk/scripts/cronjobs/parsereleases.py

Modified: comdev/projects.apache.org/trunk/scripts/cronjobs/parsereleases.py
URL: 
http://svn.apache.org/viewvc/comdev/projects.apache.org/trunk/scripts/cronjobs/parsereleases.py?rev=1912272&r1=1912271&r2=1912272&view=diff
==============================================================================
--- comdev/projects.apache.org/trunk/scripts/cronjobs/parsereleases.py 
(original)
+++ comdev/projects.apache.org/trunk/scripts/cronjobs/parsereleases.py Tue Sep 
12 20:58:20 2023
@@ -1,9 +1,15 @@
+#!/usr/bin/env python3
+
 import errtee # this is imported for its side-effects
+from collections import defaultdict
+import gzip
 import re
 import json
-from urlutils import URLopen
+from datetime import datetime
+from urlutils import UrlCache
+
 """
-Reads the list of files in http://www.apache.org/dist/
+Reads the list of files in https://downloads.apache.org/zzz/find-ls.gz
 
 Creates:
 ../../site/json/foundation/releases.json
@@ -17,37 +23,33 @@ The date comes from the first entry
 Format:
 { top-level dir: { release-id: [list of files for that release-id]}, ... }
 
-TODO: it would probably be more efficient to parse the output of
-svn ls -R https://dist.apache.org/repos/dist/release/
-Could cache the output based on the last changed date
-
-Or use https://downloads.apache.org/zzz/find-ls.gz
-
-Or use an rsync listing:
-rsync --list-only -r rsync.apache.org::apache-dist
-Note that rsync excludes hashes, sigs and KEYS files; however they are not 
needed here.
 """
 
-releases = {}
-files = {}
-mainurl = "https://downloads.apache.org/";
-
-# don't try to maintain history for the moment...
-#try:
-#    with open("../../site/json/foundation/releases.json") as f:
-#        releases = json.loads(f.read())
-#        f.close()
-#except Exception as err:
-#    print("Could not read releases.json, assuming blank slate")
-
-def getDirList(url):
-    print(url)
-    try:
-        data = URLopen(url).read().decode('utf-8')
-        for entry, xd, xdate in re.findall(r"<a 
href=\"([^\"/]+)(/?)\">\S+</a>\s+(\d\d\d\d-\d\d-\d\d)", data, re.MULTILINE | 
re.UNICODE):
-            yield(entry, xdate, xd)
-    except:
-        pass
+# Listing generated by find(1) -ls
+FIND_LS = 'https://downloads.apache.org/zzz/find-ls.gz'
+
+# key: committee-id, value: dict(key: release version, value: date)
+releases = defaultdict(dict)
+
+# key: committee-id, value: dict(key: release version, value: list of file 
names for the release)
+files = defaultdict(lambda: defaultdict(list))
+
+"""
+Parse find(1) dates
+These have month and day of month, and either year or HH:MM if the date is in 
the last 6 months or so
+If the year is missing, assume the current year.
+If that generates a date in the future, assume the previous year.
+"""
+def getdate(yort, mon, dom):
+    now = datetime.utcnow()
+    if ':' in yort: # year not provided, assume current year
+        stamp = datetime.strptime(f'{now.year} {mon} {dom} {yort}', '%Y %b %d 
%H:%M')
+        if stamp > now:
+            stamp = datetime.strptime(f'{now.year-1} {mon} {dom} {yort}', '%Y 
%b %d %H:%M')
+    else:
+
+        stamp = datetime.strptime(f'{yort} {mon} {dom}', '%Y %b %d')
+    return stamp.strftime('%Y-%m-%d')
 
 def cleanFilename(filename):
     """
@@ -76,49 +78,60 @@ def cleanReleases(committeeId):
         del releases[committeeId]
         del files[committeeId]
 
-def parseDir(committeeId, path):
-    print("              %s..." % path)
-    if len(path) > 100:
-        print("WARN too long path: recursion?")
-        return
-    for f, d, xd in getDirList("%s/%s" % (mainurl, path)):
-        if xd:
-            if ("/%s" % f) not in path and f.lower() not in ['binaries', 
'repos', 'updatesite', 'current', 'stable', 'stable1', 'stable2', 'binary', 
'notes', 'doc', 'eclipse', 'patches', 'docs', 'changes', 'features', 'tmp', 
'cpp', 'php', 'ruby', 'py', 'py3', 'issuesfixed', 'images', 'styles', 
'wikipages']:
-                parseDir(committeeId, "%s/%s" % (path, f))
-        # Eliminate non-source releases
-        elif not 
re.search(r"(MD5SUM|SHA1SUM|\.s?nupkg|\.md5|\.mds|\.sh1|\.sh2|\.sha|\.asc|\.sig|\.bin|\.pom|\.jar|\.whl|\.pdf|\.xml|\.xsd|\.html|\.txt|\.cfg|\.ish|\.pl|RELEASE.NOTES|LICENSE|KEYS|CHANGELOG|NOTICE|MANIFEST|Changes|readme|x86|amd64|-manual\.|-docs\.|-docs-|-doc-|Announcement|current|-deps|-dependencies|binary|-bin-|-bin\.|-javadoc-|-distro|rat_report|\.png|\.jpg|\.gif|\.sqlite|\.yaml|\.yml|\.prov)",
 f, flags=re.IGNORECASE):
-            filename = cleanFilename(f)
-            if len(filename) > 1:
-                if filename not in releases[committeeId]:
-                    releases[committeeId][filename] = d
-                    files[committeeId][filename] = []
-                    print("                  - %s\t\t\t%s" % (filename, f))
-                files[committeeId][filename].append("%s/%s" % (path, f))
-
-
-for committeeId, d, xdir in getDirList(mainurl):
-    if committeeId != 'incubator':
-        if committeeId not in ['xml', 'zzz', 'maven-repository']:
-            print("Parsing /dist/%s content:" % committeeId)
-            releases[committeeId] = releases[committeeId] if committeeId in 
releases else {}
-            files[committeeId] = {}
-            parseDir(committeeId, committeeId)
-            cleanReleases(committeeId)
-    else:
-        for podling, d, xd in getDirList("%s/incubator/" % mainurl):
-            print("Parsing /dist/incubator-%s content:" % podling)
-            committeeId = "incubator-%s" % podling
-            releases[committeeId] = releases[committeeId] if committeeId in 
releases else {}
-            files[committeeId] = {}
-            parseDir(committeeId, "incubator/%s" % podling)
-            cleanReleases(committeeId)
-
-print("Writing releases.json")
-with open("../../site/json/foundation/releases.json", "w") as f:
-    json.dump(releases, f, sort_keys=True, indent=0)
-    f.close()
-with open("../../site/json/foundation/releases-files.json", "w") as f:
-    json.dump(files, f, sort_keys=True, indent=0)
-    f.close()
-
-print("All done!")
+def parseFile(committeeId, file, date, path):
+    if not 
re.search(r"(MD5SUM|SHA1SUM|\.s?nupkg|\.md5|\.mds|\.sh1|\.sh2|\.sha|\.asc|\.sig|\.bin|\.pom|\.jar|\.whl|\.pdf|\.xml|\.xsd|\.html|\.txt|\.cfg|\.ish|\.pl|RELEASE.NOTES|LICENSE|KEYS|CHANGELOG|NOTICE|MANIFEST|Changes|readme|x86|amd64|-manual\.|-docs\.|-docs-|-doc-|Announcement|current|-deps|-dependencies|binary|-bin-|-bin\.|-javadoc-|-distro|rat_report|\.png|\.jpg|\.gif|\.sqlite|\.yaml|\.yml|\.prov)",
 file, flags=re.IGNORECASE):
+        filename = cleanFilename(file)
+        if len(filename) > 1:
+            if filename not in releases[committeeId]:
+                releases[committeeId][filename] = date
+                files[committeeId][filename] = []
+                print(f"                  - {filename}\t\t\t{file}")
+            files[committeeId][filename].append(path)
+
+# Don't visit these directories
+SKIP_DIRS=['hidden', 'css', 'META', 'website', 'binaries', 'repos', 
'updatesite', 'current', 'stable', 'stable1', 'stable2', 'binary', 'notes', 
'doc', 'eclipse', 'patches', 'docs', 'changes', 'features', 'tmp', 'cpp', 
'php', 'ruby', 'py', 'py3', 'issuesfixed', 'images', 'styles', 'wikipages']
+
+def main():
+    uc = UrlCache(silent=True)
+    find_ls = uc.get(FIND_LS, name='find-ls.gz')
+    #  45350913      4 drwxr-xr-x 239 svnwc    svnwc        4096 Sep  9 12:08 
. [-> other]
+    #      0         1     2       3   4         5            6   7   8  9    
10
+    with gzip.open(find_ls, mode='rt') as r:
+        for l in r:
+            fields = l.split() # split the find line (the split drops the 
final LF)
+            if not fields[2].startswith('-'): # only want plain files
+                continue
+            path = fields.pop()[2:] # last entry on line is the path; also 
drop the ./ prefix
+            segs = path.split('/')
+            if len(segs) == 1: # ignore top level files
+                continue
+            file = segs.pop() # basename
+            # Ignore invisible files
+            if file.startswith('.') or file in ['favicon.ico', 'META']:
+                continue
+            if any( seg in SKIP_DIRS for seg in segs):
+                # print('SKIP', segs)
+                continue
+
+            committeeId = segs[0]
+            if committeeId in ['zzz']:
+                continue
+            if committeeId == 'incubator':
+                podling = segs[1]
+                committeeId = f'incubator-{podling}'
+            # Now store the info
+            yort = fields.pop()
+            dom = fields.pop()
+            mon = fields.pop()
+            stamp = getdate(yort, mon, dom)
+            parseFile(committeeId, file, stamp, path)
+
+if __name__ == '__main__':
+    main()
+    print("Writing releases.json")
+    with open("../../site/json/foundation/releases.json", "w") as f:
+        json.dump(releases, f, sort_keys=True, indent=0)
+    print("Writing releases-files.json")
+    with open("../../site/json/foundation/releases-files.json", "w") as f:
+        json.dump(files, f, sort_keys=True, indent=0)
+    print("All done!")

svn commit: r1912272 - /comdev/projects.apache.org/trunk/scripts/cronjobs/parsereleases.py

Reply via email to