Author: sebb
Date: Tue Sep 12 20:58:20 2023
New Revision: 1912272
URL: http://svn.apache.org/viewvc?rev=1912272&view=rev
Log:
Use find-ls listing instead of fetching and parsing web pages.
This reduces run-time to less than a minute
compared with about 15 minutes previously
Modified:
comdev/projects.apache.org/trunk/scripts/cronjobs/parsereleases.py
Modified: comdev/projects.apache.org/trunk/scripts/cronjobs/parsereleases.py
URL:
http://svn.apache.org/viewvc/comdev/projects.apache.org/trunk/scripts/cronjobs/parsereleases.py?rev=1912272&r1=1912271&r2=1912272&view=diff
==============================================================================
--- comdev/projects.apache.org/trunk/scripts/cronjobs/parsereleases.py
(original)
+++ comdev/projects.apache.org/trunk/scripts/cronjobs/parsereleases.py Tue Sep
12 20:58:20 2023
@@ -1,9 +1,15 @@
+#!/usr/bin/env python3
+
import errtee # this is imported for its side-effects
+from collections import defaultdict
+import gzip
import re
import json
-from urlutils import URLopen
+from datetime import datetime
+from urlutils import UrlCache
+
"""
-Reads the list of files in http://www.apache.org/dist/
+Reads the list of files in https://downloads.apache.org/zzz/find-ls.gz
Creates:
../../site/json/foundation/releases.json
@@ -17,37 +23,33 @@ The date comes from the first entry
Format:
{ top-level dir: { release-id: [list of files for that release-id]}, ... }
-TODO: it would probably be more efficient to parse the output of
-svn ls -R https://dist.apache.org/repos/dist/release/
-Could cache the output based on the last changed date
-
-Or use https://downloads.apache.org/zzz/find-ls.gz
-
-Or use an rsync listing:
-rsync --list-only -r rsync.apache.org::apache-dist
-Note that rsync excludes hashes, sigs and KEYS files; however they are not
needed here.
"""
-releases = {}
-files = {}
-mainurl = "https://downloads.apache.org/"
-
-# don't try to maintain history for the moment...
-#try:
-# with open("../../site/json/foundation/releases.json") as f:
-# releases = json.loads(f.read())
-# f.close()
-#except Exception as err:
-# print("Could not read releases.json, assuming blank slate")
-
-def getDirList(url):
- print(url)
- try:
- data = URLopen(url).read().decode('utf-8')
- for entry, xd, xdate in re.findall(r"<a
href=\"([^\"/]+)(/?)\">\S+</a>\s+(\d\d\d\d-\d\d-\d\d)", data, re.MULTILINE |
re.UNICODE):
- yield(entry, xdate, xd)
- except:
- pass
+# Listing generated by find(1) -ls
+FIND_LS = 'https://downloads.apache.org/zzz/find-ls.gz'
+
+# key: committee-id, value: dict(key: release version, value: date)
+releases = defaultdict(dict)
+
+# key: committee-id, value: dict(key: release version, value: list of file
names for the release)
+files = defaultdict(lambda: defaultdict(list))
+
+"""
+Parse find(1) dates
+These have month and day of month, and either year or HH:MM if the date is in
the last 6 months or so
+If the year is missing, assume the current year.
+If that generates a date in the future, assume the previous year.
+"""
+def getdate(yort, mon, dom):
+ now = datetime.utcnow()
+ if ':' in yort: # year not provided, assume current year
+ stamp = datetime.strptime(f'{now.year} {mon} {dom} {yort}', '%Y %b %d
%H:%M')
+ if stamp > now:
+ stamp = datetime.strptime(f'{now.year-1} {mon} {dom} {yort}', '%Y
%b %d %H:%M')
+ else:
+
+ stamp = datetime.strptime(f'{yort} {mon} {dom}', '%Y %b %d')
+ return stamp.strftime('%Y-%m-%d')
def cleanFilename(filename):
"""
@@ -76,49 +78,60 @@ def cleanReleases(committeeId):
del releases[committeeId]
del files[committeeId]
-def parseDir(committeeId, path):
- print(" %s..." % path)
- if len(path) > 100:
- print("WARN too long path: recursion?")
- return
- for f, d, xd in getDirList("%s/%s" % (mainurl, path)):
- if xd:
- if ("/%s" % f) not in path and f.lower() not in ['binaries',
'repos', 'updatesite', 'current', 'stable', 'stable1', 'stable2', 'binary',
'notes', 'doc', 'eclipse', 'patches', 'docs', 'changes', 'features', 'tmp',
'cpp', 'php', 'ruby', 'py', 'py3', 'issuesfixed', 'images', 'styles',
'wikipages']:
- parseDir(committeeId, "%s/%s" % (path, f))
- # Eliminate non-source releases
- elif not
re.search(r"(MD5SUM|SHA1SUM|\.s?nupkg|\.md5|\.mds|\.sh1|\.sh2|\.sha|\.asc|\.sig|\.bin|\.pom|\.jar|\.whl|\.pdf|\.xml|\.xsd|\.html|\.txt|\.cfg|\.ish|\.pl|RELEASE.NOTES|LICENSE|KEYS|CHANGELOG|NOTICE|MANIFEST|Changes|readme|x86|amd64|-manual\.|-docs\.|-docs-|-doc-|Announcement|current|-deps|-dependencies|binary|-bin-|-bin\.|-javadoc-|-distro|rat_report|\.png|\.jpg|\.gif|\.sqlite|\.yaml|\.yml|\.prov)",
f, flags=re.IGNORECASE):
- filename = cleanFilename(f)
- if len(filename) > 1:
- if filename not in releases[committeeId]:
- releases[committeeId][filename] = d
- files[committeeId][filename] = []
- print(" - %s\t\t\t%s" % (filename, f))
- files[committeeId][filename].append("%s/%s" % (path, f))
-
-
-for committeeId, d, xdir in getDirList(mainurl):
- if committeeId != 'incubator':
- if committeeId not in ['xml', 'zzz', 'maven-repository']:
- print("Parsing /dist/%s content:" % committeeId)
- releases[committeeId] = releases[committeeId] if committeeId in
releases else {}
- files[committeeId] = {}
- parseDir(committeeId, committeeId)
- cleanReleases(committeeId)
- else:
- for podling, d, xd in getDirList("%s/incubator/" % mainurl):
- print("Parsing /dist/incubator-%s content:" % podling)
- committeeId = "incubator-%s" % podling
- releases[committeeId] = releases[committeeId] if committeeId in
releases else {}
- files[committeeId] = {}
- parseDir(committeeId, "incubator/%s" % podling)
- cleanReleases(committeeId)
-
-print("Writing releases.json")
-with open("../../site/json/foundation/releases.json", "w") as f:
- json.dump(releases, f, sort_keys=True, indent=0)
- f.close()
-with open("../../site/json/foundation/releases-files.json", "w") as f:
- json.dump(files, f, sort_keys=True, indent=0)
- f.close()
-
-print("All done!")
+def parseFile(committeeId, file, date, path):
+ if not
re.search(r"(MD5SUM|SHA1SUM|\.s?nupkg|\.md5|\.mds|\.sh1|\.sh2|\.sha|\.asc|\.sig|\.bin|\.pom|\.jar|\.whl|\.pdf|\.xml|\.xsd|\.html|\.txt|\.cfg|\.ish|\.pl|RELEASE.NOTES|LICENSE|KEYS|CHANGELOG|NOTICE|MANIFEST|Changes|readme|x86|amd64|-manual\.|-docs\.|-docs-|-doc-|Announcement|current|-deps|-dependencies|binary|-bin-|-bin\.|-javadoc-|-distro|rat_report|\.png|\.jpg|\.gif|\.sqlite|\.yaml|\.yml|\.prov)",
file, flags=re.IGNORECASE):
+ filename = cleanFilename(file)
+ if len(filename) > 1:
+ if filename not in releases[committeeId]:
+ releases[committeeId][filename] = date
+ files[committeeId][filename] = []
+ print(f" - {filename}\t\t\t{file}")
+ files[committeeId][filename].append(path)
+
+# Don't visit these directories
+SKIP_DIRS=['hidden', 'css', 'META', 'website', 'binaries', 'repos',
'updatesite', 'current', 'stable', 'stable1', 'stable2', 'binary', 'notes',
'doc', 'eclipse', 'patches', 'docs', 'changes', 'features', 'tmp', 'cpp',
'php', 'ruby', 'py', 'py3', 'issuesfixed', 'images', 'styles', 'wikipages']
+
+def main():
+ uc = UrlCache(silent=True)
+ find_ls = uc.get(FIND_LS, name='find-ls.gz')
+ # 45350913 4 drwxr-xr-x 239 svnwc svnwc 4096 Sep 9 12:08
. [-> other]
+ # 0 1 2 3 4 5 6 7 8 9
10
+ with gzip.open(find_ls, mode='rt') as r:
+ for l in r:
+ fields = l.split() # split the find line (the split drops the
final LF)
+ if not fields[2].startswith('-'): # only want plain files
+ continue
+ path = fields.pop()[2:] # last entry on line is the path; also
drop the ./ prefix
+ segs = path.split('/')
+ if len(segs) == 1: # ignore top level files
+ continue
+ file = segs.pop() # basename
+ # Ignore invisible files
+ if file.startswith('.') or file in ['favicon.ico', 'META']:
+ continue
+ if any( seg in SKIP_DIRS for seg in segs):
+ # print('SKIP', segs)
+ continue
+
+ committeeId = segs[0]
+ if committeeId in ['zzz']:
+ continue
+ if committeeId == 'incubator':
+ podling = segs[1]
+ committeeId = f'incubator-{podling}'
+ # Now store the info
+ yort = fields.pop()
+ dom = fields.pop()
+ mon = fields.pop()
+ stamp = getdate(yort, mon, dom)
+ parseFile(committeeId, file, stamp, path)
+
+if __name__ == '__main__':
+ main()
+ print("Writing releases.json")
+ with open("../../site/json/foundation/releases.json", "w") as f:
+ json.dump(releases, f, sort_keys=True, indent=0)
+ print("Writing releases-files.json")
+ with open("../../site/json/foundation/releases-files.json", "w") as f:
+ json.dump(files, f, sort_keys=True, indent=0)
+ print("All done!")