Author: lucas Date: 2009-11-14 08:18:33 +0000 (Sat, 14 Nov 2009) New Revision: 1628
Added: udd/scripts/fix-removal-timestamps.py udd/udd/removals_gatherer.py Modified: udd/config-org.yaml udd/crontabs udd/sql/setup.sql Log: add removals gatherer Modified: udd/config-org.yaml =================================================================== --- udd/config-org.yaml 2009-11-14 08:18:00 UTC (rev 1627) +++ udd/config-org.yaml 2009-11-14 08:18:33 UTC (rev 1628) @@ -22,6 +22,7 @@ dehs: module udd.dehs_gatherer ldap: module udd.ldap_gatherer wannabuild: module udd.wannabuild_gatherer + removals: module udd.removals_gatherer timestamp-dir: /org/udd.debian.org/timestamps lock-dir: /org/udd.debian.org/locks archs: @@ -445,3 +446,9 @@ i386, ia64, kfreebsd-amd64, kfreebsd-i386, mips, mipsel, powerpc, s390, sparc] +removals: + type: removals + update-command: wget -q http://ftp-master.debian.org/removals-full.txt -O - | scripts/fix-removal-timestamps.py > /org/udd.debian.org/mirrors/removals-full.txt + path: /org/udd.debian.org/mirrors/removals-full.txt + table: package_removal + schema: package_removal Modified: udd/crontabs =================================================================== --- udd/crontabs 2009-11-14 08:18:00 UTC (rev 1627) +++ udd/crontabs 2009-11-14 08:18:33 UTC (rev 1628) @@ -8,7 +8,7 @@ # Ubuntu Sources/Packages 30 2 * * * $UAR ubuntu-lucid ubuntu-karmic ubuntu-hardy ubuntu-intrepid ubuntu-jaunty # Various simple things -0 4 * * * $UAR dehs debian-popcon ubuntu-popcon lintian debtags carnivore ldap | /org/udd.debian.org/udd/scripts/filter-output.rb +0 4,16 * * * $UAR dehs debian-popcon ubuntu-popcon lintian debtags carnivore ldap removals | /org/udd.debian.org/udd/scripts/filter-output.rb 49 */6 * * * $UAR upload-history | /org/udd.debian.org/udd/scripts/filter-output.rb 0 */12 * * * $UAR testing-migrations 11 */4 * * * $UAR wannabuild | /org/udd.debian.org/udd/scripts/filter-output.rb Added: udd/scripts/fix-removal-timestamps.py =================================================================== --- udd/scripts/fix-removal-timestamps.py (rev 0) +++ udd/scripts/fix-removal-timestamps.py 2009-11-14 08:18:33 UTC (rev 1628) @@ -0,0 +1,41 @@ +#!/usr/bin/env python + +# This file is a part of the Ultimate Debian Database +# <http://wiki.debian.org/UltimateDebianDatabase> +# +# Copyright (C) 2009 Serafeim Zanikolas <ser...@hellug.gr> +# +# This file is distributed under the terms of the General Public +# License version 3 or (at your option) any later version. + +""" +Quick hack to fix broken timestamp entries in ftp-archive package removals +history file. + +Before: + + [Date: Tue, 27 Oct 2009 19:41:19 +0000 + ] [ftpmaster: Archive Administrator] + +After applying this script: + + [Date: Tue, 27 Oct 2009 19:41:19 +0000] [ftpmaster: Archive Administrator] +""" + +import sys + +prev_line = None +for line in sys.stdin: + line = line.rstrip() + if prev_line is None: + prev_line = line + continue + if line.startswith("] [ftpmaster:"): + assert prev_line + print "%s%s" % (prev_line, line) + prev_line = None + else: + print prev_line + prev_line = line +if prev_line: + print prev_line Property changes on: udd/scripts/fix-removal-timestamps.py ___________________________________________________________________ Added: svn:executable + * Modified: udd/sql/setup.sql =================================================================== --- udd/sql/setup.sql 2009-11-14 08:18:00 UTC (rev 1627) +++ udd/sql/setup.sql 2009-11-14 08:18:33 UTC (rev 1628) @@ -534,6 +534,29 @@ ); GRANT SELECT ON wannabuild TO public; +-- package_removal_batch +CREATE TABLE package_removal_batch ( + id int, + time timestamp, + ftpmaster text, + distribution text, + requestor text, + reasons text, + PRIMARY KEY (id) +); +GRANT SELECT ON package_removal_batch TO public; + +-- package_removal +CREATE TABLE package_removal ( + batch_id int, + name text, + version debversion, + arch_array text[], + PRIMARY KEY(batch_id, name, version), + FOREIGN KEY(batch_id) REFERENCES package_removal_batch(id) +); +GRANT SELECT ON package_removal TO public; + -- timings of data operations CREATE TABLE timestamps ( id serial, Added: udd/udd/removals_gatherer.py =================================================================== --- udd/udd/removals_gatherer.py (rev 0) +++ udd/udd/removals_gatherer.py 2009-11-14 08:18:33 UTC (rev 1628) @@ -0,0 +1,269 @@ +#!/usr/bin/env python + +# This file is a part of the Ultimate Debian Database +# <http://wiki.debian.org/UltimateDebianDatabase> +# +# Copyright (C) 2009 Serafeim Zanikolas <ser...@hellug.gr> +# +# This file is distributed under the terms of the General Public +# License version 3 or (at your option) any later version. + +""" import data about the removal of packages (from the debian archive) in UDD + +Raw data source: http://ftp-master.debian.org/removals-full.txt + +Sample removal batch from the above file: + +========================================================================= +[Date: Tue, 9 Jan 2001 20:52:51 -0500] [ftpmaster: James Troup] +Removed the following packages from unstable: + + dsniff | 2.3-1 | source, i386 +Closed bugs: 81709 + +------------------- Reason ------------------- +ROM; moved to non-US (now depends on libssl) +---------------------------------------------- +========================================================================= + +Note that a removal batch may have many packages removed (unlike the one +above, where only dsniff is removed). + +This script when ran as a standalone script will not connect to the database +but will instead run a basic sanity test (to make sure that the input file +hasn't changed in a way that would break the script). +""" + +import sys +import re + +from gatherer import gatherer +from aux import quote + +def fail(msg): + sys.stderr.write("%s\n" % msg) + exit(1) + +def parse_removals(stream): + # We expect lines to appear in the order below. parser.curr_func is set to + # one of several functions based on how we expect to show up next in the + # file. + # + # date; ftp-master name + # distrib + # skip_line* + # pkg name | version | arch[, arch] <-- >=1 lines like these + # skip_line* + #------------------- Reason ------------------- + # requestor; reasons + + parser = Parser() + for line in stream: + if parser.skip_line(line): + continue + if parser.curr_func(line): + continue + return parser.removal_batches + +def get_gatherer(connection, config, source): + return removals_gatherer(connection, config, source) + +class removals_gatherer(gatherer): + """import removals into the database""" + + def __init__(self, connection, config, source): + gatherer.__init__(self, connection, config, source) + self.assert_my_config('path', 'table') + + def run(self): + conf = self.my_config + + try: + input_fd = open(conf['path']) + except IOError: + fail('failed to open %s' % conf['path']) + + batch_removals = parse_removals(input_fd) + + pkg_removal_table = conf['table'] + pkg_removal_batch_table = "%s_batch" % conf['table'] + + cur = self.cursor() + cur.execute('DELETE FROM %s' % pkg_removal_table) + cur.execute('DELETE FROM %s' % pkg_removal_batch_table) + + # insert data for batches of removals + cur.execute('PREPARE batch_removals_insert ' \ + 'AS INSERT INTO %s (id, time, ftpmaster, ' \ + 'distribution, requestor, ' \ + 'reasons)' \ + 'VALUES ($1, $2, $3, $4, $5, $6)' \ + % pkg_removal_batch_table) + for i, batch_removal in enumerate(batch_removals): + cur.execute('EXECUTE batch_removals_insert ' \ + '(%s, %s, %s, %s, %s, %s)' \ + % (i, quote(batch_removal.timestamp), + quote(batch_removal.ftpmaster), + quote(batch_removal.distribution), + quote(batch_removal.requestor), + quote(batch_removal.reasons))) + cur.execute('DEALLOCATE batch_removals_insert') + cur.execute("ANALYZE %s" % pkg_removal_batch_table) + + # insert data for removals of individual packages + cur.execute('PREPARE pkg_removal_insert ' \ + 'AS INSERT INTO %s (batch_id, name, version, ' \ + 'arch_array)' \ + 'VALUES ($1, $2, $3, $4)' % pkg_removal_table) + for i, batch_removal in enumerate(batch_removals): + for pkg in batch_removal.packages: + cur.execute('EXECUTE pkg_removal_insert (%s, %s, %s, %s)' \ + % (i, quote(pkg.name), quote(pkg.version), + quote("{%s}" % ",".join(pkg.arches)))) + cur.execute('DEALLOCATE pkg_removal_insert') + cur.execute("ANALYZE %s" % pkg_removal_table) + +def test(filename, removal_batches): + """compare the number of parsed packages against those counted with a + shell one-liner""" + + from commands import getstatusoutput + + status, npackage_removals_via_grep = getstatusoutput(\ + "egrep '[^ ]+ *\| *[^ ]+ *\| *[^ ]+' %s | " \ + "awk '-F|' '{print $1, $2}' | sed 's/ */ /g' | wc -l" \ + % filename) + if status != 0: + fail("failed to extract removed packages with grep") + npackage_removals_via_grep = int(npackage_removals_via_grep) + + npackage_removals_via_python = 0 + ftpmasters = set() + distribs = set() + package_removals_via_python = set() + for pkg_rm_batch in removal_batches: + npackage_removals_via_python += len(pkg_rm_batch.packages) + ftpmasters.add(pkg_rm_batch.ftpmaster) + distribs.add(pkg_rm_batch.distribution) + + if npackage_removals_via_grep != npackage_removals_via_python: + fail("%d removed packages have been parsed but %d were expected" % \ + (npackage_removals_via_python, npackage_removals_via_grep)) + + print '%d packages were removed from %d distributions, in %d\n' \ + 'batches of removals done by %d ftpmaster members' % \ + (npackage_removals_via_python, len(distribs), + len(removal_batches), len(ftpmasters)) + + +class Package(object): + """container for a single removed package""" + def __init__(self, name, version, arches): + self.name = name + self.version = version + self.arches = [arch.strip() for arch in arches.split(",")] + + def __str__(self): + return '%s-%s' % (self.name, self.version) + +class PackageRemovalBatch(object): + """container for a removal batch (refers to one or more packages)""" + def __init__(self, timestamp, ftpmaster): + self.timestamp = timestamp + self.ftpmaster = ftpmaster + self.distribution = None + self.packages = [] + self.requestor = None + self.reasons = None + + def add_pkg(self, pkg): + self.packages.append(pkg) + + def __str__(self): + return "removal of %s at %s by %s from %s" \ + % ("\n".join([str(p) for p in self.packages]), \ + self.timestamp, self.ftpmaster, self.distribution) + +class Parser(object): + date_master_pat = re.compile(r"\[Date: ([^\]]+)] \[ftpmaster: ([^\]]+)\]") + distrib_pat = re.compile(r"Removed the following packages from ([a-z-]+)[:,]*") + pkg_version_arches_pat = re.compile(r"\s*(\S*) *\|\s*(\S+)\s*\|\s*(.*)$") + reason_pat = re.compile("-+\s*Reason\s*-+") + rene_pat = re.compile("(\[rene[^\]]*\])\s*(.*)") + + def __init__(self): + self.removal_batch = None + self.removal_batches = [] + self.curr_func = self.parse_removal + + def skip_line(self, line): + if line.isspace() or line == "": + return True + + def parse_removal(self, line): + match = Parser.date_master_pat.search(line) + if match: + timestamp, ftpmaster = match.groups() + self.removal_batch = PackageRemovalBatch(timestamp, ftpmaster) + self.curr_func = self.parse_distrib + return True + + def parse_distrib(self, line): + match = Parser.distrib_pat.search(line) + if match: + self.removal_batch.distribution = match.group(1) + self.curr_func = self.parse_pkg_version_arch_or_reason_header + return True + + def parse_pkg_version_arch_or_reason_header(self, line): + match = Parser.pkg_version_arches_pat.search(line) + if match: + pkg, version, arches = match.groups() + pkg_obj = Package(pkg, version, arches) + if self.removal_batch: + self.removal_batch.add_pkg(pkg_obj) + return True + elif self.removal_batch: + match = Parser.reason_pat.search(line) + if match: + self.curr_func = self.parse_requestor_reasons + return True + + def parse_requestor_reasons(self, line): + match = Parser.rene_pat.search(line) + if match: + self.removal_batch.requestor = match.group(1) + self.removal_batch.reasons = match.group(2) + else: + fields = line.split(';') + if fields == 1: # assume no requestor + self.removal_batch.requestor = None + self.removal_batch.reasons = line + else: + self.removal_batch.requestor = fields[0] + self.removal_batch.reasons = ";".join(fields[1:]) + self.curr_func = self.conclude_batch + return True # assume that we always get fed the correct line + + def conclude_batch(self, line): + if line.startswith("---------") and self.removal_batch is not None: + self.removal_batches.append(self.removal_batch) + self.removal_batch = None + self.curr_func = self.parse_removal + return True + +if '__main__' == __name__: + import os + + try: + filename = sys.argv[1] + input_fd = open(filename) + except IndexError: + fail("syntax: %s <removals-file>\n" \ + "(when run from the command line will only prints stats)" \ + % os.path.basename(sys.argv[0])) + except IOError: + fail("failed to open %s" % filename) + + batch_removals = parse_removals(input_fd) + test(filename, batch_removals) _______________________________________________ Collab-qa-commits mailing list Collab-qa-commits@lists.alioth.debian.org http://lists.alioth.debian.org/mailman/listinfo/collab-qa-commits