Bug#710813: lintian: Restructure the laboratory/data storage

Helmut Grohne Sat, 03 Aug 2013 09:46:27 -0700

On Sun, Jun 02, 2013 at 06:49:09PM +0200, Niels Thykier wrote:
> During the current full run, lintian.d.o ran out of inodes (1.9M).  At
> the moment, I have disabled experimental which I hope will work around
> the problem for now.


I was working on something entirely different which happened to relate
to this issue. I sought the answer to the question "What are popular
file types in binary packages?". Lintian happens to be part of the
answer, because it runs file on all packages and saves the results as
file-index.gz (per package). Since grepping 40k .gz files was not an
option, I converted them to a sqlite database. Now replacing all those
file-index.gz with one sqlite database surely reduces inode usage!

You can find my import script attached. It expects a lintian laboratory
with the layout on lilburn and writes to ./fileinfo.sqlite3. If run
repeatedly it imports only new packages (incremental runs should be
fast, provided you have an index on the package table). Note that the
output of file is preprocessed to remove ELF BuildIds, image dimensions,
creation times and similar stuff one generally doesn't need when
checking packages. A fresh import took about two hours on lilburn and
was mostly io-bound. The resulting database has 431MB.

Note that most of the space used is allocated to filenames. So if
index.gz and index-owner.gz are imported into the very same database,
I'd expect it to grow by no more than 100MB (excluding indices).

I'll leave the db around on lilburn.d.o:~helmutg/fileinfo.sqlite3 for
inspection for some time.

Hope this helps.

Helmut

#!/usr/bin/python
# share results with #710813

import gzip
import os
import re
import sqlite3

def find_file_info(labpath="/srv/lintian.debian.org/laboratory"):
    labpath = os.path.join(labpath, "pool")
    for letter in os.listdir(labpath):
        if letter.startswith("."):
            continue
        subdir = os.path.join(labpath, letter)
        if not os.path.isdir(subdir):
            continue
        for srcpkg in os.listdir(os.path.join(labpath, letter)):
            if srcpkg.startswith("."):
                continue
            srcdir = os.path.join(subdir, srcpkg)
            if not os.path.isdir(srcdir):
                continue
            for binpkg in os.listdir(srcdir):
                match = re.match("^([^_]+)_([^_]+)_([^_]+)_binary$", binpkg)
                if not match:
                    continue
                package, version, arch = match.groups()
                fileinfo = os.path.join(srcdir, binpkg, "file-info.gz")
                if os.path.isfile(fileinfo):
                    yield (package, version, arch, fileinfo)

saniztizers = dict()
def add_sanitizer(firstletter, pattern, repl):
    saniztizers.setdefault(firstletter, []).append((re.compile(pattern), repl))

# compresors
add_sanitizer("g", '^(gzip compressed data), was "[^"]*"(.*)', r"\1\2")
add_sanitizer("g", "^(gzip compressed data.*), last modified: [A-Za-z0-9: ]+(.*)", r"\1\2")
# binaries
add_sanitizer("E", "^(ELF .*)(?:, BuildID\\[sha1\\]=0x[a-f0-9]{40})+(.*)", r"\1\2")
add_sanitizer("E", "^(ELF .*), for GNU/Linux \\d+\\.\\d+\\.\\d+(.*)", r"\1\2")
# non-files
add_sanitizer("s", "^(symbolic link) to `.*'$", r"\1")
add_sanitizer("b", "^(broken symbolic link) to `.*'$", r"\1")
# image formats
add_sanitizer("G", r"^(GIF image data, version 8[79]a), \d+ x \d+$", r"\1")
add_sanitizer("P", r"^(PNG image data), \d+ x \d+(.*)", r"\1\2")
add_sanitizer("J", r'^(JPEG image data, JFIF standard 1\.0[12]), comment: "[^"]*"$', r"\1")
add_sanitizer("J", r"^(JPEG image data, JFIF standard 1\.0[12].*), \d+x\d+$", r"\1")
add_sanitizer("P", r"^(PCX ver\. 3\.0 image data) bounding box \[\d+, \d+\] - \[\d+, \d+\](.*)", r"\1\2")
add_sanitizer("A", r"^(Adobe Photoshop Image[^,]*), \d+ x \d+(.*)$", r"\1\2")
add_sanitizer("P", r"^(PC bitmap, Windows 3\.x format), \d+ x \d+ x \d+$", r"\1")
add_sanitizer("S", r"^(SGI image data, RLE), [23]-D, \d+ x \d+, \d+ channels?$", r"\1")
add_sanitizer("T", r"^(Targa image data - RGB(?: - RLE)?) \d+ x \d+$", r"\1")
add_sanitizer("M", r"^(MS Windows icon resource)(?: - \d+ icons?)?(?:, \d+x\d+)?(?:, \d+-colors)?$", r"\1")
# audio formats
add_sanitizer("O", r"^(Ogg data, Vorbis audio.*), \d+ Hz(.*)", r"\1\2")
add_sanitizer("O", r"^(Ogg data, Vorbis audio.*), ~\d+ bps(.*)", r"\1\2")
add_sanitizer("O", "^(Ogg data, Vorbis audio.*), created by: .*?$", r"\1")
add_sanitizer("S", r"^(Standard MIDI data \(format 1\)) using \d+ tracks at 1/\d+$", r"\1")
add_sanitizer("R", r"^(RIFF \(little-endian\) data, WAVE audio, Microsoft PCM), \d+ bit, (?:mono|stereo) \d+ Hz$", r"\1")
add_sanitizer("R", r"^(RIFF \(little-endian\) data, WAVE audio, Microsoft ADPCM), (?:mono|stereo) \d+ Hz$", r"\1")
add_sanitizer("F", r"^(FLAC audio bitstream data.*), \d+ samples$", r"\1")
add_sanitizer("4", '^(4-channel Protracker module sound data) Title: "[^"]*"$', r"\1")
add_sanitizer("F", '^(Fasttracker II module sound data) Title: "[^"]*"$', r"\1")
add_sanitizer("I", '^(Impulse Tracker module sound data -) ".*?"( compatible .* created .*)$', r"\1\2")
add_sanitizer("G", r"^(GIMP XCF image data.*), \d+ x \d+(.*)$", r"\1\2")
add_sanitizer("G", "^(GIMP pattern data), .*$", r"\1")
# other
add_sanitizer("G", r"^(GNU message catalog \((?:big|little) endian\), revision [01]\.[01]), \d+ messages(, \d+ sysdep messages)?$", r"\1")
add_sanitizer("P", '^(Palm OS dynamic library) data ".*?"$', r"\1")
add_sanitizer("P", r"^(PostScript Type 1 font text) \(.*\)$", r"\1")
add_sanitizer("Q", r"^(Quake II 3D Model file), \d+ skin\(s\), \(\d+ x \d+\), \d+ frame\(s\), Frame size \d+ bytes, \d+ vertices/frame, \d+ texture coordinates, \d+ triangles/frame$", r"\1")
add_sanitizer("Q", r"^(Quake I or II world or extension), \d+ entries$", r"\1")
add_sanitizer("G", r"^(G-IR binary database, v4\.0), \d+ entries/\d+ local$", r"\1")
add_sanitizer("t", r"^(timezone data, version 2), \d+ gmt time flags?, \d+ std time flags?, (?:no|\d+) leap seconds?, (?:no|\d+) transition times?, \d+ abbreviation chars?$", r"\1")
add_sanitizer("T", r"^(TeX font metric data) \(.*\)$", r"\1")
add_sanitizer("T", r"^(TeX DVI file) \(TeX output [0-9.:]+\\213\)$", r"\1")
add_sanitizer("C", "^(Composite Document File V2 Document.*?)(?:, Last Printed: [A-Za-z0-9: ]+)?(?:, Create Time/Date: [A-Za-z0-9: ]+)?(.*?)$", r"\1\2")
add_sanitizer("M", r"^(Minix filesystem.*), \d+ zones$", r"\1")
add_sanitizer("P", r"^(Protein Data Bank data), ID Code \w{4}, \d\d-[A-Z]{3}-\d\d$", r"\1")
add_sanitizer("D", r"^(DBase 3 data file.*) \(\d+ records\)$", r"\1")
add_sanitizer("E", r"^(ESRI Shapefile.*) length \d+(.*)$", r"\1\2")
add_sanitizer("S", r"^(SVR2 (?:pure )?executable .*) - version \d+$", r"\1")
add_sanitizer("S", "^(Sendmail frozen configuration)  - version .*$", r"\1")

def sanitize_guess(guess):
    for pat, repl in saniztizers.get(guess[0:1], []):
        guess = pat.sub(repl, guess)
    return guess

def read_file_info(path):
    with gzip.GzipFile(path) as fileinfo:
        for line in fileinfo:
            try:
                filename, guess = line.split('\0', 1)
            except ValueError:
                print("parse error for %s" % path)
                continue
            guess = guess.strip()
            guess = sanitize_guess(guess)
            yield filename, guess

def main():
    db = sqlite3.connect("fileinfo.sqlite3")
    cur = db.cursor()
    cur.execute("PRAGMA foreign_keys=ON;")
    cur.execute("CREATE TABLE IF NOT EXISTS package (id INTEGER PRIMARY KEY, name TEXT NOT NULL, version NOT NULL, architecture NOT NULL);")
    cur.execute("CREATE TABLE IF NOT EXISTS guess (id INTEGER PRIMARY KEY, guess TEXT UNIQUE NOT NULL);")
    cur.execute("CREATE TABLE IF NOT EXISTS content (pid INTEGER REFERENCES package(id) ON DELETE CASCADE, filename TEXT, gid INTEGER REFERENCES guess(id));")
    for package, version, arch, fileinfopath in find_file_info():
        cur.execute("SELECT id FROM package WHERE name = ? AND version = ? AND architecture = ?;",
                    (package, version, arch))
        if cur.fetchone():
            continue
        print("processing %s" % package)
        cur.execute("BEGIN;")
        cur.execute("INSERT INTO package (name, version, architecture) VALUES (?, ?, ?);",
                    (package, version, arch))
        pid = cur.lastrowid
        for filename, guess in read_file_info(fileinfopath):
            try:
                filename = filename.decode("utf-8")
            except UnicodeDecodeError:
                print("non-utf8 filename found")
                continue
            cur.execute("SELECT id FROM guess WHERE guess = ?;", (guess,))
            row = cur.fetchone()
            if row:
                gid = row[0]
            else:
                cur.execute("INSERT INTO guess (guess) VALUES (?);", (guess,))
                gid = cur.lastrowid
            cur.execute("INSERT INTO content (pid, filename, gid) VALUES (?, ?, ?);",
                        (pid, filename, gid))
        db.commit()
            
if __name__ == "__main__":
    main()

Bug#710813: lintian: Restructure the laboratory/data storage

Reply via email to