On Sun, Jun 02, 2013 at 06:49:09PM +0200, Niels Thykier wrote: > During the current full run, lintian.d.o ran out of inodes (1.9M). At > the moment, I have disabled experimental which I hope will work around > the problem for now.
I was working on something entirely different which happened to relate to this issue. I sought the answer to the question "What are popular file types in binary packages?". Lintian happens to be part of the answer, because it runs file on all packages and saves the results as file-index.gz (per package). Since grepping 40k .gz files was not an option, I converted them to a sqlite database. Now replacing all those file-index.gz with one sqlite database surely reduces inode usage! You can find my import script attached. It expects a lintian laboratory with the layout on lilburn and writes to ./fileinfo.sqlite3. If run repeatedly it imports only new packages (incremental runs should be fast, provided you have an index on the package table). Note that the output of file is preprocessed to remove ELF BuildIds, image dimensions, creation times and similar stuff one generally doesn't need when checking packages. A fresh import took about two hours on lilburn and was mostly io-bound. The resulting database has 431MB. Note that most of the space used is allocated to filenames. So if index.gz and index-owner.gz are imported into the very same database, I'd expect it to grow by no more than 100MB (excluding indices). I'll leave the db around on lilburn.d.o:~helmutg/fileinfo.sqlite3 for inspection for some time. Hope this helps. Helmut
#!/usr/bin/python # share results with #710813 import gzip import os import re import sqlite3 def find_file_info(labpath="/srv/lintian.debian.org/laboratory"): labpath = os.path.join(labpath, "pool") for letter in os.listdir(labpath): if letter.startswith("."): continue subdir = os.path.join(labpath, letter) if not os.path.isdir(subdir): continue for srcpkg in os.listdir(os.path.join(labpath, letter)): if srcpkg.startswith("."): continue srcdir = os.path.join(subdir, srcpkg) if not os.path.isdir(srcdir): continue for binpkg in os.listdir(srcdir): match = re.match("^([^_]+)_([^_]+)_([^_]+)_binary$", binpkg) if not match: continue package, version, arch = match.groups() fileinfo = os.path.join(srcdir, binpkg, "file-info.gz") if os.path.isfile(fileinfo): yield (package, version, arch, fileinfo) saniztizers = dict() def add_sanitizer(firstletter, pattern, repl): saniztizers.setdefault(firstletter, []).append((re.compile(pattern), repl)) # compresors add_sanitizer("g", '^(gzip compressed data), was "[^"]*"(.*)', r"\1\2") add_sanitizer("g", "^(gzip compressed data.*), last modified: [A-Za-z0-9: ]+(.*)", r"\1\2") # binaries add_sanitizer("E", "^(ELF .*)(?:, BuildID\\[sha1\\]=0x[a-f0-9]{40})+(.*)", r"\1\2") add_sanitizer("E", "^(ELF .*), for GNU/Linux \\d+\\.\\d+\\.\\d+(.*)", r"\1\2") # non-files add_sanitizer("s", "^(symbolic link) to `.*'$", r"\1") add_sanitizer("b", "^(broken symbolic link) to `.*'$", r"\1") # image formats add_sanitizer("G", r"^(GIF image data, version 8[79]a), \d+ x \d+$", r"\1") add_sanitizer("P", r"^(PNG image data), \d+ x \d+(.*)", r"\1\2") add_sanitizer("J", r'^(JPEG image data, JFIF standard 1\.0[12]), comment: "[^"]*"$', r"\1") add_sanitizer("J", r"^(JPEG image data, JFIF standard 1\.0[12].*), \d+x\d+$", r"\1") add_sanitizer("P", r"^(PCX ver\. 3\.0 image data) bounding box \[\d+, \d+\] - \[\d+, \d+\](.*)", r"\1\2") add_sanitizer("A", r"^(Adobe Photoshop Image[^,]*), \d+ x \d+(.*)$", r"\1\2") add_sanitizer("P", r"^(PC bitmap, Windows 3\.x format), \d+ x \d+ x \d+$", r"\1") add_sanitizer("S", r"^(SGI image data, RLE), [23]-D, \d+ x \d+, \d+ channels?$", r"\1") add_sanitizer("T", r"^(Targa image data - RGB(?: - RLE)?) \d+ x \d+$", r"\1") add_sanitizer("M", r"^(MS Windows icon resource)(?: - \d+ icons?)?(?:, \d+x\d+)?(?:, \d+-colors)?$", r"\1") # audio formats add_sanitizer("O", r"^(Ogg data, Vorbis audio.*), \d+ Hz(.*)", r"\1\2") add_sanitizer("O", r"^(Ogg data, Vorbis audio.*), ~\d+ bps(.*)", r"\1\2") add_sanitizer("O", "^(Ogg data, Vorbis audio.*), created by: .*?$", r"\1") add_sanitizer("S", r"^(Standard MIDI data \(format 1\)) using \d+ tracks at 1/\d+$", r"\1") add_sanitizer("R", r"^(RIFF \(little-endian\) data, WAVE audio, Microsoft PCM), \d+ bit, (?:mono|stereo) \d+ Hz$", r"\1") add_sanitizer("R", r"^(RIFF \(little-endian\) data, WAVE audio, Microsoft ADPCM), (?:mono|stereo) \d+ Hz$", r"\1") add_sanitizer("F", r"^(FLAC audio bitstream data.*), \d+ samples$", r"\1") add_sanitizer("4", '^(4-channel Protracker module sound data) Title: "[^"]*"$', r"\1") add_sanitizer("F", '^(Fasttracker II module sound data) Title: "[^"]*"$', r"\1") add_sanitizer("I", '^(Impulse Tracker module sound data -) ".*?"( compatible .* created .*)$', r"\1\2") add_sanitizer("G", r"^(GIMP XCF image data.*), \d+ x \d+(.*)$", r"\1\2") add_sanitizer("G", "^(GIMP pattern data), .*$", r"\1") # other add_sanitizer("G", r"^(GNU message catalog \((?:big|little) endian\), revision [01]\.[01]), \d+ messages(, \d+ sysdep messages)?$", r"\1") add_sanitizer("P", '^(Palm OS dynamic library) data ".*?"$', r"\1") add_sanitizer("P", r"^(PostScript Type 1 font text) \(.*\)$", r"\1") add_sanitizer("Q", r"^(Quake II 3D Model file), \d+ skin\(s\), \(\d+ x \d+\), \d+ frame\(s\), Frame size \d+ bytes, \d+ vertices/frame, \d+ texture coordinates, \d+ triangles/frame$", r"\1") add_sanitizer("Q", r"^(Quake I or II world or extension), \d+ entries$", r"\1") add_sanitizer("G", r"^(G-IR binary database, v4\.0), \d+ entries/\d+ local$", r"\1") add_sanitizer("t", r"^(timezone data, version 2), \d+ gmt time flags?, \d+ std time flags?, (?:no|\d+) leap seconds?, (?:no|\d+) transition times?, \d+ abbreviation chars?$", r"\1") add_sanitizer("T", r"^(TeX font metric data) \(.*\)$", r"\1") add_sanitizer("T", r"^(TeX DVI file) \(TeX output [0-9.:]+\\213\)$", r"\1") add_sanitizer("C", "^(Composite Document File V2 Document.*?)(?:, Last Printed: [A-Za-z0-9: ]+)?(?:, Create Time/Date: [A-Za-z0-9: ]+)?(.*?)$", r"\1\2") add_sanitizer("M", r"^(Minix filesystem.*), \d+ zones$", r"\1") add_sanitizer("P", r"^(Protein Data Bank data), ID Code \w{4}, \d\d-[A-Z]{3}-\d\d$", r"\1") add_sanitizer("D", r"^(DBase 3 data file.*) \(\d+ records\)$", r"\1") add_sanitizer("E", r"^(ESRI Shapefile.*) length \d+(.*)$", r"\1\2") add_sanitizer("S", r"^(SVR2 (?:pure )?executable .*) - version \d+$", r"\1") add_sanitizer("S", "^(Sendmail frozen configuration) - version .*$", r"\1") def sanitize_guess(guess): for pat, repl in saniztizers.get(guess[0:1], []): guess = pat.sub(repl, guess) return guess def read_file_info(path): with gzip.GzipFile(path) as fileinfo: for line in fileinfo: try: filename, guess = line.split('\0', 1) except ValueError: print("parse error for %s" % path) continue guess = guess.strip() guess = sanitize_guess(guess) yield filename, guess def main(): db = sqlite3.connect("fileinfo.sqlite3") cur = db.cursor() cur.execute("PRAGMA foreign_keys=ON;") cur.execute("CREATE TABLE IF NOT EXISTS package (id INTEGER PRIMARY KEY, name TEXT NOT NULL, version NOT NULL, architecture NOT NULL);") cur.execute("CREATE TABLE IF NOT EXISTS guess (id INTEGER PRIMARY KEY, guess TEXT UNIQUE NOT NULL);") cur.execute("CREATE TABLE IF NOT EXISTS content (pid INTEGER REFERENCES package(id) ON DELETE CASCADE, filename TEXT, gid INTEGER REFERENCES guess(id));") for package, version, arch, fileinfopath in find_file_info(): cur.execute("SELECT id FROM package WHERE name = ? AND version = ? AND architecture = ?;", (package, version, arch)) if cur.fetchone(): continue print("processing %s" % package) cur.execute("BEGIN;") cur.execute("INSERT INTO package (name, version, architecture) VALUES (?, ?, ?);", (package, version, arch)) pid = cur.lastrowid for filename, guess in read_file_info(fileinfopath): try: filename = filename.decode("utf-8") except UnicodeDecodeError: print("non-utf8 filename found") continue cur.execute("SELECT id FROM guess WHERE guess = ?;", (guess,)) row = cur.fetchone() if row: gid = row[0] else: cur.execute("INSERT INTO guess (guess) VALUES (?);", (guess,)) gid = cur.lastrowid cur.execute("INSERT INTO content (pid, filename, gid) VALUES (?, ?, ?);", (pid, filename, gid)) db.commit() if __name__ == "__main__": main()