filesystem metadata indexing, yet again

Kragen Sitaker Wed, 04 Feb 2004 01:39:45 -0800

Like other things posted here without notices to the contrary, this
code is in the public domain.


I wrote about this before, using nested S-expressions and stuff.  I
got very similar performance results this time around, with a slightly
simpler data format, and a slightly different set of metadata.  I've
been embedded in relational databases for the last year or so, so this
time, the structure is flat, not nested, and this time, the parser for
it actually works.  But I neglected to include CRCs this time.

This time, I'm using a modified RFC-822 format rather than
S-expressions.

Here's the indexing program:
#!/usr/bin/python
import sys, os, urllib, sha, md5, stat

# simple filesystem indexing: basically metadata extraction.

# Resource usage: Output format uses about 322 bytes per file
# uncompressed, about 64 bytes compressed with gzip.  In tests, it
# indexes
# 480 files per CPU second
# 400 files per second
# 6.1 megabytes per CPU second
# 4.4 megabytes per second

# My laptop presently contains 204108 files, 200032 inodes,
# 9 669 636 096 bytes used.  This suggests a maximum indexing type of
# 507 seconds (9 minutes) if indexing files is the bottleneck, or 2198
# seconds (36 minutes) if indexing bytes is the bottleneck, which it
# probably will be.  And the index should occupy 13.1 megabytes or so.

# I have many applications in mind to build atop this.  For example:
# - filesystem change detection: what changed?  What's missing?
# - filesystem access summary: what got accessed more recently than
#   most other things?
# - backup indexing
# - filesystem reconstruction, in whole or in part, from what you have
#   lying around and what you can download
# - backup testing
# - incremental indexing

quote = lambda astring: urllib.quote_plus(astring, "/~,")

def content_info(afile):
    (shasum, md5sum) = (sha.new(), md5.new())
    initial = ''
    while 1:
        block = afile.read(4096)
        if block == '': return { 'fips_180_1_sha': shasum.hexdigest(),
                                 'md5': md5sum.hexdigest(),
                                 'initial_bytes': initial }
        if initial == '': initial = block[:40]
        shasum.update(block); md5sum.update(block)

def metadata(pathname):
    rv = {'name': pathname}
    try:
        st = os.lstat(pathname)  # may raise ENOENT
        rv.update({
            # these are for reconstituting Unix filesystems:
            'mode': oct(st.st_mode), 'uid_gid': ('%s_%s' % (st.st_uid, st.st_gid)),
            # these are for reconstituting hardlink structures:
            'dev_inode': '%s_%s' % (st.st_dev, st.st_ino),
            # these are clues for finding matches:
            'bytes': st.st_size, 'mtime': st.st_mtime, 'ctime': st.st_ctime,
            # this may be interesting to watch change over time, although
            # this script perturbs it:
            'atime': st.st_atime,
            })
        if stat.S_ISLNK(st.st_mode):
            # may raise EPERM, at least on Linux /proc/1/cwd
            rv['symlink'] = os.readlink(pathname) 
        elif stat.S_ISREG(st.st_mode):
            # may raise EPERM
            rv.update(content_info(file(pathname)))
    except (IOError, OSError), exc:
        rv['error'] = '%s %s' % (exc.errno, exc.strerror)
    return rv

def foo(arg, dirname, names):
    names.sort() # to encourage consistency
    for name in names:
        dd = metadata(os.path.join(dirname, name))
        keys = dd.keys()
        keys.sort() # again, to encourage consistency
        for key in keys:
            sys.stdout.write('%s: %s\n' % (quote(key), quote(str(dd[key]))))
        sys.stdout.write("\n")

def main(argv, output):
    os.path.walk(argv[1], foo, output)

if __name__ == '__main__': main(sys.argv, sys.stdout)


Here's a Python module that can read data in the format that the above
program produces:

#!/usr/bin/python
"Reads and indexes sequences of records, where each record is a hash."

# TODO:
# - indexing
# - object ids
# - put writing code in here instead of elsewhere
# - name things sensibly
# - documenting
# - updating
#   - inserting records
#   - removing truncated records at end of file
#   - removing records
#   - more flexibility for .where():
#     - None arguments
#     - dicts
#     - general predicates
#     - sets
# - ephemeral dbs (without files)
# - somedb.foo as somedb.where(table='foo') --- good idea?
# - joins
# - sorts

from __future__ import generators

import urllib

class record:
    "Represents a record read from the file."
    def __init__(self, contents): self.__contents = contents
    def __getitem__(self, name): return self.__contents[name]
    def __getattr__(self, name): return self[name]
    def __repr__(self):
        return 'record(%s)' % `self.__contents`
    

class db:
    "Base class for queries and file-readers, i.e. sequences of records."
    def __len__(self):
        ii = 0
        for rec in self: ii += 1
        return ii
    def where(self, **condition): return query(self, condition)

class filereader(db):
    "Reads records from a file."
    def __init__(self, fn): self.file = file(fn)
    def __iter__(self): return filereader_iter(self)
    def record_at(self, fp):
        self.file.seek(fp)
        rv = {}
        while 1:
            line = self.file.readline()
            if not line.endswith('\n'): return None  # truncated record, XXX test
            line = line[:-1]  # strip off newline
            if line == '': return record(rv)
            name, value = map(urllib.unquote, line.split(': '))
            rv[name] = value
    def tell(self): return self.file.tell()

class filereader_iter:
    "The iterator for a filereader."
    def __init__(self, _db):
        self.db = _db
        self.fp = 0
    def __iter__(self): return self
    def next(self):
        nextrec = self.db.record_at(self.fp)
        if not nextrec: raise StopIteration
        self.fp = self.db.tell()
        return nextrec

# XXX add indexing!
class query(db):
    def __init__(self, contents, condition):
        (self.contents, self.condition) = (contents, condition)
    def __iter__(self):
        for item in self.contents:
            if self.ok(item): yield item
    def ok(self, item):
        for key, value in self.condition.iteritems():
            if item[key] != value: return 0
        return 1


def ok(a, b): assert a == b, (a, b)

def test():
    reader = filereader('sample-fsdex')
    ok(len(reader), 9)
    items = list(reader)
    ok(len(items), 9)
    item = filter(lambda rec: rec['dev_inode'] == '769_1460247', items)[0]
    ok(item['atime'], '1074220012')
    ok(item.atime, '1074220012')
    ok(item['name'], '../4096dict')
    ok(item.name, '../4096dict')
    ok(item['dev_inode'], '769_1460247')

    items = list(reader.where(dev_inode='769_1460247'))
    ok(len(items), 1)
    ok(items[0].name, '../4096dict')

    # test unescaping of names and values
    items = list(reader.where(dev_inode='769_1460288'))
    ok(len(items), 1)
    # unfortunately the %25's decode to % signs, and this file was put
    # here by links, so this is a little confusing
    ok(items[0].name, '../Toshiba%20Pin%20Password%20Reset.htm')
    ok(items[0]['random thingy'], 'yes')

    # .where on a query
    items = list(reader.where(name='../MboxBackward.t~').where(uid_gid='500_500'))
    ok(len(items), 1)
    ok(items[0].bytes, '67')
    # and with negative results
    ok(len(reader.where(name='../MboxBackward.t~').where(bytes='0')), 0)

    # multiple conditions in a single query
    ok(len(reader.where(name='../MboxBackward.t~', bytes='67')), 1)
    ok(len(reader.where(name='../MboxBackward.t~', bytes='0')), 0)

if __name__ == '__main__': test()


The regression tests in the above module refer to a file called
"sample-fsdex".  Here's sample-fsdex, with """ marks at the beginning
and end:

"""atime: 1074220012
bytes: 27570
ctime: 1069308971
dev_inode: 769_1460247
fips_180_1_sha: 9d760fbb24803ae3d464afb1737cac5861ae368d
md5: f098a92989326963b36ce41a8898af9b
mode: 0100664
mtime: 1069308971
name: ../4096dict
uid_gid: 500_500

atime: 1074220012
bytes: 30992
ctime: 1068802397
dev_inode: 769_1459758
fips_180_1_sha: 9963b14069ed6bca7481b952748e1ef1319c96c7
md5: 0eb3334a8dd767d80ef0659543f1d274
mode: 0100664
mtime: 1068802397
name: ../4096words
uid_gid: 500_500

atime: 1074220013
bytes: 103722
ctime: 1069308910
dev_inode: 769_1459802
fips_180_1_sha: 5357c0d526552b7d3d64abbaa13170b7505dd048
md5: 31b07b296cc8b334321685892583f19d
mode: 0100644
mtime: 1069308910
name: ../5530-words
uid_gid: 500_500

atime: 1074220013
bytes: 983
ctime: 1073618036
dev_inode: 769_1460474
fips_180_1_sha: 49b6908ade9312348883a49c6ba7d6f56ed1997a
md5: 71d4fabff9a9803d1d6df43d7326874b
mode: 0100664
mtime: 1073618036
name: ../MboxBackward.pm
uid_gid: 500_500

atime: 1074220013
bytes: 34
ctime: 1073614407
dev_inode: 769_1460473
fips_180_1_sha: 27453a8353aff01fe936bc3c72c993b429dd2d4a
md5: a24d148f6aa1d64b893f3ec693beab88
mode: 0100664
mtime: 1073614406
name: ../MboxBackward.pm~
uid_gid: 500_500

atime: 1074220013
bytes: 1061
ctime: 1073618035
dev_inode: 769_1460476
fips_180_1_sha: c15244778f00e5ade72a9605e756321f08830265
md5: b6c5624303b828a5dd4f49822b45c5e9
mode: 0100755
mtime: 1073618035
name: ../MboxBackward.t
uid_gid: 500_500

atime: 1074220013
bytes: 67
ctime: 1073614459
dev_inode: 769_1460475
fips_180_1_sha: f39fcc72b9296000a831a8e3e7e4b146de23b9b8
md5: cad90f1a600598b91e962d3793dbd7b1
mode: 0100664
mtime: 1073614439
name: ../MboxBackward.t~
uid_gid: 500_500

atime: 1074220014
bytes: 1624
ctime: 1072920796
dev_inode: 769_1460288
fips_180_1_sha: a2e62e2d1bfd0e654eeae0b7976478c418dfcaa4
md5: 5da8553a4f8f3f136853c44e8322529c
mode: 0100664
mtime: 1072920796
name: ../Toshiba%2520Pin%2520Password%2520Reset.htm
random%20thingy: yes
uid_gid: 500_500

atime: 1074220014
bytes: 34
ctime: 1071351217
dev_inode: 769_1459799
fips_180_1_sha: 3e2cecdd58a68e2787eefe4f01ffc50954a82b39
md5: fb0bbac5153f2e0cc53bdad8188e533f
mode: 0100664
mtime: 1071351217
name: ../allmsgs
uid_gid: 500_500

"""

(That blank line at the end is important.  It signifies that the
record is complete.)

filesystem metadata indexing, yet again

Reply via email to