Like other things posted here without notices to the contrary, this code is in the public domain.
I wrote about this before, using nested S-expressions and stuff. I got very similar performance results this time around, with a slightly simpler data format, and a slightly different set of metadata. I've been embedded in relational databases for the last year or so, so this time, the structure is flat, not nested, and this time, the parser for it actually works. But I neglected to include CRCs this time. This time, I'm using a modified RFC-822 format rather than S-expressions. Here's the indexing program: #!/usr/bin/python import sys, os, urllib, sha, md5, stat # simple filesystem indexing: basically metadata extraction. # Resource usage: Output format uses about 322 bytes per file # uncompressed, about 64 bytes compressed with gzip. In tests, it # indexes # 480 files per CPU second # 400 files per second # 6.1 megabytes per CPU second # 4.4 megabytes per second # My laptop presently contains 204108 files, 200032 inodes, # 9 669 636 096 bytes used. This suggests a maximum indexing type of # 507 seconds (9 minutes) if indexing files is the bottleneck, or 2198 # seconds (36 minutes) if indexing bytes is the bottleneck, which it # probably will be. And the index should occupy 13.1 megabytes or so. # I have many applications in mind to build atop this. For example: # - filesystem change detection: what changed? What's missing? # - filesystem access summary: what got accessed more recently than # most other things? # - backup indexing # - filesystem reconstruction, in whole or in part, from what you have # lying around and what you can download # - backup testing # - incremental indexing quote = lambda astring: urllib.quote_plus(astring, "/~,") def content_info(afile): (shasum, md5sum) = (sha.new(), md5.new()) initial = '' while 1: block = afile.read(4096) if block == '': return { 'fips_180_1_sha': shasum.hexdigest(), 'md5': md5sum.hexdigest(), 'initial_bytes': initial } if initial == '': initial = block[:40] shasum.update(block); md5sum.update(block) def metadata(pathname): rv = {'name': pathname} try: st = os.lstat(pathname) # may raise ENOENT rv.update({ # these are for reconstituting Unix filesystems: 'mode': oct(st.st_mode), 'uid_gid': ('%s_%s' % (st.st_uid, st.st_gid)), # these are for reconstituting hardlink structures: 'dev_inode': '%s_%s' % (st.st_dev, st.st_ino), # these are clues for finding matches: 'bytes': st.st_size, 'mtime': st.st_mtime, 'ctime': st.st_ctime, # this may be interesting to watch change over time, although # this script perturbs it: 'atime': st.st_atime, }) if stat.S_ISLNK(st.st_mode): # may raise EPERM, at least on Linux /proc/1/cwd rv['symlink'] = os.readlink(pathname) elif stat.S_ISREG(st.st_mode): # may raise EPERM rv.update(content_info(file(pathname))) except (IOError, OSError), exc: rv['error'] = '%s %s' % (exc.errno, exc.strerror) return rv def foo(arg, dirname, names): names.sort() # to encourage consistency for name in names: dd = metadata(os.path.join(dirname, name)) keys = dd.keys() keys.sort() # again, to encourage consistency for key in keys: sys.stdout.write('%s: %s\n' % (quote(key), quote(str(dd[key])))) sys.stdout.write("\n") def main(argv, output): os.path.walk(argv[1], foo, output) if __name__ == '__main__': main(sys.argv, sys.stdout) Here's a Python module that can read data in the format that the above program produces: #!/usr/bin/python "Reads and indexes sequences of records, where each record is a hash." # TODO: # - indexing # - object ids # - put writing code in here instead of elsewhere # - name things sensibly # - documenting # - updating # - inserting records # - removing truncated records at end of file # - removing records # - more flexibility for .where(): # - None arguments # - dicts # - general predicates # - sets # - ephemeral dbs (without files) # - somedb.foo as somedb.where(table='foo') --- good idea? # - joins # - sorts from __future__ import generators import urllib class record: "Represents a record read from the file." def __init__(self, contents): self.__contents = contents def __getitem__(self, name): return self.__contents[name] def __getattr__(self, name): return self[name] def __repr__(self): return 'record(%s)' % `self.__contents` class db: "Base class for queries and file-readers, i.e. sequences of records." def __len__(self): ii = 0 for rec in self: ii += 1 return ii def where(self, **condition): return query(self, condition) class filereader(db): "Reads records from a file." def __init__(self, fn): self.file = file(fn) def __iter__(self): return filereader_iter(self) def record_at(self, fp): self.file.seek(fp) rv = {} while 1: line = self.file.readline() if not line.endswith('\n'): return None # truncated record, XXX test line = line[:-1] # strip off newline if line == '': return record(rv) name, value = map(urllib.unquote, line.split(': ')) rv[name] = value def tell(self): return self.file.tell() class filereader_iter: "The iterator for a filereader." def __init__(self, _db): self.db = _db self.fp = 0 def __iter__(self): return self def next(self): nextrec = self.db.record_at(self.fp) if not nextrec: raise StopIteration self.fp = self.db.tell() return nextrec # XXX add indexing! class query(db): def __init__(self, contents, condition): (self.contents, self.condition) = (contents, condition) def __iter__(self): for item in self.contents: if self.ok(item): yield item def ok(self, item): for key, value in self.condition.iteritems(): if item[key] != value: return 0 return 1 def ok(a, b): assert a == b, (a, b) def test(): reader = filereader('sample-fsdex') ok(len(reader), 9) items = list(reader) ok(len(items), 9) item = filter(lambda rec: rec['dev_inode'] == '769_1460247', items)[0] ok(item['atime'], '1074220012') ok(item.atime, '1074220012') ok(item['name'], '../4096dict') ok(item.name, '../4096dict') ok(item['dev_inode'], '769_1460247') items = list(reader.where(dev_inode='769_1460247')) ok(len(items), 1) ok(items[0].name, '../4096dict') # test unescaping of names and values items = list(reader.where(dev_inode='769_1460288')) ok(len(items), 1) # unfortunately the %25's decode to % signs, and this file was put # here by links, so this is a little confusing ok(items[0].name, '../Toshiba%20Pin%20Password%20Reset.htm') ok(items[0]['random thingy'], 'yes') # .where on a query items = list(reader.where(name='../MboxBackward.t~').where(uid_gid='500_500')) ok(len(items), 1) ok(items[0].bytes, '67') # and with negative results ok(len(reader.where(name='../MboxBackward.t~').where(bytes='0')), 0) # multiple conditions in a single query ok(len(reader.where(name='../MboxBackward.t~', bytes='67')), 1) ok(len(reader.where(name='../MboxBackward.t~', bytes='0')), 0) if __name__ == '__main__': test() The regression tests in the above module refer to a file called "sample-fsdex". Here's sample-fsdex, with """ marks at the beginning and end: """atime: 1074220012 bytes: 27570 ctime: 1069308971 dev_inode: 769_1460247 fips_180_1_sha: 9d760fbb24803ae3d464afb1737cac5861ae368d md5: f098a92989326963b36ce41a8898af9b mode: 0100664 mtime: 1069308971 name: ../4096dict uid_gid: 500_500 atime: 1074220012 bytes: 30992 ctime: 1068802397 dev_inode: 769_1459758 fips_180_1_sha: 9963b14069ed6bca7481b952748e1ef1319c96c7 md5: 0eb3334a8dd767d80ef0659543f1d274 mode: 0100664 mtime: 1068802397 name: ../4096words uid_gid: 500_500 atime: 1074220013 bytes: 103722 ctime: 1069308910 dev_inode: 769_1459802 fips_180_1_sha: 5357c0d526552b7d3d64abbaa13170b7505dd048 md5: 31b07b296cc8b334321685892583f19d mode: 0100644 mtime: 1069308910 name: ../5530-words uid_gid: 500_500 atime: 1074220013 bytes: 983 ctime: 1073618036 dev_inode: 769_1460474 fips_180_1_sha: 49b6908ade9312348883a49c6ba7d6f56ed1997a md5: 71d4fabff9a9803d1d6df43d7326874b mode: 0100664 mtime: 1073618036 name: ../MboxBackward.pm uid_gid: 500_500 atime: 1074220013 bytes: 34 ctime: 1073614407 dev_inode: 769_1460473 fips_180_1_sha: 27453a8353aff01fe936bc3c72c993b429dd2d4a md5: a24d148f6aa1d64b893f3ec693beab88 mode: 0100664 mtime: 1073614406 name: ../MboxBackward.pm~ uid_gid: 500_500 atime: 1074220013 bytes: 1061 ctime: 1073618035 dev_inode: 769_1460476 fips_180_1_sha: c15244778f00e5ade72a9605e756321f08830265 md5: b6c5624303b828a5dd4f49822b45c5e9 mode: 0100755 mtime: 1073618035 name: ../MboxBackward.t uid_gid: 500_500 atime: 1074220013 bytes: 67 ctime: 1073614459 dev_inode: 769_1460475 fips_180_1_sha: f39fcc72b9296000a831a8e3e7e4b146de23b9b8 md5: cad90f1a600598b91e962d3793dbd7b1 mode: 0100664 mtime: 1073614439 name: ../MboxBackward.t~ uid_gid: 500_500 atime: 1074220014 bytes: 1624 ctime: 1072920796 dev_inode: 769_1460288 fips_180_1_sha: a2e62e2d1bfd0e654eeae0b7976478c418dfcaa4 md5: 5da8553a4f8f3f136853c44e8322529c mode: 0100664 mtime: 1072920796 name: ../Toshiba%2520Pin%2520Password%2520Reset.htm random%20thingy: yes uid_gid: 500_500 atime: 1074220014 bytes: 34 ctime: 1071351217 dev_inode: 769_1459799 fips_180_1_sha: 3e2cecdd58a68e2787eefe4f01ffc50954a82b39 md5: fb0bbac5153f2e0cc53bdad8188e533f mode: 0100664 mtime: 1071351217 name: ../allmsgs uid_gid: 500_500 """ (That blank line at the end is important. It signifies that the record is complete.)