I described some of the problems I have with file and backup
management in a kragen-tol post in March 2002, entitled "file
catalogs and backups":
http://lists.canonical.org/pipermail/kragen-tol/2002-March/000691.html
http://www.mail-archive.com/[EMAIL PROTECTED]/msg00037.html
Now, spurred by recent major hardware failures in my life, I've
implemented two pieces of the grandiose system I laid out there:
- a filesystem indexer that spits out all the metadata for my files
(although it doesn't yet look inside of tar and zip files for more
files);
- a broken program to answer the simplest interesting question I could
think of with such a catalog: what files are duplicated in more than one
place on the filesystem?
They follow.
fs-snapshot:
#!/usr/local/bin/python
# extract enough info about your filesystem that you could, at least
# in principle, tell if it had changed, or reconstruct its old state,
# if you had the files lying around somewhere; yet this info should
# remain relatively small, so that it's practical to keep it close at
# hand.
# With just pathname, adler32, CRC32, MD5, and SHA-1, this data adds
# up to about 64 bytes per file gzipped, or 164 ungzipped, and it
# processes about 189 files (or 4.27 megs) per second. This means
# that gentle's root partition's 288613 files consuming 8.2 gigabytes
# should take no more than 1920 seconds to index (or 1527 if the
# bottleneck was per-file and not per-byte in the smaller benchmark),
# and the aggregate database should be 18 megabytes compressed or 47
# uncompressed. That's good enough to eat!
# After adding the rest of the metadata, it takes 78 bytes per file
# gzipped or 244 ungzipped; time to index 931 files totaling 21M has
# increased to 6.17 seconds, or 150 files or 3.4 megabytes per second;
# that's 21.5 mebibytes for an index of gentle's root partition, and
# 2400 seconds or so to index it.
# In actual fact, indexing gentle's entire filesystem produced an
# index file that gzipped to 19 462 116 bytes (76 762 702 bytes
# ungzipped), in 77 minutes 14.45 seconds wallclock time, with 39
# minutes 25.42 seconds user time and 13 minutes 37.88 seconds system
# time. That's a total of 3183 seconds of CPU time to index 13 278
# 767 kiB of data, or 4.3 MB/cpusec. The file indexed 296 440 files,
# so it was only able to index 93 files per CPU second. Wallclock
# speeds were 64 files and 2.9 MB per second. The gzipped index file
# used 65.7 bytes per file; gunzipped, it used 259 bytes per file.
# All of this was on a 500MHz AMD K6-3.
# BUGS:
# - when it scans multiple hardlinks to the same file, it rereads the file
# each time.
# - it doesn't save major and minor numbers of device files, which makes it
# usable but painful for system backups.
# - nothing reads its output yet
# - it doesn't save user and group names
# - it doesn't yet generate THEX/Tiger tree hashes
# - it prints some useless information --- e.g. sizes of directories
# and symlinks
# - it doesn't include metadata about the entire file. It would be nice to
# know not just that some /bin/sh was a symlink to 'bash', but that at
# 2003-03-30, the /bin/sh in gentle.canonical.org's filesystem was a symlink
# to 'bash'. If you know that some '/bin/bash' once had 461400
# bytes that have an MD5 checksum of b5d4cad2a9edb1cd647b7ef86a2488, you
# might then know whether that /bin/bash is the same one you wish you had
# on your current filesystem. But if you know that that /bin/bash is the
# one on panacea.canonical.org as of yesterday, then you can take some
# useful action, like copying it into the place you wish it was!
import sys, os, md5, sha, zlib, string, stat
def dictstr(adict):
return ''.join(["%s: %s\n" % (key, value) for key, value in adict.items()])
class checksums:
def __init__(self, fileobj):
self.adler32 = zlib.adler32("")
self.crc32 = zlib.crc32("")
self.md5 = md5.new()
self.sha1 = sha.new()
while 1:
instr = fileobj.read(4096)
if instr == '': break
self.update(instr)
fileobj.close()
def update(self, instr):
self.crc32 = zlib.crc32(instr, self.crc32)
self.adler32 = zlib.adler32(instr, self.adler32)
self.md5.update(instr)
self.sha1.update(instr)
def hexstring(self, astr):
return string.join(["%x" % ord(char) for char in astr], '')
def as_string(self):
return dictstr({
'Adler32': '%x' % self.adler32,
'CRC32': '%x' % self.crc32,
'MD5': self.hexstring(self.md5.digest()),
'SHA-1': self.sha1.hexdigest(),
})
typemap = {
0010000: 'FIFO', # S_IFIFO
0020000: 'Character device', # S_IFCHR
0040000: 'Dir', # S_IFDIR
0060000: 'Block device', # S_IFBLK
0100000: 'File', # S_IFREG
0120000: 'Symlink', # S_IFLNK
0140000: 'Socket', # S_IFSOCK
}
class get_metadata:
def __init__(self, pathname):
self.statdata = os.lstat(pathname)
self.mode = self.statdata[0]
def is_regular_file(self):
return stat.S_ISREG(self.mode)
def islink(self):
return stat.S_ISLNK(self.mode)
def type(self):
return typemap[stat.S_IFMT(self.mode)]
def as_string(self):
# lstat returns: mode ino dev nlink uid gid size atime mtime ctime
# we ignore nlink, atime, and ctime, because they don't help with
# backup or restore of files.
return dictstr({
'Permissions': '%05o' % (stat.S_IMODE(self.mode)),
'Inum.dev': '%d.%d' % (self.statdata[1], self.statdata[2]),
'uid.gid': '%d.%d' % (self.statdata[4], self.statdata[5]),
'Bytes': self.statdata[6],
'Mtime': self.statdata[8],
'Type': self.type(),
})
def dumpfile(output, pathname):
output.write(dictstr({'Path': `pathname`}))
metadata = get_metadata(pathname)
output.write(metadata.as_string())
if metadata.is_regular_file():
output.write(checksums(open(pathname)).as_string())
if metadata.islink():
output.write(dictstr({'Symlink': `os.readlink(pathname)`}))
# XXX save device numbers on block and char devices?
output.write("\n")
def dumpfiles(output, dirname, filenames):
for filename in filenames:
dumpfile(output, "%s/%s" % (dirname, filename))
def main(argv):
if len(argv) < 2:
sys.stderr.write("Usage: %s dir [dir [dir ...]]\n" % argv[0])
return 1
for dirpath in argv[1:]:
os.path.walk(dirpath, dumpfiles, sys.stdout)
return 0
if __name__ == '__main__': sys.exit(main(sys.argv))
find-dups:
#!/usr/local/bin/python
# read output produced by fs-snapshot and find out which files occur in many
# places
# ook, this code horrifies. it breaks on filenames with spaces, due to a
# quick-and-dirty implementation, and I don't have time to fix it before
# sending it out. Sorry.
import sys, operator, string, tempfile, os
# these fields best when evaled:
evalable = 'path', 'bytes', 'symlink'
def read_file_rec(input):
rv = {}
while 1:
line = input.readline()
if line == '': return None
if line == '\n': return rv
while line[-1:] == '\n': line = line[:-1]
name, value = line.split(':', 1)
name = string.lower(name)
value = string.lstrip(value)
if name in evalable: value = eval(value)
rv[name] = value
if not filerecs: return None
return filerecs.pop()
def addpair(adict, key, val):
if not adict.has_key(key): adict[key] = []
adict[key].append(val)
def main(argv):
if len(argv) != 1:
sys.stderr.write("Usage: %s <filesystemlisting" % argv[0])
return 1
# My first pass just used a hash of hashes. It grew bigger than
# my RAM and started thrashing.
# read in data
tmpfilename = tempfile.mktemp()
tmpfile = open(tmpfilename, 'w') # XXX security hole
while 1:
filerec = read_file_rec(sys.stdin)
if not filerec: break
if filerec['type'] != 'File': continue
tmpfile.write("%(sha-1)s %(inum.dev)s %(path)s %(bytes)s\n" % filerec)
tmpfile.close()
# break it out by SHA-1
os.system("sort -o %s %s" % (tmpfilename, tmpfilename))
tmpfile = open(tmpfilename)
# sort it by number of occurrences
tmpfilename2 = tempfile.mktemp()
tmpfile2 = open(tmpfilename2, 'w') # XXX security hole
oldsha1, inumpaths, oldbytes = None, [], None
while 1:
line = tmpfile.readline()
if line:
sha1, inumdev, path, bytes = string.split(line, ' ', 4)
if sha1 != oldsha1:
if oldsha1 is not None:
tmpfile2.write("%08d %s %s %s" % (count, oldsha1,
string.join(inumpaths),
oldbytes))
oldsha1, inumpaths, oldbytes, count = sha1, [], bytes, 0
if not line: break
count += 1
inumpaths.append(inumdev)
inumpaths.append(path)
tmpfile2.close()
os.system("sort -r -o %s %s" % (tmpfilename2, tmpfilename2))
# output it
tmpfile2 = open(tmpfilename2)
while 1:
line = tmpfile2.readline()
if not line: break
chunks = string.split(line)
count, inumpaths, bytes = eval(chunks[0]), chunks[2:-1], chunks[-1]
print "%d file%s of %d bytes:" % (
count, (count > 1 and 's' or ''), eval(bytes),
)
harddict = {}
while inumpaths:
inumdev, path = inumpaths[:2]
inumpaths = inumpaths[2:]
if not harddict.has_key(inumdev): harddict[inumdev] = []
harddict[inumdev].append(path)
for pathlist in harddict.values():
print " ", ' = '.join(pathlist)
return 0
if __name__ == '__main__': sys.exit(main(sys.argv))