Chad has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/330464 )
Change subject: Pull in all upstream changes from https://github.com/jedbrown/git-fat/blob/master/git-fat ...................................................................... Pull in all upstream changes from https://github.com/jedbrown/git-fat/blob/master/git-fat Change-Id: I9916149b4f4e8cd16a384753fda6fd72346ff695 --- M README.md M git-fat M test-retroactive.sh M test.sh 4 files changed, 134 insertions(+), 20 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/operations/debs/git-fat refs/changes/64/330464/1 diff --git a/README.md b/README.md index 3889ad2..1db7de8 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ Some people recommend checking binaries into different repositories or even not versioning them at all, but these are not satisfying solutions for most workflows. ## Features of `git-fat` -* clones of the source repository are small and fast because no binaries are transferred, yet fully functional (unlike `git clone --depth`) +* clones of the source repository are small and fast because no binaries are transferred, yet fully functional with complete metadata and incremental retrieval (`git clone --depth` has limited granularity and couples metadata to content) * `git-fat` supports the same workflow for large binaries and traditionally versioned files, but internally manages the "fat" files separately * `git-bisect` works properly even when versions of the binary files change over time * selective control of which large files to pull into the local store @@ -19,8 +19,9 @@ # Installation and configuration Place `git-fat` in your `PATH`. -Edit `.gitattributes` to regard any desired extensions as fat files. +Edit (or create) `.gitattributes` to regard any desired extensions as fat files. + $ cd path-to-your-repository $ cat >> .gitattributes *.png filter=fat -crlf *.jpg filter=fat -crlf diff --git a/git-fat b/git-fat index 7edb7ba..dd6af72 100755 --- a/git-fat +++ b/git-fat @@ -15,6 +15,10 @@ import time import collections +if not type(sys.version_info) is tuple and sys.version_info.major > 2: + sys.stderr.write('git-fat does not support Python-3 yet. Please use python2.\n') + sys.exit(1) + try: from subprocess import check_output del check_output @@ -105,8 +109,10 @@ args.append(name) p = subprocess.Popen(args, stdout=subprocess.PIPE) output = p.communicate()[0].strip() - if p.returncode != 0: + if p.returncode and file is None: return None + elif p.returncode: + return gitconfig_get(name) else: return output def gitconfig_set(name, value, file=None): @@ -120,7 +126,10 @@ DecodeError = RuntimeError def __init__(self): self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') else verbose_ignore - self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip() + try: + self.gitroot = subprocess.check_output('git rev-parse --show-toplevel'.split()).strip() + except subprocess.CalledProcessError: + sys.exit(1) self.gitdir = subprocess.check_output('git rev-parse --git-dir'.split()).strip() self.objdir = os.path.join(self.gitdir, 'fat', 'objects') if os.environ.get('GIT_FAT_VERSION') == '1': @@ -133,6 +142,13 @@ self.magiclens = [magiclen(enc) for enc in [self.encode_v1, self.encode_v2]] # All prior versions def setup(self): mkdir_p(self.objdir) + def is_init_done(self): + return gitconfig_get('filter.fat.clean') or gitconfig_get('filter.fat.smudge') + def assert_init_done(self): + if not self.is_init_done(): + sys.stderr.write('fatal: git-fat is not yet configured in this repository.\n') + sys.stderr.write('Run "git fat init" to configure.\n') + sys.exit(1) def get_rsync(self): cfgpath = os.path.join(self.gitroot,'.gitfat') remote = gitconfig_get('rsync.remote', file=cfgpath) @@ -193,7 +209,10 @@ return itertools.chain([preamble], readblocks(stream)), None def decode_file(self, fname): # Fast check - stat = os.lstat(fname) + try: + stat = os.lstat(fname) + except OSError: + return False, None if stat.st_size != self.magiclen: return False, None # read file @@ -282,25 +301,61 @@ rev = '--all' elif rev is None: rev = self.revparse('HEAD') + # Revision list gives us object names to inspect with cat-file... p1 = subprocess.Popen(['git','rev-list','--objects',rev], stdout=subprocess.PIPE) - p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) def cut_sha1hash(input, output): for line in input: output.write(line.split()[0] + '\n') output.close() + # ...`cat-file --batch-check` filters for git-fat object candidates in bulk... + p2 = subprocess.Popen(['git','cat-file','--batch-check'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) + def filter_gitfat_candidates(input, output): + for line in input: + objhash, objtype, size = line.split() + if objtype == 'blob' and int(size) in self.magiclens: + output.write(objhash + '\n') + output.close() + # ...`cat-file --batch` provides full contents of git-fat candidates in bulk + p3 = subprocess.Popen(['git','cat-file','--batch'], stdin=subprocess.PIPE, stdout=subprocess.PIPE) + # Stream data: p1 | cut_thread | p2 | filter_thread | p3 cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, p2.stdin)) + filter_thread = threading.Thread(target=filter_gitfat_candidates, args=(p2.stdout, p3.stdin)) cut_thread.start() - for line in p2.stdout: - objhash, objtype, size = line.split() - if objtype == 'blob' and int(size) in self.magiclens: - try: - fathash = self.decode(subprocess.check_output(['git', 'cat-file', '-p', objhash]))[0] - referenced.add(fathash) - except GitFat.DecodeError: - pass + filter_thread.start() + # Process metadata + content format provided by `cat-file --batch` + while True: + metadata_line = p3.stdout.readline() + if not metadata_line: + break # EOF + objhash, objtype, size_str = metadata_line.split() + size, bytes_read = int(size_str), 0 + # We know from filter that item is a candidate git-fat object and + # is small enough to read into memory and process + content = '' + while bytes_read < size: + data = p3.stdout.read(size - bytes_read) + if not data: + break # EOF + content += data + bytes_read += len(data) + try: + fathash = self.decode(content)[0] + referenced.add(fathash) + except GitFat.DecodeError: + pass + # Consume LF record delimiter in `cat-file --batch` output + bytes_read = 0 + while bytes_read < 1: + data = p3.stdout.read(1) + if not data: + break # EOF + bytes_read += len(data) + # Ensure everything is cleaned up cut_thread.join() + filter_thread.join() p1.wait() p2.wait() + p3.wait() return referenced def orphan_files(self, patterns=[]): @@ -343,8 +398,11 @@ self.verbose('Executing: %s' % ' '.join(cmd)) p = subprocess.Popen(cmd, stdin=subprocess.PIPE) p.communicate(input='\x00'.join(files)) + if p.returncode: + sys.exit(p.returncode) def checkout(self, show_orphans=False): 'Update any stale files in the present working tree' + self.assert_init_done() for digest, fname in self.orphan_files(): objpath = os.path.join(self.objdir, digest) if os.access(objpath, os.R_OK): @@ -377,6 +435,8 @@ self.verbose('Executing: %s' % ' '.join(cmd)) p = subprocess.Popen(cmd, stdin=subprocess.PIPE) p.communicate(input='\x00'.join(files)) + if p.returncode: + sys.exit(p.returncode) self.checkout() def parse_pull_patterns(self, args): @@ -405,9 +465,27 @@ fname = os.path.join(self.objdir, obj) print('%10d %s' % (os.stat(fname).st_size, obj)) os.remove(fname) + + def cmd_verify(self): + """Print details of git-fat objects with incorrect data hash""" + corrupted_objects = [] + for obj in self.catalog_objects(): + fname = os.path.join(self.objdir, obj) + h = hashlib.new('sha1') + for block in readblocks(open(fname)): + h.update(block) + data_hash = h.hexdigest() + if obj != data_hash: + corrupted_objects.append((obj, data_hash)) + if corrupted_objects: + print('Corrupted objects: %d' % len(corrupted_objects)) + for obj, data_hash in corrupted_objects: + print('%s data hash is %s' % (obj, data_hash)) + sys.exit(1) + def cmd_init(self): self.setup() - if gitconfig_get('filter.fat.clean') or gitconfig_get('filter.fat.smudge'): + if self.is_init_done(): print('Git fat already configured, check configuration in .git/config') else: gitconfig_set('filter.fat.clean', 'git-fat filter-clean') @@ -459,7 +537,7 @@ time1 = time.time() self.verbose('Found %d paths in %.3f s' % (len(pathsizes), time1-time0)) maxlen = max(map(len,pathsizes)) if pathsizes else 0 - for path, sizes in sorted(pathsizes.items(), cmp=lambda (p1,s1),(p2,s2): cmp(max(s1),max(s2)), reverse=True): + for path, sizes in sorted(pathsizes.items(), key=lambda p,s: max(s), reverse=True): print('%-*s filter=fat -text # %10d %d' % (maxlen, path,max(sizes),len(sizes))) revlist.wait() difftree.wait() @@ -530,6 +608,8 @@ fat.cmd_pull(sys.argv[2:]) elif cmd == 'gc': fat.cmd_gc() + elif cmd == 'verify': + fat.cmd_verify() elif cmd == 'checkout': fat.cmd_checkout(sys.argv[2:]) elif cmd == 'find': @@ -537,4 +617,4 @@ elif cmd == 'index-filter': fat.cmd_index_filter(sys.argv[2:]) else: - print('Usage: git fat [init|status|push|pull|gc|checkout|find|index-filter]', file=sys.stderr) + print('Usage: git fat [init|status|push|pull|gc|verify|checkout|find|index-filter]', file=sys.stderr) diff --git a/test-retroactive.sh b/test-retroactive.sh index dd02367..51a38ec 100755 --- a/test-retroactive.sh +++ b/test-retroactive.sh @@ -1,8 +1,11 @@ #!/bin/sh -ex +fullpath() { echo "`pwd`/$1"; } + git init retro cd retro cp /usr/share/dict/words words.big +chmod u+w words.big git add words.big git commit -m'Add big file without using git-fat' sort words.big > sorted.big @@ -25,7 +28,7 @@ git log --stat git fat find 10000 | awk '{print $1}' > fat-files -git filter-branch --index-filter "git fat index-filter $(realpath fat-files) --manage-gitattributes" --tag-name-filter cat -- --all +git filter-branch --index-filter "git fat index-filter $(fullpath fat-files) --manage-gitattributes" --tag-name-filter cat -- --all git log --stat git checkout HEAD^ @@ -37,14 +40,14 @@ git checkout master cat > .gitfat <<EOF [rsync] -remote = $(realpath ../retro-store) +remote = $(fullpath ../retro-store) EOF git add .gitfat git commit -m'Add .gitfat for local push' git fat push cd .. -git clone file:///$(realpath retro) retro-clone +git clone file:///$(fullpath retro) retro-clone cd retro-clone git fat init git fat pull diff --git a/test.sh b/test.sh index e4ee3bb..0ee63ea 100755 --- a/test.sh +++ b/test.sh @@ -2,6 +2,9 @@ # Any copyright is dedicated to the Public Domain. # http://creativecommons.org/publicdomain/zero/1.0/ +# Clear out repos and fat store from prior test runs +rm -fR fat-test fat-test2 /tmp/fat-store + git init fat-test cd fat-test git fat init @@ -29,6 +32,33 @@ cd .. git clone fat-test fat-test2 cd fat-test2 +# checkout and pull should fail in repo not yet init'ed for git-fat +git fat checkout && true +if [ $? -eq 0 ] +then + echo 'ERROR: "git fat checkout" in uninitialised repo should fail' + exit 1 +fi +git fat pull -- 'a.fa*' && true +if [ $? -eq 0 ] +then + echo 'ERROR: "git fat pull" in uninitialised repo should fail' + exit 1 +fi git fat init git fat pull -- 'a.fa*' cat a.fat +echo 'file which is committed and removed afterwards' > d +git add d +git commit -m'add d with normal content' +rm d +git fat pull + +# Check verify command finds corrupt object +mv .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 \ + .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.bak +echo "Not the right data" > .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 +git fat verify && true +if [ $? -eq 0 ]; then echo "Verify did not detect invalid object"; exit 1; fi +mv .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.bak \ + .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 -- To view, visit https://gerrit.wikimedia.org/r/330464 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I9916149b4f4e8cd16a384753fda6fd72346ff695 Gerrit-PatchSet: 1 Gerrit-Project: operations/debs/git-fat Gerrit-Branch: master Gerrit-Owner: Chad <ch...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits