Chad has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/330464 )

Change subject: Pull in all upstream changes from 
https://github.com/jedbrown/git-fat/blob/master/git-fat
......................................................................

Pull in all upstream changes from 
https://github.com/jedbrown/git-fat/blob/master/git-fat

Change-Id: I9916149b4f4e8cd16a384753fda6fd72346ff695
---
M README.md
M git-fat
M test-retroactive.sh
M test.sh
4 files changed, 134 insertions(+), 20 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/debs/git-fat 
refs/changes/64/330464/1

diff --git a/README.md b/README.md
index 3889ad2..1db7de8 100644
--- a/README.md
+++ b/README.md
@@ -4,7 +4,7 @@
 Some people recommend checking binaries into different repositories or even 
not versioning them at all, but these are not satisfying solutions for most 
workflows.
 
 ## Features of `git-fat`
-* clones of the source repository are small and fast because no binaries are 
transferred, yet fully functional (unlike `git clone --depth`)
+* clones of the source repository are small and fast because no binaries are 
transferred, yet fully functional with complete metadata and incremental 
retrieval (`git clone --depth` has limited granularity and couples metadata to 
content)
 * `git-fat` supports the same workflow for large binaries and traditionally 
versioned files, but internally manages the "fat" files separately
 * `git-bisect` works properly even when versions of the binary files change 
over time
 * selective control of which large files to pull into the local store
@@ -19,8 +19,9 @@
 # Installation and configuration
 Place `git-fat` in your `PATH`.
 
-Edit `.gitattributes` to regard any desired extensions as fat files.
+Edit (or create) `.gitattributes` to regard any desired extensions as fat 
files.
 
+    $ cd path-to-your-repository
     $ cat >> .gitattributes
     *.png filter=fat -crlf
     *.jpg filter=fat -crlf
diff --git a/git-fat b/git-fat
index 7edb7ba..dd6af72 100755
--- a/git-fat
+++ b/git-fat
@@ -15,6 +15,10 @@
 import time
 import collections
 
+if not type(sys.version_info) is tuple and sys.version_info.major > 2:
+    sys.stderr.write('git-fat does not support Python-3 yet.  Please use 
python2.\n')
+    sys.exit(1)
+
 try:
     from subprocess import check_output
     del check_output
@@ -105,8 +109,10 @@
     args.append(name)
     p = subprocess.Popen(args, stdout=subprocess.PIPE)
     output = p.communicate()[0].strip()
-    if p.returncode != 0:
+    if p.returncode and file is None:
         return None
+    elif p.returncode:
+        return gitconfig_get(name)
     else:
         return output
 def gitconfig_set(name, value, file=None):
@@ -120,7 +126,10 @@
     DecodeError = RuntimeError
     def __init__(self):
         self.verbose = verbose_stderr if os.environ.get('GIT_FAT_VERBOSE') 
else verbose_ignore
-        self.gitroot = subprocess.check_output('git rev-parse 
--show-toplevel'.split()).strip()
+        try:
+            self.gitroot = subprocess.check_output('git rev-parse 
--show-toplevel'.split()).strip()
+        except subprocess.CalledProcessError:
+            sys.exit(1)
         self.gitdir = subprocess.check_output('git rev-parse 
--git-dir'.split()).strip()
         self.objdir = os.path.join(self.gitdir, 'fat', 'objects')
         if os.environ.get('GIT_FAT_VERSION') == '1':
@@ -133,6 +142,13 @@
         self.magiclens = [magiclen(enc) for enc in [self.encode_v1, 
self.encode_v2]] # All prior versions
     def setup(self):
         mkdir_p(self.objdir)
+    def is_init_done(self):
+        return gitconfig_get('filter.fat.clean') or 
gitconfig_get('filter.fat.smudge')
+    def assert_init_done(self):
+        if not self.is_init_done():
+            sys.stderr.write('fatal: git-fat is not yet configured in this 
repository.\n')
+            sys.stderr.write('Run "git fat init" to configure.\n')
+            sys.exit(1)
     def get_rsync(self):
         cfgpath   = os.path.join(self.gitroot,'.gitfat')
         remote    = gitconfig_get('rsync.remote', file=cfgpath)
@@ -193,7 +209,10 @@
             return itertools.chain([preamble], readblocks(stream)), None
     def decode_file(self, fname):
         # Fast check
-        stat = os.lstat(fname)
+        try:
+            stat = os.lstat(fname)
+        except OSError:
+            return False, None
         if stat.st_size != self.magiclen:
             return False, None
         # read file
@@ -282,25 +301,61 @@
             rev = '--all'
         elif rev is None:
             rev = self.revparse('HEAD')
+        # Revision list gives us object names to inspect with cat-file...
         p1 = subprocess.Popen(['git','rev-list','--objects',rev], 
stdout=subprocess.PIPE)
-        p2 = subprocess.Popen(['git','cat-file','--batch-check'], 
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
         def cut_sha1hash(input, output):
             for line in input:
                 output.write(line.split()[0] + '\n')
             output.close()
+        # ...`cat-file --batch-check` filters for git-fat object candidates in 
bulk...
+        p2 = subprocess.Popen(['git','cat-file','--batch-check'], 
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+        def filter_gitfat_candidates(input, output):
+            for line in input:
+                objhash, objtype, size = line.split()
+                if objtype == 'blob' and int(size) in self.magiclens:
+                    output.write(objhash + '\n')
+            output.close()
+        # ...`cat-file --batch` provides full contents of git-fat candidates 
in bulk
+        p3 = subprocess.Popen(['git','cat-file','--batch'], 
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+        # Stream data: p1 | cut_thread | p2 | filter_thread | p3
         cut_thread = threading.Thread(target=cut_sha1hash, args=(p1.stdout, 
p2.stdin))
+        filter_thread = threading.Thread(target=filter_gitfat_candidates, 
args=(p2.stdout, p3.stdin))
         cut_thread.start()
-        for line in p2.stdout:
-            objhash, objtype, size = line.split()
-            if objtype == 'blob' and int(size) in self.magiclens:
-                try:
-                    fathash = self.decode(subprocess.check_output(['git', 
'cat-file', '-p', objhash]))[0]
-                    referenced.add(fathash)
-                except GitFat.DecodeError:
-                    pass
+        filter_thread.start()
+        # Process metadata + content format provided by `cat-file --batch`
+        while True:
+            metadata_line = p3.stdout.readline()
+            if not metadata_line:
+                break  # EOF
+            objhash, objtype, size_str = metadata_line.split()
+            size, bytes_read = int(size_str), 0
+            # We know from filter that item is a candidate git-fat object and
+            # is small enough to read into memory and process
+            content = ''
+            while bytes_read < size:
+                data = p3.stdout.read(size - bytes_read)
+                if not data:
+                    break  # EOF
+                content += data
+                bytes_read += len(data)
+            try:
+                fathash = self.decode(content)[0]
+                referenced.add(fathash)
+            except GitFat.DecodeError:
+                pass
+            # Consume LF record delimiter in `cat-file --batch` output
+            bytes_read = 0
+            while bytes_read < 1:
+                data = p3.stdout.read(1)
+                if not data:
+                    break  # EOF
+                bytes_read += len(data)
+        # Ensure everything is cleaned up
         cut_thread.join()
+        filter_thread.join()
         p1.wait()
         p2.wait()
+        p3.wait()
         return referenced
 
     def orphan_files(self, patterns=[]):
@@ -343,8 +398,11 @@
         self.verbose('Executing: %s' % ' '.join(cmd))
         p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
         p.communicate(input='\x00'.join(files))
+        if p.returncode:
+            sys.exit(p.returncode)
     def checkout(self, show_orphans=False):
         'Update any stale files in the present working tree'
+        self.assert_init_done()
         for digest, fname in self.orphan_files():
             objpath = os.path.join(self.objdir, digest)
             if os.access(objpath, os.R_OK):
@@ -377,6 +435,8 @@
         self.verbose('Executing: %s' % ' '.join(cmd))
         p = subprocess.Popen(cmd, stdin=subprocess.PIPE)
         p.communicate(input='\x00'.join(files))
+        if p.returncode:
+            sys.exit(p.returncode)
         self.checkout()
 
     def parse_pull_patterns(self, args):
@@ -405,9 +465,27 @@
             fname = os.path.join(self.objdir, obj)
             print('%10d %s' % (os.stat(fname).st_size, obj))
             os.remove(fname)
+
+    def cmd_verify(self):
+        """Print details of git-fat objects with incorrect data hash"""
+        corrupted_objects = []
+        for obj in self.catalog_objects():
+            fname = os.path.join(self.objdir, obj)
+            h = hashlib.new('sha1')
+            for block in readblocks(open(fname)):
+                h.update(block)
+            data_hash = h.hexdigest()
+            if obj != data_hash:
+                corrupted_objects.append((obj, data_hash))
+        if corrupted_objects:
+            print('Corrupted objects: %d' % len(corrupted_objects))
+            for obj, data_hash in corrupted_objects:
+                print('%s data hash is %s' % (obj, data_hash))
+            sys.exit(1)
+
     def cmd_init(self):
         self.setup()
-        if gitconfig_get('filter.fat.clean') or 
gitconfig_get('filter.fat.smudge'):
+        if self.is_init_done():
             print('Git fat already configured, check configuration in 
.git/config')
         else:
             gitconfig_set('filter.fat.clean', 'git-fat filter-clean')
@@ -459,7 +537,7 @@
         time1 = time.time()
         self.verbose('Found %d paths in %.3f s' % (len(pathsizes), 
time1-time0))
         maxlen = max(map(len,pathsizes)) if pathsizes else 0
-        for path, sizes in sorted(pathsizes.items(), cmp=lambda 
(p1,s1),(p2,s2): cmp(max(s1),max(s2)), reverse=True):
+        for path, sizes in sorted(pathsizes.items(), key=lambda p,s: max(s), 
reverse=True):
             print('%-*s filter=fat -text # %10d %d' % (maxlen, 
path,max(sizes),len(sizes)))
         revlist.wait()
         difftree.wait()
@@ -530,6 +608,8 @@
         fat.cmd_pull(sys.argv[2:])
     elif cmd == 'gc':
         fat.cmd_gc()
+    elif cmd == 'verify':
+        fat.cmd_verify()
     elif cmd == 'checkout':
         fat.cmd_checkout(sys.argv[2:])
     elif cmd == 'find':
@@ -537,4 +617,4 @@
     elif cmd == 'index-filter':
         fat.cmd_index_filter(sys.argv[2:])
     else:
-        print('Usage: git fat 
[init|status|push|pull|gc|checkout|find|index-filter]', file=sys.stderr)
+        print('Usage: git fat 
[init|status|push|pull|gc|verify|checkout|find|index-filter]', file=sys.stderr)
diff --git a/test-retroactive.sh b/test-retroactive.sh
index dd02367..51a38ec 100755
--- a/test-retroactive.sh
+++ b/test-retroactive.sh
@@ -1,8 +1,11 @@
 #!/bin/sh -ex
 
+fullpath() { echo "`pwd`/$1"; }
+
 git init retro
 cd retro
 cp /usr/share/dict/words words.big
+chmod u+w words.big
 git add words.big
 git commit -m'Add big file without using git-fat'
 sort words.big > sorted.big
@@ -25,7 +28,7 @@
 git log --stat
 
 git fat find 10000 | awk '{print $1}' > fat-files
-git filter-branch --index-filter "git fat index-filter $(realpath fat-files) 
--manage-gitattributes" --tag-name-filter cat -- --all
+git filter-branch --index-filter "git fat index-filter $(fullpath fat-files) 
--manage-gitattributes" --tag-name-filter cat -- --all
 
 git log --stat
 git checkout HEAD^
@@ -37,14 +40,14 @@
 git checkout master
 cat > .gitfat <<EOF
 [rsync]
-remote = $(realpath ../retro-store)
+remote = $(fullpath ../retro-store)
 EOF
 git add .gitfat
 git commit -m'Add .gitfat for local push'
 git fat push
 
 cd ..
-git clone file:///$(realpath retro) retro-clone
+git clone file:///$(fullpath retro) retro-clone
 cd retro-clone
 git fat init
 git fat pull
diff --git a/test.sh b/test.sh
index e4ee3bb..0ee63ea 100755
--- a/test.sh
+++ b/test.sh
@@ -2,6 +2,9 @@
 # Any copyright is dedicated to the Public Domain.
 # http://creativecommons.org/publicdomain/zero/1.0/
 
+# Clear out repos and fat store from prior test runs
+rm -fR fat-test fat-test2 /tmp/fat-store
+
 git init fat-test
 cd fat-test
 git fat init
@@ -29,6 +32,33 @@
 cd ..
 git clone fat-test fat-test2
 cd fat-test2
+# checkout and pull should fail in repo not yet init'ed for git-fat
+git fat checkout && true
+if [ $? -eq 0 ]
+then
+    echo 'ERROR: "git fat checkout" in uninitialised repo should fail'
+    exit 1
+fi
+git fat pull -- 'a.fa*' && true
+if [ $? -eq 0 ]
+then
+    echo 'ERROR: "git fat pull" in uninitialised repo should fail'
+    exit 1
+fi
 git fat init
 git fat pull -- 'a.fa*'
 cat a.fat
+echo 'file which is committed and removed afterwards' > d
+git add d
+git commit -m'add d with normal content'
+rm d
+git fat pull
+
+# Check verify command finds corrupt object
+mv .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8 \
+   .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.bak
+echo "Not the right data" > 
.git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8
+git fat verify && true
+if [ $? -eq 0 ]; then echo "Verify did not detect invalid object"; exit 1; fi
+mv .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8.bak \
+   .git/fat/objects/6ecec2e21d3033e7ba53e2db63f69dbd3a011fa8

-- 
To view, visit https://gerrit.wikimedia.org/r/330464
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I9916149b4f4e8cd16a384753fda6fd72346ff695
Gerrit-PatchSet: 1
Gerrit-Project: operations/debs/git-fat
Gerrit-Branch: master
Gerrit-Owner: Chad <ch...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to