Re: [BUG] add_again() off-by-one error in custom format

René Scharfe Sun, 18 Jun 2017 03:59:26 -0700

Am 15.06.2017 um 15:25 schrieb Jeff King:
> On Thu, Jun 15, 2017 at 01:33:34PM +0200, René Scharfe wrote:
>>> I'm not really sure how, though, short of caching the directory
>>> contents. That opens up questions of whether and when to invalidate the
>>> cache. If the cache were _just_ about short hashes, it might be OK to
>>> just assume that it remains valid through the length of the program (so
>>> worst case, a simultaneous write might mean that we generate a sha1
>>> which just became ambiguous, but that's generally going to be racy
>>> anyway).
>>>
>>> The other downside of course is that we'd spend RAM on it. We could
>>> bound the size of the cache, I suppose.
>>
>> You mean like an in-memory pack index for loose objects?  In order to
>> avoid the readdir call in sha1_name.c::find_short_object_filename()?
>> We'd only need to keep the hashes of found objects.  An oid_array
>> would be quite compact.
> 
> Yes, that's what I was thinking of.


An oid_array spends ca. 30 bytes per entry (20 bytes for a hash times
a factor of 1.5 from alloc_nr).  How many loose objects do we have to
expect?  30 MB for a million of them sounds not too bad, but can there
realistically be orders of magnitude more?

>> Non-racy writes inside a process should not be ignored (write, read
>> later) -- e.g. checkout does something like that.
> 
> Ideally, yes. Though thinking on this more, it's racy even today,
> because we rely on the in-memory packed_git list. We usually re-check the
> directory only when we look for an object and it's missing. So any new
> packs which have been added while the process runs will be missed when
> doing short-sha1 lookups (and actually, find_short_packed_object does
> not even seem to do the usual retry-on-miss).
> 
> A normal process like "checkout" isn't writing new packs, but a
> simultaneous repack could be migrating its new objects to a pack behind
> its back. (It also _can_ write packs, but only for large blobs).
> 
> Given that we are talking about finding "unique" abbreviations here, and
> that those abbreviations can become invalidated immediately anyway as
> more objects are added (even by the same process), I'm not sure we need
> to strive for absolute accuracy.

Agreed.  And processes that add objects and use them later probably use
full hashes (at least checkout does).

So here's a patch for caching loose object hashes in an oid_array
without a way to invalidate or release the cache.  It uses a single
oid_array for simplicity, but reads each subdirectory individually and
remembers that fact using a bitmap.
---
 cache.h     |  4 ++++
 sha1_name.c | 69 +++++++++++++++++++++++++++++++++++++++++++------------------
 2 files changed, 53 insertions(+), 20 deletions(-)

diff --git a/cache.h b/cache.h
index 4d92aae0e8..8c6748907b 100644
--- a/cache.h
+++ b/cache.h
@@ -11,6 +11,7 @@
 #include "string-list.h"
 #include "pack-revindex.h"
 #include "hash.h"
+#include "sha1-array.h"
 
 #ifndef platform_SHA_CTX
 /*
@@ -1586,6 +1587,9 @@ extern struct alternate_object_database {
        struct strbuf scratch;
        size_t base_len;
 
+       uint32_t loose_objects_subdir_bitmap[8];
+       struct oid_array loose_objects_cache;
+
        char path[FLEX_ARRAY];
 } *alt_odb_list;
 extern void prepare_alt_odb(void);
diff --git a/sha1_name.c b/sha1_name.c
index 5126853bb5..ae6a5c3abe 100644
--- a/sha1_name.c
+++ b/sha1_name.c
@@ -77,10 +77,46 @@ static void update_candidates(struct disambiguate_state 
*ds, const struct object
        /* otherwise, current can be discarded and candidate is still good */
 }
 
+static void read_loose_object_subdir(struct alternate_object_database *alt,
+                                    int subdir_nr)
+{
+       struct strbuf *buf;
+       char hex[GIT_MAX_HEXSZ];
+       DIR *dir;
+       struct dirent *de;
+       size_t bitmap_index = subdir_nr / 32;
+       uint32_t bitmap_mask = 1 << (subdir_nr % 32);
+
+       if (alt->loose_objects_subdir_bitmap[bitmap_index] & bitmap_mask)
+               return;
+       alt->loose_objects_subdir_bitmap[bitmap_index] |= bitmap_mask;
+
+       buf = alt_scratch_buf(alt);
+       strbuf_addf(buf, "%02x/", subdir_nr);
+       xsnprintf(hex, sizeof(hex), "%02x", subdir_nr);
+
+       dir = opendir(buf->buf);
+       if (!dir)
+               return;
+
+       while ((de = readdir(dir)) != NULL) {
+               struct object_id oid;
+
+               if (strlen(de->d_name) != GIT_SHA1_HEXSZ - 2)
+                       continue;
+               memcpy(hex + 2, de->d_name, GIT_SHA1_HEXSZ - 2);
+               if (!get_oid_hex(hex, &oid))
+                       oid_array_append(&alt->loose_objects_cache, &oid);
+       }
+       closedir(dir);
+}
+
+static int match_sha(unsigned, const unsigned char *, const unsigned char *);
+
 static void find_short_object_filename(struct disambiguate_state *ds)
 {
+       int subdir_nr = ds->bin_pfx.hash[0];
        struct alternate_object_database *alt;
-       char hex[GIT_MAX_HEXSZ];
        static struct alternate_object_database *fakeent;
 
        if (!fakeent) {
@@ -95,29 +131,22 @@ static void find_short_object_filename(struct 
disambiguate_state *ds)
        }
        fakeent->next = alt_odb_list;
 
-       xsnprintf(hex, sizeof(hex), "%.2s", ds->hex_pfx);
        for (alt = fakeent; alt && !ds->ambiguous; alt = alt->next) {
-               struct strbuf *buf = alt_scratch_buf(alt);
-               struct dirent *de;
-               DIR *dir;
-
-               strbuf_addf(buf, "%.2s/", ds->hex_pfx);
-               dir = opendir(buf->buf);
-               if (!dir)
-                       continue;
+               int pos;
 
-               while (!ds->ambiguous && (de = readdir(dir)) != NULL) {
-                       struct object_id oid;
+               read_loose_object_subdir(alt, subdir_nr);
 
-                       if (strlen(de->d_name) != GIT_SHA1_HEXSZ - 2)
-                               continue;
-                       if (memcmp(de->d_name, ds->hex_pfx + 2, ds->len - 2))
-                               continue;
-                       memcpy(hex + 2, de->d_name, GIT_SHA1_HEXSZ - 2);
-                       if (!get_oid_hex(hex, &oid))
-                               update_candidates(ds, &oid);
+               pos = oid_array_lookup(&alt->loose_objects_cache, &ds->bin_pfx);
+               if (pos < 0)
+                       pos = -1 - pos;
+               while (!ds->ambiguous && pos < alt->loose_objects_cache.nr) {
+                       const struct object_id *oid;
+                       oid = alt->loose_objects_cache.oid + pos;
+                       if (!match_sha(ds->len, ds->bin_pfx.hash, oid->hash))
+                               break;
+                       update_candidates(ds, oid);
+                       pos++;
                }
-               closedir(dir);
        }
 }
 
-- 
2.13.0

Re: [BUG] add_again() off-by-one error in custom format

Reply via email to