Re: [PATCH 12/23] midx: write object ids in a chunk

2018-06-09 Thread Duy Nguyen
On Thu, Jun 7, 2018 at 4:07 PM Derrick Stolee  wrote:
>
> Signed-off-by: Derrick Stolee 
> ---
>  Documentation/technical/pack-format.txt |  4 ++
>  builtin/midx.c  |  2 +
>  midx.c  | 50 +++--
>  object-store.h  |  1 +
>  t/t5319-midx.sh |  4 +-
>  5 files changed, 55 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/technical/pack-format.txt 
> b/Documentation/technical/pack-format.txt
> index 29bf87283a..de9ac778b6 100644
> --- a/Documentation/technical/pack-format.txt
> +++ b/Documentation/technical/pack-format.txt
> @@ -307,6 +307,10 @@ CHUNK DATA:
> name. This is the only chunk not guaranteed to be a multiple of 
> four
> bytes in length, so should be the last chunk for alignment 
> reasons.
>
> +   OID Lookup (ID: {'O', 'I', 'D', 'L'}) (N * H bytes)

So N is the number of objects and H is hash size? Please don't let me guess.

> +   The OIDs for all objects in the MIDX are stored in lexicographic
> +   order in this chunk.

The reason we keep all hashes together, packed right, is to reduce
cache footprint. Another observation is it takes us usually just 12
bytes or less to uniquely identify an object, which means we could
pack even tighter if we split he object hash into two chunks, do
bsearch in the first chunk with just  bytes then verify that the
remaining 20- bytes is matched in the second chunk. This may matter
more when we move to larger hashes. The split would of course be
configurable since different project may have different optimal value,
but default value could be 10/10 bytes.

> +
> (This section intentionally left incomplete.)
>
>  TRAILER:
> diff --git a/builtin/midx.c b/builtin/midx.c
> index 3a261e9bbf..86edd30174 100644
> --- a/builtin/midx.c
> +++ b/builtin/midx.c
> @@ -35,6 +35,8 @@ static int read_midx_file(const char *object_dir)
> printf(" pack_lookup");
> if (m->chunk_pack_names)
> printf(" pack_names");
> +   if (m->chunk_oid_lookup)
> +   printf(" oid_lookup");
>
> printf("\n");
>
> diff --git a/midx.c b/midx.c
> index b20d52713c..d06bc6876a 100644
> --- a/midx.c
> +++ b/midx.c
> @@ -14,10 +14,11 @@
>  #define MIDX_HASH_LEN 20
>  #define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + MIDX_HASH_LEN)
>
> -#define MIDX_MAX_CHUNKS 2
> +#define MIDX_MAX_CHUNKS 3
>  #define MIDX_CHUNK_ALIGNMENT 4
>  #define MIDX_CHUNKID_PACKLOOKUP 0x504c4f4f /* "PLOO" */
>  #define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
> +#define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
>  #define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
>
>  static char *get_midx_filename(const char *object_dir)
> @@ -95,6 +96,10 @@ struct midxed_git *load_midxed_git(const char *object_dir)
> m->chunk_pack_names = m->data + chunk_offset;
> break;
>
> +   case MIDX_CHUNKID_OIDLOOKUP:
> +   m->chunk_oid_lookup = m->data + chunk_offset;
> +   break;


I just now realized, how do you protect from duplicate chunks? From
this patch, it looks like you could accept two oidlookup chunks just
fine then siliently ignore the first one.

> case 0:
> die("terminating MIDX chunk id appears 
> earlier than expected");
> break;
> @@ -112,6 +117,8 @@ struct midxed_git *load_midxed_git(const char *object_dir)
> die("MIDX missing required pack lookup chunk");
> if (!m->chunk_pack_names)
> die("MIDX missing required pack-name chunk");
> +   if (!m->chunk_oid_lookup)
> +   die("MIDX missing required OID lookup chunk");

_()

>
> m->pack_names = xcalloc(m->num_packs, sizeof(const char *));
> for (i = 0; i < m->num_packs; i++) {
> @@ -370,6 +377,32 @@ static size_t write_midx_pack_names(struct hashfile *f,
> return written;
>  }
>
> +static size_t write_midx_oid_lookup(struct hashfile *f, unsigned char 
> hash_len,
> +   struct pack_midx_entry *objects,
> +   uint32_t nr_objects)
> +{
> +   struct pack_midx_entry *list = objects;
> +   uint32_t i;
> +   size_t written = 0;
> +
> +   for (i = 0; i < nr_objects; i++) {
> +   struct pack_midx_entry *obj = list++;
> +
> +   if (i < nr_objects - 1) {
> +   struct pack_midx_entry *next = list;
> +   if (oidcmp(>oid, >oid) >= 0)
> +   BUG("OIDs not in order: %s >= %s",
> +   oid_to_hex(>oid),
> +   oid_to_hex(>oid));

Indentation. I almost thought oid_to_hex() was a separate statement.

> +   }
> +
> +  

[PATCH 12/23] midx: write object ids in a chunk

2018-06-07 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 Documentation/technical/pack-format.txt |  4 ++
 builtin/midx.c  |  2 +
 midx.c  | 50 +++--
 object-store.h  |  1 +
 t/t5319-midx.sh |  4 +-
 5 files changed, 55 insertions(+), 6 deletions(-)

diff --git a/Documentation/technical/pack-format.txt 
b/Documentation/technical/pack-format.txt
index 29bf87283a..de9ac778b6 100644
--- a/Documentation/technical/pack-format.txt
+++ b/Documentation/technical/pack-format.txt
@@ -307,6 +307,10 @@ CHUNK DATA:
name. This is the only chunk not guaranteed to be a multiple of four
bytes in length, so should be the last chunk for alignment reasons.
 
+   OID Lookup (ID: {'O', 'I', 'D', 'L'}) (N * H bytes)
+   The OIDs for all objects in the MIDX are stored in lexicographic
+   order in this chunk.
+
(This section intentionally left incomplete.)
 
 TRAILER:
diff --git a/builtin/midx.c b/builtin/midx.c
index 3a261e9bbf..86edd30174 100644
--- a/builtin/midx.c
+++ b/builtin/midx.c
@@ -35,6 +35,8 @@ static int read_midx_file(const char *object_dir)
printf(" pack_lookup");
if (m->chunk_pack_names)
printf(" pack_names");
+   if (m->chunk_oid_lookup)
+   printf(" oid_lookup");
 
printf("\n");
 
diff --git a/midx.c b/midx.c
index b20d52713c..d06bc6876a 100644
--- a/midx.c
+++ b/midx.c
@@ -14,10 +14,11 @@
 #define MIDX_HASH_LEN 20
 #define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + MIDX_HASH_LEN)
 
-#define MIDX_MAX_CHUNKS 2
+#define MIDX_MAX_CHUNKS 3
 #define MIDX_CHUNK_ALIGNMENT 4
 #define MIDX_CHUNKID_PACKLOOKUP 0x504c4f4f /* "PLOO" */
 #define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
+#define MIDX_CHUNKID_OIDLOOKUP 0x4f49444c /* "OIDL" */
 #define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
 
 static char *get_midx_filename(const char *object_dir)
@@ -95,6 +96,10 @@ struct midxed_git *load_midxed_git(const char *object_dir)
m->chunk_pack_names = m->data + chunk_offset;
break;
 
+   case MIDX_CHUNKID_OIDLOOKUP:
+   m->chunk_oid_lookup = m->data + chunk_offset;
+   break;
+
case 0:
die("terminating MIDX chunk id appears earlier 
than expected");
break;
@@ -112,6 +117,8 @@ struct midxed_git *load_midxed_git(const char *object_dir)
die("MIDX missing required pack lookup chunk");
if (!m->chunk_pack_names)
die("MIDX missing required pack-name chunk");
+   if (!m->chunk_oid_lookup)
+   die("MIDX missing required OID lookup chunk");
 
m->pack_names = xcalloc(m->num_packs, sizeof(const char *));
for (i = 0; i < m->num_packs; i++) {
@@ -370,6 +377,32 @@ static size_t write_midx_pack_names(struct hashfile *f,
return written;
 }
 
+static size_t write_midx_oid_lookup(struct hashfile *f, unsigned char hash_len,
+   struct pack_midx_entry *objects,
+   uint32_t nr_objects)
+{
+   struct pack_midx_entry *list = objects;
+   uint32_t i;
+   size_t written = 0;
+
+   for (i = 0; i < nr_objects; i++) {
+   struct pack_midx_entry *obj = list++;
+
+   if (i < nr_objects - 1) {
+   struct pack_midx_entry *next = list;
+   if (oidcmp(>oid, >oid) >= 0)
+   BUG("OIDs not in order: %s >= %s",
+   oid_to_hex(>oid),
+   oid_to_hex(>oid));
+   }
+
+   hashwrite(f, obj->oid.hash, (int)hash_len);
+   written += hash_len;
+   }
+
+   return written;
+}
+
 int write_midx_file(const char *object_dir)
 {
unsigned char cur_chunk, num_chunks = 0;
@@ -389,6 +422,7 @@ int write_midx_file(const char *object_dir)
uint64_t written = 0;
uint32_t chunk_ids[MIDX_MAX_CHUNKS + 1];
uint64_t chunk_offsets[MIDX_MAX_CHUNKS + 1];
+   struct pack_midx_entry *entries;
uint32_t nr_entries;
 
midx_name = get_midx_filename(object_dir);
@@ -448,14 +482,14 @@ int write_midx_file(const char *object_dir)
ALLOC_ARRAY(pack_perm, nr_packs);
sort_packs_by_name(pack_names, nr_packs, pack_perm);
 
-   get_sorted_entries(packs, pack_perm, nr_packs, _entries);
+   entries = get_sorted_entries(packs, pack_perm, nr_packs, _entries);
 
hold_lock_file_for_update(, midx_name, LOCK_DIE_ON_ERROR);
f = hashfd(lk.tempfile->fd, lk.tempfile->filename.buf);
FREE_AND_NULL(midx_name);
 
cur_chunk = 0;
-   num_chunks = 2;
+   num_chunks = 3;
 
written = write_midx_header(f, num_chunks,