Re: [PATCH 10/23] midx: write a lookup into the pack names chunk

2018-06-21 Thread Derrick Stolee




On 6/9/2018 12:43 PM, Duy Nguyen wrote:

On Thu, Jun 7, 2018 at 7:01 PM Derrick Stolee  wrote:

Signed-off-by: Derrick Stolee 
---
  Documentation/technical/pack-format.txt |  5 +++
  builtin/midx.c  |  7 
  midx.c  | 56 +++--
  object-store.h  |  2 +
  t/t5319-midx.sh | 11 +++--
  5 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/Documentation/technical/pack-format.txt 
b/Documentation/technical/pack-format.txt
index 2b37be7b33..29bf87283a 100644
--- a/Documentation/technical/pack-format.txt
+++ b/Documentation/technical/pack-format.txt
@@ -296,6 +296,11 @@ CHUNK LOOKUP:

  CHUNK DATA:

+   Packfile Name Lookup (ID: {'P', 'L', 'O', 'O'}) (P * 4 bytes)
+   P * 4 bytes storing the offset in the packfile name chunk for
+   the null-terminated string containing the filename for the
+   ith packfile.
+

Commit message is too light on this one. Why does this need to be
stored? Isn't the cost of rebuilding this in-core cheap?

Adding this chunk on disk in my opinion only adds more burden. Now you
have to verify that these offsets actually point to the right place.
This is a very good point. I'll drop the chunk and just read the names 
directly to construct the array of strings.


Thanks,
-Stolee


Re: [PATCH 10/23] midx: write a lookup into the pack names chunk

2018-06-09 Thread Duy Nguyen
On Thu, Jun 7, 2018 at 7:01 PM Derrick Stolee  wrote:
>
> Signed-off-by: Derrick Stolee 
> ---
>  Documentation/technical/pack-format.txt |  5 +++
>  builtin/midx.c  |  7 
>  midx.c  | 56 +++--
>  object-store.h  |  2 +
>  t/t5319-midx.sh | 11 +++--
>  5 files changed, 75 insertions(+), 6 deletions(-)
>
> diff --git a/Documentation/technical/pack-format.txt 
> b/Documentation/technical/pack-format.txt
> index 2b37be7b33..29bf87283a 100644
> --- a/Documentation/technical/pack-format.txt
> +++ b/Documentation/technical/pack-format.txt
> @@ -296,6 +296,11 @@ CHUNK LOOKUP:
>
>  CHUNK DATA:
>
> +   Packfile Name Lookup (ID: {'P', 'L', 'O', 'O'}) (P * 4 bytes)
> +   P * 4 bytes storing the offset in the packfile name chunk for
> +   the null-terminated string containing the filename for the
> +   ith packfile.
> +

Commit message is too light on this one. Why does this need to be
stored? Isn't the cost of rebuilding this in-core cheap?

Adding this chunk on disk in my opinion only adds more burden. Now you
have to verify that these offsets actually point to the right place.

> Packfile Names (ID: {'P', 'N', 'A', 'M'})
> Stores the packfile names as concatenated, null-terminated 
> strings.
> Packfiles must be listed in lexicographic order for fast lookups 
> by
> diff --git a/builtin/midx.c b/builtin/midx.c
> index fe56560853..3a261e9bbf 100644
> --- a/builtin/midx.c
> +++ b/builtin/midx.c
> @@ -16,6 +16,7 @@ static struct opts_midx {
>
>  static int read_midx_file(const char *object_dir)
>  {
> +   uint32_t i;
> struct midxed_git *m = load_midxed_git(object_dir);
>
> if (!m)
> @@ -30,11 +31,17 @@ static int read_midx_file(const char *object_dir)
>
> printf("chunks:");
>
> +   if (m->chunk_pack_lookup)
> +   printf(" pack_lookup");
> if (m->chunk_pack_names)
> printf(" pack_names");
>
> printf("\n");
>
> +   printf("packs:\n");
> +   for (i = 0; i < m->num_packs; i++)
> +   printf("%s\n", m->pack_names[i]);
> +
> printf("object_dir: %s\n", m->object_dir);
>
> return 0;
> diff --git a/midx.c b/midx.c
> index d4f4a01a51..923acda72e 100644
> --- a/midx.c
> +++ b/midx.c
> @@ -13,8 +13,9 @@
>  #define MIDX_HASH_LEN 20
>  #define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + MIDX_HASH_LEN)
>
> -#define MIDX_MAX_CHUNKS 1
> +#define MIDX_MAX_CHUNKS 2
>  #define MIDX_CHUNK_ALIGNMENT 4
> +#define MIDX_CHUNKID_PACKLOOKUP 0x504c4f4f /* "PLOO" */
>  #define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
>  #define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
>
> @@ -85,6 +86,10 @@ struct midxed_git *load_midxed_git(const char *object_dir)
> uint64_t chunk_offset = get_be64(m->data + 16 + 
> MIDX_CHUNKLOOKUP_WIDTH * i);
>
> switch (chunk_id) {
> +   case MIDX_CHUNKID_PACKLOOKUP:
> +   m->chunk_pack_lookup = (uint32_t *)(m->data + 
> chunk_offset);
> +   break;
> +
> case MIDX_CHUNKID_PACKNAMES:
> m->chunk_pack_names = m->data + chunk_offset;
> break;
> @@ -102,9 +107,32 @@ struct midxed_git *load_midxed_git(const char 
> *object_dir)
> }
> }
>
> +   if (!m->chunk_pack_lookup)
> +   die("MIDX missing required pack lookup chunk");
> if (!m->chunk_pack_names)
> die("MIDX missing required pack-name chunk");
>
> +   m->pack_names = xcalloc(m->num_packs, sizeof(const char *));
> +   for (i = 0; i < m->num_packs; i++) {
> +   if (i) {
> +   if (ntohl(m->chunk_pack_lookup[i]) <= 
> ntohl(m->chunk_pack_lookup[i - 1])) {
> +   error("MIDX pack lookup value %d before %d",
> + ntohl(m->chunk_pack_lookup[i - 1]),
> + ntohl(m->chunk_pack_lookup[i]));
> +   goto cleanup_fail;
> +   }
> +   }
> +
> +   m->pack_names[i] = (const char *)(m->chunk_pack_names + 
> ntohl(m->chunk_pack_lookup[i]));
> +
> +   if (i && strcmp(m->pack_names[i], m->pack_names[i - 1]) <= 0) 
> {
> +   error("MIDX pack names out of order: '%s' before 
> '%s'",
> + m->pack_names[i - 1],
> + m->pack_names[i]);
> +   goto cleanup_fail;
> +   }
> +   }
> +
> return m;
>
>  cleanup_fail:
> @@ -162,6 +190,20 @@ static void sort_packs_by_name(char **pack_names, 
> uint32_t nr_packs, uint32_t *p
> }
>  }
>
> +static size_t write_midx_pack_lookup(struct hashfile *f,
> +   

[PATCH 10/23] midx: write a lookup into the pack names chunk

2018-06-07 Thread Derrick Stolee
Signed-off-by: Derrick Stolee 
---
 Documentation/technical/pack-format.txt |  5 +++
 builtin/midx.c  |  7 
 midx.c  | 56 +++--
 object-store.h  |  2 +
 t/t5319-midx.sh | 11 +++--
 5 files changed, 75 insertions(+), 6 deletions(-)

diff --git a/Documentation/technical/pack-format.txt 
b/Documentation/technical/pack-format.txt
index 2b37be7b33..29bf87283a 100644
--- a/Documentation/technical/pack-format.txt
+++ b/Documentation/technical/pack-format.txt
@@ -296,6 +296,11 @@ CHUNK LOOKUP:
 
 CHUNK DATA:
 
+   Packfile Name Lookup (ID: {'P', 'L', 'O', 'O'}) (P * 4 bytes)
+   P * 4 bytes storing the offset in the packfile name chunk for
+   the null-terminated string containing the filename for the
+   ith packfile.
+
Packfile Names (ID: {'P', 'N', 'A', 'M'})
Stores the packfile names as concatenated, null-terminated strings.
Packfiles must be listed in lexicographic order for fast lookups by
diff --git a/builtin/midx.c b/builtin/midx.c
index fe56560853..3a261e9bbf 100644
--- a/builtin/midx.c
+++ b/builtin/midx.c
@@ -16,6 +16,7 @@ static struct opts_midx {
 
 static int read_midx_file(const char *object_dir)
 {
+   uint32_t i;
struct midxed_git *m = load_midxed_git(object_dir);
 
if (!m)
@@ -30,11 +31,17 @@ static int read_midx_file(const char *object_dir)
 
printf("chunks:");
 
+   if (m->chunk_pack_lookup)
+   printf(" pack_lookup");
if (m->chunk_pack_names)
printf(" pack_names");
 
printf("\n");
 
+   printf("packs:\n");
+   for (i = 0; i < m->num_packs; i++)
+   printf("%s\n", m->pack_names[i]);
+
printf("object_dir: %s\n", m->object_dir);
 
return 0;
diff --git a/midx.c b/midx.c
index d4f4a01a51..923acda72e 100644
--- a/midx.c
+++ b/midx.c
@@ -13,8 +13,9 @@
 #define MIDX_HASH_LEN 20
 #define MIDX_MIN_SIZE (MIDX_HEADER_SIZE + MIDX_HASH_LEN)
 
-#define MIDX_MAX_CHUNKS 1
+#define MIDX_MAX_CHUNKS 2
 #define MIDX_CHUNK_ALIGNMENT 4
+#define MIDX_CHUNKID_PACKLOOKUP 0x504c4f4f /* "PLOO" */
 #define MIDX_CHUNKID_PACKNAMES 0x504e414d /* "PNAM" */
 #define MIDX_CHUNKLOOKUP_WIDTH (sizeof(uint32_t) + sizeof(uint64_t))
 
@@ -85,6 +86,10 @@ struct midxed_git *load_midxed_git(const char *object_dir)
uint64_t chunk_offset = get_be64(m->data + 16 + 
MIDX_CHUNKLOOKUP_WIDTH * i);
 
switch (chunk_id) {
+   case MIDX_CHUNKID_PACKLOOKUP:
+   m->chunk_pack_lookup = (uint32_t *)(m->data + 
chunk_offset);
+   break;
+
case MIDX_CHUNKID_PACKNAMES:
m->chunk_pack_names = m->data + chunk_offset;
break;
@@ -102,9 +107,32 @@ struct midxed_git *load_midxed_git(const char *object_dir)
}
}
 
+   if (!m->chunk_pack_lookup)
+   die("MIDX missing required pack lookup chunk");
if (!m->chunk_pack_names)
die("MIDX missing required pack-name chunk");
 
+   m->pack_names = xcalloc(m->num_packs, sizeof(const char *));
+   for (i = 0; i < m->num_packs; i++) {
+   if (i) {
+   if (ntohl(m->chunk_pack_lookup[i]) <= 
ntohl(m->chunk_pack_lookup[i - 1])) {
+   error("MIDX pack lookup value %d before %d",
+ ntohl(m->chunk_pack_lookup[i - 1]),
+ ntohl(m->chunk_pack_lookup[i]));
+   goto cleanup_fail;
+   }
+   }
+
+   m->pack_names[i] = (const char *)(m->chunk_pack_names + 
ntohl(m->chunk_pack_lookup[i]));
+
+   if (i && strcmp(m->pack_names[i], m->pack_names[i - 1]) <= 0) {
+   error("MIDX pack names out of order: '%s' before '%s'",
+ m->pack_names[i - 1],
+ m->pack_names[i]);
+   goto cleanup_fail;
+   }
+   }
+
return m;
 
 cleanup_fail:
@@ -162,6 +190,20 @@ static void sort_packs_by_name(char **pack_names, uint32_t 
nr_packs, uint32_t *p
}
 }
 
+static size_t write_midx_pack_lookup(struct hashfile *f,
+char **pack_names,
+uint32_t nr_packs)
+{
+   uint32_t i, cur_len = 0;
+
+   for (i = 0; i < nr_packs; i++) {
+   hashwrite_be32(f, cur_len);
+   cur_len += strlen(pack_names[i]) + 1;
+   }
+
+   return sizeof(uint32_t) * (size_t)nr_packs;
+}
+
 static size_t write_midx_pack_names(struct hashfile *f,
char **pack_names,
uint32_t num_packs)
@@ -275,13 +317,17 @@ int write_midx_file(