Re: [PATCH v2 11/14] commit: integrate commit graph with commit parsing

2018-02-06 Thread Derrick Stolee

On 2/1/2018 8:51 PM, Jonathan Tan wrote:

On Tue, 30 Jan 2018 16:39:40 -0500
Derrick Stolee  wrote:


+/* global storage */
+struct commit_graph *commit_graph = 0;

NULL, not 0.


+static int bsearch_graph(struct commit_graph *g, struct object_id *oid, 
uint32_t *pos)
+{
+   uint32_t last, first = 0;
+
+   if (oid->hash[0])
+   first = ntohl(*(uint32_t*)(g->chunk_oid_fanout + 4 * 
(oid->hash[0] - 1)));
+   last = ntohl(*(uint32_t*)(g->chunk_oid_fanout + 4 * oid->hash[0]));
+
+   while (first < last) {
+   uint32_t mid = first + (last - first) / 2;
+   const unsigned char *current;
+   int cmp;
+
+   current = g->chunk_oid_lookup + g->hdr->hash_len * mid;
+   cmp = hashcmp(oid->hash, current);
+   if (!cmp) {
+   *pos = mid;
+   return 1;
+   }
+   if (cmp > 0) {
+   first = mid + 1;
+   continue;
+   }
+   last = mid;
+   }
+
+   *pos = first;
+   return 0;
+}

This would be better in sha1-lookup.c, something like the reverse of commit
f1068efefe6d ("sha1_file: drop experimental GIT_USE_LOOKUP search",
2017-08-09), except that it can be done using a simple binary search.


I rebased my patch onto your binary search patch, so I'll use that in 
the future.





+static int full_parse_commit(struct commit *item, struct commit_graph *g,
+uint32_t pos, const unsigned char *commit_data)
+{
+   struct object_id oid;
+   struct commit *new_parent;
+   uint32_t new_parent_pos;
+   uint32_t *parent_data_ptr;
+   uint64_t date_low, date_high;
+   struct commit_list **pptr;
+
+   item->object.parsed = 1;
+   item->graph_pos = pos;
+
+   hashcpy(oid.hash, commit_data);
+   item->tree = lookup_tree();
+
+   date_high = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len + 8)) & 
0x3;
+   date_low = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len + 12));
+   item->date = (timestamp_t)((date_high << 32) | date_low);
+
+   pptr = >parents;
+
+   new_parent_pos = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len));
+   if (new_parent_pos == GRAPH_PARENT_NONE)
+   return 1;
+   get_nth_commit_oid(g, new_parent_pos, );
+   new_parent = lookup_commit();
+   if (new_parent) {
+   new_parent->graph_pos = new_parent_pos;
+   pptr = _list_insert(new_parent, pptr)->next;
+   } else {
+   die("could not find commit %s", oid_to_hex());
+   }
+
+   new_parent_pos = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len + 
4));
+   if (new_parent_pos == GRAPH_PARENT_NONE)
+   return 1;
+   if (!(new_parent_pos & GRAPH_LARGE_EDGES_NEEDED)) {
+   get_nth_commit_oid(g, new_parent_pos, );
+   new_parent = lookup_commit();
+   if (new_parent) {
+   new_parent->graph_pos = new_parent_pos;
+   pptr = _list_insert(new_parent, pptr)->next;
+   } else
+   die("could not find commit %s", oid_to_hex());
+   return 1;
+   }
+
+   parent_data_ptr = (uint32_t*)(g->chunk_large_edges + 4 * 
(new_parent_pos ^ GRAPH_LARGE_EDGES_NEEDED));
+   do {
+   new_parent_pos = ntohl(*parent_data_ptr);
+
+   get_nth_commit_oid(g, new_parent_pos & GRAPH_EDGE_LAST_MASK, 
);
+   new_parent = lookup_commit();
+   if (new_parent) {
+   new_parent->graph_pos = new_parent_pos & 
GRAPH_EDGE_LAST_MASK;
+   pptr = _list_insert(new_parent, pptr)->next;
+   } else
+   die("could not find commit %s", oid_to_hex());
+   parent_data_ptr++;
+   } while (!(new_parent_pos & GRAPH_LAST_EDGE));
+
+   return 1;
+}

The part that converts  into 
seems to be duplicated 3 times. Refactor into a function?


Will do.




+/**
+ * Fill 'item' to contain all information that would be parsed by 
parse_commit_buffer.
+ */
+static int fill_commit_in_graph(struct commit *item, struct commit_graph *g, 
uint32_t pos)
+{
+   uint32_t new_parent_pos;
+   uint32_t *parent_data_ptr;
+   const unsigned char *commit_data = g->chunk_commit_data + 
(g->hdr->hash_len + 16) * pos;
+
+   new_parent_pos = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len));
+
+   if (new_parent_pos == GRAPH_PARENT_MISSING)
+   return 0;
+
+   if (new_parent_pos == GRAPH_PARENT_NONE)
+   return full_parse_commit(item, g, pos, commit_data);
+
+   new_parent_pos = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len + 
4));
+
+   if (new_parent_pos == GRAPH_PARENT_MISSING)
+   return 0;
+   if (!(new_parent_pos & GRAPH_LARGE_EDGES_NEEDED))
+   return 

Re: [PATCH v2 11/14] commit: integrate commit graph with commit parsing

2018-02-01 Thread Jonathan Tan
On Tue, 30 Jan 2018 16:39:40 -0500
Derrick Stolee  wrote:

> +/* global storage */
> +struct commit_graph *commit_graph = 0;

NULL, not 0.

> +static int bsearch_graph(struct commit_graph *g, struct object_id *oid, 
> uint32_t *pos)
> +{
> + uint32_t last, first = 0;
> +
> + if (oid->hash[0])
> + first = ntohl(*(uint32_t*)(g->chunk_oid_fanout + 4 * 
> (oid->hash[0] - 1)));
> + last = ntohl(*(uint32_t*)(g->chunk_oid_fanout + 4 * oid->hash[0]));
> +
> + while (first < last) {
> + uint32_t mid = first + (last - first) / 2;
> + const unsigned char *current;
> + int cmp;
> +
> + current = g->chunk_oid_lookup + g->hdr->hash_len * mid;
> + cmp = hashcmp(oid->hash, current);
> + if (!cmp) {
> + *pos = mid;
> + return 1;
> + }
> + if (cmp > 0) {
> + first = mid + 1;
> + continue;
> + }
> + last = mid;
> + }
> +
> + *pos = first;
> + return 0;
> +}

This would be better in sha1-lookup.c, something like the reverse of commit
f1068efefe6d ("sha1_file: drop experimental GIT_USE_LOOKUP search",
2017-08-09), except that it can be done using a simple binary search.

> +static int full_parse_commit(struct commit *item, struct commit_graph *g,
> +  uint32_t pos, const unsigned char *commit_data)
> +{
> + struct object_id oid;
> + struct commit *new_parent;
> + uint32_t new_parent_pos;
> + uint32_t *parent_data_ptr;
> + uint64_t date_low, date_high;
> + struct commit_list **pptr;
> +
> + item->object.parsed = 1;
> + item->graph_pos = pos;
> +
> + hashcpy(oid.hash, commit_data);
> + item->tree = lookup_tree();
> +
> + date_high = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len + 8)) & 
> 0x3;
> + date_low = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len + 12));
> + item->date = (timestamp_t)((date_high << 32) | date_low);
> +
> + pptr = >parents;
> +
> + new_parent_pos = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len));
> + if (new_parent_pos == GRAPH_PARENT_NONE)
> + return 1;
> + get_nth_commit_oid(g, new_parent_pos, );
> + new_parent = lookup_commit();
> + if (new_parent) {
> + new_parent->graph_pos = new_parent_pos;
> + pptr = _list_insert(new_parent, pptr)->next;
> + } else {
> + die("could not find commit %s", oid_to_hex());
> + }
> +
> + new_parent_pos = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len + 
> 4));
> + if (new_parent_pos == GRAPH_PARENT_NONE)
> + return 1;
> + if (!(new_parent_pos & GRAPH_LARGE_EDGES_NEEDED)) {
> + get_nth_commit_oid(g, new_parent_pos, );
> + new_parent = lookup_commit();
> + if (new_parent) {
> + new_parent->graph_pos = new_parent_pos;
> + pptr = _list_insert(new_parent, pptr)->next;
> + } else
> + die("could not find commit %s", oid_to_hex());
> + return 1;
> + }
> +
> + parent_data_ptr = (uint32_t*)(g->chunk_large_edges + 4 * 
> (new_parent_pos ^ GRAPH_LARGE_EDGES_NEEDED));
> + do {
> + new_parent_pos = ntohl(*parent_data_ptr);
> +
> + get_nth_commit_oid(g, new_parent_pos & GRAPH_EDGE_LAST_MASK, 
> );
> + new_parent = lookup_commit();
> + if (new_parent) {
> + new_parent->graph_pos = new_parent_pos & 
> GRAPH_EDGE_LAST_MASK;
> + pptr = _list_insert(new_parent, pptr)->next;
> + } else
> + die("could not find commit %s", oid_to_hex());
> + parent_data_ptr++;
> + } while (!(new_parent_pos & GRAPH_LAST_EDGE));
> +
> + return 1;
> +}

The part that converts  into 
seems to be duplicated 3 times. Refactor into a function?

> +/**
> + * Fill 'item' to contain all information that would be parsed by 
> parse_commit_buffer.
> + */
> +static int fill_commit_in_graph(struct commit *item, struct commit_graph *g, 
> uint32_t pos)
> +{
> + uint32_t new_parent_pos;
> + uint32_t *parent_data_ptr;
> + const unsigned char *commit_data = g->chunk_commit_data + 
> (g->hdr->hash_len + 16) * pos;
> +
> + new_parent_pos = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len));
> +
> + if (new_parent_pos == GRAPH_PARENT_MISSING)
> + return 0;
> +
> + if (new_parent_pos == GRAPH_PARENT_NONE)
> + return full_parse_commit(item, g, pos, commit_data);
> +
> + new_parent_pos = ntohl(*(uint32_t*)(commit_data + g->hdr->hash_len + 
> 4));
> +
> + if (new_parent_pos == GRAPH_PARENT_MISSING)
> + return 0;
> + if (!(new_parent_pos & GRAPH_LARGE_EDGES_NEEDED))
> + return full_parse_commit(item, g, pos, commit_data);
> +
> + 

[PATCH v2 11/14] commit: integrate commit graph with commit parsing

2018-01-30 Thread Derrick Stolee
Teach Git to inspect a commit graph file to supply the contents of a
struct commit when calling parse_commit_gently(). This implementation
satisfies all post-conditions on the struct commit, including loading
parents, the root tree, and the commit date. The only loosely-expected
condition is that the commit buffer is loaded into the cache. This
was checked in log-tree.c:show_log(), but the "return;" on failure
produced unexpected results (i.e. the message line was never terminated).
The new behavior of loading the buffer when needed prevents the
unexpected behavior.

If core.commitgraph is false, then do not check graph files.

In test script t5319-commit-graph.sh, add output-matching conditions on
read-only graph operations.

By loading commits from the graph instead of parsing commit buffers, we
save a lot of time on long commit walks. Here are some performance
results for a copy of the Linux repository where 'master' has 704,766
reachable commits and is behind 'origin/master' by 19,610 commits.

| Command  | Before | After  | Rel % |
|--|||---|
| log --oneline --topo-order -1000 |  5.9s  |  0.7s  | -88%  |
| branch -vv   |  0.42s |  0.27s | -35%  |
| rev-list --all   |  6.4s  |  1.0s  | -84%  |
| rev-list --all --objects | 32.6s  | 27.6s  | -15%  |

Signed-off-by: Derrick Stolee 
---
 alloc.c |   1 +
 commit-graph.c  | 237 
 commit-graph.h  |  20 +++-
 commit.c|  10 +-
 commit.h|   4 +
 log-tree.c  |   3 +-
 t/t5318-commit-graph.sh |  47 ++
 7 files changed, 318 insertions(+), 4 deletions(-)

diff --git a/alloc.c b/alloc.c
index 12afadfacd..cf4f8b61e1 100644
--- a/alloc.c
+++ b/alloc.c
@@ -93,6 +93,7 @@ void *alloc_commit_node(void)
struct commit *c = alloc_node(_state, sizeof(struct commit));
c->object.type = OBJ_COMMIT;
c->index = alloc_commit_index();
+   c->graph_pos = COMMIT_NOT_FROM_GRAPH;
return c;
 }
 
diff --git a/commit-graph.c b/commit-graph.c
index 764e016ddb..fc816533c6 100644
--- a/commit-graph.c
+++ b/commit-graph.c
@@ -35,6 +35,9 @@
 #define GRAPH_MIN_SIZE (GRAPH_CHUNKLOOKUP_SIZE + GRAPH_FANOUT_SIZE + \
GRAPH_OID_LEN + sizeof(struct commit_graph_header))
 
+/* global storage */
+struct commit_graph *commit_graph = 0;
+
 struct object_id *get_graph_head_hash(const char *pack_dir, struct object_id 
*hash)
 {
struct strbuf head_filename = STRBUF_INIT;
@@ -209,6 +212,220 @@ struct commit_graph *load_commit_graph_one(const char 
*graph_file, const char *p
return graph;
 }
 
+static void prepare_commit_graph_one(const char *obj_dir)
+{
+   char *graph_file;
+   struct object_id oid;
+   struct strbuf pack_dir = STRBUF_INIT;
+   strbuf_addstr(_dir, obj_dir);
+   strbuf_add(_dir, "/pack", 5);
+
+   if (!get_graph_head_hash(pack_dir.buf, ))
+   return;
+
+   graph_file = get_commit_graph_filename_hash(pack_dir.buf, );
+
+   commit_graph = load_commit_graph_one(graph_file, pack_dir.buf);
+   strbuf_release(_dir);
+}
+
+static int prepare_commit_graph_run_once = 0;
+void prepare_commit_graph(void)
+{
+   struct alternate_object_database *alt;
+   char *obj_dir;
+
+   if (prepare_commit_graph_run_once)
+   return;
+   prepare_commit_graph_run_once = 1;
+
+   obj_dir = get_object_directory();
+   prepare_commit_graph_one(obj_dir);
+   prepare_alt_odb();
+   for (alt = alt_odb_list; !commit_graph && alt; alt = alt->next)
+   prepare_commit_graph_one(alt->path);
+}
+
+static int bsearch_graph(struct commit_graph *g, struct object_id *oid, 
uint32_t *pos)
+{
+   uint32_t last, first = 0;
+
+   if (oid->hash[0])
+   first = ntohl(*(uint32_t*)(g->chunk_oid_fanout + 4 * 
(oid->hash[0] - 1)));
+   last = ntohl(*(uint32_t*)(g->chunk_oid_fanout + 4 * oid->hash[0]));
+
+   while (first < last) {
+   uint32_t mid = first + (last - first) / 2;
+   const unsigned char *current;
+   int cmp;
+
+   current = g->chunk_oid_lookup + g->hdr->hash_len * mid;
+   cmp = hashcmp(oid->hash, current);
+   if (!cmp) {
+   *pos = mid;
+   return 1;
+   }
+   if (cmp > 0) {
+   first = mid + 1;
+   continue;
+   }
+   last = mid;
+   }
+
+   *pos = first;
+   return 0;
+}
+
+struct object_id *get_nth_commit_oid(struct commit_graph *g,
+uint32_t n,
+struct object_id *oid)
+{
+   hashcpy(oid->hash, g->chunk_oid_lookup + g->hdr->hash_len * n);
+   return oid;
+}
+