Re: [PATCH v2 17/18] commit-reach: make can_all_from_reach... linear
On 9/12/2018 12:29 AM, Jeff King wrote: On Wed, Sep 12, 2018 at 12:14:25AM -0400, Jeff King wrote: + ALLOC_ARRAY(list, from->nr); for (i = 0; i < from->nr; i++) { - struct object *from_one = from->objects[i].item; + list[i] = (struct commit *)from->objects[i].item; Some of the objects in my array are not commits, but rather tags, so this is a bogus cast. You can see that the original code peeled them and threw away non-commits: - if (from_one->flags & assign_flag) - continue; - from_one = deref_tag(the_repository, from_one, "a from object", 0); - if (!from_one || from_one->type != OBJ_COMMIT) { - /* no way to tell if this is reachable by -* looking at the ancestry chain alone, so -* leave a note to ourselves not to worry about -* this object anymore. -*/ - from->objects[i].item->flags |= assign_flag; - continue; - } So presumably we'd need to do something similar. This patch seems to fix it for me. It's more or less a reversion of the hunk above, though I didn't dig into whether I'm violating some other assumption in your new code. I think this function leaks "list" both from the location I noted here, as well as from normal exit Thanks for the report and the fix. I'll try to create test that demonstrates this and then push up a full patch. --- commit-reach.c | 25 ++--- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/commit-reach.c b/commit-reach.c index 622eeb313d..abe90a2f55 100644 --- a/commit-reach.c +++ b/commit-reach.c @@ -547,20 +547,31 @@ int can_all_from_reach_with_flag(struct object_array *from, { struct commit **list = NULL; int i; + int nr_commits; int result = 1; ALLOC_ARRAY(list, from->nr); + nr_commits = 0; for (i = 0; i < from->nr; i++) { - list[i] = (struct commit *)from->objects[i].item; + struct object *from_one = from->objects[i].item; - if (parse_commit(list[i]) || - list[i]->generation < min_generation) - return 0; + from_one = deref_tag(the_repository, from_one, +"a from object", 0); + if (!from_one || from_one->type != OBJ_COMMIT) { + from->objects[i].item->flags |= assign_flag; + continue; + } + + list[nr_commits] = (struct commit *)from_one; + if (parse_commit(list[nr_commits]) || + list[nr_commits]->generation < min_generation) + return 0; /* is this a leak? */ + nr_commits++; } - QSORT(list, from->nr, compare_commits_by_gen); + QSORT(list, nr_commits, compare_commits_by_gen); - for (i = 0; i < from->nr; i++) { + for (i = 0; i < nr_commits; i++) { /* DFS from list[i] */ struct commit_list *stack = NULL; @@ -603,7 +614,7 @@ int can_all_from_reach_with_flag(struct object_array *from, } cleanup: - for (i = 0; i < from->nr; i++) { + for (i = 0; i < nr_commits; i++) { clear_commit_marks(list[i], RESULT); clear_commit_marks(list[i], assign_flag); }
Re: [PATCH v2 17/18] commit-reach: make can_all_from_reach... linear
On Wed, Sep 12, 2018 at 12:14:25AM -0400, Jeff King wrote: > > + ALLOC_ARRAY(list, from->nr); > > for (i = 0; i < from->nr; i++) { > > - struct object *from_one = from->objects[i].item; > > + list[i] = (struct commit *)from->objects[i].item; > > Some of the objects in my array are not commits, but rather tags, so > this is a bogus cast. > > You can see that the original code peeled them and threw away > non-commits: > > > > > - if (from_one->flags & assign_flag) > > - continue; > > - from_one = deref_tag(the_repository, from_one, "a from object", > > 0); > > - if (!from_one || from_one->type != OBJ_COMMIT) { > > - /* no way to tell if this is reachable by > > -* looking at the ancestry chain alone, so > > -* leave a note to ourselves not to worry about > > -* this object anymore. > > -*/ > > - from->objects[i].item->flags |= assign_flag; > > - continue; > > - } > > So presumably we'd need to do something similar. This patch seems to fix it for me. It's more or less a reversion of the hunk above, though I didn't dig into whether I'm violating some other assumption in your new code. I think this function leaks "list" both from the location I noted here, as well as from normal exit --- commit-reach.c | 25 ++--- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/commit-reach.c b/commit-reach.c index 622eeb313d..abe90a2f55 100644 --- a/commit-reach.c +++ b/commit-reach.c @@ -547,20 +547,31 @@ int can_all_from_reach_with_flag(struct object_array *from, { struct commit **list = NULL; int i; + int nr_commits; int result = 1; ALLOC_ARRAY(list, from->nr); + nr_commits = 0; for (i = 0; i < from->nr; i++) { - list[i] = (struct commit *)from->objects[i].item; + struct object *from_one = from->objects[i].item; - if (parse_commit(list[i]) || - list[i]->generation < min_generation) - return 0; + from_one = deref_tag(the_repository, from_one, +"a from object", 0); + if (!from_one || from_one->type != OBJ_COMMIT) { + from->objects[i].item->flags |= assign_flag; + continue; + } + + list[nr_commits] = (struct commit *)from_one; + if (parse_commit(list[nr_commits]) || + list[nr_commits]->generation < min_generation) + return 0; /* is this a leak? */ + nr_commits++; } - QSORT(list, from->nr, compare_commits_by_gen); + QSORT(list, nr_commits, compare_commits_by_gen); - for (i = 0; i < from->nr; i++) { + for (i = 0; i < nr_commits; i++) { /* DFS from list[i] */ struct commit_list *stack = NULL; @@ -603,7 +614,7 @@ int can_all_from_reach_with_flag(struct object_array *from, } cleanup: - for (i = 0; i < from->nr; i++) { + for (i = 0; i < nr_commits; i++) { clear_commit_marks(list[i], RESULT); clear_commit_marks(list[i], assign_flag); } -- 2.19.0.600.ga229f7d059
Re: [PATCH v2 17/18] commit-reach: make can_all_from_reach... linear
On Fri, Jul 20, 2018 at 04:33:28PM +, Derrick Stolee wrote: > The can_all_from_reach_with_flags() algorithm is currently quadratic in > the worst case, because it calls the reachable() method for every 'from' > without tracking which commits have already been walked or which can > already reach a commit in 'to'. > > Rewrite the algorithm to walk each commit a constant number of times. I got a segfault in upload-pack from 'next' today which bisected to this patch (which became 4fbcca4eff). I think the problem is the line at the bottom of this hunk: > int can_all_from_reach_with_flag(struct object_array *from, >unsigned int with_flag, >unsigned int assign_flag, > - time_t min_commit_date) > + time_t min_commit_date, > + uint32_t min_generation) > { > + struct commit **list = NULL; > int i; > + int result = 1; > > + ALLOC_ARRAY(list, from->nr); > for (i = 0; i < from->nr; i++) { > - struct object *from_one = from->objects[i].item; > + list[i] = (struct commit *)from->objects[i].item; Some of the objects in my array are not commits, but rather tags, so this is a bogus cast. You can see that the original code peeled them and threw away non-commits: > > - if (from_one->flags & assign_flag) > - continue; > - from_one = deref_tag(the_repository, from_one, "a from object", > 0); > - if (!from_one || from_one->type != OBJ_COMMIT) { > - /* no way to tell if this is reachable by > - * looking at the ancestry chain alone, so > - * leave a note to ourselves not to worry about > - * this object anymore. > - */ > - from->objects[i].item->flags |= assign_flag; > - continue; > - } So presumably we'd need to do something similar. I think when we're called from can_all_from_reach(), we feed only commits. But in this case the stack trace is: #0 can_all_from_reach_with_flag (from=0x55f95ff42f80 , with_flag=2048, assign_flag=16384, min_commit_date=1513037626, min_generation=0) at commit-reach.c:567 #1 0x55f95fe20e92 in ok_to_give_up () at upload-pack.c:346 #2 0x55f95fe20efa in get_common_commits () at upload-pack.c:369 #3 0x55f95fe22ce9 in upload_pack (options=0x7fff7c1b81f0) at upload-pack.c:1065 #4 0x55f95fcdc11b in cmd_upload_pack (argc=1, argv=0x7fff7c1b8498, prefix=0x0) at builtin/upload-pack.c:67 #5 0x55f95fc39574 in run_builtin (p=0x55f95ff02248 , argc=2, argv=0x7fff7c1b8498) at git.c:417 #6 0x55f95fc3987c in handle_builtin (argc=2, argv=0x7fff7c1b8498) at git.c:633 #7 0x55f95fc39b02 in cmd_main (argc=2, argv=0x7fff7c1b8498) at git.c:732 #8 0x55f95fce044b in main (argc=2, argv=0x7fff7c1b8498) at common-main.c:45 and my client was fetching some tags (though it would similarly break with other object types). So I'd think the easy reproduction would be just fetching a tag, but a trivial case didn't seem to trigger (it probably needs a more substantial negotiation). -Peff
Re: [PATCH v2 17/18] commit-reach: make can_all_from_reach... linear
On 7/23/2018 4:41 PM, Jonathan Tan wrote: + if (parse_commit(list[i]) || + list[i]->generation < min_generation) Here... + if (parse_commit(parent->item) || + parent->item->date < min_commit_date || + parent->item->generation < min_generation) ...and here, would parse_commit_or_die() be better? I think that a function that returns a definitive answer (either the commits are reachable or not) should die when the commits cannot be parsed. I'm hesitant to add _or_die() here, when the previous implementation only used parse_object() or parse_commit(), so would not die when parsing fails. The same holds true for the other methods that call can_all_from_reach(). Thanks, -Stolee
Re: [PATCH v2 17/18] commit-reach: make can_all_from_reach... linear
> + if (parse_commit(list[i]) || > + list[i]->generation < min_generation) Here... > + if (parse_commit(parent->item) || > + parent->item->date < > min_commit_date || > + parent->item->generation < > min_generation) ...and here, would parse_commit_or_die() be better? I think that a function that returns a definitive answer (either the commits are reachable or not) should die when the commits cannot be parsed. Other than that, I've compared the commits in this version to v1, and all my review comments have been addressed, thanks. (With the exception of the skip_prefix() one, but that is a minor matter - I suggested that to make it easier to implement my "Ancestor:" and "Descendant:" suggestion which Stolee disagreed on with reason.) [1] https://public-inbox.org/git/20180716230019.257742-1-jonathanta...@google.com/
[PATCH v2 17/18] commit-reach: make can_all_from_reach... linear
The can_all_from_reach_with_flags() algorithm is currently quadratic in the worst case, because it calls the reachable() method for every 'from' without tracking which commits have already been walked or which can already reach a commit in 'to'. Rewrite the algorithm to walk each commit a constant number of times. We also add some optimizations that should work for the main consumer of this method: fetch negotitation (haves/wants). The first step includes using a depth-first-search (DFS) from each 'from' commit, sorted by ascending generation number. We do not walk beyond the minimum generation number or the minimum commit date. This DFS is likely to be faster than the existing reachable() method because we expect previous ref values to be along the first-parent history. If we find a target commit, then we mark everything in the DFS stack as a RESULT. This expands the set of targets for the other 'from' commits. We also mark the visited commits using 'assign_flag' to prevent re- walking the same commits. We still need to clear our flags at the end, which is why we will have a total of three visits to each commit. Performance was measured on the Linux repository using 'test-tool reach can_all_from_reach'. The input included rows seeded by tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as v3.[0-9]*. This mimics a (very large) fetch that says "I have all major v3 releases and want all major v4 releases." The "large" case included X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate tags to the set, which does not greatly increase the number of objects that are considered, but does increase the number of 'from' commits, demonstrating the quadratic nature of the previous code. Small Case: Before: 1.52 s After: 0.26 s Large Case: Before: 3.50 s After: 0.27 s Note how the time increases between the two cases in the two versions. The new code increases relative to the number of commits that need to be walked, but not directly relative to the number of 'from' commits. Signed-off-by: Derrick Stolee --- commit-reach.c | 122 ++--- commit-reach.h | 9 ++-- upload-pack.c | 5 +- 3 files changed, 83 insertions(+), 53 deletions(-) diff --git a/commit-reach.c b/commit-reach.c index f5858944fd..bc522d6840 100644 --- a/commit-reach.c +++ b/commit-reach.c @@ -514,66 +514,87 @@ int commit_contains(struct ref_filter *filter, struct commit *commit, return is_descendant_of(commit, list); } -int reachable(struct commit *from, unsigned int with_flag, - unsigned int assign_flag, time_t min_commit_date) +static int compare_commits_by_gen(const void *_a, const void *_b) { - struct prio_queue work = { compare_commits_by_commit_date }; + const struct commit *a = (const struct commit *)_a; + const struct commit *b = (const struct commit *)_b; - prio_queue_put(&work, from); - while (work.nr) { - struct commit_list *list; - struct commit *commit = prio_queue_get(&work); - - if (commit->object.flags & with_flag) { - from->object.flags |= assign_flag; - break; - } - if (!commit->object.parsed) - parse_object(the_repository, &commit->object.oid); - if (commit->object.flags & REACHABLE) - continue; - commit->object.flags |= REACHABLE; - if (commit->date < min_commit_date) - continue; - for (list = commit->parents; list; list = list->next) { - struct commit *parent = list->item; - if (!(parent->object.flags & REACHABLE)) - prio_queue_put(&work, parent); - } - } - from->object.flags |= REACHABLE; - clear_commit_marks(from, REACHABLE); - clear_prio_queue(&work); - return (from->object.flags & assign_flag); + if (a->generation < b->generation) + return -1; + if (a->generation > b->generation) + return 1; + return 0; } int can_all_from_reach_with_flag(struct object_array *from, unsigned int with_flag, unsigned int assign_flag, -time_t min_commit_date) +time_t min_commit_date, +uint32_t min_generation) { + struct commit **list = NULL; int i; + int result = 1; + ALLOC_ARRAY(list, from->nr); for (i = 0; i < from->nr; i++) { - struct object *from_one = from->objects[i].item; + list[i] = (struct commit *)from->objects[i].item; - if (from_one->flags & assign_flag) - continue; - from_one = deref_tag(the_repository, from_one,