Re: [PATCH v2 17/18] commit-reach: make can_all_from_reach... linear

2018-09-12 Thread Derrick Stolee

On 9/12/2018 12:29 AM, Jeff King wrote:

On Wed, Sep 12, 2018 at 12:14:25AM -0400, Jeff King wrote:


+   ALLOC_ARRAY(list, from->nr);
for (i = 0; i < from->nr; i++) {
-   struct object *from_one = from->objects[i].item;
+   list[i] = (struct commit *)from->objects[i].item;

Some of the objects in my array are not commits, but rather tags, so
this is a bogus cast.

You can see that the original code peeled them and threw away
non-commits:

  
-		if (from_one->flags & assign_flag)

-   continue;
-   from_one = deref_tag(the_repository, from_one, "a from object", 
0);
-   if (!from_one || from_one->type != OBJ_COMMIT) {
-   /* no way to tell if this is reachable by
-* looking at the ancestry chain alone, so
-* leave a note to ourselves not to worry about
-* this object anymore.
-*/
-   from->objects[i].item->flags |= assign_flag;
-   continue;
-   }

So presumably we'd need to do something similar.

This patch seems to fix it for me. It's more or less a reversion of the
hunk above, though I didn't dig into whether I'm violating some other
assumption in your new code.

I think this function leaks "list" both from the location I noted here,
as well as from normal exit
Thanks for the report and the fix. I'll try to create  test that 
demonstrates this and then push up a full patch.

---
  commit-reach.c | 25 ++---
  1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/commit-reach.c b/commit-reach.c
index 622eeb313d..abe90a2f55 100644
--- a/commit-reach.c
+++ b/commit-reach.c
@@ -547,20 +547,31 @@ int can_all_from_reach_with_flag(struct object_array 
*from,
  {
struct commit **list = NULL;
int i;
+   int nr_commits;
int result = 1;
  
  	ALLOC_ARRAY(list, from->nr);

+   nr_commits = 0;
for (i = 0; i < from->nr; i++) {
-   list[i] = (struct commit *)from->objects[i].item;
+   struct object *from_one = from->objects[i].item;
  
-		if (parse_commit(list[i]) ||

-   list[i]->generation < min_generation)
-   return 0;
+   from_one = deref_tag(the_repository, from_one,
+"a from object", 0);
+   if (!from_one || from_one->type != OBJ_COMMIT) {
+   from->objects[i].item->flags |= assign_flag;
+   continue;
+   }
+
+   list[nr_commits] = (struct commit *)from_one;
+   if (parse_commit(list[nr_commits]) ||
+   list[nr_commits]->generation < min_generation)
+   return 0; /* is this a leak? */
+   nr_commits++;
}
  
-	QSORT(list, from->nr, compare_commits_by_gen);

+   QSORT(list, nr_commits, compare_commits_by_gen);
  
-	for (i = 0; i < from->nr; i++) {

+   for (i = 0; i < nr_commits; i++) {
/* DFS from list[i] */
struct commit_list *stack = NULL;
  
@@ -603,7 +614,7 @@ int can_all_from_reach_with_flag(struct object_array *from,

}
  
  cleanup:

-   for (i = 0; i < from->nr; i++) {
+   for (i = 0; i < nr_commits; i++) {
clear_commit_marks(list[i], RESULT);
clear_commit_marks(list[i], assign_flag);
}




Re: [PATCH v2 17/18] commit-reach: make can_all_from_reach... linear

2018-09-11 Thread Jeff King
On Wed, Sep 12, 2018 at 12:14:25AM -0400, Jeff King wrote:

> > +   ALLOC_ARRAY(list, from->nr);
> > for (i = 0; i < from->nr; i++) {
> > -   struct object *from_one = from->objects[i].item;
> > +   list[i] = (struct commit *)from->objects[i].item;
> 
> Some of the objects in my array are not commits, but rather tags, so
> this is a bogus cast.
> 
> You can see that the original code peeled them and threw away
> non-commits:
> 
> >  
> > -   if (from_one->flags & assign_flag)
> > -   continue;
> > -   from_one = deref_tag(the_repository, from_one, "a from object", 
> > 0);
> > -   if (!from_one || from_one->type != OBJ_COMMIT) {
> > -   /* no way to tell if this is reachable by
> > -* looking at the ancestry chain alone, so
> > -* leave a note to ourselves not to worry about
> > -* this object anymore.
> > -*/
> > -   from->objects[i].item->flags |= assign_flag;
> > -   continue;
> > -   }
> 
> So presumably we'd need to do something similar.

This patch seems to fix it for me. It's more or less a reversion of the
hunk above, though I didn't dig into whether I'm violating some other
assumption in your new code.

I think this function leaks "list" both from the location I noted here,
as well as from normal exit

---
 commit-reach.c | 25 ++---
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/commit-reach.c b/commit-reach.c
index 622eeb313d..abe90a2f55 100644
--- a/commit-reach.c
+++ b/commit-reach.c
@@ -547,20 +547,31 @@ int can_all_from_reach_with_flag(struct object_array 
*from,
 {
struct commit **list = NULL;
int i;
+   int nr_commits;
int result = 1;
 
ALLOC_ARRAY(list, from->nr);
+   nr_commits = 0;
for (i = 0; i < from->nr; i++) {
-   list[i] = (struct commit *)from->objects[i].item;
+   struct object *from_one = from->objects[i].item;
 
-   if (parse_commit(list[i]) ||
-   list[i]->generation < min_generation)
-   return 0;
+   from_one = deref_tag(the_repository, from_one,
+"a from object", 0);
+   if (!from_one || from_one->type != OBJ_COMMIT) {
+   from->objects[i].item->flags |= assign_flag;
+   continue;
+   }
+
+   list[nr_commits] = (struct commit *)from_one;
+   if (parse_commit(list[nr_commits]) ||
+   list[nr_commits]->generation < min_generation)
+   return 0; /* is this a leak? */
+   nr_commits++;
}
 
-   QSORT(list, from->nr, compare_commits_by_gen);
+   QSORT(list, nr_commits, compare_commits_by_gen);
 
-   for (i = 0; i < from->nr; i++) {
+   for (i = 0; i < nr_commits; i++) {
/* DFS from list[i] */
struct commit_list *stack = NULL;
 
@@ -603,7 +614,7 @@ int can_all_from_reach_with_flag(struct object_array *from,
}
 
 cleanup:
-   for (i = 0; i < from->nr; i++) {
+   for (i = 0; i < nr_commits; i++) {
clear_commit_marks(list[i], RESULT);
clear_commit_marks(list[i], assign_flag);
}
-- 
2.19.0.600.ga229f7d059



Re: [PATCH v2 17/18] commit-reach: make can_all_from_reach... linear

2018-09-11 Thread Jeff King
On Fri, Jul 20, 2018 at 04:33:28PM +, Derrick Stolee wrote:

> The can_all_from_reach_with_flags() algorithm is currently quadratic in
> the worst case, because it calls the reachable() method for every 'from'
> without tracking which commits have already been walked or which can
> already reach a commit in 'to'.
> 
> Rewrite the algorithm to walk each commit a constant number of times.

I got a segfault in upload-pack from 'next' today which bisected to this
patch (which became 4fbcca4eff). I think the problem is the line at the
bottom of this hunk:

>  int can_all_from_reach_with_flag(struct object_array *from,
>unsigned int with_flag,
>unsigned int assign_flag,
> -  time_t min_commit_date)
> +  time_t min_commit_date,
> +  uint32_t min_generation)
>  {
> + struct commit **list = NULL;
>   int i;
> + int result = 1;
>  
> + ALLOC_ARRAY(list, from->nr);
>   for (i = 0; i < from->nr; i++) {
> - struct object *from_one = from->objects[i].item;
> + list[i] = (struct commit *)from->objects[i].item;

Some of the objects in my array are not commits, but rather tags, so
this is a bogus cast.

You can see that the original code peeled them and threw away
non-commits:

>  
> - if (from_one->flags & assign_flag)
> - continue;
> - from_one = deref_tag(the_repository, from_one, "a from object", 
> 0);
> - if (!from_one || from_one->type != OBJ_COMMIT) {
> - /* no way to tell if this is reachable by
> -  * looking at the ancestry chain alone, so
> -  * leave a note to ourselves not to worry about
> -  * this object anymore.
> -  */
> - from->objects[i].item->flags |= assign_flag;
> - continue;
> - }

So presumably we'd need to do something similar.

I think when we're called from can_all_from_reach(), we feed only
commits. But in this case the stack trace is:

  #0  can_all_from_reach_with_flag (from=0x55f95ff42f80 , 
with_flag=2048, assign_flag=16384, 
  min_commit_date=1513037626, min_generation=0) at commit-reach.c:567
  #1  0x55f95fe20e92 in ok_to_give_up () at upload-pack.c:346
  #2  0x55f95fe20efa in get_common_commits () at upload-pack.c:369
  #3  0x55f95fe22ce9 in upload_pack (options=0x7fff7c1b81f0) at 
upload-pack.c:1065
  #4  0x55f95fcdc11b in cmd_upload_pack (argc=1, argv=0x7fff7c1b8498, 
prefix=0x0)
  at builtin/upload-pack.c:67
  #5  0x55f95fc39574 in run_builtin (p=0x55f95ff02248 , 
argc=2, argv=0x7fff7c1b8498)
  at git.c:417
  #6  0x55f95fc3987c in handle_builtin (argc=2, argv=0x7fff7c1b8498) at 
git.c:633
  #7  0x55f95fc39b02 in cmd_main (argc=2, argv=0x7fff7c1b8498) at git.c:732
  #8  0x55f95fce044b in main (argc=2, argv=0x7fff7c1b8498) at 
common-main.c:45

and my client was fetching some tags (though it would similarly break
with other object types). So I'd think the easy reproduction would be
just fetching a tag, but a trivial case didn't seem to trigger (it
probably needs a more substantial negotiation).

-Peff


Re: [PATCH v2 17/18] commit-reach: make can_all_from_reach... linear

2018-08-01 Thread Derrick Stolee




On 7/23/2018 4:41 PM, Jonathan Tan wrote:

+   if (parse_commit(list[i]) ||
+   list[i]->generation < min_generation)

Here...


+   if (parse_commit(parent->item) ||
+   parent->item->date < 
min_commit_date ||
+   parent->item->generation < 
min_generation)

...and here, would parse_commit_or_die() be better? I think that a
function that returns a definitive answer (either the commits are
reachable or not) should die when the commits cannot be parsed.
I'm hesitant to add _or_die() here, when the previous implementation 
only used parse_object() or parse_commit(), so would not die when 
parsing fails. The same holds true for the other methods that call 
can_all_from_reach().


Thanks,
-Stolee


Re: [PATCH v2 17/18] commit-reach: make can_all_from_reach... linear

2018-07-23 Thread Jonathan Tan
> + if (parse_commit(list[i]) ||
> + list[i]->generation < min_generation)

Here...

> + if (parse_commit(parent->item) ||
> + parent->item->date < 
> min_commit_date ||
> + parent->item->generation < 
> min_generation)

...and here, would parse_commit_or_die() be better? I think that a
function that returns a definitive answer (either the commits are
reachable or not) should die when the commits cannot be parsed.

Other than that, I've compared the commits in this version to v1, and
all my review comments have been addressed, thanks. (With the exception
of the skip_prefix() one, but that is a minor matter - I suggested that
to make it easier to implement my "Ancestor:" and "Descendant:"
suggestion which Stolee disagreed on with reason.)

[1] 
https://public-inbox.org/git/20180716230019.257742-1-jonathanta...@google.com/


[PATCH v2 17/18] commit-reach: make can_all_from_reach... linear

2018-07-20 Thread Derrick Stolee
The can_all_from_reach_with_flags() algorithm is currently quadratic in
the worst case, because it calls the reachable() method for every 'from'
without tracking which commits have already been walked or which can
already reach a commit in 'to'.

Rewrite the algorithm to walk each commit a constant number of times.

We also add some optimizations that should work for the main consumer of
this method: fetch negotitation (haves/wants).

The first step includes using a depth-first-search (DFS) from each
'from' commit, sorted by ascending generation number. We do not walk
beyond the minimum generation number or the minimum commit date. This
DFS is likely to be faster than the existing reachable() method because
we expect previous ref values to be along the first-parent history.

If we find a target commit, then we mark everything in the DFS stack as
a RESULT. This expands the set of targets for the other 'from' commits.
We also mark the visited commits using 'assign_flag' to prevent re-
walking the same commits.

We still need to clear our flags at the end, which is why we will have a
total of three visits to each commit.

Performance was measured on the Linux repository using
'test-tool reach can_all_from_reach'. The input included rows seeded by
tag values. The "small" case included X-rows as v4.[0-9]* and Y-rows as
v3.[0-9]*. This mimics a (very large) fetch that says "I have all major
v3 releases and want all major v4 releases." The "large" case included
X-rows as "v4.*" and Y-rows as "v3.*". This adds all release-candidate
tags to the set, which does not greatly increase the number of objects
that are considered, but does increase the number of 'from' commits,
demonstrating the quadratic nature of the previous code.

Small Case:

Before: 1.52 s
 After: 0.26 s

Large Case:

Before: 3.50 s
 After: 0.27 s

Note how the time increases between the two cases in the two versions.
The new code increases relative to the number of commits that need to be
walked, but not directly relative to the number of 'from' commits.

Signed-off-by: Derrick Stolee 
---
 commit-reach.c | 122 ++---
 commit-reach.h |   9 ++--
 upload-pack.c  |   5 +-
 3 files changed, 83 insertions(+), 53 deletions(-)

diff --git a/commit-reach.c b/commit-reach.c
index f5858944fd..bc522d6840 100644
--- a/commit-reach.c
+++ b/commit-reach.c
@@ -514,66 +514,87 @@ int commit_contains(struct ref_filter *filter, struct 
commit *commit,
return is_descendant_of(commit, list);
 }
 
-int reachable(struct commit *from, unsigned int with_flag,
- unsigned int assign_flag, time_t min_commit_date)
+static int compare_commits_by_gen(const void *_a, const void *_b)
 {
-   struct prio_queue work = { compare_commits_by_commit_date };
+   const struct commit *a = (const struct commit *)_a;
+   const struct commit *b = (const struct commit *)_b;
 
-   prio_queue_put(&work, from);
-   while (work.nr) {
-   struct commit_list *list;
-   struct commit *commit = prio_queue_get(&work);
-
-   if (commit->object.flags & with_flag) {
-   from->object.flags |= assign_flag;
-   break;
-   }
-   if (!commit->object.parsed)
-   parse_object(the_repository, &commit->object.oid);
-   if (commit->object.flags & REACHABLE)
-   continue;
-   commit->object.flags |= REACHABLE;
-   if (commit->date < min_commit_date)
-   continue;
-   for (list = commit->parents; list; list = list->next) {
-   struct commit *parent = list->item;
-   if (!(parent->object.flags & REACHABLE))
-   prio_queue_put(&work, parent);
-   }
-   }
-   from->object.flags |= REACHABLE;
-   clear_commit_marks(from, REACHABLE);
-   clear_prio_queue(&work);
-   return (from->object.flags & assign_flag);
+   if (a->generation < b->generation)
+   return -1;
+   if (a->generation > b->generation)
+   return 1;
+   return 0;
 }
 
 int can_all_from_reach_with_flag(struct object_array *from,
 unsigned int with_flag,
 unsigned int assign_flag,
-time_t min_commit_date)
+time_t min_commit_date,
+uint32_t min_generation)
 {
+   struct commit **list = NULL;
int i;
+   int result = 1;
 
+   ALLOC_ARRAY(list, from->nr);
for (i = 0; i < from->nr; i++) {
-   struct object *from_one = from->objects[i].item;
+   list[i] = (struct commit *)from->objects[i].item;
 
-   if (from_one->flags & assign_flag)
-   continue;
-   from_one = deref_tag(the_repository, from_one,