Re: [PATCH v4 03/21] range-diff: first rudimentary implementation
Hi Thomas, On Mon, 30 Jul 2018, Thomas Gummerer wrote: > On 07/30, Johannes Schindelin wrote: > > > > On Sun, 29 Jul 2018, Thomas Gummerer wrote: > > > > > On 07/21, Johannes Schindelin via GitGitGadget wrote: > > > > > > > > [...] > > > > > > > > +static void find_exact_matches(struct string_list *a, struct > > > > string_list *b) > > > > +{ > > > > + struct hashmap map; > > > > + int i; > > > > + > > > > + hashmap_init(, (hashmap_cmp_fn)patch_util_cmp, NULL, 0); > > > > + > > > > + /* First, add the patches of a to a hash map */ > > > > + for (i = 0; i < a->nr; i++) { > > > > + struct patch_util *util = a->items[i].util; > > > > + > > > > + util->i = i; > > > > + util->patch = a->items[i].string; > > > > + util->diff = util->patch + util->diff_offset; > > > > + hashmap_entry_init(util, strhash(util->diff)); > > > > + hashmap_add(, util); > > > > + } > > > > + > > > > + /* Now try to find exact matches in b */ > > > > + for (i = 0; i < b->nr; i++) { > > > > + struct patch_util *util = b->items[i].util, *other; > > > > + > > > > + util->i = i; > > > > + util->patch = b->items[i].string; > > > > + util->diff = util->patch + util->diff_offset; > > > > + hashmap_entry_init(util, strhash(util->diff)); > > > > + other = hashmap_remove(, util, NULL); > > > > + if (other) { > > > > + if (other->matching >= 0) > > > > + BUG("already assigned!"); > > > > + > > > > + other->matching = i; > > > > + util->matching = other->i; > > > > + } > > > > + } > > > > > > One possibly interesting corner case here is what happens when there > > > are two patches that have the exact same diff, for example in the > > > pathological case of commit A doing something, commit B reverting > > > commit A, and then commit C reverting commit B, so it ends up with the > > > same diff. > > > > > > Having those same commits unchanged in both ranges (e.g. if a commit > > > earlier in the range has been changed, and range B has been rebased on > > > top of that), we'd get the following mapping from range A to range B > > > for the commits in question: > > > > > > A -> C > > > B -> B > > > C -> A > > > > > > Which is not quite what I would expect as the user (even though it is > > > a valid mapping, and it probably doesn't matter too much for the end > > > result of the range diff, as nothing has changed between the commits > > > anyway). So I'm not sure it's worth fixing this, as it is a > > > pathological case, and nothing really breaks. > > > > Indeed. As far as I am concerned, this falls squarely into the "let's > > cross that bridge when, or if, we reach it" category. > > Makes sense, this can definitely be addressed later. > > > > > + > > > > + hashmap_free(, 0); > > > > +} > > > > + > > > > +static void diffsize_consume(void *data, char *line, unsigned long len) > > > > +{ > > > > + (*(int *)data)++; > > > > +} > > > > + > > > > +static int diffsize(const char *a, const char *b) > > > > +{ > > > > + xpparam_t pp = { 0 }; > > > > + xdemitconf_t cfg = { 0 }; > > > > + mmfile_t mf1, mf2; > > > > + int count = 0; > > > > + > > > > + mf1.ptr = (char *)a; > > > > + mf1.size = strlen(a); > > > > + mf2.ptr = (char *)b; > > > > + mf2.size = strlen(b); > > > > + > > > > + cfg.ctxlen = 3; > > > > + if (!xdi_diff_outf(, , diffsize_consume, , , > > > > )) > > > > + return count; > > > > + > > > > + error(_("failed to generate diff")); > > > > + return COST_MAX; > > > > +} > > > > + > > > > +static void get_correspondences(struct string_list *a, struct > > > > string_list *b, > > > > + int creation_factor) > > > > +{ > > > > + int n = a->nr + b->nr; > > > > + int *cost, c, *a2b, *b2a; > > > > + int i, j; > > > > + > > > > + ALLOC_ARRAY(cost, st_mult(n, n)); > > > > + ALLOC_ARRAY(a2b, n); > > > > + ALLOC_ARRAY(b2a, n); > > > > + > > > > + for (i = 0; i < a->nr; i++) { > > > > + struct patch_util *a_util = a->items[i].util; > > > > + > > > > + for (j = 0; j < b->nr; j++) { > > > > + struct patch_util *b_util = b->items[j].util; > > > > + > > > > + if (a_util->matching == j) > > > > + c = 0; > > > > + else if (a_util->matching < 0 && > > > > b_util->matching < 0) > > > > + c = diffsize(a_util->diff, > > > > b_util->diff); > > > > + else > > > > + c = COST_MAX; > > > > + cost[i + n * j] = c; > > > > +
Re: [PATCH v4 03/21] range-diff: first rudimentary implementation
On 07/30, Johannes Schindelin wrote: > Hi Thomas, > > On Sun, 29 Jul 2018, Thomas Gummerer wrote: > > > On 07/21, Johannes Schindelin via GitGitGadget wrote: > > > > > > [...] > > > > > > +static void find_exact_matches(struct string_list *a, struct string_list > > > *b) > > > +{ > > > + struct hashmap map; > > > + int i; > > > + > > > + hashmap_init(, (hashmap_cmp_fn)patch_util_cmp, NULL, 0); > > > + > > > + /* First, add the patches of a to a hash map */ > > > + for (i = 0; i < a->nr; i++) { > > > + struct patch_util *util = a->items[i].util; > > > + > > > + util->i = i; > > > + util->patch = a->items[i].string; > > > + util->diff = util->patch + util->diff_offset; > > > + hashmap_entry_init(util, strhash(util->diff)); > > > + hashmap_add(, util); > > > + } > > > + > > > + /* Now try to find exact matches in b */ > > > + for (i = 0; i < b->nr; i++) { > > > + struct patch_util *util = b->items[i].util, *other; > > > + > > > + util->i = i; > > > + util->patch = b->items[i].string; > > > + util->diff = util->patch + util->diff_offset; > > > + hashmap_entry_init(util, strhash(util->diff)); > > > + other = hashmap_remove(, util, NULL); > > > + if (other) { > > > + if (other->matching >= 0) > > > + BUG("already assigned!"); > > > + > > > + other->matching = i; > > > + util->matching = other->i; > > > + } > > > + } > > > > One possibly interesting corner case here is what happens when there > > are two patches that have the exact same diff, for example in the > > pathological case of commit A doing something, commit B reverting > > commit A, and then commit C reverting commit B, so it ends up with the > > same diff. > > > > Having those same commits unchanged in both ranges (e.g. if a commit > > earlier in the range has been changed, and range B has been rebased on > > top of that), we'd get the following mapping from range A to range B > > for the commits in question: > > > > A -> C > > B -> B > > C -> A > > > > Which is not quite what I would expect as the user (even though it is > > a valid mapping, and it probably doesn't matter too much for the end > > result of the range diff, as nothing has changed between the commits > > anyway). So I'm not sure it's worth fixing this, as it is a > > pathological case, and nothing really breaks. > > Indeed. As far as I am concerned, this falls squarely into the "let's > cross that bridge when, or if, we reach it" category. Makes sense, this can definitely be addressed later. > > > + > > > + hashmap_free(, 0); > > > +} > > > + > > > +static void diffsize_consume(void *data, char *line, unsigned long len) > > > +{ > > > + (*(int *)data)++; > > > +} > > > + > > > +static int diffsize(const char *a, const char *b) > > > +{ > > > + xpparam_t pp = { 0 }; > > > + xdemitconf_t cfg = { 0 }; > > > + mmfile_t mf1, mf2; > > > + int count = 0; > > > + > > > + mf1.ptr = (char *)a; > > > + mf1.size = strlen(a); > > > + mf2.ptr = (char *)b; > > > + mf2.size = strlen(b); > > > + > > > + cfg.ctxlen = 3; > > > + if (!xdi_diff_outf(, , diffsize_consume, , , )) > > > + return count; > > > + > > > + error(_("failed to generate diff")); > > > + return COST_MAX; > > > +} > > > + > > > +static void get_correspondences(struct string_list *a, struct > > > string_list *b, > > > + int creation_factor) > > > +{ > > > + int n = a->nr + b->nr; > > > + int *cost, c, *a2b, *b2a; > > > + int i, j; > > > + > > > + ALLOC_ARRAY(cost, st_mult(n, n)); > > > + ALLOC_ARRAY(a2b, n); > > > + ALLOC_ARRAY(b2a, n); > > > + > > > + for (i = 0; i < a->nr; i++) { > > > + struct patch_util *a_util = a->items[i].util; > > > + > > > + for (j = 0; j < b->nr; j++) { > > > + struct patch_util *b_util = b->items[j].util; > > > + > > > + if (a_util->matching == j) > > > + c = 0; > > > + else if (a_util->matching < 0 && b_util->matching < 0) > > > + c = diffsize(a_util->diff, b_util->diff); > > > + else > > > + c = COST_MAX; > > > + cost[i + n * j] = c; > > > + } > > > + > > > + c = a_util->matching < 0 ? > > > + a_util->diffsize * creation_factor / 100 : COST_MAX; > > > + for (j = b->nr; j < n; j++) > > > + cost[i + n * j] = c; > > > + } > > > + > > > + for (j = 0; j < b->nr; j++) { > > > + struct patch_util *util = b->items[j].util; > > > + > > > + c = util->matching < 0 ? > > > + util->diffsize * creation_factor / 100 : COST_MAX; > > > + for (i = a->nr; i < n; i++) > > > + cost[i + n * j] = c; > > > + } > > > + > > > + for (i = a->nr; i < n; i++) > > > + for (j = b->nr; j < n; j++) > > > + cost[i + n
Re: [PATCH v4 03/21] range-diff: first rudimentary implementation
Hi Thomas, On Sun, 29 Jul 2018, Thomas Gummerer wrote: > On 07/21, Johannes Schindelin via GitGitGadget wrote: > > From: Johannes Schindelin > > > > At this stage, `git range-diff` can determine corresponding commits > > of two related commit ranges. This makes use of the recently introduced > > implementation of the linear assignment algorithm. > > > > The core of this patch is a straight port of the ideas of tbdiff, the > > apparently dormant project at https://github.com/trast/tbdiff. > > > > The output does not at all match `tbdiff`'s output yet, as this patch > > really concentrates on getting the patch matching part right. > > > > Note: due to differences in the diff algorithm (`tbdiff` uses the Python > > module `difflib`, Git uses its xdiff fork), the cost matrix calculated > > by `range-diff` is different (but very similar) to the one calculated > > by `tbdiff`. Therefore, it is possible that they find different matching > > commits in corner cases (e.g. when a patch was split into two patches of > > roughly equal length). > > > > Signed-off-by: Johannes Schindelin > > --- > > Makefile | 1 + > > builtin/range-diff.c | 43 +- > > range-diff.c | 311 +++ > > range-diff.h | 7 + > > 4 files changed, 361 insertions(+), 1 deletion(-) > > create mode 100644 range-diff.c > > create mode 100644 range-diff.h > > > > [...] > > > > diff --git a/range-diff.c b/range-diff.c > > new file mode 100644 > > index 0..15d418afa > > --- /dev/null > > +++ b/range-diff.c > > @@ -0,0 +1,311 @@ > > +#include "cache.h" > > +#include "range-diff.h" > > +#include "string-list.h" > > +#include "run-command.h" > > +#include "argv-array.h" > > +#include "hashmap.h" > > +#include "xdiff-interface.h" > > +#include "linear-assignment.h" > > + > > +struct patch_util { > > + /* For the search for an exact match */ > > + struct hashmap_entry e; > > + const char *diff, *patch; > > + > > + int i; > > + int diffsize; > > + size_t diff_offset; > > + /* the index of the matching item in the other branch, or -1 */ > > + int matching; > > + struct object_id oid; > > +}; > > + > > +/* > > + * Reads the patches into a string list, with the `util` field being > > populated > > + * as struct object_id (will need to be free()d). > > + */ > > +static int read_patches(const char *range, struct string_list *list) > > +{ > > + struct child_process cp = CHILD_PROCESS_INIT; > > + FILE *in; > > + struct strbuf buf = STRBUF_INIT, line = STRBUF_INIT; > > + struct patch_util *util = NULL; > > + int in_header = 1; > > + > > + argv_array_pushl(, "log", "--no-color", "-p", "--no-merges", > > + "--reverse", "--date-order", "--decorate=no", > > + "--no-abbrev-commit", range, > > + NULL); > > Compared to tbdiff, add "--decorate=no", and "--no-abbrev-commit". I > see we're abbreviating the commit hashes later. We don't want ref > names here, so "--decorate=no" makes sense as well. Indeed. Compare also to https://github.com/trast/tbdiff/pull/8 > > + cp.out = -1; > > + cp.no_stdin = 1; > > + cp.git_cmd = 1; > > + > > + if (start_command()) > > + return error_errno(_("could not start `log`")); > > + in = fdopen(cp.out, "r"); > > + if (!in) { > > + error_errno(_("could not read `log` output")); > > + finish_command(); > > + return -1; > > + } > > + > > + while (strbuf_getline(, in) != EOF) { > > + const char *p; > > + > > + if (skip_prefix(line.buf, "commit ", )) { > > + if (util) { > > + string_list_append(list, buf.buf)->util = util; > > + strbuf_reset(); > > + } > > + util = xcalloc(sizeof(*util), 1); > > + if (get_oid(p, >oid)) { > > + error(_("could not parse commit '%s'"), p); > > + free(util); > > + string_list_clear(list, 1); > > + strbuf_release(); > > + strbuf_release(); > > + fclose(in); > > + finish_command(); > > + return -1; > > + } > > + util->matching = -1; > > + in_header = 1; > > + continue; > > + } > > + > > + if (starts_with(line.buf, "diff --git")) { > > + in_header = 0; > > + strbuf_addch(, '\n'); > > + if (!util->diff_offset) > > + util->diff_offset = buf.len; > > + strbuf_addbuf(, ); > > + } else if (in_header) { > > + if (starts_with(line.buf, "Author: ")) { > > + strbuf_addbuf(, ); > > +
Re: [PATCH v4 03/21] range-diff: first rudimentary implementation
On 07/21, Johannes Schindelin via GitGitGadget wrote: > From: Johannes Schindelin > > At this stage, `git range-diff` can determine corresponding commits > of two related commit ranges. This makes use of the recently introduced > implementation of the linear assignment algorithm. > > The core of this patch is a straight port of the ideas of tbdiff, the > apparently dormant project at https://github.com/trast/tbdiff. > > The output does not at all match `tbdiff`'s output yet, as this patch > really concentrates on getting the patch matching part right. > > Note: due to differences in the diff algorithm (`tbdiff` uses the Python > module `difflib`, Git uses its xdiff fork), the cost matrix calculated > by `range-diff` is different (but very similar) to the one calculated > by `tbdiff`. Therefore, it is possible that they find different matching > commits in corner cases (e.g. when a patch was split into two patches of > roughly equal length). > > Signed-off-by: Johannes Schindelin > --- > Makefile | 1 + > builtin/range-diff.c | 43 +- > range-diff.c | 311 +++ > range-diff.h | 7 + > 4 files changed, 361 insertions(+), 1 deletion(-) > create mode 100644 range-diff.c > create mode 100644 range-diff.h > > [...] > > diff --git a/range-diff.c b/range-diff.c > new file mode 100644 > index 0..15d418afa > --- /dev/null > +++ b/range-diff.c > @@ -0,0 +1,311 @@ > +#include "cache.h" > +#include "range-diff.h" > +#include "string-list.h" > +#include "run-command.h" > +#include "argv-array.h" > +#include "hashmap.h" > +#include "xdiff-interface.h" > +#include "linear-assignment.h" > + > +struct patch_util { > + /* For the search for an exact match */ > + struct hashmap_entry e; > + const char *diff, *patch; > + > + int i; > + int diffsize; > + size_t diff_offset; > + /* the index of the matching item in the other branch, or -1 */ > + int matching; > + struct object_id oid; > +}; > + > +/* > + * Reads the patches into a string list, with the `util` field being > populated > + * as struct object_id (will need to be free()d). > + */ > +static int read_patches(const char *range, struct string_list *list) > +{ > + struct child_process cp = CHILD_PROCESS_INIT; > + FILE *in; > + struct strbuf buf = STRBUF_INIT, line = STRBUF_INIT; > + struct patch_util *util = NULL; > + int in_header = 1; > + > + argv_array_pushl(, "log", "--no-color", "-p", "--no-merges", > + "--reverse", "--date-order", "--decorate=no", > + "--no-abbrev-commit", range, > + NULL); Compared to tbdiff, add "--decorate=no", and "--no-abbrev-commit". I see we're abbreviating the commit hashes later. We don't want ref names here, so "--decorate=no" makes sense as well. > + cp.out = -1; > + cp.no_stdin = 1; > + cp.git_cmd = 1; > + > + if (start_command()) > + return error_errno(_("could not start `log`")); > + in = fdopen(cp.out, "r"); > + if (!in) { > + error_errno(_("could not read `log` output")); > + finish_command(); > + return -1; > + } > + > + while (strbuf_getline(, in) != EOF) { > + const char *p; > + > + if (skip_prefix(line.buf, "commit ", )) { > + if (util) { > + string_list_append(list, buf.buf)->util = util; > + strbuf_reset(); > + } > + util = xcalloc(sizeof(*util), 1); > + if (get_oid(p, >oid)) { > + error(_("could not parse commit '%s'"), p); > + free(util); > + string_list_clear(list, 1); > + strbuf_release(); > + strbuf_release(); > + fclose(in); > + finish_command(); > + return -1; > + } > + util->matching = -1; > + in_header = 1; > + continue; > + } > + > + if (starts_with(line.buf, "diff --git")) { > + in_header = 0; > + strbuf_addch(, '\n'); > + if (!util->diff_offset) > + util->diff_offset = buf.len; > + strbuf_addbuf(, ); > + } else if (in_header) { > + if (starts_with(line.buf, "Author: ")) { > + strbuf_addbuf(, ); > + strbuf_addstr(, "\n\n"); > + } else if (starts_with(line.buf, "")) { > + strbuf_addbuf(, ); > + strbuf_addch(, '\n'); > + } > +
[PATCH v4 03/21] range-diff: first rudimentary implementation
From: Johannes Schindelin At this stage, `git range-diff` can determine corresponding commits of two related commit ranges. This makes use of the recently introduced implementation of the linear assignment algorithm. The core of this patch is a straight port of the ideas of tbdiff, the apparently dormant project at https://github.com/trast/tbdiff. The output does not at all match `tbdiff`'s output yet, as this patch really concentrates on getting the patch matching part right. Note: due to differences in the diff algorithm (`tbdiff` uses the Python module `difflib`, Git uses its xdiff fork), the cost matrix calculated by `range-diff` is different (but very similar) to the one calculated by `tbdiff`. Therefore, it is possible that they find different matching commits in corner cases (e.g. when a patch was split into two patches of roughly equal length). Signed-off-by: Johannes Schindelin --- Makefile | 1 + builtin/range-diff.c | 43 +- range-diff.c | 311 +++ range-diff.h | 7 + 4 files changed, 361 insertions(+), 1 deletion(-) create mode 100644 range-diff.c create mode 100644 range-diff.h diff --git a/Makefile b/Makefile index 45c9dea1b..41b93689a 100644 --- a/Makefile +++ b/Makefile @@ -921,6 +921,7 @@ LIB_OBJS += progress.o LIB_OBJS += prompt.o LIB_OBJS += protocol.o LIB_OBJS += quote.o +LIB_OBJS += range-diff.o LIB_OBJS += reachable.o LIB_OBJS += read-cache.o LIB_OBJS += reflog-walk.o diff --git a/builtin/range-diff.c b/builtin/range-diff.c index 36788ea4f..3881da246 100644 --- a/builtin/range-diff.c +++ b/builtin/range-diff.c @@ -1,6 +1,7 @@ #include "cache.h" #include "builtin.h" #include "parse-options.h" +#include "range-diff.h" static const char * const builtin_range_diff_usage[] = { N_("git range-diff [] .. .."), @@ -17,9 +18,49 @@ int cmd_range_diff(int argc, const char **argv, const char *prefix) N_("Percentage by which creation is weighted")), OPT_END() }; + int res = 0; + struct strbuf range1 = STRBUF_INIT, range2 = STRBUF_INIT; argc = parse_options(argc, argv, NULL, options, builtin_range_diff_usage, 0); - return 0; + if (argc == 2) { + if (!strstr(argv[0], "..")) + die(_("no .. in range: '%s'"), argv[0]); + strbuf_addstr(, argv[0]); + + if (!strstr(argv[1], "..")) + die(_("no .. in range: '%s'"), argv[1]); + strbuf_addstr(, argv[1]); + } else if (argc == 3) { + strbuf_addf(, "%s..%s", argv[0], argv[1]); + strbuf_addf(, "%s..%s", argv[0], argv[2]); + } else if (argc == 1) { + const char *b = strstr(argv[0], "..."), *a = argv[0]; + int a_len; + + if (!b) + die(_("single arg format requires a symmetric range")); + + a_len = (int)(b - a); + if (!a_len) { + a = "HEAD"; + a_len = strlen(a); + } + b += 3; + if (!*b) + b = "HEAD"; + strbuf_addf(, "%s..%.*s", b, a_len, a); + strbuf_addf(, "%.*s..%s", a_len, a, b); + } else { + error(_("need two commit ranges")); + usage_with_options(builtin_range_diff_usage, options); + } + + res = show_range_diff(range1.buf, range2.buf, creation_factor); + + strbuf_release(); + strbuf_release(); + + return res; } diff --git a/range-diff.c b/range-diff.c new file mode 100644 index 0..15d418afa --- /dev/null +++ b/range-diff.c @@ -0,0 +1,311 @@ +#include "cache.h" +#include "range-diff.h" +#include "string-list.h" +#include "run-command.h" +#include "argv-array.h" +#include "hashmap.h" +#include "xdiff-interface.h" +#include "linear-assignment.h" + +struct patch_util { + /* For the search for an exact match */ + struct hashmap_entry e; + const char *diff, *patch; + + int i; + int diffsize; + size_t diff_offset; + /* the index of the matching item in the other branch, or -1 */ + int matching; + struct object_id oid; +}; + +/* + * Reads the patches into a string list, with the `util` field being populated + * as struct object_id (will need to be free()d). + */ +static int read_patches(const char *range, struct string_list *list) +{ + struct child_process cp = CHILD_PROCESS_INIT; + FILE *in; + struct strbuf buf = STRBUF_INIT, line = STRBUF_INIT; + struct patch_util *util = NULL; + int in_header = 1; + + argv_array_pushl(, "log", "--no-color", "-p", "--no-merges", + "--reverse", "--date-order", "--decorate=no", + "--no-abbrev-commit", range, + NULL); + cp.out =