On Wed, May 18, 2011 at 8:33 AM, Johan Corveleyn <[email protected]> wrote:
> On Wed, May 18, 2011 at 1:56 AM, Morten Kloster <[email protected]> wrote:
>> Log message:
>>
>> Speed-up of libsvn_diff using token counts
>> By using indices, not node pointers, to refer to tokens, and counting
>> the number of each token, the longest common subsequence (lcs)
>> algorithm at the heart of libsvn_diff becomes much faster in many
>> situations by skipping tokens that are unique to one source or the other.
>>
>> * subversion/libsvn_diff/token.c
>> (svn_diff__node_t): Added 'index' member.
>> (svn_diff__tree_t): Added 'node_count' member.
>> (svn_diff__get_token_count): New method.
>> (tree_insert_token): Assigns indices to nodes.
>> (svn_diff__get_tokens): Uses index, not pointer,
>> to identify node in position struct.
>>
>> * subversion/libsvn_diff/lcs.c
>> (svn_diff__snake): Takes token counts as additional argument.
>> Skips tokens that are unique to one or the other file.
>> (svn_diff__lcs): Takes token counts and number of different tokens
>> as additional arguments. Calculates number of tokens unique to
>> each source and uses this to compute effective length difference.
>>
>> * subversion/libsvn_diff/diff.h
>> (svn_diff__position_t): Changed node member from being
>> a pointer to the node to an integer index for the node.
>> Updated method declarations.
>>
>> * subversion/libsvn_diff/diff.c
>> * subversion/libsvn_diff/diff3.c
>> * subversion/libsvn_diff/diff4.c
>> (svn_diff_diff_2, svn_diff_diff_3, svn_diff_diff_4): Count the
>> number of times each token appears in each source.
>> (svn_diff__resolve_conflict): Takes number of different tokens
>> as additional argument. Counts the number of times
>> each token appears in each (partial) source.
>>
>>
>>
>>
>>
>>
>> Comments:
>> This patch can reduce the time required for a diff by a large factor
>> when comparing long files with large differences. As an extreme case,
>> splitting sqlite.c into two roughly equal parts and diffing them against
>> each other took about a minute on my laptop with the original code,
>> and 7-8 seconds with my patch. The speed-up is gained only if a
>> large fraction of the changed tokens are unique to one source or the
>> other. I can not see that the change would cause a significant
>> performance degradation in any reasonable case.
>>
>> Counting the number of times each token appears is also useful for
>> potential additional features in the future, such as identifying moved
>> blocks outside of the longest common subsequence.
>>
>> I have tested only svn_diff_diff_2 (that it gives identical results as
>> before), but the changes to svn_diff_diff3 and svn_diff_diff4 are
>> completely analogous. Tested on a Windows 7 64-bit machine,
>> TortoiseSVN build script.
>>
>>
>> Morten Kloster (first patch)
>
> Interesting! The patch seems to be missing though. The list software
> tends to strip off attachments if they don't have a text/plain
> mime-type. Can you try sending it again with a .txt extension?
>
> --
> Johan
>
Sorry, here's the patch with .txt extension. I found a bug in the
first version anyway, so it was just as well. :)
And of course I meant sqlite3.c for the stress test.
Morten
Index: subversion/libsvn_diff/diff.c
===================================================================
--- subversion/libsvn_diff/diff.c (revision 1104713)
+++ subversion/libsvn_diff/diff.c (working copy)
@@ -105,6 +105,10 @@
{
svn_diff__tree_t *tree;
svn_diff__position_t *position_list[2];
+ apr_uint32_t num_tokens;
+ apr_uint32_t token_index;
+ svn_diff__position_t *current;
+ apr_uint32_t *token_counts[2];
svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
svn_diff_datasource_modified};
svn_diff__lcs_t *lcs;
@@ -138,6 +142,8 @@
prefix_lines,
subpool));
+ num_tokens = svn_diff__get_node_count(tree);
+
/* The cool part is that we don't need the tokens anymore.
* Allow the app to clean them up if it wants to.
*/
@@ -147,8 +153,35 @@
/* We don't need the nodes in the tree either anymore, nor the tree itself */
svn_pool_destroy(treepool);
+ token_counts[0] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ token_counts[1] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ for (token_index = 0; token_index < num_tokens; token_index++)
+ token_counts[0][token_index] = token_counts[1][token_index] = 0;
+
+ current = position_list[0];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[0][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[0]);
+ }
+ current = position_list[1];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[1][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[1]);
+ }
+
/* Get the lcs */
- lcs = svn_diff__lcs(position_list[0], position_list[1], prefix_lines,
+ lcs = svn_diff__lcs(position_list[0], position_list[1], token_counts[0],
+ token_counts[1], num_tokens, prefix_lines,
suffix_lines, subpool);
/* Produce the diff */
Index: subversion/libsvn_diff/diff.h
===================================================================
--- subversion/libsvn_diff/diff.h (revision 1104713)
+++ subversion/libsvn_diff/diff.h (working copy)
@@ -62,7 +62,7 @@
struct svn_diff__position_t
{
svn_diff__position_t *next;
- svn_diff__node_t *node;
+ apr_int32_t node;
apr_off_t offset;
};
@@ -91,12 +91,21 @@
svn_diff__lcs_t *
svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring)
*/
svn_diff__position_t *position_list2, /* pointer to tail (ring)
*/
+ apr_uint32_t *token_counts_list1, /* array of counts */
+ apr_uint32_t *token_counts_list2, /* array of counts */
+ apr_uint32_t num_tokens,
apr_off_t prefix_lines,
apr_off_t suffix_lines,
apr_pool_t *pool);
/*
+ * Returns number of tokens in a tree
+ */
+apr_uint32_t
+svn_diff__get_node_count(svn_diff__tree_t *tree);
+
+/*
* Support functions to build a tree of token positions
*/
void
@@ -128,6 +137,7 @@
svn_diff__resolve_conflict(svn_diff_t *hunk,
svn_diff__position_t **position_list1,
svn_diff__position_t **position_list2,
+ apr_uint32_t num_tokens,
apr_pool_t *pool);
Index: subversion/libsvn_diff/diff3.c
===================================================================
--- subversion/libsvn_diff/diff3.c (revision 1104713)
+++ subversion/libsvn_diff/diff3.c (working copy)
@@ -38,208 +38,232 @@
svn_diff__resolve_conflict(svn_diff_t *hunk,
svn_diff__position_t **position_list1,
svn_diff__position_t **position_list2,
+ apr_uint32_t num_tokens,
apr_pool_t *pool)
{
- apr_off_t modified_start = hunk->modified_start + 1;
- apr_off_t latest_start = hunk->latest_start + 1;
- apr_off_t common_length;
- apr_off_t modified_length = hunk->modified_length;
- apr_off_t latest_length = hunk->latest_length;
- svn_diff__position_t *start_position[2];
- svn_diff__position_t *position[2];
- svn_diff__lcs_t *lcs = NULL;
- svn_diff__lcs_t **lcs_ref = &lcs;
- svn_diff_t **diff_ref = &hunk->resolved_diff;
- apr_pool_t *subpool;
+ apr_off_t modified_start = hunk->modified_start + 1;
+ apr_off_t latest_start = hunk->latest_start + 1;
+ apr_off_t common_length;
+ apr_off_t modified_length = hunk->modified_length;
+ apr_off_t latest_length = hunk->latest_length;
+ svn_diff__position_t *start_position[2];
+ svn_diff__position_t *position[2];
+ apr_uint32_t token_index;
+ svn_diff__position_t *current;
+ apr_uint32_t *token_counts[2];
+ svn_diff__lcs_t *lcs = NULL;
+ svn_diff__lcs_t **lcs_ref = &lcs;
+ svn_diff_t **diff_ref = &hunk->resolved_diff;
+ apr_pool_t *subpool;
- /* First find the starting positions for the
- * comparison
- */
+ /* First find the starting positions for the
+ * comparison
+ */
- start_position[0] = *position_list1;
- start_position[1] = *position_list2;
+ start_position[0] = *position_list1;
+ start_position[1] = *position_list2;
- while (start_position[0]->offset < modified_start)
- start_position[0] = start_position[0]->next;
+ while (start_position[0]->offset < modified_start)
+ start_position[0] = start_position[0]->next;
- while (start_position[1]->offset < latest_start)
- start_position[1] = start_position[1]->next;
+ while (start_position[1]->offset < latest_start)
+ start_position[1] = start_position[1]->next;
- position[0] = start_position[0];
- position[1] = start_position[1];
+ position[0] = start_position[0];
+ position[1] = start_position[1];
- common_length = modified_length < latest_length
- ? modified_length : latest_length;
+ common_length = modified_length < latest_length
+ ? modified_length : latest_length;
- while (common_length > 0
- && position[0]->node == position[1]->node)
- {
- position[0] = position[0]->next;
- position[1] = position[1]->next;
+ while (common_length > 0
+ && position[0]->node == position[1]->node)
+ {
+ position[0] = position[0]->next;
+ position[1] = position[1]->next;
- common_length--;
- }
+ common_length--;
+ }
- if (common_length == 0
- && modified_length == latest_length)
- {
- hunk->type = svn_diff__type_diff_common;
- hunk->resolved_diff = NULL;
+ if (common_length == 0
+ && modified_length == latest_length)
+ {
+ hunk->type = svn_diff__type_diff_common;
+ hunk->resolved_diff = NULL;
- *position_list1 = position[0];
- *position_list2 = position[1];
+ *position_list1 = position[0];
+ *position_list2 = position[1];
- return;
- }
+ return;
+ }
- hunk->type = svn_diff__type_conflict;
+ hunk->type = svn_diff__type_conflict;
- /* ### If we have a conflict we can try to find the
- * ### common parts in it by getting an lcs between
- * ### modified (start to start + length) and
- * ### latest (start to start + length).
- * ### We use this lcs to create a simple diff. Only
- * ### where there is a diff between the two, we have
- * ### a conflict.
- * ### This raises a problem; several common diffs and
- * ### conflicts can occur within the same original
- * ### block. This needs some thought.
- * ###
- * ### NB: We can use the node _pointers_ to identify
- * ### different tokens
- */
+ /* ### If we have a conflict we can try to find the
+ * ### common parts in it by getting an lcs between
+ * ### modified (start to start + length) and
+ * ### latest (start to start + length).
+ * ### We use this lcs to create a simple diff. Only
+ * ### where there is a diff between the two, we have
+ * ### a conflict.
+ * ### This raises a problem; several common diffs and
+ * ### conflicts can occur within the same original
+ * ### block. This needs some thought.
+ * ###
+ * ### NB: We can use the node _pointers_ to identify
+ * ### different tokens
+ */
- subpool = svn_pool_create(pool);
+ subpool = svn_pool_create(pool);
- /* Calculate how much of the two sequences was
- * actually the same.
- */
- common_length = (modified_length < latest_length
- ? modified_length : latest_length)
- - common_length;
+ /* Calculate how much of the two sequences was
+ * actually the same.
+ */
+ common_length = (modified_length < latest_length
+ ? modified_length : latest_length)
+ - common_length;
- /* If there were matching symbols at the start of
- * both sequences, record that fact.
- */
- if (common_length > 0)
- {
- lcs = apr_palloc(subpool, sizeof(*lcs));
- lcs->next = NULL;
- lcs->position[0] = start_position[0];
- lcs->position[1] = start_position[1];
- lcs->length = common_length;
+ /* If there were matching symbols at the start of
+ * both sequences, record that fact.
+ */
+ if (common_length > 0)
+ {
+ lcs = apr_palloc(subpool, sizeof(*lcs));
+ lcs->next = NULL;
+ lcs->position[0] = start_position[0];
+ lcs->position[1] = start_position[1];
+ lcs->length = common_length;
- lcs_ref = &lcs->next;
- }
+ lcs_ref = &lcs->next;
+ }
- modified_length -= common_length;
- latest_length -= common_length;
+ modified_length -= common_length;
+ latest_length -= common_length;
- modified_start = start_position[0]->offset;
- latest_start = start_position[1]->offset;
+ modified_start = start_position[0]->offset;
+ latest_start = start_position[1]->offset;
- start_position[0] = position[0];
- start_position[1] = position[1];
+ start_position[0] = position[0];
+ start_position[1] = position[1];
- /* Create a new ring for svn_diff__lcs to grok.
- * We can safely do this given we don't need the
- * positions we processed anymore.
- */
- if (modified_length == 0)
- {
- *position_list1 = position[0];
- position[0] = NULL;
- }
- else
- {
- while (--modified_length)
- position[0] = position[0]->next;
+ /* Create a new ring for svn_diff__lcs to grok.
+ * We can safely do this given we don't need the
+ * positions we processed anymore.
+ */
+ if (modified_length == 0)
+ {
+ *position_list1 = position[0];
+ position[0] = NULL;
+ }
+ else
+ {
+ while (--modified_length)
+ position[0] = position[0]->next;
- *position_list1 = position[0]->next;
- position[0]->next = start_position[0];
- }
+ *position_list1 = position[0]->next;
+ position[0]->next = start_position[0];
+ }
- if (latest_length == 0)
- {
- *position_list2 = position[1];
- position[1] = NULL;
- }
- else
- {
- while (--latest_length)
- position[1] = position[1]->next;
+ if (latest_length == 0)
+ {
+ *position_list2 = position[1];
+ position[1] = NULL;
+ }
+ else
+ {
+ while (--latest_length)
+ position[1] = position[1]->next;
- *position_list2 = position[1]->next;
- position[1]->next = start_position[1];
- }
+ *position_list2 = position[1]->next;
+ position[1]->next = start_position[1];
+ }
- *lcs_ref = svn_diff__lcs(position[0], position[1], 0, 0,
- subpool);
+ token_counts[0] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ token_counts[1] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ for (token_index = 0; token_index < num_tokens; token_index++)
+ token_counts[0][token_index] = token_counts[1][token_index] = 0;
- /* Fix up the EOF lcs element in case one of
- * the two sequences was NULL.
- */
- if ((*lcs_ref)->position[0]->offset == 1)
- (*lcs_ref)->position[0] = *position_list1;
+ current = position[0];
+ do
+ {
+ token_counts[0][current->node]++;
+ current = current->next;
+ }
+ while (current != position[0]);
+ current = position[1];
+ do
+ {
+ token_counts[1][current->node]++;
+ current = current->next;
+ }
+ while (current != position[1]);
- if ((*lcs_ref)->position[1]->offset == 1)
- (*lcs_ref)->position[1] = *position_list2;
+ *lcs_ref = svn_diff__lcs(position[0], position[1], token_counts[0],
+ token_counts[1], num_tokens, 0, 0, subpool);
- /* Restore modified_length and latest_length */
- modified_length = hunk->modified_length;
- latest_length = hunk->latest_length;
+ /* Fix up the EOF lcs element in case one of
+ * the two sequences was NULL.
+ */
+ if ((*lcs_ref)->position[0]->offset == 1)
+ (*lcs_ref)->position[0] = *position_list1;
- /* Produce the resolved diff */
- while (1)
- {
- if (modified_start < lcs->position[0]->offset
- || latest_start < lcs->position[1]->offset)
- {
- (*diff_ref) = apr_palloc(pool, sizeof(**diff_ref));
+ if ((*lcs_ref)->position[1]->offset == 1)
+ (*lcs_ref)->position[1] = *position_list2;
- (*diff_ref)->type = svn_diff__type_conflict;
- (*diff_ref)->original_start = hunk->original_start;
- (*diff_ref)->original_length = hunk->original_length;
- (*diff_ref)->modified_start = modified_start - 1;
- (*diff_ref)->modified_length = lcs->position[0]->offset
+ /* Restore modified_length and latest_length */
+ modified_length = hunk->modified_length;
+ latest_length = hunk->latest_length;
+
+ /* Produce the resolved diff */
+ while (1)
+ {
+ if (modified_start < lcs->position[0]->offset
+ || latest_start < lcs->position[1]->offset)
+ {
+ (*diff_ref) = apr_palloc(pool, sizeof(**diff_ref));
+
+ (*diff_ref)->type = svn_diff__type_conflict;
+ (*diff_ref)->original_start = hunk->original_start;
+ (*diff_ref)->original_length = hunk->original_length;
+ (*diff_ref)->modified_start = modified_start - 1;
+ (*diff_ref)->modified_length = lcs->position[0]->offset
- modified_start;
- (*diff_ref)->latest_start = latest_start - 1;
- (*diff_ref)->latest_length = lcs->position[1]->offset
+ (*diff_ref)->latest_start = latest_start - 1;
+ (*diff_ref)->latest_length = lcs->position[1]->offset
- latest_start;
- (*diff_ref)->resolved_diff = NULL;
+ (*diff_ref)->resolved_diff = NULL;
- diff_ref = &(*diff_ref)->next;
- }
+ diff_ref = &(*diff_ref)->next;
+ }
- /* Detect the EOF */
- if (lcs->length == 0)
- break;
+ /* Detect the EOF */
+ if (lcs->length == 0)
+ break;
- modified_start = lcs->position[0]->offset;
- latest_start = lcs->position[1]->offset;
+ modified_start = lcs->position[0]->offset;
+ latest_start = lcs->position[1]->offset;
- (*diff_ref) = apr_palloc(pool, sizeof(**diff_ref));
+ (*diff_ref) = apr_palloc(pool, sizeof(**diff_ref));
- (*diff_ref)->type = svn_diff__type_diff_common;
- (*diff_ref)->original_start = hunk->original_start;
- (*diff_ref)->original_length = hunk->original_length;
- (*diff_ref)->modified_start = modified_start - 1;
- (*diff_ref)->modified_length = lcs->length;
- (*diff_ref)->latest_start = latest_start - 1;
- (*diff_ref)->latest_length = lcs->length;
- (*diff_ref)->resolved_diff = NULL;
+ (*diff_ref)->type = svn_diff__type_diff_common;
+ (*diff_ref)->original_start = hunk->original_start;
+ (*diff_ref)->original_length = hunk->original_length;
+ (*diff_ref)->modified_start = modified_start - 1;
+ (*diff_ref)->modified_length = lcs->length;
+ (*diff_ref)->latest_start = latest_start - 1;
+ (*diff_ref)->latest_length = lcs->length;
+ (*diff_ref)->resolved_diff = NULL;
- diff_ref = &(*diff_ref)->next;
+ diff_ref = &(*diff_ref)->next;
- modified_start += lcs->length;
- latest_start += lcs->length;
+ modified_start += lcs->length;
+ latest_start += lcs->length;
- lcs = lcs->next;
- }
+ lcs = lcs->next;
+ }
- *diff_ref = NULL;
+ *diff_ref = NULL;
- svn_pool_destroy(subpool);
+ svn_pool_destroy(subpool);
}
@@ -251,6 +275,10 @@
{
svn_diff__tree_t *tree;
svn_diff__position_t *position_list[3];
+ apr_uint32_t num_tokens;
+ apr_uint32_t token_index;
+ svn_diff__position_t *current;
+ apr_uint32_t *token_counts[3];
svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
svn_diff_datasource_modified,
svn_diff_datasource_latest};
@@ -292,6 +320,8 @@
prefix_lines,
subpool));
+ num_tokens = svn_diff__get_node_count(tree);
+
/* Get rid of the tokens, we don't need them to calc the diff */
if (vtable->token_discard_all != NULL)
vtable->token_discard_all(diff_baton);
@@ -299,10 +329,52 @@
/* We don't need the nodes in the tree either anymore, nor the tree itself */
svn_pool_destroy(treepool);
+ token_counts[0] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ token_counts[1] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ token_counts[2] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ for (token_index = 0; token_index < num_tokens; token_index++)
+ {
+ token_counts[0][token_index] = token_counts[1][token_index] = 0;
+ token_counts[2][token_index] = 0;
+ }
+
+ current = position_list[0];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[0][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[0]);
+ }
+ current = position_list[1];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[1][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[1]);
+ }
+ current = position_list[2];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[2][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[2]);
+ }
+
/* Get the lcs for original-modified and original-latest */
- lcs_om = svn_diff__lcs(position_list[0], position_list[1], prefix_lines,
+ lcs_om = svn_diff__lcs(position_list[0], position_list[1], token_counts[0],
+ token_counts[1], num_tokens, prefix_lines,
suffix_lines, subpool);
- lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
+ lcs_ol = svn_diff__lcs(position_list[0], position_list[2], token_counts[0],
+ token_counts[2], num_tokens, prefix_lines,
suffix_lines, subpool);
/* Produce a merged diff */
@@ -435,6 +507,7 @@
svn_diff__resolve_conflict(*diff_ref,
&position_list[1],
&position_list[2],
+ num_tokens,
pool);
}
else if (is_modified)
Index: subversion/libsvn_diff/diff4.c
===================================================================
--- subversion/libsvn_diff/diff4.c (revision 1104713)
+++ subversion/libsvn_diff/diff4.c (working copy)
@@ -173,6 +173,10 @@
{
svn_diff__tree_t *tree;
svn_diff__position_t *position_list[4];
+ apr_uint32_t num_tokens;
+ apr_uint32_t token_index;
+ svn_diff__position_t *current;
+ apr_uint32_t *token_counts[4];
svn_diff_datasource_e datasource[] = {svn_diff_datasource_original,
svn_diff_datasource_modified,
svn_diff_datasource_latest,
@@ -227,6 +231,8 @@
prefix_lines,
subpool2));
+ num_tokens = svn_diff__get_node_count(tree);
+
/* Get rid of the tokens, we don't need them to calc the diff */
if (vtable->token_discard_all != NULL)
vtable->token_discard_all(diff_baton);
@@ -234,8 +240,61 @@
/* We don't need the nodes in the tree either anymore, nor the tree itself */
svn_pool_clear(subpool3);
+ token_counts[0] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ token_counts[1] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ token_counts[2] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ token_counts[3] = apr_palloc(subpool, num_tokens * sizeof(**token_counts));
+ for (token_index = 0; token_index < num_tokens; token_index++)
+ {
+ token_counts[0][token_index] = token_counts[1][token_index] = 0;
+ token_counts[2][token_index] = token_counts[3][token_index] = 0;
+ }
+
+ current = position_list[0];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[0][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[0]);
+ }
+ current = position_list[1];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[1][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[1]);
+ }
+ current = position_list[2];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[2][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[2]);
+ }
+ current = position_list[3];
+ if (current != NULL)
+ {
+ do
+ {
+ token_counts[3][current->node]++;
+ current = current->next;
+ }
+ while (current != position_list[3]);
+ }
+
/* Get the lcs for original - latest */
- lcs_ol = svn_diff__lcs(position_list[0], position_list[2], prefix_lines,
+ lcs_ol = svn_diff__lcs(position_list[0], position_list[2],
+ token_counts[0], token_counts[2],
+ num_tokens, prefix_lines,
suffix_lines, subpool3);
diff_ol = svn_diff__diff(lcs_ol, 1, 1, TRUE, pool);
@@ -257,7 +316,9 @@
/* Get the lcs for common ancestor - original
* Do reverse adjustements
*/
- lcs_adjust = svn_diff__lcs(position_list[3], position_list[2], prefix_lines,
+ lcs_adjust = svn_diff__lcs(position_list[3], position_list[2],
+ token_counts[3], token_counts[2],
+ num_tokens, prefix_lines,
suffix_lines, subpool3);
diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
adjust_diff(diff_ol, diff_adjust);
@@ -267,7 +328,9 @@
/* Get the lcs for modified - common ancestor
* Do forward adjustments
*/
- lcs_adjust = svn_diff__lcs(position_list[1], position_list[3], prefix_lines,
+ lcs_adjust = svn_diff__lcs(position_list[1], position_list[3],
+ token_counts[1], token_counts[3],
+ num_tokens, prefix_lines,
suffix_lines, subpool3);
diff_adjust = svn_diff__diff(lcs_adjust, 1, 1, FALSE, subpool3);
adjust_diff(diff_ol, diff_adjust);
@@ -283,7 +346,7 @@
if (hunk->type == svn_diff__type_conflict)
{
svn_diff__resolve_conflict(hunk, &position_list[1],
- &position_list[2], pool);
+ &position_list[2], num_tokens, pool);
}
}
Index: subversion/libsvn_diff/lcs.c
===================================================================
--- subversion/libsvn_diff/lcs.c (revision 1104713)
+++ subversion/libsvn_diff/lcs.c (working copy)
@@ -51,6 +51,7 @@
svn_diff__snake(apr_off_t k,
svn_diff__snake_t *fp,
int idx,
+ apr_uint32_t *token_counts[2],
svn_diff__lcs_t **freelist,
apr_pool_t *pool)
{
@@ -92,6 +93,11 @@
}
+ if (previous_lcs)
+ {
+ previous_lcs->refcount++;
+ }
+
/* ### Optimization, skip all positions that don't have matchpoints
* ### anyway. Beware of the sentinel, don't skip it!
*/
@@ -99,41 +105,44 @@
position[0] = start_position[0];
position[1] = start_position[1];
- while (position[0]->node == position[1]->node)
+ while (1)
{
- position[0] = position[0]->next;
- position[1] = position[1]->next;
- }
-
- if (position[1] != start_position[1])
- {
- lcs = *freelist;
- if (lcs)
+ while (position[0]->node == position[1]->node)
{
- *freelist = lcs->next;
+ position[0] = position[0]->next;
+ position[1] = position[1]->next;
}
- else
+
+ if (position[1] != start_position[1])
{
- lcs = apr_palloc(pool, sizeof(*lcs));
+ lcs = *freelist;
+ if (lcs)
+ {
+ *freelist = lcs->next;
+ }
+ else
+ {
+ lcs = apr_palloc(pool, sizeof(*lcs));
+ }
+
+ lcs->position[idx] = start_position[0];
+ lcs->position[abs(1 - idx)] = start_position[1];
+ lcs->length = position[1]->offset - start_position[1]->offset;
+ lcs->next = previous_lcs;
+ lcs->refcount = 1;
+ previous_lcs = lcs;
+ start_position[0] = position[0];
+ start_position[1] = position[1];
}
-
- lcs->position[idx] = start_position[0];
- lcs->position[abs(1 - idx)] = start_position[1];
- lcs->length = position[1]->offset - start_position[1]->offset;
- lcs->next = previous_lcs;
- lcs->refcount = 1;
- fp[k].lcs = lcs;
+ if (position[0]->node >= 0 && token_counts[1][position[0]->node] == 0)
+ start_position[0] = position[0] = position[0]->next;
+ else if (position[1]->node >= 0 && token_counts[0][position[1]->node] ==
0)
+ start_position[1] = position[1] = position[1]->next;
+ else
+ break;
}
- else
- {
- fp[k].lcs = previous_lcs;
- }
- if (previous_lcs)
- {
- previous_lcs->refcount++;
- }
-
+ fp[k].lcs = previous_lcs;
fp[k].position[0] = position[0];
fp[k].position[1] = position[1];
@@ -188,12 +197,18 @@
svn_diff__lcs_t *
svn_diff__lcs(svn_diff__position_t *position_list1, /* pointer to tail (ring)
*/
svn_diff__position_t *position_list2, /* pointer to tail (ring)
*/
+ apr_uint32_t *token_counts_list1, /* array of counts */
+ apr_uint32_t *token_counts_list2, /* array of counts */
+ apr_uint32_t num_tokens,
apr_off_t prefix_lines,
apr_off_t suffix_lines,
apr_pool_t *pool)
{
int idx;
apr_off_t length[2];
+ apr_uint32_t *token_counts[2];
+ apr_uint32_t unique_count[2];
+ apr_uint32_t token_index;
svn_diff__snake_t *fp;
apr_off_t d;
apr_off_t k;
@@ -231,10 +246,19 @@
return lcs;
}
+ unique_count[1] = unique_count[0] = 0;
+ for (token_index = 0; token_index < num_tokens; token_index++)
+ {
+ if (token_counts_list1[token_index] == 0)
+ unique_count[1] += token_counts_list2[token_index];
+ if (token_counts_list2[token_index] == 0)
+ unique_count[0] += token_counts_list1[token_index];
+ }
+
/* Calculate length of both sequences to be compared */
length[0] = position_list1->offset - position_list1->next->offset + 1;
length[1] = position_list2->offset - position_list2->next->offset + 1;
- idx = length[0] > length[1] ? 1 : 0;
+ idx = length[0] - unique_count[0] > length[1] - unique_count[1] ? 1 : 0;
/* strikerXXX: here we allocate the furthest point array, which is
* strikerXXX: sized M + N + 3 (!)
@@ -246,18 +270,20 @@
sentinel_position[idx].next = position_list1->next;
position_list1->next = &sentinel_position[idx];
sentinel_position[idx].offset = position_list1->offset + 1;
+ token_counts[idx] = token_counts_list1;
sentinel_position[abs(1 - idx)].next = position_list2->next;
position_list2->next = &sentinel_position[abs(1 - idx)];
sentinel_position[abs(1 - idx)].offset = position_list2->offset + 1;
+ token_counts[abs(1 - idx)] = token_counts_list2;
- /* These are never dereferenced, only compared by value, so we
- * can safely fake these up and the void* cast is OK.
+ /* Negative indices will not be used elsewhere
*/
- sentinel_position[0].node = (void*)&sentinel_position[0];
- sentinel_position[1].node = (void*)&sentinel_position[1];
+ sentinel_position[0].node = -1;
+ sentinel_position[1].node = -2;
- d = length[abs(1 - idx)] - length[idx];
+ d = length[abs(1 - idx)] - unique_count[abs(1 - idx)]
+ - (length[idx] - unique_count[idx]);
/* k = -1 will be the first to be used to get previous
* position information from, make sure it holds sane
@@ -272,12 +298,12 @@
/* Forward */
for (k = -p; k < d; k++)
{
- svn_diff__snake(k, fp, idx, &lcs_freelist, pool);
+ svn_diff__snake(k, fp, idx, token_counts, &lcs_freelist, pool);
}
for (k = d + p; k >= d; k--)
{
- svn_diff__snake(k, fp, idx, &lcs_freelist, pool);
+ svn_diff__snake(k, fp, idx, token_counts, &lcs_freelist, pool);
}
p++;
Index: subversion/libsvn_diff/token.c
===================================================================
--- subversion/libsvn_diff/token.c (revision 1104713)
+++ subversion/libsvn_diff/token.c (working copy)
@@ -46,6 +46,7 @@
svn_diff__node_t *right;
apr_uint32_t hash;
+ apr_int32_t index;
void *token;
};
@@ -53,10 +54,20 @@
{
svn_diff__node_t *root[SVN_DIFF__HASH_SIZE];
apr_pool_t *pool;
+ apr_uint32_t node_count;
};
/*
+ * Returns number of tokens in a tree
+ */
+apr_uint32_t
+svn_diff__get_node_count(svn_diff__tree_t *tree)
+{
+ return tree->node_count;
+}
+
+/*
* Support functions to build a tree of token positions
*/
@@ -65,6 +76,7 @@
{
*tree = apr_pcalloc(pool, sizeof(**tree));
(*tree)->pool = pool;
+ (*tree)->node_count = 0;
}
@@ -122,6 +134,7 @@
new_node->right = NULL;
new_node->hash = hash;
new_node->token = token;
+ new_node->index = tree->node_count++;
*node = *node_ref = new_node;
@@ -168,7 +181,7 @@
/* Create a new position */
position = apr_palloc(pool, sizeof(*position));
position->next = NULL;
- position->node = node;
+ position->node = node->index;
position->offset = offset;
*position_ref = position;