Jkroll has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/404293 )
Change subject: [WiP] Use character runs as an additional measure for change vs
add+delete
......................................................................
[WiP] Use character runs as an additional measure for change vs add+delete
Change-Id: I2dafeca326dee2a594f7565d68f05128cf32acef
---
M DiffEngine.h
M Wikidiff2.h
2 files changed, 16 insertions(+), 4 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/php/wikidiff2
refs/changes/93/404293/1
diff --git a/DiffEngine.h b/DiffEngine.h
index 38dbe0e..f32b1a7 100644
--- a/DiffEngine.h
+++ b/DiffEngine.h
@@ -31,7 +31,7 @@
// helper function to calculate similarity of text lines, based on existing
diff code.
// used in DiffEngine and Wikidiff2.
-double calculateSimilarity(TextUtil::WordVector& words1, TextUtil::WordVector&
words2, long long bailoutComplexity, int *opCountPtr = nullptr);
+double calculateSimilarity(TextUtil::WordVector& words1, TextUtil::WordVector&
words2, long long bailoutComplexity, int *runCountPtr = nullptr, int
*opCountPtr = nullptr);
/**
* Diff operation
@@ -195,7 +195,9 @@
TextUtil::WordVector words1, words2;
TextUtil::explodeWords(del, words1);
TextUtil::explodeWords(add, words2);
- return calculateSimilarity(words1, words2, bailoutComplexity) >
looksLikeChangeThreshold();
+ int runCount;
+ double similarity = calculateSimilarity(words1, words2,
bailoutComplexity, &runCount);
+ return similarity > looksLikeChangeThreshold();
}
// go through list of changed lines. if they are too dissimilar, convert to
del+add.
@@ -677,7 +679,7 @@
engine.diff(from_lines, to_lines, *this, bailoutComplexity);
}
-inline double calculateSimilarity(TextUtil::WordVector& words1,
TextUtil::WordVector& words2, long long bailoutComplexity, int *opCountPtr /* =
nullptr*/)
+inline double calculateSimilarity(TextUtil::WordVector& words1,
TextUtil::WordVector& words2, long long bailoutComplexity, int *runCountPtr /*
= nullptr */, int *opCountPtr /* = nullptr*/)
{
typedef Diff<Word> WordDiff;
WordDiff diff(words1, words2, bailoutComplexity);
@@ -689,6 +691,8 @@
return a + (b->suffixEnd - b->bodyStart);
});
};
+ int runCount = 0;
+ int lastOp = -1;
for (int i = 0; i < diff.size(); ++i) {
int op = diff[i].op;
int charCount;
@@ -706,6 +710,10 @@
}
opCharCount[op] += charCount;
charsTotal += charCount;
+ if(op != lastOp) {
+ runCount++;
+ lastOp = op;
+ }
}
if (opCharCount[DiffOp<Word>::copy] == 0) {
similarity = 0.0;
@@ -723,6 +731,9 @@
}
}
+ if (runCountPtr)
+ *runCountPtr = runCount;
+
return similarity;
}
diff --git a/Wikidiff2.h b/Wikidiff2.h
index 185bfb6..91d0c03 100644
--- a/Wikidiff2.h
+++ b/Wikidiff2.h
@@ -40,6 +40,7 @@
struct DiffMapEntry
{
double similarity;
+ int runCount; // number of ChangeOp sequences
(character counts).
int opCharCount[4] = { 0 };
int opIndexFrom, opLineFrom, opIndexTo, opLineTo;
bool lhsDisplayed = false, rhsDisplayed = false;
@@ -81,7 +82,7 @@
inline Wikidiff2::DiffMapEntry::DiffMapEntry(Wikidiff2::WordVector& words1,
Wikidiff2::WordVector& words2, int opIndexFrom_, int opLineFrom_, int
opIndexTo_, int opLineTo_):
opIndexFrom(opIndexFrom_), opLineFrom(opLineFrom_),
opIndexTo(opIndexTo_), opLineTo(opLineTo_)
{
- similarity = calculateSimilarity(words1, words2,
MAX_WORD_LEVEL_DIFF_COMPLEXITY, opCharCount);
+ similarity = calculateSimilarity(words1, words2,
MAX_WORD_LEVEL_DIFF_COMPLEXITY, &runCount, opCharCount);
}
inline bool Wikidiff2::AllowPrintMovedLineDiff::operator () (StringDiff &
linediff, int maxMovedLines)
--
To view, visit https://gerrit.wikimedia.org/r/404293
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I2dafeca326dee2a594f7565d68f05128cf32acef
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/php/wikidiff2
Gerrit-Branch: master
Gerrit-Owner: Jkroll <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits