Santhosh has uploaded a new change for review.
https://gerrit.wikimedia.org/r/179424
Change subject: MT: Improve the annotation mapping implementation
......................................................................
MT: Improve the annotation mapping implementation
1. Correct the range overlap checking algorithm. Test case:
Editor-in-chief page from en to ca, see the first paragraph
2. Annotation Mapping: Use best match when multiple approximate
matches found
Test case inlcuded
Change-Id: I9b318734646dcd04697ef0086fbc1be043b6b657
---
M mt/MTClient.js
M mt/annotationmapper/SubsequenceMatcher.js
M tests/mt/Apertium.test.js
3 files changed, 57 insertions(+), 8 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver
refs/changes/24/179424/1
diff --git a/mt/MTClient.js b/mt/MTClient.js
index 0ad344b..d6d5225 100644
--- a/mt/MTClient.js
+++ b/mt/MTClient.js
@@ -84,6 +84,7 @@
targetDoc.items = Array.prototype.slice.call( arguments, 0 );
deferred.resolve( targetDoc.getHtml() );
}, function ( error ) {
+ logger.error( error.toString() );
deferred.reject( error );
} );
@@ -227,12 +228,15 @@
* range array
*/
function isOverlappingRange( range, rangeArray ) {
- var i;
+ var i, rangeStart, rangeEnd, start, end;
+ rangeStart = range.start;
+ rangeEnd = range.start + range.length;
for ( i = 0; i < rangeArray.length; i++ ) {
- if ( rangeArray[ i ].start <= range.start &&
- rangeArray[ i ].start + rangeArray[ i ].length >=
range.start + range.length
- ) {
+ start = rangeArray[ i ].start;
+ end = start + rangeArray[ i ].length;
+ if ( start >= rangeStart && end <= rangeEnd ||
+ start <= rangeStart && end >= rangeEnd ) {
return true;
}
}
@@ -290,7 +294,6 @@
} );
}
}
-
return rangeMappings;
};
@@ -309,15 +312,19 @@
* @return.length {number} Length of matched sequence in the text.
*/
MTClient.prototype.findSubSequence = function ( text, sequence, language,
occurance ) {
- var indices, matcher = new SubSequenceMatcher( language );
+ var indices, matcher;
+ matcher = new SubSequenceMatcher( language );
indices = matcher.findFuzzyMatch( text, sequence );
// Find the nth occurance position
+
if ( !indices || indices.length < occurance ) {
return null;
- } else {
- return indices[ occurance ];
}
+ if ( occurance === 0 ) {
+ return matcher.bestMatch( indices );
+ }
+ return indices[ occurance ];
};
/**
diff --git a/mt/annotationmapper/SubsequenceMatcher.js
b/mt/annotationmapper/SubsequenceMatcher.js
index e6f61d0..e88878a 100644
--- a/mt/annotationmapper/SubsequenceMatcher.js
+++ b/mt/annotationmapper/SubsequenceMatcher.js
@@ -78,6 +78,7 @@
return null;
}
+ //console.log( 'Searching [' + substring + '] in [' + text + ']' );
substringNGrams = this.getWords( substring );
substringWordsLength = substringNGrams.length;
textNGrams = this.getNGrams( text, substringNGrams.length );
@@ -91,6 +92,12 @@
if ( this.isApproximateEqual( word, substringNGrams[ j
] ) ) {
match = !match ? word : match + ' ' + word;
} else {
+ // The match sequence broke.
+ // Example:
+ // Sarching [editor de página del editorial] in
[the new york times, el cual tiene un editor
+ // ejecutivo sobre las páginas noticiosas y un
editor de página del editorial encima páginas de opinión.],
+ // [editor ejecutivo] will be the match till
here. We do not ignore that match, but will look for better
+ // matches.
break;
}
}
@@ -115,7 +122,31 @@
startIndex = index + match.length;
}
+
return indices;
};
+/**
+ * Sort function for maching positions based on length.
+ */
+function comparePositions( positionA, positionB ) {
+ if ( positionA.length < positionB.length ) {
+ return -1;
+ }
+ if ( positionA.length > positionB.length ) {
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * Find the best match among candidate positions by longest match.
+ * @param {Object[]} positions
+ * @retun {Object} best match position.
+ */
+SubSequenceMatcher.prototype.bestMatch = function ( positions ) {
+ positions.sort( comparePositions );
+ return positions[ positions.length - 1 ];
+};
+
module.exports = SubSequenceMatcher;
diff --git a/tests/mt/Apertium.test.js b/tests/mt/Apertium.test.js
index b589fe4..c2c86d6 100644
--- a/tests/mt/Apertium.test.js
+++ b/tests/mt/Apertium.test.js
@@ -76,6 +76,17 @@
big: 'Grande',
red: 'Rojo'
}
+ },
+ {
+ title: 'Find longest match among multiple matches',
+ source: '<p id="8"><span class="cx-segment"
data-segmentid="9"><a class="cx-link" data-linkid="17"
href="./The_New_York_Times" rel="mw:WikiLink" title="The New York Times">The
New York Times</a>, which has an <b>executive editor</b> over the news pages
and an <b>editorial page editor</b> over opinion pages.</span></p>',
+ target: '<p id="8"><span class="cx-segment"
data-segmentid="9"><a class="cx-link" data-linkid="17"
href="./The_New_York_Times" rel="mw:WikiLink" title="The New York Times">The
New York Times</a>, el cual tiene un <b>editor ejecutivo</b> sobre las páginas
noticiosas y un <b>editor de página del editorial</b> encima páginas de
opinión.</span></p>',
+ textTranslations: {
+ 'The New York Times, which has an executive editor over
the news pages and an editorial page editor over opinion pages.': 'The New York
Times, el cual tiene un editor ejecutivo sobre las páginas noticiosas y un
editor de página del editorial encima páginas de opinión.',
+ 'The New York Times': 'The New York Times',
+ 'executive editor': 'editor ejecutivo',
+ 'editorial page editor': 'editor de página del
editorial'
+ }
}
];
--
To view, visit https://gerrit.wikimedia.org/r/179424
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I9b318734646dcd04697ef0086fbc1be043b6b657
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Santhosh <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits