Santhosh has uploaded a new change for review.
https://gerrit.wikimedia.org/r/179421
Change subject: Annotation Mapping: Use best match when multiple approximate
matches found
......................................................................
Annotation Mapping: Use best match when multiple approximate matches found
Change-Id: I75e43ea766e48ce40066ae5aa76849d5e0dd763f
---
M mt/MTClient.js
M mt/annotationmapper/SubsequenceMatcher.js
M tests/mt/Apertium.test.js
3 files changed, 49 insertions(+), 3 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver
refs/changes/21/179421/1
diff --git a/mt/MTClient.js b/mt/MTClient.js
index 129b2a5..7aa17b6 100644
--- a/mt/MTClient.js
+++ b/mt/MTClient.js
@@ -84,6 +84,7 @@
targetDoc.items = Array.prototype.slice.call( arguments, 0 );
deferred.resolve( targetDoc.getHtml() );
}, function ( error ) {
+ logger.error( error.toString() );
deferred.reject( error );
} );
@@ -291,7 +292,6 @@
} );
}
}
-
return rangeMappings;
};
@@ -310,15 +310,20 @@
* @return.length {number} Length of matched sequence in the text.
*/
MTClient.prototype.findSubSequence = function ( text, sequence, language,
occurance ) {
- var indices, matcher = new SubSequenceMatcher( language );
+ var indices, matcher;
+ matcher = new SubSequenceMatcher( language );
indices = matcher.findFuzzyMatch( text, sequence );
// Find the nth occurance position
+
if ( !indices || indices.length < occurance ) {
return null;
- } else {
+ }
+ if ( indices.length > occurance ) {
return indices[ occurance ];
}
+
+ return matcher.bestMatch( indices );
};
/**
diff --git a/mt/annotationmapper/SubsequenceMatcher.js
b/mt/annotationmapper/SubsequenceMatcher.js
index e6f61d0..d051e5f 100644
--- a/mt/annotationmapper/SubsequenceMatcher.js
+++ b/mt/annotationmapper/SubsequenceMatcher.js
@@ -78,6 +78,7 @@
return null;
}
+ // console.log( 'Searching [' + substring + '] in [' + text + ']' );
substringNGrams = this.getWords( substring );
substringWordsLength = substringNGrams.length;
textNGrams = this.getNGrams( text, substringNGrams.length );
@@ -91,6 +92,12 @@
if ( this.isApproximateEqual( word, substringNGrams[ j
] ) ) {
match = !match ? word : match + ' ' + word;
} else {
+ // The match sequence broke.
+ // Example:
+ // Sarching [editor de página del editorial] in
[the new york times, el cual tiene un editor
+ // ejecutivo sobre las páginas noticiosas y un
editor de página del editorial encima páginas de opinión.],
+ // [editor ejecutivo] will be the match till
here. We do not ignore that match, but will look for better
+ // matches.
break;
}
}
@@ -118,4 +125,27 @@
return indices;
};
+/**
+ * Sort function for maching positions based on length.
+ */
+function comparePositions( positionA, positionB ) {
+ if ( positionA.length < positionB.length ) {
+ return -1;
+ }
+ if ( positionA.length > positionB.length ) {
+ return 1;
+ }
+ return 0;
+}
+
+/**
+ * Find the best match among candidate positions by longest match.
+ * @param {Object[]} positions
+ * @retun {Object} best match position.
+ */
+SubSequenceMatcher.prototype.bestMatch = function ( positions ) {
+ positions.sort( comparePositions );
+ return positions[ positions.length - 1 ];
+};
+
module.exports = SubSequenceMatcher;
diff --git a/tests/mt/Apertium.test.js b/tests/mt/Apertium.test.js
index b589fe4..c2c86d6 100644
--- a/tests/mt/Apertium.test.js
+++ b/tests/mt/Apertium.test.js
@@ -76,6 +76,17 @@
big: 'Grande',
red: 'Rojo'
}
+ },
+ {
+ title: 'Find longest match among multiple matches',
+ source: '<p id="8"><span class="cx-segment"
data-segmentid="9"><a class="cx-link" data-linkid="17"
href="./The_New_York_Times" rel="mw:WikiLink" title="The New York Times">The
New York Times</a>, which has an <b>executive editor</b> over the news pages
and an <b>editorial page editor</b> over opinion pages.</span></p>',
+ target: '<p id="8"><span class="cx-segment"
data-segmentid="9"><a class="cx-link" data-linkid="17"
href="./The_New_York_Times" rel="mw:WikiLink" title="The New York Times">The
New York Times</a>, el cual tiene un <b>editor ejecutivo</b> sobre las páginas
noticiosas y un <b>editor de página del editorial</b> encima páginas de
opinión.</span></p>',
+ textTranslations: {
+ 'The New York Times, which has an executive editor over
the news pages and an editorial page editor over opinion pages.': 'The New York
Times, el cual tiene un editor ejecutivo sobre las páginas noticiosas y un
editor de página del editorial encima páginas de opinión.',
+ 'The New York Times': 'The New York Times',
+ 'executive editor': 'editor ejecutivo',
+ 'editorial page editor': 'editor de página del
editorial'
+ }
}
];
--
To view, visit https://gerrit.wikimedia.org/r/179421
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I75e43ea766e48ce40066ae5aa76849d5e0dd763f
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Santhosh <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits