Santhosh has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/179421

Change subject: Annotation Mapping: Use best match when multiple approximate 
matches found
......................................................................

Annotation Mapping: Use best match when multiple approximate matches found

Change-Id: I75e43ea766e48ce40066ae5aa76849d5e0dd763f
---
M mt/MTClient.js
M mt/annotationmapper/SubsequenceMatcher.js
M tests/mt/Apertium.test.js
3 files changed, 49 insertions(+), 3 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver 
refs/changes/21/179421/1

diff --git a/mt/MTClient.js b/mt/MTClient.js
index 129b2a5..7aa17b6 100644
--- a/mt/MTClient.js
+++ b/mt/MTClient.js
@@ -84,6 +84,7 @@
                targetDoc.items = Array.prototype.slice.call( arguments, 0 );
                deferred.resolve( targetDoc.getHtml() );
        }, function ( error ) {
+               logger.error( error.toString() );
                deferred.reject( error );
        } );
 
@@ -291,7 +292,6 @@
                        } );
                }
        }
-
        return rangeMappings;
 };
 
@@ -310,15 +310,20 @@
  * @return.length {number} Length of matched sequence in the text.
  */
 MTClient.prototype.findSubSequence = function ( text, sequence, language, 
occurance ) {
-       var indices, matcher = new SubSequenceMatcher( language );
+       var indices, matcher;
 
+       matcher = new SubSequenceMatcher( language );
        indices = matcher.findFuzzyMatch( text, sequence );
        // Find the nth occurance position
+
        if ( !indices || indices.length < occurance ) {
                return null;
-       } else {
+       }
+       if ( indices.length > occurance ) {
                return indices[ occurance ];
        }
+
+       return matcher.bestMatch( indices );
 };
 
 /**
diff --git a/mt/annotationmapper/SubsequenceMatcher.js 
b/mt/annotationmapper/SubsequenceMatcher.js
index e6f61d0..d051e5f 100644
--- a/mt/annotationmapper/SubsequenceMatcher.js
+++ b/mt/annotationmapper/SubsequenceMatcher.js
@@ -78,6 +78,7 @@
                return null;
        }
 
+       // console.log( 'Searching [' + substring + '] in [' + text + ']' );
        substringNGrams = this.getWords( substring );
        substringWordsLength = substringNGrams.length;
        textNGrams = this.getNGrams( text, substringNGrams.length );
@@ -91,6 +92,12 @@
                        if ( this.isApproximateEqual( word, substringNGrams[ j 
] ) ) {
                                match = !match ? word : match + ' ' + word;
                        } else {
+                               // The match sequence broke.
+                               // Example:
+                               // Sarching [editor de página del editorial] in 
[the new york times, el cual tiene un editor
+                               // ejecutivo sobre las páginas noticiosas y un 
editor de página del editorial encima páginas de opinión.],
+                               // [editor ejecutivo] will be the match till 
here. We do not ignore that match, but will look for better
+                               // matches.
                                break;
                        }
                }
@@ -118,4 +125,27 @@
        return indices;
 };
 
+/**
+ * Sort function for maching positions based on length.
+ */
+function comparePositions( positionA, positionB ) {
+       if ( positionA.length < positionB.length ) {
+               return -1;
+       }
+       if ( positionA.length > positionB.length ) {
+               return 1;
+       }
+       return 0;
+}
+
+/**
+ * Find the best match among candidate positions by longest match.
+ * @param {Object[]} positions
+ * @retun {Object} best match position.
+ */
+SubSequenceMatcher.prototype.bestMatch = function ( positions ) {
+       positions.sort( comparePositions );
+       return positions[ positions.length - 1 ];
+};
+
 module.exports = SubSequenceMatcher;
diff --git a/tests/mt/Apertium.test.js b/tests/mt/Apertium.test.js
index b589fe4..c2c86d6 100644
--- a/tests/mt/Apertium.test.js
+++ b/tests/mt/Apertium.test.js
@@ -76,6 +76,17 @@
                        big: 'Grande',
                        red: 'Rojo'
                }
+       },
+       {
+               title: 'Find longest match among multiple matches',
+               source: '<p id="8"><span class="cx-segment" 
data-segmentid="9"><a class="cx-link" data-linkid="17" 
href="./The_New_York_Times" rel="mw:WikiLink" title="The New York Times">The 
New York Times</a>, which has an <b>executive editor</b> over the news pages 
and an <b>editorial page editor</b> over opinion pages.</span></p>',
+               target: '<p id="8"><span class="cx-segment" 
data-segmentid="9"><a class="cx-link" data-linkid="17" 
href="./The_New_York_Times" rel="mw:WikiLink" title="The New York Times">The 
New York Times</a>, el cual tiene un <b>editor ejecutivo</b> sobre las páginas 
noticiosas y un <b>editor de página del editorial</b> encima páginas de 
opinión.</span></p>',
+               textTranslations: {
+                       'The New York Times, which has an executive editor over 
the news pages and an editorial page editor over opinion pages.': 'The New York 
Times, el cual tiene un editor ejecutivo sobre las páginas noticiosas y un 
editor de página del editorial encima páginas de opinión.',
+                       'The New York Times': 'The New York Times',
+                       'executive editor': 'editor ejecutivo',
+                       'editorial page editor': 'editor de página del 
editorial'
+               }
        }
  ];
 

-- 
To view, visit https://gerrit.wikimedia.org/r/179421
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I75e43ea766e48ce40066ae5aa76849d5e0dd763f
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Santhosh <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to