Santhosh has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/179424

Change subject: MT: Improve the annotation mapping implementation
......................................................................

MT: Improve the annotation mapping implementation

1. Correct the range overlap checking algorithm. Test case:
   Editor-in-chief page from en to ca, see the first paragraph
2. Annotation Mapping: Use best match when multiple approximate
   matches found

Test case inlcuded

Change-Id: I9b318734646dcd04697ef0086fbc1be043b6b657
---
M mt/MTClient.js
M mt/annotationmapper/SubsequenceMatcher.js
M tests/mt/Apertium.test.js
3 files changed, 57 insertions(+), 8 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver 
refs/changes/24/179424/1

diff --git a/mt/MTClient.js b/mt/MTClient.js
index 0ad344b..d6d5225 100644
--- a/mt/MTClient.js
+++ b/mt/MTClient.js
@@ -84,6 +84,7 @@
                targetDoc.items = Array.prototype.slice.call( arguments, 0 );
                deferred.resolve( targetDoc.getHtml() );
        }, function ( error ) {
+               logger.error( error.toString() );
                deferred.reject( error );
        } );
 
@@ -227,12 +228,15 @@
  *   range array
  */
 function isOverlappingRange( range, rangeArray ) {
-       var i;
+       var i, rangeStart, rangeEnd, start, end;
 
+       rangeStart = range.start;
+       rangeEnd = range.start + range.length;
        for ( i = 0; i < rangeArray.length; i++ ) {
-               if ( rangeArray[ i ].start <= range.start &&
-                       rangeArray[ i ].start + rangeArray[ i ].length >= 
range.start + range.length
-               ) {
+               start = rangeArray[ i ].start;
+               end = start + rangeArray[ i ].length;
+               if ( start >= rangeStart && end <= rangeEnd ||
+                       start <= rangeStart && end >= rangeEnd ) {
                        return true;
                }
        }
@@ -290,7 +294,6 @@
                        } );
                }
        }
-
        return rangeMappings;
 };
 
@@ -309,15 +312,19 @@
  * @return.length {number} Length of matched sequence in the text.
  */
 MTClient.prototype.findSubSequence = function ( text, sequence, language, 
occurance ) {
-       var indices, matcher = new SubSequenceMatcher( language );
+       var indices, matcher;
 
+       matcher = new SubSequenceMatcher( language );
        indices = matcher.findFuzzyMatch( text, sequence );
        // Find the nth occurance position
+
        if ( !indices || indices.length < occurance ) {
                return null;
-       } else {
-               return indices[ occurance ];
        }
+       if ( occurance === 0 ) {
+               return matcher.bestMatch( indices );
+       }
+       return indices[ occurance ];
 };
 
 /**
diff --git a/mt/annotationmapper/SubsequenceMatcher.js 
b/mt/annotationmapper/SubsequenceMatcher.js
index e6f61d0..e88878a 100644
--- a/mt/annotationmapper/SubsequenceMatcher.js
+++ b/mt/annotationmapper/SubsequenceMatcher.js
@@ -78,6 +78,7 @@
                return null;
        }
 
+       //console.log( 'Searching [' + substring + '] in [' + text + ']' );
        substringNGrams = this.getWords( substring );
        substringWordsLength = substringNGrams.length;
        textNGrams = this.getNGrams( text, substringNGrams.length );
@@ -91,6 +92,12 @@
                        if ( this.isApproximateEqual( word, substringNGrams[ j 
] ) ) {
                                match = !match ? word : match + ' ' + word;
                        } else {
+                               // The match sequence broke.
+                               // Example:
+                               // Sarching [editor de página del editorial] in 
[the new york times, el cual tiene un editor
+                               // ejecutivo sobre las páginas noticiosas y un 
editor de página del editorial encima páginas de opinión.],
+                               // [editor ejecutivo] will be the match till 
here. We do not ignore that match, but will look for better
+                               // matches.
                                break;
                        }
                }
@@ -115,7 +122,31 @@
 
                startIndex = index + match.length;
        }
+
        return indices;
 };
 
+/**
+ * Sort function for maching positions based on length.
+ */
+function comparePositions( positionA, positionB ) {
+       if ( positionA.length < positionB.length ) {
+               return -1;
+       }
+       if ( positionA.length > positionB.length ) {
+               return 1;
+       }
+       return 0;
+}
+
+/**
+ * Find the best match among candidate positions by longest match.
+ * @param {Object[]} positions
+ * @retun {Object} best match position.
+ */
+SubSequenceMatcher.prototype.bestMatch = function ( positions ) {
+       positions.sort( comparePositions );
+       return positions[ positions.length - 1 ];
+};
+
 module.exports = SubSequenceMatcher;
diff --git a/tests/mt/Apertium.test.js b/tests/mt/Apertium.test.js
index b589fe4..c2c86d6 100644
--- a/tests/mt/Apertium.test.js
+++ b/tests/mt/Apertium.test.js
@@ -76,6 +76,17 @@
                        big: 'Grande',
                        red: 'Rojo'
                }
+       },
+       {
+               title: 'Find longest match among multiple matches',
+               source: '<p id="8"><span class="cx-segment" 
data-segmentid="9"><a class="cx-link" data-linkid="17" 
href="./The_New_York_Times" rel="mw:WikiLink" title="The New York Times">The 
New York Times</a>, which has an <b>executive editor</b> over the news pages 
and an <b>editorial page editor</b> over opinion pages.</span></p>',
+               target: '<p id="8"><span class="cx-segment" 
data-segmentid="9"><a class="cx-link" data-linkid="17" 
href="./The_New_York_Times" rel="mw:WikiLink" title="The New York Times">The 
New York Times</a>, el cual tiene un <b>editor ejecutivo</b> sobre las páginas 
noticiosas y un <b>editor de página del editorial</b> encima páginas de 
opinión.</span></p>',
+               textTranslations: {
+                       'The New York Times, which has an executive editor over 
the news pages and an editorial page editor over opinion pages.': 'The New York 
Times, el cual tiene un editor ejecutivo sobre las páginas noticiosas y un 
editor de página del editorial encima páginas de opinión.',
+                       'The New York Times': 'The New York Times',
+                       'executive editor': 'editor ejecutivo',
+                       'editorial page editor': 'editor de página del 
editorial'
+               }
        }
  ];
 

-- 
To view, visit https://gerrit.wikimedia.org/r/179424
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I9b318734646dcd04697ef0086fbc1be043b6b657
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Santhosh <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to