[MediaWiki-commits] [Gerrit] WIP MT: Subsequence extraction and mapping algorithm impleme... - change (mediawiki...cxserver)

Santhosh (Code Review) Mon, 24 Nov 2014 04:06:07 -0800

Santhosh has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/175420


Change subject: WIP MT: Subsequence extraction and mapping algorithm 
implementation
......................................................................

WIP MT: Subsequence extraction and mapping algorithm implementation

Change-Id: I5b97362d1bd75f7719eabd85bea19169ef3bc230
---
M index.js
M lineardoc/LinearDoc.js
2 files changed, 99 insertions(+), 20 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver 
refs/changes/20/175420/1

diff --git a/index.js b/index.js
index f955bdb..167639e 100644
--- a/index.js
+++ b/index.js
@@ -1,6 +1,8 @@
 module.exports = {
        Segmenter: require( './segmentation/CXSegmenter.js' ).CXSegmenter,
        Apertium: require( './mt/Apertium.js' ),
+       Yandex: require( './mt/Yandex.js' ),
+       MTClient: require( './mt/MTClient.js' ),
        LinearDoc: require( './lineardoc/LinearDoc.js' ),
        Dictionary: require( './dictionary' )
 };
diff --git a/lineardoc/LinearDoc.js b/lineardoc/LinearDoc.js
index 3144bad..bb3581a 100644
--- a/lineardoc/LinearDoc.js
+++ b/lineardoc/LinearDoc.js
@@ -67,7 +67,7 @@
        }
        attributes.sort();
        for ( i = 0, len = attributes.length; i < len; i++ ) {
-               attr = attributes[i];
+               attr = attributes[ i ];
                html.push( ' ' + esc( attr ) + '="' + escAttr( tag.attributes[ 
attr ] ) + '"' );
        }
        if ( tag.isSelfClosing ) {
@@ -84,9 +84,12 @@
  * @return {Object} Cloned tag
  */
 function cloneOpenTag( tag ) {
-       var attr, newTag = { name: tag.name, attributes: {} };
+       var attr, newTag = {
+               name: tag.name,
+               attributes: {}
+       };
        for ( attr in tag.attributes ) {
-               newTag.attributes[attr] = tag.attributes[attr];
+               newTag.attributes[ attr ] = tag.attributes[ attr ];
        }
        return newTag;
 }
@@ -202,7 +205,7 @@
        'map', 'object', 'pre', 'progress', 'video',
        // non-annotation inline tags
        'img', 'br'
-] ) );
+ ] ) );
 
 /**
  * Find the boundaries that lie in each chunk
@@ -225,16 +228,18 @@
 
        // Get boundaries in order, disregarding the start of the first chunk
        boundaries = boundaries.slice();
-       boundaries.sort( function ( a, b ) { return a - b; } );
-       while ( boundaries[boundaryPtr] === 0 ) {
+       boundaries.sort( function ( a, b ) {
+               return a - b;
+       } );
+       while ( boundaries[ boundaryPtr ] === 0 ) {
                boundaryPtr++;
        }
        for ( i = 0, len = chunks.length; i < len; i++ ) {
                groupBoundaries = [];
-               chunk = chunks[i];
+               chunk = chunks[ i ];
                chunkLength = getLength( chunk );
                while ( true ) {
-                       boundary = boundaries[boundaryPtr];
+                       boundary = boundaries[ boundaryPtr ];
                        if ( boundary === undefined || boundary > offset + 
chunkLength - 1 ) {
                                // beyond the interior of this chunk
                                break;
@@ -285,7 +290,10 @@
        this.offsets = [];
        cursor = 0;
        for ( i = 0, len = this.textChunks.length; i < len; i++ ) {
-               this.offsets[ i ] = { start: cursor, length: this.textChunks[ i 
].text.length };
+               this.offsets[ i ] = {
+                       start: cursor,
+                       length: this.textChunks[ i ].text.length
+               };
                cursor += this.offsets[ i ].length;
        }
 }
@@ -382,7 +390,7 @@
                offset = this.offsets[ i ].start;
                if ( textChunk.text.length > 0 ) {
                        continue;
-               } 
+               }
                if ( !emptyTextChunks[ offset ] ) {
                        emptyTextChunks[ offset ] = [];
                }
@@ -391,7 +399,9 @@
        for ( offset in emptyTextChunks ) {
                emptyTextChunkOffsets.push( offset );
        }
-       emptyTextChunkOffsets.sort( function ( a, b ) { return a - b; } );
+       emptyTextChunkOffsets.sort( function ( a, b ) {
+               return a - b;
+       } );
 
        for ( i = 0, iLen = rangeMappings.length; i < iLen; i++ ) {
                // Copy tags from source text start offset
@@ -488,7 +498,9 @@
                } );
                pos += tail.length;
        }
-       return new TextBlock( textChunks.map( function ( x ) { return 
x.textChunk; } ) );
+       return new TextBlock( textChunks.map( function ( x ) {
+               return x.textChunk;
+       } ) );
 };
 
 /**
@@ -642,14 +654,14 @@
        // Setup: currentTextChunks for current segment, and allTextChunks for 
all segments
        allTextChunks = [];
        currentTextChunks = [];
+
        function flushChunks() {
                var modifiedTextChunks;
                if ( currentTextChunks.length === 0 ) {
                        return;
                }
                modifiedTextChunks = addCommonTag(
-                       currentTextChunks,
-                       {
+                       currentTextChunks, {
                                name: 'span',
                                attributes: {
                                        class: 'cx-segment',
@@ -666,16 +678,18 @@
        groups = getChunkBoundaryGroups(
                getBoundaries( this.getPlainText() ),
                this.textChunks,
-               function ( textChunk ) { return textChunk.text.length; }
+               function ( textChunk ) {
+                       return textChunk.text.length;
+               }
        );
 
        offset = 0;
        for ( i = 0, iLen = groups.length; i < iLen; i++ ) {
-               group = groups[i];
+               group = groups[ i ];
                textChunk = group.chunk;
                boundaries = group.boundaries;
                for ( j = 0, jLen = boundaries.length; j < jLen; j++ ) {
-                       relOffset = boundaries[j] - offset;
+                       relOffset = boundaries[ j ] - offset;
                        if ( relOffset === 0 ) {
                                flushChunks();
                        } else {
@@ -700,6 +714,47 @@
        }
        flushChunks();
        return new TextBlock( allTextChunks );
+};
+
+/**
+ * Dump an XML Array version of the linear representation, for debugging
+ *
+ * @method
+ * @return {string[]} Array that will concatenate to an XML string 
representation
+ */
+TextBlock.prototype.getSubSequences = function () {
+       var i, j, len, chunk, tagsDump, tagsAttr, nextTagsAttr,
+               nextChunk, nextTagsDump,
+               subsequences = [];
+       for ( i = 0, len = this.textChunks.length; i < len; i++ ) {
+               chunk = this.textChunks[ i ];
+               tagsDump = dumpTags( chunk.tags );
+               tagsAttr = tagsDump ? ' tags="' + tagsDump + '"' : '';
+               if ( chunk.text && chunk.tags && chunk.tags.length > 0 ) {
+                       j = i + 1;
+                       while ( j < len ) {
+                               nextChunk = this.textChunks[ j ];
+                               nextTagsDump = dumpTags( chunk.tags );
+                               nextTagsAttr = tagsDump ? ' tags="' + tagsDump 
+ '"' : '';
+                               if ( nextChunk.tags && nextChunk.tags.length > 
0 && nextTagsDump.indexOf( tagsDump ) >= 0 ) {
+                                       chunk = util._extend( {}, chunk );
+                                       chunk.text += nextChunk.text;
+                               } else {
+                                       break;
+                               }
+                               j++;
+                       }
+                       subsequences.push( chunk );
+               }
+               if ( chunk.inlineContent ) {
+                       if ( chunk.inlineContent.getSubSequences ) {
+                               // sub-doc: concatenate
+                               subsequences.push( 
chunk.inlineContent.getSubSequences( '' ) );
+                       }
+               }
+       }
+       return subsequences;
+
 };
 
 /**
@@ -930,6 +985,26 @@
 };
 
 /**
+ * Dump an XML Array version of the linear representation, for debugging
+ * @method
+ * @return {string[]} Array that will concatenate to an XML string 
representation
+ */
+Doc.prototype.getSubSequences = function () {
+       var i, len, type, item, tag, textBlock,
+               subsequences = [];
+
+       for ( i = 0, len = this.items.length; i < len; i++ ) {
+               type = this.items[ i ].type;
+               item = this.items[ i ].item;
+               if ( type === 'textblock' ) {
+                       // Block of inline text
+                       subsequences.push( item.getSubSequences() );
+               }
+       }
+       return subsequences;
+};
+
+/**
  * Extract the text segments from the document
  * @method
  * @return {string[]} balanced html fragments, one per segment
@@ -1060,13 +1135,13 @@
                return;
        }
        for ( i = 0, len = this.textChunks.length; i < len; i++ ) {
-               textChunk = this.textChunks[i];
+               textChunk = this.textChunks[ i ];
                if ( textChunk.inlineContent || textChunk.text.match( /\S/ ) ) {
                        whitespaceOnly = false;
                        whitespace = undefined;
                        break;
                } else {
-                       whitespace.push( this.textChunks[i].text );
+                       whitespace.push( this.textChunks[ i ].text );
                }
        }
        if ( whitespaceOnly ) {
@@ -1140,7 +1215,9 @@
  * @constructor
  */
 function Normalizer() {
-       SAXParser.call( this, false, { lowercase: true } );
+       SAXParser.call( this, false, {
+               lowercase: true
+       } );
 }
 util.inherits( Normalizer, SAXParser );
 

-- 
To view, visit https://gerrit.wikimedia.org/r/175420
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I5b97362d1bd75f7719eabd85bea19169ef3bc230
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] WIP MT: Subsequence extraction and mapping algorithm impleme... - change (mediawiki...cxserver)

Reply via email to