Santhosh has uploaded a new change for review. https://gerrit.wikimedia.org/r/175420
Change subject: WIP MT: Subsequence extraction and mapping algorithm implementation ...................................................................... WIP MT: Subsequence extraction and mapping algorithm implementation Change-Id: I5b97362d1bd75f7719eabd85bea19169ef3bc230 --- M index.js M lineardoc/LinearDoc.js 2 files changed, 99 insertions(+), 20 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver refs/changes/20/175420/1 diff --git a/index.js b/index.js index f955bdb..167639e 100644 --- a/index.js +++ b/index.js @@ -1,6 +1,8 @@ module.exports = { Segmenter: require( './segmentation/CXSegmenter.js' ).CXSegmenter, Apertium: require( './mt/Apertium.js' ), + Yandex: require( './mt/Yandex.js' ), + MTClient: require( './mt/MTClient.js' ), LinearDoc: require( './lineardoc/LinearDoc.js' ), Dictionary: require( './dictionary' ) }; diff --git a/lineardoc/LinearDoc.js b/lineardoc/LinearDoc.js index 3144bad..bb3581a 100644 --- a/lineardoc/LinearDoc.js +++ b/lineardoc/LinearDoc.js @@ -67,7 +67,7 @@ } attributes.sort(); for ( i = 0, len = attributes.length; i < len; i++ ) { - attr = attributes[i]; + attr = attributes[ i ]; html.push( ' ' + esc( attr ) + '="' + escAttr( tag.attributes[ attr ] ) + '"' ); } if ( tag.isSelfClosing ) { @@ -84,9 +84,12 @@ * @return {Object} Cloned tag */ function cloneOpenTag( tag ) { - var attr, newTag = { name: tag.name, attributes: {} }; + var attr, newTag = { + name: tag.name, + attributes: {} + }; for ( attr in tag.attributes ) { - newTag.attributes[attr] = tag.attributes[attr]; + newTag.attributes[ attr ] = tag.attributes[ attr ]; } return newTag; } @@ -202,7 +205,7 @@ 'map', 'object', 'pre', 'progress', 'video', // non-annotation inline tags 'img', 'br' -] ) ); + ] ) ); /** * Find the boundaries that lie in each chunk @@ -225,16 +228,18 @@ // Get boundaries in order, disregarding the start of the first chunk boundaries = boundaries.slice(); - boundaries.sort( function ( a, b ) { return a - b; } ); - while ( boundaries[boundaryPtr] === 0 ) { + boundaries.sort( function ( a, b ) { + return a - b; + } ); + while ( boundaries[ boundaryPtr ] === 0 ) { boundaryPtr++; } for ( i = 0, len = chunks.length; i < len; i++ ) { groupBoundaries = []; - chunk = chunks[i]; + chunk = chunks[ i ]; chunkLength = getLength( chunk ); while ( true ) { - boundary = boundaries[boundaryPtr]; + boundary = boundaries[ boundaryPtr ]; if ( boundary === undefined || boundary > offset + chunkLength - 1 ) { // beyond the interior of this chunk break; @@ -285,7 +290,10 @@ this.offsets = []; cursor = 0; for ( i = 0, len = this.textChunks.length; i < len; i++ ) { - this.offsets[ i ] = { start: cursor, length: this.textChunks[ i ].text.length }; + this.offsets[ i ] = { + start: cursor, + length: this.textChunks[ i ].text.length + }; cursor += this.offsets[ i ].length; } } @@ -382,7 +390,7 @@ offset = this.offsets[ i ].start; if ( textChunk.text.length > 0 ) { continue; - } + } if ( !emptyTextChunks[ offset ] ) { emptyTextChunks[ offset ] = []; } @@ -391,7 +399,9 @@ for ( offset in emptyTextChunks ) { emptyTextChunkOffsets.push( offset ); } - emptyTextChunkOffsets.sort( function ( a, b ) { return a - b; } ); + emptyTextChunkOffsets.sort( function ( a, b ) { + return a - b; + } ); for ( i = 0, iLen = rangeMappings.length; i < iLen; i++ ) { // Copy tags from source text start offset @@ -488,7 +498,9 @@ } ); pos += tail.length; } - return new TextBlock( textChunks.map( function ( x ) { return x.textChunk; } ) ); + return new TextBlock( textChunks.map( function ( x ) { + return x.textChunk; + } ) ); }; /** @@ -642,14 +654,14 @@ // Setup: currentTextChunks for current segment, and allTextChunks for all segments allTextChunks = []; currentTextChunks = []; + function flushChunks() { var modifiedTextChunks; if ( currentTextChunks.length === 0 ) { return; } modifiedTextChunks = addCommonTag( - currentTextChunks, - { + currentTextChunks, { name: 'span', attributes: { class: 'cx-segment', @@ -666,16 +678,18 @@ groups = getChunkBoundaryGroups( getBoundaries( this.getPlainText() ), this.textChunks, - function ( textChunk ) { return textChunk.text.length; } + function ( textChunk ) { + return textChunk.text.length; + } ); offset = 0; for ( i = 0, iLen = groups.length; i < iLen; i++ ) { - group = groups[i]; + group = groups[ i ]; textChunk = group.chunk; boundaries = group.boundaries; for ( j = 0, jLen = boundaries.length; j < jLen; j++ ) { - relOffset = boundaries[j] - offset; + relOffset = boundaries[ j ] - offset; if ( relOffset === 0 ) { flushChunks(); } else { @@ -700,6 +714,47 @@ } flushChunks(); return new TextBlock( allTextChunks ); +}; + +/** + * Dump an XML Array version of the linear representation, for debugging + * + * @method + * @return {string[]} Array that will concatenate to an XML string representation + */ +TextBlock.prototype.getSubSequences = function () { + var i, j, len, chunk, tagsDump, tagsAttr, nextTagsAttr, + nextChunk, nextTagsDump, + subsequences = []; + for ( i = 0, len = this.textChunks.length; i < len; i++ ) { + chunk = this.textChunks[ i ]; + tagsDump = dumpTags( chunk.tags ); + tagsAttr = tagsDump ? ' tags="' + tagsDump + '"' : ''; + if ( chunk.text && chunk.tags && chunk.tags.length > 0 ) { + j = i + 1; + while ( j < len ) { + nextChunk = this.textChunks[ j ]; + nextTagsDump = dumpTags( chunk.tags ); + nextTagsAttr = tagsDump ? ' tags="' + tagsDump + '"' : ''; + if ( nextChunk.tags && nextChunk.tags.length > 0 && nextTagsDump.indexOf( tagsDump ) >= 0 ) { + chunk = util._extend( {}, chunk ); + chunk.text += nextChunk.text; + } else { + break; + } + j++; + } + subsequences.push( chunk ); + } + if ( chunk.inlineContent ) { + if ( chunk.inlineContent.getSubSequences ) { + // sub-doc: concatenate + subsequences.push( chunk.inlineContent.getSubSequences( '' ) ); + } + } + } + return subsequences; + }; /** @@ -930,6 +985,26 @@ }; /** + * Dump an XML Array version of the linear representation, for debugging + * @method + * @return {string[]} Array that will concatenate to an XML string representation + */ +Doc.prototype.getSubSequences = function () { + var i, len, type, item, tag, textBlock, + subsequences = []; + + for ( i = 0, len = this.items.length; i < len; i++ ) { + type = this.items[ i ].type; + item = this.items[ i ].item; + if ( type === 'textblock' ) { + // Block of inline text + subsequences.push( item.getSubSequences() ); + } + } + return subsequences; +}; + +/** * Extract the text segments from the document * @method * @return {string[]} balanced html fragments, one per segment @@ -1060,13 +1135,13 @@ return; } for ( i = 0, len = this.textChunks.length; i < len; i++ ) { - textChunk = this.textChunks[i]; + textChunk = this.textChunks[ i ]; if ( textChunk.inlineContent || textChunk.text.match( /\S/ ) ) { whitespaceOnly = false; whitespace = undefined; break; } else { - whitespace.push( this.textChunks[i].text ); + whitespace.push( this.textChunks[ i ].text ); } } if ( whitespaceOnly ) { @@ -1140,7 +1215,9 @@ * @constructor */ function Normalizer() { - SAXParser.call( this, false, { lowercase: true } ); + SAXParser.call( this, false, { + lowercase: true + } ); } util.inherits( Normalizer, SAXParser ); -- To view, visit https://gerrit.wikimedia.org/r/175420 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I5b97362d1bd75f7719eabd85bea19169ef3bc230 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/cxserver Gerrit-Branch: master Gerrit-Owner: Santhosh <santhosh.thottin...@gmail.com> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits