Divec has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/370869 )
Change subject: WIP: Avoid segmenting within certain types of tag ...................................................................... WIP: Avoid segmenting within certain types of tag To avoid adding spans in places that violate the MediaWiki DOM spec https://www.mediawiki.org/wiki/Specs/HTML/1.4.0#Images Change-Id: I7d25b5996876951f675cdf4b797dbb1371a0d16e --- M config.dev.yaml M lib/lineardoc/Builder.js M lib/lineardoc/Doc.js M lib/lineardoc/Parser.js M lib/lineardoc/TextBlock.js M lib/routes/v1.js M lib/translationunits/MWImage.js 7 files changed, 60 insertions(+), 17 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver refs/changes/69/370869/1 diff --git a/config.dev.yaml b/config.dev.yaml index f171eef..0962b13 100644 --- a/config.dev.yaml +++ b/config.dev.yaml @@ -61,7 +61,7 @@ mt: # Apertium web API URL apertium: - api: http://apertium.wmflabs.org + api: http://localhost:2737 yandex: api: https://translate.yandex.net key: null diff --git a/lib/lineardoc/Builder.js b/lib/lineardoc/Builder.js index 57d69f8..7097d2d 100644 --- a/lib/lineardoc/Builder.js +++ b/lib/lineardoc/Builder.js @@ -22,6 +22,7 @@ this.doc = new Doc( wrapperTag || null ); this.textChunks = []; this.parent = parent || null; + this.noSegmentBlock = false; } Builder.prototype.createChildBuilder = function ( wrapperTag ) { @@ -109,9 +110,12 @@ return tag; }; -Builder.prototype.addTextChunk = function ( text ) { +Builder.prototype.addTextChunk = function ( text, noSegment ) { this.textChunks.push( new TextChunk( text, this.inlineAnnotationTags.slice() ) ); this.inlineAnnotationTagsUsed = this.inlineAnnotationTags.length; + if ( noSegment ) { + this.noSegmentBlock = true; + } }; /** @@ -119,10 +123,14 @@ * * @method * @param {Object} content Sub-document or empty SAX tag + * @param {boolean} noSegment */ -Builder.prototype.addInlineContent = function ( content ) { +Builder.prototype.addInlineContent = function ( content, noSegment ) { this.textChunks.push( new TextChunk( '', this.inlineAnnotationTags.slice(), content ) ); this.inlineAnnotationTagsUsed = this.inlineAnnotationTags.length; + if ( noSegment ) { + this.noSegmentBlock = true; + } }; Builder.prototype.finishTextBlock = function () { @@ -145,7 +153,7 @@ if ( whitespaceOnly ) { this.doc.addItem( 'blockspace', whitespace.join( '' ) ); } else { - this.doc.addItem( 'textblock', new TextBlock( this.textChunks ) ); + this.doc.addItem( 'textblock', new TextBlock( this.textChunks, this.noSegmentBlock ) ); } this.textChunks = []; }; diff --git a/lib/lineardoc/Doc.js b/lib/lineardoc/Doc.js index e9f4157..5240cd3 100644 --- a/lib/lineardoc/Doc.js +++ b/lib/lineardoc/Doc.js @@ -122,7 +122,7 @@ textBlock = item.item; newDoc.addItem( 'textblock', - textBlock.segment( getBoundaries, getNextId ) + textBlock.noSegment ? textBlock : textBlock.segment( getBoundaries, getNextId ) ); } } diff --git a/lib/lineardoc/Parser.js b/lib/lineardoc/Parser.js index beb4c80..96ad429 100644 --- a/lib/lineardoc/Parser.js +++ b/lib/lineardoc/Parser.js @@ -4,7 +4,9 @@ Builder = require( './Builder.js' ), Utils = require( './Utils.js' ), util = require( 'util' ), - blockTags; + blockTags, + relBlacklist, + relWhitelist; blockTags = [ 'html', 'head', 'body', 'script', @@ -33,6 +35,26 @@ 'img', 'br' ]; +relBlacklist = [ /^mw:/ ]; + +relWhitelist = [ 'mw:Entity', 'mw:ExtLink', 'mw:MediaLink', 'mw:WikiLink', 'mw:referencedBy' ]; + +function matches( patterns, text ) { + return patterns.filter( function ( pattern ) { + return typeof pattern === 'string' ? + text.indexOf( pattern ) > -1 : + pattern.exec( text ); + } ).length > 0; +}; + +function isBlacklisted( tag ) { + var attr = tag.attributes.typeof || tag.attributes.rel; + if ( !attr ) { + return false; + } + return matches( relBlacklist, attr ) && !matches( relWhitelist, attr ); +} + /** * Parser to read an HTML stream into a Doc * @@ -53,6 +75,9 @@ Parser.prototype.init = function () { this.rootBuilder = new Builder(); this.builder = this.rootBuilder; + // Number of currently open tags within which translation should not occur + this.allTags = []; + this.noSegmentTags = 0; }; Parser.prototype.onopentag = function ( tag ) { @@ -68,21 +93,28 @@ // Start a reference: create a child builder, and move into it this.builder = this.builder.createChildBuilder( tag ); } else if ( Utils.isInlineEmptyTag( tag.name ) ) { - this.builder.addInlineContent( tag ); + this.builder.addInlineContent( tag, this.noSegmentTags > 0 ); } else if ( this.isInlineAnnotationTag( tag.name ) ) { this.builder.pushInlineAnnotationTag( tag ); } else { this.builder.pushBlockTag( tag ); } + this.allTags.push( tag ); + if ( isBlacklisted( tag ) ) { + this.noSegmentTags++; + } }; Parser.prototype.onclosetag = function ( tagName ) { - var tag, + var tag = this.allTags.pop( tag ), isAnn = this.isInlineAnnotationTag( tagName ); + if ( isBlacklisted( tag ) ) { + this.noSegmentTags--; + } if ( Utils.isInlineEmptyTag( tagName ) ) { return; } else if ( isAnn && this.builder.inlineAnnotationTags.length > 0 ) { - tag = this.builder.popInlineAnnotationTag( tagName ); + this.builder.popInlineAnnotationTag( tagName ); if ( this.options.isolateSegments && Utils.isSegment( tag ) ) { this.builder.popBlockTag( 'div' ); } @@ -103,7 +135,7 @@ }; Parser.prototype.ontext = function ( text ) { - this.builder.addTextChunk( text ); + this.builder.addTextChunk( text, this.noTranslationTags > 0 ); }; /** diff --git a/lib/lineardoc/TextBlock.js b/lib/lineardoc/TextBlock.js index 3f9ece6..fd572f9 100644 --- a/lib/lineardoc/TextBlock.js +++ b/lib/lineardoc/TextBlock.js @@ -10,11 +10,13 @@ * * @constructor * - * @param {string} textChunks annotated inline text + * @param {string} textChunks Annotated inline text + * @param {boolean} noSegment This is a block which should not be segmented */ -function TextBlock( textChunks ) { +function TextBlock( textChunks, noSegment ) { var i, len, cursor; this.textChunks = textChunks; + this.noSegment = noSegment; this.offsets = []; cursor = 0; for ( i = 0, len = this.textChunks.length; i < len; i++ ) { diff --git a/lib/routes/v1.js b/lib/routes/v1.js index cfd948e..73e8aba 100644 --- a/lib/routes/v1.js +++ b/lib/routes/v1.js @@ -254,7 +254,7 @@ contents: adaptedDoc.getHtml() } ); }, ( error ) => { - res.status( 500 ).end( error.toString() ); + res.status( 500 ).end( error.stack ); app.logger.log( 'error', 'MT processing error: ' + error.stack ); } ); } ); diff --git a/lib/translationunits/MWImage.js b/lib/translationunits/MWImage.js index da3ce35..6ae9071 100644 --- a/lib/translationunits/MWImage.js +++ b/lib/translationunits/MWImage.js @@ -50,10 +50,11 @@ } MWImage.prototype.adapt = cxutil.async( function* () { - var i, len, chunk, sourceImage, imageLink, targetResource, namespaceAlias; + var textChunks, i, len, chunk, sourceImage, imageLink, targetResource, namespaceAlias; - for ( i = 0, len = this.node.children.textChunks.length; i < len; i++ ) { - chunk = this.node.children.textChunks[ i ]; + textChunks = this.node.children ? this.node.children.textChunks.length : []; + for ( i = 0, len = textChunks.length; i < len; i++ ) { + chunk = textChunks[ i ]; if ( chunk.tags[ 0 ].name === 'a' ) { imageLink = chunk.tags[ 0 ]; } @@ -67,7 +68,7 @@ throw new Error( 'img tag not found in the figure with mw:Image/Thumb for id: ' + this.node.attributes.id ); } - this.sourceResource = sourceImage.attributes[ 'resource' ]; + this.sourceResource = sourceImage.attributes[ 'resource' ] || ''; this.adaptImageAlignment(); if ( this.isCommonsImage( sourceImage.attributes[ 'src' ] ) ) { -- To view, visit https://gerrit.wikimedia.org/r/370869 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I7d25b5996876951f675cdf4b797dbb1371a0d16e Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/cxserver Gerrit-Branch: master Gerrit-Owner: Divec <da...@troi.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits