Divec has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/370869 )

Change subject: WIP: Avoid segmenting within certain types of tag
......................................................................

WIP: Avoid segmenting within certain types of tag

To avoid adding spans in places that violate the MediaWiki DOM spec

https://www.mediawiki.org/wiki/Specs/HTML/1.4.0#Images

Change-Id: I7d25b5996876951f675cdf4b797dbb1371a0d16e
---
M config.dev.yaml
M lib/lineardoc/Builder.js
M lib/lineardoc/Doc.js
M lib/lineardoc/Parser.js
M lib/lineardoc/TextBlock.js
M lib/routes/v1.js
M lib/translationunits/MWImage.js
7 files changed, 60 insertions(+), 17 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver 
refs/changes/69/370869/1

diff --git a/config.dev.yaml b/config.dev.yaml
index f171eef..0962b13 100644
--- a/config.dev.yaml
+++ b/config.dev.yaml
@@ -61,7 +61,7 @@
       mt:
         # Apertium web API URL
         apertium:
-          api: http://apertium.wmflabs.org
+          api: http://localhost:2737
         yandex:
           api: https://translate.yandex.net
           key: null
diff --git a/lib/lineardoc/Builder.js b/lib/lineardoc/Builder.js
index 57d69f8..7097d2d 100644
--- a/lib/lineardoc/Builder.js
+++ b/lib/lineardoc/Builder.js
@@ -22,6 +22,7 @@
        this.doc = new Doc( wrapperTag || null );
        this.textChunks = [];
        this.parent = parent || null;
+       this.noSegmentBlock = false;
 }
 
 Builder.prototype.createChildBuilder = function ( wrapperTag ) {
@@ -109,9 +110,12 @@
        return tag;
 };
 
-Builder.prototype.addTextChunk = function ( text ) {
+Builder.prototype.addTextChunk = function ( text, noSegment ) {
        this.textChunks.push( new TextChunk( text, 
this.inlineAnnotationTags.slice() ) );
        this.inlineAnnotationTagsUsed = this.inlineAnnotationTags.length;
+       if ( noSegment ) {
+               this.noSegmentBlock = true;
+       }
 };
 
 /**
@@ -119,10 +123,14 @@
  *
  * @method
  * @param {Object} content Sub-document or empty SAX tag
+ * @param {boolean} noSegment
  */
-Builder.prototype.addInlineContent = function ( content ) {
+Builder.prototype.addInlineContent = function ( content, noSegment ) {
        this.textChunks.push( new TextChunk( '', 
this.inlineAnnotationTags.slice(), content ) );
        this.inlineAnnotationTagsUsed = this.inlineAnnotationTags.length;
+       if ( noSegment ) {
+               this.noSegmentBlock = true;
+       }
 };
 
 Builder.prototype.finishTextBlock = function () {
@@ -145,7 +153,7 @@
        if ( whitespaceOnly ) {
                this.doc.addItem( 'blockspace', whitespace.join( '' ) );
        } else {
-               this.doc.addItem( 'textblock', new TextBlock( this.textChunks ) 
);
+               this.doc.addItem( 'textblock', new TextBlock( this.textChunks, 
this.noSegmentBlock ) );
        }
        this.textChunks = [];
 };
diff --git a/lib/lineardoc/Doc.js b/lib/lineardoc/Doc.js
index e9f4157..5240cd3 100644
--- a/lib/lineardoc/Doc.js
+++ b/lib/lineardoc/Doc.js
@@ -122,7 +122,7 @@
                        textBlock = item.item;
                        newDoc.addItem(
                                'textblock',
-                               textBlock.segment( getBoundaries, getNextId )
+                               textBlock.noSegment ? textBlock : 
textBlock.segment( getBoundaries, getNextId )
                        );
                }
        }
diff --git a/lib/lineardoc/Parser.js b/lib/lineardoc/Parser.js
index beb4c80..96ad429 100644
--- a/lib/lineardoc/Parser.js
+++ b/lib/lineardoc/Parser.js
@@ -4,7 +4,9 @@
        Builder = require( './Builder.js' ),
        Utils = require( './Utils.js' ),
        util = require( 'util' ),
-       blockTags;
+       blockTags,
+       relBlacklist,
+       relWhitelist;
 
 blockTags = [
        'html', 'head', 'body', 'script',
@@ -33,6 +35,26 @@
        'img', 'br'
 ];
 
+relBlacklist = [ /^mw:/ ];
+
+relWhitelist = [ 'mw:Entity', 'mw:ExtLink', 'mw:MediaLink', 'mw:WikiLink', 
'mw:referencedBy' ];
+
+function matches( patterns, text ) {
+       return patterns.filter( function ( pattern ) {
+               return typeof pattern === 'string' ?
+                       text.indexOf( pattern ) > -1 :
+                       pattern.exec( text );
+       } ).length > 0;
+};
+
+function isBlacklisted( tag ) {
+       var attr = tag.attributes.typeof || tag.attributes.rel;
+       if ( !attr ) {
+               return false;
+       }
+       return matches( relBlacklist, attr ) && !matches( relWhitelist, attr );
+}
+
 /**
  * Parser to read an HTML stream into a Doc
  *
@@ -53,6 +75,9 @@
 Parser.prototype.init = function () {
        this.rootBuilder = new Builder();
        this.builder = this.rootBuilder;
+       // Number of currently open tags within which translation should not 
occur
+       this.allTags = [];
+       this.noSegmentTags = 0;
 };
 
 Parser.prototype.onopentag = function ( tag ) {
@@ -68,21 +93,28 @@
                // Start a reference: create a child builder, and move into it
                this.builder = this.builder.createChildBuilder( tag );
        } else if ( Utils.isInlineEmptyTag( tag.name ) ) {
-               this.builder.addInlineContent( tag );
+               this.builder.addInlineContent( tag, this.noSegmentTags > 0 );
        } else if ( this.isInlineAnnotationTag( tag.name ) ) {
                this.builder.pushInlineAnnotationTag( tag );
        } else {
                this.builder.pushBlockTag( tag );
        }
+       this.allTags.push( tag );
+       if ( isBlacklisted( tag ) ) {
+               this.noSegmentTags++;
+       }
 };
 
 Parser.prototype.onclosetag = function ( tagName ) {
-       var tag,
+       var tag = this.allTags.pop( tag ),
                isAnn = this.isInlineAnnotationTag( tagName );
+       if ( isBlacklisted( tag ) ) {
+               this.noSegmentTags--;
+       }
        if ( Utils.isInlineEmptyTag( tagName ) ) {
                return;
        } else if ( isAnn && this.builder.inlineAnnotationTags.length > 0 ) {
-               tag = this.builder.popInlineAnnotationTag( tagName );
+               this.builder.popInlineAnnotationTag( tagName );
                if ( this.options.isolateSegments && Utils.isSegment( tag ) ) {
                        this.builder.popBlockTag( 'div' );
                }
@@ -103,7 +135,7 @@
 };
 
 Parser.prototype.ontext = function ( text ) {
-       this.builder.addTextChunk( text );
+       this.builder.addTextChunk( text, this.noTranslationTags > 0 );
 };
 
 /**
diff --git a/lib/lineardoc/TextBlock.js b/lib/lineardoc/TextBlock.js
index 3f9ece6..fd572f9 100644
--- a/lib/lineardoc/TextBlock.js
+++ b/lib/lineardoc/TextBlock.js
@@ -10,11 +10,13 @@
  *
  * @constructor
  *
- * @param {string} textChunks annotated inline text
+ * @param {string} textChunks Annotated inline text
+ * @param {boolean} noSegment This is a block which should not be segmented
  */
-function TextBlock( textChunks ) {
+function TextBlock( textChunks, noSegment ) {
        var i, len, cursor;
        this.textChunks = textChunks;
+       this.noSegment = noSegment;
        this.offsets = [];
        cursor = 0;
        for ( i = 0, len = this.textChunks.length; i < len; i++ ) {
diff --git a/lib/routes/v1.js b/lib/routes/v1.js
index cfd948e..73e8aba 100644
--- a/lib/routes/v1.js
+++ b/lib/routes/v1.js
@@ -254,7 +254,7 @@
                                        contents: adaptedDoc.getHtml()
                                } );
                        }, ( error ) => {
-                               res.status( 500 ).end( error.toString() );
+                               res.status( 500 ).end( error.stack );
                                app.logger.log( 'error', 'MT processing error: 
' + error.stack );
                        } );
        } );
diff --git a/lib/translationunits/MWImage.js b/lib/translationunits/MWImage.js
index da3ce35..6ae9071 100644
--- a/lib/translationunits/MWImage.js
+++ b/lib/translationunits/MWImage.js
@@ -50,10 +50,11 @@
 }
 
 MWImage.prototype.adapt = cxutil.async( function* () {
-       var i, len, chunk, sourceImage, imageLink, targetResource, 
namespaceAlias;
+       var textChunks, i, len, chunk, sourceImage, imageLink, targetResource, 
namespaceAlias;
 
-       for ( i = 0, len = this.node.children.textChunks.length; i < len; i++ ) 
{
-               chunk = this.node.children.textChunks[ i ];
+       textChunks = this.node.children ? this.node.children.textChunks.length 
: [];
+       for ( i = 0, len = textChunks.length; i < len; i++ ) {
+               chunk = textChunks[ i ];
                if ( chunk.tags[ 0 ].name === 'a' ) {
                        imageLink = chunk.tags[ 0 ];
                }
@@ -67,7 +68,7 @@
                throw new Error( 'img tag not found in the figure with 
mw:Image/Thumb for id: ' + this.node.attributes.id );
        }
 
-       this.sourceResource = sourceImage.attributes[ 'resource' ];
+       this.sourceResource = sourceImage.attributes[ 'resource' ] || '';
        this.adaptImageAlignment();
 
        if ( this.isCommonsImage( sourceImage.attributes[ 'src' ] ) ) {

-- 
To view, visit https://gerrit.wikimedia.org/r/370869
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I7d25b5996876951f675cdf4b797dbb1371a0d16e
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/cxserver
Gerrit-Branch: master
Gerrit-Owner: Divec <da...@troi.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to