[MediaWiki-commits] [Gerrit] mediawiki...cxserver[master]: Section wrapping at cxserver for v2 page fetch api

2017-11-23 Thread jenkins-bot (Code Review)
jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/392790 )

Change subject: Section wrapping at cxserver for v2 page fetch api
..


Section wrapping at cxserver for v2 page fetch api

v2 page fetch api will wrap the translatable sections
with  tag. Immediate childrens under 
will be considered as translatable sections.

Bug: T177752

Change-Id: Ibb5937061c6980579d35cd24a0ba8205b109f8c7
---
M bin/segment
M lib/lineardoc/Doc.js
M lib/lineardoc/MwContextualizer.js
M lib/lineardoc/Parser.js
M lib/mw/MWPageLoader.js
M lib/routes/v2.js
A test/mw/SectionWrap.test.js
M test/segmentation/data/result-18.html
M test/segmentation/data/result-4.html
M test/segmentation/data/test-18.html
10 files changed, 109 insertions(+), 25 deletions(-)

Approvals:
  Divec: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/bin/segment b/bin/segment
index 2a18675..a58e532 100755
--- a/bin/segment
+++ b/bin/segment
@@ -6,12 +6,14 @@
 function normalize( html ) {
const normalizer = new LinearDoc.Normalizer();
normalizer.init();
-   normalizer.write( html.replace( /(\r\n|\n|\t|\r)/gm, '' ) );
+   normalizer.write( html.replace( /[\t\r\n]+/g, '' ) );
return normalizer.getHtml();
 }
 
 function getParsedDoc( content ) {
-   const parser = new LinearDoc.Parser( new LinearDoc.MwContextualizer() );
+   const parser = new LinearDoc.Parser( new LinearDoc.MwContextualizer(), {
+   wrapSections: true
+   } );
parser.init();
parser.write( content );
return parser.builder.doc;
diff --git a/lib/lineardoc/Doc.js b/lib/lineardoc/Doc.js
index 6c4047d..30d2909 100644
--- a/lib/lineardoc/Doc.js
+++ b/lib/lineardoc/Doc.js
@@ -77,10 +77,14 @@
 Doc.prototype.segment = function ( getBoundaries ) {
var i, len, item, tag, textBlock, hash,
newDoc = new Doc(),
+   nextSectionId = 0,
nextId = 0;
 
// TODO: return different counters depending on type
-   function getNextId( type ) {
+   function getNextId( type, tagName ) {
+   if ( tagName === 'section' ) {
+   return String( 'cxSourceSection' + nextSectionId++ );
+   }
if ( type === 'segment' || type === 'link' || type === 'block' 
) {
return String( nextId++ );
} else {
@@ -115,7 +119,7 @@
).substr( 0, 30 );
}
} else {
-   tag.attributes.id = getNextId( 'block' );
+   tag.attributes.id = getNextId( 'block', 
tag.name );
}
newDoc.addItem( item.type, tag );
} else if ( this.items[ i ].type !== 'textblock' ) {
diff --git a/lib/lineardoc/MwContextualizer.js 
b/lib/lineardoc/MwContextualizer.js
index c5d3742..d964443 100644
--- a/lib/lineardoc/MwContextualizer.js
+++ b/lib/lineardoc/MwContextualizer.js
@@ -1,7 +1,7 @@
 'use strict';
 
 const Contextualizer = require( './Contextualizer' );
-const contentBranchNodeNames = [ 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 
'h6', 'p', 'pre' ];
+const contentBranchNodeNames = [ 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 
'h6', 'p', 'pre', 'div', 'table', 'ol', 'li' ];
 
 /**
  * Contextualizer for MediaWiki DOM HTML
@@ -31,13 +31,18 @@
return 'media';
}
 
+   // Otherwise, figure is media
+   if ( context === undefined && tag.name === 'body' ) {
+   return 'section';
+   }
+
// And figure//figcaption is contentBranch
if ( context === 'media' && tag.name === 'figcaption' ) {
return 'contentBranch';
}
 
// And ContentBranchNodes are contentBranch
-   if ( context === undefined && contentBranchNodeNames.indexOf( 
tag.name ) > -1 ) {
+   if ( ( context === 'section' || context === undefined ) && 
contentBranchNodeNames.indexOf( tag.name ) > -1 ) {
return 'contentBranch';
}
 
diff --git a/lib/lineardoc/Parser.js b/lib/lineardoc/Parser.js
index d2410da..5291228 100644
--- a/lib/lineardoc/Parser.js
+++ b/lib/lineardoc/Parser.js
@@ -57,7 +57,7 @@
this.builder = this.rootBuilder;
// Stack of tags currently open
this.allTags = [];
-   // context for each tag currently open; 
undefined|'verbatim'|'media'|'contentBranch'
+   // context for each tag currently open; 
undefined|'verbatim'|'media'|'contentBranch'|'section'
this.contexts = [];
 };
 
@@ -78,6 +78,14 @@
} else if ( this.isInlineAnnotationTag( tag.name ) ) {
this.builder.pushInlineAnnotationTag( tag );
} else {
+   if ( 

[MediaWiki-commits] [Gerrit] mediawiki...cxserver[master]: Section wrapping at cxserver for v2 page fetch api

2017-11-22 Thread Santhosh (Code Review)
Santhosh has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/392790 )

Change subject: Section wrapping at cxserver for v2 page fetch api
..

Section wrapping at cxserver for v2 page fetch api

v2 page fetch api will wrap the translatable sections
with  tag. Immediate childrens under 
will be considered as translatable sections.

Bug: T177752

Change-Id: Ibb5937061c6980579d35cd24a0ba8205b109f8c7
---
M bin/segment
M lib/lineardoc/Doc.js
M lib/lineardoc/MwContextualizer.js
M lib/lineardoc/Parser.js
M lib/mw/MWPageLoader.js
M lib/routes/v2.js
A test/mw/SectionWrap.test.js
M test/segmentation/data/result-18.html
M test/segmentation/data/result-4.html
M test/segmentation/data/test-18.html
10 files changed, 108 insertions(+), 24 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/cxserver 
refs/changes/90/392790/1

diff --git a/bin/segment b/bin/segment
index 2a18675..c23f7b0 100755
--- a/bin/segment
+++ b/bin/segment
@@ -11,7 +11,9 @@
 }
 
 function getParsedDoc( content ) {
-   const parser = new LinearDoc.Parser( new LinearDoc.MwContextualizer() );
+   const parser = new LinearDoc.Parser( new LinearDoc.MwContextualizer(), {
+   wrapSections: true
+   } );
parser.init();
parser.write( content );
return parser.builder.doc;
diff --git a/lib/lineardoc/Doc.js b/lib/lineardoc/Doc.js
index 6c4047d..30d2909 100644
--- a/lib/lineardoc/Doc.js
+++ b/lib/lineardoc/Doc.js
@@ -77,10 +77,14 @@
 Doc.prototype.segment = function ( getBoundaries ) {
var i, len, item, tag, textBlock, hash,
newDoc = new Doc(),
+   nextSectionId = 0,
nextId = 0;
 
// TODO: return different counters depending on type
-   function getNextId( type ) {
+   function getNextId( type, tagName ) {
+   if ( tagName === 'section' ) {
+   return String( 'cxSourceSection' + nextSectionId++ );
+   }
if ( type === 'segment' || type === 'link' || type === 'block' 
) {
return String( nextId++ );
} else {
@@ -115,7 +119,7 @@
).substr( 0, 30 );
}
} else {
-   tag.attributes.id = getNextId( 'block' );
+   tag.attributes.id = getNextId( 'block', 
tag.name );
}
newDoc.addItem( item.type, tag );
} else if ( this.items[ i ].type !== 'textblock' ) {
diff --git a/lib/lineardoc/MwContextualizer.js 
b/lib/lineardoc/MwContextualizer.js
index c5d3742..d964443 100644
--- a/lib/lineardoc/MwContextualizer.js
+++ b/lib/lineardoc/MwContextualizer.js
@@ -1,7 +1,7 @@
 'use strict';
 
 const Contextualizer = require( './Contextualizer' );
-const contentBranchNodeNames = [ 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 
'h6', 'p', 'pre' ];
+const contentBranchNodeNames = [ 'blockquote', 'h1', 'h2', 'h3', 'h4', 'h5', 
'h6', 'p', 'pre', 'div', 'table', 'ol', 'li' ];
 
 /**
  * Contextualizer for MediaWiki DOM HTML
@@ -31,13 +31,18 @@
return 'media';
}
 
+   // Otherwise, figure is media
+   if ( context === undefined && tag.name === 'body' ) {
+   return 'section';
+   }
+
// And figure//figcaption is contentBranch
if ( context === 'media' && tag.name === 'figcaption' ) {
return 'contentBranch';
}
 
// And ContentBranchNodes are contentBranch
-   if ( context === undefined && contentBranchNodeNames.indexOf( 
tag.name ) > -1 ) {
+   if ( ( context === 'section' || context === undefined ) && 
contentBranchNodeNames.indexOf( tag.name ) > -1 ) {
return 'contentBranch';
}
 
diff --git a/lib/lineardoc/Parser.js b/lib/lineardoc/Parser.js
index d2410da..5291228 100644
--- a/lib/lineardoc/Parser.js
+++ b/lib/lineardoc/Parser.js
@@ -57,7 +57,7 @@
this.builder = this.rootBuilder;
// Stack of tags currently open
this.allTags = [];
-   // context for each tag currently open; 
undefined|'verbatim'|'media'|'contentBranch'
+   // context for each tag currently open; 
undefined|'verbatim'|'media'|'contentBranch'|'section'
this.contexts = [];
 };
 
@@ -78,6 +78,14 @@
} else if ( this.isInlineAnnotationTag( tag.name ) ) {
this.builder.pushInlineAnnotationTag( tag );
} else {
+   if ( this.options.wrapSections && 
this.contextualizer.getContext() === 'section' ) {
+   this.builder.pushBlockTag( {
+   name: 'section',
+   attributes: {
+