jenkins-bot has submitted this change and it was merged.

Change subject: Bug 50603, 44498: Handle templates straddling cell attributes & 
content
......................................................................


Bug 50603, 44498: Handle templates straddling cell attributes & content

This patch adds a DOM visitor to fix up content like

{|
|{{nom|Bar}}
|title="foo" {{nom|Bar}}
|}

where {{nom|Bar}} expands to attributes and a table cell, for example

style="foo" class="bar"|Bar

The attributes are re-tokenized and stripped from the table contents. The
template encapsulation metadata is moved to the table cell and updated.

This implementation renders and round-trips the templates documented in
- https://en.wikipedia.org/wiki/Template:Table_cell_templates/doc
- http://en.wikipedia.org/wiki/User:Nicolas1981/Sandbox (from bug 44498)

Nevertheless, there are some limitations:
- We assume that attributes don't contain wiki markup (apart from <nowiki>)
  and end up in text or nowiki nodes.

- Only a single table cell is produced / opened by the template that
  contains the attributes. This limitation could be lifted with more
  aggressive re-parsing if really needed in practice.

- There is only a single transclusion in the table cell content. This
  limitation can be lifted with more advanced data-mw construction.

Bug: 50603
Bug: 44498

Change-Id: Ibbd16952784be6768622ed6df65d3475d2c37e91
---
D js/lib/dom.t.TDFixups.js
A js/lib/dom.t.TableFixups.js
M js/lib/ext.core.Sanitizer.js
M js/lib/mediawiki.DOMPostProcessor.js
M js/lib/mediawiki.DOMUtils.js
M js/lib/mediawiki.tokenizer.peg.js
M js/tests/parserTests-blacklist.js
M js/tests/parserTests.txt
8 files changed, 384 insertions(+), 79 deletions(-)

Approvals:
  Subramanya Sastry: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/js/lib/dom.t.TDFixups.js b/js/lib/dom.t.TDFixups.js
deleted file mode 100644
index 65fb3ad..0000000
--- a/js/lib/dom.t.TDFixups.js
+++ /dev/null
@@ -1,72 +0,0 @@
-var DU = require('./mediawiki.DOMUtils.js').DOMUtils;
-
-
-/**
- * DOM visitor that strips the double td for this test case:
- * |{{echo|{{!}} Foo}}
- *
- * See https://bugzilla.wikimedia.org/show_bug.cgi?id=50603
- */
-function stripDoubleTDs (env, node) {
-       var nextNode = node.nextSibling;
-
-       if (!DU.isLiteralHTMLNode(node) &&
-               nextNode !== null &&
-           nextNode.nodeName === 'TD' &&
-           !DU.isLiteralHTMLNode(nextNode) &&
-               DU.nodeEssentiallyEmpty(node) &&
-               (// FIXME: will not be set for nested templates
-                DU.isEncapsulatedElt(nextNode) ||
-                // Hacky work-around for nested templates
-                /^{{.*?}}$/.test(nextNode.data.parsoid.src))
-           )
-       {
-               // Update the dsr. Since we are coalescing the first
-               // node with the second (or, more precisely, deleting
-               // the first node), we have to update the second DSR's
-               // starting point and start tag width.
-               var nodeDSR     = node.data.parsoid.dsr,
-                   nextNodeDSR = nextNode.data.parsoid.dsr;
-
-               if (nodeDSR && nextNodeDSR) {
-                       nextNodeDSR[0] = nodeDSR[0];
-               }
-
-               // Now update data-mw
-               // XXX: use data.mw for data-mw as well!
-               var dataMW = DU.getJSONAttribute(nextNode, 'data-mw'),
-                       nodeSrc = DU.getWTSource(env, node);
-
-               if (!dataMW.parts) {
-                       dataMW.parts = [
-                               nodeSrc,
-                               {
-                                       // XXX: Should we always use parts or 
at least the
-                                       // template wrapper? This will need to 
be updated whenever
-                                       // we change the template info.
-                                       template: {
-                                               target: dataMW.target,
-                                               params: dataMW.params,
-                                               i: 0
-                                       }
-                               }
-                       ];
-                       dataMW.target = undefined;
-                       dataMW.params = undefined;
-               } else {
-                       dataMW.parts.unshift(nodeSrc);
-               }
-               DU.setJSONAttribute(nextNode, 'data-mw', dataMW);
-
-               // Delete the duplicated <td> node.
-               node.parentNode.removeChild(node);
-               // This node was deleted, so don't continue processing on it.
-               return nextNode;
-       }
-
-       return true;
-}
-
-if (typeof module === "object") {
-       module.exports.stripDoubleTDs = stripDoubleTDs;
-}
diff --git a/js/lib/dom.t.TableFixups.js b/js/lib/dom.t.TableFixups.js
new file mode 100644
index 0000000..96e5b33
--- /dev/null
+++ b/js/lib/dom.t.TableFixups.js
@@ -0,0 +1,330 @@
+var DU = require('./mediawiki.DOMUtils.js').DOMUtils,
+    PegTokenizer = require('./mediawiki.tokenizer.peg.js').PegTokenizer,
+       Sanitizer = require('./ext.core.Sanitizer.js').Sanitizer,
+    defines = require('./mediawiki.parser.defines.js');
+
+// define some constructor shortcuts
+var TagTk = defines.TagTk;
+
+/**
+ * TableFixups class
+ *
+ * Provides two DOMTraverser visitors that implement the two parts of
+ * https://bugzilla.wikimedia.org/show_bug.cgi?id=50603:
+ * - stripDoubleTDs
+ * - reparseTemplatedAttributes
+ */
+function TableFixups (env) {
+       /**
+        * Set up some helper objects for reparseTemplatedAttributes
+        */
+
+       /**
+        * Actually the regular tokenizer, but we'll use
+        * tokenizeTableCellAttributes only.
+        */
+       this.tokenizer = new PegTokenizer( env );
+       // XXX: Don't require us to fake a manager!
+       var fakeManager = {
+               env: env,
+               addTransform: function(){}
+       };
+       this.sanitizer = new Sanitizer( fakeManager );
+}
+
+/**
+ * DOM visitor that strips the double td for this test case:
+ * |{{echo|{{!}} Foo}}
+ *
+ * @public
+ *
+ * See https://bugzilla.wikimedia.org/show_bug.cgi?id=50603
+ */
+TableFixups.prototype.stripDoubleTDs = function (env, node) {
+       var nextNode = node.nextSibling;
+
+       if (!DU.isLiteralHTMLNode(node) &&
+               nextNode !== null &&
+           nextNode.nodeName === 'TD' &&
+           !DU.isLiteralHTMLNode(nextNode) &&
+               DU.nodeEssentiallyEmpty(node) &&
+               (// FIXME: will not be set for nested templates
+                DU.isEncapsulatedElt(nextNode) ||
+                // Hacky work-around for nested templates
+                /^{{.*?}}$/.test(nextNode.data.parsoid.src))
+           )
+       {
+               // Update the dsr. Since we are coalescing the first
+               // node with the second (or, more precisely, deleting
+               // the first node), we have to update the second DSR's
+               // starting point and start tag width.
+               var nodeDSR     = node.data.parsoid.dsr,
+                   nextNodeDSR = nextNode.data.parsoid.dsr;
+
+               if (nodeDSR && nextNodeDSR) {
+                       nextNodeDSR[0] = nodeDSR[0];
+               }
+
+               // Now update data-mw
+               // XXX: use data.mw for data-mw as well!
+               var dataMW = DU.getJSONAttribute(nextNode, 'data-mw'),
+                       nodeSrc = DU.getWTSource(env, node);
+
+               if (!dataMW.parts) {
+                       dataMW.parts = [
+                               nodeSrc,
+                               {
+                                       // XXX: Should we always use parts or 
at least the
+                                       // template wrapper? This will need to 
be updated whenever
+                                       // we change the template info.
+                                       template: {
+                                               target: dataMW.target,
+                                               params: dataMW.params,
+                                               i: 0
+                                       }
+                               }
+                       ];
+                       dataMW.target = undefined;
+                       dataMW.params = undefined;
+               } else {
+                       dataMW.parts.unshift(nodeSrc);
+               }
+               DU.setJSONAttribute(nextNode, 'data-mw', dataMW);
+
+               // Delete the duplicated <td> node.
+               node.parentNode.removeChild(node);
+               // This node was deleted, so don't continue processing on it.
+               return nextNode;
+       }
+
+       return true;
+};
+
+
+/**
+ * Helpers for reparseTemplatedAttributes
+ */
+TableFixups.prototype.hasOnlyOneTransclusionChild = function (node) {
+       var n = 0;
+       node.childNodes.forEach(function(n) {
+               if(DU.isElt(n) && DU.isEncapsulatedElt(n)) {
+                       n++;
+               }
+               if (n > 1) {
+                       return false;
+               }
+       });
+       return true;
+};
+
+
+/**
+ * Collect potential attribute content
+ *
+ * We expect this to be text nodes without a pipe character followed by one or
+ * more nowiki spans, followed by a template encapsulation with pure-text and
+ * nowiki content. Collection stops when encountering other nodes or a pipe
+ * character.
+ */
+TableFixups.prototype.collectAttributishContent = function (node) {
+       var buf = [],
+               nodes = [],
+               transclusionNode,
+               // Build the result.
+               buildRes = function () {
+                       return {
+                               txt: buf.join(''),
+                               nodes: nodes,
+                               transclusionNode: transclusionNode
+                       };
+               },
+               child = node.firstChild;
+       while (child) {
+               if (!transclusionNode &&
+                               child.nodeType === node.TEXT_NODE &&
+                               ! /[|]/.test(child.nodeValue))
+               {
+                       buf.push(child.nodeValue);
+               } else if (transclusionNode &&
+                               child.nodeName === 'SPAN' &&
+                               DU.hasTypeOf(child, 'mw:Nowiki'))
+               {
+                       buf.push(child.textContent);
+               } else if (child.nodeName === 'SPAN' &&
+                               child.childNodes.length === 1 &&
+                               (child.getAttribute('typeof') === null &&
+                                transclusionNode &&
+                                child.getAttribute('about') === 
transclusionNode.getAttribute('about') ||
+                                DU.hasTypeOf(child, 'mw:Transclusion')))
+               {
+                       buf.push(child.textContent);
+                       if (!transclusionNode && DU.hasTypeOf(child, 
'mw:Transclusion')) {
+                               transclusionNode = child;
+                       }
+               } else {
+                       return buildRes();
+               }
+               nodes.push(child);
+               if (/[|]/.test(buf.last())) {
+                       return buildRes();
+               }
+               child = child.nextSibling;
+       }
+       return buildRes();
+};
+
+/**
+ * Bug 44498, second part of bug 50603
+ *
+ * @public
+ *
+ * Handle wikitext like
+ *
+ * {|
+ * |{{nom|Bar}}
+ * |}
+ *
+ * where nom expands to style="foo" class="bar"|Bar. The attributes are
+ * tokenized and stripped from the table contents.
+ *
+ * This method works well for the templates documented in
+ * https://en.wikipedia.org/wiki/Template:Table_cell_templates/doc
+ *
+ * Nevertheless, there are some limitations:
+ * - We assume that attributes don't contain wiki markup (apart from <nowiki>)
+ *   and end up in text or nowiki nodes.
+ * - Only a single table cell is produced / opened by the template that
+ *   contains the attributes. This limitation could be lifted with more
+ *   aggressive re-parsing if really needed in practice.
+ * - There is only a single transclusion in the table cell content. This
+ *   limitation can be lifted with more advanced data-mw construction.
+ */
+TableFixups.prototype.reparseTemplatedAttributes = function (env, node) {
+
+       // Cheap checks first
+       if (!DU.isLiteralHTMLNode(node) &&
+               // We use the dsr start tag width as a proxy for 'has no 
attributes
+               // yet'. We accept '|' and '||' (row-based syntax), so at most 
two
+               // chars.
+               node.data.parsoid.dsr &&
+               node.data.parsoid.dsr[2] !== null &&
+               node.data.parsoid.dsr[2] <= 2)
+       {
+               // Now actually look at the content
+               var attributishContent = this.collectAttributishContent(node),
+                       transclusionNode = attributishContent.transclusionNode;
+
+                       // First of all make sure we have a transclusion that 
produces leading
+                       // text content
+               if ( transclusionNode &&
+                               // Check for the pipe character in the 
attributish text.
+                               // Also make sure that we only trigger for 
simple
+                               // attribute-only cases for now.  Don't handle 
|{{multicells}}
+                               // where multicells expands to something like 
style="foo"| Bar
+                               // || Baz
+                               /^[^|]+[|][^|]*$/.test(attributishContent.txt) 
&&
+                               // And only handle a single nested transclusion 
for now
+                               // TODO: Handle data-mw construction for 
multi-transclusion content as
+                               // well, then relax this restriction.
+                               this.hasOnlyOneTransclusionChild(node)
+                  )
+               {
+                       //console.log(node.data.parsoid.dsr, 
JSON.stringify(attributishText));
+
+                       // Try to re-parse the attributish text content
+                       var attributishPrefix = 
attributishContent.txt.match(/^[^|]+\|/)[0],
+                               // re-parse the attributish prefix
+                               attributeTokens = this.tokenizer
+                                       
.tokenizeTableCellAttributes(attributishPrefix);
+                       if (attributeTokens) {
+                               // Found attributes.
+
+                               // Sanitize them
+                               var sanitizedToken = this.sanitizer
+                                       .sanitizeTokens(
+                                                       [new 
TagTk(node.nodeName.toLowerCase(),
+                                                               
attributeTokens[0])])[0];
+                               //console.log(JSON.stringify(sanitizedToken));
+
+                               // and transfer the sanitized attributes to the 
td node
+                               sanitizedToken.attribs.forEach(function(kv) {
+                                       node.setAttribute(kv.k, kv.v);
+                               });
+
+                               // Update the template encapsulation including 
data-mw
+
+                               // Lift up the about group to our td node.
+                               node.setAttribute('typeof', 
transclusionNode.getAttribute('typeof'));
+                               node.setAttribute('about', 
transclusionNode.getAttribute('about'));
+                               var dataMW = 
DU.getJSONAttribute(transclusionNode, 'data-mw'),
+                                       // FIXME:
+                                       parts = dataMW.parts ||
+                                               // XXX: Should we always use 
parts or at least the
+                                               // template wrapper? This will 
need to be updated whenever
+                                               // we change the template info.
+                                               [
+                                               {
+                                                       template: {
+                                                               target: 
dataMW.target,
+                                                               params: 
dataMW.params,
+                                                               i: 0
+                                                       }
+                                               }
+                               ];
+
+                               // Construct a new data-mw
+                               // Get the td and content source up to the 
transclusion start
+                               parts.unshift(env.page.src.substring(
+                                                       
node.data.parsoid.dsr[0],
+                                                       
transclusionNode.data.parsoid.dsr[0]));
+                               // Add wikitext for the table cell content 
following the
+                               // transclusion. This is safe as we are 
currently only
+                               // handling a single transclusion in the 
content, which is
+                               // guaranteed to have a dsr that covers the 
transclusion
+                               // itself.
+                               parts.push(env.page.src.substring(
+                                                       
transclusionNode.data.parsoid.dsr[1],
+                                                       
node.data.parsoid.dsr[1]));
+
+                               // Save the new data-mw on the td node
+                               // XXX: use data.mw instead, see
+                               // 
https://bugzilla.wikimedia.org/show_bug.cgi?id=53109
+                               DU.setJSONAttribute(node, 'data-mw', {parts: 
parts});
+                               node.data.parsoid.pi = 
transclusionNode.data.parsoid.pi;
+
+                               // Remove the span wrapper
+                               var attributishNodes = attributishContent.nodes;
+                               while(attributishNodes.length) {
+                                       var n = attributishNodes.shift();
+                                       if (/[|]/.test(n.textContent)) {
+                                               // Remove the consumed prefix 
from the text node
+                                               var nValue;
+                                               if (n.nodeName === '#text') {
+                                                       nValue = n.nodeValue;
+                                               } else {
+                                                       nValue = n.textContent;
+                                               }
+                                               // and convert it into a simple 
text node
+                                               
node.replaceChild(node.ownerDocument.createTextNode(nValue.replace(/^[^|]*[|]/, 
'')), n);
+                                       } else {
+                                               // content was consumed by 
attributes, so just drop it
+                                               // from the cell
+                                               node.removeChild(n);
+                                       }
+                               }
+                               // Remove template encapsulation from other 
children. The
+                               // table cell wraps everything now.
+                               node.childNodes.map(function(childNode) {
+                                       if (childNode.getAttribute && 
childNode.getAttribute('about')) {
+                                               
childNode.removeAttribute('about');
+                                       }
+                               });
+                       }
+               }
+       }
+       return true;
+};
+
+if (typeof module === "object") {
+       module.exports.TableFixups = TableFixups;
+}
diff --git a/js/lib/ext.core.Sanitizer.js b/js/lib/ext.core.Sanitizer.js
index 00b5b8e..8ce9547 100644
--- a/js/lib/ext.core.Sanitizer.js
+++ b/js/lib/ext.core.Sanitizer.js
@@ -579,12 +579,33 @@
  * @param {TokenTransformManager} manager The manager for this part of the 
pipeline.
  */
 function Sanitizer ( manager ) {
+       // FIXME: would be good to make the sanitizer independent of the manager
+       // so that it can be used separately. See
+       // https://bugzilla.wikimedia.org/show_bug.cgi?id=52941
        this.manager = manager;
        this.register( manager );
        this.constants = SanitizerConstants;
        this.attrWhiteListCache = {};
 }
 
+/**
+ * Utility function: Sanitize an array of tokens. Not used in normal token
+ * pipelines. The only caller is currently in dom.t.TDFixups.js.
+ *
+ * TODO: Move to Util / generalize when working on bug 52941?
+ */
+Sanitizer.prototype.sanitizeTokens = function (tokens) {
+       var res = [],
+               sanitizer = this;
+       tokens.forEach(function(token) {
+               if(token.name && token.name === 'a') {
+                       token = sanitizer.onAnchor(token).token;
+               }
+               res.push(sanitizer.onAny(token).token);
+       });
+       return res;
+};
+
 Sanitizer.prototype.getAttrWhiteList = function(tag) {
        var awlCache = this.attrWhiteListCache;
        if (!awlCache[tag]) {
diff --git a/js/lib/mediawiki.DOMPostProcessor.js 
b/js/lib/mediawiki.DOMPostProcessor.js
index 6e50d62..da2fb3e 100644
--- a/js/lib/mediawiki.DOMPostProcessor.js
+++ b/js/lib/mediawiki.DOMPostProcessor.js
@@ -20,7 +20,7 @@
        markTreeBuilderFixups = 
require('./dom.markTreeBuilderFixups.js').markTreeBuilderFixups,
        migrateStartMetas = 
require('./dom.migrateStartMetas.js').migrateStartMetas,
        migrateTrailingNLs = 
require('./dom.migrateTrailingNLs.js').migrateTrailingNLs,
-       stripDoubleTDs = require('./dom.t.TDFixups.js').stripDoubleTDs,
+       TableFixups = require('./dom.t.TableFixups.js'),
        stripMarkerMetas = CleanUp.stripMarkerMetas,
        unpackDOMFragments = 
require('./dom.t.unpackDOMFragments.js').unpackDOMFragments,
        wrapTemplates = require('./dom.wrapTemplates.js').wrapTemplates;
@@ -136,14 +136,18 @@
        this.processors.push(generateRefs.bind(null,
                                
env.conf.parsoid.nativeExtensions.cite.references));
 
+       var domVisitor2 = new DOMTraverser(),
+               tableFixer = new TableFixups.TableFixups(env);
        // 1. Strip marker metas -- removes left over marker metas (ex: metas
        //    nested in expanded tpl/extension output).
-       // 2. Fix up DOM for li-hack.
-       // 3. Save data.parsoid into data-parsoid html attribute.
-       var domVisitor2 = new DOMTraverser();
        domVisitor2.addHandler( 'meta', stripMarkerMetas.bind(null, 
env.conf.parsoid.editMode) );
+       // 2. Fix up DOM for li-hack.
        domVisitor2.addHandler( 'li', handleLIHack.bind( null, env ) );
-       domVisitor2.addHandler( 'td', stripDoubleTDs.bind( null, env ) );
+       // 3. Fix up issues from templated table cells and table cell attributes
+       domVisitor2.addHandler( 'td', tableFixer.stripDoubleTDs.bind( 
tableFixer, env ) );
+       domVisitor2.addHandler( 'td', 
tableFixer.reparseTemplatedAttributes.bind( tableFixer, env ) );
+       domVisitor2.addHandler( 'th', 
tableFixer.reparseTemplatedAttributes.bind( tableFixer, env ) );
+       // 4. Save data.parsoid into data-parsoid html attribute.
        domVisitor2.addHandler( null, cleanupAndSaveDataParsoid );
        this.processors.push(domVisitor2.traverse.bind(domVisitor2));
 }
diff --git a/js/lib/mediawiki.DOMUtils.js b/js/lib/mediawiki.DOMUtils.js
index 867bdec..d9d3369 100644
--- a/js/lib/mediawiki.DOMUtils.js
+++ b/js/lib/mediawiki.DOMUtils.js
@@ -95,6 +95,20 @@
        },
 
        /**
+        * Test if a node matches a given typeof.
+        */
+       hasTypeOf: function(node, type) {
+               if (!node.getAttribute) {
+                       return false;
+               }
+               var typeOfs = node.getAttribute('typeof');
+               if (!typeOfs) {
+                       return false;
+               }
+               return typeOfs.split(' ').indexOf(type) !== -1;
+       },
+
+       /**
         * Decode a JSON object into the data member of DOM nodes
         *
         * @param {Node} node
diff --git a/js/lib/mediawiki.tokenizer.peg.js 
b/js/lib/mediawiki.tokenizer.peg.js
index dd69e99..7ed2b0e 100644
--- a/js/lib/mediawiki.tokenizer.peg.js
+++ b/js/lib/mediawiki.tokenizer.peg.js
@@ -256,6 +256,13 @@
        return this.tokenize(text, 'img_options');
 };
 
+/**
+ * Tokenize table cell attributes
+ */
+PegTokenizer.prototype.tokenizeTableCellAttributes = function( text ) {
+       return this.tokenize(text, "single_cell_table_args");
+};
+
 /*
  * Inline breaks, flag-enabled production which detects end positions for
  * active higher-level productions in inline and other nested productions.
diff --git a/js/tests/parserTests-blacklist.js 
b/js/tests/parserTests-blacklist.js
index bb5ec3e..222912d 100644
--- a/js/tests/parserTests-blacklist.js
+++ b/js/tests/parserTests-blacklist.js
@@ -64,8 +64,6 @@
 add("wt2html", "Bracketed external links with template-generated invalid 
target");
 add("wt2html", "External link containing double-single-quotes in text embedded 
in italics (bug 4598 sanity check)");
 add("wt2html", "Non-extlinks in brackets");
-add("wt2html", "Template-generated table cell attributes and cell content");
-add("wt2html", "Template-generated table cell attributes and cell content 
(2)");
 add("wt2html", "Template-generated table cell attributes and cell content 
(3)");
 add("wt2html", "Link containing \"#<\" and \"#>\" % as a hex sequences- these 
are valid section anchors\nExample for such a section: == < ==");
 add("wt2html", "Link containing \"<#\" and \">#\" as a hex sequences");
diff --git a/js/tests/parserTests.txt b/js/tests/parserTests.txt
index 7f33f47..1582bdc 100644
--- a/js/tests/parserTests.txt
+++ b/js/tests/parserTests.txt
@@ -4067,11 +4067,14 @@
 !! input
 {|
 |{{table_attribs}}
+| {{table_attribs}}
 |}
 !! result
 <table>
 <tr>
 <td style="color: red"> Foo
+</td>
+<td style="color: red"> Foo
 </td></tr></table>
 
 !! end

-- 
To view, visit https://gerrit.wikimedia.org/r/79943
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ibbd16952784be6768622ed6df65d3475d2c37e91
Gerrit-PatchSet: 8
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: GWicke <[email protected]>
Gerrit-Reviewer: Cscott <[email protected]>
Gerrit-Reviewer: GWicke <[email protected]>
Gerrit-Reviewer: Subramanya Sastry <[email protected]>
Gerrit-Reviewer: jenkins-bot

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to