jenkins-bot has submitted this change and it was merged.
Change subject: Bug 50603, 44498: Handle templates straddling cell attributes &
content
......................................................................
Bug 50603, 44498: Handle templates straddling cell attributes & content
This patch adds a DOM visitor to fix up content like
{|
|{{nom|Bar}}
|title="foo" {{nom|Bar}}
|}
where {{nom|Bar}} expands to attributes and a table cell, for example
style="foo" class="bar"|Bar
The attributes are re-tokenized and stripped from the table contents. The
template encapsulation metadata is moved to the table cell and updated.
This implementation renders and round-trips the templates documented in
- https://en.wikipedia.org/wiki/Template:Table_cell_templates/doc
- http://en.wikipedia.org/wiki/User:Nicolas1981/Sandbox (from bug 44498)
Nevertheless, there are some limitations:
- We assume that attributes don't contain wiki markup (apart from <nowiki>)
and end up in text or nowiki nodes.
- Only a single table cell is produced / opened by the template that
contains the attributes. This limitation could be lifted with more
aggressive re-parsing if really needed in practice.
- There is only a single transclusion in the table cell content. This
limitation can be lifted with more advanced data-mw construction.
Bug: 50603
Bug: 44498
Change-Id: Ibbd16952784be6768622ed6df65d3475d2c37e91
---
D js/lib/dom.t.TDFixups.js
A js/lib/dom.t.TableFixups.js
M js/lib/ext.core.Sanitizer.js
M js/lib/mediawiki.DOMPostProcessor.js
M js/lib/mediawiki.DOMUtils.js
M js/lib/mediawiki.tokenizer.peg.js
M js/tests/parserTests-blacklist.js
M js/tests/parserTests.txt
8 files changed, 384 insertions(+), 79 deletions(-)
Approvals:
Subramanya Sastry: Looks good to me, approved
jenkins-bot: Verified
diff --git a/js/lib/dom.t.TDFixups.js b/js/lib/dom.t.TDFixups.js
deleted file mode 100644
index 65fb3ad..0000000
--- a/js/lib/dom.t.TDFixups.js
+++ /dev/null
@@ -1,72 +0,0 @@
-var DU = require('./mediawiki.DOMUtils.js').DOMUtils;
-
-
-/**
- * DOM visitor that strips the double td for this test case:
- * |{{echo|{{!}} Foo}}
- *
- * See https://bugzilla.wikimedia.org/show_bug.cgi?id=50603
- */
-function stripDoubleTDs (env, node) {
- var nextNode = node.nextSibling;
-
- if (!DU.isLiteralHTMLNode(node) &&
- nextNode !== null &&
- nextNode.nodeName === 'TD' &&
- !DU.isLiteralHTMLNode(nextNode) &&
- DU.nodeEssentiallyEmpty(node) &&
- (// FIXME: will not be set for nested templates
- DU.isEncapsulatedElt(nextNode) ||
- // Hacky work-around for nested templates
- /^{{.*?}}$/.test(nextNode.data.parsoid.src))
- )
- {
- // Update the dsr. Since we are coalescing the first
- // node with the second (or, more precisely, deleting
- // the first node), we have to update the second DSR's
- // starting point and start tag width.
- var nodeDSR = node.data.parsoid.dsr,
- nextNodeDSR = nextNode.data.parsoid.dsr;
-
- if (nodeDSR && nextNodeDSR) {
- nextNodeDSR[0] = nodeDSR[0];
- }
-
- // Now update data-mw
- // XXX: use data.mw for data-mw as well!
- var dataMW = DU.getJSONAttribute(nextNode, 'data-mw'),
- nodeSrc = DU.getWTSource(env, node);
-
- if (!dataMW.parts) {
- dataMW.parts = [
- nodeSrc,
- {
- // XXX: Should we always use parts or
at least the
- // template wrapper? This will need to
be updated whenever
- // we change the template info.
- template: {
- target: dataMW.target,
- params: dataMW.params,
- i: 0
- }
- }
- ];
- dataMW.target = undefined;
- dataMW.params = undefined;
- } else {
- dataMW.parts.unshift(nodeSrc);
- }
- DU.setJSONAttribute(nextNode, 'data-mw', dataMW);
-
- // Delete the duplicated <td> node.
- node.parentNode.removeChild(node);
- // This node was deleted, so don't continue processing on it.
- return nextNode;
- }
-
- return true;
-}
-
-if (typeof module === "object") {
- module.exports.stripDoubleTDs = stripDoubleTDs;
-}
diff --git a/js/lib/dom.t.TableFixups.js b/js/lib/dom.t.TableFixups.js
new file mode 100644
index 0000000..96e5b33
--- /dev/null
+++ b/js/lib/dom.t.TableFixups.js
@@ -0,0 +1,330 @@
+var DU = require('./mediawiki.DOMUtils.js').DOMUtils,
+ PegTokenizer = require('./mediawiki.tokenizer.peg.js').PegTokenizer,
+ Sanitizer = require('./ext.core.Sanitizer.js').Sanitizer,
+ defines = require('./mediawiki.parser.defines.js');
+
+// define some constructor shortcuts
+var TagTk = defines.TagTk;
+
+/**
+ * TableFixups class
+ *
+ * Provides two DOMTraverser visitors that implement the two parts of
+ * https://bugzilla.wikimedia.org/show_bug.cgi?id=50603:
+ * - stripDoubleTDs
+ * - reparseTemplatedAttributes
+ */
+function TableFixups (env) {
+ /**
+ * Set up some helper objects for reparseTemplatedAttributes
+ */
+
+ /**
+ * Actually the regular tokenizer, but we'll use
+ * tokenizeTableCellAttributes only.
+ */
+ this.tokenizer = new PegTokenizer( env );
+ // XXX: Don't require us to fake a manager!
+ var fakeManager = {
+ env: env,
+ addTransform: function(){}
+ };
+ this.sanitizer = new Sanitizer( fakeManager );
+}
+
+/**
+ * DOM visitor that strips the double td for this test case:
+ * |{{echo|{{!}} Foo}}
+ *
+ * @public
+ *
+ * See https://bugzilla.wikimedia.org/show_bug.cgi?id=50603
+ */
+TableFixups.prototype.stripDoubleTDs = function (env, node) {
+ var nextNode = node.nextSibling;
+
+ if (!DU.isLiteralHTMLNode(node) &&
+ nextNode !== null &&
+ nextNode.nodeName === 'TD' &&
+ !DU.isLiteralHTMLNode(nextNode) &&
+ DU.nodeEssentiallyEmpty(node) &&
+ (// FIXME: will not be set for nested templates
+ DU.isEncapsulatedElt(nextNode) ||
+ // Hacky work-around for nested templates
+ /^{{.*?}}$/.test(nextNode.data.parsoid.src))
+ )
+ {
+ // Update the dsr. Since we are coalescing the first
+ // node with the second (or, more precisely, deleting
+ // the first node), we have to update the second DSR's
+ // starting point and start tag width.
+ var nodeDSR = node.data.parsoid.dsr,
+ nextNodeDSR = nextNode.data.parsoid.dsr;
+
+ if (nodeDSR && nextNodeDSR) {
+ nextNodeDSR[0] = nodeDSR[0];
+ }
+
+ // Now update data-mw
+ // XXX: use data.mw for data-mw as well!
+ var dataMW = DU.getJSONAttribute(nextNode, 'data-mw'),
+ nodeSrc = DU.getWTSource(env, node);
+
+ if (!dataMW.parts) {
+ dataMW.parts = [
+ nodeSrc,
+ {
+ // XXX: Should we always use parts or
at least the
+ // template wrapper? This will need to
be updated whenever
+ // we change the template info.
+ template: {
+ target: dataMW.target,
+ params: dataMW.params,
+ i: 0
+ }
+ }
+ ];
+ dataMW.target = undefined;
+ dataMW.params = undefined;
+ } else {
+ dataMW.parts.unshift(nodeSrc);
+ }
+ DU.setJSONAttribute(nextNode, 'data-mw', dataMW);
+
+ // Delete the duplicated <td> node.
+ node.parentNode.removeChild(node);
+ // This node was deleted, so don't continue processing on it.
+ return nextNode;
+ }
+
+ return true;
+};
+
+
+/**
+ * Helpers for reparseTemplatedAttributes
+ */
+TableFixups.prototype.hasOnlyOneTransclusionChild = function (node) {
+ var n = 0;
+ node.childNodes.forEach(function(n) {
+ if(DU.isElt(n) && DU.isEncapsulatedElt(n)) {
+ n++;
+ }
+ if (n > 1) {
+ return false;
+ }
+ });
+ return true;
+};
+
+
+/**
+ * Collect potential attribute content
+ *
+ * We expect this to be text nodes without a pipe character followed by one or
+ * more nowiki spans, followed by a template encapsulation with pure-text and
+ * nowiki content. Collection stops when encountering other nodes or a pipe
+ * character.
+ */
+TableFixups.prototype.collectAttributishContent = function (node) {
+ var buf = [],
+ nodes = [],
+ transclusionNode,
+ // Build the result.
+ buildRes = function () {
+ return {
+ txt: buf.join(''),
+ nodes: nodes,
+ transclusionNode: transclusionNode
+ };
+ },
+ child = node.firstChild;
+ while (child) {
+ if (!transclusionNode &&
+ child.nodeType === node.TEXT_NODE &&
+ ! /[|]/.test(child.nodeValue))
+ {
+ buf.push(child.nodeValue);
+ } else if (transclusionNode &&
+ child.nodeName === 'SPAN' &&
+ DU.hasTypeOf(child, 'mw:Nowiki'))
+ {
+ buf.push(child.textContent);
+ } else if (child.nodeName === 'SPAN' &&
+ child.childNodes.length === 1 &&
+ (child.getAttribute('typeof') === null &&
+ transclusionNode &&
+ child.getAttribute('about') ===
transclusionNode.getAttribute('about') ||
+ DU.hasTypeOf(child, 'mw:Transclusion')))
+ {
+ buf.push(child.textContent);
+ if (!transclusionNode && DU.hasTypeOf(child,
'mw:Transclusion')) {
+ transclusionNode = child;
+ }
+ } else {
+ return buildRes();
+ }
+ nodes.push(child);
+ if (/[|]/.test(buf.last())) {
+ return buildRes();
+ }
+ child = child.nextSibling;
+ }
+ return buildRes();
+};
+
+/**
+ * Bug 44498, second part of bug 50603
+ *
+ * @public
+ *
+ * Handle wikitext like
+ *
+ * {|
+ * |{{nom|Bar}}
+ * |}
+ *
+ * where nom expands to style="foo" class="bar"|Bar. The attributes are
+ * tokenized and stripped from the table contents.
+ *
+ * This method works well for the templates documented in
+ * https://en.wikipedia.org/wiki/Template:Table_cell_templates/doc
+ *
+ * Nevertheless, there are some limitations:
+ * - We assume that attributes don't contain wiki markup (apart from <nowiki>)
+ * and end up in text or nowiki nodes.
+ * - Only a single table cell is produced / opened by the template that
+ * contains the attributes. This limitation could be lifted with more
+ * aggressive re-parsing if really needed in practice.
+ * - There is only a single transclusion in the table cell content. This
+ * limitation can be lifted with more advanced data-mw construction.
+ */
+TableFixups.prototype.reparseTemplatedAttributes = function (env, node) {
+
+ // Cheap checks first
+ if (!DU.isLiteralHTMLNode(node) &&
+ // We use the dsr start tag width as a proxy for 'has no
attributes
+ // yet'. We accept '|' and '||' (row-based syntax), so at most
two
+ // chars.
+ node.data.parsoid.dsr &&
+ node.data.parsoid.dsr[2] !== null &&
+ node.data.parsoid.dsr[2] <= 2)
+ {
+ // Now actually look at the content
+ var attributishContent = this.collectAttributishContent(node),
+ transclusionNode = attributishContent.transclusionNode;
+
+ // First of all make sure we have a transclusion that
produces leading
+ // text content
+ if ( transclusionNode &&
+ // Check for the pipe character in the
attributish text.
+ // Also make sure that we only trigger for
simple
+ // attribute-only cases for now. Don't handle
|{{multicells}}
+ // where multicells expands to something like
style="foo"| Bar
+ // || Baz
+ /^[^|]+[|][^|]*$/.test(attributishContent.txt)
&&
+ // And only handle a single nested transclusion
for now
+ // TODO: Handle data-mw construction for
multi-transclusion content as
+ // well, then relax this restriction.
+ this.hasOnlyOneTransclusionChild(node)
+ )
+ {
+ //console.log(node.data.parsoid.dsr,
JSON.stringify(attributishText));
+
+ // Try to re-parse the attributish text content
+ var attributishPrefix =
attributishContent.txt.match(/^[^|]+\|/)[0],
+ // re-parse the attributish prefix
+ attributeTokens = this.tokenizer
+
.tokenizeTableCellAttributes(attributishPrefix);
+ if (attributeTokens) {
+ // Found attributes.
+
+ // Sanitize them
+ var sanitizedToken = this.sanitizer
+ .sanitizeTokens(
+ [new
TagTk(node.nodeName.toLowerCase(),
+
attributeTokens[0])])[0];
+ //console.log(JSON.stringify(sanitizedToken));
+
+ // and transfer the sanitized attributes to the
td node
+ sanitizedToken.attribs.forEach(function(kv) {
+ node.setAttribute(kv.k, kv.v);
+ });
+
+ // Update the template encapsulation including
data-mw
+
+ // Lift up the about group to our td node.
+ node.setAttribute('typeof',
transclusionNode.getAttribute('typeof'));
+ node.setAttribute('about',
transclusionNode.getAttribute('about'));
+ var dataMW =
DU.getJSONAttribute(transclusionNode, 'data-mw'),
+ // FIXME:
+ parts = dataMW.parts ||
+ // XXX: Should we always use
parts or at least the
+ // template wrapper? This will
need to be updated whenever
+ // we change the template info.
+ [
+ {
+ template: {
+ target:
dataMW.target,
+ params:
dataMW.params,
+ i: 0
+ }
+ }
+ ];
+
+ // Construct a new data-mw
+ // Get the td and content source up to the
transclusion start
+ parts.unshift(env.page.src.substring(
+
node.data.parsoid.dsr[0],
+
transclusionNode.data.parsoid.dsr[0]));
+ // Add wikitext for the table cell content
following the
+ // transclusion. This is safe as we are
currently only
+ // handling a single transclusion in the
content, which is
+ // guaranteed to have a dsr that covers the
transclusion
+ // itself.
+ parts.push(env.page.src.substring(
+
transclusionNode.data.parsoid.dsr[1],
+
node.data.parsoid.dsr[1]));
+
+ // Save the new data-mw on the td node
+ // XXX: use data.mw instead, see
+ //
https://bugzilla.wikimedia.org/show_bug.cgi?id=53109
+ DU.setJSONAttribute(node, 'data-mw', {parts:
parts});
+ node.data.parsoid.pi =
transclusionNode.data.parsoid.pi;
+
+ // Remove the span wrapper
+ var attributishNodes = attributishContent.nodes;
+ while(attributishNodes.length) {
+ var n = attributishNodes.shift();
+ if (/[|]/.test(n.textContent)) {
+ // Remove the consumed prefix
from the text node
+ var nValue;
+ if (n.nodeName === '#text') {
+ nValue = n.nodeValue;
+ } else {
+ nValue = n.textContent;
+ }
+ // and convert it into a simple
text node
+
node.replaceChild(node.ownerDocument.createTextNode(nValue.replace(/^[^|]*[|]/,
'')), n);
+ } else {
+ // content was consumed by
attributes, so just drop it
+ // from the cell
+ node.removeChild(n);
+ }
+ }
+ // Remove template encapsulation from other
children. The
+ // table cell wraps everything now.
+ node.childNodes.map(function(childNode) {
+ if (childNode.getAttribute &&
childNode.getAttribute('about')) {
+
childNode.removeAttribute('about');
+ }
+ });
+ }
+ }
+ }
+ return true;
+};
+
+if (typeof module === "object") {
+ module.exports.TableFixups = TableFixups;
+}
diff --git a/js/lib/ext.core.Sanitizer.js b/js/lib/ext.core.Sanitizer.js
index 00b5b8e..8ce9547 100644
--- a/js/lib/ext.core.Sanitizer.js
+++ b/js/lib/ext.core.Sanitizer.js
@@ -579,12 +579,33 @@
* @param {TokenTransformManager} manager The manager for this part of the
pipeline.
*/
function Sanitizer ( manager ) {
+ // FIXME: would be good to make the sanitizer independent of the manager
+ // so that it can be used separately. See
+ // https://bugzilla.wikimedia.org/show_bug.cgi?id=52941
this.manager = manager;
this.register( manager );
this.constants = SanitizerConstants;
this.attrWhiteListCache = {};
}
+/**
+ * Utility function: Sanitize an array of tokens. Not used in normal token
+ * pipelines. The only caller is currently in dom.t.TDFixups.js.
+ *
+ * TODO: Move to Util / generalize when working on bug 52941?
+ */
+Sanitizer.prototype.sanitizeTokens = function (tokens) {
+ var res = [],
+ sanitizer = this;
+ tokens.forEach(function(token) {
+ if(token.name && token.name === 'a') {
+ token = sanitizer.onAnchor(token).token;
+ }
+ res.push(sanitizer.onAny(token).token);
+ });
+ return res;
+};
+
Sanitizer.prototype.getAttrWhiteList = function(tag) {
var awlCache = this.attrWhiteListCache;
if (!awlCache[tag]) {
diff --git a/js/lib/mediawiki.DOMPostProcessor.js
b/js/lib/mediawiki.DOMPostProcessor.js
index 6e50d62..da2fb3e 100644
--- a/js/lib/mediawiki.DOMPostProcessor.js
+++ b/js/lib/mediawiki.DOMPostProcessor.js
@@ -20,7 +20,7 @@
markTreeBuilderFixups =
require('./dom.markTreeBuilderFixups.js').markTreeBuilderFixups,
migrateStartMetas =
require('./dom.migrateStartMetas.js').migrateStartMetas,
migrateTrailingNLs =
require('./dom.migrateTrailingNLs.js').migrateTrailingNLs,
- stripDoubleTDs = require('./dom.t.TDFixups.js').stripDoubleTDs,
+ TableFixups = require('./dom.t.TableFixups.js'),
stripMarkerMetas = CleanUp.stripMarkerMetas,
unpackDOMFragments =
require('./dom.t.unpackDOMFragments.js').unpackDOMFragments,
wrapTemplates = require('./dom.wrapTemplates.js').wrapTemplates;
@@ -136,14 +136,18 @@
this.processors.push(generateRefs.bind(null,
env.conf.parsoid.nativeExtensions.cite.references));
+ var domVisitor2 = new DOMTraverser(),
+ tableFixer = new TableFixups.TableFixups(env);
// 1. Strip marker metas -- removes left over marker metas (ex: metas
// nested in expanded tpl/extension output).
- // 2. Fix up DOM for li-hack.
- // 3. Save data.parsoid into data-parsoid html attribute.
- var domVisitor2 = new DOMTraverser();
domVisitor2.addHandler( 'meta', stripMarkerMetas.bind(null,
env.conf.parsoid.editMode) );
+ // 2. Fix up DOM for li-hack.
domVisitor2.addHandler( 'li', handleLIHack.bind( null, env ) );
- domVisitor2.addHandler( 'td', stripDoubleTDs.bind( null, env ) );
+ // 3. Fix up issues from templated table cells and table cell attributes
+ domVisitor2.addHandler( 'td', tableFixer.stripDoubleTDs.bind(
tableFixer, env ) );
+ domVisitor2.addHandler( 'td',
tableFixer.reparseTemplatedAttributes.bind( tableFixer, env ) );
+ domVisitor2.addHandler( 'th',
tableFixer.reparseTemplatedAttributes.bind( tableFixer, env ) );
+ // 4. Save data.parsoid into data-parsoid html attribute.
domVisitor2.addHandler( null, cleanupAndSaveDataParsoid );
this.processors.push(domVisitor2.traverse.bind(domVisitor2));
}
diff --git a/js/lib/mediawiki.DOMUtils.js b/js/lib/mediawiki.DOMUtils.js
index 867bdec..d9d3369 100644
--- a/js/lib/mediawiki.DOMUtils.js
+++ b/js/lib/mediawiki.DOMUtils.js
@@ -95,6 +95,20 @@
},
/**
+ * Test if a node matches a given typeof.
+ */
+ hasTypeOf: function(node, type) {
+ if (!node.getAttribute) {
+ return false;
+ }
+ var typeOfs = node.getAttribute('typeof');
+ if (!typeOfs) {
+ return false;
+ }
+ return typeOfs.split(' ').indexOf(type) !== -1;
+ },
+
+ /**
* Decode a JSON object into the data member of DOM nodes
*
* @param {Node} node
diff --git a/js/lib/mediawiki.tokenizer.peg.js
b/js/lib/mediawiki.tokenizer.peg.js
index dd69e99..7ed2b0e 100644
--- a/js/lib/mediawiki.tokenizer.peg.js
+++ b/js/lib/mediawiki.tokenizer.peg.js
@@ -256,6 +256,13 @@
return this.tokenize(text, 'img_options');
};
+/**
+ * Tokenize table cell attributes
+ */
+PegTokenizer.prototype.tokenizeTableCellAttributes = function( text ) {
+ return this.tokenize(text, "single_cell_table_args");
+};
+
/*
* Inline breaks, flag-enabled production which detects end positions for
* active higher-level productions in inline and other nested productions.
diff --git a/js/tests/parserTests-blacklist.js
b/js/tests/parserTests-blacklist.js
index bb5ec3e..222912d 100644
--- a/js/tests/parserTests-blacklist.js
+++ b/js/tests/parserTests-blacklist.js
@@ -64,8 +64,6 @@
add("wt2html", "Bracketed external links with template-generated invalid
target");
add("wt2html", "External link containing double-single-quotes in text embedded
in italics (bug 4598 sanity check)");
add("wt2html", "Non-extlinks in brackets");
-add("wt2html", "Template-generated table cell attributes and cell content");
-add("wt2html", "Template-generated table cell attributes and cell content
(2)");
add("wt2html", "Template-generated table cell attributes and cell content
(3)");
add("wt2html", "Link containing \"#<\" and \"#>\" % as a hex sequences- these
are valid section anchors\nExample for such a section: == < ==");
add("wt2html", "Link containing \"<#\" and \">#\" as a hex sequences");
diff --git a/js/tests/parserTests.txt b/js/tests/parserTests.txt
index 7f33f47..1582bdc 100644
--- a/js/tests/parserTests.txt
+++ b/js/tests/parserTests.txt
@@ -4067,11 +4067,14 @@
!! input
{|
|{{table_attribs}}
+| {{table_attribs}}
|}
!! result
<table>
<tr>
<td style="color: red"> Foo
+</td>
+<td style="color: red"> Foo
</td></tr></table>
!! end
--
To view, visit https://gerrit.wikimedia.org/r/79943
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ibbd16952784be6768622ed6df65d3475d2c37e91
Gerrit-PatchSet: 8
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: GWicke <[email protected]>
Gerrit-Reviewer: Cscott <[email protected]>
Gerrit-Reviewer: GWicke <[email protected]>
Gerrit-Reviewer: Subramanya Sastry <[email protected]>
Gerrit-Reviewer: jenkins-bot
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits