Subramanya Sastry has uploaded a new change for review.
https://gerrit.wikimedia.org/r/75253
Change subject: WIP: (Bug 51217) Detect fostered content and ignore in selser
......................................................................
WIP: (Bug 51217) Detect fostered content and ignore in selser
* Added a dom pass to detect fostered nodes (and text in some
scenarios -- see comments in code for more info about this)
* Removed older fostered content tests and reused new flag.
* 8 selser tests failing (because wt2wt is at fault, not selser.
See Bug 51718 for this -- we really need to fix this some way
sooner than later) and blacklisted for now.
* Manually tested with this snippet by making changes to
para-1 and para-2 and ensuring that selser doesn't duplicate
'will be fostered' outside the table
------------
para-1
{|
will be fostered
|-
|bar
|}
para-2
------------
* Tested with the following pages which were used to test early
handling of fostered content from templates and verified that
nothing is broken.
- en:Almirante_Latorre-class_battleship
- en:HMS_Salisbury_(F32)
Change-Id: I36f2746509791059d8dcbf88f59f4bc7d57582b0
---
M js/lib/mediawiki.DOMPostProcessor.js
M js/lib/mediawiki.WikitextSerializer.js
M js/tests/parserTests-blacklist.js
3 files changed, 151 insertions(+), 76 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid
refs/changes/53/75253/1
diff --git a/js/lib/mediawiki.DOMPostProcessor.js
b/js/lib/mediawiki.DOMPostProcessor.js
index f569fe4..863f19b 100644
--- a/js/lib/mediawiki.DOMPostProcessor.js
+++ b/js/lib/mediawiki.DOMPostProcessor.js
@@ -70,6 +70,47 @@
// what about span, figure, caption, figcaption?
};
+// TSR info on all these tags are only valid for the opening tag.
+// (closing tags dont have attrs since tree-builder strips them
+// and adds meta-tags tracking the corresponding TSR)
+//
+// On other tags, a, hr, br, meta-marker tags, the tsr spans
+// the entire DOM, not just the tag.
+var WT_tagsWithLimitedTSR = {
+ "b" : true,
+ "i" : true,
+ "h1" : true,
+ "h2" : true,
+ "h3" : true,
+ "h4" : true,
+ "h5" : true,
+ "ul" : true,
+ "ol" : true,
+ "dl" : true,
+ "li" : true,
+ "dt" : true,
+ "dd" : true,
+ "table" : true,
+ "caption" : true,
+ "tr" : true,
+ "td" : true,
+ "th" : true,
+ "hr" : true, // void element
+ "br" : true, // void element
+ "pre" : true
+};
+
+function tsrSpansTagDOM(n, parsoidData) {
+ // - tags known to have tag-specific tsr
+ // - html tags with 'stx' set
+ // - span tags with 'mw:Nowiki' type
+ var name = n.nodeName.toLowerCase();
+ return !WT_tagsWithLimitedTSR[name] &&
+ !DU.hasLiteralHTMLMarker(parsoidData) &&
+ !DU.isNodeOfType(n, 'span', 'mw:Nowiki');
+}
+
+
/* ------------- utility functions on DOM nodes/Node attributes ------------ */
// SSS FIXME: Should we convert some of these functions to properties
@@ -843,6 +884,73 @@
}
}
+function markFosteredContent(node, env) {
+ function findFosteredContent(table) {
+ var tableTagId = table.data.parsoid.tagId,
+ n = table.previousSibling,
+ initPos = table.data.parsoid.tsr ?
table.data.parsoid.tsr[0] : null,
+ fosteredText = "",
+ nodeBuf = [],
+ tsrGap = 0;
+
+ while (n) {
+ if (DU.isElt(n)) {
+ if (typeof(n.data.parsoid.tagId) !== 'number'
|| n.data.parsoid.tagId < tableTagId) {
+ if (initPos && n.data.parsoid.tsr &&
tsrSpansTagDOM(n, n.data.parsoid)) {
+ var expectedGap = initPos -
n.data.parsoid.tsr[1];
+ if (tsrGap !== expectedGap) {
+ console.log("Fostered
text/comments: " +
+
JSON.stringify(fosteredText.substring(expectedGap)));
+ while (nodeBuf.length >
0) {
+ // Wrap each
node in a span wrapper
+ var x =
nodeBuf.pop();
+ var span =
table.ownerDocument.createElement('span');
+ span.data = {
parsoid: { fostered: true } };
+
x.parentNode.insertBefore(span, x);
+
span.appendChild(x);
+ }
+ }
+ } else {
+ // No clue if the text in
fosteredText is really fostered content.
+ // If we ran this pass
post-dsr-computation, we might be able to
+ // detect this in more
scenarios. Something to consider.
+
+ /**
+ console.warn("initPos: " +
initPos);
+ console.warn("have tsr: " +
n.data.parsoid.tsr);
+ console.warn("spans tsr: " +
(n.data.parsoid.tsr && tsrSpansTagDOM(n, n.data.parsoid)));
+ **/
+ }
+ // All good at this point
+ break;
+ } else {
+ n.data.parsoid.fostered = true;
+ }
+ } else {
+ var str = DU.isText(n) ? n.nodeValue : "<!--" +
n.nodeValue + "-->";
+ tsrGap += str.length;
+ fosteredText = str + fosteredText;
+ nodeBuf.push(n);
+ }
+ n = n.previousSibling;
+ }
+ }
+
+ var c = node.firstChild;
+ while (c) {
+ var sibling = c.nextSibling;
+
+ if (DU.isElt(c) && c.nodeName === 'TABLE') {
+ findFosteredContent(c);
+ }
+
+ if (c.childNodes.length > 0) {
+ markFosteredContent(c, env);
+ }
+ c = sibling;
+ }
+}
+
// If the last child of a node is a start-meta, simply
// move it up and make it the parent's sibling.
// This will move the start-meta closest to the content
@@ -1027,7 +1135,6 @@
if (!range.end.data) {
range.end.data = {};
}
- range.end.data.tmp_fostered = true;
}
return range;
@@ -1400,7 +1507,7 @@
var endDsr = dp2.dsr[0];
if (DU.hasNodeName(tcEnd, 'table') &&
((endDsr !== null && endDsr <
dp1.dsr[0]) ||
- (tcStart.data &&
tcStart.data.tmp_fostered)))
+ (tcStart.data &&
tcStart.data.parsoid.fostered)))
{
dp1.dsr[0] = endDsr;
}
@@ -1480,27 +1587,6 @@
}
deleteNode(range.endElem);
- }
-}
-
-function swallowTableIfNestedDSR(elt, tbl) {
- var eltDP = DU.getDataParsoid( elt ),
- eltDSR = eltDP.dsr,
- tblDP = DU.getDataParsoid( tbl ),
- tblTSR = tblDP.tsr;
-
- // IMPORTANT: Do not use dsr to compare because the table may not
- // have a valid dsr[1] (if the table's end-tag is generated by
- // a template transcluded into the table) always. But it is
- // sufficient to check against tsr[1] because the only way 'elt'
- // could have a dsr[0] after the table-start-tag but show up before
- // 'tbl' is if 'elt' got fostered out of the table.
- if (eltDSR && tblTSR && eltDSR[0] >= tblTSR[1]) {
- eltDP.dsr[0] = tblTSR[0];
- eltDP.dsr[1] = null;
- return true;
- } else {
- return false;
}
}
@@ -1627,10 +1713,14 @@
tbl = tbl.nextSibling;
}
+ var dp =
sm.parentNode.data.parsoid;
if (tbl &&
DU.hasNodeName(tbl,
'table') &&
-
swallowTableIfNestedDSR(sm.parentNode, tbl))
+ dp.fostered)
{
+ if (dp.tsr && dp.tsr[0]
!== null && tbl.dsr[0] === null) {
+ tbl.dsr[0] =
dp.tsr[0];
+ }
tbl.setAttribute('about', about); // set about on elem
ee = tbl;
}
@@ -1867,47 +1957,6 @@
// [s,e) -- if defined, start/end position of wikitext source that generated
// node's subtree
function computeNodeDSR(env, node, s, e, dsrCorrection, traceDSR) {
-
- // TSR info on all these tags are only valid for the opening tag.
- // (closing tags dont have attrs since tree-builder strips them
- // and adds meta-tags tracking the corresponding TSR)
- //
- // On other tags, a, hr, br, meta-marker tags, the tsr spans
- // the entire DOM, not just the tag.
- var WT_tagsWithLimitedTSR = {
- "b" : true,
- "i" : true,
- "h1" : true,
- "h2" : true,
- "h3" : true,
- "h4" : true,
- "h5" : true,
- "ul" : true,
- "ol" : true,
- "dl" : true,
- "li" : true,
- "dt" : true,
- "dd" : true,
- "table" : true,
- "caption" : true,
- "tr" : true,
- "td" : true,
- "th" : true,
- "hr" : true, // void element
- "br" : true, // void element
- "pre" : true
- };
-
- function tsrSpansTagDOM(n, parsoidData) {
- // - tags known to have tag-specific tsr
- // - html tags with 'stx' set
- // - span tags with 'mw:Nowiki' type
- var name = n.nodeName.toLowerCase();
- return !WT_tagsWithLimitedTSR[name] &&
- !DU.hasLiteralHTMLMarker(parsoidData) &&
- !DU.isNodeOfType(n, 'span', 'mw:Nowiki');
- }
-
function computeListEltWidth(li, nodeName) {
if (!li.previousSibling && li.firstChild) {
var n = li.firstChild.nodeName.toLowerCase();
@@ -2059,12 +2108,14 @@
var isMarkerTag = false,
child = children[i],
cType = child.nodeType,
- endTagWidth = null;
+ endTagWidth = null,
+ fosteredNode = false;
cs = null;
// In edit mode, StrippedTag marker tags will be removed and
wont
// be around to miss in the filling gap. So, absorb its width
into
- // the DSR of its previous sibling.
+ // the DSR of its previous sibling. Currently, this fix is
only for
+ // B and I tags where the fix is clear-cut and obvious.
if (editMode) {
var next = child.nextSibling;
if (next) {
@@ -2098,6 +2149,8 @@
propagateRight = false,
stWidth = null, etWidth = null;
+ fosteredNode = dp.fostered;
+
// In edit-mode, we are making dsr corrections to
account for
// stripped tags (end tags usually). When stripping
happens,
// in most common use cases, a corresponding end tag is
added
@@ -2106,6 +2159,9 @@
// So, when an autoInsertedEnd tag is encountered and a
matching
// dsr-correction is found, make a 1-time correction in
the
// other direction.
+ //
+ // Currently, this fix is only for
+ // B and I tags where the fix is clear-cut and obvious.
if (editMode && ce !== null && dp.autoInsertedEnd &&
child.nodeName in {B:1, I:1}) {
correction = (3 + child.nodeName.length);
if (correction === dsrCorrection) {
@@ -2330,10 +2386,13 @@
node.removeChild(child); // No use for this marker tag
after this
}
- // ce for next child = cs of current child
- ce = cs;
- // end-tag width from marker meta tag
- savedEndTagWidth = endTagWidth;
+ // Dont change state if we processed a fostered node
+ if (!fosteredNode) {
+ // ce for next child = cs of current child
+ ce = cs;
+ // end-tag width from marker meta tag
+ savedEndTagWidth = endTagWidth;
+ }
}
if (cs === undefined || cs === null) {
@@ -2855,10 +2914,18 @@
saveDataParsoid = function( node, debugDump ) {
if ( node.nodeType === node.ELEMENT_NODE && node.data ) {
if (!debugDump) {
- if (node.data.parsoid) {
- node.data.parsoid.tagId = undefined;
- if (node.data.parsoid.tsr) {
- node.data.parsoid.tsr = undefined;
+ var dp = node.data.parsoid;
+ if (dp) {
+ dp.tagId = undefined;
+ if (dp.tsr) {
+ dp.tsr = undefined;
+ }
+
+ // Make dsr zero-range for fostered content
+ // to prevent selser from duplicating this
content
+ // outside the table from where this came.
+ if (dp.fostered) {
+ dp.dsr[0] = dp.dsr[1];
}
}
}
@@ -2890,6 +2957,7 @@
this.processors = [
dataParsoidLoader.traverse.bind( dataParsoidLoader ),
handleUnbalancedTableTags,
+ markFosteredContent,
migrateStartMetas,
//normalizeDocument,
findAndFixBuilderCorrectedTags,
diff --git a/js/lib/mediawiki.WikitextSerializer.js
b/js/lib/mediawiki.WikitextSerializer.js
index f78e751..c08f30c 100644
--- a/js/lib/mediawiki.WikitextSerializer.js
+++ b/js/lib/mediawiki.WikitextSerializer.js
@@ -3860,7 +3860,7 @@
// to every node of a subtree (rather than an
indication that some node
// in the subtree is modified).
if (state.selserMode && !state.inModifiedContent &&
- dp && isValidDSR(dp.dsr) && dp.dsr[1] >
dp.dsr[0]) {
+ dp && isValidDSR(dp.dsr) && (dp.dsr[1] >
dp.dsr[0] || dp.fostered)) {
// To serialize from source, we need 3 things
of the node:
// -- it should not have a diff marker
// -- it should have valid, usable DSR
diff --git a/js/tests/parserTests-blacklist.js
b/js/tests/parserTests-blacklist.js
index 735269d..9376d7d 100644
--- a/js/tests/parserTests-blacklist.js
+++ b/js/tests/parserTests-blacklist.js
@@ -3295,8 +3295,11 @@
add("selser", "Fuzz testing: Parser22 [[1],4,2]");
add("selser", "Fuzz testing: Parser22 [3,0,0]");
add("selser", "Fuzz testing: Parser22 [1,0,2]");
+add("selser", "Fuzz testing: Parser24 [[0,2],3]");
+add("selser", "Fuzz testing: Parser24 [[[4],3],[0,0,3]]");
add("selser", "Fuzz testing: Parser24 [3,2]");
add("selser", "Fuzz testing: Parser24 [[[3],0],2]");
+add("selser", "Fuzz testing: Parser24 [2,4]");
add("selser", "Fuzz testing: Parser24 [0,2]");
add("selser", "Fuzz testing: Parser24 [1,2]");
add("selser", "Fuzz testing: Parser24 [[3,0],2]");
@@ -3814,12 +3817,16 @@
add("selser", "Empty table rows go away [[3,[[[2],0,[4]],0,0,0,2,4]]]");
add("selser", "RT-ed inter-element separators should be valid separators
[[2],[2,[[3],4]]]");
add("selser", "RT-ed inter-element separators should be valid separators
[1,1]");
+add("selser", "RT-ed inter-element separators should be valid separators
[0,[0,3]]");
+add("selser", "RT-ed inter-element separators should be valid separators
[0,3]");
add("selser", "RT-ed inter-element separators should be valid separators
[0,4]");
add("selser", "RT-ed inter-element separators should be valid separators
[4,[0,[[3],0]]]");
add("selser", "RT-ed inter-element separators should be valid separators
[1,[0,[[4],3]]]");
add("selser", "RT-ed inter-element separators should be valid separators
[0,1]");
add("selser", "RT-ed inter-element separators should be valid separators
[1,2]");
add("selser", "RT-ed inter-element separators should be valid separators
[3,[0,[0,4]]]");
+add("selser", "RT-ed inter-element separators should be valid separators
[0,[0,[4,0]]]");
+add("selser", "RT-ed inter-element separators should be valid separators
[0,[3,4]]");
add("selser", "RT-ed inter-element separators should be valid separators
[0,[2,[0,3]]]");
add("selser", "RT-ed inter-element separators should be valid separators
[2,[0,[[2],0]]]");
add("selser", "RT-ed inter-element separators should be valid separators
[0,2]");
--
To view, visit https://gerrit.wikimedia.org/r/75253
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I36f2746509791059d8dcbf88f59f4bc7d57582b0
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits