Subramanya Sastry has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/75253


Change subject: WIP: (Bug 51217) Detect fostered content and ignore in selser
......................................................................

WIP: (Bug 51217) Detect fostered content and ignore in selser

* Added a dom pass to detect fostered nodes (and text in some
  scenarios -- see comments in code for more info about this)

* Removed older fostered content tests and reused new flag.

* 8 selser tests failing (because wt2wt is at fault, not selser.
  See Bug 51718 for this -- we really need to fix this some way
  sooner than later) and blacklisted for now.

* Manually tested with this snippet by making changes to
  para-1 and para-2 and ensuring that selser doesn't duplicate
  'will be fostered' outside the table
------------
para-1
{|
will be fostered
|-
|bar
|}
para-2
------------

* Tested with the following pages which were used to test early
  handling of fostered content from templates and verified that
  nothing is broken.

  - en:Almirante_Latorre-class_battleship
  - en:HMS_Salisbury_(F32)

Change-Id: I36f2746509791059d8dcbf88f59f4bc7d57582b0
---
M js/lib/mediawiki.DOMPostProcessor.js
M js/lib/mediawiki.WikitextSerializer.js
M js/tests/parserTests-blacklist.js
3 files changed, 151 insertions(+), 76 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid 
refs/changes/53/75253/1

diff --git a/js/lib/mediawiki.DOMPostProcessor.js 
b/js/lib/mediawiki.DOMPostProcessor.js
index f569fe4..863f19b 100644
--- a/js/lib/mediawiki.DOMPostProcessor.js
+++ b/js/lib/mediawiki.DOMPostProcessor.js
@@ -70,6 +70,47 @@
        // what about span, figure, caption, figcaption?
 };
 
+// TSR info on all these tags are only valid for the opening tag.
+// (closing tags dont have attrs since tree-builder strips them
+//  and adds meta-tags tracking the corresponding TSR)
+//
+// On other tags, a, hr, br, meta-marker tags, the tsr spans
+// the entire DOM, not just the tag.
+var WT_tagsWithLimitedTSR = {
+       "b" : true,
+       "i" : true,
+       "h1" : true,
+       "h2" : true,
+       "h3" : true,
+       "h4" : true,
+       "h5" : true,
+       "ul" : true,
+       "ol" : true,
+       "dl" : true,
+       "li" : true,
+       "dt" : true,
+       "dd" : true,
+       "table" : true,
+       "caption" : true,
+       "tr" : true,
+       "td" : true,
+       "th" : true,
+       "hr" : true, // void element
+       "br" : true, // void element
+       "pre" : true
+};
+
+function tsrSpansTagDOM(n, parsoidData) {
+       // - tags known to have tag-specific tsr
+       // - html tags with 'stx' set
+       // - span tags with 'mw:Nowiki' type
+       var name = n.nodeName.toLowerCase();
+       return !WT_tagsWithLimitedTSR[name] &&
+               !DU.hasLiteralHTMLMarker(parsoidData) &&
+               !DU.isNodeOfType(n, 'span', 'mw:Nowiki');
+}
+
+
 /* ------------- utility functions on DOM nodes/Node attributes ------------ */
 
 // SSS FIXME: Should we convert some of these functions to properties
@@ -843,6 +884,73 @@
        }
 }
 
+function markFosteredContent(node, env) {
+       function findFosteredContent(table) {
+               var tableTagId = table.data.parsoid.tagId,
+                       n = table.previousSibling,
+                       initPos = table.data.parsoid.tsr ? 
table.data.parsoid.tsr[0] : null,
+                       fosteredText = "",
+                       nodeBuf = [],
+                       tsrGap = 0;
+
+               while (n) {
+                       if (DU.isElt(n)) {
+                               if (typeof(n.data.parsoid.tagId) !== 'number' 
|| n.data.parsoid.tagId < tableTagId) {
+                                       if (initPos && n.data.parsoid.tsr && 
tsrSpansTagDOM(n, n.data.parsoid)) {
+                                               var expectedGap = initPos - 
n.data.parsoid.tsr[1];
+                                               if (tsrGap !== expectedGap) {
+                                                       console.log("Fostered 
text/comments: " +
+                                                               
JSON.stringify(fosteredText.substring(expectedGap)));
+                                                       while (nodeBuf.length > 
0) {
+                                                               // Wrap each 
node in a span wrapper
+                                                               var x = 
nodeBuf.pop();
+                                                               var span = 
table.ownerDocument.createElement('span');
+                                                               span.data = { 
parsoid: { fostered: true } };
+                                                               
x.parentNode.insertBefore(span, x);
+                                                               
span.appendChild(x);
+                                                       }
+                                               }
+                                       } else {
+                                               // No clue if the text in 
fosteredText is really fostered content.
+                                               // If we ran this pass 
post-dsr-computation, we might be able to
+                                               // detect this in more 
scenarios. Something to consider.
+
+                                               /**
+                                               console.warn("initPos: " + 
initPos);
+                                               console.warn("have tsr: " + 
n.data.parsoid.tsr);
+                                               console.warn("spans tsr: " + 
(n.data.parsoid.tsr && tsrSpansTagDOM(n, n.data.parsoid)));
+                                               **/
+                                       }
+                                       // All good at this point
+                                       break;
+                               } else {
+                                       n.data.parsoid.fostered = true;
+                               }
+                       } else {
+                               var str = DU.isText(n) ? n.nodeValue : "<!--" + 
n.nodeValue + "-->";
+                               tsrGap += str.length;
+                               fosteredText = str + fosteredText;
+                               nodeBuf.push(n);
+                       }
+                       n = n.previousSibling;
+               }
+       }
+
+       var c = node.firstChild;
+       while (c) {
+               var sibling = c.nextSibling;
+
+               if (DU.isElt(c) && c.nodeName === 'TABLE') {
+                       findFosteredContent(c);
+               }
+
+               if (c.childNodes.length > 0) {
+                       markFosteredContent(c, env);
+               }
+               c = sibling;
+       }
+}
+
 // If the last child of a node is a start-meta, simply
 // move it up and make it the parent's sibling.
 // This will move the start-meta closest to the content
@@ -1027,7 +1135,6 @@
                if (!range.end.data) {
                        range.end.data = {};
                }
-               range.end.data.tmp_fostered = true;
        }
 
        return range;
@@ -1400,7 +1507,7 @@
                                var endDsr = dp2.dsr[0];
                                if (DU.hasNodeName(tcEnd, 'table') &&
                                        ((endDsr !== null && endDsr < 
dp1.dsr[0]) ||
-                                        (tcStart.data && 
tcStart.data.tmp_fostered)))
+                                        (tcStart.data && 
tcStart.data.parsoid.fostered)))
                                {
                                        dp1.dsr[0] = endDsr;
                                }
@@ -1480,27 +1587,6 @@
                }
 
                deleteNode(range.endElem);
-       }
-}
-
-function swallowTableIfNestedDSR(elt, tbl) {
-       var eltDP = DU.getDataParsoid( elt ),
-               eltDSR = eltDP.dsr,
-               tblDP = DU.getDataParsoid( tbl ),
-               tblTSR = tblDP.tsr;
-
-       // IMPORTANT: Do not use dsr to compare because the table may not
-       // have a valid dsr[1] (if the  table's end-tag is generated by
-       // a template transcluded into the table) always.  But it is
-       // sufficient to check against tsr[1] because the only way 'elt'
-       // could have a dsr[0] after the table-start-tag but show up before
-       // 'tbl' is if 'elt' got fostered out of the table.
-       if (eltDSR && tblTSR && eltDSR[0] >= tblTSR[1]) {
-               eltDP.dsr[0] = tblTSR[0];
-               eltDP.dsr[1] = null;
-               return true;
-       } else {
-               return false;
        }
 }
 
@@ -1627,10 +1713,14 @@
                                                        tbl = tbl.nextSibling;
                                                }
 
+                                               var dp = 
sm.parentNode.data.parsoid;
                                                if (tbl &&
                                                        DU.hasNodeName(tbl, 
'table') &&
-                                                       
swallowTableIfNestedDSR(sm.parentNode, tbl))
+                                                       dp.fostered)
                                                {
+                                                       if (dp.tsr && dp.tsr[0] 
!== null && tbl.dsr[0] === null) {
+                                                               tbl.dsr[0] = 
dp.tsr[0];
+                                                       }
                                                        
tbl.setAttribute('about', about); // set about on elem
                                                        ee = tbl;
                                                }
@@ -1867,47 +1957,6 @@
 // [s,e) -- if defined, start/end position of wikitext source that generated
 //          node's subtree
 function computeNodeDSR(env, node, s, e, dsrCorrection, traceDSR) {
-
-       // TSR info on all these tags are only valid for the opening tag.
-       // (closing tags dont have attrs since tree-builder strips them
-       //  and adds meta-tags tracking the corresponding TSR)
-       //
-       // On other tags, a, hr, br, meta-marker tags, the tsr spans
-       // the entire DOM, not just the tag.
-       var WT_tagsWithLimitedTSR = {
-               "b" : true,
-               "i" : true,
-               "h1" : true,
-               "h2" : true,
-               "h3" : true,
-               "h4" : true,
-               "h5" : true,
-               "ul" : true,
-               "ol" : true,
-               "dl" : true,
-               "li" : true,
-               "dt" : true,
-               "dd" : true,
-               "table" : true,
-               "caption" : true,
-               "tr" : true,
-               "td" : true,
-               "th" : true,
-               "hr" : true, // void element
-               "br" : true, // void element
-               "pre" : true
-       };
-
-       function tsrSpansTagDOM(n, parsoidData) {
-               // - tags known to have tag-specific tsr
-               // - html tags with 'stx' set
-               // - span tags with 'mw:Nowiki' type
-               var name = n.nodeName.toLowerCase();
-               return !WT_tagsWithLimitedTSR[name] &&
-                       !DU.hasLiteralHTMLMarker(parsoidData) &&
-                       !DU.isNodeOfType(n, 'span', 'mw:Nowiki');
-       }
-
        function computeListEltWidth(li, nodeName) {
                if (!li.previousSibling && li.firstChild) {
                        var n = li.firstChild.nodeName.toLowerCase();
@@ -2059,12 +2108,14 @@
                var isMarkerTag = false,
                        child = children[i],
                    cType = child.nodeType,
-                       endTagWidth = null;
+                       endTagWidth = null,
+                       fosteredNode = false;
                cs = null;
 
                // In edit mode, StrippedTag marker tags will be removed and 
wont
                // be around to miss in the filling gap.  So, absorb its width 
into
-               // the DSR of its previous sibling.
+               // the DSR of its previous sibling.  Currently, this fix is 
only for
+               // B and I tags where the fix is clear-cut and obvious.
                if (editMode) {
                        var next = child.nextSibling;
                        if (next) {
@@ -2098,6 +2149,8 @@
                                propagateRight = false,
                                stWidth = null, etWidth = null;
 
+                       fosteredNode = dp.fostered;
+
                        // In edit-mode, we are making dsr corrections to 
account for
                        // stripped tags (end tags usually).  When stripping 
happens,
                        // in most common use cases, a corresponding end tag is 
added
@@ -2106,6 +2159,9 @@
                        // So, when an autoInsertedEnd tag is encountered and a 
matching
                        // dsr-correction is found, make a 1-time correction in 
the
                        // other direction.
+                       //
+                       // Currently, this fix is only for
+                       // B and I tags where the fix is clear-cut and obvious.
                        if (editMode && ce !== null && dp.autoInsertedEnd && 
child.nodeName in {B:1, I:1}) {
                                correction = (3 + child.nodeName.length);
                                if (correction === dsrCorrection) {
@@ -2330,10 +2386,13 @@
                        node.removeChild(child); // No use for this marker tag 
after this
                }
 
-               // ce for next child = cs of current child
-               ce = cs;
-               // end-tag width from marker meta tag
-               savedEndTagWidth = endTagWidth;
+               // Dont change state if we processed a fostered node
+               if (!fosteredNode) {
+                       // ce for next child = cs of current child
+                       ce = cs;
+                       // end-tag width from marker meta tag
+                       savedEndTagWidth = endTagWidth;
+               }
        }
 
        if (cs === undefined || cs === null) {
@@ -2855,10 +2914,18 @@
 saveDataParsoid = function( node, debugDump ) {
        if ( node.nodeType === node.ELEMENT_NODE && node.data ) {
                if (!debugDump) {
-                       if (node.data.parsoid) {
-                               node.data.parsoid.tagId = undefined;
-                               if (node.data.parsoid.tsr) {
-                                       node.data.parsoid.tsr = undefined;
+                       var dp = node.data.parsoid;
+                       if (dp) {
+                               dp.tagId = undefined;
+                               if (dp.tsr) {
+                                       dp.tsr = undefined;
+                               }
+
+                               // Make dsr zero-range for fostered content
+                               // to prevent selser from duplicating this 
content
+                               // outside the table from where this came.
+                               if (dp.fostered) {
+                                       dp.dsr[0] = dp.dsr[1];
                                }
                        }
                }
@@ -2890,6 +2957,7 @@
        this.processors = [
                dataParsoidLoader.traverse.bind( dataParsoidLoader ),
                handleUnbalancedTableTags,
+               markFosteredContent,
                migrateStartMetas,
                //normalizeDocument,
                findAndFixBuilderCorrectedTags,
diff --git a/js/lib/mediawiki.WikitextSerializer.js 
b/js/lib/mediawiki.WikitextSerializer.js
index f78e751..c08f30c 100644
--- a/js/lib/mediawiki.WikitextSerializer.js
+++ b/js/lib/mediawiki.WikitextSerializer.js
@@ -3860,7 +3860,7 @@
                        // to every node of a subtree (rather than an 
indication that some node
                        // in the subtree is modified).
                        if (state.selserMode && !state.inModifiedContent &&
-                               dp && isValidDSR(dp.dsr) && dp.dsr[1] > 
dp.dsr[0]) {
+                               dp && isValidDSR(dp.dsr) && (dp.dsr[1] > 
dp.dsr[0] || dp.fostered)) {
                                // To serialize from source, we need 3 things 
of the node:
                                // -- it should not have a diff marker
                                // -- it should have valid, usable DSR
diff --git a/js/tests/parserTests-blacklist.js 
b/js/tests/parserTests-blacklist.js
index 735269d..9376d7d 100644
--- a/js/tests/parserTests-blacklist.js
+++ b/js/tests/parserTests-blacklist.js
@@ -3295,8 +3295,11 @@
 add("selser", "Fuzz testing: Parser22 [[1],4,2]");
 add("selser", "Fuzz testing: Parser22 [3,0,0]");
 add("selser", "Fuzz testing: Parser22 [1,0,2]");
+add("selser", "Fuzz testing: Parser24 [[0,2],3]");
+add("selser", "Fuzz testing: Parser24 [[[4],3],[0,0,3]]");
 add("selser", "Fuzz testing: Parser24 [3,2]");
 add("selser", "Fuzz testing: Parser24 [[[3],0],2]");
+add("selser", "Fuzz testing: Parser24 [2,4]");
 add("selser", "Fuzz testing: Parser24 [0,2]");
 add("selser", "Fuzz testing: Parser24 [1,2]");
 add("selser", "Fuzz testing: Parser24 [[3,0],2]");
@@ -3814,12 +3817,16 @@
 add("selser", "Empty table rows go away [[3,[[[2],0,[4]],0,0,0,2,4]]]");
 add("selser", "RT-ed inter-element separators should be valid separators 
[[2],[2,[[3],4]]]");
 add("selser", "RT-ed inter-element separators should be valid separators 
[1,1]");
+add("selser", "RT-ed inter-element separators should be valid separators 
[0,[0,3]]");
+add("selser", "RT-ed inter-element separators should be valid separators 
[0,3]");
 add("selser", "RT-ed inter-element separators should be valid separators 
[0,4]");
 add("selser", "RT-ed inter-element separators should be valid separators 
[4,[0,[[3],0]]]");
 add("selser", "RT-ed inter-element separators should be valid separators 
[1,[0,[[4],3]]]");
 add("selser", "RT-ed inter-element separators should be valid separators 
[0,1]");
 add("selser", "RT-ed inter-element separators should be valid separators 
[1,2]");
 add("selser", "RT-ed inter-element separators should be valid separators 
[3,[0,[0,4]]]");
+add("selser", "RT-ed inter-element separators should be valid separators 
[0,[0,[4,0]]]");
+add("selser", "RT-ed inter-element separators should be valid separators 
[0,[3,4]]");
 add("selser", "RT-ed inter-element separators should be valid separators 
[0,[2,[0,3]]]");
 add("selser", "RT-ed inter-element separators should be valid separators 
[2,[0,[[2],0]]]");
 add("selser", "RT-ed inter-element separators should be valid separators 
[0,2]");

-- 
To view, visit https://gerrit.wikimedia.org/r/75253
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I36f2746509791059d8dcbf88f59f4bc7d57582b0
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to