Subramanya Sastry has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/70768


Change subject: (Bug 50068) Make DSR adjustments based on 
treebuilder-corrections
......................................................................

(Bug 50068) Make DSR adjustments based on treebuilder-corrections

* mw:Placeholder/StrippedTag meta tags record information about
  deleted tags (and represent non-zero width wikitext in source).

  Similarly, autoInsertedEnd/Start flags on data-parsoid of nodes
  record information about added tags (and represent zero-width
  wikitext in source).

* In edit-mode, mw:Placeholder/StrippedTag meta tags are removed
  which effectively creates a dsr-hole where it was removed from
  and can lead to missing wikitext when selser uses that DSR
  during serialization. Bug 50068 is an instance of this.

  "Foo '''''bar''''' baz" parses to the HTML below.  Notice that
  the <i> tag spans from 4-14 and loses 3-chars worth of information
  about the closing </b> tag.
-------------
<p data-parsoid='{"dsr":[0,21,0,0]}'>Foo <i data-parsoid='{"dsr":[4,14,2,2]}'>
<b data-parsoid='{"autoInsertedEnd":true,"dsr":[6,12,3,0]}'>bar</b></i>
baz</p>
-------------

  In editMode false, the HTML output is (notice the placeholder below)

-------------
<p data-parsoid='{"dsr":[0,21,0,0]}'>Foo <i data-parsoid='{"dsr":[4,14,2,2]}'>
<b data-parsoid='{"autoInsertedEnd":true,"dsr":[6,12,3,0]}'>bar</b></i>
<meta typeof="mw:Placeholder/StrippedTag" 
data-parsoid='{"src":"&#39;&#39;&#39;","dsr":[14,17,null,null]}'>
baz</p>
-------------

* This patch attempts to patch DSR by assimilating the gap into
  previous siblings.  Given that some instances of deleting an end
  tag correspond to the same end tag being reinserted elsewhere in
  the DOM (ex: badly nested HTML <span><s>x</span></s>), we also
  have to make a fixup in the reverse direction on a node that has
  autoInsertedEnd set.

  This strategy is at best a heuristic that will not always work
  properly -- work in progress.

* Fixes the bug + several selser tests around quotes.  We have two
  selser failures on a misnested HTML example.

Change-Id: I69bda7db141a3302cee6ece5af5ffdc57e748d89
---
M js/lib/mediawiki.DOMPostProcessor.js
M js/lib/mediawiki.Util.js
M js/tests/parserTests-blacklist.js
3 files changed, 71 insertions(+), 39 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid 
refs/changes/68/70768/1

diff --git a/js/lib/mediawiki.DOMPostProcessor.js 
b/js/lib/mediawiki.DOMPostProcessor.js
index b86d863..c610205 100644
--- a/js/lib/mediawiki.DOMPostProcessor.js
+++ b/js/lib/mediawiki.DOMPostProcessor.js
@@ -1909,7 +1909,7 @@
 // node  -- node to process
 // [s,e) -- if defined, start/end position of wikitext source that generated
 //          node's subtree
-function computeNodeDSR(env, node, s, e, traceDSR) {
+function computeNodeDSR(env, node, s, e, dsrCorrection, traceDSR) {
 
        // TSR info on all these tags are only valid for the opening tag.
        // (closing tags dont have attrs since tree-builder strips them
@@ -2058,6 +2058,23 @@
                return [stWidth, etWidth];
        }
 
+       function trace() {
+               if (traceDSR) {
+                       Util.debug_pp.apply(Util, ['', 
''].concat([].slice.apply(arguments)));
+               }
+       }
+
+       function traceNode(node, i, cs, ce) {
+               if (traceDSR) {
+                       trace(
+                               "-- Processing <", node.parentNode.nodeName, 
":", i,
+                               ">=", DU.isElt(node) ? '' : (DU.isText(node) ? 
'#' : '!'),
+                               DU.isElt(node) ? (node.nodeName === 'META' ? 
node.outerHTML : node.nodeName) : node.data,
+                               " with [", cs, ",", ce, "]"
+                       );
+               }
+       }
+
        // No undefined values here onwards.
        // NOTE: Never use !s, !e, !cs, !ce for testing for non-null
        // because any of them could be zero.
@@ -2069,7 +2086,7 @@
                e = null;
        }
 
-       if (traceDSR) { console.warn("Received " + s + ", " + e + " for " + 
node.nodeName + " --"); }
+       trace("Received ", s, ", ", e, " for ", node.nodeName);
 
        var children = node.childNodes,
                savedEndTagWidth = null,
@@ -2078,31 +2095,62 @@
                // if this node has no child content, then the start and end for
                // the child dom are indeed identical.  Alternatively, we could
                // explicitly code this check before everything and bypass this.
-               cs = ce;
+               cs = ce,
+               editMode = env.conf.parsoid.editMode;
        for (var n = children.length, i = n-1; i >= 0; i--) {
                var isMarkerTag = false,
                        child = children[i],
                    cType = child.nodeType,
                        endTagWidth = null;
                cs = null;
+
+               // In edit mode, StrippedTag marker tags will be removed and 
wont
+               // be around to miss in the filling gap.  So, absorb its width 
into
+               // the DSR of its previous sibling.
+               if (editMode) {
+                       var next = child.nextSibling;
+                       if (next) {
+                               if (DU.isElt(next) && 
/\bmw:Placeholder\/StrippedTag\b/.test(next.getAttribute("typeof"))) {
+                                       var correction = 
next.data.parsoid.src.length;
+                                       ce += correction;
+                                       dsrCorrection = correction;
+                               }
+                       }
+               }
+
+               traceNode(child, i, cs, ce);
+
                if (cType === Node.TEXT_NODE) {
-                       if (traceDSR) { console.warn("-- Processing <" + 
node.nodeName + ":" + i + ">=#" + child.data + " with [" + cs + "," + ce + 
"]"); }
                        if (ce !== null) {
                                cs = ce - child.data.length - 
DU.indentPreDSRCorrection(child);
                        }
                } else if (cType === Node.COMMENT_NODE) {
-                       if (traceDSR) { console.warn("-- Processing <" + 
node.nodeName + ":" + i + ">=!" + child.data + " with [" + cs + "," + ce + 
"]"); }
                        if (ce !== null) {
                                cs = ce - child.data.length - 7; // 7 chars for 
"<!--" and "-->"
                        }
                } else if (cType === Node.ELEMENT_NODE) {
-                       if (traceDSR) { console.warn("-- Processing <" + 
node.nodeName + ":" + i + ">=" + child.nodeName + " with [" + cs + "," + ce + 
"]"); }
                        var cTypeOf = child.getAttribute("typeof"),
                                dp = DU.getDataParsoid( child ),
                                tsr = dp.tsr,
                                oldCE = tsr ? tsr[1] : null,
                                propagateRight = false,
                                stWidth = null, etWidth = null;
+
+                       // In edit-mode, we are making dsr corrections to 
account for
+                       // stripped tags (end tags usually).  When stripping 
happens,
+                       // in most common use cases, a corresponding end tag is 
added
+                       // back elsewhere in the DOM.
+                       //
+                       // So, when an autoInsertedEnd tag is encountered and a 
matching
+                       // dsr-correction is found, make a 1-time correction in 
the
+                       // other direction.
+                       if (editMode && ce !== null && dp.autoInsertedEnd) {
+                               var correction = (3 + child.nodeName.length);
+                               if (correction === dsrCorrection) {
+                                       ce -= correction;
+                                       dsrCorrection = 0;
+                               }
+                       }
 
                        if (DU.hasNodeName(child, "meta")) {
                                // Unless they have been foster-parented,
@@ -2181,7 +2229,10 @@
                                                } else {
                                                        stWidth = tsr[1] - 
tsr[0];
                                                }
-                                               if (traceDSR) { 
console.warn("TSR: " + JSON.stringify(tsr) + "; cs: " + cs + "; ce: " + ce); }
+
+                                               if (traceDSR) {
+                                                       trace("TSR: ", tsr, "; 
cs: ", cs, "; ce: ", ce);
+                                               }
                                        } else if (s && child.previousSibling 
=== null) {
                                                cs = s;
                                        }
@@ -2201,11 +2252,10 @@
 
                                ccs = cs !== null && stWidth !== null ? cs + 
stWidth : null;
                                cce = ce !== null && etWidth !== null ? ce - 
etWidth : null;
-                               if (traceDSR) {
-                                       console.warn("Before recursion, 
[cs,ce]=" + cs + "," + ce +
-                                               "; [sw,ew]=" + stWidth + "," + 
etWidth +
-                                               "; [ccs,cce]=" + ccs + "," + 
cce);
-                               }
+
+                               trace("Before recursion, [cs,ce]=", cs, ",", ce,
+                                       "; [sw,ew]=", stWidth, ",", etWidth,
+                                       "; [ccs,cce]=", ccs + ",", cce);
 
                                /* 
-----------------------------------------------------------------
                                 * Process DOM rooted at 'child'.
@@ -2240,7 +2290,7 @@
                                         * 
------------------------------------------------------------- */
                                        newDsr = [ccs, cce];
                                } else {
-                                       newDsr = computeNodeDSR(env, child, 
ccs, cce, traceDSR);
+                                       newDsr = computeNodeDSR(env, child, 
ccs, cce, dsrCorrection, traceDSR);
                                }
 
                                // Min(child-dom-tree dsr[0] - tag-width, 
current dsr[0])
@@ -2260,7 +2310,7 @@
                        if (cs !== null || ce !== null) {
                                dp.dsr = [cs, ce, stWidth, etWidth];
                                if (traceDSR) {
-                                       console.warn("-- UPDATING; " + 
child.nodeName + " with [" + cs + "," + ce + "]; typeof: " + cTypeOf);
+                                       trace("-- UPDATING; ", child.nodeName, 
" with [", cs, ",", ce, "]; typeof: ", cTypeOf);
                                        // Set up 'dbsrc' so we can debug this
                                        dp.dbsrc = env.page.src.substring(cs, 
ce);
                                }
@@ -2293,7 +2343,7 @@
 
                                                // Update and move right
                                                if (traceDSR) {
-                                                       console.warn("CHANGING 
ce.start of " + sibling.nodeName + " from " + siblingDP.dsr[0] + " to " + 
newCE);
+                                                       trace("CHANGING 
ce.start of ", sibling.nodeName, " from ", siblingDP.dsr[0], " to ", newCE);
                                                        // debug info
                                                        if (siblingDP.dsr[1]) {
                                                                siblingDP.dbsrc 
= env.page.src.substring(newCE, siblingDP.dsr[1]);
@@ -2333,9 +2383,8 @@
                console.warn("WARNING: DSR inconsistency: cs/s mismatch for 
node: " +
                        node.nodeName + " s: " + s + "; cs: " + cs);
        }
-       if (traceDSR) {
-               console.warn("For " + node.nodeName + ", returning: " + cs + ", 
" + e);
-       }
+
+       trace("For ", node.nodeName, ", returning: ", cs, ", ", e);
 
        return [cs, e];
 }
@@ -2382,7 +2431,7 @@
 
        // The actual computation buried in trace/debug stmts.
        var body = root.body;
-       computeNodeDSR(env, body, startOffset, endOffset, traceDSR);
+       computeNodeDSR(env, body, startOffset, endOffset, 0, traceDSR);
 
        var dp = DU.getDataParsoid( body );
        dp.dsr = [startOffset, endOffset, 0, 0];
diff --git a/js/lib/mediawiki.Util.js b/js/lib/mediawiki.Util.js
index ae275d6..27530cf 100644
--- a/js/lib/mediawiki.Util.js
+++ b/js/lib/mediawiki.Util.js
@@ -926,7 +926,7 @@
                                out.push('undefined');
                        } else if (a.constructor === Boolean) {
                                out.push(a ? '1' : '0');
-                       } else if (a.constructor !== String || 
a.match(/\n|^\s*$/)) {
+                       } else if (a.constructor !== String || 
a.match(/\n|^\s+$/)) {
                                out.push(JSON.stringify(a));
                        } else {
                                out.push(a);
diff --git a/js/tests/parserTests-blacklist.js 
b/js/tests/parserTests-blacklist.js
index 00998b0..9ebd3bb 100644
--- a/js/tests/parserTests-blacklist.js
+++ b/js/tests/parserTests-blacklist.js
@@ -2089,7 +2089,6 @@
 add("selser", "Italics and bold 
[[[4],3,2,3,[0,0,4,[2],0],0,4,0,[3,2,0,0,2],0,[3,4,4,[2],3],0,[0,3,4,[3],0],2,4,4,[0,1,4],0,1,0,[2,2,1,2],3,[0,[3,1],4,0],2,2,0,[2,3,0],2,0]]");
 add("selser", "Italics and bold 
[[1,2,[2,0,2],3,[0,[3],4,0,2],0,1,4,1,2,[0,0,0,[2],0],3,1,4,[0,[0,3,0],0],2,1,0,1,4,[2,0,[2],2],0,1,0,[0,3,2],3,[0,[3],0],0,3]]");
 add("selser", "Italics and bold 
[[4,2,4,3,1,4,1,0,[4,1,2,4,0],0,[0,[3],2,[2],0],3,[2,[3],0,4,0],3,4,2,[0,[3,2,0],0],2,[0,[[3],3],0],2,[0,1,2,4],0,[0,[0,3],1,3],0,1,4,3,4,1]]");
-add("selser", "Italics and bold 
[[[3],2,[4,3,0],4,[2,1,2,2,2],0,[2,[2],0],2,2,4,[0,2,3,3,0],0,[0,2,3,0,3],4,[4,[3,4,0],2],3,3,2,2,2,4,0,1,0,[2,2,0],0,[0,4,3],0,0]]");
 add("selser", "Italics and bold 
[[[3],0,3,2,4,0,3,0,[2,3,0,1,4],0,[0,3,3,0,3],0,[0,0,0,0,4],0,1,2,[4,[0,2,3],4],0,[2,[0,3],4],0,[4,1,0,4],0,[4,4,1,0],0,1,0,[0,0,2],0,1]]");
 add("selser", "Italics and possessives [2]");
 add("selser", "Italics and possessives [[3,2]]");
@@ -2189,13 +2188,6 @@
 add("selser", "Italics and bold: 5-quote opening sequence: (5,4) [[1]]");
 add("selser", "Italics and bold: 5-quote opening sequence: (5,4) [[[2]]]");
 add("selser", "Italics and bold: 5-quote opening sequence: (5,4) [[[[4]]]]");
-add("selser", "Italics and bold: 5-quote opening sequence: (5,5) [1]");
-add("selser", "Italics and bold: 5-quote opening sequence: (5,5) [[[2],0]]");
-add("selser", "Italics and bold: 5-quote opening sequence: (5,5) [[2,0]]");
-add("selser", "Italics and bold: 5-quote opening sequence: (5,5) [[1,0]]");
-add("selser", "Italics and bold: 5-quote opening sequence: (5,5) [[1,2]]");
-add("selser", "Italics and bold: 5-quote opening sequence: (5,5) [[1,4]]");
-add("selser", "Italics and bold: 5-quote opening sequence: (5,5) [[2,2]]");
 add("selser", "Italics and bold: multiple quote sequences: (2,4,2) [[2]]");
 add("selser", "Italics and bold: multiple quote sequences: (2,4,2) [[1]]");
 add("selser", "Italics and bold: multiple quote sequences: (2,4,2) [[[2,0]]]");
@@ -2260,11 +2252,6 @@
 add("selser", "Italics and bold: multiple quote sequences: (3,4,3) (parsoid) 
[[[2],4,1]]");
 add("selser", "Italics and bold: multiple quote sequences: (3,4,3) (parsoid) 
[[0,2,1]]");
 add("selser", "Italics and bold: multiple quote sequences: (3,4,3) (parsoid) 
[[2,3,1]]");
-add("selser", "Italics and bold: other quote tests: (2,3,5) [[1,2]]");
-add("selser", "Italics and bold: other quote tests: (2,3,5) [1]");
-add("selser", "Italics and bold: other quote tests: (2,3,5) [[1,0]]");
-add("selser", "Italics and bold: other quote tests: (2,3,5) [[[2,2],4]]");
-add("selser", "Italics and bold: other quote tests: (2,3,5) [[1]]");
 add("selser", "Italics and bold: other quote tests: (3,2,3,2) [1]");
 add("selser", "Italics and bold: other quote tests: (3,2,3,2) [[[0,2],1]]");
 add("selser", "Italics and bold: other quote tests: (3,2,3,2) [[1,[3]]]");
@@ -2664,12 +2651,6 @@
 add("selser", "External link containing double-single-quotes with no space 
separating the url from text in italics [[[0,4],4,3]]");
 add("selser", "External link containing double-single-quotes with no space 
separating the url from text in italics [[4,0,3]]");
 add("selser", "External link containing double-single-quotes with no space 
separating the url from text in italics [[[1,2],2,4]]");
-add("selser", "Quotes [1,0,[4,2,4,4]]");
-add("selser", "Quotes [2,0,1]");
-add("selser", "Quotes [[4,0,2,0],0,1]");
-add("selser", "Quotes [[3,[3],0,3],0,[2,2,2,3]]");
-add("selser", "Quotes [[2,[3],0,[3]],3,[0,2,0,4]]");
-add("selser", "Unclosed and unmatched quotes (parsoid) 
[[2,0],2,[3,4],4,3,2,3,3,3,3,[4],3,4,0,4,0,3,3,[2,3,3],2,[0,1,3]]");
 add("selser", "Unclosed and unmatched quotes (parsoid) 
[4,0,1,0,3,0,3,0,1,3,2,0,0,3,2,0,[[3],3,0,0,0,0],2,[[3,4],2,0],0,[2,[2],3]]");
 add("selser", "Unclosed and unmatched quotes (parsoid) 
[4,0,2,4,3,3,1,3,[[3]],0,2,0,0,4,0,0,1,4,[2,1,0],0,2]");
 add("selser", "Unclosed and unmatched quotes (parsoid) 
[2,0,[[2],[3,1]],0,3,0,1,3,4,4,2,3,[4],0,3,0,[4,3,[3],0,[3],0],0,[[0,1],3,0],4,4]");
@@ -3425,6 +3406,8 @@
 add("selser", "Parsing of overlapping (improperly nested) inline html tags 
(Parsoid) [2]");
 add("selser", "Parsing of overlapping (improperly nested) inline html tags 
(Parsoid) [[[[3]],1]]");
 add("selser", "Parsing of overlapping (improperly nested) inline html tags 
(Parsoid) [[[[3]],2]]");
+add("selser", "Parsing of overlapping (improperly nested) inline html tags 
(Parsoid) [1]");
+add("selser", "Parsing of overlapping (improperly nested) inline html tags 
(Parsoid) [[2,3]]");
 add("selser", "Parsing of overlapping (improperly nested) inline html tags 
(Parsoid) [[0,4]]");
 add("selser", "Simple template in language variants [[3]]");
 add("selser", "Simple template in language variants [[2]]");

-- 
To view, visit https://gerrit.wikimedia.org/r/70768
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I69bda7db141a3302cee6ece5af5ffdc57e748d89
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to