Subramanya Sastry has uploaded a new change for review.
https://gerrit.wikimedia.org/r/70768
Change subject: (Bug 50068) Make DSR adjustments based on
treebuilder-corrections
......................................................................
(Bug 50068) Make DSR adjustments based on treebuilder-corrections
* mw:Placeholder/StrippedTag meta tags record information about
deleted tags (and represent non-zero width wikitext in source).
Similarly, autoInsertedEnd/Start flags on data-parsoid of nodes
record information about added tags (and represent zero-width
wikitext in source).
* In edit-mode, mw:Placeholder/StrippedTag meta tags are removed
which effectively creates a dsr-hole where it was removed from
and can lead to missing wikitext when selser uses that DSR
during serialization. Bug 50068 is an instance of this.
"Foo '''''bar''''' baz" parses to the HTML below. Notice that
the <i> tag spans from 4-14 and loses 3-chars worth of information
about the closing </b> tag.
-------------
<p data-parsoid='{"dsr":[0,21,0,0]}'>Foo <i data-parsoid='{"dsr":[4,14,2,2]}'>
<b data-parsoid='{"autoInsertedEnd":true,"dsr":[6,12,3,0]}'>bar</b></i>
baz</p>
-------------
In editMode false, the HTML output is (notice the placeholder below)
-------------
<p data-parsoid='{"dsr":[0,21,0,0]}'>Foo <i data-parsoid='{"dsr":[4,14,2,2]}'>
<b data-parsoid='{"autoInsertedEnd":true,"dsr":[6,12,3,0]}'>bar</b></i>
<meta typeof="mw:Placeholder/StrippedTag"
data-parsoid='{"src":"'''","dsr":[14,17,null,null]}'>
baz</p>
-------------
* This patch attempts to patch DSR by assimilating the gap into
previous siblings. Given that some instances of deleting an end
tag correspond to the same end tag being reinserted elsewhere in
the DOM (ex: badly nested HTML <span><s>x</span></s>), we also
have to make a fixup in the reverse direction on a node that has
autoInsertedEnd set.
This strategy is at best a heuristic that will not always work
properly -- work in progress.
* Fixes the bug + several selser tests around quotes. We have two
selser failures on a misnested HTML example.
Change-Id: I69bda7db141a3302cee6ece5af5ffdc57e748d89
---
M js/lib/mediawiki.DOMPostProcessor.js
M js/lib/mediawiki.Util.js
M js/tests/parserTests-blacklist.js
3 files changed, 71 insertions(+), 39 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid
refs/changes/68/70768/1
diff --git a/js/lib/mediawiki.DOMPostProcessor.js
b/js/lib/mediawiki.DOMPostProcessor.js
index b86d863..c610205 100644
--- a/js/lib/mediawiki.DOMPostProcessor.js
+++ b/js/lib/mediawiki.DOMPostProcessor.js
@@ -1909,7 +1909,7 @@
// node -- node to process
// [s,e) -- if defined, start/end position of wikitext source that generated
// node's subtree
-function computeNodeDSR(env, node, s, e, traceDSR) {
+function computeNodeDSR(env, node, s, e, dsrCorrection, traceDSR) {
// TSR info on all these tags are only valid for the opening tag.
// (closing tags dont have attrs since tree-builder strips them
@@ -2058,6 +2058,23 @@
return [stWidth, etWidth];
}
+ function trace() {
+ if (traceDSR) {
+ Util.debug_pp.apply(Util, ['',
''].concat([].slice.apply(arguments)));
+ }
+ }
+
+ function traceNode(node, i, cs, ce) {
+ if (traceDSR) {
+ trace(
+ "-- Processing <", node.parentNode.nodeName,
":", i,
+ ">=", DU.isElt(node) ? '' : (DU.isText(node) ?
'#' : '!'),
+ DU.isElt(node) ? (node.nodeName === 'META' ?
node.outerHTML : node.nodeName) : node.data,
+ " with [", cs, ",", ce, "]"
+ );
+ }
+ }
+
// No undefined values here onwards.
// NOTE: Never use !s, !e, !cs, !ce for testing for non-null
// because any of them could be zero.
@@ -2069,7 +2086,7 @@
e = null;
}
- if (traceDSR) { console.warn("Received " + s + ", " + e + " for " +
node.nodeName + " --"); }
+ trace("Received ", s, ", ", e, " for ", node.nodeName);
var children = node.childNodes,
savedEndTagWidth = null,
@@ -2078,31 +2095,62 @@
// if this node has no child content, then the start and end for
// the child dom are indeed identical. Alternatively, we could
// explicitly code this check before everything and bypass this.
- cs = ce;
+ cs = ce,
+ editMode = env.conf.parsoid.editMode;
for (var n = children.length, i = n-1; i >= 0; i--) {
var isMarkerTag = false,
child = children[i],
cType = child.nodeType,
endTagWidth = null;
cs = null;
+
+ // In edit mode, StrippedTag marker tags will be removed and
wont
+ // be around to miss in the filling gap. So, absorb its width
into
+ // the DSR of its previous sibling.
+ if (editMode) {
+ var next = child.nextSibling;
+ if (next) {
+ if (DU.isElt(next) &&
/\bmw:Placeholder\/StrippedTag\b/.test(next.getAttribute("typeof"))) {
+ var correction =
next.data.parsoid.src.length;
+ ce += correction;
+ dsrCorrection = correction;
+ }
+ }
+ }
+
+ traceNode(child, i, cs, ce);
+
if (cType === Node.TEXT_NODE) {
- if (traceDSR) { console.warn("-- Processing <" +
node.nodeName + ":" + i + ">=#" + child.data + " with [" + cs + "," + ce +
"]"); }
if (ce !== null) {
cs = ce - child.data.length -
DU.indentPreDSRCorrection(child);
}
} else if (cType === Node.COMMENT_NODE) {
- if (traceDSR) { console.warn("-- Processing <" +
node.nodeName + ":" + i + ">=!" + child.data + " with [" + cs + "," + ce +
"]"); }
if (ce !== null) {
cs = ce - child.data.length - 7; // 7 chars for
"<!--" and "-->"
}
} else if (cType === Node.ELEMENT_NODE) {
- if (traceDSR) { console.warn("-- Processing <" +
node.nodeName + ":" + i + ">=" + child.nodeName + " with [" + cs + "," + ce +
"]"); }
var cTypeOf = child.getAttribute("typeof"),
dp = DU.getDataParsoid( child ),
tsr = dp.tsr,
oldCE = tsr ? tsr[1] : null,
propagateRight = false,
stWidth = null, etWidth = null;
+
+ // In edit-mode, we are making dsr corrections to
account for
+ // stripped tags (end tags usually). When stripping
happens,
+ // in most common use cases, a corresponding end tag is
added
+ // back elsewhere in the DOM.
+ //
+ // So, when an autoInsertedEnd tag is encountered and a
matching
+ // dsr-correction is found, make a 1-time correction in
the
+ // other direction.
+ if (editMode && ce !== null && dp.autoInsertedEnd) {
+ var correction = (3 + child.nodeName.length);
+ if (correction === dsrCorrection) {
+ ce -= correction;
+ dsrCorrection = 0;
+ }
+ }
if (DU.hasNodeName(child, "meta")) {
// Unless they have been foster-parented,
@@ -2181,7 +2229,10 @@
} else {
stWidth = tsr[1] -
tsr[0];
}
- if (traceDSR) {
console.warn("TSR: " + JSON.stringify(tsr) + "; cs: " + cs + "; ce: " + ce); }
+
+ if (traceDSR) {
+ trace("TSR: ", tsr, ";
cs: ", cs, "; ce: ", ce);
+ }
} else if (s && child.previousSibling
=== null) {
cs = s;
}
@@ -2201,11 +2252,10 @@
ccs = cs !== null && stWidth !== null ? cs +
stWidth : null;
cce = ce !== null && etWidth !== null ? ce -
etWidth : null;
- if (traceDSR) {
- console.warn("Before recursion,
[cs,ce]=" + cs + "," + ce +
- "; [sw,ew]=" + stWidth + "," +
etWidth +
- "; [ccs,cce]=" + ccs + "," +
cce);
- }
+
+ trace("Before recursion, [cs,ce]=", cs, ",", ce,
+ "; [sw,ew]=", stWidth, ",", etWidth,
+ "; [ccs,cce]=", ccs + ",", cce);
/*
-----------------------------------------------------------------
* Process DOM rooted at 'child'.
@@ -2240,7 +2290,7 @@
*
------------------------------------------------------------- */
newDsr = [ccs, cce];
} else {
- newDsr = computeNodeDSR(env, child,
ccs, cce, traceDSR);
+ newDsr = computeNodeDSR(env, child,
ccs, cce, dsrCorrection, traceDSR);
}
// Min(child-dom-tree dsr[0] - tag-width,
current dsr[0])
@@ -2260,7 +2310,7 @@
if (cs !== null || ce !== null) {
dp.dsr = [cs, ce, stWidth, etWidth];
if (traceDSR) {
- console.warn("-- UPDATING; " +
child.nodeName + " with [" + cs + "," + ce + "]; typeof: " + cTypeOf);
+ trace("-- UPDATING; ", child.nodeName,
" with [", cs, ",", ce, "]; typeof: ", cTypeOf);
// Set up 'dbsrc' so we can debug this
dp.dbsrc = env.page.src.substring(cs,
ce);
}
@@ -2293,7 +2343,7 @@
// Update and move right
if (traceDSR) {
- console.warn("CHANGING
ce.start of " + sibling.nodeName + " from " + siblingDP.dsr[0] + " to " +
newCE);
+ trace("CHANGING
ce.start of ", sibling.nodeName, " from ", siblingDP.dsr[0], " to ", newCE);
// debug info
if (siblingDP.dsr[1]) {
siblingDP.dbsrc
= env.page.src.substring(newCE, siblingDP.dsr[1]);
@@ -2333,9 +2383,8 @@
console.warn("WARNING: DSR inconsistency: cs/s mismatch for
node: " +
node.nodeName + " s: " + s + "; cs: " + cs);
}
- if (traceDSR) {
- console.warn("For " + node.nodeName + ", returning: " + cs + ",
" + e);
- }
+
+ trace("For ", node.nodeName, ", returning: ", cs, ", ", e);
return [cs, e];
}
@@ -2382,7 +2431,7 @@
// The actual computation buried in trace/debug stmts.
var body = root.body;
- computeNodeDSR(env, body, startOffset, endOffset, traceDSR);
+ computeNodeDSR(env, body, startOffset, endOffset, 0, traceDSR);
var dp = DU.getDataParsoid( body );
dp.dsr = [startOffset, endOffset, 0, 0];
diff --git a/js/lib/mediawiki.Util.js b/js/lib/mediawiki.Util.js
index ae275d6..27530cf 100644
--- a/js/lib/mediawiki.Util.js
+++ b/js/lib/mediawiki.Util.js
@@ -926,7 +926,7 @@
out.push('undefined');
} else if (a.constructor === Boolean) {
out.push(a ? '1' : '0');
- } else if (a.constructor !== String ||
a.match(/\n|^\s*$/)) {
+ } else if (a.constructor !== String ||
a.match(/\n|^\s+$/)) {
out.push(JSON.stringify(a));
} else {
out.push(a);
diff --git a/js/tests/parserTests-blacklist.js
b/js/tests/parserTests-blacklist.js
index 00998b0..9ebd3bb 100644
--- a/js/tests/parserTests-blacklist.js
+++ b/js/tests/parserTests-blacklist.js
@@ -2089,7 +2089,6 @@
add("selser", "Italics and bold
[[[4],3,2,3,[0,0,4,[2],0],0,4,0,[3,2,0,0,2],0,[3,4,4,[2],3],0,[0,3,4,[3],0],2,4,4,[0,1,4],0,1,0,[2,2,1,2],3,[0,[3,1],4,0],2,2,0,[2,3,0],2,0]]");
add("selser", "Italics and bold
[[1,2,[2,0,2],3,[0,[3],4,0,2],0,1,4,1,2,[0,0,0,[2],0],3,1,4,[0,[0,3,0],0],2,1,0,1,4,[2,0,[2],2],0,1,0,[0,3,2],3,[0,[3],0],0,3]]");
add("selser", "Italics and bold
[[4,2,4,3,1,4,1,0,[4,1,2,4,0],0,[0,[3],2,[2],0],3,[2,[3],0,4,0],3,4,2,[0,[3,2,0],0],2,[0,[[3],3],0],2,[0,1,2,4],0,[0,[0,3],1,3],0,1,4,3,4,1]]");
-add("selser", "Italics and bold
[[[3],2,[4,3,0],4,[2,1,2,2,2],0,[2,[2],0],2,2,4,[0,2,3,3,0],0,[0,2,3,0,3],4,[4,[3,4,0],2],3,3,2,2,2,4,0,1,0,[2,2,0],0,[0,4,3],0,0]]");
add("selser", "Italics and bold
[[[3],0,3,2,4,0,3,0,[2,3,0,1,4],0,[0,3,3,0,3],0,[0,0,0,0,4],0,1,2,[4,[0,2,3],4],0,[2,[0,3],4],0,[4,1,0,4],0,[4,4,1,0],0,1,0,[0,0,2],0,1]]");
add("selser", "Italics and possessives [2]");
add("selser", "Italics and possessives [[3,2]]");
@@ -2189,13 +2188,6 @@
add("selser", "Italics and bold: 5-quote opening sequence: (5,4) [[1]]");
add("selser", "Italics and bold: 5-quote opening sequence: (5,4) [[[2]]]");
add("selser", "Italics and bold: 5-quote opening sequence: (5,4) [[[[4]]]]");
-add("selser", "Italics and bold: 5-quote opening sequence: (5,5) [1]");
-add("selser", "Italics and bold: 5-quote opening sequence: (5,5) [[[2],0]]");
-add("selser", "Italics and bold: 5-quote opening sequence: (5,5) [[2,0]]");
-add("selser", "Italics and bold: 5-quote opening sequence: (5,5) [[1,0]]");
-add("selser", "Italics and bold: 5-quote opening sequence: (5,5) [[1,2]]");
-add("selser", "Italics and bold: 5-quote opening sequence: (5,5) [[1,4]]");
-add("selser", "Italics and bold: 5-quote opening sequence: (5,5) [[2,2]]");
add("selser", "Italics and bold: multiple quote sequences: (2,4,2) [[2]]");
add("selser", "Italics and bold: multiple quote sequences: (2,4,2) [[1]]");
add("selser", "Italics and bold: multiple quote sequences: (2,4,2) [[[2,0]]]");
@@ -2260,11 +2252,6 @@
add("selser", "Italics and bold: multiple quote sequences: (3,4,3) (parsoid)
[[[2],4,1]]");
add("selser", "Italics and bold: multiple quote sequences: (3,4,3) (parsoid)
[[0,2,1]]");
add("selser", "Italics and bold: multiple quote sequences: (3,4,3) (parsoid)
[[2,3,1]]");
-add("selser", "Italics and bold: other quote tests: (2,3,5) [[1,2]]");
-add("selser", "Italics and bold: other quote tests: (2,3,5) [1]");
-add("selser", "Italics and bold: other quote tests: (2,3,5) [[1,0]]");
-add("selser", "Italics and bold: other quote tests: (2,3,5) [[[2,2],4]]");
-add("selser", "Italics and bold: other quote tests: (2,3,5) [[1]]");
add("selser", "Italics and bold: other quote tests: (3,2,3,2) [1]");
add("selser", "Italics and bold: other quote tests: (3,2,3,2) [[[0,2],1]]");
add("selser", "Italics and bold: other quote tests: (3,2,3,2) [[1,[3]]]");
@@ -2664,12 +2651,6 @@
add("selser", "External link containing double-single-quotes with no space
separating the url from text in italics [[[0,4],4,3]]");
add("selser", "External link containing double-single-quotes with no space
separating the url from text in italics [[4,0,3]]");
add("selser", "External link containing double-single-quotes with no space
separating the url from text in italics [[[1,2],2,4]]");
-add("selser", "Quotes [1,0,[4,2,4,4]]");
-add("selser", "Quotes [2,0,1]");
-add("selser", "Quotes [[4,0,2,0],0,1]");
-add("selser", "Quotes [[3,[3],0,3],0,[2,2,2,3]]");
-add("selser", "Quotes [[2,[3],0,[3]],3,[0,2,0,4]]");
-add("selser", "Unclosed and unmatched quotes (parsoid)
[[2,0],2,[3,4],4,3,2,3,3,3,3,[4],3,4,0,4,0,3,3,[2,3,3],2,[0,1,3]]");
add("selser", "Unclosed and unmatched quotes (parsoid)
[4,0,1,0,3,0,3,0,1,3,2,0,0,3,2,0,[[3],3,0,0,0,0],2,[[3,4],2,0],0,[2,[2],3]]");
add("selser", "Unclosed and unmatched quotes (parsoid)
[4,0,2,4,3,3,1,3,[[3]],0,2,0,0,4,0,0,1,4,[2,1,0],0,2]");
add("selser", "Unclosed and unmatched quotes (parsoid)
[2,0,[[2],[3,1]],0,3,0,1,3,4,4,2,3,[4],0,3,0,[4,3,[3],0,[3],0],0,[[0,1],3,0],4,4]");
@@ -3425,6 +3406,8 @@
add("selser", "Parsing of overlapping (improperly nested) inline html tags
(Parsoid) [2]");
add("selser", "Parsing of overlapping (improperly nested) inline html tags
(Parsoid) [[[[3]],1]]");
add("selser", "Parsing of overlapping (improperly nested) inline html tags
(Parsoid) [[[[3]],2]]");
+add("selser", "Parsing of overlapping (improperly nested) inline html tags
(Parsoid) [1]");
+add("selser", "Parsing of overlapping (improperly nested) inline html tags
(Parsoid) [[2,3]]");
add("selser", "Parsing of overlapping (improperly nested) inline html tags
(Parsoid) [[0,4]]");
add("selser", "Simple template in language variants [[3]]");
add("selser", "Simple template in language variants [[2]]");
--
To view, visit https://gerrit.wikimedia.org/r/70768
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I69bda7db141a3302cee6ece5af5ffdc57e748d89
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits