Subramanya Sastry has uploaded a new change for review.
https://gerrit.wikimedia.org/r/67635
Change subject: Detect cycles in tpl-range nesting and pick a winner.
......................................................................
Detect cycles in tpl-range nesting and pick a winner.
* Because multiple templates can collectively generate a DOM subtree
(whereas individual elements of this collection dont), the
expanded template ranges for each member of the collection can
end up being identical. In addition, fostering of content from
tables can result in situations where some ranges can be flipped
over (end marker occuring earlier than the start marker).
Both these conditions are true with en:Mujigen_Hunter_Fandora
* Previous logic to not introduce cycles in the nesting detection
was simplistic and assumed that at most 2 templates can participate
in cycles, whereas in the above page, 3 templates participated in
a cycle. This sent the template encapsulation logic in an infinite
loop (in an attempt to find the topmost enclosing range).
* This patch eliminates the checks and introduces a more robust cycle
detection check that breaks cycles no matter how they were introduced.
It arbitrarily picks the smallest template range id as the winner,
i.e. the template range within which all the other elements of the
cycle are considered nested in. This works in eliminating loops
during template encapsulation.
However, the strategy of picking the smallest range always wont
work well if that range happens to be a flipped range and introduces
duplicate content during RTing. (See new wt2wt failure in parserTests)
This failure can be handled either in the serializer or by not
picking a flipped range as the outermost range in the parser --
to be decided what is better.
* 1 new wt2wt test failure (see explanation above)
1 new selser of the same test
Change-Id: I8e3042466719c98e2e6364fc756398d5ce6543ef
---
M js/lib/mediawiki.DOMPostProcessor.js
M js/tests/parserTests-blacklist.js
2 files changed, 57 insertions(+), 31 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid
refs/changes/35/67635/1
diff --git a/js/lib/mediawiki.DOMPostProcessor.js
b/js/lib/mediawiki.DOMPostProcessor.js
index 711edee..8189d1e 100644
--- a/js/lib/mediawiki.DOMPostProcessor.js
+++ b/js/lib/mediawiki.DOMPostProcessor.js
@@ -1058,10 +1058,47 @@
}
}
- function findToplevelEnclosingRange(nestingInfo, rId) {
+ function findToplevelEnclosingRange(nestingInfo, startId) {
// Walk up the implicit nesting tree to the find the
// top-level range within which rId is nested.
+ //
+ // Detect cycles and return the smallest id for all
+ // elements in the cycle.
+ //
+ // Cycles can show up because of > 2 identical ranges (some of
which
+ // might be flipped ranges because of foster parenting
scenarios)
+
+ function findCycle(start, edges) {
+ var visited = {},
+ elt = start;
+ while (!visited[elt]) {
+ visited[elt] = true;
+ elt = edges[elt];
+ }
+ return Object.keys(visited);
+ }
+
+ var visitedIds = {},
+ rId = startId;
while (nestingInfo[rId]) {
+ if (visitedIds[rId]) {
+ // We have a cycle -- find members of the cycle
and return
+ // the smallest id in the cycle.
+ //
+ // NOTE: Cannot use members of visitedIds to
detect cycles
+ // since it can contain elements outside the
cycle.
+ var cycle = findCycle(rId, nestingInfo),
+ minId = Math.min.apply(null, cycle),
+ // minId is a number, rId, startId are
strings
+ minId = minId.toString();
+
+ // console.warn("Found cycle: " +
JSON.stringify(cycle) + "; Min id: " + minId);
+
+ // The smallest element will contain all the
other elements.
+ // So, minId itself should return null
+ return (minId === startId) ? null : minId;
+ }
+ visitedIds[rId] = true;
rId = nestingInfo[rId];
}
return rId;
@@ -1147,11 +1184,6 @@
// 'r' is nested for sure
// Record a range in which 'r' is
nested in.
nestedRangesMap[r.id] =
Object.keys(n.data.tmp_tplRanges)[0];
- if (nestedRangesMap[r.id] === r.id) {
- console.error("BUG! BUG! range
" + r.id + " is being reported as nested in itself!");
- console.error("Clearing the
flag to eliminate possibility of infinite loops, but output might be corrupt.");
- nestedRangesMap[r.id] = null;
- }
break;
} else {
// n === r.start
@@ -1161,6 +1193,9 @@
// compute their intersection. If this
intersection has
// another tpl range besides r itself,
we have a winner!
//
+ // Array A - B functionality that Ruby
has would have simplified
+ // this code!
+ //
// The code below does the above check
efficiently.
var s_tpls = r.start.data.tmp_tplRanges,
e_tpls =
r.end.data.tmp_tplRanges,
@@ -1168,21 +1203,8 @@
foundIntersection = false;
for (var j = 0; j < s_keys.length; j++)
{
-
- // Because of fostered of
end-tags, a range's end and start
- // tags might be flipped
(r.flipped). In such a scenario,
- // two ranges A and B might
have exactly identical ranges.
- //
- // A = {start: e1, end: e2}
- // B = {start: e2, end: e1}
where B is the flipped range.
- //
- // Hence we also need an
additional check to make sure
- // nestedRangesMap[other] !==
r.id. Without this check,
- // we will record that A is
nested in B and B is nested in A
- // which will introduce a loop
in findToplevelEnclosingRange!
-
var other = s_keys[j];
- if (other !== r.id &&
e_tpls[other] && nestedRangesMap[other] !== r.id) {
+ if (other !== r.id &&
e_tpls[other]) {
foundIntersection =
true;
// Record a range in
which 'r' is nested in.
nestedRangesMap[r.id] =
other;
@@ -1241,25 +1263,27 @@
}
*/
+ var enclosingRangeId = null;
if (nestedRangesMap[r.id]) {
- // console.warn("--nested--");
+ // console.warn("--possibly nested--");
+ enclosingRangeId =
findToplevelEnclosingRange(nestedRangesMap, r.id);
+ }
- // Nested -- ignore
+ if (nestedRangesMap[r.id] && enclosingRangeId) {
+ // console.warn("--nested--");
+ // Nested -- ignore r
startTagToStrip = r.startElem;
endTagToRemove = r.endElem;
-
if (argInfo) {
- var containingRangeId =
findToplevelEnclosingRange(nestedRangesMap, r.id);
-
- // 'r' is nested in 'containingRange' at the
top-level
- // So, containingRange gets r's argInfo
- if (!compoundTpls[containingRangeId]) {
- compoundTpls[containingRangeId] = [];
+ // 'r' is nested in 'enclosingRange' at the
top-level
+ // So, enclosingRange gets r's argInfo
+ if (!compoundTpls[enclosingRangeId]) {
+ compoundTpls[enclosingRangeId] = [];
}
- recordTemplateInfo(compoundTpls,
containingRangeId, r, argInfo);
+ recordTemplateInfo(compoundTpls,
enclosingRangeId, r, argInfo);
}
} else if (prev && !r.flipped && r.start === prev.end) {
- // console.warn("--overlapped--");
+ // /console.warn("--overlapped--");
// Overlapping ranges.
// r is the regular kind
diff --git a/js/tests/parserTests-blacklist.js
b/js/tests/parserTests-blacklist.js
index 05583b4..f7a640b 100644
--- a/js/tests/parserTests-blacklist.js
+++ b/js/tests/parserTests-blacklist.js
@@ -540,6 +540,7 @@
add("wt2wt", "2. includeonly in html attr value");
add("wt2wt", "Templates: HTML Tag: 2. Generation of HTML attr. value");
add("wt2wt", "Templates: HTML Tag: 3. Generation of HTML attr key and value");
+add("wt2wt", "Templates: Wiki Tables: 1b. Fostering of entire template
content");
add("wt2wt", "pre-save transform: nonexistent template");
add("wt2wt", "pre-save transform: mixed tag case");
add("wt2wt", "pre-save transform: Signature expansion");
@@ -2801,6 +2802,7 @@
add("selser", "2. includeonly in html attr value [1]");
add("selser", "Templates: HTML Tag: 2. Generation of HTML attr. value 0");
add("selser", "Templates: HTML Tag: 3. Generation of HTML attr key and value
0");
+add("selser", "Templates: Wiki Tables: 1b. Fostering of entire template
content 0");
add("selser", "pre-save transform: nonexistent template [[4]]");
add("selser", "pre-save transform: nonexistent template [[3]]");
add("selser", "pre-save transform: nonexistent template 0");
--
To view, visit https://gerrit.wikimedia.org/r/67635
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I8e3042466719c98e2e6364fc756398d5ce6543ef
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits