Subramanya Sastry has uploaded a new change for review.
https://gerrit.wikimedia.org/r/60970
Change subject: Improved wikitext escaping of headings
......................................................................
Improved wikitext escaping of headings
* Headings dont need to escaped unless they are in the extremities
In some cases (Ex: "==bogus== <s>a</s>"), the escaping didn't
account for additional non-heading chars in eol position.
* The fixes now lead to 2 more wt2wt, 2 more html2wt and 32 more
passing selser tests.
* Updated parserTests-blacklist
Change-Id: I65cfa51c7508145881c6b370d9ed452cc678903b
---
M js/lib/mediawiki.DOMUtils.js
M js/lib/mediawiki.WikitextSerializer.js
M js/tests/parserTests-blacklist.js
3 files changed, 32 insertions(+), 51 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid
refs/changes/70/60970/1
diff --git a/js/lib/mediawiki.DOMUtils.js b/js/lib/mediawiki.DOMUtils.js
index e5f1a57..fd46824 100644
--- a/js/lib/mediawiki.DOMUtils.js
+++ b/js/lib/mediawiki.DOMUtils.js
@@ -484,7 +484,7 @@
* Get the first child element or non-IEW text node, ignoring
* whitespace-only text nodes and comments.
*/
- getFirstNonSepChildNode: function(node) {
+ firstNonSepChildNode: function(node) {
var child = node.firstChild;
while (child && !this.isContentNode(child)) {
child = child.nextSibling;
diff --git a/js/lib/mediawiki.WikitextSerializer.js
b/js/lib/mediawiki.WikitextSerializer.js
index 099d56f..85a2a2c 100644
--- a/js/lib/mediawiki.WikitextSerializer.js
+++ b/js/lib/mediawiki.WikitextSerializer.js
@@ -246,9 +246,13 @@
}
if (!linksOnly && tc === pd.TagTk) {
- // mw:Entity tokens
+ // Ignore mw:Entity tokens
if (t.name === 'span' && t.getAttribute('typeof') ===
'mw:Entity') {
numEntities++;
+ continue;
+ }
+ // Ignore heading tokens
+ if (t.name.match(/^h\d$/)) {
continue;
}
@@ -256,9 +260,13 @@
}
if (!linksOnly && tc === pd.EndTagTk) {
- // mw:Entity tokens
+ // Ignore mw:Entity tokens
if (numEntities > 0 && t.name === 'span') {
numEntities--;
+ continue;
+ }
+ // Ignore heading tokens
+ if (t.name.match(/^h\d$/)) {
continue;
}
@@ -615,10 +623,22 @@
text = text.replace(/<(\/?nowiki)>/g, '<$1>');
// Use the tokenizer to see if we have any wikitext tokens
+ //
+ // Ignores headings & entities -- headings have additional
+ // EOL matching requirements which are not captured by the
+ // hasWikitextTokens check
if (this.wteHandlers.hasWikitextTokens(state, sol, text) || hasTildes) {
// console.warn("---EWT:DBG1---");
return escapedText(text);
- } else if (!state.onSOL) {
+ } else if (state.onSOL) {
+ if (text.match(/^=+[^=]+=+$/)) {
+ // console.warn("---EWT:DBG2---");
+ return escapedText(text);
+ } else {
+ // console.warn("---EWT:DBG3---");
+ return text;
+ }
+ } else {
// Detect if we have open brackets or heading chars -- we use
'processed' flag
// as a performance opt. to run this detection only if/when
required.
//
@@ -636,7 +656,7 @@
// - a text node: (Ex: <p>=foo=</p>)
// - the first child of a heading node: (Ex:
<h1>=foo=</h1>)
if (cl.text.match(/^=/) &&
- (DU.isText(cl.firstNode) ||
+
(DU.isText(DU.firstNonSepChildNode(cl.firstNode.parentNode)) ||
cl.firstNode.nodeName.match(/^H/) &&
cl.firstNode.firstChild && DU.isText(cl.firstNode.firstChild)))
{
cl.hasOpenHeadingChar = true;
@@ -659,15 +679,12 @@
cl.hasOpenBrackets && text.match(/^[^\[]*\]/) &&
this.wteHandlers.hasWikitextTokens(state, sol,
cl.text + text, true))
{
- // console.warn("---EWT:DBG2---");
+ // console.warn("---EWT:DBG4---");
return escapedText(text);
} else {
- // console.warn("---EWT:DBG3---");
+ // console.warn("---EWT:DBG5---");
return text;
}
- } else {
- // console.warn("---EWT:DBG4---");
- return text;
}
};
@@ -1317,13 +1334,13 @@
return {
handle: function (node, state, cb) {
- var firstChildElt = DU.getFirstNonSepChildNode(node);
+ var firstChildElt = DU.firstNonSepChildNode(node);
// Skip builder-inserted wrappers
// Ex: <ul><s
auto-inserted-start-and-end-><li>..</li><li>..</li></s>...</ul>
// output from: <s>\n*a\n*b\n*c</s>
while (firstChildElt &&
isBuilderInsertedElt(firstChildElt)) {
- firstChildElt =
DU.getFirstNonSepChildNode(firstChildElt);
+ firstChildElt =
DU.firstNonSepChildNode(firstChildElt);
}
if (!firstChildElt || ! (firstChildElt.nodeName in
firstChildNames)) {
@@ -1354,7 +1371,7 @@
li: {
handle: function (node, state, cb) {
- var firstChildElement =
DU.getFirstNonSepChildNode(node);
+ var firstChildElement = DU.firstNonSepChildNode(node);
if (!DU.isList(firstChildElement)) {
cb(state.serializer._getListBullets(node),
node);
}
@@ -1384,7 +1401,7 @@
dt: {
handle: function (node, state, cb) {
- var firstChildElement =
DU.getFirstNonSepChildNode(node);
+ var firstChildElement = DU.firstNonSepChildNode(node);
if (!DU.isList(firstChildElement)) {
cb(state.serializer._getListBullets(node),
node);
}
@@ -1412,7 +1429,7 @@
dd: {
handle: function (node, state, cb) {
- var firstChildElement =
DU.getFirstNonSepChildNode(node);
+ var firstChildElement = DU.firstNonSepChildNode(node);
if (!DU.isList(firstChildElement)) {
// XXX: handle stx: row
if (node.data.parsoid.stx === 'row') {
diff --git a/js/tests/parserTests-blacklist.js
b/js/tests/parserTests-blacklist.js
index 1ccff8d..c58aa79 100644
--- a/js/tests/parserTests-blacklist.js
+++ b/js/tests/parserTests-blacklist.js
@@ -588,8 +588,6 @@
add("wt2wt", "Transclusion of nonexistent MediaWiki message");
add("wt2wt", "Transclusion of MediaWiki message with underscore");
add("wt2wt", "Transclusion of MediaWiki message with space");
-add("wt2wt", "Section extraction test with bogus <nowiki> heading (section
1)");
-add("wt2wt", "Section extraction test with bogus <nowiki> heading (section
2)");
add("wt2wt", "Section extraction, <pre> around bogus header (bug 10309)");
add("wt2wt", "Section replacement, <pre> around bogus header (bug 10309)");
add("wt2wt", "5 quotes, code coverage +1 line (parsoid)");
@@ -1748,8 +1746,6 @@
add("html2wt", "Section replacement test (section 9)");
add("html2wt", "Section replacement test (section 10)");
add("html2wt", "Section replacement test with initial whitespace (bug 13728)");
-add("html2wt", "Section extraction, heading followed by pre with 20 spaces
(bug 6398)");
-add("html2wt", "Section extraction, heading followed by pre with 19 spaces
(bug 6398 sanity check)");
add("html2wt", "Section extraction, <pre> around bogus header (bug 10309)");
add("html2wt", "Section replacement, <pre> around bogus header (bug 10309)");
add("html2wt", "Handling of 
 in URLs");
@@ -2953,38 +2949,6 @@
add("selser", "Transclusion of MediaWiki message with space [1]");
add("selser", "Transclusion of MediaWiki message with space [4]");
add("selser", "Transclusion of MediaWiki message with space [2]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1)
[4,0,[0,1]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1)
[[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1)
[1,0,1,0,1]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1)
[2,0,[0,4]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1)
[1,0,[0,[0]]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1)
[2,0,[0,[0]]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1)
[3,0,[0,[0]]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1)
[3,0,2,0,[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1)
[3,0,[0,1]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1)
[1,0,1,0,[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1)
[1,0,[0,4]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1)
[4,0,[0,[0]]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1)
[2,0,4,0,1]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1)
[4,0,1,0,[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1)
[4,0,4,0,1]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2)
[2,0,4,0,1]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2)
[3,0,[0,1]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2)
[[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2)
[1,0,[0,[0]]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2)
[3,0,4,0,[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2)
[2,0,[0,[0]]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2)
[4,0,1,0,[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2)
[1,0,[0,1]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2)
[4,0,2,0,[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2)
[4,0,1,0,4]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2)
[2,0,2,0,2]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2)
[3,0,2,0,2]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2)
[3,0,1,0,[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2)
[2,0,1,0,1]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2)
[1,0,1,0,[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2)
[2,0,4,0,2]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2)
[2,0,4,0,[0]]");
add("selser", "Section extraction, <pre> around bogus header (bug 10309)
[1,0,4,0,1,4,0,3]");
add("selser", "Section extraction, <pre> around bogus header (bug 10309)
[1,0,[0]]");
add("selser", "Section extraction, <pre> around bogus header (bug 10309)
[1,0,2,0,2,[0]]");
--
To view, visit https://gerrit.wikimedia.org/r/60970
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I65cfa51c7508145881c6b370d9ed452cc678903b
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits