jenkins-bot has submitted this change and it was merged.

Change subject: Improved wikitext escaping of headings
......................................................................


Improved wikitext escaping of headings

* Headings dont need to escaped unless they are in the extremities
  In some cases (Ex: "==bogus== <s>a</s>"), the escaping didn't
  account for additional non-heading chars in eol position.

* The fixes now lead to 2 more wt2wt, 2 more html2wt and 32 more
  passing selser tests.

* Updated parserTests-blacklist

Change-Id: I65cfa51c7508145881c6b370d9ed452cc678903b
---
M js/lib/mediawiki.DOMUtils.js
M js/lib/mediawiki.WikitextSerializer.js
M js/tests/parserTests-blacklist.js
3 files changed, 32 insertions(+), 51 deletions(-)

Approvals:
  Cscott: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/js/lib/mediawiki.DOMUtils.js b/js/lib/mediawiki.DOMUtils.js
index e5f1a57..fd46824 100644
--- a/js/lib/mediawiki.DOMUtils.js
+++ b/js/lib/mediawiki.DOMUtils.js
@@ -484,7 +484,7 @@
         * Get the first child element or non-IEW text node, ignoring
         * whitespace-only text nodes and comments.
         */
-       getFirstNonSepChildNode: function(node) {
+       firstNonSepChildNode: function(node) {
                var child = node.firstChild;
                while (child && !this.isContentNode(child)) {
                        child = child.nextSibling;
diff --git a/js/lib/mediawiki.WikitextSerializer.js 
b/js/lib/mediawiki.WikitextSerializer.js
index 099d56f..85a2a2c 100644
--- a/js/lib/mediawiki.WikitextSerializer.js
+++ b/js/lib/mediawiki.WikitextSerializer.js
@@ -246,9 +246,13 @@
                }
 
                if (!linksOnly && tc === pd.TagTk) {
-                       // mw:Entity tokens
+                       // Ignore mw:Entity tokens
                        if (t.name === 'span' && t.getAttribute('typeof') === 
'mw:Entity') {
                                numEntities++;
+                               continue;
+                       }
+                       // Ignore heading tokens
+                       if (t.name.match(/^h\d$/)) {
                                continue;
                        }
 
@@ -256,9 +260,13 @@
                }
 
                if (!linksOnly && tc === pd.EndTagTk) {
-                       // mw:Entity tokens
+                       // Ignore mw:Entity tokens
                        if (numEntities > 0 && t.name === 'span') {
                                numEntities--;
+                               continue;
+                       }
+                       // Ignore heading tokens
+                       if (t.name.match(/^h\d$/)) {
                                continue;
                        }
 
@@ -615,10 +623,22 @@
        text = text.replace(/<(\/?nowiki)>/g, '&lt;$1&gt;');
 
        // Use the tokenizer to see if we have any wikitext tokens
+       //
+       // Ignores headings & entities -- headings have additional
+       // EOL matching requirements which are not captured by the
+       // hasWikitextTokens check
        if (this.wteHandlers.hasWikitextTokens(state, sol, text) || hasTildes) {
                // console.warn("---EWT:DBG1---");
                return escapedText(text);
-       } else if (!state.onSOL) {
+       } else if (state.onSOL) {
+               if (text.match(/^=+[^=]+=+$/)) {
+                       // console.warn("---EWT:DBG2---");
+                       return escapedText(text);
+               } else {
+                       // console.warn("---EWT:DBG3---");
+                       return text;
+               }
+       } else {
                // Detect if we have open brackets or heading chars -- we use 
'processed' flag
                // as a performance opt. to run this detection only if/when 
required.
                //
@@ -636,7 +656,7 @@
                        // - a text node: (Ex: <p>=foo=</p>)
                        // - the first child of a heading node: (Ex: 
<h1>=foo=</h1>)
                        if (cl.text.match(/^=/) &&
-                               (DU.isText(cl.firstNode) ||
+                               
(DU.isText(DU.firstNonSepChildNode(cl.firstNode.parentNode)) ||
                                cl.firstNode.nodeName.match(/^H/) && 
cl.firstNode.firstChild && DU.isText(cl.firstNode.firstChild)))
                        {
                                cl.hasOpenHeadingChar = true;
@@ -659,15 +679,12 @@
                    cl.hasOpenBrackets && text.match(/^[^\[]*\]/) &&
                                this.wteHandlers.hasWikitextTokens(state, sol, 
cl.text + text, true))
                {
-                       // console.warn("---EWT:DBG2---");
+                       // console.warn("---EWT:DBG4---");
                        return escapedText(text);
                } else {
-                       // console.warn("---EWT:DBG3---");
+                       // console.warn("---EWT:DBG5---");
                        return text;
                }
-       } else {
-               // console.warn("---EWT:DBG4---");
-               return text;
        }
 };
 
@@ -1317,13 +1334,13 @@
 
        return {
                handle: function (node, state, cb) {
-                       var firstChildElt = DU.getFirstNonSepChildNode(node);
+                       var firstChildElt = DU.firstNonSepChildNode(node);
 
                        // Skip builder-inserted wrappers
                        // Ex: <ul><s 
auto-inserted-start-and-end-><li>..</li><li>..</li></s>...</ul>
                        // output from: <s>\n*a\n*b\n*c</s>
                        while (firstChildElt && 
isBuilderInsertedElt(firstChildElt)) {
-                               firstChildElt = 
DU.getFirstNonSepChildNode(firstChildElt);
+                               firstChildElt = 
DU.firstNonSepChildNode(firstChildElt);
                        }
 
                        if (!firstChildElt || ! (firstChildElt.nodeName in 
firstChildNames)) {
@@ -1354,7 +1371,7 @@
 
        li: {
                handle: function (node, state, cb) {
-                       var firstChildElement = 
DU.getFirstNonSepChildNode(node);
+                       var firstChildElement = DU.firstNonSepChildNode(node);
                        if (!DU.isList(firstChildElement)) {
                                cb(state.serializer._getListBullets(node), 
node);
                        }
@@ -1384,7 +1401,7 @@
 
        dt: {
                handle: function (node, state, cb) {
-                       var firstChildElement = 
DU.getFirstNonSepChildNode(node);
+                       var firstChildElement = DU.firstNonSepChildNode(node);
                        if (!DU.isList(firstChildElement)) {
                                cb(state.serializer._getListBullets(node), 
node);
                        }
@@ -1412,7 +1429,7 @@
 
        dd: {
                handle: function (node, state, cb) {
-                       var firstChildElement = 
DU.getFirstNonSepChildNode(node);
+                       var firstChildElement = DU.firstNonSepChildNode(node);
                        if (!DU.isList(firstChildElement)) {
                                // XXX: handle stx: row
                                if (node.data.parsoid.stx === 'row') {
diff --git a/js/tests/parserTests-blacklist.js 
b/js/tests/parserTests-blacklist.js
index 1ccff8d..c58aa79 100644
--- a/js/tests/parserTests-blacklist.js
+++ b/js/tests/parserTests-blacklist.js
@@ -588,8 +588,6 @@
 add("wt2wt", "Transclusion of nonexistent MediaWiki message");
 add("wt2wt", "Transclusion of MediaWiki message with underscore");
 add("wt2wt", "Transclusion of MediaWiki message with space");
-add("wt2wt", "Section extraction test with bogus <nowiki> heading (section 
1)");
-add("wt2wt", "Section extraction test with bogus <nowiki> heading (section 
2)");
 add("wt2wt", "Section extraction, <pre> around bogus header (bug 10309)");
 add("wt2wt", "Section replacement, <pre> around bogus header (bug 10309)");
 add("wt2wt", "5 quotes, code coverage +1 line (parsoid)");
@@ -1748,8 +1746,6 @@
 add("html2wt", "Section replacement test (section 9)");
 add("html2wt", "Section replacement test (section 10)");
 add("html2wt", "Section replacement test with initial whitespace (bug 13728)");
-add("html2wt", "Section extraction, heading followed by pre with 20 spaces 
(bug 6398)");
-add("html2wt", "Section extraction, heading followed by pre with 19 spaces 
(bug 6398 sanity check)");
 add("html2wt", "Section extraction, <pre> around bogus header (bug 10309)");
 add("html2wt", "Section replacement, <pre> around bogus header (bug 10309)");
 add("html2wt", "Handling of &#x0A; in URLs");
@@ -2953,38 +2949,6 @@
 add("selser", "Transclusion of MediaWiki message with space [1]");
 add("selser", "Transclusion of MediaWiki message with space [4]");
 add("selser", "Transclusion of MediaWiki message with space [2]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1) 
[4,0,[0,1]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1) 
[[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1) 
[1,0,1,0,1]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1) 
[2,0,[0,4]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1) 
[1,0,[0,[0]]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1) 
[2,0,[0,[0]]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1) 
[3,0,[0,[0]]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1) 
[3,0,2,0,[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1) 
[3,0,[0,1]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1) 
[1,0,1,0,[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1) 
[1,0,[0,4]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1) 
[4,0,[0,[0]]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1) 
[2,0,4,0,1]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1) 
[4,0,1,0,[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 1) 
[4,0,4,0,1]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2) 
[2,0,4,0,1]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2) 
[3,0,[0,1]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2) 
[[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2) 
[1,0,[0,[0]]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2) 
[3,0,4,0,[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2) 
[2,0,[0,[0]]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2) 
[4,0,1,0,[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2) 
[1,0,[0,1]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2) 
[4,0,2,0,[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2) 
[4,0,1,0,4]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2) 
[2,0,2,0,2]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2) 
[3,0,2,0,2]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2) 
[3,0,1,0,[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2) 
[2,0,1,0,1]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2) 
[1,0,1,0,[0]]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2) 
[2,0,4,0,2]");
-add("selser", "Section extraction test with bogus <nowiki> heading (section 2) 
[2,0,4,0,[0]]");
 add("selser", "Section extraction, <pre> around bogus header (bug 10309) 
[1,0,4,0,1,4,0,3]");
 add("selser", "Section extraction, <pre> around bogus header (bug 10309) 
[1,0,[0]]");
 add("selser", "Section extraction, <pre> around bogus header (bug 10309) 
[1,0,2,0,2,[0]]");

-- 
To view, visit https://gerrit.wikimedia.org/r/60970
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I65cfa51c7508145881c6b370d9ed452cc678903b
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <ssas...@wikimedia.org>
Gerrit-Reviewer: Cscott <wikime...@cscott.net>
Gerrit-Reviewer: GWicke <gwi...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to