Arlolra has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/325507

Change subject: Support extension tags which shadows block level elements
......................................................................

Support extension tags which shadows block level elements

Change-Id: Ieadcc21966dc30511fd9c56365b1abfcdadee3fe
---
M lib/utils/Util.js
M lib/wt2html/pegTokenizer.pegjs
M tests/parserTests-blacklist.js
3 files changed, 155 insertions(+), 140 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/07/325507/1

diff --git a/lib/utils/Util.js b/lib/utils/Util.js
index 590406e..0e71410 100644
--- a/lib/utils/Util.js
+++ b/lib/utils/Util.js
@@ -1377,11 +1377,6 @@
        }).join('');
 };
 
-Util.isHTMLElementName = function(name) {
-       name = name.toUpperCase();
-       return Consts.HTML.HTML5Tags.has(name) || 
Consts.HTML.OlderHTMLTags.has(name);
-};
-
 /**
  * Determine whether the protocol of a link is potentially valid. Use the
  * environment's per-wiki config to do so.
diff --git a/lib/wt2html/pegTokenizer.pegjs b/lib/wt2html/pegTokenizer.pegjs
index 60d6e4f..68fd213 100644
--- a/lib/wt2html/pegTokenizer.pegjs
+++ b/lib/wt2html/pegTokenizer.pegjs
@@ -72,6 +72,148 @@
         }
     };
 
+    var isXMLTag = function(name, block) {
+        var lName = name.toLowerCase();
+        var uName = name.toUpperCase();
+
+        // FIXME: These are installed extension tags which we, for some
+        // historical reason, are special casing in the grammar.  Ignore them
+        // here, they have their own rules.
+        //
+        // For <pre>, see https://gerrit.wikimedia.org/r/#/c/281076/
+        // where we'll clean this up.  Notice how much we can remove!
+        //
+        // For <nowiki>, see https://gerrit.wikimedia.org/r/#/c/232313/
+        // which has some relevant info for serialization.
+        var ignoredExtTag = lName === 'pre' || lName === 'nowiki';
+
+        var isInstalledExt = env.conf.wiki.extensionTags.has(lName) && 
!ignoredExtTag;
+        var isIncludeTag = lName === 'includeonly' ||
+                lName === 'noinclude' || lName === 'onlyinclude';
+
+        var isHtmlTag = block ?
+                // We need to ignore them here too because block tags have
+                // higher precedence than our questionable rules.
+                constants.HTML.BlockTags.has(uName) && !ignoredExtTag :
+                constants.HTML.HTML5Tags.has(uName) || 
constants.HTML.OlderHTMLTags.has(uName);
+
+        return isHtmlTag || isInstalledExt || isIncludeTag;
+    };
+
+    var maybeExtensionTag = function(t) {
+        var tagName = t.name.toLowerCase();
+
+        var isInstalledExt = env.conf.wiki.extensionTags.has(tagName);
+        var isIncludeTag = tagName === 'includeonly' ||
+                tagName === 'noinclude' || tagName === 'onlyinclude';
+
+        // Extensions have higher precedence when they shadow html tags.
+        if (!(isInstalledExt || isIncludeTag)) {
+            return t;
+        }
+
+        var dp = t.dataAttribs;
+        var skipLen = 0;
+
+        switch (t.constructor) {
+        case EndTagTk:
+            return t;
+        case SelfclosingTagTk:
+            dp.src = input.substring(dp.tsr[0], dp.tsr[1]);
+            dp.tagWidths = [dp.tsr[1] - dp.tsr[0], 0];
+            if (isIncludeTag) {
+                return t;
+            }
+            break;
+        case TagTk:
+            var tsr0 = dp.tsr[0];
+            var endTagRE = new RegExp("^[\\s\\S]*?(</\\s*" + tagName + 
"\\s*>)", "mi");
+            var restOfInput = input.substring(tsr0);
+            var tagContent = restOfInput.match(endTagRE);
+
+            if (!tagContent) {
+                dp.src = input.substring(dp.tsr[0], dp.tsr[1]);
+                dp.tagWidths = [dp.tsr[1] - dp.tsr[0], 0];
+                if (isIncludeTag) {
+                    return t;
+                } else {
+                    // This is undefined behaviour.  The php parser currently
+                    // returns a tag here as well, which results in unclosed
+                    // extension tags that shadow html tags falling back to
+                    // their html equivalent.  The sanitizer will take care
+                    // of converting to text where necessary.  We do this to
+                    // simplify `hasWikitextTokens` when escaping wikitext,
+                    // which wants these as tokens because it's otherwise
+                    // lacking in context.
+                    return t;  // not text()
+                }
+            }
+
+            var extSrc = tagContent[0];
+            var endTagWidth = tagContent[1].length;
+
+            // FIXME: This should be removed in favour of a native parser 
function
+            // for `tag`, which invokes the extension handler directly.
+            if (tagName === 'ref') {
+                // Support 1-level nesting of <ref> tags during tokenizing.
+                // <ref> tags are the exception to the rule (no nesting of ext 
tags)
+                //
+                // Expand extSrc as long as there is a <ref> tag found in the
+                // extension source body.
+                var s = extSrc.substring(endOffset() - tsr0);
+                while (s && s.match(new RegExp("<" + tagName + "[^<>]*>"))) {
+                    tagContent = 
restOfInput.substring(extSrc.length).match(endTagRE);
+                    if (tagContent) {
+                        s = tagContent[0];
+                        endTagWidth = tagContent[1].length;
+                        extSrc += s;
+                    } else {
+                        s = null;
+                    }
+                }
+            }
+
+            // Extension content source
+            dp.src = extSrc;
+            dp.tagWidths = [endOffset() - tsr0, endTagWidth];
+
+            skipLen = extSrc.length - dp.tagWidths[0] - dp.tagWidths[1];
+
+            // If the xml-tag is a known installed (not native) extension,
+            // skip the end-tag as well.
+            if (isInstalledExt) {
+                skipLen += endTagWidth;
+            }
+            break;
+        default:
+            console.assert(false, 'Should not be reachable.');
+        }
+
+        peg$currPos += skipLen;
+
+        if (isInstalledExt) {
+            // update tsr[1] to span the start and end tags.
+            dp.tsr[1] = endOffset();  // was just modified above
+            return new SelfclosingTagTk('extension', [
+                new KV('typeof', 'mw:Extension'),
+                new KV('name', tagName),
+                new KV('about', env.newAboutId()),
+                new KV('source', dp.src),
+                new KV('options', t.attribs),
+            ], dp);
+        } else if (isIncludeTag) {
+            // Parse ext-content, strip eof, and shift tsr
+            var extContent = dp.src.substring(dp.tagWidths[0], dp.src.length - 
dp.tagWidths[1]);
+            var extContentToks = (new 
PegTokenizer(env)).tokenizeSync(extContent);
+            if (dp.tagWidths[1] > 0) {
+                extContentToks = Util.stripEOFTkfromTokens(extContentToks);
+            }
+            Util.shiftTokenTSR(extContentToks, dp.tsr[0] + dp.tagWidths[0]);
+            return [t].concat(extContentToks);
+        } else {
+            console.assert(false, 'Should not be reachable.');
+        }
+    };
 }
 
 /*********************************************************
@@ -971,126 +1113,7 @@
  * ----------------------------------------------------------------------- */
 
 xmlish_tag =
-    t:generic_tag & {
-        var tagName = t.name.toLowerCase();
-        var isHtmlTag = Util.isHTMLElementName(tagName);
-        var isInstalledExt = env.conf.wiki.extensionTags.has(tagName);
-        var isIncludeTag = tagName === 'includeonly' ||
-                tagName === 'noinclude' || tagName === 'onlyinclude';
-        return isHtmlTag || isInstalledExt || isIncludeTag;
-    } {
-        var tagName = t.name.toLowerCase();
-        var isHtmlTag = Util.isHTMLElementName(tagName);
-        var isInstalledExt = env.conf.wiki.extensionTags.has(tagName);
-        var isIncludeTag = tagName === 'includeonly' ||
-                tagName === 'noinclude' || tagName === 'onlyinclude';
-        var dp = t.dataAttribs;
-        var skipLen = 0;
-
-        // Extensions have higher precedence when they shadow html tags.
-        if (!(isInstalledExt || isIncludeTag)) {
-            return t;
-        }
-
-        switch (t.constructor) {
-        case EndTagTk:
-            return t;
-        case SelfclosingTagTk:
-            dp.src = input.substring(dp.tsr[0], dp.tsr[1]);
-            dp.tagWidths = [dp.tsr[1] - dp.tsr[0], 0];
-            if (isIncludeTag) {
-                return t;
-            }
-            break;
-        case TagTk:
-            var tsr0 = dp.tsr[0];
-            var endTagRE = new RegExp("^[\\s\\S]*?(</\\s*" + tagName + 
"\\s*>)", "mi");
-            var restOfInput = input.substring(tsr0);
-            var tagContent = restOfInput.match(endTagRE);
-
-            if (!tagContent) {
-                dp.src = input.substring(dp.tsr[0], dp.tsr[1]);
-                dp.tagWidths = [dp.tsr[1] - dp.tsr[0], 0];
-                if (isIncludeTag) {
-                    return t;
-                } else {
-                    // This is undefined behaviour.  The php parser currently
-                    // returns a tag here as well, which results in unclosed
-                    // extension tags that shadow html tags falling back to
-                    // their html equivalent.  The sanitizer will take care
-                    // of converting to text where necessary.  We do this to
-                    // simplify `hasWikitextTokens` when escaping wikitext,
-                    // which wants these as tokens because it's otherwise
-                    // lacking in context.
-                    return t;  // not text()
-                }
-            }
-
-            var extSrc = tagContent[0];
-            var endTagWidth = tagContent[1].length;
-
-            // FIXME: This should be removed in favour of a native parser 
function
-            // for `tag`, which invokes the extension handler directly.
-            if (tagName === 'ref') {
-                // Support 1-level nesting of <ref> tags during tokenizing.
-                // <ref> tags are the exception to the rule (no nesting of ext 
tags)
-                //
-                // Expand extSrc as long as there is a <ref> tag found in the
-                // extension source body.
-                var s = extSrc.substring(endOffset() - tsr0);
-                while (s && s.match(new RegExp("<" + tagName + "[^<>]*>"))) {
-                    tagContent = 
restOfInput.substring(extSrc.length).match(endTagRE);
-                    if (tagContent) {
-                        s = tagContent[0];
-                        endTagWidth = tagContent[1].length;
-                        extSrc += s;
-                    } else {
-                        s = null;
-                    }
-                }
-            }
-
-            // Extension content source
-            dp.src = extSrc;
-            dp.tagWidths = [endOffset() - tsr0, endTagWidth];
-
-            skipLen = extSrc.length - dp.tagWidths[0] - dp.tagWidths[1];
-
-            // If the xml-tag is a known installed (not native) extension,
-            // skip the end-tag as well.
-            if (isInstalledExt) {
-                skipLen += endTagWidth;
-            }
-            break;
-        default:
-            console.assert(false, 'Should not be reachable.');
-        }
-
-        peg$currPos += skipLen;
-
-        if (isInstalledExt) {
-            // update tsr[1] to span the start and end tags.
-            dp.tsr[1] = endOffset();  // was just modified above
-            return new SelfclosingTagTk('extension', [
-                new KV('typeof', 'mw:Extension'),
-                new KV('name', tagName),
-                new KV('about', env.newAboutId()),
-                new KV('source', dp.src),
-                new KV('options', t.attribs),
-            ], dp);
-        } else if (isIncludeTag) {
-            // Parse ext-content, strip eof, and shift tsr
-            var extContent = dp.src.substring(dp.tagWidths[0], dp.src.length - 
dp.tagWidths[1]);
-            var extContentToks = (new 
PegTokenizer(env)).tokenizeSync(extContent);
-            if (dp.tagWidths[1] > 0) {
-                extContentToks = Util.stripEOFTkfromTokens(extContentToks);
-            }
-            Util.shiftTokenTSR(extContentToks, dp.tsr[0] + dp.tagWidths[0]);
-            return [t].concat(extContentToks);
-        } else {
-            console.assert(false, 'Should not be reachable.');
-        }
-    }
+    t:generic_tag & { return isXMLTag(t.name, false); } { return 
maybeExtensionTag(t); }
 
 /*
  * Nowiki treats anything inside it as plain text. It could thus also be
@@ -1337,19 +1360,13 @@
  */
 block_tag
   = "<" end:"/"?
-    name:$(tn:tag_name & {
-      var lcTn = tn.toLowerCase();
-      return lcTn !== "pre" && lcTn !== "hr" &&
-        constants.HTML.BlockTags.has(tn.toUpperCase());
-    })
+    name:$(tn:tag_name & { return isXMLTag(tn, true); })
     attribs:generic_newline_attributes
     space_or_newline*
     selfclose:"/"?
     ">" {
-      return [
-        tu.buildXMLTag(name, name.toLowerCase(), attribs, end, selfclose,
-          tsrOffsets()),
-      ];
+      var t = tu.buildXMLTag(name, name.toLowerCase(), attribs, end, 
selfclose, tsrOffsets());
+      return [maybeExtensionTag(t)];
     }
 
 
diff --git a/tests/parserTests-blacklist.js b/tests/parserTests-blacklist.js
index 05f6075..96f5976 100644
--- a/tests/parserTests-blacklist.js
+++ b/tests/parserTests-blacklist.js
@@ -297,7 +297,7 @@
 add("wt2wt", "Parsoid-centric test: Whitespace in ext- and wiki-links should 
be preserved", "[[Foo|  bar]]\n\n[[Foo|  ''bar'']]\n\n[http://wp.org 
foo]\n\n[http://wp.org ''foo'']\n");
 add("wt2wt", "Handling html with a div self-closing tag", "<div title=\"\" 
/>\n<div title=\"\" />\n<div title=\"\" />\n<div title=\"bar\" />\n<div 
title=\"bar\" />\n<div title=\"bar/\">");
 add("wt2wt", "Handling html with a br self-closing tag", "<br title=\"\" 
/>\n<br title=\"\" />\n<br title=\"\" />\n<br title=\"bar\" />\n<br 
title=\"bar\" />\n<br title=\"bar/\">\n");
-add("wt2wt", "Horizontal ruler (should it add that extra space?)", 
"<hr>\n<hr>\nfoo <hr> bar\n");
+add("wt2wt", "Horizontal ruler (should it add that extra space?)", "<hr 
/>\n<hr />\nfoo <hr> bar\n");
 add("wt2wt", "Nested lists 3 (first element empty)", "\n**bar\n");
 add("wt2wt", "Nested lists 6 (both elements empty)", "\n**\n");
 add("wt2wt", "Unbalanced closing non-block tags don't break a list\n(php 
parser relies on Tidy to fix up)", "<span>\n*a<span>\n*b");
@@ -1587,15 +1587,18 @@
 add("selser", "Horizontal ruler (should it add that extra space?) 
[2,2,0,2,[3],0,[4]]", "mfej6cl2tvlpu8fr<hr>g7w5lx5r4xz85mi\n<hr 
>n8aj1jq5r9e5qaor\n<hr\n>4tlzqynij4lk57b9");
 add("selser", "Horizontal ruler (should it add that extra space?) 
[0,4,3,4,2,0,2]", 
"<hr>efqnkd9nqlwel8fr\n\nxmvwrb0llpz7u8fr\n\njplm7w9i7t10dx6r\n\nfoo 
<hr\n>wmoqdkg19vtgwrk9\n\n<nowiki> </nowiki>bar");
 add("selser", "Horizontal ruler (should it add that extra space?) 
[2,0,0,2,4,2,1]", "zxpf8ei52ttn8kt9<hr>\n<hr 
>f75t6r98vikn8kt9\n16a3ff1niy7m0a4i\n\nznvihrl0p4hiwwmi<hr\n> bar");
-add("selser", "Horizontal ruler (should it add that extra space?) 
[1,0,0,0,2,0,4]", "<hr data-foobar=\"zbky74okstc5wmi\">\n<hr 
>\n1cdauyobybltbj4i\n\nfoo <hr\n>72ti2j1ve5pl23xr\n");
+add("selser", "Horizontal ruler (should it add that extra space?) 
[0,4,3,4,1,4,[2]]", "<hr>504w47ur2i8gp66r\n\n89w0alf7mwhtzkt9\n\nfoo 
\n\nqj7pzeukhe7gy14i\n\nwcqqssn8slkdquxr bar");
+add("selser", "Horizontal ruler (should it add that extra space?) 
[1,0,0,0,2,0,4]", "<hr data-foobar=\"zbky74okstc5wmi\" />\n<hr 
>\n1cdauyobybltbj4i\n\nfoo <hr\n>72ti2j1ve5pl23xr\n");
 add("selser", "Horizontal ruler (should it add that extra space?) 
[2,3,0,0,[4],3,2]", "o5rwiutljxjsv2t9<hr><hr 
>\nd35ei25187b0ggb9\n\nl2gmcvf139bsatt9\n\n<nowiki> </nowiki>bar");
-add("selser", "Horizontal ruler (should it add that extra space?) 
[3,3,1,0,0,0,0]", "<hr data-foobar=\"lf04si3r3323xr\">\nfoo <hr\n> bar");
+add("selser", "Horizontal ruler (should it add that extra space?) 
[0,4,1,4,0,4,[2]]", "<hr>bd1fbirckdkg9zfr<hr data-foobar=\"p7mw3becxflxr\" 
/>ziabhrhonumlhaor\n\nfoo \n\nrtsgmqphxg5vcxr\n\nrj4m9beenwjnhfr bar");
+add("selser", "Horizontal ruler (should it add that extra space?) 
[3,3,1,0,0,0,0]", "<hr data-foobar=\"lf04si3r3323xr\" />\nfoo <hr\n> bar");
 add("selser", "Horizontal ruler (should it add that extra space?) 
[4,0,0,3,2,3,[2]]", "x425cn04w3y833di\n<hr >w1vtj40el2x4j9k9\n\nfoo 
\n\n1k0l0e76yfbnjyvi bar");
-add("selser", "Horizontal ruler (should it add that extra space?) 
[1,3,0,3,0,3,[3]]", "<hr data-foobar=\"l0b3o734w50o1or\"><hr >foo \n");
+add("selser", "Horizontal ruler (should it add that extra space?) 
[1,3,0,3,0,3,[3]]", "<hr data-foobar=\"l0b3o734w50o1or\" /><hr >foo \n");
 add("selser", "Horizontal ruler (should it add that extra space?) 
[0,0,0,0,[4],0,1]", "<hr>\n<hr >\n2bhtkpyrargj5rk9<hr\n> bar");
+add("selser", "Horizontal ruler (should it add that extra space?) 
[0,3,3,2,4,1,3]", "<hr>ct10s4snbyq77gb9\nvb66koxr72oqolxr<hr 
data-foobar=\"trqbxkno4n6irudi\">");
 add("selser", "Horizontal ruler (should it add that extra space?) 
[3,4,4,2,3,0,0]", 
"o6enrwkz4kl9dx6r\n\nqc1zz05ztk9sh5mi\n\npt363zha88q9qkt9\n<hr\n> bar");
-add("selser", "Horizontal ruler (should it add that extra space?) 
[0,0,1,0,[3],0,3]", "<hr>\n<hr data-foobar=\"h3lfecmid64unmi\">\n<hr\n>");
-add("selser", "Horizontal ruler (should it add that extra space?) 
[1,2,4,2,1,0,0]", "<hr 
data-foobar=\"n0w4kgj63z5jnhfr\">2tme0t8y5iwfjemi\no1nlliabwat9be29\n\n77m2ud715kfyldi\nfoo
 <hr\n> bar");
+add("selser", "Horizontal ruler (should it add that extra space?) 
[0,0,1,0,[3],0,3]", "<hr>\n<hr data-foobar=\"h3lfecmid64unmi\" />\n<hr\n>");
+add("selser", "Horizontal ruler (should it add that extra space?) 
[1,2,4,2,1,0,0]", "<hr data-foobar=\"n0w4kgj63z5jnhfr\" 
/>2tme0t8y5iwfjemi\no1nlliabwat9be29\n\n77m2ud715kfyldi\nfoo <hr\n> bar");
 add("selser", "Horizontal ruler (should it add that extra space?) 
[0,0,0,4,0,3,0]", "<hr>\n<hr >3nksf5u6o17nwmi\n\nfoo \n\n<nowiki> 
</nowiki>bar\n");
 add("selser", "Horizontal ruler (should it add that extra space?) 
[0,0,0,3,4,1,0]", "<hr>\n<hr >d13gtt9304e1m7vi<hr 
data-foobar=\"fbu9qf9y7h7aatt9\"> bar");
 add("selser", "Mixed list [3,0,0]", "#** Level 3, but ordered");

-- 
To view, visit https://gerrit.wikimedia.org/r/325507
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ieadcc21966dc30511fd9c56365b1abfcdadee3fe
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Arlolra <abrea...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to