Arlolra has uploaded a new change for review. https://gerrit.wikimedia.org/r/325507
Change subject: Support extension tags which shadows block level elements ...................................................................... Support extension tags which shadows block level elements Change-Id: Ieadcc21966dc30511fd9c56365b1abfcdadee3fe --- M lib/utils/Util.js M lib/wt2html/pegTokenizer.pegjs M tests/parserTests-blacklist.js 3 files changed, 155 insertions(+), 140 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid refs/changes/07/325507/1 diff --git a/lib/utils/Util.js b/lib/utils/Util.js index 590406e..0e71410 100644 --- a/lib/utils/Util.js +++ b/lib/utils/Util.js @@ -1377,11 +1377,6 @@ }).join(''); }; -Util.isHTMLElementName = function(name) { - name = name.toUpperCase(); - return Consts.HTML.HTML5Tags.has(name) || Consts.HTML.OlderHTMLTags.has(name); -}; - /** * Determine whether the protocol of a link is potentially valid. Use the * environment's per-wiki config to do so. diff --git a/lib/wt2html/pegTokenizer.pegjs b/lib/wt2html/pegTokenizer.pegjs index 60d6e4f..68fd213 100644 --- a/lib/wt2html/pegTokenizer.pegjs +++ b/lib/wt2html/pegTokenizer.pegjs @@ -72,6 +72,148 @@ } }; + var isXMLTag = function(name, block) { + var lName = name.toLowerCase(); + var uName = name.toUpperCase(); + + // FIXME: These are installed extension tags which we, for some + // historical reason, are special casing in the grammar. Ignore them + // here, they have their own rules. + // + // For <pre>, see https://gerrit.wikimedia.org/r/#/c/281076/ + // where we'll clean this up. Notice how much we can remove! + // + // For <nowiki>, see https://gerrit.wikimedia.org/r/#/c/232313/ + // which has some relevant info for serialization. + var ignoredExtTag = lName === 'pre' || lName === 'nowiki'; + + var isInstalledExt = env.conf.wiki.extensionTags.has(lName) && !ignoredExtTag; + var isIncludeTag = lName === 'includeonly' || + lName === 'noinclude' || lName === 'onlyinclude'; + + var isHtmlTag = block ? + // We need to ignore them here too because block tags have + // higher precedence than our questionable rules. + constants.HTML.BlockTags.has(uName) && !ignoredExtTag : + constants.HTML.HTML5Tags.has(uName) || constants.HTML.OlderHTMLTags.has(uName); + + return isHtmlTag || isInstalledExt || isIncludeTag; + }; + + var maybeExtensionTag = function(t) { + var tagName = t.name.toLowerCase(); + + var isInstalledExt = env.conf.wiki.extensionTags.has(tagName); + var isIncludeTag = tagName === 'includeonly' || + tagName === 'noinclude' || tagName === 'onlyinclude'; + + // Extensions have higher precedence when they shadow html tags. + if (!(isInstalledExt || isIncludeTag)) { + return t; + } + + var dp = t.dataAttribs; + var skipLen = 0; + + switch (t.constructor) { + case EndTagTk: + return t; + case SelfclosingTagTk: + dp.src = input.substring(dp.tsr[0], dp.tsr[1]); + dp.tagWidths = [dp.tsr[1] - dp.tsr[0], 0]; + if (isIncludeTag) { + return t; + } + break; + case TagTk: + var tsr0 = dp.tsr[0]; + var endTagRE = new RegExp("^[\\s\\S]*?(</\\s*" + tagName + "\\s*>)", "mi"); + var restOfInput = input.substring(tsr0); + var tagContent = restOfInput.match(endTagRE); + + if (!tagContent) { + dp.src = input.substring(dp.tsr[0], dp.tsr[1]); + dp.tagWidths = [dp.tsr[1] - dp.tsr[0], 0]; + if (isIncludeTag) { + return t; + } else { + // This is undefined behaviour. The php parser currently + // returns a tag here as well, which results in unclosed + // extension tags that shadow html tags falling back to + // their html equivalent. The sanitizer will take care + // of converting to text where necessary. We do this to + // simplify `hasWikitextTokens` when escaping wikitext, + // which wants these as tokens because it's otherwise + // lacking in context. + return t; // not text() + } + } + + var extSrc = tagContent[0]; + var endTagWidth = tagContent[1].length; + + // FIXME: This should be removed in favour of a native parser function + // for `tag`, which invokes the extension handler directly. + if (tagName === 'ref') { + // Support 1-level nesting of <ref> tags during tokenizing. + // <ref> tags are the exception to the rule (no nesting of ext tags) + // + // Expand extSrc as long as there is a <ref> tag found in the + // extension source body. + var s = extSrc.substring(endOffset() - tsr0); + while (s && s.match(new RegExp("<" + tagName + "[^<>]*>"))) { + tagContent = restOfInput.substring(extSrc.length).match(endTagRE); + if (tagContent) { + s = tagContent[0]; + endTagWidth = tagContent[1].length; + extSrc += s; + } else { + s = null; + } + } + } + + // Extension content source + dp.src = extSrc; + dp.tagWidths = [endOffset() - tsr0, endTagWidth]; + + skipLen = extSrc.length - dp.tagWidths[0] - dp.tagWidths[1]; + + // If the xml-tag is a known installed (not native) extension, + // skip the end-tag as well. + if (isInstalledExt) { + skipLen += endTagWidth; + } + break; + default: + console.assert(false, 'Should not be reachable.'); + } + + peg$currPos += skipLen; + + if (isInstalledExt) { + // update tsr[1] to span the start and end tags. + dp.tsr[1] = endOffset(); // was just modified above + return new SelfclosingTagTk('extension', [ + new KV('typeof', 'mw:Extension'), + new KV('name', tagName), + new KV('about', env.newAboutId()), + new KV('source', dp.src), + new KV('options', t.attribs), + ], dp); + } else if (isIncludeTag) { + // Parse ext-content, strip eof, and shift tsr + var extContent = dp.src.substring(dp.tagWidths[0], dp.src.length - dp.tagWidths[1]); + var extContentToks = (new PegTokenizer(env)).tokenizeSync(extContent); + if (dp.tagWidths[1] > 0) { + extContentToks = Util.stripEOFTkfromTokens(extContentToks); + } + Util.shiftTokenTSR(extContentToks, dp.tsr[0] + dp.tagWidths[0]); + return [t].concat(extContentToks); + } else { + console.assert(false, 'Should not be reachable.'); + } + }; } /********************************************************* @@ -971,126 +1113,7 @@ * ----------------------------------------------------------------------- */ xmlish_tag = - t:generic_tag & { - var tagName = t.name.toLowerCase(); - var isHtmlTag = Util.isHTMLElementName(tagName); - var isInstalledExt = env.conf.wiki.extensionTags.has(tagName); - var isIncludeTag = tagName === 'includeonly' || - tagName === 'noinclude' || tagName === 'onlyinclude'; - return isHtmlTag || isInstalledExt || isIncludeTag; - } { - var tagName = t.name.toLowerCase(); - var isHtmlTag = Util.isHTMLElementName(tagName); - var isInstalledExt = env.conf.wiki.extensionTags.has(tagName); - var isIncludeTag = tagName === 'includeonly' || - tagName === 'noinclude' || tagName === 'onlyinclude'; - var dp = t.dataAttribs; - var skipLen = 0; - - // Extensions have higher precedence when they shadow html tags. - if (!(isInstalledExt || isIncludeTag)) { - return t; - } - - switch (t.constructor) { - case EndTagTk: - return t; - case SelfclosingTagTk: - dp.src = input.substring(dp.tsr[0], dp.tsr[1]); - dp.tagWidths = [dp.tsr[1] - dp.tsr[0], 0]; - if (isIncludeTag) { - return t; - } - break; - case TagTk: - var tsr0 = dp.tsr[0]; - var endTagRE = new RegExp("^[\\s\\S]*?(</\\s*" + tagName + "\\s*>)", "mi"); - var restOfInput = input.substring(tsr0); - var tagContent = restOfInput.match(endTagRE); - - if (!tagContent) { - dp.src = input.substring(dp.tsr[0], dp.tsr[1]); - dp.tagWidths = [dp.tsr[1] - dp.tsr[0], 0]; - if (isIncludeTag) { - return t; - } else { - // This is undefined behaviour. The php parser currently - // returns a tag here as well, which results in unclosed - // extension tags that shadow html tags falling back to - // their html equivalent. The sanitizer will take care - // of converting to text where necessary. We do this to - // simplify `hasWikitextTokens` when escaping wikitext, - // which wants these as tokens because it's otherwise - // lacking in context. - return t; // not text() - } - } - - var extSrc = tagContent[0]; - var endTagWidth = tagContent[1].length; - - // FIXME: This should be removed in favour of a native parser function - // for `tag`, which invokes the extension handler directly. - if (tagName === 'ref') { - // Support 1-level nesting of <ref> tags during tokenizing. - // <ref> tags are the exception to the rule (no nesting of ext tags) - // - // Expand extSrc as long as there is a <ref> tag found in the - // extension source body. - var s = extSrc.substring(endOffset() - tsr0); - while (s && s.match(new RegExp("<" + tagName + "[^<>]*>"))) { - tagContent = restOfInput.substring(extSrc.length).match(endTagRE); - if (tagContent) { - s = tagContent[0]; - endTagWidth = tagContent[1].length; - extSrc += s; - } else { - s = null; - } - } - } - - // Extension content source - dp.src = extSrc; - dp.tagWidths = [endOffset() - tsr0, endTagWidth]; - - skipLen = extSrc.length - dp.tagWidths[0] - dp.tagWidths[1]; - - // If the xml-tag is a known installed (not native) extension, - // skip the end-tag as well. - if (isInstalledExt) { - skipLen += endTagWidth; - } - break; - default: - console.assert(false, 'Should not be reachable.'); - } - - peg$currPos += skipLen; - - if (isInstalledExt) { - // update tsr[1] to span the start and end tags. - dp.tsr[1] = endOffset(); // was just modified above - return new SelfclosingTagTk('extension', [ - new KV('typeof', 'mw:Extension'), - new KV('name', tagName), - new KV('about', env.newAboutId()), - new KV('source', dp.src), - new KV('options', t.attribs), - ], dp); - } else if (isIncludeTag) { - // Parse ext-content, strip eof, and shift tsr - var extContent = dp.src.substring(dp.tagWidths[0], dp.src.length - dp.tagWidths[1]); - var extContentToks = (new PegTokenizer(env)).tokenizeSync(extContent); - if (dp.tagWidths[1] > 0) { - extContentToks = Util.stripEOFTkfromTokens(extContentToks); - } - Util.shiftTokenTSR(extContentToks, dp.tsr[0] + dp.tagWidths[0]); - return [t].concat(extContentToks); - } else { - console.assert(false, 'Should not be reachable.'); - } - } + t:generic_tag & { return isXMLTag(t.name, false); } { return maybeExtensionTag(t); } /* * Nowiki treats anything inside it as plain text. It could thus also be @@ -1337,19 +1360,13 @@ */ block_tag = "<" end:"/"? - name:$(tn:tag_name & { - var lcTn = tn.toLowerCase(); - return lcTn !== "pre" && lcTn !== "hr" && - constants.HTML.BlockTags.has(tn.toUpperCase()); - }) + name:$(tn:tag_name & { return isXMLTag(tn, true); }) attribs:generic_newline_attributes space_or_newline* selfclose:"/"? ">" { - return [ - tu.buildXMLTag(name, name.toLowerCase(), attribs, end, selfclose, - tsrOffsets()), - ]; + var t = tu.buildXMLTag(name, name.toLowerCase(), attribs, end, selfclose, tsrOffsets()); + return [maybeExtensionTag(t)]; } diff --git a/tests/parserTests-blacklist.js b/tests/parserTests-blacklist.js index 05f6075..96f5976 100644 --- a/tests/parserTests-blacklist.js +++ b/tests/parserTests-blacklist.js @@ -297,7 +297,7 @@ add("wt2wt", "Parsoid-centric test: Whitespace in ext- and wiki-links should be preserved", "[[Foo| bar]]\n\n[[Foo| ''bar'']]\n\n[http://wp.org foo]\n\n[http://wp.org ''foo'']\n"); add("wt2wt", "Handling html with a div self-closing tag", "<div title=\"\" />\n<div title=\"\" />\n<div title=\"\" />\n<div title=\"bar\" />\n<div title=\"bar\" />\n<div title=\"bar/\">"); add("wt2wt", "Handling html with a br self-closing tag", "<br title=\"\" />\n<br title=\"\" />\n<br title=\"\" />\n<br title=\"bar\" />\n<br title=\"bar\" />\n<br title=\"bar/\">\n"); -add("wt2wt", "Horizontal ruler (should it add that extra space?)", "<hr>\n<hr>\nfoo <hr> bar\n"); +add("wt2wt", "Horizontal ruler (should it add that extra space?)", "<hr />\n<hr />\nfoo <hr> bar\n"); add("wt2wt", "Nested lists 3 (first element empty)", "\n**bar\n"); add("wt2wt", "Nested lists 6 (both elements empty)", "\n**\n"); add("wt2wt", "Unbalanced closing non-block tags don't break a list\n(php parser relies on Tidy to fix up)", "<span>\n*a<span>\n*b"); @@ -1587,15 +1587,18 @@ add("selser", "Horizontal ruler (should it add that extra space?) [2,2,0,2,[3],0,[4]]", "mfej6cl2tvlpu8fr<hr>g7w5lx5r4xz85mi\n<hr >n8aj1jq5r9e5qaor\n<hr\n>4tlzqynij4lk57b9"); add("selser", "Horizontal ruler (should it add that extra space?) [0,4,3,4,2,0,2]", "<hr>efqnkd9nqlwel8fr\n\nxmvwrb0llpz7u8fr\n\njplm7w9i7t10dx6r\n\nfoo <hr\n>wmoqdkg19vtgwrk9\n\n<nowiki> </nowiki>bar"); add("selser", "Horizontal ruler (should it add that extra space?) [2,0,0,2,4,2,1]", "zxpf8ei52ttn8kt9<hr>\n<hr >f75t6r98vikn8kt9\n16a3ff1niy7m0a4i\n\nznvihrl0p4hiwwmi<hr\n> bar"); -add("selser", "Horizontal ruler (should it add that extra space?) [1,0,0,0,2,0,4]", "<hr data-foobar=\"zbky74okstc5wmi\">\n<hr >\n1cdauyobybltbj4i\n\nfoo <hr\n>72ti2j1ve5pl23xr\n"); +add("selser", "Horizontal ruler (should it add that extra space?) [0,4,3,4,1,4,[2]]", "<hr>504w47ur2i8gp66r\n\n89w0alf7mwhtzkt9\n\nfoo \n\nqj7pzeukhe7gy14i\n\nwcqqssn8slkdquxr bar"); +add("selser", "Horizontal ruler (should it add that extra space?) [1,0,0,0,2,0,4]", "<hr data-foobar=\"zbky74okstc5wmi\" />\n<hr >\n1cdauyobybltbj4i\n\nfoo <hr\n>72ti2j1ve5pl23xr\n"); add("selser", "Horizontal ruler (should it add that extra space?) [2,3,0,0,[4],3,2]", "o5rwiutljxjsv2t9<hr><hr >\nd35ei25187b0ggb9\n\nl2gmcvf139bsatt9\n\n<nowiki> </nowiki>bar"); -add("selser", "Horizontal ruler (should it add that extra space?) [3,3,1,0,0,0,0]", "<hr data-foobar=\"lf04si3r3323xr\">\nfoo <hr\n> bar"); +add("selser", "Horizontal ruler (should it add that extra space?) [0,4,1,4,0,4,[2]]", "<hr>bd1fbirckdkg9zfr<hr data-foobar=\"p7mw3becxflxr\" />ziabhrhonumlhaor\n\nfoo \n\nrtsgmqphxg5vcxr\n\nrj4m9beenwjnhfr bar"); +add("selser", "Horizontal ruler (should it add that extra space?) [3,3,1,0,0,0,0]", "<hr data-foobar=\"lf04si3r3323xr\" />\nfoo <hr\n> bar"); add("selser", "Horizontal ruler (should it add that extra space?) [4,0,0,3,2,3,[2]]", "x425cn04w3y833di\n<hr >w1vtj40el2x4j9k9\n\nfoo \n\n1k0l0e76yfbnjyvi bar"); -add("selser", "Horizontal ruler (should it add that extra space?) [1,3,0,3,0,3,[3]]", "<hr data-foobar=\"l0b3o734w50o1or\"><hr >foo \n"); +add("selser", "Horizontal ruler (should it add that extra space?) [1,3,0,3,0,3,[3]]", "<hr data-foobar=\"l0b3o734w50o1or\" /><hr >foo \n"); add("selser", "Horizontal ruler (should it add that extra space?) [0,0,0,0,[4],0,1]", "<hr>\n<hr >\n2bhtkpyrargj5rk9<hr\n> bar"); +add("selser", "Horizontal ruler (should it add that extra space?) [0,3,3,2,4,1,3]", "<hr>ct10s4snbyq77gb9\nvb66koxr72oqolxr<hr data-foobar=\"trqbxkno4n6irudi\">"); add("selser", "Horizontal ruler (should it add that extra space?) [3,4,4,2,3,0,0]", "o6enrwkz4kl9dx6r\n\nqc1zz05ztk9sh5mi\n\npt363zha88q9qkt9\n<hr\n> bar"); -add("selser", "Horizontal ruler (should it add that extra space?) [0,0,1,0,[3],0,3]", "<hr>\n<hr data-foobar=\"h3lfecmid64unmi\">\n<hr\n>"); -add("selser", "Horizontal ruler (should it add that extra space?) [1,2,4,2,1,0,0]", "<hr data-foobar=\"n0w4kgj63z5jnhfr\">2tme0t8y5iwfjemi\no1nlliabwat9be29\n\n77m2ud715kfyldi\nfoo <hr\n> bar"); +add("selser", "Horizontal ruler (should it add that extra space?) [0,0,1,0,[3],0,3]", "<hr>\n<hr data-foobar=\"h3lfecmid64unmi\" />\n<hr\n>"); +add("selser", "Horizontal ruler (should it add that extra space?) [1,2,4,2,1,0,0]", "<hr data-foobar=\"n0w4kgj63z5jnhfr\" />2tme0t8y5iwfjemi\no1nlliabwat9be29\n\n77m2ud715kfyldi\nfoo <hr\n> bar"); add("selser", "Horizontal ruler (should it add that extra space?) [0,0,0,4,0,3,0]", "<hr>\n<hr >3nksf5u6o17nwmi\n\nfoo \n\n<nowiki> </nowiki>bar\n"); add("selser", "Horizontal ruler (should it add that extra space?) [0,0,0,3,4,1,0]", "<hr>\n<hr >d13gtt9304e1m7vi<hr data-foobar=\"fbu9qf9y7h7aatt9\"> bar"); add("selser", "Mixed list [3,0,0]", "#** Level 3, but ordered"); -- To view, visit https://gerrit.wikimedia.org/r/325507 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ieadcc21966dc30511fd9c56365b1abfcdadee3fe Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/parsoid Gerrit-Branch: master Gerrit-Owner: Arlolra <abrea...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits