jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/328434 )
Change subject: Get rid of the generic_tag rule ...................................................................... Get rid of the generic_tag rule * It's only used in xmlish_tag. Change-Id: Ic16d5ef900a03df06336b3497cc6977cb0ef0eb9 --- M lib/wt2html/pegTokenizer.pegjs 1 file changed, 42 insertions(+), 41 deletions(-) Approvals: Subramanya Sastry: Looks good to me, approved jenkins-bot: Verified diff --git a/lib/wt2html/pegTokenizer.pegjs b/lib/wt2html/pegTokenizer.pegjs index b4bd3e0..f487a3c 100644 --- a/lib/wt2html/pegTokenizer.pegjs +++ b/lib/wt2html/pegTokenizer.pegjs @@ -76,6 +76,24 @@ } }; + /* ------------------------------------------------------------------------ + * Extension tags should be parsed with higher priority than anything else. + * + * The trick we use is to strip out the content inside a matching tag-pair + * and not tokenize it. The content, if it needs to parsed (for example, + * for <ref>, <*include*> tags), is parsed in a fresh tokenizer context + * which means any error correction that needs to happen is restricted to + * the scope of the extension content and doesn't spill over to the higher + * level. Ex: <math><!--foo</math>. + * + * IGNORE: {{ this just balances the blocks in this comment for pegjs + * + * This trick also lets us prevent extension content (that don't accept WT) + * from being parsed as wikitext (Ex: <math>\frac{foo\frac{bar}}</math>) + * We don't want the "}}" being treated as a template closing tag and + * closing outer templates. + * --------------------------------------------------------------------- */ + var isXMLTag = function(name, block) { var lName = name.toLowerCase(); var uName = name.toUpperCase(); @@ -1117,24 +1135,6 @@ } / & { return stops.dec('pre'); } -/* ----------------------------------------------------------------------- - * Extension tags should be parsed with higher priority than anything else. - * The trick we use is to strip out the content inside a matching tag-pair - * and not tokenize it. The content, if it needs to parsed (for example, - * for <ref>, <*include*> tags), is parsed in a fresh tokenizer context - * which means any error correction that needs to happen is restricted to - * the scope of the extension content and doesn't spill over to the higher - * level. Ex: <math><!--foo</math>. - * - * This trick also lets us prevent extension content (that don't accept WT) - * from being parsed as wikitext (Ex: <math>\frac{foo\frac{bar}}</math>) - * We don't want the "}}" being treated as a template closing tag and closing - * outer templates. - * ----------------------------------------------------------------------- */ - -xmlish_tag = - t:generic_tag & { return isXMLTag(t.name, false); } { return maybeExtensionTag(t); } - /* * Nowiki treats anything inside it as plain text. It could thus also be * defined as an extension that returns its raw input text, possibly wrapped @@ -1211,7 +1211,7 @@ tag_name_chars = [^\t\n\v />\0] tag_name = $([A-Za-z] tag_name_chars*) -generic_tag +xmlish_tag = & { // By the time we get to `doTableStuff` in the php parser, we've already // safely encoded element attributes. See 55313f4e in core. @@ -1221,13 +1221,14 @@ return stops.push('tableCellArg', false); } "<" - end:"/"? name:tag_name + end:"/"? name:$(tn:tag_name & { return isXMLTag(tn, false); }) attribs:generic_newline_attributes space_or_newline* // No need to preserve this -- canonicalize on RT via dirty diff selfclose:"/"? bad_ws:space* // No need to preserve this -- canonicalize on RT via dirty diff ">" { stops.pop('tableCellArg'); + var lcName = name.toLowerCase(); var isVoidElt = Util.isVoidElement(lcName) ? true : null; // Support </br> @@ -1248,9 +1249,27 @@ if (broken || bad_ws.length > 0) { res.dataAttribs.brokenHTMLTag = true; } - return res; + + return maybeExtensionTag(res); } / & { return stops.pop('tableCellArg'); } + +/* + * A variant of xmlish_tag, but also checks if the tag name is a block-level + * tag as defined in + * http://www.w3.org/TR/html5/syntax.html#tag-open-state and + * following paragraphs. + */ +block_tag + = "<" end:"/"? + name:$(tn:tag_name & { return isXMLTag(tn, true); }) + attribs:generic_newline_attributes + space_or_newline* + selfclose:"/"? + ">" { + var t = tu.buildXMLTag(name, name.toLowerCase(), attribs, end, selfclose, tsrOffsets()); + return [maybeExtensionTag(t)]; + } // A generic attribute that can span multiple lines. generic_newline_attribute @@ -1333,7 +1352,7 @@ // Accept insane tags-inside-attributes as attribute names. // The sanitizer will strip and shadow them for roundtripping. // Example: <hiddentext>generated with.. </hiddentext> - / &generic_tag nb:nested_block_line + / &xmlish_tag nb:nested_block_line // `nested_block_line` can return zero or more blocks. // Assure that we've got at least one, otherwise that plus // below is trouble. @@ -1371,24 +1390,6 @@ / s:$space* t:table_attribute_preprocessor_text &(space_or_newline/ eof / '!!' / '|') { return tu.getAttrVal(t, startOffset() + s.length, endOffset()); } - -/* - * A variant of generic_tag, but also checks if the tag name is a block-level - * tag as defined in - * http://www.w3.org/TR/html5/syntax.html#tag-open-state and - * following paragraphs. - */ -block_tag - = "<" end:"/"? - name:$(tn:tag_name & { return isXMLTag(tn, true); }) - attribs:generic_newline_attributes - space_or_newline* - selfclose:"/"? - ">" { - var t = tu.buildXMLTag(name, name.toLowerCase(), attribs, end, selfclose, tsrOffsets()); - return [maybeExtensionTag(t)]; - } - /********************************************************* * Lists @@ -1915,7 +1916,7 @@ /** * noinclude / includeonly / onlyinclude rules. These are normally - * handled by the generic_tag rule, except where generic tags are not + * handled by the xmlish_tag rule, except where generic tags are not * allowed- for example in directives, which are allowed in various attribute * names and -values. * -- To view, visit https://gerrit.wikimedia.org/r/328434 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ic16d5ef900a03df06336b3497cc6977cb0ef0eb9 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/parsoid Gerrit-Branch: master Gerrit-Owner: Arlolra <abrea...@wikimedia.org> Gerrit-Reviewer: C. Scott Ananian <canan...@wikimedia.org> Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org> Gerrit-Reviewer: Tim Starling <tstarl...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits