[MediaWiki-commits] [Gerrit] mediawiki...parsoid[master]: Support extension tags which shadows block level elements

2016-12-06 Thread jenkins-bot (Code Review)
jenkins-bot has submitted this change and it was merged.

Change subject: Support extension tags which shadows block level elements
..


Support extension tags which shadows block level elements

Change-Id: Ieadcc21966dc30511fd9c56365b1abfcdadee3fe
---
M lib/config/WikitextConstants.js
M lib/html2wt/WikitextSerializer.js
M lib/utils/Util.js
M lib/wt2html/pegTokenizer.pegjs
M tests/parserTests-blacklist.js
5 files changed, 158 insertions(+), 142 deletions(-)

Approvals:
  Subramanya Sastry: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/lib/config/WikitextConstants.js b/lib/config/WikitextConstants.js
index 430cc28..b39a872 100644
--- a/lib/config/WikitextConstants.js
+++ b/lib/config/WikitextConstants.js
@@ -174,8 +174,7 @@
"METER", "NAV", "NOSCRIPT", "OBJECT", "OL", "OPTGROUP", 
"OPTION",
"OUTPUT", "P", "PARAM", "PRE", "PROGRESS", "Q", "RB", 
"RP", "RT",
"RTC", "RUBY", "S", "SAMP", "SCRIPT", "SECTION", 
"SELECT", "SMALL",
-   // "SOURCE", Support the deprecated  alias for 
syntaxhighlight
-   "SPAN", "STRONG", "STYLE", "SUB", "SUMMARY", "SUP",
+   "SOURCE", "SPAN", "STRONG", "STYLE", "SUB", "SUMMARY", 
"SUP",
"TABLE", "TBODY", "TD", "TEXTAREA", "TFOOT", "TH", 
"THEAD", "TIME",
"TITLE", "TR", "TRACK", "U", "UL", "VAR", "VIDEO", 
"WBR",
]),
diff --git a/lib/html2wt/WikitextSerializer.js 
b/lib/html2wt/WikitextSerializer.js
index 3fd9f84..d8c2882 100644
--- a/lib/html2wt/WikitextSerializer.js
+++ b/lib/html2wt/WikitextSerializer.js
@@ -1128,6 +1128,8 @@
reqd = true;
break;
} else if (Consts.HTML.BlockTags.has(tagName)) {
+   // FIXME: Extension tags shadowing 
html5 tags might not
+   // have block semantics.
// Block tags on a line suppress nowikis
reqd = false;
}
diff --git a/lib/utils/Util.js b/lib/utils/Util.js
index b285c15..d86a459 100644
--- a/lib/utils/Util.js
+++ b/lib/utils/Util.js
@@ -1383,11 +1383,6 @@
}).join('');
 };
 
-Util.isHTMLElementName = function(name) {
-   name = name.toUpperCase();
-   return Consts.HTML.HTML5Tags.has(name) || 
Consts.HTML.OlderHTMLTags.has(name);
-};
-
 /**
  * Determine whether the protocol of a link is potentially valid. Use the
  * environment's per-wiki config to do so.
diff --git a/lib/wt2html/pegTokenizer.pegjs b/lib/wt2html/pegTokenizer.pegjs
index 60d6e4f..68fd213 100644
--- a/lib/wt2html/pegTokenizer.pegjs
+++ b/lib/wt2html/pegTokenizer.pegjs
@@ -72,6 +72,148 @@
 }
 };
 
+var isXMLTag = function(name, block) {
+var lName = name.toLowerCase();
+var uName = name.toUpperCase();
+
+// FIXME: These are installed extension tags which we, for some
+// historical reason, are special casing in the grammar.  Ignore them
+// here, they have their own rules.
+//
+// For , see https://gerrit.wikimedia.org/r/#/c/281076/
+// where we'll clean this up.  Notice how much we can remove!
+//
+// For , see https://gerrit.wikimedia.org/r/#/c/232313/
+// which has some relevant info for serialization.
+var ignoredExtTag = lName === 'pre' || lName === 'nowiki';
+
+var isInstalledExt = env.conf.wiki.extensionTags.has(lName) && 
!ignoredExtTag;
+var isIncludeTag = lName === 'includeonly' ||
+lName === 'noinclude' || lName === 'onlyinclude';
+
+var isHtmlTag = block ?
+// We need to ignore them here too because block tags have
+// higher precedence than our questionable rules.
+constants.HTML.BlockTags.has(uName) && !ignoredExtTag :
+constants.HTML.HTML5Tags.has(uName) || 
constants.HTML.OlderHTMLTags.has(uName);
+
+return isHtmlTag || isInstalledExt || isIncludeTag;
+};
+
+var maybeExtensionTag = function(t) {
+var tagName = t.name.toLowerCase();
+
+var isInstalledExt = env.conf.wiki.extensionTags.has(tagName);
+var isIncludeTag = tagName === 'includeonly' ||
+tagName === 'noinclude' || tagName === 'onlyinclude';
+
+// Extensions have higher precedence when they shadow html tags.
+if (!(isInstalledExt || isIncludeTag)) {
+return t;
+}
+
+var dp = t.dataAttribs;
+var skipLen = 0;
+
+switch (t.constructor) {
+case EndTagTk:
+return t;
+case SelfclosingTagTk:
+dp.src = input.substring(dp.tsr[0], dp.tsr[1]);
+dp.tagWidths = 

[MediaWiki-commits] [Gerrit] mediawiki...parsoid[master]: Support extension tags which shadows block level elements

2016-12-05 Thread Arlolra (Code Review)
Arlolra has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/325507

Change subject: Support extension tags which shadows block level elements
..

Support extension tags which shadows block level elements

Change-Id: Ieadcc21966dc30511fd9c56365b1abfcdadee3fe
---
M lib/utils/Util.js
M lib/wt2html/pegTokenizer.pegjs
M tests/parserTests-blacklist.js
3 files changed, 155 insertions(+), 140 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/07/325507/1

diff --git a/lib/utils/Util.js b/lib/utils/Util.js
index 590406e..0e71410 100644
--- a/lib/utils/Util.js
+++ b/lib/utils/Util.js
@@ -1377,11 +1377,6 @@
}).join('');
 };
 
-Util.isHTMLElementName = function(name) {
-   name = name.toUpperCase();
-   return Consts.HTML.HTML5Tags.has(name) || 
Consts.HTML.OlderHTMLTags.has(name);
-};
-
 /**
  * Determine whether the protocol of a link is potentially valid. Use the
  * environment's per-wiki config to do so.
diff --git a/lib/wt2html/pegTokenizer.pegjs b/lib/wt2html/pegTokenizer.pegjs
index 60d6e4f..68fd213 100644
--- a/lib/wt2html/pegTokenizer.pegjs
+++ b/lib/wt2html/pegTokenizer.pegjs
@@ -72,6 +72,148 @@
 }
 };
 
+var isXMLTag = function(name, block) {
+var lName = name.toLowerCase();
+var uName = name.toUpperCase();
+
+// FIXME: These are installed extension tags which we, for some
+// historical reason, are special casing in the grammar.  Ignore them
+// here, they have their own rules.
+//
+// For , see https://gerrit.wikimedia.org/r/#/c/281076/
+// where we'll clean this up.  Notice how much we can remove!
+//
+// For , see https://gerrit.wikimedia.org/r/#/c/232313/
+// which has some relevant info for serialization.
+var ignoredExtTag = lName === 'pre' || lName === 'nowiki';
+
+var isInstalledExt = env.conf.wiki.extensionTags.has(lName) && 
!ignoredExtTag;
+var isIncludeTag = lName === 'includeonly' ||
+lName === 'noinclude' || lName === 'onlyinclude';
+
+var isHtmlTag = block ?
+// We need to ignore them here too because block tags have
+// higher precedence than our questionable rules.
+constants.HTML.BlockTags.has(uName) && !ignoredExtTag :
+constants.HTML.HTML5Tags.has(uName) || 
constants.HTML.OlderHTMLTags.has(uName);
+
+return isHtmlTag || isInstalledExt || isIncludeTag;
+};
+
+var maybeExtensionTag = function(t) {
+var tagName = t.name.toLowerCase();
+
+var isInstalledExt = env.conf.wiki.extensionTags.has(tagName);
+var isIncludeTag = tagName === 'includeonly' ||
+tagName === 'noinclude' || tagName === 'onlyinclude';
+
+// Extensions have higher precedence when they shadow html tags.
+if (!(isInstalledExt || isIncludeTag)) {
+return t;
+}
+
+var dp = t.dataAttribs;
+var skipLen = 0;
+
+switch (t.constructor) {
+case EndTagTk:
+return t;
+case SelfclosingTagTk:
+dp.src = input.substring(dp.tsr[0], dp.tsr[1]);
+dp.tagWidths = [dp.tsr[1] - dp.tsr[0], 0];
+if (isIncludeTag) {
+return t;
+}
+break;
+case TagTk:
+var tsr0 = dp.tsr[0];
+var endTagRE = new RegExp("^[\\s\\S]*?()", "mi");
+var restOfInput = input.substring(tsr0);
+var tagContent = restOfInput.match(endTagRE);
+
+if (!tagContent) {
+dp.src = input.substring(dp.tsr[0], dp.tsr[1]);
+dp.tagWidths = [dp.tsr[1] - dp.tsr[0], 0];
+if (isIncludeTag) {
+return t;
+} else {
+// This is undefined behaviour.  The php parser currently
+// returns a tag here as well, which results in unclosed
+// extension tags that shadow html tags falling back to
+// their html equivalent.  The sanitizer will take care
+// of converting to text where necessary.  We do this to
+// simplify `hasWikitextTokens` when escaping wikitext,
+// which wants these as tokens because it's otherwise
+// lacking in context.
+return t;  // not text()
+}
+}
+
+var extSrc = tagContent[0];
+var endTagWidth = tagContent[1].length;
+
+// FIXME: This should be removed in favour of a native parser 
function
+// for `tag`, which invokes the extension handler directly.
+if (tagName === 'ref') {
+// Support 1-level nesting of  tags during tokenizing.
+//  tags are the exception