jenkins-bot has submitted this change and it was merged. ( https://gerrit.wikimedia.org/r/379430 )
Change subject: Linter: Flag misnested tags with different behavior in HTML5 vs Tidy ...................................................................... Linter: Flag misnested tags with different behavior in HTML5 vs Tidy * This will make a different when Tidy is replaced by Remex and will also make Parsoid rendering more compatible with Tidy. Bug: T176363 Change-Id: I93d779eba9b7738c309b8a8a4a89f337bb4ac168 --- M lib/utils/DOMUtils.js M lib/wt2html/pp/handlers/linter.js M tests/mocha/linter.js 3 files changed, 140 insertions(+), 9 deletions(-) Approvals: jenkins-bot: Verified Arlolra: Looks good to me, approved diff --git a/lib/utils/DOMUtils.js b/lib/utils/DOMUtils.js index e073bde..5de498f 100644 --- a/lib/utils/DOMUtils.js +++ b/lib/utils/DOMUtils.js @@ -1244,6 +1244,12 @@ return next; }, + hasFollowingContent: function(node) { + return !DU.isBody(node) && ( + DU.nextNonSepSibling(node) || DU.hasFollowingContent(node.parentNode) + ); + }, + numNonDeletedChildNodes: function(node) { var n = 0; var child = node.firstChild; diff --git a/lib/wt2html/pp/handlers/linter.js b/lib/wt2html/pp/handlers/linter.js index 305b023..fa50353 100644 --- a/lib/wt2html/pp/handlers/linter.js +++ b/lib/wt2html/pp/handlers/linter.js @@ -17,6 +17,62 @@ var Util = require('../../../utils/Util.js').Util; var Consts = require('../../../config/WikitextConstants.js').WikitextConstants; +/* ------------------------------------------------------------------------------ + * We are trying to find HTML5 tags that have different behavior compared to HTML4 + * in some misnesting scenarios around wikitext paragraphs. + * + * Ex: Input: <p><small>a</p><p>b</small></p> + * Tidy output: <p><small>a</small></p><p><small>b</small></p> + * HTML5 output: <p><small>a</small></p><p><small>b</small></p> + * + * So, all good here. + * But, see how output changes when we use <span> instead + * + * Ex: Input: <p><span>a</p><p>b</span></p> + * Tidy output: <p><span>a</span></p><p><span>b</span></p> + * HTML5 output: <p><span>a</span></p><p>b</p> + * + * The source wikitext is "<span>a\n\nb</span>". The difference persists even + * when you have "<span>a\n\n<div>b</div>" or "<span>a\n\n{|\n|x\n|}\nbar". + * + * This is because Tidy seems to be doing the equivalent of HTM5-treebuilder's + * active formatting element reconstruction step on all *inline* elements. + * However, HTML5 parsers only do that on formatting elements. So, we need + * to compute which HTML5 tags are subject to this differential behavior. + * + * We compute that by excluding the following tags from the list of all HTML5 tags + * - If our sanitizer doesn't whitelist them, they will be escaped => ignore them + * - HTML4 block tags are excluded (obviously) + * - Void tags don't matter since they cannot wrap anything (obviously) + * - Active formatting elements have special handling in the HTML5 tree building + * algorithm where they are reconstructed to wrap all originally intended content. + * (ex: <small> above) + * + * Here is the list of 22 HTML5 tags that are affected: + * ABBR, BDI, BDO, CITE, DATA, DEL, DFN, INS, KBD, MARK, + * Q, RB, RP, RT, RTC, RUBY, SAMP, SPAN, SUB, SUP, TIME, VAR + * + * https://phabricator.wikimedia.org/T176363#3628173 verifies that this list of + * tags all demonstrate this behavior. + * ------------------------------------------------------------------------------ */ +var tagsWithChangedMisnestingBehavior; +function getTagsWithChangedMisnestingBehavior() { + if (!tagsWithChangedMisnestingBehavior) { + tagsWithChangedMisnestingBehavior = new Set(); + Consts.HTML.HTML5Tags.forEach(function(t) { + if (Consts.Sanitizer.TagWhiteList.has(t) && + !Consts.HTML.BlockTags.has(t) && + !Consts.HTML.FormattingTags.has(t) && + !Consts.HTML.VoidTags.has(t) + ) { + tagsWithChangedMisnestingBehavior.add(t); + } + }); + } + + return tagsWithChangedMisnestingBehavior; +} + /* * Log Transclusion with more than one parts * Ex - {{table-start}} @@ -176,16 +232,31 @@ templateInfo: templateInfo, params: { name: cNodeName }, }; - var adjNode = getNextMatchingNode(c, c); - if (adjNode) { - var adjDp = DU.getDataParsoid(adjNode); - if (!adjDp.tmp) { - adjDp.tmp = {}; + + // FIXME: This literal html marker check is strictly not required + // (a) we've already checked that above and know that isQuoteElt is + // not one of our tags. + // (b) none of the tags in the list have native wikitext syntax => + // they will show up as literal html tags. + // But, in the interest of long-term maintenance in the face of + // changes (to wikitext or html specs), let us make it explicit. + if (DU.hasLiteralHTMLMarker(dp) && + getTagsWithChangedMisnestingBehavior().has(c.nodeName) && + DU.hasFollowingContent(c) + ) { + env.log('lint/html5-misnesting', lintObj); + } else { + var adjNode = getNextMatchingNode(c, c); + if (adjNode) { + var adjDp = DU.getDataParsoid(adjNode); + if (!adjDp.tmp) { + adjDp.tmp = {}; + } + adjDp.tmp.linted = true; + env.log('lint/misnested-tag', lintObj); + } else if (DU.hasLiteralHTMLMarker(dp)) { + env.log('lint/missing-end-tag', lintObj); } - adjDp.tmp.linted = true; - env.log('lint/misnested-tag', lintObj); - } else if (DU.hasLiteralHTMLMarker(dp)) { - env.log('lint/missing-end-tag', lintObj); } } diff --git a/tests/mocha/linter.js b/tests/mocha/linter.js index 857bd91..ef303fa 100644 --- a/tests/mocha/linter.js +++ b/tests/mocha/linter.js @@ -639,4 +639,58 @@ }); }); + describe('HTML5 MISNESTED TAGS', function() { + it('should not trigger html5 misnesting if there is no following content', function() { + return parseWT('<del>foo\nbar').then(function(result) { + result.should.have.length(1); + result[0].should.have.a.property("type", "missing-end-tag"); + result[0].should.have.a.property("params"); + result[0].params.should.have.a.property("name", "del"); + }); + }); + it('should trigger html5 misnesting correctly', function() { + return parseWT('<del>foo\n\nbar').then(function(result) { + result.should.have.length(1); + result[0].should.have.a.property("type", "html5-misnesting"); + result[0].dsr.should.deep.equal([ 0, 8, 5, 0 ]); + result[0].should.have.a.property("params"); + result[0].params.should.have.a.property("name", "del"); + }); + }); + it('should trigger html5 misnesting for span (1)', function() { + return parseWT('<span>foo\n\nbar').then(function(result) { + result.should.have.length(1); + result[0].should.have.a.property("type", "html5-misnesting"); + result[0].dsr.should.deep.equal([ 0, 9, 6, 0 ]); + result[0].should.have.a.property("params"); + result[0].params.should.have.a.property("name", "span"); + }); + }); + it('should trigger html5 misnesting for span (2)', function() { + return parseWT('<span>foo\n\n<div>bar</div>').then(function(result) { + result.should.have.length(1); + result[0].should.have.a.property("type", "html5-misnesting"); + result[0].dsr.should.deep.equal([ 0, 9, 6, 0 ]); + result[0].should.have.a.property("params"); + result[0].params.should.have.a.property("name", "span"); + }); + }); + it('should trigger html5 misnesting for span (3)', function() { + return parseWT('<span>foo\n\n{|\n|x\n|}\nboo').then(function(result) { + result.should.have.length(1); + result[0].should.have.a.property("type", "html5-misnesting"); + result[0].dsr.should.deep.equal([ 0, 9, 6, 0 ]); + result[0].should.have.a.property("params"); + result[0].params.should.have.a.property("name", "span"); + }); + }); + it('should not trigger html5 misnesting for formatting tags', function() { + return parseWT('<small>foo\n\nbar').then(function(result) { + result.should.have.length(1); + result[0].should.have.a.property("type", "missing-end-tag"); + result[0].should.have.a.property("params"); + result[0].params.should.have.a.property("name", "small"); + }); + }); + }); }); -- To view, visit https://gerrit.wikimedia.org/r/379430 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I93d779eba9b7738c309b8a8a4a89f337bb4ac168 Gerrit-PatchSet: 14 Gerrit-Project: mediawiki/services/parsoid Gerrit-Branch: master Gerrit-Owner: Subramanya Sastry <ssas...@wikimedia.org> Gerrit-Reviewer: Arlolra <abrea...@wikimedia.org> Gerrit-Reviewer: C. Scott Ananian <canan...@wikimedia.org> Gerrit-Reviewer: Legoktm <lego...@member.fsf.org> Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits