jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/379430 )

Change subject: Linter: Flag misnested tags with different behavior in HTML5 vs 
Tidy
......................................................................


Linter: Flag misnested tags with different behavior in HTML5 vs Tidy

* This will make a different when Tidy is replaced by Remex and
  will also make Parsoid rendering more compatible with Tidy.

Bug: T176363
Change-Id: I93d779eba9b7738c309b8a8a4a89f337bb4ac168
---
M lib/utils/DOMUtils.js
M lib/wt2html/pp/handlers/linter.js
M tests/mocha/linter.js
3 files changed, 140 insertions(+), 9 deletions(-)

Approvals:
  jenkins-bot: Verified
  Arlolra: Looks good to me, approved



diff --git a/lib/utils/DOMUtils.js b/lib/utils/DOMUtils.js
index e073bde..5de498f 100644
--- a/lib/utils/DOMUtils.js
+++ b/lib/utils/DOMUtils.js
@@ -1244,6 +1244,12 @@
                return next;
        },
 
+       hasFollowingContent: function(node) {
+               return !DU.isBody(node) && (
+                       DU.nextNonSepSibling(node) || 
DU.hasFollowingContent(node.parentNode)
+               );
+       },
+
        numNonDeletedChildNodes: function(node) {
                var n = 0;
                var child = node.firstChild;
diff --git a/lib/wt2html/pp/handlers/linter.js 
b/lib/wt2html/pp/handlers/linter.js
index 305b023..fa50353 100644
--- a/lib/wt2html/pp/handlers/linter.js
+++ b/lib/wt2html/pp/handlers/linter.js
@@ -17,6 +17,62 @@
 var Util = require('../../../utils/Util.js').Util;
 var Consts = require('../../../config/WikitextConstants.js').WikitextConstants;
 
+/* 
------------------------------------------------------------------------------
+ * We are trying to find HTML5 tags that have different behavior compared to 
HTML4
+ * in some misnesting scenarios around wikitext paragraphs.
+ *
+ * Ex: Input: <p><small>a</p><p>b</small></p>
+ *     Tidy  output: <p><small>a</small></p><p><small>b</small></p>
+ *     HTML5 output: <p><small>a</small></p><p><small>b</small></p>
+ *
+ * So, all good here.
+ * But, see how output changes when we use <span> instead
+ *
+ * Ex: Input: <p><span>a</p><p>b</span></p>
+ *     Tidy  output: <p><span>a</span></p><p><span>b</span></p>
+ *     HTML5 output: <p><span>a</span></p><p>b</p>
+ *
+ * The source wikitext is "<span>a\n\nb</span>". The difference persists even
+ * when you have "<span>a\n\n<div>b</div>" or "<span>a\n\n{|\n|x\n|}\nbar".
+ *
+ * This is because Tidy seems to be doing the equivalent of HTM5-treebuilder's
+ * active formatting element reconstruction step on all *inline* elements.
+ * However, HTML5 parsers only do that on formatting elements. So, we need
+ * to compute which HTML5 tags are subject to this differential behavior.
+ *
+ * We compute that by excluding the following tags from the list of all HTML5 
tags
+ * - If our sanitizer doesn't whitelist them, they will be escaped => ignore 
them
+ * - HTML4 block tags are excluded (obviously)
+ * - Void tags don't matter since they cannot wrap anything (obviously)
+ * - Active formatting elements have special handling in the HTML5 tree 
building
+ *   algorithm where they are reconstructed to wrap all originally intended 
content.
+ *   (ex: <small> above)
+ *
+ * Here is the list of 22 HTML5 tags that are affected:
+ *    ABBR, BDI, BDO, CITE, DATA, DEL, DFN, INS, KBD, MARK,
+ *    Q, RB, RP, RT, RTC, RUBY, SAMP, SPAN, SUB, SUP, TIME, VAR
+ *
+ * https://phabricator.wikimedia.org/T176363#3628173 verifies that this list of
+ * tags all demonstrate this behavior.
+ * 
------------------------------------------------------------------------------ 
*/
+var tagsWithChangedMisnestingBehavior;
+function getTagsWithChangedMisnestingBehavior() {
+       if (!tagsWithChangedMisnestingBehavior) {
+               tagsWithChangedMisnestingBehavior = new Set();
+               Consts.HTML.HTML5Tags.forEach(function(t) {
+                       if (Consts.Sanitizer.TagWhiteList.has(t) &&
+                               !Consts.HTML.BlockTags.has(t) &&
+                               !Consts.HTML.FormattingTags.has(t) &&
+                               !Consts.HTML.VoidTags.has(t)
+                       ) {
+                               tagsWithChangedMisnestingBehavior.add(t);
+                       }
+               });
+       }
+
+       return tagsWithChangedMisnestingBehavior;
+}
+
 /*
  * Log Transclusion with more than one parts
  * Ex - {{table-start}}
@@ -176,16 +232,31 @@
                                templateInfo: templateInfo,
                                params: { name: cNodeName },
                        };
-                       var adjNode = getNextMatchingNode(c, c);
-                       if (adjNode) {
-                               var adjDp = DU.getDataParsoid(adjNode);
-                               if (!adjDp.tmp) {
-                                       adjDp.tmp = {};
+
+                       // FIXME: This literal html marker check is strictly 
not required
+                       // (a) we've already checked that above and know that 
isQuoteElt is
+                       //     not one of our tags.
+                       // (b) none of the tags in the list have native 
wikitext syntax =>
+                       //     they will show up as literal html tags.
+                       // But, in the interest of long-term maintenance in the 
face of
+                       // changes (to wikitext or html specs), let us make it 
explicit.
+                       if (DU.hasLiteralHTMLMarker(dp) &&
+                               
getTagsWithChangedMisnestingBehavior().has(c.nodeName) &&
+                               DU.hasFollowingContent(c)
+                       ) {
+                               env.log('lint/html5-misnesting', lintObj);
+                       } else {
+                               var adjNode = getNextMatchingNode(c, c);
+                               if (adjNode) {
+                                       var adjDp = DU.getDataParsoid(adjNode);
+                                       if (!adjDp.tmp) {
+                                               adjDp.tmp = {};
+                                       }
+                                       adjDp.tmp.linted = true;
+                                       env.log('lint/misnested-tag', lintObj);
+                               } else if (DU.hasLiteralHTMLMarker(dp)) {
+                                       env.log('lint/missing-end-tag', 
lintObj);
                                }
-                               adjDp.tmp.linted = true;
-                               env.log('lint/misnested-tag', lintObj);
-                       } else if (DU.hasLiteralHTMLMarker(dp)) {
-                               env.log('lint/missing-end-tag', lintObj);
                        }
                }
 
diff --git a/tests/mocha/linter.js b/tests/mocha/linter.js
index 857bd91..ef303fa 100644
--- a/tests/mocha/linter.js
+++ b/tests/mocha/linter.js
@@ -639,4 +639,58 @@
                });
        });
 
+       describe('HTML5 MISNESTED TAGS', function() {
+               it('should not trigger html5 misnesting if there is no 
following content', function() {
+                       return parseWT('<del>foo\nbar').then(function(result) {
+                               result.should.have.length(1);
+                               result[0].should.have.a.property("type", 
"missing-end-tag");
+                               result[0].should.have.a.property("params");
+                               result[0].params.should.have.a.property("name", 
"del");
+                       });
+               });
+               it('should trigger html5 misnesting correctly', function() {
+                       return parseWT('<del>foo\n\nbar').then(function(result) 
{
+                               result.should.have.length(1);
+                               result[0].should.have.a.property("type", 
"html5-misnesting");
+                               result[0].dsr.should.deep.equal([ 0, 8, 5, 0 ]);
+                               result[0].should.have.a.property("params");
+                               result[0].params.should.have.a.property("name", 
"del");
+                       });
+               });
+               it('should trigger html5 misnesting for span (1)', function() {
+                       return 
parseWT('<span>foo\n\nbar').then(function(result) {
+                               result.should.have.length(1);
+                               result[0].should.have.a.property("type", 
"html5-misnesting");
+                               result[0].dsr.should.deep.equal([ 0, 9, 6, 0 ]);
+                               result[0].should.have.a.property("params");
+                               result[0].params.should.have.a.property("name", 
"span");
+                       });
+               });
+               it('should trigger html5 misnesting for span (2)', function() {
+                       return 
parseWT('<span>foo\n\n<div>bar</div>').then(function(result) {
+                               result.should.have.length(1);
+                               result[0].should.have.a.property("type", 
"html5-misnesting");
+                               result[0].dsr.should.deep.equal([ 0, 9, 6, 0 ]);
+                               result[0].should.have.a.property("params");
+                               result[0].params.should.have.a.property("name", 
"span");
+                       });
+               });
+               it('should trigger html5 misnesting for span (3)', function() {
+                       return 
parseWT('<span>foo\n\n{|\n|x\n|}\nboo').then(function(result) {
+                               result.should.have.length(1);
+                               result[0].should.have.a.property("type", 
"html5-misnesting");
+                               result[0].dsr.should.deep.equal([ 0, 9, 6, 0 ]);
+                               result[0].should.have.a.property("params");
+                               result[0].params.should.have.a.property("name", 
"span");
+                       });
+               });
+               it('should not trigger html5 misnesting for formatting tags', 
function() {
+                       return 
parseWT('<small>foo\n\nbar').then(function(result) {
+                               result.should.have.length(1);
+                               result[0].should.have.a.property("type", 
"missing-end-tag");
+                               result[0].should.have.a.property("params");
+                               result[0].params.should.have.a.property("name", 
"small");
+                       });
+               });
+       });
 });

-- 
To view, visit https://gerrit.wikimedia.org/r/379430
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I93d779eba9b7738c309b8a8a4a89f337bb4ac168
Gerrit-PatchSet: 14
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <ssas...@wikimedia.org>
Gerrit-Reviewer: Arlolra <abrea...@wikimedia.org>
Gerrit-Reviewer: C. Scott Ananian <canan...@wikimedia.org>
Gerrit-Reviewer: Legoktm <lego...@member.fsf.org>
Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to