[MediaWiki-commits] [Gerrit] mediawiki...parsoid[master]: Remove html5 treebuilder in favour of domino's

jenkins-bot (Code Review) Sat, 15 Oct 2016 12:14:49 -0700

jenkins-bot has submitted this change and it was merged.

Change subject: Remove html5 treebuilder in favour of domino's
......................................................................



Remove html5 treebuilder in favour of domino's

 * The html5 library did the stripping of the leading newline following
   pre as part of tree building.  We worked around this by adding an
   extra newline after pre to compensate, which can now be removed.  In
   domino, correctly, this is part of tokenizing.

   https://html.spec.whatwg.org/multipage/semantics.html#the-pre-element

 * The blacklist changes are because the above hack was adding an extra
   newline when the pre was in a blockquote, and hence wouldn't have any
   stripping.  So, this fixes a bug.

Change-Id: I45ffb07723a6842be6a39d52ab0165676536be0a
---
M lib/wt2html/HTML5TreeBuilder.js
M npm-shrinkwrap.json
M package.json
M tests/parserTests-blacklist.js
4 files changed, 54 insertions(+), 75 deletions(-)

Approvals:
  Subramanya Sastry: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/lib/wt2html/HTML5TreeBuilder.js b/lib/wt2html/HTML5TreeBuilder.js
index 9be9bf4..6a58fa5 100644
--- a/lib/wt2html/HTML5TreeBuilder.js
+++ b/lib/wt2html/HTML5TreeBuilder.js
@@ -7,8 +7,7 @@
 
 var events = require('events');
 var util = require('util');
-var DOMTreeBuilder = require('html5').DOMTreeBuilder;
-var domino = require('domino');
+var HTMLParser = require('domino').impl.HTMLParser;
 var defines = require('./parser.defines.js');
 var Util = require('../utils/Util.js').Util;
 var SanitizerConstants = require('./tt/Sanitizer.js').SanitizerConstants;
@@ -81,24 +80,44 @@
        // We only need one for every run of strings and newline tokens.
        this.haveTransclusionShadow = false;
 
-       if (!this._treeBuilder) {
-               // Set up a new tree builder.  Note that when adding attributes 
to
-               // elements, this will call the public DOM method 
`setAttributeNS`
-               // with all its checks that aren't part of the HTML5 parsing 
spec.
-               this._treeBuilder = new DOMTreeBuilder(domino);
-               this.addListener('token',
-                       this._treeBuilder.processToken.bind(this._treeBuilder));
+       this.parser = new HTMLParser();
+       this.insertToken({ type: 'DOCTYPE', name: 'html' });
+       this.insertToken({ type: 'StartTag', name: 'body' });
+};
+
+var types = new Map(Object.entries({
+       EOF: -1,
+       Characters: 1,
+       StartTag: 2,
+       EndTag: 3,
+       Comment: 4,
+       DOCTYPE: 5,
+}));
+
+// FIXME: This conversion code can be eliminated by cleaning up processToken.
+TreeBuilder.prototype.insertToken = function(tok) {
+       var t = types.get(tok.type);
+       var value, arg3;
+       switch (tok.type) {
+               case 'StartTag':
+               case 'EndTag':
+               case 'DOCTYPE':
+                       value = tok.name;
+                       if (Array.isArray(tok.data)) {
+                               arg3 = tok.data.map(function(a) {
+                                       return [a.nodeName, a.nodeValue];
+                               });
+                       }
+                       break;
+               case 'Characters':
+               case 'Comment':
+               case 'EOF':
+                       value = tok.data;
+                       break;
+               default:
+                       console.assert(false, "Unexpected type: " + tok.type);
        }
-
-       // Reset the tree builder
-       this._treeBuilder.startTokenization(this);
-
-       // At this point, domino has already created a document element for us 
but
-       // the html5 library would like to use its own (keeps an internal state 
of
-       // open elements). Remove it and process a body token to trigger 
rebuilding.
-       this.doc = this._treeBuilder.document;
-       this.doc.removeChild(this.doc.lastChild);
-       this.processToken(new TagTk('body'));
+       this.parser.insertToken(t, value, arg3);
 };
 
 TreeBuilder.prototype.onChunk = function(tokens) {
@@ -116,7 +135,7 @@
        if (this.lastToken && this.lastToken.constructor !== EOFTk) {
                this.env.log("error", "EOFTk was lost in page", 
this.env.page.name);
        }
-       this.emit('document', this.doc);
+       this.emit('document', this.parser.document());
        this.emit('end');
        this.resetState();
 };
@@ -171,12 +190,7 @@
                case String:
                case NlTk:
                        data = (token.constructor === NlTk) ? '\n' : token;
-                       if (this.preceededByPre && data[0] === '\n') {
-                               // Emit two newlines when preceded by a pre 
because the
-                               // treebuilder will eat one.
-                               data = '\n' + data;
-                       }
-                       this.emit('token', { type: 'Characters', data: data });
+                       this.insertToken({ type: 'Characters', data: data });
                        // NlTks are only fostered when accompanied by
                        // non-whitespace. Safe to ignore.
                        if (this.inTransclusion && this.tableDepth > 0 &&
@@ -185,7 +199,7 @@
                                // after every text node so that we can detect
                                // fostered content that came from a 
transclusion.
                                this.env.log("debug/html", this.pipelineId, 
"Inserting shadow transclusion meta");
-                               this.emit('token', {
+                               this.insertToken({
                                        type: 'StartTag',
                                        name: 'meta',
                                        data: [{ nodeName: "typeof", nodeValue: 
"mw:TransclusionShadow" }],
@@ -203,22 +217,21 @@
                                // like the navbox
                                if (!this.inTransclusion) {
                                        this.env.log("debug/html", 
this.pipelineId, "Inserting foster box meta");
-                                       this.emit('token', {
+                                       this.insertToken({
                                                type: 'StartTag',
                                                name: 'table',
-                                               self_closing: true,
                                                data: [{ nodeName: "typeof", 
nodeValue: "mw:FosterBox" }],
                                        });
                                }
                        }
-                       this.emit('token', {type: 'StartTag', name: tName, 
data: this._att(attribs)});
+                       this.insertToken({ type: 'StartTag', name: tName, data: 
this._att(attribs) });
                        this.env.log("debug/html", this.pipelineId, "Inserting 
shadow meta for", tName);
                        attrs = [
                                { nodeName: "typeof", nodeValue: "mw:StartTag" 
},
                                { nodeName: "data-stag", nodeValue: tName + ":" 
+ dataAttribs.tmp.tagId },
                                { nodeName: "data-parsoid", nodeValue: 
JSON.stringify(dataAttribs) },
                        ];
-                       this.emit('token', {
+                       this.insertToken({
                                type: 'Comment',
                                data: JSON.stringify({
                                        "@type": "mw:shadow",
@@ -280,7 +293,7 @@
                                        if (tTypeOf.match(/^mw:Transclusion/)) {
                                                this.inTransclusion = 
/^mw:Transclusion$/.test(tTypeOf);
                                        }
-                                       this.emit('token', {
+                                       this.insertToken({
                                                type: 'Comment',
                                                data: JSON.stringify({
                                                        '@type': tTypeOf,
@@ -292,11 +305,11 @@
                        }
 
                        var newAttrs = this._att(attribs);
-                       this.emit('token', { type: 'StartTag', name: tName, 
data: newAttrs });
+                       this.insertToken({ type: 'StartTag', name: tName, data: 
newAttrs });
                        if (!Util.isVoidElement(tName)) {
                                // VOID_ELEMENTS are automagically treated as 
self-closing by
                                // the tree builder
-                               this.emit('token', { type: 'EndTag', name: 
tName, data: newAttrs });
+                               this.insertToken({ type: 'EndTag', name: tName, 
data: newAttrs });
                        }
                        break;
                case EndTagTk:
@@ -304,7 +317,7 @@
                        if (tName === 'table' && this.tableDepth > 0) {
                                this.tableDepth--;
                        }
-                       this.emit('token', {type: 'EndTag', name: tName});
+                       this.insertToken({ type: 'EndTag', name: tName });
                        if (dataAttribs && !dataAttribs.autoInsertedEnd) {
                                attrs = this._att(attribs).concat([
                                        { nodeName: "typeof", nodeValue: 
"mw:EndTag" },
@@ -312,7 +325,7 @@
                                        { nodeName: "data-parsoid", nodeValue: 
JSON.stringify(dataAttribs) },
                                ]);
                                this.env.log("debug/html", this.pipelineId, 
"Inserting shadow meta for", tName);
-                               this.emit('token', {
+                               this.insertToken({
                                        type: 'Comment',
                                        data: JSON.stringify({
                                                "@type": "mw:shadow",
@@ -322,10 +335,10 @@
                        }
                        break;
                case CommentTk:
-                       this.emit('token', { type: 'Comment', data: token.value 
});
+                       this.insertToken({ type: 'Comment', data: token.value 
});
                        break;
                case EOFTk:
-                       this.emit('token', { type: 'EOF' });
+                       this.insertToken({ type: 'EOF' });
                        break;
                default:
                        var errors = [
@@ -342,21 +355,6 @@
        // those tokens will need transclusion shadow protection again.
        if (token.constructor !== String && token.constructor !== NlTk) {
                this.haveTransclusionShadow = false;
-       }
-
-       // Keep track of preceeding by pre
-       if (token.constructor === TagTk) {
-               if (token.name === "pre") {
-                       this.preceededByPre = true;
-               } else if (token.name === "span" &&
-                       token.getAttribute('typeof') === "mw:Nowiki") {
-                       // Nowikis are emitted as elements in pres
-                       // and should be ignored for the sake of first nl 
eating.
-               } else {
-                       this.preceededByPre = false;
-               }
-       } else {
-               this.preceededByPre = false;
        }
 
        // Store the last token
diff --git a/npm-shrinkwrap.json b/npm-shrinkwrap.json
index 26de604..db9ef6b 100644
--- a/npm-shrinkwrap.json
+++ b/npm-shrinkwrap.json
@@ -606,8 +606,8 @@
       "resolved": "https://registry.npmjs.org/diff/-/diff-1.0.7.tgz";
     },
     "domino": {
-      "version": "1.0.25",
-      "resolved": "https://registry.npmjs.org/domino/-/domino-1.0.25.tgz";
+      "version": "1.0.26",
+      "resolved": "https://registry.npmjs.org/domino/-/domino-1.0.26.tgz";
     },
     "entities": {
       "version": "1.1.1",
@@ -1088,20 +1088,6 @@
         "gelfling": {
           "version": "0.2.0",
           "resolved": 
"https://registry.npmjs.org/gelfling/-/gelfling-0.2.0.tgz";
-        }
-      }
-    },
-    "html5": {
-      "version": "1.0.5",
-      "resolved": "https://registry.npmjs.org/html5/-/html5-1.0.5.tgz";,
-      "dependencies": {
-        "html5-entities": {
-          "version": "1.0.0",
-          "resolved": 
"https://registry.npmjs.org/html5-entities/-/html5-entities-1.0.0.tgz";
-        },
-        "opts": {
-          "version": "1.2.2",
-          "resolved": "https://registry.npmjs.org/opts/-/opts-1.2.2.tgz";
         }
       }
     },
diff --git a/package.json b/package.json
index e7fe3f9..ea71182 100644
--- a/package.json
+++ b/package.json
@@ -12,13 +12,12 @@
     "content-type": "git+https://github.com/wikimedia/content-type#master";,
     "core-js": "^2.4.1",
     "diff": "^1.0.7",
-    "domino": "^1.0.25",
+    "domino": "^1.0.26",
     "entities": "^1.1.1",
     "express": "^4.14.0",
     "express-handlebars": "^3.0.0",
     "finalhandler": "^0.5.0",
     "gelf-stream": "^0.2.4",
-    "html5": "^1.0.5",
     "js-yaml": "^3.6.1",
     "mediawiki-title": "^0.5.6",
     "negotiator": 
"git+https://github.com/arlolra/negotiator#full-parse-access";,
diff --git a/tests/parserTests-blacklist.js b/tests/parserTests-blacklist.js
index 17c577e..e283b6a 100644
--- a/tests/parserTests-blacklist.js
+++ b/tests/parserTests-blacklist.js
@@ -1364,10 +1364,6 @@
 add("selser", "Templates: Handle comments in the target [0,0,0,0,0,4,0]", 
"{{echo\n<!-- should be ignored -->\n|foo}}\n\n{{echo<!-- should be ignored 
-->\n|foo}}\n\n{{echo<!-- should be ignored 
-->|foo}}\n\nhlr2vnanjhflayvi\n\n{{<!-- should be ignored -->echo|foo}}");
 add("selser", "4. Indent-Pre and extension tags 5", "<nowiki> </nowiki>a <tag 
/>");
 add("selser", "4. Indent-Pre and extension tags [2,0]", "z2q1uxsvdja9vn29\n\n 
a <tag />");
-add("selser", "Render paragraphs when indent-pre is suppressed in blocklevels 
[[1,3,0,0]]", "<blockquote> foo\n\n bar\n</blockquote>");
-add("selser", "Render paragraphs when indent-pre is suppressed in blocklevels 
[[1,0,0,0]]", "<blockquote> foo\n\n bar\n</blockquote>");
-add("selser", "Render paragraphs when indent-pre is suppressed in blocklevels 
[[1,0,[4],3]]", "<blockquote> foo\n\ngn7ylzy40ftj4i\n</blockquote>");
-add("selser", "Render paragraphs when indent-pre is suppressed in blocklevels 
[[2,4,2,0]]", "<blockquote>400w4vubqf0nqaor\n\n 
foo\ngzulwh9x29xgk3xrmyw8xai0kyg8ehfr\n bar\n</blockquote>");
 add("selser", "5a. White-space in indent-pre [[0,0,4]]", " a<br />\n 
7tmueg6dyhqia4i");
 add("selser", "5a. White-space in indent-pre [1]", " a<br />\n \n \n b");
 add("selser", "5a. White-space in indent-pre [[3,0,0]]", " <br />\n \n \n b");

-- 
To view, visit https://gerrit.wikimedia.org/r/315644
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I45ffb07723a6842be6a39d52ab0165676536be0a
Gerrit-PatchSet: 8
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Arlolra <abrea...@wikimedia.org>
Gerrit-Reviewer: Arlolra <abrea...@wikimedia.org>
Gerrit-Reviewer: C. Scott Ananian <canan...@wikimedia.org>
Gerrit-Reviewer: Legoktm <legoktm.wikipe...@gmail.com>
Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org>
Gerrit-Reviewer: Tim Starling <tstarl...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] mediawiki...parsoid[master]: Remove html5 treebuilder in favour of domino's

Reply via email to