jenkins-bot has submitted this change and it was merged. Change subject: Remove html5 treebuilder in favour of domino's ......................................................................
Remove html5 treebuilder in favour of domino's * The html5 library did the stripping of the leading newline following pre as part of tree building. We worked around this by adding an extra newline after pre to compensate, which can now be removed. In domino, correctly, this is part of tokenizing. https://html.spec.whatwg.org/multipage/semantics.html#the-pre-element * The blacklist changes are because the above hack was adding an extra newline when the pre was in a blockquote, and hence wouldn't have any stripping. So, this fixes a bug. Change-Id: I45ffb07723a6842be6a39d52ab0165676536be0a --- M lib/wt2html/HTML5TreeBuilder.js M npm-shrinkwrap.json M package.json M tests/parserTests-blacklist.js 4 files changed, 54 insertions(+), 75 deletions(-) Approvals: Subramanya Sastry: Looks good to me, approved jenkins-bot: Verified diff --git a/lib/wt2html/HTML5TreeBuilder.js b/lib/wt2html/HTML5TreeBuilder.js index 9be9bf4..6a58fa5 100644 --- a/lib/wt2html/HTML5TreeBuilder.js +++ b/lib/wt2html/HTML5TreeBuilder.js @@ -7,8 +7,7 @@ var events = require('events'); var util = require('util'); -var DOMTreeBuilder = require('html5').DOMTreeBuilder; -var domino = require('domino'); +var HTMLParser = require('domino').impl.HTMLParser; var defines = require('./parser.defines.js'); var Util = require('../utils/Util.js').Util; var SanitizerConstants = require('./tt/Sanitizer.js').SanitizerConstants; @@ -81,24 +80,44 @@ // We only need one for every run of strings and newline tokens. this.haveTransclusionShadow = false; - if (!this._treeBuilder) { - // Set up a new tree builder. Note that when adding attributes to - // elements, this will call the public DOM method `setAttributeNS` - // with all its checks that aren't part of the HTML5 parsing spec. - this._treeBuilder = new DOMTreeBuilder(domino); - this.addListener('token', - this._treeBuilder.processToken.bind(this._treeBuilder)); + this.parser = new HTMLParser(); + this.insertToken({ type: 'DOCTYPE', name: 'html' }); + this.insertToken({ type: 'StartTag', name: 'body' }); +}; + +var types = new Map(Object.entries({ + EOF: -1, + Characters: 1, + StartTag: 2, + EndTag: 3, + Comment: 4, + DOCTYPE: 5, +})); + +// FIXME: This conversion code can be eliminated by cleaning up processToken. +TreeBuilder.prototype.insertToken = function(tok) { + var t = types.get(tok.type); + var value, arg3; + switch (tok.type) { + case 'StartTag': + case 'EndTag': + case 'DOCTYPE': + value = tok.name; + if (Array.isArray(tok.data)) { + arg3 = tok.data.map(function(a) { + return [a.nodeName, a.nodeValue]; + }); + } + break; + case 'Characters': + case 'Comment': + case 'EOF': + value = tok.data; + break; + default: + console.assert(false, "Unexpected type: " + tok.type); } - - // Reset the tree builder - this._treeBuilder.startTokenization(this); - - // At this point, domino has already created a document element for us but - // the html5 library would like to use its own (keeps an internal state of - // open elements). Remove it and process a body token to trigger rebuilding. - this.doc = this._treeBuilder.document; - this.doc.removeChild(this.doc.lastChild); - this.processToken(new TagTk('body')); + this.parser.insertToken(t, value, arg3); }; TreeBuilder.prototype.onChunk = function(tokens) { @@ -116,7 +135,7 @@ if (this.lastToken && this.lastToken.constructor !== EOFTk) { this.env.log("error", "EOFTk was lost in page", this.env.page.name); } - this.emit('document', this.doc); + this.emit('document', this.parser.document()); this.emit('end'); this.resetState(); }; @@ -171,12 +190,7 @@ case String: case NlTk: data = (token.constructor === NlTk) ? '\n' : token; - if (this.preceededByPre && data[0] === '\n') { - // Emit two newlines when preceded by a pre because the - // treebuilder will eat one. - data = '\n' + data; - } - this.emit('token', { type: 'Characters', data: data }); + this.insertToken({ type: 'Characters', data: data }); // NlTks are only fostered when accompanied by // non-whitespace. Safe to ignore. if (this.inTransclusion && this.tableDepth > 0 && @@ -185,7 +199,7 @@ // after every text node so that we can detect // fostered content that came from a transclusion. this.env.log("debug/html", this.pipelineId, "Inserting shadow transclusion meta"); - this.emit('token', { + this.insertToken({ type: 'StartTag', name: 'meta', data: [{ nodeName: "typeof", nodeValue: "mw:TransclusionShadow" }], @@ -203,22 +217,21 @@ // like the navbox if (!this.inTransclusion) { this.env.log("debug/html", this.pipelineId, "Inserting foster box meta"); - this.emit('token', { + this.insertToken({ type: 'StartTag', name: 'table', - self_closing: true, data: [{ nodeName: "typeof", nodeValue: "mw:FosterBox" }], }); } } - this.emit('token', {type: 'StartTag', name: tName, data: this._att(attribs)}); + this.insertToken({ type: 'StartTag', name: tName, data: this._att(attribs) }); this.env.log("debug/html", this.pipelineId, "Inserting shadow meta for", tName); attrs = [ { nodeName: "typeof", nodeValue: "mw:StartTag" }, { nodeName: "data-stag", nodeValue: tName + ":" + dataAttribs.tmp.tagId }, { nodeName: "data-parsoid", nodeValue: JSON.stringify(dataAttribs) }, ]; - this.emit('token', { + this.insertToken({ type: 'Comment', data: JSON.stringify({ "@type": "mw:shadow", @@ -280,7 +293,7 @@ if (tTypeOf.match(/^mw:Transclusion/)) { this.inTransclusion = /^mw:Transclusion$/.test(tTypeOf); } - this.emit('token', { + this.insertToken({ type: 'Comment', data: JSON.stringify({ '@type': tTypeOf, @@ -292,11 +305,11 @@ } var newAttrs = this._att(attribs); - this.emit('token', { type: 'StartTag', name: tName, data: newAttrs }); + this.insertToken({ type: 'StartTag', name: tName, data: newAttrs }); if (!Util.isVoidElement(tName)) { // VOID_ELEMENTS are automagically treated as self-closing by // the tree builder - this.emit('token', { type: 'EndTag', name: tName, data: newAttrs }); + this.insertToken({ type: 'EndTag', name: tName, data: newAttrs }); } break; case EndTagTk: @@ -304,7 +317,7 @@ if (tName === 'table' && this.tableDepth > 0) { this.tableDepth--; } - this.emit('token', {type: 'EndTag', name: tName}); + this.insertToken({ type: 'EndTag', name: tName }); if (dataAttribs && !dataAttribs.autoInsertedEnd) { attrs = this._att(attribs).concat([ { nodeName: "typeof", nodeValue: "mw:EndTag" }, @@ -312,7 +325,7 @@ { nodeName: "data-parsoid", nodeValue: JSON.stringify(dataAttribs) }, ]); this.env.log("debug/html", this.pipelineId, "Inserting shadow meta for", tName); - this.emit('token', { + this.insertToken({ type: 'Comment', data: JSON.stringify({ "@type": "mw:shadow", @@ -322,10 +335,10 @@ } break; case CommentTk: - this.emit('token', { type: 'Comment', data: token.value }); + this.insertToken({ type: 'Comment', data: token.value }); break; case EOFTk: - this.emit('token', { type: 'EOF' }); + this.insertToken({ type: 'EOF' }); break; default: var errors = [ @@ -342,21 +355,6 @@ // those tokens will need transclusion shadow protection again. if (token.constructor !== String && token.constructor !== NlTk) { this.haveTransclusionShadow = false; - } - - // Keep track of preceeding by pre - if (token.constructor === TagTk) { - if (token.name === "pre") { - this.preceededByPre = true; - } else if (token.name === "span" && - token.getAttribute('typeof') === "mw:Nowiki") { - // Nowikis are emitted as elements in pres - // and should be ignored for the sake of first nl eating. - } else { - this.preceededByPre = false; - } - } else { - this.preceededByPre = false; } // Store the last token diff --git a/npm-shrinkwrap.json b/npm-shrinkwrap.json index 26de604..db9ef6b 100644 --- a/npm-shrinkwrap.json +++ b/npm-shrinkwrap.json @@ -606,8 +606,8 @@ "resolved": "https://registry.npmjs.org/diff/-/diff-1.0.7.tgz" }, "domino": { - "version": "1.0.25", - "resolved": "https://registry.npmjs.org/domino/-/domino-1.0.25.tgz" + "version": "1.0.26", + "resolved": "https://registry.npmjs.org/domino/-/domino-1.0.26.tgz" }, "entities": { "version": "1.1.1", @@ -1088,20 +1088,6 @@ "gelfling": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/gelfling/-/gelfling-0.2.0.tgz" - } - } - }, - "html5": { - "version": "1.0.5", - "resolved": "https://registry.npmjs.org/html5/-/html5-1.0.5.tgz", - "dependencies": { - "html5-entities": { - "version": "1.0.0", - "resolved": "https://registry.npmjs.org/html5-entities/-/html5-entities-1.0.0.tgz" - }, - "opts": { - "version": "1.2.2", - "resolved": "https://registry.npmjs.org/opts/-/opts-1.2.2.tgz" } } }, diff --git a/package.json b/package.json index e7fe3f9..ea71182 100644 --- a/package.json +++ b/package.json @@ -12,13 +12,12 @@ "content-type": "git+https://github.com/wikimedia/content-type#master", "core-js": "^2.4.1", "diff": "^1.0.7", - "domino": "^1.0.25", + "domino": "^1.0.26", "entities": "^1.1.1", "express": "^4.14.0", "express-handlebars": "^3.0.0", "finalhandler": "^0.5.0", "gelf-stream": "^0.2.4", - "html5": "^1.0.5", "js-yaml": "^3.6.1", "mediawiki-title": "^0.5.6", "negotiator": "git+https://github.com/arlolra/negotiator#full-parse-access", diff --git a/tests/parserTests-blacklist.js b/tests/parserTests-blacklist.js index 17c577e..e283b6a 100644 --- a/tests/parserTests-blacklist.js +++ b/tests/parserTests-blacklist.js @@ -1364,10 +1364,6 @@ add("selser", "Templates: Handle comments in the target [0,0,0,0,0,4,0]", "{{echo\n<!-- should be ignored -->\n|foo}}\n\n{{echo<!-- should be ignored -->\n|foo}}\n\n{{echo<!-- should be ignored -->|foo}}\n\nhlr2vnanjhflayvi\n\n{{<!-- should be ignored -->echo|foo}}"); add("selser", "4. Indent-Pre and extension tags 5", "<nowiki> </nowiki>a <tag />"); add("selser", "4. Indent-Pre and extension tags [2,0]", "z2q1uxsvdja9vn29\n\n a <tag />"); -add("selser", "Render paragraphs when indent-pre is suppressed in blocklevels [[1,3,0,0]]", "<blockquote> foo\n\n bar\n</blockquote>"); -add("selser", "Render paragraphs when indent-pre is suppressed in blocklevels [[1,0,0,0]]", "<blockquote> foo\n\n bar\n</blockquote>"); -add("selser", "Render paragraphs when indent-pre is suppressed in blocklevels [[1,0,[4],3]]", "<blockquote> foo\n\ngn7ylzy40ftj4i\n</blockquote>"); -add("selser", "Render paragraphs when indent-pre is suppressed in blocklevels [[2,4,2,0]]", "<blockquote>400w4vubqf0nqaor\n\n foo\ngzulwh9x29xgk3xrmyw8xai0kyg8ehfr\n bar\n</blockquote>"); add("selser", "5a. White-space in indent-pre [[0,0,4]]", " a<br />\n 7tmueg6dyhqia4i"); add("selser", "5a. White-space in indent-pre [1]", " a<br />\n \n \n b"); add("selser", "5a. White-space in indent-pre [[3,0,0]]", " <br />\n \n \n b"); -- To view, visit https://gerrit.wikimedia.org/r/315644 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I45ffb07723a6842be6a39d52ab0165676536be0a Gerrit-PatchSet: 8 Gerrit-Project: mediawiki/services/parsoid Gerrit-Branch: master Gerrit-Owner: Arlolra <abrea...@wikimedia.org> Gerrit-Reviewer: Arlolra <abrea...@wikimedia.org> Gerrit-Reviewer: C. Scott Ananian <canan...@wikimedia.org> Gerrit-Reviewer: Legoktm <legoktm.wikipe...@gmail.com> Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org> Gerrit-Reviewer: Tim Starling <tstarl...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits