jenkins-bot has submitted this change and it was merged. Change subject: Allow extensions to handle specific contentmodels. ......................................................................
Allow extensions to handle specific contentmodels. Some extensions (for example, Extension:ProofreadPage) do more than register specific extension tags: they also hook the parser to declare responsibility for a specific contentmodel (like "proofread-page" or "json"). These are https://www.mediawiki.org/wiki/Category:ContentHandler_extensions (as opposed to https://www.mediawiki.org/wiki/Category:Tag_extensions). See https://www.mediawiki.org/wiki/Manual:ContentHandler for more details. We abstract the top-level parser entry points to allow dispatching to extensions to parse alternative content models and add a core extension as a demonstration that handles the "json" content model, rendering it in DOM as an HTML table (as the json content model in mediawiki core does). Bug: T48580 Bug: T133320 Change-Id: I7ca31c99de8e04b1359bc521df121db0eb69e384 --- M bin/parse.js M bin/parserTests.js M bin/roundtrip-test.js M lib/api/apiUtils.js M lib/api/routes.js M lib/config/MWParserEnvironment.js M lib/config/ParsoidConfig.js M lib/config/WikiConfig.js M lib/config/extapi.js A lib/ext/JSON/index.js M lib/jsapi.js M lib/utils/DOMUtils.js M lib/wt2html/DOMPostProcessor.js M package.json M tests/mocha/api.js M tests/mocha/parse.js M tests/mocha/test.helpers.js M tests/mockAPI.js 18 files changed, 544 insertions(+), 89 deletions(-) Approvals: Arlolra: Looks good to me, approved jenkins-bot: Verified diff --git a/bin/parse.js b/bin/parse.js index ac81cb9..39c939b 100755 --- a/bin/parse.js +++ b/bin/parse.js @@ -73,6 +73,11 @@ 'boolean': false, 'default': ParserEnv.prototype.defaultPageName, }, + 'contentmodel': { + description: 'The content model of the input. Defaults to "wikitext" but extensions may support others (for example, "json").', + 'boolean': false, + 'default': null, + }, 'oldid': { description: 'Oldid of the given page.', 'boolean': false, @@ -171,7 +176,9 @@ if (pb) { DU.applyPageBundle(doc, pb); } - return DU.serializeDOM(env, doc.body, argv.selser).then(function(out) { + var handler = env.getContentHandler(argv.contentmodel); + return handler.fromHTML(env, doc.body, argv.selser) + .then(function(out) { if (argv.html2wt || argv.wt2wt) { return { trailingNL: true, out: out, env: env }; } else { @@ -182,8 +189,8 @@ startsAtWikitext = function(argv, env, input) { env.setPageSrcInfo(input); - // Kick off the pipeline by feeding the input into the parser pipeline - return env.pipelineFactory.parse(env.page.src) + var handler = env.getContentHandler(argv.contentmodel); + return handler.toHTML(env) .then(function(doc) { if (argv.lint) { env.log("end/parse"); @@ -298,7 +305,12 @@ var target = env.normalizeAndResolvePageTitle(); return TemplateRequest .setPageSrcInfo(env, target, argv.oldid) - .then(function() { return env.page.src; }); + .then(function() { + // Preserve fetched contentmodel. + argv.contentmodel = argv.contentmodel || + env.page.meta.revision.contentmodel; + return env.page.src; + }); }); }).then(function(str) { str = str.replace(/\r/g, ''); diff --git a/bin/parserTests.js b/bin/parserTests.js index c0aaef4..f05c36f 100755 --- a/bin/parserTests.js +++ b/bin/parserTests.js @@ -21,6 +21,7 @@ var yargs = require('yargs'); var Alea = require('alea'); var DU = require('../lib/utils/DOMUtils.js').DOMUtils; +var Promise = require('../lib/utils/promise.js'); var ParsoidLogger = require('../lib/logger/ParsoidLogger.js').ParsoidLogger; var PEG = require('pegjs'); var Util = require('../lib/utils/Util.js').Util; @@ -417,30 +418,27 @@ * @param {string|null} processWikitextCB.res */ ParserTests.prototype.convertHtml2Wt = function(options, mode, item, body, processWikitextCB) { - var startsAtWikitext = mode === 'wt2wt' || mode === 'wt2html' || mode === 'selser'; var self = this; - var cb = function(err, wt) { - self.env.setPageSrcInfo(null); - self.env.page.dom = null; - processWikitextCB(err, wt); - }; - try { + return Promise.try(function() { + var startsAtWikitext = mode === 'wt2wt' || mode === 'wt2html' || mode === 'selser'; if (startsAtWikitext) { // FIXME: All tests share an env. // => we need to initialize this each time over here. - this.env.page.dom = DU.parseHTML(item.cachedBODYstr).body; + self.env.page.dom = DU.parseHTML(item.cachedBODYstr).body; } if (mode === 'selser') { - this.env.setPageSrcInfo(item.wikitext); + self.env.setPageSrcInfo(item.wikitext); } else if (booleanOption(options.use_source) && startsAtWikitext) { - this.env.setPageSrcInfo(item.wikitext); + self.env.setPageSrcInfo(item.wikitext); } else { - this.env.setPageSrcInfo(null); + self.env.setPageSrcInfo(null); } - DU.serializeDOM(this.env, body, (mode === 'selser'), cb); - } catch (err) { - cb(err, null); - } + var handler = self.env.getContentHandler(); + return handler.fromHTML(self.env, body, (mode === 'selser')); + }).finally(function() { + self.env.setPageSrcInfo(null); + self.env.page.dom = null; + }).nodify(processWikitextCB); }; /** @@ -890,7 +888,7 @@ ParserTests.prototype.convertWt2Html = function(mode, wikitext, processHtmlCB) { var env = this.env; env.setPageSrcInfo(wikitext); - env.pipelineFactory.parse(env.page.src) + env.getContentHandler().toHTML(env) .then(function(doc) { return doc.body; }) diff --git a/bin/roundtrip-test.js b/bin/roundtrip-test.js index 77e5305..e408b2b 100755 --- a/bin/roundtrip-test.js +++ b/bin/roundtrip-test.js @@ -534,9 +534,10 @@ var offsets = Diff.convertDiffToOffsetPairs(diff); if (!diff.length || !offsets.length) { return []; } + var contentmodel = data.contentmodel || 'wikitext'; var options = Object.assign({ wt2html: true, - data: { wikitext: data.newWt }, + data: { wikitext: data.newWt, contentmodel: contentmodel }, }, parsoidOptions); return parsoidPost(profile, options).then(function(body) { data.newHTML = body.html; @@ -619,11 +620,12 @@ // oldid for later use in selser. data.oldid = res.request.path.replace(/^(.*)\//, ''); data.oldWt = body; + data.contentmodel = res.headers['x-contentmodel'] || 'wikitext'; // First, fetch the HTML for the requested page's wikitext var opts = Object.assign({ wt2html: true, recordSizes: true, - data: { wikitext: data.oldWt }, + data: { wikitext: data.oldWt, contentmodel: data.contentmodel }, }, parsoidOptions); return parsoidPost(profile, opts); }).then(function(body) { @@ -636,6 +638,7 @@ recordSizes: true, data: { html: data.oldHTML.body, + contentmodel: data.contentmodel, original: { 'data-parsoid': data.oldDp, 'data-mw': data.oldMw, @@ -662,6 +665,7 @@ oldid: data.oldid, data: { html: newDocument.outerHTML, + contentmodel: data.contentmodel, original: { 'data-parsoid': data.oldDp, 'data-mw': data.oldMw, diff --git a/lib/api/apiUtils.js b/lib/api/apiUtils.js index 175c375..e3d77e9 100644 --- a/lib/api/apiUtils.js +++ b/lib/api/apiUtils.js @@ -159,7 +159,8 @@ // Re-parse the HTML to uncover foster-parenting issues doc = domino.createDocument(doc.outerHTML); - return DU.serializeDOM(env, doc.body, useSelser).then(function(out) { + var handler = env.getContentHandler(); + return handler.fromHTML(env, doc.body, useSelser).then(function(out) { // Strip selser trigger comment out = out.replace(/<!--rtSelserEditTestComment-->\n*$/, ''); @@ -494,6 +495,7 @@ apiUtils.wt2htmlRes = function(env, res, html, pb) { if (env.pageBundle) { var response = { + contentmodel: env.page.meta.revision.contentmodel, html: { headers: { 'content-type': apiUtils.htmlContentType(env) }, body: html, diff --git a/lib/api/routes.js b/lib/api/routes.js index 92fbad5..9a56651 100644 --- a/lib/api/routes.js +++ b/lib/api/routes.js @@ -320,7 +320,7 @@ return TemplateRequest.setPageSrcInfo(env, target, oldid).then(function() { env.log('info', 'started parsing'); - return env.pipelineFactory.parse(env.page.src); + return env.getContentHandler().toHTML(env); }) .then(apiUtils.roundTripDiff.bind(null, env, req, res, false)) // .timeout(REQ_TIMEOUT) @@ -348,7 +348,7 @@ return TemplateRequest.setPageSrcInfo(env, target, oldid).then(function() { env.log('info', 'started parsing'); - return env.pipelineFactory.parse(env.page.src); + return env.getContentHandler().toHTML(env); }).then(function(doc) { // strip newlines from the html var html = doc.innerHTML.replace(/[\r\n]/g, ''); @@ -378,7 +378,7 @@ return TemplateRequest.setPageSrcInfo(env, target, oldid).then(function() { env.log('info', 'started parsing'); - return env.pipelineFactory.parse(env.page.src); + return env.getContentHandler().toHTML(env); }).then(function(doc) { doc = DU.parseHTML(DU.toXML(doc)); var comment = doc.createComment('rtSelserEditTestComment'); @@ -412,7 +412,7 @@ env.setPageSrcInfo(req.body.content); env.log('info', 'started parsing'); - return env.pipelineFactory.parse(env.page.src) + return env.getContentHandler().toHTML(env) .then(apiUtils.roundTripDiff.bind(null, env, req, res, false)) .then(apiUtils.rtResponse.bind(null, env, req, res)) .catch(function(err) { @@ -426,6 +426,7 @@ var wt2html = Promise.method(function(req, res, wt) { var env = res.locals.env; + var opts = res.locals.opts; var oldid = res.locals.oldid; var target = env.normalizeAndResolvePageTitle(); @@ -475,6 +476,9 @@ var p2; if (typeof wikitext === 'string') { env.setPageSrcInfo(wikitext); + if (opts.contentmodel) { + env.page.meta.revision.contentmodel = opts.contentmodel; + } // Don't cache requests when wt is set in case somebody uses // GET for wikitext parsing @@ -492,7 +496,7 @@ env.page.name = ''; } - p2 = env.pipelineFactory.parse(env.page.src); + p2 = env.getContentHandler().toHTML(env); } else if (oldid) { // Indicate the MediaWiki revision in a header as well for // ease of extraction in clients. @@ -505,7 +509,7 @@ metrics.timing('wt2html.pageWithOldid.size.input', env.page.src.length); } - p2 = env.pipelineFactory.parse(env.page.src) + p2 = env.getContentHandler().toHTML(env) .tap(function() { if (req.headers.cookie) { // Don't cache requests with a session. @@ -560,6 +564,10 @@ env.page.reset(); env.page.meta.revision.revid = res.locals.oldid; + env.page.meta.revision.contentmodel = + opts.contentmodel || + (opts.original && opts.original.contentmodel) || + env.page.meta.revision.contentmodel; env.bumpSerializerResourceUse('htmlSize', html.length); env.log('info', 'started serializing'); @@ -660,7 +668,8 @@ var hasOldId = !!env.page.meta.revision.revid; var useSelser = hasOldId && env.conf.parsoid.useSelser; - return DU.serializeDOM(env, doc.body, useSelser) + var handler = env.getContentHandler(); + return handler.fromHTML(env, doc.body, useSelser) // .timeout(REQ_TIMEOUT) .then(function(output) { if (metrics) { @@ -687,6 +696,15 @@ if (env.originalVersion === null) { return apiUtils.fatalRequest(env, 'Content-type of revision html is missing.', 400); } + + // Set the contentmodel here for downgrades. + // Reuse will overwrite it when setting the src. + if (!env.page.meta) { + env.page.meta = { revision: {} }; + } + env.page.meta.revision.contentmodel = + (revision && revision.contentmodel) || + env.page.meta.revision.contentmodel; // Downgrade (2 -> 1) if (revision === opts.original && // Maybe provide a stronger assertion. @@ -716,6 +734,9 @@ return apiUtils.redirectToOldid(req, res); } apiUtils.setHeader(res, env, 'content-type', apiUtils.wikitextContentType(env)); + if (env.page.meta && env.page.meta.revision && env.page.meta.revision.contentmodel) { + apiUtils.setHeader(res, env, 'x-contentmodel', env.page.meta.revision.contentmodel); + } apiUtils.sendResponse(res, env, env.page.src); }); } else { diff --git a/lib/config/MWParserEnvironment.js b/lib/config/MWParserEnvironment.js index d25f2e7..dfccf46 100644 --- a/lib/config/MWParserEnvironment.js +++ b/lib/config/MWParserEnvironment.js @@ -322,7 +322,7 @@ * @param {String|Object} srcOrMetadata page source or metadata */ MWParserEnvironment.prototype.setPageSrcInfo = function(srcOrMetadata) { - if (typeof (srcOrMetadata) === 'string' || srcOrMetadata === null) { + if (typeof srcOrMetadata === 'string' || !srcOrMetadata) { this.page.reset(); this.page.src = srcOrMetadata || ''; return; @@ -773,6 +773,27 @@ } }; +/** + * @method + * + * Get an appropriate content handler, given a contentmodel. + * + * @param {String} [forceContentModel] An optional content model + * which will override whatever the source specifies. + * @return {Object} An appropriate content handler with `toHTML` and `fromHTML` + * methods. + */ +MWParserEnvironment.prototype.getContentHandler = function(forceContentModel) { + var contentmodel = forceContentModel || + this.page.meta.revision.contentmodel || + 'wikitext'; + if (!this.conf.wiki.extContentModel.has(contentmodel)) { + this.log('error', 'Unknown contentmodel', contentmodel); + contentmodel = 'wikitext'; + } + return this.conf.wiki.extContentModel.get(contentmodel); +}; + if (typeof module === "object") { module.exports.MWParserEnvironment = MWParserEnvironment; diff --git a/lib/config/ParsoidConfig.js b/lib/config/ParsoidConfig.js index 337923c..10a5df5 100644 --- a/lib/config/ParsoidConfig.js +++ b/lib/config/ParsoidConfig.js @@ -464,6 +464,9 @@ // Give them some default extensions. if (!Array.isArray(apiConf.extensions)) { // Native support for certain extensions (Cite, etc) + // Note that in order to remain compatible with mediawiki core, + // core extensions (for example, for the JSON content model) + // must take precedence over other extensions. apiConf.extensions = Util.clone(this.defaultNativeExtensions); /* Include global user extensions */ ParsoidConfig._collectExtensions( @@ -578,7 +581,11 @@ try { if (!fs.statSync(base).isDirectory()) { return; /* not dir */} } catch (e) { return; /* no file there */ } - fs.readdirSync(base).forEach(function(d) { + var files = fs.readdirSync(base); + // Sort! To ensure that we have a repeatable order in which we load + // and process extensions. + files.sort(); + files.forEach(function(d) { var p = isNative ? path.join(base, d) : path.join(base, d, 'parsoid'); try { if (!fs.statSync(p).isDirectory()) { return; /* not dir */ } diff --git a/lib/config/WikiConfig.js b/lib/config/WikiConfig.js index 06cb9d0..905d89d 100644 --- a/lib/config/WikiConfig.js +++ b/lib/config/WikiConfig.js @@ -7,6 +7,7 @@ var semver = require('semver'); var baseConfig = require('./baseconfig/enwiki.json').query; var JSUtils = require('../utils/jsutils.js').JSUtils; +var DU = require('../utils/DOMUtils.js').DOMUtils; var Util = require('../utils/Util.js').Util; // Make sure our base config is never modified @@ -498,6 +499,17 @@ // Register native extension handlers second to overwrite the above. this.extensionPostProcessors = []; this.extensionStyles = new Set(); + this.extContentModel = new Map(); + this.extContentModel.set('wikitext', { + toHTML: function(env) { + // Default: wikitext parser. + return env.pipelineFactory.parse(env.page.src); + }, + fromHTML: function(env, body, useSelser) { + // Default: wikitext serializer. + return DU.serializeDOM(env, body, useSelser); + }, + }); mwApiConf.extensions.forEach(function(Ext) { var ext = new Ext(); var tags = ext.config.hasOwnProperty('tags') ? ext.config.tags : []; @@ -515,6 +527,12 @@ this.extensionStyles.add(s); }, this); } + Object.keys(ext.config.contentmodels || {}).forEach(function(cm) { + // For compatibility with mediawiki core, the first + // registered extension wins. + if (this.extContentModel.has(cm)) { return; } + this.extContentModel.set(cm, ext.config.contentmodels[cm]); + }, this); }, this); // Function hooks on this wiki, indexed by their normalized form diff --git a/lib/config/extapi.js b/lib/config/extapi.js index 2f5a4f7..02f3dbc 100644 --- a/lib/config/extapi.js +++ b/lib/config/extapi.js @@ -30,6 +30,7 @@ // functions are changed. Util: require('../utils/Util.js').Util, DOMUtils: require('../utils/DOMUtils.js').DOMUtils, + addMetaData: require('../wt2html/DOMPostProcessor.js').DOMPostProcessor.addMetaData, defines: require('../wt2html/parser.defines.js'), }; }, diff --git a/lib/ext/JSON/index.js b/lib/ext/JSON/index.js new file mode 100644 index 0000000..37797ae --- /dev/null +++ b/lib/ext/JSON/index.js @@ -0,0 +1,246 @@ +/* ---------------------------------------------------------------------- + * This is a demonstration of content model handling in extensions for + * Parsoid. It implements the "json" content model, to allow editing + * JSON data structures using Visual Editor. It represents the JSON + * structure as a nested table. + * ---------------------------------------------------------------------- */ +'use strict'; + +var ParsoidExtApi = module.parent.require('./extapi.js').versionCheck('^0.5.1'); +var DU = ParsoidExtApi.DOMUtils; +var Promise = ParsoidExtApi.Promise; +var addMetaData = ParsoidExtApi.addMetaData; + +/** + * Native Parsoid implementation of the "json" contentmodel. + */ +var JSONExt = function() { + this.config = { + contentmodels: { + json: this, + }, + }; +}; + +var PARSE_ERROR_HTML = + '<!DOCTYPE html><html>' + + '<body>' + + '<table data-mw=\'{"errors":[{"key":"bad-json"}]}\' typeof="mw:Error">' + + '</body>'; + +// JSON to HTML +// Implementation matches that from includes/content/JsonContent.php in +// mediawiki core, except that we add some additional classes to distinguish +// value types. +JSONExt.prototype.toHTML = Promise.method(function(env) { + var document = DU.parseHTML('<!DOCTYPE html><html><body>'); + var rootValueTable; + var objectTable; + var objectRow; + var arrayTable; + var valueCell; + var primitiveValue; + var src; + + rootValueTable = function(parent, val) { + if (Array.isArray(val)) { + // Wrap arrays in another array so they're visually boxed in a + // container. Otherwise they are visually indistinguishable from + // a single value. + return arrayTable(parent, [ val ]); + } + if (val && typeof val === "object") { + return objectTable(parent, val); + } + parent.innerHTML = + '<table class="mw-json mw-json-single-value"><tbody><tr><td>'; + return primitiveValue(parent.querySelector('td'), val); + }; + objectTable = function(parent, val) { + parent.innerHTML = '<table class="mw-json mw-json-object"><tbody>'; + var tbody = parent.firstElementChild.firstElementChild; + var keys = Object.keys(val); + if (keys.length) { + keys.forEach(function(k) { + objectRow(tbody, k, val[k]); + }); + } else { + tbody.innerHTML = + '<tr><td class="mw-json-empty">'; + } + }; + objectRow = function(parent, key, val) { + var tr = document.createElement('tr'); + if (key !== undefined) { + var th = document.createElement('th'); + th.textContent = key; + tr.appendChild(th); + } + valueCell(tr, val); + parent.appendChild(tr); + }; + arrayTable = function(parent, val) { + parent.innerHTML = '<table class="mw-json mw-json-array"><tbody>'; + var tbody = parent.firstElementChild.firstElementChild; + if (val.length) { + for (var i = 0; i < val.length; i++) { + objectRow(tbody, undefined, val[i]); + } + } else { + tbody.innerHTML = + '<tr><td class="mw-json-empty">'; + } + }; + valueCell = function(parent, val) { + var td = document.createElement('td'); + if (Array.isArray(val)) { + arrayTable(td, val); + } else if (val && typeof val === 'object') { + objectTable(td, val); + } else { + td.classList.add('value'); + primitiveValue(td, val); + } + parent.appendChild(td); + }; + primitiveValue = function(parent, val) { + if (val === null) { + parent.classList.add('mw-json-null'); + } else if (val === true || val === false) { + parent.classList.add('mw-json-boolean'); + } else if (typeof val === 'number') { + parent.classList.add('mw-json-number'); + } else if (typeof val === 'string') { + parent.classList.add('mw-json-string'); + } + parent.textContent = '' + val; + }; + + try { + src = JSON.parse(env.page.src); + rootValueTable(document.body, src); + } catch (e) { + document = DU.parseHTML(PARSE_ERROR_HTML); + } + // We're responsible for running the standard DOMPostProcessor on our + // resulting document. + if (env.pageBundle) { + DU.setDataParsoid(document, { + pagebundle: { + parsoid: { counter: -1, ids: {} }, + mw: { ids: {} }, + }, + }); + DU.visitDOM(document.body, DU.storeDataAttribs, { + storeInPageBundle: env.pageBundle, + env: env, + }); + } + addMetaData(env, document); + return document; +}); + +// HTML to JSON +JSONExt.prototype.fromHTML = Promise.method(function(env, body, useSelser) { + var rootValueTable; + var objectTable; + var objectRow; + var arrayTable; + var valueCell; + var primitiveValue; + + console.assert(DU.isBody(body), 'Expected a body node.'); + + rootValueTable = function(el) { + if (el.classList.contains('mw-json-single-value')) { + return primitiveValue(el.querySelector('tr > td')); + } else if (el.classList.contains('mw-json-array')) { + return arrayTable(el)[0]; + } else { + return objectTable(el); + } + }; + objectTable = function(el) { + console.assert(el.classList.contains('mw-json-object')); + var tbody = el; + if ( + tbody.firstElementChild && + tbody.firstElementChild.tagName === 'TBODY' + ) { + tbody = tbody.firstElementChild; + } + var rows = tbody.children; + var obj = {}; + var empty = rows.length === 0 || ( + rows[0].firstElementChild && + rows[0].firstElementChild.classList.contains('mw-json-empty') + ); + if (!empty) { + for (var i = 0; i < rows.length; i++) { + objectRow(rows[i], obj, undefined); + } + } + return obj; + }; + objectRow = function(tr, obj, key) { + var td = tr.firstElementChild; + if (key === undefined) { + key = td.textContent; + td = td.nextElementSibling; + } + obj[key] = valueCell(td); + }; + arrayTable = function(el) { + console.assert(el.classList.contains('mw-json-array')); + var tbody = el; + if ( + tbody.firstElementChild && + tbody.firstElementChild.tagName === 'TBODY' + ) { + tbody = tbody.firstElementChild; + } + var rows = tbody.children; + var arr = []; + var empty = rows.length === 0 || ( + rows[0].firstElementChild && + rows[0].firstElementChild.classList.contains('mw-json-empty') + ); + if (!empty) { + for (var i = 0; i < rows.length; i++) { + objectRow(rows[i], arr, i); + } + } + return arr; + }; + valueCell = function(el) { + console.assert(el.tagName === 'TD'); + var table = el.firstElementChild; + if (table && table.classList.contains('mw-json-array')) { + return arrayTable(table); + } else if (table && table.classList.contains('mw-json-object')) { + return objectTable(table); + } else { + return primitiveValue(el); + } + }; + primitiveValue = function(el) { + if (el.classList.contains('mw-json-null')) { + return null; + } else if (el.classList.contains('mw-json-boolean')) { + return /true/.test(el.textContent); + } else if (el.classList.contains('mw-json-number')) { + return +el.textContent; + } else if (el.classList.contains('mw-json-string')) { + return '' + el.textContent; + } else { + return undefined; // shouldn't happen. + } + }; + var table = body.firstElementChild; + console.assert(table && table.tagName === 'TABLE'); + return JSON.stringify(rootValueTable(table), null, 4); +}); + +if (typeof module === "object") { + module.exports = JSONExt; +} diff --git a/lib/jsapi.js b/lib/jsapi.js index 0888f32..4c8d166 100644 --- a/lib/jsapi.js +++ b/lib/jsapi.js @@ -34,7 +34,7 @@ body.appendChild(nodes[i].cloneNode(true)); } } - return DU.serializeDOM(env, body, false); + return env.getContentHandler().fromHTML(env, body, false); }; // toString helper diff --git a/lib/utils/DOMUtils.js b/lib/utils/DOMUtils.js index c2fbd0e..d44c928 100644 --- a/lib/utils/DOMUtils.js +++ b/lib/utils/DOMUtils.js @@ -2642,7 +2642,59 @@ /** * @method * - * The main serializer handler. + * Fetch prior DOM for selser. This is factored out of + * DU.serializeDOM so that it can be reused by alternative + * content handlers which support selser. + * + * @param {Object} env The environment. + * @param {Boolean} useSelser Use the selective serializer, or not. + * @return {Promise} a promise that is resolved after selser information + * has been loaded. + */ +DOMUtils.fetchSelser = function(env, useSelser) { + var hasOldId = !!env.page.meta.revision.revid; + var needsContent = useSelser && hasOldId && (env.page.src === null); + var needsOldDOM = useSelser && !(env.page.dom || env.page.domdiff); + + var p = Promise.resolve(); + if (needsContent) { + p = p.then(function() { + var target = env.normalizeAndResolvePageTitle(); + return TemplateRequest.setPageSrcInfo( + env, target, env.page.meta.revision.revid + ).catch(function(err) { + env.log('error', 'Error while fetching page source.'); + }); + }); + } + if (needsOldDOM) { + p = p.then(function() { + if (env.page.src === null) { + // The src fetch failed or we never had an oldid. + // We'll just fallback to non-selser. + return; + } + return env.getContentHandler().toHTML(env) + .then(function(doc) { + env.page.dom = DU.parseHTML(DU.toXML(doc)).body; + }) + .catch(function(err) { + env.log('error', 'Error while parsing original DOM.'); + }); + }); + } + + return p; +}; + +/** + * @method + * + * The main serializer from DOM to *wikitext*. + * + * If you could be handling non-wikitext content, use + * `env.getContentHandler().fromHTML(env, body, useSelser)` instead. + * See {@link MWParserEnvironment#getContentHandler}. * * @param {Object} env The environment. * @param {Node} body The document body to serialize. @@ -2657,52 +2709,9 @@ SelectiveSerializer = require('../html2wt/SelectiveSerializer.js') .SelectiveSerializer; } - console.assert(DU.isBody(body), 'Expected a body node.'); - var hasOldId = !!env.page.meta.revision.revid; - var needsWt = useSelser && hasOldId && (env.page.src === null); - var needsOldDOM = useSelser && !(env.page.dom || env.page.domdiff); - - var steps = []; - if (needsWt) { - steps.push(function() { - var target = env.normalizeAndResolvePageTitle(); - return TemplateRequest.setPageSrcInfo( - env, target, env.page.meta.revision.revid - ).catch(function(err) { - env.log('error', 'Error while fetching page source.'); - }); - }); - } - if (needsOldDOM) { - steps.push(function() { - if (env.page.src === null) { - // The src fetch failed or we never had an oldid. - // We'll just fallback to non-selser. - return; - } - return env.pipelineFactory.parse(env.page.src) - .then(function(doc) { - env.page.dom = DU.parseHTML(DU.toXML(doc)).body; - }) - .catch(function(err) { - env.log('error', 'Error while parsing original DOM.'); - }); - }); - } - - // If we can, perform these steps in parallel (w/ map). - var p; - if (!useSelser) { - p = Promise.resolve(); - } else { - p = Promise.reduce(steps, function(prev, func) { - return func(); - }, null); - } - - return p.then(function() { + return DOMUtils.fetchSelser(env, useSelser).then(function() { var Serializer = useSelser ? SelectiveSerializer : WikitextSerializer; var serializer = new Serializer({ env: env }); // TODO(arlolra): There's probably an opportunity to refactor callers diff --git a/lib/wt2html/DOMPostProcessor.js b/lib/wt2html/DOMPostProcessor.js index 08d5180..35f7d58 100644 --- a/lib/wt2html/DOMPostProcessor.js +++ b/lib/wt2html/DOMPostProcessor.js @@ -187,7 +187,7 @@ DOMPostProcessor.prototype.resetState = function(opts) { this.atTopLevel = opts && opts.toplevel; - this.displayTitle = null; + this.env.page.meta.displayTitle = null; }; /** @@ -206,7 +206,7 @@ // Set title to display when present (last one wins). if (DU.hasNodeName(node, "meta") && node.getAttribute("property") === "mw:PageProp/displaytitle") { - this.displayTitle = node.getAttribute("content"); + env.page.meta.displayTitle = node.getAttribute("content"); } } else if (DU.isComment(node) && /^\{[^]+\}$/.test(node.data)) { // Convert serialized meta tags back from comments. @@ -239,9 +239,8 @@ return true; }; -DOMPostProcessor.prototype.addMetaData = function(document) { - var env = this.env; - +// FIXME: consider moving to DOMUtils or MWParserEnvironment. +DOMPostProcessor.addMetaData = function(env, document) { // add <head> element if it was missing if (!document.head) { document.documentElement. @@ -320,7 +319,7 @@ appendToHead(document, 'link', { rel: 'dc:isVersionOf', href: wikiPageUrl }); - document.title = this.displayTitle || env.page.meta.title || ''; + document.title = env.page.meta.displayTitle || env.page.meta.title || ''; // Add base href pointing to the wiki root appendToHead(document, 'base', { href: env.conf.wiki.baseURI }); @@ -441,11 +440,10 @@ // For sub-pipeline documents, we are done. // For the top-level document, we generate <head> and add it. if (this.atTopLevel) { - this.addMetaData(document); + DOMPostProcessor.addMetaData(env, document); if (psd.traceFlags && psd.traceFlags.indexOf('time') !== -1) { env.printTimeProfile(); } - if (psd.dumpFlags && psd.dumpFlags.indexOf('wt2html:limits') !== -1) { env.printParserResourceUsage({'HTML Size': document.outerHTML.length}); } diff --git a/package.json b/package.json index ea71182..8b7c07b 100644 --- a/package.json +++ b/package.json @@ -57,7 +57,7 @@ "dump-tokenizer": "node lib/wt2html/tokenizer.js", "mocha": "mocha --opts tests/mocha/mocha.opts tests/mocha", "parserTests": "node bin/parserTests.js --wt2html --wt2wt --html2wt --html2html --selser --no-color --quiet --blacklist", - "roundtrip": "node bin/roundtrip-test.js -c 'Barack Obama' && node bin/roundtrip-test.js -c --prefix frwiki Chope && node bin/roundtrip-test.js -c --xml Parkour", + "roundtrip": "node bin/roundtrip-test.js -c 'Barack Obama' && node bin/roundtrip-test.js -c --prefix frwiki Chope && node bin/roundtrip-test.js -c --xml Parkour && node bin/roundtrip-test.js -c --domain www.mediawiki.org --oldid 2170316 'User:Legoktm/test_this_is_json'", "toolcheck": "bin/toolcheck.sh", "test": "npm run nsp && npm run lint && npm run parserTests && npm run mocha", "cover-mocha": "istanbul cover _mocha --dir ./coverage/mocha -- --opts tests/mocha/mocha.opts tests/mocha", diff --git a/tests/mocha/api.js b/tests/mocha/api.js index 6af99b5..e3f6d55 100644 --- a/tests/mocha/api.js +++ b/tests/mocha/api.js @@ -416,10 +416,28 @@ .end(done); }); + it('should get from a title and revision (html, json content)', function(done) { + request(api) + .get(mockDomain + '/v3/page/html/JSON_Page/101') + .expect(validHtmlResponse(function(doc) { + doc.body.firstChild.nodeName.should.equal('TABLE'); + })) + .end(done); + }); + it('should get from a title and revision (pagebundle)', function(done) { request(api) .get(mockDomain + '/v3/page/pagebundle/Main_Page/1') .expect(validPageBundleResponse()) + .end(done); + }); + + it('should get from a title and revision (pagebundle, json content)', function(done) { + request(api) + .get(mockDomain + '/v3/page/pagebundle/JSON_Page/101') + .expect(validPageBundleResponse(function(doc) { + doc.body.firstChild.nodeName.should.equal('TABLE'); + })) .end(done); }); @@ -442,6 +460,19 @@ .end(done); }); + it('should accept json contentmodel as a string for html', function(done) { + request(api) + .post(mockDomain + '/v3/transform/wikitext/to/html/') + .send({ + wikitext: '{"1":2}', + contentmodel: 'json', + }) + .expect(validHtmlResponse(function(doc) { + doc.body.firstChild.nodeName.should.equal('TABLE'); + })) + .end(done); + }); + it('should accept wikitext as a string for pagebundle', function(done) { request(api) .post(mockDomain + '/v3/transform/wikitext/to/pagebundle/') @@ -450,6 +481,20 @@ }) .expect(validPageBundleResponse(function(doc) { doc.body.firstChild.nodeName.should.equal('H2'); + })) + .end(done); + }); + + it('should accept json contentmodel as a string for pagebundle', function(done) { + request(api) + .post(mockDomain + '/v3/transform/wikitext/to/pagebundle/') + .send({ + wikitext: '{"1":2}', + contentmodel: 'json', + }) + .expect(validPageBundleResponse(function(doc) { + doc.body.firstChild.nodeName.should.equal('TABLE'); + should.not.exist(doc.querySelector('*[typeof="mw:Error"]')); })) .end(done); }); @@ -753,6 +798,17 @@ .end(done); }); + it('should accept html for json contentmodel as a string', function(done) { + request(api) + .post(mockDomain + '/v3/transform/html/to/wikitext/') + .send({ + html: '<!DOCTYPE html>\n<html prefix="dc: http://purl.org/dc/terms/ mw: http://mediawiki.org/rdf/"><head prefix="mwr: http://en.wikipedia.org/wiki/Special:Redirect/"><meta charset="utf-8"/><meta property="mw:articleNamespace" content="0"/><meta property="mw:html:version" content="1.2.1"/><meta property="mw:data-parsoid:version" content="0.0.2"/><link rel="dc:isVersionOf" href="//en.wikipedia.org/wiki/Main_Page"/><title></title><base href="//en.wikipedia.org/wiki/"/><link rel="stylesheet" href="//en.wikipedia.org/w/load.php?modules=mediawiki.legacy.commonPrint,shared|mediawiki.skinning.elements|mediawiki.skinning.content|mediawiki.skinning.interface|skins.vector.styles|site|mediawiki.skinning.content.parsoid|ext.cite.style&only=styles&skin=vector"/></head><body lang="en" class="mw-content-ltr sitedir-ltr ltr mw-body mw-body-content mediawiki" dir="ltr"><table class="mw-json mw-json-object"><tbody><tr><th>a</th><td class="value mw-json-number">4</td></tr><tr><th>b</th><td class="value mw-json-number">3</td></tr></tbody></table></body></html>', + contentmodel: 'json', + }) + .expect(validWikitextResponse('{\n "a": 4,\n "b": 3\n}')) + .end(done); + }); + it('should accept html with headers', function(done) { request(api) .post(mockDomain + '/v3/transform/html/to/wikitext/') diff --git a/tests/mocha/parse.js b/tests/mocha/parse.js index fd82f8e..a65f2f5 100644 --- a/tests/mocha/parse.js +++ b/tests/mocha/parse.js @@ -47,6 +47,36 @@ }); }); + it('should support json contentmodel', function() { + var opts = { contentmodel: 'json' }; + var testval = {a: "a", b: [2, true, ""], c: null}; + return parse(JSON.stringify(testval), opts).then(function(doc) { + doc.should.have.property('nodeName', '#document'); + doc.outerHTML.startsWith('<!DOCTYPE html><html').should.equal(true); + doc.outerHTML.endsWith('</body></html>').should.equal(true); + // verify that body has only one <html> tag, one <body> tag, etc. + doc.childNodes.length.should.equal(2);// <!DOCTYPE> and <html> + doc.firstChild.nodeName.should.equal('html'); + doc.lastChild.nodeName.should.equal('HTML'); + // <html> children should be <head> and <body> + var html = doc.documentElement; + html.childNodes.length.should.equal(2); + html.firstChild.nodeName.should.equal('HEAD'); + html.lastChild.nodeName.should.equal('BODY'); + // <body> should have one child, <table> + var body = doc.body; + body.childElementCount.should.equal(1); + body.firstElementChild.nodeName.should.equal('TABLE'); + var table = doc.body.firstElementChild; + table.classList.contains('mw-json').should.equal(true); + // Now convert back to JSON + return serialize(doc, null, opts); + }).then(function(result) { + var v = JSON.parse(result); // shouldn't throw an error! + v.should.eql(testval); + }); + }); + ['no subpages', 'subpages'].forEach(function(desc, subpages) { describe('should handle page titles with embedded ? (' + desc + ')', function() { var linktests = [ diff --git a/tests/mocha/test.helpers.js b/tests/mocha/test.helpers.js index 7838bc3..b028c2f 100644 --- a/tests/mocha/test.helpers.js +++ b/tests/mocha/test.helpers.js @@ -12,7 +12,10 @@ env = options.tweakEnv(env) || env; } env.setPageSrcInfo(src); - return env.pipelineFactory.parse(env.page.src) + if (options.contentmodel) { + env.page.meta.revision.contentmodel = options.contentmodel; + } + return env.getContentHandler().toHTML(env) .then(function(doc) { // linter tests need the env object return { env: env, doc: doc }; @@ -30,11 +33,17 @@ if (options.tweakEnv) { env = options.tweakEnv(env) || env; } + if (!env.page.meta) { + env.page.meta = { revision: {} }; + } + if (options.contentmodel) { + env.page.meta.revision.contentmodel = options.contentmodel; + } pb = pb || DU.extractPageBundle(doc); if (pb) { DU.applyPageBundle(doc, pb); } - return DU.serializeDOM(env, doc.body, false); + return env.getContentHandler().fromHTML(env, doc.body, false); }); }; diff --git a/tests/mockAPI.js b/tests/mockAPI.js index 3729893..55b4ba1 100644 --- a/tests/mockAPI.js +++ b/tests/mockAPI.js @@ -151,6 +151,27 @@ }, }; +var jsonPage = { + query: { + pages: { + '101': { + pageid: 101, + ns: 0, + title: 'JSON_Page', + revisions: [ + { + revid: 101, + parentid: 0, + contentmodel: 'json', + contentformat: 'text/json', + '*': '[1]', + }, + ], + }, + }, + }, +}; + var fnames = { 'Image:Foobar.jpg': 'Foobar.jpg', 'File:Foobar.jpg': 'Foobar.jpg', @@ -251,6 +272,8 @@ return cb(null , largePage); } else if (body.revids === '100' || body.titles === 'Reuse_Page') { return cb(null , reusePage); + } else if (body.revids === '101' || body.titles === 'JSON_Page') { + return cb(null , jsonPage); } } -- To view, visit https://gerrit.wikimedia.org/r/295707 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I7ca31c99de8e04b1359bc521df121db0eb69e384 Gerrit-PatchSet: 27 Gerrit-Project: mediawiki/services/parsoid Gerrit-Branch: master Gerrit-Owner: C. Scott Ananian <canan...@wikimedia.org> Gerrit-Reviewer: Arlolra <abrea...@wikimedia.org> Gerrit-Reviewer: C. Scott Ananian <canan...@wikimedia.org> Gerrit-Reviewer: GWicke <gwi...@wikimedia.org> Gerrit-Reviewer: Jforrester <jforres...@wikimedia.org> Gerrit-Reviewer: Subramanya Sastry <ssas...@wikimedia.org> Gerrit-Reviewer: Tim Starling <tstarl...@wikimedia.org> Gerrit-Reviewer: Tpt <thoma...@hotmail.fr> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits