C. Scott Ananian has uploaded a new change for review. https://gerrit.wikimedia.org/r/316237
Change subject: WIP: allstar wikitext serialization. ...................................................................... WIP: allstar wikitext serialization. Change-Id: Ibbf98aa4420650707813ec67addcac1bf827a7b5 --- A bin/allstar.js M package.json 2 files changed, 488 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid refs/changes/37/316237/1 diff --git a/bin/allstar.js b/bin/allstar.js new file mode 100755 index 0000000..1329bbc --- /dev/null +++ b/bin/allstar.js @@ -0,0 +1,487 @@ +#!/usr/bin/env node +/** + * Allstar serialization. + * Read from STDIN, write to STDOUT. + */ +'use strict'; +require('../core-upgrade.js'); + +var ParserEnv = require('../lib/config/MWParserEnvironment.js').MWParserEnvironment; +var ParsoidConfig = require('../lib/config/ParsoidConfig.js').ParsoidConfig; +var Parse = require('./parse.js'); +var TemplateRequest = require('../lib/mw/ApiRequest.js').TemplateRequest; +var Util = require('../lib/utils/Util.js').Util; +var DU = require('../lib/utils/DOMUtils.js').DOMUtils; +var DOMTraverser = require('../lib/utils/DOMTraverser.js').DOMTraverser; +var Promise = require('../lib/utils/promise.js'); +var fs = require('fs'); +var path = require('path'); +var yargs = require('yargs'); +var yaml = require('js-yaml'); + +var standardOpts = Util.addStandardOptions({ + 'config': { + description: "Path to a config.yaml file. Use --config w/ no argument to default to the server's config.yaml", + 'default': false, + }, + 'prefix': { + description: 'Which wiki prefix to use; e.g. "enwiki" for English wikipedia, "eswiki" for Spanish, "mediawikiwiki" for mediawiki.org', + 'boolean': false, + 'default': null, + }, + 'domain': { + description: 'Which wiki to use; e.g. "en.wikipedia.org" for English wikipedia, "es.wikipedia.org" for Spanish, "mediawiki.org" for mediawiki.org', + 'boolean': false, + 'default': null, + }, + 'page': { + description: 'The page name, returned for {{PAGENAME}}. If no input is given (ie. empty/stdin closed), it downloads and parses the page. This should be the actual title of the article (that is, not including any URL-encoding that might be necessary in wikitext).', + 'boolean': false, + 'default': ParserEnv.prototype.defaultPageName, + }, + 'inputfile': { + description: 'File containing input as an alternative to stdin', + 'boolean': false, + 'default': false, + }, + 'contentVersion': { + description: 'The acceptable content version.', + 'boolean': false, + 'default': ParserEnv.prototype.contentVersion, + }, + 'loadWMF': { + description: 'Use WMF mediawiki API config', + 'boolean': true, + 'default': true, + }, + 'offline': { + description: 'Shortcut to turn off various network fetches during parse.', + 'boolean': true, + 'default': false, + }, +}); + +var serialize = function(env, doc) { + var result = []; + var dt = new DOMTraverser(env); + var atSOL = function() { + if (result.length === 0) { return true; } + var i = result.length - 1; + while (i>0 && result[i].length === 0) { i--; } + return /\n$/.test(result[i]); + }; + var atListSOL = function() { + if (result.length === 0) { return true; } + var i = result.length - 1; + while (i>0 && /^[*#:;]*$/.test(result[i])) { i--; } + if (i==0) { return /(^|\n)[*#:;]*$/.test(result[i]); } + return /\n[*#:;]*$/.test(result[i]); + }; + var forceSOL = function() { + if (!atSOL()) { result.push('\n'); } + }; + var typeofContains = function(node, type) { + if (node.nodeType !== 1) { return false; } + var ty = (node.getAttribute('typeof') || '').split(/\s+/g); + if (typeof type === 'string') { + type = new RegExp('^' + Util.escapeRegExp(type) + '$'); + } + for (var i=0; i<ty.length; i++) { + if (type.test(ty[i])) { return true; } + } + return false; + }; + var urlToTitle = function(env, href) { + href = Util.decodeURI(href); + href = href.replace(/^\.\//, ''); + href = env.resolveTitle(href); + var title = env.normalizedTitleKey(href); + title = title.replace(/_/g, ' '); + return title; + }; + var entity = function(c) { + // xxx in the future, could choose textual entity names in some + // situations. + return '&#' + c.codePointAt(0) + ';'; + }; + var escapeAttr = function(s) { + return '"' + s.replace(/[&"]/g, entity) + '"'; + }; + var escape = function(s) { + if (result.length && /\{(={1,6}|\'{1,2})$/.test(result[result.length-1]) && + s[0] === result[result.length-1].slice(-1)) { + return entity(s[0]) + escape(s.slice(1)); + } + return s.replace(/[\n\[\]\{\}|*#:;=]/g, function(c) { + switch (c) { + case '\n': return ' '; + case '*': case '#': case ':': case ';': + if (!atSOL()) { return c; } + // fall through + default: + return entity(c); + } + }); + }; + var options = {}; + var listStack = []; + var handlers = [ + function body(node, env) { return node.firstChild; }, + function _all_transclusion(node, env) { + if (!typeofContains(node, 'mw:Transclusion')) { return true; } + var about = node.getAttribute('about'); + var dmw = JSON.parse(node.getAttribute('data-mw')); + dmw.parts.forEach(function(part) { + if (typeof part === 'string') { + result.push(escape(part)); + } else if (part.template) { + // XXX, the 'wt' part should be unescaped from wt and + // then escaped for allstar. Alternatively, it should + // be rendered to HTML and then we should serialize it. + result.push('{{'); + result.push(part.template.target.wt); + Object.keys(part.template.params).forEach(function(k) { + result.push('|'); + if (!/^[0-9]+$/.test(k)) { + result.push(escape(k)); + result.push('='); + } + // XXX escaping + // XXX block arguments? + result.push(part.template.params[k].wt); + }); + result.push('}}'); + } else { + throw new Error('unexpected template part:'+JSON.stringify(part)); + } + }); + var next = node.nextSibling; + while (next && next.nodeType === 1 && next.getAttribute('about') == about) { + next = next.nextSibling; + } + return next; + }, + function _all_figure(node, env) { + if (!typeofContains(node, /^mw:Image(\/|$)/)) { return true; } + var media = node.firstChild.firstChild; + var href = media.getAttribute('resource'); + var title = urlToTitle(env, href); + title = escape(title); + result.push('[['); + result.push(title); + result.push('|'); + var alt = media.getAttribute('alt'); + if (alt) { + result.push('alt='); + result.push(escape(alt)); + result.push('|'); + } + // XXX other options + result.pop(); // remove trailing '|' + result.push(']]'); + return node.nextSibling; + }, + function _all_ref(node, env, atTopLevel, tplInfo) { + if (!typeofContains(node, 'mw:Extension/ref')) { return true; } + // This one is weird, we need to look elsewhere for the source + var dmw = JSON.parse(node.getAttribute('data-mw')); + var id = dmw.body && dmw.body.id; + var actual = id && node.ownerDocument.getElementById(id); + // XXX dmx.name? + result.push('<ref'); + Object.keys(dmw.attrs).forEach(function(k) { + result.push(' '); + result.push(escape(k)); + result.push('='); + result.push(escapeAttr(dmw.attrs[k])); + }); + if (actual) { + result.push('>'); + dt.traverse(actual.firstChild, env, options, false, tplInfo); + result.push('</ref>') + } else { + result.push('/>'); + } + return node.nextSibling; + }, + function h1_or_h2_or_h3_or_h4_or_h5_or_h6(node, env, atTopLevel, tplInfo) { + var count = +(node.nodeName.replace(/^H/i, '')); + forceSOL(); + var partial = result.length; + result.push('='.repeat(count)); + + // note, we ensure child wikitext doesn't start with = + dt.traverse(node.firstChild, env, options, false, tplInfo); + // XXX ensure child wikitext doesn't end with = + var partial2 = result.length; + result.push('='.repeat(count)); + forceSOL(); + var text = result.slice(partial).join(''); + if (/\n[^]/.test(text)) { + // if block content, use {= ... =} + result[partial] = '{' + result[partial]; + result[partial2] = result[partial2] + '}'; + } + return node.nextSibling; + }, + function p(node, env, atTopLevel, tplInfo) { + dt.traverse(node.firstChild, env, options, false, tplInfo); + forceSOL(); + result.push('\n'); // what about <p></p><p></p>? + return node.nextSibling; + }, + function b_or_i(node, env, atTopLevel, tplInfo) { + var count = node.nodeName === 'B' ? 2 : 1; + result.push('{' + "'".repeat(count)); + // note first char of result can't be ' + dt.traverse(node.firstChild, env, options, false, tplInfo); + // XXX do we care about last char of result? + result.push("'".repeat(count)+'}'); + return node.nextSibling; + }, + function ul_or_ol(node, env, atTopLevel, tplInfo) { + listStack.push(node.nodeName==='UL'?'*':'#'); + dt.traverse(node.firstChild, env, options, false, tplInfo); + listStack.pop(); + return node.nextSibling; + }, + function li(node, env, atTopLevel, tplInfo) { + // XXX after ^* or {* always at SOL context again. + var type = listStack[listStack.length-1] || '*'; + var beforeSOL = result.length; + if (!atListSOL()) { result.push('\n'); } + var partial = result.length; + result.push(type); + dt.traverse(node.firstChild, env, options, false, tplInfo); + forceSOL(); + // if this includes block content, emit '{* ... *}' + var text = result.slice(partial+1).join(''); + if (/\n[^]/.test(text)) { + result[beforeSOL] = ''; + result[partial] = '{' + type; + result.push(type + '}'); + } + return node.nextSibling; + }, + function span_entity(node, env, atTopLevel, tplInfo) { + if (!typeofContains(node, 'mw:Entity')) { return true; } + result.push(entity(node.textContent)); + return node.nextSibling; + }, + function a_extlink(node, env, atTopLevel, tplInfo) { + var rel = node.getAttribute('rel'); + if (rel !== 'mw:ExtLink') { return true; } + result.push('['); + var href = escape(node.getAttribute('href')); + result.push(href); + var partial = result.length; + result.push('|'); + dt.traverse(node.firstChild, env, options, false, tplInfo); + var linkText = result.slice(partial+1).join(''); + // Maybe suppress the explicit link text + if (href === linkText) { + result.length = partial; // truncate! + } + result.push(']'); + return node.nextSibling; + }, + function a_wikilink(node, env, atTopLevel, tplInfo) { + var rel = node.getAttribute('rel'); + if (rel !== 'mw:WikiLink') { return true; } + var href = node.getAttribute('href'); + var title = urlToTitle(env, href); + title = escape(title); + result.push('[['); + result.push(title); + var partial = result.length; + result.push('|'); + dt.traverse(node.firstChild, env, options, false, tplInfo); + var linkText = result.slice(partial+1).join(''); + // Maybe suppress the explicit link text + if (title === linkText) { + result.length = partial; // truncate! + } + result.push(']]'); + return node.nextSibling; + }, + function div(node, env, atTopLevel, tplInfo) { + result.push('<div>'); + dt.traverse(node.firstChild, env, options, false, tplInfo); + result.push('</div>'); + return node.nextSibling; + }, + function link_category(node, env) { + var rel = node.getAttribute('rel'); + if (rel !== 'mw:PageProp/Category') { return true; } + var href = node.getAttribute('href'); + result.push('[['); + result.push(href.replace(/^\.\//, '').replace(/_/g, ' ')); + result.push(']]'); + return node.nextSibling; + }, + function _text(node, env, atTopLevel, tplInfo) { + node.data.split(/\n/g).forEach(function(s, idx) { + if (idx > 0) { forceSOL(); } + if (s.length) result.push(escape(s)); + }); + return node.nextSibling; + }, + function _comment(node, env, atTopLevel, tplInfo) { + result.push('<!--'); + result.push(DU.decodeComment(node.data)); + result.push('-->'); + return node.nextSibling; + }, + function _all(node, env) { + if (node.nodeType !== 1) throw new Error('woah:'+node.nodeName); + result.push('<parsoid>'); + result.push(node.outerHTML); + result.push('</parsoid>'); + return node.nextSibling; // skip further processing for this node + }, + ]; + handlers.forEach(function(h) { + h.name.split(/_or_/g).forEach(function(name) { + name = name.replace(/^_/, '#').replace(/_.*$/, ''); + if (name === '#all') { name = null; } // all nodes + dt.addHandler(name, h); + }); + }); + dt.traverse(doc.body, env, options, true/*atTopLevel*/); + return result.join(''); +}; + +var allstar = exports.allstar = function(input, argv, parsoidConfig, prefix, domain) { + var env; + return ParserEnv.getParserEnv(parsoidConfig, { + prefix: prefix, + domain: domain, + pageName: argv.page, + }).then(function(_env) { + env = _env; + + // fetch templates from enwiki by default. + if (argv.wgScriptPath) { + env.conf.wiki.wgScriptPath = argv.wgScriptPath; + } + // The content version to output + if (argv.contentVersion) { + env.setContentVersion(argv.contentVersion); + } + if (typeof input === 'string') { + return input; + } + + if (argv.inputfile) { + // read input from the file, then process + var fileContents = fs.readFileSync(argv.inputfile, 'utf8'); + return fileContents; + } + + // Send a message to stderr if there is no input for a while, since the + // convention that --page must be used with </dev/null is confusing. + var stdinTimer = setTimeout(function() { + console.error('Waiting for stdin...'); + }, 1000); + + return new Promise(function(resolve) { + // collect input + var inputChunks = []; + var stdin = process.stdin; + stdin.resume(); + stdin.setEncoding('utf8'); + stdin.on('data', function(chunk) { + inputChunks.push(chunk); + }); + stdin.on('end', function() { + resolve(inputChunks); + }); + }).then(function(inputChunks) { + clearTimeout(stdinTimer); + // parse page if no input + if (inputChunks.length > 0) { + return inputChunks.join(''); + } + var target = env.normalizeAndResolvePageTitle(); + argv.wt2html = true; + return TemplateRequest + .setPageSrcInfo(env, target, argv.oldid) + .then(function() { + return Parse.parse(env.page.src, argv, parsoidConfig, prefix, domain); + }).then(function(res) { return res.out; }); + }); + }).then(function(html) { + var doc = DU.parseHTML(html); + + // Serialize! + var wt = serialize(env, doc); + + return { trailingNL: true, out: wt || '', env: env }; + }); +} + +if (require.main === module) { + (function() { + var opts = yargs.usage( + 'Usage: $0 --inputfile <filename>', + standardOpts + ).strict(); + + var argv = opts.argv; + + if (Util.booleanOption(argv.help)) { + opts.showHelp(); + return; + } + + // Offline shortcut + if (argv.offline) { + argv.fetchConfig = false; + argv.fetchTemplates = false; + argv.fetchImageInfo = false; + argv.usephppreprocessor = false; + } + + var prefix = argv.prefix || null; + var domain = argv.domain || null; + + if (argv.apiURL) { + prefix = 'customwiki'; + domain = null; + } else if (!(prefix || domain)) { + domain = 'en.wikipedia.org'; + } + + var config = null; + if (Util.booleanOption(argv.config)) { + var p = (typeof (argv.config) === 'string') ? + path.resolve('.', argv.config) : + path.resolve(__dirname, '../config.yaml'); + // Assuming Parsoid is the first service in the list + config = yaml.load(fs.readFileSync(p, 'utf8')).services[0].conf; + } + + var setup = function(parsoidConfig) { + parsoidConfig.loadWMF = argv.loadWMF; + if (config && config.localsettings) { + var local = require(path.resolve(__dirname, config.localsettings)); + local.setup(parsoidConfig); + } + Util.setTemplatingAndProcessingFlags(parsoidConfig, argv); + Util.setDebuggingFlags(parsoidConfig, argv); + }; + + var parsoidConfig = new ParsoidConfig({ setup: setup }, config); + + parsoidConfig.defaultWiki = prefix ? prefix : + parsoidConfig.reverseMwApiMap.get(domain); + + return allstar(null, argv, parsoidConfig, prefix, domain).then(function(res) { + var stdout = process.stdout; + stdout.write(res.out); + if (res.trailingNL && stdout.isTTY) { + stdout.write('\n'); + } + }).done(); + })(); +} diff --git a/package.json b/package.json index 1718df1..d7ea2a2 100644 --- a/package.json +++ b/package.json @@ -4,6 +4,7 @@ "version": "0.5.1+git", "license": "GPL-2.0+", "dependencies": { + "antlr4": "^4.5.3", "async": "^0.9.2", "babybird": "^0.0.1", "body-parser": "^1.15.2", -- To view, visit https://gerrit.wikimedia.org/r/316237 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ibbf98aa4420650707813ec67addcac1bf827a7b5 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/parsoid Gerrit-Branch: master Gerrit-Owner: C. Scott Ananian <canan...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits