C. Scott Ananian has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/316237

Change subject: WIP: allstar wikitext serialization.
......................................................................

WIP: allstar wikitext serialization.

Change-Id: Ibbf98aa4420650707813ec67addcac1bf827a7b5
---
A bin/allstar.js
M package.json
2 files changed, 488 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/37/316237/1

diff --git a/bin/allstar.js b/bin/allstar.js
new file mode 100755
index 0000000..1329bbc
--- /dev/null
+++ b/bin/allstar.js
@@ -0,0 +1,487 @@
+#!/usr/bin/env node
+/**
+ * Allstar serialization.
+ * Read from STDIN, write to STDOUT.
+ */
+'use strict';
+require('../core-upgrade.js');
+
+var ParserEnv = 
require('../lib/config/MWParserEnvironment.js').MWParserEnvironment;
+var ParsoidConfig = require('../lib/config/ParsoidConfig.js').ParsoidConfig;
+var Parse = require('./parse.js');
+var TemplateRequest = require('../lib/mw/ApiRequest.js').TemplateRequest;
+var Util = require('../lib/utils/Util.js').Util;
+var DU = require('../lib/utils/DOMUtils.js').DOMUtils;
+var DOMTraverser = require('../lib/utils/DOMTraverser.js').DOMTraverser;
+var Promise = require('../lib/utils/promise.js');
+var fs = require('fs');
+var path = require('path');
+var yargs = require('yargs');
+var yaml = require('js-yaml');
+
+var standardOpts = Util.addStandardOptions({
+       'config': {
+               description: "Path to a config.yaml file.  Use --config w/ no 
argument to default to the server's config.yaml",
+               'default': false,
+       },
+       'prefix': {
+               description: 'Which wiki prefix to use; e.g. "enwiki" for 
English wikipedia, "eswiki" for Spanish, "mediawikiwiki" for mediawiki.org',
+               'boolean': false,
+               'default': null,
+       },
+       'domain': {
+               description: 'Which wiki to use; e.g. "en.wikipedia.org" for 
English wikipedia, "es.wikipedia.org" for Spanish, "mediawiki.org" for 
mediawiki.org',
+               'boolean': false,
+               'default': null,
+       },
+       'page': {
+               description: 'The page name, returned for {{PAGENAME}}. If no 
input is given (ie. empty/stdin closed), it downloads and parses the page. This 
should be the actual title of the article (that is, not including any 
URL-encoding that might be necessary in wikitext).',
+               'boolean': false,
+               'default': ParserEnv.prototype.defaultPageName,
+       },
+       'inputfile': {
+               description: 'File containing input as an alternative to stdin',
+               'boolean': false,
+               'default': false,
+       },
+       'contentVersion': {
+               description: 'The acceptable content version.',
+               'boolean': false,
+               'default': ParserEnv.prototype.contentVersion,
+       },
+       'loadWMF': {
+               description: 'Use WMF mediawiki API config',
+               'boolean': true,
+               'default': true,
+       },
+       'offline': {
+               description: 'Shortcut to turn off various network fetches 
during parse.',
+               'boolean': true,
+               'default': false,
+       },
+});
+
+var serialize = function(env, doc) {
+       var result = [];
+       var dt = new DOMTraverser(env);
+       var atSOL = function() {
+               if (result.length === 0) { return true; }
+               var i = result.length - 1;
+               while (i>0 && result[i].length === 0) { i--; }
+               return /\n$/.test(result[i]);
+       };
+       var atListSOL = function() {
+               if (result.length === 0) { return true; }
+               var i = result.length - 1;
+               while (i>0 && /^[*#:;]*$/.test(result[i])) { i--; }
+               if (i==0) { return /(^|\n)[*#:;]*$/.test(result[i]); }
+               return /\n[*#:;]*$/.test(result[i]);
+       };
+       var forceSOL = function() {
+               if (!atSOL()) { result.push('\n'); }
+       };
+       var typeofContains = function(node, type) {
+               if (node.nodeType !== 1) { return false; }
+               var ty = (node.getAttribute('typeof') || '').split(/\s+/g);
+               if (typeof type === 'string') {
+                       type = new RegExp('^' + Util.escapeRegExp(type) + '$');
+               }
+               for (var i=0; i<ty.length; i++) {
+                       if (type.test(ty[i])) { return true; }
+               }
+               return false;
+       };
+       var urlToTitle = function(env, href) {
+               href = Util.decodeURI(href);
+               href = href.replace(/^\.\//, '');
+               href = env.resolveTitle(href);
+               var title = env.normalizedTitleKey(href);
+               title = title.replace(/_/g, ' ');
+               return title;
+       };
+       var entity = function(c) {
+               // xxx in the future, could choose textual entity names in some
+               // situations.
+               return '&#' + c.codePointAt(0) + ';';
+       };
+       var escapeAttr = function(s) {
+               return '"' + s.replace(/[&"]/g, entity) + '"';
+       };
+       var escape = function(s) {
+               if (result.length && 
/\{(={1,6}|\'{1,2})$/.test(result[result.length-1]) &&
+                       s[0] === result[result.length-1].slice(-1)) {
+                       return entity(s[0]) + escape(s.slice(1));
+               }
+               return s.replace(/[\n\[\]\{\}|*#:;=]/g, function(c) {
+                       switch (c) {
+                       case '\n': return ' ';
+                       case '*': case '#': case ':': case ';':
+                               if (!atSOL()) { return c; }
+                               // fall through
+                       default:
+                               return entity(c);
+                       }
+               });
+       };
+       var options = {};
+       var listStack = [];
+       var handlers = [
+               function body(node, env) { return node.firstChild; },
+               function _all_transclusion(node, env) {
+                       if (!typeofContains(node, 'mw:Transclusion')) { return 
true; }
+                       var about = node.getAttribute('about');
+                       var dmw = JSON.parse(node.getAttribute('data-mw'));
+                       dmw.parts.forEach(function(part) {
+                               if (typeof part === 'string') {
+                                       result.push(escape(part));
+                               } else if (part.template) {
+                                       // XXX, the 'wt' part should be 
unescaped from wt and
+                                       // then escaped for allstar.  
Alternatively, it should
+                                       // be rendered to HTML and then we 
should serialize it.
+                                       result.push('{{');
+                                       result.push(part.template.target.wt);
+                                       
Object.keys(part.template.params).forEach(function(k) {
+                                               result.push('|');
+                                               if (!/^[0-9]+$/.test(k)) {
+                                                       result.push(escape(k));
+                                                       result.push('=');
+                                               }
+                                               // XXX escaping
+                                               // XXX block arguments?
+                                               
result.push(part.template.params[k].wt);
+                                       });
+                                       result.push('}}');
+                               } else {
+                                       throw new Error('unexpected template 
part:'+JSON.stringify(part));
+                               }
+                       });
+                       var next = node.nextSibling;
+                       while (next && next.nodeType === 1 && 
next.getAttribute('about') == about) {
+                               next = next.nextSibling;
+                       }
+                       return next;
+               },
+               function _all_figure(node, env) {
+                       if (!typeofContains(node, /^mw:Image(\/|$)/)) { return 
true; }
+                       var media = node.firstChild.firstChild;
+                       var href = media.getAttribute('resource');
+                       var title = urlToTitle(env, href);
+                       title = escape(title);
+                       result.push('[[');
+                       result.push(title);
+                       result.push('|');
+                       var alt = media.getAttribute('alt');
+                       if (alt) {
+                               result.push('alt=');
+                               result.push(escape(alt));
+                               result.push('|');
+                       }
+                       // XXX other options
+                       result.pop(); // remove trailing '|'
+                       result.push(']]');
+                       return node.nextSibling;
+               },
+               function _all_ref(node, env, atTopLevel, tplInfo) {
+                       if (!typeofContains(node, 'mw:Extension/ref')) { return 
true; }
+                       // This one is weird, we need to look elsewhere for the 
source
+                       var dmw = JSON.parse(node.getAttribute('data-mw'));
+                       var id = dmw.body && dmw.body.id;
+                       var actual = id && 
node.ownerDocument.getElementById(id);
+                       // XXX dmx.name?
+                       result.push('<ref');
+                       Object.keys(dmw.attrs).forEach(function(k) {
+                               result.push(' ');
+                               result.push(escape(k));
+                               result.push('=');
+                               result.push(escapeAttr(dmw.attrs[k]));
+                       });
+                       if (actual) {
+                               result.push('>');
+                               dt.traverse(actual.firstChild, env, options, 
false, tplInfo);
+                               result.push('</ref>')
+                       } else {
+                               result.push('/>');
+                       }
+                       return node.nextSibling;
+               },
+               function h1_or_h2_or_h3_or_h4_or_h5_or_h6(node, env, 
atTopLevel, tplInfo) {
+                       var count = +(node.nodeName.replace(/^H/i, ''));
+                       forceSOL();
+                       var partial = result.length;
+                       result.push('='.repeat(count));
+
+                       // note, we ensure child wikitext doesn't start with =
+                       dt.traverse(node.firstChild, env, options, false, 
tplInfo);
+                       // XXX ensure child wikitext doesn't end with =
+                       var partial2 = result.length;
+                       result.push('='.repeat(count));
+                       forceSOL();
+                       var text = result.slice(partial).join('');
+                       if (/\n[^]/.test(text)) {
+                               // if block content, use {= ... =}
+                               result[partial] = '{' + result[partial];
+                               result[partial2] = result[partial2] + '}';
+                       }
+                       return node.nextSibling;
+               },
+               function p(node, env, atTopLevel, tplInfo) {
+                       dt.traverse(node.firstChild, env, options, false, 
tplInfo);
+                       forceSOL();
+                       result.push('\n'); // what about <p></p><p></p>?
+                       return node.nextSibling;
+               },
+               function b_or_i(node, env, atTopLevel, tplInfo) {
+                       var count = node.nodeName === 'B' ? 2 : 1;
+                       result.push('{' + "'".repeat(count));
+                       // note first char of result can't be '
+                       dt.traverse(node.firstChild, env, options, false, 
tplInfo);
+                       // XXX do we care about last char of result?
+                       result.push("'".repeat(count)+'}');
+                       return node.nextSibling;
+               },
+               function ul_or_ol(node, env, atTopLevel, tplInfo) {
+                       listStack.push(node.nodeName==='UL'?'*':'#');
+                       dt.traverse(node.firstChild, env, options, false, 
tplInfo);
+                       listStack.pop();
+                       return node.nextSibling;
+               },
+               function li(node, env, atTopLevel, tplInfo) {
+                       // XXX after ^* or {* always at SOL context again.
+                       var type = listStack[listStack.length-1] || '*';
+                       var beforeSOL = result.length;
+                       if (!atListSOL()) { result.push('\n'); }
+                       var partial = result.length;
+                       result.push(type);
+                       dt.traverse(node.firstChild, env, options, false, 
tplInfo);
+                       forceSOL();
+                       // if this includes block content, emit '{* ... *}'
+                       var text = result.slice(partial+1).join('');
+                       if (/\n[^]/.test(text)) {
+                               result[beforeSOL] = '';
+                               result[partial] = '{' + type;
+                               result.push(type + '}');
+                       }
+                       return node.nextSibling;
+               },
+               function span_entity(node, env, atTopLevel, tplInfo) {
+                       if (!typeofContains(node, 'mw:Entity')) { return true; }
+                       result.push(entity(node.textContent));
+                       return node.nextSibling;
+               },
+               function a_extlink(node, env, atTopLevel, tplInfo) {
+                       var rel = node.getAttribute('rel');
+                       if (rel !== 'mw:ExtLink') { return true; }
+                       result.push('[');
+                       var href = escape(node.getAttribute('href'));
+                       result.push(href);
+                       var partial = result.length;
+                       result.push('|');
+                       dt.traverse(node.firstChild, env, options, false, 
tplInfo);
+                       var linkText = result.slice(partial+1).join('');
+                       // Maybe suppress the explicit link text
+                       if (href === linkText) {
+                               result.length = partial; // truncate!
+                       }
+                       result.push(']');
+                       return node.nextSibling;
+               },
+               function a_wikilink(node, env, atTopLevel, tplInfo) {
+                       var rel = node.getAttribute('rel');
+                       if (rel !== 'mw:WikiLink') { return true; }
+                       var href = node.getAttribute('href');
+                       var title = urlToTitle(env, href);
+                       title = escape(title);
+                       result.push('[[');
+                       result.push(title);
+                       var partial = result.length;
+                       result.push('|');
+                       dt.traverse(node.firstChild, env, options, false, 
tplInfo);
+                       var linkText = result.slice(partial+1).join('');
+                       // Maybe suppress the explicit link text
+                       if (title === linkText) {
+                               result.length = partial; // truncate!
+                       }
+                       result.push(']]');
+                       return node.nextSibling;
+               },
+               function div(node, env, atTopLevel, tplInfo) {
+                       result.push('<div>');
+                       dt.traverse(node.firstChild, env, options, false, 
tplInfo);
+                       result.push('</div>');
+                       return node.nextSibling;
+               },
+               function link_category(node, env) {
+                       var rel = node.getAttribute('rel');
+                       if (rel !== 'mw:PageProp/Category') { return true; }
+                       var href = node.getAttribute('href');
+                       result.push('[[');
+                       result.push(href.replace(/^\.\//, '').replace(/_/g, ' 
'));
+                       result.push(']]');
+                       return node.nextSibling;
+               },
+               function _text(node, env, atTopLevel, tplInfo) {
+                       node.data.split(/\n/g).forEach(function(s, idx) {
+                               if (idx > 0) { forceSOL(); }
+                               if (s.length) result.push(escape(s));
+                       });
+                       return node.nextSibling;
+               },
+               function _comment(node, env, atTopLevel, tplInfo) {
+                       result.push('<!--');
+                       result.push(DU.decodeComment(node.data));
+                       result.push('-->');
+                       return node.nextSibling;
+               },
+               function _all(node, env) {
+                       if (node.nodeType !== 1) throw new 
Error('woah:'+node.nodeName);
+                       result.push('<parsoid>');
+                       result.push(node.outerHTML);
+                       result.push('</parsoid>');
+                       return node.nextSibling; // skip further processing for 
this node
+               },
+       ];
+       handlers.forEach(function(h) {
+               h.name.split(/_or_/g).forEach(function(name) {
+                       name = name.replace(/^_/, '#').replace(/_.*$/, '');
+                       if (name === '#all') { name = null; } // all nodes
+                       dt.addHandler(name, h);
+               });
+       });
+       dt.traverse(doc.body, env, options, true/*atTopLevel*/);
+       return result.join('');
+};
+
+var allstar = exports.allstar = function(input, argv, parsoidConfig, prefix, 
domain) {
+       var env;
+       return ParserEnv.getParserEnv(parsoidConfig, {
+               prefix: prefix,
+               domain: domain,
+               pageName: argv.page,
+       }).then(function(_env) {
+               env = _env;
+
+               // fetch templates from enwiki by default.
+               if (argv.wgScriptPath) {
+                       env.conf.wiki.wgScriptPath = argv.wgScriptPath;
+               }
+               // The content version to output
+               if (argv.contentVersion) {
+                       env.setContentVersion(argv.contentVersion);
+               }
+               if (typeof input === 'string') {
+                       return input;
+               }
+
+               if (argv.inputfile) {
+                       // read input from the file, then process
+                       var fileContents = fs.readFileSync(argv.inputfile, 
'utf8');
+                       return fileContents;
+               }
+
+               // Send a message to stderr if there is no input for a while, 
since the
+               // convention that --page must be used with </dev/null is 
confusing.
+               var stdinTimer = setTimeout(function() {
+                       console.error('Waiting for stdin...');
+               }, 1000);
+
+               return new Promise(function(resolve) {
+                       // collect input
+                       var inputChunks = [];
+                       var stdin = process.stdin;
+                       stdin.resume();
+                       stdin.setEncoding('utf8');
+                       stdin.on('data', function(chunk) {
+                               inputChunks.push(chunk);
+                       });
+                       stdin.on('end', function() {
+                               resolve(inputChunks);
+                       });
+               }).then(function(inputChunks) {
+                       clearTimeout(stdinTimer);
+                       // parse page if no input
+                       if (inputChunks.length > 0) {
+                               return inputChunks.join('');
+                       }
+                       var target = env.normalizeAndResolvePageTitle();
+                       argv.wt2html = true;
+                       return TemplateRequest
+                               .setPageSrcInfo(env, target, argv.oldid)
+                               .then(function() {
+                                       return Parse.parse(env.page.src, argv, 
parsoidConfig, prefix, domain);
+                               }).then(function(res) { return res.out; });
+               });
+       }).then(function(html) {
+               var doc = DU.parseHTML(html);
+
+               // Serialize!
+               var wt = serialize(env, doc);
+
+               return { trailingNL: true, out: wt || '', env: env };
+       });
+}
+
+if (require.main === module) {
+       (function() {
+               var opts = yargs.usage(
+                       'Usage: $0 --inputfile <filename>',
+                       standardOpts
+               ).strict();
+
+               var argv = opts.argv;
+
+               if (Util.booleanOption(argv.help)) {
+                       opts.showHelp();
+                       return;
+               }
+
+               // Offline shortcut
+               if (argv.offline) {
+                       argv.fetchConfig = false;
+                       argv.fetchTemplates = false;
+                       argv.fetchImageInfo = false;
+                       argv.usephppreprocessor = false;
+               }
+
+               var prefix = argv.prefix || null;
+               var domain = argv.domain || null;
+
+               if (argv.apiURL) {
+                       prefix = 'customwiki';
+                       domain = null;
+               } else if (!(prefix || domain)) {
+                       domain = 'en.wikipedia.org';
+               }
+
+               var config = null;
+               if (Util.booleanOption(argv.config)) {
+                       var p = (typeof (argv.config) === 'string') ?
+                               path.resolve('.', argv.config) :
+                               path.resolve(__dirname, '../config.yaml');
+                       // Assuming Parsoid is the first service in the list
+                       config = yaml.load(fs.readFileSync(p, 
'utf8')).services[0].conf;
+               }
+
+               var setup = function(parsoidConfig) {
+                       parsoidConfig.loadWMF = argv.loadWMF;
+                       if (config && config.localsettings) {
+                               var local = require(path.resolve(__dirname, 
config.localsettings));
+                               local.setup(parsoidConfig);
+                       }
+                       Util.setTemplatingAndProcessingFlags(parsoidConfig, 
argv);
+                       Util.setDebuggingFlags(parsoidConfig, argv);
+               };
+
+               var parsoidConfig = new ParsoidConfig({ setup: setup }, config);
+
+               parsoidConfig.defaultWiki = prefix ? prefix :
+                       parsoidConfig.reverseMwApiMap.get(domain);
+
+               return allstar(null, argv, parsoidConfig, prefix, 
domain).then(function(res) {
+                       var stdout = process.stdout;
+                       stdout.write(res.out);
+                       if (res.trailingNL && stdout.isTTY) {
+                               stdout.write('\n');
+                       }
+               }).done();
+       })();
+}
diff --git a/package.json b/package.json
index 1718df1..d7ea2a2 100644
--- a/package.json
+++ b/package.json
@@ -4,6 +4,7 @@
   "version": "0.5.1+git",
   "license": "GPL-2.0+",
   "dependencies": {
+    "antlr4": "^4.5.3",
     "async": "^0.9.2",
     "babybird": "^0.0.1",
     "body-parser": "^1.15.2",

-- 
To view, visit https://gerrit.wikimedia.org/r/316237
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ibbf98aa4420650707813ec67addcac1bf827a7b5
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: C. Scott Ananian <canan...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to