Yurik has uploaded a new change for review. https://gerrit.wikimedia.org/r/192273
Change subject: Instrumented and rewrote service logic ...................................................................... Instrumented and rewrote service logic Change-Id: I34cfbe6ac7b6ea57cc346f280ef4d2b5bf3c4457 --- A config.labs.yaml M config.yaml M routes/v1.js 3 files changed, 503 insertions(+), 218 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/graphoid refs/changes/73/192273/1 diff --git a/config.labs.yaml b/config.labs.yaml new file mode 100644 index 0000000..4540b43 --- /dev/null +++ b/config.labs.yaml @@ -0,0 +1,69 @@ +# Info about this config. Used for packaging & other purposes. +info: + name: service-template-node + version: 0.0.1 + description: A blueprint for MediaWiki REST API services + +# Number of worker processes to spawn. +# Set to 0 to run everything in a single process without clustering. +# Use 'ncpu' to run as many workers as there are CPU units +num_workers: 0 + +# Logger info +logging: + level: trace +# streams: +# # Use gelf-stream -> logstash +# - type: gelf +# host: logstash1003.eqiad.wmnet +# port: 12201 + +# Statsd metrics reporter +metrics: + type: txstatsd + host: localhost + port: 8125 + +services: + - name: service-template-node + # a relative path or the name of an npm package, if different from name + module: ./app.js + # optionally, a version constraint of the npm package + # version: ^0.4.0 + # per-service config + conf: + port: 19000 + # interface: localhost # uncomment to only listen on localhost + # compressionLevel: 3 # output gzip compression level, 6 by default + # more per-service config settings + + # Graphoid-specific settings + # List of domains the service is allowed to access (with their subdomains) + domains: + - localhost + - 127.0.0.1 + - wmflabs.org + - mediawiki.org + - wikibooks.org + - wikidata.org + - wikimedia.org + - wikimediafoundation.org + - wikinews.org + - wikipedia.org + - wikiquote.org + - wikisource.org + - wikiversity.org + - wikivoyage.org + - wiktionary.org + # Additional list of allowed domains with their replacements + # Even though the subdomains will also be allowed, the substitution + # will be done only on fully matched hostname. + # For example, it allows "graph.wmflabs.org" to be replaced with "localhost:12345" + domainMap: + zero.wmflabs.org: localhost + graphoid.wmflabs.org: localhost + + # Which protocol should service use for protocol-relative URLs like //example.com/path + defaultProtocol: http + # Error if it takes longer than this to handle the request + timeout: 10000 diff --git a/config.yaml b/config.yaml index c11eec8..ef3b865 120000 --- a/config.yaml +++ b/config.yaml @@ -1 +1,66 @@ -config.dev.yaml \ No newline at end of file +# Info about this config. Used for packaging & other purposes. +info: + name: service-template-node + version: 0.0.1 + description: A blueprint for MediaWiki REST API services + +# Number of worker processes to spawn. +# Set to 0 to run everything in a single process without clustering. +# Use 'ncpu' to run as many workers as there are CPU units +num_workers: 0 + +# Logger info +logging: + level: trace +# streams: +# # Use gelf-stream -> logstash +# - type: gelf +# host: logstash1003.eqiad.wmnet +# port: 12201 + +# Statsd metrics reporter +metrics: + type: txstatsd + host: localhost + port: 8125 + +services: + - name: service-template-node + # a relative path or the name of an npm package, if different from name + module: ./app.js + # optionally, a version constraint of the npm package + # version: ^0.4.0 + # per-service config + conf: + port: 6930 + # interface: localhost # uncomment to only listen on localhost + # compressionLevel: 3 # output gzip compression level, 6 by default + # more per-service config settings + + # Graphoid-specific settings + # List of domains the service is allowed to access (with their subdomains) + domains: + - localhost + - 127.0.0.1 + - wmflabs.org + - mediawiki.org + - wikibooks.org + - wikidata.org + - wikimedia.org + - wikimediafoundation.org + - wikinews.org + - wikipedia.org + - wikiquote.org + - wikisource.org + - wikiversity.org + - wikivoyage.org + - wiktionary.org + # Additional list of allowed domains with their replacements + # Even though the subdomains will also be allowed, the substitution + # will be done only on fully matched hostname. + # For example, it allows "graph.wmflabs.org" to be replaced with "localhost:12345" + domainMap: {} + # Which protocol should service use for protocol-relative URLs like //example.com/path + defaultProtocol: http + # Error if it takes longer than this to handle the request + timeout: -1 diff --git a/routes/v1.js b/routes/v1.js index f416ea2..99fbc65 100644 --- a/routes/v1.js +++ b/routes/v1.js @@ -3,15 +3,18 @@ var express = require('express'), preq = require('preq'), Promise = require('bluebird'), + urllib = require('url'), vega = null; // Visualization grammar - https://github.com/trifacta/vega -try{ - // Simplify debugging when vega is not available - vega = require('vega'); -} catch(err) { - console.error(err); -} +/** + * Main log function + */ +var log; +/** + * Metrics object + */ +var metrics; /** * The main router object @@ -19,14 +22,9 @@ var router = express.Router(); /** - * A list of allowed hosts - */ -var domains = []; - -/** * A set of 'oldHost' => 'newHost' mappings */ -var domainMap = {}; +var domainMap = false; /** * For protocol-relative URLs (they begin with //), which protocol should we use @@ -39,14 +37,32 @@ var timeout = 10000; /** - * Regex to validate server parameter + * Regex to validate host parameter */ var serverRe = null; -function init(conf) { - domains = conf.domains || domains; - domainMap = conf.domainMap || domainMap; +function init(app) { + + // The very first operation should set up our logger + log = app.logger.log.bind(app.logger); + metrics = app.metrics; + + // Uncomment to console.log metrics calls + //metrics = wrapMetrics(app.metrics); + + log('info/init', 'starting v1' ); + metrics.increment('v1.init'); + + try{ + // Simplify debugging when vega is not available + vega = require('vega'); + } catch(err) { + log('fatal/vega', err); + } + + var conf = app.conf; + var domains = conf.domains || domains; timeout = conf.timeout || timeout; defaultProtocol = conf.defaultProtocol || defaultProtocol; if (!defaultProtocol.endsWith(':')) { @@ -54,38 +70,335 @@ defaultProtocol = defaultProtocol + ':'; } - var validDomains = domains.concat(Object.getOwnPropertyNames(domainMap)); + var validDomains = domains; + if (conf.domainMap && Object.getOwnPropertyNames(domainMap) > 0) { + domainMap = conf.domainMap; + validDomains = validDomains.concat(Object.getOwnPropertyNames(domainMap)) + } if (validDomains.length == 0) { - console.error('Config must have non-empty "domains" (list) and/or "domainMap" (dict)'); + log('fatal/config', 'Config must have non-empty "domains" (list) and/or "domainMap" (dict)'); process.exit(1); } serverRe = new RegExp('^([-a-z0-9]+\\.)?(m\\.|zero\\.)?(' + validDomains.join('|') + ')$'); - - if (vega) { - vega.config.domainWhiteList = domains; - vega.config.defaultProtocol = defaultProtocol; - vega.config.safeMode = true; - if (Object.getOwnPropertyNames(domainMap) > 0) { - var originalSanitize = vega.data.load.sanitizeUrl; - vega.data.load.sanitizeUrl = function(url) { - url = originalSanitize(url); - if (url) { - url = url.replace(/^(https?:\/\/)([-a-z0-9.]+)/, function(match, prot, host){ - var repl = domainMap[host]; - return repl ? prot + repl : match; - }); - } - return url; - }; - } - } + initVega(domains); } + + +/** + * Init vega rendering + * @param domains array of strings - which domains are valid + */ +function initVega(domains) { + if (!vega) { + return; + } + vega.config.domainWhiteList = domains; + vega.config.defaultProtocol = defaultProtocol; + vega.config.safeMode = true; + + // + // TODO/BUG: In multithreaded env, we cannot set global vega.config var + // while handling multiple requests from multiple hosts. + // Until vega is capable of per-rendering context, we must bail on any + // relative (no hostname) data or image URLs. + // + // Do not set vega.config.baseURL. Current sanitizer implementation will fail + // because of the missing protocol (safeMode == true). Still, lets double check + // here, in case user has 'http:pathname', which for some strange reason is + // parsed as correct by url lib. + // + var originalSanitize = vega.data.load.sanitizeUrl; + vega.data.load.sanitizeUrl = function (urlOrig) { + url = originalSanitize(urlOrig); + if (url) { + var parts = urllib.parse(url); + if (!parts.protocol || !parts.hostname) { + url = null; + } + } + if (url && domainMap) { + url = url.replace(/^(https?:\/\/)([^#?\/]+)/, function (match, prot, host) { + var repl = domainMap[host]; + return repl ? prot + repl : match; + }); + } + + if (!url) { + log('info/url-deny', urlOrig); + } else if (urlOrig !== url) { + log('info/url-fix', {'req': urlOrig, 'repl': url}); + } else { + log('info/url-ok', urlOrig); + } + return url; + }; +} + + +/** + * Parse and validate request parameters + */ +function validateRequest(state) { + + var start = Date.now(); + + var p = state.request.params, + host = p.host, + title = p.title, + revid = p.revid, + id = p.id; + + state.log = p; // log all parameters of the request + + state.apiRequest = { + format: 'json', + action: 'query', + prop: 'pageprops', + ppprop: 'graph_specs', + continue: '' + }; + + if (revid) { + if (!/^[0-9]+$/.test(revid)) { + // must be a non-negative integer + throw new Err('info/param-revid', 'req.revid'); + } + revid = parseInt(revid); + } + + if (revid) { + state.apiRequest.revids = revid; + } else if (title) { + if (title.indexOf('|') > -1) { + throw new Err('info/param-title', 'req.title'); + } + state.apiRequest.titles = title; + } else { + throw new Err('info/param-page', 'req.page'); + } + + if (!/^[0-9a-f]+$/.test(id)) { + throw new Err('info/param-id', 'req.id'); + } + state.graphId = id; + + var parts = serverRe.exec(host); + if (!parts) { + throw new Err('info/param-host', 'req.host'); + } + // Remove optional part #2 from host (makes m. links appear as desktop to optimize cache) + // 1 2 3 + // en.m.wikipedia.org + var host2 = parts[3]; + if (parts[1]) { + host2 = parts[1] + host2; + } + host2 = (domainMap && domainMap[host2]) || host2; + + state.host = host2; + state.apiUrl = defaultProtocol + '//' + host2 + '/w/api.php'; + if (host !== host2) { + state.log.backend = host2; + } + + metrics.endTiming('req.time', start); + + return state; +} + +/** + * Retrieve graph specifications from the host + * @param state is the object with the current state of the request processing + */ +function downloadGraphDef(state) { + + var startDefDownload = Date.now(); + state.log.calls = []; + + // http://stackoverflow.com/questions/24660096/correct-way-to-write-loops-for-promise + var loopAsync = Promise.method(function (action, condition, value) { + var req = condition(value); + if (req) { + return action(req).then(loopAsync.bind(null, action, condition)); + } + }); + + return loopAsync(function (req) { + + var startApiReq = Date.now(); + state.log.calls.push(req); + var requestOpts = { + uri: state.apiUrl, + query: req, + headers: {'User-Agent': 'graph.ext backend (yurik at wikimedia)'} + }; + return preq(requestOpts) + .then(function (resp) { + metrics.endTiming('host.time', startApiReq); + return resp; + }); + + }, function (apiRes) { + + // If first run, always allow + if (!apiRes) { + return state.apiRequest; + } + + if (apiRes.status !== 200) { + state.log.apiRetStatus = apiRes.status; + throw new Err('error/host-status', 'host.status'); + } + + var res = apiRes.body; + if (res.hasOwnProperty('error')) { + state.log.apiRetError = res.error; + throw new Err('error/host-error', 'host.error'); + } + + if (res.hasOwnProperty('warnings')) { + state.log.apiWarning = res.warnings; + log('warn/host-warning', state.log); + // Warnings are usually safe to continue + } + + if (res.hasOwnProperty('query') && res.query.hasOwnProperty('pages')) { + var pages = res.query.pages, + graphData = null; + + Object.getOwnPropertyNames(pages).some(function (k) { + var page = pages[k]; + if (page.hasOwnProperty('pageprops') && page.pageprops.hasOwnProperty('graph_specs')) { + var gs = JSON.parse(page.pageprops.graph_specs); + if (gs.hasOwnProperty(state.graphId)) { + graphData = gs[state.graphId]; + return true; + } + } + return false; + }); + + if (graphData) { + state.graphData = graphData; + return false; // found needed result + } + } + if (res.hasOwnProperty('continue')) { + return merge(state.apiRequest, res.continue); + } + throw new Err('info/host-no-graph', 'host.no-graph'); + + }).then(function () { + metrics.endTiming('host.total', startDefDownload); + return state; + }); +} + +function renderOnCanvas(state) { + return new Promise(function (fulfill, reject){ + if (!vega) { + // If vega is down, keep reporting it + throw new Err('fatal/vega', 'vega.missing'); + } + + var start = Date.now(); + + // BUG: see comment above at vega.data.load.sanitizeUrl = ... + // In case of non-absolute URLs, use requesting host as "local" + //vega.config.baseURL = defaultProtocol + '//' + state.host; + + vega.headless.render({spec: state.graphData, renderer: 'canvas'}, function (err, result) { + if (err) { + state.log.vegaerr = err; + reject(new Err('error/vega', 'vega.error')); + } else { + var stream = result.canvas.pngStream(); + state.response.status(200).type('png'); + stream.on('data', function (chunk) { + state.response.write(chunk); + }); + stream.on('end', function () { + state.response.end(); + metrics.endTiming('vega.time', start); + fulfill(state); + }); + } + }); + }); +} + +/** + * Main entry point for graphoid + */ +router.get('/:host/:title/:revid/:id.png', function(req, res) { + + var start = Date.now(); + var state = {request: req, response: res}; + + var render = Promise + .resolve(state) + .then(validateRequest) + .then(downloadGraphDef) + .then(renderOnCanvas); + + failOnTimeout(render, timeout) + .then(function () { + + // SUCCESS + // For now, record everything, but soon we should scale it back + log('info/ok', state.log); + metrics.endTiming('total.time', start); + + },function (reason) { + + // FAILURE + var l = state.log; + var msg = 'error/unknown', + mx = 'error.unknown'; + + if (reason instanceof Err) { + l = merge(reason, l); + msg = reason.message; + mx = reason.metrics; + delete l.message; + delete l.metrics; + } else if (reason !== null && typeof reason === 'object') { + l = merge(reason, l); + } else { + l.exception = reason; + } + + res.status(400).json(msg); + metrics.increment(mx); + log(msg, l); + }); +}); + + +module.exports = function(app) { + + init(app); + + return { + path: '/v1', + router: router + }; +}; + + /* * Utility functions */ + +function Err(message, metrics) { + this.message = message; + this.metrics = metrics; +} +Err.prototype = Object.create(Error.prototype); +Err.prototype.constructor = Err; // NOTE: there are a few libraries that do this function merge() { @@ -111,191 +424,29 @@ function failOnTimeout(promise, time) { return time <= 0 ? promise : Promise.race([promise, delay(time).then(function () { - throw 'Operation timed out'; + throw 'timeout'; // we later compare on this value })]); } /** - * Parse and validate request parameters + * When enabled, logs metrics functions calls + * @param obj + * @returns {{increment: *, endTiming: *}} */ -function validateRequest(state) { - - var p = state.request.params, - server = p.server, - title = p.title, - revid = p.revid, - id = p.id; - - state.apiRequest = { - format: 'json', - action: 'query', - prop: 'pageprops', - ppprop: 'graph_specs', - continue: '' - }; - - if (revid) { - if (!/^[0-9]+$/.test(revid)) { - // must be a non-negative integer - throw 'bad revid param'; - } - revid = parseInt(revid); - } - - if (revid) { - state.apiRequest.revids = revid; - } else if (title) { - if (title.indexOf('|') > -1) { - throw 'bad title param'; - } - state.apiRequest.titles = title; - } else { - throw 'no revid or title given'; - } - - if (!/^[0-9a-f]+$/.test(id)) { - throw 'bad id param'; - } - state.graphId = id; - - // Remove optional part #2 from host (makes m. links appear as desktop to optimize cache) - // 1 2 3 - // en.m.wikipedia.org - var srvParts = serverRe.exec(server); - if (!srvParts) { - throw 'bad server param'; - } - server = (srvParts[1] || '') + srvParts[3]; - - state.server = domainMap[server] || server; - state.apiUrl = defaultProtocol + '//' + server + '/w/api.php'; - - return state; -} - -/** - * Retrieve graph specifications from the server - * @param state is the object with the current state of the request processing - */ -function getSpec(state) { - - var callApiInt; - - var processResult = function (apiRes) { - if (apiRes.status !== 200) { - throw 'API result error code ' + apiRes.status; - } - var res = apiRes.body; - if (res.hasOwnProperty('error')) { - throw 'API result error: ' + JSON.stringify(res.error); - } - - if (res.hasOwnProperty('warnings')) { - console.error('API warning: ' + JSON.stringify(res.warnings) + - ' from ' + state.server + JSON.stringify(state.apiRequest)); - } - if (res.hasOwnProperty('query') && res.query.hasOwnProperty('pages')) { - var pages = res.query.pages, - graphData = null; - - Object.getOwnPropertyNames(pages).some(function (k) { - var page = pages[k]; - if (page.hasOwnProperty('pageprops') && page.pageprops.hasOwnProperty('graph_specs')) { - var gs = JSON.parse(page.pageprops.graph_specs); - if (gs.hasOwnProperty(state.graphId)) { - graphData = gs[state.graphId]; - return true; - } - } - return false; - }); - - if (graphData) { - state.graphData = graphData; - return state; - } - } - if (res.hasOwnProperty('continue')) { - callApiInt(state.apiUrl, merge(state.apiRequest, res.continue)); - } - throw 'Unable to find graph_specs with the given id'; - }; - - callApiInt = function(url, req) { - var reqOpts = { - uri: url, - query: req, - headers: { - 'User-Agent': 'graph.ext backend (yurik at wikimedia)' - } +function wrapMetrics(obj) { + function logWrap(name){ + return function(){ + console.log(name + JSON.stringify([].slice.call(arguments))); + return obj[name].apply(obj, arguments); }; - return preq(reqOpts) - .then(processResult) - .catch(function (reason) { - delete reqOpts.headers; - console.error('API call failed: ' + state.server + JSON.stringify(state.apiRequest)); - throw reason; // re-throw - }); - }; - - return callApiInt(state.apiUrl, state.apiRequest); -} - -function renderOnCanvas(state) { - return new Promise(function (fulfill, reject){ - if (!vega) { - throw 'Unable to load Vega npm module'; - } - - // In case of non-absolute URLs, use requesting server as "local" - vega.config.baseURL = defaultProtocol + '//' + state.server; - - vega.headless.render({spec: state.graphData, renderer: 'canvas'}, function (err, result) { - if (err) { - reject(err); - } else { - var stream = result.canvas.pngStream(); - state.response.status(200).type('png'); - stream.on('data', function (chunk) { - state.response.write(chunk); - }); - stream.on('end', function () { - state.response.end(); - fulfill(state); - }); + } + var result = {}; + for (var id in obj) { + try { + if (typeof(obj[id]) == "function") { + result[id] = logWrap(id); } - }); - }); + } catch (err) {} + } + return result; } - -/** - * Main entry point for graphoid - */ -router.get('/:server/:title/:revid/:id.png', function(req, res) { - - var render = Promise - .resolve({request: req, response: res}) - .then(validateRequest) - .then(getSpec) - .then(renderOnCanvas); - - failOnTimeout(render, timeout) - .catch(function (reason) { - console.error('Failed ' + JSON.stringify(req.params) + ' ' + reason); - if (reason.hasOwnProperty('stack')) { - console.error(reason.stack); - } - res.status(400).json(reason); - }); -}); - - -module.exports = function(app) { - - init(app.conf); - - return { - path: '/v1', - router: router - }; -}; -- To view, visit https://gerrit.wikimedia.org/r/192273 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I34cfbe6ac7b6ea57cc346f280ef4d2b5bf3c4457 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/graphoid Gerrit-Branch: master Gerrit-Owner: Yurik <yu...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits