Marcoil has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/192790

Change subject: Dissect, a script to measure Parsoid HTML sizes
......................................................................

Dissect, a script to measure Parsoid HTML sizes

Change-Id: I4a892ea499c3666b92f20b7b129ec5c00b8c2251
---
A tests/dissect.js
1 file changed, 202 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/90/192790/1

diff --git a/tests/dissect.js b/tests/dissect.js
new file mode 100644
index 0000000..0f6f42b
--- /dev/null
+++ b/tests/dissect.js
@@ -0,0 +1,202 @@
+/*
+ * Fetch pages from parsoid and output statistics on them.
+ */
+
+"use strict";
+require( '../lib/core-upgrade.js' );
+
+var    request = require('request'),
+       yargs = require('yargs'),
+       domino = require('domino'),
+       url = require('url'),
+       zlib = require('zlib'),
+       JSUtils = require('../lib/jsutils.js').JSUtils,
+       Util = require('../lib/mediawiki.Util.js').Util,
+       DU = require('../lib/mediawiki.DOMUtils.js').DOMUtils;
+
+var outputLenghts = function (prefix, separator, titles, lengths) {
+       var out = prefix;
+       // Sort the keys so we always get them in some order
+       Object.keys(lengths).sort().forEach(function (l) {
+               out += separator;
+               if (titles) {
+                       out += l + ":";
+               }
+               out += lengths[l];
+       });
+       console.log(out);
+};
+
+var outputText = function (results) {
+       results.forEach(function (r) {
+               console.log("%s:%s", r.prefix, r.title);
+               if (r.err) {
+                       console.log("\tError:", r.err);
+               } else {
+                       outputLenghts("\tno compression:", "\t", true, 
r.uncompressed);
+                       outputLenghts("\tgzipped:", "\t", true, r.gzipped);
+               }
+       });
+};
+
+var outputCSV = function (results) {
+       var header = "title";
+       Object.keys(results[0].uncompressed).sort().forEach(function (column) {
+               header += "," + column;
+       });
+       console.log(header);
+       results.forEach(function (r) {
+               if (!r.err) {
+                       outputLenghts(r.prefix + ":" + r.title,
+                               ",", false, r.uncompressed);
+                       outputLenghts(r.prefix + ":" + r.title + " gzipped",
+                               ",", false, r.gzipped);
+               }
+       });
+};
+
+var extractParts = function (htmlText, parts) {
+       var doc = domino.createDocument(htmlText),
+               result = {
+                       full: htmlText
+               };
+
+       // Go over the whole tree and extract an attribute, moving it into an 
array
+       var extractAttribute = function(out, attribute, node) {
+               if (!DU.isElt(node)) {
+                       return;
+               }
+               if (node.hasAttribute(attribute)) {
+                       out.push(node.getAttribute(attribute));
+                       node.removeAttribute(attribute);
+               }
+               if (node.hasChildNodes()) {
+                       var children = node.childNodes;
+                       for (var c = 0; c < children.length; c++) {
+                               extractAttribute(out, attribute, children[c]);
+                       }
+               }
+       };
+
+       parts.forEach(function(part) {
+               var buffer = [];
+               extractAttribute(buffer, part, doc.body);
+               result[part] = JSON.stringify(buffer);
+       });
+       result.stripped = doc.outerHTML;
+       return result;
+};
+
+var fetch = function (parsoidURL, prefix, title) {
+       var options = {
+                       uri: parsoidURL + prefix + '/' + 
encodeURIComponent(title),
+                       followRedirect: true
+               };
+       return new Promise(function (resolve, reject) {
+               request.get(options.uri, function(err, res, body) {
+                       if (!err && res.statusCode === 200) {
+                               resolve(body);
+                       } else {
+                               reject(err);
+                       }
+               });
+       });
+};
+
+if ( !module.parent ) {
+       var standardOpts = Util.addStandardOptions({
+               'prefix': {
+                       description: 'Which wiki prefix to use; e.g. "enwiki" 
for English wikipedia, "eswiki" for Spanish, "mediawikiwiki" for mediawiki.org',
+                       'default': 'enwiki'
+               },
+               'parsoidURL': {
+                       description: 'The URL for the Parsoid API',
+               },
+               'file': {
+                       alias: 'f',
+                       description: "A JSON file with the pages to be 
dissected"
+               },
+               'csv': {
+                       description: "Output in CSV format",
+                       'boolean': true,
+                       'default': false
+               }
+       });
+
+       var opts = yargs.usage(
+               'Usage: $0 [options] <page-title> \n\n',
+               standardOpts
+       ).check(Util.checkUnknownArgs.bind(null, standardOpts));
+
+       var argv = opts.argv;
+       var title = argv._[0];
+       if (!title && !argv.file) {
+               opts.showHelp();
+               return;
+       }
+
+       var results = [];
+       if (title) {
+               results.push({
+                       prefix: argv.prefix,
+                       title: title,
+                       uncompressed: {},
+                       gzipped: {}
+               });
+       } else {
+               var titles = require(argv.file);
+               titles.forEach(function (t) {
+                       results.push({
+                               prefix: argv.prefix,
+                               title: t,
+                               uncompressed: {},
+                               gzipped: {}
+                       });
+               });
+       }
+
+       var p = new Promise(function (resolve) {
+               if (argv.parsoidURL) {
+                       resolve(argv.parsoidURL);
+               } else {
+                       // Start our own Parsoid server
+                       var apiServer = require('./apiServer.js');
+                       apiServer.exitOnProcessTerm();
+                       apiServer.startParsoidServer({quiet: 
true}).then(function (ret) {
+                               resolve(ret.url);
+                       });
+               }
+       })
+       .then(function (parsoidURL) {
+               return Promise.all(results.map(function (r) {
+                       return fetch(parsoidURL, argv.prefix, r.title)
+                       .then(function (htmlText) {
+                               var parts = extractParts(htmlText, ['data-mw', 
'data-parsoid']),
+                                       promisifiedGzip = 
Promise.promisify(zlib.gzip);
+                               return 
Promise.all(Object.keys(parts).map(function (part) {
+                                       r.uncompressed[part] = 
parts[part].length;
+
+                                       // Compress
+                                       return promisifiedGzip(parts[part])
+                                               .then(function (gzipped) {
+                                                       r.gzipped[part] = 
gzipped.length;
+                                               }, function (err) {
+                                                       r.err = err;
+                                               });
+                               })).then(function() {
+                                       return r;
+                               });
+                       }, function (err) {
+                               r.err = err;
+                               return r;
+                       });
+               }));
+       })
+       .then(argv.csv ? outputCSV : outputText)
+       .then(process.exit)
+       .catch(function (err) {
+               console.warn(err);
+               console.warn(err.stack);
+               process.exit(-1);
+       });
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/192790
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I4a892ea499c3666b92f20b7b129ec5c00b8c2251
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Marcoil <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to