Marcoil has uploaded a new change for review.
https://gerrit.wikimedia.org/r/192790
Change subject: Dissect, a script to measure Parsoid HTML sizes
......................................................................
Dissect, a script to measure Parsoid HTML sizes
Change-Id: I4a892ea499c3666b92f20b7b129ec5c00b8c2251
---
A tests/dissect.js
1 file changed, 202 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid
refs/changes/90/192790/1
diff --git a/tests/dissect.js b/tests/dissect.js
new file mode 100644
index 0000000..0f6f42b
--- /dev/null
+++ b/tests/dissect.js
@@ -0,0 +1,202 @@
+/*
+ * Fetch pages from parsoid and output statistics on them.
+ */
+
+"use strict";
+require( '../lib/core-upgrade.js' );
+
+var request = require('request'),
+ yargs = require('yargs'),
+ domino = require('domino'),
+ url = require('url'),
+ zlib = require('zlib'),
+ JSUtils = require('../lib/jsutils.js').JSUtils,
+ Util = require('../lib/mediawiki.Util.js').Util,
+ DU = require('../lib/mediawiki.DOMUtils.js').DOMUtils;
+
+var outputLenghts = function (prefix, separator, titles, lengths) {
+ var out = prefix;
+ // Sort the keys so we always get them in some order
+ Object.keys(lengths).sort().forEach(function (l) {
+ out += separator;
+ if (titles) {
+ out += l + ":";
+ }
+ out += lengths[l];
+ });
+ console.log(out);
+};
+
+var outputText = function (results) {
+ results.forEach(function (r) {
+ console.log("%s:%s", r.prefix, r.title);
+ if (r.err) {
+ console.log("\tError:", r.err);
+ } else {
+ outputLenghts("\tno compression:", "\t", true,
r.uncompressed);
+ outputLenghts("\tgzipped:", "\t", true, r.gzipped);
+ }
+ });
+};
+
+var outputCSV = function (results) {
+ var header = "title";
+ Object.keys(results[0].uncompressed).sort().forEach(function (column) {
+ header += "," + column;
+ });
+ console.log(header);
+ results.forEach(function (r) {
+ if (!r.err) {
+ outputLenghts(r.prefix + ":" + r.title,
+ ",", false, r.uncompressed);
+ outputLenghts(r.prefix + ":" + r.title + " gzipped",
+ ",", false, r.gzipped);
+ }
+ });
+};
+
+var extractParts = function (htmlText, parts) {
+ var doc = domino.createDocument(htmlText),
+ result = {
+ full: htmlText
+ };
+
+ // Go over the whole tree and extract an attribute, moving it into an
array
+ var extractAttribute = function(out, attribute, node) {
+ if (!DU.isElt(node)) {
+ return;
+ }
+ if (node.hasAttribute(attribute)) {
+ out.push(node.getAttribute(attribute));
+ node.removeAttribute(attribute);
+ }
+ if (node.hasChildNodes()) {
+ var children = node.childNodes;
+ for (var c = 0; c < children.length; c++) {
+ extractAttribute(out, attribute, children[c]);
+ }
+ }
+ };
+
+ parts.forEach(function(part) {
+ var buffer = [];
+ extractAttribute(buffer, part, doc.body);
+ result[part] = JSON.stringify(buffer);
+ });
+ result.stripped = doc.outerHTML;
+ return result;
+};
+
+var fetch = function (parsoidURL, prefix, title) {
+ var options = {
+ uri: parsoidURL + prefix + '/' +
encodeURIComponent(title),
+ followRedirect: true
+ };
+ return new Promise(function (resolve, reject) {
+ request.get(options.uri, function(err, res, body) {
+ if (!err && res.statusCode === 200) {
+ resolve(body);
+ } else {
+ reject(err);
+ }
+ });
+ });
+};
+
+if ( !module.parent ) {
+ var standardOpts = Util.addStandardOptions({
+ 'prefix': {
+ description: 'Which wiki prefix to use; e.g. "enwiki"
for English wikipedia, "eswiki" for Spanish, "mediawikiwiki" for mediawiki.org',
+ 'default': 'enwiki'
+ },
+ 'parsoidURL': {
+ description: 'The URL for the Parsoid API',
+ },
+ 'file': {
+ alias: 'f',
+ description: "A JSON file with the pages to be
dissected"
+ },
+ 'csv': {
+ description: "Output in CSV format",
+ 'boolean': true,
+ 'default': false
+ }
+ });
+
+ var opts = yargs.usage(
+ 'Usage: $0 [options] <page-title> \n\n',
+ standardOpts
+ ).check(Util.checkUnknownArgs.bind(null, standardOpts));
+
+ var argv = opts.argv;
+ var title = argv._[0];
+ if (!title && !argv.file) {
+ opts.showHelp();
+ return;
+ }
+
+ var results = [];
+ if (title) {
+ results.push({
+ prefix: argv.prefix,
+ title: title,
+ uncompressed: {},
+ gzipped: {}
+ });
+ } else {
+ var titles = require(argv.file);
+ titles.forEach(function (t) {
+ results.push({
+ prefix: argv.prefix,
+ title: t,
+ uncompressed: {},
+ gzipped: {}
+ });
+ });
+ }
+
+ var p = new Promise(function (resolve) {
+ if (argv.parsoidURL) {
+ resolve(argv.parsoidURL);
+ } else {
+ // Start our own Parsoid server
+ var apiServer = require('./apiServer.js');
+ apiServer.exitOnProcessTerm();
+ apiServer.startParsoidServer({quiet:
true}).then(function (ret) {
+ resolve(ret.url);
+ });
+ }
+ })
+ .then(function (parsoidURL) {
+ return Promise.all(results.map(function (r) {
+ return fetch(parsoidURL, argv.prefix, r.title)
+ .then(function (htmlText) {
+ var parts = extractParts(htmlText, ['data-mw',
'data-parsoid']),
+ promisifiedGzip =
Promise.promisify(zlib.gzip);
+ return
Promise.all(Object.keys(parts).map(function (part) {
+ r.uncompressed[part] =
parts[part].length;
+
+ // Compress
+ return promisifiedGzip(parts[part])
+ .then(function (gzipped) {
+ r.gzipped[part] =
gzipped.length;
+ }, function (err) {
+ r.err = err;
+ });
+ })).then(function() {
+ return r;
+ });
+ }, function (err) {
+ r.err = err;
+ return r;
+ });
+ }));
+ })
+ .then(argv.csv ? outputCSV : outputText)
+ .then(process.exit)
+ .catch(function (err) {
+ console.warn(err);
+ console.warn(err.stack);
+ process.exit(-1);
+ });
+}
--
To view, visit https://gerrit.wikimedia.org/r/192790
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I4a892ea499c3666b92f20b7b129ec5c00b8c2251
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Marcoil <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits