Marcoil has uploaded a new change for review.
https://gerrit.wikimedia.org/r/177590
Change subject: Add API to retrieve JSON-serialized token stream
......................................................................
Add API to retrieve JSON-serialized token stream
This will be used to parallelize template expansion.
Also add a new option to parse.js, --wt2tokens.
Change-Id: Iedcfba28daf7ed226a10b96f651aedee1f81210a
---
M api/ParsoidService.js
M api/routes.js
A lib/ext.core.TokenBuilder.js
M lib/mediawiki.parser.js
M tests/mocha/api.js
M tests/parse.js
6 files changed, 101 insertions(+), 8 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid
refs/changes/90/177590/1
diff --git a/api/ParsoidService.js b/api/ParsoidService.js
index 6de81c1..956a3d8 100644
--- a/api/ParsoidService.js
+++ b/api/ParsoidService.js
@@ -89,6 +89,7 @@
app.get( re('/_rtselser/(' + iwRe + ')/(.*)'), i, p,
routes.roundtripSelser );
app.get( re('/_rtform/(?:(' + iwRe + ')/(.*))?'), i, p,
routes.get_rtForm );
app.post( re('/_rtform/(?:(' + iwRe + ')/(.*))?'), i, p,
routes.post_rtForm );
+ app.post( re('/_tokens/(?:(' + iwRe + ')/(.*))?'), i, p,
routes.post_tokens );
app.get( re('/(' + iwRe + ')/(.*)'), i, p, routes.get_article );
app.post( re('/(' + iwRe + ')/(.*)'), i, p, routes.post_article );
diff --git a/api/routes.js b/api/routes.js
index 7182a66..fa2d623 100644
--- a/api/routes.js
+++ b/api/routes.js
@@ -277,6 +277,8 @@
html: DU.serializeNode( res.local('body') ?
doc.body : doc ),
"data-parsoid": JSON.parse(dp.text)
});
+ } else if (res.local("format") === "tokens") {
+ apiUtils.jsonResponse(res, env, doc);
} else {
apiUtils.setHeader(res, env, 'Content-Type',
'text/html; charset=UTF-8');
apiUtils.endResponse(res, env, DU.serializeNode(
res.local('body') ? doc.body : doc ));
@@ -291,7 +293,9 @@
env.page.name = '';
}
return new Promise(function( resolve, reject ) {
- var parser =
env.pipelineFactory.getPipeline('text/x-mediawiki/full');
+ var pipelineType = (res.local("format") === "tokens")?
+ 'text/x-mediawiki/tokens' :
'text/x-mediawiki/full',
+ parser =
env.pipelineFactory.getPipeline(pipelineType);
parser.once('document', function( doc ) {
// Don't cache requests when wt is set in case
somebody uses
// GET for wikitext parsing
@@ -611,6 +615,16 @@
});
};
+routes.post_tokens = function (req, res) {
+ var body = req.body;
+ if (req.body.wt) {
+ res.local("format", "tokens");
+ wt2html(req, res, body.wt);
+ } else {
+ res.local('env').log("fatal/request", "Can't post to tokens
without wt");
+ }
+};
+
routes.get_article = function( req, res ) {
// Regular article parsing
wt2html( req, res );
diff --git a/lib/ext.core.TokenBuilder.js b/lib/ext.core.TokenBuilder.js
new file mode 100644
index 0000000..e10e305
--- /dev/null
+++ b/lib/ext.core.TokenBuilder.js
@@ -0,0 +1,43 @@
+"use strict";
+
+/* This class just accumulates tokens into an array and outputs them as a
+ * finished document. Intended to be used to provide direct output of tokens
+ * for internal processing.
+ */
+
+var events = require('events'),
+ util = require('util');
+
+function TokenBuilder(env) {
+ this.env = env;
+ this.resetState();
+}
+
+util.inherits(TokenBuilder, events.EventEmitter);
+
+TokenBuilder.prototype.addListenersOn = function (emitter) {
+ emitter.addListener('chunk', this.onChunk.bind(this));
+ emitter.addListener('end', this.onEnd.bind(this));
+};
+
+TokenBuilder.prototype.setPipelineId = function (id) {
+ this.pipelineId = id;
+};
+
+TokenBuilder.prototype.resetState = function () {
+ this.out = [];
+};
+
+TokenBuilder.prototype.onChunk = function (tokens) {
+ this.out = this.out.concat(tokens);
+};
+
+TokenBuilder.prototype.onEnd = function () {
+ this.emit('document', this.out);
+ this.emit('end');
+ this.resetState();
+};
+
+if (typeof module === "object") {
+ module.exports.TokenBuilder = TokenBuilder;
+}
diff --git a/lib/mediawiki.parser.js b/lib/mediawiki.parser.js
index 6859cac..31b65ce 100644
--- a/lib/mediawiki.parser.js
+++ b/lib/mediawiki.parser.js
@@ -37,7 +37,8 @@
BehaviorSwitchPreprocessor = BehaviorSwitch.BehaviorSwitchPreprocessor,
DOMFragmentBuilder =
require('./ext.core.DOMFragmentBuilder.js').DOMFragmentBuilder,
TreeBuilder =
require('./mediawiki.HTML5TreeBuilder.node.js').TreeBuilder,
- DOMPostProcessor =
require('./mediawiki.DOMPostProcessor.js').DOMPostProcessor;
+ DOMPostProcessor =
require('./mediawiki.DOMPostProcessor.js').DOMPostProcessor,
+ TokenBuilder = require('./ext.core.TokenBuilder.js').TokenBuilder;
var ParserPipeline; // forward declaration
var globalPipelineId = 0;
@@ -130,9 +131,7 @@
]
],
- // Final stages of main pipeline, operating on fully expanded tokens of
- // potentially mixed origin.
- 'tokens/x-mediawiki/expanded': [
+ 'tokens/x-mediawiki/transformed': [
// Synchronous in-order on fully expanded token stream
(including
// expanded templates etc). In order to support mixed input
(from
// wikitext and plain HTML, say) all applicable transforms need
to be
@@ -159,8 +158,13 @@
// require the existence of p-tags for
its functioning.
ParagraphWrapper // 2.95 -- 2.97
]
- ],
+ ]
+ ],
+ // Final stages of main pipeline, operating on fully expanded tokens of
+ // potentially mixed origin.
+ 'tokens/x-mediawiki/expanded': [
+ 'tokens/x-mediawiki/transformed',
// Build a tree out of the fully processed token stream
[ TreeBuilder, [] ],
@@ -174,6 +178,14 @@
* (Template wrapping, broken wikitext/html detection, etc.)
*/
[ DOMPostProcessor, [] ]
+ ],
+
+ 'text/x-mediawiki/tokens': [
+ [ PegTokenizer, [] ],
+ 'tokens/x-mediawiki',
+ 'tokens/x-mediawiki/transformed',
+ // Concatenate all tokens and pass them as the result.
+ [ TokenBuilder, [] ]
]
};
diff --git a/tests/mocha/api.js b/tests/mocha/api.js
index 19b8975..95f5d24 100644
--- a/tests/mocha/api.js
+++ b/tests/mocha/api.js
@@ -63,4 +63,17 @@
.expect(200)
.expect(/^<body/, done);
});
+
+ it("supports _token API end-point", function (done) {
+ request(api)
+ .post('/_tokens/localhost/Main_Page')
+ .send({wt: "foo"})
+ .expect(200)
+ .expect(function (res) {
+ var tokens = res.body;
+ tokens[1].should.equal('foo');
+ tokens.last().type.should.equal('EOFTk');
+ })
+ .end(done);
+ });
});
diff --git a/tests/parse.js b/tests/parse.js
index 659325a..86981c7 100755
--- a/tests/parse.js
+++ b/tests/parse.js
@@ -47,6 +47,11 @@
'boolean': true,
'default': false
},
+ 'wt2tokens': {
+ description: 'Wikitext -> Tokens',
+ 'boolean': true,
+ 'default': false
+ },
'selser': {
description: 'Use the selective serializer to go from HTML to
Wikitext.',
'boolean': true,
@@ -144,7 +149,9 @@
startsAtWikitext = function( argv, env, input ) {
return new Promise(function( resolve ) {
- var parser =
env.pipelineFactory.getPipeline('text/x-mediawiki/full');
+ var pipelineType = argv.wt2tokens ?
+ 'text/x-mediawiki/tokens' :
'text/x-mediawiki/full',
+ parser = env.pipelineFactory.getPipeline(pipelineType);
parser.once( 'document', resolve );
// Kick off the pipeline by feeding the input into the parser
pipeline
env.setPageSrcInfo( input );
@@ -163,6 +170,9 @@
} else {
out = DU.serializeNode( doc );
}
+ return { trailingNL: true, out: out };
+ } else if (argv.wt2tokens) {
+ out = JSON.stringify(doc);
return { trailingNL: true, out: out };
} else {
out = DU.serializeNode( doc.body, true );
@@ -289,7 +299,7 @@
}
// Default conversion mode
- if ( !argv.html2wt && !argv.wt2wt && !argv.html2html ) {
+ if ( !argv.html2wt && !argv.wt2wt && !argv.html2html &&
!argv.wt2tokens ) {
argv.wt2html = true;
}
--
To view, visit https://gerrit.wikimedia.org/r/177590
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Iedcfba28daf7ed226a10b96f651aedee1f81210a
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Marcoil <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits