Subramanya Sastry has uploaded a new change for review.
https://gerrit.wikimedia.org/r/49856
Change subject: Yield control to event-loop after tokenizing a top-level block.
......................................................................
Yield control to event-loop after tokenizing a top-level block.
* Currently, the tokenizer parses the whole page at once queueing
up a lot of async actions in one shot. This is not so good for
a couple of reasons:
- Tokenized chunks aren't processed further till the whole page
is parsed.
- This can lead to the first template requests issues during
the tokenization to timeout in some cases since the request
handler might never get scheduled in time.
- It can result in template requests going out in big bursts
rather than being spread out over the page parse.
* This patch fixes the tokenizer to yield control back to the
event loop after every top-level block. The following fixes
needed to be done:
- It schedules a new tokenization event before yielding.
- It fixes up tsr offsets (on each restart, the source offset
is zero which would be incorrect relative to the full source).
The code for which was extracted out of TokenStreamPatcher,
extended, and moved to mediawiki.Util.js
- It records sol state before yielding and restores it on
resume (by fixing up the sol production).
* The patch also adds a 'fullParse' mode to the tokenizer where
it does not yield and tokenizes the entire text synchronously.
This mode is used by the serializer and link handler which use
tokenizer instances to parse short substrings and assume the
tokenization to be synchronous. They use their tokenizer
instances in fullParse mode.
* No change in parser test results from this patch.
* Also verified output on a few pages via 'node server'
Change-Id: Ic82e3048b6f2c0d8d12f45bdd75ec351f770e3cd
---
M js/lib/ext.core.LinkHandler.js
M js/lib/ext.core.TokenStreamPatcher.js
M js/lib/mediawiki.TokenTransformManager.js
M js/lib/mediawiki.Util.js
M js/lib/mediawiki.WikitextSerializer.js
M js/lib/mediawiki.parser.environment.js
M js/lib/mediawiki.tokenizer.peg.js
M js/lib/pegTokenizer.pegjs.txt
M js/tests/parserTests.js
9 files changed, 171 insertions(+), 54 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid
refs/changes/56/49856/1
diff --git a/js/lib/ext.core.LinkHandler.js b/js/lib/ext.core.LinkHandler.js
index eebc6f5..d115060 100644
--- a/js/lib/ext.core.LinkHandler.js
+++ b/js/lib/ext.core.LinkHandler.js
@@ -21,7 +21,9 @@
if ( !this.urlParser ) {
// Actually the regular tokenizer, but we'll call it with the
// img_options production only.
- WikiLinkHandler.prototype.urlParser = new PegTokenizer();
+ var tokenizer = new PegTokenizer();
+ tokenizer.fullParse = true;
+ WikiLinkHandler.prototype.urlParser = tokenizer;
}
}
@@ -608,7 +610,9 @@
if ( !this.urlParser ) {
// Actually the regular tokenizer, but we'll call it with the
// img_options production only.
- ExternalLinkHandler.prototype.urlParser = new PegTokenizer();
+ var tokenizer = new PegTokenizer();
+ tokenizer.fullParse = true;
+ ExternalLinkHandler.prototype.urlParser = tokenizer;
}
this._reset();
}
diff --git a/js/lib/ext.core.TokenStreamPatcher.js
b/js/lib/ext.core.TokenStreamPatcher.js
index b5a6d99..b086210 100644
--- a/js/lib/ext.core.TokenStreamPatcher.js
+++ b/js/lib/ext.core.TokenStreamPatcher.js
@@ -10,11 +10,13 @@
* -------------------------------------------------------------------- */
"use strict";
-var PegTokenizer = require('./mediawiki.tokenizer.peg.js').PegTokenizer;
+var PegTokenizer = require('./mediawiki.tokenizer.peg.js').PegTokenizer,
+ Util = require('./mediawiki.Util.js').Util;
function TokenStreamPatcher( manager, options ) {
this.manager = manager;
this.tokenizer = new PegTokenizer();
+ this.tokenizer.fullParse = true;
manager.addTransform(this.onNewline.bind(this),
"TokenStreamPatcher:onNewline", this.nlRank, 'newline');
@@ -55,23 +57,6 @@
};
TokenStreamPatcher.prototype.onAny = function(token, manager) {
-
- function shiftTokenTSR(tokens, offset) {
- // update/clear tsr
- for (var i = 0, n = tokens.length; i < n; i++) {
- var tsr = tokens[i].dataAttribs.tsr;
- if (tsr) {
- if (offset) {
- // console.warn("--updating--");
- tokens[i].dataAttribs.tsr = [tsr[0] +
offset, tsr[1] + offset];
- } else {
- // console.warn("--nulling--");
- tokens[i].dataAttribs.tsr = null;
- }
- }
- }
- }
-
// console.warn("T: " + JSON.stringify(token));
var tokens = [token];
switch (token.constructor) {
@@ -84,7 +69,7 @@
// Reparse string with the
'table_start_tag' production
// and shift tsr of result tokens by
source offset
tokens = this.tokenizer.tokenize(token,
'table_start_tag');
- shiftTokenTSR(tokens, this.srcOffset);
+ Util.shiftTokenTSR(tokens,
this.srcOffset);
} else if (token.match(/^\s*$/)) {
// White-space doesn't change SOL state
// Update srcOffset
diff --git a/js/lib/mediawiki.TokenTransformManager.js
b/js/lib/mediawiki.TokenTransformManager.js
index e2cd36c..4595769 100644
--- a/js/lib/mediawiki.TokenTransformManager.js
+++ b/js/lib/mediawiki.TokenTransformManager.js
@@ -257,7 +257,6 @@
// Reset state between uses
AsyncTokenTransformManager.prototype.reset = function() {
this.tailAccumulator = null;
- // initial top-level callback, emits chunks
this.tokenCB = this.emitChunk.bind( this );
};
diff --git a/js/lib/mediawiki.Util.js b/js/lib/mediawiki.Util.js
index 09efccb..8ebb0fe 100644
--- a/js/lib/mediawiki.Util.js
+++ b/js/lib/mediawiki.Util.js
@@ -191,6 +191,73 @@
return true;
},
+ shiftTokenTSR: function(tokens, offset) {
+ // offset should either be a valid number or null
+ if (offset === undefined) {
+ offset = null;
+ }
+
+ // update/clear tsr
+ for (var i = 0, n = tokens.length; i < n; i++) {
+ var t = tokens[i];
+ switch (t.constructor) {
+ case TagTk:
+ case SelfclosingTagTk:
+ case NlTk:
+ case CommentTk:
+ case EndTagTk:
+ var da = tokens[i].dataAttribs;
+ var tsr = da.tsr;
+ if (tsr) {
+ if (offset !== null) {
+ da.tsr = [tsr[0] +
offset, tsr[1] + offset];
+ } else {
+ da.tsr = null;
+ }
+ }
+
+ // SSS FIXME: offset will always be
available in
+ // chunky-tokenizer mode in which case
we wont have
+ // buggy offsets below. The null
scenario is only
+ // for when the token-stream-patcher
attempts to
+ // reparse a string -- it is likely to
only patch up
+ // small string fragments and the
complicated use cases
+ // below should not materialize.
+
+ // target offset
+ if (offset && da.targetOff) {
+ da.targetOff += offset;
+ }
+
+ // Process attributes
+ if (t.attribs) {
+ for (var j = 0, m =
t.attribs.length; j < m; j++) {
+ var a = t.attribs[j];
+ if (a.k.constructor ===
Array) {
+
this.shiftTokenTSR(a.k, offset);
+ }
+ if (a.v.constructor ===
Array) {
+
this.shiftTokenTSR(a.v, offset);
+ }
+
+ // src offsets used to
set mw:TemplateParams
+ if (offset === null) {
+ a.srcOffsets =
null;
+ } else if
(a.srcOffsets) {
+ for (var k = 0;
k < a.srcOffsets.length; k++) {
+
a.srcOffsets[k] += offset;
+ }
+ }
+ }
+ }
+ break;
+
+ default:
+ break;
+ }
+ }
+ },
+
toStringTokens: function(tokens, indent) {
if (!indent) {
indent = "";
diff --git a/js/lib/mediawiki.WikitextSerializer.js
b/js/lib/mediawiki.WikitextSerializer.js
index 57ea7a8..517b6b1 100644
--- a/js/lib/mediawiki.WikitextSerializer.js
+++ b/js/lib/mediawiki.WikitextSerializer.js
@@ -55,6 +55,7 @@
var WEHP = WikitextEscapeHandlers.prototype;
WEHP.urlParser = new PegTokenizer();
+WEHP.urlParser.fullParse = true;
WEHP.headingHandler = function(state, text) {
// replace heading-handler with the default handler
@@ -139,8 +140,10 @@
});
p.on('end', function(){ });
- // Tokenizer.process is synchronous -- this call wont return till
everything is parsed.
+ // Tokenizer.process is synchronous if fullParse is enabled --
+ // this call wont return till everything is parsed.
// The code below will break if tokenization becomes async.
+ p.fullParse = true;
p.process( prefixedText );
// If the token stream has a TagTk, SelfclosingTagTk, EndTagTk or
CommentTk
diff --git a/js/lib/mediawiki.parser.environment.js
b/js/lib/mediawiki.parser.environment.js
index 3993468..aa774bd 100644
--- a/js/lib/mediawiki.parser.environment.js
+++ b/js/lib/mediawiki.parser.environment.js
@@ -390,7 +390,8 @@
* call it from places you notice errors happening.
*/
MWParserEnvironment.prototype.errCB = function ( error ) {
- console.log( 'ERROR in ' + this.page.name + ':\n' + error.stack );
+ console.log( 'ERROR in ' + this.page.name + ':\n' + error.message);
+ console.log("stack trace: " + error.stack);
process.exit( 1 );
};
diff --git a/js/lib/mediawiki.tokenizer.peg.js
b/js/lib/mediawiki.tokenizer.peg.js
index a94a8f8..0528b40 100644
--- a/js/lib/mediawiki.tokenizer.peg.js
+++ b/js/lib/mediawiki.tokenizer.peg.js
@@ -109,30 +109,20 @@
} else {
chunkCB = this.emit.bind( this, 'chunk' );
}
- // XXX: Commented out exception handling during development to get
- // reasonable traces.
- if ( ! this.env.conf.parsoid.debug ) {
- try {
- this.tokenizer.tokenize(text, 'start',
- // callback
- chunkCB,
- // inline break test
- this
- );
- this.onEnd();
- } catch (e) {
- this.env.errCB(e);
- //chunkCB( ['Tokenizer error in ' + cacheKey + ': ' +
e.stack] );
- //this.onEnd();
+
+ // Kick it off!
+ if (this.fullParse) {
+ if ( ! this.env.conf.parsoid.debug ) {
+ try {
+ this.tokenizer.tokenize(text, 'start', chunkCB,
this);
+ } catch (e) {
+ this.env.errCB(e);
+ }
+ } else {
+ this.tokenizer.tokenize(text, 'start', chunkCB, this);
}
} else {
- this.tokenizer.tokenize(text, 'start',
- // callback
- chunkCB,
- // inline break test
- this
- );
- this.onEnd();
+ this.tokenizeTLB(text, 0, chunkCB);
}
};
@@ -157,16 +147,24 @@
this.emit('end');
};
-PegTokenizer.prototype.processImageOptions = function( text ) {
- return this.tokenizer.tokenize(text, 'img_options', null, this
);
-};
+PegTokenizer.prototype.tokenizeTLB = function( text, srcOffset, cb ) {
+ if ( ! this.env.conf.parsoid.debug ) {
+ try {
+ this.tokenizer.tokenize(text, 'toplevelblock', cb,
this, srcOffset);
+ } catch (e) {
+ this.env.errCB(e);
+ }
+ } else {
+ this.tokenizer.tokenize(text, 'toplevelblock', cb, this,
srcOffset);
+ }
+}
/**
* Tokenize via a production passed in as an arg
*/
PegTokenizer.prototype.tokenize = function( text, production ) {
try {
- return this.tokenizer.tokenize(text, production, null, this );
+ return this.tokenizer.tokenize(text, production || "start",
null, this, 0);
} catch ( e ) {
return false;
}
@@ -179,6 +177,10 @@
return this.tokenize(text, "url");
};
+PegTokenizer.prototype.processImageOptions = function( text ) {
+ return this.tokenize(text, 'img_options');
+};
+
/*
* Inline breaks, flag-enabled production which detects end positions for
* active higher-level productions in inline and other nested productions.
diff --git a/js/lib/pegTokenizer.pegjs.txt b/js/lib/pegTokenizer.pegjs.txt
index ff0b54e..27b102e 100644
--- a/js/lib/pegTokenizer.pegjs.txt
+++ b/js/lib/pegTokenizer.pegjs.txt
@@ -380,6 +380,11 @@
* current expression can return an empty list (true).
*/
var emitChunk = function (tokens) {
+ // shift tsr of all tokens by offset
+ var srcOffset = __parseArgs[4];
+ if (srcOffset && srcOffset > 0) {
+ Util.shiftTokenTSR(tokens, srcOffset);
+ }
__parseArgs[2](tokens);
};
}
@@ -389,18 +394,57 @@
*********************************************************/
start
- = e:toplevelblock* newline* {
+ = tlb* newline* {
// end is passed inline as a token, as well as a separate event for now.
emitChunk( [ new EOFTk( ) ] );
- return []; //flattenIfArray(e);
+ __parseArgs[3].onEnd();
+ return true;
}
+/*
+ * This production exists to support tokenizing the document in chunks.
+ * It stops tokenization after each block and yields to the node.js
+ * event-loop to schedule other pending event handlers.
+ *
+ * It needs to keep track of sol-state so when tokenization resumes, it
+ * knows whether it is in sol-state or not.
+ */
+toplevelblock =
+ tlb save_sol_state {
+ // Yield
+ var parsedInput = input.substring(0, pos),
+ newInput = input.substring(pos);
+
+ // console.warn("Yield @pos: " + ((__parseArgs[4] || 0) + pos) + ";
input len: " + newInput.length);
+ // Trick the tokenizer into ending parsing
+ input = parsedInput;
+ inputLength = pos;
+
+ // Schedule parse of next chunk
+ process.nextTick(function() {
+ // console.warn("new input: " + JSON.stringify(newInput));
+ // console.warn("offset : " + ((__parseArgs[4] || 0) + pos));
+ __parseArgs[3].tokenizeTLB(newInput, (__parseArgs[4] || 0) + pos,
__parseArgs[2]);
+ });
+ }
+ / newline* eof {
+ // Clear saved sol state!
+ __parseArgs[3].savedSOL = null;
+ // console.warn("-- EOF!");
+ emitChunk( [ new EOFTk( ) ] );
+ __parseArgs[3].onEnd();
+ return true;
+ }
+
+save_sol_state =
+ & sol { __parseArgs[3].savedSOL = true; return true; }
+ / & { __parseArgs[3].savedSOL = false; return true; }
/*
* A document (start production) is a sequence of toplevelblocks. Tokens are
* emitted in chunks per toplevelblock to avoid buffering the full document.
*/
-toplevelblock
+tlb
= !eof b:block {
// Clear the tokenizer's backtracking cache after matching each
// toplevelblock. There won't be any backtracking as a document is just a
@@ -1910,7 +1954,18 @@
// Start of line
sol
- = nl:(newlineToken / & { return pos === 0; } { return [] })
+ = & {
+ // Use saved sol-state only at start of input
+ if (pos === 0) {
+ // If we have saved state of not being in sol posn, fail the
production
+ var oldSOL = __parseArgs[3].savedSOL;
+ if (oldSOL === false) {
+ return false;
+ }
+ }
+ return true;
+ }
+ nl:(newlineToken / & { return pos === 0; } { return [] })
// Eat multi-line comment, so that syntax after still matches as if it
// was actually preceded by a newline
cn:( c:comment n:newlineToken? {
diff --git a/js/tests/parserTests.js b/js/tests/parserTests.js
index ffb9b75..e96992a 100644
--- a/js/tests/parserTests.js
+++ b/js/tests/parserTests.js
@@ -1064,6 +1064,7 @@
// to be basically empty, since the parserTests environment is
very bare.
this.env = env;
this.env.errCB = function ( e ) {
+ console.warn("ERROR: " + JSON.stringify(e));
console.error( e.stack );
};
Util.setDebuggingFlags( this.env.conf.parsoid, options );
--
To view, visit https://gerrit.wikimedia.org/r/49856
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic82e3048b6f2c0d8d12f45bdd75ec351f770e3cd
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits