[MediaWiki-commits] [Gerrit] Yield control to event-loop after tokenizing a top-level blo... - change (mediawiki...Parsoid)

Subramanya Sastry (Code Review) Tue, 19 Feb 2013 11:46:51 -0800

Subramanya Sastry has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/49856



Change subject: Yield control to event-loop after tokenizing a top-level block.
......................................................................

Yield control to event-loop after tokenizing a top-level block.

* Currently, the tokenizer parses the whole page at once queueing
  up a lot of async actions in one shot.  This is not so good for
  a couple of reasons:
  - Tokenized chunks aren't processed further till the whole page
    is parsed.
  - This can lead to the first template requests issues during
    the tokenization to timeout in some cases since the request
        handler might never get scheduled in time.
  - It can result in template requests going out in big bursts
    rather than being spread out over the page parse.

* This patch fixes the tokenizer to yield control back to the
  event loop after every top-level block.  The following fixes
  needed to be done:
  - It schedules a new tokenization event before yielding.
  - It fixes up tsr offsets (on each restart, the source offset
    is zero which would be incorrect relative to the full source).
        The code for which was extracted out of TokenStreamPatcher,
        extended, and moved to mediawiki.Util.js
  - It records sol state before yielding and restores it on
    resume (by fixing up the sol production).

* The patch also adds a 'fullParse' mode to the tokenizer where
  it does not yield and tokenizes the entire text synchronously.
  This mode is used by the serializer and link handler which use
  tokenizer instances to parse short substrings and assume the
  tokenization to be synchronous.  They use their tokenizer
  instances in fullParse mode.

* No change in parser test results from this patch.

* Also verified output on a few pages via 'node server'

Change-Id: Ic82e3048b6f2c0d8d12f45bdd75ec351f770e3cd
---
M js/lib/ext.core.LinkHandler.js
M js/lib/ext.core.TokenStreamPatcher.js
M js/lib/mediawiki.TokenTransformManager.js
M js/lib/mediawiki.Util.js
M js/lib/mediawiki.WikitextSerializer.js
M js/lib/mediawiki.parser.environment.js
M js/lib/mediawiki.tokenizer.peg.js
M js/lib/pegTokenizer.pegjs.txt
M js/tests/parserTests.js
9 files changed, 171 insertions(+), 54 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid 
refs/changes/56/49856/1

diff --git a/js/lib/ext.core.LinkHandler.js b/js/lib/ext.core.LinkHandler.js
index eebc6f5..d115060 100644
--- a/js/lib/ext.core.LinkHandler.js
+++ b/js/lib/ext.core.LinkHandler.js
@@ -21,7 +21,9 @@
        if ( !this.urlParser ) {
                // Actually the regular tokenizer, but we'll call it with the
                // img_options production only.
-               WikiLinkHandler.prototype.urlParser = new PegTokenizer();
+               var tokenizer = new PegTokenizer();
+               tokenizer.fullParse = true;
+               WikiLinkHandler.prototype.urlParser = tokenizer;
        }
 }
 
@@ -608,7 +610,9 @@
        if ( !this.urlParser ) {
                // Actually the regular tokenizer, but we'll call it with the
                // img_options production only.
-               ExternalLinkHandler.prototype.urlParser = new PegTokenizer();
+               var tokenizer = new PegTokenizer();
+               tokenizer.fullParse = true;
+               ExternalLinkHandler.prototype.urlParser = tokenizer;
        }
        this._reset();
 }
diff --git a/js/lib/ext.core.TokenStreamPatcher.js 
b/js/lib/ext.core.TokenStreamPatcher.js
index b5a6d99..b086210 100644
--- a/js/lib/ext.core.TokenStreamPatcher.js
+++ b/js/lib/ext.core.TokenStreamPatcher.js
@@ -10,11 +10,13 @@
  * -------------------------------------------------------------------- */
 "use strict";
 
-var PegTokenizer = require('./mediawiki.tokenizer.peg.js').PegTokenizer;
+var PegTokenizer = require('./mediawiki.tokenizer.peg.js').PegTokenizer,
+       Util = require('./mediawiki.Util.js').Util;
 
 function TokenStreamPatcher( manager, options ) {
        this.manager = manager;
        this.tokenizer = new PegTokenizer();
+       this.tokenizer.fullParse = true;
 
        manager.addTransform(this.onNewline.bind(this),
                "TokenStreamPatcher:onNewline", this.nlRank, 'newline');
@@ -55,23 +57,6 @@
 };
 
 TokenStreamPatcher.prototype.onAny = function(token, manager) {
-
-       function shiftTokenTSR(tokens, offset) {
-               // update/clear tsr
-               for (var i = 0, n = tokens.length; i < n; i++) {
-                       var tsr = tokens[i].dataAttribs.tsr;
-                       if (tsr) {
-                               if (offset) {
-                                       // console.warn("--updating--");
-                                       tokens[i].dataAttribs.tsr = [tsr[0] + 
offset, tsr[1] + offset];
-                               } else {
-                                       // console.warn("--nulling--");
-                                       tokens[i].dataAttribs.tsr = null;
-                               }
-                       }
-               }
-       }
-
        // console.warn("T: " + JSON.stringify(token));
        var tokens = [token];
        switch (token.constructor) {
@@ -84,7 +69,7 @@
                                        // Reparse string with the 
'table_start_tag' production
                                        // and shift tsr of result tokens by 
source offset
                                        tokens = this.tokenizer.tokenize(token, 
'table_start_tag');
-                                       shiftTokenTSR(tokens, this.srcOffset);
+                                       Util.shiftTokenTSR(tokens, 
this.srcOffset);
                                } else if (token.match(/^\s*$/)) {
                                        // White-space doesn't change SOL state
                                        // Update srcOffset
diff --git a/js/lib/mediawiki.TokenTransformManager.js 
b/js/lib/mediawiki.TokenTransformManager.js
index e2cd36c..4595769 100644
--- a/js/lib/mediawiki.TokenTransformManager.js
+++ b/js/lib/mediawiki.TokenTransformManager.js
@@ -257,7 +257,6 @@
 // Reset state between uses
 AsyncTokenTransformManager.prototype.reset = function() {
        this.tailAccumulator = null;
-       // initial top-level callback, emits chunks
        this.tokenCB = this.emitChunk.bind( this );
 };
 
diff --git a/js/lib/mediawiki.Util.js b/js/lib/mediawiki.Util.js
index 09efccb..8ebb0fe 100644
--- a/js/lib/mediawiki.Util.js
+++ b/js/lib/mediawiki.Util.js
@@ -191,6 +191,73 @@
                return true;
        },
 
+       shiftTokenTSR: function(tokens, offset) {
+               // offset should either be a valid number or null
+               if (offset === undefined) {
+                       offset = null;
+               }
+
+               // update/clear tsr
+               for (var i = 0, n = tokens.length; i < n; i++) {
+                       var t = tokens[i];
+                       switch (t.constructor) {
+                               case TagTk:
+                               case SelfclosingTagTk:
+                               case NlTk:
+                               case CommentTk:
+                               case EndTagTk:
+                                       var da = tokens[i].dataAttribs;
+                                       var tsr = da.tsr;
+                                       if (tsr) {
+                                               if (offset !== null) {
+                                                       da.tsr = [tsr[0] + 
offset, tsr[1] + offset];
+                                               } else {
+                                                       da.tsr = null;
+                                               }
+                                       }
+
+                                       // SSS FIXME: offset will always be 
available in
+                                       // chunky-tokenizer mode in which case 
we wont have
+                                       // buggy offsets below.  The null 
scenario is only
+                                       // for when the token-stream-patcher 
attempts to
+                                       // reparse a string -- it is likely to 
only patch up
+                                       // small string fragments and the 
complicated use cases
+                                       // below should not materialize.
+
+                                       // target offset
+                                       if (offset && da.targetOff) {
+                                               da.targetOff += offset;
+                                       }
+
+                                       //  Process attributes
+                                       if (t.attribs) {
+                                               for (var j = 0, m = 
t.attribs.length; j < m; j++) {
+                                                       var a = t.attribs[j];
+                                                       if (a.k.constructor === 
Array) {
+                                                               
this.shiftTokenTSR(a.k, offset);
+                                                       }
+                                                       if (a.v.constructor === 
Array) {
+                                                               
this.shiftTokenTSR(a.v, offset);
+                                                       }
+
+                                                       // src offsets used to 
set mw:TemplateParams
+                                                       if (offset === null) {
+                                                               a.srcOffsets = 
null;
+                                                       } else if 
(a.srcOffsets) {
+                                                               for (var k = 0; 
k < a.srcOffsets.length; k++) {
+                                                                       
a.srcOffsets[k] += offset;
+                                                               }
+                                                       }
+                                               }
+                                       }
+                                       break;
+
+                               default:
+                                       break;
+                       }
+               }
+       },
+
        toStringTokens: function(tokens, indent) {
                if (!indent) {
                        indent = "";
diff --git a/js/lib/mediawiki.WikitextSerializer.js 
b/js/lib/mediawiki.WikitextSerializer.js
index 57ea7a8..517b6b1 100644
--- a/js/lib/mediawiki.WikitextSerializer.js
+++ b/js/lib/mediawiki.WikitextSerializer.js
@@ -55,6 +55,7 @@
 var WEHP = WikitextEscapeHandlers.prototype;
 
 WEHP.urlParser = new PegTokenizer();
+WEHP.urlParser.fullParse = true;
 
 WEHP.headingHandler = function(state, text) {
        // replace heading-handler with the default handler
@@ -139,8 +140,10 @@
        });
        p.on('end', function(){ });
 
-       // Tokenizer.process is synchronous -- this call wont return till 
everything is parsed.
+       // Tokenizer.process is synchronous if fullParse is enabled --
+       // this call wont return till everything is parsed.
        // The code below will break if tokenization becomes async.
+       p.fullParse = true;
        p.process( prefixedText );
 
        // If the token stream has a TagTk, SelfclosingTagTk, EndTagTk or 
CommentTk
diff --git a/js/lib/mediawiki.parser.environment.js 
b/js/lib/mediawiki.parser.environment.js
index 3993468..aa774bd 100644
--- a/js/lib/mediawiki.parser.environment.js
+++ b/js/lib/mediawiki.parser.environment.js
@@ -390,7 +390,8 @@
  * call it from places you notice errors happening.
  */
 MWParserEnvironment.prototype.errCB = function ( error ) {
-       console.log( 'ERROR in ' + this.page.name + ':\n' + error.stack );
+       console.log( 'ERROR in ' + this.page.name + ':\n' + error.message);
+       console.log("stack trace: " + error.stack);
        process.exit( 1 );
 };
 
diff --git a/js/lib/mediawiki.tokenizer.peg.js 
b/js/lib/mediawiki.tokenizer.peg.js
index a94a8f8..0528b40 100644
--- a/js/lib/mediawiki.tokenizer.peg.js
+++ b/js/lib/mediawiki.tokenizer.peg.js
@@ -109,30 +109,20 @@
        } else {
                chunkCB = this.emit.bind( this, 'chunk' );
        }
-       // XXX: Commented out exception handling during development to get
-       // reasonable traces.
-       if ( ! this.env.conf.parsoid.debug ) {
-               try {
-                       this.tokenizer.tokenize(text, 'start',
-                                       // callback
-                                       chunkCB,
-                                       // inline break test
-                                       this
-                                       );
-                       this.onEnd();
-               } catch (e) {
-                       this.env.errCB(e);
-                       //chunkCB( ['Tokenizer error in ' + cacheKey + ': ' + 
e.stack] );
-                       //this.onEnd();
+
+       // Kick it off!
+       if (this.fullParse) {
+               if ( ! this.env.conf.parsoid.debug ) {
+                       try {
+                               this.tokenizer.tokenize(text, 'start', chunkCB, 
this);
+                       } catch (e) {
+                               this.env.errCB(e);
+                       }
+               } else {
+                       this.tokenizer.tokenize(text, 'start', chunkCB, this);
                }
        } else {
-               this.tokenizer.tokenize(text, 'start',
-                               // callback
-                               chunkCB,
-                               // inline break test
-                               this
-                               );
-               this.onEnd();
+               this.tokenizeTLB(text, 0, chunkCB);
        }
 };
 
@@ -157,16 +147,24 @@
        this.emit('end');
 };
 
-PegTokenizer.prototype.processImageOptions = function( text ) {
-               return this.tokenizer.tokenize(text, 'img_options', null, this 
);
-};
+PegTokenizer.prototype.tokenizeTLB = function( text, srcOffset, cb ) {
+       if ( ! this.env.conf.parsoid.debug ) {
+               try {
+                       this.tokenizer.tokenize(text, 'toplevelblock', cb, 
this, srcOffset);
+               } catch (e) {
+                       this.env.errCB(e);
+               }
+       } else {
+               this.tokenizer.tokenize(text, 'toplevelblock', cb, this, 
srcOffset);
+       }
+}
 
 /**
  * Tokenize via a production passed in as an arg
  */
 PegTokenizer.prototype.tokenize = function( text, production ) {
        try {
-               return this.tokenizer.tokenize(text, production, null, this );
+               return this.tokenizer.tokenize(text, production || "start", 
null, this, 0);
        } catch ( e ) {
                return false;
        }
@@ -179,6 +177,10 @@
        return this.tokenize(text, "url");
 };
 
+PegTokenizer.prototype.processImageOptions = function( text ) {
+               return this.tokenize(text, 'img_options');
+};
+
 /*
  * Inline breaks, flag-enabled production which detects end positions for
  * active higher-level productions in inline and other nested productions.
diff --git a/js/lib/pegTokenizer.pegjs.txt b/js/lib/pegTokenizer.pegjs.txt
index ff0b54e..27b102e 100644
--- a/js/lib/pegTokenizer.pegjs.txt
+++ b/js/lib/pegTokenizer.pegjs.txt
@@ -380,6 +380,11 @@
      * current expression can return an empty list (true).
      */
     var emitChunk = function (tokens) {
+        // shift tsr of all tokens by offset
+        var srcOffset = __parseArgs[4];
+        if (srcOffset && srcOffset > 0) {
+            Util.shiftTokenTSR(tokens, srcOffset);
+        }
         __parseArgs[2](tokens);
     };
 }
@@ -389,18 +394,57 @@
  *********************************************************/
 
 start
-  = e:toplevelblock* newline* {
+  = tlb* newline* {
       // end is passed inline as a token, as well as a separate event for now.
       emitChunk( [ new EOFTk( ) ] );
-      return []; //flattenIfArray(e);
+      __parseArgs[3].onEnd();
+      return true;
   }
 
+/*
+ * This production exists to support tokenizing the document in chunks.
+ * It stops tokenization after each block and yields to the node.js
+ * event-loop to schedule other pending event handlers.
+ *
+ * It needs to keep track of sol-state so when tokenization resumes, it
+ * knows whether it is in sol-state or not.
+ */
+toplevelblock =
+  tlb save_sol_state {
+        // Yield
+        var parsedInput = input.substring(0, pos),
+            newInput = input.substring(pos);
+
+        // console.warn("Yield @pos: " + ((__parseArgs[4] || 0) + pos) + "; 
input len: " + newInput.length);
+        // Trick the tokenizer into ending parsing
+        input = parsedInput;
+        inputLength = pos;
+
+        // Schedule parse of next chunk
+        process.nextTick(function() {
+            // console.warn("new input: " + JSON.stringify(newInput));
+            // console.warn("offset   : " + ((__parseArgs[4] || 0) + pos));
+            __parseArgs[3].tokenizeTLB(newInput, (__parseArgs[4] || 0) + pos, 
__parseArgs[2]);
+        });
+      }
+  / newline* eof {
+        // Clear saved sol state!
+        __parseArgs[3].savedSOL = null; 
+        // console.warn("-- EOF!");
+        emitChunk( [ new EOFTk( ) ] );
+        __parseArgs[3].onEnd();
+        return true;
+  }
+
+save_sol_state =
+    & sol { __parseArgs[3].savedSOL = true; return true; }
+  / &     { __parseArgs[3].savedSOL = false; return true; }
 
 /*
  * A document (start production) is a sequence of toplevelblocks. Tokens are
  * emitted in chunks per toplevelblock to avoid buffering the full document.
  */
-toplevelblock
+tlb
   = !eof b:block {
     // Clear the tokenizer's backtracking cache after matching each
     // toplevelblock. There won't be any backtracking as a document is just a
@@ -1910,7 +1954,18 @@
 
 // Start of line
 sol
-  = nl:(newlineToken / & { return pos === 0; } { return [] })
+  = & {
+      // Use saved sol-state only at start of input
+      if (pos === 0) {
+          // If we have saved state of not being in sol posn, fail the 
production
+          var oldSOL = __parseArgs[3].savedSOL;
+          if (oldSOL === false) {
+              return false;
+          }
+      }
+      return true;
+    }
+    nl:(newlineToken / & { return pos === 0; } { return [] })
     // Eat multi-line comment, so that syntax after still matches as if it
     // was actually preceded by a newline
     cn:( c:comment n:newlineToken? {
diff --git a/js/tests/parserTests.js b/js/tests/parserTests.js
index ffb9b75..e96992a 100644
--- a/js/tests/parserTests.js
+++ b/js/tests/parserTests.js
@@ -1064,6 +1064,7 @@
                // to be basically empty, since the parserTests environment is 
very bare.
                this.env = env;
                this.env.errCB = function ( e ) {
+                       console.warn("ERROR: " + JSON.stringify(e));
                        console.error( e.stack );
                };
                Util.setDebuggingFlags( this.env.conf.parsoid, options );

-- 
To view, visit https://gerrit.wikimedia.org/r/49856
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic82e3048b6f2c0d8d12f45bdd75ec351f770e3cd
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Yield control to event-loop after tokenizing a top-level blo... - change (mediawiki...Parsoid)

Reply via email to