[MediaWiki-commits] [Gerrit] Generalized operation of TokenStreamPatcher a bit - change (mediawiki...Parsoid)

Subramanya Sastry (Code Review) Fri, 26 Apr 2013 02:32:30 -0700

Subramanya Sastry has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/60971



Change subject: Generalized operation of TokenStreamPatcher a bit
......................................................................

Generalized operation of TokenStreamPatcher a bit

* The token stream patcher is a workaround/hack to deal with
  limitations of not having a preprocessing tpl-expansion.
  So far, it had a special case for tackling re-parsing of
  table tags and nothing else.

* This commit generalizes this to handle other SOL reparsing
  scenarios (lists, table-cells).  Reparsing strings found in a
  SOL context is the most useful/necessary use-case right now
  since templates could end in SOL context which might not be
  evident to the tokenizer till the template is fully-expanded.

  Headings are not reparsed since they have EOL requirements
  besides SOL requirements.

* Parsoid generates output identical to that of the PHP parser on
  the following wikitext.
--------------------
{{echo|}}{| width = '100%'
{{echo|}}!heading
{{echo|}}|-
{{echo|}}|foo
{{echo|}}|}

{{echo|}}*a
--------------------

* No change in parser test results.

Change-Id: I54bed19d4a3ab1a096d8756fe62667b59fa0cdf4
---
M js/lib/ext.core.TokenStreamPatcher.js
M js/lib/mediawiki.tokenizer.peg.js
2 files changed, 94 insertions(+), 56 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid 
refs/changes/71/60971/1

diff --git a/js/lib/ext.core.TokenStreamPatcher.js 
b/js/lib/ext.core.TokenStreamPatcher.js
index e9fadeb..65c5e7a 100644
--- a/js/lib/ext.core.TokenStreamPatcher.js
+++ b/js/lib/ext.core.TokenStreamPatcher.js
@@ -13,6 +13,7 @@
 var PegTokenizer = require('./mediawiki.tokenizer.peg.js').PegTokenizer,
        Util = require('./mediawiki.Util.js').Util,
        defines = require('./mediawiki.parser.defines.js');
+
 // define some constructor shortcuts
 var CommentTk = defines.CommentTk,
     TagTk = defines.TagTk,
@@ -21,7 +22,7 @@
 
 function TokenStreamPatcher( manager, options ) {
        this.manager = manager;
-       this.tokenizer = new PegTokenizer();
+       this.tokenizer = new PegTokenizer(this.manager.env);
 
        manager.addTransform(this.onNewline.bind(this),
                "TokenStreamPatcher:onNewline", this.nlRank, 'newline');
@@ -37,20 +38,30 @@
 TokenStreamPatcher.prototype.anyRank  = 2.002;
 TokenStreamPatcher.prototype.endRank  = 2.003;
 
-TokenStreamPatcher.prototype.reset = function() {
-       this.inNowiki = false;
-       this.sol = true;
-       this.srcOffset = null;
-       this.buf = [];
+TokenStreamPatcher.prototype.updateBuf = function() {
+       if (this.buf.length === 0) {
+               this.buf.sol = this.sol;
+               this.buf.srcOffset = this.srcOffset;
+       }
 };
 
-TokenStreamPatcher.prototype.onNewline = function(token, manager) {
+TokenStreamPatcher.prototype.reset = function() {
+       this.inNowiki = false;
+       this.inPre = false;
+       this.sol = true;
+       this.srcOffset = 0;
+       this.buf = [];
+       this.updateBuf();
+};
+
+TokenStreamPatcher.prototype.onNewline = function(token) {
        this.sol = true;
        this.srcOffset = (token.dataAttribs.tsr || [null,null])[1];
+       this.updateBuf();
        return {tokens: [token]};
 };
 
-TokenStreamPatcher.prototype.onEnd = function(token, manager) {
+TokenStreamPatcher.prototype.onEnd = function(token) {
        this.reset();
        return {tokens: [token]};
 };
@@ -61,61 +72,85 @@
        this.sol = false;
 };
 
-TokenStreamPatcher.prototype.onAny = function(token, manager) {
+TokenStreamPatcher.prototype.onAny = function(token) {
        // console.warn("T: " + JSON.stringify(token));
-       var tokens = [token];
-       switch (token.constructor) {
-               case String:
-                       // TRICK #1:
-                       // Attempt to match "{|" after a newline and convert
-                       // it to a table token.
-                       if (this.sol && !this.inNowiki) {
-                               if (token.match(/^\{\|/)) {
-                                       // Reparse string with the 
'table_start_tag' production
-                                       // and shift tsr of result tokens by 
source offset
-                                       tokens = this.tokenizer.tokenize(token, 
'table_start_tag');
-                                       Util.shiftTokenTSR(tokens, 
this.srcOffset, true);
-                               } else if (token.match(/^\s*$/)) {
-                                       // White-space doesn't change SOL state
-                                       // Update srcOffset
-                                       this.srcOffset += token.length;
+       var tokens = [];
+       if (token.constructor === String) {
+               if (this.inNowiki || this.inPre) {
+                       tokens.push(token);
+                       this.clearSOL();
+               } else {
+                       if (!token.match(/^\s*$/)) {
+                               this.clearSOL();
+                       }
+                       this.buf.push(token);
+                       this.srcOffset += token.length;
+               }
+       } else if (token.constructor === CommentTk) {
+               // Comments don't change SOL state
+               // Update srcOffset
+               this.buf.push(token);
+               this.srcOffset = (token.dataAttribs.tsr || [null,null])[1];
+       } else {
+               var str = this.buf.join('');
+               // Attempt reparse of tables, table-cells, hrs, lists
+               // (anything that doesn't require end-of-line matching
+               //  => headings are out).
+               if (this.buf.sol && /^[\{\|\!\-\*#:;]/.test(str)) {
+                       // console.warn("--str: " + JSON.stringify(str));
+                       // console.warn("--off: " + this.buf.srcOffset);
+                       tokens = [];
+                       var args = {
+                               cb: function(r) { tokens = tokens.concat(r); },
+                               pegTokenizer: this.tokenizer,
+                               srcOffset: this.buf.srcOffset,
+                               env: this.manager.env
+                       };
+                       this.tokenizer.savedSOL = this.buf.sol;
+                       this.tokenizer.tokenize(str, 'start', args);
+                       tokens = Util.stripEOFTkfromTokens(tokens);
+               } else {
+                       tokens = this.buf;
+               }
+
+               switch (token.constructor) {
+                       case SelfclosingTagTk:
+                               if (token.name === "meta" && 
token.dataAttribs.stx !== "html") {
+                                       this.srcOffset = (token.dataAttribs.tsr 
|| [null,null])[1];
                                } else {
                                        this.clearSOL();
                                }
-                       } else {
+                               break;
+
+                       case TagTk:
+                               if (token.getAttribute("typeof") === 
"mw:Nowiki") {
+                                       this.inNowiki = true;
+                               }
+                               if (token.name === "pre" && token.isHTMLTag()) {
+                                       this.inPre = true;
+                               }
                                this.clearSOL();
-                       }
-                       break;
+                               break;
 
-               case CommentTk:
-                       // Comments don't change SOL state
-                       // Update srcOffset
-                       this.srcOffset = (token.dataAttribs.tsr || 
[null,null])[1];
-                       break;
-
-               case SelfclosingTagTk:
-                       if (token.name === "meta" && token.dataAttribs.stx !== 
"html") {
-                               this.srcOffset = (token.dataAttribs.tsr || 
[null,null])[1];
-                       } else {
+                       case EndTagTk:
+                               if (token.getAttribute("typeof") === 
"mw:Nowiki") {
+                                       this.inNowiki = false;
+                               }
+                               if (token.name === "pre" && token.isHTMLTag()) {
+                                       this.inPre = false;
+                               }
                                this.clearSOL();
-                       }
-                       break;
+                               break;
+               }
 
-               case TagTk:
-                       if (token.getAttribute("typeof") === "mw:Nowiki") {
-                               this.inNowiki = true;
-                       }
-                       this.clearSOL();
-                       break;
+               tokens.push(token);
 
-               case EndTagTk:
-                       if (token.getAttribute("typeof") === "mw:Nowiki") {
-                               this.inNowiki = false;
-                       }
-                       this.clearSOL();
-                       break;
+               // Reset buf
+               this.buf = [];
+               this.updateBuf();
        }
 
+       // console.warn("toks: " + JSON.stringify(tokens));
        return {tokens: tokens};
 };
 
diff --git a/js/lib/mediawiki.tokenizer.peg.js 
b/js/lib/mediawiki.tokenizer.peg.js
index 664a73d..77a560c 100644
--- a/js/lib/mediawiki.tokenizer.peg.js
+++ b/js/lib/mediawiki.tokenizer.peg.js
@@ -208,17 +208,20 @@
  * Tokenize via a production passed in as an arg.
  * The text is tokenized synchronously in one shot.
  */
-PegTokenizer.prototype.tokenize = function( text, production ) {
+PegTokenizer.prototype.tokenize = function( text, production, args ) {
        try {
                // Some productions use callbacks: start, tlb, toplevelblock.
                // All other productions return tokens directly.
-               var toks = [],
-                       retToks = this.tokenizer.tokenize(text, production, {
+               var toks = [];
+               if (!args) {
+                       args = {
                                cb: function(r) { toks = toks.concat(r); },
                                pegTokenizer: this,
                                srcOffset: this.options.startOffset || 0,
                                env: this.env
-                       });
+                       }
+               }
+               var retToks = this.tokenizer.tokenize(text, production, args);
 
                if (retToks.constructor === Array && retToks.length > 0) {
                        toks = toks.concat(retToks);

-- 
To view, visit https://gerrit.wikimedia.org/r/60971
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I54bed19d4a3ab1a096d8756fe62667b59fa0cdf4
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <ssas...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Generalized operation of TokenStreamPatcher a bit - change (mediawiki...Parsoid)

Reply via email to