Subramanya Sastry has uploaded a new change for review.
https://gerrit.wikimedia.org/r/60971
Change subject: Generalized operation of TokenStreamPatcher a bit
......................................................................
Generalized operation of TokenStreamPatcher a bit
* The token stream patcher is a workaround/hack to deal with
limitations of not having a preprocessing tpl-expansion.
So far, it had a special case for tackling re-parsing of
table tags and nothing else.
* This commit generalizes this to handle other SOL reparsing
scenarios (lists, table-cells). Reparsing strings found in a
SOL context is the most useful/necessary use-case right now
since templates could end in SOL context which might not be
evident to the tokenizer till the template is fully-expanded.
Headings are not reparsed since they have EOL requirements
besides SOL requirements.
* Parsoid generates output identical to that of the PHP parser on
the following wikitext.
--------------------
{{echo|}}{| width = '100%'
{{echo|}}!heading
{{echo|}}|-
{{echo|}}|foo
{{echo|}}|}
{{echo|}}*a
--------------------
* No change in parser test results.
Change-Id: I54bed19d4a3ab1a096d8756fe62667b59fa0cdf4
---
M js/lib/ext.core.TokenStreamPatcher.js
M js/lib/mediawiki.tokenizer.peg.js
2 files changed, 94 insertions(+), 56 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid
refs/changes/71/60971/1
diff --git a/js/lib/ext.core.TokenStreamPatcher.js
b/js/lib/ext.core.TokenStreamPatcher.js
index e9fadeb..65c5e7a 100644
--- a/js/lib/ext.core.TokenStreamPatcher.js
+++ b/js/lib/ext.core.TokenStreamPatcher.js
@@ -13,6 +13,7 @@
var PegTokenizer = require('./mediawiki.tokenizer.peg.js').PegTokenizer,
Util = require('./mediawiki.Util.js').Util,
defines = require('./mediawiki.parser.defines.js');
+
// define some constructor shortcuts
var CommentTk = defines.CommentTk,
TagTk = defines.TagTk,
@@ -21,7 +22,7 @@
function TokenStreamPatcher( manager, options ) {
this.manager = manager;
- this.tokenizer = new PegTokenizer();
+ this.tokenizer = new PegTokenizer(this.manager.env);
manager.addTransform(this.onNewline.bind(this),
"TokenStreamPatcher:onNewline", this.nlRank, 'newline');
@@ -37,20 +38,30 @@
TokenStreamPatcher.prototype.anyRank = 2.002;
TokenStreamPatcher.prototype.endRank = 2.003;
-TokenStreamPatcher.prototype.reset = function() {
- this.inNowiki = false;
- this.sol = true;
- this.srcOffset = null;
- this.buf = [];
+TokenStreamPatcher.prototype.updateBuf = function() {
+ if (this.buf.length === 0) {
+ this.buf.sol = this.sol;
+ this.buf.srcOffset = this.srcOffset;
+ }
};
-TokenStreamPatcher.prototype.onNewline = function(token, manager) {
+TokenStreamPatcher.prototype.reset = function() {
+ this.inNowiki = false;
+ this.inPre = false;
+ this.sol = true;
+ this.srcOffset = 0;
+ this.buf = [];
+ this.updateBuf();
+};
+
+TokenStreamPatcher.prototype.onNewline = function(token) {
this.sol = true;
this.srcOffset = (token.dataAttribs.tsr || [null,null])[1];
+ this.updateBuf();
return {tokens: [token]};
};
-TokenStreamPatcher.prototype.onEnd = function(token, manager) {
+TokenStreamPatcher.prototype.onEnd = function(token) {
this.reset();
return {tokens: [token]};
};
@@ -61,61 +72,85 @@
this.sol = false;
};
-TokenStreamPatcher.prototype.onAny = function(token, manager) {
+TokenStreamPatcher.prototype.onAny = function(token) {
// console.warn("T: " + JSON.stringify(token));
- var tokens = [token];
- switch (token.constructor) {
- case String:
- // TRICK #1:
- // Attempt to match "{|" after a newline and convert
- // it to a table token.
- if (this.sol && !this.inNowiki) {
- if (token.match(/^\{\|/)) {
- // Reparse string with the
'table_start_tag' production
- // and shift tsr of result tokens by
source offset
- tokens = this.tokenizer.tokenize(token,
'table_start_tag');
- Util.shiftTokenTSR(tokens,
this.srcOffset, true);
- } else if (token.match(/^\s*$/)) {
- // White-space doesn't change SOL state
- // Update srcOffset
- this.srcOffset += token.length;
+ var tokens = [];
+ if (token.constructor === String) {
+ if (this.inNowiki || this.inPre) {
+ tokens.push(token);
+ this.clearSOL();
+ } else {
+ if (!token.match(/^\s*$/)) {
+ this.clearSOL();
+ }
+ this.buf.push(token);
+ this.srcOffset += token.length;
+ }
+ } else if (token.constructor === CommentTk) {
+ // Comments don't change SOL state
+ // Update srcOffset
+ this.buf.push(token);
+ this.srcOffset = (token.dataAttribs.tsr || [null,null])[1];
+ } else {
+ var str = this.buf.join('');
+ // Attempt reparse of tables, table-cells, hrs, lists
+ // (anything that doesn't require end-of-line matching
+ // => headings are out).
+ if (this.buf.sol && /^[\{\|\!\-\*#:;]/.test(str)) {
+ // console.warn("--str: " + JSON.stringify(str));
+ // console.warn("--off: " + this.buf.srcOffset);
+ tokens = [];
+ var args = {
+ cb: function(r) { tokens = tokens.concat(r); },
+ pegTokenizer: this.tokenizer,
+ srcOffset: this.buf.srcOffset,
+ env: this.manager.env
+ };
+ this.tokenizer.savedSOL = this.buf.sol;
+ this.tokenizer.tokenize(str, 'start', args);
+ tokens = Util.stripEOFTkfromTokens(tokens);
+ } else {
+ tokens = this.buf;
+ }
+
+ switch (token.constructor) {
+ case SelfclosingTagTk:
+ if (token.name === "meta" &&
token.dataAttribs.stx !== "html") {
+ this.srcOffset = (token.dataAttribs.tsr
|| [null,null])[1];
} else {
this.clearSOL();
}
- } else {
+ break;
+
+ case TagTk:
+ if (token.getAttribute("typeof") ===
"mw:Nowiki") {
+ this.inNowiki = true;
+ }
+ if (token.name === "pre" && token.isHTMLTag()) {
+ this.inPre = true;
+ }
this.clearSOL();
- }
- break;
+ break;
- case CommentTk:
- // Comments don't change SOL state
- // Update srcOffset
- this.srcOffset = (token.dataAttribs.tsr ||
[null,null])[1];
- break;
-
- case SelfclosingTagTk:
- if (token.name === "meta" && token.dataAttribs.stx !==
"html") {
- this.srcOffset = (token.dataAttribs.tsr ||
[null,null])[1];
- } else {
+ case EndTagTk:
+ if (token.getAttribute("typeof") ===
"mw:Nowiki") {
+ this.inNowiki = false;
+ }
+ if (token.name === "pre" && token.isHTMLTag()) {
+ this.inPre = false;
+ }
this.clearSOL();
- }
- break;
+ break;
+ }
- case TagTk:
- if (token.getAttribute("typeof") === "mw:Nowiki") {
- this.inNowiki = true;
- }
- this.clearSOL();
- break;
+ tokens.push(token);
- case EndTagTk:
- if (token.getAttribute("typeof") === "mw:Nowiki") {
- this.inNowiki = false;
- }
- this.clearSOL();
- break;
+ // Reset buf
+ this.buf = [];
+ this.updateBuf();
}
+ // console.warn("toks: " + JSON.stringify(tokens));
return {tokens: tokens};
};
diff --git a/js/lib/mediawiki.tokenizer.peg.js
b/js/lib/mediawiki.tokenizer.peg.js
index 664a73d..77a560c 100644
--- a/js/lib/mediawiki.tokenizer.peg.js
+++ b/js/lib/mediawiki.tokenizer.peg.js
@@ -208,17 +208,20 @@
* Tokenize via a production passed in as an arg.
* The text is tokenized synchronously in one shot.
*/
-PegTokenizer.prototype.tokenize = function( text, production ) {
+PegTokenizer.prototype.tokenize = function( text, production, args ) {
try {
// Some productions use callbacks: start, tlb, toplevelblock.
// All other productions return tokens directly.
- var toks = [],
- retToks = this.tokenizer.tokenize(text, production, {
+ var toks = [];
+ if (!args) {
+ args = {
cb: function(r) { toks = toks.concat(r); },
pegTokenizer: this,
srcOffset: this.options.startOffset || 0,
env: this.env
- });
+ }
+ }
+ var retToks = this.tokenizer.tokenize(text, production, args);
if (retToks.constructor === Array && retToks.length > 0) {
toks = toks.concat(retToks);
--
To view, visit https://gerrit.wikimedia.org/r/60971
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I54bed19d4a3ab1a096d8756fe62667b59fa0cdf4
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits