Subramanya Sastry has uploaded a new change for review. https://gerrit.wikimedia.org/r/60971
Change subject: Generalized operation of TokenStreamPatcher a bit ...................................................................... Generalized operation of TokenStreamPatcher a bit * The token stream patcher is a workaround/hack to deal with limitations of not having a preprocessing tpl-expansion. So far, it had a special case for tackling re-parsing of table tags and nothing else. * This commit generalizes this to handle other SOL reparsing scenarios (lists, table-cells). Reparsing strings found in a SOL context is the most useful/necessary use-case right now since templates could end in SOL context which might not be evident to the tokenizer till the template is fully-expanded. Headings are not reparsed since they have EOL requirements besides SOL requirements. * Parsoid generates output identical to that of the PHP parser on the following wikitext. -------------------- {{echo|}}{| width = '100%' {{echo|}}!heading {{echo|}}|- {{echo|}}|foo {{echo|}}|} {{echo|}}*a -------------------- * No change in parser test results. Change-Id: I54bed19d4a3ab1a096d8756fe62667b59fa0cdf4 --- M js/lib/ext.core.TokenStreamPatcher.js M js/lib/mediawiki.tokenizer.peg.js 2 files changed, 94 insertions(+), 56 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid refs/changes/71/60971/1 diff --git a/js/lib/ext.core.TokenStreamPatcher.js b/js/lib/ext.core.TokenStreamPatcher.js index e9fadeb..65c5e7a 100644 --- a/js/lib/ext.core.TokenStreamPatcher.js +++ b/js/lib/ext.core.TokenStreamPatcher.js @@ -13,6 +13,7 @@ var PegTokenizer = require('./mediawiki.tokenizer.peg.js').PegTokenizer, Util = require('./mediawiki.Util.js').Util, defines = require('./mediawiki.parser.defines.js'); + // define some constructor shortcuts var CommentTk = defines.CommentTk, TagTk = defines.TagTk, @@ -21,7 +22,7 @@ function TokenStreamPatcher( manager, options ) { this.manager = manager; - this.tokenizer = new PegTokenizer(); + this.tokenizer = new PegTokenizer(this.manager.env); manager.addTransform(this.onNewline.bind(this), "TokenStreamPatcher:onNewline", this.nlRank, 'newline'); @@ -37,20 +38,30 @@ TokenStreamPatcher.prototype.anyRank = 2.002; TokenStreamPatcher.prototype.endRank = 2.003; -TokenStreamPatcher.prototype.reset = function() { - this.inNowiki = false; - this.sol = true; - this.srcOffset = null; - this.buf = []; +TokenStreamPatcher.prototype.updateBuf = function() { + if (this.buf.length === 0) { + this.buf.sol = this.sol; + this.buf.srcOffset = this.srcOffset; + } }; -TokenStreamPatcher.prototype.onNewline = function(token, manager) { +TokenStreamPatcher.prototype.reset = function() { + this.inNowiki = false; + this.inPre = false; + this.sol = true; + this.srcOffset = 0; + this.buf = []; + this.updateBuf(); +}; + +TokenStreamPatcher.prototype.onNewline = function(token) { this.sol = true; this.srcOffset = (token.dataAttribs.tsr || [null,null])[1]; + this.updateBuf(); return {tokens: [token]}; }; -TokenStreamPatcher.prototype.onEnd = function(token, manager) { +TokenStreamPatcher.prototype.onEnd = function(token) { this.reset(); return {tokens: [token]}; }; @@ -61,61 +72,85 @@ this.sol = false; }; -TokenStreamPatcher.prototype.onAny = function(token, manager) { +TokenStreamPatcher.prototype.onAny = function(token) { // console.warn("T: " + JSON.stringify(token)); - var tokens = [token]; - switch (token.constructor) { - case String: - // TRICK #1: - // Attempt to match "{|" after a newline and convert - // it to a table token. - if (this.sol && !this.inNowiki) { - if (token.match(/^\{\|/)) { - // Reparse string with the 'table_start_tag' production - // and shift tsr of result tokens by source offset - tokens = this.tokenizer.tokenize(token, 'table_start_tag'); - Util.shiftTokenTSR(tokens, this.srcOffset, true); - } else if (token.match(/^\s*$/)) { - // White-space doesn't change SOL state - // Update srcOffset - this.srcOffset += token.length; + var tokens = []; + if (token.constructor === String) { + if (this.inNowiki || this.inPre) { + tokens.push(token); + this.clearSOL(); + } else { + if (!token.match(/^\s*$/)) { + this.clearSOL(); + } + this.buf.push(token); + this.srcOffset += token.length; + } + } else if (token.constructor === CommentTk) { + // Comments don't change SOL state + // Update srcOffset + this.buf.push(token); + this.srcOffset = (token.dataAttribs.tsr || [null,null])[1]; + } else { + var str = this.buf.join(''); + // Attempt reparse of tables, table-cells, hrs, lists + // (anything that doesn't require end-of-line matching + // => headings are out). + if (this.buf.sol && /^[\{\|\!\-\*#:;]/.test(str)) { + // console.warn("--str: " + JSON.stringify(str)); + // console.warn("--off: " + this.buf.srcOffset); + tokens = []; + var args = { + cb: function(r) { tokens = tokens.concat(r); }, + pegTokenizer: this.tokenizer, + srcOffset: this.buf.srcOffset, + env: this.manager.env + }; + this.tokenizer.savedSOL = this.buf.sol; + this.tokenizer.tokenize(str, 'start', args); + tokens = Util.stripEOFTkfromTokens(tokens); + } else { + tokens = this.buf; + } + + switch (token.constructor) { + case SelfclosingTagTk: + if (token.name === "meta" && token.dataAttribs.stx !== "html") { + this.srcOffset = (token.dataAttribs.tsr || [null,null])[1]; } else { this.clearSOL(); } - } else { + break; + + case TagTk: + if (token.getAttribute("typeof") === "mw:Nowiki") { + this.inNowiki = true; + } + if (token.name === "pre" && token.isHTMLTag()) { + this.inPre = true; + } this.clearSOL(); - } - break; + break; - case CommentTk: - // Comments don't change SOL state - // Update srcOffset - this.srcOffset = (token.dataAttribs.tsr || [null,null])[1]; - break; - - case SelfclosingTagTk: - if (token.name === "meta" && token.dataAttribs.stx !== "html") { - this.srcOffset = (token.dataAttribs.tsr || [null,null])[1]; - } else { + case EndTagTk: + if (token.getAttribute("typeof") === "mw:Nowiki") { + this.inNowiki = false; + } + if (token.name === "pre" && token.isHTMLTag()) { + this.inPre = false; + } this.clearSOL(); - } - break; + break; + } - case TagTk: - if (token.getAttribute("typeof") === "mw:Nowiki") { - this.inNowiki = true; - } - this.clearSOL(); - break; + tokens.push(token); - case EndTagTk: - if (token.getAttribute("typeof") === "mw:Nowiki") { - this.inNowiki = false; - } - this.clearSOL(); - break; + // Reset buf + this.buf = []; + this.updateBuf(); } + // console.warn("toks: " + JSON.stringify(tokens)); return {tokens: tokens}; }; diff --git a/js/lib/mediawiki.tokenizer.peg.js b/js/lib/mediawiki.tokenizer.peg.js index 664a73d..77a560c 100644 --- a/js/lib/mediawiki.tokenizer.peg.js +++ b/js/lib/mediawiki.tokenizer.peg.js @@ -208,17 +208,20 @@ * Tokenize via a production passed in as an arg. * The text is tokenized synchronously in one shot. */ -PegTokenizer.prototype.tokenize = function( text, production ) { +PegTokenizer.prototype.tokenize = function( text, production, args ) { try { // Some productions use callbacks: start, tlb, toplevelblock. // All other productions return tokens directly. - var toks = [], - retToks = this.tokenizer.tokenize(text, production, { + var toks = []; + if (!args) { + args = { cb: function(r) { toks = toks.concat(r); }, pegTokenizer: this, srcOffset: this.options.startOffset || 0, env: this.env - }); + } + } + var retToks = this.tokenizer.tokenize(text, production, args); if (retToks.constructor === Array && retToks.length > 0) { toks = toks.concat(retToks); -- To view, visit https://gerrit.wikimedia.org/r/60971 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I54bed19d4a3ab1a096d8756fe62667b59fa0cdf4 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Parsoid Gerrit-Branch: master Gerrit-Owner: Subramanya Sastry <ssas...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits