Subramanya Sastry has uploaded a new change for review.
https://gerrit.wikimedia.org/r/54506
Change subject: Tokenize *include* tags hierarchically just like extension tags.
......................................................................
Tokenize *include* tags hierarchically just like extension tags.
* First pass cleaning up tokenizing of *include* tags.
- Applied technique used in 1a0b8840 to parse extension tags.
* Eliminated the use of TokenAndAttrCollector since the complex
delimiter matching across token and attribute levels is no longer
required -- this is now handled in the tokenizer.
* Fixed up handlers in ext.core.NoIncludeOnly to use TokenCollector.
* No change in parser tests results, but some change in output of
failed tests -- nothing significant that we should be worried about.
Verified that es:Anexo:Monumentos_Históricos_de_Panamá continues
to parse correctly -- this page was what necessitated the coding
of the complex TokenAndAttrCollector handlers.
Change-Id: Id92b7bbfef730a6f32a2df52240e17dbd0cc0ae9
---
M js/lib/ext.core.NoIncludeOnly.js
D js/lib/ext.util.TokenAndAttrCollector.js
M js/lib/ext.util.TokenCollector.js
M js/lib/pegTokenizer.pegjs.txt
4 files changed, 99 insertions(+), 600 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid
refs/changes/06/54506/1
diff --git a/js/lib/ext.core.NoIncludeOnly.js b/js/lib/ext.core.NoIncludeOnly.js
index 90b3fa1..e3a1cc2 100644
--- a/js/lib/ext.core.NoIncludeOnly.js
+++ b/js/lib/ext.core.NoIncludeOnly.js
@@ -4,7 +4,7 @@
* noinclude sections.
*/
-var Collector = require( './ext.util.TokenAndAttrCollector.js'
).TokenAndAttrCollector;
+var Collector = require( './ext.util.TokenCollector.js' ).TokenCollector;
/**
* This helper function will build a meta token in the right way for these
@@ -141,68 +141,8 @@
}
};
-function defaultNestedDelimiterHandler(nestedDelimiterInfo) {
- // Always clone the container token before modifying it
- var token = nestedDelimiterInfo.token.clone();
-
- // Strip the delimiter token wherever it is nested
- // and strip upto/from the delimiter depending on the
- // token type and where in the stream we are.
- var i = nestedDelimiterInfo.attrIndex;
- var delimiter = nestedDelimiterInfo.delimiter;
- var isOpenTag = delimiter.constructor === TagTk;
- var stripFrom = ((delimiter.name === "noinclude") && isOpenTag) ||
- ((delimiter.name === "includeOnly") &&
!isOpenTag);
- var stripUpto = ((delimiter.name === "noinclude") && !isOpenTag) ||
- ((delimiter.name === "includeOnly") &&
isOpenTag);
-
- if (nestedDelimiterInfo.k >= 0) {
- if (stripFrom) {
- token.attribs.splice(i+1);
- token.attribs[i].k.splice(nestedDelimiterInfo.k);
- }
- if (stripUpto) {
- // Since we are stripping upto the delimiter,
- // change the token to a simple span.
- // SSS FIXME: For sure in the case of table tags
(tr,td,th,etc.) but, always??
- token.name = 'span';
- token.attribs.splice(0, i);
- i = 0;
- token.attribs[i].k.splice(0, nestedDelimiterInfo.k);
- }
-
- // default -- not sure when this might be triggered
- if (!stripFrom && !stripUpto) {
- token.attribs[i].k.splice(nestedDelimiterInfo.k, 1);
- }
- token.attribs[i].ksrc = undefined;
- } else {
- if (stripFrom) {
- token.attribs.splice(i+1);
- token.attribs[i].v.splice(nestedDelimiterInfo.v);
- }
- if (stripUpto) {
- // Since we are stripping upto the delimiter,
- // change the token to a simple span.
- // SSS FIXME: For sure in the case of table tags
(tr,td,th,etc.) but, always??
- token.name = 'span';
- token.attribs.splice(0, i);
- i = 0;
- token.attribs[i].v.splice(0, nestedDelimiterInfo.v);
- }
-
- // default -- not sure when this might be triggered
- if (!stripFrom && !stripUpto) {
- token.attribs[i].v.splice(nestedDelimiterInfo.v, 1);
- }
- token.attribs[i].vsrc = undefined;
- }
-
- return {containerToken: token, delimiter: delimiter};
-}
-
function noIncludeHandler(manager, options, collection) {
- var start = collection.start, end = collection.end;
+ var start = collection.shift(), end = collection.pop();
// Handle self-closing tag case specially!
if (start.constructor === SelfclosingTagTk) {
@@ -213,41 +153,24 @@
var tokens = [];
- // Deal with nested opening delimiter found in another token
- if (start.constructor !== TagTk) {
- // FIXME: May use other handlers later.
- // May abort collection, convert to text, whatever ....
- // For now, this is just an intermediate solution while we
- // figure out other smarter strategies and how to plug them in
here.
-
tokens.push(defaultNestedDelimiterHandler(start).containerToken);
- }
-
if (!options.isInclude) {
// Content is preserved
var curriedBuildMetaToken = buildMetaToken.bind( null, manager,
'mw:NoInclude' ),
- // TODO: abstract this!
- startTSR = start &&
- start.dataAttribs &&
- start.dataAttribs.tsr,
- endTSR = end &&
- end.dataAttribs &&
- end.dataAttribs.tsr;
+ startTSR = start && start.dataAttribs &&
start.dataAttribs.tsr,
+ endTSR = end && end.dataAttribs && end.dataAttribs.tsr;
tokens.push(curriedBuildMetaToken(false, startTSR));
- tokens = tokens.concat(collection.tokens);
- if ( end ) {
+ tokens = tokens.concat(collection);
+ if ( end && end.constructor == EndTagTk ) {
tokens.push(curriedBuildMetaToken(true, endTSR));
- } else if ( tokens.last().constructor === EOFTk ) {
- tokens.pop();
}
} else if (options.wrapTemplates) {
// content is stripped
- tokens.push(buildStrippedMetaToken(manager, 'mw:NoInclude',
- start, end));
+ tokens.push(buildStrippedMetaToken(manager, 'mw:NoInclude',
start, end));
}
- // Deal with nested closing delimiter found in another token
- if (end && end.constructor !== EndTagTk) {
- tokens.push(defaultNestedDelimiterHandler(end).containerToken);
+ // Preserve EOF
+ if ( end.constructor === EOFTk ) {
+ tokens.push(end);
}
return { tokens: tokens };
@@ -259,12 +182,14 @@
noIncludeHandler.bind(null, manager, options),
true, // match the end-of-input if </noinclude> is
missing
0.02, // very early in stage 1, to avoid any further
processing.
+ 'tag',
'noinclude'
);
}
function includeOnlyHandler(manager, options, collection) {
- var start = collection.start, end = collection.end;
+ var start = collection.shift(),
+ end = collection.pop();
// Handle self-closing tag case specially!
if (start.constructor === SelfclosingTagTk) {
@@ -273,48 +198,19 @@
{ tokens: [ buildMetaToken(manager, 'mw:IncludeOnly',
false, (start.dataAttribs || {}).tsr) ] };
}
- // Deal with nested opening delimiter found in another token
- var startDelim, startHead;
- if (start.constructor !== TagTk) {
- // FIXME: May use other handlers later.
- // May abort collection, convert to text, whatever ....
- // For now, this is just an intermediate solution while we
- // figure out other smarter strategies and how to plug them in
here.
- var s = defaultNestedDelimiterHandler(start);
- startHead = s.containerToken;
- startDelim = s.delimiter;
- } else {
- startDelim = start;
- }
-
- // Deal with nested closing delimiter found in another token
- var endDelim, endTail;
- if (end) {
- if (end.constructor !== EndTagTk) {
- var e = defaultNestedDelimiterHandler(end);
- endTail = e.containerToken;
- endDelim = e.delimiter;
- } else {
- endDelim = end;
- }
- }
-
- var tokens = [];
- if (startHead) {
- tokens.push(startHead);
- }
+ var tokens = [], eof = end.constructor === EOFTk;
if (options.isInclude) {
// Just pass through the full collection including delimiters
- tokens = tokens.concat(collection.tokens);
+ tokens = tokens.concat(collection);
} else if (options.wrapTemplates) {
// Content is stripped, add a meta for round-tripping
- tokens.push(buildStrippedMetaToken(manager, 'mw:IncludeOnly',
- startDelim, endDelim));
+ tokens = [buildStrippedMetaToken(manager, 'mw:IncludeOnly',
start, eof ? null : end )];
}
- if (endTail) {
- tokens.push(endTail);
+ // Preserve EOF
+ if ( eof ) {
+ tokens.push(end);
}
return { tokens: tokens };
@@ -328,6 +224,7 @@
includeOnlyHandler.bind(null, manager, options),
true, // match the end-of-input if </noinclude> is
missing
0.03, // very early in stage 1, to avoid any further
processing.
+ 'tag',
'includeonly'
);
}
diff --git a/js/lib/ext.util.TokenAndAttrCollector.js
b/js/lib/ext.util.TokenAndAttrCollector.js
deleted file mode 100644
index 9a90bd4..0000000
--- a/js/lib/ext.util.TokenAndAttrCollector.js
+++ /dev/null
@@ -1,452 +0,0 @@
-/* ------------------------------------------------------------------------
- * Summary
- * -------
- * This whole handler collects delimiter-separated tokens on behalf of other
- * transformers. It is also one giant hack of sorts to get around:
- *
- * (1) precedence issues in a single-pass tokenizer (without
- * preprocessing passes like in the multi-pass PHP parser)
- * to process noinclude/includeonly and extension content tags.
- * (2) unbalanced/misnested tags in source wikitext.
- * ------------------------------------------------------------------------
- *
- * Token attributes can have one or both of their key/value information
- * come from a token stream. Ex: <div {{echo|id}}="{{echo|test}}">
- *
- * However if noinclude/includeonly or an extension tag shows up in
- * an attribute key/value position (as far as the tokenizer is concerned),
- * these delimiter tags get buried inside token attributes rather than
- * being present at the top-level of the token stream.
- *
- * Examples:
- * - <div <noinclude>id</noinclude><includeonly>about</includeonly>='foo'>
- * - <noinclude>{|</noinclude> ...
- * - {|<noinclude>style='color:red'</noinclude>
- * - [[Image:foo.jpg| .. <math>a|b</math> ..]]
- *
- * This class attempts to match up opening and closing tags across token
- * nesting boundaries when the parser cannot always accurately match them
- * up within the restricted parsing context. The broad strategy is to
- * find matching pairs of delimiters within attributes of the same token
- * and merge those attributes into a single attribute to let the
- * Attribute Expander handle them. After this is done, there should only
- * be atmost one unmatched open/closing delimiter within each token.
- *
- * The current strategy has been adopted to not crash while handling uses
- * of such tags that may not be properly nested vis-a-vis other tags:
- *
- * Ex: <p id="<noinclude>"> foo </noinclude></p>
- *
- * This use of <noinclude> tags spans a DOM-attribute and a DOM child
- * across levels and is not really well-structured wrt DOM semantics and
- * ideally should not be supported/seen in wikitext. This support may
- * evolve in the future to issue appropriate warnings/error messages to
- * encourage fixing up the relevant pages.
- * ------------------------------------------------------------------------ */
-
-"use strict";
-
-function TokenAndAttrCollector(manager, transformation, toEnd, rank, name) {
- this.transformation = transformation;
- this.manager = manager;
- this.rank = rank;
- this.tagName = name;
- this.toEnd = toEnd;
- this.hasOpenTag = false;
- // this.uid = this.manager.env.generateUID();
- manager.addTransform(this.onAnyToken.bind( this ),
"TokenAndAttrCollector:onAnyToken", rank, 'any');
-}
-
-TokenAndAttrCollector.prototype.init = function(start) {
- /*
- * start and end are usually delimiter tokens (ex: <noinclude> and
</noinclude>)
- * but, if they are nested in attributes of another token, then start
and end
- * will be an object with info about the nesting.
- *
- * tokens are all tokens in between start and end.
- *
- * The nesting info object has the following fields:
- * - delimiter : the nested delimiter (ex: </noinclude>, <includeonly>
..)
- * - token : the token that nested the delimiter
- * - attrIndex : index of the attribute where the delimiter was found
- * - k : if >= 0, the index of the delimiter with the k-array
of the attribute
- * - v : if >= 0, the index of the delimiter with the v-array
of the attribute
- */
- this.collection = {
- start : null,
- end : null,
- tokens : []
- };
- this.hasOpenTag = true;
- this.collection.start = start;
-};
-
-TokenAndAttrCollector.prototype.inspectAttrs = function(token) {
- /* --------------------------------------------------
- * NOTE: This function assumes:
- * - balanced open/closed delimiters.
- * - no nesting of delimiters.
- * -------------------------------------------------- */
-
- function findMatchingDelimIndex(delims, opts) {
- // Finds first/last open/closed delimiter tag
- var i, n = delims.length;
- if (opts.first) {
- i = 0;
- // xor to detect unmet condition
- while (i < n && (opts.open ^ delims[i].open)) {
- i++;
- }
-
- // failure case
- if (i === n) {
- i = -1;
- }
- } else {
- i = n - 1;
- // xor to detect unmet condition
- while (i >= 0 && (opts.open ^ delims[i].open)) {
- i--;
- }
- }
-
- return i;
- }
-
- function nothingSpecialToDo(collector, token) {
- if (collector.hasOpenTag) {
- collector.collection.tokens.push(token);
- return {};
- } else {
- return {tokens: [token]};
- }
- }
-
- function collectNestedDelimiters(collector, containerToken, attrIndex,
isK, tagArray) {
- // Don't collect balanced pairs of open-closed tags.
- // They will be taken care of by the attribute-handler.
- //
- // This let us distinguish between (a) and (b).
- // (a) <div style="<noinclude>">...</div>
- // (b) <div style="<noinclude>foo</noinclude>">...</div>
- var delims = [], openTag = null, closedTag = null;
- for (var j = 0, m = tagArray.length; j < m; j++) {
- var t = tagArray[j];
- var tc = t.constructor;
- if ((tc === TagTk) && (t.name === collector.tagName)) {
- openTag = {
- delimiter: t,
- open: true,
- token: containerToken,
- attrIndex: attrIndex,
- k: isK ? j : -1,
- v: !isK ? j : -1
- };
- } else if ((tc === EndTagTk) && (t.name ===
collector.tagName)) {
- closedTag = {
- delimiter: t,
- open: false,
- token: containerToken,
- attrIndex: attrIndex,
- k: isK ? j : -1,
- v: !isK ? j : -1
- };
-
- // Collect any unbalanced closed tag
- if (!openTag) {
- closedTag.unbalanced = true;
- delims.push(closedTag);
- }
-
- openTag = closedTag = null;
- }
- // FIXME: Not recursing down into t's attributes above
- }
-
- // Collect any unbalanced open tag
- if (openTag) {
- delims.push(openTag);
- }
-
- return delims;
- }
-
- function reuniteSeparatedPairs(token, delims) {
- /* -----------------------------------------------------------
- * FIXME: Merging attributes is not necessarily the right
- * solution in all cases. In certain parsing contexts,
- * we shouldn't be merging the attributes at all.
- *
- * Ex:
[[Image:foo.jpg|thumb|<noinclude>foo|bar|baz</noinclude>]]
- *
- * PHP parser treats these as 3 different attributes and
- * discards everything but 'baz'. But, by merging the 3 attrs,
- * this handler will include everything. This is an edge case,
- * so, not worrying about it now.
- *
- * FIXME: Later on, we may implement smarter merging strategies.
- * ------------------------------------------------------------
*/
-
- // helper function
- function mergeToks(toks, t) {
- if (t.constructor === Array) {
- return toks.concat(t);
- } else {
- toks.push(t);
- return toks;
- }
- }
-
- // helper function
- function mergeAttr(toks, a) {
- // Compute toks + a.k + "=" + a.v
- if (a.k === "mw:maybeContent") {
- /*
-----------------------------------------------------
- * FIXME: This is not the right solution in all
cases.
- * This is appropriate only when we are
processing
- * extension content where "|" has no special
meaning.
- * For now, we are turning a blind eye since
this is
- * likely an edge case:
- *
- *
[[Image:foo.jpg|thumb|<noinclude>foo|bar|baz</noinclude>]]
- *
---------------------------------------------------------- */
- toks.push("|");
- toks = mergeToks(toks, a.v);
- } else {
- toks = mergeToks(toks, a.k);
- if (a.v !== "") {
- toks.push('=');
- toks = mergeToks(toks, a.v);
- }
- }
- return toks;
- }
-
- // console.warn("T: " + JSON.stringify(token));
-
- // find the first open delim -- will be delims[0/1] for
well-formed WT
- var i = findMatchingDelimIndex(delims, {first: true, open:
true});
- var openD = i === -1 ? null : delims[i];
-
- // find the last closed delim -- will be delims[n-2/n-1] for
well-formed WT
- var j = findMatchingDelimIndex(delims, {first: false, open:
false});
- var closeD = j === -1 ? null : delims[j];
-
- // Merge all attributes between openD.attrIndex and
closeD.attrIndex
- // Tricky bits:
- // every attribute is a (k,v) pair, and we need to merge
- // both the k and v into one set of tokens and insert a "="
token
- // in between.
- // - we need to handle the first/last attribute specially since
the
- // openD/closeD may show up in either k/v of those attrs.
That
- // will determine what the merged k/v value will be.
- if (openD && closeD && i < j) {
- var attrs = token.attribs,
- toks, mergedK, mergedV;
-
- // console.warn("openD: " + JSON.stringify(openD));
- // console.warn("closeD: " + JSON.stringify(closeD));
-
- if (openD.attrIndex === closeD.attrIndex) {
- // Special Case: openD and closeD showed up in
k and v
- // of the same attr. In this case, openD would
have showed up
- // in k and closeD in v.
- //
- // assert(openD.k !== -1);
- // assert(closeD.k === -1);
- mergedK = mergeAttr([], attrs[openD.attrIndex]);
- mergedV = [];
- } else {
- if (openD.k === -1) {
- // openD didn't show up in k. Start
with v
- toks = mergeToks([],
attrs[openD.attrIndex].v);
- } else {
- // openD showed up in k. Merge k & v
- toks = mergeAttr([],
attrs[openD.attrIndex]);
- }
-
- var x = openD.attrIndex + 1;
- while (x < closeD.attrIndex) {
- // Compute toks + a.k + "=" + a.v
- toks = mergeAttr(toks, attrs[x]);
- x++;
- }
-
- // Compute merged (k,v)
- if (openD.k === -1) {
- // openD didn't show up in k.
- // Use orig-k for the merged KV
- // Merge closeD's attr into toks and
use it for v
- mergedK = attrs[openD.attrIndex].k;
- mergedV = mergeAttr(toks,
attrs[closeD.attrIndex]);
- } else {
- // openD showed up in k.
- // check where closedD showed up.
- if (closeD.k !== -1) {
- mergedK = mergeToks(toks,
attrs[closeD.attrIndex].k);
- mergedV =
attrs[closeD.attrIndex].v;
- } else {
- mergedK = mergeAttr(toks,
attrs[closeD.attrIndex]);
- mergedV = [];
- }
- }
- }
-
- // console.warn("t-delims: " + JSON.stringify(delims));
- // console.warn("-------------");
- // console.warn("t-orig: " + JSON.stringify(token));
- // console.warn("merged k: " + JSON.stringify(mergedK));
- // console.warn("merged v: " + JSON.stringify(mergedV));
-
- // clone token and splice in merged attribute
- var numDeleted = closeD.attrIndex - openD.attrIndex;
- token = token.clone();
-
- // FIXME: Blindly using src-offsets from the attribute
where the
- // opening tag showed up. This is a temporary hack. We
need a
- // better solution to this whole token-and-attr
collector mess.
- var newKV = new KV(mergedK, mergedV,
attrs[openD.attrIndex].srcOffsets);
- token.attribs.splice(openD.attrIndex, numDeleted + 1,
newKV);
-
- // console.warn("-------------");
- // console.warn("t-merged: " + JSON.stringify(token));
-
- // remove merged delims and update attr-index for
remaining delimiters
- delims.splice(i,j-i+1);
- while (i < delims.length) {
- delims.attrIndex -= numDeleted;
- i++;
- }
- }
-
- return [token, delims];
- }
-
- // Check tags to see if we have a nested delimiter
- var attrs = token.attribs;
- var delims = [];
- var i, n;
-
- for (i = 0, n = attrs.length; i < n; i++) {
- var a = attrs[i];
- var k = a.k;
- if (k.constructor === Array && k.length > 0) {
- delims = delims.concat(collectNestedDelimiters(this,
token, i, true, k));
- }
- var v = a.v;
- if (v.constructor === Array && v.length > 0) {
- delims = delims.concat(collectNestedDelimiters(this,
token, i, false, v));
- }
- }
-
- // console.warn("delims: " + JSON.stringify(delims));
-
- if (delims.length === 0) {
- return nothingSpecialToDo(this, token);
- } else {
- if (delims.length > 1) {
- // we will have delims.length %2 matched pairs across
- // attributes and their .k and .v properties. Merge
- // them into a unified attribute since this separation
- // is a Parsoid parsing artefact.
- var ret = reuniteSeparatedPairs(token, delims);
- token = ret[0];
- delims = ret[1];
- }
-
- if (delims.length === 0) {
- // we merged matching pairs and eliminated all nested
- // delims to process in this pass.
- return nothingSpecialToDo(this, token);
- } else {
- var openDelim, closedDelim;
-
- // Find first closed delim -- should always be the
delims[0]
- // if everything is working properly.
- i = findMatchingDelimIndex(delims, {first: true, open:
false });
- closedDelim = i === -1 ? null : delims[i];
-
- // Find last open delim -- should always be the
delims[numDelims-1]
- // if everything is working properly.
- i = findMatchingDelimIndex(delims, {first: false, open:
true });
- openDelim = i === -1 ? null : delims[i];
-
- if (this.hasOpenTag) {
- if (closedDelim) {
- this.collection.end = closedDelim;
- this.hasOpenTag = false;
- return
this.transformation(this.collection);
- } else {
- // nested/extra tag? we'll ignore it.
- return nothingSpecialToDo(this, token);
- }
- } else {
- if (openDelim) {
- this.init(openDelim);
- return {tokens: null};
- } else {
- // nested/extra tag? we'll ignore it.
- return nothingSpecialToDo(this, token);
- }
- }
- }
- }
-};
-
-TokenAndAttrCollector.prototype.onAnyToken = function( token, frame, cb ) {
- //console.warn("T<" + this.tagName + ":" + this.rank + ":" +
this.hasOpenTag + ">:" + JSON.stringify(token));
- var tc = token.constructor, res;
- if ((tc === TagTk) && (token.name === this.tagName)) {
- this.init(token);
- return {tokens: null};
- } else if (this.hasOpenTag) {
- if ((tc === EndTagTk) && (token.name === this.tagName)) {
- this.hasOpenTag = false;
- this.collection.end = token;
- return this.transformation(this.collection);
- } else if (tc === EOFTk) {
- if (this.toEnd) {
- this.collection.tokens.push(token);
- this.hasOpenTag = false;
- res = this.transformation(this.collection);
- // make sure we preserve the EOFTk
- if ( res.tokens && res.tokens.length &&
- res.tokens.last().constructor
!== EOFTk ) {
- res.tokens.push(token);
- } else {
- res = { tokens: [token] };
- }
- return res;
- } else {
- this.collection.tokens.push(token);
- return { tokens: this.collection.tokens };
- }
- } else if (tc === TagTk || tc === EndTagTk || tc ===
SelfclosingTagTk){
- return this.inspectAttrs(token);
- } else {
- this.collection.tokens.push(token);
- return { };
- }
- } else {
- if ((tc === EndTagTk) && (token.name === this.tagName)) {
- // ERROR! unbalanced closing token! -- convert to
string!
- // Spit out error somewhere.
- // FIXME: Copy over tsr
- return {tokens: [new String("</" + this.tagName +
">")]};
- } else if (tc === SelfclosingTagTk && token.name ===
this.tagName) {
- return this.transformation({
- start : token,
- end : null,
- tokens : []
- });
- } else if (tc === TagTk || tc === EndTagTk || tc ===
SelfclosingTagTk){
- return this.inspectAttrs(token);
- } else {
- return {tokens: [token]};
- }
- }
-};
-
-if (typeof module === "object") {
- module.exports.TokenAndAttrCollector = TokenAndAttrCollector;
-}
diff --git a/js/lib/ext.util.TokenCollector.js
b/js/lib/ext.util.TokenCollector.js
index 35cc08c..cee386f 100644
--- a/js/lib/ext.util.TokenCollector.js
+++ b/js/lib/ext.util.TokenCollector.js
@@ -1,9 +1,3 @@
-/* FIXME: Currently not used by anything now.
- * But once include-directives are handled better in the tokenizer,
- * token-and-attr-collector might become defunct and we might use
- * this instead. So keeping around till then.
- */
-
/**
* @class
*
@@ -52,7 +46,6 @@
* ExtensionContentCollector for example.
*/
TokenCollector.prototype._anyDelta = 0.00000001;
-
/**
* @private
@@ -122,9 +115,8 @@
// preserve the EOFTk
res.tokens.push(token);
- } else {
- res = { tokens: [token] };
}
+
return res;
}
} else {
diff --git a/js/lib/pegTokenizer.pegjs.txt b/js/lib/pegTokenizer.pegjs.txt
index be7dbff..76a04c7 100644
--- a/js/lib/pegTokenizer.pegjs.txt
+++ b/js/lib/pegTokenizer.pegjs.txt
@@ -296,10 +296,9 @@
return pos === inputLength;
};
- // Current extension tag being parsed.
+ // Current extension/include tag being parsed.
var currExtTag = null;
- // SSS FIXME: Temporary hack till the next round of cleanup and
refactoring.
var includeTags = Util.arrayToHash([
"includeonly", "noinclude", "onlyinclude"
]);
@@ -1454,7 +1453,9 @@
// Parse ext-content, strip eof, and shift tsr
var extContent = dp.src.substring(dp.tagWidths[0],
dp.src.length - dp.tagWidths[1]);
var extContentToks = (new
PegTokenizer(pegArgs.env)).tokenize(extContent);
- extContentToks = Util.stripEOFTkfromTokens(extContentToks);
+ if (dp.tagWidths[1] > 0) {
+ extContentToks = Util.stripEOFTkfromTokens(extContentToks);
+ }
Util.shiftTokenTSR(extContentToks, dp.tsr[0] +
dp.tagWidths[0]);
ret = [t2].concat(extContentToks);
}
@@ -2195,34 +2196,96 @@
* |}
*/
-include_limits = "<" end:"/"? name:[0-9a-zA-Z]+ (space / newline)* ">" {
+include_limits =
+ "</" name:[0-9a-zA-Z]+ (space / newline)* ">" {
+ // End tag only
name = name.join('');
var incl = name.toLowerCase();
if (incl === "noinclude" || incl === "onlyinclude" || incl ===
"includeonly") {
- var da = {tsr: [pos0, pos]};
+ var dp = {tsr: [pos0, pos]};
+ // Record variant since tag is not in normalized lower case
if (name !== incl) {
- da.srcTagName = name;
+ dp.srcTagName = name;
}
- if (end) {
- return [new EndTagTk(name, [], da)];
- } else {
- return [new TagTk(name, [], da)];
- }
+ return new EndTagTk(name, [], dp);
} else {
return null;
}
}
+ / inclTag:("<" name:[0-9a-zA-Z]+ (space / newline)* ">" {
+ // Start tag only
+ name = name.join('');
+ var incl = name.toLowerCase();
+ if (incl === "noinclude" || incl === "onlyinclude" || incl ===
"includeonly") {
+ var dp = {tsr: [pos0, pos]},
+ restOfInput = input.substring(pos0),
+ tagContent = restOfInput.match(new RegExp("^(.|\n)*?(</\s*" +
incl + ">)", "m")),
+ tagWidths = [pos-pos0, (tagContent ? tagContent[2].length : 0)],
+ inclSrc = tagContent ? tagContent[0] : restOfInput,
+ inclContentLen = inclSrc.length - tagWidths[0] - tagWidths[1],
+ skipLen = inclContentLen;
+
+ dp.src = inclSrc;
+ dp.origInput = input;
+
+ // Replace incl-content with '#' or '_'
+ var skipChar = '#';
+ if (input.length >= pos+skipLen && input[pos+skipLen] === '#') {
+ skipChar = '_';
+ }
+ input = input.slice(0,pos) +
+ Util.charSequence('', skipChar, skipLen) +
+ input.slice(pos+skipLen);
+
+ // Temporary state
+ dp.skipLen = skipLen;
+ dp.tagWidths = tagWidths;
+ dp.skipChar = skipChar;
+
+ // Record variant since tag is not in normalized lower case
+ if (name !== incl) {
+ dp.srcTagName = name;
+ }
+
+ return new TagTk(name, [], dp);
+ } else {
+ return null;
+ }
+ }) dummyText:('#'+ / '_'+) {
+ var dp = inclTag.dataAttribs;
+ if (dummyText.length !== dp.skipLen || dummyText[0] !== dp.skipChar) {
+ return null;
+ }
+
+ // Tokenize include content in a new tokenizer
+ var inclContent = dp.src.substring(dp.tagWidths[0], dp.src.length -
dp.tagWidths[1]),
+ inclContentToks = (new
PegTokenizer(pegArgs.env)).tokenize(inclContent);
+
+ if (dp.tagWidths[1] > 0) {
+ inclContentToks = Util.stripEOFTkfromTokens(inclContentToks);
+ }
+
+ // shift tsr
+ Util.shiftTokenTSR(inclContentToks, dp.tsr[0] + dp.tagWidths[0]);
+
+ // Reset input
+ input = dp.origInput;
+
+ // Clear temporary state
+ dp.skipChar = undefined;
+ dp.skipLen = undefined;
+ dp.origInput = undefined;
+
+ return [inclTag].concat(inclContentToks);
+ }
eof = & { return isEOF(pos); } { return true; }
-
-newline
- = '\n' / '\r\n'
+newline = '\n' / '\r\n'
newlineToken = newline { return [new NlTk([pos0, pos])] }
eolf = newline / eof
-
// 'Preprocessor' directive- higher-level things that can occur in otherwise
// plain-text content.
@@ -2381,7 +2444,6 @@
// SSS FIXME: what about |{{!}} and {{!}}|
pipe_pipe = "||" / "{{!}}{{!}}"
-
// Similar, for tables..
exclam = "!" / "{{;}}"
--
To view, visit https://gerrit.wikimedia.org/r/54506
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Id92b7bbfef730a6f32a2df52240e17dbd0cc0ae9
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits