[MediaWiki-commits] [Gerrit] Tokenize include tags hierarchically just like extension t... - change (mediawiki...Parsoid)

Subramanya Sastry (Code Review) Mon, 18 Mar 2013 11:35:42 -0700

Subramanya Sastry has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/54506



Change subject: Tokenize *include* tags hierarchically just like extension tags.
......................................................................

Tokenize *include* tags hierarchically just like extension tags.

* First pass cleaning up tokenizing of *include* tags.
  - Applied technique used in 1a0b8840 to parse extension tags.

* Eliminated the use of TokenAndAttrCollector since the complex
  delimiter matching across token and attribute levels is no longer
  required -- this is now handled in the tokenizer.

* Fixed up handlers in ext.core.NoIncludeOnly to use TokenCollector.

* No change in parser tests results, but some change in output of
  failed tests -- nothing significant that we should be worried about.

  Verified that es:Anexo:Monumentos_Históricos_de_Panamá continues
  to parse correctly -- this page was what necessitated the coding
  of the complex TokenAndAttrCollector handlers.

Change-Id: Id92b7bbfef730a6f32a2df52240e17dbd0cc0ae9
---
M js/lib/ext.core.NoIncludeOnly.js
D js/lib/ext.util.TokenAndAttrCollector.js
M js/lib/ext.util.TokenCollector.js
M js/lib/pegTokenizer.pegjs.txt
4 files changed, 99 insertions(+), 600 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid 
refs/changes/06/54506/1

diff --git a/js/lib/ext.core.NoIncludeOnly.js b/js/lib/ext.core.NoIncludeOnly.js
index 90b3fa1..e3a1cc2 100644
--- a/js/lib/ext.core.NoIncludeOnly.js
+++ b/js/lib/ext.core.NoIncludeOnly.js
@@ -4,7 +4,7 @@
  * noinclude sections.
  */
 
-var Collector = require( './ext.util.TokenAndAttrCollector.js' 
).TokenAndAttrCollector;
+var Collector = require( './ext.util.TokenCollector.js' ).TokenCollector;
 
 /**
  * This helper function will build a meta token in the right way for these
@@ -141,68 +141,8 @@
        }
 };
 
-function defaultNestedDelimiterHandler(nestedDelimiterInfo) {
-       // Always clone the container token before modifying it
-       var token = nestedDelimiterInfo.token.clone();
-
-       // Strip the delimiter token wherever it is nested
-       // and strip upto/from the delimiter depending on the
-       // token type and where in the stream we are.
-       var i = nestedDelimiterInfo.attrIndex;
-       var delimiter = nestedDelimiterInfo.delimiter;
-       var isOpenTag = delimiter.constructor === TagTk;
-       var stripFrom = ((delimiter.name === "noinclude") && isOpenTag) ||
-                                       ((delimiter.name === "includeOnly") && 
!isOpenTag);
-       var stripUpto = ((delimiter.name === "noinclude") && !isOpenTag) ||
-                                       ((delimiter.name === "includeOnly") && 
isOpenTag);
-
-       if (nestedDelimiterInfo.k >= 0) {
-               if (stripFrom) {
-                       token.attribs.splice(i+1);
-                       token.attribs[i].k.splice(nestedDelimiterInfo.k);
-               }
-               if (stripUpto) {
-                       // Since we are stripping upto the delimiter,
-                       // change the token to a simple span.
-                       // SSS FIXME: For sure in the case of table tags 
(tr,td,th,etc.) but, always??
-                       token.name = 'span';
-                       token.attribs.splice(0, i);
-                       i = 0;
-                       token.attribs[i].k.splice(0, nestedDelimiterInfo.k);
-               }
-
-               // default -- not sure when this might be triggered
-               if (!stripFrom && !stripUpto) {
-                       token.attribs[i].k.splice(nestedDelimiterInfo.k, 1);
-               }
-               token.attribs[i].ksrc = undefined;
-       } else {
-               if (stripFrom) {
-                       token.attribs.splice(i+1);
-                       token.attribs[i].v.splice(nestedDelimiterInfo.v);
-               }
-               if (stripUpto) {
-                       // Since we are stripping upto the delimiter,
-                       // change the token to a simple span.
-                       // SSS FIXME: For sure in the case of table tags 
(tr,td,th,etc.) but, always??
-                       token.name = 'span';
-                       token.attribs.splice(0, i);
-                       i = 0;
-                       token.attribs[i].v.splice(0, nestedDelimiterInfo.v);
-               }
-
-               // default -- not sure when this might be triggered
-               if (!stripFrom && !stripUpto) {
-                       token.attribs[i].v.splice(nestedDelimiterInfo.v, 1);
-               }
-               token.attribs[i].vsrc = undefined;
-       }
-
-       return {containerToken: token, delimiter: delimiter};
-}
-
 function noIncludeHandler(manager, options, collection) {
-       var start = collection.start, end = collection.end;
+       var start = collection.shift(), end = collection.pop();
 
        // Handle self-closing tag case specially!
        if (start.constructor === SelfclosingTagTk) {
@@ -213,41 +153,24 @@
 
        var tokens = [];
 
-       // Deal with nested opening delimiter found in another token
-       if (start.constructor !== TagTk) {
-               // FIXME: May use other handlers later.
-               // May abort collection, convert to text, whatever ....
-               // For now, this is just an intermediate solution while we
-               // figure out other smarter strategies and how to plug them in 
here.
-               
tokens.push(defaultNestedDelimiterHandler(start).containerToken);
-       }
-
        if (!options.isInclude) {
                // Content is preserved
                var curriedBuildMetaToken = buildMetaToken.bind( null, manager, 
'mw:NoInclude' ),
-                       // TODO: abstract this!
-                       startTSR = start &&
-                               start.dataAttribs &&
-                               start.dataAttribs.tsr,
-                       endTSR = end &&
-                               end.dataAttribs &&
-                               end.dataAttribs.tsr;
+                       startTSR = start && start.dataAttribs && 
start.dataAttribs.tsr,
+                       endTSR = end && end.dataAttribs && end.dataAttribs.tsr;
                tokens.push(curriedBuildMetaToken(false, startTSR));
-               tokens = tokens.concat(collection.tokens);
-               if ( end ) {
+               tokens = tokens.concat(collection);
+               if ( end && end.constructor == EndTagTk ) {
                        tokens.push(curriedBuildMetaToken(true, endTSR));
-               } else if ( tokens.last().constructor === EOFTk ) {
-                       tokens.pop();
                }
        } else if (options.wrapTemplates) {
                // content is stripped
-               tokens.push(buildStrippedMetaToken(manager, 'mw:NoInclude',
-                                       start, end));
+               tokens.push(buildStrippedMetaToken(manager, 'mw:NoInclude', 
start, end));
        }
 
-       // Deal with nested closing delimiter found in another token
-       if (end && end.constructor !== EndTagTk) {
-               tokens.push(defaultNestedDelimiterHandler(end).containerToken);
+       // Preserve EOF
+       if ( end.constructor === EOFTk ) {
+               tokens.push(end);
        }
 
        return { tokens: tokens };
@@ -259,12 +182,14 @@
                        noIncludeHandler.bind(null, manager, options),
                        true, // match the end-of-input if </noinclude> is 
missing
                        0.02, // very early in stage 1, to avoid any further 
processing.
+                       'tag',
                        'noinclude'
                        );
 }
 
 function includeOnlyHandler(manager, options, collection) {
-       var start = collection.start, end = collection.end;
+       var start = collection.shift(),
+               end = collection.pop();
 
        // Handle self-closing tag case specially!
        if (start.constructor === SelfclosingTagTk) {
@@ -273,48 +198,19 @@
                        { tokens: [ buildMetaToken(manager, 'mw:IncludeOnly', 
false, (start.dataAttribs || {}).tsr) ] };
        }
 
-       // Deal with nested opening delimiter found in another token
-       var startDelim, startHead;
-       if (start.constructor !== TagTk) {
-               // FIXME: May use other handlers later.
-               // May abort collection, convert to text, whatever ....
-               // For now, this is just an intermediate solution while we
-               // figure out other smarter strategies and how to plug them in 
here.
-               var s = defaultNestedDelimiterHandler(start);
-               startHead  = s.containerToken;
-               startDelim = s.delimiter;
-       } else {
-               startDelim = start;
-       }
-
-       // Deal with nested closing delimiter found in another token
-       var endDelim, endTail;
-       if (end) {
-               if (end.constructor !== EndTagTk) {
-                       var e = defaultNestedDelimiterHandler(end);
-                       endTail  = e.containerToken;
-                       endDelim = e.delimiter;
-               } else {
-                       endDelim = end;
-               }
-       }
-
-       var tokens = [];
-       if (startHead) {
-               tokens.push(startHead);
-       }
+       var tokens = [], eof = end.constructor === EOFTk;
 
        if (options.isInclude) {
                // Just pass through the full collection including delimiters
-               tokens = tokens.concat(collection.tokens);
+               tokens = tokens.concat(collection);
        } else if (options.wrapTemplates) {
                // Content is stripped, add a meta for round-tripping
-               tokens.push(buildStrippedMetaToken(manager, 'mw:IncludeOnly',
-                                       startDelim, endDelim));
+               tokens = [buildStrippedMetaToken(manager, 'mw:IncludeOnly', 
start, eof ? null : end )];
        }
 
-       if (endTail) {
-               tokens.push(endTail);
+       // Preserve EOF
+       if ( eof ) {
+               tokens.push(end);
        }
 
        return { tokens: tokens };
@@ -328,6 +224,7 @@
                        includeOnlyHandler.bind(null, manager, options),
                        true, // match the end-of-input if </noinclude> is 
missing
                        0.03, // very early in stage 1, to avoid any further 
processing.
+                       'tag',
                        'includeonly'
                        );
 }
diff --git a/js/lib/ext.util.TokenAndAttrCollector.js 
b/js/lib/ext.util.TokenAndAttrCollector.js
deleted file mode 100644
index 9a90bd4..0000000
--- a/js/lib/ext.util.TokenAndAttrCollector.js
+++ /dev/null
@@ -1,452 +0,0 @@
-/* ------------------------------------------------------------------------
- * Summary
- * -------
- * This whole handler collects delimiter-separated tokens on behalf of other
- * transformers.  It is also one giant hack of sorts to get around:
- *
- * (1) precedence issues in a single-pass tokenizer (without
- *     preprocessing passes like in the multi-pass PHP parser)
- *     to process noinclude/includeonly and extension content tags.
- * (2) unbalanced/misnested tags in source wikitext.
- * ------------------------------------------------------------------------
- *
- * Token attributes can have one or both of their key/value information
- * come from a token stream.  Ex: <div {{echo|id}}="{{echo|test}}">
- *
- * However if noinclude/includeonly or an extension tag shows up in
- * an attribute key/value position (as far as the tokenizer is concerned),
- * these delimiter tags get buried inside token attributes rather than
- * being present at the top-level of the token stream.
- *
- * Examples:
- * - <div <noinclude>id</noinclude><includeonly>about</includeonly>='foo'>
- * - <noinclude>{|</noinclude> ...
- * - {|<noinclude>style='color:red'</noinclude>
- * - [[Image:foo.jpg| .. <math>a|b</math> ..]]
- *
- * This class attempts to match up opening and closing tags across token
- * nesting boundaries when the parser cannot always accurately match them
- * up within the restricted parsing context.  The broad strategy is to
- * find matching pairs of delimiters within attributes of the same token
- * and merge those attributes into a single attribute to let the
- * Attribute Expander handle them.  After this is done, there should only
- * be atmost one unmatched open/closing delimiter within each token.
- *
- * The current strategy has been adopted to not crash while handling uses
- * of such tags that may not be properly nested vis-a-vis other tags:
- *
- * Ex: <p id="<noinclude>"> foo </noinclude></p>
- *
- * This use of <noinclude> tags spans a DOM-attribute and a DOM child
- * across levels and is not really well-structured wrt DOM semantics and
- * ideally should not be supported/seen in wikitext.  This support may
- * evolve in the future to issue appropriate warnings/error messages to
- * encourage fixing up the relevant pages.
- * ------------------------------------------------------------------------ */
-
-"use strict";
-
-function TokenAndAttrCollector(manager, transformation, toEnd, rank, name) {
-       this.transformation = transformation;
-       this.manager = manager;
-       this.rank = rank;
-       this.tagName = name;
-       this.toEnd = toEnd;
-       this.hasOpenTag = false;
-       // this.uid = this.manager.env.generateUID();
-       manager.addTransform(this.onAnyToken.bind( this ), 
"TokenAndAttrCollector:onAnyToken", rank, 'any');
-}
-
-TokenAndAttrCollector.prototype.init = function(start) {
-       /*
-        * start and end are usually delimiter tokens (ex: <noinclude> and 
</noinclude>)
-        * but, if they are nested in attributes of another token, then start 
and end
-        * will be an object with info about the nesting.
-        *
-        * tokens are all tokens in between start and end.
-        *
-        * The nesting info object has the following fields:
-        * - delimiter : the nested delimiter (ex: </noinclude>, <includeonly> 
..)
-        * - token     : the token that nested the delimiter
-        * - attrIndex : index of the attribute where the delimiter was found
-        * - k         : if >= 0, the index of the delimiter with the k-array 
of the attribute
-        * - v         : if >= 0, the index of the delimiter with the v-array 
of the attribute
-        */
-       this.collection = {
-               start  : null,
-               end    : null,
-               tokens : []
-       };
-       this.hasOpenTag = true;
-       this.collection.start = start;
-};
-
-TokenAndAttrCollector.prototype.inspectAttrs = function(token) {
-       /* --------------------------------------------------
-        * NOTE: This function assumes:
-        * - balanced open/closed delimiters.
-        * - no nesting of delimiters.
-        * -------------------------------------------------- */
-
-       function findMatchingDelimIndex(delims, opts) {
-               // Finds first/last open/closed delimiter tag
-               var i, n = delims.length;
-               if (opts.first) {
-                       i = 0;
-                       // xor to detect unmet condition
-                       while (i < n && (opts.open ^ delims[i].open)) {
-                               i++;
-                       }
-
-                       // failure case
-                       if (i === n) {
-                               i = -1;
-                       }
-               } else {
-                       i = n - 1;
-                       // xor to detect unmet condition
-                       while (i >= 0 && (opts.open ^ delims[i].open)) {
-                               i--;
-                       }
-               }
-
-               return i;
-       }
-
-       function nothingSpecialToDo(collector, token) {
-               if (collector.hasOpenTag) {
-                       collector.collection.tokens.push(token);
-                       return {};
-               } else {
-                       return {tokens: [token]};
-               }
-       }
-
-       function collectNestedDelimiters(collector, containerToken, attrIndex, 
isK, tagArray) {
-               // Don't collect balanced pairs of open-closed tags.
-               // They will be taken care of by the attribute-handler.
-               //
-               // This let us distinguish between (a) and (b).
-               // (a) <div style="<noinclude>">...</div>
-               // (b) <div style="<noinclude>foo</noinclude>">...</div>
-               var delims = [], openTag = null, closedTag = null;
-               for (var j = 0, m = tagArray.length; j < m; j++) {
-                       var t  = tagArray[j];
-                       var tc = t.constructor;
-                       if ((tc === TagTk) && (t.name === collector.tagName)) {
-                               openTag = {
-                                       delimiter: t,
-                                       open: true,
-                                       token: containerToken,
-                                       attrIndex: attrIndex,
-                                       k: isK  ? j : -1,
-                                       v: !isK ? j : -1
-                               };
-                       } else if ((tc === EndTagTk) && (t.name === 
collector.tagName)) {
-                               closedTag = {
-                                       delimiter: t,
-                                       open: false,
-                                       token: containerToken,
-                                       attrIndex: attrIndex,
-                                       k: isK  ? j : -1,
-                                       v: !isK ? j : -1
-                               };
-
-                               // Collect any unbalanced closed tag
-                               if (!openTag) {
-                                       closedTag.unbalanced = true;
-                                       delims.push(closedTag);
-                               }
-
-                               openTag = closedTag = null;
-                       }
-                       // FIXME: Not recursing down into t's attributes above
-               }
-
-               // Collect any unbalanced open tag
-               if (openTag) {
-                       delims.push(openTag);
-               }
-
-               return delims;
-       }
-
-       function reuniteSeparatedPairs(token, delims) {
-               /* -----------------------------------------------------------
-                * FIXME: Merging attributes is not necessarily the right
-                * solution in all cases.  In certain parsing contexts,
-                * we shouldn't be merging the attributes at all.
-                *
-                * Ex: 
[[Image:foo.jpg|thumb|<noinclude>foo|bar|baz</noinclude>]]
-                *
-                * PHP parser treats these as 3 different attributes and
-                * discards everything but 'baz'.  But, by merging the 3 attrs,
-                * this handler will include everything.  This is an edge case,
-                * so, not worrying about it now.
-                *
-                * FIXME: Later on, we may implement smarter merging strategies.
-                * ------------------------------------------------------------ 
*/
-
-               // helper function
-               function mergeToks(toks, t) {
-                       if (t.constructor === Array) {
-                               return toks.concat(t);
-                       } else {
-                               toks.push(t);
-                               return toks;
-                       }
-               }
-
-               // helper function
-               function mergeAttr(toks, a) {
-                       // Compute toks + a.k + "=" + a.v
-                       if (a.k === "mw:maybeContent") {
-                               /* 
-----------------------------------------------------
-                                * FIXME: This is not the right solution in all 
cases.
-                                * This is appropriate only when we are 
processing
-                                * extension content where "|" has no special 
meaning.
-                                * For now, we are turning a blind eye since 
this is
-                                * likely an edge case:
-                                *
-                                * 
[[Image:foo.jpg|thumb|<noinclude>foo|bar|baz</noinclude>]]
-                                * 
---------------------------------------------------------- */
-                               toks.push("|");
-                               toks = mergeToks(toks, a.v);
-                       } else {
-                               toks = mergeToks(toks, a.k);
-                               if (a.v !== "") {
-                                       toks.push('=');
-                                       toks = mergeToks(toks, a.v);
-                               }
-                       }
-                       return toks;
-               }
-
-               // console.warn("T: " + JSON.stringify(token));
-
-               // find the first open delim -- will be delims[0/1] for 
well-formed WT
-               var i = findMatchingDelimIndex(delims, {first: true, open: 
true});
-               var openD = i === -1 ? null : delims[i];
-
-               // find the last closed delim -- will be delims[n-2/n-1] for 
well-formed WT
-               var j = findMatchingDelimIndex(delims, {first: false, open: 
false});
-               var closeD = j === -1 ? null : delims[j];
-
-               // Merge all attributes between openD.attrIndex and 
closeD.attrIndex
-               // Tricky bits:
-               //  every attribute is a (k,v) pair, and we need to merge
-               //   both the k and v into one set of tokens and insert a "=" 
token
-               //   in between.
-               // - we need to handle the first/last attribute specially since 
the
-               //   openD/closeD may show up in either k/v of those attrs.  
That
-               //   will determine what the merged k/v value will be.
-               if (openD && closeD && i < j) {
-                       var attrs = token.attribs,
-                               toks, mergedK, mergedV;
-
-                       // console.warn("openD: " + JSON.stringify(openD));
-                       // console.warn("closeD: " + JSON.stringify(closeD));
-
-                       if (openD.attrIndex === closeD.attrIndex) {
-                               // Special Case: openD and closeD showed up in 
k and v
-                               // of the same attr. In this case, openD would 
have showed up
-                               // in k and closeD in v.
-                               //
-                               // assert(openD.k !== -1);
-                               // assert(closeD.k === -1);
-                               mergedK = mergeAttr([], attrs[openD.attrIndex]);
-                               mergedV = [];
-                       } else {
-                               if (openD.k === -1) {
-                                       // openD didn't show up in k. Start 
with v
-                                       toks = mergeToks([], 
attrs[openD.attrIndex].v);
-                               } else {
-                                       // openD showed up in k.  Merge k & v
-                                       toks = mergeAttr([], 
attrs[openD.attrIndex]);
-                               }
-
-                               var x = openD.attrIndex + 1;
-                               while (x < closeD.attrIndex) {
-                                       // Compute toks + a.k + "=" + a.v
-                                       toks = mergeAttr(toks, attrs[x]);
-                                       x++;
-                               }
-
-                               // Compute merged (k,v)
-                               if (openD.k === -1) {
-                                       // openD didn't show up in k.
-                                       // Use orig-k for the merged KV
-                                       // Merge closeD's attr into toks and 
use it for v
-                                       mergedK = attrs[openD.attrIndex].k;
-                                       mergedV = mergeAttr(toks, 
attrs[closeD.attrIndex]);
-                               } else {
-                                       // openD showed up in k.
-                                       // check where closedD showed up.
-                                       if (closeD.k !== -1) {
-                                               mergedK = mergeToks(toks, 
attrs[closeD.attrIndex].k);
-                                               mergedV = 
attrs[closeD.attrIndex].v;
-                                       } else {
-                                               mergedK = mergeAttr(toks, 
attrs[closeD.attrIndex]);
-                                               mergedV = [];
-                                       }
-                               }
-                       }
-
-                       // console.warn("t-delims: " + JSON.stringify(delims));
-                       // console.warn("-------------");
-                       // console.warn("t-orig: " + JSON.stringify(token));
-                       // console.warn("merged k: " + JSON.stringify(mergedK));
-                       // console.warn("merged v: " + JSON.stringify(mergedV));
-
-                       // clone token and splice in merged attribute
-                       var numDeleted = closeD.attrIndex - openD.attrIndex;
-                       token = token.clone();
-
-                       // FIXME: Blindly using src-offsets from the attribute 
where the
-                       // opening tag showed up. This is a temporary hack.  We 
need a
-                       // better solution to this whole token-and-attr 
collector mess.
-                       var newKV = new KV(mergedK, mergedV, 
attrs[openD.attrIndex].srcOffsets);
-                       token.attribs.splice(openD.attrIndex, numDeleted + 1, 
newKV);
-
-                       // console.warn("-------------");
-                       // console.warn("t-merged: " + JSON.stringify(token));
-
-                       // remove merged delims and update attr-index for 
remaining delimiters
-                       delims.splice(i,j-i+1);
-                       while (i < delims.length) {
-                               delims.attrIndex -= numDeleted;
-                               i++;
-                       }
-               }
-
-               return [token, delims];
-       }
-
-       // Check tags to see if we have a nested delimiter
-       var attrs = token.attribs;
-       var delims = [];
-       var i, n;
-
-       for (i = 0, n = attrs.length; i < n; i++) {
-               var a = attrs[i];
-               var k = a.k;
-               if (k.constructor === Array && k.length > 0) {
-                       delims = delims.concat(collectNestedDelimiters(this, 
token, i, true, k));
-               }
-               var v = a.v;
-               if (v.constructor === Array && v.length > 0) {
-                       delims = delims.concat(collectNestedDelimiters(this, 
token, i, false, v));
-               }
-       }
-
-       // console.warn("delims: " + JSON.stringify(delims));
-
-       if (delims.length === 0) {
-               return nothingSpecialToDo(this, token);
-       } else {
-               if (delims.length > 1) {
-                       // we will have delims.length %2 matched pairs across
-                       // attributes and their .k and .v properties.  Merge
-                       // them into a unified attribute since this separation
-                       // is a Parsoid parsing artefact.
-                       var ret = reuniteSeparatedPairs(token, delims);
-                       token  = ret[0];
-                       delims = ret[1];
-               }
-
-               if (delims.length === 0) {
-                       // we merged matching pairs and eliminated all nested
-                       // delims to process in this pass.
-                       return nothingSpecialToDo(this, token);
-               } else {
-                       var openDelim, closedDelim;
-
-                       // Find first closed delim -- should always be the 
delims[0]
-                       // if everything is working properly.
-                       i = findMatchingDelimIndex(delims, {first: true, open: 
false });
-                       closedDelim = i === -1 ? null : delims[i];
-
-                       // Find last open delim -- should always be the 
delims[numDelims-1]
-                       // if everything is working properly.
-                       i = findMatchingDelimIndex(delims, {first: false, open: 
true });
-                       openDelim = i === -1 ? null : delims[i];
-
-                       if (this.hasOpenTag) {
-                               if (closedDelim) {
-                                       this.collection.end = closedDelim;
-                                       this.hasOpenTag = false;
-                                       return 
this.transformation(this.collection);
-                               } else {
-                                       // nested/extra tag?  we'll ignore it.
-                                       return nothingSpecialToDo(this, token);
-                               }
-                       } else {
-                               if (openDelim) {
-                                       this.init(openDelim);
-                                       return {tokens: null};
-                               } else {
-                                       // nested/extra tag?  we'll ignore it.
-                                       return nothingSpecialToDo(this, token);
-                               }
-                       }
-               }
-       }
-};
-
-TokenAndAttrCollector.prototype.onAnyToken = function( token, frame, cb ) {
-       //console.warn("T<" + this.tagName + ":" + this.rank + ":" + 
this.hasOpenTag + ">:" + JSON.stringify(token));
-       var tc = token.constructor, res;
-       if ((tc === TagTk) && (token.name === this.tagName)) {
-               this.init(token);
-               return {tokens: null};
-       } else if (this.hasOpenTag) {
-               if ((tc === EndTagTk) && (token.name === this.tagName)) {
-                       this.hasOpenTag = false;
-                       this.collection.end = token;
-                       return this.transformation(this.collection);
-               } else if (tc === EOFTk) {
-                       if (this.toEnd) {
-                               this.collection.tokens.push(token);
-                               this.hasOpenTag = false;
-                               res = this.transformation(this.collection);
-                               // make sure we preserve the EOFTk
-                               if ( res.tokens && res.tokens.length &&
-                                               res.tokens.last().constructor 
!== EOFTk ) {
-                                       res.tokens.push(token);
-                               } else {
-                                       res = { tokens: [token] };
-                               }
-                               return res;
-                       } else {
-                               this.collection.tokens.push(token);
-                               return { tokens: this.collection.tokens };
-                       }
-               } else if (tc === TagTk || tc === EndTagTk || tc === 
SelfclosingTagTk){
-                       return this.inspectAttrs(token);
-               } else {
-                       this.collection.tokens.push(token);
-                       return { };
-               }
-       } else {
-               if ((tc === EndTagTk) && (token.name === this.tagName)) {
-                       // ERROR! unbalanced closing token! -- convert to 
string!
-                       // Spit out error somewhere.
-                       // FIXME: Copy over tsr
-                       return {tokens: [new String("</" + this.tagName + 
">")]};
-               } else if (tc === SelfclosingTagTk && token.name === 
this.tagName) {
-                       return this.transformation({
-                               start  : token,
-                               end    : null,
-                               tokens : []
-                       });
-               } else if (tc === TagTk || tc === EndTagTk || tc === 
SelfclosingTagTk){
-                       return this.inspectAttrs(token);
-               } else {
-                       return {tokens: [token]};
-               }
-       }
-};
-
-if (typeof module === "object") {
-       module.exports.TokenAndAttrCollector = TokenAndAttrCollector;
-}
diff --git a/js/lib/ext.util.TokenCollector.js 
b/js/lib/ext.util.TokenCollector.js
index 35cc08c..cee386f 100644
--- a/js/lib/ext.util.TokenCollector.js
+++ b/js/lib/ext.util.TokenCollector.js
@@ -1,9 +1,3 @@
-/* FIXME: Currently not used by anything now.
- * But once include-directives are handled better in the tokenizer,
- * token-and-attr-collector might become defunct and we might use
- * this instead.  So keeping around till then.
- */
-
 /**
  * @class
  *
@@ -52,7 +46,6 @@
  * ExtensionContentCollector for example.
  */
 TokenCollector.prototype._anyDelta = 0.00000001;
-
 
 /**
  * @private
@@ -122,9 +115,8 @@
 
                                // preserve the EOFTk
                                res.tokens.push(token);
-                       } else {
-                               res = { tokens: [token] };
                        }
+
                        return res;
                }
        } else {
diff --git a/js/lib/pegTokenizer.pegjs.txt b/js/lib/pegTokenizer.pegjs.txt
index be7dbff..76a04c7 100644
--- a/js/lib/pegTokenizer.pegjs.txt
+++ b/js/lib/pegTokenizer.pegjs.txt
@@ -296,10 +296,9 @@
         return pos === inputLength;
     };
 
-    // Current extension tag being parsed.
+    // Current extension/include tag being parsed.
     var currExtTag = null;
 
-    // SSS FIXME: Temporary hack till the next round of cleanup and 
refactoring.
     var includeTags = Util.arrayToHash([
         "includeonly", "noinclude", "onlyinclude"
     ]);
@@ -1454,7 +1453,9 @@
                 // Parse ext-content, strip eof, and shift tsr
                 var extContent = dp.src.substring(dp.tagWidths[0], 
dp.src.length - dp.tagWidths[1]);
                 var extContentToks = (new 
PegTokenizer(pegArgs.env)).tokenize(extContent);
-                extContentToks = Util.stripEOFTkfromTokens(extContentToks);
+                if (dp.tagWidths[1] > 0) {
+                    extContentToks = Util.stripEOFTkfromTokens(extContentToks);
+                }
                 Util.shiftTokenTSR(extContentToks, dp.tsr[0] + 
dp.tagWidths[0]);
                 ret = [t2].concat(extContentToks);
             }
@@ -2195,34 +2196,96 @@
  * |}
  */
 
-include_limits = "<" end:"/"? name:[0-9a-zA-Z]+ (space / newline)* ">" {
+include_limits =
+  "</" name:[0-9a-zA-Z]+ (space / newline)* ">" {
+     // End tag only
      name = name.join('');
      var incl = name.toLowerCase();
      if (incl === "noinclude" || incl === "onlyinclude" || incl === 
"includeonly") {
-         var da = {tsr: [pos0, pos]};
+         var dp = {tsr: [pos0, pos]};
+         // Record variant since tag is not in normalized lower case
          if (name !== incl) {
-             da.srcTagName = name;
+             dp.srcTagName = name;
          }
-         if (end) {
-            return [new EndTagTk(name, [], da)];
-         } else {
-            return [new TagTk(name, [], da)];
-         }
+         return new EndTagTk(name, [], dp);
      } else {
          return null;
      }
   }
+  / inclTag:("<" name:[0-9a-zA-Z]+ (space / newline)* ">" {
+     // Start tag only
+     name = name.join('');
+     var incl = name.toLowerCase();
+     if (incl === "noinclude" || incl === "onlyinclude" || incl === 
"includeonly") {
+         var dp = {tsr: [pos0, pos]},
+             restOfInput = input.substring(pos0),
+             tagContent = restOfInput.match(new RegExp("^(.|\n)*?(</\s*" + 
incl + ">)", "m")),
+             tagWidths = [pos-pos0, (tagContent ? tagContent[2].length : 0)],
+             inclSrc = tagContent ? tagContent[0] : restOfInput,
+             inclContentLen = inclSrc.length - tagWidths[0] - tagWidths[1],
+             skipLen = inclContentLen;
+
+         dp.src = inclSrc;
+         dp.origInput = input;
+
+         // Replace incl-content with '#' or '_'
+         var skipChar = '#';
+         if (input.length >= pos+skipLen && input[pos+skipLen] === '#') {
+             skipChar = '_';
+         }
+         input = input.slice(0,pos) +
+             Util.charSequence('', skipChar, skipLen) +
+             input.slice(pos+skipLen);
+
+         // Temporary state
+         dp.skipLen = skipLen;
+         dp.tagWidths = tagWidths;
+         dp.skipChar = skipChar;
+
+         // Record variant since tag is not in normalized lower case
+         if (name !== incl) {
+             dp.srcTagName = name;
+         }
+
+         return new TagTk(name, [], dp);
+     } else {
+         return null;
+     }
+  }) dummyText:('#'+ / '_'+) {
+      var dp = inclTag.dataAttribs;
+      if (dummyText.length !== dp.skipLen || dummyText[0] !== dp.skipChar) {
+          return null;
+      }
+
+      // Tokenize include content in a new tokenizer
+      var inclContent = dp.src.substring(dp.tagWidths[0], dp.src.length - 
dp.tagWidths[1]),
+          inclContentToks = (new 
PegTokenizer(pegArgs.env)).tokenize(inclContent);
+
+      if (dp.tagWidths[1] > 0) {
+          inclContentToks = Util.stripEOFTkfromTokens(inclContentToks);
+      }
+
+      // shift tsr
+      Util.shiftTokenTSR(inclContentToks, dp.tsr[0] + dp.tagWidths[0]);
+
+      // Reset input
+      input = dp.origInput;
+
+      // Clear temporary state
+      dp.skipChar = undefined;
+      dp.skipLen = undefined;
+      dp.origInput = undefined;
+
+      return [inclTag].concat(inclContentToks);
+  }
 
 eof = & { return isEOF(pos); } { return true; }
 
-
-newline
-  = '\n' / '\r\n'
+newline = '\n' / '\r\n'
 
 newlineToken = newline { return [new NlTk([pos0, pos])] }
 
 eolf = newline / eof
-
 
 // 'Preprocessor' directive- higher-level things that can occur in otherwise
 // plain-text content.
@@ -2381,7 +2444,6 @@
 
 // SSS FIXME: what about |{{!}} and {{!}}|
 pipe_pipe = "||" / "{{!}}{{!}}"
-
 
 // Similar, for tables..
 exclam = "!" / "{{;}}"

-- 
To view, visit https://gerrit.wikimedia.org/r/54506
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Id92b7bbfef730a6f32a2df52240e17dbd0cc0ae9
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] Tokenize *include* tags hierarchically just like extension t... - change (mediawiki...Parsoid)

Reply via email to

[MediaWiki-commits] [Gerrit] Tokenize include tags hierarchically just like extension t... - change (mediawiki...Parsoid)