Subramanya Sastry has uploaded a new change for review.
https://gerrit.wikimedia.org/r/81014
Change subject: (Bug 52762) Lines with 1+ comments and WS are transparent to
handlers
......................................................................
(Bug 52762) Lines with 1+ comments and WS are transparent to handlers
* This was being handled exlicitly in pre- and paragraph- handlers
and the list handler wasn't yet handling this special case.
The PHP preprocessor strips out such lines from wikitext which
effectively means that Parsoid needs to pass through those lines
without changing state of any handler in the pipeline.
* This commit uses the TokenStreamPatcher to collect these empty
lines, wraps them in a meta-token which all handlers ignore and
are re-expanded in the HTML tree builder.
* Removed custom code from the paragraph handler.
* Tweaked the list-handler to close list elements at end of line.
* Added an additional rule to the output normalization code to
strip such lnes from output.
* 4 wt2html tests are now green. Selser tests got changed around
because of tweaks to some parser tests and because of minor
changes in HTML output.
Change-Id: I6aada39ff966c328688b085c4b1ba4c46faf713c
---
M js/lib/ext.core.ListHandler.js
M js/lib/ext.core.ParagraphWrapper.js
M js/lib/ext.core.Sanitizer.js
M js/lib/ext.core.TokenStreamPatcher.js
M js/lib/mediawiki.HTML5TreeBuilder.node.js
M js/lib/mediawiki.Util.js
M js/tests/parserTests-blacklist.js
M js/tests/parserTests.txt
8 files changed, 115 insertions(+), 77 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid
refs/changes/14/81014/1
diff --git a/js/lib/ext.core.ListHandler.js b/js/lib/ext.core.ListHandler.js
index c6c3a27..bfc5787 100644
--- a/js/lib/ext.core.ListHandler.js
+++ b/js/lib/ext.core.ListHandler.js
@@ -269,16 +269,18 @@
// same list item types and same nesting level
itemToken = this.currListFrame.endtags.pop();
this.currListFrame.endtags.push(new EndTagTk( itemToken.name ));
- res = [
- itemToken,
- // this list item gets all the bullets since this is
- // a list item at the same level
- //
- // **a
- // **b
- this.currListFrame.nlTk || '',
- new TagTk( itemToken.name, [], makeDP( 0, bn.length ) )
- ];
+ res = [ itemToken ].concat(
+ this.currListFrame.solTokens,
+ [
+ // this list item gets all the bullets since
this is
+ // a list item at the same level
+ //
+ // **a
+ // **b
+ this.currListFrame.nlTk || '',
+ new TagTk( itemToken.name, [], makeDP( 0,
bn.length ) )
+ ]
+ );
} else {
var prefixCorrection = 0;
var tokens = [];
@@ -298,6 +300,7 @@
* ------------------------------------------------ */
tokens = this.popTags(bs.length - prefixLen - 1);
+ tokens = this.currListFrame.solTokens.concat(tokens);
var newName = this.bulletCharsMap[bn[prefixLen]].item;
var endTag = this.currListFrame.endtags.pop();
this.currListFrame.endtags.push(new EndTagTk( newName
));
@@ -330,6 +333,7 @@
console.warn(" -> reduced nesting");
}
tokens = tokens.concat( this.popTags(bs.length -
prefixLen) );
+ tokens = this.currListFrame.solTokens.concat(tokens);
if (this.currListFrame.nlTk) {
tokens.push(this.currListFrame.nlTk);
}
@@ -398,7 +402,6 @@
}
// clear out sol-tokens
- res = this.currListFrame.solTokens.concat(res);
res.rank = this.anyRank + 0.01;
this.currListFrame.solTokens = [];
this.currListFrame.nlTk = null;
diff --git a/js/lib/ext.core.ParagraphWrapper.js
b/js/lib/ext.core.ParagraphWrapper.js
index 2bd9440..5e920c2 100644
--- a/js/lib/ext.core.ParagraphWrapper.js
+++ b/js/lib/ext.core.ParagraphWrapper.js
@@ -102,7 +102,6 @@
this.currLine = {
tokens: [],
isNewline: atEOL,
- hasComments: false,
hasBlockToken: false,
hasWrappableTokens: false
};
@@ -122,13 +121,6 @@
this.hasOpenPTag = true;
}
- // PHP parser ignores (= strips out during preprocessing) lines with
- // a comment and other white-space. This flag checks if we are on such
a line.
- var emptyLineWithComments =
- l.isNewline &&
- !l.hasWrappableTokens &&
- l.hasComments;
-
// this.nonNlTokens += this.currLine.tokens
this.nonNlTokens = this.nonNlTokens.concat(l.tokens);
@@ -143,16 +135,6 @@
this.hasOpenHTMLPTag = false;
this.reset();
return { tokens: res };
- } else if (emptyLineWithComments) {
- // 1. Dont increment newline count on "empty" lines with
- // one or more comments -- see comment above
- //
- // 2. Convert the NlTk to a String-representation so that
- // it doesn't get processed by discardOneNlTk -- this
- // newline needs to be emitted (so it gets RTed) without
- // being processed for p-wrapping.
- this.nlWsTokens.push("\n");
- return {};
} else {
this.newLineCount++;
this.nlWsTokens.push(token);
@@ -302,11 +284,10 @@
}
} else if (tc === EOFTk || this.inPre) {
return { tokens: [token] };
- } else if ((tc === String && token.match( /^[\t ]*$/)) || tc ===
CommentTk) {
- if (tc === CommentTk) {
- this.currLine.hasComments = true;
- }
-
+ } else if (tc === CommentTk ||
+ tc === String && token.match(/^[\t ]*$/) ||
+ Util.isEmptyLineMetaToken(token))
+ {
if (this.newLineCount === 0) {
this.currLine.tokens.push(token);
// Since we have no pending newlines to trip us up,
diff --git a/js/lib/ext.core.Sanitizer.js b/js/lib/ext.core.Sanitizer.js
index 8ce9547..64425e7 100644
--- a/js/lib/ext.core.Sanitizer.js
+++ b/js/lib/ext.core.Sanitizer.js
@@ -685,6 +685,11 @@
* attribute in the DOM).
*/
Sanitizer.prototype.onAny = function ( token ) {
+ // Pass through a transparent line meta-token
+ if (Util.isEmptyLineMetaToken(token)) {
+ return { token: token };
+ }
+
// XXX: validate token type according to whitelist and convert non-ok
ones
// back to text.
diff --git a/js/lib/ext.core.TokenStreamPatcher.js
b/js/lib/ext.core.TokenStreamPatcher.js
index 95ed483..cab3fe3 100644
--- a/js/lib/ext.core.TokenStreamPatcher.js
+++ b/js/lib/ext.core.TokenStreamPatcher.js
@@ -16,6 +16,8 @@
// define some constructor shortcuts
var CommentTk = defines.CommentTk,
+ KV = defines.KV,
+ NlTk = defines.NlTk,
TagTk = defines.TagTk,
SelfclosingTagTk = defines.SelfclosingTagTk,
EndTagTk = defines.EndTagTk;
@@ -31,33 +33,45 @@
manager.addTransform( this.onAny.bind(this),
"TokenStreamPatcher:onAny", this.anyRank, 'any' );
+ this.buf = [];
this.reset();
}
-TokenStreamPatcher.prototype.nlRank = 2.001;
-TokenStreamPatcher.prototype.anyRank = 2.002;
+TokenStreamPatcher.prototype.anyRank = 2.001;
+TokenStreamPatcher.prototype.nlRank = 2.002;
TokenStreamPatcher.prototype.endRank = 2.003;
-TokenStreamPatcher.prototype.updateBuf = function() {
- if (this.buf.length === 0) {
- this.buf.sol = this.sol;
- this.buf.srcOffset = this.srcOffset;
+TokenStreamPatcher.prototype.processBuf = function(nlTk) {
+ var ret;
+
+ this.buf.push(nlTk);
+ if (this.buf.length === 1 || !this.inCollectionMode ||
!this.buf.hasComment) {
+ ret = this.buf;
+ } else {
+ ret = [
+ new SelfclosingTagTk("meta", [
+ new KV("typeof", "mw:Line"),
+ new KV("tokens", this.buf)
+ ])
+ ];
}
+ // console.warn("RET-1: " + JSON.stringify(this.buf));
+ this.buf = [];
+ this.inCollectionMode = true;
+ return ret;
};
TokenStreamPatcher.prototype.reset = function() {
this.inNowiki = false;
- this.sol = true;
this.srcOffset = 0;
- this.buf = [];
- this.updateBuf();
+ this.sol = true;
+ this.processBuf();
};
TokenStreamPatcher.prototype.onNewline = function(token) {
- this.sol = true;
this.srcOffset = (token.dataAttribs.tsr || [null,null])[1];
- this.updateBuf();
- return {tokens: [token]};
+ this.sol = true;
+ return {tokens: this.processBuf(token)};
};
TokenStreamPatcher.prototype.onEnd = function(token) {
@@ -72,9 +86,28 @@
};
TokenStreamPatcher.prototype.onAny = function(token) {
- // console.warn("T: " + JSON.stringify(token));
- var tokens = [token];
- switch (token.constructor) {
+ // console.warn("T: " + JSON.stringify(token) + ": cm: " +
this.inCollectionMode);
+
+ var tokens = [], tc = token.constructor;
+ if (this.inCollectionMode) {
+ if ((tc === String && token.match(/^\s*$/)) || tc ===
CommentTk) {
+ this.buf.push(token);
+ if (tc === CommentTk) {
+ this.buf.hasComment = true;
+ }
+ return { tokens: [] };
+ } else if (tc === NlTk) {
+ // console.warn("RET-2: " + JSON.stringify(token));
+ return { tokens: [token] };
+ } else {
+ tokens = this.buf;
+ this.buf = [];
+ this.inCollectionMode = false;
+ }
+ }
+
+ tokens.push(token);
+ switch (tc) {
case String:
// TRICK #1:
// Attempt to match "{|" after a newline and convert
@@ -124,8 +157,12 @@
}
this.clearSOL();
break;
+
+ default:
+ break;
}
+ // console.warn("RET-3: " + JSON.stringify(tokens));
return {tokens: tokens};
};
diff --git a/js/lib/mediawiki.HTML5TreeBuilder.node.js
b/js/lib/mediawiki.HTML5TreeBuilder.node.js
index 185c31a..2c759a6 100644
--- a/js/lib/mediawiki.HTML5TreeBuilder.node.js
+++ b/js/lib/mediawiki.HTML5TreeBuilder.node.js
@@ -182,6 +182,13 @@
break;
case SelfclosingTagTk:
tName = token.name;
+
+ // SSS FIXME: Re-expand the line into constituent tokens
+ if (tName === "meta" && attribs[0].v === "mw:Line") {
+ this.onChunk(attribs[1].v);
+ break;
+ }
+
this.emit('token', {type: 'StartTag', name: tName,
data: this._att(attribs)});
if ( HTML5.VOID_ELEMENTS.indexOf( tName ) < 0 ) {
// VOID_ELEMENTS are automagically treated as
self-closing by
diff --git a/js/lib/mediawiki.Util.js b/js/lib/mediawiki.Util.js
index 932187c..5c99a89 100644
--- a/js/lib/mediawiki.Util.js
+++ b/js/lib/mediawiki.Util.js
@@ -195,8 +195,14 @@
if (token.name === 'meta' &&
/\bmw:Extension\//.test(token.getAttribute('typeof'))) {
return false;
} else {
- return true;
+ return token.dataAttribs.stx !== 'html';
}
+ },
+
+ isEmptyLineMetaToken: function(token) {
+ return token.constructor === pd.SelfclosingTagTk &&
+ token.name === "meta" &&
+ token.getAttribute("typeof") === "mw:Line";
},
/*
@@ -888,6 +894,8 @@
if
(!/[^<]*(<\w+(\s+[^\0-\cZ\s"'>\/=]+(="[^"]*")?)*\/?>[^<]*)*/.test(out)) {
throw new Error("normalizeOut input is not in standard
serialized form");
}
+ // Strip comment-and-ws-only lines that PHP parser strips out
+ out = out.replace(/\n[ \t]*<!--([^-]|-(?!->))*-->([
\t]|<!--([^-]|-(?!->))*-->)*\n/g, '\n');
out = normalizeNewlines( out );
if ( !parsoidOnly ) {
// ignore troublesome attributes
diff --git a/js/tests/parserTests-blacklist.js
b/js/tests/parserTests-blacklist.js
index 8ec4525..60d214e 100644
--- a/js/tests/parserTests-blacklist.js
+++ b/js/tests/parserTests-blacklist.js
@@ -35,7 +35,6 @@
// Blacklist for wt2html
add("wt2html", "Paragraphs with newline spacing with non-empty white-space
lines in between");
add("wt2html", "Paragraphs with newline spacing with non-empty mixed comment
and white-space lines in between");
-add("wt2html", "Extra newlines: More paragraphs with indented comment");
add("wt2html", "Extra newlines between heading and content are swallowed");
add("wt2html", "Non-word characters don't terminate tag names (bug 17663,
40670, 52022)");
add("wt2html", "Bare pipe character (bug 52363)");
@@ -52,7 +51,6 @@
add("wt2html", "2a. Indent-Pre and tables");
add("wt2html", "2b. Indent-Pre and tables");
add("wt2html", "2c. Indent-Pre and tables (bug 42252)");
-add("wt2html", "4. Multiple spaces at start-of-line");
add("wt2html", "6. Pre-blocks should extend across lines with leading WS even
when there is no wrappable content");
add("wt2html", "Definition Lists: Nesting: Multi-level (Parsoid only)");
add("wt2html", "Definition Lists: Nesting: Test 2 (Parsoid only)");
@@ -78,8 +76,6 @@
add("wt2html", "Handling html with a div self-closing tag");
add("wt2html", "2. Lists with start-of-line-transparent tokens before bullets:
Template close");
add("wt2html", "List interrupted by empty line or heading");
-add("wt2html", "Single-comment whitespace lines dont break lists, and neither
do multi-comment whitespace lines");
-add("wt2html", "Replacing whitespace with tabs still doesn't break the list
(gerrit 78327)");
add("wt2html", "Test the li-hack\n(Cannot test this with PHP parser since it
relies on Tidy for the hack)");
add("wt2html", "Unclosed formatting tags that straddle lists are closed and
reopened\n(Parsoid-only since php parser generates broken html -- relies on
Tidy to fix up)");
add("wt2html", "Magic Word: {{CURRENTMONTH1}}");
@@ -2460,9 +2456,12 @@
add("selser", "<nowiki> and <pre> preference (first one wins)
[2,4,[2,0],4,[[3],0],0]");
add("selser", "2a. Indent-Pre and tables [4,[0,[[3,0,3,3,[4],2],0]]]");
add("selser", "2c. Indent-Pre and tables (bug 42252) [[0,0,3,2]]");
-add("selser", "4. Multiple spaces at start-of-line [2,0,0,2]");
-add("selser", "4. Multiple spaces at start-of-line [2,2,0,2]");
-add("selser", "4. Multiple spaces at start-of-line [0,3,0,2]");
+add("selser", "4. Multiple spaces at start-of-line [2,0,0,2,0,2]");
+add("selser", "4. Multiple spaces at start-of-line [4,1,0,0,0,[3,[0,4]]]");
+add("selser", "4. Multiple spaces at start-of-line [0,1,2,3,4,2]");
+add("selser", "4. Multiple spaces at start-of-line [4,[3],2,[3],2,[3,[1,3]]]");
+add("selser", "4. Multiple spaces at start-of-line [0,4,0,4,0,2]");
+add("selser", "4. Multiple spaces at start-of-line [2,[2],0,0,0,2]");
add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd
line is significant [0,2]");
add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd
line is significant [0,[4,0,3]]");
add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd
line is significant [4,1]");
@@ -3038,11 +3037,13 @@
add("selser", "Nested lists 7 (skip initial nesting levels) [[[[[1]]]]]");
add("selser", "List interrupted by empty line or heading [[1],0,[1],2,3,0,0]");
add("selser", "List interrupted by empty line or heading
[[[4]],4,[[1]],0,[3],2,2]");
-add("selser", "Replacing whitespace with tabs still doesn't break the list
(gerrit 78327) [0,4,4,4,[[4]],4,3,3,0,4,3,4,2,4,2,0,[[4]]]");
-add("selser", "Replacing whitespace with tabs still doesn't break the list
(gerrit 78327) [[4],2,0,0,1,2,0,4,0,0,2,2,0,3,0,3,2]");
-add("selser", "Replacing whitespace with tabs still doesn't break the list
(gerrit 78327) [[4],3,0,0,2,2,3,3,[[4]],0,2,0,0,3,0,2,0]");
-add("selser", "Replacing whitespace with tabs still doesn't break the list
(gerrit 78327) [2,3,2,2,[2],2,4,0,0,3,4,4,4,0,2,0,4]");
-add("selser", "Replacing whitespace with tabs still doesn't break the list
(gerrit 78327) [[3],3,3,4,1,2,3,2,[[3]],0,0,4,0,3,2,2,1]");
+add("selser", "Replacing whitespace with tabs still doesn't break the list
(gerrit 78327) [[1,3,2,4,[3],0,4,0,0,2,0,0,4,4,0,0,[2]]]");
+add("selser", "Replacing whitespace with tabs still doesn't break the list
(gerrit 78327) [2]");
+add("selser", "Replacing whitespace with tabs still doesn't break the list
(gerrit 78327) [[2,0,2,4,2,0,2,0,0,3,0,3,3,4,2,0,1]]");
+add("selser", "Replacing whitespace with tabs still doesn't break the list
(gerrit 78327) [[2,3,0,0,[4],4,4,0,[4],3,3,3,0,2,3,0,0]]");
+add("selser", "Replacing whitespace with tabs still doesn't break the list
(gerrit 78327) [1]");
+add("selser", "Replacing whitespace with tabs still doesn't break the list
(gerrit 78327) [[4,3,0,0,2,2,3,3,0,4,0,2,0,0,3,0,1]]");
+add("selser", "Replacing whitespace with tabs still doesn't break the list
(gerrit 78327) [[[4],4,0,0,3,0,2,2,0,0,0,4,0,0,4,2,0]]");
add("selser", "Test the li-hack\n(Cannot test this with PHP parser since it
relies on Tidy for the hack) [1,2,[3,2,[3],0]]");
add("selser", "Test the li-hack\n(Cannot test this with PHP parser since it
relies on Tidy for the hack) [[1,0,0,0,0,0,[0,2,2],2],2,[2,0,[4],4]]");
add("selser", "Test the li-hack\n(Cannot test this with PHP parser since it
relies on Tidy for the hack) [[4,2,[3],0,0,0,[4,2,3],3],0,2]");
@@ -3350,21 +3351,18 @@
add("selser", "Sanitizer: Validating the contents of the id attribute (bug
4515) [1]");
add("selser", "Sanitizer: Validating the contents of the id attribute (bug
4515) [2]");
add("selser", "Sanitizer: Validating the contents of the id attribute (bug
4515) [[2]]");
-add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[0,1,4,[0,0,0,4,0,0],0]]");
-add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[2,0,2,[0,0,0,0,0,2],0]]");
+add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[2,[0,0,2,0,0,0,0],0]]");
add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [2]");
-add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[0,1,4,[0,0,0,0,0,2],2]]");
-add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[4,1,0,[0,2,0,0,0,0],2]]");
-add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[4,[3],0,0,3]]");
-add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[0,0,4,[0,0,0,2,0,3],4]]");
-add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[4,1,0,[0,0,0,0,0,4],3]]");
-add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[0,3,4,[0,2,0,0,0,0],4]]");
-add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[0,[2],0,0,0]]");
-add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[0,0,0,2,3]]");
+add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[0,[2,0,2,0,0,0,3],3]]");
+add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[4,[3,0,0,0,0,0,0],0]]");
+add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[0,[0,0,4,0,0,0,0],2]]");
+add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[0,[4,0,3,0,0,0,0],0]]");
+add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[0,[2,0,0,0,0,0,0],0]]");
+add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[0,[0,0,0,0,3,0,3],0]]");
add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [1]");
-add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[2,2,0,2,0]]");
-add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[2,4,0,2,3]]");
-add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[3,0,4,2,0]]");
+add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[2,2,0]]");
+add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[3,[0,0,4,0,3,0,0],4]]");
+add("selser", "Sanitizer: Validating that <meta> and <link> work, but only for
Microdata [[0,2,0]]");
add("selser", "Fuzz testing: Parser13 [2]");
add("selser", "Fuzz testing: Parser13 [1]");
add("selser", "Fuzz testing: Parser13 [[2,2]]");
diff --git a/js/tests/parserTests.txt b/js/tests/parserTests.txt
index 57b7894..70c9f33 100644
--- a/js/tests/parserTests.txt
+++ b/js/tests/parserTests.txt
@@ -7375,8 +7375,7 @@
{{echo|Bar}}
!!result
<!-- foo -->
-<p typeof="mw:Transclusion">Bar
-</p>
+<p><span typeof="mw:Transclusion">Bar</span></p>
!!end
!!test
--
To view, visit https://gerrit.wikimedia.org/r/81014
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I6aada39ff966c328688b085c4b1ba4c46faf713c
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits