Cscott has uploaded a new change for review.
https://gerrit.wikimedia.org/r/202058
Change subject: T95039: encode/decode arbitrary data in comments.
..
T95039: encode/decode arbitrary data in comments.
Use HTML5 entity encoding to ensure that arbitrary data can be represented
in comments without triggering premature end of comment.
Do a little bit entity encoding on the wikitext side as well to allow the
user to author the string "-->" inside a comment. They probably won't want
to (!) but we want to avoid exposing arbitrary content restrictions to
DOM consumers (like Visual Editor).
If you were wondering, "-->" is represented as "-->" in wikitext.
Change-Id: Id1cb60d201abfc14633a353830ea9fbc89f633a8
---
M lib/dom.computeDSR.js
M lib/mediawiki.tokenizer.utils.js
M lib/pegTokenizer.pegjs.txt
M lib/wts.utils.js
M tests/parserTests-blacklist.js
M tests/parserTests.txt
6 files changed, 113 insertions(+), 66 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid
refs/changes/58/202058/1
diff --git a/lib/dom.computeDSR.js b/lib/dom.computeDSR.js
index 89110a1..763fea3 100644
--- a/lib/dom.computeDSR.js
+++ b/lib/dom.computeDSR.js
@@ -2,6 +2,7 @@
var Consts = require('./mediawiki.wikitext.constants.js').WikitextConstants,
DU = require('./mediawiki.DOMUtils.js').DOMUtils,
+ TU = require('./mediawiki.tokenizer.utils.js'),
Util = require('./mediawiki.Util.js').Util,
dumpDOM = require('./dom.dumper.js').dumpDOM;
@@ -249,7 +250,10 @@
}
} else if (cType === node.COMMENT_NODE) {
if (ce !== null) {
- cs = ce - child.data.length - 7; // 7 chars for
""
+ // decode html entities & reencode as wikitext
to get the
+ // length of the data. The subtract 7 chars
for the
+ // ""
+ cs = ce - TU.decodeComment(child.data).length -
7;
}
} else if (cType === node.ELEMENT_NODE) {
var cTypeOf = child.getAttribute("typeof"),
diff --git a/lib/mediawiki.tokenizer.utils.js b/lib/mediawiki.tokenizer.utils.js
index 0f2aec3..4c8b6a7 100644
--- a/lib/mediawiki.tokenizer.utils.js
+++ b/lib/mediawiki.tokenizer.utils.js
@@ -12,10 +12,7 @@
EndTagTk = defines.EndTagTk,
CommentTk = defines.CommentTk;
-
-var replaceMatchWithSpaces = function( match ) {
- return " ".repeat( match.length );
-};
+var Util = require('./mediawiki.Util.js').Util;
var tu = {
@@ -298,35 +295,80 @@
}
},
- // Comment normalizing
+ // Comment encoding/decoding.
//
- // * Some relevant phab tickets: T94055, T70146, T60184.
+ // * Some relevant phab tickets: T94055, T70146, T60184, T95039
+ //
+ // The wikitext comment production is very simple: ends a comment. This means we can have almost anything as
the
+ // contents of a comment (except the string "-->", but see below),
including
+ // several things that are not valid in HTML5 comments:
//
// * For one, the html5 comment parsing algorithm [0] leniently accepts
//--!> as a closing comment tag, which differs from the php+tidy
combo.
- //We suppress that behaviour at the cost of introducing a dirty
diff,
- //but selser should take care of it.
//
- // * If the comment's data matches /^-?>/, html5 will end the comment,
- //introducing roundtrip diffs. For example, breaks
up as
+ // * If the comment's data matches /^-?>/, html5 will end the comment.
+ //For example, breaks up as
// (as text).
//
// * Finally, comment data shouldn't contain two consecutive
hyphen-minus
//characters (--), nor end in a hyphen-minus character (/-$/) as
defined
- //in the spec [1]. Normalize to this syntax in order to emit valid
XML.
+ //in the spec [1].
+ //
+ // We work around all these problems by using HTML entity encoding
inside
+ // the comment body. The characters -, >, and & must be encoded in
order
+ // to prevent premature termination of the comment by one of the cases
+ // above. Encoding other characters is optional; all entities will be
+ // decoded during wikitext serialization.
+ //
+ // In order to allow *arbitrary* content inside a wikitext comment,
+ // including the forbidden string "-->" we also do some minimal entity
+ // decoding on the wikitext. We are also limited by our inability
+ // to encode DSR attributes on the comment node, so our wikitext entity
+ // decoding must be 1-to-1: that is, there must be a unique "decoded"
+ // string for every wikitext seq