jenkins-bot has submitted this change and it was merged. Change subject: Bunch of <pre> related fixes ......................................................................
Bunch of <pre> related fixes 1. Updated <pre> sep constraints in serializer Fixes bug 57276 2. Since selser now expects trailing newlines to not be part of DSR (see 8fa522eb), updated trailing newline stripping handler to strip trailing newlines from indent-pre nodes as well. It is safe to do so since empty newlines cannot be represented as indent-pre in wikitext anyway. 3. Update <pre> serializer to not add leading space on comment-and-ws-only "empty lines". Those lines get through the parser without being transformed by any handlers. So, PreHandler will not add a leading space on those lines. So, serializer should not add a space on those lines either. 4. Double-newline normalization was being applied to <pre> content as well which caused newlines in indent-pre to get stripped (bug 50906). This patch fixes that. The "<pre>a\n\nb</pre>" was an edge case that accidentally worked. Updated existing whitespace tests and added new tests to document correct behavior and prevent regressions. Parser test results ------------------- * Added new parser test for bug 57276 (which now passes html2wt) - html2html fails the more complex case because of differences in newline handling -- not important. * One additional html2html test now passes as well. * Because of fixes 2 and 3, six additional selser tests now pass. Unrelated cleanup ----------------- * Cleanup in HTML5TreeBuilder (from https://gerrit.wikimedia.org/r/#/c/98852/) * Cleanup in WikitextSerializer (from https://gerrit.wikimedia.org/r/#/c/98982/) BUG: 50906 BUG: 57276 Change-Id: I24ff084301d7f188e0d64a7034659193fa20ae34 --- M js/lib/dom.migrateTrailingNLs.js M js/lib/mediawiki.DOMUtils.js M js/lib/mediawiki.HTML5TreeBuilder.node.js M js/lib/mediawiki.WikitextSerializer.js M js/tests/parserTests-blacklist.js M js/tests/parserTests.txt 6 files changed, 155 insertions(+), 75 deletions(-) Approvals: GWicke: Looks good to me, approved jenkins-bot: Verified diff --git a/js/lib/dom.migrateTrailingNLs.js b/js/lib/dom.migrateTrailingNLs.js index 19893a2..08d0067 100644 --- a/js/lib/dom.migrateTrailingNLs.js +++ b/js/lib/dom.migrateTrailingNLs.js @@ -11,7 +11,7 @@ // SSS FIXME: Given condition 2, we may not need to check th/td anymore // (if we can rely on auto inserted start/end tags being present always). var nodesToMigrateFrom = JSUtils.arrayToSet([ - "TH", "TD", "TR", "LI", "DD", "OL", "UL", "DL", "CAPTION", "P" + "PRE", "TH", "TD", "TR", "LI", "DD", "OL", "UL", "DL", "CAPTION", "P" ]); function nodeEndsLineInWT(node) { @@ -48,10 +48,6 @@ } return c === null; - } - - if (DU.hasNodeName(elt, "pre")) { - return; } // 1. Process DOM rooted at 'elt' first diff --git a/js/lib/mediawiki.DOMUtils.js b/js/lib/mediawiki.DOMUtils.js index 3ae5481..bcf6f1c 100644 --- a/js/lib/mediawiki.DOMUtils.js +++ b/js/lib/mediawiki.DOMUtils.js @@ -646,6 +646,20 @@ }, /** + * Check if node is a text-node and has a leading newline. + */ + nodeHasLeadingNL: function(node) { + return node && this.isText(node) && node.nodeValue.match(/^\n/); + }, + + /** + * Check if node is a text-node and has a trailing newline. + */ + nodeHasTrailingNL: function(node) { + return node && this.isText(node) && node.nodeValue.match(/\n$/); + }, + + /** * Find how much offset is necessary for the DSR of an * indent-originated pre tag. * diff --git a/js/lib/mediawiki.HTML5TreeBuilder.node.js b/js/lib/mediawiki.HTML5TreeBuilder.node.js index 7338c99..8e78687 100644 --- a/js/lib/mediawiki.HTML5TreeBuilder.node.js +++ b/js/lib/mediawiki.HTML5TreeBuilder.node.js @@ -299,10 +299,10 @@ case EndTagTk: tName = token.name; this.emit('token', {type: 'EndTag', name: tName}); - attrs = this._att( attribs ); - attrs.push({ name: "typeof", value: "mw:EndTag" }); - attrs.push({ name: "data-etag", value: tName }); if (dataAttribs && !dataAttribs.autoInsertedEnd) { + attrs = this._att( attribs ); + attrs.push({ name: "typeof", value: "mw:EndTag" }); + attrs.push({ name: "data-etag", value: tName }); if ( this.trace ) { console.warn('inserting shadow meta for ' + tName); } this.emit('token', {type: 'Comment', data: JSON.stringify({ "@type": "mw:shadow", diff --git a/js/lib/mediawiki.WikitextSerializer.js b/js/lib/mediawiki.WikitextSerializer.js index 8e3c33c..720a8af 100644 --- a/js/lib/mediawiki.WikitextSerializer.js +++ b/js/lib/mediawiki.WikitextSerializer.js @@ -2707,28 +2707,51 @@ state.inIndentPre = true; var content = state.serializeChildrenToString(node); - // Insert indentation - content = ' ' + - content.replace(/(\n(<!--(?:[^\-]|\-(?!\->))*\-\->)*)(?!$)/g, '$1 ' ); + // Strip (only the) trailing newline + var trailingNL = content.match(/\n$/); + content = content.replace(/\n$/, ''); - // Strip trailing separators - //var trailingSep = content.match(/\s*$/); - //content = content.replace(/\s*$/, ''); + // Insert indentation + content = ' ' + content.replace(/(\n(<!--(?:[^\-]|\-(?!\->))*\-\->)*)/g, '$1 '); + + // But skip "empty lines" (lines with 1+ comment and optional whitespace) + // since empty-lines sail through all handlers without being affected. + // See empty_line_with_comments production in pegTokenizer.pegjs.txt + // + // We could use 'split' to split content into lines and selectively add + // indentation, but the code will get unnecessarily complex for questionable + // benefits. So, going this route for now. + content = content.replace(/(^|\n) ((?:[ \t]*<!--(?:[^\-]|\-(?!\->))*\-\->[ \t]*)+)(?=\n|$)/, '$1$2'); cb(content, node); // Preserve separator source - //state.sep.src = trailingSep && trailingSep[0] || ''; - state.sep.src = ''; + state.sep.src = trailingNL && trailingNL[0] || ''; } state.inIndentPre = false; }, sepnls: { before: function(node, otherNode) { - return node.data.parsoid.stx === 'html' ? {} : {min:1}; + if (node.data.parsoid.stx === 'html') { + return {}; + } else if (otherNode.nodeName === 'PRE' && + otherNode.data.parsoid.stx !== 'html') + { + return {min:2}; + } else { + return {min:1}; + } }, after: function(node, otherNode) { - return node.data.parsoid.stx === 'html' ? {} : {min:1}; + if (node.data.parsoid.stx === 'html') { + return {}; + } else if (otherNode.nodeName === 'PRE' && + otherNode.data.parsoid.stx !== 'html') + { + return {min:2}; + } else { + return {min:1}; + } } } }, @@ -2843,11 +2866,11 @@ var child = node.firstChild; while(child) { if (DU.isElt(child)) { - if (child.nodeName === 'SPAN' && - child.getAttribute('typeof') === 'mw:Entity') - { - state.serializer._serializeNode(child, state, cb); - } else if (DU.isMarkerMeta(child, "mw:DiffMarker")) { + /* jshint noempty: false */ + if (DU.isMarkerMeta(child, "mw:DiffMarker")) { + // nothing to do + } else if (child.nodeName === 'SPAN' && + child.getAttribute('typeof') === 'mw:Entity') { state.serializer._serializeNode(child, state, cb); } else { cb(child.outerHTML, node); @@ -3557,30 +3580,32 @@ var newSepMatch = res.match(/\n\s*$/); res = res.replace(/\n\s*$/, ''); - // Don't strip two newlines for wikitext like this: - // <div>foo - // - // bar</div> - // The PHP parser won't create paragraphs on lines that also contain - // block-level tags. - if (node.parentNode.childNodes.length !== 1 || - !DU.isBlockNode(node.parentNode) || - //node.parentNode.data.parsoid.stx !== 'html' || - doubleNewlineCount !== 1) - { - // Strip more than one consecutive newline - res = res.replace(/\n([ \t]*\n)+/g, '\n'); - } - // Strip trailing newlines from text content - //if (node.nextSibling && node.nextSibling.nodeType === node.ELEMENT_NODE) { - // res = res.replace(/\n$/, ' '); - //} else { - // res = res.replace(/\n$/, ''); - //} + if (!state.inIndentPre) { + // Don't strip two newlines for wikitext like this: + // <div>foo + // + // bar</div> + // The PHP parser won't create paragraphs on lines that also contain + // block-level tags. + if (node.parentNode.childNodes.length !== 1 || + !DU.isBlockNode(node.parentNode) || + //node.parentNode.data.parsoid.stx !== 'html' || + doubleNewlineCount !== 1) + { + // Strip more than one consecutive newline + res = res.replace(/\n([ \t]*\n)+/g, '\n'); + } + // Strip trailing newlines from text content + //if (node.nextSibling && node.nextSibling.nodeType === node.ELEMENT_NODE) { + // res = res.replace(/\n$/, ' '); + //} else { + // res = res.replace(/\n$/, ''); + //} - // Strip leading newlines. They are already added to the separator source - // in handleSeparatorText. - res = res.replace(/^\n/, ''); + // Strip leading newlines. They are already added to the separator source + // in handleSeparatorText. + res = res.replace(/^\n/, ''); + } // Always escape entities res = Util.escapeEntities(res); @@ -3668,7 +3693,7 @@ * XXX: Support separator-transparent elements! */ WSP.handleSeparatorText = function ( node, state ) { - if (DU.isText(node)) { + if (!state.inIndentPre && DU.isText(node)) { if (node.nodeValue.match(/^\s*$/)) { state.sep.src = (state.sep.src || '') + node.nodeValue; //if (!state.sep.lastSourceNode) { @@ -4087,7 +4112,11 @@ dsrA = prevNode.parentNode.data.parsoid.dsr; } else if (prevNode.previousSibling && prevNode.previousSibling.nodeType === prevNode.ELEMENT_NODE && + // FIXME: Not sure why we need this check because data-parsoid + // is loaded on all nodes. mw:Diffmarker maybe? But, if so, why? + // Should be fixed. prevNode.previousSibling.data && + prevNode.previousSibling.data.parsoid && prevNode.previousSibling.data.parsoid.dsr && // Don't extrapolate if the string was potentially changed // or we didn't diff (selser disabled) @@ -4386,9 +4415,7 @@ state.prevNodeUnmodified = state.currNodeUnmodified; state.currNodeUnmodified = false; - if (state.selserMode) { - this.trace("TEXT: ", node.nodeValue); - } + this.trace("TEXT: ", node.nodeValue); if (!this.handleSeparatorText(node, state)) { // Text is not just whitespace @@ -4413,9 +4440,7 @@ state.prevNodeUnmodified = state.currNodeUnmodified; state.currNodeUnmodified = false; - if (state.selserMode) { - this.trace("COMMENT: ", node.nodeValue); - } + this.trace("COMMENT: ", node.nodeValue); // delay the newline creation until after the comment if (!this.handleSeparatorText(node, state)) { diff --git a/js/tests/parserTests-blacklist.js b/js/tests/parserTests-blacklist.js index 5dcd5c2..6469302 100644 --- a/js/tests/parserTests-blacklist.js +++ b/js/tests/parserTests-blacklist.js @@ -541,8 +541,6 @@ add("wt2wt", "Comment semantics: unclosed comment at end"); add("wt2wt", "<nowiki> inside <pre> (bug 13238)"); add("wt2wt", "<nowiki> and <pre> preference (first one wins)"); -add("wt2wt", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd line is significant"); -add("wt2wt", "6. Pre-blocks should extend across lines with leading WS even when there is no wrappable content"); add("wt2wt", "HTML-pre: 1. embedded newlines"); add("wt2wt", "Definition lists: self-closed tag"); add("wt2wt", "External links: open square bracket forbidden in URL (named) (bug 4377)"); @@ -711,7 +709,6 @@ add("html2html", "<pre> with forbidden attribute values (bug 3202)"); add("html2html", "<nowiki> inside <pre> (bug 13238)"); add("html2html", "Empty pre; pre inside other HTML tags (bug 54946)"); -add("html2html", "HTML pre followed by indent-pre"); add("html2html", "3a. Indent-Pre and block tags (single-line html)"); add("html2html", "3b. Indent-Pre and block tags (pre-content on separate line)"); add("html2html", "4. Multiple spaces at start-of-line"); @@ -1188,6 +1185,7 @@ add("html2html", "Empty TR followed by a template-generated TR\n(Parsoid-specific since PHP parser doesn't handle this mixed tbl-wikitext)"); add("html2html", "Empty TR followed by mixed-ws-comment line should RT correctly"); add("html2html", "Multi-line image caption generated by templates with/without trailing newlines"); +add("html2html", "Consecutive <pre>s should not get merged"); // Blacklist for html2wt @@ -1286,7 +1284,7 @@ add("html2wt", "3a. Indent-Pre and block tags (single-line html)"); add("html2wt", "3b. Indent-Pre and block tags (pre-content on separate line)"); add("html2wt", "4. Multiple spaces at start-of-line"); -add("html2wt", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd line is significant"); +add("html2wt", "5a. White-space in indent-pre"); add("html2wt", "6. Pre-blocks should extend across lines with leading WS even when there is no wrappable content"); add("html2wt", "HTML-pre: 1. embedded newlines"); add("html2wt", "HTML-pre: 2: indented text"); @@ -2376,21 +2374,15 @@ add("selser", "<nowiki> and <pre> preference (first one wins) [0,0,[2,2],0,2,0]", "<pre>\n<nowiki>\n</pre>\nayl0rt9j2rvvlsor</nowiki>ku5wz6d4oixjemi\n</pre>\n\naoia6vijp7722o6r\n\n<nowiki>\n<pre>\n<nowiki>\n</pre>\n</nowiki>\n</pre>\n"); add("selser", "2c. Indent-Pre and tables (bug 42252) [[4,[3],3,4]]", "{|<!--ypj7hfyfeggv6lxr-->\n|+<!--budgqhu8gdfb6gvi-->\n|}"); add("selser", "2c. Indent-Pre and tables (bug 42252) [[4,0,0,4]]", "{|<!--75hp80omksqhncdi-->\n|+ foo\n <!--8xrgwr6xnqj8aor-->|}"); -add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd line is significant [[3,0,4]]", " <br/>\n a3l22m2436f4unmi"); -add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd line is significant [2]", "zj3bbk6ucui8uxr\n a<br/>\n \n b"); -add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd line is significant [1]", " a<br/>\n \n \n b"); -add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd line is significant [[3,0,0]]", " <br/>\n \n \n b"); -add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd line is significant [[0,0,3]]", " a<br/>\n"); -add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd line is significant [[0,0,2]]", " a<br/>\n cta0eum223wbqpvi\n \n b"); -add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd line is significant [[0,2,0]]", " ae5r26yt10dx6r<br/>\n \n \n b"); -add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd line is significant [[4,0,2]]", " 9cb82yu0k8adcxr<br/>\n y4lz0dsrgxkr19k9\n \n b"); -add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd line is significant [[4,2,3]]", " lmoi3q8misaqyqfrdzigxr658zen4s4i<br/>\n"); -add("selser", "6. Pre-blocks should extend across lines with leading WS even when there is no wrappable content [0,3,[2],2,1]", " a\n \n <!-- continue -->\n b\n\n 0m6r034mkmon7b9c\n\n8yvdh6pxp8tfn7b9\n \nd"); -add("selser", "6. Pre-blocks should extend across lines with leading WS even when there is no wrappable content [2,0,1,0,[3]]", "opfgacofr6rh33di\n a\n \n <!-- continue -->\n b\n\n c\n \n"); -add("selser", "6. Pre-blocks should extend across lines with leading WS even when there is no wrappable content [0,2,0,0,0]", " a\n \n <!-- continue -->\n b\n4iktogo6ehw4gqfr\n\n c\n \nd"); -add("selser", "6. Pre-blocks should extend across lines with leading WS even when there is no wrappable content [2,4,0,0,3]", "iinv6rmhb9e8kt9\n a\n \n <!-- continue -->\n b\ng7k5nb5u0z035wmi\n c\n \n"); -add("selser", "6. Pre-blocks should extend across lines with leading WS even when there is no wrappable content [0,0,0,4,[2]]", " a\n \n <!-- continue -->\n b\n\n c\n\nf0hedm0999j54s4i\n\nbf387lq47n4uz0k9d"); -add("selser", "6. Pre-blocks should extend across lines with leading WS even when there is no wrappable content [3,3,1,2,[4]]", " c\n\nq4fdwd5681t3mcxr \no14qbnr6610jm7vi"); +add("selser", "5a. White-space in indent-pre [[0,0,4]]", " a<br />\n 7tmueg6dyhqia4i"); +add("selser", "5a. White-space in indent-pre [1]", " a<br />\n \n \n b"); +add("selser", "5a. White-space in indent-pre [[3,0,0]]", " <br />\n \n \n b"); +add("selser", "5a. White-space in indent-pre [[3,2,0]]", " ghzkfux2q7pynwmi<br />\n \n \n b"); +add("selser", "5a. White-space in indent-pre [[0,2,2]]", " avnpy3yvtyhh93sor<br />\n tryc19803p4lsor\n \n b"); +add("selser", "5a. White-space in indent-pre [[0,0,2]]", " a<br />\n m7led9fbke8nnrk9\n \n b"); +add("selser", "5a. White-space in indent-pre [[2,0,4]]", " d3ppdqvqzdxi529a<br />\n 82567nb6q9hbmx6r"); +add("selser", "5a. White-space in indent-pre [[2,0,0]]", " 9blj98gpz0py14ia<br />\n \n \n b"); +add("selser", "5a. White-space in indent-pre [[0,2,0]]", " am3m3vd8oxzu84cxr<br />\n \n \n b"); add("selser", "HTML-pre: 1. embedded newlines [0,0,0,0,2,4,[4]]", "<pre>foo</pre>\n\n<pre>\nfoo\n</pre>\n\n0j7m5dwvy5t81tt9<pre>\n\nfoo\n</pre>gznvus2wm2ikke29<pre>3kx0qbxm4do39pb9</pre>"); add("selser", "HTML-pre: 1. embedded newlines [0,2,[4],0,[4],3,2]", "<pre>foo</pre>7ysmnukvidpctyb9\n\n<pre>9g47a95uexywrk9</pre>\n\n<pre>frywx1s1mtervn29</pre>bk0krhbpq3nb3xr<pre>\n\n\nfoo\n</pre>"); add("selser", "HTML-pre: 1. embedded newlines [1,0,[3],2,0,4,0]", "<pre data-foobar=\"kvsknufiw5r0be29\">foo</pre>\n\n<pre></pre>7p6kn1agspz4zpvi\n\n<pre>\n\nfoo\n</pre>2twixsqcqcdz33di<pre>\n\n\nfoo\n</pre>"); diff --git a/js/tests/parserTests.txt b/js/tests/parserTests.txt index 5e690f5..6a96970 100644 --- a/js/tests/parserTests.txt +++ b/js/tests/parserTests.txt @@ -2213,11 +2213,11 @@ !!end +## NOTE: the leading white-space chars on empty line are significant !! test -5. White-space in indent-pre -NOTE: the white-space char on 2nd line is significant +5a. White-space in indent-pre !! input - a<br/> + a<br /> b !! result @@ -2226,6 +2226,27 @@ b </pre> !! end + +## NOTE: the leading white-space chars on empty line are significant +!! test +5b. White-space in indent-pre +!! input + a + + b + + + c +!! result +<pre>a + +b + + +c +</pre> +!! end + !! test 6. Pre-blocks should extend across lines with leading WS even when there is no wrappable content @@ -18915,6 +18936,38 @@ !! end !! test +Consecutive <pre>s should not get merged +!! options +parsoid=html2wt,html2html +!! input + a + + b + + c + + d + + e + + + + f +!! result +<pre>a</pre><pre>b</pre> + +<pre>c +</pre><pre> +d</pre> + +<pre>e + +</pre><pre> + +f</pre> +!! end + +!! test Edited ISBN links not serializable as ISBN links should serialize as wikilinks !! options parsoid=html2wt -- To view, visit https://gerrit.wikimedia.org/r/97165 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I24ff084301d7f188e0d64a7034659193fa20ae34 Gerrit-PatchSet: 9 Gerrit-Project: mediawiki/extensions/Parsoid Gerrit-Branch: master Gerrit-Owner: Subramanya Sastry <[email protected]> Gerrit-Reviewer: GWicke <[email protected]> Gerrit-Reviewer: Subramanya Sastry <[email protected]> Gerrit-Reviewer: jenkins-bot _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
