GWicke has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/101337


Change subject: Bunch of <pre> related fixes
......................................................................

Bunch of <pre> related fixes

1. Updated <pre> sep constraints in serializer
   Fixes bug 57276

2. Since selser now expects trailing newlines to not
   be part of DSR (see 8fa522eb), updated trailing newline
   stripping handler to strip trailing newlines from
   indent-pre nodes as well. It is safe to do so since
   empty newlines cannot be represented as indent-pre
   in wikitext anyway.

3. Update <pre> serializer to not add leading space
   on comment-and-ws-only "empty lines". Those lines
   get through the parser without being transformed
   by any handlers. So, PreHandler will not add a leading
   space on those lines. So, serializer should not add
   a space on those lines either.

4. Double-newline normalization was being applied to
   <pre> content as well which caused newlines in indent-pre
   to get stripped (bug 50906). This patch fixes that.
   The "<pre>a\n\nb</pre>" was an edge case that accidentally
   worked.

   Updated existing whitespace tests and added new tests
   to document correct behavior and prevent regressions.

Parser test results
-------------------

* Added new parser test for bug 57276 (which now passes html2wt)
  - html2html fails the more complex case because of
   differences in newline handling -- not important.

* One additional html2html test now passes as well.

* Because of fixes 2 and 3, six additional selser tests
  now pass.

Unrelated cleanup
-----------------
* Cleanup in HTML5TreeBuilder
  (from https://gerrit.wikimedia.org/r/#/c/98852/)

* Cleanup in WikitextSerializer
  (from https://gerrit.wikimedia.org/r/#/c/98982/)

BUG: 50906
BUG: 57276
Change-Id: I24ff084301d7f188e0d64a7034659193fa20ae34
---
M js/lib/dom.migrateTrailingNLs.js
M js/lib/mediawiki.DOMUtils.js
M js/lib/mediawiki.HTML5TreeBuilder.node.js
M js/lib/mediawiki.WikitextSerializer.js
M js/tests/parserTests-blacklist.js
M js/tests/parserTests.txt
6 files changed, 155 insertions(+), 75 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/37/101337/1

diff --git a/js/lib/dom.migrateTrailingNLs.js b/js/lib/dom.migrateTrailingNLs.js
index 19893a2..08d0067 100644
--- a/js/lib/dom.migrateTrailingNLs.js
+++ b/js/lib/dom.migrateTrailingNLs.js
@@ -11,7 +11,7 @@
 // SSS FIXME: Given condition 2, we may not need to check th/td anymore
 // (if we can rely on auto inserted start/end tags being present always).
 var nodesToMigrateFrom = JSUtils.arrayToSet([
-       "TH", "TD", "TR", "LI", "DD", "OL", "UL", "DL", "CAPTION", "P"
+       "PRE", "TH", "TD", "TR", "LI", "DD", "OL", "UL", "DL", "CAPTION", "P"
 ]);
 
 function nodeEndsLineInWT(node) {
@@ -48,10 +48,6 @@
                }
 
                return c === null;
-       }
-
-       if (DU.hasNodeName(elt, "pre")) {
-               return;
        }
 
        // 1. Process DOM rooted at 'elt' first
diff --git a/js/lib/mediawiki.DOMUtils.js b/js/lib/mediawiki.DOMUtils.js
index 3ae5481..bcf6f1c 100644
--- a/js/lib/mediawiki.DOMUtils.js
+++ b/js/lib/mediawiki.DOMUtils.js
@@ -646,6 +646,20 @@
        },
 
        /**
+        * Check if node is a text-node and has a leading newline.
+        */
+       nodeHasLeadingNL: function(node) {
+               return node && this.isText(node) && node.nodeValue.match(/^\n/);
+       },
+
+       /**
+        * Check if node is a text-node and has a trailing newline.
+        */
+       nodeHasTrailingNL: function(node) {
+               return node && this.isText(node) && node.nodeValue.match(/\n$/);
+       },
+
+       /**
         * Find how much offset is necessary for the DSR of an
         * indent-originated pre tag.
         *
diff --git a/js/lib/mediawiki.HTML5TreeBuilder.node.js 
b/js/lib/mediawiki.HTML5TreeBuilder.node.js
index 7338c99..8e78687 100644
--- a/js/lib/mediawiki.HTML5TreeBuilder.node.js
+++ b/js/lib/mediawiki.HTML5TreeBuilder.node.js
@@ -299,10 +299,10 @@
                case EndTagTk:
                        tName = token.name;
                        this.emit('token', {type: 'EndTag', name: tName});
-                       attrs = this._att( attribs );
-                       attrs.push({ name: "typeof", value: "mw:EndTag" });
-                       attrs.push({ name: "data-etag", value: tName });
                        if (dataAttribs && !dataAttribs.autoInsertedEnd) {
+                               attrs = this._att( attribs );
+                               attrs.push({ name: "typeof", value: "mw:EndTag" 
});
+                               attrs.push({ name: "data-etag", value: tName });
                                if ( this.trace ) { console.warn('inserting 
shadow meta for ' + tName); }
                                this.emit('token', {type: 'Comment', data: 
JSON.stringify({
                                        "@type": "mw:shadow",
diff --git a/js/lib/mediawiki.WikitextSerializer.js 
b/js/lib/mediawiki.WikitextSerializer.js
index 8e3c33c..720a8af 100644
--- a/js/lib/mediawiki.WikitextSerializer.js
+++ b/js/lib/mediawiki.WikitextSerializer.js
@@ -2707,28 +2707,51 @@
                                state.inIndentPre = true;
                                var content = 
state.serializeChildrenToString(node);
 
-                               // Insert indentation
-                               content = ' ' +
-                                       
content.replace(/(\n(<!--(?:[^\-]|\-(?!\->))*\-\->)*)(?!$)/g, '$1 ' );
+                           // Strip (only the) trailing newline
+                           var trailingNL = content.match(/\n$/);
+                           content = content.replace(/\n$/, '');
 
-                               // Strip trailing separators
-                               //var trailingSep = content.match(/\s*$/);
-                               //content = content.replace(/\s*$/, '');
+                               // Insert indentation
+                               content = ' ' + 
content.replace(/(\n(<!--(?:[^\-]|\-(?!\->))*\-\->)*)/g, '$1 ');
+
+                               // But skip "empty lines" (lines with 1+ 
comment and optional whitespace)
+                               // since empty-lines sail through all handlers 
without being affected.
+                               // See empty_line_with_comments production in 
pegTokenizer.pegjs.txt
+                               //
+                               // We could use 'split' to split content into 
lines and selectively add
+                               // indentation, but the code will get 
unnecessarily complex for questionable
+                               // benefits. So, going this route for now.
+                               content = content.replace(/(^|\n) ((?:[ 
\t]*<!--(?:[^\-]|\-(?!\->))*\-\->[ \t]*)+)(?=\n|$)/, '$1$2');
 
                                cb(content, node);
 
                                // Preserve separator source
-                               //state.sep.src = trailingSep && trailingSep[0] 
|| '';
-                               state.sep.src = '';
+                               state.sep.src = trailingNL && trailingNL[0] || 
'';
                        }
                        state.inIndentPre = false;
                },
                sepnls: {
                        before: function(node, otherNode) {
-                               return node.data.parsoid.stx === 'html' ? {} : 
{min:1};
+                               if (node.data.parsoid.stx === 'html') {
+                                       return {};
+                               } else if (otherNode.nodeName === 'PRE' &&
+                                       otherNode.data.parsoid.stx !== 'html')
+                               {
+                                       return {min:2};
+                               } else {
+                                       return {min:1};
+                               }
                        },
                        after: function(node, otherNode) {
-                               return node.data.parsoid.stx === 'html' ? {} : 
{min:1};
+                               if (node.data.parsoid.stx === 'html') {
+                                       return {};
+                               } else if (otherNode.nodeName === 'PRE' &&
+                                       otherNode.data.parsoid.stx !== 'html')
+                               {
+                                       return {min:2};
+                               } else {
+                                       return {min:1};
+                               }
                        }
                }
        },
@@ -2843,11 +2866,11 @@
                                                var child = node.firstChild;
                                                while(child) {
                                                        if (DU.isElt(child)) {
-                                                               if 
(child.nodeName === 'SPAN' &&
-                                                                               
child.getAttribute('typeof') === 'mw:Entity')
-                                                               {
-                                                                       
state.serializer._serializeNode(child, state, cb);
-                                                               } else if 
(DU.isMarkerMeta(child, "mw:DiffMarker")) {
+                                                               /* jshint 
noempty: false */
+                                                               if 
(DU.isMarkerMeta(child, "mw:DiffMarker")) {
+                                                                       // 
nothing to do
+                                                               } else if 
(child.nodeName === 'SPAN' &&
+                                                                               
child.getAttribute('typeof') === 'mw:Entity') {
                                                                        
state.serializer._serializeNode(child, state, cb);
                                                                } else {
                                                                        
cb(child.outerHTML, node);
@@ -3557,30 +3580,32 @@
        var newSepMatch = res.match(/\n\s*$/);
        res = res.replace(/\n\s*$/, '');
 
-       // Don't strip two newlines for wikitext like this:
-       // <div>foo
-       //
-       // bar</div>
-       // The PHP parser won't create paragraphs on lines that also contain
-       // block-level tags.
-       if (node.parentNode.childNodes.length !== 1 ||
-                       !DU.isBlockNode(node.parentNode) ||
-                       //node.parentNode.data.parsoid.stx !== 'html' ||
-                       doubleNewlineCount !== 1)
-       {
-               // Strip more than one consecutive newline
-               res = res.replace(/\n([ \t]*\n)+/g, '\n');
-       }
-       // Strip trailing newlines from text content
-       //if (node.nextSibling && node.nextSibling.nodeType === 
node.ELEMENT_NODE) {
-       //      res = res.replace(/\n$/, ' ');
-       //} else {
-       //      res = res.replace(/\n$/, '');
-       //}
+       if (!state.inIndentPre) {
+               // Don't strip two newlines for wikitext like this:
+               // <div>foo
+               //
+               // bar</div>
+               // The PHP parser won't create paragraphs on lines that also 
contain
+               // block-level tags.
+               if (node.parentNode.childNodes.length !== 1 ||
+                               !DU.isBlockNode(node.parentNode) ||
+                               //node.parentNode.data.parsoid.stx !== 'html' ||
+                               doubleNewlineCount !== 1)
+               {
+                       // Strip more than one consecutive newline
+                       res = res.replace(/\n([ \t]*\n)+/g, '\n');
+               }
+               // Strip trailing newlines from text content
+               //if (node.nextSibling && node.nextSibling.nodeType === 
node.ELEMENT_NODE) {
+               //      res = res.replace(/\n$/, ' ');
+               //} else {
+               //      res = res.replace(/\n$/, '');
+               //}
 
-       // Strip leading newlines. They are already added to the separator 
source
-       // in handleSeparatorText.
-       res = res.replace(/^\n/, '');
+               // Strip leading newlines. They are already added to the 
separator source
+               // in handleSeparatorText.
+               res = res.replace(/^\n/, '');
+       }
 
        // Always escape entities
        res = Util.escapeEntities(res);
@@ -3668,7 +3693,7 @@
  * XXX: Support separator-transparent elements!
  */
 WSP.handleSeparatorText = function ( node, state ) {
-       if (DU.isText(node)) {
+       if (!state.inIndentPre && DU.isText(node)) {
                if (node.nodeValue.match(/^\s*$/)) {
                        state.sep.src = (state.sep.src || '') + node.nodeValue;
                        //if (!state.sep.lastSourceNode) {
@@ -4087,7 +4112,11 @@
                                dsrA = prevNode.parentNode.data.parsoid.dsr;
                        } else if (prevNode.previousSibling &&
                                        prevNode.previousSibling.nodeType === 
prevNode.ELEMENT_NODE &&
+                                       // FIXME: Not sure why we need this 
check because data-parsoid
+                                       // is loaded on all nodes. 
mw:Diffmarker maybe? But, if so, why?
+                                       // Should be fixed.
                                        prevNode.previousSibling.data &&
+                                       prevNode.previousSibling.data.parsoid &&
                                        
prevNode.previousSibling.data.parsoid.dsr &&
                                        // Don't extrapolate if the string was 
potentially changed
                                        // or we didn't diff (selser disabled)
@@ -4386,9 +4415,7 @@
                        state.prevNodeUnmodified = state.currNodeUnmodified;
                        state.currNodeUnmodified = false;
 
-                       if (state.selserMode) {
-                               this.trace("TEXT: ", node.nodeValue);
-                       }
+                       this.trace("TEXT: ", node.nodeValue);
 
                        if (!this.handleSeparatorText(node, state)) {
                                // Text is not just whitespace
@@ -4413,9 +4440,7 @@
                        state.prevNodeUnmodified = state.currNodeUnmodified;
                        state.currNodeUnmodified = false;
 
-                       if (state.selserMode) {
-                               this.trace("COMMENT: ", node.nodeValue);
-                       }
+                       this.trace("COMMENT: ", node.nodeValue);
 
                        // delay the newline creation until after the comment
                        if (!this.handleSeparatorText(node, state)) {
diff --git a/js/tests/parserTests-blacklist.js 
b/js/tests/parserTests-blacklist.js
index 5dcd5c2..6469302 100644
--- a/js/tests/parserTests-blacklist.js
+++ b/js/tests/parserTests-blacklist.js
@@ -541,8 +541,6 @@
 add("wt2wt", "Comment semantics: unclosed comment at end");
 add("wt2wt", "<nowiki> inside <pre> (bug 13238)");
 add("wt2wt", "<nowiki> and <pre> preference (first one wins)");
-add("wt2wt", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd 
line is significant");
-add("wt2wt", "6. Pre-blocks should extend across lines with leading WS even 
when there is no wrappable content");
 add("wt2wt", "HTML-pre: 1. embedded newlines");
 add("wt2wt", "Definition lists: self-closed tag");
 add("wt2wt", "External links: open square bracket forbidden in URL (named) 
(bug 4377)");
@@ -711,7 +709,6 @@
 add("html2html", "<pre> with forbidden attribute values (bug 3202)");
 add("html2html", "<nowiki> inside <pre> (bug 13238)");
 add("html2html", "Empty pre; pre inside other HTML tags (bug 54946)");
-add("html2html", "HTML pre followed by indent-pre");
 add("html2html", "3a. Indent-Pre and block tags (single-line html)");
 add("html2html", "3b. Indent-Pre and block tags (pre-content on separate 
line)");
 add("html2html", "4. Multiple spaces at start-of-line");
@@ -1188,6 +1185,7 @@
 add("html2html", "Empty TR followed by a template-generated 
TR\n(Parsoid-specific since PHP parser doesn't handle this mixed 
tbl-wikitext)");
 add("html2html", "Empty TR followed by mixed-ws-comment line should RT 
correctly");
 add("html2html", "Multi-line image caption generated by templates with/without 
trailing newlines");
+add("html2html", "Consecutive <pre>s should not get merged");
 
 
 // Blacklist for html2wt
@@ -1286,7 +1284,7 @@
 add("html2wt", "3a. Indent-Pre and block tags (single-line html)");
 add("html2wt", "3b. Indent-Pre and block tags (pre-content on separate line)");
 add("html2wt", "4. Multiple spaces at start-of-line");
-add("html2wt", "5. White-space in indent-pre\nNOTE: the white-space char on 
2nd line is significant");
+add("html2wt", "5a. White-space in indent-pre");
 add("html2wt", "6. Pre-blocks should extend across lines with leading WS even 
when there is no wrappable content");
 add("html2wt", "HTML-pre: 1. embedded newlines");
 add("html2wt", "HTML-pre: 2: indented text");
@@ -2376,21 +2374,15 @@
 add("selser", "<nowiki> and <pre> preference (first one wins) 
[0,0,[2,2],0,2,0]", 
"<pre>\n<nowiki>\n</pre>\nayl0rt9j2rvvlsor</nowiki>ku5wz6d4oixjemi\n</pre>\n\naoia6vijp7722o6r\n\n<nowiki>\n<pre>\n<nowiki>\n</pre>\n</nowiki>\n</pre>\n");
 add("selser", "2c. Indent-Pre and tables (bug 42252) [[4,[3],3,4]]", 
"{|<!--ypj7hfyfeggv6lxr-->\n|+<!--budgqhu8gdfb6gvi-->\n|}");
 add("selser", "2c. Indent-Pre and tables (bug 42252) [[4,0,0,4]]", 
"{|<!--75hp80omksqhncdi-->\n|+ foo\n <!--8xrgwr6xnqj8aor-->|}");
-add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd 
line is significant [[3,0,4]]", " <br/>\n a3l22m2436f4unmi");
-add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd 
line is significant [2]", "zj3bbk6ucui8uxr\n a<br/>\n \n b");
-add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd 
line is significant [1]", " a<br/>\n \n \n b");
-add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd 
line is significant [[3,0,0]]", " <br/>\n \n \n b");
-add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd 
line is significant [[0,0,3]]", " a<br/>\n");
-add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd 
line is significant [[0,0,2]]", " a<br/>\n cta0eum223wbqpvi\n \n b");
-add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd 
line is significant [[0,2,0]]", " ae5r26yt10dx6r<br/>\n \n \n b");
-add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd 
line is significant [[4,0,2]]", " 9cb82yu0k8adcxr<br/>\n y4lz0dsrgxkr19k9\n \n 
b");
-add("selser", "5. White-space in indent-pre\nNOTE: the white-space char on 2nd 
line is significant [[4,2,3]]", " lmoi3q8misaqyqfrdzigxr658zen4s4i<br/>\n");
-add("selser", "6. Pre-blocks should extend across lines with leading WS even 
when there is no wrappable content [0,3,[2],2,1]", " a\n \n <!-- continue -->\n 
b\n\n 0m6r034mkmon7b9c\n\n8yvdh6pxp8tfn7b9\n \nd");
-add("selser", "6. Pre-blocks should extend across lines with leading WS even 
when there is no wrappable content [2,0,1,0,[3]]", "opfgacofr6rh33di\n a\n \n 
<!-- continue -->\n b\n\n c\n \n");
-add("selser", "6. Pre-blocks should extend across lines with leading WS even 
when there is no wrappable content [0,2,0,0,0]", " a\n \n <!-- continue -->\n 
b\n4iktogo6ehw4gqfr\n\n c\n \nd");
-add("selser", "6. Pre-blocks should extend across lines with leading WS even 
when there is no wrappable content [2,4,0,0,3]", "iinv6rmhb9e8kt9\n a\n \n <!-- 
continue -->\n b\ng7k5nb5u0z035wmi\n c\n \n");
-add("selser", "6. Pre-blocks should extend across lines with leading WS even 
when there is no wrappable content [0,0,0,4,[2]]", " a\n \n <!-- continue -->\n 
b\n\n c\n\nf0hedm0999j54s4i\n\nbf387lq47n4uz0k9d");
-add("selser", "6. Pre-blocks should extend across lines with leading WS even 
when there is no wrappable content [3,3,1,2,[4]]", " c\n\nq4fdwd5681t3mcxr 
\no14qbnr6610jm7vi");
+add("selser", "5a. White-space in indent-pre [[0,0,4]]", " a<br />\n 
7tmueg6dyhqia4i");
+add("selser", "5a. White-space in indent-pre [1]", " a<br />\n \n \n b");
+add("selser", "5a. White-space in indent-pre [[3,0,0]]", " <br />\n \n \n b");
+add("selser", "5a. White-space in indent-pre [[3,2,0]]", " ghzkfux2q7pynwmi<br 
/>\n \n \n b");
+add("selser", "5a. White-space in indent-pre [[0,2,2]]", " 
avnpy3yvtyhh93sor<br />\n tryc19803p4lsor\n \n b");
+add("selser", "5a. White-space in indent-pre [[0,0,2]]", " a<br />\n 
m7led9fbke8nnrk9\n \n b");
+add("selser", "5a. White-space in indent-pre [[2,0,4]]", " d3ppdqvqzdxi529a<br 
/>\n 82567nb6q9hbmx6r");
+add("selser", "5a. White-space in indent-pre [[2,0,0]]", " 9blj98gpz0py14ia<br 
/>\n \n \n b");
+add("selser", "5a. White-space in indent-pre [[0,2,0]]", " 
am3m3vd8oxzu84cxr<br />\n \n \n b");
 add("selser", "HTML-pre: 1. embedded newlines [0,0,0,0,2,4,[4]]", 
"<pre>foo</pre>\n\n<pre>\nfoo\n</pre>\n\n0j7m5dwvy5t81tt9<pre>\n\nfoo\n</pre>gznvus2wm2ikke29<pre>3kx0qbxm4do39pb9</pre>");
 add("selser", "HTML-pre: 1. embedded newlines [0,2,[4],0,[4],3,2]", 
"<pre>foo</pre>7ysmnukvidpctyb9\n\n<pre>9g47a95uexywrk9</pre>\n\n<pre>frywx1s1mtervn29</pre>bk0krhbpq3nb3xr<pre>\n\n\nfoo\n</pre>");
 add("selser", "HTML-pre: 1. embedded newlines [1,0,[3],2,0,4,0]", "<pre 
data-foobar=\"kvsknufiw5r0be29\">foo</pre>\n\n<pre></pre>7p6kn1agspz4zpvi\n\n<pre>\n\nfoo\n</pre>2twixsqcqcdz33di<pre>\n\n\nfoo\n</pre>");
diff --git a/js/tests/parserTests.txt b/js/tests/parserTests.txt
index 5e690f5..6a96970 100644
--- a/js/tests/parserTests.txt
+++ b/js/tests/parserTests.txt
@@ -2213,11 +2213,11 @@
 
 !!end
 
+## NOTE: the leading white-space chars on empty line are significant
 !! test
-5. White-space in indent-pre
-NOTE: the white-space char on 2nd line is significant
+5a. White-space in indent-pre
 !! input
- a<br/>
+ a<br />
  
  b
 !! result
@@ -2226,6 +2226,27 @@
 b
 </pre>
 !! end
+
+## NOTE: the leading white-space chars on empty line are significant
+!! test
+5b. White-space in indent-pre
+!! input
+ a
+ 
+ b
+ 
+ 
+ c
+!! result
+<pre>a
+
+b
+
+
+c
+</pre>
+!! end
+
 
 !! test
 6. Pre-blocks should extend across lines with leading WS even when there is no 
wrappable content
@@ -18915,6 +18936,38 @@
 !! end
 
 !! test
+Consecutive <pre>s should not get merged
+!! options
+parsoid=html2wt,html2html
+!! input
+ a
+
+ b
+
+ c
+
+ d
+
+ e
+ 
+
+ 
+ f
+!! result
+<pre>a</pre><pre>b</pre>
+
+<pre>c
+</pre><pre>
+d</pre>
+
+<pre>e
+
+</pre><pre>
+
+f</pre>
+!! end
+
+!! test
 Edited ISBN links not serializable as ISBN links should serialize as wikilinks
 !! options
 parsoid=html2wt

-- 
To view, visit https://gerrit.wikimedia.org/r/101337
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I24ff084301d7f188e0d64a7034659193fa20ae34
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: GWicke <[email protected]>
Gerrit-Reviewer: Subramanya Sastry <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to