jenkins-bot has submitted this change and it was merged.

Change subject: Fix sol handling in separators + tag handler cleanup
......................................................................


Fix sol handling in separators + tag handler cleanup

* Only /\n$/ puts serializer in SOL-state, not /\n\s*$/.

* However, fixing this bug exposed another fairly serious one.
  NL constraints for various HTML tags don't indicate whether
  the wikitext for the HTML tags should force SOL state by stripping
  excess whitespace characters at the start of line.

[subbu@earth lib] echo "<ul>\n<li>a</li>\n <li>b</li></ul>" | node parse
--html2wt
* a
 * b

  This is clearly a bug. The reason we didn't trip over it all this
  while is because 'make-indent-pre' stripped the excess whitespace
  to prevent indent-pre-ing the content. However, that is a scenario
  of one bug papering over another bug.

* This patch also introduces a new flag in tag handlers.
  Tags can declare if they force SOL state on separators.
  This flag is used to suppress non-nl whitespace before emitting wikitext
  for those tags.

* The fly-in-the-ointment there were some tags that have different
  behavior in different contexts. Ex: ";a:b" vs ";a\n:b". The <dd> tag
  in the second example forces SOL context but not the former. Added
  additional handling to deal with this scenario.

  However, this also led to some nice code cleanup in the tag handlers.

* No net parser test changes after all these fixes.

Change-Id: I7a5ccd33b31fa4222f016e33a9576f4127590b2d
---
M lib/html2wt/SerializerState.js
M lib/html2wt/TagHandlers.js
M lib/html2wt/WikitextSerializer.js
M lib/html2wt/separators.js
4 files changed, 96 insertions(+), 87 deletions(-)

Approvals:
  Arlolra: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/lib/html2wt/SerializerState.js b/lib/html2wt/SerializerState.js
index 1ede07f..ee69f37 100644
--- a/lib/html2wt/SerializerState.js
+++ b/lib/html2wt/SerializerState.js
@@ -218,7 +218,7 @@
 SSP.sepIntroducedSOL = function(sep) {
        // Don't get tripped by newlines in comments!  Be wary of nowikis added
        // by makeSepIndentPreSafe on the last line.
-       if (sep.replace(Util.COMMENT_REGEXP_G, '').search(/\n\s*$/) !== -1) {
+       if (sep.replace(Util.COMMENT_REGEXP_G, '').search(/\n$/) !== -1) {
                this.onSOL = true;
        }
 };
diff --git a/lib/html2wt/TagHandlers.js b/lib/html2wt/TagHandlers.js
index 5f792f4..6eaa66d 100644
--- a/lib/html2wt/TagHandlers.js
+++ b/lib/html2wt/TagHandlers.js
@@ -33,6 +33,7 @@
 
 function buildHeadingHandler(headingWT) {
        return {
+               forceSOL: true,
                handle: function(node, state, cb) {
                        // For new elements, for prettier wikitext 
serialization,
                        // emit a space after the last '=' char.
@@ -244,6 +245,7 @@
        }
 
        return {
+               forceSOL: true,
                handle: function(node, state, cb) {
                        // Disable single-line context here so that separators 
aren't
                        // suppressed between nested list elements.
@@ -288,6 +290,43 @@
                                }
                        },
                        after: wtListEOL,
+               },
+       };
+}
+
+function buildDDHandler(stx) {
+       return {
+               forceSOL: stx !== 'row',
+               handle: function(node, state, cb) {
+                       var firstChildElement = DU.firstNonSepChildNode(node);
+                       if (!DU.isList(firstChildElement) || 
isTplListWithoutSharedPrefix(firstChildElement)) {
+                               if (stx === 'row') {
+                                       cb(':', node);
+                               } else {
+                                       cb(getListBullets(state, node), node);
+                               }
+                       }
+                       var liHandler = 
state.serializer.wteHandlers.liHandler.bind(state.serializer.wteHandlers, node);
+                       state.singleLineContext.enforce();
+                       state.serializeChildren(node, cb, liHandler);
+                       state.singleLineContext.pop();
+               },
+               sepnls: {
+                       before: function(node, othernode) {
+                               if (stx === 'row') {
+                                       return { min: 0, max: 0 };
+                               } else {
+                                       return { min: 1, max: 2 };
+                               }
+                       },
+                       after: wtListEOL,
+                       firstChild: function(node, otherNode) {
+                               if (!DU.isList(otherNode)) {
+                                       return { min: 0, max: 0 };
+                               } else {
+                                       return {};
+                               }
+                       },
                },
        };
 }
@@ -469,6 +508,7 @@
        ol: buildListHandler({ LI: 1 }),
 
        li: {
+               forceSOL: true,
                handle: function(node, state, cb) {
                        var firstChildElement = DU.firstNonSepChildNode(node);
                        if (!DU.isList(firstChildElement) || 
isTplListWithoutSharedPrefix(firstChildElement)) {
@@ -500,6 +540,7 @@
        },
 
        dt: {
+               forceSOL: true,
                handle: function(node, state, cb) {
                        var firstChildElement = DU.firstNonSepChildNode(node);
                        if (!DU.isList(firstChildElement) || 
isTplListWithoutSharedPrefix(firstChildElement)) {
@@ -529,41 +570,8 @@
                },
        },
 
-       dd: {
-               handle: function(node, state, cb) {
-                       var firstChildElement = DU.firstNonSepChildNode(node);
-                       if (!DU.isList(firstChildElement) || 
isTplListWithoutSharedPrefix(firstChildElement)) {
-                               // XXX: handle stx: row
-                               if (DU.getDataParsoid(node).stx === 'row') {
-                                       cb(':', node);
-                               } else {
-                                       cb(getListBullets(state, node), node);
-                               }
-                       }
-                       var liHandler = 
state.serializer.wteHandlers.liHandler.bind(state.serializer.wteHandlers, node);
-                       state.singleLineContext.enforce();
-                       state.serializeChildren(node, cb, liHandler);
-                       state.singleLineContext.pop();
-               },
-               sepnls: {
-                       before: function(node, othernode) {
-                               // Handle single-line dt/dd
-                               if (DU.getDataParsoid(node).stx === 'row') {
-                                       return { min: 0, max: 0 };
-                               } else {
-                                       return { min: 1, max: 2 };
-                               }
-                       },
-                       after: wtListEOL,
-                       firstChild: function(node, otherNode) {
-                               if (!DU.isList(otherNode)) {
-                                       return { min: 0, max: 0 };
-                               } else {
-                                       return {};
-                               }
-                       },
-               },
-       },
+       dd_row: buildDDHandler('row'), // single-line dt/dd
+       dd: buildDDHandler(), // multi-line dt/dd
 
        // XXX: handle options
        table: {
@@ -745,6 +753,11 @@
        // Insert the text handler here too?
        '#text': { },
        p: {
+               // Counterintuitive but seems right.
+               // Otherwise the generated wikitext will parse as an indent-pre
+               // escapeWikitext nowiking will deal with leading space for 
content
+               // inside the p-tag, but forceSOL suppresses whitespace before 
the p-tag.
+               forceSOL: true,
                handle: function(node, state, cb) {
                        // XXX: Handle single-line mode by switching to HTML 
handler!
                        state.serializeChildren(node, cb, null);
@@ -813,6 +826,7 @@
                        },
                },
        },
+       // Wikitext indent pre generated with leading space
        pre: {
                handle: function(node, state, cb) {
                        // Handle indent pre
@@ -860,9 +874,7 @@
                },
                sepnls: {
                        before: function(node, otherNode) {
-                               if (DU.getDataParsoid(node).stx === 'html') {
-                                       return {};
-                               } else if (otherNode.nodeName === 'PRE' &&
+                               if (otherNode.nodeName === 'PRE' &&
                                        DU.getDataParsoid(otherNode).stx !== 
'html') {
                                        return {min: 2};
                                } else {
@@ -870,29 +882,27 @@
                                }
                        },
                        after: function(node, otherNode) {
-                               if (DU.getDataParsoid(node).stx === 'html') {
-                                       return {};
-                               } else if (otherNode.nodeName === 'PRE' &&
+                               if (otherNode.nodeName === 'PRE' &&
                                        DU.getDataParsoid(otherNode).stx !== 
'html') {
                                        return {min: 2};
                                } else {
                                        return {min: 1};
                                }
                        },
-                       firstChild: function(node, otherNode) {
-                               if (DU.getDataParsoid(node).stx === 'html') {
-                                       return { max: Number.MAX_VALUE };
-                               } else {
-                                       return {};
-                               }
-                       },
-                       lastChild: function(node, otherNode) {
-                               if (DU.getDataParsoid(node).stx === 'html') {
-                                       return { max: Number.MAX_VALUE };
-                               } else {
-                                       return {};
-                               }
-                       },
+                       firstChild: id({}),
+                       lastChild: id({}),
+               },
+       },
+       // HTML pre
+       pre_html: {
+               handle: function(node, state, cb) {
+                       return state.serializer._htmlElementHandler(node, 
state, cb);
+               },
+               sepnls: {
+                       before: id({}),
+                       after: id({}),
+                       firstChild: id({ max: Number.MAX_VALUE }),
+                       lastChild:  id({ max: Number.MAX_VALUE }),
                },
        },
        meta: {
diff --git a/lib/html2wt/WikitextSerializer.js 
b/lib/html2wt/WikitextSerializer.js
index 2749f6f..5905c6e 100644
--- a/lib/html2wt/WikitextSerializer.js
+++ b/lib/html2wt/WikitextSerializer.js
@@ -659,8 +659,6 @@
 
        var self = this;
        var dp = DU.getDataParsoid(node);
-       var nodeName = node.nodeName.toLowerCase();
-       var handler;
        var typeOf = node.getAttribute('typeof') || '';
 
        // XXX: Convert into separate handlers?
@@ -751,6 +749,8 @@
        }
 
        if (dp.src !== undefined) {
+               // Uneditable forms wrapped with mw:Placeholder tags
+               // OR unedited nowikis
                if (/(^|\s)mw:Placeholder(\/\w*)?$/.test(typeOf) ||
                                (typeOf === "mw:Nowiki" && node.textContent === 
dp.src)) {
                        // implement generic src round-tripping:
@@ -766,7 +766,10 @@
                                        }
                                },
                        };
-               } else if (typeOf === "mw:Entity" && node.childNodes.length === 
1) {
+               }
+
+               // Entities
+               if (typeOf === "mw:Entity" && node.childNodes.length === 1) {
                        var contentSrc = node.textContent || node.innerHTML;
                        return {
                                handle: function() {
@@ -780,39 +783,32 @@
                }
        }
 
-       // Handle html pres
-       if (nodeName === 'pre' && dp.stx === 'html') {
-               return {
-                       handle: self._htmlElementHandler.bind(self),
-                       sepnls: self.tagHandlers[nodeName].sepnls,
-               };
+       var htmlTagHandler = { handle: self._htmlElementHandler.bind(self) };
+
+       // If available, use a specialized handler for serializing
+       // to the specialized syntactic form of the tag.
+       var nodeName = node.nodeName.toLowerCase();
+       var altHandler = self.tagHandlers[nodeName + "_" + dp.stx];
+
+       // Unless a specialized handler is available, use the HTML handler
+       // for html-stx tags. But, <a> tags should never serialize as HTML.
+       if (!altHandler && dp.stx === 'html' && nodeName !== 'a') {
+               return htmlTagHandler;
+       }
+
        // If parent node is a list or table tag in html-syntax, then serialize
        // new elements in html-syntax rather than wiki-syntax.
-       // Anchor elements should never serialize as html, regardless of what
-       // data-parsoid has to say about the matter.
-       } else if (nodeName !== "a" && (dp.stx === 'html' ||
-               (DU.isNewElt(node) && !DU.atTheTop(node) &&
+       if (DU.isNewElt(node) && !DU.atTheTop(node) &&
                !DU.isDocumentFragment(node.parentNode) &&
                DU.getDataParsoid(node.parentNode).stx === 'html' &&
                ((DU.isList(node.parentNode) && DU.isListItem(node)) ||
                        (Consts.ParentTableTags.has(node.parentNode.nodeName) &&
-                       Consts.ChildTableTags.has(node.nodeName)))
-       ))) {
-               return { handle: self._htmlElementHandler.bind(self) };
-       } else if (self.tagHandlers[nodeName]) {
-               handler = self.tagHandlers[nodeName];
-               if (!handler.handle) {
-                       return {
-                               handle: self._htmlElementHandler.bind(self),
-                               sepnls: handler.sepnls,
-                       };
-               } else {
-                       return handler || null;
-               }
-       } else {
-               // XXX: check against element whitelist and drop those not on 
it?
-               return { handle: self._htmlElementHandler.bind(self) };
+                       Consts.ChildTableTags.has(node.nodeName)))) {
+               return htmlTagHandler;
        }
+
+       // Pick the best available handler
+       return altHandler || self.tagHandlers[nodeName] || htmlTagHandler;
 };
 
 WSP.separatorREs = {
diff --git a/lib/html2wt/separators.js b/lib/html2wt/separators.js
index 1674aeb..8b85e90 100644
--- a/lib/html2wt/separators.js
+++ b/lib/html2wt/separators.js
@@ -378,6 +378,8 @@
 
        state.sep.constraints.constraintInfo = {
                onSOL: state.onSOL,
+               // force SOL state when separator is built/emitted
+               forceSOL: handlerB && handlerB.forceSOL,
                sepType: sepType,
                nodeA: nodeA,
                nodeB: nodeB,
@@ -404,6 +406,7 @@
        var sepType = constraintInfo.sepType;
        var nodeA = constraintInfo.nodeA;
        var nodeB = constraintInfo.nodeB;
+       var forceSOL = constraintInfo.forceSOL && sepType !== 'child-parent';
        var origNodeB = nodeB;
 
        // Ex: "<div>foo</div>\n <span>bar</span>"
@@ -418,7 +421,7 @@
        // which has the same purpose.
        if (state.serializer.options.extName !== 'ref' && !state.inIndentPre &&
                (NL_WS_COMMENTS_SEP_REGEXP.test(sep)
-               || (constraintInfo.onSOL && 
WS_COMMENTS_SEP_TEST_REGEXP.test(sep)))
+               || WS_COMMENTS_SEP_TEST_REGEXP.test(sep) && 
(constraintInfo.onSOL || forceSOL))
        ) {
                // 'sep' is the separator before 'nodeB' and it has leading 
spaces on a newline.
                // We have to decide whether that leading space will trigger 
indent-pres in wikitext.
@@ -496,7 +499,7 @@
                        }
                }
 
-               var stripLeadingSpace = constraintInfo.onSOL && nodeB && 
Consts.SolSpaceSensitiveTags.has(nodeB.nodeName);
+               var stripLeadingSpace = (constraintInfo.onSOL || forceSOL) && 
nodeB && Consts.SolSpaceSensitiveTags.has(nodeB.nodeName);
                if (!isIndentPreSafe || stripLeadingSpace) {
                        // Wrap non-nl ws from last line, but preserve comments.
                        // This avoids triggering indent-pres.

-- 
To view, visit https://gerrit.wikimedia.org/r/247177
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I7a5ccd33b31fa4222f016e33a9576f4127590b2d
Gerrit-PatchSet: 8
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>
Gerrit-Reviewer: Arlolra <[email protected]>
Gerrit-Reviewer: Cscott <[email protected]>
Gerrit-Reviewer: Subramanya Sastry <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to