jenkins-bot has submitted this change and it was merged.
Change subject: Fix sol handling in separators + tag handler cleanup
......................................................................
Fix sol handling in separators + tag handler cleanup
* Only /\n$/ puts serializer in SOL-state, not /\n\s*$/.
* However, fixing this bug exposed another fairly serious one.
NL constraints for various HTML tags don't indicate whether
the wikitext for the HTML tags should force SOL state by stripping
excess whitespace characters at the start of line.
[subbu@earth lib] echo "<ul>\n<li>a</li>\n <li>b</li></ul>" | node parse
--html2wt
* a
* b
This is clearly a bug. The reason we didn't trip over it all this
while is because 'make-indent-pre' stripped the excess whitespace
to prevent indent-pre-ing the content. However, that is a scenario
of one bug papering over another bug.
* This patch also introduces a new flag in tag handlers.
Tags can declare if they force SOL state on separators.
This flag is used to suppress non-nl whitespace before emitting wikitext
for those tags.
* The fly-in-the-ointment there were some tags that have different
behavior in different contexts. Ex: ";a:b" vs ";a\n:b". The <dd> tag
in the second example forces SOL context but not the former. Added
additional handling to deal with this scenario.
However, this also led to some nice code cleanup in the tag handlers.
* No net parser test changes after all these fixes.
Change-Id: I7a5ccd33b31fa4222f016e33a9576f4127590b2d
---
M lib/html2wt/SerializerState.js
M lib/html2wt/TagHandlers.js
M lib/html2wt/WikitextSerializer.js
M lib/html2wt/separators.js
4 files changed, 96 insertions(+), 87 deletions(-)
Approvals:
Arlolra: Looks good to me, approved
jenkins-bot: Verified
diff --git a/lib/html2wt/SerializerState.js b/lib/html2wt/SerializerState.js
index 1ede07f..ee69f37 100644
--- a/lib/html2wt/SerializerState.js
+++ b/lib/html2wt/SerializerState.js
@@ -218,7 +218,7 @@
SSP.sepIntroducedSOL = function(sep) {
// Don't get tripped by newlines in comments! Be wary of nowikis added
// by makeSepIndentPreSafe on the last line.
- if (sep.replace(Util.COMMENT_REGEXP_G, '').search(/\n\s*$/) !== -1) {
+ if (sep.replace(Util.COMMENT_REGEXP_G, '').search(/\n$/) !== -1) {
this.onSOL = true;
}
};
diff --git a/lib/html2wt/TagHandlers.js b/lib/html2wt/TagHandlers.js
index 5f792f4..6eaa66d 100644
--- a/lib/html2wt/TagHandlers.js
+++ b/lib/html2wt/TagHandlers.js
@@ -33,6 +33,7 @@
function buildHeadingHandler(headingWT) {
return {
+ forceSOL: true,
handle: function(node, state, cb) {
// For new elements, for prettier wikitext
serialization,
// emit a space after the last '=' char.
@@ -244,6 +245,7 @@
}
return {
+ forceSOL: true,
handle: function(node, state, cb) {
// Disable single-line context here so that separators
aren't
// suppressed between nested list elements.
@@ -288,6 +290,43 @@
}
},
after: wtListEOL,
+ },
+ };
+}
+
+function buildDDHandler(stx) {
+ return {
+ forceSOL: stx !== 'row',
+ handle: function(node, state, cb) {
+ var firstChildElement = DU.firstNonSepChildNode(node);
+ if (!DU.isList(firstChildElement) ||
isTplListWithoutSharedPrefix(firstChildElement)) {
+ if (stx === 'row') {
+ cb(':', node);
+ } else {
+ cb(getListBullets(state, node), node);
+ }
+ }
+ var liHandler =
state.serializer.wteHandlers.liHandler.bind(state.serializer.wteHandlers, node);
+ state.singleLineContext.enforce();
+ state.serializeChildren(node, cb, liHandler);
+ state.singleLineContext.pop();
+ },
+ sepnls: {
+ before: function(node, othernode) {
+ if (stx === 'row') {
+ return { min: 0, max: 0 };
+ } else {
+ return { min: 1, max: 2 };
+ }
+ },
+ after: wtListEOL,
+ firstChild: function(node, otherNode) {
+ if (!DU.isList(otherNode)) {
+ return { min: 0, max: 0 };
+ } else {
+ return {};
+ }
+ },
},
};
}
@@ -469,6 +508,7 @@
ol: buildListHandler({ LI: 1 }),
li: {
+ forceSOL: true,
handle: function(node, state, cb) {
var firstChildElement = DU.firstNonSepChildNode(node);
if (!DU.isList(firstChildElement) ||
isTplListWithoutSharedPrefix(firstChildElement)) {
@@ -500,6 +540,7 @@
},
dt: {
+ forceSOL: true,
handle: function(node, state, cb) {
var firstChildElement = DU.firstNonSepChildNode(node);
if (!DU.isList(firstChildElement) ||
isTplListWithoutSharedPrefix(firstChildElement)) {
@@ -529,41 +570,8 @@
},
},
- dd: {
- handle: function(node, state, cb) {
- var firstChildElement = DU.firstNonSepChildNode(node);
- if (!DU.isList(firstChildElement) ||
isTplListWithoutSharedPrefix(firstChildElement)) {
- // XXX: handle stx: row
- if (DU.getDataParsoid(node).stx === 'row') {
- cb(':', node);
- } else {
- cb(getListBullets(state, node), node);
- }
- }
- var liHandler =
state.serializer.wteHandlers.liHandler.bind(state.serializer.wteHandlers, node);
- state.singleLineContext.enforce();
- state.serializeChildren(node, cb, liHandler);
- state.singleLineContext.pop();
- },
- sepnls: {
- before: function(node, othernode) {
- // Handle single-line dt/dd
- if (DU.getDataParsoid(node).stx === 'row') {
- return { min: 0, max: 0 };
- } else {
- return { min: 1, max: 2 };
- }
- },
- after: wtListEOL,
- firstChild: function(node, otherNode) {
- if (!DU.isList(otherNode)) {
- return { min: 0, max: 0 };
- } else {
- return {};
- }
- },
- },
- },
+ dd_row: buildDDHandler('row'), // single-line dt/dd
+ dd: buildDDHandler(), // multi-line dt/dd
// XXX: handle options
table: {
@@ -745,6 +753,11 @@
// Insert the text handler here too?
'#text': { },
p: {
+ // Counterintuitive but seems right.
+ // Otherwise the generated wikitext will parse as an indent-pre
+ // escapeWikitext nowiking will deal with leading space for
content
+ // inside the p-tag, but forceSOL suppresses whitespace before
the p-tag.
+ forceSOL: true,
handle: function(node, state, cb) {
// XXX: Handle single-line mode by switching to HTML
handler!
state.serializeChildren(node, cb, null);
@@ -813,6 +826,7 @@
},
},
},
+ // Wikitext indent pre generated with leading space
pre: {
handle: function(node, state, cb) {
// Handle indent pre
@@ -860,9 +874,7 @@
},
sepnls: {
before: function(node, otherNode) {
- if (DU.getDataParsoid(node).stx === 'html') {
- return {};
- } else if (otherNode.nodeName === 'PRE' &&
+ if (otherNode.nodeName === 'PRE' &&
DU.getDataParsoid(otherNode).stx !==
'html') {
return {min: 2};
} else {
@@ -870,29 +882,27 @@
}
},
after: function(node, otherNode) {
- if (DU.getDataParsoid(node).stx === 'html') {
- return {};
- } else if (otherNode.nodeName === 'PRE' &&
+ if (otherNode.nodeName === 'PRE' &&
DU.getDataParsoid(otherNode).stx !==
'html') {
return {min: 2};
} else {
return {min: 1};
}
},
- firstChild: function(node, otherNode) {
- if (DU.getDataParsoid(node).stx === 'html') {
- return { max: Number.MAX_VALUE };
- } else {
- return {};
- }
- },
- lastChild: function(node, otherNode) {
- if (DU.getDataParsoid(node).stx === 'html') {
- return { max: Number.MAX_VALUE };
- } else {
- return {};
- }
- },
+ firstChild: id({}),
+ lastChild: id({}),
+ },
+ },
+ // HTML pre
+ pre_html: {
+ handle: function(node, state, cb) {
+ return state.serializer._htmlElementHandler(node,
state, cb);
+ },
+ sepnls: {
+ before: id({}),
+ after: id({}),
+ firstChild: id({ max: Number.MAX_VALUE }),
+ lastChild: id({ max: Number.MAX_VALUE }),
},
},
meta: {
diff --git a/lib/html2wt/WikitextSerializer.js
b/lib/html2wt/WikitextSerializer.js
index 2749f6f..5905c6e 100644
--- a/lib/html2wt/WikitextSerializer.js
+++ b/lib/html2wt/WikitextSerializer.js
@@ -659,8 +659,6 @@
var self = this;
var dp = DU.getDataParsoid(node);
- var nodeName = node.nodeName.toLowerCase();
- var handler;
var typeOf = node.getAttribute('typeof') || '';
// XXX: Convert into separate handlers?
@@ -751,6 +749,8 @@
}
if (dp.src !== undefined) {
+ // Uneditable forms wrapped with mw:Placeholder tags
+ // OR unedited nowikis
if (/(^|\s)mw:Placeholder(\/\w*)?$/.test(typeOf) ||
(typeOf === "mw:Nowiki" && node.textContent ===
dp.src)) {
// implement generic src round-tripping:
@@ -766,7 +766,10 @@
}
},
};
- } else if (typeOf === "mw:Entity" && node.childNodes.length ===
1) {
+ }
+
+ // Entities
+ if (typeOf === "mw:Entity" && node.childNodes.length === 1) {
var contentSrc = node.textContent || node.innerHTML;
return {
handle: function() {
@@ -780,39 +783,32 @@
}
}
- // Handle html pres
- if (nodeName === 'pre' && dp.stx === 'html') {
- return {
- handle: self._htmlElementHandler.bind(self),
- sepnls: self.tagHandlers[nodeName].sepnls,
- };
+ var htmlTagHandler = { handle: self._htmlElementHandler.bind(self) };
+
+ // If available, use a specialized handler for serializing
+ // to the specialized syntactic form of the tag.
+ var nodeName = node.nodeName.toLowerCase();
+ var altHandler = self.tagHandlers[nodeName + "_" + dp.stx];
+
+ // Unless a specialized handler is available, use the HTML handler
+ // for html-stx tags. But, <a> tags should never serialize as HTML.
+ if (!altHandler && dp.stx === 'html' && nodeName !== 'a') {
+ return htmlTagHandler;
+ }
+
// If parent node is a list or table tag in html-syntax, then serialize
// new elements in html-syntax rather than wiki-syntax.
- // Anchor elements should never serialize as html, regardless of what
- // data-parsoid has to say about the matter.
- } else if (nodeName !== "a" && (dp.stx === 'html' ||
- (DU.isNewElt(node) && !DU.atTheTop(node) &&
+ if (DU.isNewElt(node) && !DU.atTheTop(node) &&
!DU.isDocumentFragment(node.parentNode) &&
DU.getDataParsoid(node.parentNode).stx === 'html' &&
((DU.isList(node.parentNode) && DU.isListItem(node)) ||
(Consts.ParentTableTags.has(node.parentNode.nodeName) &&
- Consts.ChildTableTags.has(node.nodeName)))
- ))) {
- return { handle: self._htmlElementHandler.bind(self) };
- } else if (self.tagHandlers[nodeName]) {
- handler = self.tagHandlers[nodeName];
- if (!handler.handle) {
- return {
- handle: self._htmlElementHandler.bind(self),
- sepnls: handler.sepnls,
- };
- } else {
- return handler || null;
- }
- } else {
- // XXX: check against element whitelist and drop those not on
it?
- return { handle: self._htmlElementHandler.bind(self) };
+ Consts.ChildTableTags.has(node.nodeName)))) {
+ return htmlTagHandler;
}
+
+ // Pick the best available handler
+ return altHandler || self.tagHandlers[nodeName] || htmlTagHandler;
};
WSP.separatorREs = {
diff --git a/lib/html2wt/separators.js b/lib/html2wt/separators.js
index 1674aeb..8b85e90 100644
--- a/lib/html2wt/separators.js
+++ b/lib/html2wt/separators.js
@@ -378,6 +378,8 @@
state.sep.constraints.constraintInfo = {
onSOL: state.onSOL,
+ // force SOL state when separator is built/emitted
+ forceSOL: handlerB && handlerB.forceSOL,
sepType: sepType,
nodeA: nodeA,
nodeB: nodeB,
@@ -404,6 +406,7 @@
var sepType = constraintInfo.sepType;
var nodeA = constraintInfo.nodeA;
var nodeB = constraintInfo.nodeB;
+ var forceSOL = constraintInfo.forceSOL && sepType !== 'child-parent';
var origNodeB = nodeB;
// Ex: "<div>foo</div>\n <span>bar</span>"
@@ -418,7 +421,7 @@
// which has the same purpose.
if (state.serializer.options.extName !== 'ref' && !state.inIndentPre &&
(NL_WS_COMMENTS_SEP_REGEXP.test(sep)
- || (constraintInfo.onSOL &&
WS_COMMENTS_SEP_TEST_REGEXP.test(sep)))
+ || WS_COMMENTS_SEP_TEST_REGEXP.test(sep) &&
(constraintInfo.onSOL || forceSOL))
) {
// 'sep' is the separator before 'nodeB' and it has leading
spaces on a newline.
// We have to decide whether that leading space will trigger
indent-pres in wikitext.
@@ -496,7 +499,7 @@
}
}
- var stripLeadingSpace = constraintInfo.onSOL && nodeB &&
Consts.SolSpaceSensitiveTags.has(nodeB.nodeName);
+ var stripLeadingSpace = (constraintInfo.onSOL || forceSOL) &&
nodeB && Consts.SolSpaceSensitiveTags.has(nodeB.nodeName);
if (!isIndentPreSafe || stripLeadingSpace) {
// Wrap non-nl ws from last line, but preserve comments.
// This avoids triggering indent-pres.
--
To view, visit https://gerrit.wikimedia.org/r/247177
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I7a5ccd33b31fa4222f016e33a9576f4127590b2d
Gerrit-PatchSet: 8
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>
Gerrit-Reviewer: Arlolra <[email protected]>
Gerrit-Reviewer: Cscott <[email protected]>
Gerrit-Reviewer: Subramanya Sastry <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits