[MediaWiki-commits] [Gerrit] T115018: Some tweaks to separator handling to handle unicode... - change (mediawiki...parsoid)

jenkins-bot (Code Review) Thu, 22 Oct 2015 17:00:39 -0700

jenkins-bot has submitted this change and it was merged.

Change subject: T115018: Some tweaks to separator handling to handle unicode 
chars.
......................................................................



T115018: Some tweaks to separator handling to handle unicode chars.

* Refactored regular expressions used to match/extract separator
  text in different scenarios so that we can tweak them when we
  discover more nuances and keep them consistent across uses.

* In this patch, I've fixed up a couple of regular expressions to
  deal with unicode space and other bidi characters that show up
  in the HTML. I've left behind a couple FIXMEs to investigate later
  if the other regexps need similar generalization.

* Editors expect categories to appear to be on separate lines, but if
  there is some extra invisible whitespace there, that's okay.
  Directionality markers should be ignored here, even though they are
  pretty pointless immediately before a newline (since they only affect
  the "current paragraph").

* This fixes the roundtrip diffs seen in hewiki:בישעה with the
  latest master.

* Parser test verifies the expectation.

Change-Id: Ie85669374618da5e0d809411220346943d193436
---
M lib/mediawiki.WikitextSerializer.js
M lib/wts.separators.js
M tests/parserTests.txt
3 files changed, 53 insertions(+), 12 deletions(-)

Approvals:
  Arlolra: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/lib/mediawiki.WikitextSerializer.js 
b/lib/mediawiki.WikitextSerializer.js
index 53ea27d..7e72f8e 100644
--- a/lib/mediawiki.WikitextSerializer.js
+++ b/lib/mediawiki.WikitextSerializer.js
@@ -26,6 +26,7 @@
 require('./core-upgrade.js');
 
 var util = require('util');
+var JSUtils = require('./jsutils.js').JSUtils;
 var wtConsts = require('./mediawiki.wikitext.constants.js');
 var Util = require('./mediawiki.Util.js').Util;
 var DU = require('./mediawiki.DOMUtils.js').DOMUtils;
@@ -814,18 +815,43 @@
        }
 };
 
+// Accept unicode spaces (\p{Zs}) and unicode directionality
+// markers ("bidi_control": [\u200E\u200F\u202A-\u202E]).
+// We could accept any invisible non-printable character,
+// but let's not go crazy quite yet.
+// Looks like some pages on hewiki use the \u200f character
+// and we don't want to be tripping over them. Include them
+// in the separator string so that newlines surrounded by these
+// control characters still count towards required
+// inter(-wikitext-)line-separators.
+var unicodeSpacesAndBIDIChars = 
'\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000\u200E\u200F\u202A-\u202E';
+var anySpaceCharRE = JSUtils.rejoin('[\\s', unicodeSpacesAndBIDIChars, ']');
+
+WSP.separatorREs = {
+       pureSepRE: JSUtils.rejoin('^', anySpaceCharRE, '*$'),
+       // We only care about the unicode and BIDI chars before the \n
+       // since we want those newlines to count towards required
+       // inter(-wikitext-)line-separators.
+       sepPrefixWithNlsRE: JSUtils.rejoin('^(?:[ \\t', 
unicodeSpacesAndBIDIChars, ']*\\n)+\\s*'),
+       // SSS FIXME: I am not sure that unicode char business applies here.
+       // sepSuffixWithNlsRE: JSUtils.rejoin(/\n/, anySpaceCharRE, '*$'),
+       sepSuffixWithNlsRE: /\n\s*$/,
+       // SSS FIXME: I am not sure that unicode char business applies here.
+       doubleNewlineRE_G: /\n([ \t]*\n)+/g,
+};
+
 /**
  * Serialize the content of a text node
  */
 WSP._serializeTextNode = function(node, state, cb) {
        // write out a potential separator?
        var res = node.nodeValue;
-       var doubleNewlineMatch = res.match(/\n([ \t]*\n)+/g);
+       var doubleNewlineMatch = res.match(this.separatorREs.doubleNewlineRE_G);
        var doubleNewlineCount = doubleNewlineMatch && 
doubleNewlineMatch.length || 0;
 
        // Deal with trailing separator-like text (at least 1 newline and other 
whitespace)
-       var newSepMatch = res.match(/\n\s*$/);
-       res = res.replace(/\n\s*$/, '');
+       var newSepMatch = res.match(this.separatorREs.sepSuffixWithNlsRE);
+       res = res.replace(this.separatorREs.sepSuffixWithNlsRE, '');
 
        if (!state.inIndentPre) {
                // Don't strip two newlines for wikitext like this:
@@ -837,7 +863,7 @@
                if (!state.inHTMLPre && 
(!DU.allChildrenAreText(node.parentNode) ||
                        doubleNewlineCount !== 1)) {
                        // Strip more than one consecutive newline
-                       res = res.replace(/\n([ \t]*\n)+/g, '\n');
+                       res = res.replace(this.separatorREs.doubleNewlineRE_G, 
'\n');
                }
                // Strip trailing newlines from text content
                // if (node.nextSibling && DU.isElt(node.nextSibling)) {
@@ -848,7 +874,7 @@
 
                // Strip leading newlines and other whitespace
                // They are already added to the separator source in 
handleSeparatorText.
-               res = res.replace(/^[ \t]*\n+\s*/, '');
+               res = res.replace(this.separatorREs.sepPrefixWithNlsRE, '');
        }
 
        // Always escape entities
@@ -895,8 +921,8 @@
        // in handleSeparatorText.
        var res = text.replace(/^\n/, '');
        // Deal with trailing newlines
-       var newSepMatch = res.match(/\n\s*$/);
-       res = res.replace(/\n\s*$/, '');
+       var newSepMatch = res.match(this.separatorREs.sepSuffixWithNlsRE);
+       res = res.replace(this.separatorREs.sepSuffixWithNlsRE, '');
        cb(res, node);
        state.sep.lastSourceNode = node;
        // Move trailing newlines into the next separator
diff --git a/lib/wts.separators.js b/lib/wts.separators.js
index 75fe35d..248a315 100644
--- a/lib/wts.separators.js
+++ b/lib/wts.separators.js
@@ -137,8 +137,9 @@
  * XXX: Support separator-transparent elements!
  */
 var handleSeparatorText = function(node, state) {
+       var separatorREs = state.serializer.separatorREs;
        if (!state.inIndentPre && DU.isText(node)) {
-               if (node.nodeValue.match(/^\s*$/)) {
+               if (node.nodeValue.match(separatorREs.pureSepRE)) {
                        state.sep.src = (state.sep.src || '') + node.nodeValue;
 
                        // Same caveat about onSOL and <li> nodes
@@ -147,7 +148,7 @@
 
                        return true;
                } else {
-                       var match = node.nodeValue.match(/^[ \t]*\n+\s*/);
+                       var match = 
node.nodeValue.match(separatorREs.sepPrefixWithNlsRE);
                        if (match) {
                                state.sep.src = (state.sep.src || '') + 
match[0];
 
diff --git a/tests/parserTests.txt b/tests/parserTests.txt
index 5789cc7..c3eb2ac 100644
--- a/tests/parserTests.txt
+++ b/tests/parserTests.txt
@@ -25666,11 +25666,11 @@
 [http://boo.org http://boohoo.org]
 !! end
 
-# Misnested is an indication that selser can reuse the source but these have
-# shown to sneak through on occasion. See T101768.
+# Misnested marks are an indication that selser can reuse the source
+# but these have snuck through on occasion. See T101768.
 # The original wikitext here is: [http://test.com [[one]] two three]
 !! test
-Strip span tags added to mark as misnested
+Strip span tags added to mark misnested links
 !! options
 parsoid=html2wt
 !! html/parsoid
@@ -25679,6 +25679,20 @@
 [http://test.com][[one]] two three
 !! end
 
+# Careful while editing this test. There are \u200f characters on all lines
+# in both wikitext and HTML - they shouldn't be stripped.
+!! test
+RTL (\u200f) and LTR (\u200e) unicode chars should not trip up separator 
handling
+!! options
+parsoid=html2wt
+!! html
+<p>‏<link rel="mw:PageProp/Category" href="./קטגוריה:טקסים" />‏
+‏<link rel="mw:PageProp/Category" href="./קטגוריה:_שיטות_משפט" />‏</p>
+!! wikitext
+‏[[קטגוריה:טקסים]]‏
+‏[[קטגוריה: שיטות משפט]]‏
+!! end
+
 # --------------------------------------------
 # Tests spec'ing wikitext serialization norms |
 # --------------------------------------------

-- 
To view, visit https://gerrit.wikimedia.org/r/244472
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ie85669374618da5e0d809411220346943d193436
Gerrit-PatchSet: 7
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>
Gerrit-Reviewer: Arlolra <[email protected]>
Gerrit-Reviewer: Cscott <[email protected]>
Gerrit-Reviewer: Subramanya Sastry <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] T115018: Some tweaks to separator handling to handle unicode... - change (mediawiki...parsoid)

Reply via email to