Subramanya Sastry has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/248942

Change subject: Revert "T115018: Some tweaks to separator handling to handle 
unicode chars."
......................................................................

Revert "T115018: Some tweaks to separator handling to handle unicode chars."

This is actually more complicated than this
For now, reverting the "serialize link on own line" set of patches.

This reverts commit b319c3d76ec3cae4b6a49f82050762b0cd743ba3.

Change-Id: If1aa02844b86449a23e702ba9317825c3ed52b87
---
M lib/mediawiki.WikitextSerializer.js
M lib/wts.separators.js
M tests/parserTests.txt
3 files changed, 12 insertions(+), 53 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/42/248942/1

diff --git a/lib/mediawiki.WikitextSerializer.js 
b/lib/mediawiki.WikitextSerializer.js
index 7e72f8e..53ea27d 100644
--- a/lib/mediawiki.WikitextSerializer.js
+++ b/lib/mediawiki.WikitextSerializer.js
@@ -26,7 +26,6 @@
 require('./core-upgrade.js');
 
 var util = require('util');
-var JSUtils = require('./jsutils.js').JSUtils;
 var wtConsts = require('./mediawiki.wikitext.constants.js');
 var Util = require('./mediawiki.Util.js').Util;
 var DU = require('./mediawiki.DOMUtils.js').DOMUtils;
@@ -815,43 +814,18 @@
        }
 };
 
-// Accept unicode spaces (\p{Zs}) and unicode directionality
-// markers ("bidi_control": [\u200E\u200F\u202A-\u202E]).
-// We could accept any invisible non-printable character,
-// but let's not go crazy quite yet.
-// Looks like some pages on hewiki use the \u200f character
-// and we don't want to be tripping over them. Include them
-// in the separator string so that newlines surrounded by these
-// control characters still count towards required
-// inter(-wikitext-)line-separators.
-var unicodeSpacesAndBIDIChars = 
'\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000\u200E\u200F\u202A-\u202E';
-var anySpaceCharRE = JSUtils.rejoin('[\\s', unicodeSpacesAndBIDIChars, ']');
-
-WSP.separatorREs = {
-       pureSepRE: JSUtils.rejoin('^', anySpaceCharRE, '*$'),
-       // We only care about the unicode and BIDI chars before the \n
-       // since we want those newlines to count towards required
-       // inter(-wikitext-)line-separators.
-       sepPrefixWithNlsRE: JSUtils.rejoin('^(?:[ \\t', 
unicodeSpacesAndBIDIChars, ']*\\n)+\\s*'),
-       // SSS FIXME: I am not sure that unicode char business applies here.
-       // sepSuffixWithNlsRE: JSUtils.rejoin(/\n/, anySpaceCharRE, '*$'),
-       sepSuffixWithNlsRE: /\n\s*$/,
-       // SSS FIXME: I am not sure that unicode char business applies here.
-       doubleNewlineRE_G: /\n([ \t]*\n)+/g,
-};
-
 /**
  * Serialize the content of a text node
  */
 WSP._serializeTextNode = function(node, state, cb) {
        // write out a potential separator?
        var res = node.nodeValue;
-       var doubleNewlineMatch = res.match(this.separatorREs.doubleNewlineRE_G);
+       var doubleNewlineMatch = res.match(/\n([ \t]*\n)+/g);
        var doubleNewlineCount = doubleNewlineMatch && 
doubleNewlineMatch.length || 0;
 
        // Deal with trailing separator-like text (at least 1 newline and other 
whitespace)
-       var newSepMatch = res.match(this.separatorREs.sepSuffixWithNlsRE);
-       res = res.replace(this.separatorREs.sepSuffixWithNlsRE, '');
+       var newSepMatch = res.match(/\n\s*$/);
+       res = res.replace(/\n\s*$/, '');
 
        if (!state.inIndentPre) {
                // Don't strip two newlines for wikitext like this:
@@ -863,7 +837,7 @@
                if (!state.inHTMLPre && 
(!DU.allChildrenAreText(node.parentNode) ||
                        doubleNewlineCount !== 1)) {
                        // Strip more than one consecutive newline
-                       res = res.replace(this.separatorREs.doubleNewlineRE_G, 
'\n');
+                       res = res.replace(/\n([ \t]*\n)+/g, '\n');
                }
                // Strip trailing newlines from text content
                // if (node.nextSibling && DU.isElt(node.nextSibling)) {
@@ -874,7 +848,7 @@
 
                // Strip leading newlines and other whitespace
                // They are already added to the separator source in 
handleSeparatorText.
-               res = res.replace(this.separatorREs.sepPrefixWithNlsRE, '');
+               res = res.replace(/^[ \t]*\n+\s*/, '');
        }
 
        // Always escape entities
@@ -921,8 +895,8 @@
        // in handleSeparatorText.
        var res = text.replace(/^\n/, '');
        // Deal with trailing newlines
-       var newSepMatch = res.match(this.separatorREs.sepSuffixWithNlsRE);
-       res = res.replace(this.separatorREs.sepSuffixWithNlsRE, '');
+       var newSepMatch = res.match(/\n\s*$/);
+       res = res.replace(/\n\s*$/, '');
        cb(res, node);
        state.sep.lastSourceNode = node;
        // Move trailing newlines into the next separator
diff --git a/lib/wts.separators.js b/lib/wts.separators.js
index 248a315..75fe35d 100644
--- a/lib/wts.separators.js
+++ b/lib/wts.separators.js
@@ -137,9 +137,8 @@
  * XXX: Support separator-transparent elements!
  */
 var handleSeparatorText = function(node, state) {
-       var separatorREs = state.serializer.separatorREs;
        if (!state.inIndentPre && DU.isText(node)) {
-               if (node.nodeValue.match(separatorREs.pureSepRE)) {
+               if (node.nodeValue.match(/^\s*$/)) {
                        state.sep.src = (state.sep.src || '') + node.nodeValue;
 
                        // Same caveat about onSOL and <li> nodes
@@ -148,7 +147,7 @@
 
                        return true;
                } else {
-                       var match = 
node.nodeValue.match(separatorREs.sepPrefixWithNlsRE);
+                       var match = node.nodeValue.match(/^[ \t]*\n+\s*/);
                        if (match) {
                                state.sep.src = (state.sep.src || '') + 
match[0];
 
diff --git a/tests/parserTests.txt b/tests/parserTests.txt
index c3eb2ac..5789cc7 100644
--- a/tests/parserTests.txt
+++ b/tests/parserTests.txt
@@ -25666,31 +25666,17 @@
 [http://boo.org http://boohoo.org]
 !! end
 
-# Misnested marks are an indication that selser can reuse the source
-# but these have snuck through on occasion. See T101768.
+# Misnested is an indication that selser can reuse the source but these have
+# shown to sneak through on occasion. See T101768.
 # The original wikitext here is: [http://test.com [[one]] two three]
 !! test
-Strip span tags added to mark misnested links
+Strip span tags added to mark as misnested
 !! options
 parsoid=html2wt
 !! html/parsoid
 <p data-parsoid='{}'><a rel="mw:ExtLink" href="http://test.com"; 
data-parsoid='{"targetOff":17,"contentOffsets":[17,34]}'></a><a 
rel="mw:WikiLink" href="./One" title="One" 
data-parsoid='{"stx":"simple","a":{"href":"./One"},"sa":{"href":"one"},"misnested":true}'>one</a><span
 data-parsoid='{"misnested":true}'> two three</span></p>
 !! wikitext
 [http://test.com][[one]] two three
-!! end
-
-# Careful while editing this test. There are \u200f characters on all lines
-# in both wikitext and HTML - they shouldn't be stripped.
-!! test
-RTL (\u200f) and LTR (\u200e) unicode chars should not trip up separator 
handling
-!! options
-parsoid=html2wt
-!! html
-<p>‏<link rel="mw:PageProp/Category" href="./קטגוריה:טקסים" />‏
-‏<link rel="mw:PageProp/Category" href="./קטגוריה:_שיטות_משפט" />‏</p>
-!! wikitext
-‏[[קטגוריה:טקסים]]‏
-‏[[קטגוריה: שיטות משפט]]‏
 !! end
 
 # --------------------------------------------

-- 
To view, visit https://gerrit.wikimedia.org/r/248942
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: If1aa02844b86449a23e702ba9317825c3ed52b87
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to