jenkins-bot has submitted this change and it was merged.

Change subject: Normalize DOM by stripping \u200e, \u200f next to category links
......................................................................


Normalize DOM by stripping \u200e, \u200f next to category links

* The stripping is done around category links when
  scrubWikitext API flag is enabled.

  Without the stripping,
    "\u200f<link>\u200f<link>\u200f foo"
  serializes to:
    "\u200f\n<link>\n\u200f\n<link>\n\u200f foo"

  The first \u200f line is useless, since the \u200f is terminated
  by the \n. The second \u200f appears as an anomalous "extra" newline
  (this is what's most important to fix). The third \u200f should
  be kept, since it affects the rendering of 'foo'.

* In this version of the patch, the stripping happens everywhere
  in the document, not just around modified content.

* For now, since we are unsure if we want to enable this, I am hiding
  this behind a parsoidConfig.scrubBidiChars flag which we can turn on
  by making config changes.

  This is turned on during rt-testing and parser tests so we have
  a sense of how pages will be affected.

Change-Id: I9eaf81d6e27429d4a1d9f7a2465f31259b0ae0e3
---
M bin/parserTests.js
M lib/config/ParsoidConfig.js
M lib/html2wt/normalizeDOM.js
M tests/parserTests.txt
M tests/rttest.localsettings.js
5 files changed, 98 insertions(+), 5 deletions(-)

Approvals:
  Cscott: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/bin/parserTests.js b/bin/parserTests.js
index 0adf139..e692d5a 100755
--- a/bin/parserTests.js
+++ b/bin/parserTests.js
@@ -1661,6 +1661,9 @@
                parsoidConfig.loadWMF = false;
                parsoidConfig.initMwApiMap();
 
+               // Needed for bidi-char-scrubbing html2wt tests.
+               parsoidConfig.scrubBidiChars = true;
+
                // Send all requests to the mock API server.
                parsoidConfig.mwApiMap.forEach(function(apiConf) {
                        parsoidConfig.setMwApi({
diff --git a/lib/config/ParsoidConfig.js b/lib/config/ParsoidConfig.js
index 4a5182a..a863227 100644
--- a/lib/config/ParsoidConfig.js
+++ b/lib/config/ParsoidConfig.js
@@ -373,6 +373,12 @@
 ParsoidConfig.prototype.suppressMwApiWarnings = /modulemessages is deprecated/;
 
 /**
+ * If enabled, bidi chars adjacent to category links will be stripped
+ * in the html -> wt serialization pass.
+ */
+ParsoidConfig.prototype.scrubBidiChars = false;
+
+/**
  * @property {number} How often should we emit a heap sample? Time in ms.
  *
  * Only relevant if performance timing is enabled
diff --git a/lib/html2wt/normalizeDOM.js b/lib/html2wt/normalizeDOM.js
index 9caeaaa..f7db151 100644
--- a/lib/html2wt/normalizeDOM.js
+++ b/lib/html2wt/normalizeDOM.js
@@ -246,20 +246,68 @@
        }
 };
 
+Normalizer.prototype.stripBidiCharsAroundCategories = function(node) {
+       if (!DU.isText(node) ||
+               (!DU.isCategoryLink(node.previousSibling) && 
!DU.isCategoryLink(node.nextSibling))) {
+               // Not a text node and not adjacent to a category link
+               return node;
+       }
+
+       var next = node.nextSibling;
+       if (!next || DU.isCategoryLink(next)) {
+               // The following can leave behind an empty text node.
+               var oldLength = node.nodeValue.length;
+               node.nodeValue = 
node.nodeValue.replace(/([\u200e\u200f]+\n)?[\u200e\u200f]+$/g, '');
+               var newLength = node.nodeValue.length;
+
+               if (oldLength !== newLength) {
+                       // Log changes for editors benefit
+                       this.env.log('warning/html2wt/bidi',
+                               'LRM/RLM unicode chars stripped around 
categories');
+               }
+
+               if (newLength === 0) {
+                       // Remove empty text nodes to keep DOM in normalized 
form
+                       var ret = DU.nextNonDeletedSibling(node);
+                       node.parentNode.removeChild(node);
+                       this.addDiffMarks(node, 'deleted');
+                       return ret;
+               }
+
+               // Treat modified node as having been newly inserted
+               this.addDiffMarks(node, 'inserted');
+               this.addDiffMarks(node.parentNode, 'children-changed');
+       }
+       return node;
+};
+
 /**
- * Normalizations implemented right now:
- * -------------------------------------
+ * scrubWikitext normalizations implemented right now:
+ * ---------------------------------------------------
  * 1. Tag minimization (I/B tags) in normalizeSiblingPair
  * 2. Strip empty headings and style tags
  * 3. Force SOL transparent links to serialize before/after heading
  * 4. Trailing spaces are migrated out of links
  * 5. Space is added before escapable prefixes in table cells
  * 6. Strip <br/> from headings
+ * 7. Strip bidi chars around categories
  */
 Normalizer.prototype.normalizeNode = function(node) {
-       // Only if scrubWikitext flag is enabled
+       // The following are done only if scrubWikitext flag is enabled
        if (!this.env.scrubWikitext) {
                return node;
+       }
+
+       var next;
+
+       if (this.env.conf.parsoid.scrubBidiChars) {
+               // Strip bidirectional chars around categories
+               // Note that this is being done everywhere,
+               // not just in selser mode
+               next = this.stripBidiCharsAroundCategories(node);
+               if (next !== node) {
+                       return next;
+               }
        }
 
        // Skip unmodified content
@@ -270,8 +318,6 @@
                DU.origSrcValidInEditedContext(this.env, node)) {
                return node;
        }
-
-       var next;
 
        // Headings
        if (/^H[1-6]$/.test(node.nodeName)) {
diff --git a/tests/parserTests.txt b/tests/parserTests.txt
index 2581218..683092b 100644
--- a/tests/parserTests.txt
+++ b/tests/parserTests.txt
@@ -25784,6 +25784,41 @@
 <p>foo <span about="#mwt1" typeof="mw:Transclusion" 
data-mw='{"parts":[{"template":{"target":{"wt":"echo","href":"./Template:Echo"},"params":{"1":{"wt":"&lt;span>bar&lt;/span>
 [[Category:baz]]"}},"i":0}}]}'>bar</span><span about="#mwt1"> </span><link 
rel="mw:PageProp/Category" href="./Category:Baz" about="#mwt1" 
data-parsoid='{"stx":"simple","a":{"href":"./Category:Baz"},"sa":{"href":"Category:baz"}}'/>
 bar</p>
 !! end
 
+# Careful while editing these next 2 tests. There are \u200f characters
+# before and after the <link> tags in the HTML and following some
+# of the categories in wikitext
+# Do not remove these characters in edits.
+#
+# As part of the serialization, these bidi characters will get stripped.
+!! test
+RTL (\u200f) and LTR (\u200e) markers around category tags should be stripped
+!! options
+parsoid={
+  "modes": ["html2wt"],
+  "scrubWikitext": true
+}
+!! html/parsoid
+<p>‏<link rel="mw:PageProp/Category" href="./קטגוריה:טקסים" />‏
+‏<link rel="mw:PageProp/Category" href="./קטגוריה:_שיטות_משפט" />‏</p>
+!! wikitext
+[[קטגוריה:טקסים]]
+[[קטגוריה: שיטות משפט]]
+!! end
+
+!! test
+RTL (\u200f) and LTR (\u200e) markers should not be stripped if followed by a 
text node
+!! options
+parsoid={
+  "modes": ["html2wt"],
+  "scrubWikitext": true
+}
+!! html/parsoid
+<p>‏<link rel="mw:PageProp/Category" href="./קטגוריה:טקסים" />‏y</p>
+!! wikitext
+[[קטגוריה:טקסים]]
+‏y
+!! end
+
 !! test
 Lists: Add space after bullets
 !! options
diff --git a/tests/rttest.localsettings.js b/tests/rttest.localsettings.js
index bc7f248..dab319f 100644
--- a/tests/rttest.localsettings.js
+++ b/tests/rttest.localsettings.js
@@ -65,6 +65,9 @@
        // Set rtTestMode to true for round-trip testing
        parsoidConfig.rtTestMode = true;
 
+       // Enable bidi char stripping during serialization
+       parsoidConfig.scrubBidiChars = true;
+
        // Set to true to enable Performance timing
        parsoidConfig.useDefaultPerformanceTimer = false;
        // Peformance timing options for testing

-- 
To view, visit https://gerrit.wikimedia.org/r/259430
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I9eaf81d6e27429d4a1d9f7a2465f31259b0ae0e3
Gerrit-PatchSet: 10
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>
Gerrit-Reviewer: Arlolra <[email protected]>
Gerrit-Reviewer: Cscott <[email protected]>
Gerrit-Reviewer: Subramanya Sastry <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to