[MediaWiki-commits] [Gerrit] mediawiki...mobileapps[master]: WIP: Remove parentheses using state machine instead of regex

BearND (Code Review) Thu, 11 Jan 2018 11:13:07 -0800

BearND has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/403725 )


Change subject: WIP: Remove parentheses using state machine instead of regex
......................................................................

WIP: Remove parentheses using state machine instead of regex

Do not merge!

Bug: T184557
Change-Id: Ia135df4eb6fbb26064eb95764d1529f294ea4dbe
---
M lib/transformations/summarize.js
M test/lib/transformations/summarize.js
2 files changed, 245 insertions(+), 25 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps 
refs/changes/25/403725/1

diff --git a/lib/transformations/summarize.js b/lib/transformations/summarize.js
index 06c777b..1072f0f 100644
--- a/lib/transformations/summarize.js
+++ b/lib/transformations/summarize.js
@@ -34,31 +34,229 @@
     }
 }
 
+// Non-ASCII examples of parentheses are taken from
+// https://www.mediawiki.org/wiki/Topic:Tn0k9fjn0g8e9fpe.
+// They were all the same in the Asian languages.
+const TOKEN = {
+    Open: { ASCII:  '(', NON_ASCII: '（' }, // %28, %uFF08
+    Close: { ASCII:  ')', NON_ASCII: '）' }, // %29, %uFF09
+    Space: { NORMAL:  ' ' }
+};
+
+class Range {
+    constructor(startNode, startNodeIndex, startCharacterIndex) {
+        this._startNode = startNode;
+        this._startNodeIndex = startNodeIndex;
+        this._startCharacterIndex = startCharacterIndex;
+    }
+
+    endRange(endNode, endNodeIndex, endCharacterIndex) {
+        this._endNode = endNode;
+        this._endNodeIndex = endNodeIndex;
+        this._endCharacterIndex = endCharacterIndex;
+    }
+
+    getStartNode() {
+        return this._startNode;
+    }
+
+    getStartNodeIndex() {
+        return this._startNodeIndex;
+    }
+
+    getStartCharacterIndex() {
+        return this._startCharacterIndex;
+    }
+
+    getEndNode() {
+        return this._endNode;
+    }
+
+    getEndNodeIndex() {
+        return this._endNodeIndex;
+    }
+
+    getEndIndex() {
+        return this._endCharacterIndex;
+    }
+
+    startsBefore(nextRange) {
+        return this._startNodeIndex < nextRange._startNodeIndex
+            || (
+                this._startNodeIndex === nextRange._startNodeIndex
+                && this._startCharacterIndex < nextRange._startCharacterIndex
+            );
+    }
+
+    endsBefore(nextRange) {
+        return this._endNodeIndex < nextRange._endNodeIndex
+            || (
+                this._endNodeIndex === nextRange._endNodeIndex
+                && this._endCharacterIndex < nextRange._endCharacterIndex
+            );
+    }
+
+    setSpace() {
+        this._hasSpace = true;
+    }
+
+    hasSpaces() {
+        return this._hasSpace;
+    }
+
+    debug() {
+        if (this._startNode === this._endNode) {
+            const text = this._startNode.textContent;
+            return `${text.slice(this._startCharacterIndex, 
this._endCharacterIndex + 1)}`;
+        } else {
+            const startText = 
this._startNode.textContent.slice(this._startCharacterIndex);
+            const endText = this._endNode.textContent.slice(0, 
this._endCharacterIndex + 1);
+            return `'${startText}' to '${endText}'`;
+        }
+    }
+}
+
+function markParentheses(node, state) {
+    const text = node.textContent;
+    for (let i = 0; i < text.length; i++) {
+        const char = text[i];
+        if (char === TOKEN.Open.ASCII || char === TOKEN.Open.NON_ASCII) {
+            const range = new Range(node, node._nid, i);
+            state.ranges.push(range);
+            state.stack.push(range);
+        } else if (char === TOKEN.Close.ASCII || char === 
TOKEN.Close.NON_ASCII) {
+            const range = state.stack[state.stack.length - 1];
+            if (range) {
+                range.endRange(node, node._nid, i);
+                state.stack.pop();
+            }
+        } else if (char === TOKEN.Space.NORMAL) {
+            // TODO: do for all in stack???
+            const range = state.stack[state.stack.length - 1];
+            if (range && !range.hasSpaces()) {
+                range.setSpace();
+            }
+        }
+    }
+}
+
+// '(d e)'
+function discardIneligibleRanges(ranges) {
+    for (let i = 0; i < ranges.length; i++) {
+        const currentRange = ranges[i];
+        if (!currentRange.hasSpaces()) {
+            ranges.splice(i, 1); // remove current range
+        }
+    }
+}
+
+// 'a(b'
+function discardUnclosedRanges(ranges) {
+    for (let i = 0; i < ranges.length; i++) {
+        const currentRange = ranges[i];
+        if (!currentRange.getEndNode()) {
+            ranges.splice(i, 1); // remove current range
+        }
+    }
+}
+
+// 'a(b (d e) f)g'
+function discardNestedRanges(ranges) {
+    for (let i = 0; i < ranges.length - 1; i++) {
+        const currentRange = ranges[i];
+        const nextRange = ranges[i + 1];
+        if (currentRange.startsBefore(nextRange)
+                && nextRange.endsBefore(currentRange)
+                && currentRange.hasSpaces()) {
+            // next range is nested inside current eligible range
+            ranges.splice(i + 1, 1); // remove next range
+        }
+    }
+}
+
+function removeParenthesesInTextNode(node, state) {
+    let range = state.ranges[0];
+    let offset = 0;
+    while (range) {
+        if (state.deleting && node !== range.getEndNode()) {
+            node.remove();
+        }
+
+        if (node === range.getStartNode() && node === range.getEndNode()) {
+            // start and end in this text node
+            const originalText = node.textContent;
+            const startText = originalText.slice(0, 
range.getStartCharacterIndex() - offset);
+            const endText = originalText.slice(range.getEndIndex() + 1 - 
offset);
+            node.textContent = `${startText} ${endText}`;
+            offset = range.getEndIndex() - range.getStartCharacterIndex();
+            state.ranges.shift();
+            range = state.ranges[0];
+        } else if (node === range.getStartNode()) {
+            // just start
+            state.deleting = true;
+            const originalText = node.textContent;
+            const startText = originalText.slice(0, 
range.getStartCharacterIndex() - offset);
+            node.textContent = startText;
+            break;
+        } else if (node === range.getEndNode()) {
+            // just end
+            state.deleting = false;
+            const originalText = node.textContent;
+            const endText = originalText.slice(range.getEndIndex() + 1 - 
offset);
+            node.textContent = endText;
+            offset = range.getEndIndex();
+            state.ranges.shift();
+            range = state.ranges[0];
+        } else {
+            break;
+        }
+    }
+}
+
 /**
  * Visits one DOM node. Do the stuff that needs to be done when a single DOM 
node is handled.
  * In this case, remove DOM nodes and some attributes we don't want to keep.
  * @param {!Node} node the node to visit
+ * @param {!Object} state some state to pass around to the visitor
  */
-function visitor(node) {
+function visitorPass1(node, state) {
     if (node.nodeType === NodeType.ELEMENT_NODE) {
         rmDisallowedElements(node);
         if (node.tagName !== 'IMG') {
             rmUnwantedAttributes(node);
         }
-    } else if (node.nodeType !== NodeType.TEXT_NODE) {
+    } else if (node.nodeType === NodeType.TEXT_NODE) {
+        markParentheses(node, state);
+    } else {
         node.remove();
+    }
+}
+
+/**
+ * Visits one DOM node. Do the stuff that needs to be done when a single DOM 
node is handled.
+ * In this case, remove DOM nodes and some attributes we don't want to keep.
+ * @param {!Node} node the node to visit
+ * @param {!Object} state some state to pass around to the visitor
+ */
+function visitorPass2(node, state) {
+    if (node.nodeType === NodeType.ELEMENT_NODE && state.deleting) {
+        node.remove();
+    } else if (node.nodeType === NodeType.TEXT_NODE) {
+        removeParenthesesInTextNode(node, state);
     }
 }
 
 /**
  * Traverses DOM tree iteratively (depth first).
  * @param {!Element} rootElement the root of the DOM tree which needs to be 
traversed
+ * @param {!Function} visitor called whenever a node gets encountered
+ * @param {!Object} state some state to pass around to the visitor
  */
-function traverseDF(rootElement) {
+function traverseDF(rootElement, visitor, state) {
     let nodesToVisit = [ rootElement ];
     while (nodesToVisit.length > 0) {
         const currentNode = nodesToVisit.shift();
-        visitor(currentNode);
+        visitor(currentNode, state);
         nodesToVisit = [
             ...(currentNode.childNodes || []), // depth first
             ...nodesToVisit,
@@ -71,25 +269,32 @@
  * @param {!Document} document the DOM document
  */
 function removeUnwantedNodes(document) {
-    traverseDF(document.body);
-}
-
-/**
- * Recursively discard any parentheticals that themselves are inside 
parentheticals
- * @param {string} html
- * @return {string} html summary with nested parentheticals removed.
- */
-function removeNestedParentheticals(html) {
-    // Remove any nested parentheticals
-    const regex = /(\([^()]+)(\([^()]+\))/g;
-    const newHtml = html.replace(regex, '$1 ');
-
-    if (newHtml.match(regex)) {
-        return removeNestedParentheticals(newHtml);
-    } else {
-        return newHtml;
+    const state = { ranges: [], stack: [] };
+    traverseDF(document.body, visitorPass1, state);
+    if (state.ranges.length > 0) { // found parentheses in previous pass
+        discardIneligibleRanges(state.ranges);
+        discardUnclosedRanges(state.ranges);
+        discardNestedRanges(state.ranges);
+        traverseDF(document.body, visitorPass2, state);
     }
 }
+
+// /**
+//  * Recursively discard any parentheticals that themselves are inside 
parentheticals
+//  * @param {string} html
+//  * @return {string} html summary with nested parentheticals removed.
+//  */
+// function removeNestedParentheticals(html) {
+//     // Remove any nested parentheticals
+//     const regex = /(\([^()]+)(\([^()]+\))/g;
+//     const newHtml = html.replace(regex, '$1 ');
+//
+//     if (newHtml.match(regex)) {
+//         return removeNestedParentheticals(newHtml);
+//     } else {
+//         return newHtml;
+//     }
+// }
 
 /**
  * Given a chunk of HTML, create a summary of the content
@@ -108,15 +313,15 @@
     removeUnwantedNodes(doc);
 
     html = doc.body.innerHTML;
-    html = removeNestedParentheticals(html);
+    // html = removeNestedParentheticals(html);
     // 1. Replace any parentheticals which have at least one space inside
-    html = html.replace(/\([^\)]+ [^\)]+\)/g, ' '); // eslint-disable-line 
no-useless-escape
+    // html = html.replace(/\([^\)]+ [^\)]+\)/g, ' '); // eslint-disable-line 
no-useless-escape
     // 2. Remove any empty parentheticals due to transformations
-    html = html.replace(/\(\)/g, ' ');
+    // html = html.replace(/\(\)/g, ' ');
 
     // 3. Remove content inside any other non-latin parentheticals. The 
behaviour is
     // the same as 1 but for languages that are not latin based
-    html = html.replace(/[（（（].+ .+[）））]/g, ' ');
+    // html = html.replace(/[（（（].+ .+[）））]/g, ' ');
 
     // 4. remove all double spaces created by the above
     html = html.replace(/ +/g, ' ');
diff --git a/test/lib/transformations/summarize.js 
b/test/lib/transformations/summarize.js
index 5658cb9..3eb0f95 100644
--- a/test/lib/transformations/summarize.js
+++ b/test/lib/transformations/summarize.js
@@ -8,6 +8,21 @@
 describe('summarize', () => {
     it('matches the spec', () => {
         const testCases = [
+            // Should remove nested parens
+            [
+                '1a(b (d e) f)g',
+                '1a g'
+            ],
+            // Should remove sequential parens
+            [
+                '2a(b c)d(e f)g',
+                '2a d g'
+            ],
+            // Should remove sequential parens except open one at the end
+            [
+                '3a(b c)d(e f)g(',
+                '3a d g('
+            ],
             // Should remove unwanted elements
             [
                 '<style>f</style><object>o</object>o',

-- 
To view, visit https://gerrit.wikimedia.org/r/403725
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ia135df4eb6fbb26064eb95764d1529f294ea4dbe
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/mobileapps
Gerrit-Branch: master
Gerrit-Owner: BearND <bsitzm...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

[MediaWiki-commits] [Gerrit] mediawiki...mobileapps[master]: WIP: Remove parentheses using state machine instead of regex

Reply via email to