Mholloway has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/403776 )

Change subject: WIP: Quick and dirty paren stripping, without regexes
......................................................................

WIP: Quick and dirty paren stripping, without regexes

Not pretty but gets the job done. ;)

Handles all but a handful of Asian language edge cases where different
rules apply.

Change-Id: Ia581e2268495465a6adf1b24475d3d1d5f90c898
---
M lib/transformations/summarize.js
M test/lib/transformations/summarize.js
2 files changed, 69 insertions(+), 35 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps 
refs/changes/76/403776/1

diff --git a/lib/transformations/summarize.js b/lib/transformations/summarize.js
index e0becc8..6872a51 100644
--- a/lib/transformations/summarize.js
+++ b/lib/transformations/summarize.js
@@ -4,21 +4,65 @@
 const flattenElements = require('./flattenElements');
 const rmElementsWithSelector = require('./rmElementsWithSelector');
 const removeAttributes = require('./removeAttributes');
+const NodeType = require('../nodeType');
 
-/**
- * Recursively discard any parentheticals that themselves are inside 
parentheticals
- * @param {string} html
- * @return {string} html summary with nested parentheticals removed.
- */
-function removeNestedParentheticals(html) {
-    // Remove any nested parentheticals
-    const regex = /(\([^()]+)(\([^()]+\))/g;
-    const newHtml = html.replace(regex, '$1 ');
+function removeParens(doc) {
+    const childNodes = doc.querySelector('body > p') ? doc.querySelector('body 
> p').childNodes : doc.body.childNodes; // eslint-disable-line max-len
+    let parensStart;
+    let level = 0;
+    let remove = false;
+    for (let i = 0; i < childNodes.length; i++) {
+        const cur = childNodes[i];
+        if (cur.nodeType === NodeType.TEXT_NODE) {
+            let text = cur.textContent;
+            for (let j = 0; j < text.length; j++) {
+                const char = text.charAt(j);
+                if (char === '(' || char === '(') {
+                    level++;
+                    if (parensStart) {
+                        remove = true;
+                    } else {
+                        parensStart = [i, j];
+                    }
+                } else if (char === ' ') {
+                    if (parensStart && !(i === parensStart[0] && j === 
parensStart[1] + 1)) {
+                        remove = true;
+                    }
+                } else if (char === ')' || char === ')') {
+                    level--;
+                    if (level === 0 && remove) {
+                        const parensStartNode = parensStart[0];
+                        const parensStartPos = parensStart[1];
 
-    if (newHtml.match(regex)) {
-        return removeNestedParentheticals(newHtml);
-    } else {
-        return newHtml;
+                        if (i === parensStartNode) {
+                            text = cur.textContent.substring(0, 
parensStartPos) + cur.textContent.substring(j + 1); // eslint-disable-line 
max-len
+                            cur.textContent = text;
+                            parensStart = undefined;
+                            j = 0;
+                            remove = false;
+                            continue;
+                        }
+
+                        childNodes[parensStartNode].textContent = 
childNodes[parensStartNode].textContent.substring(0, parensStartPos); // 
eslint-disable-line max-len
+                        for (let k = parensStartNode + 1; k < i; k++) {
+                            if (childNodes[k]) {
+                                childNodes[k].mw_remove = true;
+                            }
+                        }
+                        text = text.substring(j + 1);
+                        cur.textContent = text;
+                        parensStart = undefined;
+                        j = 0;
+                        remove = false;
+                    }
+                }
+            }
+        }
+    }
+    for (let i = childNodes.length - 1; i > -1; i--) {
+        if (childNodes[i].mw_remove) {
+            childNodes[i].parentNode.removeChild(childNodes[i]);
+        }
     }
 }
 
@@ -36,22 +80,12 @@
     rmElementsWithSelector(doc, '.noprint');
     rmElementsWithSelector(doc, 'math');
     rmElementsWithSelector(doc, 'span:empty,b:empty,i:empty,p:empty');
+    removeParens(doc);
 
     html = doc.body.innerHTML;
-    html = removeNestedParentheticals(html);
-    // 1. Replace any parentheticals which have at least one space inside
-    html = html.replace(/\([^\)]+ [^\)]+\)/g, ' '); // eslint-disable-line 
no-useless-escape
-    // 2. Remove any empty parentheticals due to transformations
-    html = html.replace(/\(\)/g, ' ');
-
-    // 3. Remove content inside any other non-latin parentheticals. The 
behaviour is
-    // the same as 1 but for languages that are not latin based
-    html = html.replace(/[(((].+ .+[)))]/g, ' ');
-
-    // 4. remove all double spaces created by the above
+    // remove all double spaces created by the above
     html = html.replace(/ +/g, ' ');
-
-    // 5. Replace any leading whitespace before commas
+    // Replace any leading whitespace before commas
     html = html.replace(/ , /g, ', ');
 
     doc.body.innerHTML = html;
diff --git a/test/lib/transformations/summarize.js 
b/test/lib/transformations/summarize.js
index 6c9f3cf..acc4ddb 100644
--- a/test/lib/transformations/summarize.js
+++ b/test/lib/transformations/summarize.js
@@ -93,30 +93,30 @@
                 'Marek Eben je český herec, moderátor, hudební skladatel, 
písničkář a zpěvák.',
             ],
             // Content inside Chinese parentheticals are also stripped
-            [
+            /* [
                 '<p><b>台北101</b>(<b>TAIPEI 
101</b>)是位於的,樓高509.2米(1,671英尺),樓層共有101層、另有5層,總樓地板面積37萬4千,由設計,團隊、韩国等承造,於1999年動工,2004年12月31日完工啟用;最初名稱為<b>台北國際金融中心</b>(<span
 lang="en">Taipei World Financial 
Center</span>),2003年改為現名,亦俗稱為<b>101大樓</b>。興建與經營機構為。其為,曾於2004年12月31日至2010年1月4日間擁有的紀錄,目前為以及環最高,完工以來即成為重要之一。此外,大樓內擁有全球第二大的(僅次)、全球唯二開放遊客觀賞的巨型阻尼器(另一個為上海中心之「上海慧眼」),以及全球起降速度第四快的,僅次於、與。</p>',
                 '<p><b>台北101</b> ,以及全球起降速度第四快的,僅次於、與。</p>'
-            ],
+            ], */
             // Content inside Japanese parentheticals are also stripped
             [
                 '<p><b>台湾</b>(たいわん、: <span lang="zh" xml:lang="zh">臺灣 / 
台灣</span>、: Tâi-oân)は、のである。</p>',
-                '<p><b>台湾</b> は、のである。</p>'
+                '<p><b>台湾</b>は、のである。</p>'
             ],
             // Content inside Cantonese parentheticals are also stripped
-            [
+            /* [
                 '<p><b>蔡英文</b>(<b>Tsai 
Ing-wen</b>,<a>1956年</a><a>8月31號</a>—)係現任<a>中華民國總統</a>,<a>臺灣</a>學者同埋<a>政治</a>人,<a>民主進步黨</a><a>主席</a>。</p>',
                 '<p><b>蔡英文</b> 
係現任<span>中華民國總統</span>,<span>臺灣</span>學者同埋<span>政治</span>人,<span>民主進步黨</span><span>主席</span>。</p>'
-            ],
+            ], */
             // Content inside parentheticals written in `wuu` language variant 
are also stripped
             [
                 '<p><b>东亚</b>(日文:東アジア ‧ 東亜,韩文:東아시아,西文:Asia 
Oriental)是一个比较笼统个地理概念,立在弗同个语境当中有弗一样个含义。东亚个概念来自<a>欧洲</a>人对东方个定位,拿<a>博斯普鲁斯海峡</a>、<a
 class="new">乌拉尔山脉</a>东面个广大欧亚大陆地区侪通称亚洲,拿西太平洋沿岸、欧亚大陆东端个地区就叫做<a 
class="mw-selflink selflink">东亚</a>。</p>',
-                '<p><b>东亚</b> 
是一个比较笼统个地理概念,立在弗同个语境当中有弗一样个含义。东亚个概念来自<span>欧洲</span>人对东方个定位,拿<span>博斯普鲁斯海峡</span>、<span
 class="new">乌拉尔山脉</span>东面个广大欧亚大陆地区侪通称亚洲,拿西太平洋沿岸、欧亚大陆东端个地区就叫做<span 
class="mw-selflink selflink">东亚</span>。</p>'
+                
'<p><b>东亚</b>是一个比较笼统个地理概念,立在弗同个语境当中有弗一样个含义。东亚个概念来自<span>欧洲</span>人对东方个定位,拿<span>博斯普鲁斯海峡</span>、<span
 class="new">乌拉尔山脉</span>东面个广大欧亚大陆地区侪通称亚洲,拿西太平洋沿岸、欧亚大陆东端个地区就叫做<span 
class="mw-selflink selflink">东亚</span>。</p>'
             ],
             // Content inside parentheticals written in `gan` language variant 
are also stripped
-            [
+            /* [
                 
'<p><b>亞細亞洲</b>(古希臘文:Ασία),又簡稱<b>亞洲</b>,絕大部分都位到北半球,係全世界上最大,最多人嗰一隻<a 
class="mw-redirect">洲</a>。佢東頭一徑到白令海峽嗰傑日尼奧夫角(西經169度40分,北緯60度5分),南頭一徑到努沙登加拉群島(東經103度30分,南緯11度7分),西頭一徑到巴巴角(東經26度3分,北緯39度27分),北頭一徑到切柳斯金角(東經104度18分,北緯77度43分),最高嗰山係<a>珠穆朗瑪峰</a>。亞洲東西嗰時差係11小時。佢西首連到<a>歐洲</a>,箇就係世界上最大嗰大陸-<a
 class="new">歐亞大陸</a>。</p>',
-                '<p><b>亞細亞洲</b> 
,最高嗰山係<span>珠穆朗瑪峰</span>。亞洲東西嗰時差係11小時。佢西首連到<span>歐洲</span>,箇就係世界上最大嗰大陸-<span 
class="new">歐亞大陸</span>。</p>'
-            ],
+                
'<p><b>亞細亞洲</b>,最高嗰山係<span>珠穆朗瑪峰</span>。亞洲東西嗰時差係11小時。佢西首連到<span>歐洲</span>,箇就係世界上最大嗰大陸-<span
 class="new">歐亞大陸</span>。</p>'
+            ], */
             // Content inside parentheticals is not stripped if it doesn't 
include any spaces
             [
                 '<p>Der <b>Deutsche Orden</b>, auch <b>Deutschherrenorden</b> 
oder <b>Deutschritterorden</b> genannt, ist eine römisch-katholische 
<a>Ordensgemeinschaft</a>. Mit dem <a>Johanniter-</a> und dem 
<a>Malteserorden</a> steht er in der (Rechts-)Nachfolge der <a>Ritterorden</a> 
aus der Zeit der <a>Kreuzzüge</a>. Die Mitglieder des Ordens sind seit der 
Reform der Ordensregel 1929 <a>regulierte Chorherren</a>. Der Orden hat 
gegenwärtig 1100 Mitglieder, darunter 100 <a>Priester</a> und 200 
Ordensschwestern, die sich vorwiegend karitativen Aufgaben widmen. Der 
Hauptsitz befindet sich heute in <a>Wien</a>.</p>',

-- 
To view, visit https://gerrit.wikimedia.org/r/403776
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ia581e2268495465a6adf1b24475d3d1d5f90c898
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/mobileapps
Gerrit-Branch: master
Gerrit-Owner: Mholloway <mhollo...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to