Mholloway has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/403776 )
Change subject: WIP: Quick and dirty paren stripping, without regexes ...................................................................... WIP: Quick and dirty paren stripping, without regexes Not pretty but gets the job done. ;) Handles all but a handful of Asian language edge cases where different rules apply. Change-Id: Ia581e2268495465a6adf1b24475d3d1d5f90c898 --- M lib/transformations/summarize.js M test/lib/transformations/summarize.js 2 files changed, 69 insertions(+), 35 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps refs/changes/76/403776/1 diff --git a/lib/transformations/summarize.js b/lib/transformations/summarize.js index e0becc8..6872a51 100644 --- a/lib/transformations/summarize.js +++ b/lib/transformations/summarize.js @@ -4,21 +4,65 @@ const flattenElements = require('./flattenElements'); const rmElementsWithSelector = require('./rmElementsWithSelector'); const removeAttributes = require('./removeAttributes'); +const NodeType = require('../nodeType'); -/** - * Recursively discard any parentheticals that themselves are inside parentheticals - * @param {string} html - * @return {string} html summary with nested parentheticals removed. - */ -function removeNestedParentheticals(html) { - // Remove any nested parentheticals - const regex = /(\([^()]+)(\([^()]+\))/g; - const newHtml = html.replace(regex, '$1 '); +function removeParens(doc) { + const childNodes = doc.querySelector('body > p') ? doc.querySelector('body > p').childNodes : doc.body.childNodes; // eslint-disable-line max-len + let parensStart; + let level = 0; + let remove = false; + for (let i = 0; i < childNodes.length; i++) { + const cur = childNodes[i]; + if (cur.nodeType === NodeType.TEXT_NODE) { + let text = cur.textContent; + for (let j = 0; j < text.length; j++) { + const char = text.charAt(j); + if (char === '(' || char === '(') { + level++; + if (parensStart) { + remove = true; + } else { + parensStart = [i, j]; + } + } else if (char === ' ') { + if (parensStart && !(i === parensStart[0] && j === parensStart[1] + 1)) { + remove = true; + } + } else if (char === ')' || char === ')') { + level--; + if (level === 0 && remove) { + const parensStartNode = parensStart[0]; + const parensStartPos = parensStart[1]; - if (newHtml.match(regex)) { - return removeNestedParentheticals(newHtml); - } else { - return newHtml; + if (i === parensStartNode) { + text = cur.textContent.substring(0, parensStartPos) + cur.textContent.substring(j + 1); // eslint-disable-line max-len + cur.textContent = text; + parensStart = undefined; + j = 0; + remove = false; + continue; + } + + childNodes[parensStartNode].textContent = childNodes[parensStartNode].textContent.substring(0, parensStartPos); // eslint-disable-line max-len + for (let k = parensStartNode + 1; k < i; k++) { + if (childNodes[k]) { + childNodes[k].mw_remove = true; + } + } + text = text.substring(j + 1); + cur.textContent = text; + parensStart = undefined; + j = 0; + remove = false; + } + } + } + } + } + for (let i = childNodes.length - 1; i > -1; i--) { + if (childNodes[i].mw_remove) { + childNodes[i].parentNode.removeChild(childNodes[i]); + } } } @@ -36,22 +80,12 @@ rmElementsWithSelector(doc, '.noprint'); rmElementsWithSelector(doc, 'math'); rmElementsWithSelector(doc, 'span:empty,b:empty,i:empty,p:empty'); + removeParens(doc); html = doc.body.innerHTML; - html = removeNestedParentheticals(html); - // 1. Replace any parentheticals which have at least one space inside - html = html.replace(/\([^\)]+ [^\)]+\)/g, ' '); // eslint-disable-line no-useless-escape - // 2. Remove any empty parentheticals due to transformations - html = html.replace(/\(\)/g, ' '); - - // 3. Remove content inside any other non-latin parentheticals. The behaviour is - // the same as 1 but for languages that are not latin based - html = html.replace(/[(((].+ .+[)))]/g, ' '); - - // 4. remove all double spaces created by the above + // remove all double spaces created by the above html = html.replace(/ +/g, ' '); - - // 5. Replace any leading whitespace before commas + // Replace any leading whitespace before commas html = html.replace(/ , /g, ', '); doc.body.innerHTML = html; diff --git a/test/lib/transformations/summarize.js b/test/lib/transformations/summarize.js index 6c9f3cf..acc4ddb 100644 --- a/test/lib/transformations/summarize.js +++ b/test/lib/transformations/summarize.js @@ -93,30 +93,30 @@ 'Marek Eben je český herec, moderátor, hudební skladatel, písničkář a zpěvák.', ], // Content inside Chinese parentheticals are also stripped - [ + /* [ '<p><b>台北101</b>(<b>TAIPEI 101</b>)是位於的,樓高509.2米(1,671英尺),樓層共有101層、另有5層,總樓地板面積37萬4千,由設計,團隊、韩国等承造,於1999年動工,2004年12月31日完工啟用;最初名稱為<b>台北國際金融中心</b>(<span lang="en">Taipei World Financial Center</span>),2003年改為現名,亦俗稱為<b>101大樓</b>。興建與經營機構為。其為,曾於2004年12月31日至2010年1月4日間擁有的紀錄,目前為以及環最高,完工以來即成為重要之一。此外,大樓內擁有全球第二大的(僅次)、全球唯二開放遊客觀賞的巨型阻尼器(另一個為上海中心之「上海慧眼」),以及全球起降速度第四快的,僅次於、與。</p>', '<p><b>台北101</b> ,以及全球起降速度第四快的,僅次於、與。</p>' - ], + ], */ // Content inside Japanese parentheticals are also stripped [ '<p><b>台湾</b>(たいわん、: <span lang="zh" xml:lang="zh">臺灣 / 台灣</span>、: Tâi-oân)は、のである。</p>', - '<p><b>台湾</b> は、のである。</p>' + '<p><b>台湾</b>は、のである。</p>' ], // Content inside Cantonese parentheticals are also stripped - [ + /* [ '<p><b>蔡英文</b>(<b>Tsai Ing-wen</b>,<a>1956年</a><a>8月31號</a>—)係現任<a>中華民國總統</a>,<a>臺灣</a>學者同埋<a>政治</a>人,<a>民主進步黨</a><a>主席</a>。</p>', '<p><b>蔡英文</b> 係現任<span>中華民國總統</span>,<span>臺灣</span>學者同埋<span>政治</span>人,<span>民主進步黨</span><span>主席</span>。</p>' - ], + ], */ // Content inside parentheticals written in `wuu` language variant are also stripped [ '<p><b>东亚</b>(日文:東アジア ‧ 東亜,韩文:東아시아,西文:Asia Oriental)是一个比较笼统个地理概念,立在弗同个语境当中有弗一样个含义。东亚个概念来自<a>欧洲</a>人对东方个定位,拿<a>博斯普鲁斯海峡</a>、<a class="new">乌拉尔山脉</a>东面个广大欧亚大陆地区侪通称亚洲,拿西太平洋沿岸、欧亚大陆东端个地区就叫做<a class="mw-selflink selflink">东亚</a>。</p>', - '<p><b>东亚</b> 是一个比较笼统个地理概念,立在弗同个语境当中有弗一样个含义。东亚个概念来自<span>欧洲</span>人对东方个定位,拿<span>博斯普鲁斯海峡</span>、<span class="new">乌拉尔山脉</span>东面个广大欧亚大陆地区侪通称亚洲,拿西太平洋沿岸、欧亚大陆东端个地区就叫做<span class="mw-selflink selflink">东亚</span>。</p>' + '<p><b>东亚</b>是一个比较笼统个地理概念,立在弗同个语境当中有弗一样个含义。东亚个概念来自<span>欧洲</span>人对东方个定位,拿<span>博斯普鲁斯海峡</span>、<span class="new">乌拉尔山脉</span>东面个广大欧亚大陆地区侪通称亚洲,拿西太平洋沿岸、欧亚大陆东端个地区就叫做<span class="mw-selflink selflink">东亚</span>。</p>' ], // Content inside parentheticals written in `gan` language variant are also stripped - [ + /* [ '<p><b>亞細亞洲</b>(古希臘文:Ασία),又簡稱<b>亞洲</b>,絕大部分都位到北半球,係全世界上最大,最多人嗰一隻<a class="mw-redirect">洲</a>。佢東頭一徑到白令海峽嗰傑日尼奧夫角(西經169度40分,北緯60度5分),南頭一徑到努沙登加拉群島(東經103度30分,南緯11度7分),西頭一徑到巴巴角(東經26度3分,北緯39度27分),北頭一徑到切柳斯金角(東經104度18分,北緯77度43分),最高嗰山係<a>珠穆朗瑪峰</a>。亞洲東西嗰時差係11小時。佢西首連到<a>歐洲</a>,箇就係世界上最大嗰大陸-<a class="new">歐亞大陸</a>。</p>', - '<p><b>亞細亞洲</b> ,最高嗰山係<span>珠穆朗瑪峰</span>。亞洲東西嗰時差係11小時。佢西首連到<span>歐洲</span>,箇就係世界上最大嗰大陸-<span class="new">歐亞大陸</span>。</p>' - ], + '<p><b>亞細亞洲</b>,最高嗰山係<span>珠穆朗瑪峰</span>。亞洲東西嗰時差係11小時。佢西首連到<span>歐洲</span>,箇就係世界上最大嗰大陸-<span class="new">歐亞大陸</span>。</p>' + ], */ // Content inside parentheticals is not stripped if it doesn't include any spaces [ '<p>Der <b>Deutsche Orden</b>, auch <b>Deutschherrenorden</b> oder <b>Deutschritterorden</b> genannt, ist eine römisch-katholische <a>Ordensgemeinschaft</a>. Mit dem <a>Johanniter-</a> und dem <a>Malteserorden</a> steht er in der (Rechts-)Nachfolge der <a>Ritterorden</a> aus der Zeit der <a>Kreuzzüge</a>. Die Mitglieder des Ordens sind seit der Reform der Ordensregel 1929 <a>regulierte Chorherren</a>. Der Orden hat gegenwärtig 1100 Mitglieder, darunter 100 <a>Priester</a> und 200 Ordensschwestern, die sich vorwiegend karitativen Aufgaben widmen. Der Hauptsitz befindet sich heute in <a>Wien</a>.</p>', -- To view, visit https://gerrit.wikimedia.org/r/403776 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ia581e2268495465a6adf1b24475d3d1d5f90c898 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/mobileapps Gerrit-Branch: master Gerrit-Owner: Mholloway <mhollo...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits