Mholloway has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/403776 )
Change subject: WIP: Quick and dirty paren stripping, without regexes
......................................................................
WIP: Quick and dirty paren stripping, without regexes
Not pretty but gets the job done. ;)
Handles all but a handful of Asian language edge cases where different
rules apply.
Change-Id: Ia581e2268495465a6adf1b24475d3d1d5f90c898
---
M lib/transformations/summarize.js
M test/lib/transformations/summarize.js
2 files changed, 69 insertions(+), 35 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps
refs/changes/76/403776/1
diff --git a/lib/transformations/summarize.js b/lib/transformations/summarize.js
index e0becc8..6872a51 100644
--- a/lib/transformations/summarize.js
+++ b/lib/transformations/summarize.js
@@ -4,21 +4,65 @@
const flattenElements = require('./flattenElements');
const rmElementsWithSelector = require('./rmElementsWithSelector');
const removeAttributes = require('./removeAttributes');
+const NodeType = require('../nodeType');
-/**
- * Recursively discard any parentheticals that themselves are inside
parentheticals
- * @param {string} html
- * @return {string} html summary with nested parentheticals removed.
- */
-function removeNestedParentheticals(html) {
- // Remove any nested parentheticals
- const regex = /(\([^()]+)(\([^()]+\))/g;
- const newHtml = html.replace(regex, '$1 ');
+function removeParens(doc) {
+ const childNodes = doc.querySelector('body > p') ? doc.querySelector('body
> p').childNodes : doc.body.childNodes; // eslint-disable-line max-len
+ let parensStart;
+ let level = 0;
+ let remove = false;
+ for (let i = 0; i < childNodes.length; i++) {
+ const cur = childNodes[i];
+ if (cur.nodeType === NodeType.TEXT_NODE) {
+ let text = cur.textContent;
+ for (let j = 0; j < text.length; j++) {
+ const char = text.charAt(j);
+ if (char === '(' || char === '(') {
+ level++;
+ if (parensStart) {
+ remove = true;
+ } else {
+ parensStart = [i, j];
+ }
+ } else if (char === ' ') {
+ if (parensStart && !(i === parensStart[0] && j ===
parensStart[1] + 1)) {
+ remove = true;
+ }
+ } else if (char === ')' || char === ')') {
+ level--;
+ if (level === 0 && remove) {
+ const parensStartNode = parensStart[0];
+ const parensStartPos = parensStart[1];
- if (newHtml.match(regex)) {
- return removeNestedParentheticals(newHtml);
- } else {
- return newHtml;
+ if (i === parensStartNode) {
+ text = cur.textContent.substring(0,
parensStartPos) + cur.textContent.substring(j + 1); // eslint-disable-line
max-len
+ cur.textContent = text;
+ parensStart = undefined;
+ j = 0;
+ remove = false;
+ continue;
+ }
+
+ childNodes[parensStartNode].textContent =
childNodes[parensStartNode].textContent.substring(0, parensStartPos); //
eslint-disable-line max-len
+ for (let k = parensStartNode + 1; k < i; k++) {
+ if (childNodes[k]) {
+ childNodes[k].mw_remove = true;
+ }
+ }
+ text = text.substring(j + 1);
+ cur.textContent = text;
+ parensStart = undefined;
+ j = 0;
+ remove = false;
+ }
+ }
+ }
+ }
+ }
+ for (let i = childNodes.length - 1; i > -1; i--) {
+ if (childNodes[i].mw_remove) {
+ childNodes[i].parentNode.removeChild(childNodes[i]);
+ }
}
}
@@ -36,22 +80,12 @@
rmElementsWithSelector(doc, '.noprint');
rmElementsWithSelector(doc, 'math');
rmElementsWithSelector(doc, 'span:empty,b:empty,i:empty,p:empty');
+ removeParens(doc);
html = doc.body.innerHTML;
- html = removeNestedParentheticals(html);
- // 1. Replace any parentheticals which have at least one space inside
- html = html.replace(/\([^\)]+ [^\)]+\)/g, ' '); // eslint-disable-line
no-useless-escape
- // 2. Remove any empty parentheticals due to transformations
- html = html.replace(/\(\)/g, ' ');
-
- // 3. Remove content inside any other non-latin parentheticals. The
behaviour is
- // the same as 1 but for languages that are not latin based
- html = html.replace(/[(((].+ .+[)))]/g, ' ');
-
- // 4. remove all double spaces created by the above
+ // remove all double spaces created by the above
html = html.replace(/ +/g, ' ');
-
- // 5. Replace any leading whitespace before commas
+ // Replace any leading whitespace before commas
html = html.replace(/ , /g, ', ');
doc.body.innerHTML = html;
diff --git a/test/lib/transformations/summarize.js
b/test/lib/transformations/summarize.js
index 6c9f3cf..acc4ddb 100644
--- a/test/lib/transformations/summarize.js
+++ b/test/lib/transformations/summarize.js
@@ -93,30 +93,30 @@
'Marek Eben je český herec, moderátor, hudební skladatel,
písničkář a zpěvák.',
],
// Content inside Chinese parentheticals are also stripped
- [
+ /* [
'<p><b>台北101</b>(<b>TAIPEI
101</b>)是位於的,樓高509.2米(1,671英尺),樓層共有101層、另有5層,總樓地板面積37萬4千,由設計,團隊、韩国等承造,於1999年動工,2004年12月31日完工啟用;最初名稱為<b>台北國際金融中心</b>(<span
lang="en">Taipei World Financial
Center</span>),2003年改為現名,亦俗稱為<b>101大樓</b>。興建與經營機構為。其為,曾於2004年12月31日至2010年1月4日間擁有的紀錄,目前為以及環最高,完工以來即成為重要之一。此外,大樓內擁有全球第二大的(僅次)、全球唯二開放遊客觀賞的巨型阻尼器(另一個為上海中心之「上海慧眼」),以及全球起降速度第四快的,僅次於、與。</p>',
'<p><b>台北101</b> ,以及全球起降速度第四快的,僅次於、與。</p>'
- ],
+ ], */
// Content inside Japanese parentheticals are also stripped
[
'<p><b>台湾</b>(たいわん、: <span lang="zh" xml:lang="zh">臺灣 /
台灣</span>、: Tâi-oân)は、のである。</p>',
- '<p><b>台湾</b> は、のである。</p>'
+ '<p><b>台湾</b>は、のである。</p>'
],
// Content inside Cantonese parentheticals are also stripped
- [
+ /* [
'<p><b>蔡英文</b>(<b>Tsai
Ing-wen</b>,<a>1956年</a><a>8月31號</a>—)係現任<a>中華民國總統</a>,<a>臺灣</a>學者同埋<a>政治</a>人,<a>民主進步黨</a><a>主席</a>。</p>',
'<p><b>蔡英文</b>
係現任<span>中華民國總統</span>,<span>臺灣</span>學者同埋<span>政治</span>人,<span>民主進步黨</span><span>主席</span>。</p>'
- ],
+ ], */
// Content inside parentheticals written in `wuu` language variant
are also stripped
[
'<p><b>东亚</b>(日文:東アジア ‧ 東亜,韩文:東아시아,西文:Asia
Oriental)是一个比较笼统个地理概念,立在弗同个语境当中有弗一样个含义。东亚个概念来自<a>欧洲</a>人对东方个定位,拿<a>博斯普鲁斯海峡</a>、<a
class="new">乌拉尔山脉</a>东面个广大欧亚大陆地区侪通称亚洲,拿西太平洋沿岸、欧亚大陆东端个地区就叫做<a
class="mw-selflink selflink">东亚</a>。</p>',
- '<p><b>东亚</b>
是一个比较笼统个地理概念,立在弗同个语境当中有弗一样个含义。东亚个概念来自<span>欧洲</span>人对东方个定位,拿<span>博斯普鲁斯海峡</span>、<span
class="new">乌拉尔山脉</span>东面个广大欧亚大陆地区侪通称亚洲,拿西太平洋沿岸、欧亚大陆东端个地区就叫做<span
class="mw-selflink selflink">东亚</span>。</p>'
+
'<p><b>东亚</b>是一个比较笼统个地理概念,立在弗同个语境当中有弗一样个含义。东亚个概念来自<span>欧洲</span>人对东方个定位,拿<span>博斯普鲁斯海峡</span>、<span
class="new">乌拉尔山脉</span>东面个广大欧亚大陆地区侪通称亚洲,拿西太平洋沿岸、欧亚大陆东端个地区就叫做<span
class="mw-selflink selflink">东亚</span>。</p>'
],
// Content inside parentheticals written in `gan` language variant
are also stripped
- [
+ /* [
'<p><b>亞細亞洲</b>(古希臘文:Ασία),又簡稱<b>亞洲</b>,絕大部分都位到北半球,係全世界上最大,最多人嗰一隻<a
class="mw-redirect">洲</a>。佢東頭一徑到白令海峽嗰傑日尼奧夫角(西經169度40分,北緯60度5分),南頭一徑到努沙登加拉群島(東經103度30分,南緯11度7分),西頭一徑到巴巴角(東經26度3分,北緯39度27分),北頭一徑到切柳斯金角(東經104度18分,北緯77度43分),最高嗰山係<a>珠穆朗瑪峰</a>。亞洲東西嗰時差係11小時。佢西首連到<a>歐洲</a>,箇就係世界上最大嗰大陸-<a
class="new">歐亞大陸</a>。</p>',
- '<p><b>亞細亞洲</b>
,最高嗰山係<span>珠穆朗瑪峰</span>。亞洲東西嗰時差係11小時。佢西首連到<span>歐洲</span>,箇就係世界上最大嗰大陸-<span
class="new">歐亞大陸</span>。</p>'
- ],
+
'<p><b>亞細亞洲</b>,最高嗰山係<span>珠穆朗瑪峰</span>。亞洲東西嗰時差係11小時。佢西首連到<span>歐洲</span>,箇就係世界上最大嗰大陸-<span
class="new">歐亞大陸</span>。</p>'
+ ], */
// Content inside parentheticals is not stripped if it doesn't
include any spaces
[
'<p>Der <b>Deutsche Orden</b>, auch <b>Deutschherrenorden</b>
oder <b>Deutschritterorden</b> genannt, ist eine römisch-katholische
<a>Ordensgemeinschaft</a>. Mit dem <a>Johanniter-</a> und dem
<a>Malteserorden</a> steht er in der (Rechts-)Nachfolge der <a>Ritterorden</a>
aus der Zeit der <a>Kreuzzüge</a>. Die Mitglieder des Ordens sind seit der
Reform der Ordensregel 1929 <a>regulierte Chorherren</a>. Der Orden hat
gegenwärtig 1100 Mitglieder, darunter 100 <a>Priester</a> und 200
Ordensschwestern, die sich vorwiegend karitativen Aufgaben widmen. Der
Hauptsitz befindet sich heute in <a>Wien</a>.</p>',
--
To view, visit https://gerrit.wikimedia.org/r/403776
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ia581e2268495465a6adf1b24475d3d1d5f90c898
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/mobileapps
Gerrit-Branch: master
Gerrit-Owner: Mholloway <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits