BearND has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/403725 )
Change subject: WIP: Remove parentheses using state machine instead of regex ...................................................................... WIP: Remove parentheses using state machine instead of regex Do not merge! Bug: T184557 Change-Id: Ia135df4eb6fbb26064eb95764d1529f294ea4dbe --- M lib/transformations/summarize.js M test/lib/transformations/summarize.js 2 files changed, 245 insertions(+), 25 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps refs/changes/25/403725/1 diff --git a/lib/transformations/summarize.js b/lib/transformations/summarize.js index 06c777b..1072f0f 100644 --- a/lib/transformations/summarize.js +++ b/lib/transformations/summarize.js @@ -34,31 +34,229 @@ } } +// Non-ASCII examples of parentheses are taken from +// https://www.mediawiki.org/wiki/Topic:Tn0k9fjn0g8e9fpe. +// They were all the same in the Asian languages. +const TOKEN = { + Open: { ASCII: '(', NON_ASCII: '(' }, // %28, %uFF08 + Close: { ASCII: ')', NON_ASCII: ')' }, // %29, %uFF09 + Space: { NORMAL: ' ' } +}; + +class Range { + constructor(startNode, startNodeIndex, startCharacterIndex) { + this._startNode = startNode; + this._startNodeIndex = startNodeIndex; + this._startCharacterIndex = startCharacterIndex; + } + + endRange(endNode, endNodeIndex, endCharacterIndex) { + this._endNode = endNode; + this._endNodeIndex = endNodeIndex; + this._endCharacterIndex = endCharacterIndex; + } + + getStartNode() { + return this._startNode; + } + + getStartNodeIndex() { + return this._startNodeIndex; + } + + getStartCharacterIndex() { + return this._startCharacterIndex; + } + + getEndNode() { + return this._endNode; + } + + getEndNodeIndex() { + return this._endNodeIndex; + } + + getEndIndex() { + return this._endCharacterIndex; + } + + startsBefore(nextRange) { + return this._startNodeIndex < nextRange._startNodeIndex + || ( + this._startNodeIndex === nextRange._startNodeIndex + && this._startCharacterIndex < nextRange._startCharacterIndex + ); + } + + endsBefore(nextRange) { + return this._endNodeIndex < nextRange._endNodeIndex + || ( + this._endNodeIndex === nextRange._endNodeIndex + && this._endCharacterIndex < nextRange._endCharacterIndex + ); + } + + setSpace() { + this._hasSpace = true; + } + + hasSpaces() { + return this._hasSpace; + } + + debug() { + if (this._startNode === this._endNode) { + const text = this._startNode.textContent; + return `${text.slice(this._startCharacterIndex, this._endCharacterIndex + 1)}`; + } else { + const startText = this._startNode.textContent.slice(this._startCharacterIndex); + const endText = this._endNode.textContent.slice(0, this._endCharacterIndex + 1); + return `'${startText}' to '${endText}'`; + } + } +} + +function markParentheses(node, state) { + const text = node.textContent; + for (let i = 0; i < text.length; i++) { + const char = text[i]; + if (char === TOKEN.Open.ASCII || char === TOKEN.Open.NON_ASCII) { + const range = new Range(node, node._nid, i); + state.ranges.push(range); + state.stack.push(range); + } else if (char === TOKEN.Close.ASCII || char === TOKEN.Close.NON_ASCII) { + const range = state.stack[state.stack.length - 1]; + if (range) { + range.endRange(node, node._nid, i); + state.stack.pop(); + } + } else if (char === TOKEN.Space.NORMAL) { + // TODO: do for all in stack??? + const range = state.stack[state.stack.length - 1]; + if (range && !range.hasSpaces()) { + range.setSpace(); + } + } + } +} + +// '(d e)' +function discardIneligibleRanges(ranges) { + for (let i = 0; i < ranges.length; i++) { + const currentRange = ranges[i]; + if (!currentRange.hasSpaces()) { + ranges.splice(i, 1); // remove current range + } + } +} + +// 'a(b' +function discardUnclosedRanges(ranges) { + for (let i = 0; i < ranges.length; i++) { + const currentRange = ranges[i]; + if (!currentRange.getEndNode()) { + ranges.splice(i, 1); // remove current range + } + } +} + +// 'a(b (d e) f)g' +function discardNestedRanges(ranges) { + for (let i = 0; i < ranges.length - 1; i++) { + const currentRange = ranges[i]; + const nextRange = ranges[i + 1]; + if (currentRange.startsBefore(nextRange) + && nextRange.endsBefore(currentRange) + && currentRange.hasSpaces()) { + // next range is nested inside current eligible range + ranges.splice(i + 1, 1); // remove next range + } + } +} + +function removeParenthesesInTextNode(node, state) { + let range = state.ranges[0]; + let offset = 0; + while (range) { + if (state.deleting && node !== range.getEndNode()) { + node.remove(); + } + + if (node === range.getStartNode() && node === range.getEndNode()) { + // start and end in this text node + const originalText = node.textContent; + const startText = originalText.slice(0, range.getStartCharacterIndex() - offset); + const endText = originalText.slice(range.getEndIndex() + 1 - offset); + node.textContent = `${startText} ${endText}`; + offset = range.getEndIndex() - range.getStartCharacterIndex(); + state.ranges.shift(); + range = state.ranges[0]; + } else if (node === range.getStartNode()) { + // just start + state.deleting = true; + const originalText = node.textContent; + const startText = originalText.slice(0, range.getStartCharacterIndex() - offset); + node.textContent = startText; + break; + } else if (node === range.getEndNode()) { + // just end + state.deleting = false; + const originalText = node.textContent; + const endText = originalText.slice(range.getEndIndex() + 1 - offset); + node.textContent = endText; + offset = range.getEndIndex(); + state.ranges.shift(); + range = state.ranges[0]; + } else { + break; + } + } +} + /** * Visits one DOM node. Do the stuff that needs to be done when a single DOM node is handled. * In this case, remove DOM nodes and some attributes we don't want to keep. * @param {!Node} node the node to visit + * @param {!Object} state some state to pass around to the visitor */ -function visitor(node) { +function visitorPass1(node, state) { if (node.nodeType === NodeType.ELEMENT_NODE) { rmDisallowedElements(node); if (node.tagName !== 'IMG') { rmUnwantedAttributes(node); } - } else if (node.nodeType !== NodeType.TEXT_NODE) { + } else if (node.nodeType === NodeType.TEXT_NODE) { + markParentheses(node, state); + } else { node.remove(); + } +} + +/** + * Visits one DOM node. Do the stuff that needs to be done when a single DOM node is handled. + * In this case, remove DOM nodes and some attributes we don't want to keep. + * @param {!Node} node the node to visit + * @param {!Object} state some state to pass around to the visitor + */ +function visitorPass2(node, state) { + if (node.nodeType === NodeType.ELEMENT_NODE && state.deleting) { + node.remove(); + } else if (node.nodeType === NodeType.TEXT_NODE) { + removeParenthesesInTextNode(node, state); } } /** * Traverses DOM tree iteratively (depth first). * @param {!Element} rootElement the root of the DOM tree which needs to be traversed + * @param {!Function} visitor called whenever a node gets encountered + * @param {!Object} state some state to pass around to the visitor */ -function traverseDF(rootElement) { +function traverseDF(rootElement, visitor, state) { let nodesToVisit = [ rootElement ]; while (nodesToVisit.length > 0) { const currentNode = nodesToVisit.shift(); - visitor(currentNode); + visitor(currentNode, state); nodesToVisit = [ ...(currentNode.childNodes || []), // depth first ...nodesToVisit, @@ -71,25 +269,32 @@ * @param {!Document} document the DOM document */ function removeUnwantedNodes(document) { - traverseDF(document.body); -} - -/** - * Recursively discard any parentheticals that themselves are inside parentheticals - * @param {string} html - * @return {string} html summary with nested parentheticals removed. - */ -function removeNestedParentheticals(html) { - // Remove any nested parentheticals - const regex = /(\([^()]+)(\([^()]+\))/g; - const newHtml = html.replace(regex, '$1 '); - - if (newHtml.match(regex)) { - return removeNestedParentheticals(newHtml); - } else { - return newHtml; + const state = { ranges: [], stack: [] }; + traverseDF(document.body, visitorPass1, state); + if (state.ranges.length > 0) { // found parentheses in previous pass + discardIneligibleRanges(state.ranges); + discardUnclosedRanges(state.ranges); + discardNestedRanges(state.ranges); + traverseDF(document.body, visitorPass2, state); } } + +// /** +// * Recursively discard any parentheticals that themselves are inside parentheticals +// * @param {string} html +// * @return {string} html summary with nested parentheticals removed. +// */ +// function removeNestedParentheticals(html) { +// // Remove any nested parentheticals +// const regex = /(\([^()]+)(\([^()]+\))/g; +// const newHtml = html.replace(regex, '$1 '); +// +// if (newHtml.match(regex)) { +// return removeNestedParentheticals(newHtml); +// } else { +// return newHtml; +// } +// } /** * Given a chunk of HTML, create a summary of the content @@ -108,15 +313,15 @@ removeUnwantedNodes(doc); html = doc.body.innerHTML; - html = removeNestedParentheticals(html); + // html = removeNestedParentheticals(html); // 1. Replace any parentheticals which have at least one space inside - html = html.replace(/\([^\)]+ [^\)]+\)/g, ' '); // eslint-disable-line no-useless-escape + // html = html.replace(/\([^\)]+ [^\)]+\)/g, ' '); // eslint-disable-line no-useless-escape // 2. Remove any empty parentheticals due to transformations - html = html.replace(/\(\)/g, ' '); + // html = html.replace(/\(\)/g, ' '); // 3. Remove content inside any other non-latin parentheticals. The behaviour is // the same as 1 but for languages that are not latin based - html = html.replace(/[(((].+ .+[)))]/g, ' '); + // html = html.replace(/[(((].+ .+[)))]/g, ' '); // 4. remove all double spaces created by the above html = html.replace(/ +/g, ' '); diff --git a/test/lib/transformations/summarize.js b/test/lib/transformations/summarize.js index 5658cb9..3eb0f95 100644 --- a/test/lib/transformations/summarize.js +++ b/test/lib/transformations/summarize.js @@ -8,6 +8,21 @@ describe('summarize', () => { it('matches the spec', () => { const testCases = [ + // Should remove nested parens + [ + '1a(b (d e) f)g', + '1a g' + ], + // Should remove sequential parens + [ + '2a(b c)d(e f)g', + '2a d g' + ], + // Should remove sequential parens except open one at the end + [ + '3a(b c)d(e f)g(', + '3a d g(' + ], // Should remove unwanted elements [ '<style>f</style><object>o</object>o', -- To view, visit https://gerrit.wikimedia.org/r/403725 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ia135df4eb6fbb26064eb95764d1529f294ea4dbe Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/mobileapps Gerrit-Branch: master Gerrit-Owner: BearND <bsitzm...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits