BearND has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/403568 )
Change subject: Remove unwanted nodes and attributes from summary ...................................................................... Remove unwanted nodes and attributes from summary Bug: T184557 Change-Id: Ib4f21f5f68913fa10b9005d10de23b94616dcc94 --- M lib/transformations/summarize.js M test/lib/transformations/summarize.js 2 files changed, 88 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps refs/changes/68/403568/1 diff --git a/lib/transformations/summarize.js b/lib/transformations/summarize.js index e0becc8..f558d86 100644 --- a/lib/transformations/summarize.js +++ b/lib/transformations/summarize.js @@ -4,6 +4,77 @@ const flattenElements = require('./flattenElements'); const rmElementsWithSelector = require('./rmElementsWithSelector'); const removeAttributes = require('./removeAttributes'); +const NodeType = require('../nodeType'); + +const ALLOWED_ATTRIBUTES = ['class', 'style']; +const DISALLOWED_ELEMENTS = ['object', 'script', 'style']; + +/** + * Removes blacklisted elements. + * @param {!Node} node the node to visit + */ +function rmDisallowedElements(node) { + if (DISALLOWED_ELEMENTS.includes(node.tagName)) { + node.remove(); + } +} + +/** + * Removes attributes except white-listed ones. + * @param {!Node} node the node to visit + */ +function rmUnwantedAttributes(node) { + if (node.tagName !== 'IMG') { + const attrs = node.attributes; + for (let i = attrs.length - 1; i >= 0; i--) { + const attribute = attrs.item(i); + if (attribute && attribute.localName + && !ALLOWED_ATTRIBUTES.includes(attribute.localName)) { + node.removeAttribute(attribute.localName); + } + } + } +} + +/** + * Visits one DOM node. Do the stuff that needs to be done when a single DOM node is handled. + * In this case, remove DOM nodes and their children we don't want to keep. + * @param {!Node} node the node to visit + */ +function visitor(node) { + if (node.nodeType === NodeType.TEXT_NODE) { + // keep as is for now + } else if (node.nodeType === NodeType.ELEMENT_NODE) { + rmDisallowedElements(node); + rmUnwantedAttributes(node); + } else { + node.remove(); + } +} + +/** + * Traverses DOM tree iteratively (depth first). + * @param {!Element} rootElement the root of the DOM tree which needs to be traversed + */ +function traverseDF(rootElement) { + let nodesToVisit = [ rootElement ]; + while (nodesToVisit.length > 0) { + const currentNode = nodesToVisit.shift(); + visitor(currentNode); + nodesToVisit = [ + ...(currentNode.childNodes || []), // depth first + ...nodesToVisit, + ]; + } +} + +/** + * Removes unwanted nodes and element attributes. + * @param {!Document} document the DOM document + */ +function removeUnwantedNodes(document) { + traverseDF(document.body); +} /** * Recursively discard any parentheticals that themselves are inside parentheticals @@ -36,6 +107,7 @@ rmElementsWithSelector(doc, '.noprint'); rmElementsWithSelector(doc, 'math'); rmElementsWithSelector(doc, 'span:empty,b:empty,i:empty,p:empty'); + removeUnwantedNodes(doc); html = doc.body.innerHTML; html = removeNestedParentheticals(html); diff --git a/test/lib/transformations/summarize.js b/test/lib/transformations/summarize.js index 6c9f3cf..7031bf4 100644 --- a/test/lib/transformations/summarize.js +++ b/test/lib/transformations/summarize.js @@ -8,6 +8,21 @@ describe('summarize', () => { it('matches the spec', () => { const testCases = [ + // Should remove unwanted attributes + [ + '<span bogus="dummy">f</span><b invalid="whateva">o</b>o', + '<span>f</span><b>o</b>o' + ], + // Should keep white-listed attributes + [ + '<span style="we-got-style">f</span><span class="we-got-class">o</span>o', + '<span style="we-got-style">f</span><span class="we-got-class">o</span>o' + ], + // Should remove comments + [ + 'foo<!-- a comment -->bar', + 'foobar' + ], // Should flatten empty nodes [ '<span></span><b></b><i></i><p><span>f</span></p>', @@ -46,7 +61,7 @@ // math tags are stripped but any math images are shown [ '<p>The Planck–Einstein relation connects the particulate photon energy <span class="texhtml "><i>E</i></span> with its associated wave frequency <span class="texhtml "><i>f</i></span>:</p>\n\n<dl id="mwmQ"><dd id="mwmg"><span class="mwe-math-element"><span class="mwe-math-mathml-inline mwe-math-mathml-a11y" style="display: none;"><math xmlns="http://www.w3.org/1998/Math/MathML">\n <semantics>\n <mrow class="MJX-TeXAtom-ORD">\n <mstyle displaystyle="true" scriptlevel="0">\n <mi>E</mi>\n <mo>=</mo>\n <mi>h</mi>\n <mi>f</mi>\n </mstyle>\n </mrow>\n <annotation encoding="application/x-tex">{\\displaystyle E=hf}</annotation>\n </semantics>\n</math></span><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f39fac3593bb1e2dec0282c112c4dff7a99007f6" class="mwe-math-fallback-image-inline" aria-hidden="true" style="vertical-align: -0.671ex; width:7.533ex; height:2.509ex;"></span></dd></dl>', - '<p>The Planck–Einstein relation connects the particulate photon energy <span class="texhtml "><i>E</i></span> with its associated wave frequency <span class="texhtml "><i>f</i></span>:</p>\n\n<dl id="mwmQ"><dd id="mwmg"><span class="mwe-math-element"><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f39fac3593bb1e2dec0282c112c4dff7a99007f6" class="mwe-math-fallback-image-inline" aria-hidden="true" style="vertical-align: -0.671ex; width:7.533ex; height:2.509ex;"></span></dd></dl>' + '<p>The Planck–Einstein relation connects the particulate photon energy <span class="texhtml "><i>E</i></span> with its associated wave frequency <span class="texhtml "><i>f</i></span>:</p>\n\n<dl><dd><span class="mwe-math-element"><img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f39fac3593bb1e2dec0282c112c4dff7a99007f6" class="mwe-math-fallback-image-inline" aria-hidden="true" style="vertical-align: -0.671ex; width:7.533ex; height:2.509ex;"></span></dd></dl>' ], // Parentheticals will be stripped [ -- To view, visit https://gerrit.wikimedia.org/r/403568 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ib4f21f5f68913fa10b9005d10de23b94616dcc94 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/mobileapps Gerrit-Branch: master Gerrit-Owner: BearND <bsitzm...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits