BearND has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/403568 )

Change subject: Remove unwanted nodes and attributes from summary
......................................................................

Remove unwanted nodes and attributes from summary

Bug: T184557
Change-Id: Ib4f21f5f68913fa10b9005d10de23b94616dcc94
---
M lib/transformations/summarize.js
M test/lib/transformations/summarize.js
2 files changed, 88 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps 
refs/changes/68/403568/1

diff --git a/lib/transformations/summarize.js b/lib/transformations/summarize.js
index e0becc8..f558d86 100644
--- a/lib/transformations/summarize.js
+++ b/lib/transformations/summarize.js
@@ -4,6 +4,77 @@
 const flattenElements = require('./flattenElements');
 const rmElementsWithSelector = require('./rmElementsWithSelector');
 const removeAttributes = require('./removeAttributes');
+const NodeType = require('../nodeType');
+
+const ALLOWED_ATTRIBUTES = ['class', 'style'];
+const DISALLOWED_ELEMENTS = ['object', 'script', 'style'];
+
+/**
+ * Removes blacklisted elements.
+ * @param {!Node} node the node to visit
+ */
+function rmDisallowedElements(node) {
+    if (DISALLOWED_ELEMENTS.includes(node.tagName)) {
+        node.remove();
+    }
+}
+
+/**
+ * Removes attributes except white-listed ones.
+ * @param {!Node} node the node to visit
+ */
+function rmUnwantedAttributes(node) {
+    if (node.tagName !== 'IMG') {
+        const attrs = node.attributes;
+        for (let i = attrs.length - 1; i >= 0; i--) {
+            const attribute = attrs.item(i);
+            if (attribute && attribute.localName
+                    && !ALLOWED_ATTRIBUTES.includes(attribute.localName)) {
+                node.removeAttribute(attribute.localName);
+            }
+        }
+    }
+}
+
+/**
+ * Visits one DOM node. Do the stuff that needs to be done when a single DOM 
node is handled.
+ * In this case, remove DOM nodes and their children we don't want to keep.
+ * @param {!Node} node the node to visit
+ */
+function visitor(node) {
+    if (node.nodeType === NodeType.TEXT_NODE) {
+        // keep as is for now
+    } else if (node.nodeType === NodeType.ELEMENT_NODE) {
+        rmDisallowedElements(node);
+        rmUnwantedAttributes(node);
+    } else {
+        node.remove();
+    }
+}
+
+/**
+ * Traverses DOM tree iteratively (depth first).
+ * @param {!Element} rootElement the root of the DOM tree which needs to be 
traversed
+ */
+function traverseDF(rootElement) {
+    let nodesToVisit = [ rootElement ];
+    while (nodesToVisit.length > 0) {
+        const currentNode = nodesToVisit.shift();
+        visitor(currentNode);
+        nodesToVisit = [
+            ...(currentNode.childNodes || []), // depth first
+            ...nodesToVisit,
+        ];
+    }
+}
+
+/**
+ * Removes unwanted nodes and element attributes.
+ * @param {!Document} document the DOM document
+ */
+function removeUnwantedNodes(document) {
+    traverseDF(document.body);
+}
 
 /**
  * Recursively discard any parentheticals that themselves are inside 
parentheticals
@@ -36,6 +107,7 @@
     rmElementsWithSelector(doc, '.noprint');
     rmElementsWithSelector(doc, 'math');
     rmElementsWithSelector(doc, 'span:empty,b:empty,i:empty,p:empty');
+    removeUnwantedNodes(doc);
 
     html = doc.body.innerHTML;
     html = removeNestedParentheticals(html);
diff --git a/test/lib/transformations/summarize.js 
b/test/lib/transformations/summarize.js
index 6c9f3cf..7031bf4 100644
--- a/test/lib/transformations/summarize.js
+++ b/test/lib/transformations/summarize.js
@@ -8,6 +8,21 @@
 describe('summarize', () => {
     it('matches the spec', () => {
         const testCases = [
+            // Should remove unwanted attributes
+            [
+                '<span bogus="dummy">f</span><b invalid="whateva">o</b>o',
+                '<span>f</span><b>o</b>o'
+            ],
+            // Should keep white-listed attributes
+            [
+                '<span style="we-got-style">f</span><span 
class="we-got-class">o</span>o',
+                '<span style="we-got-style">f</span><span 
class="we-got-class">o</span>o'
+            ],
+            // Should remove comments
+            [
+                'foo<!-- a comment -->bar',
+                'foobar'
+            ],
             // Should flatten empty nodes
             [
                 '<span></span><b></b><i></i><p><span>f</span></p>',
@@ -46,7 +61,7 @@
             // math tags are stripped but any math images are shown
             [
                 '<p>The Planck–Einstein relation connects the particulate 
photon energy <span class="texhtml "><i>E</i></span> with its associated wave 
frequency <span class="texhtml "><i>f</i></span>:</p>\n\n<dl id="mwmQ"><dd 
id="mwmg"><span class="mwe-math-element"><span class="mwe-math-mathml-inline 
mwe-math-mathml-a11y" style="display: none;"><math 
xmlns="http://www.w3.org/1998/Math/MathML";>\n  <semantics>\n    <mrow 
class="MJX-TeXAtom-ORD">\n      <mstyle displaystyle="true" scriptlevel="0">\n  
      <mi>E</mi>\n        <mo>=</mo>\n        <mi>h</mi>\n        <mi>f</mi>\n  
    </mstyle>\n    </mrow>\n    <annotation 
encoding="application/x-tex">{\\displaystyle E=hf}</annotation>\n  
</semantics>\n</math></span><img 
src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f39fac3593bb1e2dec0282c112c4dff7a99007f6";
 class="mwe-math-fallback-image-inline" aria-hidden="true" 
style="vertical-align: -0.671ex; width:7.533ex; 
height:2.509ex;"></span></dd></dl>',
-                '<p>The Planck–Einstein relation connects the particulate 
photon energy <span class="texhtml "><i>E</i></span> with its associated wave 
frequency <span class="texhtml "><i>f</i></span>:</p>\n\n<dl id="mwmQ"><dd 
id="mwmg"><span class="mwe-math-element"><img 
src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f39fac3593bb1e2dec0282c112c4dff7a99007f6";
 class="mwe-math-fallback-image-inline" aria-hidden="true" 
style="vertical-align: -0.671ex; width:7.533ex; 
height:2.509ex;"></span></dd></dl>'
+                '<p>The Planck–Einstein relation connects the particulate 
photon energy <span class="texhtml "><i>E</i></span> with its associated wave 
frequency <span class="texhtml "><i>f</i></span>:</p>\n\n<dl><dd><span 
class="mwe-math-element"><img 
src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f39fac3593bb1e2dec0282c112c4dff7a99007f6";
 class="mwe-math-fallback-image-inline" aria-hidden="true" 
style="vertical-align: -0.671ex; width:7.533ex; 
height:2.509ex;"></span></dd></dl>'
             ],
             // Parentheticals will be stripped
             [

-- 
To view, visit https://gerrit.wikimedia.org/r/403568
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ib4f21f5f68913fa10b9005d10de23b94616dcc94
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/mobileapps
Gerrit-Branch: master
Gerrit-Owner: BearND <bsitzm...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to