jenkins-bot has submitted this change and it was merged.
Change subject: Stop stripping HTML content and use library for decoding
entities
......................................................................
Stop stripping HTML content and use library for decoding entities
Our HTML-stripping functions are unsafe and unnecessary and should be
removed.
Also, we should use a popular and actively maintained library for decoding
HTML entities rather than using a hand-rolled function.
Change-Id: Iaa737b0ac40f40d14c92fe44bcec7f96e170f99a
---
M lib/mobile-util.js
M lib/parseDefinition.js
M package.json
M test/features/definition/definition.js
M test/lib/mobile-util/mobile-util-test.js
5 files changed, 7 insertions(+), 53 deletions(-)
Approvals:
BearND: Looks good to me, approved
jenkins-bot: Verified
diff --git a/lib/mobile-util.js b/lib/mobile-util.js
index c6ef0e5..551df36 100644
--- a/lib/mobile-util.js
+++ b/lib/mobile-util.js
@@ -167,13 +167,6 @@
return dateString + '/' + uuid.now().toString();
};
-/**
- * Strip HTML markup from a string.
- */
-mUtil.stripMarkup = function(text) {
- return text.replace(/<[^>]*>/g, '');
-};
-
mUtil.throw404 = function(message) {
throw new HTTPError({
status: 404,
diff --git a/lib/parseDefinition.js b/lib/parseDefinition.js
index b76fa1a..8728a32 100644
--- a/lib/parseDefinition.js
+++ b/lib/parseDefinition.js
@@ -8,6 +8,7 @@
'use strict';
var domino = require('domino');
+var he = require('he');
var sUtil = require('./util');
var mUtil = require('./mobile-util');
var transforms = require('./transforms');
@@ -64,21 +65,6 @@
*/
};
-function stripSpanTags(text) {
- return text.replace(/<\/?span[^>]*>/g, '');
-}
-
-function stripItalicTags(text) {
- return text.replace(/<\/?i[^>]*>/g, '');
-}
-
-function decodeHTML(text) {
- return text.replace(/</g,'<')
- .replace(/>/g,'>')
- .replace(/&/g,'&')
- .replace(/ /g, ' ');
-}
-
function hasUsageExamples(text, langCode) {
return langCode === 'en' ? text.indexOf('<dl') > -1 : text.indexOf('<ul')
> -1;
}
@@ -119,18 +105,11 @@
currentDefinition.examples = [];
examples = wikiLangCode === 'en' ? element.querySelectorAll('dd') :
element.querySelectorAll('li');
for (j = 0; j < examples.length; j++) {
- example = examples[j].innerHTML.trim();
- example = decodeHTML(example);
- example = stripSpanTags(example);
- example = stripItalicTags(example);
+ example = he.decode(examples[j].innerHTML.trim());
currentDefinition.examples.push(example);
}
} else {
- defn = element.innerHTML.trim();
- defn = decodeHTML(defn);
- defn = stripSpanTags(defn);
- defn = stripItalicTags(defn);
- currentDefinition.definition = defn;
+ currentDefinition.definition = he.decode(element.innerHTML.trim());
}
return currentDefinition;
}
@@ -160,7 +139,7 @@
for (j = 0; j < sectionDivs.length; j++) {
currentSectionDiv = sectionDivs[j];
- header = mUtil.stripMarkup(currentSectionDiv.title);
+ header = currentSectionDiv.title;
/* Get the language from the first H2 header, and begin iterating over
sections.
Per the English Wiktionary style guide (linked in header above), H2
headings
diff --git a/package.json b/package.json
index 4a5dad2..0490526 100644
--- a/package.json
+++ b/package.json
@@ -46,6 +46,7 @@
"core-js": "^2.4.1",
"domino": "^1.0.25",
"express": "^4.14.0",
+ "he": "^1.1.0",
"js-yaml": "^3.6.1",
"locutus": "^2.0.5",
"mediawiki-title": "^0.5.4",
diff --git a/test/features/definition/definition.js
b/test/features/definition/definition.js
index 40c583b..af95843 100644
--- a/test/features/definition/definition.js
+++ b/test/features/definition/definition.js
@@ -36,7 +36,7 @@
}
}
assert.deepEqual(en[0].partOfSpeech, 'Noun');
- assert.ok(en[0].definitions[0].definition.indexOf('An animal
of the family <a href=\"/wiki/Felidae\">') === 0, 'Expected different start of
definition');
+ assert.ok(en[0].definitions[0].definition.indexOf('An animal
of the family <i><a href="/wiki/Felidae">Felidae</a></i>') === 0, 'Expected
different start of definition');
assert.deepEqual(en[1].partOfSpeech, 'Verb');
assert.ok(en[1].definitions[0].definition.indexOf('To <a
href=\"/wiki/hoist\">hoist</a>') === 0, 'Expected different start of
definition');
});
diff --git a/test/lib/mobile-util/mobile-util-test.js
b/test/lib/mobile-util/mobile-util-test.js
index 9027296..93bb986 100644
--- a/test/lib/mobile-util/mobile-util-test.js
+++ b/test/lib/mobile-util/mobile-util-test.js
@@ -19,18 +19,6 @@
var arr6 = [ obj1, obj3 ];
var arr7 = [ obj4, obj2 ];
-// If a news story string breaks the regex in mUtil.stripMarkup, fix the regex,
-// then add the offending string here for use in future unit testing!
-var newsHtml1 = '<!--June 20 --> In <a rel="mw:WikiLink" href="./Basketball"
title="Basketball" id="mwBA">basketball</a>, the <a rel="mw:WikiLink"
href="./Cleveland_Cavaliers" title="Cleveland Cavaliers" id="mwBQ">Cleveland
Cavaliers</a> defeat the <a rel="mw:WikiLink" href="./Golden_State_Warriors"
title="Golden State Warriors" id="mwBg">Golden State Warriors</a> to win the <b
id="mwBw"><a rel="mw:WikiLink" href="./2016_NBA_Finals" title="2016 NBA Finals"
id="mwCA">NBA Finals</a></b> <i id="mwCQ">(<a rel="mw:WikiLink"
href="./Bill_Russell_NBA_Finals_Most_Valuable_Player_Award" title="Bill Russell
NBA Finals Most Valuable Player Award" id="mwCg">MVP</a> <a rel="mw:WikiLink"
href="./LeBron_James" title="LeBron James" id="mwCw">LeBron James</a>
pictured)</i>.';
-var newsHtml2 = '<!--June 18--> The <a rel="mw:WikiLink"
href="./United_Nations" title="United Nations" id="mwDQ">United Nations</a>
reports 80,000 civilians have fled <a rel="mw:WikiLink" href="./Fallujah"
title="Fallujah" id="mwDg">Fallujah</a>, as <a rel="mw:WikiLink"
href="./Iraqi_Armed_Forces" title="Iraqi Armed Forces" id="mwDw">Iraqi Army</a>
and <a rel="mw:WikiLink" href="./Popular_Mobilization_Forces_(Iraq)"
title="Popular Mobilization Forces (Iraq)" id="mwEA">Shia militias</a> <b
id="mwEQ"><a rel="mw:WikiLink" href="./Battle_of_Fallujah_(2016)" title="Battle
of Fallujah (2016)" id="mwEg">retake most of the city</a></b> from <a
rel="mw:WikiLink" href="./Islamic_State_of_Iraq_and_the_Levant" title="Islamic
State of Iraq and the Levant" id="mwEw">ISIL</a>.';
-var newsHtml3 = '<!--June 16--> British <a rel="mw:WikiLink"
href="./Member_of_parliament" title="Member of parliament" id="mwFQ">Member of
Parliament</a> <b id="mwFg"><a rel="mw:WikiLink" href="./Jo_Cox" title="Jo Cox"
id="mwFw">Jo Cox</a></b> dies after being <a rel="mw:WikiLink"
href="./Killing_of_Jo_Cox" title="Killing of Jo Cox" id="mwGA">shot and
stabbed</a> in <a rel="mw:WikiLink" href="./Birstall,_West_Yorkshire"
title="Birstall, West Yorkshire" id="mwGQ">Birstall, West Yorkshire</a>.';
-var newsHtml4 = '<!--June 14--> In <a rel="mw:WikiLink"
href="./Association_football" title="Association football"
id="mwGw">association football</a>, <b id="mwHA"><a rel="mw:WikiLink"
href="./Violence_at_UEFA_Euro_2016" title="Violence at UEFA Euro 2016"
id="mwHQ">fan violence</a></b> at <a rel="mw:WikiLink" href="./UEFA_Euro_2016"
title="UEFA Euro 2016" id="mwHg">UEFA Euro 2016</a> results in arrests and
deportation of fans, and fines for the <a rel="mw:WikiLink"
href="./Croatian_national_football_team" title="Croatian national football
team" id="mwHw">Croatian</a>, <a rel="mw:WikiLink"
href="./Hungarian_national_football_team" title="Hungarian national football
team" id="mwIA">Hungarian</a> and <a rel="mw:WikiLink"
href="./Russia_national_football_team" title="Russia national football team"
id="mwIQ">Russian</a> teams.';
-
-var newsText1 = 'In basketball, the Cleveland Cavaliers defeat the Golden
State Warriors to win the NBA Finals (MVP LeBron James pictured).';
-var newsText2 = 'The United Nations reports 80,000 civilians have fled
Fallujah, as Iraqi Army and Shia militias retake most of the city from ISIL.';
-var newsText3 = 'British Member of Parliament Jo Cox dies after being shot and
stabbed in Birstall, West Yorkshire.';
-var newsText4 = 'In association football, fan violence at UEFA Euro 2016
results in arrests and deportation of fans, and fines for the Croatian,
Hungarian and Russian teams.';
-
describe('lib:mobile-util', function() {
this.timeout(20000);
@@ -54,12 +42,5 @@
it('fillInMemberKeys should make the specified adjustments', function() {
mUtil.fillInMemberKeys(arr3, [['world', 'goodbye'], ['again', 'sea']]);
assert.deepEqual(arr3, arr4);
- })
-
- it('HTML stripping regex should strip HTML as expected', function() {
- assert.deepEqual(mUtil.stripMarkup(newsHtml1).trim(), newsText1);
- assert.deepEqual(mUtil.stripMarkup(newsHtml2).trim(), newsText2);
- assert.deepEqual(mUtil.stripMarkup(newsHtml3).trim(), newsText3);
- assert.deepEqual(mUtil.stripMarkup(newsHtml4).trim(), newsText4);
- })
+ });
});
--
To view, visit https://gerrit.wikimedia.org/r/302288
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Iaa737b0ac40f40d14c92fe44bcec7f96e170f99a
Gerrit-PatchSet: 3
Gerrit-Project: mediawiki/services/mobileapps
Gerrit-Branch: master
Gerrit-Owner: Mholloway <[email protected]>
Gerrit-Reviewer: BearND <[email protected]>
Gerrit-Reviewer: Dbrant <[email protected]>
Gerrit-Reviewer: Fjalapeno <[email protected]>
Gerrit-Reviewer: GWicke <[email protected]>
Gerrit-Reviewer: Jhernandez <[email protected]>
Gerrit-Reviewer: Mholloway <[email protected]>
Gerrit-Reviewer: Mhurd <[email protected]>
Gerrit-Reviewer: Mobrovac <[email protected]>
Gerrit-Reviewer: Niedzielski <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits