Jdlrobson has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/404882 )
Change subject: WIP: Cleanup trailing punctuation
......................................................................
WIP: Cleanup trailing punctuation
Bug: T185161
Change-Id: I20d786e3a46c06fdd47a9faf7a50dcf56267b531
---
M lib/transformations/summarize.js
M test/lib/transformations/summarize.js
2 files changed, 14 insertions(+), 7 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps
refs/changes/82/404882/1
diff --git a/lib/transformations/summarize.js b/lib/transformations/summarize.js
index 6c55b25..97101e2 100644
--- a/lib/transformations/summarize.js
+++ b/lib/transformations/summarize.js
@@ -130,14 +130,16 @@
// 4. remove all double spaces created by the above
html = html.replace(/ +/g, ' ');
- // 5. Replace any leading whitespace before commas
+ // 5. Replace any leading whitespace before commas and full stops
// (which could be the result of earlier transformations)
- html = html.replace(/ , /g, ', ');
+ html = html.replace(/ ([,.!?]) /g, '$1 ');
// 6. Same as 5 but for non-latin comma and no space afterwards
- html = html.replace(/ ,/g, ',');
+ html = html.replace(/ ([,.!?])/g, '$1 ');
- doc.body.innerHTML = html;
+ // remove any trailing spaces before closing node.
+ html = html.replace(/([.!]) +<\//g, '$1</');
+
return {
extract: doc.body.textContent,
extract_html: html
diff --git a/test/lib/transformations/summarize.js
b/test/lib/transformations/summarize.js
index ec25458..9503c0c 100644
--- a/test/lib/transformations/summarize.js
+++ b/test/lib/transformations/summarize.js
@@ -8,6 +8,11 @@
describe('summarize', () => {
it('matches the spec', () => {
const testCases = [
+ // Should not leave double spaces after stripping parentheticals
+ [
+ '<p>Justice League (both 2017). Justice (is forever)!</p>',
+ '<p>Justice League. Justice!</p>'
+ ],
// Should remove unwanted elements
[
'.<style>f</style><object>o</object><script>o</script>.',
@@ -65,8 +70,8 @@
],
// math tags are stripped but any math images are shown
[
- '<p>The Planck–Einstein relation connects the particulate
photon energy <span class="texhtml "><i>E</i></span> with its associated wave
frequency <span class="texhtml "><i>f</i></span>:</p>\n\n<dl id="mwmQ"><dd
id="mwmg"><span class="mwe-math-element"><span class="mwe-math-mathml-inline
mwe-math-mathml-a11y" style="display: none;"><math
xmlns="http://www.w3.org/1998/Math/MathML">\n <semantics>\n <mrow
class="MJX-TeXAtom-ORD">\n <mstyle displaystyle="true" scriptlevel="0">\n
<mi>E</mi>\n <mo>=</mo>\n <mi>h</mi>\n <mi>f</mi>\n
</mstyle>\n </mrow>\n <annotation
encoding="application/x-tex">{\\displaystyle E=hf}</annotation>\n
</semantics>\n</math></span><img
src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f39fac3593bb1e2dec0282c112c4dff7a99007f6"
class="mwe-math-fallback-image-inline" aria-hidden="true"
style="vertical-align: -0.671ex; width:7.533ex;
height:2.509ex;"></span></dd></dl>',
- '<p>The Planck–Einstein relation connects the particulate
photon energy <span class="texhtml "><i>E</i></span> with its associated wave
frequency <span class="texhtml "><i>f</i></span>:</p>\n\n<dl><dd><span
class="mwe-math-element"><img
src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f39fac3593bb1e2dec0282c112c4dff7a99007f6"
class="mwe-math-fallback-image-inline" aria-hidden="true"
style="vertical-align: -0.671ex; width:7.533ex;
height:2.509ex;"></span></dd></dl>'
+ '<p>The Planck–Einstein relation connects the particulate
photon energy <span class="texhtml "><i>E</i></span> with its associated wave
frequency <span class="texhtml "><i>f</i></span>:</p><dl id="mwmQ"><dd
id="mwmg"><span class="mwe-math-element"><span class="mwe-math-mathml-inline
mwe-math-mathml-a11y" style="display: none;"><math
xmlns="http://www.w3.org/1998/Math/MathML"> <semantics> <mrow
class="MJX-TeXAtom-ORD"> <mstyle displaystyle="true" scriptlevel="0">
<mi>E</mi> <mo>=</mo> <mi>h</mi> <mi>f</mi>
</mstyle> </mrow> <annotation
encoding="application/x-tex">{\\displaystyle E=hf}</annotation>
</semantics></math></span><img
src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f39fac3593bb1e2dec0282c112c4dff7a99007f6"
class="mwe-math-fallback-image-inline" aria-hidden="true"
style="vertical-align: -0.671ex; width:7.533ex;
height:2.509ex;"></span></dd></dl>',
+ '<p>The Planck–Einstein relation connects the particulate
photon energy <span class="texhtml "><i>E</i></span> with its associated wave
frequency <span class="texhtml "><i>f</i></span>:</p><dl><dd><span
class="mwe-math-element"><img
src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f39fac3593bb1e2dec0282c112c4dff7a99007f6"
class="mwe-math-fallback-image-inline" aria-hidden="true"
style="vertical-align: -0.671ex; width:7.533ex;
height:2.509ex;"></span></dd></dl>'
],
// Parentheticals will be stripped
[
@@ -169,7 +174,7 @@
]
];
testCases.forEach((test) => {
- assert.equal(summarize(test[0]).extract_html, test[1], test[2]);
+ assert.equal(summarize(test[0]).extract_html.replace(/\n/g, ''),
test[1], test[2]);
});
});
});
--
To view, visit https://gerrit.wikimedia.org/r/404882
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I20d786e3a46c06fdd47a9faf7a50dcf56267b531
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/mobileapps
Gerrit-Branch: master
Gerrit-Owner: Jdlrobson <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits