Jdlrobson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/404882 )

Change subject: WIP: Cleanup trailing punctuation
......................................................................

WIP: Cleanup trailing punctuation

Bug: T185161
Change-Id: I20d786e3a46c06fdd47a9faf7a50dcf56267b531
---
M lib/transformations/summarize.js
M test/lib/transformations/summarize.js
2 files changed, 14 insertions(+), 7 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/mobileapps 
refs/changes/82/404882/1

diff --git a/lib/transformations/summarize.js b/lib/transformations/summarize.js
index 6c55b25..97101e2 100644
--- a/lib/transformations/summarize.js
+++ b/lib/transformations/summarize.js
@@ -130,14 +130,16 @@
     // 4. remove all double spaces created by the above
     html = html.replace(/ +/g, ' ');
 
-    // 5. Replace any leading whitespace before commas
+    // 5. Replace any leading whitespace before commas and full stops
     // (which could be the result of earlier transformations)
-    html = html.replace(/ , /g, ', ');
+    html = html.replace(/ ([,.!?]) /g, '$1 ');
 
     // 6. Same as 5 but for non-latin comma and no space afterwards
-    html = html.replace(/ ,/g, ',');
+    html = html.replace(/ ([,.!?])/g, '$1 ');
 
-    doc.body.innerHTML = html;
+    // remove any trailing spaces before closing node.
+    html = html.replace(/([.!]) +<\//g, '$1</');
+
     return {
         extract: doc.body.textContent,
         extract_html: html
diff --git a/test/lib/transformations/summarize.js 
b/test/lib/transformations/summarize.js
index ec25458..9503c0c 100644
--- a/test/lib/transformations/summarize.js
+++ b/test/lib/transformations/summarize.js
@@ -8,6 +8,11 @@
 describe('summarize', () => {
     it('matches the spec', () => {
         const testCases = [
+            // Should not leave double spaces after stripping parentheticals
+            [
+                '<p>Justice League (both 2017). Justice (is forever)!</p>',
+                '<p>Justice League. Justice!</p>'
+            ],
             // Should remove unwanted elements
             [
                 '.<style>f</style><object>o</object><script>o</script>.',
@@ -65,8 +70,8 @@
             ],
             // math tags are stripped but any math images are shown
             [
-                '<p>The Planck–Einstein relation connects the particulate 
photon energy <span class="texhtml "><i>E</i></span> with its associated wave 
frequency <span class="texhtml "><i>f</i></span>:</p>\n\n<dl id="mwmQ"><dd 
id="mwmg"><span class="mwe-math-element"><span class="mwe-math-mathml-inline 
mwe-math-mathml-a11y" style="display: none;"><math 
xmlns="http://www.w3.org/1998/Math/MathML";>\n  <semantics>\n    <mrow 
class="MJX-TeXAtom-ORD">\n      <mstyle displaystyle="true" scriptlevel="0">\n  
      <mi>E</mi>\n        <mo>=</mo>\n        <mi>h</mi>\n        <mi>f</mi>\n  
    </mstyle>\n    </mrow>\n    <annotation 
encoding="application/x-tex">{\\displaystyle E=hf}</annotation>\n  
</semantics>\n</math></span><img 
src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f39fac3593bb1e2dec0282c112c4dff7a99007f6";
 class="mwe-math-fallback-image-inline" aria-hidden="true" 
style="vertical-align: -0.671ex; width:7.533ex; 
height:2.509ex;"></span></dd></dl>',
-                '<p>The Planck–Einstein relation connects the particulate 
photon energy <span class="texhtml "><i>E</i></span> with its associated wave 
frequency <span class="texhtml "><i>f</i></span>:</p>\n\n<dl><dd><span 
class="mwe-math-element"><img 
src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f39fac3593bb1e2dec0282c112c4dff7a99007f6";
 class="mwe-math-fallback-image-inline" aria-hidden="true" 
style="vertical-align: -0.671ex; width:7.533ex; 
height:2.509ex;"></span></dd></dl>'
+                '<p>The Planck–Einstein relation connects the particulate 
photon energy <span class="texhtml "><i>E</i></span> with its associated wave 
frequency <span class="texhtml "><i>f</i></span>:</p><dl id="mwmQ"><dd 
id="mwmg"><span class="mwe-math-element"><span class="mwe-math-mathml-inline 
mwe-math-mathml-a11y" style="display: none;"><math 
xmlns="http://www.w3.org/1998/Math/MathML";>  <semantics>    <mrow 
class="MJX-TeXAtom-ORD">      <mstyle displaystyle="true" scriptlevel="0">      
  <mi>E</mi>        <mo>=</mo>        <mi>h</mi>        <mi>f</mi>      
</mstyle>    </mrow>    <annotation 
encoding="application/x-tex">{\\displaystyle E=hf}</annotation>  
</semantics></math></span><img 
src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f39fac3593bb1e2dec0282c112c4dff7a99007f6";
 class="mwe-math-fallback-image-inline" aria-hidden="true" 
style="vertical-align: -0.671ex; width:7.533ex; 
height:2.509ex;"></span></dd></dl>',
+                '<p>The Planck–Einstein relation connects the particulate 
photon energy <span class="texhtml "><i>E</i></span> with its associated wave 
frequency <span class="texhtml "><i>f</i></span>:</p><dl><dd><span 
class="mwe-math-element"><img 
src="https://wikimedia.org/api/rest_v1/media/math/render/svg/f39fac3593bb1e2dec0282c112c4dff7a99007f6";
 class="mwe-math-fallback-image-inline" aria-hidden="true" 
style="vertical-align: -0.671ex; width:7.533ex; 
height:2.509ex;"></span></dd></dl>'
             ],
             // Parentheticals will be stripped
             [
@@ -169,7 +174,7 @@
             ]
         ];
         testCases.forEach((test) => {
-            assert.equal(summarize(test[0]).extract_html, test[1], test[2]);
+            assert.equal(summarize(test[0]).extract_html.replace(/\n/g, ''), 
test[1], test[2]);
         });
     });
 });

-- 
To view, visit https://gerrit.wikimedia.org/r/404882
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I20d786e3a46c06fdd47a9faf7a50dcf56267b531
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/mobileapps
Gerrit-Branch: master
Gerrit-Owner: Jdlrobson <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to