jenkins-bot has submitted this change and it was merged.
Change subject: Strip wikitext comments out of parsed values in templates
......................................................................
Strip wikitext comments out of parsed values in templates
The monuments lists may contain HTML-style comments, such as
| lon = 2.51058<!--coordenades de patmapa ajustades automàticament-->
in the Catalan lists.
The comments should be removed, in general as they pollute data
and in particular as they break latitude/longitude processing.
- Extract method sanitize_wikitext_string
from extract_elements_from_template_param
- Add to sanitize_wikitext_string the removal of HTML comments
using regular expression
- Add unit tests for new method sanitize_wikitext_string
Bug: T134727
Change-Id: I03407520f649a842527467cb3b77d3756e100f08
---
M erfgoedbot/converters.py
M tests/test_converters.py
2 files changed, 48 insertions(+), 4 deletions(-)
Approvals:
Lokal Profil: Looks good to me, approved
jenkins-bot: Verified
diff --git a/erfgoedbot/converters.py b/erfgoedbot/converters.py
index 2f8a95b..8087356 100644
--- a/erfgoedbot/converters.py
+++ b/erfgoedbot/converters.py
@@ -78,5 +78,13 @@
(field, _, value) = template_param.partition(u'=')
# Remove leading or trailing spaces
field = field.strip()
+ return (field, sanitize_wikitext_string(value))
+
+
+def sanitize_wikitext_string(value):
+
+ """Removes undesirable wikitext features from a string."""
+
value = value.split("<ref")[0].strip()
- return (field, value)
+ value = re.sub(r"\s?<!--.*?-->\s?", ' ', value)
+ return value.strip()
diff --git a/tests/test_converters.py b/tests/test_converters.py
index f9933d0..20ecb45 100644
--- a/tests/test_converters.py
+++ b/tests/test_converters.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
"""Unit tests for converters."""
import unittest
@@ -6,6 +7,7 @@
extractWikilink,
extract_elements_from_template_param,
remove_commons_category_prefix,
+ sanitize_wikitext_string,
CH1903Converter
)
@@ -105,7 +107,41 @@
expected = (u'id', u'identifiant')
self.assertEquals(extract_elements_from_template_param(input_value),
expected)
- def test_extract_elements_from_template_param_with_reference(self):
- input_value = 'name=My monument name<ref>Serious reference</ref>'
- expected = (u'name', u'My monument name')
+ def test_extract_elements_from_template_param_with_spaces(self):
+ input_value = 'id = identifiant'
+ expected = (u'id', u'identifiant')
self.assertEquals(extract_elements_from_template_param(input_value),
expected)
+
+
+class TestSanitizeWikitextString(unittest.TestCase):
+
+ """Test the sanitize_wikitext_string method."""
+
+ def test_sanitize_wikitext_string_empty_string(self):
+ self.assertEquals(sanitize_wikitext_string(''), '')
+
+ def test_sanitize_wikitext_string_no_features(self):
+ input_value = 'My monument name'
+ expected = u'My monument name'
+ self.assertEquals(sanitize_wikitext_string(input_value), expected)
+
+ def test_sanitize_wikitext_string_with_reference_at_the_end(self):
+ input_value = 'My monument name<ref>Serious reference</ref>'
+ expected = u'My monument name'
+ self.assertEquals(sanitize_wikitext_string(input_value), expected)
+
+ def test_sanitize_wikitext_string_with_reused_reference_at_the_end(self):
+ input_value = 'My monument name<ref name="refA"/>'
+ expected = u'My monument name'
+ self.assertEquals(sanitize_wikitext_string(input_value), expected)
+
+ def test_sanitize_wikitext_string_with_comment_at_the_end(self):
+ input_value = u'2.51058<!--coordenades de patmapa ajustades
automàticament-->'
+ self.assertEquals(sanitize_wikitext_string(input_value), '2.51058')
+ self.assertEquals(sanitize_wikitext_string(u'Aaa Ccc <!-- B -->'),
u'Aaa Ccc')
+
+ def test_sanitize_wikitext_string_with_comment_in_the_middle(self):
+ expected = u'Aaa Ccc'
+ self.assertEquals(sanitize_wikitext_string(u'Aaa <!-- B -->Ccc'),
expected)
+ self.assertEquals(sanitize_wikitext_string(u'Aaa <!-- B b b --> Ccc'),
expected)
+ self.assertEquals(sanitize_wikitext_string(u'Aaa<!-- B b b --> Ccc'),
expected)
--
To view, visit https://gerrit.wikimedia.org/r/287792
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I03407520f649a842527467cb3b77d3756e100f08
Gerrit-PatchSet: 3
Gerrit-Project: labs/tools/heritage
Gerrit-Branch: master
Gerrit-Owner: Jean-Frédéric <[email protected]>
Gerrit-Reviewer: Jean-Frédéric <[email protected]>
Gerrit-Reviewer: Lokal Profil <[email protected]>
Gerrit-Reviewer: Multichill <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits