Jean-Frédéric has uploaded a new change for review.
https://gerrit.wikimedia.org/r/287792
Change subject: Strip wikitext comments out of parsed values in templates
......................................................................
Strip wikitext comments out of parsed values in templates
The monuments lists may contain HTML-style comments, such as
| lon = 2.51058<!--coordenades de patmapa ajustades automàticament-->
in the Catalan lists.
The comments should be removed, in general as they pollute data
and in particular as they break latitude/longitude processing.
- Extract method sanitize_wikitext_string
from extract_elements_from_template_param
- Add to sanitize_wikitext_string the removal of HTML comments
using regular expression
- Add unit tests for new method sanitize_wikitext_string
Bug: T134727
Change-Id: I03407520f649a842527467cb3b77d3756e100f08
---
M erfgoedbot/converters.py
M tests/test_converters.py
2 files changed, 46 insertions(+), 4 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/labs/tools/heritage
refs/changes/92/287792/1
diff --git a/erfgoedbot/converters.py b/erfgoedbot/converters.py
index 2f8a95b..05b1885 100644
--- a/erfgoedbot/converters.py
+++ b/erfgoedbot/converters.py
@@ -78,5 +78,13 @@
(field, _, value) = template_param.partition(u'=')
# Remove leading or trailing spaces
field = field.strip()
+ return (field, sanitize_wikitext_string(value))
+
+
+def sanitize_wikitext_string(value):
+
+ """Removes undesirable wikitext features from a string."""
+
value = value.split("<ref")[0].strip()
- return (field, value)
+ value = re.sub(r"\s?<!--.*?-->", '', value)
+ return value
diff --git a/tests/test_converters.py b/tests/test_converters.py
index f9933d0..a081af3 100644
--- a/tests/test_converters.py
+++ b/tests/test_converters.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
"""Unit tests for converters."""
import unittest
@@ -6,6 +7,7 @@
extractWikilink,
extract_elements_from_template_param,
remove_commons_category_prefix,
+ sanitize_wikitext_string,
CH1903Converter
)
@@ -105,7 +107,39 @@
expected = (u'id', u'identifiant')
self.assertEquals(extract_elements_from_template_param(input_value),
expected)
- def test_extract_elements_from_template_param_with_reference(self):
- input_value = 'name=My monument name<ref>Serious reference</ref>'
- expected = (u'name', u'My monument name')
+ def test_extract_elements_from_template_param_with_spaces(self):
+ input_value = 'id = identifiant'
+ expected = (u'id', u'identifiant')
self.assertEquals(extract_elements_from_template_param(input_value),
expected)
+
+
+class TestSanitizeWikitextString(unittest.TestCase):
+
+ """Test the sanitize_wikitext_string method."""
+
+ def test_sanitize_wikitext_string_empty_string(self):
+ self.assertEquals(sanitize_wikitext_string(''), '')
+
+ def test_sanitize_wikitext_string_no_features(self):
+ input_value = 'My monument name'
+ expected = u'My monument name'
+ self.assertEquals(sanitize_wikitext_string(input_value), expected)
+
+ def test_sanitize_wikitext_string_with_reference_at_the_end(self):
+ input_value = 'My monument name<ref>Serious reference</ref>'
+ expected = u'My monument name'
+ self.assertEquals(sanitize_wikitext_string(input_value), expected)
+
+ def test_sanitize_wikitext_string_with_reused_reference_at_the_end(self):
+ input_value = 'My monument name<ref name="refA"/>'
+ expected = u'My monument name'
+ self.assertEquals(sanitize_wikitext_string(input_value), expected)
+
+ def test_sanitize_wikitext_string_with_comment_at_the_end(self):
+ input_value = u'2.51058<!--coordenades de patmapa ajustades
automàticament-->'
+ self.assertEquals(sanitize_wikitext_string(input_value), '2.51058')
+
+ def test_sanitize_wikitext_string_with_comment_in_the_middle(self):
+ input_value = u'A monument <!-- apparently --> somewhere'
+ expected = u'A monument somewhere'
+ self.assertEquals(sanitize_wikitext_string(input_value), expected)
--
To view, visit https://gerrit.wikimedia.org/r/287792
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I03407520f649a842527467cb3b77d3756e100f08
Gerrit-PatchSet: 1
Gerrit-Project: labs/tools/heritage
Gerrit-Branch: master
Gerrit-Owner: Jean-Frédéric <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits