Subramanya Sastry has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/384171 )
Change subject: Update reverse interwiki map to prefer language prefixes over
others
......................................................................
Update reverse interwiki map to prefer language prefixes over others
* Updated a bunch of parser tests to reflect the change.
Bug: T177784
Change-Id: I5cf93950a6da69263fb9da59fba2b33cc2e8931f
---
M lib/config/WikiConfig.js
M tests/parserTests-blacklist.js
M tests/parserTests.txt
3 files changed, 38 insertions(+), 21 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid
refs/changes/71/384171/1
diff --git a/lib/config/WikiConfig.js b/lib/config/WikiConfig.js
index e9f8e48..52e1c4f 100644
--- a/lib/config/WikiConfig.js
+++ b/lib/config/WikiConfig.js
@@ -232,14 +232,12 @@
}
});
- var cachedMatcher = null;
- this.interWikiMatcher = function() {
- if (cachedMatcher) {
- return cachedMatcher;
- }
- var keys = [];
- var patterns = [];
+ var updatePatterns = function(keys, patterns, filter) {
conf.interwikiMap.forEach(function(val, key) {
+ if (!filter(val)) {
+ return;
+ }
+
var url = val.url;
var protocolRelative = url.startsWith('//');
if (val.protorel !== undefined) {
@@ -271,6 +269,20 @@
patterns.push('^' + val.prefix + '%3A(.*?)');
}
});
+ }
+
+ var cachedMatcher = null;
+ this.interWikiMatcher = function() {
+ if (cachedMatcher) {
+ return cachedMatcher;
+ }
+ var keys = [];
+ var patterns = [];
+ // For html -> wt reverse mapping, prefer language interwiki
prefixes
+ // over other interwiki prefixes. So, use "en" instead of
"wikipedia"
+ // for English wikipedia interwiki links.
+ updatePatterns(keys, patterns, function(val) { return
!!val.language; });
+ updatePatterns(keys, patterns, function(val) { return
!val.language; });
var reString = '^(?:' + patterns.join('|') + ')$';
var regExp = new RegExp(reString, 'i');
var matchFunc = function(s) {
diff --git a/tests/parserTests-blacklist.js b/tests/parserTests-blacklist.js
index 0c418d9..0cd0c61 100644
--- a/tests/parserTests-blacklist.js
+++ b/tests/parserTests-blacklist.js
@@ -358,6 +358,7 @@
add("html2html", "Internal link with is link prefix", "<p
data-parsoid='{\"dsr\":[0,45,0,0]}'>Aðrir <a rel=\"mw:WikiLink\"
href=\"./Wiki/Söfnuður\" title=\"Wiki/Söfnuður\"
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Wiki/Söfnuður\"},\"sa\":{\"href\":\"wiki/Söfnuður\"},\"dsr\":[6,42,16,2]}'>mótmælendasöfnuðir</a>
og</p>\n");
add("html2html", "Internal link with is link trail and link prefix", "<p
data-parsoid='{\"dsr\":[0,181,0,0]}'><a rel=\"mw:WikiLink\"
href=\"./Wiki/Mótmælendatrú\" title=\"Wiki/Mótmælendatrú\"
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Wiki/Mótmælendatrú\"},\"sa\":{\"href\":\"wiki/Mótmælendatrú\"},\"dsr\":[0,28,21,2]}'>xxxar</a>\n<a
rel=\"mw:WikiLink\" href=\"./Wiki/Mótmælendatrú\" title=\"Wiki/Mótmælendatrú\"
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Wiki/Mótmælendatrú\"},\"sa\":{\"href\":\"wiki/Mótmælendatrú\"},\"dsr\":[29,67,21,2]}'>mótmælendatrúar</a>\n<a
rel=\"mw:WikiLink\" href=\"./Wiki/Söfnuður\" title=\"Wiki/Söfnuður\"
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Wiki/Söfnuður\"},\"sa\":{\"href\":\"wiki/Söfnuður\"},\"dsr\":[68,104,16,2]}'>mótmælendasöfnuður</a>\n<a
rel=\"mw:WikiLink\" href=\"./Wiki/Söfnuður\" title=\"Wiki/Söfnuður\"
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Wiki/Söfnuður\"},\"sa\":{\"href\":\"wiki/Söfnuður\"},\"dsr\":[105,141,16,2]}'>mótmælendasöfnuðir</a>\n<a
rel=\"mw:WikiLink\" href=\"./Wiki/Söfnuður\" title=\"Wiki/Söfnuður\"
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Wiki/Söfnuður\"},\"sa\":{\"href\":\"wiki/Söfnuður\"},\"dsr\":[142,181,16,2]}'>mótmælendasöfnuðirxxx</a></p>\n");
add("html2html", "Parsoid-centric test: Whitespace in ext- and wiki-links
should be preserved", "<p data-parsoid='{\"dsr\":[0,18,0,0]}'><a
rel=\"mw:WikiLink\" href=\"./Wiki/Foo\" title=\"Wiki/Foo\"
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Wiki/Foo\"},\"sa\":{\"href\":\"wiki/Foo\"},\"dsr\":[0,18,11,2]}'>
bar</a></p>\n\n<p data-parsoid='{\"dsr\":[20,42,0,0]}'><a rel=\"mw:WikiLink\"
href=\"./Wiki/Foo\" title=\"Wiki/Foo\"
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Wiki/Foo\"},\"sa\":{\"href\":\"wiki/Foo\"},\"dsr\":[20,42,11,2]}'>
<i data-parsoid='{\"dsr\":[33,40,2,2]}'>bar</i></a></p>\n\n<p
data-parsoid='{\"dsr\":[44,63,0,0]}'><a rel=\"mw:ExtLink\"
href=\"http://wp.org\"
data-parsoid='{\"targetOff\":59,\"contentOffsets\":[59,62],\"dsr\":[44,63,15,1]}'>foo</a></p>\n\n<p
data-parsoid='{\"dsr\":[65,88,0,0]}'><a rel=\"mw:ExtLink\"
href=\"http://wp.org\"
data-parsoid='{\"targetOff\":80,\"contentOffsets\":[80,87],\"dsr\":[65,88,15,1]}'><i
data-parsoid='{\"dsr\":[80,87,2,2]}'>foo</i></a></p>\n");
+add("html2html", "Interwiki link encoding conversion (T3636)\n+!!
options\n+parsoid=wt2html,wt2wt\n+## html2wt and html2html will fail because we
will prefer the :en: interwiki prefix over wikipedia:", "<ul
data-parsoid='{\"dsr\":[0,87,0,0]}'><li data-parsoid='{\"dsr\":[0,43,1,0]}'> <a
rel=\"mw:ExtLink\" href=\"//en.wikipedia.org/wiki/ro:Olteniţa\"
title=\"en:ro:Olteniţa\"
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"//en.wikipedia.org/wiki/ro:Olteniţa\"},\"sa\":{\"href\":\":en:ro:Olteniţa\"},\"isIW\":true,\"dsr\":[2,43,18,2]}'>Wikipedia:ro:Olteniţa</a></li>\n<li
data-parsoid='{\"dsr\":[44,87,1,0]}'> <a rel=\"mw:ExtLink\"
href=\"//en.wikipedia.org/wiki/ro:Olteniţa\" title=\"en:ro:Olteniţa\"
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"//en.wikipedia.org/wiki/ro:Olteniţa\"},\"sa\":{\"href\":\":en:ro:Olteniţa\"},\"isIW\":true,\"dsr\":[46,87,18,2]}'>Wikipedia:ro:Olteniţa</a></li></ul>\n");
add("html2html", "Space and question mark encoding in interlanguage links
(T95473)", "<p data-parsoid='{\"dsr\":[0,14,0,0]}'>Blah blah blah</p>\n<link
rel=\"mw:PageProp/Language\" href=\"http://es.wikipedia.org/wiki/Foo_bar?\"
data-parsoid='{\"stx\":\"simple\",\"a\":{\"href\":\"http://es.wikipedia.org/wiki/Foo_bar?\"},\"sa\":{\"href\":\"es:Foo_bar?\"},\"dsr\":[15,30,null,null]}'/>");
add("html2html", "Parsoid-specific test: Wikilinks with should RT
properly", "<p
data-parsoid='{\"dsr\":[0,52,0,0]}'>[/index.php?title=WW_II&action=edit&redlink=1
WW II]</p>\n");
add("html2html", "<br> to <br />", "<p
data-parsoid='{\"dsr\":[0,5,0,0]}'>1\n2\n3</p>\n");
@@ -652,7 +653,7 @@
add("html2wt", "Internal link with is link prefix", "Aðrir
[[wiki/Söfnuður|mótmælendasöfnuðir]] og\n");
add("html2wt", "Internal link with is link trail and link prefix",
"[[wiki/Mótmælendatrú|xxxar]]\n[[wiki/Mótmælendatrú|mótmælendatrúar]]\n[[wiki/Söfnuður|mótmælendasöfnuður]]\n[[wiki/Söfnuður|mótmælendasöfnuðir]]\n[[wiki/Söfnuður|mótmælendasöfnuðirxxx]]\n");
add("html2wt", "Parsoid-centric test: Whitespace in ext- and wiki-links should
be preserved", "[[wiki/Foo| bar]]\n\n[[wiki/Foo| ''bar'']]\n\n[http://wp.org
foo]\n\n[http://wp.org ''foo'']\n");
-add("html2wt", "Interwiki link encoding conversion (T3636)", "*
[[wikipedia:ro:Olteniţa|Wikipedia:ro:Olteniţa]]\n*
[[wikipedia:ro:Olteniţa|Wikipedia:ro:Olteniţa]]\n");
+add("html2wt", "Interwiki link encoding conversion (T3636)\n+!!
options\n+parsoid=wt2html,wt2wt\n+## html2wt and html2html will fail because we
will prefer the :en: interwiki prefix over wikipedia:", "*
[[:en:ro:Olteniţa|Wikipedia:ro:Olteniţa]]\n*
[[:en:ro:Olteniţa|Wikipedia:ro:Olteniţa]]\n");
add("html2wt", "Interwiki link with fragment (T4130)",
"[[meatball:SoftSecurity#foo|MeatBall:SoftSecurity#foo]]\n");
add("html2wt", "Escaping of interlanguage links (T129218, T156308)", "Blah
blah blah\n[[:es:Spanish]]\n[[:zh:Chinese| zh : Chinese ]]\n");
add("html2wt", "Parsoid-specific test: Wikilinks with should RT
properly", "[/index.php?title=WW_II&action=edit&redlink=1 WW II]\n");
@@ -1158,8 +1159,8 @@
add("selser", "External link containing double-single-quotes with no space
separating the url from text in italics [[1,3,0]]",
"[http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm ''La muerte de
Casagemas'' (1901) en el sitio de ]\n");
add("selser", "External link containing double-single-quotes with no space
separating the url from text in italics [[4,0,3]]", "1jnda7a\n");
add("selser", "External link containing double-single-quotes with no space
separating the url from text in italics [[[1,2],2,4]]",
"[http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm ''La muerte de
Casagemas''1svf0oe (1901) en el sitio de ]mqtmyg6n94l9");
-add("selser", "mw:ExtLink linking to a interwiki URL can be round-tripped
losslessly (T94723) [[[4]]]", "[[wikipedia:European_Robin|1rmduf6]]");
-add("selser", "mw:ExtLink linking to a interwiki URL can be round-tripped
losslessly (T94723) [[[2]]]", "[[wikipedia:European_Robin|134iwocEuropean
Robin]]");
+add("selser", "mw:ExtLink linking to a interwiki URL can be round-tripped
losslessly (T94723) [[[4]]]", "[[:en:European_Robin|1rmduf6]]");
+add("selser", "mw:ExtLink linking to a interwiki URL can be round-tripped
losslessly (T94723) [[[2]]]", "[[:en:European_Robin|134iwocEuropean Robin]]");
add("selser", "Unclosed and unmatched quotes
[[[0,0,4]],2,3,3,[1],0,4,0,1,2,[[2]],3,3,3,2,0,3,4,1,2,2]", "'''''Bold italic
text '''with bold deactivatedmik1b''\n\n194mnir\n\n'''Bold
text..'''\n\n1vh6b8p\n\n'''Bold tag left open\n\n118iayy\n\n''hxqm6fItalic tag
left open''\n\n8zfmbl<!-- Unmatching number of opening, closing tags:
-->\n\nv7yj6u\n\n''Tom'''s car is bigger than
'''''<nowiki/>'''Susan'''s.\n\n1suii2h\n\n1qt3jiw\n\nPlain ''italic'''s plain");
add("selser", "Unclosed and unmatched quotes
[[1],0,[1],0,[3],0,[2,2],0,4,0,4,2,4,0,3,0,4,0,[[0,[4]],0,3],0,4]", "'''''Bold
italic text '''with bold deactivated''' in between.'''''\n\n'''''Bold italic
text ''with italic deactivated'' in between.'''''\n\n1vbvxxl..spanning two
paragraphs (should not
work).6tygj0'''\n\n750fcg\n\n1ke2xol\n\nqpzby4\n\n1fztsq9\n\n1qf0akm\n\n''Tom'''3ftppf'''''<nowiki/>'''Susan'''\n\namwrge\n");
add("selser", "Unclosed and unmatched quotes
[[[3,0,[3]]],0,[1],0,2,2,3,0,[4],4,[4],0,0,3,0,4,[2,3,3,4,3,0],0,4,2,[0,3,4]]",
"''with bold deactivated'''<nowiki/>'''''\n\n'''''Bold italic text ''with
italic deactivated'' in between.'''''\n\n1o7p7gt\n\n'''Bold
text..\n\nypestg\n\n9sn2o4\n\n1clpd1j\n\n1933mb7\n\nNormal text.<!-- Unmatching
number of opening, closing tags: -->\n\nr7rcfr\n\n9vn9he'''This
year''''1bex21s.\n\n23segf\n\n15xu7jn\n\nPlain 1038n5m");
@@ -1311,13 +1312,11 @@
add("selser", "Parsoid-centric test: Whitespace in ext- and wiki-links should
be preserved [3,4,2,2,2,0,4]", "1ft87cu\n\n13xu2qq\n\n[[Foo|
''bar'']]\n\n15lnhl6\n\n1djvh1q\n\n[http://wp.org foo]\n\n1rzfwwg\n");
add("selser", "Parsoid-centric test: Whitespace in ext- and wiki-links should
be preserved [0,2,1,3,[[4]],3,0]", "[[Foo| bar]]\n\np3atih\n\n[[Foo|
''bar'']]\n\n[http://wp.org rop4jb]\n\n[http://wp.org ''foo'']");
add("selser", "Parsoid-centric test: Whitespace in ext- and wiki-links should
be preserved [3,4,4,2,2,4,0]",
"1bj9bbq\n\n11icu1m\n\n1f2hrph\n\n1d06nta\n\n[http://wp.org
foo]\n\n1npgg6a\n\n[http://wp.org ''foo'']");
-add("selser", "Interwiki link encoding conversion (T3636) [[[3],2,[2]]]",
"*\n* edo4is\n*1k978vr[[Wikipedia:ro:Olteniţa]]");
-add("selser", "Different interwiki prefixes mapping to the same URL
[2,0,[4],0,2,4,3,0,3,3,[2],2,[4]]",
"1cmr4k9\n\n[[:en:Foo]]\n\nhvdis9\n\na1dr3q\n\n[[wikipedia:Foo]]\n\ne49fuw\n\nnsemot[[wikipedia:en:Foo]]\n\nm67gy7\n\n1jea487");
-add("selser", "Different interwiki prefixes mapping to the same URL
[3,4,0,0,[1],4,3,0,3,0,[2],0,3]",
"1nj1lgj\n\n[[:en:Foo|Foo]]\n\n[[wikipedia:Foo]]\n\n1iwxfz6\n\n1v5pb3c[[wikipedia:en:Foo]]\n");
-add("selser", "Different interwiki prefixes mapping to the same URL
[4,3,[4],4,2,0,[1],2,1,0,[2],0,3]",
"1niy9um\n\n1rj1oh1\n\n17bubmt\n\nwa0uv2\n\n[[wikipedia:Foo]]\n\n[[:wikipedia:Foo|Foo]]\n\n1gcjzpc\n\n[[wikipedia:en:Foo]]\n\n1x968no[[wikipedia:en:Foo]]\n");
-add("selser", "Different interwiki prefixes mapping to the same URL
[4,4,1,4,[3],4,[[4]],0,4,0,[1],3,[4]]",
"1ts2wsm\n\n1ha756h\n\n[[:en:Foo|Foo]]\n\n1oy27y4\n\n1wctm47\n\n[[:wikipedia:Foo|3wyj0g]]\n\n13tzutw\n\n[[wikipedia:en:Foo]]\n\ndrkjrg");
-add("selser", "Different interwiki prefixes mapping to the same URL
[[4],0,[3],0,0,2,[2],0,[4],0,[2],4,[[2]]]",
"vp9v9k\n\n[[wikipedia:Foo]]\n\n1dr7otu\n\n1ws4mui[[:wikipedia:Foo|Foo]]\n\n1szk0ja\n\nkvaw0k[[wikipedia:en:Foo]]\n\n1cbqxcb\n\n[[
wikiPEdia :Foo|qzsy1a wikiPEdia :Foo]]");
-add("selser", "Different interwiki prefixes mapping to the same URL
[1,3,1,4,[[4]],0,3,2,[3],0,1,0,3]",
"[[:en:Foo]]\n\n[[:en:Foo|Foo]]\n\n1qaad10\n\n[[wikipedia:Foo|zc9a9s]]\n\n1kh1ssv\n\n[[wikipedia:en:Foo]]\n");
+add("selser", "Interwiki link encoding conversion (T3636)\n+!!
options\n+parsoid=wt2html,wt2wt\n+## html2wt and html2html will fail because we
will prefer the :en: interwiki prefix over wikipedia: [[0,4,[2]]]",
"*[[Wikipedia:ro:Olteniţa]]\n*
1nfwe0o\n*13bu3xr[[:Wikipedia:ro:Olteniţa]]");
+add("selser", "Different interwiki prefixes mapping to the same URL
[[1],0,4,0,1,0,3,0,[1],0,[3],0,[1]]",
"[[:en:Foo]]\n\nv7i85g\n\n[[wikipedia:Foo]]\n\n[[:wikipedia:en:Foo]]\n\n[[:
wikiPEdia :Foo]]\n");
+add("selser", "Different interwiki prefixes mapping to the same URL
[4,0,[[3]],0,1,0,[[4]],2,0,0,[1],4,1]",
"1l14vkx\n\n[:en:Foo]\n\n[[wikipedia:Foo]]\n\n[[:wikipedia:Foo|nbuvsk]]\n\nk4ccf8\n\n[[wikipedia:en:Foo]]\n\n[[:wikipedia:en:Foo]]\n\n1gr9ugr\n\n[[:
wikiPEdia :Foo]]");
+add("selser", "Different interwiki prefixes mapping to the same URL
[2,0,[[4]],3,3,0,[[3]],0,1,3,0,3,[2]]",
"1belm8p\n\n[[:en:Foo]]\n\n[[:en:Foo|jvi0fn]]\n\n[:wikipedia:Foo]\n\n[[:wikipedia:en:Foo]]\n\n[[:wikipedia:en:Foo]]\n\nrqndks[[
wikiPEdia :Foo]]");
+add("selser", "Different interwiki prefixes mapping to the same URL
[4,3,[[2]],2,[2],0,4,4,[[4]],4,[[2]],3,[4]]",
"1vnl7bc\n\n[[:en:Foo|1mo1igaFoo]]\n\n15p8av9\n\nt1tnk7[[:wikipedia:Foo]]\n\nik65u\n\ncp91v8\n\n[[wikipedia:en:Foo|vzo3uh]]\n\n1hyadw9\n\n[[:wikipedia:en:Foo|11hwcmxwikipedia:en:Foo]]\n\n8ja2ps");
add("selser", "Parsoid: recognize interwiki links without a target page
[2,2,1]", "13h7yqp\n\n[[:es:]]\n\nn76wcf\n\n[[ko:]]");
add("selser", "Parsoid: recognize interwiki links without a target page
[[2],2,1]", "1fz9jlc[[:es:]]\n\n2alzzr\n\n[[ko:]]");
add("selser", "Parsoid: recognize interwiki links without a target page
[1,4,1]", "[[:es:]]\n\neviwhn\n\n[[ko:]]");
diff --git a/tests/parserTests.txt b/tests/parserTests.txt
index 7cff659..a9140fe 100644
--- a/tests/parserTests.txt
+++ b/tests/parserTests.txt
@@ -4835,6 +4835,9 @@
!! test
External links: with no contents
+!! options
+parsoid=wt2html,wt2wt
+## html2wt and html2html will fail because we will prefer the :en: interwiki
prefix over wikipedia:
!! wikitext
[http://en.wikipedia.org/wiki/Foo]
@@ -5962,11 +5965,11 @@
!! wikitext
[[Foo|Bar]]
[[Foo|Bar]]
-[[wikipedia:Foo|Bar]]
-[[wikipedia:Foo|Bar]]
+[[:en:Foo|Bar]]
+[[:en:Foo|Bar]]
-[[wikipedia:European_Robin|European Robin]]
-[[wikipedia:European_Robin|European Robin]]
+[[:en:European_Robin|European Robin]]
+[[:en:European_Robin|European Robin]]
!! end
!! test
@@ -8613,6 +8616,9 @@
!! test
Interwiki link encoding conversion (T3636)
++!! options
++parsoid=wt2html,wt2wt
++## html2wt and html2html will fail because we will prefer the :en: interwiki
prefix over wikipedia:
!! wikitext
*[[Wikipedia:ro:Olteniţa]]
*[[Wikipedia:ro:Olteniţa]]
--
To view, visit https://gerrit.wikimedia.org/r/384171
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I5cf93950a6da69263fb9da59fba2b33cc2e8931f
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits