Subramanya Sastry has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/384171 )

Change subject: Update reverse interwiki map to prefer language prefixes over 
others
......................................................................

Update reverse interwiki map to prefer language prefixes over others

* Updated a bunch of parser tests to reflect the change.

Bug: T177784
Change-Id: I5cf93950a6da69263fb9da59fba2b33cc2e8931f
---
M lib/config/WikiConfig.js
M tests/parserTests-blacklist.js
M tests/parserTests.txt
3 files changed, 38 insertions(+), 21 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/71/384171/1

diff --git a/lib/config/WikiConfig.js b/lib/config/WikiConfig.js
index e9f8e48..52e1c4f 100644
--- a/lib/config/WikiConfig.js
+++ b/lib/config/WikiConfig.js
@@ -232,14 +232,12 @@
                }
        });
 
-       var cachedMatcher = null;
-       this.interWikiMatcher = function() {
-               if (cachedMatcher) {
-                       return cachedMatcher;
-               }
-               var keys = [];
-               var patterns = [];
+       var updatePatterns = function(keys, patterns, filter) {
                conf.interwikiMap.forEach(function(val, key) {
+                       if (!filter(val)) {
+                               return;
+                       }
+
                        var url = val.url;
                        var protocolRelative = url.startsWith('//');
                        if (val.protorel !== undefined) {
@@ -271,6 +269,20 @@
                                patterns.push('^' + val.prefix + '%3A(.*?)');
                        }
                });
+       }
+
+       var cachedMatcher = null;
+       this.interWikiMatcher = function() {
+               if (cachedMatcher) {
+                       return cachedMatcher;
+               }
+               var keys = [];
+               var patterns = [];
+               // For html -> wt reverse mapping, prefer language interwiki 
prefixes
+               // over other interwiki prefixes. So, use "en" instead of 
"wikipedia"
+               // for English wikipedia interwiki links.
+               updatePatterns(keys, patterns, function(val) { return 
!!val.language; });
+               updatePatterns(keys, patterns, function(val) { return 
!val.language; });
                var reString = '^(?:' + patterns.join('|') + ')$';
                var regExp = new RegExp(reString, 'i');
                var matchFunc = function(s) {
diff --git a/tests/parserTests-blacklist.js b/tests/parserTests-blacklist.js
index 0c418d9..0cd0c61 100644
--- a/tests/parserTests-blacklist.js
+++ b/tests/parserTests-blacklist.js
@@ -358,6 +358,7 @@
 add("html2html", "Internal link with is link prefix", "<p 
data-parsoid='{\"dsr\":[0,45,0,0]}'>Aðrir <a rel=\"mw:WikiLink\" 
href=\"./Wiki/Söfnuður\" title=\"Wiki/Söfnuður\" 
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Wiki/Söfnuður\"},\"sa\":{\"href\":\"wiki/Söfnuður\"},\"dsr\":[6,42,16,2]}'>mótmælendasöfnuðir</a>
 og</p>\n");
 add("html2html", "Internal link with is link trail and link prefix", "<p 
data-parsoid='{\"dsr\":[0,181,0,0]}'><a rel=\"mw:WikiLink\" 
href=\"./Wiki/Mótmælendatrú\" title=\"Wiki/Mótmælendatrú\" 
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Wiki/Mótmælendatrú\"},\"sa\":{\"href\":\"wiki/Mótmælendatrú\"},\"dsr\":[0,28,21,2]}'>xxxar</a>\n<a
 rel=\"mw:WikiLink\" href=\"./Wiki/Mótmælendatrú\" title=\"Wiki/Mótmælendatrú\" 
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Wiki/Mótmælendatrú\"},\"sa\":{\"href\":\"wiki/Mótmælendatrú\"},\"dsr\":[29,67,21,2]}'>mótmælendatrúar</a>\n<a
 rel=\"mw:WikiLink\" href=\"./Wiki/Söfnuður\" title=\"Wiki/Söfnuður\" 
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Wiki/Söfnuður\"},\"sa\":{\"href\":\"wiki/Söfnuður\"},\"dsr\":[68,104,16,2]}'>mótmælendasöfnuður</a>\n<a
 rel=\"mw:WikiLink\" href=\"./Wiki/Söfnuður\" title=\"Wiki/Söfnuður\" 
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Wiki/Söfnuður\"},\"sa\":{\"href\":\"wiki/Söfnuður\"},\"dsr\":[105,141,16,2]}'>mótmælendasöfnuðir</a>\n<a
 rel=\"mw:WikiLink\" href=\"./Wiki/Söfnuður\" title=\"Wiki/Söfnuður\" 
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Wiki/Söfnuður\"},\"sa\":{\"href\":\"wiki/Söfnuður\"},\"dsr\":[142,181,16,2]}'>mótmælendasöfnuðirxxx</a></p>\n");
 add("html2html", "Parsoid-centric test: Whitespace in ext- and wiki-links 
should be preserved", "<p data-parsoid='{\"dsr\":[0,18,0,0]}'><a 
rel=\"mw:WikiLink\" href=\"./Wiki/Foo\" title=\"Wiki/Foo\" 
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Wiki/Foo\"},\"sa\":{\"href\":\"wiki/Foo\"},\"dsr\":[0,18,11,2]}'>
  bar</a></p>\n\n<p data-parsoid='{\"dsr\":[20,42,0,0]}'><a rel=\"mw:WikiLink\" 
href=\"./Wiki/Foo\" title=\"Wiki/Foo\" 
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"./Wiki/Foo\"},\"sa\":{\"href\":\"wiki/Foo\"},\"dsr\":[20,42,11,2]}'>
  <i data-parsoid='{\"dsr\":[33,40,2,2]}'>bar</i></a></p>\n\n<p 
data-parsoid='{\"dsr\":[44,63,0,0]}'><a rel=\"mw:ExtLink\" 
href=\"http://wp.org\"; 
data-parsoid='{\"targetOff\":59,\"contentOffsets\":[59,62],\"dsr\":[44,63,15,1]}'>foo</a></p>\n\n<p
 data-parsoid='{\"dsr\":[65,88,0,0]}'><a rel=\"mw:ExtLink\" 
href=\"http://wp.org\"; 
data-parsoid='{\"targetOff\":80,\"contentOffsets\":[80,87],\"dsr\":[65,88,15,1]}'><i
 data-parsoid='{\"dsr\":[80,87,2,2]}'>foo</i></a></p>\n");
+add("html2html", "Interwiki link encoding conversion (T3636)\n+!! 
options\n+parsoid=wt2html,wt2wt\n+## html2wt and html2html will fail because we 
will prefer the :en: interwiki prefix over wikipedia:", "<ul 
data-parsoid='{\"dsr\":[0,87,0,0]}'><li data-parsoid='{\"dsr\":[0,43,1,0]}'> <a 
rel=\"mw:ExtLink\" href=\"//en.wikipedia.org/wiki/ro:Olteniţa\" 
title=\"en:ro:Olteniţa\" 
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"//en.wikipedia.org/wiki/ro:Olteniţa\"},\"sa\":{\"href\":\":en:ro:Olteniţa\"},\"isIW\":true,\"dsr\":[2,43,18,2]}'>Wikipedia:ro:Olteniţa</a></li>\n<li
 data-parsoid='{\"dsr\":[44,87,1,0]}'> <a rel=\"mw:ExtLink\" 
href=\"//en.wikipedia.org/wiki/ro:Olteniţa\" title=\"en:ro:Olteniţa\" 
data-parsoid='{\"stx\":\"piped\",\"a\":{\"href\":\"//en.wikipedia.org/wiki/ro:Olteniţa\"},\"sa\":{\"href\":\":en:ro:Olteniţa\"},\"isIW\":true,\"dsr\":[46,87,18,2]}'>Wikipedia:ro:Olteniţa</a></li></ul>\n");
 add("html2html", "Space and question mark encoding in interlanguage links 
(T95473)", "<p data-parsoid='{\"dsr\":[0,14,0,0]}'>Blah blah blah</p>\n<link 
rel=\"mw:PageProp/Language\" href=\"http://es.wikipedia.org/wiki/Foo_bar?\"; 
data-parsoid='{\"stx\":\"simple\",\"a\":{\"href\":\"http://es.wikipedia.org/wiki/Foo_bar?\"},\"sa\":{\"href\":\"es:Foo_bar?\"},\"dsr\":[15,30,null,null]}'/>");
 add("html2html", "Parsoid-specific test: Wikilinks with &nbsp; should RT 
properly", "<p 
data-parsoid='{\"dsr\":[0,52,0,0]}'>[/index.php?title=WW_II&amp;action=edit&amp;redlink=1
 WW II]</p>\n");
 add("html2html", "<br> to <br />", "<p 
data-parsoid='{\"dsr\":[0,5,0,0]}'>1\n2\n3</p>\n");
@@ -652,7 +653,7 @@
 add("html2wt", "Internal link with is link prefix", "Aðrir 
[[wiki/Söfnuður|mótmælendasöfnuðir]] og\n");
 add("html2wt", "Internal link with is link trail and link prefix", 
"[[wiki/Mótmælendatrú|xxxar]]\n[[wiki/Mótmælendatrú|mótmælendatrúar]]\n[[wiki/Söfnuður|mótmælendasöfnuður]]\n[[wiki/Söfnuður|mótmælendasöfnuðir]]\n[[wiki/Söfnuður|mótmælendasöfnuðirxxx]]\n");
 add("html2wt", "Parsoid-centric test: Whitespace in ext- and wiki-links should 
be preserved", "[[wiki/Foo|  bar]]\n\n[[wiki/Foo|  ''bar'']]\n\n[http://wp.org 
foo]\n\n[http://wp.org ''foo'']\n");
-add("html2wt", "Interwiki link encoding conversion (T3636)", "* 
[[wikipedia:ro:Olteniţa|Wikipedia:ro:Olteniţa]]\n* 
[[wikipedia:ro:Olteniţa|Wikipedia:ro:Olteniţa]]\n");
+add("html2wt", "Interwiki link encoding conversion (T3636)\n+!! 
options\n+parsoid=wt2html,wt2wt\n+## html2wt and html2html will fail because we 
will prefer the :en: interwiki prefix over wikipedia:", "* 
[[:en:ro:Olteniţa|Wikipedia:ro:Olteniţa]]\n* 
[[:en:ro:Olteniţa|Wikipedia:ro:Olteniţa]]\n");
 add("html2wt", "Interwiki link with fragment (T4130)", 
"[[meatball:SoftSecurity#foo|MeatBall:SoftSecurity#foo]]\n");
 add("html2wt", "Escaping of interlanguage links (T129218, T156308)", "Blah 
blah blah\n[[:es:Spanish]]\n[[:zh:Chinese| zh : Chinese ]]\n");
 add("html2wt", "Parsoid-specific test: Wikilinks with &nbsp; should RT 
properly", "[/index.php?title=WW_II&action=edit&redlink=1 WW II]\n");
@@ -1158,8 +1159,8 @@
 add("selser", "External link containing double-single-quotes with no space 
separating the url from text in italics [[1,3,0]]", 
"[http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm ''La muerte de 
Casagemas'' (1901) en el sitio de ]\n");
 add("selser", "External link containing double-single-quotes with no space 
separating the url from text in italics [[4,0,3]]", "1jnda7a\n");
 add("selser", "External link containing double-single-quotes with no space 
separating the url from text in italics [[[1,2],2,4]]", 
"[http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm ''La muerte de 
Casagemas''1svf0oe (1901) en el sitio de ]mqtmyg6n94l9");
-add("selser", "mw:ExtLink linking to a interwiki URL can be round-tripped 
losslessly (T94723) [[[4]]]", "[[wikipedia:European_Robin|1rmduf6]]");
-add("selser", "mw:ExtLink linking to a interwiki URL can be round-tripped 
losslessly (T94723) [[[2]]]", "[[wikipedia:European_Robin|134iwocEuropean 
Robin]]");
+add("selser", "mw:ExtLink linking to a interwiki URL can be round-tripped 
losslessly (T94723) [[[4]]]", "[[:en:European_Robin|1rmduf6]]");
+add("selser", "mw:ExtLink linking to a interwiki URL can be round-tripped 
losslessly (T94723) [[[2]]]", "[[:en:European_Robin|134iwocEuropean Robin]]");
 add("selser", "Unclosed and unmatched quotes 
[[[0,0,4]],2,3,3,[1],0,4,0,1,2,[[2]],3,3,3,2,0,3,4,1,2,2]", "'''''Bold italic 
text '''with bold deactivatedmik1b''\n\n194mnir\n\n'''Bold 
text..'''\n\n1vh6b8p\n\n'''Bold tag left open\n\n118iayy\n\n''hxqm6fItalic tag 
left open''\n\n8zfmbl<!-- Unmatching number of opening, closing tags: 
-->\n\nv7yj6u\n\n''Tom'''s car is bigger than 
'''''<nowiki/>'''Susan'''s.\n\n1suii2h\n\n1qt3jiw\n\nPlain ''italic'''s plain");
 add("selser", "Unclosed and unmatched quotes 
[[1],0,[1],0,[3],0,[2,2],0,4,0,4,2,4,0,3,0,4,0,[[0,[4]],0,3],0,4]", "'''''Bold 
italic text '''with bold deactivated''' in between.'''''\n\n'''''Bold italic 
text ''with italic deactivated'' in between.'''''\n\n1vbvxxl..spanning two 
paragraphs (should not 
work).6tygj0'''\n\n750fcg\n\n1ke2xol\n\nqpzby4\n\n1fztsq9\n\n1qf0akm\n\n''Tom'''3ftppf'''''<nowiki/>'''Susan'''\n\namwrge\n");
 add("selser", "Unclosed and unmatched quotes 
[[[3,0,[3]]],0,[1],0,2,2,3,0,[4],4,[4],0,0,3,0,4,[2,3,3,4,3,0],0,4,2,[0,3,4]]", 
"''with bold deactivated'''<nowiki/>'''''\n\n'''''Bold italic text ''with 
italic deactivated'' in between.'''''\n\n1o7p7gt\n\n'''Bold 
text..\n\nypestg\n\n9sn2o4\n\n1clpd1j\n\n1933mb7\n\nNormal text.<!-- Unmatching 
number of opening, closing tags: -->\n\nr7rcfr\n\n9vn9he'''This 
year''''1bex21s.\n\n23segf\n\n15xu7jn\n\nPlain 1038n5m");
@@ -1311,13 +1312,11 @@
 add("selser", "Parsoid-centric test: Whitespace in ext- and wiki-links should 
be preserved [3,4,2,2,2,0,4]", "1ft87cu\n\n13xu2qq\n\n[[Foo|  
''bar'']]\n\n15lnhl6\n\n1djvh1q\n\n[http://wp.org   foo]\n\n1rzfwwg\n");
 add("selser", "Parsoid-centric test: Whitespace in ext- and wiki-links should 
be preserved [0,2,1,3,[[4]],3,0]", "[[Foo|  bar]]\n\np3atih\n\n[[Foo|  
''bar'']]\n\n[http://wp.org rop4jb]\n\n[http://wp.org   ''foo'']");
 add("selser", "Parsoid-centric test: Whitespace in ext- and wiki-links should 
be preserved [3,4,4,2,2,4,0]", 
"1bj9bbq\n\n11icu1m\n\n1f2hrph\n\n1d06nta\n\n[http://wp.org   
foo]\n\n1npgg6a\n\n[http://wp.org   ''foo'']");
-add("selser", "Interwiki link encoding conversion (T3636) [[[3],2,[2]]]", 
"*\n* edo4is\n*1k978vr[[Wikipedia:ro:Olteniţa]]");
-add("selser", "Different interwiki prefixes mapping to the same URL 
[2,0,[4],0,2,4,3,0,3,3,[2],2,[4]]", 
"1cmr4k9\n\n[[:en:Foo]]\n\nhvdis9\n\na1dr3q\n\n[[wikipedia:Foo]]\n\ne49fuw\n\nnsemot[[wikipedia:en:Foo]]\n\nm67gy7\n\n1jea487");
-add("selser", "Different interwiki prefixes mapping to the same URL 
[3,4,0,0,[1],4,3,0,3,0,[2],0,3]", 
"1nj1lgj\n\n[[:en:Foo|Foo]]\n\n[[wikipedia:Foo]]\n\n1iwxfz6\n\n1v5pb3c[[wikipedia:en:Foo]]\n");
-add("selser", "Different interwiki prefixes mapping to the same URL 
[4,3,[4],4,2,0,[1],2,1,0,[2],0,3]", 
"1niy9um\n\n1rj1oh1\n\n17bubmt\n\nwa0uv2\n\n[[wikipedia:Foo]]\n\n[[:wikipedia:Foo|Foo]]\n\n1gcjzpc\n\n[[wikipedia:en:Foo]]\n\n1x968no[[wikipedia:en:Foo]]\n");
-add("selser", "Different interwiki prefixes mapping to the same URL 
[4,4,1,4,[3],4,[[4]],0,4,0,[1],3,[4]]", 
"1ts2wsm\n\n1ha756h\n\n[[:en:Foo|Foo]]\n\n1oy27y4\n\n1wctm47\n\n[[:wikipedia:Foo|3wyj0g]]\n\n13tzutw\n\n[[wikipedia:en:Foo]]\n\ndrkjrg");
-add("selser", "Different interwiki prefixes mapping to the same URL 
[[4],0,[3],0,0,2,[2],0,[4],0,[2],4,[[2]]]", 
"vp9v9k\n\n[[wikipedia:Foo]]\n\n1dr7otu\n\n1ws4mui[[:wikipedia:Foo|Foo]]\n\n1szk0ja\n\nkvaw0k[[wikipedia:en:Foo]]\n\n1cbqxcb\n\n[[
  wikiPEdia :Foo|qzsy1a  wikiPEdia :Foo]]");
-add("selser", "Different interwiki prefixes mapping to the same URL 
[1,3,1,4,[[4]],0,3,2,[3],0,1,0,3]", 
"[[:en:Foo]]\n\n[[:en:Foo|Foo]]\n\n1qaad10\n\n[[wikipedia:Foo|zc9a9s]]\n\n1kh1ssv\n\n[[wikipedia:en:Foo]]\n");
+add("selser", "Interwiki link encoding conversion (T3636)\n+!! 
options\n+parsoid=wt2html,wt2wt\n+## html2wt and html2html will fail because we 
will prefer the :en: interwiki prefix over wikipedia: [[0,4,[2]]]", 
"*[[Wikipedia:ro:Olteni&#0355;a]]\n* 
1nfwe0o\n*13bu3xr[[:Wikipedia:ro:Olteniţa]]");
+add("selser", "Different interwiki prefixes mapping to the same URL 
[[1],0,4,0,1,0,3,0,[1],0,[3],0,[1]]", 
"[[:en:Foo]]\n\nv7i85g\n\n[[wikipedia:Foo]]\n\n[[:wikipedia:en:Foo]]\n\n[[:  
wikiPEdia :Foo]]\n");
+add("selser", "Different interwiki prefixes mapping to the same URL 
[4,0,[[3]],0,1,0,[[4]],2,0,0,[1],4,1]", 
"1l14vkx\n\n[:en:Foo]\n\n[[wikipedia:Foo]]\n\n[[:wikipedia:Foo|nbuvsk]]\n\nk4ccf8\n\n[[wikipedia:en:Foo]]\n\n[[:wikipedia:en:Foo]]\n\n1gr9ugr\n\n[[:
  wikiPEdia :Foo]]");
+add("selser", "Different interwiki prefixes mapping to the same URL 
[2,0,[[4]],3,3,0,[[3]],0,1,3,0,3,[2]]", 
"1belm8p\n\n[[:en:Foo]]\n\n[[:en:Foo|jvi0fn]]\n\n[:wikipedia:Foo]\n\n[[:wikipedia:en:Foo]]\n\n[[:wikipedia:en:Foo]]\n\nrqndks[[
  wikiPEdia :Foo]]");
+add("selser", "Different interwiki prefixes mapping to the same URL 
[4,3,[[2]],2,[2],0,4,4,[[4]],4,[[2]],3,[4]]", 
"1vnl7bc\n\n[[:en:Foo|1mo1igaFoo]]\n\n15p8av9\n\nt1tnk7[[:wikipedia:Foo]]\n\nik65u\n\ncp91v8\n\n[[wikipedia:en:Foo|vzo3uh]]\n\n1hyadw9\n\n[[:wikipedia:en:Foo|11hwcmxwikipedia:en:Foo]]\n\n8ja2ps");
 add("selser", "Parsoid: recognize interwiki links without a target page 
[2,2,1]", "13h7yqp\n\n[[:es:]]\n\nn76wcf\n\n[[ko:]]");
 add("selser", "Parsoid: recognize interwiki links without a target page 
[[2],2,1]", "1fz9jlc[[:es:]]\n\n2alzzr\n\n[[ko:]]");
 add("selser", "Parsoid: recognize interwiki links without a target page 
[1,4,1]", "[[:es:]]\n\neviwhn\n\n[[ko:]]");
diff --git a/tests/parserTests.txt b/tests/parserTests.txt
index 7cff659..a9140fe 100644
--- a/tests/parserTests.txt
+++ b/tests/parserTests.txt
@@ -4835,6 +4835,9 @@
 
 !! test
 External links: with no contents
+!! options
+parsoid=wt2html,wt2wt
+## html2wt and html2html will fail because we will prefer the :en: interwiki 
prefix over wikipedia:
 !! wikitext
 [http://en.wikipedia.org/wiki/Foo]
 
@@ -5962,11 +5965,11 @@
 !! wikitext
 [[Foo|Bar]]
 [[Foo|Bar]]
-[[wikipedia:Foo|Bar]]
-[[wikipedia:Foo|Bar]]
+[[:en:Foo|Bar]]
+[[:en:Foo|Bar]]
 
-[[wikipedia:European_Robin|European Robin]]
-[[wikipedia:European_Robin|European Robin]]
+[[:en:European_Robin|European Robin]]
+[[:en:European_Robin|European Robin]]
 !! end
 
 !! test
@@ -8613,6 +8616,9 @@
 
 !! test
 Interwiki link encoding conversion (T3636)
++!! options
++parsoid=wt2html,wt2wt
++## html2wt and html2html will fail because we will prefer the :en: interwiki 
prefix over wikipedia:
 !! wikitext
 *[[Wikipedia:ro:Olteni&#0355;a]]
 *[[Wikipedia:ro:Olteni&#355;a]]

-- 
To view, visit https://gerrit.wikimedia.org/r/384171
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I5cf93950a6da69263fb9da59fba2b33cc2e8931f
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to