Subramanya Sastry has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/293799

Change subject: WIP: html2wt: Always emit canonical wikitext for url links
......................................................................

WIP: html2wt: Always emit canonical wikitext for url links

* This patch removes the conditional emitting of url-links for
  modified content and normalizes anything that goes through the
  non-selective serializer. In reality, we expect non-canonical
  forms like [http://foo.com http://foo.com] to be rare in the
  corpus, and we have the selective serializer that will prevent
  dirty diffs in unedited content.

* TODO: Change the failing tests since we don't expect them to
  pass going forward.

Change-Id: I2e15bffb5c1e778444620e76576c0ebd24f93cae
---
M lib/html2wt/LinkHandler.js
M tests/parserTests-blacklist.js
2 files changed, 21 insertions(+), 3 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid 
refs/changes/99/293799/1

diff --git a/lib/html2wt/LinkHandler.js b/lib/html2wt/LinkHandler.js
index ede3af4..ba70554 100644
--- a/lib/html2wt/LinkHandler.js
+++ b/lib/html2wt/LinkHandler.js
@@ -545,9 +545,7 @@
                        // Can we minimize this?
                        (target.value === contentStr  ||
                        getHref(env, node) === contentStr) &&
-                       Util.isProtocolValid(contentStr, env) &&
-                       // But preserve non-minimal encoding
-                       (target.modified || linkData.contentModified || dp.stx 
=== 'url'));
+                       Util.isProtocolValid(contentStr, env));
 };
 
 var serializeAsExtLink = Promise.method(function(node, state, linkData) {
diff --git a/tests/parserTests-blacklist.js b/tests/parserTests-blacklist.js
index 4330e2b..f570847 100644
--- a/tests/parserTests-blacklist.js
+++ b/tests/parserTests-blacklist.js
@@ -296,6 +296,7 @@
 add("wt2wt", "4. Indent-Pre and extension tags", " a <gallery>\n 
File:foobar.jpg\n </gallery>\n");
 add("wt2wt", "Definition lists: self-closed tag", ";one<br />two : two-line 
fun");
 add("wt2wt", "Definition Lists: colons occurring in tags", 
";a:b\n;'''a:b'''\n;<i>a:b</i>\n;<span>a:b</span>\n;<div>a:b</div>\n;<div>a\n:b\n;{{echo|a:b}}\n;{{echo|''a:b''}}\n;;;''a:b''\n");
+add("wt2wt", "External links: URL in text", "URL in text: 
http://example.com\n";);
 add("wt2wt", "BUG 289: \">\"-token in bracketed URL", 
"[http://www.example.com/ <hello> stuff]\n");
 add("wt2wt", "BUG 289: literal \">\"-token in bracketed URL", 
"[http://www.example.com/ <b>html</b> stuff]\n");
 add("wt2wt", "BUG 289: literal double quote in bracketed URL", 
"[http://www.example.com/ \"hello\" stuff]\n");
@@ -365,6 +366,7 @@
 add("wt2wt", "Image with page parameter", "[[File:LoremIpsum.djvu]]\n");
 add("wt2wt", "Don't fall for the self-closing div", "<div>hello world</div>");
 add("wt2wt", "Parsing of overlapping (improperly nested) inline html tags", 
"<span><s>x</span>\n");
+add("wt2wt", "Proper conversion of text in external links", 
"http://www.google.com\ngopher://www.google.com\nhttp://www.google.com\ngopher://www.google.com\n[https://www.google.com
 irc://www.google.com]\n[ftp://www.google.com 
www.google.com/ftp://dir]\n[//www.google.com www.google.com]\n");
 add("wt2wt", "Don't break table handling if language converter markup is in 
the cell.", "{|\n|-\n| -{R |B}-\n|}");
 add("wt2wt", "Bug 529: Uncovered bullet leaving empty list, normally removed 
by tidy", "******* Foo \n{{bullet}}");
 add("wt2wt", "HHP2.1: Heuristics for headings in preprocessor parenthetical 
structures", "<nowiki>{{foo|</nowiki>\n=heading=\n");
@@ -1410,6 +1412,13 @@
 add("selser", "Definition Lists: colons occurring in tags 
[[3,[4],0,[[2]],4,0,4,[4],0,[[2]],2,[[4]],2,0,0,0,0,4]]", 
":5lkbigrdwdx11yvi\n;'''cqslr7dvsnd5z5mia:b'''\n: 
masspab3x9don7b9\n;<i>a:b</i>\n: 
oa29nopebnoy9zfr\n;98d7as2k961ug14i\n;<div>1bwhvaem3l0izfra:b</div>\n: 
vnpj9u2knmmzehfr\n;<div>zv7k43lu5scq5mi\n: 
1f68p993haor\n:b</div>\n;{{echo|a:b}}\n: hxhtxsqncdi");
 add("selser", "Definition Lists: colons occurring in tags 
[[0,0,4,[2],3,[4],0,[[3]],4,[[3]],0,2,3,0,0,0,0,[0,0,1]]]", ";a:b\n: 
h3264ujrqrkq33di\n;h56kdmfdrrv34n29'''a:b'''\n;fxo6p8aww4p9zfr\n;<span></span>\n:
 h438k9e0o3aexw29\n;<div></div>\n: 
diwu40dj1dims4i\n;<div>a\n:b</div>\n;{{echo|a:b}}\n;{{echo|''a:b''}}\n;;;''a:b''");
 add("selser", "Definition Lists: colons occurring in tags [1]", 
";a:b\n;'''a:b'''\n;<i>a:b</i>\n;<span>a:b</span>\n;<div>a:b</div>\n;<div>a\n:b</div>\n;{{echo|a:b}}\n;{{echo|''a:b''}}\n;;;''a:b''");
+add("selser", "External links: URL in text [2]", "phopaichqnsqyqfr\n\nURL in 
text: [http://example.com http://example.com]";);
+add("selser", "External links: URL in text [1]", "URL in text: 
[http://example.com http://example.com]";);
+add("selser", "External links: URL in text [[2,2]]", "rqwk38dsmcboi529URL in 
text: f02w1rnzv2qehfr[http://example.com http://example.com]";);
+add("selser", "External links: URL in text [[2,0]]", "oho4izp0vk1brzfrURL in 
text: [http://example.com http://example.com]";);
+add("selser", "External links: URL in text [[0,2]]", "URL in text: 
g47x5c7350y0hpvi[http://example.com http://example.com]";);
+add("selser", "External links: URL in text [[4,0]]", 
"21h3qp7binlba9k9[http://example.com http://example.com]";);
+add("selser", "External links: URL in text [[3,0]]", "[http://example.com 
http://example.com]";);
 add("selser", "BUG 289: \">\"-token in bracketed URL [2]", 
"ll0lxyd1rfgpsyvi\n\n[http://www.example.com/<hello> stuff]");
 add("selser", "BUG 289: \">\"-token in bracketed URL [1]", 
"[http://www.example.com/<hello> stuff]");
 add("selser", "BUG 289: \">\"-token in bracketed URL [[2]]", 
"kemi29jie4b1emi[http://www.example.com/<hello> stuff]");
@@ -1976,6 +1985,17 @@
 add("selser", "Don't fall for the self-closing div [2]", 
"03k36yoyy20jatt9<div>hello world</div/>");
 add("selser", "Don't fall for the self-closing div [[2]]", 
"<div>3554xqvubx83erk9hello world</div/>");
 add("selser", "Parsing of overlapping (improperly nested) inline html tags 
[2]", "y9cin9njblr6n7b9\n\n<span><s>x</span></s>");
+add("selser", "Proper conversion of text in external links 
[[[3],0,0,4,0,3,1,0,0,0,[3],0,[3]]]", 
"[http://www.google.com]\ngopher://www.google.com<nowiki/>aot4gtd8s7bsxlxr[http://www.google.com
 http://www.google.com]gopher://www.google.com\n[https://www.google.com 
irc://www.google.com]\n[ftp://www.google.com]\n[//www.google.com]";);
+add("selser", "Proper conversion of text in external links 
[[1,4,1,3,[2],0,0,0,0,0,[4],3,[3]]]", 
"http://www.google.com<nowiki/>3nvd9lor01lnxw29<nowiki/>gopher://www.google.com[http://www.google.com
 278eudoroal40a4ihttp://www.google.com]\n[gopher://www.google.com 
gopher://www.google.com]\n[https://www.google.com 
irc://www.google.com]\n[ftp://www.google.com 
o619jmpvbwjw0zfr][//www.google.com]");
+add("selser", "Proper conversion of text in external links [2]", 
"z6fnzp7yy08gp66r\n\nhttp://www.google.com\ngopher://www.google.com\n[http://www.google.com
 http://www.google.com]\n[gopher://www.google.com 
gopher://www.google.com]\n[https://www.google.com 
irc://www.google.com]\n[ftp://www.google.com 
www.google.com/ftp://dir]\n[//www.google.com www.google.com]");
+add("selser", "Proper conversion of text in external links 
[[[3],2,[2],2,0,0,1,0,[2],0,0,0,[4]]]", 
"[http://www.google.com]vskj1nt5fbkpgb9\n[gopher://www.google.com 
gvn2m1xpf3aexw29gopher://www.google.com]hqebupr7tc81if6r\n[http://www.google.com
 http://www.google.com]\ngopher://www.google.com\n[https://www.google.com 
6v0985zzjpqlg14iirc://www.google.com]\n[ftp://www.google.com 
www.google.com/ftp://dir]\n[//www.google.com pfkt5od54jkyb9]");
+add("selser", "Proper conversion of text in external links 
[[0,0,3,3,1,4,2,2,0,2,1,0,3]]", 
"http://www.google.com\nhttp://www.google.com<nowiki/>e818ud1zg6u8r5298k2fc5smu606n7b9[gopher://www.google.com
 gopher://www.google.com]kmbnl7c5lbn2vs4i\n[https://www.google.com 
irc://www.google.com]d1s5j5iraj3e4s4i\n[ftp://www.google.com 
www.google.com/ftp://dir]\n";);
+add("selser", "Proper conversion of text in external links 
[[[2],0,4,3,0,4,[2],3,0,0,3,0,[4]]]", "[http://www.google.com 
07wy8z55g3vgf1orhttp://www.google.com]\n3h27bqa01zwipb9[http://www.google.com 
http://www.google.com]7uw8a4k5l5etrzfr[gopher://www.google.com 
off2bxlyo8js8aorgopher://www.google.com][https://www.google.com 
irc://www.google.com]\n\n[//www.google.com nogw6oiiytbotj4i]\n");
+add("selser", "Proper conversion of text in external links 
[[0,4,[3],0,0,0,1,2,[2],0,3,0,0]]", 
"http://www.google.com<nowiki/>17vt7qajl6usor[gopher://www.google.com]\n[http://www.google.com
 
http://www.google.com]\ngopher://www.google.com<nowiki/>0p217x17xx14te29\n[https://www.google.com
 nsw2thd2boab57b9irc://www.google.com]\n\n[//www.google.com www.google.com]");
+add("selser", "Proper conversion of text in external links [1]", 
"http://www.google.com\ngopher://www.google.com\n[http://www.google.com 
http://www.google.com]\n[gopher://www.google.com 
gopher://www.google.com]\n[https://www.google.com 
irc://www.google.com]\n[ftp://www.google.com 
www.google.com/ftp://dir]\n[//www.google.com www.google.com]");
+add("selser", "Proper conversion of text in external links 
[[2,0,[4],2,[3],0,0,0,0,3,0,2,0]]", 
"w7hh5ei3lyvkuik9<nowiki/>http://www.google.com\n[gopher://www.google.com 
gb1rz0o0gasmj9k9]u2ld5b4ed9w45cdi\n[http://www.google.com]\n[gopher://www.google.com
 gopher://www.google.com]\n[https://www.google.com 
irc://www.google.com][ftp://www.google.com 
www.google.com/ftp://dir]l2e1a2sttdszto6r\n[//www.google.com www.google.com]");
+add("selser", "Proper conversion of text in external links 
[[3,2,0,0,0,3,3,0,0,0,1,0,[3]]]", 
"wxkvboht8yxk1emi\ngopher://www.google.com\n[http://www.google.com 
http://www.google.com]\n[https://www.google.com 
irc://www.google.com]\n[ftp://www.google.com 
www.google.com/ftp://dir]\n[//www.google.com]";);
+add("selser", "Proper conversion of text in external links 
[[[4],0,4,0,[3],0,0,0,0,4,0,3,0]]", "[http://www.google.com 
n8jxqjarsd8nz5mi]\nfhhy3vvlyozs38fr\n[http://www.google.com]\n[gopher://www.google.com
 gopher://www.google.com]\n[https://www.google.com 
irc://www.google.com]3lgdl8pppd4mquxr[ftp://www.google.com 
www.google.com/ftp://dir][//www.google.com www.google.com]");
 add("selser", "Don't break table handling if language converter markup is in 
the cell. [2]", "ixer2na9oirudi\n{|\n|-\n| -{R|B}-\n|}");
 add("selser", "Don't break table handling if language converter markup is in 
the cell. [[3,[1,4]]]", "{|\n|- data-foobar=\"lxrh34k2mmoq1tt9\"\n| 
-{R|B}-<!--czk30pw9nitl0udi-->\n|}");
 add("selser", "Don't break table handling if language converter markup is in 
the cell. [[3,[2,3]]]", "{|<!--kvgh4eibaoiggb9-->\n|-\n| -{R|B}-\n|}");

-- 
To view, visit https://gerrit.wikimedia.org/r/293799
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I2e15bffb5c1e778444620e76576c0ebd24f93cae
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/services/parsoid
Gerrit-Branch: master
Gerrit-Owner: Subramanya Sastry <ssas...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to