Subramanya Sastry has uploaded a new change for review. https://gerrit.wikimedia.org/r/293799
Change subject: WIP: html2wt: Always emit canonical wikitext for url links ...................................................................... WIP: html2wt: Always emit canonical wikitext for url links * This patch removes the conditional emitting of url-links for modified content and normalizes anything that goes through the non-selective serializer. In reality, we expect non-canonical forms like [http://foo.com http://foo.com] to be rare in the corpus, and we have the selective serializer that will prevent dirty diffs in unedited content. * TODO: Change the failing tests since we don't expect them to pass going forward. Change-Id: I2e15bffb5c1e778444620e76576c0ebd24f93cae --- M lib/html2wt/LinkHandler.js M tests/parserTests-blacklist.js 2 files changed, 21 insertions(+), 3 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/services/parsoid refs/changes/99/293799/1 diff --git a/lib/html2wt/LinkHandler.js b/lib/html2wt/LinkHandler.js index ede3af4..ba70554 100644 --- a/lib/html2wt/LinkHandler.js +++ b/lib/html2wt/LinkHandler.js @@ -545,9 +545,7 @@ // Can we minimize this? (target.value === contentStr || getHref(env, node) === contentStr) && - Util.isProtocolValid(contentStr, env) && - // But preserve non-minimal encoding - (target.modified || linkData.contentModified || dp.stx === 'url')); + Util.isProtocolValid(contentStr, env)); }; var serializeAsExtLink = Promise.method(function(node, state, linkData) { diff --git a/tests/parserTests-blacklist.js b/tests/parserTests-blacklist.js index 4330e2b..f570847 100644 --- a/tests/parserTests-blacklist.js +++ b/tests/parserTests-blacklist.js @@ -296,6 +296,7 @@ add("wt2wt", "4. Indent-Pre and extension tags", " a <gallery>\n File:foobar.jpg\n </gallery>\n"); add("wt2wt", "Definition lists: self-closed tag", ";one<br />two : two-line fun"); add("wt2wt", "Definition Lists: colons occurring in tags", ";a:b\n;'''a:b'''\n;<i>a:b</i>\n;<span>a:b</span>\n;<div>a:b</div>\n;<div>a\n:b\n;{{echo|a:b}}\n;{{echo|''a:b''}}\n;;;''a:b''\n"); +add("wt2wt", "External links: URL in text", "URL in text: http://example.com\n"); add("wt2wt", "BUG 289: \">\"-token in bracketed URL", "[http://www.example.com/ <hello> stuff]\n"); add("wt2wt", "BUG 289: literal \">\"-token in bracketed URL", "[http://www.example.com/ <b>html</b> stuff]\n"); add("wt2wt", "BUG 289: literal double quote in bracketed URL", "[http://www.example.com/ \"hello\" stuff]\n"); @@ -365,6 +366,7 @@ add("wt2wt", "Image with page parameter", "[[File:LoremIpsum.djvu]]\n"); add("wt2wt", "Don't fall for the self-closing div", "<div>hello world</div>"); add("wt2wt", "Parsing of overlapping (improperly nested) inline html tags", "<span><s>x</span>\n"); +add("wt2wt", "Proper conversion of text in external links", "http://www.google.com\ngopher://www.google.com\nhttp://www.google.com\ngopher://www.google.com\n[https://www.google.com irc://www.google.com]\n[ftp://www.google.com www.google.com/ftp://dir]\n[//www.google.com www.google.com]\n"); add("wt2wt", "Don't break table handling if language converter markup is in the cell.", "{|\n|-\n| -{R |B}-\n|}"); add("wt2wt", "Bug 529: Uncovered bullet leaving empty list, normally removed by tidy", "******* Foo \n{{bullet}}"); add("wt2wt", "HHP2.1: Heuristics for headings in preprocessor parenthetical structures", "<nowiki>{{foo|</nowiki>\n=heading=\n"); @@ -1410,6 +1412,13 @@ add("selser", "Definition Lists: colons occurring in tags [[3,[4],0,[[2]],4,0,4,[4],0,[[2]],2,[[4]],2,0,0,0,0,4]]", ":5lkbigrdwdx11yvi\n;'''cqslr7dvsnd5z5mia:b'''\n: masspab3x9don7b9\n;<i>a:b</i>\n: oa29nopebnoy9zfr\n;98d7as2k961ug14i\n;<div>1bwhvaem3l0izfra:b</div>\n: vnpj9u2knmmzehfr\n;<div>zv7k43lu5scq5mi\n: 1f68p993haor\n:b</div>\n;{{echo|a:b}}\n: hxhtxsqncdi"); add("selser", "Definition Lists: colons occurring in tags [[0,0,4,[2],3,[4],0,[[3]],4,[[3]],0,2,3,0,0,0,0,[0,0,1]]]", ";a:b\n: h3264ujrqrkq33di\n;h56kdmfdrrv34n29'''a:b'''\n;fxo6p8aww4p9zfr\n;<span></span>\n: h438k9e0o3aexw29\n;<div></div>\n: diwu40dj1dims4i\n;<div>a\n:b</div>\n;{{echo|a:b}}\n;{{echo|''a:b''}}\n;;;''a:b''"); add("selser", "Definition Lists: colons occurring in tags [1]", ";a:b\n;'''a:b'''\n;<i>a:b</i>\n;<span>a:b</span>\n;<div>a:b</div>\n;<div>a\n:b</div>\n;{{echo|a:b}}\n;{{echo|''a:b''}}\n;;;''a:b''"); +add("selser", "External links: URL in text [2]", "phopaichqnsqyqfr\n\nURL in text: [http://example.com http://example.com]"); +add("selser", "External links: URL in text [1]", "URL in text: [http://example.com http://example.com]"); +add("selser", "External links: URL in text [[2,2]]", "rqwk38dsmcboi529URL in text: f02w1rnzv2qehfr[http://example.com http://example.com]"); +add("selser", "External links: URL in text [[2,0]]", "oho4izp0vk1brzfrURL in text: [http://example.com http://example.com]"); +add("selser", "External links: URL in text [[0,2]]", "URL in text: g47x5c7350y0hpvi[http://example.com http://example.com]"); +add("selser", "External links: URL in text [[4,0]]", "21h3qp7binlba9k9[http://example.com http://example.com]"); +add("selser", "External links: URL in text [[3,0]]", "[http://example.com http://example.com]"); add("selser", "BUG 289: \">\"-token in bracketed URL [2]", "ll0lxyd1rfgpsyvi\n\n[http://www.example.com/<hello> stuff]"); add("selser", "BUG 289: \">\"-token in bracketed URL [1]", "[http://www.example.com/<hello> stuff]"); add("selser", "BUG 289: \">\"-token in bracketed URL [[2]]", "kemi29jie4b1emi[http://www.example.com/<hello> stuff]"); @@ -1976,6 +1985,17 @@ add("selser", "Don't fall for the self-closing div [2]", "03k36yoyy20jatt9<div>hello world</div/>"); add("selser", "Don't fall for the self-closing div [[2]]", "<div>3554xqvubx83erk9hello world</div/>"); add("selser", "Parsing of overlapping (improperly nested) inline html tags [2]", "y9cin9njblr6n7b9\n\n<span><s>x</span></s>"); +add("selser", "Proper conversion of text in external links [[[3],0,0,4,0,3,1,0,0,0,[3],0,[3]]]", "[http://www.google.com]\ngopher://www.google.com<nowiki/>aot4gtd8s7bsxlxr[http://www.google.com http://www.google.com]gopher://www.google.com\n[https://www.google.com irc://www.google.com]\n[ftp://www.google.com]\n[//www.google.com]"); +add("selser", "Proper conversion of text in external links [[1,4,1,3,[2],0,0,0,0,0,[4],3,[3]]]", "http://www.google.com<nowiki/>3nvd9lor01lnxw29<nowiki/>gopher://www.google.com[http://www.google.com 278eudoroal40a4ihttp://www.google.com]\n[gopher://www.google.com gopher://www.google.com]\n[https://www.google.com irc://www.google.com]\n[ftp://www.google.com o619jmpvbwjw0zfr][//www.google.com]"); +add("selser", "Proper conversion of text in external links [2]", "z6fnzp7yy08gp66r\n\nhttp://www.google.com\ngopher://www.google.com\n[http://www.google.com http://www.google.com]\n[gopher://www.google.com gopher://www.google.com]\n[https://www.google.com irc://www.google.com]\n[ftp://www.google.com www.google.com/ftp://dir]\n[//www.google.com www.google.com]"); +add("selser", "Proper conversion of text in external links [[[3],2,[2],2,0,0,1,0,[2],0,0,0,[4]]]", "[http://www.google.com]vskj1nt5fbkpgb9\n[gopher://www.google.com gvn2m1xpf3aexw29gopher://www.google.com]hqebupr7tc81if6r\n[http://www.google.com http://www.google.com]\ngopher://www.google.com\n[https://www.google.com 6v0985zzjpqlg14iirc://www.google.com]\n[ftp://www.google.com www.google.com/ftp://dir]\n[//www.google.com pfkt5od54jkyb9]"); +add("selser", "Proper conversion of text in external links [[0,0,3,3,1,4,2,2,0,2,1,0,3]]", "http://www.google.com\nhttp://www.google.com<nowiki/>e818ud1zg6u8r5298k2fc5smu606n7b9[gopher://www.google.com gopher://www.google.com]kmbnl7c5lbn2vs4i\n[https://www.google.com irc://www.google.com]d1s5j5iraj3e4s4i\n[ftp://www.google.com www.google.com/ftp://dir]\n"); +add("selser", "Proper conversion of text in external links [[[2],0,4,3,0,4,[2],3,0,0,3,0,[4]]]", "[http://www.google.com 07wy8z55g3vgf1orhttp://www.google.com]\n3h27bqa01zwipb9[http://www.google.com http://www.google.com]7uw8a4k5l5etrzfr[gopher://www.google.com off2bxlyo8js8aorgopher://www.google.com][https://www.google.com irc://www.google.com]\n\n[//www.google.com nogw6oiiytbotj4i]\n"); +add("selser", "Proper conversion of text in external links [[0,4,[3],0,0,0,1,2,[2],0,3,0,0]]", "http://www.google.com<nowiki/>17vt7qajl6usor[gopher://www.google.com]\n[http://www.google.com http://www.google.com]\ngopher://www.google.com<nowiki/>0p217x17xx14te29\n[https://www.google.com nsw2thd2boab57b9irc://www.google.com]\n\n[//www.google.com www.google.com]"); +add("selser", "Proper conversion of text in external links [1]", "http://www.google.com\ngopher://www.google.com\n[http://www.google.com http://www.google.com]\n[gopher://www.google.com gopher://www.google.com]\n[https://www.google.com irc://www.google.com]\n[ftp://www.google.com www.google.com/ftp://dir]\n[//www.google.com www.google.com]"); +add("selser", "Proper conversion of text in external links [[2,0,[4],2,[3],0,0,0,0,3,0,2,0]]", "w7hh5ei3lyvkuik9<nowiki/>http://www.google.com\n[gopher://www.google.com gb1rz0o0gasmj9k9]u2ld5b4ed9w45cdi\n[http://www.google.com]\n[gopher://www.google.com gopher://www.google.com]\n[https://www.google.com irc://www.google.com][ftp://www.google.com www.google.com/ftp://dir]l2e1a2sttdszto6r\n[//www.google.com www.google.com]"); +add("selser", "Proper conversion of text in external links [[3,2,0,0,0,3,3,0,0,0,1,0,[3]]]", "wxkvboht8yxk1emi\ngopher://www.google.com\n[http://www.google.com http://www.google.com]\n[https://www.google.com irc://www.google.com]\n[ftp://www.google.com www.google.com/ftp://dir]\n[//www.google.com]"); +add("selser", "Proper conversion of text in external links [[[4],0,4,0,[3],0,0,0,0,4,0,3,0]]", "[http://www.google.com n8jxqjarsd8nz5mi]\nfhhy3vvlyozs38fr\n[http://www.google.com]\n[gopher://www.google.com gopher://www.google.com]\n[https://www.google.com irc://www.google.com]3lgdl8pppd4mquxr[ftp://www.google.com www.google.com/ftp://dir][//www.google.com www.google.com]"); add("selser", "Don't break table handling if language converter markup is in the cell. [2]", "ixer2na9oirudi\n{|\n|-\n| -{R|B}-\n|}"); add("selser", "Don't break table handling if language converter markup is in the cell. [[3,[1,4]]]", "{|\n|- data-foobar=\"lxrh34k2mmoq1tt9\"\n| -{R|B}-<!--czk30pw9nitl0udi-->\n|}"); add("selser", "Don't break table handling if language converter markup is in the cell. [[3,[2,3]]]", "{|<!--kvgh4eibaoiggb9-->\n|-\n| -{R|B}-\n|}"); -- To view, visit https://gerrit.wikimedia.org/r/293799 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I2e15bffb5c1e778444620e76576c0ebd24f93cae Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/services/parsoid Gerrit-Branch: master Gerrit-Owner: Subramanya Sastry <ssas...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits