https://www.mediawiki.org/wiki/Special:Code/MediaWiki/113136
Revision: 113136 Author: gwicke Date: 2012-03-06 13:49:37 +0000 (Tue, 06 Mar 2012) Log Message: ----------- Reworked percent encoding handling for URIs to get closer to the 'url construction' part of the HTML5 spec: http://www.whatwg.org/specs/web-apps/current-work/multipage/urls.html#url-manipulation-and-creation Removed a few whitelisted test cases that are now passing directly. The encoding canonicalization could also be moved to the Sanitizer. Doing this early in token stream processing however has the advantage of providing further transformations uniform data to work with. We could even consider to move this even further into the tokenizer. Modified Paths: -------------- trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js Modified: trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js =================================================================== --- trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js 2012-03-06 13:43:46 UTC (rev 113135) +++ trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js 2012-03-06 13:49:37 UTC (rev 113136) @@ -54,7 +54,7 @@ } content = out; } else { - content = href; + content = [ env.decodeURI( env.tokensToString( href ) ) ]; } if ( tail ) { content.push( tail ); @@ -105,19 +105,19 @@ var content = token.attribs.slice(1, -1); - // XXX: get /wiki from config! + // TODO: get /wiki from config! var a = new TagTk( 'a', [ new KV( 'href', '/wiki' + title.makeLink() ) ] ); a.dataAttribs = token.dataAttribs; var MD5 = new jshashes.MD5(), hash = MD5.hex( title.key ), - // XXX: Hackhack.. + // TODO: Hackhack.. Move to proper test harness setup! path = 'http://example.com/images/' + [ hash[0], hash.substr(0, 2) ].join('/') + '/' + title.key; - // XXX: extract options + // extract options var options = [], caption = null; for( var i = 0, l = content.length; i<l; i++ ) { @@ -132,7 +132,7 @@ } } else { var bits = oText[0].split( '=', 2 ); - if ( bits.length > 1 && this._prefixImageOptions[ bits[0].strip ] ) { + if ( bits.length > 1 && this._prefixImageOptions[ bits[0].trim() ] ) { console.log('handle prefix ' + bits ); } else { caption = oContent; @@ -217,7 +217,9 @@ }; ExternalLinkHandler.prototype.onUrlLink = function ( token, manager, cb ) { - var href = this.manager.env.lookupKV( token.attribs, 'href' ).v; + var href = this.manager.env.sanitizeURI( + this.manager.env.lookupKV( token.attribs, 'href' ).v + ); if ( this._isImageLink( href ) ) { return { token: new SelfclosingTagTk( 'img', [ @@ -241,6 +243,8 @@ ExternalLinkHandler.prototype.onExtLink = function ( token, manager, cb ) { var href = this.manager.env.lookupKV( token.attribs, 'href' ).v, content= this.manager.env.lookupKV( token.attribs, 'content' ).v; + href = this.manager.env.sanitizeURI( href ); + console.warn('extlink href: ' + href ); //console.warn( 'content: ' + JSON.stringify( content, null, 2 ) ); // validate the href if ( this.imageParser.parseURL( href ) ) { Modified: trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js =================================================================== --- trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js 2012-03-06 13:43:46 UTC (rev 113135) +++ trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js 2012-03-06 13:49:37 UTC (rev 113136) @@ -10,7 +10,7 @@ Title.prototype.makeLink = function () { // XXX: links always point to the canonical namespace name. if ( false && this.nskey ) { - return this.env.wgScriptPath + this.nskey + ':' + this.key; + return this.env.sanitizeURI( this.env.wgScriptPath + this.nskey + ':' + this.key ); } else { var l = this.env.wgScriptPath, ns = this.ns.getDefaultName(); @@ -18,7 +18,7 @@ if ( ns ) { l += ns + ':'; } - return l + this.key; + return this.env.sanitizeURI( l + this.key ); } }; Modified: trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js =================================================================== --- trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js 2012-03-06 13:43:46 UTC (rev 113135) +++ trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js 2012-03-06 13:49:37 UTC (rev 113136) @@ -208,7 +208,45 @@ return out.join(''); }; +MWParserEnvironment.prototype.decodeURI = function ( s ) { + return s.replace( /%[0-9a-f][0-9a-f]/g, function( m ) { + try { + return decodeURI( m ); + } catch ( e ) { + return m; + } + } ); +}; +MWParserEnvironment.prototype.sanitizeURI = function ( s ) { + var host = s.match(/^[a-zA-Z]+:\/\/[^\/]+(?:\/|$)/), + path = s, + anchor = null; + console.warn( 'host: ' + host ); + if ( host ) { + path = s.substr( host[0].length ); + host = host[0]; + } else { + host = ''; + } + var bits = path.split('#'); + if ( bits.length > 1 ) { + anchor = bits[bits.length - 1]; + path = path.substr(0, path.length - anchor.length - 1); + } + host = host.replace( /%(?![0-9a-fA-F][0-9a-fA-F])|[#|]/g, function ( m ) { + return encodeURIComponent( m ); + } ); + path = path.replace( /%(?![0-9a-fA-F][0-9a-fA-F])|[\[\]#|]/g, function ( m ) { + return encodeURIComponent( m ); + } ); + s = host + path; + if ( anchor !== null ) { + s += '#' + anchor; + } + return s; +}; + /** * Simple debug helper */ Modified: trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt =================================================================== --- trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt 2012-03-06 13:43:46 UTC (rev 113135) +++ trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt 2012-03-06 13:49:37 UTC (rev 113136) @@ -645,7 +645,7 @@ ) / s:[.:,] !(space / eolf) { return s } / htmlentity - / urlencoded_char + /// urlencoded_char / [&%] )+ { return proto + addr + rest.join(''); @@ -1674,17 +1674,19 @@ wikilink_preprocessor_text - = r:( t:[^%<~[{\n\r\t|!\]} &=]+ { return t.join(''); } - / urlencoded_char + = r:( t:[^<~[{\n\r\t|!\]} &=]+ { return t.join(''); } + /// urlencoded_char / directive / !inline_breaks !"|" !"]]" text_char )+ { return flatten_stringlist ( r ); } extlink_preprocessor_text + // added special separator character class inline: separates url from + // description / text = r:( t:[^'<~[{\n\r|!\]}\t&="' \u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+ { return t.join(''); } / directive - / urlencoded_char + /// urlencoded_char / !inline_breaks no_punctuation_char / s:[.:,] !(space / eolf) { return s } / [&%] )+ { Modified: trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js =================================================================== --- trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js 2012-03-06 13:43:46 UTC (rev 113135) +++ trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js 2012-03-06 13:49:37 UTC (rev 113136) @@ -46,36 +46,24 @@ /* Missing token transform functionality */ -// We don't implement percent encoding for URIs yet. -testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a data-mw-type=\"internal\" href=\"/wiki/Lista d''e paise d''o munno\">Lista d''e paise d''o munno</a></p>"; +// Single quotes are legal in HTML5 URIs. See +// http://www.whatwg.org/specs/web-apps/current-work/multipage/urls.html#url-manipulation-and-creation +testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a href=\"/wiki/Lista_d''e_paise_d''o_munno\" data-mw-type=\"internal\">Lista d''e paise d''o munno</a></p>"; -testWhiteList["Link containing \"<#\" and \">#\" as a hex sequences"] = "<p><a data-mw-type=\"internal\" href=\"/wiki/<%23\"><%23</a><a data-mw-type=\"internal\" href=\"/wiki/>%23\">>%23</a></p>"; - // Sanitizer testWhiteList["Invalid attributes in table cell (bug 1830)"] = "<table><tbody><tr><td Cell:=\"\">broken</td></tr></tbody></table>"; testWhiteList["Table security: embedded pipes (http://lists.wikimedia.org/mailman/htdig/wikitech-l/2006-April/022293.html)"] = "<table><tbody><tr><td> |<a href=\"ftp://|x||\">[1]</a>\" onmouseover=\"alert(document.cookie)\">test</td></tr></tbody></table>"; -// Sanitizer, but UTF8 in link might actually be ok in HTML5 +// Sanitizer, but UTF8 in link is ok in HTML5 testWhiteList["External link containing double-single-quotes with no space separating the url from text in italics"] = "<p><a href=\"http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm\" data-mw-type=\"external\" data-mw-rt=\"{"sourcePos":[0,146]}\"><i>La muerte de Casagemas</i> (1901) en el sitio de </a><a href=\"/wiki/Museo_Picasso_(ParĂs)\" data-mw-type=\"internal\">Museo Picasso</a>.</p>"; -// plain percent sign is also valid in HTML5 -testWhiteList["Bug 4781, 5267: %28, %29 in URL"] = "<p><a href=\"http://www.example.com/?title=Ben-Hur_(1959_film)\" data-mw-sourcePos=\"0:53\">http://www.example.com/?title=Ben-Hur_(1959_film)</a></p>"; - testWhiteList["External links: wiki links within external link (Bug 3695)"] = "<p><a href=\"http://example.com\" data-mw-type=\"external\" data-mw-sourcePos=\"0:54\"></a><a data-mw-type=\"internal\" href=\"/wiki/Wikilink\">wikilink</a> embedded in ext link</p>"; -testWhiteList["Bug 4781, 5267: %25 in URL"] = "<p><a href=\"http://www.example.com/?title=100%_Bran\" data-mw-sourcePos=\"0:41\">http://www.example.com/?title=100%_Bran</a></p>"; - testWhiteList["<pre> with forbidden attribute values (bug 3202)"] = "<pre width=\"8\" style=\"\">Narrow screen goodies</pre>"; -testWhiteList["Link containing % (not as a hex sequence)"] = "<p><a href=\"/wiki/7%_Solution\" data-mw-type=\"internal\">7% Solution</a></p>"; +//testWhiteList["Piped link to URL"] = "<p>Piped link to URL: [<a href=\"http://www.example.com|an\" data-mw-type=\"external\">example URL</a>]</p>"; -testWhiteList["Link containing % as a single hex sequence interpreted to char"] = "<p><a href=\"/wiki/7%_Solution\" data-mw-type=\"internal\">7% Solution</a></p>"; - -testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a href=\"/wiki/Lista_d''e_paise_d''o_munno\" data-mw-type=\"internal\">Lista d''e paise d''o munno</a></p>"; - -testWhiteList["Brackets in urls"] = "<p><a href=\"http://example.com/index.php?foozoid[]=bar\">http://example.com/index.php?foozoid[]=bar</a></p><p><a href=\"http://example.com/index.php?foozoid[]=bar\">http://example.com/index.php?foozoid[]=bar</a></p>"; - if (typeof module == "object") { module.exports.testWhiteList = testWhiteList; } _______________________________________________ MediaWiki-CVS mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs
