https://www.mediawiki.org/wiki/Special:Code/MediaWiki/113136

Revision: 113136
Author:   gwicke
Date:     2012-03-06 13:49:37 +0000 (Tue, 06 Mar 2012)
Log Message:
-----------
Reworked percent encoding handling for URIs to get closer to the 'url
construction' part of the HTML5 spec:
http://www.whatwg.org/specs/web-apps/current-work/multipage/urls.html#url-manipulation-and-creation

Removed a few whitelisted test cases that are now passing directly.

The encoding canonicalization could also be moved to the Sanitizer. Doing this
early in token stream processing however has the advantage of providing further
transformations uniform data to work with. We could even consider to move this
even further into the tokenizer.

Modified Paths:
--------------
    trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js
    trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js
    trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js
    trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
    trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js

Modified: trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js
===================================================================
--- trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js        
2012-03-06 13:43:46 UTC (rev 113135)
+++ trunk/extensions/VisualEditor/modules/parser/ext.core.LinkHandler.js        
2012-03-06 13:49:37 UTC (rev 113136)
@@ -54,7 +54,7 @@
                        }
                        content = out;
                } else {
-                       content = href;
+                       content = [ env.decodeURI( env.tokensToString( href ) ) 
];
                }
                if ( tail ) {
                        content.push( tail );
@@ -105,19 +105,19 @@
        
        var content = token.attribs.slice(1, -1);
 
-       // XXX: get /wiki from config!
+       // TODO: get /wiki from config!
        var a = new TagTk( 'a', [ new KV( 'href', '/wiki' + title.makeLink() ) 
] );
        a.dataAttribs = token.dataAttribs;
 
        var MD5 = new jshashes.MD5(),
                hash = MD5.hex( title.key ),
-               // XXX: Hackhack..
+               // TODO: Hackhack.. Move to proper test harness setup!
                path = 'http://example.com/images/' + 
                        [ hash[0], hash.substr(0, 2) ].join('/') + '/' + 
title.key;
        
        
 
-       // XXX: extract options
+       // extract options
        var options = [],
                caption = null;
        for( var i = 0, l = content.length; i<l; i++ ) {
@@ -132,7 +132,7 @@
                        } 
                } else {
                        var bits = oText[0].split( '=', 2 );
-                       if ( bits.length > 1 && this._prefixImageOptions[ 
bits[0].strip ] ) {
+                       if ( bits.length > 1 && this._prefixImageOptions[ 
bits[0].trim() ] ) {
                                console.log('handle prefix ' + bits );
                        } else {
                                caption = oContent;
@@ -217,7 +217,9 @@
 };
 
 ExternalLinkHandler.prototype.onUrlLink = function ( token, manager, cb ) {
-       var href = this.manager.env.lookupKV( token.attribs, 'href' ).v;
+       var href = this.manager.env.sanitizeURI( 
+                       this.manager.env.lookupKV( token.attribs, 'href' ).v 
+                       );
        if ( this._isImageLink( href ) ) {
                return { token: new SelfclosingTagTk( 'img', 
                                [ 
@@ -241,6 +243,8 @@
 ExternalLinkHandler.prototype.onExtLink = function ( token, manager, cb ) {
        var href = this.manager.env.lookupKV( token.attribs, 'href' ).v,
                content=  this.manager.env.lookupKV( token.attribs, 'content' 
).v;
+       href = this.manager.env.sanitizeURI( href );
+       console.warn('extlink href: ' + href );
        //console.warn( 'content: ' + JSON.stringify( content, null, 2 ) );
        // validate the href
        if ( this.imageParser.parseURL( href ) ) {

Modified: trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js
===================================================================
--- trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js     
2012-03-06 13:43:46 UTC (rev 113135)
+++ trunk/extensions/VisualEditor/modules/parser/mediawiki.Title.js     
2012-03-06 13:49:37 UTC (rev 113136)
@@ -10,7 +10,7 @@
 Title.prototype.makeLink = function () {
        // XXX: links always point to the canonical namespace name.
        if ( false && this.nskey ) {
-               return this.env.wgScriptPath + this.nskey + ':' + this.key;
+               return this.env.sanitizeURI( this.env.wgScriptPath + this.nskey 
+ ':' + this.key );
        } else {
                var l = this.env.wgScriptPath,
                        ns = this.ns.getDefaultName();
@@ -18,7 +18,7 @@
                if ( ns ) {
                        l += ns + ':';
                }
-               return l + this.key;
+               return this.env.sanitizeURI( l + this.key );
        }
 };
 

Modified: 
trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js
===================================================================
--- 
trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js    
    2012-03-06 13:43:46 UTC (rev 113135)
+++ 
trunk/extensions/VisualEditor/modules/parser/mediawiki.parser.environment.js    
    2012-03-06 13:49:37 UTC (rev 113136)
@@ -208,7 +208,45 @@
        return out.join('');
 };
 
+MWParserEnvironment.prototype.decodeURI = function ( s ) {
+       return s.replace( /%[0-9a-f][0-9a-f]/g, function( m ) {
+               try {
+                       return decodeURI( m );
+               } catch ( e ) {
+                       return m;
+               }
+       } );
+};
 
+MWParserEnvironment.prototype.sanitizeURI = function ( s ) {
+       var host = s.match(/^[a-zA-Z]+:\/\/[^\/]+(?:\/|$)/),
+               path = s,
+               anchor = null;
+       console.warn( 'host: ' + host );
+       if ( host ) {
+               path = s.substr( host[0].length );
+               host = host[0];
+       } else {
+               host = '';
+       }
+       var bits = path.split('#');
+       if ( bits.length > 1 ) {
+               anchor = bits[bits.length - 1];
+               path = path.substr(0, path.length - anchor.length - 1);
+       }
+       host = host.replace( /%(?![0-9a-fA-F][0-9a-fA-F])|[#|]/g, function ( m 
) {
+               return encodeURIComponent( m );
+       } );
+       path = path.replace( /%(?![0-9a-fA-F][0-9a-fA-F])|[\[\]#|]/g, function 
( m ) {
+               return encodeURIComponent( m );
+       } );
+       s = host + path;
+       if ( anchor !== null ) {
+               s += '#' + anchor;
+       }
+       return s;
+};
+
 /**
  * Simple debug helper
  */

Modified: trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt
===================================================================
--- trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt 
2012-03-06 13:43:46 UTC (rev 113135)
+++ trunk/extensions/VisualEditor/modules/parser/pegTokenizer.pegjs.txt 
2012-03-06 13:49:37 UTC (rev 113136)
@@ -645,7 +645,7 @@
             )
             / s:[.:,] !(space / eolf) { return s } 
             / htmlentity
-            / urlencoded_char
+            /// urlencoded_char
             / [&%] )+ 
 { 
     return proto + addr + rest.join(''); 
@@ -1674,17 +1674,19 @@
 
 
 wikilink_preprocessor_text 
-  = r:( t:[^%<~[{\n\r\t|!\]} &=]+ { return t.join(''); }
-  / urlencoded_char
+  = r:( t:[^<~[{\n\r\t|!\]} &=]+ { return t.join(''); }
+  /// urlencoded_char
   / directive
   / !inline_breaks !"|" !"]]" text_char )+ {
       return flatten_stringlist ( r );
   }
 
 extlink_preprocessor_text
+  // added special separator character class inline: separates url from
+  // description / text
   = r:( t:[^'<~[{\n\r|!\]}\t&="' 
\u00A0\u1680\u180E\u2000-\u200A\u202F\u205F\u3000]+ { return t.join(''); }
   / directive
-  / urlencoded_char
+  /// urlencoded_char
   / !inline_breaks no_punctuation_char
   / s:[.:,] !(space / eolf) { return s } 
   / [&%] )+ {

Modified: trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js
===================================================================
--- trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js 
2012-03-06 13:43:46 UTC (rev 113135)
+++ trunk/extensions/VisualEditor/tests/parser/parserTests-whitelist.js 
2012-03-06 13:49:37 UTC (rev 113136)
@@ -46,36 +46,24 @@
 
 /* Missing token transform functionality */
 
-// We don't implement percent encoding for URIs yet.
-testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a 
data-mw-type=\"internal\" href=\"/wiki/Lista d''e paise d''o munno\">Lista d''e 
paise d''o munno</a></p>";
+// Single quotes are legal in HTML5 URIs. See 
+// 
http://www.whatwg.org/specs/web-apps/current-work/multipage/urls.html#url-manipulation-and-creation
+testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a 
href=\"/wiki/Lista_d''e_paise_d''o_munno\" data-mw-type=\"internal\">Lista d''e 
paise d''o munno</a></p>";
 
-testWhiteList["Link containing \"<#\" and \">#\" as a hex sequences"] = "<p><a 
data-mw-type=\"internal\" href=\"/wiki/&lt;%23\">&lt;%23</a><a 
data-mw-type=\"internal\" href=\"/wiki/&gt;%23\">&gt;%23</a></p>";
 
-
 // Sanitizer
 testWhiteList["Invalid attributes in table cell (bug 1830)"] = 
"<table><tbody><tr><td Cell:=\"\">broken</td></tr></tbody></table>";
 testWhiteList["Table security: embedded pipes 
(http://lists.wikimedia.org/mailman/htdig/wikitech-l/2006-April/022293.html)"] 
= "<table><tbody><tr><td> |<a href=\"ftp://|x||\">[1]</a>\" 
onmouseover=\"alert(document.cookie)\"&gt;test</td></tr></tbody></table>";
 
-// Sanitizer, but UTF8 in link might actually be ok in HTML5
+// Sanitizer, but UTF8 in link is ok in HTML5
 testWhiteList["External link containing double-single-quotes with no space 
separating the url from text in italics"] = "<p><a 
href=\"http://www.musee-picasso.fr/pages/page_id18528_u1l2.htm\"; 
data-mw-type=\"external\" data-mw-rt=\"{&quot;sourcePos&quot;:[0,146]}\"><i>La 
muerte de Casagemas</i> (1901) en el sitio de </a><a 
href=\"/wiki/Museo_Picasso_(ParĂ­s)\" data-mw-type=\"internal\">Museo 
Picasso</a>.</p>";
 
-// plain percent sign is also valid in HTML5
-testWhiteList["Bug 4781, 5267: %28, %29 in URL"] = "<p><a 
href=\"http://www.example.com/?title=Ben-Hur_(1959_film)\" 
data-mw-sourcePos=\"0:53\">http://www.example.com/?title=Ben-Hur_(1959_film)</a></p>";
-
 testWhiteList["External links: wiki links within external link (Bug 3695)"] = 
"<p><a href=\"http://example.com\"; data-mw-type=\"external\" 
data-mw-sourcePos=\"0:54\"></a><a data-mw-type=\"internal\" 
href=\"/wiki/Wikilink\">wikilink</a> embedded in ext link</p>";
 
-testWhiteList["Bug 4781, 5267: %25 in URL"] = "<p><a 
href=\"http://www.example.com/?title=100%_Bran\"; 
data-mw-sourcePos=\"0:41\">http://www.example.com/?title=100%_Bran</a></p>";
-
 testWhiteList["<pre> with forbidden attribute values (bug 3202)"] = "<pre 
width=\"8\" style=\"\">Narrow screen goodies</pre>";
 
-testWhiteList["Link containing % (not as a hex sequence)"] = "<p><a 
href=\"/wiki/7%_Solution\" data-mw-type=\"internal\">7% Solution</a></p>";
+//testWhiteList["Piped link to URL"] = "<p>Piped link to URL: [<a 
href=\"http://www.example.com|an\" data-mw-type=\"external\">example 
URL</a>]</p>";
 
-testWhiteList["Link containing % as a single hex sequence interpreted to 
char"] = "<p><a href=\"/wiki/7%_Solution\" data-mw-type=\"internal\">7% 
Solution</a></p>";
-
-testWhiteList["Link containing double-single-quotes '' (bug 4598)"] = "<p><a 
href=\"/wiki/Lista_d''e_paise_d''o_munno\" data-mw-type=\"internal\">Lista d''e 
paise d''o munno</a></p>";
-
-testWhiteList["Brackets in urls"] = "<p><a 
href=\"http://example.com/index.php?foozoid[]=bar\";>http://example.com/index.php?foozoid[]=bar</a></p><p><a
 
href=\"http://example.com/index.php?foozoid[]=bar\";>http://example.com/index.php?foozoid[]=bar</a></p>";
-
 if (typeof module == "object") {
        module.exports.testWhiteList = testWhiteList;
 }


_______________________________________________
MediaWiki-CVS mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-cvs

Reply via email to