Legoktm has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/370982 )
Change subject: Don't urldecode user input, and document problems with test cases ...................................................................... Don't urldecode user input, and document problems with test cases As demonstrated by the wm-bot URLs, blindly decoding user input doesn't work. Leave a TODO and add lots of test cases that demonstrate URLs we need to worry about in the future when trying to smartly decode/encode URLs. Change-Id: Icbe96ea53f937b902dc9b7999a673cd8b70ed824 Co-Authored-By: Derk-Jan Hartman <hartman.w...@gmail.com> --- M UrlShortener.utils.php M tests/phpunit/UrlShortenerUtilsTest.php 2 files changed, 43 insertions(+), 8 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/UrlShortener refs/changes/82/370982/1 diff --git a/UrlShortener.utils.php b/UrlShortener.utils.php index fa302ba..d13af55 100755 --- a/UrlShortener.utils.php +++ b/UrlShortener.utils.php @@ -81,11 +81,12 @@ } /** - * Normalizes URL from its `/w/index.php?title=$1` form to - * `/wiki/$1`. + * Normalizes URL into a somewhat canonical form, including: + * * protocol to HTTP + * * from its `/w/index.php?title=$1` form to `/wiki/$1`. * - * @param string $url - * @return string + * @param string $url might be encoded or decoded (raw user input) + * @return string URL that is saved in DB and used in Location header */ public static function normalizeUrl( $url ) { global $wgArticlePath; @@ -93,8 +94,9 @@ // it to a different one when redirecting $url = self::convertToProtocol( $url, PROTO_HTTP ); - // Decode it... - $url = urldecode( $url ); + // TODO: We should ideally decode/encode the URL for normalization, + // but we don't want to double-encode, nor unencode the URL that + // is directly provided by users (see test cases) // If the wiki is using an article path (e.g. /wiki/$1) try // and convert plain index.php?title=$1 URLs to the canonical form diff --git a/tests/phpunit/UrlShortenerUtilsTest.php b/tests/phpunit/UrlShortenerUtilsTest.php index 20a09c6..08afb75 100644 --- a/tests/phpunit/UrlShortenerUtilsTest.php +++ b/tests/phpunit/UrlShortenerUtilsTest.php @@ -95,15 +95,48 @@ 'http://example.com/w/index.php?title=Special:Version&baz=bar', 'http://example.com/w/index.php?title=Special:Version&baz=bar' ], - // urldecoded + // urlencoded [ 'http://example.org/wiki/Scott_Morrison_%28politician%29', - 'http://example.org/wiki/Scott_Morrison_(politician)' + 'http://example.org/wiki/Scott_Morrison_%28politician%29' ], + // urldecoded [ 'http://example.org/wiki/Scott_Morrison_(politician)', 'http://example.org/wiki/Scott_Morrison_(politician)' ], + // Ideally spaces should be replaced with underscores + [ + 'http://example.org/wiki/Scott Morrison (politician)', + 'http://example.org/wiki/Scott Morrison (politician)' + ], + // encoded # in URL that is not an anchor + [ + 'http://bots.wmflabs.org/logs/%23mediawiki/', + 'http://bots.wmflabs.org/logs/%23mediawiki/', + ], + // encoded + in URL that is not a space + [ + 'http://en.wikipedia.org/wiki/%2B_(disambiguation)', + 'http://en.wikipedia.org/wiki/%2B_(disambiguation)' + ], + // encoded + but not encoded / in URL + [ + 'http://en.wikipedia.org/wiki/Talk:C%2B%2B/Archive_1', + 'http://en.wikipedia.org/wiki/Talk:C%2B%2B/Archive_1', + ], + // Bad characters sourced from + // https://perishablepress.com/stop-using-unsafe-characters-in-urls/ + // These should ideally be escaped after the first ? + [ + 'http://example.org/?;/?:@=&"<>#%{}|\^~[]`', + 'http://example.org/?;/?:@=&"<>#%{}|\^~[]`', + ], + // a real anchor + [ + 'http://example.org/this/is/#anchor', + 'http://example.org/this/is/#anchor', + ], ]; } -- To view, visit https://gerrit.wikimedia.org/r/370982 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Icbe96ea53f937b902dc9b7999a673cd8b70ed824 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/UrlShortener Gerrit-Branch: master Gerrit-Owner: Legoktm <lego...@member.fsf.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits