GWicke has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/273146

Change subject: REST path escaping
......................................................................

REST path escaping

REST APIs like their slashes preserved, while MediaWiki likes them decoded.
This patch adds a REST API normalization variant that decodes everything but
the path delimiters [/?#] as well as % itself. This includes the `unreserved`
set and `sub-delim` classes from RFC3986, as well as those characters from
`gen-delim` that are only used in the host / auth portion of a URL ([:@]).

As discussed on the task, the MediaWiki variant should probably include at
least `unreserved` and much of `gen-delim` as well. This patch does not touch
this just yet, leaving it to a follow-up once consensus on those changes is
reached.

Change-Id: I23d5baf099e4a04aa451efde7cd7d952c202e6cd
---
M templates/varnish/text-frontend.inc.vcl.erb
1 file changed, 78 insertions(+), 3 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/46/273146/1

diff --git a/templates/varnish/text-frontend.inc.vcl.erb 
b/templates/varnish/text-frontend.inc.vcl.erb
index 93c197e..6d63d94 100644
--- a/templates/varnish/text-frontend.inc.vcl.erb
+++ b/templates/varnish/text-frontend.inc.vcl.erb
@@ -50,7 +50,7 @@
 C{
        #include <string.h>
 }C
-sub normalize_path {
+sub normalize_mediawiki_path {
        /* Rewrite the path part of the URL, replacing unnecessarily escaped
         * punctuation with the actual characters. The character list is from
         * MediaWiki's wfUrlencode(), so the URLs produced here will be the 
same as
@@ -107,6 +107,79 @@
                                        /* Reached the query part. Just copy 
the rest of the URL
                                         * to the destination.
                                         */
+                                       memcpy(destBuffer + outPos, url + i, 
sizeof(char) * (urlLength - i));
+                                       outPos += urlLength - i;
+                                       i = urlLength;
+                               } else {
+                                       destBuffer[outPos++] = url[i];
+                               }
+                       }
+                       destBuffer[outPos] = '\0';
+
+                       /* Set req.url. This will copy our stack buffer into 
the workspace.
+                        * VRT_l_req_url() is varadic, and concatenates its 
arguments. The
+                        * vrt_magic_string_end marks the end of the list.
+                        */
+                       if (dirty) {
+                               VRT_l_req_url(sp, destBuffer, 
vrt_magic_string_end);
+                       }
+               }
+               #undef NP_IS_HEX
+               #undef NP_HEX_DIGIT
+               #undef NP_HEXCHAR
+       }C
+}
+
+sub normalize_rest_path {
+       /* Rewrite the path part of the URL, replacing unnecessarily escaped
+        * punctuation with the actual characters. The character list is based 
on
+        * the discussion in T127387, aiming for typical REST API use cases,
+        * including RESTBase. */
+       C{
+               /* DIY hexadecimal conversion, since it is simple enough for a 
fixed
+                * width, and all the relevant standard C library functions 
promise to
+                * malfunction if the locale is set to anything other than "C"
+                */
+               #define NP_HEX_DIGIT(c) ( \
+                       (c) >= '0' && (c) <= '9' ? (c) - '0' : ( \
+                               (c) >= 'A' && (c) <= 'F' ? (c) - 'A' + 0x0a : ( 
\
+                                       (c) >= 'a' && (c) <= 'f' ? (c) - 'a' + 
0x0a : -1 ) ) )
+               #define NP_IS_HEX(c) (NP_HEX_DIGIT(c) != -1)
+               #define NP_HEXCHAR(c1, c2) (char)( (NP_HEX_DIGIT(c1) << 4) | 
NP_HEX_DIGIT(c2) )
+               const char * url = VRT_r_req_url(sp);
+               size_t i, outPos;
+               const size_t urlLength = strlen(url);
+                // index for the last position %XX can start at:
+               const size_t lastConvertIdx = urlLength > 2 ? urlLength - 3 : 0;
+               char c;
+               int dirty = 0;
+
+               /* Allocate destination memory from the stack using the C99
+                * variable-length automatic feature. We know the length in 
advance
+                * because this function can only shorten the input string.
+                */
+               char destBuffer[urlLength + 1];
+               if (url) {
+                       for (i = 0, outPos = 0; i < urlLength; i++) {
+                               if (i <= lastConvertIdx && url[i] == '%' && 
NP_IS_HEX(url[i+1]) && NP_IS_HEX(url[i+2])) {
+                                       c = NP_HEXCHAR(url[i+1], url[i+2]);
+                                       /* Don't decode the subset of 
gen-delims from RFC3986
+                                        * that apply to the path portion of 
URLs */
+                                       if (c != '/'
+                                               && c != '?'
+                                               && c != '#'
+                                               /* Also protect percents */
+                                               && c != '%')
+                                       {
+                                               destBuffer[outPos++] = c;
+                                               dirty = 1;
+                                               i += 2;
+                                       } else {
+                                               destBuffer[outPos++] = url[i];
+                                       }
+                               } else if (url[i] == '?' || url[i] == '#') {
+                                       /* Reached the query part or stray 
fragment id. Just
+                                         * copy the rest of the URL to the 
destination. */
                                        memcpy(destBuffer + outPos, url + i, 
sizeof(char) * (urlLength - i));
                                        outPos += urlLength - i;
                                        i = urlLength;
@@ -207,8 +280,10 @@
        }
 
        // Don't decode percent-encoded slashes in paths for REST APIs
-       if (req.url !~ "^/api/rest_v1/" && req.http.host !~ 
"cxserver|citoid|restbase|^rest\.") {
-               call normalize_path;
+       if (req.url ~ "^/api/rest_v1/" || req.http.host ~ 
"cxserver|citoid|restbase|^rest\.") {
+               call normalize_rest_path;
+       } else {
+               call normalize_mediawiki_path;
        }
 
        call mobile_redirect;

-- 
To view, visit https://gerrit.wikimedia.org/r/273146
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I23d5baf099e4a04aa451efde7cd7d952c202e6cd
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: GWicke <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to