BBlack has uploaded a new change for review.
https://gerrit.wikimedia.org/r/274089
Change subject: normalize_path: fully parameterize the decoded set
......................................................................
normalize_path: fully parameterize the decoded set
Bug: T127387
Change-Id: Ie95526896305be7eb15374728a01d51d2f6d5f52
---
M templates/varnish/normalize_path.inc.vcl.erb
1 file changed, 69 insertions(+), 25 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/operations/puppet
refs/changes/89/274089/1
diff --git a/templates/varnish/normalize_path.inc.vcl.erb
b/templates/varnish/normalize_path.inc.vcl.erb
index 6970310..de503e5 100644
--- a/templates/varnish/normalize_path.inc.vcl.erb
+++ b/templates/varnish/normalize_path.inc.vcl.erb
@@ -1,5 +1,36 @@
C{
#include <string.h>
+#include <inttypes.h>
+
+/******************************************************************************
+* normalize_url_path(sp, "..."):
+*
+* Rewrites the path part of the URL, decoding some %-encoded characters into
+* the actual characters they represent. This stops conversion when it first
+* runs into a literal '?' or '#', indicating the end of the path part of the
+* URL. This sort of canonicalization helps with cache fragmentation and PURGE
+* issues in general, but the set of decoded characters can be
+* application/service -specific.
+*
+* By default this function only decodes the Unreserved Set of universally-safe
+* characters, which are [-A-Za-z0-9_.~]. If an argument string is provided,
+* all of the characters in that string are also decoded.
+*
+* As a general rule, it would never be wise to decode any of the following:
+* Anything unprintable in US-ASCII (0x00 - 0x1F, 0x7F - 0xFF).
+* Any of these four: ' ' (0x20), '#' (0x23), '%' (0x25), '?' (0x3F)
+*
+* Combining all of the information above, the total set of potentially useful
+* characters to place in an argument string which aren't already converted by
+* default would be:
+* !"$&'()*+,-./;<=>@[\]^_{|}~
+* or in C string form with a couple of required escapes:
+* "!\"$&'()*+,-./;<=>@[\\]^_{|}~"
+*
+* Whether it's safe to decode any these within the path segment is really up
+* to application-layer semantics. In the absence of known application-layer
+* semantics, we can't assume we can decode any of them safely.
+******************************************************************************/
/* DIY hexadecimal conversion, since it is simple enough for a fixed
* width, and all the relevant standard C library functions promise to
@@ -12,14 +43,11 @@
#define NP_IS_HEX(c) (NP_HEX_DIGIT(c) != -1)
#define NP_HEXCHAR(c1, c2) (char)( (NP_HEX_DIGIT(c1) << 4) | NP_HEX_DIGIT(c2) )
-void raw_normalize_path(struct sess *sp, const int doslash) {
- /* Rewrite the path part of the URL, replacing unnecessarily escaped
- * punctuation with the actual characters. The character list is from
- * MediaWiki's wfUrlencode(), so the URLs produced here will be the
same as
- * the ones produced by MediaWiki in href attributes. Doing this reduces
- * cache fragmentation and fixes T29935, i.e. stale cache entries due to
- * MediaWiki purging only the wfUrlencode'd version of the URL.
- */
+// note these evaluate their argument twice!
+#define NP_SET(_x) (decode_ok[_x >> 6] |= (1 << (63 - (_x & 63))))
+#define NP_CHECK(_x) (decode_ok[_x >> 6] & (1 << (63 - (_x & 63))))
+
+void raw_normalize_path(struct sess *sp, const char* addtl_chars) {
const char * url = VRT_r_req_url(sp);
assert(url);
@@ -29,25 +57,34 @@
const size_t lastConvertIdx = urlLength > 2 ? urlLength - 3 : 0;
int dirty = 0;
- /* Allocate destination memory from the stack using the C99
- * variable-length automatic feature. We know the length in advance
- * because this function can only shorten the input string.
- */
+ // default decoding map: [-A-Za-z0-9_.~]
+ // X in commnts below is unprintable, except where it's obviously the
real X
+ uint64_t decode_ok[4] = {
+ // 0x00 - 0x3F:
+ //XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
!"#$%&'()*+,-./0123456789:;<=>?
+
0b0000000000000000000000000000000000000000000001101111111111000000,
+ // 0x40 - 0x7F:
+
//@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~X
+
0b0111111111111111111111111110000101111111111111111111111111100010,
+ // 0x80 - 0xBF: (all unprintable)
+
0b0000000000000000000000000000000000000000000000000000000000000000,
+ // 0xC0 - 0xFF: (all unprintable)
+
0b0000000000000000000000000000000000000000000000000000000000000000,
+ };
+
+ if(addtl_chars) {
+ // Turn on bits for the additional characters requested
+ char addtl;
+ while ((addtl = *addtl_chars++))
+ NP_SET(addtl);
+ }
+
+ // We know the length in advance because this function can only shorten
the input string
char destBuffer[urlLength + 1];
while (url[i] && url[i] != '?' && url[i] != '#') {
if (i <= lastConvertIdx && url[i] == '%' && NP_IS_HEX(url[i+1])
&& NP_IS_HEX(url[i+2])) {
const char c = NP_HEXCHAR(url[i+1], url[i+2]);
- if (c == ';'
- || c == '@'
- || c == '$'
- || c == '!'
- || c == '*'
- || c == '('
- || c == ')'
- || c == ','
- || c == ':'
- || (doslash && c == '/'))
- {
+ if (NP_CHECK(c)) {
destBuffer[outPos++] = c;
dirty = 1;
i += 3;
@@ -75,13 +112,20 @@
#undef NP_IS_HEX
#undef NP_HEX_DIGIT
#undef NP_HEXCHAR
+#undef NP_SET
+#undef NP_CHECK
}C
+// Use raw_normalize_path to decode certain charaters in the path part
+// of the URL. The character list is from MediaWiki's wfUrlencode(), so the
+// URLs produced here will be the same as the ones produced by
+// MediaWiki in href attributes.
sub normalize_mediawiki_path {
- C{ raw_normalize_path(sp, 1); }C
+ C{ raw_normalize_path(sp, ";@$!*(),/:"); }C
}
+// RB initially has the same set as MW, minus the forward slash
sub normalize_rest_path {
- C{ raw_normalize_path(sp, 0); }C
+ C{ raw_normalize_path(sp, ";@$!*(),:"); }C
}
--
To view, visit https://gerrit.wikimedia.org/r/274089
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie95526896305be7eb15374728a01d51d2f6d5f52
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: BBlack <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits