BBlack has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/391216 )

Change subject: Fully normalize upload paths
......................................................................


Fully normalize upload paths

This also contains disabled code to do the same for cache_text,
but leaves the legacy cache_text normalization in place the way it
was for now.  We can deploy this now in the more-urgent case of
cache_upload and then evaluate switching cache_text afterwards.
The VTC can be a WIP for now...

Bug: T171881
Bug: T127387
Change-Id: I5887c54a9295e6a344911621b2963c9df9ad4e24
---
M modules/profile/manifests/cache/upload.pp
A modules/varnish/files/tests/text/16-normalize-path.vtc
M modules/varnish/templates/normalize_path.inc.vcl.erb
M modules/varnish/templates/upload-backend.inc.vcl.erb
M modules/varnish/templates/upload-common.inc.vcl.erb
M modules/varnish/templates/upload-frontend.inc.vcl.erb
6 files changed, 305 insertions(+), 6 deletions(-)

Approvals:
  BBlack: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/modules/profile/manifests/cache/upload.pp 
b/modules/profile/manifests/cache/upload.pp
index 89dc150..1cc5659 100644
--- a/modules/profile/manifests/cache/upload.pp
+++ b/modules/profile/manifests/cache/upload.pp
@@ -87,8 +87,8 @@
         app_def_be_opts  => $app_def_be_opts,
         fe_vcl_config    => $fe_vcl_config,
         be_vcl_config    => $be_vcl_config,
-        fe_extra_vcl     => ['upload-common'],
-        be_extra_vcl     => ['upload-common'],
+        fe_extra_vcl     => ['upload-common', 'normalize_path'],
+        be_extra_vcl     => ['upload-common', 'normalize_path'],
         be_storage       => $upload_storage_args,
         fe_cache_be_opts => $fe_cache_be_opts,
         be_cache_be_opts => $be_cache_be_opts,
diff --git a/modules/varnish/files/tests/text/16-normalize-path.vtc 
b/modules/varnish/files/tests/text/16-normalize-path.vtc
new file mode 100644
index 0000000..23f2558
--- /dev/null
+++ b/modules/varnish/files/tests/text/16-normalize-path.vtc
@@ -0,0 +1,72 @@
+varnishtest "Path Normalization"
+
+server s1 {
+    loop 10 {
+        rxreq
+        txresp
+    }
+} -start
+
+varnish v1 -arg "-p cc_command='exec cc -fpic -shared -Wl,-x -L/usr/local/lib/ 
-o %o %s -lmaxminddb' -p vcc_allow_inline_c=true -p vcc_err_unref=false" 
-vcl+backend {
+    backend vtc_backend {
+        .host = "${s1_addr}"; .port = "${s1_port}";
+    }
+
+    sub vcl_deliver {
+        set resp.http.req-url = req.url;
+    }
+
+    include "/usr/share/varnish/tests/wikimedia_text-frontend.vcl";
+} -start
+
+client c1 {
+    # Sanity-check test mechanism
+    txreq -url "/"
+    rxresp
+    expect resp.http.req-url == "/"
+
+    # Send all chars in raw unencoded form, except for NUL, space, and ?, 
which would otherwise abort processing early and/or malform the HTTP 
request-line, and thus and not let us test the rest
+    txreq -url 
"/\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x21\x22\x23\x24\x25\x26\x27\x28\x29\x2a\x2b\x2c\x2d\x2e\x2f\x30\x31\x32\x33\x34\x35\x36\x37\x38\x39\x3a\x3b\x3c\x3d\x3e\x40\x41\x42\x43\x44\x45\x46\x47\x48\x49\x4a\x4b\x4c\x4d\x4e\x4f\x50\x51\x52\x53\x54\x55\x56\x57\x58\x59\x5a\x5b\x5c\x5d\x5e\x5f\x60\x61\x62\x63\x64\x65\x66\x67\x68\x69\x6a\x6b\x6c\x6d\x6e\x6f\x70\x71\x72\x73\x74\x75\x76\x77\x78\x79\x7a\x7b\x7c\x7d\x7e\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff"
+    rxresp
+    expect resp.http.req-url == 
"/%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%10%11%12%13%14%15%16%17%18%19%1A%1B%1C%1D%1E%1F!%22%23$%25%26%27()*%2B,-./0123456789%3A%3B%3C%3D%3E@ABCDEFGHIJKLMNOPQRSTUVWXYZ%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F%80%81%82%83%84%85%86%87%88%89%8A%8B%8C%8D%8E%8F%90%91%92%93%94%95%96%97%98%99%9A%9B%9C%9D%9E%9F%A0%A1%A2%A3%A4%A5%A6%A7%A8%A9%AA%AB%AC%AD%AE%AF%B0%B1%B2%B3%B4%B5%B6%B7%B8%B9%BA%BB%BC%BD%BE%BF%C0%C1%C2%C3%C4%C5%C6%C7%C8%C9%CA%CB%CC%CD%CE%CF%D0%D1%D2%D3%D4%D5%D6%D7%D8%D9%DA%DB%DC%DD%DE%DF%E0%E1%E2%E3%E4%E5%E6%E7%E8%E9%EA%EB%EC%ED%EE%EF%F0%F1%F2%F3%F4%F5%F6%F7%F8%F9%FA%FB%FC%FD%FE%FF"
+
+    # Send all chars in %-encoded lowercase form, with no exceptions.  Output 
should be identical to the above except for the addition of %-encoded NUL, 
space, and ?.
+    txreq -url 
"/%00%01%02%03%04%05%06%07%08%09%0a%0b%0c%0d%0e%0f%10%11%12%13%14%15%16%17%18%19%1a%1b%1c%1d%1e%1f%20%21%22%23%24%25%26%27%28%29%2a%2b%2c%2d%2e%2f%30%31%32%33%34%35%36%37%38%39%3a%3b%3c%3d%3e%3f%40%41%42%43%44%45%46%47%48%49%4a%4b%4c%4d%4e%4f%50%51%52%53%54%55%56%57%58%59%5a%5b%5c%5d%5e%5f%60%61%62%63%64%65%66%67%68%69%6a%6b%6c%6d%6e%6f%70%71%72%73%74%75%76%77%78%79%7a%7b%7c%7d%7e%7f%80%81%82%83%84%85%86%87%88%89%8a%8b%8c%8d%8e%8f%90%91%92%93%94%95%96%97%98%99%9a%9b%9c%9d%9e%9f%a0%a1%a2%a3%a4%a5%a6%a7%a8%a9%aa%ab%ac%ad%ae%af%b0%b1%b2%b3%b4%b5%b6%b7%b8%b9%ba%bb%bc%bd%be%bf%c0%c1%c2%c3%c4%c5%c6%c7%c8%c9%ca%cb%cc%cd%ce%cf%d0%d1%d2%d3%d4%d5%d6%d7%d8%d9%da%db%dc%dd%de%df%e0%e1%e2%e3%e4%e5%e6%e7%e8%e9%ea%eb%ec%ed%ee%ef%f0%f1%f2%f3%f4%f5%f6%f7%f8%f9%fa%fb%fc%fd%fe%ff"
+    rxresp
+    expect resp.http.req-url == 
"/%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%10%11%12%13%14%15%16%17%18%19%20%1A%1B%1C%1D%1E%1F!%22%23$%25%26%27()*%2B,-./0123456789%3A%3B%3C%3D%3E%3F@ABCDEFGHIJKLMNOPQRSTUVWXYZ%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F%80%81%82%83%84%85%86%87%88%89%8A%8B%8C%8D%8E%8F%90%91%92%93%94%95%96%97%98%99%9A%9B%9C%9D%9E%9F%A0%A1%A2%A3%A4%A5%A6%A7%A8%A9%AA%AB%AC%AD%AE%AF%B0%B1%B2%B3%B4%B5%B6%B7%B8%B9%BA%BB%BC%BD%BE%BF%C0%C1%C2%C3%C4%C5%C6%C7%C8%C9%CA%CB%CC%CD%CE%CF%D0%D1%D2%D3%D4%D5%D6%D7%D8%D9%DA%DB%DC%DD%DE%DF%E0%E1%E2%E3%E4%E5%E6%E7%E8%E9%EA%EB%EC%ED%EE%EF%F0%F1%F2%F3%F4%F5%F6%F7%F8%F9%FA%FB%FC%FD%FE%FF"
+
+    # As above but for RB path prefix, which should alter the output to leave 
the forward-slash percent-encoded (but uppercased)
+    txreq -url 
"/api/rest_v1/%00%01%02%03%04%05%06%07%08%09%0a%0b%0c%0d%0e%0f%10%11%12%13%14%15%16%17%18%19%1a%1b%1c%1d%1e%1f%20%21%22%23%24%25%26%27%28%29%2a%2b%2c%2d%2e%2f%30%31%32%33%34%35%36%37%38%39%3a%3b%3c%3d%3e%3f%40%41%42%43%44%45%46%47%48%49%4a%4b%4c%4d%4e%4f%50%51%52%53%54%55%56%57%58%59%5a%5b%5c%5d%5e%5f%60%61%62%63%64%65%66%67%68%69%6a%6b%6c%6d%6e%6f%70%71%72%73%74%75%76%77%78%79%7a%7b%7c%7d%7e%7f%80%81%82%83%84%85%86%87%88%89%8a%8b%8c%8d%8e%8f%90%91%92%93%94%95%96%97%98%99%9a%9b%9c%9d%9e%9f%a0%a1%a2%a3%a4%a5%a6%a7%a8%a9%aa%ab%ac%ad%ae%af%b0%b1%b2%b3%b4%b5%b6%b7%b8%b9%ba%bb%bc%bd%be%bf%c0%c1%c2%c3%c4%c5%c6%c7%c8%c9%ca%cb%cc%cd%ce%cf%d0%d1%d2%d3%d4%d5%d6%d7%d8%d9%da%db%dc%dd%de%df%e0%e1%e2%e3%e4%e5%e6%e7%e8%e9%ea%eb%ec%ed%ee%ef%f0%f1%f2%f3%f4%f5%f6%f7%f8%f9%fa%fb%fc%fd%fe%ff"
+    rxresp
+    expect resp.http.req-url == 
"/api/rest_v1/%00%01%02%03%04%05%06%07%08%09%0A%0B%0C%0D%0E%0F%10%11%12%13%14%15%16%17%18%19%20%1A%1B%1C%1D%1E%1F!%22%23$%25%26%27()*%2B,-.%2F0123456789%3A%3B%3C%3D%3E%3F@ABCDEFGHIJKLMNOPQRSTUVWXYZ%5B%5C%5D%5E_%60abcdefghijklmnopqrstuvwxyz%7B%7C%7D~%7F%80%81%82%83%84%85%86%87%88%89%8A%8B%8C%8D%8E%8F%90%91%92%93%94%95%96%97%98%99%9A%9B%9C%9D%9E%9F%A0%A1%A2%A3%A4%A5%A6%A7%A8%A9%AA%AB%AC%AD%AE%AF%B0%B1%B2%B3%B4%B5%B6%B7%B8%B9%BA%BB%BC%BD%BE%BF%C0%C1%C2%C3%C4%C5%C6%C7%C8%C9%CA%CB%CC%CD%CE%CF%D0%D1%D2%D3%D4%D5%D6%D7%D8%D9%DA%DB%DC%DD%DE%DF%E0%E1%E2%E3%E4%E5%E6%E7%E8%E9%EA%EB%EC%ED%EE%EF%F0%F1%F2%F3%F4%F5%F6%F7%F8%F9%FA%FB%FC%FD%FE%FF"
+
+    # Literal % that needs encoding:
+    txreq -url "/lmno%pqrs"
+    rxresp
+    expect resp.http.req-url == "/lmno%25pqrs"
+
+    # Literal % at end of path
+    txreq -url "/lmno%"
+    rxresp
+    expect resp.http.req-url == "/lmno%25"
+
+    # Literal % cases with half a hex byte after
+    txreq -url "/lmno%1x%x2"
+    rxresp
+    expect resp.http.req-url == "/lmno%251x%25x2"
+
+    # Literal % followed by 1x non-hex at end of path
+    txreq -url "/lmno%x"
+    rxresp
+    expect resp.http.req-url == "/lmno%25x"
+
+    # Literal % followed by 1x hex at end of path
+    txreq -url "/lmno%x1"
+    rxresp
+    expect resp.http.req-url == "/lmno%251"
+
+    # Test query part is unaffected (if path-normalization were applied, query 
part would become "?foo%26bar")
+    txreq -url "/as%64f?fo%6f&bar"
+    rxresp
+    expect resp.http.req-url == "/asdf?fo%6f&bar"
+} -run
diff --git a/modules/varnish/templates/normalize_path.inc.vcl.erb 
b/modules/varnish/templates/normalize_path.inc.vcl.erb
index 98e4242..3147cf5 100644
--- a/modules/varnish/templates/normalize_path.inc.vcl.erb
+++ b/modules/varnish/templates/normalize_path.inc.vcl.erb
@@ -1,4 +1,204 @@
 C{
+
+/*******************************************************************************
+ * URL-Path (not query!) normalization:
+ *
+ * For MediaWiki's purposes, the 256 characters can be divided into two sets
+ * named Always-Decode and Always-Encode, which means every path has exactly
+ * one canonical encoding:
+ *
+ * Always-Decode:
+ *   Unreserved Set (RFC 3986): 0-9 A-Z a-z - . _ ~
+ *   MW-specific from wfUrlencode: ! $ ( ) * , : ; / @
+ * Always-Encode:
+ *   Unprintables and space: 0x00-0x20 0x7F-0xFF
+ *   MediaWiki disallowed title chars: # < > [ ] | { }
+ *   Observed canonical encodes: " % & ' + = \ ^ ` ?
+ *
+ * Additional notes:-------------------
+ * Canonical form for percent-encoding hex digits is uppercase.
+ * "Observed canonical encodes" - tested live WP titles containing these
+ *   characters, observed MW rel=canonical uses the percent-encoded form.
+ * Won't ever actually get encoded:
+ *   space ( ) - Can't be transmitted in HTTP request anyways
+ *   question (?) - Starts query part, used to delimit path below
+ * Literal Percent (%) - Obviously, only encode if not followed by hex digits
+ *
+ * Restbase:-------------------
+ * Believed to use MW encoding rules above, but has a special exception for
+ *   forward-slash: We can neither encode nor decode either form of the
+ *   forward-slash for RB; it must be preserved.  This is because RB needs
+ *   forward-slashes from MediaWiki titles to be in %2F form, but still needs
+ *   its own functional path-delimiting slashes unencoded.
+ *
+ * upload.wikimedia.org:---------
+ *   MW seems to use PHP's urlencode() directly when translating from MediaWiki
+ *   File titles to storage URLs.  Note the filenames never contain spaces
+ *   (would be underscores), and cannot have forward slashes (which will still
+ *   exist in the rest of the path un-encoded).  .  The rules we should obey to
+ *   be consistent with it would be:
+ *
+ * Always-Decode: 0-9 A-Z a-z - . _ /
+ * Always-Encode: Everything Else
+ 
******************************************************************************/
+
+#include <inttypes.h>
+#include <string.h>
+
+<%- if @cluster == "cache_upload" -%>
+
+static const uintptr_t decoder_ring[256] = {
+  // 0x00-0x1F (all unprintable)
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  //  ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
+    0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
+  //@ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
+    0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,
+  //` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ <DEL>
+    0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,
+  // 0x80-0xFF (all unprintable)
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+};
+
+<%- else -%>
+
+static const uintptr_t decoder_ring[256] = {
+  // 0x00-0x1F (all unprintable)
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  //  ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
+    0,1,0,0,1,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,
+  //@ A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,
+  //` a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ <DEL>
+    0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,1,0,
+  // 0x80-0xFF (all unprintable)
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+};
+
+<%- end -%>
+
+static const uintptr_t hex_finder[256] = {
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  //  ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ...
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,
+  //@ A B C D E F G ...
+    0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+  //` a b c d e f g ...
+    0,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+};
+
+static const uintptr_t hex_dec[256] = {
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,2,3,4,5,6,7,8,9,0,0,0,0,0,0,
+    0,10,11,12,13,14,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,10,11,12,13,14,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+};
+
+static const char hex_enc[16] = {
+    '0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F'
+};
+
+#define DO_DECODE(_c) (decoder_ring[(uint8_t)(_c)])
+#define IS_HEX(_c) (hex_finder[(uint8_t)(_c)])
+#define HEX_DECODE(_c1,_c2) \
+    (hex_dec[(uint8_t)(_c1)] << 4 | hex_dec[(uint8_t)(_c2)])
+#define HEX_ENC_TOP(_c) (hex_enc[((uint8_t)(_c)) >> 4])
+#define HEX_ENC_BOTTOM(_c) (hex_enc[((uint8_t)(_c)) & 0xF])
+#define HEX_IS_LOWER(_c) \
+    (uint8_t)((((uint8_t)(_c)) & 0x40) >> 6 & (((uint8_t)(_c)) & 0x20) >> 5)
+
+void normalize_path_encoding(const struct vrt_ctx *ctx, const int doslash) {
+    const char* vrt_url = VRT_r_req_url(ctx);
+    const size_t in_url_len = strlen(vrt_url);
+
+    // Copy input URL to new array with double-NUL termination (this allows
+    // us to not worry about running off the end when checking for %XX hex
+    // encodings):
+    char in_url[in_url_len + 2];
+    memcpy(in_url, vrt_url, in_url_len + 1);
+    in_url[in_url_len + 1] = '\0';
+
+    // worst-case is every input character requires encoding:
+    char out_url[(in_url_len * 3) + 1];
+
+    char* in_p = in_url;
+    char* out_p = out_url;
+    unsigned dirty = 0;
+
+    while(1) {
+        const char c = *in_p++;
+        if (c == '\0') {
+            *out_p = '\0';
+            break;
+        }
+        else if (c == '?') {
+            if (dirty) {
+                *out_p++ = '?';
+                strcpy(out_p, in_p);
+            }
+            break;
+        }
+        else if (c == '%' && IS_HEX(*in_p) && IS_HEX(*(in_p + 1))) {
+            const char x1 = *in_p++;
+            const char x2 = *in_p++;
+            const char x = HEX_DECODE(x1, x2);
+            if (DO_DECODE(x) && (doslash || x != '/')) {
+                dirty = 1;
+                *out_p++ = x;
+            }
+            else {
+                // mark dirty if input hex had lowercase letters
+                dirty |= HEX_IS_LOWER(x1) | HEX_IS_LOWER(x2);
+                *out_p++ = '%';
+                *out_p++ = HEX_ENC_TOP(x);
+                *out_p++ = HEX_ENC_BOTTOM(x);
+            }
+        }
+        else if (DO_DECODE(c)) {
+            *out_p++ = c;
+        }
+        else {
+            dirty = 1;
+            *out_p++ = '%';
+            *out_p++ = HEX_ENC_TOP(c);
+            *out_p++ = HEX_ENC_BOTTOM(c);
+        }
+    }
+
+    /* Set req.url. This will copy our stack buffer into the workspace.
+     * VRT_l_req_url() is varadic, and concatenates its arguments. The
+     * vrt_magic_string_end marks the end of the list.
+     */
+    if (dirty)
+        VRT_l_req_url(ctx, out_url, vrt_magic_string_end);
+}
+
+#undef DO_DECODE
+#undef IS_HEX
+#undef HEX_DECODE
+#undef HEX_ENC_TOP
+#undef HEX_ENC_BOTTOM
+#undef HEX_IS_LOWER
+
+}C
+
+/************ Previous implementation, incomplete text normalization *********/
+
+C{
 #include <string.h>
 
 /* DIY hexadecimal conversion, since it is simple enough for a fixed
@@ -83,10 +283,22 @@
 
 }C
 
+<%- if @cluster == "cache_upload" -%>
+// Upload uses the new full-normalization code
+
+sub normalize_upload_path {
+    C{ normalize_path_encoding(ctx, 1); }C
+}
+
+<%- else -%>
+// Text still uses the previous implementation
+
 sub normalize_mediawiki_path {
-       C{ raw_normalize_path(ctx, 1); }C
+    C{ raw_normalize_path(ctx, 1); }C
 }
 
 sub normalize_rest_path {
-       C{ raw_normalize_path(ctx, 0); }C
+    C{ raw_normalize_path(ctx, 0); }C
 }
+
+<%- end -%>
diff --git a/modules/varnish/templates/upload-backend.inc.vcl.erb 
b/modules/varnish/templates/upload-backend.inc.vcl.erb
index 04932ff..02b835f 100644
--- a/modules/varnish/templates/upload-backend.inc.vcl.erb
+++ b/modules/varnish/templates/upload-backend.inc.vcl.erb
@@ -27,7 +27,11 @@
        }
 }
 
-sub cluster_be_recv_pre_purge { }
+sub cluster_be_recv_pre_purge {
+       if (req.method == "PURGE") {
+               call upload_normalize_path;
+       }
+}
 
 sub cluster_be_recv {
        call upload_common_recv;
diff --git a/modules/varnish/templates/upload-common.inc.vcl.erb 
b/modules/varnish/templates/upload-common.inc.vcl.erb
index 1dea5f9..fa87167 100644
--- a/modules/varnish/templates/upload-common.inc.vcl.erb
+++ b/modules/varnish/templates/upload-common.inc.vcl.erb
@@ -1,5 +1,13 @@
 // common to upload front+backend VCL
 
+include "normalize_path.inc.vcl";
+
+sub upload_normalize_path {
+       if (req.http.host ~ "^upload\.wikimedia\.org$") {
+               call normalize_upload_path;
+       }
+}
+
 sub upload_common_recv {
        unset req.http.X-Range;
 
diff --git a/modules/varnish/templates/upload-frontend.inc.vcl.erb 
b/modules/varnish/templates/upload-frontend.inc.vcl.erb
index a00a667..fe8e1b6 100644
--- a/modules/varnish/templates/upload-frontend.inc.vcl.erb
+++ b/modules/varnish/templates/upload-frontend.inc.vcl.erb
@@ -1,7 +1,10 @@
 // Varnish VCL include file for upload frontends
 include "upload-common.inc.vcl";
 
-sub cluster_fe_recv_pre_purge { }
+sub cluster_fe_recv_pre_purge {
+       // Normalize paths before purging
+       call upload_normalize_path;
+}
 
 sub cluster_fe_recv {
        // The upload cluster does not serve page views or authenticated

-- 
To view, visit https://gerrit.wikimedia.org/r/391216
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I5887c54a9295e6a344911621b2963c9df9ad4e24
Gerrit-PatchSet: 10
Gerrit-Project: operations/puppet
Gerrit-Branch: production
Gerrit-Owner: BBlack <[email protected]>
Gerrit-Reviewer: BBlack <[email protected]>
Gerrit-Reviewer: Ema <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to