This normalizer decodes percent encoded characters within the RFC 3986
unreserved set.

See GitHub Issue #714.
---
 doc/configuration.txt                  | 44 +++++++++++-
 include/haproxy/action-t.h             |  2 +
 include/haproxy/uri_normalizer.h       |  1 +
 reg-tests/http-rules/normalize_uri.vtc | 75 +++++++++++++++++++-
 src/http_act.c                         | 33 +++++++++
 src/uri_normalizer.c                   | 95 ++++++++++++++++++++++++++
 6 files changed, 247 insertions(+), 3 deletions(-)

diff --git a/doc/configuration.txt b/doc/configuration.txt
index 322932b5e..df01dd848 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -6015,6 +6015,7 @@ http-request normalize-uri <normalizer> [ { if | unless } 
<condition> ]
 http-request normalize-uri path-merge-slashes [ { if | unless } <condition> ]
 http-request normalize-uri path-strip-dot [ { if | unless } <condition> ]
 http-request normalize-uri path-strip-dotdot [ full ] [ { if | unless } 
<condition> ]
+http-request normalize-uri percent-decode-unreserved [ strict ] [ { if | 
unless } <condition> ]
 http-request normalize-uri percent-to-uppercase [ strict ] [ { if | unless } 
<condition> ]
 http-request normalize-uri query-sort-by-name [ { if | unless } <condition> ]
 
@@ -6034,11 +6035,25 @@ http-request normalize-uri query-sort-by-name [ { if | 
unless } <condition> ]
   filesystem. However it might break routing of an API that expects a specific
   number of segments in the path.
 
+  It is important to note that some normalizers might result in unsafe
+  transformations for broken URIs. It might also be possible that a combination
+  of normalizers that are safe by themselves results in unsafe transformations
+  when improperly combined.
+
+  As an example the "percent-decode-unreserved" normalizer might result in
+  unexpected results when a broken URI includes bare percent characters. One
+  such a broken URI is "/%%36%36" which would be decoded to "/%66" which in
+  turn is equivalent to "/f". By specifying the "strict" option requests to
+  such a broken URI would safely be rejected.
+
   The following normalizers are available:
 
   - path-strip-dot: Removes "/./" segments within the "path" component
       (RFC 3986#6.2.2.3).
 
+      Segments including percent encoded dots ("%2E") will not be detected. Use
+      the "percent-decode-unreserved" normalizer first if this is undesired.
+
       Example:
       - /.            -> /
       - /./bar/       -> /bar/
@@ -6049,8 +6064,13 @@ http-request normalize-uri query-sort-by-name [ { if | 
unless } <condition> ]
       (RFC 3986#6.2.2.3).
 
       This merges segments that attempt to access the parent directory with
-      their preceding segment. Empty segments do not receive special treatment.
-      Use the "path-merge-slashes" normalizer first if this is undesired.
+      their preceding segment.
+
+      Empty segments do not receive special treatment. Use the "merge-slashes"
+      normalizer first if this is undesired.
+
+      Segments including percent encoded dots ("%2E") will not be detected. Use
+      the "percent-decode-unreserved" normalizer first if this is undesired.
 
       Example:
       - /foo/../     -> /
@@ -6059,6 +6079,7 @@ http-request normalize-uri query-sort-by-name [ { if | 
unless } <condition> ]
       - /../bar/     -> /../bar/
       - /bar/../../  -> /../
       - /foo//../    -> /foo/
+      - /foo/%2E%2E/ -> /foo/%2E%2E/
 
       If the "full" option is specified then "../" at the beginning will be
       removed as well:
@@ -6074,6 +6095,25 @@ http-request normalize-uri query-sort-by-name [ { if | 
unless } <condition> ]
       - //        -> /
       - /foo//bar -> /foo/bar
 
+  - percent-decode-unreserved: Decodes unreserved percent encoded characters to
+      their representation as a regular character (RFC 3986#6.2.2.2).
+
+      The set of unreserved characters includes all letters, all digits, "-",
+      ".", "_", and "~".
+
+      Example:
+      - /%61dmin       -> /admin
+      - /foo%3Fbar=baz -> /foo%3Fbar=baz (no change)
+      - /%%36%36       -> /%66           (unsafe)
+      - /%ZZ           -> /%ZZ
+
+      If the "strict" option is specified then invalid sequences will result
+      in a HTTP 400 Bad Request being returned.
+
+      Example:
+      - /%%36%36 -> HTTP 400
+      - /%ZZ     -> HTTP 400
+
   - percent-to-uppercase: Uppercases letters within percent-encoded sequences
       (RFC 3986#6.2.2.1).
 
diff --git a/include/haproxy/action-t.h b/include/haproxy/action-t.h
index 2e3edeaf3..1d0b0d229 100644
--- a/include/haproxy/action-t.h
+++ b/include/haproxy/action-t.h
@@ -109,6 +109,8 @@ enum act_normalize_uri {
        ACT_NORMALIZE_URI_QUERY_SORT_BY_NAME,
        ACT_NORMALIZE_URI_PERCENT_TO_UPPERCASE,
        ACT_NORMALIZE_URI_PERCENT_TO_UPPERCASE_STRICT,
+       ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED,
+       ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED_STRICT,
 };
 
 /* NOTE: if <.action_ptr> is defined, the referenced function will always be
diff --git a/include/haproxy/uri_normalizer.h b/include/haproxy/uri_normalizer.h
index 81c7e00ce..06f237e45 100644
--- a/include/haproxy/uri_normalizer.h
+++ b/include/haproxy/uri_normalizer.h
@@ -18,6 +18,7 @@
 
 #include <haproxy/uri_normalizer-t.h>
 
+enum uri_normalizer_err uri_normalizer_percent_decode_unreserved(const struct 
ist input, int strict, struct ist *dst);
 enum uri_normalizer_err uri_normalizer_percent_upper(const struct ist input, 
int strict, struct ist *dst);
 enum uri_normalizer_err uri_normalizer_path_dot(const struct ist path, struct 
ist *dst);
 enum uri_normalizer_err uri_normalizer_path_dotdot(const struct ist path, int 
full, struct ist *dst);
diff --git a/reg-tests/http-rules/normalize_uri.vtc 
b/reg-tests/http-rules/normalize_uri.vtc
index 9884b6c43..cc8806066 100644
--- a/reg-tests/http-rules/normalize_uri.vtc
+++ b/reg-tests/http-rules/normalize_uri.vtc
@@ -8,7 +8,7 @@ feature ignore_unknown_macro
 server s1 {
     rxreq
     txresp
-} -repeat 54 -start
+} -repeat 63 -start
 
 haproxy h1 -conf {
     defaults
@@ -94,6 +94,30 @@ haproxy h1 -conf {
 
         default_backend be
 
+    frontend fe_percent_decode_unreserved
+        bind "fd@${fe_percent_decode_unreserved}"
+
+        http-request set-var(txn.before) url
+        http-request normalize-uri percent-decode-unreserved
+        http-request set-var(txn.after) url
+
+        http-response add-header before  %[var(txn.before)]
+        http-response add-header after  %[var(txn.after)]
+
+        default_backend be
+
+    frontend fe_percent_decode_unreserved_strict
+        bind "fd@${fe_percent_decode_unreserved_strict}"
+
+        http-request set-var(txn.before) url
+        http-request normalize-uri percent-decode-unreserved strict
+        http-request set-var(txn.after) url
+
+        http-response add-header before  %[var(txn.before)]
+        http-response add-header after  %[var(txn.after)]
+
+        default_backend be
+
     backend be
         server s1 ${s1_addr}:${s1_port}
 
@@ -391,3 +415,52 @@ client c6 -connect ${h1_fe_dot_sock} {
     expect resp.http.before == "/?a=/./"
     expect resp.http.after == "/?a=/./"
 } -run
+
+client c7 -connect ${h1_fe_percent_decode_unreserved_sock} {
+    txreq -url "/a?a=a"
+    rxresp
+    expect resp.http.before == "/a?a=a"
+    expect resp.http.after == "/a?a=a"
+
+    txreq -url "/%61?%61=%61"
+    rxresp
+    expect resp.http.before == "/%61?%61=%61"
+    expect resp.http.after == "/a?a=a"
+
+    txreq -url "/%3F?foo=bar"
+    rxresp
+    expect resp.http.before == "/%3F?foo=bar"
+    expect resp.http.after == "/%3F?foo=bar"
+
+    txreq -url "/%%36%36"
+    rxresp
+    expect resp.status == 200
+    expect resp.http.before == "/%%36%36"
+    expect resp.http.after == "/%66"
+
+    txreq -req OPTIONS -url "*"
+    rxresp
+    expect resp.http.before == "*"
+    expect resp.http.after == "*"
+} -run
+
+client c8 -connect ${h1_fe_percent_decode_unreserved_strict_sock} {
+    txreq -url "/a?a=a"
+    rxresp
+    expect resp.http.before == "/a?a=a"
+    expect resp.http.after == "/a?a=a"
+
+    txreq -url "/%61?%61=%61"
+    rxresp
+    expect resp.http.before == "/%61?%61=%61"
+    expect resp.http.after == "/a?a=a"
+
+    txreq -url "/%3F?foo=bar"
+    rxresp
+    expect resp.http.before == "/%3F?foo=bar"
+    expect resp.http.after == "/%3F?foo=bar"
+
+    txreq -url "/%%36%36"
+    rxresp
+    expect resp.status == 400
+} -run
diff --git a/src/http_act.c b/src/http_act.c
index df2bbe41b..58e1bb857 100644
--- a/src/http_act.c
+++ b/src/http_act.c
@@ -294,6 +294,24 @@ static enum act_return http_action_normalize_uri(struct 
act_rule *rule, struct p
 
                        err = uri_normalizer_percent_upper(path, rule->action 
== ACT_NORMALIZE_URI_PERCENT_TO_UPPERCASE_STRICT, &newpath);
 
+                       if (err != URI_NORMALIZER_ERR_NONE)
+                               break;
+
+                       if (!http_replace_req_path(htx, newpath, 1))
+                               goto fail_rewrite;
+
+                       break;
+               }
+               case ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED:
+               case ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED_STRICT: {
+                       const struct ist path = http_get_path(uri);
+                       struct ist newpath = ist2(replace->area, replace->size);
+
+                       if (!isttest(path))
+                               goto leave;
+
+                       err = uri_normalizer_percent_decode_unreserved(path, 
rule->action == ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED_STRICT, &newpath);
+
                        if (err != URI_NORMALIZER_ERR_NONE)
                                break;
 
@@ -407,6 +425,21 @@ static enum act_parse_ret parse_http_normalize_uri(const 
char **args, int *orig_
                        return ACT_RET_PRS_ERR;
                }
        }
+       else if (strcmp(args[cur_arg], "percent-decode-unreserved") == 0) {
+               cur_arg++;
+
+               if (strcmp(args[cur_arg], "strict") == 0) {
+                       cur_arg++;
+                       rule->action = 
ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED_STRICT;
+               }
+               else if (!*args[cur_arg]) {
+                       rule->action = 
ACT_NORMALIZE_URI_PERCENT_DECODE_UNRESERVED;
+               }
+               else if (strcmp(args[cur_arg], "if") != 0 && 
strcmp(args[cur_arg], "unless") != 0) {
+                       memprintf(err, "unknown argument '%s' for 
'percent-decode-unreserved' normalizer", args[cur_arg]);
+                       return ACT_RET_PRS_ERR;
+               }
+       }
        else {
                memprintf(err, "unknown normalizer '%s'", args[cur_arg]);
                return ACT_RET_PRS_ERR;
diff --git a/src/uri_normalizer.c b/src/uri_normalizer.c
index 8d959365e..4fd783d4a 100644
--- a/src/uri_normalizer.c
+++ b/src/uri_normalizer.c
@@ -18,6 +18,101 @@
 #include <haproxy/tools.h>
 #include <haproxy/uri_normalizer.h>
 
+/* Returns 1 if the given character is part of the 'unreserved' set in the
+ * RFC 3986 ABNF.
+ * Returns 0 if not.
+ */
+static int is_unreserved_character(unsigned char c)
+{
+       switch (c) {
+       case 'A'...'Z': /* ALPHA */
+       case 'a'...'z': /* ALPHA */
+       case '0'...'9': /* DIGIT */
+       case '-':
+       case '.':
+       case '_':
+       case '~':
+               return 1;
+       default:
+               return 0;
+       }
+}
+
+/* Decodes percent encoded characters that are part of the 'unreserved' set.
+ *
+ * RFC 3986, section 2.3:
+ * >  URIs that differ in the replacement of an unreserved character with
+ * >  its corresponding percent-encoded US-ASCII octet are equivalent [...]
+ * >  when found in a URI, should be decoded to their corresponding unreserved
+ * >  characters by URI normalizers.
+ *
+ * If `strict` is set to 0 then percent characters that are not followed by a
+ * hexadecimal digit are returned as-is without performing any decoding.
+ * If `strict` is set to 1 then `URI_NORMALIZER_ERR_INVALID_INPUT` is returned
+ * for invalid sequences.
+ */
+enum uri_normalizer_err uri_normalizer_percent_decode_unreserved(const struct 
ist input, int strict, struct ist *dst)
+{
+       enum uri_normalizer_err err;
+
+       const size_t size = istclear(dst);
+       struct ist output = *dst;
+
+       struct ist scanner = input;
+
+       /* The output will either be shortened or have the same length. */
+       if (size < istlen(input)) {
+               err = URI_NORMALIZER_ERR_ALLOC;
+               goto fail;
+       }
+
+       while (istlen(scanner)) {
+               const char current = istshift(&scanner);
+
+               if (current == '%') {
+                       if (istlen(scanner) >= 2) {
+                               if (ishex(istptr(scanner)[0]) && 
ishex(istptr(scanner)[1])) {
+                                       char hex1, hex2, c;
+
+                                       hex1 = istshift(&scanner);
+                                       hex2 = istshift(&scanner);
+                                       c = (hex2i(hex1) << 4) + hex2i(hex2);
+
+                                       if (is_unreserved_character(c)) {
+                                               output = __istappend(output, c);
+                                       }
+                                       else {
+                                               output = __istappend(output, 
current);
+                                               output = __istappend(output, 
hex1);
+                                               output = __istappend(output, 
hex2);
+                                       }
+
+                                       continue;
+                               }
+                       }
+
+                       if (strict) {
+                               err = URI_NORMALIZER_ERR_INVALID_INPUT;
+                               goto fail;
+                       }
+                       else {
+                               output = __istappend(output, current);
+                       }
+               }
+               else {
+                       output = __istappend(output, current);
+               }
+       }
+
+       *dst = output;
+
+       return URI_NORMALIZER_ERR_NONE;
+
+  fail:
+
+       return err;
+}
+
 /* Uppercases letters used in percent encoding.
  *
  * If `strict` is set to 0 then percent characters that are not followed by a
-- 
2.31.1


Reply via email to