This is an automated email from the ASF dual-hosted git repository.

bneradt pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficserver.git


The following commit(s) were added to refs/heads/master by this push:
     new 232f546ca6 Correctly handle encoding for cache hash generation (#10126)
232f546ca6 is described below

commit 232f546ca6ccbda9bb3a5b04411c50d179f92cf0
Author: Brian Neradt <[email protected]>
AuthorDate: Wed Aug 16 13:57:38 2023 -0500

    Correctly handle encoding for cache hash generation (#10126)
    
    Since origins may treat URL encoded or unencoded paths, query
    parameters, or fragments differently, we should cache them separately.
    This updates our URL cache hashing logic to not unencode these
    components of a URI.
---
 proxy/hdrs/URL.cc                    |  10 +++
 proxy/hdrs/unit_tests/test_URL.cc    | 127 +++++++++++++++++++++++++++++++++++
 tests/gold_tests/url/uri.replay.yaml | 111 ++++++++++++++++++++++++++++--
 tests/gold_tests/url/uri.test.py     |   8 ++-
 4 files changed, 248 insertions(+), 8 deletions(-)

diff --git a/proxy/hdrs/URL.cc b/proxy/hdrs/URL.cc
index e7536ba547..3b053699e2 100644
--- a/proxy/hdrs/URL.cc
+++ b/proxy/hdrs/URL.cc
@@ -1867,6 +1867,16 @@ url_CryptoHash_get_general(const URLImpl *url, 
CryptoContext &ctx, CryptoHash &h
       while (t < ends[i]) {
         if ((i == 0) || (i == 6)) { // scheme and host
           unescape_str_tolower(p, e, t, ends[i], s);
+        } else if (i == 8 || i == 10 || i == 12) { // path, params, query
+          // Don't unescape the parts of the URI that are processed by the
+          // origin since it may behave differently based upon whether these 
are
+          // escaped or not. Therefore differently encoded strings should be
+          // cached separately via differentiated hashes.
+          int path_len = ends[i] - t;
+          int min_len  = std::min(path_len, static_cast<int>(e - p));
+          memcpy(p, t, min_len);
+          p += min_len;
+          t += min_len;
         } else {
           unescape_str(p, e, t, ends[i], s);
         }
diff --git a/proxy/hdrs/unit_tests/test_URL.cc 
b/proxy/hdrs/unit_tests/test_URL.cc
index 3f11b98c6d..75a5abc601 100644
--- a/proxy/hdrs/unit_tests/test_URL.cc
+++ b/proxy/hdrs/unit_tests/test_URL.cc
@@ -23,6 +23,7 @@
 #include "catch.hpp"
 
 #include "URL.h"
+#include "tscore/CryptoHash.h"
 
 TEST_CASE("ValidateURL", "[proxy][validurl]")
 {
@@ -570,3 +571,129 @@ TEST_CASE("UrlParse", "[proxy][parseurl]")
     test_parse(test_case, URL_PARSE_REGEX);
   }
 }
+
+struct get_hash_test_case {
+  const std::string description;
+  const std::string uri_1;
+  const std::string uri_2;
+  const bool ignore_query;
+  const bool has_equal_hash;
+};
+
+constexpr bool HAS_EQUAL_HASH = true;
+constexpr bool IGNORE_QUERY   = true;
+
+// clang-format off
+std::vector<get_hash_test_case> get_hash_test_cases = {
+  {
+    "No encoding: equal hashes",
+    
"http://one.example.com/a/path?name=value#some=value?with_question#fragment";,
+    
"http://one.example.com/a/path?name=value#some=value?with_question#fragment";,
+    !IGNORE_QUERY,
+    HAS_EQUAL_HASH,
+  },
+  {
+    "Scheme encoded: equal hashes",
+    
"http%3C://one.example.com/a/path?name=value#some=value?with_question#fragment",
+    
"http<://one.example.com/a/path?name=value#some=value?with_question#fragment",
+    !IGNORE_QUERY,
+    HAS_EQUAL_HASH,
+  },
+  {
+    "Host encoded: equal hashes",
+    
"http://one%2Eexample.com/a/path?name=value#some=value?with_question#fragment";,
+    
"http://one.example.com/a/path?name=value#some=value?with_question#fragment";,
+    !IGNORE_QUERY,
+    HAS_EQUAL_HASH,
+  },
+  {
+    "Path encoded: differing hashes",
+    
"http://one.example.com/a%2Fpath?name=value#some=value?with_question#fragment";,
+    
"http://one.example.com/a/path?name=value#some=value?with_question#fragment";,
+    !IGNORE_QUERY,
+    !HAS_EQUAL_HASH,
+  },
+  {
+    "Query = encoded: differing hashes",
+    
"http://one.example.com/a/path?name%3Dvalue#some=value?with_question#fragment";,
+    
"http://one.example.com/a/path?name=value#some=value?with_question#fragment";,
+    !IGNORE_QUERY,
+    !HAS_EQUAL_HASH,
+  },
+  {
+    "Query = encoded but ignore_query: equal hashes",
+    
"http://one.example.com/a/path?name%3Dvalue#some=value?with_question#fragment";,
+    
"http://one.example.com/a/path?name=value#some=value?with_question#fragment";,
+    IGNORE_QUERY,
+    HAS_EQUAL_HASH,
+  },
+  {
+    "Query internal encoded: differing hashes",
+    
"http://one.example.com/a/path?name=valu%5D#some=value?with_question#fragment";,
+    
"http://one.example.com/a/path?name=valu]#some=value?with_question#fragment";,
+    !IGNORE_QUERY,
+    !HAS_EQUAL_HASH,
+  },
+  {
+    "Query internal encoded but ignore_query: equal hashes",
+    
"http://one.example.com/a/path?name=valu%5D#some=value?with_question#fragment";,
+    
"http://one.example.com/a/path?name=valu]#some=value?with_question#fragment";,
+    IGNORE_QUERY,
+    HAS_EQUAL_HASH,
+  },
+  {
+    "Fragment encoded: fragment is not part of the hash",
+    
"http://one.example.com/a/path?name=value#some=value?with_question#frag%7Dent";,
+    
"http://one.example.com/a/path?name=value#some=value?with_question/frag}ent";,
+    !IGNORE_QUERY,
+    HAS_EQUAL_HASH,
+  },
+  {
+    "Username encoded: equal hashes",
+    "mysql://my%7Eser:mypassword@localhost/mydatabase",
+    "mysql://my~ser:mypassword@localhost/mydatabase",
+    !IGNORE_QUERY,
+    HAS_EQUAL_HASH,
+  },
+  {
+    "Password encoded: equal hashes",
+    "mysql://myuser:mypa%24sword@localhost/mydatabase",
+    "mysql://myuser:mypa$sword@localhost/mydatabase",
+    !IGNORE_QUERY,
+    HAS_EQUAL_HASH,
+  },
+};
+
+/** Return the hash related to a URI.
+  *
+  * @param[in] uri The URI to hash.
+  * @return The hash of the URI.
+ */
+CryptoHash
+get_hash(const std::string &uri, bool ignore_query)
+{
+  URL url;
+  HdrHeap *heap = new_HdrHeap();
+  url.create(heap);
+  url.parse(uri);
+  CryptoHash hash;
+  url.hash_get(&hash, ignore_query);
+  heap->destroy();
+  return hash;
+}
+
+TEST_CASE("UrlHashGet", "[url][hash_get]")
+{
+  for (auto const &test_case : get_hash_test_cases) {
+    std::string description = test_case.description + ": " + test_case.uri_1 + 
" vs " + test_case.uri_2;
+    SECTION(description) {
+      CryptoHash hash1 = get_hash(test_case.uri_1, test_case.ignore_query);
+      CryptoHash hash2 = get_hash(test_case.uri_2, test_case.ignore_query);
+      if (test_case.has_equal_hash) {
+        CHECK(hash1 == hash2);
+      } else {
+        CHECK(hash1 != hash2);
+      }
+    }
+  }
+}
diff --git a/tests/gold_tests/url/uri.replay.yaml 
b/tests/gold_tests/url/uri.replay.yaml
index 4935ff7951..d6689db11f 100644
--- a/tests/gold_tests/url/uri.replay.yaml
+++ b/tests/gold_tests/url/uri.replay.yaml
@@ -33,12 +33,20 @@ meta:
           - [ Content-Length, 16 ]
           - [ Cache-Control, max-age=300 ]
 
+  - 404_ok_response: &404_ok_response
+      server-response:
+        status: 404
+        reason: "Not Found"
+        headers:
+          fields:
+          - [ Content-Length, 16 ]
+          - [ Cache-Control, max-age=300 ]
+
 sessions:
 - transactions:
 
   # A simply path
-  - all: { headers: { fields: [[ uuid, 1 ]]}}
-    client-request:
+  - client-request:
       method: "GET"
       version: "1.1"
       scheme: "http"
@@ -46,6 +54,7 @@ sessions:
       headers:
         fields:
         - [ Host, example.com ]
+        - [ uuid, 1 ]
 
     <<: *200_ok_response
 
@@ -54,8 +63,7 @@ sessions:
 
   # An initial // without an authority is not valid per RFC 3986 section-3.3,
   # but we have historically accepted it and will continue to do so.
-  - all: { headers: { fields: [[ uuid, 2 ]]}}
-    client-request:
+  - client-request:
       method: "GET"
       version: "1.1"
       scheme: "http"
@@ -63,6 +71,7 @@ sessions:
       headers:
         fields:
         - [ Host, example.com ]
+        - [ uuid, 2 ]
 
     <<: *200_ok_response
 
@@ -70,8 +79,7 @@ sessions:
       status: 200
 
   # A '^' is an invalid path.
-  - all: { headers: { fields: [[ uuid, 3 ]]}}
-    client-request:
+  - client-request:
       method: "GET"
       version: "1.1"
       scheme: "http"
@@ -79,8 +87,99 @@ sessions:
       headers:
         fields:
         - [ Host, example.com ]
+        - [ uuid, 3 ]
 
     <<: *200_ok_response
 
     proxy-response:
       status: 400
+
+- transactions:
+  #
+  # Make sure that we consistently treat encoding of paths.
+  #
+  - client-request:
+      method: "GET"
+      version: "1.1"
+      url: /path%2ffoo
+      headers:
+        fields:
+        - [ Host, test.com ]
+        - [ Connection, keep-alive ]
+        - [ uuid, encoded_path ]
+
+    proxy-request:
+      # Verify that the origin gets the encoded path.
+      url: /path%2ffoo
+
+    <<: *404_ok_response
+
+    proxy-response:
+      status: 404
+
+  # Request with an unencoded path: go to origin.
+  - client-request:
+      # Give plenty of time to cache the response.
+      delay: 200ms
+
+      method: "GET"
+      version: "1.1"
+      # Note that the path is not encoded.
+      url: /path/foo
+      headers:
+        fields:
+        - [ Host, test.com ]
+        - [ Connection, keep-alive ]
+        - [ uuid, unencoded_path ]
+
+    proxy-request:
+      url: /path/foo
+
+    <<: *200_ok_response
+
+    proxy-response:
+      # ATS should treat the unencoded path as different and go to the origin,
+      # not serve the encoded path out of cache.
+      status: 200
+
+  # Request again with an encoded path, should be served from cache.
+  - client-request:
+      method: "GET"
+      version: "1.1"
+      url: /path%2ffoo
+      headers:
+        fields:
+        - [ Host, test.com ]
+        - [ Connection, keep-alive ]
+        - [ uuid, encoded_path_again ]
+
+    # The origin should not receive this request, but if it does, reply with a
+    # non-404 response so we can detect it.
+    <<: *200_ok_response
+
+    proxy-response:
+      # ATS should serve the encoded path response from cache.
+      status: 404
+
+  # Request again with an encoded path, should be served from cache.
+  - client-request:
+      # Give plenty of time to cache the unencoded response.
+      delay: 200ms
+
+      method: "GET"
+      version: "1.1"
+      # Note that the path is not encoded.
+      url: /path/foo
+      headers:
+        fields:
+        - [ Host, test.com ]
+        - [ Connection, keep-alive ]
+        - [ uuid, unencoded_path_again ]
+
+    # The origin should not receive this request, but if it does, reply with a
+    # non-200 response so we can detect it.
+    <<: *404_ok_response
+
+    proxy-response:
+      # Verify that ATS returns out of cache for the uneencoded path object.
+      status: 200
diff --git a/tests/gold_tests/url/uri.test.py b/tests/gold_tests/url/uri.test.py
index 8589d07f0f..f1ccd77e00 100644
--- a/tests/gold_tests/url/uri.test.py
+++ b/tests/gold_tests/url/uri.test.py
@@ -26,7 +26,7 @@ replay_file = "uri.replay.yaml"
 server = Test.MakeVerifierServerProcess("server", replay_file)
 ts.Disk.records_config.update({
     'proxy.config.diags.debug.enabled': 1,
-    'proxy.config.diags.debug.tags': 'http',
+    'proxy.config.diags.debug.tags': 'http|cache|url',
 })
 ts.Disk.remap_config.AddLine(
     'map / http://127.0.0.1:{0}'.format(server.Variables.http_port)
@@ -34,4 +34,8 @@ ts.Disk.remap_config.AddLine(
 tr = Test.AddTestRun("Verify correct URI parsing behavior.")
 tr.Processes.Default.StartBefore(server)
 tr.Processes.Default.StartBefore(ts)
-tr.AddVerifierClientProcess("client", replay_file, 
http_ports=[ts.Variables.port])
+tr.AddVerifierClientProcess(
+    "client",
+    replay_file,
+    http_ports=[ts.Variables.port],
+    other_args='--thread-limit 1')

Reply via email to