[trafficserver] branch 8.1.x updated: Remove duplicate slashes at the beginning of the incoming URL (#10133)

eze Tue, 01 Aug 2023 14:52:44 -0700

This is an automated email from the ASF dual-hosted git repository.

eze pushed a commit to branch 8.1.x
in repository https://gitbox.apache.org/repos/asf/trafficserver.git



The following commit(s) were added to refs/heads/8.1.x by this push:
     new 3640b5f8da Remove duplicate slashes at the beginning of the incoming 
URL (#10133)
3640b5f8da is described below

commit 3640b5f8daefe7f9a8d7c060f3f2a0cb04eb7532
Author: Bryan Call <[email protected]>
AuthorDate: Tue Aug 1 14:52:34 2023 -0700

    Remove duplicate slashes at the beginning of the incoming URL (#10133)
---
 proxy/hdrs/URL.cc                 |   8 +
 proxy/hdrs/unit_tests/test_URL.cc | 389 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 397 insertions(+)

diff --git a/proxy/hdrs/URL.cc b/proxy/hdrs/URL.cc
index e38df3379b..9977ad6a5b 100644
--- a/proxy/hdrs/URL.cc
+++ b/proxy/hdrs/URL.cc
@@ -1473,6 +1473,10 @@ done:
     if (!path_end) {
       path_end = cur;
     }
+    // Remove all preceding slashes
+    while (path_start < path_end && *path_start == '/') {
+      ++path_start;
+    }
     url_path_set(heap, url, path_start, path_end - path_start, copy_strings);
   }
   if (params_start) {
@@ -1553,6 +1557,10 @@ url_parse_http_no_path_component_breakdown(HdrHeap 
*heap, URLImpl *url, const ch
 
   // path is anything that's left.
   if (cur < end) {
+    // Remove all preceding slashes
+    while (cur < end && *cur == '/') {
+      cur++;
+    }
     url_path_set(heap, url, cur, end - cur, copy_strings);
     cur = end;
   }
diff --git a/proxy/hdrs/unit_tests/test_URL.cc 
b/proxy/hdrs/unit_tests/test_URL.cc
index 44f621c76f..d5fc7f471c 100644
--- a/proxy/hdrs/unit_tests/test_URL.cc
+++ b/proxy/hdrs/unit_tests/test_URL.cc
@@ -145,3 +145,392 @@ TEST_CASE("UrlHashGet", "[url][hash_get]")
     }
   }
 }
+
+struct url_parse_test_case {
+  const std::string input_uri;
+  const std::string expected_printed_url;
+  const bool verify_host_characters;
+  const std::string expected_printed_url_regex;
+  const bool is_valid;
+  const bool is_valid_regex;
+};
+
+constexpr bool IS_VALID               = true;
+constexpr bool VERIFY_HOST_CHARACTERS = true;
+
+// clang-format off
+std::vector<url_parse_test_case> url_parse_test_cases = {
+  {
+    "///dir////index.html",
+    "/dir////index.html",
+    VERIFY_HOST_CHARACTERS,
+    "/dir////index.html",
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "/index.html",
+    "/index.html",
+    VERIFY_HOST_CHARACTERS,
+    "/index.html",
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "//index.html",
+    "/index.html",
+    VERIFY_HOST_CHARACTERS,
+    "/index.html",
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    // The following scheme-only URI is technically valid per the spec, but we
+    // have historically returned this as invalid and I'm not comfortable
+    // changing it in case something depends upon this behavior. Besides, a
+    // scheme-only URI is probably not helpful to us nor something likely
+    // Traffic Server will see.
+    "http://";,
+    "",
+    VERIFY_HOST_CHARACTERS,
+    "",
+    !IS_VALID,
+    !IS_VALID
+  },
+  {
+    "https:///";,
+    "https:///";,
+    VERIFY_HOST_CHARACTERS,
+    "https:///";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    // RFC 3986 section-3: When authority is not present, the path cannot begin
+    // with two slash characters ("//"). We have historically allowed this,
+    // however, and will continue to do so.
+    "https:////";,
+    "https:///";,
+    VERIFY_HOST_CHARACTERS,
+    "https:///";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    // By convention, our url_print() function adds a path of '/' at the end of
+    // URLs that have no path, query, or fragment after the authority.
+    "mailto:[email protected]";,
+    "mailto:[email protected]/";,
+    VERIFY_HOST_CHARACTERS,
+    "mailto:[email protected]/";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "mailto:[email protected]:25";,
+    "mailto:[email protected]:25/";,
+    VERIFY_HOST_CHARACTERS,
+    "mailto:[email protected]:25/";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "https://www.example.com";,
+    "https://www.example.com/";,
+    VERIFY_HOST_CHARACTERS,
+    "https://www.example.com/";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "https://www.example.com/";,
+    "https://www.example.com/";,
+    VERIFY_HOST_CHARACTERS,
+    "https://www.example.com/";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "https://www.example.com//";,
+    "https://www.example.com/";,
+    VERIFY_HOST_CHARACTERS,
+    "https://www.example.com/";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "https://127.0.0.1";,
+    "https://127.0.0.1/";,
+    VERIFY_HOST_CHARACTERS,
+    "https://127.0.0.1/";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "https://[::1]";,
+    "https://[::1]/";,
+    VERIFY_HOST_CHARACTERS,
+    "https://[::1]/";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "https://127.0.0.1/";,
+    "https://127.0.0.1/";,
+    VERIFY_HOST_CHARACTERS,
+    "https://127.0.0.1/";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "https://www.example.com:8888";,
+    "https://www.example.com:8888/";,
+    VERIFY_HOST_CHARACTERS,
+    "https://www.example.com:8888/";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "https://www.example.com:8888/";,
+    "https://www.example.com:8888/";,
+    VERIFY_HOST_CHARACTERS,
+    "https://www.example.com:8888/";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "https://www.example.com/a/path";,
+    "https://www.example.com/a/path";,
+    VERIFY_HOST_CHARACTERS,
+    "https://www.example.com/a/path";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "https://www.example.com//a/path";,
+    "https://www.example.com/a/path";,
+    VERIFY_HOST_CHARACTERS,
+    "https://www.example.com/a/path";,
+    IS_VALID,
+    IS_VALID
+  },
+
+  // Technically a trailing '?' with an empty query string is valid, but we
+  // drop the '?'. The parse_regex, however, makes no distinction between
+  // query, fragment, and path components so it does not cut it out.
+  {
+    "https://www.example.com/a/path?";,
+    "https://www.example.com/a/path";,
+    VERIFY_HOST_CHARACTERS,
+    "https://www.example.com/a/path?";,
+    IS_VALID,
+    IS_VALID},
+  {
+    "https://www.example.com/a/path?name=value";,
+    "https://www.example.com/a/path?name=value";,
+    VERIFY_HOST_CHARACTERS,
+    "https://www.example.com/a/path?name=value";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "https://www.example.com/a/path?name=/a/path/value";,
+    "https://www.example.com/a/path?name=/a/path/value";,
+    VERIFY_HOST_CHARACTERS,
+    "https://www.example.com/a/path?name=/a/path/value";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "https://www.example.com/a/path?name=/a/path/value;some=other_value";,
+    "https://www.example.com/a/path?name=/a/path/value;some=other_value";,
+    VERIFY_HOST_CHARACTERS,
+    "https://www.example.com/a/path?name=/a/path/value;some=other_value";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "https://www.example.com/a/path?name=/a/path/value;some=other_value/";,
+    "https://www.example.com/a/path?name=/a/path/value;some=other_value/";,
+    VERIFY_HOST_CHARACTERS,
+    "https://www.example.com/a/path?name=/a/path/value;some=other_value/";,
+    IS_VALID,
+    IS_VALID
+  },
+
+  // XXX - Tests didn't pass before this change, so commenting out for now.
+  // Again, URL::parse drops a final '?'.
+  // {
+  //   "https://www.example.com?";,
+  //   "https://www.example.com";,
+  //   VERIFY_HOST_CHARACTERS,
+  //   "https://www.example.com?/";,
+  //   IS_VALID,
+  //   IS_VALID
+  // },
+  // {
+  //   "https://www.example.com?name=value";,
+  //   "https://www.example.com?name=value";,
+  //   VERIFY_HOST_CHARACTERS,
+  //   "https://www.example.com?name=value/";,
+  //   IS_VALID,
+  //   IS_VALID
+  // },
+  // {
+  //   "https://www.example.com?name=value/";,
+  //   "https://www.example.com?name=value/";,
+  //   VERIFY_HOST_CHARACTERS,
+  //   "https://www.example.com?name=value/";,
+  //   IS_VALID,
+  //   IS_VALID
+  // },
+
+  // URL::parse also drops the final '#'.
+  // {
+  //   "https://www.example.com#";,
+  //   "https://www.example.com";,
+  //   VERIFY_HOST_CHARACTERS,
+  //   "https://www.example.com#/";,
+  //   IS_VALID,
+  //   IS_VALID
+  // },
+  // {
+  //   "https://www.example.com#some=value";,
+  //   "https://www.example.com#some=value";,
+  //   VERIFY_HOST_CHARACTERS,
+  //   "https://www.example.com#some=value/";,
+  //   IS_VALID,
+  //   IS_VALID
+  // },
+  {
+    "https://www.example.com/a/path#";,
+    "https://www.example.com/a/path";,
+    VERIFY_HOST_CHARACTERS,
+    "https://www.example.com/a/path#";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "https://www.example.com/a/path#some=value";,
+    "https://www.example.com/a/path#some=value";,
+    VERIFY_HOST_CHARACTERS,
+    "https://www.example.com/a/path#some=value";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    // Note that this final '?' is not for a query parameter but is a part of
+    // the fragment.
+    "https://www.example.com/a/path#some=value?";,
+    "https://www.example.com/a/path#some=value?";,
+    VERIFY_HOST_CHARACTERS,
+    "https://www.example.com/a/path#some=value?";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    "https://www.example.com/a/path#some=value?with_question";,
+    "https://www.example.com/a/path#some=value?with_question";,
+    VERIFY_HOST_CHARACTERS,
+    "https://www.example.com/a/path#some=value?with_question";,
+    IS_VALID,
+    IS_VALID
+  },
+  {
+    
"https://www.example.com/a/path?name=value?_with_question#some=value?with_question/";,
+    
"https://www.example.com/a/path?name=value?_with_question#some=value?with_question/";,
+    VERIFY_HOST_CHARACTERS,
+    
"https://www.example.com/a/path?name=value?_with_question#some=value?with_question/";,
+    IS_VALID,
+    IS_VALID
+  },
+
+  // The following are some examples of strings we expect from regex_map in
+  // remap.config.  The "From" portion, which are regular expressions, are
+  // often not parsible by URL::parse but are by URL::parse_regex, which is the
+  // purpose of its existence.
+  {
+    R"(http://(.*)?reactivate\.mail\.yahoo\.com/)",
+    "",
+    VERIFY_HOST_CHARACTERS,
+    R"(http://(.*)?reactivate\.mail\.yahoo\.com/)",
+    !IS_VALID,
+    IS_VALID
+  },
+  {
+    // The following is an example of a "To" URL in a regex_map line. We'll
+    // first verify that the '$' is flagged as invalid for a host in this case.
+    "http://$1reactivate.real.mail.yahoo.com/";,
+    "http://$1reactivate.real.mail.yahoo.com/";,
+    VERIFY_HOST_CHARACTERS,
+    "http://$1reactivate.real.mail.yahoo.com/";,
+    !IS_VALID,
+    IS_VALID
+  },
+  {
+    // Same as above, but this time we pass in !VERIFY_HOST_CHARACTERS. This is
+    // how RemapConfig will call this parse() function.
+    "http://$1reactivate.real.mail.yahoo.com/";,
+    "http://$1reactivate.real.mail.yahoo.com/";,
+    !VERIFY_HOST_CHARACTERS,
+    "http://$1reactivate.real.mail.yahoo.com/";,
+    IS_VALID,
+    IS_VALID
+  }
+};
+// clang-format on
+
+constexpr bool URL_PARSE       = true;
+constexpr bool URL_PARSE_REGEX = false;
+
+/** Test the specified url.parse function.
+ *
+ * URL::parse and URL::parse_regex should behave the same. This function
+ * performs the same behavior for each.
+ *
+ * @param[in] test_case The test case specification to run.
+ *
+ * @param[in] parse_function Whether to run parse() or
+ * parse_regex().
+ */
+void
+test_parse(url_parse_test_case const &test_case, bool parse_function)
+{
+  URL url;
+  HdrHeap *heap = new_HdrHeap();
+  url.create(heap);
+  ParseResult result = PARSE_RESULT_OK;
+  if (test_case.verify_host_characters) {
+    result = url.parse(test_case.input_uri.c_str(), 
test_case.input_uri.size());
+  } else {
+    heap->destroy();
+    return;
+    //result = url.parse_no_host_check(test_case.input_uri.c_str(), 
test_case.input_uri.size());
+  }
+  bool expected_is_valid = test_case.is_valid;
+
+  if (expected_is_valid && result != PARSE_RESULT_DONE) {
+    std::printf("Parse URI: \"%s\", expected it to be valid but it was parsed 
invalid (%d)\n", test_case.input_uri.c_str(), result);
+    CHECK(false);
+  } else if (!expected_is_valid && result != PARSE_RESULT_ERROR) {
+    std::printf("Parse URI: \"%s\", expected it to be invalid but it was 
parsed valid (%d)\n", test_case.input_uri.c_str(), result);
+    CHECK(false);
+  }
+  if (result == PARSE_RESULT_DONE) {
+    char buf[1024];
+    int index  = 0;
+    int offset = 0;
+    url.print(buf, sizeof(buf), &index, &offset);
+    std::string printed_url{buf, static_cast<size_t>(index)};
+    CHECK(test_case.expected_printed_url == printed_url);
+    CHECK(test_case.expected_printed_url.size() == printed_url.size());
+  }
+  heap->destroy();
+}
+
+TEST_CASE("UrlParse", "[proxy][parseurl]")
+{
+  for (auto const &test_case : url_parse_test_cases) {
+    test_parse(test_case, URL_PARSE);
+  }
+}

[trafficserver] branch 8.1.x updated: Remove duplicate slashes at the beginning of the incoming URL (#10133)

Reply via email to