This is an automated email from the ASF dual-hosted git repository.
eze pushed a commit to branch 8.1.x
in repository https://gitbox.apache.org/repos/asf/trafficserver.git
The following commit(s) were added to refs/heads/8.1.x by this push:
new 3640b5f8da Remove duplicate slashes at the beginning of the incoming
URL (#10133)
3640b5f8da is described below
commit 3640b5f8daefe7f9a8d7c060f3f2a0cb04eb7532
Author: Bryan Call <[email protected]>
AuthorDate: Tue Aug 1 14:52:34 2023 -0700
Remove duplicate slashes at the beginning of the incoming URL (#10133)
---
proxy/hdrs/URL.cc | 8 +
proxy/hdrs/unit_tests/test_URL.cc | 389 ++++++++++++++++++++++++++++++++++++++
2 files changed, 397 insertions(+)
diff --git a/proxy/hdrs/URL.cc b/proxy/hdrs/URL.cc
index e38df3379b..9977ad6a5b 100644
--- a/proxy/hdrs/URL.cc
+++ b/proxy/hdrs/URL.cc
@@ -1473,6 +1473,10 @@ done:
if (!path_end) {
path_end = cur;
}
+ // Remove all preceding slashes
+ while (path_start < path_end && *path_start == '/') {
+ ++path_start;
+ }
url_path_set(heap, url, path_start, path_end - path_start, copy_strings);
}
if (params_start) {
@@ -1553,6 +1557,10 @@ url_parse_http_no_path_component_breakdown(HdrHeap
*heap, URLImpl *url, const ch
// path is anything that's left.
if (cur < end) {
+ // Remove all preceding slashes
+ while (cur < end && *cur == '/') {
+ cur++;
+ }
url_path_set(heap, url, cur, end - cur, copy_strings);
cur = end;
}
diff --git a/proxy/hdrs/unit_tests/test_URL.cc
b/proxy/hdrs/unit_tests/test_URL.cc
index 44f621c76f..d5fc7f471c 100644
--- a/proxy/hdrs/unit_tests/test_URL.cc
+++ b/proxy/hdrs/unit_tests/test_URL.cc
@@ -145,3 +145,392 @@ TEST_CASE("UrlHashGet", "[url][hash_get]")
}
}
}
+
+struct url_parse_test_case {
+ const std::string input_uri;
+ const std::string expected_printed_url;
+ const bool verify_host_characters;
+ const std::string expected_printed_url_regex;
+ const bool is_valid;
+ const bool is_valid_regex;
+};
+
+constexpr bool IS_VALID = true;
+constexpr bool VERIFY_HOST_CHARACTERS = true;
+
+// clang-format off
+std::vector<url_parse_test_case> url_parse_test_cases = {
+ {
+ "///dir////index.html",
+ "/dir////index.html",
+ VERIFY_HOST_CHARACTERS,
+ "/dir////index.html",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "/index.html",
+ "/index.html",
+ VERIFY_HOST_CHARACTERS,
+ "/index.html",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "//index.html",
+ "/index.html",
+ VERIFY_HOST_CHARACTERS,
+ "/index.html",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ // The following scheme-only URI is technically valid per the spec, but we
+ // have historically returned this as invalid and I'm not comfortable
+ // changing it in case something depends upon this behavior. Besides, a
+ // scheme-only URI is probably not helpful to us nor something likely
+ // Traffic Server will see.
+ "http://",
+ "",
+ VERIFY_HOST_CHARACTERS,
+ "",
+ !IS_VALID,
+ !IS_VALID
+ },
+ {
+ "https:///",
+ "https:///",
+ VERIFY_HOST_CHARACTERS,
+ "https:///",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ // RFC 3986 section-3: When authority is not present, the path cannot begin
+ // with two slash characters ("//"). We have historically allowed this,
+ // however, and will continue to do so.
+ "https:////",
+ "https:///",
+ VERIFY_HOST_CHARACTERS,
+ "https:///",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ // By convention, our url_print() function adds a path of '/' at the end of
+ // URLs that have no path, query, or fragment after the authority.
+ "mailto:[email protected]",
+ "mailto:[email protected]/",
+ VERIFY_HOST_CHARACTERS,
+ "mailto:[email protected]/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "mailto:[email protected]:25",
+ "mailto:[email protected]:25/",
+ VERIFY_HOST_CHARACTERS,
+ "mailto:[email protected]:25/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com",
+ "https://www.example.com/",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com/",
+ "https://www.example.com/",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com//",
+ "https://www.example.com/",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://127.0.0.1",
+ "https://127.0.0.1/",
+ VERIFY_HOST_CHARACTERS,
+ "https://127.0.0.1/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://[::1]",
+ "https://[::1]/",
+ VERIFY_HOST_CHARACTERS,
+ "https://[::1]/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://127.0.0.1/",
+ "https://127.0.0.1/",
+ VERIFY_HOST_CHARACTERS,
+ "https://127.0.0.1/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com:8888",
+ "https://www.example.com:8888/",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com:8888/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com:8888/",
+ "https://www.example.com:8888/",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com:8888/",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com/a/path",
+ "https://www.example.com/a/path",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com//a/path",
+ "https://www.example.com/a/path",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path",
+ IS_VALID,
+ IS_VALID
+ },
+
+ // Technically a trailing '?' with an empty query string is valid, but we
+ // drop the '?'. The parse_regex, however, makes no distinction between
+ // query, fragment, and path components so it does not cut it out.
+ {
+ "https://www.example.com/a/path?",
+ "https://www.example.com/a/path",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path?",
+ IS_VALID,
+ IS_VALID},
+ {
+ "https://www.example.com/a/path?name=value",
+ "https://www.example.com/a/path?name=value",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path?name=value",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com/a/path?name=/a/path/value",
+ "https://www.example.com/a/path?name=/a/path/value",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path?name=/a/path/value",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com/a/path?name=/a/path/value;some=other_value",
+ "https://www.example.com/a/path?name=/a/path/value;some=other_value",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path?name=/a/path/value;some=other_value",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com/a/path?name=/a/path/value;some=other_value/",
+ "https://www.example.com/a/path?name=/a/path/value;some=other_value/",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path?name=/a/path/value;some=other_value/",
+ IS_VALID,
+ IS_VALID
+ },
+
+ // XXX - Tests didn't pass before this change, so commenting out for now.
+ // Again, URL::parse drops a final '?'.
+ // {
+ // "https://www.example.com?",
+ // "https://www.example.com",
+ // VERIFY_HOST_CHARACTERS,
+ // "https://www.example.com?/",
+ // IS_VALID,
+ // IS_VALID
+ // },
+ // {
+ // "https://www.example.com?name=value",
+ // "https://www.example.com?name=value",
+ // VERIFY_HOST_CHARACTERS,
+ // "https://www.example.com?name=value/",
+ // IS_VALID,
+ // IS_VALID
+ // },
+ // {
+ // "https://www.example.com?name=value/",
+ // "https://www.example.com?name=value/",
+ // VERIFY_HOST_CHARACTERS,
+ // "https://www.example.com?name=value/",
+ // IS_VALID,
+ // IS_VALID
+ // },
+
+ // URL::parse also drops the final '#'.
+ // {
+ // "https://www.example.com#",
+ // "https://www.example.com",
+ // VERIFY_HOST_CHARACTERS,
+ // "https://www.example.com#/",
+ // IS_VALID,
+ // IS_VALID
+ // },
+ // {
+ // "https://www.example.com#some=value",
+ // "https://www.example.com#some=value",
+ // VERIFY_HOST_CHARACTERS,
+ // "https://www.example.com#some=value/",
+ // IS_VALID,
+ // IS_VALID
+ // },
+ {
+ "https://www.example.com/a/path#",
+ "https://www.example.com/a/path",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path#",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com/a/path#some=value",
+ "https://www.example.com/a/path#some=value",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path#some=value",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ // Note that this final '?' is not for a query parameter but is a part of
+ // the fragment.
+ "https://www.example.com/a/path#some=value?",
+ "https://www.example.com/a/path#some=value?",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path#some=value?",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+ "https://www.example.com/a/path#some=value?with_question",
+ "https://www.example.com/a/path#some=value?with_question",
+ VERIFY_HOST_CHARACTERS,
+ "https://www.example.com/a/path#some=value?with_question",
+ IS_VALID,
+ IS_VALID
+ },
+ {
+
"https://www.example.com/a/path?name=value?_with_question#some=value?with_question/",
+
"https://www.example.com/a/path?name=value?_with_question#some=value?with_question/",
+ VERIFY_HOST_CHARACTERS,
+
"https://www.example.com/a/path?name=value?_with_question#some=value?with_question/",
+ IS_VALID,
+ IS_VALID
+ },
+
+ // The following are some examples of strings we expect from regex_map in
+ // remap.config. The "From" portion, which are regular expressions, are
+ // often not parsible by URL::parse but are by URL::parse_regex, which is the
+ // purpose of its existence.
+ {
+ R"(http://(.*)?reactivate\.mail\.yahoo\.com/)",
+ "",
+ VERIFY_HOST_CHARACTERS,
+ R"(http://(.*)?reactivate\.mail\.yahoo\.com/)",
+ !IS_VALID,
+ IS_VALID
+ },
+ {
+ // The following is an example of a "To" URL in a regex_map line. We'll
+ // first verify that the '$' is flagged as invalid for a host in this case.
+ "http://$1reactivate.real.mail.yahoo.com/",
+ "http://$1reactivate.real.mail.yahoo.com/",
+ VERIFY_HOST_CHARACTERS,
+ "http://$1reactivate.real.mail.yahoo.com/",
+ !IS_VALID,
+ IS_VALID
+ },
+ {
+ // Same as above, but this time we pass in !VERIFY_HOST_CHARACTERS. This is
+ // how RemapConfig will call this parse() function.
+ "http://$1reactivate.real.mail.yahoo.com/",
+ "http://$1reactivate.real.mail.yahoo.com/",
+ !VERIFY_HOST_CHARACTERS,
+ "http://$1reactivate.real.mail.yahoo.com/",
+ IS_VALID,
+ IS_VALID
+ }
+};
+// clang-format on
+
+constexpr bool URL_PARSE = true;
+constexpr bool URL_PARSE_REGEX = false;
+
+/** Test the specified url.parse function.
+ *
+ * URL::parse and URL::parse_regex should behave the same. This function
+ * performs the same behavior for each.
+ *
+ * @param[in] test_case The test case specification to run.
+ *
+ * @param[in] parse_function Whether to run parse() or
+ * parse_regex().
+ */
+void
+test_parse(url_parse_test_case const &test_case, bool parse_function)
+{
+ URL url;
+ HdrHeap *heap = new_HdrHeap();
+ url.create(heap);
+ ParseResult result = PARSE_RESULT_OK;
+ if (test_case.verify_host_characters) {
+ result = url.parse(test_case.input_uri.c_str(),
test_case.input_uri.size());
+ } else {
+ heap->destroy();
+ return;
+ //result = url.parse_no_host_check(test_case.input_uri.c_str(),
test_case.input_uri.size());
+ }
+ bool expected_is_valid = test_case.is_valid;
+
+ if (expected_is_valid && result != PARSE_RESULT_DONE) {
+ std::printf("Parse URI: \"%s\", expected it to be valid but it was parsed
invalid (%d)\n", test_case.input_uri.c_str(), result);
+ CHECK(false);
+ } else if (!expected_is_valid && result != PARSE_RESULT_ERROR) {
+ std::printf("Parse URI: \"%s\", expected it to be invalid but it was
parsed valid (%d)\n", test_case.input_uri.c_str(), result);
+ CHECK(false);
+ }
+ if (result == PARSE_RESULT_DONE) {
+ char buf[1024];
+ int index = 0;
+ int offset = 0;
+ url.print(buf, sizeof(buf), &index, &offset);
+ std::string printed_url{buf, static_cast<size_t>(index)};
+ CHECK(test_case.expected_printed_url == printed_url);
+ CHECK(test_case.expected_printed_url.size() == printed_url.size());
+ }
+ heap->destroy();
+}
+
+TEST_CASE("UrlParse", "[proxy][parseurl]")
+{
+ for (auto const &test_case : url_parse_test_cases) {
+ test_parse(test_case, URL_PARSE);
+ }
+}