Updated Branches: refs/heads/master 4dd567102 -> 6e93cff1b
TS-1315 Fix URL parsing to handle non-HTTP schemes correctly. Project: http://git-wip-us.apache.org/repos/asf/trafficserver/repo Commit: http://git-wip-us.apache.org/repos/asf/trafficserver/commit/6e93cff1 Tree: http://git-wip-us.apache.org/repos/asf/trafficserver/tree/6e93cff1 Diff: http://git-wip-us.apache.org/repos/asf/trafficserver/diff/6e93cff1 Branch: refs/heads/master Commit: 6e93cff1bd8fc1b7cba4c92b8e31ca2d9c5734d2 Parents: 4dd5671 Author: Alan M. Carroll <[email protected]> Authored: Wed Jul 18 13:54:54 2012 -0500 Committer: Alan M. Carroll <[email protected]> Committed: Mon Jul 23 08:01:29 2012 -0500 ---------------------------------------------------------------------- CHANGES | 2 + proxy/InkAPITest.cc | 83 ++++++++++++++------- proxy/hdrs/URL.cc | 180 +++++++++++----------------------------------- 3 files changed, 101 insertions(+), 164 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/trafficserver/blob/6e93cff1/CHANGES ---------------------------------------------------------------------- diff --git a/CHANGES b/CHANGES index 9c8129d..3eeedb9 100644 --- a/CHANGES +++ b/CHANGES @@ -1,5 +1,7 @@ -*- coding: utf-8 -*- Changes with Apache Traffic Server 3.3.0 + *) [TS-1315] Fix URL parsing to handle non-HTTP schemes correctly. + *) [TS-1322] CONNECT to parent proxy has URL with a trailing slash Author: Yakov Kopel http://git-wip-us.apache.org/repos/asf/trafficserver/blob/6e93cff1/proxy/InkAPITest.cc ---------------------------------------------------------------------- diff --git a/proxy/InkAPITest.cc b/proxy/InkAPITest.cc index e518f24..e072edb 100644 --- a/proxy/InkAPITest.cc +++ b/proxy/InkAPITest.cc @@ -5119,7 +5119,26 @@ REGRESSION_TEST(SDK_API_TSMimeHdrParse) (RegressionTest * test, int atype, int * REGRESSION_TEST(SDK_API_TSUrlParse) (RegressionTest * test, int atype, int *pstatus) { NOWARN_UNUSED(atype); - const char *url = "http://abc:[email protected]:3426/homepage.cgi;ab?abc=def#abc"; + static char const * const urls[] = { + "file:///test.dat;ab?abc=def#abc", + "http://www.example.com", + "http://abc:[email protected]", + "http://www.example.com:3426", + "http://abc:[email protected]:3426", + "http://www.example.com/homepage.cgi", + "http://www.example.com/homepage.cgi;ab?abc=def#abc", + "http://abc:[email protected]:3426/homepage.cgi;ab?abc=def#abc", + "https://abc:[email protected]:3426/homepage.cgi;ab?abc=def#abc", + "ftp://abc:[email protected]:3426/homepage.cgi;ab?abc=def#abc", + "file:///c:/test.dat;ab?abc=def#abc", // Note: file://c: is malformed URL because no host is present. + "file:///test.dat;ab?abc=def#abc", + "foo://bar.com/baz/" + }; + + static int const num_urls = sizeof(urls) / sizeof(urls[0]); + bool test_passed[num_urls] = {false}; + + const char *start; const char *end; char *temp; @@ -5128,47 +5147,57 @@ REGRESSION_TEST(SDK_API_TSUrlParse) (RegressionTest * test, int atype, int *psta TSMBuffer bufp; TSMLoc url_loc = (TSMLoc)NULL; - bool test_passed_parse_url = false; int length; *pstatus = REGRESSION_TEST_INPROGRESS; - bufp = TSMBufferCreate(); - if (TSUrlCreate(bufp, &url_loc) != TS_SUCCESS) { - SDK_RPRINT(test, "TSUrlParse", "TestCase1", TC_FAIL, "Cannot create Url for parsing the url"); - if (TSMBufferDestroy(bufp) == TS_ERROR) { - SDK_RPRINT(test, "TSUrlParse", "TestCase1", TC_FAIL, "Error in Destroying MBuffer"); - } - } else { - start = url; - end = url + strlen(url) + 1; - if ((retval = TSUrlParse(bufp, url_loc, &start, end)) == TS_PARSE_ERROR) { - SDK_RPRINT(test, "TSUrlParse", "TestCase1", TC_FAIL, "TSUrlParse returns TS_PARSE_ERROR"); + int idx; + for (idx = 0; idx < num_urls; idx++) { + char const *url = urls[idx]; + + bufp = TSMBufferCreate(); + if (TSUrlCreate(bufp, &url_loc) != TS_SUCCESS) { + SDK_RPRINT(test, "TSUrlParse", url, TC_FAIL, "Cannot create Url for parsing the url"); + if (TSMBufferDestroy(bufp) == TS_ERROR) { + SDK_RPRINT(test, "TSUrlParse", url, TC_FAIL, "Error in Destroying MBuffer"); + } } else { - if (retval == TS_PARSE_DONE) { - temp = TSUrlStringGet(bufp, url_loc, &length); - if (strncmp(url, temp, length) == 0) { - SDK_RPRINT(test, "TSUrlParse", "TestCase1", TC_PASS, "ok"); - test_passed_parse_url = true; + start = url; + end = url + strlen(url) + 1; + if ((retval = TSUrlParse(bufp, url_loc, &start, end)) == TS_PARSE_ERROR) { + SDK_RPRINT(test, "TSUrlParse", url, TC_FAIL, "TSUrlParse returns TS_PARSE_ERROR"); + } else { + if (retval == TS_PARSE_DONE) { + temp = TSUrlStringGet(bufp, url_loc, &length); + if (strncmp(url, temp, length) == 0) { + SDK_RPRINT(test, "TSUrlParse", url, TC_PASS, "ok"); + test_passed[idx] = true; + } else { + SDK_RPRINT(test, "TSUrlParse", url, TC_FAIL, "Value's Mismatch"); + } + TSfree(temp); } else { - SDK_RPRINT(test, "TSUrlParse", "TestCase1", TC_FAIL, "Value's Mismatch"); + SDK_RPRINT(test, "TSUrlParse", url, TC_FAIL, "Parsing Error"); } - TSfree(temp); - } else { - SDK_RPRINT(test, "TSHttpHdrParseReq", "TestCase1", TC_FAIL, "Parsing Error"); } } + + TSHandleMLocRelease(bufp, TS_NULL_MLOC, url_loc); + TSMBufferDestroy(bufp); } - if (test_passed_parse_url != true) { - *pstatus = REGRESSION_TEST_FAILED; - } else { + for (idx = 0; idx < num_urls; idx++) { + if (test_passed[idx] != true) { + *pstatus = REGRESSION_TEST_FAILED; + break; + } + } + + if (idx >= num_urls) { *pstatus = REGRESSION_TEST_PASSED; } - TSHandleMLocRelease(bufp, TS_NULL_MLOC, url_loc); - TSMBufferDestroy(bufp); return; } http://git-wip-us.apache.org/repos/asf/trafficserver/blob/6e93cff1/proxy/hdrs/URL.cc ---------------------------------------------------------------------- diff --git a/proxy/hdrs/URL.cc b/proxy/hdrs/URL.cc index 33416b5..9a9c754 100644 --- a/proxy/hdrs/URL.cc +++ b/proxy/hdrs/URL.cc @@ -1050,152 +1050,57 @@ url_unescapify(Arena * arena, const char *str, int length) } MIMEParseResult -url_parse(HdrHeap * heap, URLImpl * url, const char **start, const char *end, bool copy_strings) +url_parse_scheme(HdrHeap * heap, URLImpl * url, const char **start, const char *end, bool copy_strings_p) { - const char *cur; + const char *cur = *start; const char *scheme_wks; const char *scheme_start = NULL; const char *scheme_end = NULL; int scheme_wks_idx; - cur = *start; - -skip_ws: - if (ParseRules::is_ws(*cur)) { - GETNEXT(eof); - goto skip_ws; - } - ///////////////////////////////// - // fast case starts with http: // - ///////////////////////////////// - - if ((end - cur >= 5) && (((cur[0] ^ 'h') | (cur[1] ^ 't') | (cur[2] ^ 't') | (cur[3] ^ 'p') | (cur[4] ^ ':')) == 0)) { - scheme_start = cur; - cur += 4; // point to colon - scheme_end = cur; - scheme_wks_idx = URL_WKSIDX_HTTP; - scheme_wks = URL_SCHEME_HTTP; - } else { - scheme_start = cur; - - parse_scheme2: - if (*cur != ':') { - GETNEXT(done); - goto parse_scheme2; + while(' ' == *cur && ++cur < end) + ; + if (cur < end) { + scheme_start = scheme_end = cur; + // special case 'http:' for performance + if ((end - cur >= 5) && (((cur[0] ^ 'h') | (cur[1] ^ 't') | (cur[2] ^ 't') | (cur[3] ^ 'p') | (cur[4] ^ ':')) == 0)) { + scheme_end = cur + 4; // point to colon + url_scheme_set(heap, url, scheme_start, URL_WKSIDX_HTTP, 4, copy_strings_p); + } else { // some other scheme, try to parse it. + while (':' != *cur && ++cur < end) + ; + if (cur < end) { // found a colon + scheme_wks_idx = hdrtoken_tokenize(scheme_start, cur - scheme_start, &scheme_wks); + + /* Distinguish between a scheme only and a username by looking past the colon. If it is missing + or it's a slash, presume scheme. Otherwise it's a username with a password. + */ + if ((scheme_wks_idx > 0 && hdrtoken_wks_to_token_type(scheme_wks) == HDRTOKEN_TYPE_SCHEME) || // known scheme + (cur >= end-1 || cur[1] == '/')) // no more data or slash past colon + { + scheme_end = cur; + url_scheme_set(heap, url, scheme_start, scheme_wks_idx, scheme_end - scheme_start, copy_strings_p); + } + } } - scheme_end = cur; - scheme_wks_idx = hdrtoken_tokenize(scheme_start, scheme_end - scheme_start, &scheme_wks); - - // FIX: need to improve parser to support unknown schemes --- this code is - // a hack --- if no well-known scheme is found, the parser assumes - // the scheme has been omitted, and the colon is really the colon - // from "username:password" or "host:port", so it calls the scheme - // "http" and continues parsing there. - // - // If URLs can really arrive without a scheme:// prefix, then it's - // difficult to disambiguate non-well-known schemes from missing - // schemes. I'm not sure how such malformed URLs could arrive at - // the proxy --- perhaps (a) some web browsers request files with - // no leading slash, or (b) some web browsers with explicit proxy - // set may omit schemes if the user types in "www.apache.org", etc? - - // if we parsed a scheme which is well known as a scheme, use it as the - // scheme, otherwise we'll assume the scheme is missing and treat as HTTP. - - if ((scheme_wks_idx < 0) || (hdrtoken_wks_to_token_type(scheme_wks) != HDRTOKEN_TYPE_SCHEME)) - goto done; + *start = scheme_end; + return PARSE_CONT; } - - /////////////////////////////////////////// - // we get here if we have a legal scheme // - /////////////////////////////////////////// - - url_scheme_set(heap, url, scheme_start, scheme_wks_idx, scheme_end - scheme_start, copy_strings); - - *start = scheme_end; - return url_parse_http(heap, url, start, end, copy_strings); - -done: - *start = scheme_start; - return url_parse_http(heap, url, start, end, copy_strings); - -eof: - return PARSE_ERROR; + return PARSE_ERROR; // no non-whitespace found } MIMEParseResult -url_parse_no_path_component_breakdown(HdrHeap * heap, - URLImpl * url, const char **start, const char *end, bool copy_strings) +url_parse(HdrHeap * heap, URLImpl * url, const char **start, const char *end, bool copy_strings_p) { - const char *cur; - const char *scheme_wks; - const char *scheme_start = NULL; - const char *scheme_end = NULL; - int scheme_wks_idx; - - cur = *start; - -skip_ws: - if (ParseRules::is_ws(*cur)) { - GETNEXT(eof); - goto skip_ws; - } - ///////////////////////////////// - // fast case starts with http: // - ///////////////////////////////// - - if ((end - cur >= 5) && (((cur[0] ^ 'h') | (cur[1] ^ 't') | (cur[2] ^ 't') | (cur[3] ^ 'p') | (cur[4] ^ ':')) == 0)) { - scheme_start = cur; - cur += 4; // point to colon - scheme_end = cur; - scheme_wks_idx = URL_WKSIDX_HTTP; - scheme_wks = URL_SCHEME_HTTP; - } else { - scheme_start = cur; - - parse_scheme2: - if (*cur != ':') { - GETNEXT(done); - goto parse_scheme2; - } - scheme_end = cur; - scheme_wks_idx = hdrtoken_tokenize(scheme_start, scheme_end - scheme_start, &scheme_wks); - - // FIX: need to improve parser to support unknown schemes --- this code is - // a hack --- if no well-known scheme is found, the parser assumes - // the scheme has been omitted, and the colon is really the colon - // from "username:password" or "host:port", so it calls the scheme - // "http" and continues parsing there. - // - // If URLs can really arrive without a scheme:// prefix, then it's - // difficult to disambiguate non-well-known schemes from missing - // schemes. I'm not sure how such malformed URLs could arrive at - // the proxy --- perhaps (a) some web browsers request files with - // no leading slash, or (b) some web browsers with explicit proxy - // set may omit schemes if the user types in "www.apache.org", etc? - - // if we parsed a scheme which is well known as a scheme, use it as the - // scheme, otherwise we'll assume the scheme is missing and treat as HTTP. - - if ((scheme_wks_idx < 0) || (hdrtoken_wks_to_token_type(scheme_wks) != HDRTOKEN_TYPE_SCHEME)) - goto done; - } - - /////////////////////////////////////////// - // we get here if we have a legal scheme // - /////////////////////////////////////////// - - url_scheme_set(heap, url, scheme_start, scheme_wks_idx, scheme_end - scheme_start, copy_strings); - - *start = scheme_end; - return url_parse_http_no_path_component_breakdown(heap, url, start, end, copy_strings); - -done: - *start = scheme_start; - return url_parse_http_no_path_component_breakdown(heap, url, start, end, copy_strings); + MIMEParseResult zret = url_parse_scheme(heap, url, start, end, copy_strings_p); + return PARSE_CONT == zret ? url_parse_http(heap, url, start, end, copy_strings_p) : zret; +} -eof: - return PARSE_ERROR; +MIMEParseResult +url_parse_no_path_component_breakdown(HdrHeap * heap, URLImpl * url, const char **start, const char *end, bool copy_strings_p) +{ + MIMEParseResult zret = url_parse_scheme(heap, url, start, end, copy_strings_p); + return PARSE_CONT == zret ? url_parse_http_no_path_component_breakdown(heap, url, start, end, copy_strings_p) : zret; } /** @@ -1516,11 +1421,12 @@ url_print(URLImpl * url, char *buf_start, int buf_length, int *buf_index_inout, if (url->m_ptr_scheme) { TRY(mime_mem_print(url->m_ptr_scheme, url->m_len_scheme, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout)); - if ((url->m_scheme_wks_idx >= 0) && (hdrtoken_index_to_wks(url->m_scheme_wks_idx) == URL_SCHEME_FILE)) { - TRY(mime_mem_print(":", 1, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout)); - } else { + // [amc] Why is "file:" special cased to be wrong? +// if ((url->m_scheme_wks_idx >= 0) && (hdrtoken_index_to_wks(url->m_scheme_wks_idx) == URL_SCHEME_FILE)) { +// TRY(mime_mem_print(":", 1, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout)); +// } else { TRY(mime_mem_print("://", 3, buf_start, buf_length, buf_index_inout, buf_chars_to_skip_inout)); - } +// } } if (url->m_ptr_user) {
