git commit: TS-1315 Fix URL parsing to handle non-HTTP schemes correctly.

amc Mon, 23 Jul 2012 06:02:13 -0700

Updated Branches:
  refs/heads/master 4dd567102 -> 6e93cff1b


TS-1315 Fix URL parsing to handle non-HTTP schemes correctly.


Project: http://git-wip-us.apache.org/repos/asf/trafficserver/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafficserver/commit/6e93cff1
Tree: http://git-wip-us.apache.org/repos/asf/trafficserver/tree/6e93cff1
Diff: http://git-wip-us.apache.org/repos/asf/trafficserver/diff/6e93cff1

Branch: refs/heads/master
Commit: 6e93cff1bd8fc1b7cba4c92b8e31ca2d9c5734d2
Parents: 4dd5671
Author: Alan M. Carroll <[email protected]>
Authored: Wed Jul 18 13:54:54 2012 -0500
Committer: Alan M. Carroll <[email protected]>
Committed: Mon Jul 23 08:01:29 2012 -0500

----------------------------------------------------------------------
 CHANGES             |    2 +
 proxy/InkAPITest.cc |   83 ++++++++++++++-------
 proxy/hdrs/URL.cc   |  180 +++++++++++-----------------------------------
 3 files changed, 101 insertions(+), 164 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/trafficserver/blob/6e93cff1/CHANGES
----------------------------------------------------------------------
diff --git a/CHANGES b/CHANGES
index 9c8129d..3eeedb9 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,5 +1,7 @@
                                                          -*- coding: utf-8 -*-
 Changes with Apache Traffic Server 3.3.0
+  *) [TS-1315] Fix URL parsing to handle non-HTTP schemes correctly.
+
   *) [TS-1322] CONNECT to parent proxy has URL with a trailing slash
     Author: Yakov Kopel
 

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/6e93cff1/proxy/InkAPITest.cc
----------------------------------------------------------------------
diff --git a/proxy/InkAPITest.cc b/proxy/InkAPITest.cc
index e518f24..e072edb 100644
--- a/proxy/InkAPITest.cc
+++ b/proxy/InkAPITest.cc
@@ -5119,7 +5119,26 @@ REGRESSION_TEST(SDK_API_TSMimeHdrParse) (RegressionTest 
* test, int atype, int *
 REGRESSION_TEST(SDK_API_TSUrlParse) (RegressionTest * test, int atype, int 
*pstatus)
 {
   NOWARN_UNUSED(atype);
-  const char *url = 
"http://abc:[email protected]:3426/homepage.cgi;ab?abc=def#abc";;
+  static char const * const urls[] = {
+      "file:///test.dat;ab?abc=def#abc",
+      "http://www.example.com";,
+      "http://abc:[email protected]";,
+      "http://www.example.com:3426";,
+      "http://abc:[email protected]:3426";,
+      "http://www.example.com/homepage.cgi";,
+      "http://www.example.com/homepage.cgi;ab?abc=def#abc";,
+      "http://abc:[email protected]:3426/homepage.cgi;ab?abc=def#abc";,
+      "https://abc:[email protected]:3426/homepage.cgi;ab?abc=def#abc";,
+      "ftp://abc:[email protected]:3426/homepage.cgi;ab?abc=def#abc";,
+      "file:///c:/test.dat;ab?abc=def#abc", // Note: file://c: is malformed 
URL because no host is present.
+      "file:///test.dat;ab?abc=def#abc",
+      "foo://bar.com/baz/"
+  };
+
+  static int const num_urls = sizeof(urls) / sizeof(urls[0]);
+  bool test_passed[num_urls] = {false};
+
+
   const char *start;
   const char *end;
   char *temp;
@@ -5128,47 +5147,57 @@ REGRESSION_TEST(SDK_API_TSUrlParse) (RegressionTest * 
test, int atype, int *psta
 
   TSMBuffer bufp;
   TSMLoc url_loc = (TSMLoc)NULL;
-  bool test_passed_parse_url = false;
   int length;
 
   *pstatus = REGRESSION_TEST_INPROGRESS;
 
 
-  bufp = TSMBufferCreate();
-  if (TSUrlCreate(bufp, &url_loc) != TS_SUCCESS) {
-    SDK_RPRINT(test, "TSUrlParse", "TestCase1", TC_FAIL, "Cannot create Url 
for parsing the url");
-    if (TSMBufferDestroy(bufp) == TS_ERROR) {
-      SDK_RPRINT(test, "TSUrlParse", "TestCase1", TC_FAIL, "Error in 
Destroying MBuffer");
-    }
-  } else {
-    start = url;
-    end = url + strlen(url) + 1;
-    if ((retval = TSUrlParse(bufp, url_loc, &start, end)) == TS_PARSE_ERROR) {
-      SDK_RPRINT(test, "TSUrlParse", "TestCase1", TC_FAIL, "TSUrlParse returns 
TS_PARSE_ERROR");
+  int idx;
+  for (idx = 0; idx < num_urls; idx++) {
+    char const *url = urls[idx];
+
+    bufp = TSMBufferCreate();
+    if (TSUrlCreate(bufp, &url_loc) != TS_SUCCESS) {
+      SDK_RPRINT(test, "TSUrlParse", url, TC_FAIL, "Cannot create Url for 
parsing the url");
+      if (TSMBufferDestroy(bufp) == TS_ERROR) {
+        SDK_RPRINT(test, "TSUrlParse", url, TC_FAIL, "Error in Destroying 
MBuffer");
+      }
     } else {
-      if (retval == TS_PARSE_DONE) {
-        temp = TSUrlStringGet(bufp, url_loc, &length);
-        if (strncmp(url, temp, length) == 0) {
-          SDK_RPRINT(test, "TSUrlParse", "TestCase1", TC_PASS, "ok");
-          test_passed_parse_url = true;
+      start = url;
+      end = url + strlen(url) + 1;
+      if ((retval = TSUrlParse(bufp, url_loc, &start, end)) == TS_PARSE_ERROR) 
{
+        SDK_RPRINT(test, "TSUrlParse", url, TC_FAIL, "TSUrlParse returns 
TS_PARSE_ERROR");
+      } else {
+        if (retval == TS_PARSE_DONE) {
+          temp = TSUrlStringGet(bufp, url_loc, &length);
+          if (strncmp(url, temp, length) == 0) {
+            SDK_RPRINT(test, "TSUrlParse", url, TC_PASS, "ok");
+            test_passed[idx] = true;
+          } else {
+            SDK_RPRINT(test, "TSUrlParse", url, TC_FAIL, "Value's Mismatch");
+          }
+          TSfree(temp);
         } else {
-          SDK_RPRINT(test, "TSUrlParse", "TestCase1", TC_FAIL, "Value's 
Mismatch");
+          SDK_RPRINT(test, "TSUrlParse", url, TC_FAIL, "Parsing Error");
         }
-        TSfree(temp);
-      } else {
-        SDK_RPRINT(test, "TSHttpHdrParseReq", "TestCase1", TC_FAIL, "Parsing 
Error");
       }
     }
+
+    TSHandleMLocRelease(bufp, TS_NULL_MLOC, url_loc);
+    TSMBufferDestroy(bufp);
   }
 
-  if (test_passed_parse_url != true) {
-    *pstatus = REGRESSION_TEST_FAILED;
-  } else {
+  for (idx = 0; idx < num_urls; idx++) {
+    if (test_passed[idx] != true) {
+      *pstatus = REGRESSION_TEST_FAILED;
+      break;
+    }
+  }
+
+  if (idx >= num_urls) {
     *pstatus = REGRESSION_TEST_PASSED;
   }
 
-  TSHandleMLocRelease(bufp, TS_NULL_MLOC, url_loc);
-  TSMBufferDestroy(bufp);
 
   return;
 }

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/6e93cff1/proxy/hdrs/URL.cc
----------------------------------------------------------------------
diff --git a/proxy/hdrs/URL.cc b/proxy/hdrs/URL.cc
index 33416b5..9a9c754 100644
--- a/proxy/hdrs/URL.cc
+++ b/proxy/hdrs/URL.cc
@@ -1050,152 +1050,57 @@ url_unescapify(Arena * arena, const char *str, int 
length)
 }
 
 MIMEParseResult
-url_parse(HdrHeap * heap, URLImpl * url, const char **start, const char *end, 
bool copy_strings)
+url_parse_scheme(HdrHeap * heap, URLImpl * url, const char **start, const char 
*end, bool copy_strings_p)
 {
-  const char *cur;
+  const char *cur = *start;
   const char *scheme_wks;
   const char *scheme_start = NULL;
   const char *scheme_end = NULL;
   int scheme_wks_idx;
 
-  cur = *start;
-
-skip_ws:
-  if (ParseRules::is_ws(*cur)) {
-    GETNEXT(eof);
-    goto skip_ws;
-  }
-  /////////////////////////////////
-  // fast case starts with http: //
-  /////////////////////////////////
-
-  if ((end - cur >= 5) && (((cur[0] ^ 'h') | (cur[1] ^ 't') | (cur[2] ^ 't') | 
(cur[3] ^ 'p') | (cur[4] ^ ':')) == 0)) {
-    scheme_start = cur;
-    cur += 4;                   // point to colon
-    scheme_end = cur;
-    scheme_wks_idx = URL_WKSIDX_HTTP;
-    scheme_wks = URL_SCHEME_HTTP;
-  } else {
-    scheme_start = cur;
-
-  parse_scheme2:
-    if (*cur != ':') {
-      GETNEXT(done);
-      goto parse_scheme2;
+  while(' ' == *cur && ++cur < end)
+    ;
+  if (cur < end) {
+    scheme_start = scheme_end = cur;
+    // special case 'http:' for performance
+    if ((end - cur >= 5) && (((cur[0] ^ 'h') | (cur[1] ^ 't') | (cur[2] ^ 't') 
| (cur[3] ^ 'p') | (cur[4] ^ ':')) == 0)) {
+      scheme_end = cur + 4;                   // point to colon
+      url_scheme_set(heap, url, scheme_start, URL_WKSIDX_HTTP, 4, 
copy_strings_p);
+    } else { // some other scheme, try to parse it.
+      while (':' != *cur && ++cur < end)
+        ;
+      if (cur < end) { // found a colon
+        scheme_wks_idx = hdrtoken_tokenize(scheme_start, cur - scheme_start, 
&scheme_wks);
+    
+        /*  Distinguish between a scheme only and a username by looking past 
the colon. If it is missing
+            or it's a slash, presume scheme. Otherwise it's a username with a 
password.
+        */
+        if ((scheme_wks_idx > 0 && hdrtoken_wks_to_token_type(scheme_wks) == 
HDRTOKEN_TYPE_SCHEME) || // known scheme
+           (cur >= end-1 || cur[1] == '/')) // no more data or slash past colon
+        {
+          scheme_end = cur;
+          url_scheme_set(heap, url, scheme_start, scheme_wks_idx, scheme_end - 
scheme_start, copy_strings_p);
+        }
+      }
     }
-    scheme_end = cur;
-    scheme_wks_idx = hdrtoken_tokenize(scheme_start, scheme_end - 
scheme_start, &scheme_wks);
-
-    // FIX: need to improve parser to support unknown schemes --- this code is
-    //      a hack --- if no well-known scheme is found, the parser assumes
-    //      the scheme has been omitted, and the colon is really the colon
-    //      from "username:password" or "host:port", so it calls the scheme
-    //      "http" and continues parsing there.
-    //
-    //      If URLs can really arrive without a scheme:// prefix, then it's
-    //      difficult to disambiguate non-well-known schemes from missing
-    //      schemes.  I'm not sure how such malformed URLs could arrive at
-    //      the proxy --- perhaps (a) some web browsers request files with
-    //      no leading slash, or (b) some web browsers with explicit proxy
-    //      set may omit schemes if the user types in "www.apache.org", etc?
-
-    // if we parsed a scheme which is well known as a scheme, use it as the
-    // scheme, otherwise we'll assume the scheme is missing and treat as HTTP.
-
-    if ((scheme_wks_idx < 0) || (hdrtoken_wks_to_token_type(scheme_wks) != 
HDRTOKEN_TYPE_SCHEME))
-      goto done;
+    *start = scheme_end;
+    return PARSE_CONT;
   }
-
-  ///////////////////////////////////////////
-  // we get here if we have a legal scheme //
-  ///////////////////////////////////////////
-
-  url_scheme_set(heap, url, scheme_start, scheme_wks_idx, scheme_end - 
scheme_start, copy_strings);
-
-  *start = scheme_end;
-  return url_parse_http(heap, url, start, end, copy_strings);
-
-done:
-  *start = scheme_start;
-  return url_parse_http(heap, url, start, end, copy_strings);
-
-eof:
-  return PARSE_ERROR;
+  return PARSE_ERROR; // no non-whitespace found
 }
 
 MIMEParseResult
-url_parse_no_path_component_breakdown(HdrHeap * heap,
-                                      URLImpl * url, const char **start, const 
char *end, bool copy_strings)
+url_parse(HdrHeap * heap, URLImpl * url, const char **start, const char *end, 
bool copy_strings_p)
 {
-  const char *cur;
-  const char *scheme_wks;
-  const char *scheme_start = NULL;
-  const char *scheme_end = NULL;
-  int scheme_wks_idx;
-
-  cur = *start;
-
-skip_ws:
-  if (ParseRules::is_ws(*cur)) {
-    GETNEXT(eof);
-    goto skip_ws;
-  }
-  /////////////////////////////////
-  // fast case starts with http: //
-  /////////////////////////////////
-
-  if ((end - cur >= 5) && (((cur[0] ^ 'h') | (cur[1] ^ 't') | (cur[2] ^ 't') | 
(cur[3] ^ 'p') | (cur[4] ^ ':')) == 0)) {
-    scheme_start = cur;
-    cur += 4;                   // point to colon
-    scheme_end = cur;
-    scheme_wks_idx = URL_WKSIDX_HTTP;
-    scheme_wks = URL_SCHEME_HTTP;
-  } else {
-    scheme_start = cur;
-
-  parse_scheme2:
-    if (*cur != ':') {
-      GETNEXT(done);
-      goto parse_scheme2;
-    }
-    scheme_end = cur;
-    scheme_wks_idx = hdrtoken_tokenize(scheme_start, scheme_end - 
scheme_start, &scheme_wks);
-
-    // FIX: need to improve parser to support unknown schemes --- this code is
-    //      a hack --- if no well-known scheme is found, the parser assumes
-    //      the scheme has been omitted, and the colon is really the colon
-    //      from "username:password" or "host:port", so it calls the scheme
-    //      "http" and continues parsing there.
-    //
-    //      If URLs can really arrive without a scheme:// prefix, then it's
-    //      difficult to disambiguate non-well-known schemes from missing
-    //      schemes.  I'm not sure how such malformed URLs could arrive at
-    //      the proxy --- perhaps (a) some web browsers request files with
-    //      no leading slash, or (b) some web browsers with explicit proxy
-    //      set may omit schemes if the user types in "www.apache.org", etc?
-
-    // if we parsed a scheme which is well known as a scheme, use it as the
-    // scheme, otherwise we'll assume the scheme is missing and treat as HTTP.
-
-    if ((scheme_wks_idx < 0) || (hdrtoken_wks_to_token_type(scheme_wks) != 
HDRTOKEN_TYPE_SCHEME))
-      goto done;
-  }
-
-  ///////////////////////////////////////////
-  // we get here if we have a legal scheme //
-  ///////////////////////////////////////////
-
-  url_scheme_set(heap, url, scheme_start, scheme_wks_idx, scheme_end - 
scheme_start, copy_strings);
-
-  *start = scheme_end;
-  return url_parse_http_no_path_component_breakdown(heap, url, start, end, 
copy_strings);
-
-done:
-  *start = scheme_start;
-  return url_parse_http_no_path_component_breakdown(heap, url, start, end, 
copy_strings);
+  MIMEParseResult zret = url_parse_scheme(heap, url, start, end, 
copy_strings_p);
+  return PARSE_CONT == zret ? url_parse_http(heap, url, start, end, 
copy_strings_p) : zret;
+}
 
-eof:
-  return PARSE_ERROR;
+MIMEParseResult
+url_parse_no_path_component_breakdown(HdrHeap * heap, URLImpl * url, const 
char **start, const char *end, bool copy_strings_p)
+{
+  MIMEParseResult zret = url_parse_scheme(heap, url, start, end, 
copy_strings_p);
+  return PARSE_CONT == zret ? url_parse_http_no_path_component_breakdown(heap, 
url, start, end, copy_strings_p) : zret;
 }
 
 /**
@@ -1516,11 +1421,12 @@ url_print(URLImpl * url, char *buf_start, int 
buf_length, int *buf_index_inout,
   if (url->m_ptr_scheme) {
     TRY(mime_mem_print(url->m_ptr_scheme, url->m_len_scheme,
                        buf_start, buf_length, buf_index_inout, 
buf_chars_to_skip_inout));
-    if ((url->m_scheme_wks_idx >= 0) && 
(hdrtoken_index_to_wks(url->m_scheme_wks_idx) == URL_SCHEME_FILE)) {
-      TRY(mime_mem_print(":", 1, buf_start, buf_length, buf_index_inout, 
buf_chars_to_skip_inout));
-    } else {
+    // [amc] Why is "file:" special cased to be wrong?
+//    if ((url->m_scheme_wks_idx >= 0) && 
(hdrtoken_index_to_wks(url->m_scheme_wks_idx) == URL_SCHEME_FILE)) {
+//      TRY(mime_mem_print(":", 1, buf_start, buf_length, buf_index_inout, 
buf_chars_to_skip_inout));
+//    } else {
       TRY(mime_mem_print("://", 3, buf_start, buf_length, buf_index_inout, 
buf_chars_to_skip_inout));
-    }
+//    }
   }
 
   if (url->m_ptr_user) {

git commit: TS-1315 Fix URL parsing to handle non-HTTP schemes correctly.

Reply via email to