Title: [204544] trunk
Revision
204544
Author
[email protected]
Date
2016-08-16 17:41:30 -0700 (Tue, 16 Aug 2016)

Log Message

URLParser should parse URLs without credentials
https://bugs.webkit.org/show_bug.cgi?id=160913

Reviewed by Brady Eidson.

Source/WebCore:

When parsing a URL, after the scheme we do not know if we are parsing a username and password
or if we are parsing the host until we hit a '@' indicating the end of the credentials or a /, ?, or #
indicating the end of the host.  Because there are significantly different rules for serializing usernames,
passwords, and hosts (all of which have yet to be implemented in URLParser) we put the code points after the 
scheme in a special buffer that will be processed once we know what we are parsing.
        
In the future, this could be optimized by assuming that we are parsing the host and if we encounter a '@' character,
then do some extra work.  This would save us the effort of copying the host twice because most URLs don't have credentials.

This is covered by a new URLParser API test.

* platform/Logging.h:
* platform/URLParser.cpp:
(WebCore::isC0Control):
(WebCore::isC0ControlOrSpace):
(WebCore::isTabOrNewline):
(WebCore::isSpecialScheme):
(WebCore::URLParser::parse):
(WebCore::URLParser::authorityEndReached):
(WebCore::URLParser::hostEndReached):
(WebCore::URLParser::allValuesEqual):
(WebCore::isASCIIDigit): Deleted.
(WebCore::isASCIIAlpha): Deleted.
(WebCore::isASCIIAlphanumeric): Deleted.
* platform/URLParser.h:
(WebCore::URLParser::parse):

Tools:

* TestWebKitAPI/Tests/WebCore/URLParser.cpp:
(TestWebKitAPI::s):
(TestWebKitAPI::checkURL):
(TestWebKitAPI::TEST_F):

Modified Paths

Diff

Modified: trunk/Source/WebCore/ChangeLog (204543 => 204544)


--- trunk/Source/WebCore/ChangeLog	2016-08-17 00:25:36 UTC (rev 204543)
+++ trunk/Source/WebCore/ChangeLog	2016-08-17 00:41:30 UTC (rev 204544)
@@ -1,3 +1,37 @@
+2016-08-16  Alex Christensen  <[email protected]>
+
+        URLParser should parse URLs without credentials
+        https://bugs.webkit.org/show_bug.cgi?id=160913
+
+        Reviewed by Brady Eidson.
+
+        When parsing a URL, after the scheme we do not know if we are parsing a username and password
+        or if we are parsing the host until we hit a '@' indicating the end of the credentials or a /, ?, or #
+        indicating the end of the host.  Because there are significantly different rules for serializing usernames,
+        passwords, and hosts (all of which have yet to be implemented in URLParser) we put the code points after the 
+        scheme in a special buffer that will be processed once we know what we are parsing.
+        
+        In the future, this could be optimized by assuming that we are parsing the host and if we encounter a '@' character,
+        then do some extra work.  This would save us the effort of copying the host twice because most URLs don't have credentials.
+
+        This is covered by a new URLParser API test.
+
+        * platform/Logging.h:
+        * platform/URLParser.cpp:
+        (WebCore::isC0Control):
+        (WebCore::isC0ControlOrSpace):
+        (WebCore::isTabOrNewline):
+        (WebCore::isSpecialScheme):
+        (WebCore::URLParser::parse):
+        (WebCore::URLParser::authorityEndReached):
+        (WebCore::URLParser::hostEndReached):
+        (WebCore::URLParser::allValuesEqual):
+        (WebCore::isASCIIDigit): Deleted.
+        (WebCore::isASCIIAlpha): Deleted.
+        (WebCore::isASCIIAlphanumeric): Deleted.
+        * platform/URLParser.h:
+        (WebCore::URLParser::parse):
+
 2016-08-16  Chris Dumez  <[email protected]>
 
         Add support for ShadowRoot.mode attribute

Modified: trunk/Source/WebCore/platform/Logging.h (204543 => 204544)


--- trunk/Source/WebCore/platform/Logging.h	2016-08-17 00:25:36 UTC (rev 204543)
+++ trunk/Source/WebCore/platform/Logging.h	2016-08-17 00:41:30 UTC (rev 204544)
@@ -78,6 +78,7 @@
     M(StorageAPI) \
     M(TextAutosizing) \
     M(Threading) \
+    M(URLParser) \
     M(WebAudio) \
     M(WebGL) \
     M(WebReplay) \

Modified: trunk/Source/WebCore/platform/URLParser.cpp (204543 => 204544)


--- trunk/Source/WebCore/platform/URLParser.cpp	2016-08-17 00:25:36 UTC (rev 204543)
+++ trunk/Source/WebCore/platform/URLParser.cpp	2016-08-17 00:41:30 UTC (rev 204544)
@@ -25,18 +25,16 @@
 
 #include "config.h"
 #include "URLParser.h"
+
+#include "Logging.h"
 #include "NotImplemented.h"
-
 #include <wtf/text/StringBuilder.h>
 
 namespace WebCore {
 
-static bool isC0Control(const StringView::CodePoints::Iterator& c) { return *c <= 0x001F; }
-static bool isC0ControlOrSpace(const StringView::CodePoints::Iterator& c) { return isC0Control(c) || *c == 0x0020; }
-static bool isTabOrNewline(const StringView::CodePoints::Iterator& c) { return *c == 0x0009 || *c == 0x000A || *c == 0x000D; }
-static bool isASCIIDigit(const StringView::CodePoints::Iterator& c) { return *c >= 0x0030  && *c <= 0x0039; }
-static bool isASCIIAlpha(const StringView::CodePoints::Iterator& c) { return (*c >= 0x0041 && *c <= 0x005A) || (*c >= 0x0061 && *c <= 0x007A); }
-static bool isASCIIAlphanumeric(const StringView::CodePoints::Iterator& c) { return isASCIIDigit(c) || isASCIIAlpha(c); }
+template<typename CharacterType> static bool isC0Control(CharacterType character) { return character <= 0x0001F; }
+template<typename CharacterType> static bool isC0ControlOrSpace(CharacterType character) { return isC0Control(character) || character == 0x0020; }
+template<typename CharacterType> static bool isTabOrNewline(CharacterType character) { return character == 0x0009 || character == 0x000A || character == 0x000D; }
     
 static bool isSpecialScheme(const String& scheme)
 {
@@ -51,13 +49,14 @@
 
 Optional<URL> URLParser::parse(const String& input, const URL& base, const TextEncoding&)
 {
-    URL url;
-    
+    m_url = { };
+    m_buffer.clear();
+    m_authorityOrHostBuffer.clear();
+
     auto codePoints = StringView(input).codePoints();
     auto c = codePoints.begin();
     auto end = codePoints.end();
-    StringBuilder buffer;
-    while (isC0ControlOrSpace(c))
+    while (c != end && isC0ControlOrSpace(*c))
         ++c;
     
     enum class State : uint8_t {
@@ -71,10 +70,8 @@
         RelativeSlash,
         SpecialAuthoritySlashes,
         SpecialAuthorityIgnoreSlashes,
-        Authority,
+        AuthorityOrHost,
         Host,
-        Hostname,
-        Port,
         File,
         FileSlash,
         FileHost,
@@ -85,11 +82,12 @@
         Fragment,
     };
 
-#define LOG_STATE(x)
+#define LOG_STATE(x) LOG(URLParser, x)
+#define LOG_FINAL_STATE(x) LOG(URLParser, "Final State: %s", x)
 
     State state = State::SchemeStart;
     while (c != end) {
-        if (isTabOrNewline(c)) {
+        if (isTabOrNewline(*c)) {
             ++c;
             continue;
         }
@@ -97,8 +95,8 @@
         switch (state) {
         case State::SchemeStart:
             LOG_STATE("SchemeStart");
-            if (isASCIIAlpha(c)) {
-                buffer.append(toASCIILower(*c));
+            if (isASCIIAlpha(*c)) {
+                m_buffer.append(toASCIILower(*c));
                 state = State::Scheme;
             } else
                 state = State::NoScheme;
@@ -106,12 +104,12 @@
             break;
         case State::Scheme:
             LOG_STATE("Scheme");
-            if (isASCIIAlphanumeric(c) || *c == '+' || *c == '-' || *c == '.')
-                buffer.append(toASCIILower(*c));
+            if (isASCIIAlphanumeric(*c) || *c == '+' || *c == '-' || *c == '.')
+                m_buffer.append(toASCIILower(*c));
             else if (*c == ':') {
-                url.m_schemeEnd = buffer.length();
-                String urlScheme = buffer.toString(); // FIXME: Find a way to do this without shrinking the buffer.
-                url.m_protocolIsInHTTPFamily = urlScheme == "http" || urlScheme == "https";
+                m_url.m_schemeEnd = m_buffer.length();
+                String urlScheme = m_buffer.toString(); // FIXME: Find a way to do this without shrinking the m_buffer.
+                m_url.m_protocolIsInHTTPFamily = urlScheme == "http" || urlScheme == "https";
                 if (urlScheme == "file")
                     state = State::File;
                 else if (isSpecialScheme(urlScheme)) {
@@ -121,9 +119,9 @@
                         state = State::SpecialAuthoritySlashes;
                 } else
                     state = State::SchemeEndCheckForSlashes;
-                buffer.append(':');
+                m_buffer.append(':');
             } else {
-                buffer.clear();
+                m_buffer.clear();
                 state = State::NoScheme;
                 // FIXME: Find a way to start over here.
                 notImplemented();
@@ -179,9 +177,9 @@
                 ++c;
                 if (c == end)
                     return Nullopt;
-                buffer.append('/');
+                m_buffer.append('/');
                 if (*c == '/') {
-                    buffer.append('/');
+                    m_buffer.append('/');
                     state = State::SpecialAuthorityIgnoreSlashes;
                     ++c;
                     break;
@@ -193,59 +191,32 @@
             break;
         case State::SpecialAuthorityIgnoreSlashes:
             LOG_STATE("SpecialAuthorityIgnoreSlashes");
-            if (*c != '/' && *c != '\\') {
-                state = State::Authority;
-                break;
-            }
-            notImplemented();
-            ++c;
+            if (*c == '/' || *c == '\\')
+                ++c;
+            m_url.m_userStart = m_buffer.length();
+            state = State::AuthorityOrHost;
             break;
-        case State::Authority:
-            LOG_STATE("Authority");
-            if (!url.m_userStart)
-                url.m_userStart = buffer.length();
+        case State::AuthorityOrHost:
+            LOG_STATE("AuthorityOrHost");
             if (*c == '@') {
-                url.m_passwordEnd = buffer.length();
-                buffer.append('@');
+                authorityEndReached();
                 state = State::Host;
-                notImplemented();
-            } else if (*c == ':') {
-                url.m_userEnd = buffer.length();
-                buffer.append(*c);
-            } else {
-                if (*c == '/' || *c == '?' || *c == '#') {
-                    url.m_passwordEnd = buffer.length();
-                    state = State::Host;
-                }
-                buffer.append(*c);
-            }
-            ++c;
-            break;
-        case State::Host:
-        case State::Hostname:
-            LOG_STATE("Host/Hostname");
-            if (*c == ':') {
-                url.m_hostEnd = buffer.length();
-                buffer.append(':');
-                state = State::Port;
             } else if (*c == '/' || *c == '?' || *c == '#') {
-                url.m_hostEnd = buffer.length();
+                hostEndReached();
                 state = State::Path;
-                continue;
+                break;
             } else
-                buffer.append(*c);
+                m_authorityOrHostBuffer.append(*c);
             ++c;
             break;
-        case State::Port:
-            LOG_STATE("Port");
-            if (isASCIIDigit(c)) {
-                buffer.append(*c);
-            } else if (*c == '/' || *c == '?' || *c == '#') {
-                url.m_portEnd = buffer.length();
-                state = State::PathStart;
+        case State::Host:
+            LOG_STATE("Host");
+            if (*c == '/' || *c == '?' || *c == '#') {
+                hostEndReached();
+                state = State::Path;
                 continue;
-            } else
-                return Nullopt;
+            }
+            m_authorityOrHostBuffer.append(*c);
             ++c;
             break;
         case State::File:
@@ -270,8 +241,8 @@
         case State::Path:
             LOG_STATE("Path");
             if (*c == '/') {
-                buffer.append('/');
-                url.m_pathAfterLastSlash = buffer.length();
+                m_buffer.append('/');
+                m_url.m_pathAfterLastSlash = m_buffer.length();
                 ++c;
                 if (c == end)
                     break;
@@ -284,16 +255,16 @@
                     notImplemented();
                 }
             } else if (*c == '?') {
-                url.m_pathEnd = buffer.length();
+                m_url.m_pathEnd = m_buffer.length();
                 state = State::Query;
                 continue;
             } else if (*c == '#') {
-                url.m_pathEnd = buffer.length();
+                m_url.m_pathEnd = m_buffer.length();
                 state = State::Fragment;
                 continue;
             }
             // FIXME: Percent encode c
-            buffer.append(*c);
+            m_buffer.append(*c);
             ++c;
             break;
         case State::CannotBeABaseURLPath:
@@ -304,79 +275,177 @@
         case State::Query:
             LOG_STATE("Query");
             if (*c == '#') {
-                url.m_queryEnd = buffer.length();
+                m_url.m_queryEnd = m_buffer.length();
                 state = State::Fragment;
                 continue;
             }
-            buffer.append(*c);
+            m_buffer.append(*c);
             ++c;
             break;
         case State::Fragment:
             LOG_STATE("Fragment");
-            buffer.append(*c);
+            m_buffer.append(*c);
             ++c;
             break;
         }
     }
-    
+
     switch (state) {
     case State::SchemeStart:
+        LOG_FINAL_STATE("SchemeStart");
+        return Nullopt;
+        break;
     case State::Scheme:
+        LOG_FINAL_STATE("Scheme");
+        break;
     case State::SchemeEndCheckForSlashes:
+        LOG_FINAL_STATE("SchemeEndCheckForSlashes");
+        break;
     case State::NoScheme:
+        LOG_FINAL_STATE("NoScheme");
+        break;
     case State::SpecialRelativeOrAuthority:
+        LOG_FINAL_STATE("SpecialRelativeOrAuthority");
+        break;
     case State::PathOrAuthority:
+        LOG_FINAL_STATE("PathOrAuthority");
+        break;
     case State::Relative:
+        LOG_FINAL_STATE("Relative");
+        break;
     case State::RelativeSlash:
+        LOG_FINAL_STATE("RelativeSlash");
+        break;
     case State::SpecialAuthoritySlashes:
+        LOG_FINAL_STATE("SpecialAuthoritySlashes");
+        break;
     case State::SpecialAuthorityIgnoreSlashes:
-    case State::Authority:
+        LOG_FINAL_STATE("SpecialAuthorityIgnoreSlashes");
         break;
+    case State::AuthorityOrHost:
+        LOG_FINAL_STATE("AuthorityOrHost");
+        m_url.m_userEnd = m_buffer.length();
+        m_url.m_passwordEnd = m_url.m_userEnd;
+        FALLTHROUGH;
     case State::Host:
-    case State::Hostname:
-        url.m_hostEnd = buffer.length();
-        url.m_portEnd = url.m_hostEnd;
-        buffer.append('/');
-        url.m_pathEnd = url.m_hostEnd + 1;
-        url.m_pathAfterLastSlash = url.m_pathEnd;
-        url.m_queryEnd = url.m_pathEnd;
-        url.m_fragmentEnd = url.m_pathEnd;
+        if (state == State::Host)
+            LOG_FINAL_STATE("Host");
+        hostEndReached();
+        m_buffer.append('/');
+        m_url.m_pathEnd = m_url.m_portEnd + 1;
+        m_url.m_pathAfterLastSlash = m_url.m_pathEnd;
+        m_url.m_queryEnd = m_url.m_pathEnd;
+        m_url.m_fragmentEnd = m_url.m_pathEnd;
         break;
-    case State::Port:
-        url.m_portEnd = buffer.length();
-        buffer.append('/');
-        url.m_pathEnd = url.m_portEnd + 1;
-        url.m_pathAfterLastSlash = url.m_pathEnd;
-        url.m_queryEnd = url.m_pathEnd;
-        url.m_fragmentEnd = url.m_pathEnd;
+    case State::File:
+        LOG_FINAL_STATE("File");
         break;
-    case State::File:
     case State::FileSlash:
+        LOG_FINAL_STATE("FileSlash");
+        break;
     case State::FileHost:
+        LOG_FINAL_STATE("FileHost");
+        break;
     case State::PathStart:
+        LOG_FINAL_STATE("PathStart");
+        break;
     case State::Path:
-        url.m_pathEnd = buffer.length();
-        url.m_queryEnd = url.m_pathEnd;
-        url.m_fragmentEnd = url.m_pathEnd;
+        LOG_FINAL_STATE("Path");
+        m_url.m_pathEnd = m_buffer.length();
+        m_url.m_queryEnd = m_url.m_pathEnd;
+        m_url.m_fragmentEnd = m_url.m_pathEnd;
         break;
     case State::CannotBeABaseURLPath:
+        LOG_FINAL_STATE("CannotBeABaseURLPath");
         break;
     case State::Query:
-        url.m_queryEnd = buffer.length();
-        url.m_fragmentEnd = url.m_queryEnd;
+        LOG_FINAL_STATE("Query");
+        m_url.m_queryEnd = m_buffer.length();
+        m_url.m_fragmentEnd = m_url.m_queryEnd;
         break;
     case State::Fragment:
-        url.m_fragmentEnd = buffer.length();
+        LOG_FINAL_STATE("Fragment");
+        m_url.m_fragmentEnd = m_buffer.length();
         break;
     }
 
-    url.m_string = buffer.toString();
-    url.m_isValid = true;
-    return url;
+    m_url.m_string = m_buffer.toString();
+    m_url.m_isValid = true;
+    return m_url;
 }
 
+void URLParser::authorityEndReached()
+{
+    auto codePoints = StringView(m_authorityOrHostBuffer.toString()).codePoints();
+    auto iterator = codePoints.begin();
+    auto end = codePoints.end();
+    for (; iterator != end; ++iterator) {
+        m_buffer.append(*iterator);
+        if (*iterator == ':') {
+            ++iterator;
+            m_url.m_userEnd = m_buffer.length() - 1;
+            break;
+        }
+    }
+    for (; iterator != end; ++iterator)
+        m_buffer.append(*iterator);
+    m_url.m_passwordEnd = m_buffer.length();
+    m_buffer.append('@');
+    m_authorityOrHostBuffer.clear();
+}
+
+void URLParser::hostEndReached()
+{
+    auto codePoints = StringView(m_authorityOrHostBuffer.toString()).codePoints();
+    auto iterator = codePoints.begin();
+    auto end = codePoints.end();
+    for (; iterator != end; ++iterator) {
+        if (*iterator == ':') {
+            ++iterator;
+            m_url.m_hostEnd = m_buffer.length();
+            m_buffer.append(':');
+            for (; iterator != end; ++iterator)
+                m_buffer.append(*iterator);
+            m_url.m_portEnd = m_buffer.length();
+            return;
+        }
+        m_buffer.append(*iterator);
+    }
+    m_url.m_hostEnd = m_buffer.length();
+    m_url.m_portEnd = m_url.m_hostEnd;
+    m_authorityOrHostBuffer.clear();
+}
+
 bool URLParser::allValuesEqual(const URL& a, const URL& b)
 {
+    LOG(URLParser, "%d %d %d %d %d %d %d %d %d %d %d %d %s\n%d %d %d %d %d %d %d %d %d %d %d %d %s",
+        a.m_isValid,
+        a.m_protocolIsInHTTPFamily,
+        a.m_schemeEnd,
+        a.m_userStart,
+        a.m_userEnd,
+        a.m_passwordEnd,
+        a.m_hostEnd,
+        a.m_portEnd,
+        a.m_pathAfterLastSlash,
+        a.m_pathEnd,
+        a.m_queryEnd,
+        a.m_fragmentEnd,
+        a.m_string.utf8().data(),
+        b.m_isValid,
+        b.m_protocolIsInHTTPFamily,
+        b.m_schemeEnd,
+        b.m_userStart,
+        b.m_userEnd,
+        b.m_passwordEnd,
+        b.m_hostEnd,
+        b.m_portEnd,
+        b.m_pathAfterLastSlash,
+        b.m_pathEnd,
+        b.m_queryEnd,
+        b.m_fragmentEnd,
+        b.m_string.utf8().data());
+
     return a.m_string == b.m_string
         && a.m_isValid == b.m_isValid
         && a.m_protocolIsInHTTPFamily == b.m_protocolIsInHTTPFamily

Modified: trunk/Source/WebCore/platform/URLParser.h (204543 => 204544)


--- trunk/Source/WebCore/platform/URLParser.h	2016-08-17 00:25:36 UTC (rev 204543)
+++ trunk/Source/WebCore/platform/URLParser.h	2016-08-17 00:41:30 UTC (rev 204544)
@@ -28,13 +28,20 @@
 #include "TextEncoding.h"
 #include "URL.h"
 #include <wtf/Forward.h>
+#include <wtf/text/StringBuilder.h>
 
 namespace WebCore {
     
 class URLParser {
 public:
-    WEBCORE_EXPORT static Optional<URL> parse(const String&, const URL& = { }, const TextEncoding& = UTF8Encoding());
+    WEBCORE_EXPORT Optional<URL> parse(const String&, const URL& = { }, const TextEncoding& = UTF8Encoding());
     WEBCORE_EXPORT static bool allValuesEqual(const URL&, const URL&);
+private:
+    URL m_url;
+    StringBuilder m_buffer;
+    StringBuilder m_authorityOrHostBuffer;
+    void authorityEndReached();
+    void hostEndReached();
 };
 
 }

Modified: trunk/Tools/ChangeLog (204543 => 204544)


--- trunk/Tools/ChangeLog	2016-08-17 00:25:36 UTC (rev 204543)
+++ trunk/Tools/ChangeLog	2016-08-17 00:41:30 UTC (rev 204544)
@@ -1,3 +1,15 @@
+2016-08-16  Alex Christensen  <[email protected]>
+
+        URLParser should parse URLs without credentials
+        https://bugs.webkit.org/show_bug.cgi?id=160913
+
+        Reviewed by Brady Eidson.
+
+        * TestWebKitAPI/Tests/WebCore/URLParser.cpp:
+        (TestWebKitAPI::s):
+        (TestWebKitAPI::checkURL):
+        (TestWebKitAPI::TEST_F):
+
 2016-08-16  Anders Carlsson  <[email protected]>
 
         Add WTF::ScopeExit

Modified: trunk/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp (204543 => 204544)


--- trunk/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp	2016-08-17 00:25:36 UTC (rev 204543)
+++ trunk/Tools/TestWebKitAPI/Tests/WebCore/URLParser.cpp	2016-08-17 00:41:30 UTC (rev 204544)
@@ -53,7 +53,8 @@
 static const char* s(const String& s) { return s.utf8().data(); }
 static void checkURL(const String& urlString, const ExpectedParts& parts)
 {
-    auto url = ""
+    URLParser parser;
+    auto url = ""
     EXPECT_STREQ(s(parts.protocol), s(url->protocol()));
     EXPECT_STREQ(s(parts.user), s(url->user()));
     EXPECT_STREQ(s(parts.password), s(url->pass()));
@@ -86,6 +87,19 @@
     checkURL("http://user:[email protected]:123/", {"http", "user", "pass", "webkit.org", 123, "/", "", "", "http://user:[email protected]:123/"});
     checkURL("http://user:[email protected]:123", {"http", "user", "pass", "webkit.org", 123, "/", "", "", "http://user:[email protected]:123/"});
     checkURL("http://user:[email protected]", {"http", "user", "pass", "webkit.org", 0, "/", "", "", "http://user:[email protected]/"});
+    checkURL("http://webkit.org", {"http", "", "", "webkit.org", 0, "/", "", "", "http://webkit.org/"});
 }
 
+static void shouldFail(const String& urlString)
+{
+    URLParser parser;
+    auto invalidURL = parser.parse(urlString);
+    EXPECT_TRUE(invalidURL == Nullopt);
+}
+    
+TEST_F(URLParserTest, ParserFailures)
+{
+    shouldFail("    ");
+}
+
 } // namespace TestWebKitAPI
_______________________________________________
webkit-changes mailing list
[email protected]
https://lists.webkit.org/mailman/listinfo/webkit-changes

Reply via email to