Title: [221917] trunk
Revision
221917
Author
[email protected]
Date
2017-09-12 08:35:10 -0700 (Tue, 12 Sep 2017)

Log Message

Show punycode to user if a URL mixes Armenian Seh or Vo with other scripts
https://bugs.webkit.org/show_bug.cgi?id=176578
<rdar://problem/33906231>

Reviewed by Alex Christensen.

Source/WebCore:

Revise our "lookalike character" logic to include the Armenian Vo and Seh
characters, which can be mistaken for 'n' and 'v' when displayed in
certain fonts.

Tested by new API tests.

* platform/mac/WebCoreNSURLExtras.mm:
(WebCore::isArmenianLookalikeCharacter): Added utility function.
(WebCore::isArmenianScriptCharacter): Ditto.
(WebCore::isLookalikeCharacter): Handle Armenian-lookalike cases.

Source/WTF:

* wtf/ASCIICType.h:
(WTF::isASCIIDigitOrPunctuation): Added helper function to recognize ASCII digits
and punctuation characters.

Tools:

* TestWebKitAPI/Tests/WebCore/cocoa/URLExtras.mm:
(TestWebKitAPI::TEST):

Modified Paths

Diff

Modified: trunk/Source/WTF/ChangeLog (221916 => 221917)


--- trunk/Source/WTF/ChangeLog	2017-09-12 15:31:41 UTC (rev 221916)
+++ trunk/Source/WTF/ChangeLog	2017-09-12 15:35:10 UTC (rev 221917)
@@ -1,3 +1,15 @@
+2017-09-12  Brent Fulgham  <[email protected]>
+
+        Show punycode to user if a URL mixes Armenian Seh or Vo with other scripts
+        https://bugs.webkit.org/show_bug.cgi?id=176578
+        <rdar://problem/33906231>
+
+        Reviewed by Alex Christensen.
+
+        * wtf/ASCIICType.h:
+        (WTF::isASCIIDigitOrPunctuation): Added helper function to recognize ASCII digits
+        and punctuation characters.
+
 2017-09-12  Sam Weinig  <[email protected]>
 
         [Cleanup] Follow up cleanup for DOMFormData implementation

Modified: trunk/Source/WTF/wtf/ASCIICType.h (221916 => 221917)


--- trunk/Source/WTF/wtf/ASCIICType.h	2017-09-12 15:31:41 UTC (rev 221916)
+++ trunk/Source/WTF/wtf/ASCIICType.h	2017-09-12 15:35:10 UTC (rev 221917)
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2007-2016 Apple Inc. All rights reserved.
+ * Copyright (C) 2007-2017 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -247,8 +247,13 @@
     return LIKELY(toASCIILowerUnchecked(inputCharacter) == expectedASCIILowercaseLetter);
 }
 
+template<typename CharacterType> inline bool isASCIIDigitOrPunctuation(CharacterType charCode)
+{
+    return (charCode >= '!' && charCode <= '@') || (charCode >= '[' && charCode <= '`') || (charCode >= '{' && charCode <= '~');
 }
 
+}
+
 using WTF::isASCII;
 using WTF::isASCIIAlpha;
 using WTF::isASCIIAlphaCaselessEqual;
@@ -255,6 +260,7 @@
 using WTF::isASCIIAlphanumeric;
 using WTF::isASCIIBinaryDigit;
 using WTF::isASCIIDigit;
+using WTF::isASCIIDigitOrPunctuation;
 using WTF::isASCIIHexDigit;
 using WTF::isASCIILower;
 using WTF::isASCIIOctalDigit;

Modified: trunk/Source/WebCore/ChangeLog (221916 => 221917)


--- trunk/Source/WebCore/ChangeLog	2017-09-12 15:31:41 UTC (rev 221916)
+++ trunk/Source/WebCore/ChangeLog	2017-09-12 15:35:10 UTC (rev 221917)
@@ -1,3 +1,22 @@
+2017-09-12  Brent Fulgham  <[email protected]>
+
+        Show punycode to user if a URL mixes Armenian Seh or Vo with other scripts
+        https://bugs.webkit.org/show_bug.cgi?id=176578
+        <rdar://problem/33906231>
+
+        Reviewed by Alex Christensen.
+
+        Revise our "lookalike character" logic to include the Armenian Vo and Seh
+        characters, which can be mistaken for 'n' and 'v' when displayed in
+        certain fonts.
+
+        Tested by new API tests.
+
+        * platform/mac/WebCoreNSURLExtras.mm:
+        (WebCore::isArmenianLookalikeCharacter): Added utility function.
+        (WebCore::isArmenianScriptCharacter): Ditto.
+        (WebCore::isLookalikeCharacter): Handle Armenian-lookalike cases.
+
 2017-09-12  Antti Koivisto  <[email protected]>
 
         Remove RenderElement::isCSSAnimating boolean

Modified: trunk/Source/WebCore/platform/mac/WebCoreNSURLExtras.mm (221916 => 221917)


--- trunk/Source/WebCore/platform/mac/WebCoreNSURLExtras.mm	2017-09-12 15:31:41 UTC (rev 221916)
+++ trunk/Source/WebCore/platform/mac/WebCoreNSURLExtras.mm	2017-09-12 15:35:10 UTC (rev 221917)
@@ -58,6 +58,48 @@
 
 namespace WebCore {
 
+static bool isArmenianLookalikeCharacter(UChar32 codePoint)
+{
+    return codePoint == 0x0548 || codePoint == 0x054D || codePoint == 0x0578 || codePoint == 0x057D;
+}
+
+static bool isArmenianScriptCharacter(UChar32 codePoint)
+{
+    UErrorCode error = U_ZERO_ERROR;
+    UScriptCode script = uscript_getScript(codePoint, &error);
+    if (error != U_ZERO_ERROR) {
+        LOG_ERROR("got ICU error while trying to look at scripts: %d", error);
+        return false;
+    }
+
+    return script == USCRIPT_ARMENIAN;
+}
+
+
+template<typename CharacterType> inline bool isASCIIDigitOrValidHostCharacter(CharacterType charCode)
+{
+    if (!isASCIIDigitOrPunctuation(charCode))
+        return false;
+
+    // Things the URL Parser rejects:
+    switch (charCode) {
+    case '#':
+    case '%':
+    case '/':
+    case ':':
+    case '?':
+    case '@':
+    case '[':
+    case '\\':
+    case ']':
+        return false;
+    default:
+        return true;
+    }
+}
+
+
+
 static BOOL isLookalikeCharacter(std::optional<UChar32> previousCodePoint, UChar32 charCode)
 {
     // This function treats the following as unsafe, lookalike characters:
@@ -186,8 +228,19 @@
         case 0x0307: /* COMBINING DOT ABOVE */
             return previousCodePoint == 0x0237 /* LATIN SMALL LETTER DOTLESS J */
                 || previousCodePoint == 0x0131; /* LATIN SMALL LETTER DOTLESS I */
+        case 0x0548: /* ARMENIAN CAPITAL LETTER VO */
+        case 0x054D: /* ARMENIAN CAPITAL LETTER SEH */
+        case 0x0578: /* ARMENIAN SMALL LETTER VO */
+        case 0x057D: /* ARMENIAN SMALL LETTER SEH */
+            return previousCodePoint
+                && !isASCIIDigitOrValidHostCharacter(previousCodePoint.value())
+                && !isArmenianScriptCharacter(previousCodePoint.value());
+        case '.':
+            return NO;
         default:
-            return NO;
+            return previousCodePoint
+                && isArmenianLookalikeCharacter(previousCodePoint.value())
+                && !(isArmenianScriptCharacter(charCode) || isASCIIDigitOrValidHostCharacter(charCode));
     }
 }
 

Modified: trunk/Tools/ChangeLog (221916 => 221917)


--- trunk/Tools/ChangeLog	2017-09-12 15:31:41 UTC (rev 221916)
+++ trunk/Tools/ChangeLog	2017-09-12 15:35:10 UTC (rev 221917)
@@ -1,3 +1,14 @@
+2017-09-12 Brent Fulgham  <[email protected]>
+
+        Show punycode to user if a URL mixes Armenian Seh or Vo with other scripts
+        https://bugs.webkit.org/show_bug.cgi?id=176578
+        <rdar://problem/33906231>
+
+        Reviewed by Alex Christensen.
+
+        * TestWebKitAPI/Tests/WebCore/cocoa/URLExtras.mm:
+        (TestWebKitAPI::TEST):
+
 2017-09-12  Carlos Garcia Campos  <[email protected]>
 
         [Freetype] Doesn't support coloured fonts

Modified: trunk/Tools/TestWebKitAPI/Tests/WebCore/cocoa/URLExtras.mm (221916 => 221917)


--- trunk/Tools/TestWebKitAPI/Tests/WebCore/cocoa/URLExtras.mm	2017-09-12 15:31:41 UTC (rev 221916)
+++ trunk/Tools/TestWebKitAPI/Tests/WebCore/cocoa/URLExtras.mm	2017-09-12 15:35:10 UTC (rev 221917)
@@ -90,6 +90,10 @@
         "xn--o8f", // U+1D21
         "xn--p8f", // U+1D22
         "xn--0na", // U+0261
+        "xn--cn-ded", // U+054D
+        "xn--ews-nfe.org", // U+054D
+        "xn--yotube-qkh", // U+0578
+        "xn--cla-7fe.edu", // U+0578
     };
     for (const String& host : punycodedSpoofHosts) {
         auto url = "" host, "/").utf8();
@@ -97,6 +101,20 @@
     }
 }
 
+TEST(WebCore, URLExtras_NotSpoofed)
+{
+    // Valid mixtures of Armenian and other scripts
+    EXPECT_STREQ("https://en.wikipedia.org/wiki/.\u0570\u0561\u0575", userVisibleString(literalURL("https://en.wikipedia.org/wiki/.\u0570\u0561\u0575")));
+    EXPECT_STREQ("https://\u0573\u0574\u0578.\u0570\u0561\u0575", userVisibleString(literalURL("https://\u0573\u0574\u0578.\u0570\u0561\u0575")));
+    EXPECT_STREQ("https://\u0573-1-\u0574\u0578.\u0570\u0561\u0575", userVisibleString(literalURL("https://\u0573-1-\u0574\u0578.\u0570\u0561\u0575")));
+    EXPECT_STREQ("https://2\u0573_\u0574\u0578.\u0570\u0561\u0575", userVisibleString(literalURL("https://2\u0573_\u0574\u0578.\u0570\u0561\u0575")));
+    EXPECT_STREQ("https://\u0573_\u0574\u05783.\u0570\u0561\u0575", userVisibleString(literalURL("https://\u0573_\u0574\u05783.\u0570\u0561\u0575")));
+    EXPECT_STREQ("https://got\u0551\u0535\u0543.com", userVisibleString(literalURL("https://got\u0551\u0535\u0543.com")));
+    EXPECT_STREQ("https://\u0551\u0535\u0543fans.net", userVisibleString(literalURL("https://\u0551\u0535\u0543fans.net")));
+    EXPECT_STREQ("https://\u0551\u0535or\u0575\u0543.biz", userVisibleString(literalURL("https://\u0551\u0535or\u0575\u0543.biz")));
+    EXPECT_STREQ("https://\u0551\u0535and!$^&*()-~+={}or<>,.?\u0575\u0543.biz", userVisibleString(literalURL("https://\u0551\u0535and!$^&*()-~+={}or<>,.?\u0575\u0543.biz")));
+}
+
 TEST(WebCore, URLExtras_DivisionSign)
 {
     // Selected the division sign as an example of a non-ASCII character that is allowed in host names, since it's a lookalike character.
_______________________________________________
webkit-changes mailing list
[email protected]
https://lists.webkit.org/mailman/listinfo/webkit-changes

Reply via email to