Title: [276225] trunk/Source/WebCore
Revision
276225
Author
[email protected]
Date
2021-04-18 00:43:33 -0700 (Sun, 18 Apr 2021)

Log Message

Use binary-search in LocaleToScriptMapping
https://bugs.webkit.org/show_bug.cgi?id=224727

Reviewed by Darin Adler.

This patch removes HashMaps in LocaleToScriptMapping, and binary-search onto the constant data arrays.
These maps are not frequently used. Keys of the maps can be encoded into uint32_t or uint64_t so that
comparison becomes super cheap and we can initialize this array at compile-time.

We introduce ScriptName(uint32_t) and LocaleName(uint64_t) instead of String. And ues it and generate
sorted constant data array for mappings. We use binary-search to look entry up. Since # of entries are
not huge (~200), comparisons are extremely cheap (uint32_t / uint64_t comparison), and this is not a
hot code, we can just use binary-search here and eliminate HashMaps' memory allocation.

* platform/text/LocaleToScriptMapping.cpp:
(WebCore::PackedASCIILowerCodes::PackedASCIILowerCodes):
(WebCore::PackedASCIILowerCodes::parse):
(WebCore::PackedASCIILowerCodes::operator==):
(WebCore::PackedASCIILowerCodes::operator!=):
(WebCore::PackedASCIILowerCodes::operator<):
(WebCore::PackedASCIILowerCodes::operator<=):
(WebCore::PackedASCIILowerCodes::operator>):
(WebCore::PackedASCIILowerCodes::operator>=):
(WebCore::PackedASCIILowerCodes::value const):
(WebCore::scriptNameToCode):
(WebCore::localeToScriptCodeForFontSelection):

Modified Paths

Diff

Modified: trunk/Source/WebCore/ChangeLog (276224 => 276225)


--- trunk/Source/WebCore/ChangeLog	2021-04-18 07:14:07 UTC (rev 276224)
+++ trunk/Source/WebCore/ChangeLog	2021-04-18 07:43:33 UTC (rev 276225)
@@ -1,3 +1,32 @@
+2021-04-17  Yusuke Suzuki  <[email protected]>
+
+        Use binary-search in LocaleToScriptMapping
+        https://bugs.webkit.org/show_bug.cgi?id=224727
+
+        Reviewed by Darin Adler.
+
+        This patch removes HashMaps in LocaleToScriptMapping, and binary-search onto the constant data arrays.
+        These maps are not frequently used. Keys of the maps can be encoded into uint32_t or uint64_t so that
+        comparison becomes super cheap and we can initialize this array at compile-time.
+
+        We introduce ScriptName(uint32_t) and LocaleName(uint64_t) instead of String. And ues it and generate
+        sorted constant data array for mappings. We use binary-search to look entry up. Since # of entries are
+        not huge (~200), comparisons are extremely cheap (uint32_t / uint64_t comparison), and this is not a
+        hot code, we can just use binary-search here and eliminate HashMaps' memory allocation.
+
+        * platform/text/LocaleToScriptMapping.cpp:
+        (WebCore::PackedASCIILowerCodes::PackedASCIILowerCodes):
+        (WebCore::PackedASCIILowerCodes::parse):
+        (WebCore::PackedASCIILowerCodes::operator==):
+        (WebCore::PackedASCIILowerCodes::operator!=):
+        (WebCore::PackedASCIILowerCodes::operator<):
+        (WebCore::PackedASCIILowerCodes::operator<=):
+        (WebCore::PackedASCIILowerCodes::operator>):
+        (WebCore::PackedASCIILowerCodes::operator>=):
+        (WebCore::PackedASCIILowerCodes::value const):
+        (WebCore::scriptNameToCode):
+        (WebCore::localeToScriptCodeForFontSelection):
+
 2021-04-17  Wenson Hsieh  <[email protected]>
 
         [macOS] Add some support for webpage translation in WebKitLegacy

Modified: trunk/Source/WebCore/platform/text/LocaleToScriptMapping.cpp (276224 => 276225)


--- trunk/Source/WebCore/platform/text/LocaleToScriptMapping.cpp	2021-04-18 07:14:07 UTC (rev 276224)
+++ trunk/Source/WebCore/platform/text/LocaleToScriptMapping.cpp	2021-04-18 07:43:33 UTC (rev 276225)
@@ -1,5 +1,6 @@
 /*
  * Copyright (C) 2011 Google Inc. All rights reserved.
+ * Copyright (C) 2021 Apple Inc. All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions are
@@ -37,368 +38,460 @@
 
 namespace WebCore {
 
-struct ScriptNameCode {
-    ASCIILiteral name;
-    UScriptCode code;
-};
+template<typename StorageInteger>
+class PackedASCIILowerCodes {
+public:
+    static_assert(std::is_unsigned_v<StorageInteger>);
 
-// This generally maps an ISO 15924 script code to its UScriptCode, but certain families of script codes are
-// treated as a single script for assigning a per-script font in Settings. For example, "hira" is mapped to
-// USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want all Japanese scripts to be rendered
-// using the same font setting.
-static const ScriptNameCode scriptNameCodeList[] = {
-    { "zyyy"_s, USCRIPT_COMMON },
-    { "qaai"_s, USCRIPT_INHERITED },
-    { "arab"_s, USCRIPT_ARABIC },
-    { "armn"_s, USCRIPT_ARMENIAN },
-    { "beng"_s, USCRIPT_BENGALI },
-    { "bopo"_s, USCRIPT_BOPOMOFO },
-    { "cher"_s, USCRIPT_CHEROKEE },
-    { "copt"_s, USCRIPT_COPTIC },
-    { "cyrl"_s, USCRIPT_CYRILLIC },
-    { "dsrt"_s, USCRIPT_DESERET },
-    { "deva"_s, USCRIPT_DEVANAGARI },
-    { "ethi"_s, USCRIPT_ETHIOPIC },
-    { "geor"_s, USCRIPT_GEORGIAN },
-    { "goth"_s, USCRIPT_GOTHIC },
-    { "grek"_s, USCRIPT_GREEK },
-    { "gujr"_s, USCRIPT_GUJARATI },
-    { "guru"_s, USCRIPT_GURMUKHI },
-    { "hani"_s, USCRIPT_HAN },
-    { "hang"_s, USCRIPT_HANGUL },
-    { "hebr"_s, USCRIPT_HEBREW },
-    { "hira"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
-    { "knda"_s, USCRIPT_KANNADA },
-    { "kana"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
-    { "khmr"_s, USCRIPT_KHMER },
-    { "laoo"_s, USCRIPT_LAO },
-    { "latn"_s, USCRIPT_LATIN },
-    { "mlym"_s, USCRIPT_MALAYALAM },
-    { "mong"_s, USCRIPT_MONGOLIAN },
-    { "mymr"_s, USCRIPT_MYANMAR },
-    { "ogam"_s, USCRIPT_OGHAM },
-    { "ital"_s, USCRIPT_OLD_ITALIC },
-    { "orya"_s, USCRIPT_ORIYA },
-    { "runr"_s, USCRIPT_RUNIC },
-    { "sinh"_s, USCRIPT_SINHALA },
-    { "syrc"_s, USCRIPT_SYRIAC },
-    { "taml"_s, USCRIPT_TAMIL },
-    { "telu"_s, USCRIPT_TELUGU },
-    { "thaa"_s, USCRIPT_THAANA },
-    { "thai"_s, USCRIPT_THAI },
-    { "tibt"_s, USCRIPT_TIBETAN },
-    { "cans"_s, USCRIPT_CANADIAN_ABORIGINAL },
-    { "yiii"_s, USCRIPT_YI },
-    { "tglg"_s, USCRIPT_TAGALOG },
-    { "hano"_s, USCRIPT_HANUNOO },
-    { "buhd"_s, USCRIPT_BUHID },
-    { "tagb"_s, USCRIPT_TAGBANWA },
-    { "brai"_s, USCRIPT_BRAILLE },
-    { "cprt"_s, USCRIPT_CYPRIOT },
-    { "limb"_s, USCRIPT_LIMBU },
-    { "linb"_s, USCRIPT_LINEAR_B },
-    { "osma"_s, USCRIPT_OSMANYA },
-    { "shaw"_s, USCRIPT_SHAVIAN },
-    { "tale"_s, USCRIPT_TAI_LE },
-    { "ugar"_s, USCRIPT_UGARITIC },
-    { "hrkt"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
-    { "bugi"_s, USCRIPT_BUGINESE },
-    { "glag"_s, USCRIPT_GLAGOLITIC },
-    { "khar"_s, USCRIPT_KHAROSHTHI },
-    { "sylo"_s, USCRIPT_SYLOTI_NAGRI },
-    { "talu"_s, USCRIPT_NEW_TAI_LUE },
-    { "tfng"_s, USCRIPT_TIFINAGH },
-    { "xpeo"_s, USCRIPT_OLD_PERSIAN },
-    { "bali"_s, USCRIPT_BALINESE },
-    { "batk"_s, USCRIPT_BATAK },
-    { "blis"_s, USCRIPT_BLISSYMBOLS },
-    { "brah"_s, USCRIPT_BRAHMI },
-    { "cham"_s, USCRIPT_CHAM },
-    { "cirt"_s, USCRIPT_CIRTH },
-    { "cyrs"_s, USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC },
-    { "egyd"_s, USCRIPT_DEMOTIC_EGYPTIAN },
-    { "egyh"_s, USCRIPT_HIERATIC_EGYPTIAN },
-    { "egyp"_s, USCRIPT_EGYPTIAN_HIEROGLYPHS },
-    { "geok"_s, USCRIPT_KHUTSURI },
-    { "hans"_s, USCRIPT_SIMPLIFIED_HAN },
-    { "hant"_s, USCRIPT_TRADITIONAL_HAN },
-    { "hmng"_s, USCRIPT_PAHAWH_HMONG },
-    { "hung"_s, USCRIPT_OLD_HUNGARIAN },
-    { "inds"_s, USCRIPT_HARAPPAN_INDUS },
-    { "java"_s, USCRIPT_JAVANESE },
-    { "kali"_s, USCRIPT_KAYAH_LI },
-    { "latf"_s, USCRIPT_LATIN_FRAKTUR },
-    { "latg"_s, USCRIPT_LATIN_GAELIC },
-    { "lepc"_s, USCRIPT_LEPCHA },
-    { "lina"_s, USCRIPT_LINEAR_A },
-    { "mand"_s, USCRIPT_MANDAEAN },
-    { "maya"_s, USCRIPT_MAYAN_HIEROGLYPHS },
-    { "mero"_s, USCRIPT_MEROITIC },
-    { "nkoo"_s, USCRIPT_NKO },
-    { "orkh"_s, USCRIPT_ORKHON },
-    { "perm"_s, USCRIPT_OLD_PERMIC },
-    { "phag"_s, USCRIPT_PHAGS_PA },
-    { "phnx"_s, USCRIPT_PHOENICIAN },
-    { "plrd"_s, USCRIPT_PHONETIC_POLLARD },
-    { "roro"_s, USCRIPT_RONGORONGO },
-    { "sara"_s, USCRIPT_SARATI },
-    { "syre"_s, USCRIPT_ESTRANGELO_SYRIAC },
-    { "syrj"_s, USCRIPT_WESTERN_SYRIAC },
-    { "syrn"_s, USCRIPT_EASTERN_SYRIAC },
-    { "teng"_s, USCRIPT_TENGWAR },
-    { "vaii"_s, USCRIPT_VAI },
-    { "visp"_s, USCRIPT_VISIBLE_SPEECH },
-    { "xsux"_s, USCRIPT_CUNEIFORM },
-    { "jpan"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
-    { "kore"_s, USCRIPT_HANGUL },
-    { "zxxx"_s, USCRIPT_UNWRITTEN_LANGUAGES },
-    { "zzzz"_s, USCRIPT_UNKNOWN }
+    template<unsigned characterCountPlusOne>
+    constexpr PackedASCIILowerCodes(const char (&string)[characterCountPlusOne])
+    {
+        constexpr unsigned length = characterCountPlusOne - 1;
+        ASSERT_UNDER_CONSTEXPR_CONTEXT(length <= sizeof(StorageInteger));
+        ASSERT_UNDER_CONSTEXPR_CONTEXT(!string[length]);
+        StorageInteger result = 0;
+        for (unsigned index = 0; index < length; ++index) {
+            uint8_t code = static_cast<uint8_t>(string[index]);
+            result |= static_cast<StorageInteger>(code) << ((sizeof(StorageInteger) - index - 1) * 8);
+        }
+        m_value = result;
+    }
+
+    static Optional<PackedASCIILowerCodes> parse(StringView string)
+    {
+        if (string.length() > sizeof(StorageInteger))
+            return WTF::nullopt;
+        StorageInteger result = 0;
+        for (unsigned index = 0; index < string.length(); ++index) {
+            UChar code = string[index];
+            if (!isASCII(code))
+                return WTF::nullopt;
+            result |= static_cast<StorageInteger>(toASCIILower(code)) << ((sizeof(StorageInteger) - index - 1) * 8);
+        }
+        return PackedASCIILowerCodes(result);
+    }
+
+    friend constexpr bool operator==(PackedASCIILowerCodes lhs, PackedASCIILowerCodes rhs)
+    {
+        return lhs.m_value == rhs.m_value;
+    }
+
+    friend constexpr bool operator!=(PackedASCIILowerCodes lhs, PackedASCIILowerCodes rhs)
+    {
+        return lhs.m_value != rhs.m_value;
+    }
+
+    friend constexpr bool operator<(PackedASCIILowerCodes lhs, PackedASCIILowerCodes rhs)
+    {
+        return lhs.m_value < rhs.m_value;
+    }
+
+    friend constexpr bool operator<=(PackedASCIILowerCodes lhs, PackedASCIILowerCodes rhs)
+    {
+        return lhs.m_value <= rhs.m_value;
+    }
+
+    friend constexpr bool operator>(PackedASCIILowerCodes lhs, PackedASCIILowerCodes rhs)
+    {
+        return lhs.m_value > rhs.m_value;
+    }
+
+    friend constexpr bool operator>=(PackedASCIILowerCodes lhs, PackedASCIILowerCodes rhs)
+    {
+        return lhs.m_value >= rhs.m_value;
+    }
+
+    constexpr StorageInteger value() const { return m_value; }
+
+private:
+    explicit constexpr PackedASCIILowerCodes(StorageInteger value)
+        : m_value(value)
+    {
+    }
+
+    StorageInteger m_value { 0 };
 };
 
-struct ScriptNameCodeMapHashTraits : public HashTraits<String> {
-    static const int minimumTableSize = WTF::HashTableCapacityForSize<WTF_ARRAY_LENGTH(scriptNameCodeList)>::value;
+using ScriptName = PackedASCIILowerCodes<uint32_t>;
+struct ScriptNameCode {
+    ScriptName name;
+    UScriptCode code;
 };
 
 UScriptCode scriptNameToCode(const String& scriptName)
 {
-    static const auto scriptNameCodeMap = makeNeverDestroyed([] {
-        HashMap<String, UScriptCode, ASCIICaseInsensitiveHash, ScriptNameCodeMapHashTraits> map;
-        for (auto& nameAndCode : scriptNameCodeList)
-            map.add(nameAndCode.name, nameAndCode.code);
-        return map;
-    }());
+    // This generally maps an ISO 15924 script code to its UScriptCode, but certain families of script codes are
+    // treated as a single script for assigning a per-script font in Settings. For example, "hira" is mapped to
+    // USCRIPT_KATAKANA_OR_HIRAGANA instead of USCRIPT_HIRAGANA, since we want all Japanese scripts to be rendered
+    // using the same font setting.
+    static constexpr ScriptNameCode scriptNameCodeList[] = {
+        { "arab", USCRIPT_ARABIC },
+        { "armn", USCRIPT_ARMENIAN },
+        { "bali", USCRIPT_BALINESE },
+        { "batk", USCRIPT_BATAK },
+        { "beng", USCRIPT_BENGALI },
+        { "blis", USCRIPT_BLISSYMBOLS },
+        { "bopo", USCRIPT_BOPOMOFO },
+        { "brah", USCRIPT_BRAHMI },
+        { "brai", USCRIPT_BRAILLE },
+        { "bugi", USCRIPT_BUGINESE },
+        { "buhd", USCRIPT_BUHID },
+        { "cans", USCRIPT_CANADIAN_ABORIGINAL },
+        { "cham", USCRIPT_CHAM },
+        { "cher", USCRIPT_CHEROKEE },
+        { "cirt", USCRIPT_CIRTH },
+        { "copt", USCRIPT_COPTIC },
+        { "cprt", USCRIPT_CYPRIOT },
+        { "cyrl", USCRIPT_CYRILLIC },
+        { "cyrs", USCRIPT_OLD_CHURCH_SLAVONIC_CYRILLIC },
+        { "deva", USCRIPT_DEVANAGARI },
+        { "dsrt", USCRIPT_DESERET },
+        { "egyd", USCRIPT_DEMOTIC_EGYPTIAN },
+        { "egyh", USCRIPT_HIERATIC_EGYPTIAN },
+        { "egyp", USCRIPT_EGYPTIAN_HIEROGLYPHS },
+        { "ethi", USCRIPT_ETHIOPIC },
+        { "geok", USCRIPT_KHUTSURI },
+        { "geor", USCRIPT_GEORGIAN },
+        { "glag", USCRIPT_GLAGOLITIC },
+        { "goth", USCRIPT_GOTHIC },
+        { "grek", USCRIPT_GREEK },
+        { "gujr", USCRIPT_GUJARATI },
+        { "guru", USCRIPT_GURMUKHI },
+        { "hang", USCRIPT_HANGUL },
+        { "hani", USCRIPT_HAN },
+        { "hano", USCRIPT_HANUNOO },
+        { "hans", USCRIPT_SIMPLIFIED_HAN },
+        { "hant", USCRIPT_TRADITIONAL_HAN },
+        { "hebr", USCRIPT_HEBREW },
+        { "hira", USCRIPT_KATAKANA_OR_HIRAGANA },
+        { "hmng", USCRIPT_PAHAWH_HMONG },
+        { "hrkt", USCRIPT_KATAKANA_OR_HIRAGANA },
+        { "hung", USCRIPT_OLD_HUNGARIAN },
+        { "inds", USCRIPT_HARAPPAN_INDUS },
+        { "ital", USCRIPT_OLD_ITALIC },
+        { "java", USCRIPT_JAVANESE },
+        { "jpan", USCRIPT_KATAKANA_OR_HIRAGANA },
+        { "kali", USCRIPT_KAYAH_LI },
+        { "kana", USCRIPT_KATAKANA_OR_HIRAGANA },
+        { "khar", USCRIPT_KHAROSHTHI },
+        { "khmr", USCRIPT_KHMER },
+        { "knda", USCRIPT_KANNADA },
+        { "kore", USCRIPT_HANGUL },
+        { "laoo", USCRIPT_LAO },
+        { "latf", USCRIPT_LATIN_FRAKTUR },
+        { "latg", USCRIPT_LATIN_GAELIC },
+        { "latn", USCRIPT_LATIN },
+        { "lepc", USCRIPT_LEPCHA },
+        { "limb", USCRIPT_LIMBU },
+        { "lina", USCRIPT_LINEAR_A },
+        { "linb", USCRIPT_LINEAR_B },
+        { "mand", USCRIPT_MANDAEAN },
+        { "maya", USCRIPT_MAYAN_HIEROGLYPHS },
+        { "mero", USCRIPT_MEROITIC },
+        { "mlym", USCRIPT_MALAYALAM },
+        { "mong", USCRIPT_MONGOLIAN },
+        { "mymr", USCRIPT_MYANMAR },
+        { "nkoo", USCRIPT_NKO },
+        { "ogam", USCRIPT_OGHAM },
+        { "orkh", USCRIPT_ORKHON },
+        { "orya", USCRIPT_ORIYA },
+        { "osma", USCRIPT_OSMANYA },
+        { "perm", USCRIPT_OLD_PERMIC },
+        { "phag", USCRIPT_PHAGS_PA },
+        { "phnx", USCRIPT_PHOENICIAN },
+        { "plrd", USCRIPT_PHONETIC_POLLARD },
+        { "qaai", USCRIPT_INHERITED },
+        { "roro", USCRIPT_RONGORONGO },
+        { "runr", USCRIPT_RUNIC },
+        { "sara", USCRIPT_SARATI },
+        { "shaw", USCRIPT_SHAVIAN },
+        { "sinh", USCRIPT_SINHALA },
+        { "sylo", USCRIPT_SYLOTI_NAGRI },
+        { "syrc", USCRIPT_SYRIAC },
+        { "syre", USCRIPT_ESTRANGELO_SYRIAC },
+        { "syrj", USCRIPT_WESTERN_SYRIAC },
+        { "syrn", USCRIPT_EASTERN_SYRIAC },
+        { "tagb", USCRIPT_TAGBANWA },
+        { "tale", USCRIPT_TAI_LE },
+        { "talu", USCRIPT_NEW_TAI_LUE },
+        { "taml", USCRIPT_TAMIL },
+        { "telu", USCRIPT_TELUGU },
+        { "teng", USCRIPT_TENGWAR },
+        { "tfng", USCRIPT_TIFINAGH },
+        { "tglg", USCRIPT_TAGALOG },
+        { "thaa", USCRIPT_THAANA },
+        { "thai", USCRIPT_THAI },
+        { "tibt", USCRIPT_TIBETAN },
+        { "ugar", USCRIPT_UGARITIC },
+        { "vaii", USCRIPT_VAI },
+        { "visp", USCRIPT_VISIBLE_SPEECH },
+        { "xpeo", USCRIPT_OLD_PERSIAN },
+        { "xsux", USCRIPT_CUNEIFORM },
+        { "yiii", USCRIPT_YI },
+        { "zxxx", USCRIPT_UNWRITTEN_LANGUAGES },
+        { "zyyy", USCRIPT_COMMON },
+        { "zzzz", USCRIPT_UNKNOWN },
+    };
 
-    auto it = scriptNameCodeMap.get().find(scriptName);
-    if (it != scriptNameCodeMap.get().end())
-        return it->value;
+    static_assert(ScriptName("arab").value() == 0x61726162U);
+    static_assert(ScriptName("zzzz").value() == 0x7a7a7a7aU);
+
+    ASSERT(
+        std::is_sorted(std::begin(scriptNameCodeList), std::end(scriptNameCodeList),
+            [](const ScriptNameCode& a, const ScriptNameCode& b) {
+                return a.name < b.name;
+            }));
+
+    auto name = ScriptName::parse(scriptName);
+    if (!name)
+        return USCRIPT_INVALID_CODE;
+
+    auto* element = tryBinarySearch<ScriptNameCode>(scriptNameCodeList, std::size(scriptNameCodeList), name.value(),
+        [](const ScriptNameCode* scriptNameCode) {
+            return scriptNameCode->name;
+        });
+    if (element)
+        return element->code;
     return USCRIPT_INVALID_CODE;
 }
 
+using LocaleName = PackedASCIILowerCodes<uint64_t>;
 struct LocaleScript {
-    ASCIILiteral locale;
+    LocaleName locale;
     UScriptCode script;
 };
 
-static const LocaleScript localeScriptList[] = {
-    { "aa"_s, USCRIPT_LATIN },
-    { "ab"_s, USCRIPT_CYRILLIC },
-    { "ady"_s, USCRIPT_CYRILLIC },
-    { "af"_s, USCRIPT_LATIN },
-    { "ak"_s, USCRIPT_LATIN },
-    { "am"_s, USCRIPT_ETHIOPIC },
-    { "ar"_s, USCRIPT_ARABIC },
-    { "as"_s, USCRIPT_BENGALI },
-    { "ast"_s, USCRIPT_LATIN },
-    { "av"_s, USCRIPT_CYRILLIC },
-    { "ay"_s, USCRIPT_LATIN },
-    { "az"_s, USCRIPT_LATIN },
-    { "ba"_s, USCRIPT_CYRILLIC },
-    { "be"_s, USCRIPT_CYRILLIC },
-    { "bg"_s, USCRIPT_CYRILLIC },
-    { "bi"_s, USCRIPT_LATIN },
-    { "bn"_s, USCRIPT_BENGALI },
-    { "bo"_s, USCRIPT_TIBETAN },
-    { "bs"_s, USCRIPT_LATIN },
-    { "ca"_s, USCRIPT_LATIN },
-    { "ce"_s, USCRIPT_CYRILLIC },
-    { "ceb"_s, USCRIPT_LATIN },
-    { "ch"_s, USCRIPT_LATIN },
-    { "chk"_s, USCRIPT_LATIN },
-    { "cs"_s, USCRIPT_LATIN },
-    { "cy"_s, USCRIPT_LATIN },
-    { "da"_s, USCRIPT_LATIN },
-    { "de"_s, USCRIPT_LATIN },
-    { "dv"_s, USCRIPT_THAANA },
-    { "dz"_s, USCRIPT_TIBETAN },
-    { "ee"_s, USCRIPT_LATIN },
-    { "efi"_s, USCRIPT_LATIN },
-    { "el"_s, USCRIPT_GREEK },
-    { "en"_s, USCRIPT_LATIN },
-    { "es"_s, USCRIPT_LATIN },
-    { "et"_s, USCRIPT_LATIN },
-    { "eu"_s, USCRIPT_LATIN },
-    { "fa"_s, USCRIPT_ARABIC },
-    { "fi"_s, USCRIPT_LATIN },
-    { "fil"_s, USCRIPT_LATIN },
-    { "fj"_s, USCRIPT_LATIN },
-    { "fo"_s, USCRIPT_LATIN },
-    { "fr"_s, USCRIPT_LATIN },
-    { "fur"_s, USCRIPT_LATIN },
-    { "fy"_s, USCRIPT_LATIN },
-    { "ga"_s, USCRIPT_LATIN },
-    { "gaa"_s, USCRIPT_LATIN },
-    { "gd"_s, USCRIPT_LATIN },
-    { "gil"_s, USCRIPT_LATIN },
-    { "gl"_s, USCRIPT_LATIN },
-    { "gn"_s, USCRIPT_LATIN },
-    { "gsw"_s, USCRIPT_LATIN },
-    { "gu"_s, USCRIPT_GUJARATI },
-    { "ha"_s, USCRIPT_LATIN },
-    { "haw"_s, USCRIPT_LATIN },
-    { "he"_s, USCRIPT_HEBREW },
-    { "hi"_s, USCRIPT_DEVANAGARI },
-    { "hil"_s, USCRIPT_LATIN },
-    { "ho"_s, USCRIPT_LATIN },
-    { "hr"_s, USCRIPT_LATIN },
-    { "ht"_s, USCRIPT_LATIN },
-    { "hu"_s, USCRIPT_LATIN },
-    { "hy"_s, USCRIPT_ARMENIAN },
-    { "id"_s, USCRIPT_LATIN },
-    { "ig"_s, USCRIPT_LATIN },
-    { "ii"_s, USCRIPT_YI },
-    { "ilo"_s, USCRIPT_LATIN },
-    { "inh"_s, USCRIPT_CYRILLIC },
-    { "is"_s, USCRIPT_LATIN },
-    { "it"_s, USCRIPT_LATIN },
-    { "iu"_s, USCRIPT_CANADIAN_ABORIGINAL },
-    { "ja"_s, USCRIPT_KATAKANA_OR_HIRAGANA },
-    { "jv"_s, USCRIPT_LATIN },
-    { "ka"_s, USCRIPT_GEORGIAN },
-    { "kaj"_s, USCRIPT_LATIN },
-    { "kam"_s, USCRIPT_LATIN },
-    { "kbd"_s, USCRIPT_CYRILLIC },
-    { "kha"_s, USCRIPT_LATIN },
-    { "kk"_s, USCRIPT_CYRILLIC },
-    { "kl"_s, USCRIPT_LATIN },
-    { "km"_s, USCRIPT_KHMER },
-    { "kn"_s, USCRIPT_KANNADA },
-    { "ko"_s, USCRIPT_HANGUL },
-    { "kok"_s, USCRIPT_DEVANAGARI },
-    { "kos"_s, USCRIPT_LATIN },
-    { "kpe"_s, USCRIPT_LATIN },
-    { "krc"_s, USCRIPT_CYRILLIC },
-    { "ks"_s, USCRIPT_ARABIC },
-    { "ku"_s, USCRIPT_ARABIC },
-    { "kum"_s, USCRIPT_CYRILLIC },
-    { "ky"_s, USCRIPT_CYRILLIC },
-    { "la"_s, USCRIPT_LATIN },
-    { "lah"_s, USCRIPT_ARABIC },
-    { "lb"_s, USCRIPT_LATIN },
-    { "lez"_s, USCRIPT_CYRILLIC },
-    { "ln"_s, USCRIPT_LATIN },
-    { "lo"_s, USCRIPT_LAO },
-    { "lt"_s, USCRIPT_LATIN },
-    { "lv"_s, USCRIPT_LATIN },
-    { "mai"_s, USCRIPT_DEVANAGARI },
-    { "mdf"_s, USCRIPT_CYRILLIC },
-    { "mg"_s, USCRIPT_LATIN },
-    { "mh"_s, USCRIPT_LATIN },
-    { "mi"_s, USCRIPT_LATIN },
-    { "mk"_s, USCRIPT_CYRILLIC },
-    { "ml"_s, USCRIPT_MALAYALAM },
-    { "mn"_s, USCRIPT_CYRILLIC },
-    { "mr"_s, USCRIPT_DEVANAGARI },
-    { "ms"_s, USCRIPT_LATIN },
-    { "mt"_s, USCRIPT_LATIN },
-    { "my"_s, USCRIPT_MYANMAR },
-    { "myv"_s, USCRIPT_CYRILLIC },
-    { "na"_s, USCRIPT_LATIN },
-    { "nb"_s, USCRIPT_LATIN },
-    { "ne"_s, USCRIPT_DEVANAGARI },
-    { "niu"_s, USCRIPT_LATIN },
-    { "nl"_s, USCRIPT_LATIN },
-    { "nn"_s, USCRIPT_LATIN },
-    { "nr"_s, USCRIPT_LATIN },
-    { "nso"_s, USCRIPT_LATIN },
-    { "ny"_s, USCRIPT_LATIN },
-    { "oc"_s, USCRIPT_LATIN },
-    { "om"_s, USCRIPT_LATIN },
-    { "or"_s, USCRIPT_ORIYA },
-    { "os"_s, USCRIPT_CYRILLIC },
-    { "pa"_s, USCRIPT_GURMUKHI },
-    { "pag"_s, USCRIPT_LATIN },
-    { "pap"_s, USCRIPT_LATIN },
-    { "pau"_s, USCRIPT_LATIN },
-    { "pl"_s, USCRIPT_LATIN },
-    { "pon"_s, USCRIPT_LATIN },
-    { "ps"_s, USCRIPT_ARABIC },
-    { "pt"_s, USCRIPT_LATIN },
-    { "qu"_s, USCRIPT_LATIN },
-    { "rm"_s, USCRIPT_LATIN },
-    { "rn"_s, USCRIPT_LATIN },
-    { "ro"_s, USCRIPT_LATIN },
-    { "ru"_s, USCRIPT_CYRILLIC },
-    { "rw"_s, USCRIPT_LATIN },
-    { "sa"_s, USCRIPT_DEVANAGARI },
-    { "sah"_s, USCRIPT_CYRILLIC },
-    { "sat"_s, USCRIPT_LATIN },
-    { "sd"_s, USCRIPT_ARABIC },
-    { "se"_s, USCRIPT_LATIN },
-    { "sg"_s, USCRIPT_LATIN },
-    { "si"_s, USCRIPT_SINHALA },
-    { "sid"_s, USCRIPT_LATIN },
-    { "sk"_s, USCRIPT_LATIN },
-    { "sl"_s, USCRIPT_LATIN },
-    { "sm"_s, USCRIPT_LATIN },
-    { "so"_s, USCRIPT_LATIN },
-    { "sq"_s, USCRIPT_LATIN },
-    { "sr"_s, USCRIPT_CYRILLIC },
-    { "ss"_s, USCRIPT_LATIN },
-    { "st"_s, USCRIPT_LATIN },
-    { "su"_s, USCRIPT_LATIN },
-    { "sv"_s, USCRIPT_LATIN },
-    { "sw"_s, USCRIPT_LATIN },
-    { "ta"_s, USCRIPT_TAMIL },
-    { "te"_s, USCRIPT_TELUGU },
-    { "tet"_s, USCRIPT_LATIN },
-    { "tg"_s, USCRIPT_CYRILLIC },
-    { "th"_s, USCRIPT_THAI },
-    { "ti"_s, USCRIPT_ETHIOPIC },
-    { "tig"_s, USCRIPT_ETHIOPIC },
-    { "tk"_s, USCRIPT_LATIN },
-    { "tkl"_s, USCRIPT_LATIN },
-    { "tl"_s, USCRIPT_LATIN },
-    { "tn"_s, USCRIPT_LATIN },
-    { "to"_s, USCRIPT_LATIN },
-    { "tpi"_s, USCRIPT_LATIN },
-    { "tr"_s, USCRIPT_LATIN },
-    { "trv"_s, USCRIPT_LATIN },
-    { "ts"_s, USCRIPT_LATIN },
-    { "tt"_s, USCRIPT_CYRILLIC },
-    { "tvl"_s, USCRIPT_LATIN },
-    { "tw"_s, USCRIPT_LATIN },
-    { "ty"_s, USCRIPT_LATIN },
-    { "tyv"_s, USCRIPT_CYRILLIC },
-    { "udm"_s, USCRIPT_CYRILLIC },
-    { "ug"_s, USCRIPT_ARABIC },
-    { "uk"_s, USCRIPT_CYRILLIC },
-    { "und"_s, USCRIPT_LATIN },
-    { "ur"_s, USCRIPT_ARABIC },
-    { "uz"_s, USCRIPT_CYRILLIC },
-    { "ve"_s, USCRIPT_LATIN },
-    { "vi"_s, USCRIPT_LATIN },
-    { "wal"_s, USCRIPT_ETHIOPIC },
-    { "war"_s, USCRIPT_LATIN },
-    { "wo"_s, USCRIPT_LATIN },
-    { "xh"_s, USCRIPT_LATIN },
-    { "yap"_s, USCRIPT_LATIN },
-    { "yo"_s, USCRIPT_LATIN },
-    { "za"_s, USCRIPT_LATIN },
-    { "zh"_s, USCRIPT_HAN },
-    { "zh_hk"_s, USCRIPT_TRADITIONAL_HAN },
-    { "zh_tw"_s, USCRIPT_TRADITIONAL_HAN },
-    { "zu"_s, USCRIPT_LATIN }
-};
-
-struct LocaleScriptMapHashTraits : public HashTraits<String> {
-    static const int minimumTableSize = WTF::HashTableCapacityForSize<WTF_ARRAY_LENGTH(localeScriptList)>::value;
-};
-
 UScriptCode localeToScriptCodeForFontSelection(const String& locale)
 {
-    static const auto localeScriptMap = makeNeverDestroyed([] {
-        HashMap<String, UScriptCode, ASCIICaseInsensitiveHash, LocaleScriptMapHashTraits> map;
-        for (auto& localeAndScript : localeScriptList)
-            map.add(localeAndScript.locale, localeAndScript.script);
-        return map;
-    }());
+    static constexpr LocaleScript localeScriptList[] = {
+        { "aa", USCRIPT_LATIN },
+        { "ab", USCRIPT_CYRILLIC },
+        { "ady", USCRIPT_CYRILLIC },
+        { "af", USCRIPT_LATIN },
+        { "ak", USCRIPT_LATIN },
+        { "am", USCRIPT_ETHIOPIC },
+        { "ar", USCRIPT_ARABIC },
+        { "as", USCRIPT_BENGALI },
+        { "ast", USCRIPT_LATIN },
+        { "av", USCRIPT_CYRILLIC },
+        { "ay", USCRIPT_LATIN },
+        { "az", USCRIPT_LATIN },
+        { "ba", USCRIPT_CYRILLIC },
+        { "be", USCRIPT_CYRILLIC },
+        { "bg", USCRIPT_CYRILLIC },
+        { "bi", USCRIPT_LATIN },
+        { "bn", USCRIPT_BENGALI },
+        { "bo", USCRIPT_TIBETAN },
+        { "bs", USCRIPT_LATIN },
+        { "ca", USCRIPT_LATIN },
+        { "ce", USCRIPT_CYRILLIC },
+        { "ceb", USCRIPT_LATIN },
+        { "ch", USCRIPT_LATIN },
+        { "chk", USCRIPT_LATIN },
+        { "cs", USCRIPT_LATIN },
+        { "cy", USCRIPT_LATIN },
+        { "da", USCRIPT_LATIN },
+        { "de", USCRIPT_LATIN },
+        { "dv", USCRIPT_THAANA },
+        { "dz", USCRIPT_TIBETAN },
+        { "ee", USCRIPT_LATIN },
+        { "efi", USCRIPT_LATIN },
+        { "el", USCRIPT_GREEK },
+        { "en", USCRIPT_LATIN },
+        { "es", USCRIPT_LATIN },
+        { "et", USCRIPT_LATIN },
+        { "eu", USCRIPT_LATIN },
+        { "fa", USCRIPT_ARABIC },
+        { "fi", USCRIPT_LATIN },
+        { "fil", USCRIPT_LATIN },
+        { "fj", USCRIPT_LATIN },
+        { "fo", USCRIPT_LATIN },
+        { "fr", USCRIPT_LATIN },
+        { "fur", USCRIPT_LATIN },
+        { "fy", USCRIPT_LATIN },
+        { "ga", USCRIPT_LATIN },
+        { "gaa", USCRIPT_LATIN },
+        { "gd", USCRIPT_LATIN },
+        { "gil", USCRIPT_LATIN },
+        { "gl", USCRIPT_LATIN },
+        { "gn", USCRIPT_LATIN },
+        { "gsw", USCRIPT_LATIN },
+        { "gu", USCRIPT_GUJARATI },
+        { "ha", USCRIPT_LATIN },
+        { "haw", USCRIPT_LATIN },
+        { "he", USCRIPT_HEBREW },
+        { "hi", USCRIPT_DEVANAGARI },
+        { "hil", USCRIPT_LATIN },
+        { "ho", USCRIPT_LATIN },
+        { "hr", USCRIPT_LATIN },
+        { "ht", USCRIPT_LATIN },
+        { "hu", USCRIPT_LATIN },
+        { "hy", USCRIPT_ARMENIAN },
+        { "id", USCRIPT_LATIN },
+        { "ig", USCRIPT_LATIN },
+        { "ii", USCRIPT_YI },
+        { "ilo", USCRIPT_LATIN },
+        { "inh", USCRIPT_CYRILLIC },
+        { "is", USCRIPT_LATIN },
+        { "it", USCRIPT_LATIN },
+        { "iu", USCRIPT_CANADIAN_ABORIGINAL },
+        { "ja", USCRIPT_KATAKANA_OR_HIRAGANA },
+        { "jv", USCRIPT_LATIN },
+        { "ka", USCRIPT_GEORGIAN },
+        { "kaj", USCRIPT_LATIN },
+        { "kam", USCRIPT_LATIN },
+        { "kbd", USCRIPT_CYRILLIC },
+        { "kha", USCRIPT_LATIN },
+        { "kk", USCRIPT_CYRILLIC },
+        { "kl", USCRIPT_LATIN },
+        { "km", USCRIPT_KHMER },
+        { "kn", USCRIPT_KANNADA },
+        { "ko", USCRIPT_HANGUL },
+        { "kok", USCRIPT_DEVANAGARI },
+        { "kos", USCRIPT_LATIN },
+        { "kpe", USCRIPT_LATIN },
+        { "krc", USCRIPT_CYRILLIC },
+        { "ks", USCRIPT_ARABIC },
+        { "ku", USCRIPT_ARABIC },
+        { "kum", USCRIPT_CYRILLIC },
+        { "ky", USCRIPT_CYRILLIC },
+        { "la", USCRIPT_LATIN },
+        { "lah", USCRIPT_ARABIC },
+        { "lb", USCRIPT_LATIN },
+        { "lez", USCRIPT_CYRILLIC },
+        { "ln", USCRIPT_LATIN },
+        { "lo", USCRIPT_LAO },
+        { "lt", USCRIPT_LATIN },
+        { "lv", USCRIPT_LATIN },
+        { "mai", USCRIPT_DEVANAGARI },
+        { "mdf", USCRIPT_CYRILLIC },
+        { "mg", USCRIPT_LATIN },
+        { "mh", USCRIPT_LATIN },
+        { "mi", USCRIPT_LATIN },
+        { "mk", USCRIPT_CYRILLIC },
+        { "ml", USCRIPT_MALAYALAM },
+        { "mn", USCRIPT_CYRILLIC },
+        { "mr", USCRIPT_DEVANAGARI },
+        { "ms", USCRIPT_LATIN },
+        { "mt", USCRIPT_LATIN },
+        { "my", USCRIPT_MYANMAR },
+        { "myv", USCRIPT_CYRILLIC },
+        { "na", USCRIPT_LATIN },
+        { "nb", USCRIPT_LATIN },
+        { "ne", USCRIPT_DEVANAGARI },
+        { "niu", USCRIPT_LATIN },
+        { "nl", USCRIPT_LATIN },
+        { "nn", USCRIPT_LATIN },
+        { "nr", USCRIPT_LATIN },
+        { "nso", USCRIPT_LATIN },
+        { "ny", USCRIPT_LATIN },
+        { "oc", USCRIPT_LATIN },
+        { "om", USCRIPT_LATIN },
+        { "or", USCRIPT_ORIYA },
+        { "os", USCRIPT_CYRILLIC },
+        { "pa", USCRIPT_GURMUKHI },
+        { "pag", USCRIPT_LATIN },
+        { "pap", USCRIPT_LATIN },
+        { "pau", USCRIPT_LATIN },
+        { "pl", USCRIPT_LATIN },
+        { "pon", USCRIPT_LATIN },
+        { "ps", USCRIPT_ARABIC },
+        { "pt", USCRIPT_LATIN },
+        { "qu", USCRIPT_LATIN },
+        { "rm", USCRIPT_LATIN },
+        { "rn", USCRIPT_LATIN },
+        { "ro", USCRIPT_LATIN },
+        { "ru", USCRIPT_CYRILLIC },
+        { "rw", USCRIPT_LATIN },
+        { "sa", USCRIPT_DEVANAGARI },
+        { "sah", USCRIPT_CYRILLIC },
+        { "sat", USCRIPT_LATIN },
+        { "sd", USCRIPT_ARABIC },
+        { "se", USCRIPT_LATIN },
+        { "sg", USCRIPT_LATIN },
+        { "si", USCRIPT_SINHALA },
+        { "sid", USCRIPT_LATIN },
+        { "sk", USCRIPT_LATIN },
+        { "sl", USCRIPT_LATIN },
+        { "sm", USCRIPT_LATIN },
+        { "so", USCRIPT_LATIN },
+        { "sq", USCRIPT_LATIN },
+        { "sr", USCRIPT_CYRILLIC },
+        { "ss", USCRIPT_LATIN },
+        { "st", USCRIPT_LATIN },
+        { "su", USCRIPT_LATIN },
+        { "sv", USCRIPT_LATIN },
+        { "sw", USCRIPT_LATIN },
+        { "ta", USCRIPT_TAMIL },
+        { "te", USCRIPT_TELUGU },
+        { "tet", USCRIPT_LATIN },
+        { "tg", USCRIPT_CYRILLIC },
+        { "th", USCRIPT_THAI },
+        { "ti", USCRIPT_ETHIOPIC },
+        { "tig", USCRIPT_ETHIOPIC },
+        { "tk", USCRIPT_LATIN },
+        { "tkl", USCRIPT_LATIN },
+        { "tl", USCRIPT_LATIN },
+        { "tn", USCRIPT_LATIN },
+        { "to", USCRIPT_LATIN },
+        { "tpi", USCRIPT_LATIN },
+        { "tr", USCRIPT_LATIN },
+        { "trv", USCRIPT_LATIN },
+        { "ts", USCRIPT_LATIN },
+        { "tt", USCRIPT_CYRILLIC },
+        { "tvl", USCRIPT_LATIN },
+        { "tw", USCRIPT_LATIN },
+        { "ty", USCRIPT_LATIN },
+        { "tyv", USCRIPT_CYRILLIC },
+        { "udm", USCRIPT_CYRILLIC },
+        { "ug", USCRIPT_ARABIC },
+        { "uk", USCRIPT_CYRILLIC },
+        { "und", USCRIPT_LATIN },
+        { "ur", USCRIPT_ARABIC },
+        { "uz", USCRIPT_CYRILLIC },
+        { "ve", USCRIPT_LATIN },
+        { "vi", USCRIPT_LATIN },
+        { "wal", USCRIPT_ETHIOPIC },
+        { "war", USCRIPT_LATIN },
+        { "wo", USCRIPT_LATIN },
+        { "xh", USCRIPT_LATIN },
+        { "yap", USCRIPT_LATIN },
+        { "yo", USCRIPT_LATIN },
+        { "za", USCRIPT_LATIN },
+        { "zh", USCRIPT_HAN },
+        { "zh_hk", USCRIPT_TRADITIONAL_HAN },
+        { "zh_tw", USCRIPT_TRADITIONAL_HAN },
+        { "zu", USCRIPT_LATIN },
+    };
 
+    static_assert(LocaleName("aa").value() == 0x6161000000000000ULL);
+    static_assert(LocaleName("zh_tw").value() == 0x7a685f7477000000ULL);
+
+    ASSERT(
+        std::is_sorted(std::begin(localeScriptList), std::end(localeScriptList),
+            [](const LocaleScript& a, const LocaleScript& b) {
+                return a.locale < b.locale;
+            }));
+
+    auto tryFindScriptCode = [&] (const String& string) -> Optional<UScriptCode> {
+        auto localeName = LocaleName::parse(string);
+        if (!localeName)
+            return WTF::nullopt;
+
+        auto* element = tryBinarySearch<LocaleScript>(localeScriptList, std::size(localeScriptList), localeName.value(),
+            [](const LocaleScript* localeScript) {
+                return localeScript->locale;
+            });
+        if (element)
+            return element->script;
+        return WTF::nullopt;
+    };
+
     String canonicalLocale = locale;
     canonicalLocale.replace('-', '_');
     while (!canonicalLocale.isEmpty()) {
-        auto it = localeScriptMap.get().find(canonicalLocale);
-        if (it != localeScriptMap.get().end())
-            return it->value;
+        if (auto scriptCode = tryFindScriptCode(canonicalLocale))
+            return scriptCode.value();
         auto underscorePosition = canonicalLocale.reverseFind('_');
         if (underscorePosition == notFound)
             break;
_______________________________________________
webkit-changes mailing list
[email protected]
https://lists.webkit.org/mailman/listinfo/webkit-changes

Reply via email to