On Mon, Feb 9, 2026 at 9:43 PM Jonathan Wakely <[email protected]> wrote:
> Optimize std::regex_traits::lookup_collatename and
> std::regex_traits::lookup_classname.
>
> For lookup_collatename we can hoist the static array into a
> non-dependent function, then call that for the regex_traits<char>
> specialization without performing any narrowing operations via the ctype
> facet. For the regex_traits<wchar_t> specialization we populate a
> std::string for the narrowed result, and call the non-dependent function.
>
> For lookup_classname we can avoid the static array entirely, replacing
> the iteration over that array with a nested switch that implements a
> kind of manually-unrolled trie, only match one char at a time until we
> either match a full string or get a mismatch. This avoids narrowing the
> entire input and storing it in a temporary string. This improves
> performance by 2-3x for -O2 and below (the benefit is much smaller for
> -O3).
>
> We can also check the input length for random access iterators, and
> reject any strings longer than "xdigit" without doing any work at all.
> For silly cases like [:xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx:]
> this gives a 100x improvement.
>
> libstdc++-v3/ChangeLog:
>
> * include/bits/regex.tcc (__detail::__lookup_collatename): New
> function.
> (regex_traits::lookup_collatename): Use new function. Elide
> redundant narrowing via ctype facet for regex_traits<char>.
> (regex_traits::lookup_classname): Replace lookup table with
> handwritten prefix match.
> ---
>
> This does not need to be backported to release branches.
>
Agreed.
>
> Tested x86_64-linux.
>
LGTM.
>
> libstdc++-v3/include/bits/regex.tcc | 428 ++++++++++++++++------------
> 1 file changed, 252 insertions(+), 176 deletions(-)
>
> diff --git a/libstdc++-v3/include/bits/regex.tcc
> b/libstdc++-v3/include/bits/regex.tcc
> index 71eadd20bfdb..63e4cf79910c 100644
> --- a/libstdc++-v3/include/bits/regex.tcc
> +++ b/libstdc++-v3/include/bits/regex.tcc
> @@ -113,163 +113,187 @@ namespace __detail
> }
> return __ret;
> }
> +
> + inline void
> + __lookup_collatename(string& __name) noexcept
> + {
> + static const char* const __collatenames[] =
> + {
> + "NUL",
> + "SOH",
> + "STX",
> + "ETX",
> + "EOT",
> + "ENQ",
> + "ACK",
> + "alert",
> + "backspace",
> + "tab",
> + "newline",
> + "vertical-tab",
> + "form-feed",
> + "carriage-return",
> + "SO",
> + "SI",
> + "DLE",
> + "DC1",
> + "DC2",
> + "DC3",
> + "DC4",
> + "NAK",
> + "SYN",
> + "ETB",
> + "CAN",
> + "EM",
> + "SUB",
> + "ESC",
> + "IS4",
> + "IS3",
> + "IS2",
> + "IS1",
> + "space",
> + "exclamation-mark",
> + "quotation-mark",
> + "number-sign",
> + "dollar-sign",
> + "percent-sign",
> + "ampersand",
> + "apostrophe",
> + "left-parenthesis",
> + "right-parenthesis",
> + "asterisk",
> + "plus-sign",
> + "comma",
> + "hyphen",
> + "period",
> + "slash",
> + "zero",
> + "one",
> + "two",
> + "three",
> + "four",
> + "five",
> + "six",
> + "seven",
> + "eight",
> + "nine",
> + "colon",
> + "semicolon",
> + "less-than-sign",
> + "equals-sign",
> + "greater-than-sign",
> + "question-mark",
> + "commercial-at",
> + "A",
> + "B",
> + "C",
> + "D",
> + "E",
> + "F",
> + "G",
> + "H",
> + "I",
> + "J",
> + "K",
> + "L",
> + "M",
> + "N",
> + "O",
> + "P",
> + "Q",
> + "R",
> + "S",
> + "T",
> + "U",
> + "V",
> + "W",
> + "X",
> + "Y",
> + "Z",
> + "left-square-bracket",
> + "backslash",
> + "right-square-bracket",
> + "circumflex",
> + "underscore",
> + "grave-accent",
> + "a",
> + "b",
> + "c",
> + "d",
> + "e",
> + "f",
> + "g",
> + "h",
> + "i",
> + "j",
> + "k",
> + "l",
> + "m",
> + "n",
> + "o",
> + "p",
> + "q",
> + "r",
> + "s",
> + "t",
> + "u",
> + "v",
> + "w",
> + "x",
> + "y",
> + "z",
> + "left-curly-bracket",
> + "vertical-line",
> + "right-curly-bracket",
> + "tilde",
> + "DEL",
> + };
> +
> + for (const auto& __it : __collatenames)
> + if (__name == __it)
> + {
> + __name.assign(1, static_cast<char>(&__it - __collatenames));
> + return;
> + }
> +
> + __name.clear();
> + }
> +
> /// @endcond
> } // namespace __detail
>
> +#pragma GCC diagnostic push
> +#pragma GCC diagnostic ignored "-Wc++17-extensions" // if constexpr
> +
> template<typename _Ch_type>
> template<typename _Fwd_iter>
> typename regex_traits<_Ch_type>::string_type
> regex_traits<_Ch_type>::
> lookup_collatename(_Fwd_iter __first, _Fwd_iter __last) const
> {
> - typedef std::ctype<char_type> __ctype_type;
> - const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
> -
> - static const char* const __collatenames[] =
> - {
> - "NUL",
> - "SOH",
> - "STX",
> - "ETX",
> - "EOT",
> - "ENQ",
> - "ACK",
> - "alert",
> - "backspace",
> - "tab",
> - "newline",
> - "vertical-tab",
> - "form-feed",
> - "carriage-return",
> - "SO",
> - "SI",
> - "DLE",
> - "DC1",
> - "DC2",
> - "DC3",
> - "DC4",
> - "NAK",
> - "SYN",
> - "ETB",
> - "CAN",
> - "EM",
> - "SUB",
> - "ESC",
> - "IS4",
> - "IS3",
> - "IS2",
> - "IS1",
> - "space",
> - "exclamation-mark",
> - "quotation-mark",
> - "number-sign",
> - "dollar-sign",
> - "percent-sign",
> - "ampersand",
> - "apostrophe",
> - "left-parenthesis",
> - "right-parenthesis",
> - "asterisk",
> - "plus-sign",
> - "comma",
> - "hyphen",
> - "period",
> - "slash",
> - "zero",
> - "one",
> - "two",
> - "three",
> - "four",
> - "five",
> - "six",
> - "seven",
> - "eight",
> - "nine",
> - "colon",
> - "semicolon",
> - "less-than-sign",
> - "equals-sign",
> - "greater-than-sign",
> - "question-mark",
> - "commercial-at",
> - "A",
> - "B",
> - "C",
> - "D",
> - "E",
> - "F",
> - "G",
> - "H",
> - "I",
> - "J",
> - "K",
> - "L",
> - "M",
> - "N",
> - "O",
> - "P",
> - "Q",
> - "R",
> - "S",
> - "T",
> - "U",
> - "V",
> - "W",
> - "X",
> - "Y",
> - "Z",
> - "left-square-bracket",
> - "backslash",
> - "right-square-bracket",
> - "circumflex",
> - "underscore",
> - "grave-accent",
> - "a",
> - "b",
> - "c",
> - "d",
> - "e",
> - "f",
> - "g",
> - "h",
> - "i",
> - "j",
> - "k",
> - "l",
> - "m",
> - "n",
> - "o",
> - "p",
> - "q",
> - "r",
> - "s",
> - "t",
> - "u",
> - "v",
> - "w",
> - "x",
> - "y",
> - "z",
> - "left-curly-bracket",
> - "vertical-line",
> - "right-curly-bracket",
> - "tilde",
> - "DEL",
> - };
> -
> - string __s;
> - for (; __first != __last; ++__first)
> - __s += __fctyp.narrow(*__first, 0);
> -
> - for (const auto& __it : __collatenames)
> - if (__s == __it)
> - return string_type(1, __fctyp.widen(
> - static_cast<char>(&__it - __collatenames)));
> -
> // TODO Add digraph support:
> // http://boost.sourceforge.net/libs/regex/doc/collating_names.html
>
> - return string_type();
> + if constexpr (is_same<char_type, char>::value)
> + {
> + string __s(__first, __last);
> + __detail::__lookup_collatename(__s);
> + return __s;
> + }
> + else
> + {
> + typedef std::ctype<char_type> __ctype_type;
> + const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
> +
> + string __s;
> + for (; __first != __last; ++__first)
> + __s += __fctyp.narrow(*__first, 0);
> + __detail::__lookup_collatename(__s);
> + if (__s.empty())
> + return string_type();
> + else
> + return string_type(1, __fctyp.widen(__s[0]));
> + }
> }
>
> template<typename _Ch_type>
> @@ -278,43 +302,97 @@ namespace __detail
> regex_traits<_Ch_type>::
> lookup_classname(_Fwd_iter __first, _Fwd_iter __last, bool __icase)
> const
> {
> + if constexpr (__is_any_random_access_iter<_Fwd_iter>::value)
> + if ((__last - __first) > 6) [[__unlikely__]]
> + return {}; // "xdigit" is the longest classname
> +
> typedef std::ctype<char_type> __ctype_type;
> const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
>
> - // Mappings from class name to class mask.
> - static const pair<const char*, char_class_type> __classnames[] =
> - {
> - {"d", ctype_base::digit},
> - {"w", {ctype_base::alnum, _RegexMask::_S_under}},
> - {"s", ctype_base::space},
> - {"alnum", ctype_base::alnum},
> - {"alpha", ctype_base::alpha},
> - {"blank", ctype_base::blank},
> - {"cntrl", ctype_base::cntrl},
> - {"digit", ctype_base::digit},
> - {"graph", ctype_base::graph},
> - {"lower", ctype_base::lower},
> - {"print", ctype_base::print},
> - {"punct", ctype_base::punct},
> - {"space", ctype_base::space},
> - {"upper", ctype_base::upper},
> - {"xdigit", ctype_base::xdigit},
> + auto __read_ch = [&]() -> char {
> + if (__first == __last)
> + return '\0';
> + char __c = __fctyp.narrow(__fctyp.tolower(*__first), 0);
> + ++__first;
> + return __c;
> };
>
> - string __s;
> - for (; __first != __last; ++__first)
> - __s += __fctyp.narrow(__fctyp.tolower(*__first), 0);
> + auto __match = [&](const char* __s) -> bool {
> + do
> + if (__read_ch() != *__s)
> + return false;
> + while (*++__s);
> + return __first == __last;
> + };
>
> - for (const auto& __it : __classnames)
> - if (__s == __it.first)
> + switch(__read_ch())
>
That's a really cool idea, to switch on one character.
> + {
> + case 'a':
> + if (__read_ch() == 'l')
> + switch (__read_ch())
> + {
> + case 'n':
> + if (__match("um"))
>
I think adding comments like if (__match("um")) // alnum, would really
help here. I think they really helps for this two cases, but would
add them everywhere...
> + return ctype_base::alnum;
> + break;
> + case 'p':
> + if (__match("ha"))
+ return ctype_base::alpha;
> + break;
> + }
> + break;
> + case 'b':
> + if (__match("lank"))
> + return ctype_base::blank;
> + break;
> + case 'c':
> + if (__match("ntrl"))
> + return ctype_base::cntrl;
> + break;
> + case 'd':
> + if (__first == __last || __match("igit"))
>
And say // d or digit here.
> + return ctype_base::digit;
> + break;
> + case 'g':
> + if (__match("raph"))
> + return ctype_base::graph;
> + break;
> + case 'l':
> + if (__match("ower"))
> + return __icase ? ctype_base::alpha : ctype_base::lower;
> + break;
> + case 'p':
> + switch (__read_ch())
> {
> - if (__icase
> - && (__it.second._M_base == ctype_base::lower
> - || __it.second._M_base == ctype_base::upper))
> - return ctype_base::alpha;
> - return __it.second;
> + case 'r':
> + if (__match("int"))
> + return ctype_base::print;
> + break;
> + case 'u':
> + if (__match("nct"))
> + return ctype_base::punct;
> + break;
> }
> - return 0;
> + break;
> + case 's':
> + if (__first == __last || __match("pace"))
> + return ctype_base::space;
> + break;
> + case 'u':
> + if (__match("pper"))
> + return __icase ? ctype_base::alpha : ctype_base::upper;
> + break;
> + case 'w':
> + if (__first == __last)
> + return {ctype_base::alnum, char_class_type::_S_under};
> + break;
> + case 'x':
> + if (__match("digit"))
> + return ctype_base::xdigit;
> + break;
> + }
> +
> + return {};
> }
>
> template<typename _Ch_type>
> @@ -331,8 +409,6 @@ namespace __detail
> && __c == __fctyp.widen('_'));
> }
>
> -#pragma GCC diagnostic push
> -#pragma GCC diagnostic ignored "-Wc++17-extensions" // if constexpr
> template<typename _Ch_type>
> int
> regex_traits<_Ch_type>::
> --
> 2.52.0
>
>