On Wed, Feb 11, 2026 at 1:21 AM Jonathan Wakely <[email protected]> wrote:
> Optimize regex_traits::lookup_collatename and > regex_traits::lookup_classname. > > For lookup_collatename we can hoist the static array into a > non-dependent function, then call that for the regex_traits<char> > specialization without performing any narrowing operations via the ctype > facet. For the regex_traits<wchar_t> specialization we populate a > std::string for the narrowed result, and call the non-dependent function. > > For lookup_classname we can avoid the static array entirely, replacing > the iteration over that array with a nested switch that implements a > kind of manually-unrolled trie, only match one char at a time until we > either match a full string or get a mismatch. This avoids narrowing the > entire input and storing it in a temporary string. This improves > performance by 2-3x for -O2 and below (the benefit is much smaller for > -O3). > > We can also check the input length for random access iterators, and > reject any strings longer than "xdigit" without doing any work at all. > For silly cases like [:xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx:] > this gives a 100x improvement. > > libstdc++-v3/ChangeLog: > > * include/bits/regex.tcc (__detail::__lookup_collatename): New > function. > (regex_traits::lookup_collatename): Use new function. Elide > redundant narrowing via ctype facet for regex_traits<char>. > (regex_traits::lookup_classname): Replace lookup table with > handwritten prefix match. > > Reviewed-by: Tomasz Kamiński <[email protected]> > --- > > v2: Added comments to lookup_classname, e.g. "d" or "digit" > LGTM > > Tested x86_64-linux and aarch64-linux. > > libstdc++-v3/include/bits/regex.tcc | 428 ++++++++++++++++------------ > 1 file changed, 252 insertions(+), 176 deletions(-) > > diff --git a/libstdc++-v3/include/bits/regex.tcc > b/libstdc++-v3/include/bits/regex.tcc > index 7715d653df5d..2e7843537693 100644 > --- a/libstdc++-v3/include/bits/regex.tcc > +++ b/libstdc++-v3/include/bits/regex.tcc > @@ -106,163 +106,187 @@ namespace __detail > } > return __ret; > } > + > + inline void > + __lookup_collatename(string& __name) noexcept > + { > + static const char* const __collatenames[] = > + { > + "NUL", > + "SOH", > + "STX", > + "ETX", > + "EOT", > + "ENQ", > + "ACK", > + "alert", > + "backspace", > + "tab", > + "newline", > + "vertical-tab", > + "form-feed", > + "carriage-return", > + "SO", > + "SI", > + "DLE", > + "DC1", > + "DC2", > + "DC3", > + "DC4", > + "NAK", > + "SYN", > + "ETB", > + "CAN", > + "EM", > + "SUB", > + "ESC", > + "IS4", > + "IS3", > + "IS2", > + "IS1", > + "space", > + "exclamation-mark", > + "quotation-mark", > + "number-sign", > + "dollar-sign", > + "percent-sign", > + "ampersand", > + "apostrophe", > + "left-parenthesis", > + "right-parenthesis", > + "asterisk", > + "plus-sign", > + "comma", > + "hyphen", > + "period", > + "slash", > + "zero", > + "one", > + "two", > + "three", > + "four", > + "five", > + "six", > + "seven", > + "eight", > + "nine", > + "colon", > + "semicolon", > + "less-than-sign", > + "equals-sign", > + "greater-than-sign", > + "question-mark", > + "commercial-at", > + "A", > + "B", > + "C", > + "D", > + "E", > + "F", > + "G", > + "H", > + "I", > + "J", > + "K", > + "L", > + "M", > + "N", > + "O", > + "P", > + "Q", > + "R", > + "S", > + "T", > + "U", > + "V", > + "W", > + "X", > + "Y", > + "Z", > + "left-square-bracket", > + "backslash", > + "right-square-bracket", > + "circumflex", > + "underscore", > + "grave-accent", > + "a", > + "b", > + "c", > + "d", > + "e", > + "f", > + "g", > + "h", > + "i", > + "j", > + "k", > + "l", > + "m", > + "n", > + "o", > + "p", > + "q", > + "r", > + "s", > + "t", > + "u", > + "v", > + "w", > + "x", > + "y", > + "z", > + "left-curly-bracket", > + "vertical-line", > + "right-curly-bracket", > + "tilde", > + "DEL", > + }; > + > + for (const auto& __it : __collatenames) > + if (__name == __it) > + { > + __name.assign(1, static_cast<char>(&__it - __collatenames)); > + return; > + } > + > + __name.clear(); > + } > + > /// @endcond > } // namespace __detail > > +#pragma GCC diagnostic push > +#pragma GCC diagnostic ignored "-Wc++17-extensions" // if constexpr > + > template<typename _Ch_type> > template<typename _Fwd_iter> > typename regex_traits<_Ch_type>::string_type > regex_traits<_Ch_type>:: > lookup_collatename(_Fwd_iter __first, _Fwd_iter __last) const > { > - typedef std::ctype<char_type> __ctype_type; > - const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale)); > - > - static const char* const __collatenames[] = > - { > - "NUL", > - "SOH", > - "STX", > - "ETX", > - "EOT", > - "ENQ", > - "ACK", > - "alert", > - "backspace", > - "tab", > - "newline", > - "vertical-tab", > - "form-feed", > - "carriage-return", > - "SO", > - "SI", > - "DLE", > - "DC1", > - "DC2", > - "DC3", > - "DC4", > - "NAK", > - "SYN", > - "ETB", > - "CAN", > - "EM", > - "SUB", > - "ESC", > - "IS4", > - "IS3", > - "IS2", > - "IS1", > - "space", > - "exclamation-mark", > - "quotation-mark", > - "number-sign", > - "dollar-sign", > - "percent-sign", > - "ampersand", > - "apostrophe", > - "left-parenthesis", > - "right-parenthesis", > - "asterisk", > - "plus-sign", > - "comma", > - "hyphen", > - "period", > - "slash", > - "zero", > - "one", > - "two", > - "three", > - "four", > - "five", > - "six", > - "seven", > - "eight", > - "nine", > - "colon", > - "semicolon", > - "less-than-sign", > - "equals-sign", > - "greater-than-sign", > - "question-mark", > - "commercial-at", > - "A", > - "B", > - "C", > - "D", > - "E", > - "F", > - "G", > - "H", > - "I", > - "J", > - "K", > - "L", > - "M", > - "N", > - "O", > - "P", > - "Q", > - "R", > - "S", > - "T", > - "U", > - "V", > - "W", > - "X", > - "Y", > - "Z", > - "left-square-bracket", > - "backslash", > - "right-square-bracket", > - "circumflex", > - "underscore", > - "grave-accent", > - "a", > - "b", > - "c", > - "d", > - "e", > - "f", > - "g", > - "h", > - "i", > - "j", > - "k", > - "l", > - "m", > - "n", > - "o", > - "p", > - "q", > - "r", > - "s", > - "t", > - "u", > - "v", > - "w", > - "x", > - "y", > - "z", > - "left-curly-bracket", > - "vertical-line", > - "right-curly-bracket", > - "tilde", > - "DEL", > - }; > - > - string __s; > - for (; __first != __last; ++__first) > - __s += __fctyp.narrow(*__first, 0); > - > - for (const auto& __it : __collatenames) > - if (__s == __it) > - return string_type(1, __fctyp.widen( > - static_cast<char>(&__it - __collatenames))); > - > // TODO Add digraph support: > // http://boost.sourceforge.net/libs/regex/doc/collating_names.html > > - return string_type(); > + if constexpr (is_same<char_type, char>::value) > + { > + string __s(__first, __last); > + __detail::__lookup_collatename(__s); > + return __s; > + } > + else > + { > + typedef std::ctype<char_type> __ctype_type; > + const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale)); > + > + string __s; > + for (; __first != __last; ++__first) > + __s += __fctyp.narrow(*__first, 0); > + __detail::__lookup_collatename(__s); > + if (__s.empty()) > + return string_type(); > + else > + return string_type(1, __fctyp.widen(__s[0])); > + } > } > > template<typename _Ch_type> > @@ -271,43 +295,97 @@ namespace __detail > regex_traits<_Ch_type>:: > lookup_classname(_Fwd_iter __first, _Fwd_iter __last, bool __icase) > const > { > + if constexpr (__is_any_random_access_iter<_Fwd_iter>::value) > + if ((__last - __first) > 6) [[__unlikely__]] > + return {}; // "xdigit" is the longest classname > + > typedef std::ctype<char_type> __ctype_type; > const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale)); > > - // Mappings from class name to class mask. > - static const pair<const char*, char_class_type> __classnames[] = > - { > - {"d", ctype_base::digit}, > - {"w", {ctype_base::alnum, _RegexMask::_S_under}}, > - {"s", ctype_base::space}, > - {"alnum", ctype_base::alnum}, > - {"alpha", ctype_base::alpha}, > - {"blank", ctype_base::blank}, > - {"cntrl", ctype_base::cntrl}, > - {"digit", ctype_base::digit}, > - {"graph", ctype_base::graph}, > - {"lower", ctype_base::lower}, > - {"print", ctype_base::print}, > - {"punct", ctype_base::punct}, > - {"space", ctype_base::space}, > - {"upper", ctype_base::upper}, > - {"xdigit", ctype_base::xdigit}, > + auto __read_ch = [&]() -> char { > + if (__first == __last) > + return '\0'; > + char __c = __fctyp.narrow(__fctyp.tolower(*__first), 0); > + ++__first; > + return __c; > }; > > - string __s; > - for (; __first != __last; ++__first) > - __s += __fctyp.narrow(__fctyp.tolower(*__first), 0); > + auto __match = [&](const char* __s) -> bool { > + do > + if (__read_ch() != *__s) > + return false; > + while (*++__s); > + return __first == __last; > + }; > > - for (const auto& __it : __classnames) > - if (__s == __it.first) > + switch(__read_ch()) > + { > + case 'a': > + if (__read_ch() == 'l') > + switch (__read_ch()) > + { > + case 'n': > + if (__match("um")) // "alnum" > + return ctype_base::alnum; > + break; > + case 'p': > + if (__match("ha")) // "alpha" > + return ctype_base::alpha; > + break; > + } > + break; > + case 'b': > + if (__match("lank")) // "blank" > + return ctype_base::blank; > + break; > + case 'c': > + if (__match("ntrl")) // "cntrl" > + return ctype_base::cntrl; > + break; > + case 'd': > + if (__first == __last || __match("igit")) // "d" or "digit" > + return ctype_base::digit; > + break; > + case 'g': > + if (__match("raph")) // "graph" > + return ctype_base::graph; > + break; > + case 'l': > + if (__match("ower")) // "lower" > + return __icase ? ctype_base::alpha : ctype_base::lower; > + break; > + case 'p': > + switch (__read_ch()) > { > - if (__icase > - && (__it.second._M_base == ctype_base::lower > - || __it.second._M_base == ctype_base::upper)) > - return ctype_base::alpha; > - return __it.second; > + case 'r': > + if (__match("int")) // "print" > + return ctype_base::print; > + break; > + case 'u': > + if (__match("nct")) // "punct" > + return ctype_base::punct; > + break; > } > - return 0; > + break; > + case 's': > + if (__first == __last || __match("pace")) // "s" or "space" > + return ctype_base::space; > + break; > + case 'u': > + if (__match("pper")) // "upper" > + return __icase ? ctype_base::alpha : ctype_base::upper; > + break; > + case 'w': > + if (__first == __last) // "w" > + return {ctype_base::alnum, char_class_type::_S_under}; > + break; > + case 'x': > + if (__match("digit")) // "xdigit" > + return ctype_base::xdigit; > + break; > + } > + > + return {}; > } > > template<typename _Ch_type> > @@ -324,8 +402,6 @@ namespace __detail > && __c == __fctyp.widen('_')); > } > > -#pragma GCC diagnostic push > -#pragma GCC diagnostic ignored "-Wc++17-extensions" // if constexpr > template<typename _Ch_type> > int > regex_traits<_Ch_type>:: > -- > 2.52.0 > >
