Optimize std::regex_traits::lookup_collatename and
std::regex_traits::lookup_classname.
For lookup_collatename we can hoist the static array into a
non-dependent function, then call that for the regex_traits<char>
specialization without performing any narrowing operations via the ctype
facet. For the regex_traits<wchar_t> specialization we populate a
std::string for the narrowed result, and call the non-dependent function.
For lookup_classname we can avoid the static array entirely, replacing
the iteration over that array with a nested switch that implements a
kind of manually-unrolled trie, only match one char at a time until we
either match a full string or get a mismatch. This avoids narrowing the
entire input and storing it in a temporary string. This improves
performance by 2-3x for -O2 and below (the benefit is much smaller for
-O3).
We can also check the input length for random access iterators, and
reject any strings longer than "xdigit" without doing any work at all.
For silly cases like [:xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx:]
this gives a 100x improvement.
libstdc++-v3/ChangeLog:
* include/bits/regex.tcc (__detail::__lookup_collatename): New
function.
(regex_traits::lookup_collatename): Use new function. Elide
redundant narrowing via ctype facet for regex_traits<char>.
(regex_traits::lookup_classname): Replace lookup table with
handwritten prefix match.
---
This does not need to be backported to release branches.
Tested x86_64-linux.
libstdc++-v3/include/bits/regex.tcc | 428 ++++++++++++++++------------
1 file changed, 252 insertions(+), 176 deletions(-)
diff --git a/libstdc++-v3/include/bits/regex.tcc
b/libstdc++-v3/include/bits/regex.tcc
index 71eadd20bfdb..63e4cf79910c 100644
--- a/libstdc++-v3/include/bits/regex.tcc
+++ b/libstdc++-v3/include/bits/regex.tcc
@@ -113,163 +113,187 @@ namespace __detail
}
return __ret;
}
+
+ inline void
+ __lookup_collatename(string& __name) noexcept
+ {
+ static const char* const __collatenames[] =
+ {
+ "NUL",
+ "SOH",
+ "STX",
+ "ETX",
+ "EOT",
+ "ENQ",
+ "ACK",
+ "alert",
+ "backspace",
+ "tab",
+ "newline",
+ "vertical-tab",
+ "form-feed",
+ "carriage-return",
+ "SO",
+ "SI",
+ "DLE",
+ "DC1",
+ "DC2",
+ "DC3",
+ "DC4",
+ "NAK",
+ "SYN",
+ "ETB",
+ "CAN",
+ "EM",
+ "SUB",
+ "ESC",
+ "IS4",
+ "IS3",
+ "IS2",
+ "IS1",
+ "space",
+ "exclamation-mark",
+ "quotation-mark",
+ "number-sign",
+ "dollar-sign",
+ "percent-sign",
+ "ampersand",
+ "apostrophe",
+ "left-parenthesis",
+ "right-parenthesis",
+ "asterisk",
+ "plus-sign",
+ "comma",
+ "hyphen",
+ "period",
+ "slash",
+ "zero",
+ "one",
+ "two",
+ "three",
+ "four",
+ "five",
+ "six",
+ "seven",
+ "eight",
+ "nine",
+ "colon",
+ "semicolon",
+ "less-than-sign",
+ "equals-sign",
+ "greater-than-sign",
+ "question-mark",
+ "commercial-at",
+ "A",
+ "B",
+ "C",
+ "D",
+ "E",
+ "F",
+ "G",
+ "H",
+ "I",
+ "J",
+ "K",
+ "L",
+ "M",
+ "N",
+ "O",
+ "P",
+ "Q",
+ "R",
+ "S",
+ "T",
+ "U",
+ "V",
+ "W",
+ "X",
+ "Y",
+ "Z",
+ "left-square-bracket",
+ "backslash",
+ "right-square-bracket",
+ "circumflex",
+ "underscore",
+ "grave-accent",
+ "a",
+ "b",
+ "c",
+ "d",
+ "e",
+ "f",
+ "g",
+ "h",
+ "i",
+ "j",
+ "k",
+ "l",
+ "m",
+ "n",
+ "o",
+ "p",
+ "q",
+ "r",
+ "s",
+ "t",
+ "u",
+ "v",
+ "w",
+ "x",
+ "y",
+ "z",
+ "left-curly-bracket",
+ "vertical-line",
+ "right-curly-bracket",
+ "tilde",
+ "DEL",
+ };
+
+ for (const auto& __it : __collatenames)
+ if (__name == __it)
+ {
+ __name.assign(1, static_cast<char>(&__it - __collatenames));
+ return;
+ }
+
+ __name.clear();
+ }
+
/// @endcond
} // namespace __detail
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wc++17-extensions" // if constexpr
+
template<typename _Ch_type>
template<typename _Fwd_iter>
typename regex_traits<_Ch_type>::string_type
regex_traits<_Ch_type>::
lookup_collatename(_Fwd_iter __first, _Fwd_iter __last) const
{
- typedef std::ctype<char_type> __ctype_type;
- const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
-
- static const char* const __collatenames[] =
- {
- "NUL",
- "SOH",
- "STX",
- "ETX",
- "EOT",
- "ENQ",
- "ACK",
- "alert",
- "backspace",
- "tab",
- "newline",
- "vertical-tab",
- "form-feed",
- "carriage-return",
- "SO",
- "SI",
- "DLE",
- "DC1",
- "DC2",
- "DC3",
- "DC4",
- "NAK",
- "SYN",
- "ETB",
- "CAN",
- "EM",
- "SUB",
- "ESC",
- "IS4",
- "IS3",
- "IS2",
- "IS1",
- "space",
- "exclamation-mark",
- "quotation-mark",
- "number-sign",
- "dollar-sign",
- "percent-sign",
- "ampersand",
- "apostrophe",
- "left-parenthesis",
- "right-parenthesis",
- "asterisk",
- "plus-sign",
- "comma",
- "hyphen",
- "period",
- "slash",
- "zero",
- "one",
- "two",
- "three",
- "four",
- "five",
- "six",
- "seven",
- "eight",
- "nine",
- "colon",
- "semicolon",
- "less-than-sign",
- "equals-sign",
- "greater-than-sign",
- "question-mark",
- "commercial-at",
- "A",
- "B",
- "C",
- "D",
- "E",
- "F",
- "G",
- "H",
- "I",
- "J",
- "K",
- "L",
- "M",
- "N",
- "O",
- "P",
- "Q",
- "R",
- "S",
- "T",
- "U",
- "V",
- "W",
- "X",
- "Y",
- "Z",
- "left-square-bracket",
- "backslash",
- "right-square-bracket",
- "circumflex",
- "underscore",
- "grave-accent",
- "a",
- "b",
- "c",
- "d",
- "e",
- "f",
- "g",
- "h",
- "i",
- "j",
- "k",
- "l",
- "m",
- "n",
- "o",
- "p",
- "q",
- "r",
- "s",
- "t",
- "u",
- "v",
- "w",
- "x",
- "y",
- "z",
- "left-curly-bracket",
- "vertical-line",
- "right-curly-bracket",
- "tilde",
- "DEL",
- };
-
- string __s;
- for (; __first != __last; ++__first)
- __s += __fctyp.narrow(*__first, 0);
-
- for (const auto& __it : __collatenames)
- if (__s == __it)
- return string_type(1, __fctyp.widen(
- static_cast<char>(&__it - __collatenames)));
-
// TODO Add digraph support:
// http://boost.sourceforge.net/libs/regex/doc/collating_names.html
- return string_type();
+ if constexpr (is_same<char_type, char>::value)
+ {
+ string __s(__first, __last);
+ __detail::__lookup_collatename(__s);
+ return __s;
+ }
+ else
+ {
+ typedef std::ctype<char_type> __ctype_type;
+ const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
+
+ string __s;
+ for (; __first != __last; ++__first)
+ __s += __fctyp.narrow(*__first, 0);
+ __detail::__lookup_collatename(__s);
+ if (__s.empty())
+ return string_type();
+ else
+ return string_type(1, __fctyp.widen(__s[0]));
+ }
}
template<typename _Ch_type>
@@ -278,43 +302,97 @@ namespace __detail
regex_traits<_Ch_type>::
lookup_classname(_Fwd_iter __first, _Fwd_iter __last, bool __icase) const
{
+ if constexpr (__is_any_random_access_iter<_Fwd_iter>::value)
+ if ((__last - __first) > 6) [[__unlikely__]]
+ return {}; // "xdigit" is the longest classname
+
typedef std::ctype<char_type> __ctype_type;
const __ctype_type& __fctyp(use_facet<__ctype_type>(_M_locale));
- // Mappings from class name to class mask.
- static const pair<const char*, char_class_type> __classnames[] =
- {
- {"d", ctype_base::digit},
- {"w", {ctype_base::alnum, _RegexMask::_S_under}},
- {"s", ctype_base::space},
- {"alnum", ctype_base::alnum},
- {"alpha", ctype_base::alpha},
- {"blank", ctype_base::blank},
- {"cntrl", ctype_base::cntrl},
- {"digit", ctype_base::digit},
- {"graph", ctype_base::graph},
- {"lower", ctype_base::lower},
- {"print", ctype_base::print},
- {"punct", ctype_base::punct},
- {"space", ctype_base::space},
- {"upper", ctype_base::upper},
- {"xdigit", ctype_base::xdigit},
+ auto __read_ch = [&]() -> char {
+ if (__first == __last)
+ return '\0';
+ char __c = __fctyp.narrow(__fctyp.tolower(*__first), 0);
+ ++__first;
+ return __c;
};
- string __s;
- for (; __first != __last; ++__first)
- __s += __fctyp.narrow(__fctyp.tolower(*__first), 0);
+ auto __match = [&](const char* __s) -> bool {
+ do
+ if (__read_ch() != *__s)
+ return false;
+ while (*++__s);
+ return __first == __last;
+ };
- for (const auto& __it : __classnames)
- if (__s == __it.first)
+ switch(__read_ch())
+ {
+ case 'a':
+ if (__read_ch() == 'l')
+ switch (__read_ch())
+ {
+ case 'n':
+ if (__match("um"))
+ return ctype_base::alnum;
+ break;
+ case 'p':
+ if (__match("ha"))
+ return ctype_base::alpha;
+ break;
+ }
+ break;
+ case 'b':
+ if (__match("lank"))
+ return ctype_base::blank;
+ break;
+ case 'c':
+ if (__match("ntrl"))
+ return ctype_base::cntrl;
+ break;
+ case 'd':
+ if (__first == __last || __match("igit"))
+ return ctype_base::digit;
+ break;
+ case 'g':
+ if (__match("raph"))
+ return ctype_base::graph;
+ break;
+ case 'l':
+ if (__match("ower"))
+ return __icase ? ctype_base::alpha : ctype_base::lower;
+ break;
+ case 'p':
+ switch (__read_ch())
{
- if (__icase
- && (__it.second._M_base == ctype_base::lower
- || __it.second._M_base == ctype_base::upper))
- return ctype_base::alpha;
- return __it.second;
+ case 'r':
+ if (__match("int"))
+ return ctype_base::print;
+ break;
+ case 'u':
+ if (__match("nct"))
+ return ctype_base::punct;
+ break;
}
- return 0;
+ break;
+ case 's':
+ if (__first == __last || __match("pace"))
+ return ctype_base::space;
+ break;
+ case 'u':
+ if (__match("pper"))
+ return __icase ? ctype_base::alpha : ctype_base::upper;
+ break;
+ case 'w':
+ if (__first == __last)
+ return {ctype_base::alnum, char_class_type::_S_under};
+ break;
+ case 'x':
+ if (__match("digit"))
+ return ctype_base::xdigit;
+ break;
+ }
+
+ return {};
}
template<typename _Ch_type>
@@ -331,8 +409,6 @@ namespace __detail
&& __c == __fctyp.widen('_'));
}
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wc++17-extensions" // if constexpr
template<typename _Ch_type>
int
regex_traits<_Ch_type>::
--
2.52.0