On Sat, Apr 26, 2014 at 1:00 PM, Jonathan Wakely <jwak...@redhat.com> wrote: > Maybe a dumb question (I don't understand a lot of the regex code!) > but is it correct to set this to 1 in the case where > __rep_count.first != _M_current ? Could that result in the count > going downwards from 2 to 1? (Maybe that's correct?)
As I said in the comment, __rep_count records how many times (__rep_count.second) this node is visited *under certain input iterator* (__rep_count.first). That is to say, if the current iterator (_M_current) is not changing, but the executor keeps visiting this node, times of visit needs to be recorded. Once the _M_current changed, the count should be reset to 0. We simply set it to 1 for the "set to 0 then increase by 1" operations. The idea behind this is that, for example, regex "(?:)*" will form a self-epsilon-circle in the NFA graph. While it's executed, an infinite loop may be entered because this circle doesn't consumes any input (aka, the current iterator is not changing). So for given node, we need to prevent from entering infinite loop by checking the iterator since last visited. If the iterator didn't change, we shall refuse to follow the edge-on-the-circle. There's one more question: we shouldn't simply refuse to follow the edge that last visited under the same iterator, because visiting some node (begin capture, end capture) on the circle has side effects. To make sure these side effects happen, one more time of reentering is permitted. That is why we set the counter and permitted entering the node for at most 2 times. About the overall regex code, I think I can compose a blog entry to explain it, if necessary? -- Regards, Tim Shen
commit b188832f9bd85c71ef70693859e721974163c24b Author: tim <timshe...@gmail.com> Date: Fri Apr 25 01:50:11 2014 -0400 2014-04-25 Tim Shen <timshe...@gmail.com> * include/bits/regex_executor.h: Add _M_rep_count to track how many times this repeat node are visited. * include/bits/regex_executor.tcc (_Executor<>::_M_rep_once_more, _Executor<>::_M_dfs): Use _M_rep_count to prevent entering infinite loop. diff --git a/libstdc++-v3/include/bits/regex_executor.h b/libstdc++-v3/include/bits/regex_executor.h index 708c78e..c110b88 100644 --- a/libstdc++-v3/include/bits/regex_executor.h +++ b/libstdc++-v3/include/bits/regex_executor.h @@ -73,6 +73,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _M_results(__results), _M_match_queue(__dfs_mode ? nullptr : new vector<pair<_StateIdT, _ResultsVec>>()), + _M_rep_count(_M_nfa.size()), _M_visited(__dfs_mode ? nullptr : new vector<bool>(_M_nfa.size())), _M_flags((__flags & regex_constants::match_prev_avail) ? (__flags @@ -104,6 +105,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION private: template<bool __match_mode> void + _M_rep_once_more(_StateIdT); + + template<bool __match_mode> + void _M_dfs(_StateIdT __start); template<bool __match_mode> @@ -149,9 +154,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION _ResultsVec& _M_results; // Used in BFS, saving states that need to be considered for the next // character. - std::unique_ptr<vector<pair<_StateIdT, _ResultsVec>>> _M_match_queue; + unique_ptr<vector<pair<_StateIdT, _ResultsVec>>> _M_match_queue; // Used in BFS, indicating that which state is already visited. - std::unique_ptr<vector<bool>> _M_visited; + vector<pair<_BiIter, int>> _M_rep_count; + unique_ptr<vector<bool>> _M_visited; _FlagT _M_flags; // To record current solution. _StateIdT _M_start_state; diff --git a/libstdc++-v3/include/bits/regex_executor.tcc b/libstdc++-v3/include/bits/regex_executor.tcc index 68a5e04..0daf211 100644 --- a/libstdc++-v3/include/bits/regex_executor.tcc +++ b/libstdc++-v3/include/bits/regex_executor.tcc @@ -161,6 +161,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION return false; } + // __rep_count records how many times (__rep_count.second) + // this node is visited under certain input iterator + // (__rep_count.first). This prevent the executor from entering + // infinite loop by refusing to continue when it's already been + // visited more than twice. It's `twice` instead of `once` because + // we need to spare one more time for potential group capture. + template<typename _BiIter, typename _Alloc, typename _TraitsT, + bool __dfs_mode> + template<bool __match_mode> + void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>:: + _M_rep_once_more(_StateIdT __i) + { + const auto& __state = _M_nfa[__i]; + auto& __rep_count = _M_rep_count[__i]; + if (__rep_count.second == 0 || __rep_count.first != _M_current) + { + auto __back = __rep_count; + __rep_count.first = _M_current; + __rep_count.second = 1; + _M_dfs<__match_mode>(__state._M_alt); + __rep_count = __back; + } + else + { + if (__rep_count.second < 2) + { + __rep_count.second++; + _M_dfs<__match_mode>(__state._M_alt); + __rep_count.second--; + } + } + }; + // TODO: Use a function vector to dispatch, instead of using switch-case. template<typename _BiIter, typename _Alloc, typename _TraitsT, bool __dfs_mode> @@ -185,68 +218,60 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION // mean the same thing, and we need to choose the correct order under // given greedy mode. case _S_opcode_alternative: - // Greedy. - if (!__state._M_neg) - { - // "Once more" is preferred in greedy mode. - _M_dfs<__match_mode>(__state._M_alt); - // If it's DFS executor and already accepted, we're done. - if (!__dfs_mode || !_M_has_sol) - _M_dfs<__match_mode>(__state._M_next); - } - else // Non-greedy mode - { - if (__dfs_mode) - { - // vice-versa. + { + // Greedy. + if (!__state._M_neg) + { + _M_rep_once_more<__match_mode>(__i); + // If it's DFS executor and already accepted, we're done. + if (!__dfs_mode || !_M_has_sol) _M_dfs<__match_mode>(__state._M_next); - if (!_M_has_sol) - _M_dfs<__match_mode>(__state._M_alt); - } - else - { - // DON'T attempt anything, because there's already another - // state with higher priority accepted. This state cannot be - // better by attempting its next node. - if (!_M_has_sol) - { - _M_dfs<__match_mode>(__state._M_next); - // DON'T attempt anything if it's already accepted. An - // accepted state *must* be better than a solution that - // matches a non-greedy quantifier one more time. - if (!_M_has_sol) - _M_dfs<__match_mode>(__state._M_alt); - } - } + } + else // Non-greedy mode + { + if (__dfs_mode) + { + // vice-versa. + _M_dfs<__match_mode>(__state._M_next); + if (!_M_has_sol) + _M_rep_once_more<__match_mode>(__i); + } + else + { + // DON'T attempt anything, because there's already another + // state with higher priority accepted. This state cannot be + // better by attempting its next node. + if (!_M_has_sol) + { + _M_dfs<__match_mode>(__state._M_next); + // DON'T attempt anything if it's already accepted. An + // accepted state *must* be better than a solution that + // matches a non-greedy quantifier one more time. + if (!_M_has_sol) + _M_rep_once_more<__match_mode>(__i); + } + } + } } break; case _S_opcode_subexpr_begin: - // If there's nothing changed since last visit, do NOT continue. - // This prevents the executor from get into infinite loop when using - // "()*" to match "". - if (!_M_cur_results[__state._M_subexpr].matched - || _M_cur_results[__state._M_subexpr].first != _M_current) - { - auto& __res = _M_cur_results[__state._M_subexpr]; - auto __back = __res.first; - __res.first = _M_current; - _M_dfs<__match_mode>(__state._M_next); - __res.first = __back; - } + { + auto& __res = _M_cur_results[__state._M_subexpr]; + auto __back = __res.first; + __res.first = _M_current; + _M_dfs<__match_mode>(__state._M_next); + __res.first = __back; + } break; case _S_opcode_subexpr_end: - if (_M_cur_results[__state._M_subexpr].second != _M_current - || _M_cur_results[__state._M_subexpr].matched != true) - { - auto& __res = _M_cur_results[__state._M_subexpr]; - auto __back = __res; - __res.second = _M_current; - __res.matched = true; - _M_dfs<__match_mode>(__state._M_next); - __res = __back; - } - else + { + auto& __res = _M_cur_results[__state._M_subexpr]; + auto __back = __res; + __res.second = _M_current; + __res.matched = true; _M_dfs<__match_mode>(__state._M_next); + __res = __back; + } break; case _S_opcode_line_begin_assertion: if (_M_at_begin()) diff --git a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc index 3fdbdad..c33d1b6 100644 --- a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc +++ b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc @@ -50,6 +50,7 @@ test01() const char s[] = ""; VERIFY( regex_match_debug(s, m, re) ); } + VERIFY(regex_match_debug("", regex("(?:)*"))); } int