On Sat, Apr 26, 2014 at 1:00 PM, Jonathan Wakely <jwak...@redhat.com> wrote:
> Maybe a dumb question (I don't understand a lot of the regex code!)
> but is it correct to set this to 1 in the case where
> __rep_count.first != _M_current ?  Could that result in the count
> going downwards from 2 to 1? (Maybe that's correct?)

As I said in the comment, __rep_count records how many times
(__rep_count.second) this node is visited *under certain input
iterator* (__rep_count.first).

That is to say, if the current iterator (_M_current) is not changing,
but the executor keeps visiting this node, times of visit needs to be
recorded. Once the _M_current changed, the count should be reset to 0.
We simply set it to 1 for the "set to 0 then increase by 1"
operations.

The idea behind this is that, for example, regex "(?:)*" will form a
self-epsilon-circle in the NFA graph. While it's executed, an infinite
loop may be entered because this circle doesn't consumes any input
(aka, the current iterator is not changing). So for given node, we
need to prevent from entering infinite loop by checking the iterator
since last visited. If the iterator didn't change, we shall refuse to
follow the edge-on-the-circle.

There's one more question: we shouldn't simply refuse to follow the
edge that last visited under the same iterator, because visiting some
node (begin capture, end capture) on the circle has side effects. To
make sure these side effects happen, one more time of reentering is
permitted. That is why we set the counter and permitted entering the
node for at most 2 times.

About the overall regex code, I think I can compose a blog entry to
explain it, if necessary?


-- 
Regards,
Tim Shen
commit b188832f9bd85c71ef70693859e721974163c24b
Author: tim <timshe...@gmail.com>
Date:   Fri Apr 25 01:50:11 2014 -0400

    2014-04-25  Tim Shen  <timshe...@gmail.com>
    
        * include/bits/regex_executor.h: Add _M_rep_count to track how
        many times this repeat node are visited.
        * include/bits/regex_executor.tcc (_Executor<>::_M_rep_once_more,
        _Executor<>::_M_dfs): Use _M_rep_count to prevent entering
        infinite loop.

diff --git a/libstdc++-v3/include/bits/regex_executor.h 
b/libstdc++-v3/include/bits/regex_executor.h
index 708c78e..c110b88 100644
--- a/libstdc++-v3/include/bits/regex_executor.h
+++ b/libstdc++-v3/include/bits/regex_executor.h
@@ -73,6 +73,7 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       _M_results(__results),
       _M_match_queue(__dfs_mode ? nullptr
                     : new vector<pair<_StateIdT, _ResultsVec>>()),
+      _M_rep_count(_M_nfa.size()),
       _M_visited(__dfs_mode ? nullptr : new vector<bool>(_M_nfa.size())),
       _M_flags((__flags & regex_constants::match_prev_avail)
               ? (__flags
@@ -104,6 +105,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
     private:
       template<bool __match_mode>
        void
+       _M_rep_once_more(_StateIdT);
+
+      template<bool __match_mode>
+       void
        _M_dfs(_StateIdT __start);
 
       template<bool __match_mode>
@@ -149,9 +154,10 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       _ResultsVec&                                          _M_results;
       // Used in BFS, saving states that need to be considered for the next
       // character.
-      std::unique_ptr<vector<pair<_StateIdT, _ResultsVec>>> _M_match_queue;
+      unique_ptr<vector<pair<_StateIdT, _ResultsVec>>>      _M_match_queue;
       // Used in BFS, indicating that which state is already visited.
-      std::unique_ptr<vector<bool>>                         _M_visited;
+      vector<pair<_BiIter, int>>                            _M_rep_count;
+      unique_ptr<vector<bool>>                              _M_visited;
       _FlagT                                                _M_flags;
       // To record current solution.
       _StateIdT                                             _M_start_state;
diff --git a/libstdc++-v3/include/bits/regex_executor.tcc 
b/libstdc++-v3/include/bits/regex_executor.tcc
index 68a5e04..0daf211 100644
--- a/libstdc++-v3/include/bits/regex_executor.tcc
+++ b/libstdc++-v3/include/bits/regex_executor.tcc
@@ -161,6 +161,39 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
       return false;
     }
 
+  // __rep_count records how many times (__rep_count.second)
+  // this node is visited under certain input iterator
+  // (__rep_count.first). This prevent the executor from entering
+  // infinite loop by refusing to continue when it's already been
+  // visited more than twice. It's `twice` instead of `once` because
+  // we need to spare one more time for potential group capture.
+  template<typename _BiIter, typename _Alloc, typename _TraitsT,
+    bool __dfs_mode>
+  template<bool __match_mode>
+    void _Executor<_BiIter, _Alloc, _TraitsT, __dfs_mode>::
+    _M_rep_once_more(_StateIdT __i)
+    {
+      const auto& __state = _M_nfa[__i];
+      auto& __rep_count = _M_rep_count[__i];
+      if (__rep_count.second == 0 || __rep_count.first != _M_current)
+       {
+         auto __back = __rep_count;
+         __rep_count.first = _M_current;
+         __rep_count.second = 1;
+         _M_dfs<__match_mode>(__state._M_alt);
+         __rep_count = __back;
+       }
+      else
+       {
+         if (__rep_count.second < 2)
+           {
+             __rep_count.second++;
+             _M_dfs<__match_mode>(__state._M_alt);
+             __rep_count.second--;
+           }
+       }
+    };
+
   // TODO: Use a function vector to dispatch, instead of using switch-case.
   template<typename _BiIter, typename _Alloc, typename _TraitsT,
     bool __dfs_mode>
@@ -185,68 +218,60 @@ _GLIBCXX_BEGIN_NAMESPACE_VERSION
        // mean the same thing, and we need to choose the correct order under
        // given greedy mode.
        case _S_opcode_alternative:
-         // Greedy.
-         if (!__state._M_neg)
-           {
-             // "Once more" is preferred in greedy mode.
-             _M_dfs<__match_mode>(__state._M_alt);
-             // If it's DFS executor and already accepted, we're done.
-             if (!__dfs_mode || !_M_has_sol)
-               _M_dfs<__match_mode>(__state._M_next);
-           }
-         else // Non-greedy mode
-           {
-             if (__dfs_mode)
-               {
-                 // vice-versa.
+         {
+           // Greedy.
+           if (!__state._M_neg)
+             {
+               _M_rep_once_more<__match_mode>(__i);
+               // If it's DFS executor and already accepted, we're done.
+               if (!__dfs_mode || !_M_has_sol)
                  _M_dfs<__match_mode>(__state._M_next);
-                 if (!_M_has_sol)
-                   _M_dfs<__match_mode>(__state._M_alt);
-               }
-             else
-               {
-                 // DON'T attempt anything, because there's already another
-                 // state with higher priority accepted. This state cannot be
-                 // better by attempting its next node.
-                 if (!_M_has_sol)
-                   {
-                     _M_dfs<__match_mode>(__state._M_next);
-                     // DON'T attempt anything if it's already accepted. An
-                     // accepted state *must* be better than a solution that
-                     // matches a non-greedy quantifier one more time.
-                     if (!_M_has_sol)
-                       _M_dfs<__match_mode>(__state._M_alt);
-                   }
-               }
+             }
+           else // Non-greedy mode
+             {
+               if (__dfs_mode)
+                 {
+                   // vice-versa.
+                   _M_dfs<__match_mode>(__state._M_next);
+                   if (!_M_has_sol)
+                     _M_rep_once_more<__match_mode>(__i);
+                 }
+               else
+                 {
+                   // DON'T attempt anything, because there's already another
+                   // state with higher priority accepted. This state cannot be
+                   // better by attempting its next node.
+                   if (!_M_has_sol)
+                     {
+                       _M_dfs<__match_mode>(__state._M_next);
+                       // DON'T attempt anything if it's already accepted. An
+                       // accepted state *must* be better than a solution that
+                       // matches a non-greedy quantifier one more time.
+                       if (!_M_has_sol)
+                         _M_rep_once_more<__match_mode>(__i);
+                     }
+                 }
+             }
            }
          break;
        case _S_opcode_subexpr_begin:
-         // If there's nothing changed since last visit, do NOT continue.
-         // This prevents the executor from get into infinite loop when using
-         // "()*" to match "".
-         if (!_M_cur_results[__state._M_subexpr].matched
-             || _M_cur_results[__state._M_subexpr].first != _M_current)
-           {
-             auto& __res = _M_cur_results[__state._M_subexpr];
-             auto __back = __res.first;
-             __res.first = _M_current;
-             _M_dfs<__match_mode>(__state._M_next);
-             __res.first = __back;
-           }
+         {
+           auto& __res = _M_cur_results[__state._M_subexpr];
+           auto __back = __res.first;
+           __res.first = _M_current;
+           _M_dfs<__match_mode>(__state._M_next);
+           __res.first = __back;
+         }
          break;
        case _S_opcode_subexpr_end:
-         if (_M_cur_results[__state._M_subexpr].second != _M_current
-             || _M_cur_results[__state._M_subexpr].matched != true)
-           {
-             auto& __res = _M_cur_results[__state._M_subexpr];
-             auto __back = __res;
-             __res.second = _M_current;
-             __res.matched = true;
-             _M_dfs<__match_mode>(__state._M_next);
-             __res = __back;
-           }
-         else
+         {
+           auto& __res = _M_cur_results[__state._M_subexpr];
+           auto __back = __res;
+           __res.second = _M_current;
+           __res.matched = true;
            _M_dfs<__match_mode>(__state._M_next);
+           __res = __back;
+         }
          break;
        case _S_opcode_line_begin_assertion:
          if (_M_at_begin())
diff --git 
a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc
 
b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc
index 3fdbdad..c33d1b6 100644
--- 
a/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc
+++ 
b/libstdc++-v3/testsuite/28_regex/algorithms/regex_match/ecma/char/emptygroup.cc
@@ -50,6 +50,7 @@ test01()
     const char s[] = "";
     VERIFY( regex_match_debug(s, m, re) );
   }
+  VERIFY(regex_match_debug("", regex("(?:)*")));
 }
 
 int

Reply via email to