Index: include/bits/regex.h
===================================================================
--- include/bits/regex.h	(revision 201824)
+++ include/bits/regex.h	(working copy)
@@ -2211,7 +2211,8 @@
     {
       if (__re._M_automaton == nullptr)
         return false;
-      if (__detail::__get_executor(__s, __e, __m, __re, __flags)->_M_match())
+      __detail::__get_executor(__s, __e, __m, __re, __flags)->_M_match();
+      if (__m.size() > 0 && __m[0].matched)
         {
           for (auto __it : __m)
             if (!__it.matched)
@@ -2371,8 +2372,10 @@
       if (__re._M_automaton == nullptr)
         return false;
       for (auto __cur = __first; __cur != __last; ++__cur) // Any KMP-like algo?
-        if (__detail::__get_executor(__cur, __last, __m, __re, __flags)
-              ->_M_search_from_first())
+        {
+          __detail::__get_executor(__cur, __last, __m, __re, __flags)
+            ->_M_search_from_first();
+          if (__m.size() > 0 && __m[0].matched)
           {
             for (auto __it : __m)
               if (!__it.matched)
@@ -2387,6 +2390,7 @@
               (__m.suffix().first != __m.suffix().second);
             return true;
           }
+        }
       return false;
     }
 
Index: include/bits/regex_executor.h
===================================================================
--- include/bits/regex_executor.h	(revision 201825)
+++ include/bits/regex_executor.h	(working copy)
@@ -28,6 +28,8 @@
  *  Do not attempt to use it directly. @headername{regex}
  */
 
+// TODO: convert comments to doxygen form.
+
 namespace std _GLIBCXX_VISIBILITY(default)
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
@@ -34,6 +36,8 @@
   template<typename, typename>
     class basic_regex;
 
+  template<typename>
+    class sub_match;
   template<typename, typename>
     class match_results;
 _GLIBCXX_END_NAMESPACE_VERSION
@@ -52,8 +56,9 @@
     class _Executor
     {
     public:
-      typedef match_results<_BiIter, _Alloc>   _ResultsT;
-      typedef regex_constants::match_flag_type _FlagT;
+      typedef match_results<_BiIter, _Alloc>          _ResultsT;
+      typedef std::vector<sub_match<_BiIter>, _Alloc> _ResultsVec;
+      typedef regex_constants::match_flag_type        _FlagT;
 
       virtual
       ~_Executor()
@@ -60,11 +65,11 @@
       { }
 
       // Set matched when string exactly match the pattern.
-      virtual bool
+      virtual void
       _M_match() = 0;
 
       // Set matched when some prefix of the string matches the pattern.
-      virtual bool
+      virtual void
       _M_search_from_first() = 0;
 
     protected:
@@ -74,20 +79,34 @@
                 _ResultsT& __results,
                 _FlagT     __flags,
                 _SizeT     __size)
-      : _M_current(__begin), _M_end(__end),
-        _M_results(__results), _M_flags(__flags)
+      : _M_current(__begin), _M_end(__end), _M_results(__results),
+        _M_flags(__flags)
       {
-        __results.resize(__size + 2);
-        for (auto __it : __results)
-          __it.matched = false;
+        __size += 2;
+        _M_results.resize(__size);
+        for (auto __i = 0; __i < __size; __i++)
+          _M_results[__i].matched = false;
       }
 
-      _BiIter    _M_current;
-      _BiIter    _M_end;
-      _ResultsT& _M_results;
-      _FlagT     _M_flags;
+      _BiIter       _M_current;
+      _BiIter       _M_end;
+      _ResultsVec&  _M_results;
+      _FlagT        _M_flags;
     };
 
+  // A _DFSExecutor perform a DFS on given NFA and input string. At the very
+  // beginning the executor stands in the start state, then it try every
+  // possible state transition in current state recursively. Some state
+  // transitions consume input string, say, a single-char-matcher or a
+  // back-reference matcher; some not, like assertion or other anchor nodes.
+  // When the input is exhausted and the current state is an accepting state,
+  // the whole executor return true.
+  //
+  // TODO: This approach is exponentially slow for certain input.
+  //       Try to compile the NFA to a DFA.
+  //
+  // Time complexity: exponential
+  // Space complexity: O(__end - __begin)
   template<typename _BiIter, typename _Alloc,
     typename _CharT, typename _TraitsT>
     class _DFSExecutor
@@ -97,6 +116,7 @@
       typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT;
       typedef _NFA<_CharT, _TraitsT>                       _RegexT;
       typedef typename _BaseT::_ResultsT                   _ResultsT;
+      typedef typename _BaseT::_ResultsVec                 _ResultsVec;
       typedef regex_constants::match_flag_type             _FlagT;
 
       _DFSExecutor(_BiIter        __begin,
@@ -105,16 +125,16 @@
                    const _RegexT& __nfa,
                    _FlagT         __flags)
       : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()),
-        _M_traits(_TraitsT()), _M_nfa(__nfa)
+        _M_traits(_TraitsT()), _M_nfa(__nfa), _M_results_ret(this->_M_results)
       { }
 
-      bool
+      void
       _M_match()
-      { return _M_dfs<true>(_M_nfa._M_start()); }
+      { _M_dfs<true>(_M_nfa._M_start()); }
 
-      bool
+      void
       _M_search_from_first()
-      { return _M_dfs<false>(_M_nfa._M_start()); }
+      { _M_dfs<false>(_M_nfa._M_start()); }
 
     private:
       template<bool __match_mode>
@@ -121,21 +141,23 @@
         bool
         _M_dfs(_StateIdT __i);
 
+      _ResultsVec    _M_results_ret;
       _TraitsT       _M_traits;
       const _RegexT& _M_nfa;
     };
 
-  // It's essentially a variant of Single-Source-Shortest-Path problem, where,
-  // the matching results is the final distance and should be minimized.
-  // Instead of using Dijkstra Algorithm, I pick up the queue-optimizaed
-  // (BFS-like) Bellman-Ford algorithm,
-  // SPFA(http://en.wikipedia.org/wiki/Shortest_Path_Faster_Algorithm).
+  // Like the DFS approach, it try every possible state transition; Unlike DFS,
+  // it uses a queue instead of a stack to store matching states. It's a BFS
+  // approach.
   //
+  // Russ Cox's article(http://swtch.com/~rsc/regexp/regexp1.html) explained
+  // this algorithm clearly.
+  //
   // Every entry of _M_covered saves the solution(grouping status) for every
-  // matching head. When states transfer, solutions will be compared and
+  // matching head. When states transit, solutions will be compared and
   // deduplicated(based on which greedy mode we have).
   //
-  // Time complexity: O(_M_str_cur.size() * _M_nfa.size())
+  // Time complexity: O((__end - __begin) * _M_nfa.size())
   // Space complexity: O(_M_nfa.size() * _M_nfa.mark_count())
   template<typename _BiIter, typename _Alloc,
     typename _CharT, typename _TraitsT>
@@ -146,12 +168,13 @@
       typedef _Executor<_BiIter, _Alloc, _CharT, _TraitsT> _BaseT;
       typedef _NFA<_CharT, _TraitsT>                       _RegexT;
       typedef typename _BaseT::_ResultsT                   _ResultsT;
-      typedef std::unique_ptr<_ResultsT>                   _ResultsPtr;
+      typedef typename _BaseT::_ResultsVec                 _ResultsVec;
+      typedef std::unique_ptr<_ResultsVec>                 _ResultsPtr;
       typedef regex_constants::match_flag_type             _FlagT;
 
       _BFSExecutor(_BiIter        __begin,
                    _BiIter        __end,
-                   _ResultsT&      __results,
+                   _ResultsT&     __results,
                    const _RegexT& __nfa,
                    _FlagT         __flags)
       : _BaseT(__begin, __end, __results, __flags, __nfa._M_sub_count()),
@@ -159,21 +182,21 @@
       {
         if (_M_nfa._M_start() != _S_invalid_state_id)
           _M_covered[_M_nfa._M_start()] =
-            _ResultsPtr(new _ResultsT(this->_M_results));
+            _ResultsPtr(new _ResultsVec(this->_M_results));
         _M_e_closure();
       }
 
-      bool
+      void
       _M_match()
-      { return _M_main_loop<true>(); }
+      { _M_main_loop<true>(); }
 
-      bool
+      void
       _M_search_from_first()
-      { return _M_main_loop<false>(); }
+      { _M_main_loop<false>(); }
 
     private:
       template<bool __match_mode>
-        bool
+        void
         _M_main_loop();
 
       void
@@ -183,13 +206,13 @@
       _M_move();
 
       bool
-      _M_match_less_than(_StateIdT __u, _StateIdT __v) const;
+      _M_match_less_than(const _ResultsVec& __u, const _ResultsVec& __v) const;
 
       bool
       _M_includes_some() const;
 
       std::map<_StateIdT, _ResultsPtr>     _M_covered;
-      const _RegexT& _M_nfa;
+      const _RegexT&                       _M_nfa;
     };
 
  //@} regex-detail
Index: include/bits/regex_executor.tcc
===================================================================
--- include/bits/regex_executor.tcc	(revision 201825)
+++ include/bits/regex_executor.tcc	(working copy)
@@ -34,7 +34,6 @@
 {
 _GLIBCXX_BEGIN_NAMESPACE_VERSION
 
-  // TODO: This is too slow. Try to compile the NFA to a DFA.
   template<typename _BiIter, typename _Alloc,
     typename _CharT, typename _TraitsT>
   template<bool __match_mode>
@@ -41,12 +40,12 @@
     bool _DFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
     _M_dfs(_StateIdT __i)
     {
-      auto& __current = this->_M_current;
-      auto& __end = this->_M_end;
-      auto& __results = this->_M_results;
       if (__i == _S_invalid_state_id)
         // This is not that certain. Need deeper investigate.
         return false;
+      auto& __current = this->_M_current;
+      auto& __end = this->_M_end;
+      auto& __results = _M_results_ret;
       const auto& __state = _M_nfa[__i];
       bool __ret = false;
       switch (__state._M_opcode)
@@ -59,14 +58,27 @@
             || _M_dfs<__match_mode>(__state._M_next);
           break;
         case _S_opcode_subexpr_begin:
-          __results.at(__state._M_subexpr).first = __current;
-          __ret = _M_dfs<__match_mode>(__state._M_next);
+          if (!__results[__state._M_subexpr].matched
+              || __results[__state._M_subexpr].first != __current)
+            {
+              auto __back = __current;
+              __results[__state._M_subexpr].first = __current;
+              __ret = _M_dfs<__match_mode>(__state._M_next);
+              __results[__state._M_subexpr].first = __back;
+            }
           break;
         case _S_opcode_subexpr_end:
-          __results.at(__state._M_subexpr).second = __current;
-          __results.at(__state._M_subexpr).matched = true;
-          __ret = _M_dfs<__match_mode>(__state._M_next);
-          __results.at(__state._M_subexpr).matched = __ret;
+          if (__results[__state._M_subexpr].second != __current
+              || __results[__state._M_subexpr].matched != true)
+            {
+              auto __back = __results[__state._M_subexpr];
+              __results[__state._M_subexpr].second = __current;
+              __results[__state._M_subexpr].matched = true;
+              __ret = _M_dfs<__match_mode>(__state._M_next);
+              __results[__state._M_subexpr] = __back;
+            }
+          else
+            __ret = _M_dfs<__match_mode>(__state._M_next);
           break;
         case _S_opcode_match:
           if (__current != __end && __state._M_matches(*__current))
@@ -82,7 +94,7 @@
         // If matched, keep going; else just return to try another state.
         case _S_opcode_backref:
           {
-            auto& __submatch = __results.at(__state._M_backref_index);
+            auto& __submatch = __results[__state._M_backref_index];
             if (!__submatch.matched)
               break;
             auto __last = __current;
@@ -92,12 +104,15 @@
               ++__last;
             if (_M_traits.transform(__submatch.first, __submatch.second)
                 == _M_traits.transform(__current, __last))
-              {
-                auto __backup = __current;
-                __current = __last;
+              if (__last != __current)
+                {
+                  auto __backup = __current;
+                  __current = __last;
+                  __ret = _M_dfs<__match_mode>(__state._M_next);
+                  __current = __backup;
+                }
+              else
                 __ret = _M_dfs<__match_mode>(__state._M_next);
-                __current = __backup;
-              }
           }
           break;
         case _S_opcode_accept:
@@ -105,6 +120,8 @@
             __ret = __current == __end;
           else
             __ret = true;
+          if (__ret)
+            this->_M_results = __results;
           break;
         default:
           _GLIBCXX_DEBUG_ASSERT(false);
@@ -115,7 +132,7 @@
   template<typename _BiIter, typename _Alloc,
     typename _CharT, typename _TraitsT>
   template<bool __match_mode>
-    bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
+    void _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
     _M_main_loop()
     {
       while (this->_M_current != this->_M_end)
@@ -122,15 +139,14 @@
         {
           if (!__match_mode)
             if (_M_includes_some())
-              return true;
+              return;
           _M_move();
           ++this->_M_current;
           _M_e_closure();
         }
-      return _M_includes_some();
+      _M_includes_some();
     }
 
-  // The SPFA approach.
   template<typename _BiIter, typename _Alloc,
     typename _CharT, typename _TraitsT>
     void _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
@@ -152,13 +168,17 @@
           const auto& __state = _M_nfa[__u];
 
           // Can be implemented using method, but there're too much arguments.
+          // I would use macro function before C++11, but lambda is a better
+          // choice, since hopefully compiler can inline it.
           auto __add_visited_state = [&](_StateIdT __v)
           {
             if (__v == _S_invalid_state_id)
               return;
-            if (_M_match_less_than(__u, __v))
+            if (_M_covered.count(__u) != 0
+                && (_M_covered.count(__v) == 0
+                    || _M_match_less_than(*_M_covered[__u], *_M_covered[__v])))
               {
-                _M_covered[__v] = _ResultsPtr(new _ResultsT(*_M_covered[__u]));
+                _M_covered[__v] = _ResultsPtr(new _ResultsVec(*_M_covered[__u]));
                 // if a state is updated, it's outgoing neighbors should be
                 // reconsidered too. Push them to the queue.
                 if (!__in_q[__v])
@@ -176,13 +196,21 @@
                 __add_visited_state(__state._M_alt);
                 break;
               case _S_opcode_subexpr_begin:
-                _M_covered[__u]->at(__state._M_subexpr).first = __current;
-                __add_visited_state(__state._M_next);
+                {
+                  auto __back = _M_covered[__u]->at(__state._M_subexpr).first;
+                  _M_covered[__u]->at(__state._M_subexpr).first = __current;
+                  __add_visited_state(__state._M_next);
+                  _M_covered[__u]->at(__state._M_subexpr).first = __back;
+                }
                 break;
               case _S_opcode_subexpr_end:
-                _M_covered[__u]->at(__state._M_subexpr).second = __current;
-                _M_covered[__u]->at(__state._M_subexpr).matched = true;
-                __add_visited_state(__state._M_next);
+                {
+                  auto __back = _M_covered[__u]->at(__state._M_subexpr);
+                  _M_covered[__u]->at(__state._M_subexpr).second = __current;
+                  _M_covered[__u]->at(__state._M_subexpr).matched = true;
+                  __add_visited_state(__state._M_next);
+                  _M_covered[__u]->at(__state._M_subexpr) = __back;
+                }
                 break;
               case _S_opcode_match:
                 break;
@@ -206,9 +234,10 @@
           const auto& __state = _M_nfa[__it.first];
           if (__state._M_opcode == _S_opcode_match
               && __state._M_matches(*this->_M_current))
-            if (_M_match_less_than(__it.first, __state._M_next)
-                && __state._M_next != _S_invalid_state_id)
-              __next[__state._M_next] = move(__it.second);
+            if (__state._M_next != _S_invalid_state_id)
+              if (__next.count(__state._M_next) == 0
+                  || _M_match_less_than(*__it.second, *__next[__state._M_next]))
+                __next[__state._M_next] = move(__it.second);
         }
       _M_covered = move(__next);
     }
@@ -216,14 +245,28 @@
   template<typename _BiIter, typename _Alloc,
     typename _CharT, typename _TraitsT>
     bool _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT>::
-    _M_match_less_than(_StateIdT __u, _StateIdT __v) const
+    _M_match_less_than(const _ResultsVec& __u, const _ResultsVec& __v) const
     {
-      if (_M_covered.count(__u) == 0)
-        return false;
-      if (_M_covered.count(__v) > 0)
-        return true;
       // TODO: Greedy and Non-greedy support
-      return true;
+      _GLIBCXX_DEBUG_ASSERT(__u.size() == __v.size());
+      auto __size = __u.size();
+      for (auto __i = 0; __i < __size; __i++)
+        {
+          auto& __uit = __u[__i], __vit = __v[__i];
+          if (__uit.matched && !__vit.matched)
+            return true;
+          if (!__uit.matched && __vit.matched)
+            return false;
+          if (__uit.matched && __vit.matched)
+            {
+              // GREEDY
+              if (__uit.first != __vit.first)
+                return __uit.first < __vit.first;
+              if (__uit.second != __vit.second)
+                return __uit.second > __vit.second;
+            }
+        }
+      return false;
     }
 
   template<typename _BiIter, typename _Alloc,
@@ -265,11 +308,12 @@
       typedef std::unique_ptr<_Executor<_BiIter, _Alloc, _CharT, _TraitsT>>
         _ExecutorPtr;
       typedef _DFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT> _DFSExecutorT;
+      typedef _BFSExecutor<_BiIter, _Alloc, _CharT, _TraitsT> _BFSExecutorT;
       auto __p = std::static_pointer_cast<_NFA<_CharT, _TraitsT>>
         (__re._M_automaton);
       if (__p->_M_has_backref)
         return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, __flags));
-      return _ExecutorPtr(new _DFSExecutorT(__b, __e, __m, *__p, __flags));
+      return _ExecutorPtr(new _BFSExecutorT(__b, __e, __m, *__p, __flags));
     }
 
 _GLIBCXX_END_NAMESPACE_VERSION
Index: testsuite/28_regex/algorithms/regex_match/ecma/cstring_emptygroup.cc
===================================================================
--- testsuite/28_regex/algorithms/regex_match/ecma/cstring_emptygroup.cc	(revision 0)
+++ testsuite/28_regex/algorithms/regex_match/ecma/cstring_emptygroup.cc	(working copy)
@@ -0,0 +1,58 @@
+// { dg-options "-std=gnu++11" }
+
+//
+// 2013-08-08  Tim Shen <timshen91@gmail.com>
+//
+// Copyright (C) 2013 Free Software Foundation, Inc.
+//
+// This file is part of the GNU ISO C++ Library.  This library is free
+// software; you can redistribute it and/or modify it under the
+// terms of the GNU General Public License as published by the
+// Free Software Foundation; either version 3, or (at your option)
+// any later version.
+//
+// This library is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License along
+// with this library; see the file COPYING3.  If not see
+// <http://www.gnu.org/licenses/>.
+
+// 28.11.2 regex_match
+// Tests ECMAScript empty-grouping against a C-string.
+
+#include <regex>
+#include <testsuite_hooks.h>
+
+using namespace std;
+
+void
+test01()
+{
+  bool test __attribute__((unused)) = true;
+
+  {
+    regex re("()*\\1");
+    cmatch m;
+    const char s[] = "";
+    VERIFY( regex_match(s, m, re) );
+    VERIFY( m.size() == 2 );
+    VERIFY( m[0].matched );
+    VERIFY( m[1].matched );
+  }
+  {
+    regex re("()*");
+    cmatch m;
+    const char s[] = "";
+    VERIFY( regex_match(s, m, re) );
+  }
+}
+
+int
+main()
+{
+  test01();
+  return 0;
+}
